├── README.md ├── combined_data.csv ├── step1_split_with_type.ipynb ├── step2_calculate_by_split.ipynb ├── step3_visualization_WebDDoS.ipynb ├── step4_concat_500_per_class.ipynb ├── step5_meachineLearning.ipynb ├── step5_meachineLearning_Linear.ipynb ├── step5_meachineLearning_heatmap.ipynb ├── step6_PCA_t-SNE.ipynb ├── step7_mlp.ipynb └── step8_cnn.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # CIC-DDoS2019-Detection 2 | 数据清洗与合并 Data Cleaning and Merging ; 3 | 4 | 机器学习模型 Machine Learning Models; 5 | 6 | 深度学习模型 Deep Learning Models ; 7 | 8 | PCA, t-SNE分析 PCA, t-SNE Analysis; 9 | 10 | 数据,结果可视化 Data and Results Visualization 11 | 12 | 13 | 14 | ## CIC-DDoS2019 15 | 16 | 17 | 18 | 对`CIC-DDoS2019`数据集进行检测,本文提供了如下内容: 19 | 20 | + 数据清洗与合并 21 | + 机器学习模型 22 | + 深度学习模型 23 | + PCA,t-SNE分析 24 | + 数据,结果可视化 25 | 26 | 27 | 28 | ### 1、数据集加载 29 | 30 | 31 | 32 | 选择的数据集是这里的csv文件[CIC-DDoS2019 (kaggle.com)](https://www.kaggle.com/datasets/dhoogla/cicddos2019) 33 | 34 | ![image-20240618203852993](https://daetz-image.oss-cn-hangzhou.aliyuncs.com/img/202406182038051.png) 35 | 36 | 37 | 38 | ``` 39 | 链接:https://pan.baidu.com/s/1gP86I08ZQhAOgcfCd5OVVw?pwd=2019 40 | 提取码:2019 41 | ``` 42 | 43 | 44 | 45 | ### 2、数据分割 46 | 47 | ```python 48 | import os 49 | import pandas as pd 50 | 51 | # 设置包含CSV文件的目录 52 | directory = 'class_split' # 替换为您的目录路径 53 | 54 | # 列出目录下所有的CSV文件 55 | csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')] 56 | 57 | # 读取每个CSV文件并打印行数 58 | for csv_file in csv_files: 59 | file_path = os.path.join(directory, csv_file) 60 | try: 61 | # 读取CSV文件 62 | data = pd.read_csv(file_path) 63 | # 获取行数 64 | num_rows = len(data) 65 | print(f"{csv_file}: {num_rows} 行") 66 | except Exception as e: 67 | print(f"无法读取 {csv_file}: {e}") 68 | 69 | ``` 70 | 71 | ![image-20240618205624924](https://daetz-image.oss-cn-hangzhou.aliyuncs.com/img/202406182056981.png) 72 | 73 | ![image-20240618205239085](https://daetz-image.oss-cn-hangzhou.aliyuncs.com/img/202406182052134.png) 74 | 75 | 76 | 77 | 78 | 79 | ### 3、数据可视化 80 | 81 | ```python 82 | import pandas as pd 83 | import matplotlib.pyplot as plt 84 | import seaborn as sns 85 | # 加载数据 86 | file_path = './class_split/WebDDoS.csv' 87 | data = pd.read_csv(file_path) 88 | 89 | # 设置绘图样式 90 | sns.set(style="whitegrid") 91 | 92 | # 创建一个图形框架 93 | fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(16, 6)) 94 | 95 | # 散点图:流持续时间与前向数据包数量 96 | sns.scatterplot(ax=axes[0], x=data[' Flow Duration'], y=data[' Total Fwd Packets'], color='blue') 97 | axes[0].set_title('Flow Duration vs Total Fwd Packets') 98 | axes[0].set_xlabel('Flow Duration') 99 | axes[0].set_ylabel('Total Fwd Packets') 100 | 101 | # 箱线图:前向和后向数据包的分布 102 | sns.boxplot(data=data[[' Total Fwd Packets', ' Total Backward Packets']], ax=axes[1]) 103 | axes[1].set_title('Distribution of Packet Counts') 104 | axes[1].set_ylabel('Packet Counts') 105 | 106 | plt.tight_layout() 107 | plt.show() 108 | 109 | 110 | ``` 111 | 112 | ![image-20240618205846107](https://daetz-image.oss-cn-hangzhou.aliyuncs.com/img/202406182058190.png) 113 | 114 | 115 | 116 | 117 | 118 | ```python 119 | import pandas as pd 120 | import matplotlib.pyplot as plt 121 | import seaborn as sns 122 | import numpy as np 123 | 124 | # 加载数据 125 | file_path = './class_split/WebDDoS.csv' 126 | data = pd.read_csv(file_path) 127 | 128 | # 将时间列转换为 datetime 类型 129 | data[' Timestamp'] = pd.to_datetime(data[' Timestamp']) 130 | 131 | # 筛选出数值型数据列 132 | numeric_data = data.select_dtypes(include=[np.number]) 133 | 134 | # 设置绘图样式 135 | sns.set(style="whitegrid") 136 | 137 | # 创建图形框架,一行两列 138 | fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(16, 6)) 139 | 140 | # 时间序列分析:数据包数量随时间的变化 141 | data.sort_values(' Timestamp', inplace=True) 142 | data['Packet Count'] = data[' Total Fwd Packets'] + data[' Total Backward Packets'] 143 | data.plot(x=' Timestamp', y='Packet Count', ax=axes[0], title='Packet Count Over Time') 144 | 145 | # 热图:特征间的相关性 146 | correlation_matrix = numeric_data.corr() 147 | sns.heatmap(correlation_matrix, ax=axes[1]) 148 | axes[1].set_title('Feature Correlation Heatmap') 149 | 150 | plt.tight_layout() 151 | plt.show() 152 | 153 | ``` 154 | 155 | ![image-20240618205929292](https://daetz-image.oss-cn-hangzhou.aliyuncs.com/img/202406182059383.png) 156 | 157 | ```python 158 | import pandas as pd 159 | import matplotlib.pyplot as plt 160 | import seaborn as sns 161 | 162 | # 加载数据 163 | file_path = './class_split/WebDDoS.csv' 164 | data = pd.read_csv(file_path) 165 | 166 | # 设置绘图样式 167 | sns.set(style="whitegrid") 168 | 169 | # 创建图形框架,一行两列 170 | fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(18, 9)) 171 | 172 | # 小提琴图:前向包大小与反向包大小 173 | sns.violinplot(data=data[[' Total Fwd Packets', ' Total Backward Packets']], ax=axes[0]) 174 | axes[0].set_title('Violin Plot of Packet Sizes') 175 | 176 | # 选择源端口和目的端口的频率最高的前5个端口 177 | top_src_ports = data[' Source Port'].value_counts().nlargest(5) 178 | top_dst_ports = data[' Destination Port'].value_counts().nlargest(5) 179 | 180 | # 圆饼图:显示源端口的计数 181 | axes[1].pie(top_src_ports, labels=top_src_ports.index, autopct='%1.1f%%', startangle=140) 182 | axes[1].set_title('Pie Chart of Top 5 Source Ports') 183 | 184 | plt.tight_layout() 185 | plt.show() 186 | 187 | 188 | ``` 189 | 190 | ![violin_pie](https://daetz-image.oss-cn-hangzhou.aliyuncs.com/img/202406182102934.png) 191 | 192 | 193 | 194 | 195 | 196 | ```python 197 | import pandas as pd 198 | import seaborn as sns 199 | import matplotlib.pyplot as plt 200 | import numpy as np 201 | 202 | # 加载数据 203 | file_path = './class_split/WebDDoS.csv' 204 | data = pd.read_csv(file_path) 205 | 206 | # 数据清洗,处理可能的无穷大或不合理的值 207 | data['Flow Bytes/s'] = pd.to_numeric(data['Flow Bytes/s'], errors='coerce').replace([np.inf, -np.inf], np.nan).fillna(0) 208 | 209 | # 选择几个数值变量进行分析 210 | selected_columns = [' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets', 'Flow Bytes/s'] 211 | selected_data = data[selected_columns] 212 | 213 | # 设置绘图样式 214 | sns.set(style="whitegrid") 215 | 216 | # 创建对角线分布图 217 | pair_plot = sns.pairplot(selected_data) 218 | pair_plot.fig.suptitle("Pair Plot of Selected Features", y=1.02) # 添加总标题并调整位置 219 | plt.savefig("pair_plot.png") 220 | plt.show() 221 | 222 | ``` 223 | 224 | ![pair_plot](https://daetz-image.oss-cn-hangzhou.aliyuncs.com/img/202406182102335.png) 225 | 226 | 227 | 228 | 229 | 230 | ### 4、数据合并 231 | 232 | ```python 233 | import pandas as pd 234 | import os 235 | 236 | # 文件目录 237 | directory = './class_split/' 238 | 239 | # 文件列表 240 | files = [ 241 | 'BENIGN.csv', 'DrDoS_DNS.csv', 'DrDoS_LDAP.csv', 'DrDoS_MSSQL.csv', 242 | 'DrDoS_NTP.csv', 'DrDoS_NetBIOS.csv', 'DrDoS_SNMP.csv', 'DrDoS_UDP.csv', 243 | 'LDAP.csv', 'MSSQL.csv', 'NetBIOS.csv', 'Portmap.csv', 244 | 'Syn.csv', 'TFTP.csv', 'UDP.csv', 'UDP-lag.csv' 245 | ] 246 | 247 | # 创建空的DataFrame 248 | combined_data = pd.DataFrame() 249 | 250 | # 对每个文件进行处理 251 | for file in files: 252 | file_path = os.path.join(directory, file) 253 | # 加载数据 254 | data = pd.read_csv(file_path) 255 | # 随机选取500条数据 256 | sample_data = data.sample(n=500, random_state=1) 257 | # 将数据加入到总的DataFrame中 258 | combined_data = pd.concat([combined_data, sample_data], ignore_index=True) 259 | 260 | # 保存到新的CSV文件 261 | combined_data.to_csv('./combined_data.csv', index=False) 262 | 263 | print("数据合并完成,已保存到combined_data.csv") 264 | 265 | ``` 266 | 267 | 对于每一种类型都选择500个样本`combined_data.csv` 268 | 269 | 【注:本文提供的csv可满足简单的训练,如果需要更多的数据,可以下载官方数据】 270 | 271 | ### 5、机器学习 272 | 273 | #### Logistic 274 | 275 | ```python 276 | # 训练逻辑回归模型 277 | logreg.fit(X_train, y_train) 278 | y_pred_logreg = logreg.predict(X_test) 279 | print("Logistic Regression Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred_logreg) * 100)) 280 | 281 | ``` 282 | 283 | 284 | 285 | 286 | 287 | #### Random Forest 288 | 289 | ```python 290 | # 训练随机森林模型 291 | rf.fit(X_train, y_train) 292 | y_pred_rf = rf.predict(X_test) 293 | print("Random Forest Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred_rf) * 100)) 294 | 295 | ``` 296 | 297 | 298 | 299 | 300 | 301 | #### SVM 302 | 303 | ```python 304 | # 训练支持向量机模型 305 | svm.fit(X_train, y_train) 306 | y_pred_svm = svm.predict(X_test) 307 | print("SVM Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred_svm) * 100)) 308 | 309 | ``` 310 | 311 | 312 | 313 | #### XGBoost 314 | 315 | ```python 316 | # 训练XGBoost模型 317 | xgb.fit(X_train, y_train) 318 | y_pred_xgb = xgb.predict(X_test) 319 | print("XGBoost Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred_xgb) * 100)) 320 | 321 | # 打印分类报告(以XGBoost为例) 322 | print("\nClassification Report for XGBoost:") 323 | print(classification_report(y_test, y_pred_xgb)) 324 | ``` 325 | 326 | 327 | 328 | 329 | 330 | ```python 331 | from sklearn.linear_model import LogisticRegression 332 | from sklearn.ensemble import RandomForestClassifier 333 | from sklearn.svm import SVC 334 | from xgboost import XGBClassifier 335 | from sklearn.metrics import accuracy_score, classification_report 336 | 337 | # 初始化模型 338 | logreg = LogisticRegression(max_iter=1000) 339 | rf = RandomForestClassifier(n_estimators=100) 340 | svm = SVC() 341 | xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss') 342 | 343 | # 训练逻辑回归模型 344 | logreg.fit(X_train, y_train) 345 | y_pred_logreg = logreg.predict(X_test) 346 | print("Logistic Regression Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred_logreg) * 100)) 347 | 348 | # 训练随机森林模型 349 | rf.fit(X_train, y_train) 350 | y_pred_rf = rf.predict(X_test) 351 | print("Random Forest Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred_rf) * 100)) 352 | 353 | # 训练支持向量机模型 354 | svm.fit(X_train, y_train) 355 | y_pred_svm = svm.predict(X_test) 356 | print("SVM Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred_svm) * 100)) 357 | 358 | # 训练XGBoost模型 359 | xgb.fit(X_train, y_train) 360 | y_pred_xgb = xgb.predict(X_test) 361 | print("XGBoost Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred_xgb) * 100)) 362 | 363 | # 打印分类报告(以XGBoost为例) 364 | print("\nClassification Report for XGBoost:") 365 | print(classification_report(y_test, y_pred_xgb)) 366 | 367 | ``` 368 | 369 | ``` 370 | Logistic Regression Accuracy: 54.96% 371 | Random Forest Accuracy: 62.04% 372 | SVM Accuracy: 50.17% 373 | XGBoost Accuracy: 62.75% 374 | 375 | Classification Report for XGBoost: 376 | precision recall f1-score support 377 | 378 | 0 0.99 0.99 0.99 170 379 | 1 0.50 0.42 0.45 143 380 | 2 0.31 0.25 0.28 174 381 | 3 0.56 0.52 0.54 159 382 | 4 0.99 0.99 0.99 145 383 | 5 0.45 0.42 0.43 146 384 | 6 0.60 0.65 0.63 148 385 | 7 0.46 0.55 0.50 121 386 | 8 0.36 0.46 0.40 144 387 | 9 0.54 0.56 0.55 156 388 | 10 0.38 0.40 0.39 154 389 | 11 0.40 0.44 0.42 146 390 | 12 0.99 0.98 0.99 150 391 | 13 1.00 0.97 0.99 158 392 | 14 0.51 0.49 0.50 130 393 | 15 0.92 0.90 0.91 156 394 | 395 | accuracy 0.63 2400 396 | macro avg 0.62 0.62 0.62 2400 397 | weighted avg 0.63 0.63 0.63 2400 398 | ``` 399 | 400 | 401 | 402 | ```python 403 | from sklearn.metrics import confusion_matrix 404 | import seaborn as sns 405 | from sklearn.linear_model import LogisticRegression 406 | from sklearn.ensemble import RandomForestClassifier 407 | from sklearn.svm import SVC 408 | from xgboost import XGBClassifier 409 | from sklearn.metrics import accuracy_score, classification_report 410 | import matplotlib.pyplot as plt 411 | 412 | # 初始化模型 413 | logreg = LogisticRegression(max_iter=1000) 414 | rf = RandomForestClassifier(n_estimators=100) 415 | svm = SVC() 416 | 417 | # 训练模型 418 | logreg.fit(X_train, y_train) 419 | rf.fit(X_train, y_train) 420 | svm.fit(X_train, y_train) 421 | 422 | # 预测结果 423 | y_pred_logreg = logreg.predict(X_test) 424 | y_pred_rf = rf.predict(X_test) 425 | y_pred_svm = svm.predict(X_test) 426 | 427 | # 混淆矩阵 428 | cm_logreg = confusion_matrix(y_test, y_pred_logreg) 429 | cm_rf = confusion_matrix(y_test, y_pred_rf) 430 | cm_svm = confusion_matrix(y_test, y_pred_svm) 431 | 432 | # 绘制混淆矩阵的热图 433 | fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18, 6)) 434 | sns.heatmap(cm_logreg, annot=True, fmt="d", ax=axes[0], cmap='Blues') 435 | axes[0].set_title('Logistic Regression Confusion Matrix') 436 | axes[0].set_xlabel('Predicted labels') 437 | axes[0].set_ylabel('True labels') 438 | 439 | sns.heatmap(cm_rf, annot=True, fmt="d", ax=axes[1], cmap='Blues') 440 | axes[1].set_title('Random Forest Confusion Matrix') 441 | axes[1].set_xlabel('Predicted labels') 442 | axes[1].set_ylabel('True labels') 443 | 444 | sns.heatmap(cm_svm, annot=True, fmt="d", ax=axes[2], cmap='Blues') 445 | axes[2].set_title('SVM Confusion Matrix') 446 | axes[2].set_xlabel('Predicted labels') 447 | axes[2].set_ylabel('True labels') 448 | 449 | plt.tight_layout() 450 | plt.savefig("confusion.png") 451 | plt.show() 452 | 453 | ``` 454 | 455 | ![confusion](https://daetz-image.oss-cn-hangzhou.aliyuncs.com/img/202406182109743.png) 456 | 457 | 458 | 459 | ### 6、PCA t-SNE 460 | 461 | 462 | 463 | ```python 464 | from sklearn.decomposition import PCA 465 | from sklearn.manifold import TSNE 466 | import matplotlib.pyplot as plt 467 | import pandas as pd 468 | from sklearn.preprocessing import StandardScaler, LabelEncoder 469 | from sklearn.model_selection import train_test_split 470 | import numpy as np 471 | 472 | # 加载数据 473 | data = pd.read_csv('./combined_data.csv') 474 | 475 | # 删除不需要的列,例如时间戳或IP地址(假设你的数据集中有这些列) 476 | data.drop([' Timestamp'], axis=1, inplace=True) 477 | 478 | # 类型转换,将分类标签编码 479 | label_encoder = LabelEncoder() 480 | data[' Label'] = label_encoder.fit_transform(data[' Label']) 481 | 482 | # 检查并处理无穷大和非常大的数值 483 | data.replace([np.inf, -np.inf], np.nan, inplace=True) # 将inf替换为NaN 484 | data.fillna(data.median(), inplace=True) # 使用中位数填充NaN,确保之前中位数计算不包括inf 485 | 486 | # 特征标准化 487 | scaler = StandardScaler() 488 | X = scaler.fit_transform(data.drop(' Label', axis=1)) # 确保标签列不参与标准化 489 | y = data[' Label'] 490 | # PCA 491 | pca = PCA(n_components=2) 492 | X_pca = pca.fit_transform(X) 493 | 494 | # t-SNE 495 | tsne = TSNE(n_components=2, random_state=42) 496 | X_tsne = tsne.fit_transform(X) 497 | 498 | # 可视化 PCA 499 | plt.figure(figsize=(8, 8)) 500 | plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', alpha=0.5) 501 | plt.title('PCA of Dataset') 502 | plt.xlabel('Principal Component 1') 503 | plt.ylabel('Principal Component 2') 504 | plt.colorbar() 505 | plt.show() 506 | 507 | # 可视化 t-SNE 508 | plt.figure(figsize=(8, 8)) 509 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y, cmap='viridis', alpha=0.5) 510 | plt.title('t-SNE of Dataset') 511 | plt.xlabel('t-SNE Feature 1') 512 | plt.ylabel('t-SNE Feature 2') 513 | plt.colorbar() 514 | plt.show() 515 | 516 | ``` 517 | 518 | ![image-20240618211011385](https://daetz-image.oss-cn-hangzhou.aliyuncs.com/img/202406182110450.png) 519 | 520 | ![image-20240618211021915](https://daetz-image.oss-cn-hangzhou.aliyuncs.com/img/202406182110981.png) 521 | 522 | ### 7、深度学习 523 | 524 | #### MLP 525 | 526 | ```python 527 | import torch 528 | import torch.nn as nn 529 | import torch.optim as optim 530 | from torch.utils.data import DataLoader, TensorDataset 531 | 532 | # 定义模型 533 | class NeuralNetwork(nn.Module): 534 | def __init__(self, input_size, num_classes): 535 | super(NeuralNetwork, self).__init__() 536 | self.layer1 = nn.Linear(input_size, 64) 537 | self.relu = nn.ReLU() 538 | self.layer2 = nn.Linear(64, 64) 539 | self.output_layer = nn.Linear(64, num_classes) 540 | 541 | def forward(self, x): 542 | x = self.relu(self.layer1(x)) 543 | x = self.relu(self.layer2(x)) 544 | x = self.output_layer(x) 545 | return x 546 | 547 | # 初始化模型 548 | input_size = X_train.shape[1] 549 | num_classes = len(np.unique(y)) 550 | model = NeuralNetwork(input_size, num_classes) 551 | 552 | # 损失函数和优化器 553 | criterion = nn.CrossEntropyLoss() 554 | optimizer = optim.Adam(model.parameters(), lr=0.001) 555 | 556 | ``` 557 | 558 | ![image-20240618211132917](https://daetz-image.oss-cn-hangzhou.aliyuncs.com/img/202406182111981.png) 559 | 560 | ![image-20240618211140162](https://daetz-image.oss-cn-hangzhou.aliyuncs.com/img/202406182111228.png) 561 | 562 | #### CNN 563 | 564 | ```python 565 | import torch 566 | import torch.nn as nn 567 | import torch.optim as optim 568 | from torch.utils.data import DataLoader, TensorDataset 569 | 570 | # 定义模型 571 | class CNN(nn.Module): 572 | def __init__(self, input_size, num_classes): 573 | super(CNN, self).__init__() 574 | self.conv1 = nn.Conv1d(1, 16, kernel_size=3, stride=1, padding=1) 575 | self.relu = nn.ReLU() 576 | self.pool = nn.MaxPool1d(kernel_size=2, stride=2) 577 | self.conv2 = nn.Conv1d(16, 32, kernel_size=3, stride=1, padding=1) 578 | # 计算池化后的尺寸 579 | conv1_out_size = (input_size + 2 * 1 - 3) / 1 + 1 # Conv1 580 | pool1_out_size = conv1_out_size / 2 # Pool1 581 | conv2_out_size = (pool1_out_size + 2 * 1 - 3) / 1 + 1 # Conv2 582 | pool2_out_size = conv2_out_size / 2 # Pool2 583 | final_size = int(pool2_out_size) * 32 # conv2 的输出通道数 * 输出长度 584 | self.fc = nn.Linear(final_size, num_classes) 585 | 586 | def forward(self, x): 587 | x = x.unsqueeze(1) # Adding a channel dimension 588 | x = self.relu(self.conv1(x)) 589 | x = self.pool(x) 590 | x = self.relu(self.conv2(x)) 591 | x = self.pool(x) 592 | x = torch.flatten(x, 1) 593 | x = self.fc(x) 594 | return x 595 | 596 | 597 | # 初始化模型 598 | input_size = X_train.shape[1] 599 | num_classes = len(np.unique(y)) 600 | model = CNN(input_size,num_classes) 601 | 602 | # 损失函数和优化器 603 | criterion = nn.CrossEntropyLoss() 604 | optimizer = optim.Adam(model.parameters(), lr=0.001) 605 | 606 | ``` 607 | 608 | ![image-20240618211231779](https://daetz-image.oss-cn-hangzhou.aliyuncs.com/img/202406182112848.png) 609 | 610 | ![image-20240618211238465](https://daetz-image.oss-cn-hangzhou.aliyuncs.com/img/202406182112545.png) 611 | -------------------------------------------------------------------------------- /step1_split_with_type.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# !pip install pandas\n", 10 | "# !pip install scikit-learn" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "# Load Data" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": { 24 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 25 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", 26 | "tags": [] 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "import numpy as np # linear algebra\n", 31 | "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", 32 | "from sklearn.utils import resample\n", 33 | "from sklearn import preprocessing\n", 34 | "\n", 35 | "import os\n", 36 | "dataset_path = []\n", 37 | "\n", 38 | "for dirname, _, filenames in os.walk('./data/'):\n", 39 | " for filename in filenames:\n", 40 | " if filename.endswith('.csv'):\n", 41 | " dfp = os.path.join(dirname, filename)\n", 42 | " dataset_path.append(dfp)\n" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "name": "stdout", 52 | "output_type": "stream", 53 | "text": [ 54 | "17\n" 55 | ] 56 | } 57 | ], 58 | "source": [ 59 | "print(len(dataset_path))" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 4, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "# mult = 5\n", 69 | "cols = list(pd.read_csv(dataset_path[1], nrows=1))\n", 70 | "\n", 71 | "def load_file(path):\n", 72 | " # data = pd.read_csv(path, sep=',')\n", 73 | " data = pd.read_csv(path,\n", 74 | " usecols =[i for i in cols if i != \" Source IP\" \n", 75 | " and i != ' Destination IP' and i != 'Flow ID' \n", 76 | " and i != 'SimillarHTTP' and i != 'Unnamed: 0'])\n", 77 | "\n", 78 | " return data" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 5, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "samples = pd.concat([load_file(dfp) for dfp in dataset_path], ignore_index=True)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 6, 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "data": { 97 | "image/png": "\n", 98 | "text/plain": [ 99 | "
" 100 | ] 101 | }, 102 | "metadata": { 103 | "needs_background": "light" 104 | }, 105 | "output_type": "display_data" 106 | } 107 | ], 108 | "source": [ 109 | "import pandas as pd\n", 110 | "import matplotlib.pyplot as plt\n", 111 | "\n", 112 | "# Count the occurrences of each label\n", 113 | "label_counts = samples[' Label'].value_counts()\n", 114 | "\n", 115 | "# Create a bar plot to visualize the label counts\n", 116 | "plt.figure(figsize=(10, 6))\n", 117 | "label_counts.plot(kind='bar')\n", 118 | "plt.title('Comparison of Label Column')\n", 119 | "plt.xlabel('Label')\n", 120 | "plt.ylabel('Count')\n", 121 | "plt.show()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 7, 127 | "metadata": {}, 128 | "outputs": [ 129 | { 130 | "data": { 131 | "text/html": [ 132 | "
\n", 133 | "\n", 146 | "\n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | "
Source PortDestination PortProtocolTimestampFlow DurationTotal Fwd PacketsTotal Backward PacketsTotal Length of Fwd PacketsTotal Length of Bwd PacketsFwd Packet Length Max...Active MeanActive StdActive MaxActive MinIdle MeanIdle StdIdle MaxIdle MinInboundLabel
091020073172018-12-01 11:09:58.62254548202944.00.01472.0...0.00.00.00.00.00.00.00.01DrDoS_DNS
158839159172018-12-01 11:10:43.9115812202944.00.01472.0...0.00.00.00.00.00.00.00.01DrDoS_DNS
295322161172018-12-01 11:08:16.5006551202944.00.01472.0...0.00.00.00.00.00.00.00.01DrDoS_DNS
366318811172018-12-01 11:11:00.6834381202944.00.01472.0...0.00.00.00.00.00.00.00.01DrDoS_DNS
45914168172018-12-01 11:11:08.1362061202896.00.01448.0...0.00.00.00.00.00.00.00.01DrDoS_DNS
..................................................................
1130645804628062018-11-03 11:30:16.18603653120.00.00.0...0.00.00.00.00.00.00.00.01BENIGN
1130646804628262018-11-03 11:30:16.18616366120.00.00.0...0.00.00.00.00.00.00.00.01BENIGN
113064751233123172018-11-03 11:30:16.402521262882296.096.048.0...0.00.00.00.00.00.00.00.00BENIGN
1130648529688062018-11-03 11:30:17.14519930679110.00.00.0...0.00.00.00.00.00.00.00.00BENIGN
1130649805296862018-11-03 11:30:17.17587967120.00.00.0...0.00.00.00.00.00.00.00.01BENIGN
\n", 440 | "

1130650 rows × 83 columns

\n", 441 | "
" 442 | ], 443 | "text/plain": [ 444 | " Source Port Destination Port Protocol \\\n", 445 | "0 910 20073 17 \n", 446 | "1 588 39159 17 \n", 447 | "2 953 22161 17 \n", 448 | "3 663 18811 17 \n", 449 | "4 591 4168 17 \n", 450 | "... ... ... ... \n", 451 | "1130645 80 46280 6 \n", 452 | "1130646 80 46282 6 \n", 453 | "1130647 51233 123 17 \n", 454 | "1130648 52968 80 6 \n", 455 | "1130649 80 52968 6 \n", 456 | "\n", 457 | " Timestamp Flow Duration Total Fwd Packets \\\n", 458 | "0 2018-12-01 11:09:58.622545 48 2 \n", 459 | "1 2018-12-01 11:10:43.911581 2 2 \n", 460 | "2 2018-12-01 11:08:16.500655 1 2 \n", 461 | "3 2018-12-01 11:11:00.683438 1 2 \n", 462 | "4 2018-12-01 11:11:08.136206 1 2 \n", 463 | "... ... ... ... \n", 464 | "1130645 2018-11-03 11:30:16.186036 53 1 \n", 465 | "1130646 2018-11-03 11:30:16.186163 66 1 \n", 466 | "1130647 2018-11-03 11:30:16.402521 26288 2 \n", 467 | "1130648 2018-11-03 11:30:17.145199 30679 1 \n", 468 | "1130649 2018-11-03 11:30:17.175879 67 1 \n", 469 | "\n", 470 | " Total Backward Packets Total Length of Fwd Packets \\\n", 471 | "0 0 2944.0 \n", 472 | "1 0 2944.0 \n", 473 | "2 0 2944.0 \n", 474 | "3 0 2944.0 \n", 475 | "4 0 2896.0 \n", 476 | "... ... ... \n", 477 | "1130645 2 0.0 \n", 478 | "1130646 2 0.0 \n", 479 | "1130647 2 96.0 \n", 480 | "1130648 1 0.0 \n", 481 | "1130649 2 0.0 \n", 482 | "\n", 483 | " Total Length of Bwd Packets Fwd Packet Length Max ... \\\n", 484 | "0 0.0 1472.0 ... \n", 485 | "1 0.0 1472.0 ... \n", 486 | "2 0.0 1472.0 ... \n", 487 | "3 0.0 1472.0 ... \n", 488 | "4 0.0 1448.0 ... \n", 489 | "... ... ... ... \n", 490 | "1130645 0.0 0.0 ... \n", 491 | "1130646 0.0 0.0 ... \n", 492 | "1130647 96.0 48.0 ... \n", 493 | "1130648 0.0 0.0 ... \n", 494 | "1130649 0.0 0.0 ... \n", 495 | "\n", 496 | " Active Mean Active Std Active Max Active Min Idle Mean \\\n", 497 | "0 0.0 0.0 0.0 0.0 0.0 \n", 498 | "1 0.0 0.0 0.0 0.0 0.0 \n", 499 | "2 0.0 0.0 0.0 0.0 0.0 \n", 500 | "3 0.0 0.0 0.0 0.0 0.0 \n", 501 | "4 0.0 0.0 0.0 0.0 0.0 \n", 502 | "... ... ... ... ... ... \n", 503 | "1130645 0.0 0.0 0.0 0.0 0.0 \n", 504 | "1130646 0.0 0.0 0.0 0.0 0.0 \n", 505 | "1130647 0.0 0.0 0.0 0.0 0.0 \n", 506 | "1130648 0.0 0.0 0.0 0.0 0.0 \n", 507 | "1130649 0.0 0.0 0.0 0.0 0.0 \n", 508 | "\n", 509 | " Idle Std Idle Max Idle Min Inbound Label \n", 510 | "0 0.0 0.0 0.0 1 DrDoS_DNS \n", 511 | "1 0.0 0.0 0.0 1 DrDoS_DNS \n", 512 | "2 0.0 0.0 0.0 1 DrDoS_DNS \n", 513 | "3 0.0 0.0 0.0 1 DrDoS_DNS \n", 514 | "4 0.0 0.0 0.0 1 DrDoS_DNS \n", 515 | "... ... ... ... ... ... \n", 516 | "1130645 0.0 0.0 0.0 1 BENIGN \n", 517 | "1130646 0.0 0.0 0.0 1 BENIGN \n", 518 | "1130647 0.0 0.0 0.0 0 BENIGN \n", 519 | "1130648 0.0 0.0 0.0 0 BENIGN \n", 520 | "1130649 0.0 0.0 0.0 1 BENIGN \n", 521 | "\n", 522 | "[1130650 rows x 83 columns]" 523 | ] 524 | }, 525 | "execution_count": 7, 526 | "metadata": {}, 527 | "output_type": "execute_result" 528 | } 529 | ], 530 | "source": [ 531 | "samples" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": 8, 537 | "metadata": {}, 538 | "outputs": [ 539 | { 540 | "name": "stdout", 541 | "output_type": "stream", 542 | "text": [ 543 | "数据已按类别分割并保存到文件中。\n" 544 | ] 545 | } 546 | ], 547 | "source": [ 548 | "\n", 549 | "# 根据 'Label' 列分组数据\n", 550 | "grouped = samples.groupby(' Label')\n", 551 | "\n", 552 | "# 创建存储分割文件的目录\n", 553 | "output_dir = 'class_split'\n", 554 | "if not os.path.exists(output_dir):\n", 555 | " os.makedirs(output_dir)\n", 556 | "\n", 557 | "# 为每个类别保存一个CSV文件\n", 558 | "for label, group in grouped:\n", 559 | " filename = os.path.join(output_dir, f\"{label}.csv\")\n", 560 | " group.to_csv(filename, index=False)\n", 561 | "\n", 562 | "print(\"数据已按类别分割并保存到文件中。\")" 563 | ] 564 | } 565 | ], 566 | "metadata": { 567 | "kernelspec": { 568 | "display_name": "Python 3 (ipykernel)", 569 | "language": "python", 570 | "name": "python3" 571 | }, 572 | "language_info": { 573 | "codemirror_mode": { 574 | "name": "ipython", 575 | "version": 3 576 | }, 577 | "file_extension": ".py", 578 | "mimetype": "text/x-python", 579 | "name": "python", 580 | "nbconvert_exporter": "python", 581 | "pygments_lexer": "ipython3", 582 | "version": "3.8.10" 583 | } 584 | }, 585 | "nbformat": 4, 586 | "nbformat_minor": 4 587 | } 588 | -------------------------------------------------------------------------------- /step2_calculate_by_split.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "c6dca9d8-aabf-466e-ad35-31fec5211f7d", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "BENIGN.csv: 113065 行\n", 14 | "DrDoS_DNS.csv: 30618 行\n", 15 | "DrDoS_LDAP.csv: 14508 行\n", 16 | "DrDoS_MSSQL.csv: 18054 行\n", 17 | "DrDoS_NTP.csv: 129285 行\n", 18 | "DrDoS_NetBIOS.csv: 15363 行\n", 19 | "DrDoS_SNMP.csv: 13563 行\n", 20 | "DrDoS_UDP.csv: 19413 行\n", 21 | "LDAP.csv: 41801 行\n", 22 | "MSSQL.csv: 25280 行\n", 23 | "NetBIOS.csv: 16252 行\n", 24 | "Portmap.csv: 42606 行\n", 25 | "Syn.csv: 356496 行\n", 26 | "TFTP.csv: 227223 行\n", 27 | "UDP.csv: 33695 行\n", 28 | "UDP-lag.csv: 33294 行\n", 29 | "UDPLag.csv: 83 行\n", 30 | "WebDDoS.csv: 51 行\n" 31 | ] 32 | } 33 | ], 34 | "source": [ 35 | "import os\n", 36 | "import pandas as pd\n", 37 | "\n", 38 | "# 设置包含CSV文件的目录\n", 39 | "directory = 'class_split' # 替换为您的目录路径\n", 40 | "\n", 41 | "# 列出目录下所有的CSV文件\n", 42 | "csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]\n", 43 | "\n", 44 | "# 读取每个CSV文件并打印行数\n", 45 | "for csv_file in csv_files:\n", 46 | " file_path = os.path.join(directory, csv_file)\n", 47 | " try:\n", 48 | " # 读取CSV文件\n", 49 | " data = pd.read_csv(file_path)\n", 50 | " # 获取行数\n", 51 | " num_rows = len(data)\n", 52 | " print(f\"{csv_file}: {num_rows} 行\")\n", 53 | " except Exception as e:\n", 54 | " print(f\"无法读取 {csv_file}: {e}\")\n" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "id": "1b2fb846-4ca5-4593-a6e1-40bf82817d80", 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [] 64 | } 65 | ], 66 | "metadata": { 67 | "kernelspec": { 68 | "display_name": "Python 3 (ipykernel)", 69 | "language": "python", 70 | "name": "python3" 71 | }, 72 | "language_info": { 73 | "codemirror_mode": { 74 | "name": "ipython", 75 | "version": 3 76 | }, 77 | "file_extension": ".py", 78 | "mimetype": "text/x-python", 79 | "name": "python", 80 | "nbconvert_exporter": "python", 81 | "pygments_lexer": "ipython3", 82 | "version": "3.8.10" 83 | } 84 | }, 85 | "nbformat": 4, 86 | "nbformat_minor": 5 87 | } 88 | -------------------------------------------------------------------------------- /step4_concat_500_per_class.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "9b206acf-66c3-411a-8139-f34a29dcacb9", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "数据合并完成,已保存到combined_data.csv\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "import os\n", 20 | "\n", 21 | "# 文件目录\n", 22 | "directory = './class_split/'\n", 23 | "\n", 24 | "# 文件列表\n", 25 | "files = [\n", 26 | " 'BENIGN.csv', 'DrDoS_DNS.csv', 'DrDoS_LDAP.csv', 'DrDoS_MSSQL.csv',\n", 27 | " 'DrDoS_NTP.csv', 'DrDoS_NetBIOS.csv', 'DrDoS_SNMP.csv', 'DrDoS_UDP.csv',\n", 28 | " 'LDAP.csv', 'MSSQL.csv', 'NetBIOS.csv', 'Portmap.csv',\n", 29 | " 'Syn.csv', 'TFTP.csv', 'UDP.csv', 'UDP-lag.csv'\n", 30 | "]\n", 31 | "\n", 32 | "# 创建空的DataFrame\n", 33 | "combined_data = pd.DataFrame()\n", 34 | "\n", 35 | "# 对每个文件进行处理\n", 36 | "for file in files:\n", 37 | " file_path = os.path.join(directory, file)\n", 38 | " # 加载数据\n", 39 | " data = pd.read_csv(file_path)\n", 40 | " # 随机选取500条数据\n", 41 | " sample_data = data.sample(n=500, random_state=1)\n", 42 | " # 将数据加入到总的DataFrame中\n", 43 | " combined_data = pd.concat([combined_data, sample_data], ignore_index=True)\n", 44 | "\n", 45 | "# 保存到新的CSV文件\n", 46 | "combined_data.to_csv('./combined_data.csv', index=False)\n", 47 | "\n", 48 | "print(\"数据合并完成,已保存到combined_data.csv\")\n" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "id": "d82fdf83-9cb3-4a8f-a781-ff46c41460d1", 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [] 58 | } 59 | ], 60 | "metadata": { 61 | "kernelspec": { 62 | "display_name": "Python 3 (ipykernel)", 63 | "language": "python", 64 | "name": "python3" 65 | }, 66 | "language_info": { 67 | "codemirror_mode": { 68 | "name": "ipython", 69 | "version": 3 70 | }, 71 | "file_extension": ".py", 72 | "mimetype": "text/x-python", 73 | "name": "python", 74 | "nbconvert_exporter": "python", 75 | "pygments_lexer": "ipython3", 76 | "version": "3.8.10" 77 | } 78 | }, 79 | "nbformat": 4, 80 | "nbformat_minor": 5 81 | } 82 | -------------------------------------------------------------------------------- /step5_meachineLearning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 9, 6 | "id": "4f2dc226-adbc-411e-99c9-738112cb2d93", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "Looking in indexes: http://mirrors.aliyun.com/pypi/simple\n", 14 | "Collecting xgboost\n", 15 | " Downloading http://mirrors.aliyun.com/pypi/packages/c3/eb/496aa2f5d356af4185f770bc76055307f8d1870e11016b10fd779b21769c/xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl (297.1 MB)\n", 16 | "\u001b[K |████████████████████████████████| 297.1 MB 357 kB/s eta 0:00:01 |███████████████████████████▎ | 253.5 MB 386 kB/s eta 0:01:53 |███████████████████████████▍ | 254.1 MB 290 kB/s eta 0:02:28 |████████████████████████████▉ | 267.7 MB 337 kB/s eta 0:01:28\n", 17 | "\u001b[?25hRequirement already satisfied: numpy in /root/miniconda3/lib/python3.8/site-packages (from xgboost) (1.21.4)\n", 18 | "Requirement already satisfied: scipy in /root/miniconda3/lib/python3.8/site-packages (from xgboost) (1.10.1)\n", 19 | "Installing collected packages: xgboost\n", 20 | "Successfully installed xgboost-2.0.3\n", 21 | "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\n" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "# !pip install xgboost" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 7, 32 | "id": "83f5001c-4f04-466b-b942-ef2d3800f8b5", 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "数据预处理完成,准备进行模型训练和测试。\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "import pandas as pd\n", 45 | "from sklearn.preprocessing import StandardScaler, LabelEncoder\n", 46 | "from sklearn.model_selection import train_test_split\n", 47 | "import numpy as np\n", 48 | "\n", 49 | "# 加载数据\n", 50 | "data = pd.read_csv('./combined_data.csv')\n", 51 | "\n", 52 | "# 删除不需要的列,例如时间戳或IP地址(假设你的数据集中有这些列)\n", 53 | "data.drop([' Timestamp'], axis=1, inplace=True)\n", 54 | "\n", 55 | "# 类型转换,将分类标签编码\n", 56 | "label_encoder = LabelEncoder()\n", 57 | "data[' Label'] = label_encoder.fit_transform(data[' Label'])\n", 58 | "\n", 59 | "# 检查并处理无穷大和非常大的数值\n", 60 | "data.replace([np.inf, -np.inf], np.nan, inplace=True) # 将inf替换为NaN\n", 61 | "data.fillna(data.median(), inplace=True) # 使用中位数填充NaN,确保之前中位数计算不包括inf\n", 62 | "\n", 63 | "# 特征标准化\n", 64 | "scaler = StandardScaler()\n", 65 | "X = scaler.fit_transform(data.drop(' Label', axis=1)) # 确保标签列不参与标准化\n", 66 | "y = data[' Label']\n", 67 | "\n", 68 | "# 划分训练集和测试集\n", 69 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)\n", 70 | "\n", 71 | "print(\"数据预处理完成,准备进行模型训练和测试。\")\n" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 12, 77 | "id": "c3cb72e1-33a8-4fa6-92ad-eed5ab041fe4", 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "name": "stdout", 82 | "output_type": "stream", 83 | "text": [ 84 | "Logistic Regression Accuracy: 54.96%\n", 85 | "Random Forest Accuracy: 62.04%\n", 86 | "SVM Accuracy: 50.17%\n", 87 | "XGBoost Accuracy: 62.75%\n", 88 | "\n", 89 | "Classification Report for XGBoost:\n", 90 | " precision recall f1-score support\n", 91 | "\n", 92 | " 0 0.99 0.99 0.99 170\n", 93 | " 1 0.50 0.42 0.45 143\n", 94 | " 2 0.31 0.25 0.28 174\n", 95 | " 3 0.56 0.52 0.54 159\n", 96 | " 4 0.99 0.99 0.99 145\n", 97 | " 5 0.45 0.42 0.43 146\n", 98 | " 6 0.60 0.65 0.63 148\n", 99 | " 7 0.46 0.55 0.50 121\n", 100 | " 8 0.36 0.46 0.40 144\n", 101 | " 9 0.54 0.56 0.55 156\n", 102 | " 10 0.38 0.40 0.39 154\n", 103 | " 11 0.40 0.44 0.42 146\n", 104 | " 12 0.99 0.98 0.99 150\n", 105 | " 13 1.00 0.97 0.99 158\n", 106 | " 14 0.51 0.49 0.50 130\n", 107 | " 15 0.92 0.90 0.91 156\n", 108 | "\n", 109 | " accuracy 0.63 2400\n", 110 | " macro avg 0.62 0.62 0.62 2400\n", 111 | "weighted avg 0.63 0.63 0.63 2400\n", 112 | "\n" 113 | ] 114 | } 115 | ], 116 | "source": [ 117 | "from sklearn.linear_model import LogisticRegression\n", 118 | "from sklearn.ensemble import RandomForestClassifier\n", 119 | "from sklearn.svm import SVC\n", 120 | "from xgboost import XGBClassifier\n", 121 | "from sklearn.metrics import accuracy_score, classification_report\n", 122 | "\n", 123 | "# 初始化模型\n", 124 | "logreg = LogisticRegression(max_iter=1000)\n", 125 | "rf = RandomForestClassifier(n_estimators=100)\n", 126 | "svm = SVC()\n", 127 | "xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')\n", 128 | "\n", 129 | "# 训练逻辑回归模型\n", 130 | "logreg.fit(X_train, y_train)\n", 131 | "y_pred_logreg = logreg.predict(X_test)\n", 132 | "print(\"Logistic Regression Accuracy: {:.2f}%\".format(accuracy_score(y_test, y_pred_logreg) * 100))\n", 133 | "\n", 134 | "# 训练随机森林模型\n", 135 | "rf.fit(X_train, y_train)\n", 136 | "y_pred_rf = rf.predict(X_test)\n", 137 | "print(\"Random Forest Accuracy: {:.2f}%\".format(accuracy_score(y_test, y_pred_rf) * 100))\n", 138 | "\n", 139 | "# 训练支持向量机模型\n", 140 | "svm.fit(X_train, y_train)\n", 141 | "y_pred_svm = svm.predict(X_test)\n", 142 | "print(\"SVM Accuracy: {:.2f}%\".format(accuracy_score(y_test, y_pred_svm) * 100))\n", 143 | "\n", 144 | "# 训练XGBoost模型\n", 145 | "xgb.fit(X_train, y_train)\n", 146 | "y_pred_xgb = xgb.predict(X_test)\n", 147 | "print(\"XGBoost Accuracy: {:.2f}%\".format(accuracy_score(y_test, y_pred_xgb) * 100))\n", 148 | "\n", 149 | "# 打印分类报告(以XGBoost为例)\n", 150 | "print(\"\\nClassification Report for XGBoost:\")\n", 151 | "print(classification_report(y_test, y_pred_xgb))\n" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "id": "d4978d75-3b7d-49ca-93a1-7c84b1e99bc3", 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [] 161 | } 162 | ], 163 | "metadata": { 164 | "kernelspec": { 165 | "display_name": "Python 3 (ipykernel)", 166 | "language": "python", 167 | "name": "python3" 168 | }, 169 | "language_info": { 170 | "codemirror_mode": { 171 | "name": "ipython", 172 | "version": 3 173 | }, 174 | "file_extension": ".py", 175 | "mimetype": "text/x-python", 176 | "name": "python", 177 | "nbconvert_exporter": "python", 178 | "pygments_lexer": "ipython3", 179 | "version": "3.8.10" 180 | } 181 | }, 182 | "nbformat": 4, 183 | "nbformat_minor": 5 184 | } 185 | -------------------------------------------------------------------------------- /step5_meachineLearning_Linear.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "83f5001c-4f04-466b-b942-ef2d3800f8b5", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "数据预处理完成,准备进行模型训练和测试。\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "from sklearn.preprocessing import StandardScaler, LabelEncoder\n", 20 | "from sklearn.model_selection import train_test_split\n", 21 | "import numpy as np\n", 22 | "\n", 23 | "# 加载数据\n", 24 | "data = pd.read_csv('./combined_data.csv')\n", 25 | "\n", 26 | "# 删除不需要的列,例如时间戳或IP地址(假设你的数据集中有这些列)\n", 27 | "data.drop([' Timestamp'], axis=1, inplace=True)\n", 28 | "\n", 29 | "# 类型转换,将分类标签编码\n", 30 | "label_encoder = LabelEncoder()\n", 31 | "data[' Label'] = label_encoder.fit_transform(data[' Label'])\n", 32 | "\n", 33 | "# 检查并处理无穷大和非常大的数值\n", 34 | "data.replace([np.inf, -np.inf], np.nan, inplace=True) # 将inf替换为NaN\n", 35 | "data.fillna(data.median(), inplace=True) # 使用中位数填充NaN,确保之前中位数计算不包括inf\n", 36 | "\n", 37 | "# 特征标准化\n", 38 | "scaler = StandardScaler()\n", 39 | "X = scaler.fit_transform(data.drop(' Label', axis=1)) # 确保标签列不参与标准化\n", 40 | "y = data[' Label']\n", 41 | "\n", 42 | "# 划分训练集和测试集\n", 43 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)\n", 44 | "\n", 45 | "print(\"数据预处理完成,准备进行模型训练和测试。\")\n" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 4, 51 | "id": "ed08b466-38a2-4286-bac5-a14ba7d02440", 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "image/png": "\n", 57 | "text/plain": [ 58 | "
" 59 | ] 60 | }, 61 | "metadata": { 62 | "needs_background": "light" 63 | }, 64 | "output_type": "display_data" 65 | } 66 | ], 67 | "source": [ 68 | "from sklearn.metrics import confusion_matrix\n", 69 | "import seaborn as sns\n", 70 | "from sklearn.linear_model import LogisticRegression\n", 71 | "from sklearn.ensemble import RandomForestClassifier\n", 72 | "from sklearn.svm import SVC\n", 73 | "from xgboost import XGBClassifier\n", 74 | "from sklearn.metrics import accuracy_score, classification_report\n", 75 | "import matplotlib.pyplot as plt\n", 76 | "\n", 77 | "# 初始化模型\n", 78 | "logreg = LogisticRegression(max_iter=1000)\n", 79 | "rf = RandomForestClassifier(n_estimators=100)\n", 80 | "svm = SVC()\n", 81 | "\n", 82 | "# 训练模型\n", 83 | "logreg.fit(X_train, y_train)\n", 84 | "rf.fit(X_train, y_train)\n", 85 | "svm.fit(X_train, y_train)\n", 86 | "\n", 87 | "# 预测结果\n", 88 | "y_pred_logreg = logreg.predict(X_test)\n", 89 | "y_pred_rf = rf.predict(X_test)\n", 90 | "y_pred_svm = svm.predict(X_test)\n", 91 | "\n", 92 | "# 混淆矩阵\n", 93 | "cm_logreg = confusion_matrix(y_test, y_pred_logreg)\n", 94 | "cm_rf = confusion_matrix(y_test, y_pred_rf)\n", 95 | "cm_svm = confusion_matrix(y_test, y_pred_svm)\n", 96 | "\n", 97 | "# 绘制混淆矩阵的热图\n", 98 | "fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18, 6))\n", 99 | "sns.heatmap(cm_logreg, annot=True, fmt=\"d\", ax=axes[0], cmap='Blues')\n", 100 | "axes[0].set_title('Logistic Regression Confusion Matrix')\n", 101 | "axes[0].set_xlabel('Predicted labels')\n", 102 | "axes[0].set_ylabel('True labels')\n", 103 | "\n", 104 | "sns.heatmap(cm_rf, annot=True, fmt=\"d\", ax=axes[1], cmap='Blues')\n", 105 | "axes[1].set_title('Random Forest Confusion Matrix')\n", 106 | "axes[1].set_xlabel('Predicted labels')\n", 107 | "axes[1].set_ylabel('True labels')\n", 108 | "\n", 109 | "sns.heatmap(cm_svm, annot=True, fmt=\"d\", ax=axes[2], cmap='Blues')\n", 110 | "axes[2].set_title('SVM Confusion Matrix')\n", 111 | "axes[2].set_xlabel('Predicted labels')\n", 112 | "axes[2].set_ylabel('True labels')\n", 113 | "\n", 114 | "plt.tight_layout()\n", 115 | "plt.savefig(\"confusion.png\")\n", 116 | "plt.show()\n" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "id": "7b50f7c2-f71d-4ae8-a5c7-4be8dc1d2c2e", 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "from sklearn.metrics import confusion_matrix\n", 127 | "import seaborn as sns\n", 128 | "\n", 129 | "# 初始化模型\n", 130 | "logreg = LogisticRegression(max_iter=1000)\n", 131 | "rf = RandomForestClassifier(n_estimators=100)\n", 132 | "svm = SVC()\n", 133 | "\n", 134 | "# 训练模型\n", 135 | "logreg.fit(X_train, y_train)\n", 136 | "rf.fit(X_train, y_train)\n", 137 | "svm.fit(X_train, y_train)\n", 138 | "\n", 139 | "# 预测结果\n", 140 | "y_pred_logreg = logreg.predict(X_test)\n", 141 | "y_pred_rf = rf.predict(X_test)\n", 142 | "y_pred_svm = svm.predict(X_test)\n", 143 | "\n", 144 | "# 混淆矩阵\n", 145 | "cm_logreg = confusion_matrix(y_test, y_pred_logreg)\n", 146 | "cm_rf = confusion_matrix(y_test, y_pred_rf)\n", 147 | "cm_svm = confusion_matrix(y_test, y_pred_svm)\n", 148 | "\n", 149 | "# 绘制混淆矩阵的热图\n", 150 | "fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18, 6))\n", 151 | "sns.heatmap(cm_logreg, annot=True, fmt=\"d\", ax=axes[0], cmap='Blues')\n", 152 | "axes[0].set_title('Logistic Regression Confusion Matrix')\n", 153 | "axes[0].set_xlabel('Predicted labels')\n", 154 | "axes[0].set_ylabel('True labels')\n", 155 | "\n", 156 | "sns.heatmap(cm_rf, annot=True, fmt=\"d\", ax=axes[1], cmap='Blues')\n", 157 | "axes[1].set_title('Random Forest Confusion Matrix')\n", 158 | "axes[1].set_xlabel('Predicted labels')\n", 159 | "axes[1].set_ylabel('True labels')\n", 160 | "\n", 161 | "sns.heatmap(cm_svm, annot=True, fmt=\"d\", ax=axes[2], cmap='Blues')\n", 162 | "axes[2].set_title('SVM Confusion Matrix')\n", 163 | "axes[2].set_xlabel('Predicted labels')\n", 164 | "axes[2].set_ylabel('True labels')\n", 165 | "\n", 166 | "plt.tight_layout()\n", 167 | "plt.show()\n" 168 | ] 169 | } 170 | ], 171 | "metadata": { 172 | "kernelspec": { 173 | "display_name": "Python 3 (ipykernel)", 174 | "language": "python", 175 | "name": "python3" 176 | }, 177 | "language_info": { 178 | "codemirror_mode": { 179 | "name": "ipython", 180 | "version": 3 181 | }, 182 | "file_extension": ".py", 183 | "mimetype": "text/x-python", 184 | "name": "python", 185 | "nbconvert_exporter": "python", 186 | "pygments_lexer": "ipython3", 187 | "version": "3.8.10" 188 | } 189 | }, 190 | "nbformat": 4, 191 | "nbformat_minor": 5 192 | } 193 | --------------------------------------------------------------------------------