├── README.md
├── imbalance processing
    ├── RUS+SMOTE.ipynb
    ├── K-means + SMOTE.ipynb
    ├── SGM.ipynb
    └── ROS,SMOTE,ADASYN.ipynb
├── .ipynb_checkpoints
    ├── RUS+SMOTE-checkpoint.ipynb
    ├── RUS+SMOTE-10-checkpoint.ipynb
    ├── K-means + SMOTE-checkpoint.ipynb
    ├── SGM-checkpoint.ipynb
    ├── CNN-10-checkpoint.ipynb
    ├── GMM + SMOTE-2-checkpoint.ipynb
    ├── GMM + SMOTE -10-checkpoint.ipynb
    ├── K-means + SMOTE -10-checkpoint.ipynb
    ├── ROS,SMOTE,ADASYN-checkpoint.ipynb
    └── data preprocessing（UNSW-NB15）-checkpoint.ipynb
├── classification decision
    └── CNN.ipynb
└── data preprocessing（UNSW-NB15）.ipynb


/README.md:
--------------------------------------------------------------------------------
 1 | # SGM-CNN
 2 | **A flow-based network intrusion detection model that integrates class imbalance processing with deep learning: SGM-CNN.**
 3 | 
 4 | ###### Copyright: Lulu Huang, and Hongpo Zhang(School of Information Engineering, Zhengzhou University)
 5 | 
 6 | ###### The two contributors of this NIDS are Ms. Lulu Huang and SN ENGR. Hongpo Zhang (zhp@zzu.edu.cn). If you have any problems, please do not hesitate to send us an email. 
 7 | 
 8 |  
 9 | Please cite our papers, in case you find our work useful.
10 | 
11 |  (1) Hongpo Zhang, Lulu Huang, Chase Q. Wu and Zhanbo Li:
12 | 	 An Effective Convolutional Neural Network Based on SMOTE and Gaussian Mixture Model for Intrusion Detection in Imbalanced Dataset. Computer Networks (2020), doi:10.1016/j.comnet.2020.107315
13 | 
14 |  (2) Hongpo Zhang, Chase Q. Wu, Shan Gao, Zongmin Wang, Yuxiao Xu and Yongpeng Liu:
15 |      An effective deep learning based scheme for network intrusion detection, in: 2018 24th International Conference on Pattern Recognition (ICPR), 2018, pp. 682–687. doi:10.1109/ICPR.2018.8546162.
16 | 
17 | In the following, we also provide a brief user manual for this NIDS.
18 | 
19 | # (1) Installation
20 | 
21 | anaconda3-5.2.0; tensorflow; python3.6; Kreas 2.2.4
22 | 
23 | # (2) Dataset
24 | 
25 | The UNSW-NB15 dataset used to support the findings of
26 | this study is available at https://www.unsw.adfa.edu.au/unswcanberra-cyber/cybersecurity/ADFA-NB15-Datasets/.
27 | The CICIDS2017 dataset used to support the findings of this study is available at https://www.unb.ca/cic/datasets/ids-2017.html.
28 | 
29 | # (3) NIDS execution process
30 | 
31 | 1. Data preprocessing and feature selection. Different datasets have slightly different data preprocessing processes.
32 | 2. Imbalance processing. Contains five types of imbalanced processing technologies, namely ROS, SMOTE, ADASYN, RUS + SMOTE,K-means + SMOTE and SGM. 
33 | 3. Classification decision. Contains three classification algorithms, namely RF, MLP and CNN.
34 | 


--------------------------------------------------------------------------------
/imbalance processing/RUS+SMOTE.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "#  oversample  SMOTE"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [
 15 |     {
 16 |      "name": "stderr",
 17 |      "output_type": "stream",
 18 |      "text": [
 19 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:526: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 20 |       "  _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n",
 21 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:527: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 22 |       "  _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n",
 23 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:528: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 24 |       "  _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n",
 25 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:529: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 26 |       "  _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n",
 27 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:530: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 28 |       "  _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n",
 29 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:535: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 30 |       "  np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n"
 31 |      ]
 32 |     }
 33 |    ],
 34 |    "source": [
 35 |     "from collections import Counter\n",
 36 |     "import os\n",
 37 |     "import pandas as pd\n",
 38 |     "import numpy as np\n",
 39 |     "import tensorflow as tf\n",
 40 |     "\n",
 41 |     "data = np.load('E:/IDS/alldata/12/train/data.npy')\n",
 42 |     "label = np.load('E:/IDS/alldata/12/train/label_2.npy')"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 2,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "X=np.array(data) \n",
 52 |     "b=np.array(label)\n",
 53 |     "bb=b.reshape(b.shape[0],)    \n",
 54 |     "y10 = np.int32(bb)  "
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 4,
 60 |    "metadata": {},
 61 |    "outputs": [
 62 |     {
 63 |      "data": {
 64 |       "text/plain": [
 65 |        "[(0, 1553132), (1, 224898)]"
 66 |       ]
 67 |      },
 68 |      "execution_count": 4,
 69 |      "metadata": {},
 70 |      "output_type": "execute_result"
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "sorted(Counter(y10).items())"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 5,
 80 |    "metadata": {},
 81 |    "outputs": [
 82 |     {
 83 |      "name": "stderr",
 84 |      "output_type": "stream",
 85 |      "text": [
 86 |       "Using TensorFlow backend.\n"
 87 |      ]
 88 |     },
 89 |     {
 90 |      "name": "stdout",
 91 |      "output_type": "stream",
 92 |      "text": [
 93 |       "[(0, 1553132), (1, 889015)]\n",
 94 |       "time: 233.08835577964783\n"
 95 |      ]
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "from imblearn.over_sampling import SMOTE\n",
100 |     "import time\n",
101 |     "time_start = time.time()\n",
102 |     "\n",
103 |     "smo = SMOTE(ratio={1:889015},random_state=42) \n",
104 |     "#smo = SMOTE(ratio={1:177803,2:177803,3:177803,4:177803,5:177803,6:177803,7:177803,8:177803,9:177803},random_state=42) \n",
105 |     "\n",
106 |     "X_smo, y_smo = smo.fit_sample(X, y10)    \n",
107 |     "print(sorted(Counter(y_smo).items()))\n",
108 |     "\n",
109 |     "time_end = time.time()\n",
110 |     "time = time_end - time_start\n",
111 |     "print(\"time:\",time)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 6,
117 |    "metadata": {},
118 |    "outputs": [
119 |     {
120 |      "data": {
121 |       "text/plain": [
122 |        "2442147"
123 |       ]
124 |      },
125 |      "execution_count": 6,
126 |      "metadata": {},
127 |      "output_type": "execute_result"
128 |     }
129 |    ],
130 |    "source": [
131 |     "X_smo.shape[0]"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "# under-sample  RUS"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 7,
144 |    "metadata": {},
145 |    "outputs": [
146 |     {
147 |      "name": "stdout",
148 |      "output_type": "stream",
149 |      "text": [
150 |       "[(0, 889015), (1, 889015)]\n",
151 |       "time: 0.6767761707305908\n"
152 |      ]
153 |     }
154 |    ],
155 |    "source": [
156 |     "from imblearn.under_sampling import RandomUnderSampler\n",
157 |     "import time\n",
158 |     "time_start = time.time()\n",
159 |     "\n",
160 |     "rus = RandomUnderSampler(ratio={0:889015},random_state=42)\n",
161 |     "\n",
162 |     "X_rus, y_rus = rus.fit_sample(X_smo, y_smo)   \n",
163 |     "print(sorted(Counter(y_rus).items()))\n",
164 |     "\n",
165 |     "time_end = time.time()\n",
166 |     "time = time_end - time_start\n",
167 |     "print(\"time:\",time)"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 8,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "label_end = y_rus.reshape(y_rus.shape[0],1)"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 9,
182 |    "metadata": {},
183 |    "outputs": [],
184 |    "source": [
185 |     "np.save(\"E:/IDS/alldata/12/RUS+SMOTE/label_2/data.npy\",X_rus)\n",
186 |     "np.save(\"E:/IDS/alldata/12/RUS+SMOTE/label_2/label.npy\",label_end)"
187 |    ]
188 |   }
189 |  ],
190 |  "metadata": {
191 |   "kernelspec": {
192 |    "display_name": "Python 3",
193 |    "language": "python",
194 |    "name": "python3"
195 |   },
196 |   "language_info": {
197 |    "codemirror_mode": {
198 |     "name": "ipython",
199 |     "version": 3
200 |    },
201 |    "file_extension": ".py",
202 |    "mimetype": "text/x-python",
203 |    "name": "python",
204 |    "nbconvert_exporter": "python",
205 |    "pygments_lexer": "ipython3",
206 |    "version": "3.6.9"
207 |   }
208 |  },
209 |  "nbformat": 4,
210 |  "nbformat_minor": 2
211 | }
212 | 


--------------------------------------------------------------------------------
/.ipynb_checkpoints/RUS+SMOTE-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "#  oversample  SMOTE"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [
 15 |     {
 16 |      "name": "stderr",
 17 |      "output_type": "stream",
 18 |      "text": [
 19 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:526: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 20 |       "  _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n",
 21 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:527: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 22 |       "  _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n",
 23 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:528: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 24 |       "  _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n",
 25 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:529: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 26 |       "  _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n",
 27 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:530: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 28 |       "  _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n",
 29 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:535: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 30 |       "  np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n"
 31 |      ]
 32 |     }
 33 |    ],
 34 |    "source": [
 35 |     "from collections import Counter\n",
 36 |     "import os\n",
 37 |     "import pandas as pd\n",
 38 |     "import numpy as np\n",
 39 |     "import tensorflow as tf\n",
 40 |     "\n",
 41 |     "data = np.load('E:/IDS/alldata/12/train/data.npy')\n",
 42 |     "label = np.load('E:/IDS/alldata/12/train/label_2.npy')"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 2,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "X=np.array(data) \n",
 52 |     "b=np.array(label)\n",
 53 |     "bb=b.reshape(b.shape[0],)    \n",
 54 |     "y10 = np.int32(bb)  "
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 4,
 60 |    "metadata": {},
 61 |    "outputs": [
 62 |     {
 63 |      "data": {
 64 |       "text/plain": [
 65 |        "[(0, 1553132), (1, 224898)]"
 66 |       ]
 67 |      },
 68 |      "execution_count": 4,
 69 |      "metadata": {},
 70 |      "output_type": "execute_result"
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "sorted(Counter(y10).items())"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 5,
 80 |    "metadata": {},
 81 |    "outputs": [
 82 |     {
 83 |      "name": "stderr",
 84 |      "output_type": "stream",
 85 |      "text": [
 86 |       "Using TensorFlow backend.\n"
 87 |      ]
 88 |     },
 89 |     {
 90 |      "name": "stdout",
 91 |      "output_type": "stream",
 92 |      "text": [
 93 |       "[(0, 1553132), (1, 889015)]\n",
 94 |       "time: 233.08835577964783\n"
 95 |      ]
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "from imblearn.over_sampling import SMOTE\n",
100 |     "import time\n",
101 |     "time_start = time.time()\n",
102 |     "\n",
103 |     "smo = SMOTE(ratio={1:889015},random_state=42) \n",
104 |     "#smo = SMOTE(ratio={1:177803,2:177803,3:177803,4:177803,5:177803,6:177803,7:177803,8:177803,9:177803},random_state=42) \n",
105 |     "\n",
106 |     "X_smo, y_smo = smo.fit_sample(X, y10)    \n",
107 |     "print(sorted(Counter(y_smo).items()))\n",
108 |     "\n",
109 |     "time_end = time.time()\n",
110 |     "time = time_end - time_start\n",
111 |     "print(\"time:\",time)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 6,
117 |    "metadata": {},
118 |    "outputs": [
119 |     {
120 |      "data": {
121 |       "text/plain": [
122 |        "2442147"
123 |       ]
124 |      },
125 |      "execution_count": 6,
126 |      "metadata": {},
127 |      "output_type": "execute_result"
128 |     }
129 |    ],
130 |    "source": [
131 |     "X_smo.shape[0]"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "# under-sample  RUS"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 7,
144 |    "metadata": {},
145 |    "outputs": [
146 |     {
147 |      "name": "stdout",
148 |      "output_type": "stream",
149 |      "text": [
150 |       "[(0, 889015), (1, 889015)]\n",
151 |       "time: 0.6767761707305908\n"
152 |      ]
153 |     }
154 |    ],
155 |    "source": [
156 |     "from imblearn.under_sampling import RandomUnderSampler\n",
157 |     "import time\n",
158 |     "time_start = time.time()\n",
159 |     "\n",
160 |     "rus = RandomUnderSampler(ratio={0:889015},random_state=42)\n",
161 |     "\n",
162 |     "X_rus, y_rus = rus.fit_sample(X_smo, y_smo)   \n",
163 |     "print(sorted(Counter(y_rus).items()))\n",
164 |     "\n",
165 |     "time_end = time.time()\n",
166 |     "time = time_end - time_start\n",
167 |     "print(\"time:\",time)"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 8,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "label_end = y_rus.reshape(y_rus.shape[0],1)"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 9,
182 |    "metadata": {},
183 |    "outputs": [],
184 |    "source": [
185 |     "np.save(\"E:/IDS/alldata/12/RUS+SMOTE/label_2/data.npy\",X_rus)\n",
186 |     "np.save(\"E:/IDS/alldata/12/RUS+SMOTE/label_2/label.npy\",label_end)"
187 |    ]
188 |   }
189 |  ],
190 |  "metadata": {
191 |   "kernelspec": {
192 |    "display_name": "Python 3",
193 |    "language": "python",
194 |    "name": "python3"
195 |   },
196 |   "language_info": {
197 |    "codemirror_mode": {
198 |     "name": "ipython",
199 |     "version": 3
200 |    },
201 |    "file_extension": ".py",
202 |    "mimetype": "text/x-python",
203 |    "name": "python",
204 |    "nbconvert_exporter": "python",
205 |    "pygments_lexer": "ipython3",
206 |    "version": "3.6.9"
207 |   }
208 |  },
209 |  "nbformat": 4,
210 |  "nbformat_minor": 2
211 | }
212 | 


--------------------------------------------------------------------------------
/.ipynb_checkpoints/RUS+SMOTE-10-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 将少数类均SMOTE到（数据集样本数/类数）"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [
 15 |     {
 16 |      "name": "stderr",
 17 |      "output_type": "stream",
 18 |      "text": [
 19 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:526: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 20 |       "  _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n",
 21 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:527: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 22 |       "  _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n",
 23 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:528: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 24 |       "  _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n",
 25 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:529: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 26 |       "  _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n",
 27 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:530: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 28 |       "  _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n",
 29 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:535: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 30 |       "  np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n"
 31 |      ]
 32 |     }
 33 |    ],
 34 |    "source": [
 35 |     "from collections import Counter\n",
 36 |     "import os\n",
 37 |     "import pandas as pd\n",
 38 |     "import numpy as np\n",
 39 |     "import tensorflow as tf\n",
 40 |     "\n",
 41 |     "data = np.load('E:/IDS/alldata/12/train/data.npy')\n",
 42 |     "label = np.load('E:/IDS/alldata/12/train/label_10.npy')"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 2,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "X=np.array(data)  #np.array将列表转换为数组，即去掉每个元素外面的中括号\n",
 52 |     "b=np.array(label)\n",
 53 |     "bb=b.reshape(b.shape[0],)     #reshape重新定义形状,此时标签数据是1维的\n",
 54 |     "y10 = np.int32(bb)  #将标签类型从浮点型换成整形"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 3,
 60 |    "metadata": {},
 61 |    "outputs": [
 62 |     {
 63 |      "data": {
 64 |       "text/plain": [
 65 |        "(1778030, 12)"
 66 |       ]
 67 |      },
 68 |      "execution_count": 3,
 69 |      "metadata": {},
 70 |      "output_type": "execute_result"
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "X.shape"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 4,
 80 |    "metadata": {},
 81 |    "outputs": [
 82 |     {
 83 |      "data": {
 84 |       "text/plain": [
 85 |        "[(0, 1553132),\n",
 86 |        " (1, 1874),\n",
 87 |        " (2, 1630),\n",
 88 |        " (3, 11449),\n",
 89 |        " (4, 31167),\n",
 90 |        " (5, 16972),\n",
 91 |        " (6, 150836),\n",
 92 |        " (7, 9791),\n",
 93 |        " (8, 1057),\n",
 94 |        " (9, 122)]"
 95 |       ]
 96 |      },
 97 |      "execution_count": 4,
 98 |      "metadata": {},
 99 |      "output_type": "execute_result"
100 |     }
101 |    ],
102 |    "source": [
103 |     "sorted(Counter(y10).items())"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 5,
109 |    "metadata": {},
110 |    "outputs": [
111 |     {
112 |      "name": "stderr",
113 |      "output_type": "stream",
114 |      "text": [
115 |       "Using TensorFlow backend.\n"
116 |      ]
117 |     },
118 |     {
119 |      "name": "stdout",
120 |      "output_type": "stream",
121 |      "text": [
122 |       "[(0, 1553132), (1, 177803), (2, 177803), (3, 177803), (4, 177803), (5, 177803), (6, 177803), (7, 177803), (8, 177803), (9, 177803)]\n",
123 |       "time: 139.76818656921387\n"
124 |      ]
125 |     }
126 |    ],
127 |    "source": [
128 |     "#使用imlbearn库中上采样方法中的SMOTE接口\n",
129 |     "from imblearn.over_sampling import SMOTE\n",
130 |     "import time\n",
131 |     "time_start = time.time()\n",
132 |     "\n",
133 |     "smo = SMOTE(ratio={1:177803,2:177803,3:177803,4:177803,5:177803,6:177803,7:177803,8:177803,9:177803},random_state=42)  #可以用ratio指定采样程度\n",
134 |     "\n",
135 |     "X_smo, y_smo = smo.fit_sample(X, y10)    #对数据和标签进行SMOTE处理\n",
136 |     "print(sorted(Counter(y_smo).items()))\n",
137 |     "\n",
138 |     "time_end = time.time()\n",
139 |     "time = time_end - time_start\n",
140 |     "print(\"time:\",time)"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 6,
146 |    "metadata": {},
147 |    "outputs": [
148 |     {
149 |      "data": {
150 |       "text/plain": [
151 |        "3153359"
152 |       ]
153 |      },
154 |      "execution_count": 6,
155 |      "metadata": {},
156 |      "output_type": "execute_result"
157 |     }
158 |    ],
159 |    "source": [
160 |     "X_smo.shape[0]"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "metadata": {},
166 |    "source": [
167 |     "# under-sampling"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 7,
173 |    "metadata": {},
174 |    "outputs": [
175 |     {
176 |      "name": "stdout",
177 |      "output_type": "stream",
178 |      "text": [
179 |       "[(0, 177803), (1, 177803), (2, 177803), (3, 177803), (4, 177803), (5, 177803), (6, 177803), (7, 177803), (8, 177803), (9, 177803)]\n",
180 |       "time: 0.7549483776092529\n"
181 |      ]
182 |     }
183 |    ],
184 |    "source": [
185 |     "#使用imlbearn库中上采样方法中的SMOTE接口\n",
186 |     "from imblearn.under_sampling import RandomUnderSampler\n",
187 |     "import time\n",
188 |     "time_start = time.time()\n",
189 |     "\n",
190 |     "rus = RandomUnderSampler(ratio={0:177803},random_state=42)\n",
191 |     "\n",
192 |     "X_rus, y_rus = rus.fit_sample(X_smo, y_smo)    #对数据和标签进行SMOTE处理\n",
193 |     "print(sorted(Counter(y_rus).items()))\n",
194 |     "\n",
195 |     "time_end = time.time()\n",
196 |     "time = time_end - time_start\n",
197 |     "print(\"time:\",time)"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": 8,
203 |    "metadata": {},
204 |    "outputs": [],
205 |    "source": [
206 |     "label_end = y_rus.reshape(y_rus.shape[0],1)"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": 9,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "#将最终采样后的数据保存成文件\n",
216 |     "np.save(\"E:/IDS/alldata/12/RUS+SMOTE/data2.npy\",X_rus)\n",
217 |     "np.save(\"E:/IDS/alldata/12/RUS+SMOTE/label2_10.npy\",label_end)"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": null,
223 |    "metadata": {},
224 |    "outputs": [],
225 |    "source": []
226 |   }
227 |  ],
228 |  "metadata": {
229 |   "kernelspec": {
230 |    "display_name": "Python 3",
231 |    "language": "python",
232 |    "name": "python3"
233 |   },
234 |   "language_info": {
235 |    "codemirror_mode": {
236 |     "name": "ipython",
237 |     "version": 3
238 |    },
239 |    "file_extension": ".py",
240 |    "mimetype": "text/x-python",
241 |    "name": "python",
242 |    "nbconvert_exporter": "python",
243 |    "pygments_lexer": "ipython3",
244 |    "version": "3.7.4"
245 |   }
246 |  },
247 |  "nbformat": 4,
248 |  "nbformat_minor": 2
249 | }
250 | 


--------------------------------------------------------------------------------
/imbalance processing/K-means + SMOTE.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from collections import Counter\n",
 10 |     "import os\n",
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np\n",
 13 |     "import tensorflow as tf\n",
 14 |     "# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"1\"\n",
 15 |     "\n",
 16 |     "data = np.load('/home/hll/IDS/2020/data/select/zuizhong/data_train.npy')\n",
 17 |     "label= np.load('/home/hll/IDS/2020/data/select/zuizhong/label6_train.npy')"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "#  SMOTE all minority classes to (number of data set samples / classes)"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 2,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "X=np.array(data)  \n",
 34 |     "b=np.array(label)\n",
 35 |     "bb=b.reshape(b.shape[0],)    \n",
 36 |     "y10 = np.int32(bb)  "
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 3,
 42 |    "metadata": {},
 43 |    "outputs": [
 44 |     {
 45 |      "data": {
 46 |       "text/plain": [
 47 |        "(3167779, 12)"
 48 |       ]
 49 |      },
 50 |      "execution_count": 3,
 51 |      "metadata": {},
 52 |      "output_type": "execute_result"
 53 |     }
 54 |    ],
 55 |    "source": [
 56 |     "X.shape"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 4,
 62 |    "metadata": {},
 63 |    "outputs": [
 64 |     {
 65 |      "data": {
 66 |       "text/plain": [
 67 |        "[(0, 2443895), (1, 200334), (2, 359), (3, 458010), (4, 65144), (5, 37)]"
 68 |       ]
 69 |      },
 70 |      "execution_count": 4,
 71 |      "metadata": {},
 72 |      "output_type": "execute_result"
 73 |     }
 74 |    ],
 75 |    "source": [
 76 |     "sorted(Counter(y10).items())"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 5,
 82 |    "metadata": {},
 83 |    "outputs": [
 84 |     {
 85 |      "name": "stderr",
 86 |      "output_type": "stream",
 87 |      "text": [
 88 |       "Using TensorFlow backend.\n"
 89 |      ]
 90 |     },
 91 |     {
 92 |      "name": "stdout",
 93 |      "output_type": "stream",
 94 |      "text": [
 95 |       "[(0, 2443895), (1, 527963), (2, 527963), (3, 527963), (4, 527963), (5, 527963)]\n",
 96 |       "time: 2295.455605983734\n"
 97 |      ]
 98 |     }
 99 |    ],
100 |    "source": [
101 |     "from imblearn.over_sampling import SMOTE\n",
102 |     "import time\n",
103 |     "time_start = time.time()\n",
104 |     "\n",
105 |     "guo = 527963  #Oversampling samples\n",
106 |     "\n",
107 |     "smo = SMOTE(ratio={1:guo,2:guo,3:guo,4:guo,5:guo},random_state=42)\n",
108 |     "\n",
109 |     "X_smo, y_smo = smo.fit_sample(X, y10)   \n",
110 |     "print(sorted(Counter(y_smo).items()))\n",
111 |     "\n",
112 |     "time_end = time.time()\n",
113 |     "time = time_end - time_start\n",
114 |     "print(\"time:\",time)"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 6,
120 |    "metadata": {},
121 |    "outputs": [
122 |     {
123 |      "data": {
124 |       "text/plain": [
125 |        "5083710"
126 |       ]
127 |      },
128 |      "execution_count": 6,
129 |      "metadata": {},
130 |      "output_type": "execute_result"
131 |     }
132 |    ],
133 |    "source": [
134 |     "X_smo.shape[0]"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "metadata": {},
140 |    "source": [
141 |     "#  Extract Majority class of data"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 7,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "list0 = []  #data-0\n",
151 |     "list1 = []  #other classes data\n",
152 |     "list2 = [] #other classes label\n",
153 |     "\n",
154 |     "\n",
155 |     "for i in range(X_smo.shape[0]):\n",
156 |     "    if y_smo[i] == 0:\n",
157 |     "        list0.append(X_smo[i])\n",
158 |     "    else:\n",
159 |     "        list1.append(X_smo[i])\n",
160 |     "        list2.append(y_smo[i])"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 8,
166 |    "metadata": {},
167 |    "outputs": [
168 |     {
169 |      "name": "stdout",
170 |      "output_type": "stream",
171 |      "text": [
172 |       "Normal class data shape： (2443895, 12)\n",
173 |       "1-5 class data shape： (2639815, 12)\n",
174 |       "1-5 class data shape： (2639815,)\n"
175 |      ]
176 |     }
177 |    ],
178 |    "source": [
179 |     "data0 = np.array(list0)  \n",
180 |     "data1 = np.array(list1)\n",
181 |     "label2 = np.array(list2)\n",
182 |     "# label22 = label2.reshape(label2.shape[0],)  \n",
183 |     "\n",
184 |     "print(\"Normal class data shape：\",data0.shape)\n",
185 |     "print(\"1-5 class data shape：\",data1.shape)  \n",
186 |     "print(\"1-5 class data shape：\",label2.shape)"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "markdown",
191 |    "metadata": {},
192 |    "source": [
193 |     "# # Cluster majority data into  C (total number of classes)"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": 9,
199 |    "metadata": {},
200 |    "outputs": [
201 |     {
202 |      "name": "stdout",
203 |      "output_type": "stream",
204 |      "text": [
205 |       "time: 22.22838044166565\n"
206 |      ]
207 |     }
208 |    ],
209 |    "source": [
210 |     "from sklearn.cluster import KMeans\n",
211 |     "import time\n",
212 |     "time_start = time.time()\n",
213 |     "\n",
214 |     "estimator = KMeans(n_clusters=6)\n",
215 |     "estimator.fit(data0) \n",
216 |     "\n",
217 |     "time_end = time.time()\n",
218 |     "time = time_end - time_start\n",
219 |     "print(\"time:\",time)\n",
220 |     "\n",
221 |     "label_pred_0 = estimator.labels_ "
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": 10,
227 |    "metadata": {},
228 |    "outputs": [
229 |     {
230 |      "data": {
231 |       "text/plain": [
232 |        "[(0, 687683), (1, 258667), (2, 718137), (3, 110457), (4, 117957), (5, 550994)]"
233 |       ]
234 |      },
235 |      "execution_count": 10,
236 |      "metadata": {},
237 |      "output_type": "execute_result"
238 |     }
239 |    ],
240 |    "source": [
241 |     "sorted(Counter(label_pred_0).items())"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": 12,
247 |    "metadata": {},
248 |    "outputs": [],
249 |    "source": [
250 |     "label_pred = label_pred_0"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "markdown",
255 |    "metadata": {},
256 |    "source": [
257 |     "# # Select a certain amount of data from each cluster to form a new majority data"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": 13,
263 |    "metadata": {},
264 |    "outputs": [],
265 |    "source": [
266 |     "c0 = []\n",
267 |     "c1 = []\n",
268 |     "c2 = []\n",
269 |     "c3 = []\n",
270 |     "c4 = []\n",
271 |     "c5 = []\n",
272 |     "\n",
273 |     "s0=s1=s2=s3=s4=s5=0\n",
274 |     "\n",
275 |     "for i in range(data0.shape[0]):\n",
276 |     "    if label_pred[i] == 0:\n",
277 |     "        c0.append(data0[i])\n",
278 |     "        s0=s0+1\n",
279 |     "    elif label_pred[i] == 1:\n",
280 |     "        c1.append(data0[i])\n",
281 |     "        s1=s1+1\n",
282 |     "    elif label_pred[i] == 2:\n",
283 |     "        c2.append(data0[i])\n",
284 |     "        s2=s2+1\n",
285 |     "    elif label_pred[i] == 3:\n",
286 |     "        c3.append(data0[i])\n",
287 |     "        s3=s3+1\n",
288 |     "    elif label_pred[i] == 4:\n",
289 |     "        c4.append(data0[i])\n",
290 |     "        s4=s4+1\n",
291 |     "    elif label_pred[i] == 5:\n",
292 |     "        c5.append(data0[i])\n",
293 |     "        s5=s5+1\n"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": 14,
299 |    "metadata": {},
300 |    "outputs": [],
301 |    "source": [
302 |     "a=87993"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 15,
308 |    "metadata": {},
309 |    "outputs": [],
310 |    "source": [
311 |     "del c0[a:len(c0)]\n",
312 |     "del c1[a:len(c1)]\n",
313 |     "del c2[a:len(c2)]\n",
314 |     "del c3[a:len(c3)]\n",
315 |     "del c4[a:len(c4)]\n",
316 |     "del c5[a:len(c5)]"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": null,
322 |    "metadata": {},
323 |    "outputs": [],
324 |    "source": []
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": 16,
329 |    "metadata": {},
330 |    "outputs": [],
331 |    "source": [
332 |     "c00 = np.array(c0)\n",
333 |     "c11 = np.array(c1)\n",
334 |     "c22 = np.array(c2)\n",
335 |     "c33 = np.array(c3)\n",
336 |     "c44 = np.array(c4)\n",
337 |     "c55 = np.array(c5)\n"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": 17,
343 |    "metadata": {},
344 |    "outputs": [
345 |     {
346 |      "data": {
347 |       "text/plain": [
348 |        "(527958, 12)"
349 |       ]
350 |      },
351 |      "execution_count": 17,
352 |      "metadata": {},
353 |      "output_type": "execute_result"
354 |     }
355 |    ],
356 |    "source": [
357 |     "q0 = np.concatenate((c00,c11,c22,c33,c44,c55),axis=0)\n",
358 |     "q0.shape"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": 18,
364 |    "metadata": {},
365 |    "outputs": [
366 |     {
367 |      "data": {
368 |       "text/plain": [
369 |        "(527958,)"
370 |       ]
371 |      },
372 |      "execution_count": 18,
373 |      "metadata": {},
374 |      "output_type": "execute_result"
375 |     }
376 |    ],
377 |    "source": [
378 |     "label_zc = np.zeros((q0.shape[0],), dtype=int)\n",
379 |     "label_zc.shape"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": 19,
385 |    "metadata": {},
386 |    "outputs": [],
387 |    "source": [
388 |     "data_end = np.concatenate((q0,data1),axis=0)\n",
389 |     "label_end = np.concatenate((label_zc,label2),axis=0)"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": 21,
395 |    "metadata": {},
396 |    "outputs": [
397 |     {
398 |      "data": {
399 |       "text/plain": [
400 |        "[(0, 527958), (1, 527963), (2, 527963), (3, 527963), (4, 527963), (5, 527963)]"
401 |       ]
402 |      },
403 |      "execution_count": 21,
404 |      "metadata": {},
405 |      "output_type": "execute_result"
406 |     }
407 |    ],
408 |    "source": [
409 |     "sorted(Counter(label_end).items())"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": 22,
415 |    "metadata": {},
416 |    "outputs": [],
417 |    "source": [
418 |     "label_end = label_end.reshape(label_end.shape[0],1)"
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "code",
423 |    "execution_count": 23,
424 |    "metadata": {},
425 |    "outputs": [],
426 |    "source": [
427 |     "#save dataset\n",
428 |     "np.save(\"/home/hll/IDS/2020/data/select/zuizhong/K-means+SMOTE_data_train.npy\",data_end)\n",
429 |     "np.save(\"/home/hll/IDS/2020/data/select/zuizhong/K-means+SMOTE_label_train.npy\",label_end)"
430 |    ]
431 |   }
432 |  ],
433 |  "metadata": {
434 |   "kernelspec": {
435 |    "display_name": "Python 3",
436 |    "language": "python",
437 |    "name": "python3"
438 |   },
439 |   "language_info": {
440 |    "codemirror_mode": {
441 |     "name": "ipython",
442 |     "version": 3
443 |    },
444 |    "file_extension": ".py",
445 |    "mimetype": "text/x-python",
446 |    "name": "python",
447 |    "nbconvert_exporter": "python",
448 |    "pygments_lexer": "ipython3",
449 |    "version": "3.6.9"
450 |   }
451 |  },
452 |  "nbformat": 4,
453 |  "nbformat_minor": 2
454 | }
455 | 


--------------------------------------------------------------------------------
/.ipynb_checkpoints/K-means + SMOTE-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from collections import Counter\n",
 10 |     "import os\n",
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np\n",
 13 |     "import tensorflow as tf\n",
 14 |     "# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"1\"\n",
 15 |     "\n",
 16 |     "data = np.load('/home/hll/IDS/2020/data/select/zuizhong/data_train.npy')\n",
 17 |     "label= np.load('/home/hll/IDS/2020/data/select/zuizhong/label6_train.npy')"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "#  SMOTE all minority classes to (number of data set samples / classes)"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 2,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "X=np.array(data)  \n",
 34 |     "b=np.array(label)\n",
 35 |     "bb=b.reshape(b.shape[0],)    \n",
 36 |     "y10 = np.int32(bb)  "
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 3,
 42 |    "metadata": {},
 43 |    "outputs": [
 44 |     {
 45 |      "data": {
 46 |       "text/plain": [
 47 |        "(3167779, 12)"
 48 |       ]
 49 |      },
 50 |      "execution_count": 3,
 51 |      "metadata": {},
 52 |      "output_type": "execute_result"
 53 |     }
 54 |    ],
 55 |    "source": [
 56 |     "X.shape"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 4,
 62 |    "metadata": {},
 63 |    "outputs": [
 64 |     {
 65 |      "data": {
 66 |       "text/plain": [
 67 |        "[(0, 2443895), (1, 200334), (2, 359), (3, 458010), (4, 65144), (5, 37)]"
 68 |       ]
 69 |      },
 70 |      "execution_count": 4,
 71 |      "metadata": {},
 72 |      "output_type": "execute_result"
 73 |     }
 74 |    ],
 75 |    "source": [
 76 |     "sorted(Counter(y10).items())"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 5,
 82 |    "metadata": {},
 83 |    "outputs": [
 84 |     {
 85 |      "name": "stderr",
 86 |      "output_type": "stream",
 87 |      "text": [
 88 |       "Using TensorFlow backend.\n"
 89 |      ]
 90 |     },
 91 |     {
 92 |      "name": "stdout",
 93 |      "output_type": "stream",
 94 |      "text": [
 95 |       "[(0, 2443895), (1, 527963), (2, 527963), (3, 527963), (4, 527963), (5, 527963)]\n",
 96 |       "time: 2295.455605983734\n"
 97 |      ]
 98 |     }
 99 |    ],
100 |    "source": [
101 |     "from imblearn.over_sampling import SMOTE\n",
102 |     "import time\n",
103 |     "time_start = time.time()\n",
104 |     "\n",
105 |     "guo = 527963  #Oversampling samples\n",
106 |     "\n",
107 |     "smo = SMOTE(ratio={1:guo,2:guo,3:guo,4:guo,5:guo},random_state=42)\n",
108 |     "\n",
109 |     "X_smo, y_smo = smo.fit_sample(X, y10)   \n",
110 |     "print(sorted(Counter(y_smo).items()))\n",
111 |     "\n",
112 |     "time_end = time.time()\n",
113 |     "time = time_end - time_start\n",
114 |     "print(\"time:\",time)"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 6,
120 |    "metadata": {},
121 |    "outputs": [
122 |     {
123 |      "data": {
124 |       "text/plain": [
125 |        "5083710"
126 |       ]
127 |      },
128 |      "execution_count": 6,
129 |      "metadata": {},
130 |      "output_type": "execute_result"
131 |     }
132 |    ],
133 |    "source": [
134 |     "X_smo.shape[0]"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "metadata": {},
140 |    "source": [
141 |     "#  Extract Majority class of data"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 7,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "list0 = []  #data-0\n",
151 |     "list1 = []  #other classes data\n",
152 |     "list2 = [] #other classes label\n",
153 |     "\n",
154 |     "\n",
155 |     "for i in range(X_smo.shape[0]):\n",
156 |     "    if y_smo[i] == 0:\n",
157 |     "        list0.append(X_smo[i])\n",
158 |     "    else:\n",
159 |     "        list1.append(X_smo[i])\n",
160 |     "        list2.append(y_smo[i])"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 8,
166 |    "metadata": {},
167 |    "outputs": [
168 |     {
169 |      "name": "stdout",
170 |      "output_type": "stream",
171 |      "text": [
172 |       "Normal class data shape： (2443895, 12)\n",
173 |       "1-5 class data shape： (2639815, 12)\n",
174 |       "1-5 class data shape： (2639815,)\n"
175 |      ]
176 |     }
177 |    ],
178 |    "source": [
179 |     "data0 = np.array(list0)  \n",
180 |     "data1 = np.array(list1)\n",
181 |     "label2 = np.array(list2)\n",
182 |     "# label22 = label2.reshape(label2.shape[0],)  \n",
183 |     "\n",
184 |     "print(\"Normal class data shape：\",data0.shape)\n",
185 |     "print(\"1-5 class data shape：\",data1.shape)  \n",
186 |     "print(\"1-5 class data shape：\",label2.shape)"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "markdown",
191 |    "metadata": {},
192 |    "source": [
193 |     "# # Cluster majority data into  C (total number of classes)"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": 9,
199 |    "metadata": {},
200 |    "outputs": [
201 |     {
202 |      "name": "stdout",
203 |      "output_type": "stream",
204 |      "text": [
205 |       "time: 22.22838044166565\n"
206 |      ]
207 |     }
208 |    ],
209 |    "source": [
210 |     "from sklearn.cluster import KMeans\n",
211 |     "import time\n",
212 |     "time_start = time.time()\n",
213 |     "\n",
214 |     "estimator = KMeans(n_clusters=6)\n",
215 |     "estimator.fit(data0) \n",
216 |     "\n",
217 |     "time_end = time.time()\n",
218 |     "time = time_end - time_start\n",
219 |     "print(\"time:\",time)\n",
220 |     "\n",
221 |     "label_pred_0 = estimator.labels_ "
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": 10,
227 |    "metadata": {},
228 |    "outputs": [
229 |     {
230 |      "data": {
231 |       "text/plain": [
232 |        "[(0, 687683), (1, 258667), (2, 718137), (3, 110457), (4, 117957), (5, 550994)]"
233 |       ]
234 |      },
235 |      "execution_count": 10,
236 |      "metadata": {},
237 |      "output_type": "execute_result"
238 |     }
239 |    ],
240 |    "source": [
241 |     "sorted(Counter(label_pred_0).items())"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": 12,
247 |    "metadata": {},
248 |    "outputs": [],
249 |    "source": [
250 |     "label_pred = label_pred_0"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "markdown",
255 |    "metadata": {},
256 |    "source": [
257 |     "# # Select a certain amount of data from each cluster to form a new majority data"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": 13,
263 |    "metadata": {},
264 |    "outputs": [],
265 |    "source": [
266 |     "c0 = []\n",
267 |     "c1 = []\n",
268 |     "c2 = []\n",
269 |     "c3 = []\n",
270 |     "c4 = []\n",
271 |     "c5 = []\n",
272 |     "\n",
273 |     "s0=s1=s2=s3=s4=s5=0\n",
274 |     "\n",
275 |     "for i in range(data0.shape[0]):\n",
276 |     "    if label_pred[i] == 0:\n",
277 |     "        c0.append(data0[i])\n",
278 |     "        s0=s0+1\n",
279 |     "    elif label_pred[i] == 1:\n",
280 |     "        c1.append(data0[i])\n",
281 |     "        s1=s1+1\n",
282 |     "    elif label_pred[i] == 2:\n",
283 |     "        c2.append(data0[i])\n",
284 |     "        s2=s2+1\n",
285 |     "    elif label_pred[i] == 3:\n",
286 |     "        c3.append(data0[i])\n",
287 |     "        s3=s3+1\n",
288 |     "    elif label_pred[i] == 4:\n",
289 |     "        c4.append(data0[i])\n",
290 |     "        s4=s4+1\n",
291 |     "    elif label_pred[i] == 5:\n",
292 |     "        c5.append(data0[i])\n",
293 |     "        s5=s5+1\n"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": 14,
299 |    "metadata": {},
300 |    "outputs": [],
301 |    "source": [
302 |     "a=87993"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 15,
308 |    "metadata": {},
309 |    "outputs": [],
310 |    "source": [
311 |     "del c0[a:len(c0)]\n",
312 |     "del c1[a:len(c1)]\n",
313 |     "del c2[a:len(c2)]\n",
314 |     "del c3[a:len(c3)]\n",
315 |     "del c4[a:len(c4)]\n",
316 |     "del c5[a:len(c5)]"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": null,
322 |    "metadata": {},
323 |    "outputs": [],
324 |    "source": []
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": 16,
329 |    "metadata": {},
330 |    "outputs": [],
331 |    "source": [
332 |     "c00 = np.array(c0)\n",
333 |     "c11 = np.array(c1)\n",
334 |     "c22 = np.array(c2)\n",
335 |     "c33 = np.array(c3)\n",
336 |     "c44 = np.array(c4)\n",
337 |     "c55 = np.array(c5)\n"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": 17,
343 |    "metadata": {},
344 |    "outputs": [
345 |     {
346 |      "data": {
347 |       "text/plain": [
348 |        "(527958, 12)"
349 |       ]
350 |      },
351 |      "execution_count": 17,
352 |      "metadata": {},
353 |      "output_type": "execute_result"
354 |     }
355 |    ],
356 |    "source": [
357 |     "q0 = np.concatenate((c00,c11,c22,c33,c44,c55),axis=0)\n",
358 |     "q0.shape"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": 18,
364 |    "metadata": {},
365 |    "outputs": [
366 |     {
367 |      "data": {
368 |       "text/plain": [
369 |        "(527958,)"
370 |       ]
371 |      },
372 |      "execution_count": 18,
373 |      "metadata": {},
374 |      "output_type": "execute_result"
375 |     }
376 |    ],
377 |    "source": [
378 |     "label_zc = np.zeros((q0.shape[0],), dtype=int)\n",
379 |     "label_zc.shape"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": 19,
385 |    "metadata": {},
386 |    "outputs": [],
387 |    "source": [
388 |     "data_end = np.concatenate((q0,data1),axis=0)\n",
389 |     "label_end = np.concatenate((label_zc,label2),axis=0)"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": 21,
395 |    "metadata": {},
396 |    "outputs": [
397 |     {
398 |      "data": {
399 |       "text/plain": [
400 |        "[(0, 527958), (1, 527963), (2, 527963), (3, 527963), (4, 527963), (5, 527963)]"
401 |       ]
402 |      },
403 |      "execution_count": 21,
404 |      "metadata": {},
405 |      "output_type": "execute_result"
406 |     }
407 |    ],
408 |    "source": [
409 |     "sorted(Counter(label_end).items())"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": 22,
415 |    "metadata": {},
416 |    "outputs": [],
417 |    "source": [
418 |     "label_end = label_end.reshape(label_end.shape[0],1)"
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "code",
423 |    "execution_count": 23,
424 |    "metadata": {},
425 |    "outputs": [],
426 |    "source": [
427 |     "#save dataset\n",
428 |     "np.save(\"/home/hll/IDS/2020/data/select/zuizhong/K-means+SMOTE_data_train.npy\",data_end)\n",
429 |     "np.save(\"/home/hll/IDS/2020/data/select/zuizhong/K-means+SMOTE_label_train.npy\",label_end)"
430 |    ]
431 |   }
432 |  ],
433 |  "metadata": {
434 |   "kernelspec": {
435 |    "display_name": "Python 3",
436 |    "language": "python",
437 |    "name": "python3"
438 |   },
439 |   "language_info": {
440 |    "codemirror_mode": {
441 |     "name": "ipython",
442 |     "version": 3
443 |    },
444 |    "file_extension": ".py",
445 |    "mimetype": "text/x-python",
446 |    "name": "python",
447 |    "nbconvert_exporter": "python",
448 |    "pygments_lexer": "ipython3",
449 |    "version": "3.6.9"
450 |   }
451 |  },
452 |  "nbformat": 4,
453 |  "nbformat_minor": 2
454 | }
455 | 


--------------------------------------------------------------------------------
/imbalance processing/SGM.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# SMOTE all minority classes to (number of data set samples / classes)"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "from collections import Counter\n",
 17 |     "import os\n",
 18 |     "import pandas as pd\n",
 19 |     "import numpy as np\n",
 20 |     "import tensorflow as tf\n",
 21 |     "\n",
 22 |     "data = np.load('/home/hll/IDS/2020/data/select/zuizhong/data_train.npy')\n",
 23 |     "label= np.load('/home/hll/IDS/2020/data/select/zuizhong/label6_train.npy')"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 2,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "X=np.array(data)  \n",
 33 |     "b=np.array(label)\n",
 34 |     "bb=b.reshape(b.shape[0],)     \n",
 35 |     "y10 = np.int32(bb) "
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 3,
 41 |    "metadata": {},
 42 |    "outputs": [
 43 |     {
 44 |      "data": {
 45 |       "text/plain": [
 46 |        "(3167779, 12)"
 47 |       ]
 48 |      },
 49 |      "execution_count": 3,
 50 |      "metadata": {},
 51 |      "output_type": "execute_result"
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "X.shape"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 4,
 61 |    "metadata": {},
 62 |    "outputs": [
 63 |     {
 64 |      "data": {
 65 |       "text/plain": [
 66 |        "[(0, 2443895), (1, 200334), (2, 359), (3, 458010), (4, 65144), (5, 37)]"
 67 |       ]
 68 |      },
 69 |      "execution_count": 4,
 70 |      "metadata": {},
 71 |      "output_type": "execute_result"
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "sorted(Counter(y10).items())"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 5,
 81 |    "metadata": {},
 82 |    "outputs": [
 83 |     {
 84 |      "name": "stderr",
 85 |      "output_type": "stream",
 86 |      "text": [
 87 |       "Using TensorFlow backend.\n"
 88 |      ]
 89 |     },
 90 |     {
 91 |      "name": "stdout",
 92 |      "output_type": "stream",
 93 |      "text": [
 94 |       "[(0, 2443895), (1, 527963), (2, 527963), (3, 527963), (4, 527963), (5, 527963)]\n",
 95 |       "time: 1978.3734097480774\n"
 96 |      ]
 97 |     }
 98 |    ],
 99 |    "source": [
100 |     "from imblearn.over_sampling import SMOTE\n",
101 |     "import time\n",
102 |     "time_start = time.time()\n",
103 |     "a = 527963\n",
104 |     "\n",
105 |     "smo = SMOTE(ratio={1:a,2:a,3:a,4:a,5:a},random_state=42)  \n",
106 |     "\n",
107 |     "X_smo, y_smo = smo.fit_sample(X, y10)   \n",
108 |     "print(sorted(Counter(y_smo).items()))\n",
109 |     "\n",
110 |     "time_end = time.time()\n",
111 |     "time = time_end - time_start\n",
112 |     "print(\"time:\",time)"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 6,
118 |    "metadata": {},
119 |    "outputs": [
120 |     {
121 |      "data": {
122 |       "text/plain": [
123 |        "5083710"
124 |       ]
125 |      },
126 |      "execution_count": 6,
127 |      "metadata": {},
128 |      "output_type": "execute_result"
129 |     }
130 |    ],
131 |    "source": [
132 |     "X_smo.shape[0]"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "# Extract Majority class of data"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 7,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "list0 = []  \n",
149 |     "list1 = []  \n",
150 |     "list2 = []  \n",
151 |     "\n",
152 |     "for i in range(X_smo.shape[0]):\n",
153 |     "    if y_smo[i] == 0:\n",
154 |     "        list0.append(X_smo[i])\n",
155 |     "    else:\n",
156 |     "        list1.append(X_smo[i])\n",
157 |     "        list2.append(y_smo[i])\n",
158 |     "    \n"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 8,
164 |    "metadata": {},
165 |    "outputs": [
166 |     {
167 |      "name": "stdout",
168 |      "output_type": "stream",
169 |      "text": [
170 |       "Normal class data shape： (2443895, 12)\n",
171 |       "Attack class data shape： (2639815, 12)\n",
172 |       "Attack class label shape： (2639815,)\n"
173 |      ]
174 |     }
175 |    ],
176 |    "source": [
177 |     "data0 = np.array(list0)  \n",
178 |     "data1 = np.array(list1)\n",
179 |     "label1 = np.array(list2)\n",
180 |     "\n",
181 |     "label11 = label1.reshape(label1.shape[0],)    \n",
182 |     "\n",
183 |     "print(\"Normal class data shape：\",data0.shape)\n",
184 |     "print(\"Attack class data shape：\",data1.shape) \n",
185 |     "print(\"Attack class label shape：\",label11.shape)"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 9,
191 |    "metadata": {},
192 |    "outputs": [
193 |     {
194 |      "data": {
195 |       "text/plain": [
196 |        "array([3, 1, 3, ..., 5, 5, 5], dtype=int32)"
197 |       ]
198 |      },
199 |      "execution_count": 9,
200 |      "metadata": {},
201 |      "output_type": "execute_result"
202 |     }
203 |    ],
204 |    "source": [
205 |     "label11"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "markdown",
210 |    "metadata": {},
211 |    "source": [
212 |     "# Cluster majority data into  C (total number of classes)"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": 10,
218 |    "metadata": {},
219 |    "outputs": [
220 |     {
221 |      "name": "stdout",
222 |      "output_type": "stream",
223 |      "text": [
224 |       "time: 173.31280207633972\n"
225 |      ]
226 |     }
227 |    ],
228 |    "source": [
229 |     "from sklearn.mixture import GaussianMixture\n",
230 |     "import time\n",
231 |     "time_start = time.time()\n",
232 |     "\n",
233 |     "estimator = GaussianMixture(n_components=6)\n",
234 |     "estimator.fit(data0) \n",
235 |     "\n",
236 |     "time_end = time.time()\n",
237 |     "time = time_end - time_start\n",
238 |     "print(\"time:\",time)\n",
239 |     "\n",
240 |     "label_pred = estimator.predict(data0) "
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 11,
246 |    "metadata": {},
247 |    "outputs": [
248 |     {
249 |      "data": {
250 |       "text/plain": [
251 |        "[(0, 68087), (1, 762293), (2, 701472), (3, 110673), (4, 176277), (5, 625093)]"
252 |       ]
253 |      },
254 |      "execution_count": 11,
255 |      "metadata": {},
256 |      "output_type": "execute_result"
257 |     }
258 |    ],
259 |    "source": [
260 |     "sorted(Counter(label_pred).items())"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "markdown",
265 |    "metadata": {},
266 |    "source": [
267 |     "# Select a certain amount of data from each cluster to form a new majority data"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 12,
273 |    "metadata": {},
274 |    "outputs": [],
275 |    "source": [
276 |     "c0 = []\n",
277 |     "c1 = []\n",
278 |     "c2 = []\n",
279 |     "c3 = []\n",
280 |     "c4 = []\n",
281 |     "c5 = []\n",
282 |     "\n",
283 |     "\n",
284 |     "\n",
285 |     "s0=s1=s2=s3=s4=s5=0\n",
286 |     "\n",
287 |     "for i in range(data0.shape[0]):\n",
288 |     "    if label_pred[i] == 0:\n",
289 |     "        c0.append(data0[i])\n",
290 |     "        s0=s0+1\n",
291 |     "    elif label_pred[i] == 1:\n",
292 |     "        c1.append(data0[i])\n",
293 |     "        s1=s1+1\n",
294 |     "    elif label_pred[i] == 2:\n",
295 |     "        c2.append(data0[i])\n",
296 |     "        s2=s2+1\n",
297 |     "    elif label_pred[i] == 3:\n",
298 |     "        c3.append(data0[i])\n",
299 |     "        s3=s3+1\n",
300 |     "    elif label_pred[i] == 4:\n",
301 |     "        c4.append(data0[i])\n",
302 |     "        s4=s4+1\n",
303 |     "    elif label_pred[i] == 5:\n",
304 |     "        c5.append(data0[i])\n",
305 |     "        s5=s5+1\n",
306 |     "\n",
307 |     "        \n"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": 13,
313 |    "metadata": {},
314 |    "outputs": [],
315 |    "source": [
316 |     "a = 91975"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": 15,
322 |    "metadata": {},
323 |    "outputs": [],
324 |    "source": [
325 |     "# del c1[a:len(c1)]\n",
326 |     "del c2[a:len(c2)]\n",
327 |     "del c3[a:len(c3)]\n",
328 |     "del c4[a:len(c4)]\n",
329 |     "del c5[a:len(c5)]\n"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": 16,
335 |    "metadata": {},
336 |    "outputs": [],
337 |    "source": [
338 |     "c00 = np.array(c0)\n",
339 |     "c11 = np.array(c1)\n",
340 |     "c22 = np.array(c2)\n",
341 |     "c33 = np.array(c3)\n",
342 |     "c44 = np.array(c4)\n",
343 |     "c55 = np.array(c5)\n"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": 17,
349 |    "metadata": {},
350 |    "outputs": [
351 |     {
352 |      "data": {
353 |       "text/plain": [
354 |        "(527962, 12)"
355 |       ]
356 |      },
357 |      "execution_count": 17,
358 |      "metadata": {},
359 |      "output_type": "execute_result"
360 |     }
361 |    ],
362 |    "source": [
363 |     "q = np.concatenate((c00,c11,c22,c33,c44,c55),axis=0)\n",
364 |     "q.shape"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "code",
369 |    "execution_count": 18,
370 |    "metadata": {},
371 |    "outputs": [
372 |     {
373 |      "data": {
374 |       "text/plain": [
375 |        "(527962,)"
376 |       ]
377 |      },
378 |      "execution_count": 18,
379 |      "metadata": {},
380 |      "output_type": "execute_result"
381 |     }
382 |    ],
383 |    "source": [
384 |     "label_zc = np.zeros((q.shape[0],), dtype=int)\n",
385 |     "label_zc.shape"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": 19,
391 |    "metadata": {},
392 |    "outputs": [],
393 |    "source": [
394 |     "data_end = np.concatenate((q,data1),axis=0)\n",
395 |     "label_end = np.concatenate((label_zc,label1),axis=0)"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": 20,
401 |    "metadata": {},
402 |    "outputs": [
403 |     {
404 |      "data": {
405 |       "text/plain": [
406 |        "[(0, 527962), (1, 527963), (2, 527963), (3, 527963), (4, 527963), (5, 527963)]"
407 |       ]
408 |      },
409 |      "execution_count": 20,
410 |      "metadata": {},
411 |      "output_type": "execute_result"
412 |     }
413 |    ],
414 |    "source": [
415 |     "sorted(Counter(label_end).items())"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": 21,
421 |    "metadata": {},
422 |    "outputs": [],
423 |    "source": [
424 |     "label_end = label_end.reshape(label_end.shape[0],1)"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "code",
429 |    "execution_count": 22,
430 |    "metadata": {},
431 |    "outputs": [],
432 |    "source": [
433 |     "np.save(\"/home/hll/IDS/2020/data/select/zuizhong/SGM_data_train.npy\",data_end)\n",
434 |     "np.save(\"/home/hll/IDS/2020/data/select/zuizhong/SGM_label6_train.npy\",label_end)"
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "code",
439 |    "execution_count": null,
440 |    "metadata": {},
441 |    "outputs": [],
442 |    "source": []
443 |   }
444 |  ],
445 |  "metadata": {
446 |   "kernelspec": {
447 |    "display_name": "Python 3",
448 |    "language": "python",
449 |    "name": "python3"
450 |   },
451 |   "language_info": {
452 |    "codemirror_mode": {
453 |     "name": "ipython",
454 |     "version": 3
455 |    },
456 |    "file_extension": ".py",
457 |    "mimetype": "text/x-python",
458 |    "name": "python",
459 |    "nbconvert_exporter": "python",
460 |    "pygments_lexer": "ipython3",
461 |    "version": "3.6.9"
462 |   }
463 |  },
464 |  "nbformat": 4,
465 |  "nbformat_minor": 2
466 | }
467 | 


--------------------------------------------------------------------------------
/.ipynb_checkpoints/SGM-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# SMOTE all minority classes to (number of data set samples / classes)"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "from collections import Counter\n",
 17 |     "import os\n",
 18 |     "import pandas as pd\n",
 19 |     "import numpy as np\n",
 20 |     "import tensorflow as tf\n",
 21 |     "\n",
 22 |     "data = np.load('/home/hll/IDS/2020/data/select/zuizhong/data_train.npy')\n",
 23 |     "label= np.load('/home/hll/IDS/2020/data/select/zuizhong/label6_train.npy')"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 2,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "X=np.array(data)  \n",
 33 |     "b=np.array(label)\n",
 34 |     "bb=b.reshape(b.shape[0],)     \n",
 35 |     "y10 = np.int32(bb) "
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 3,
 41 |    "metadata": {},
 42 |    "outputs": [
 43 |     {
 44 |      "data": {
 45 |       "text/plain": [
 46 |        "(3167779, 12)"
 47 |       ]
 48 |      },
 49 |      "execution_count": 3,
 50 |      "metadata": {},
 51 |      "output_type": "execute_result"
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "X.shape"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 4,
 61 |    "metadata": {},
 62 |    "outputs": [
 63 |     {
 64 |      "data": {
 65 |       "text/plain": [
 66 |        "[(0, 2443895), (1, 200334), (2, 359), (3, 458010), (4, 65144), (5, 37)]"
 67 |       ]
 68 |      },
 69 |      "execution_count": 4,
 70 |      "metadata": {},
 71 |      "output_type": "execute_result"
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "sorted(Counter(y10).items())"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 5,
 81 |    "metadata": {},
 82 |    "outputs": [
 83 |     {
 84 |      "name": "stderr",
 85 |      "output_type": "stream",
 86 |      "text": [
 87 |       "Using TensorFlow backend.\n"
 88 |      ]
 89 |     },
 90 |     {
 91 |      "name": "stdout",
 92 |      "output_type": "stream",
 93 |      "text": [
 94 |       "[(0, 2443895), (1, 527963), (2, 527963), (3, 527963), (4, 527963), (5, 527963)]\n",
 95 |       "time: 1978.3734097480774\n"
 96 |      ]
 97 |     }
 98 |    ],
 99 |    "source": [
100 |     "from imblearn.over_sampling import SMOTE\n",
101 |     "import time\n",
102 |     "time_start = time.time()\n",
103 |     "a = 527963\n",
104 |     "\n",
105 |     "smo = SMOTE(ratio={1:a,2:a,3:a,4:a,5:a},random_state=42)  \n",
106 |     "\n",
107 |     "X_smo, y_smo = smo.fit_sample(X, y10)   \n",
108 |     "print(sorted(Counter(y_smo).items()))\n",
109 |     "\n",
110 |     "time_end = time.time()\n",
111 |     "time = time_end - time_start\n",
112 |     "print(\"time:\",time)"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 6,
118 |    "metadata": {},
119 |    "outputs": [
120 |     {
121 |      "data": {
122 |       "text/plain": [
123 |        "5083710"
124 |       ]
125 |      },
126 |      "execution_count": 6,
127 |      "metadata": {},
128 |      "output_type": "execute_result"
129 |     }
130 |    ],
131 |    "source": [
132 |     "X_smo.shape[0]"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "# Extract Majority class of data"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 7,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "list0 = []  \n",
149 |     "list1 = []  \n",
150 |     "list2 = []  \n",
151 |     "\n",
152 |     "for i in range(X_smo.shape[0]):\n",
153 |     "    if y_smo[i] == 0:\n",
154 |     "        list0.append(X_smo[i])\n",
155 |     "    else:\n",
156 |     "        list1.append(X_smo[i])\n",
157 |     "        list2.append(y_smo[i])\n",
158 |     "    \n"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 8,
164 |    "metadata": {},
165 |    "outputs": [
166 |     {
167 |      "name": "stdout",
168 |      "output_type": "stream",
169 |      "text": [
170 |       "Normal class data shape： (2443895, 12)\n",
171 |       "Attack class data shape： (2639815, 12)\n",
172 |       "Attack class label shape： (2639815,)\n"
173 |      ]
174 |     }
175 |    ],
176 |    "source": [
177 |     "data0 = np.array(list0)  \n",
178 |     "data1 = np.array(list1)\n",
179 |     "label1 = np.array(list2)\n",
180 |     "\n",
181 |     "label11 = label1.reshape(label1.shape[0],)    \n",
182 |     "\n",
183 |     "print(\"Normal class data shape：\",data0.shape)\n",
184 |     "print(\"Attack class data shape：\",data1.shape) \n",
185 |     "print(\"Attack class label shape：\",label11.shape)"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 9,
191 |    "metadata": {},
192 |    "outputs": [
193 |     {
194 |      "data": {
195 |       "text/plain": [
196 |        "array([3, 1, 3, ..., 5, 5, 5], dtype=int32)"
197 |       ]
198 |      },
199 |      "execution_count": 9,
200 |      "metadata": {},
201 |      "output_type": "execute_result"
202 |     }
203 |    ],
204 |    "source": [
205 |     "label11"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "markdown",
210 |    "metadata": {},
211 |    "source": [
212 |     "# Cluster majority data into  C (total number of classes)"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": 10,
218 |    "metadata": {},
219 |    "outputs": [
220 |     {
221 |      "name": "stdout",
222 |      "output_type": "stream",
223 |      "text": [
224 |       "time: 173.31280207633972\n"
225 |      ]
226 |     }
227 |    ],
228 |    "source": [
229 |     "from sklearn.mixture import GaussianMixture\n",
230 |     "import time\n",
231 |     "time_start = time.time()\n",
232 |     "\n",
233 |     "estimator = GaussianMixture(n_components=6)\n",
234 |     "estimator.fit(data0) \n",
235 |     "\n",
236 |     "time_end = time.time()\n",
237 |     "time = time_end - time_start\n",
238 |     "print(\"time:\",time)\n",
239 |     "\n",
240 |     "label_pred = estimator.predict(data0) "
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 11,
246 |    "metadata": {},
247 |    "outputs": [
248 |     {
249 |      "data": {
250 |       "text/plain": [
251 |        "[(0, 68087), (1, 762293), (2, 701472), (3, 110673), (4, 176277), (5, 625093)]"
252 |       ]
253 |      },
254 |      "execution_count": 11,
255 |      "metadata": {},
256 |      "output_type": "execute_result"
257 |     }
258 |    ],
259 |    "source": [
260 |     "sorted(Counter(label_pred).items())"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "markdown",
265 |    "metadata": {},
266 |    "source": [
267 |     "# Select a certain amount of data from each cluster to form a new majority data"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 12,
273 |    "metadata": {},
274 |    "outputs": [],
275 |    "source": [
276 |     "c0 = []\n",
277 |     "c1 = []\n",
278 |     "c2 = []\n",
279 |     "c3 = []\n",
280 |     "c4 = []\n",
281 |     "c5 = []\n",
282 |     "\n",
283 |     "\n",
284 |     "\n",
285 |     "s0=s1=s2=s3=s4=s5=0\n",
286 |     "\n",
287 |     "for i in range(data0.shape[0]):\n",
288 |     "    if label_pred[i] == 0:\n",
289 |     "        c0.append(data0[i])\n",
290 |     "        s0=s0+1\n",
291 |     "    elif label_pred[i] == 1:\n",
292 |     "        c1.append(data0[i])\n",
293 |     "        s1=s1+1\n",
294 |     "    elif label_pred[i] == 2:\n",
295 |     "        c2.append(data0[i])\n",
296 |     "        s2=s2+1\n",
297 |     "    elif label_pred[i] == 3:\n",
298 |     "        c3.append(data0[i])\n",
299 |     "        s3=s3+1\n",
300 |     "    elif label_pred[i] == 4:\n",
301 |     "        c4.append(data0[i])\n",
302 |     "        s4=s4+1\n",
303 |     "    elif label_pred[i] == 5:\n",
304 |     "        c5.append(data0[i])\n",
305 |     "        s5=s5+1\n",
306 |     "\n",
307 |     "        \n"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": 13,
313 |    "metadata": {},
314 |    "outputs": [],
315 |    "source": [
316 |     "a = 91975"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": 15,
322 |    "metadata": {},
323 |    "outputs": [],
324 |    "source": [
325 |     "# del c1[a:len(c1)]\n",
326 |     "del c2[a:len(c2)]\n",
327 |     "del c3[a:len(c3)]\n",
328 |     "del c4[a:len(c4)]\n",
329 |     "del c5[a:len(c5)]\n"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": 16,
335 |    "metadata": {},
336 |    "outputs": [],
337 |    "source": [
338 |     "c00 = np.array(c0)\n",
339 |     "c11 = np.array(c1)\n",
340 |     "c22 = np.array(c2)\n",
341 |     "c33 = np.array(c3)\n",
342 |     "c44 = np.array(c4)\n",
343 |     "c55 = np.array(c5)\n"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": 17,
349 |    "metadata": {},
350 |    "outputs": [
351 |     {
352 |      "data": {
353 |       "text/plain": [
354 |        "(527962, 12)"
355 |       ]
356 |      },
357 |      "execution_count": 17,
358 |      "metadata": {},
359 |      "output_type": "execute_result"
360 |     }
361 |    ],
362 |    "source": [
363 |     "q = np.concatenate((c00,c11,c22,c33,c44,c55),axis=0)\n",
364 |     "q.shape"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "code",
369 |    "execution_count": 18,
370 |    "metadata": {},
371 |    "outputs": [
372 |     {
373 |      "data": {
374 |       "text/plain": [
375 |        "(527962,)"
376 |       ]
377 |      },
378 |      "execution_count": 18,
379 |      "metadata": {},
380 |      "output_type": "execute_result"
381 |     }
382 |    ],
383 |    "source": [
384 |     "label_zc = np.zeros((q.shape[0],), dtype=int)\n",
385 |     "label_zc.shape"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": 19,
391 |    "metadata": {},
392 |    "outputs": [],
393 |    "source": [
394 |     "data_end = np.concatenate((q,data1),axis=0)\n",
395 |     "label_end = np.concatenate((label_zc,label1),axis=0)"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": 20,
401 |    "metadata": {},
402 |    "outputs": [
403 |     {
404 |      "data": {
405 |       "text/plain": [
406 |        "[(0, 527962), (1, 527963), (2, 527963), (3, 527963), (4, 527963), (5, 527963)]"
407 |       ]
408 |      },
409 |      "execution_count": 20,
410 |      "metadata": {},
411 |      "output_type": "execute_result"
412 |     }
413 |    ],
414 |    "source": [
415 |     "sorted(Counter(label_end).items())"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": 21,
421 |    "metadata": {},
422 |    "outputs": [],
423 |    "source": [
424 |     "label_end = label_end.reshape(label_end.shape[0],1)"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "code",
429 |    "execution_count": 22,
430 |    "metadata": {},
431 |    "outputs": [],
432 |    "source": [
433 |     "np.save(\"/home/hll/IDS/2020/data/select/zuizhong/SGM_data_train.npy\",data_end)\n",
434 |     "np.save(\"/home/hll/IDS/2020/data/select/zuizhong/SGM_label6_train.npy\",label_end)"
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "code",
439 |    "execution_count": null,
440 |    "metadata": {},
441 |    "outputs": [],
442 |    "source": []
443 |   }
444 |  ],
445 |  "metadata": {
446 |   "kernelspec": {
447 |    "display_name": "Python 3",
448 |    "language": "python",
449 |    "name": "python3"
450 |   },
451 |   "language_info": {
452 |    "codemirror_mode": {
453 |     "name": "ipython",
454 |     "version": 3
455 |    },
456 |    "file_extension": ".py",
457 |    "mimetype": "text/x-python",
458 |    "name": "python",
459 |    "nbconvert_exporter": "python",
460 |    "pygments_lexer": "ipython3",
461 |    "version": "3.6.9"
462 |   }
463 |  },
464 |  "nbformat": 4,
465 |  "nbformat_minor": 2
466 | }
467 | 


--------------------------------------------------------------------------------
/classification decision/CNN.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import os\n",
 10 |     "# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"1\"\n",
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "# load dataset"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "x_train = np.load('E:/IDS/alldata/12/RUS+SMOTE/data2.npy')\n",
 30 |     "y_train = np.load('E:/IDS/alldata/12/RUS+SMOTE/label2_10.npy')\n",
 31 |     "\n",
 32 |     "x_test = np.load('E:/IDS/alldata/12/test/data.npy')\n",
 33 |     "y_test = np.load('E:/IDS/alldata/12/test/label_10.npy')\n",
 34 |     "\n",
 35 |     "x_val = np.load('E:/IDS/alldata/12/val/data.npy')\n",
 36 |     "y_val = np.load('E:/IDS/alldata/12/val/label_10.npy')\n",
 37 |     "\n",
 38 |     "\n",
 39 |     "print(x_train.shape,y_train.shape,x_test.shape,y_test.shape,x_val.shape,y_val.shape)"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "x_train = np.expand_dims(x_train,2)\n",
 49 |     "x_test = np.expand_dims(x_test,2)\n",
 50 |     "x_val = np.expand_dims(x_val,2)\n",
 51 |     "# label one-hot\n",
 52 |     "from keras.utils import to_categorical\n",
 53 |     "y_train = to_categorical(y_train)\n",
 54 |     "y_test = to_categorical(y_test)\n",
 55 |     "y_val = to_categorical(y_val)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "y_train.shape"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "# train model"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "from keras.layers import Conv1D,MaxPooling1D,Dense,Dropout,Input,Flatten,GlobalAveragePooling1D\n",
 81 |     "from keras.layers.normalization import BatchNormalization\n",
 82 |     "from keras.models import Model\n",
 83 |     "from keras.optimizers import Nadam\n",
 84 |     "from keras import initializers\n",
 85 |     "from keras import regularizers\n",
 86 |     "np.random.seed(4)\n",
 87 |     "import pickle\n",
 88 |     "import math\n",
 89 |     "from keras.callbacks import LearningRateScheduler, ModelCheckpoint\n"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "n_obs,feature, depth = x_train.shape\n",
 99 |     "batch_size = 256\n",
100 |     "def build_model():\n",
101 |     "    input_singal = Input(shape=(feature,depth))\n",
102 |     "    x = Conv1D(32,3,activation='relu',padding='same',kernel_initializer='he_uniform')(input_singal)\n",
103 |     "    x = Conv1D(32,3,activation='relu',padding='same',kernel_initializer='he_uniform')(x)\n",
104 |     "    x = MaxPooling1D(pool_size=2,strides=2)(x)\n",
105 |     "    x = Dropout(0.2)(x)\n",
106 |     "    x = BatchNormalization()(x)\n",
107 |     "    \n",
108 |     "    x = Conv1D(64,3,activation='relu',padding='same',kernel_initializer='he_uniform')(x)\n",
109 |     "    x = Conv1D(64,3,activation='relu',padding='same',kernel_initializer='he_uniform')(x) \n",
110 |     "    x = MaxPooling1D(pool_size=2,strides=2)(x)\n",
111 |     "    x = Dropout(0.2)(x)\n",
112 |     "    x = BatchNormalization()(x)    \n",
113 |     "    \n",
114 |     "    x = Flatten()(x)\n",
115 |     "    x = Dense(32,activation='relu')(x)\n",
116 |     "    x = Dense(10,activation='softmax')(x)   #UNSW-NB15 is 2 and 10,CICIDS2017 is 15\n",
117 |     "    model = Model(inputs=input_singal,outputs=x)\n",
118 |     "    \n",
119 |     "    return model"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "model =  build_model()\n",
129 |     "model.summary()"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "import keras\n",
139 |     "import time\n",
140 |     "time_start = time.time()\n",
141 |     "\n",
142 |     "reduce_lr = keras.callbacks.ReduceLROnPlateau(moniter='val_loss',\n",
143 |     "                                              factor=0.1,\n",
144 |     "                                              patience=10)\n",
145 |     "nadam = Nadam(lr=0.008, beta_1=0.9, beta_2=0.999, epsilon=1e-08, schedule_decay=0.004)\n",
146 |     "model.compile(loss = \"categorical_crossentropy\",optimizer = \"nadam\", metrics = [\"accuracy\"])\n",
147 |     "\n",
148 |     "history = model.fit(x_train, y_train, \n",
149 |     "                    epochs=100, \n",
150 |     "                    batch_size=batch_size, \n",
151 |     "                    verbose=2,\n",
152 |     "                    validation_data=(x_val, y_val),\n",
153 |     "                    callbacks=[reduce_lr])\n",
154 |     "time_end = time.time()\n",
155 |     "train_time = time_end - time_start\n",
156 |     "print(\"train_time:\",train_time)"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "scores = model.evaluate(x_test, y_test)\n",
166 |     "print(\"test_loss = \", scores[0],\"test_accuracy = \", scores[1])"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "model.save('E:/IDS/alldata/12/RUS+SMOTE/CNN_RUS_10(2ci).h5')#save model"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "markdown",
180 |    "metadata": {},
181 |    "source": [
182 |     "# test model"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "import time\n",
192 |     "time_start = time.time()\n",
193 |     "\n",
194 |     "y_pred_onehot  = model.predict(x_test)  #返回的是在类别上的概率分布.It returns the probability distribution on the category\n",
195 |     "y_pred_label=np.argmax(y_pred_onehot,axis=1)#概率最大的类别就是预测类别.The category with the highest probability is the prediction category\n",
196 |     "\n",
197 |     "time_end = time.time()\n",
198 |     "test_time = time_end - time_start\n",
199 |     "print(\"test_time:\",test_time)\n",
200 |     "\n",
201 |     "# np.savetxt(\"E:/IDS/cicdata/GMM+SMOTE_77/2ci/CNN_pred_15.txt\",y_pred_label)  \n"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "y_true_onehot=y_test\n",
211 |     "y_true_label=np.argmax(y_true_onehot,axis=1)\n",
212 |     "# np.savetxt(\"E:/IDS/cicdata/GMM+SMOTE_77/2ci/CNN_true_15.txt\",y_true_label)  "
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": [
221 |     "# -*-coding:utf-8-*-\n",
222 |     "from sklearn.metrics import confusion_matrix\n",
223 |     "import matplotlib.pyplot as plt\n",
224 |     "import numpy as np\n",
225 |     "\n",
226 |     "labels = ['Normal','Analysis','Backdoor','DoS','Exploits','Fuzzers','Generic','Reconnaissance','Shellcode','Worms']  #class name\n",
227 |     "\n",
228 |     "y_true = y_true_label\n",
229 |     "y_pred  = y_pred_label\n",
230 |     "\n",
231 |     "tick_marks = np.array(range(len(labels))) + 0.5\n",
232 |     "\n",
233 |     "def plot_confusion_matrix(cm, title='Confusion Matrix', cmap=plt.cm.binary):\n",
234 |     "    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues) \n",
235 |     "    plt.title(title)\n",
236 |     "    plt.colorbar()\n",
237 |     "    xlocations = np.array(range(len(labels)))\n",
238 |     "    plt.xticks(xlocations, labels, rotation=90)\n",
239 |     "    plt.yticks(xlocations, labels)\n",
240 |     "    plt.ylabel('True label')\n",
241 |     "    plt.xlabel('Predicted label')\n",
242 |     "\n",
243 |     "\n",
244 |     "cm = confusion_matrix(y_true, y_pred) \n",
245 |     "np.set_printoptions(precision=2)  \n",
246 |     "cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]  \n",
247 |     "\n",
248 |     "plt.figure(figsize=(15, 13), dpi=120)\n",
249 |     "\n",
250 |     "ind_array = np.arange(len(labels))\n",
251 |     "x, y = np.meshgrid(ind_array, ind_array)\n",
252 |     "\n",
253 |     "for x_val, y_val in zip(x.flatten(), y.flatten()):\n",
254 |     "    c = cm_normalized[y_val][x_val]\n",
255 |     "    if c > 0.001:\n",
256 |     "        plt.text(x_val, y_val, \"%0.2f\" % (c,), color='red', fontsize=13, va='center', ha='center')\n",
257 |     "# offset the tick\n",
258 |     "plt.gca().set_xticks(tick_marks, minor=True)\n",
259 |     "plt.gca().set_yticks(tick_marks, minor=True)\n",
260 |     "plt.gca().xaxis.set_ticks_position('none')\n",
261 |     "plt.gca().yaxis.set_ticks_position('none')\n",
262 |     "plt.grid(True, which='minor', linestyle='-')\n",
263 |     "plt.gcf().subplots_adjust(bottom=0.15)\n",
264 |     "\n",
265 |     "plot_confusion_matrix(cm_normalized, title='MLP_12_10_ROS Normalized confusion matrix')  \n",
266 |     "#plt.savefig('/home/hll/IDS/alldata/cm/confusion_matrix.png', format='png') \n",
267 |     "plt.show()\n"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": null,
273 |    "metadata": {},
274 |    "outputs": [],
275 |    "source": [
276 |     "print(cm)  #Confusion matrix"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "markdown",
281 |    "metadata": {},
282 |    "source": [
283 |     "# multi-class evaluation indicators"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": null,
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": [
292 |     "from sklearn import metrics\n",
293 |     "from sklearn.metrics import classification_report\n",
294 |     "\n",
295 |     "target_names = ['Normal','Analysis','Backdoor','DoS','Exploits','Fuzzers','Generic','Reconnaissance','Shellcode','Worms']\n",
296 |     "print(classification_report(y_true,y_pred,target_names=target_names))\n"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": null,
302 |    "metadata": {},
303 |    "outputs": [],
304 |    "source": [
305 |     "acc = metrics.accuracy_score(y_true,y_pred) \n",
306 |     "f1 = metrics.f1_score(y_true, y_pred,average='weighted')\n",
307 |     "pre = metrics.precision_score(y_true, y_pred, labels=None, pos_label=1, average='weighted')  #DR\n",
308 |     "recall = metrics.recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted', sample_weight=None)\n",
309 |     "\n",
310 |     "print(\"acc:\",acc)\n",
311 |     "print(\"pre:\",pre)\n",
312 |     "print(\"DR=recall:\",recall)\n",
313 |     "print(\"f1:\",f1)\n"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "markdown",
318 |    "metadata": {},
319 |    "source": [
320 |     "# # binary-class evaluation indicators"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": null,
326 |    "metadata": {},
327 |    "outputs": [],
328 |    "source": [
329 |     "TP=cm[1,1]\n",
330 |     "FP=cm[0,1]\n",
331 |     "FN=cm[1,0]\n",
332 |     "TN=cm[0,0]\n",
333 |     "\n",
334 |     "acc = (TP+TN)/(TP+TN+FP+FN)\n",
335 |     "print(\"acc:\",acc)\n",
336 |     "\n",
337 |     "DR = TP/(TP+FN)  \n",
338 |     "print(\"DR:\",DR)\n",
339 |     "\n",
340 |     "FPR = FP/(FP+TN)  #FAR\n",
341 |     "print(\"FPR:\",FPR)\n",
342 |     "\n",
343 |     "recall =TP/(TP+FN)\n",
344 |     "print(\"recall：\",recall)  \n",
345 |     "\n",
346 |     "precision = TP/(TP+FP)\n",
347 |     "print(\"precision:\",precision)\n",
348 |     "\n",
349 |     "f1 = (2*precision*recall)/(precision+recall)\n",
350 |     "print(\"f1:\",f1)"
351 |    ]
352 |   }
353 |  ],
354 |  "metadata": {
355 |   "kernelspec": {
356 |    "display_name": "Python 3",
357 |    "language": "python",
358 |    "name": "python3"
359 |   },
360 |   "language_info": {
361 |    "codemirror_mode": {
362 |     "name": "ipython",
363 |     "version": 3
364 |    },
365 |    "file_extension": ".py",
366 |    "mimetype": "text/x-python",
367 |    "name": "python",
368 |    "nbconvert_exporter": "python",
369 |    "pygments_lexer": "ipython3",
370 |    "version": "3.6.9"
371 |   }
372 |  },
373 |  "nbformat": 4,
374 |  "nbformat_minor": 2
375 | }
376 | 


--------------------------------------------------------------------------------
/.ipynb_checkpoints/CNN-10-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import os\n",
 10 |     "# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"1\"\n",
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "import matplotlib.pyplot as plt"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "# load dataset"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "x_train = np.load('E:/IDS/alldata/12/RUS+SMOTE/data2.npy')\n",
 30 |     "y_train = np.load('E:/IDS/alldata/12/RUS+SMOTE/label2_10.npy')\n",
 31 |     "\n",
 32 |     "x_test = np.load('E:/IDS/alldata/12/test/data.npy')\n",
 33 |     "y_test = np.load('E:/IDS/alldata/12/test/label_10.npy')\n",
 34 |     "\n",
 35 |     "x_val = np.load('E:/IDS/alldata/12/val/data.npy')\n",
 36 |     "y_val = np.load('E:/IDS/alldata/12/val/label_10.npy')\n",
 37 |     "\n",
 38 |     "\n",
 39 |     "print(x_train.shape,y_train.shape,x_test.shape,y_test.shape,x_val.shape,y_val.shape)"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "x_train = np.expand_dims(x_train,2)\n",
 49 |     "x_test = np.expand_dims(x_test,2)\n",
 50 |     "x_val = np.expand_dims(x_val,2)\n",
 51 |     "# label one-hot\n",
 52 |     "from keras.utils import to_categorical\n",
 53 |     "y_train = to_categorical(y_train)\n",
 54 |     "y_test = to_categorical(y_test)\n",
 55 |     "y_val = to_categorical(y_val)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "y_train.shape"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "# train model"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "from keras.layers import Conv1D,MaxPooling1D,Dense,Dropout,Input,Flatten,GlobalAveragePooling1D\n",
 81 |     "from keras.layers.normalization import BatchNormalization\n",
 82 |     "from keras.models import Model\n",
 83 |     "from keras.optimizers import Nadam\n",
 84 |     "from keras import initializers\n",
 85 |     "from keras import regularizers\n",
 86 |     "np.random.seed(4)\n",
 87 |     "import pickle\n",
 88 |     "import math\n",
 89 |     "from keras.callbacks import LearningRateScheduler, ModelCheckpoint\n"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "n_obs,feature, depth = x_train.shape\n",
 99 |     "batch_size = 256\n",
100 |     "def build_model():\n",
101 |     "    input_singal = Input(shape=(feature,depth))\n",
102 |     "    x = Conv1D(32,3,activation='relu',padding='same',kernel_initializer='he_uniform')(input_singal)\n",
103 |     "    x = Conv1D(32,3,activation='relu',padding='same',kernel_initializer='he_uniform')(x)\n",
104 |     "    x = MaxPooling1D(pool_size=2,strides=2)(x)\n",
105 |     "    x = Dropout(0.2)(x)\n",
106 |     "    x = BatchNormalization()(x)\n",
107 |     "    \n",
108 |     "    x = Conv1D(64,3,activation='relu',padding='same',kernel_initializer='he_uniform')(x)\n",
109 |     "    x = Conv1D(64,3,activation='relu',padding='same',kernel_initializer='he_uniform')(x) \n",
110 |     "    x = MaxPooling1D(pool_size=2,strides=2)(x)\n",
111 |     "    x = Dropout(0.2)(x)\n",
112 |     "    x = BatchNormalization()(x)    \n",
113 |     "    \n",
114 |     "    x = Flatten()(x)\n",
115 |     "    x = Dense(32,activation='relu')(x)\n",
116 |     "    x = Dense(10,activation='softmax')(x)   #UNSW-NB15 is 2 and 10,CICIDS2017 is 15\n",
117 |     "    model = Model(inputs=input_singal,outputs=x)\n",
118 |     "    \n",
119 |     "    return model"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "model =  build_model()\n",
129 |     "model.summary()"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "import keras\n",
139 |     "import time\n",
140 |     "time_start = time.time()\n",
141 |     "\n",
142 |     "reduce_lr = keras.callbacks.ReduceLROnPlateau(moniter='val_loss',\n",
143 |     "                                              factor=0.1,\n",
144 |     "                                              patience=10)\n",
145 |     "nadam = Nadam(lr=0.008, beta_1=0.9, beta_2=0.999, epsilon=1e-08, schedule_decay=0.004)\n",
146 |     "model.compile(loss = \"categorical_crossentropy\",optimizer = \"nadam\", metrics = [\"accuracy\"])\n",
147 |     "\n",
148 |     "history = model.fit(x_train, y_train, \n",
149 |     "                    epochs=100, \n",
150 |     "                    batch_size=batch_size, \n",
151 |     "                    verbose=2,\n",
152 |     "                    validation_data=(x_val, y_val),\n",
153 |     "                    callbacks=[reduce_lr])\n",
154 |     "time_end = time.time()\n",
155 |     "train_time = time_end - time_start\n",
156 |     "print(\"train_time:\",train_time)"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "scores = model.evaluate(x_test, y_test)\n",
166 |     "print(\"test_loss = \", scores[0],\"test_accuracy = \", scores[1])"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "model.save('E:/IDS/alldata/12/RUS+SMOTE/CNN_RUS_10(2ci).h5')#save model"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "markdown",
180 |    "metadata": {},
181 |    "source": [
182 |     "# test model"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "import time\n",
192 |     "time_start = time.time()\n",
193 |     "\n",
194 |     "y_pred_onehot  = model.predict(x_test)  #返回的是在类别上的概率分布.It returns the probability distribution on the category\n",
195 |     "y_pred_label=np.argmax(y_pred_onehot,axis=1)#概率最大的类别就是预测类别.The category with the highest probability is the prediction category\n",
196 |     "\n",
197 |     "time_end = time.time()\n",
198 |     "test_time = time_end - time_start\n",
199 |     "print(\"test_time:\",test_time)\n",
200 |     "\n",
201 |     "# np.savetxt(\"E:/IDS/cicdata/GMM+SMOTE_77/2ci/CNN_pred_15.txt\",y_pred_label)  \n"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "y_true_onehot=y_test\n",
211 |     "y_true_label=np.argmax(y_true_onehot,axis=1)\n",
212 |     "# np.savetxt(\"E:/IDS/cicdata/GMM+SMOTE_77/2ci/CNN_true_15.txt\",y_true_label)  "
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": [
221 |     "# -*-coding:utf-8-*-\n",
222 |     "from sklearn.metrics import confusion_matrix\n",
223 |     "import matplotlib.pyplot as plt\n",
224 |     "import numpy as np\n",
225 |     "\n",
226 |     "labels = ['Normal','Analysis','Backdoor','DoS','Exploits','Fuzzers','Generic','Reconnaissance','Shellcode','Worms']  #class name\n",
227 |     "\n",
228 |     "y_true = y_true_label\n",
229 |     "y_pred  = y_pred_label\n",
230 |     "\n",
231 |     "tick_marks = np.array(range(len(labels))) + 0.5\n",
232 |     "\n",
233 |     "def plot_confusion_matrix(cm, title='Confusion Matrix', cmap=plt.cm.binary):\n",
234 |     "    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues) \n",
235 |     "    plt.title(title)\n",
236 |     "    plt.colorbar()\n",
237 |     "    xlocations = np.array(range(len(labels)))\n",
238 |     "    plt.xticks(xlocations, labels, rotation=90)\n",
239 |     "    plt.yticks(xlocations, labels)\n",
240 |     "    plt.ylabel('True label')\n",
241 |     "    plt.xlabel('Predicted label')\n",
242 |     "\n",
243 |     "\n",
244 |     "cm = confusion_matrix(y_true, y_pred) \n",
245 |     "np.set_printoptions(precision=2)  \n",
246 |     "cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]  \n",
247 |     "\n",
248 |     "plt.figure(figsize=(15, 13), dpi=120)\n",
249 |     "\n",
250 |     "ind_array = np.arange(len(labels))\n",
251 |     "x, y = np.meshgrid(ind_array, ind_array)\n",
252 |     "\n",
253 |     "for x_val, y_val in zip(x.flatten(), y.flatten()):\n",
254 |     "    c = cm_normalized[y_val][x_val]\n",
255 |     "    if c > 0.001:\n",
256 |     "        plt.text(x_val, y_val, \"%0.2f\" % (c,), color='red', fontsize=13, va='center', ha='center')\n",
257 |     "# offset the tick\n",
258 |     "plt.gca().set_xticks(tick_marks, minor=True)\n",
259 |     "plt.gca().set_yticks(tick_marks, minor=True)\n",
260 |     "plt.gca().xaxis.set_ticks_position('none')\n",
261 |     "plt.gca().yaxis.set_ticks_position('none')\n",
262 |     "plt.grid(True, which='minor', linestyle='-')\n",
263 |     "plt.gcf().subplots_adjust(bottom=0.15)\n",
264 |     "\n",
265 |     "plot_confusion_matrix(cm_normalized, title='MLP_12_10_ROS Normalized confusion matrix')  \n",
266 |     "#plt.savefig('/home/hll/IDS/alldata/cm/confusion_matrix.png', format='png') \n",
267 |     "plt.show()\n"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": null,
273 |    "metadata": {},
274 |    "outputs": [],
275 |    "source": [
276 |     "print(cm)  #Confusion matrix"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "markdown",
281 |    "metadata": {},
282 |    "source": [
283 |     "# multi-class evaluation indicators"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": null,
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": [
292 |     "from sklearn import metrics\n",
293 |     "from sklearn.metrics import classification_report\n",
294 |     "\n",
295 |     "target_names = ['Normal','Analysis','Backdoor','DoS','Exploits','Fuzzers','Generic','Reconnaissance','Shellcode','Worms']\n",
296 |     "print(classification_report(y_true,y_pred,target_names=target_names))\n"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": null,
302 |    "metadata": {},
303 |    "outputs": [],
304 |    "source": [
305 |     "acc = metrics.accuracy_score(y_true,y_pred) \n",
306 |     "f1 = metrics.f1_score(y_true, y_pred,average='weighted')\n",
307 |     "pre = metrics.precision_score(y_true, y_pred, labels=None, pos_label=1, average='weighted')  #DR\n",
308 |     "recall = metrics.recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted', sample_weight=None)\n",
309 |     "\n",
310 |     "print(\"acc:\",acc)\n",
311 |     "print(\"pre:\",pre)\n",
312 |     "print(\"DR=recall:\",recall)\n",
313 |     "print(\"f1:\",f1)\n"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "markdown",
318 |    "metadata": {},
319 |    "source": [
320 |     "# # binary-class evaluation indicators"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": null,
326 |    "metadata": {},
327 |    "outputs": [],
328 |    "source": [
329 |     "TP=cm[1,1]\n",
330 |     "FP=cm[0,1]\n",
331 |     "FN=cm[1,0]\n",
332 |     "TN=cm[0,0]\n",
333 |     "\n",
334 |     "acc = (TP+TN)/(TP+TN+FP+FN)\n",
335 |     "print(\"acc:\",acc)\n",
336 |     "\n",
337 |     "DR = TP/(TP+FN)  \n",
338 |     "print(\"DR:\",DR)\n",
339 |     "\n",
340 |     "FPR = FP/(FP+TN)  #FAR\n",
341 |     "print(\"FPR:\",FPR)\n",
342 |     "\n",
343 |     "recall =TP/(TP+FN)\n",
344 |     "print(\"recall：\",recall)  \n",
345 |     "\n",
346 |     "precision = TP/(TP+FP)\n",
347 |     "print(\"precision:\",precision)\n",
348 |     "\n",
349 |     "f1 = (2*precision*recall)/(precision+recall)\n",
350 |     "print(\"f1:\",f1)"
351 |    ]
352 |   }
353 |  ],
354 |  "metadata": {
355 |   "kernelspec": {
356 |    "display_name": "Python 3",
357 |    "language": "python",
358 |    "name": "python3"
359 |   },
360 |   "language_info": {
361 |    "codemirror_mode": {
362 |     "name": "ipython",
363 |     "version": 3
364 |    },
365 |    "file_extension": ".py",
366 |    "mimetype": "text/x-python",
367 |    "name": "python",
368 |    "nbconvert_exporter": "python",
369 |    "pygments_lexer": "ipython3",
370 |    "version": "3.6.9"
371 |   }
372 |  },
373 |  "nbformat": 4,
374 |  "nbformat_minor": 2
375 | }
376 | 


--------------------------------------------------------------------------------
/.ipynb_checkpoints/GMM + SMOTE-2-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 将少数类均SMOTE到（数据集样本数/类数）"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [
 15 |     {
 16 |      "name": "stderr",
 17 |      "output_type": "stream",
 18 |      "text": [
 19 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:526: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 20 |       "  _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n",
 21 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:527: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 22 |       "  _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n",
 23 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:528: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 24 |       "  _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n",
 25 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:529: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 26 |       "  _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n",
 27 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:530: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 28 |       "  _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n",
 29 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:535: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 30 |       "  np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n"
 31 |      ]
 32 |     }
 33 |    ],
 34 |    "source": [
 35 |     "from collections import Counter\n",
 36 |     "import os\n",
 37 |     "import pandas as pd\n",
 38 |     "import numpy as np\n",
 39 |     "import tensorflow as tf\n",
 40 |     "\n",
 41 |     "data = np.load('E:/IDS/alldata/12/train/data.npy')\n",
 42 |     "label = np.load('E:/IDS/alldata/12/train/label_2.npy')"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 2,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "X=np.array(data)  #np.array将列表转换为数组，即去掉每个元素外面的中括号\n",
 52 |     "b=np.array(label)\n",
 53 |     "bb=b.reshape(b.shape[0],)     #reshape重新定义形状,此时标签数据是1维的\n",
 54 |     "y10 = np.int32(bb)  #将标签类型从浮点型换成整形"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 3,
 60 |    "metadata": {},
 61 |    "outputs": [
 62 |     {
 63 |      "data": {
 64 |       "text/plain": [
 65 |        "(1778030, 12)"
 66 |       ]
 67 |      },
 68 |      "execution_count": 3,
 69 |      "metadata": {},
 70 |      "output_type": "execute_result"
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "X.shape"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 4,
 80 |    "metadata": {},
 81 |    "outputs": [
 82 |     {
 83 |      "data": {
 84 |       "text/plain": [
 85 |        "[(0, 1553132), (1, 224898)]"
 86 |       ]
 87 |      },
 88 |      "execution_count": 4,
 89 |      "metadata": {},
 90 |      "output_type": "execute_result"
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "sorted(Counter(y10).items())"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 5,
100 |    "metadata": {},
101 |    "outputs": [
102 |     {
103 |      "name": "stderr",
104 |      "output_type": "stream",
105 |      "text": [
106 |       "Using TensorFlow backend.\n"
107 |      ]
108 |     },
109 |     {
110 |      "name": "stdout",
111 |      "output_type": "stream",
112 |      "text": [
113 |       "[(0, 1553132), (1, 889015)]\n",
114 |       "time: 235.95304441452026\n"
115 |      ]
116 |     }
117 |    ],
118 |    "source": [
119 |     "#使用imlbearn库中上采样方法中的SMOTE接口\n",
120 |     "from imblearn.over_sampling import SMOTE\n",
121 |     "import time\n",
122 |     "time_start = time.time()\n",
123 |     "\n",
124 |     "smo = SMOTE(ratio={1:889015},random_state=42)  #可以用ratio指定采样程度\n",
125 |     "\n",
126 |     "X_smo, y_smo = smo.fit_sample(X, y10)    #对数据和标签进行SMOTE处理\n",
127 |     "print(sorted(Counter(y_smo).items()))\n",
128 |     "\n",
129 |     "time_end = time.time()\n",
130 |     "time = time_end - time_start\n",
131 |     "print(\"time:\",time)"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 6,
137 |    "metadata": {},
138 |    "outputs": [
139 |     {
140 |      "data": {
141 |       "text/plain": [
142 |        "2442147"
143 |       ]
144 |      },
145 |      "execution_count": 6,
146 |      "metadata": {},
147 |      "output_type": "execute_result"
148 |     }
149 |    ],
150 |    "source": [
151 |     "X_smo.shape[0]"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "metadata": {},
157 |    "source": [
158 |     "# 将多数类数据提取出来"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 7,
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "list0 = []  #正常类数据\n",
168 |     "list1 = []  #攻击类数据\n",
169 |     "list2 = []  #攻击类标签\n",
170 |     "\n",
171 |     "for i in range(X_smo.shape[0]):\n",
172 |     "    if y_smo[i] == 0:\n",
173 |     "        list0.append(X_smo[i])\n",
174 |     "    else:\n",
175 |     "        list1.append(X_smo[i])\n",
176 |     "        list2.append(y_smo[i])\n",
177 |     "    \n"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 8,
183 |    "metadata": {},
184 |    "outputs": [
185 |     {
186 |      "name": "stdout",
187 |      "output_type": "stream",
188 |      "text": [
189 |       "正常类数据形状： (1553132, 12)\n",
190 |       "攻击类数据形状： (889015, 12)\n",
191 |       "攻击类标签形状： (889015,)\n"
192 |      ]
193 |     }
194 |    ],
195 |    "source": [
196 |     "data0 = np.array(list0)  #np.array将列表转换为数组，即去掉每个元素外面的中括号\n",
197 |     "data1 = np.array(list1)\n",
198 |     "label1 = np.array(list2)\n",
199 |     "\n",
200 |     "label11 = label1.reshape(label1.shape[0],)     #reshape重新定义形状,此时标签数据是1维的\n",
201 |     "\n",
202 |     "print(\"正常类数据形状：\",data0.shape)\n",
203 |     "print(\"攻击类数据形状：\",data1.shape)  #使用函数查看类分布需要这种格式\n",
204 |     "print(\"攻击类标签形状：\",label11.shape)"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 9,
210 |    "metadata": {},
211 |    "outputs": [
212 |     {
213 |      "data": {
214 |       "text/plain": [
215 |        "array([1, 1, 1, ..., 1, 1, 1])"
216 |       ]
217 |      },
218 |      "execution_count": 9,
219 |      "metadata": {},
220 |      "output_type": "execute_result"
221 |     }
222 |    ],
223 |    "source": [
224 |     "label11"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "markdown",
229 |    "metadata": {},
230 |    "source": [
231 |     "# 将多数类数据聚成（总类数）"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": null,
237 |    "metadata": {},
238 |    "outputs": [],
239 |    "source": [
240 |     "from sklearn.mixture import GaussianMixture\n",
241 |     "import time\n",
242 |     "time_start = time.time()\n",
243 |     "\n",
244 |     "estimator = GaussianMixture(n_components=2)\n",
245 |     "estimator.fit(data0)  # 聚类\n",
246 |     "\n",
247 |     "time_end = time.time()\n",
248 |     "time = time_end - time_start\n",
249 |     "print(\"time:\",time)\n",
250 |     "\n",
251 |     "label_pred = estimator.predict(data0)  # 获取聚类标签"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": 11,
257 |    "metadata": {},
258 |    "outputs": [
259 |     {
260 |      "data": {
261 |       "text/plain": [
262 |        "[(0, 784867), (1, 768265)]"
263 |       ]
264 |      },
265 |      "execution_count": 11,
266 |      "metadata": {},
267 |      "output_type": "execute_result"
268 |     }
269 |    ],
270 |    "source": [
271 |     "sorted(Counter(label_pred).items())"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "markdown",
276 |    "metadata": {},
277 |    "source": [
278 |     "# 从每个簇中选出一定量的数据组成新的多数类数据"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": 12,
284 |    "metadata": {},
285 |    "outputs": [],
286 |    "source": [
287 |     "c0 = []\n",
288 |     "c1 = []\n",
289 |     "\n",
290 |     "s0=s1=0\n",
291 |     "\n",
292 |     "for i in range(data0.shape[0]):\n",
293 |     "    if label_pred[i] == 0:\n",
294 |     "        c0.append(data0[i])\n",
295 |     "        s0=s0+1\n",
296 |     "    elif label_pred[i] == 1:\n",
297 |     "        c1.append(data0[i])\n",
298 |     "        s1=s1+1"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": 13,
304 |    "metadata": {},
305 |    "outputs": [],
306 |    "source": [
307 |     "a = 444508"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": 14,
313 |    "metadata": {},
314 |    "outputs": [
315 |     {
316 |      "name": "stdout",
317 |      "output_type": "stream",
318 |      "text": [
319 |       "删除前大小: 784867\n",
320 |       "删除后大小: 444508\n"
321 |      ]
322 |     }
323 |    ],
324 |    "source": [
325 |     "print(\"删除前大小:\",len(c0))\n",
326 |     "del c0[a:len(c0)]\n",
327 |     "print(\"删除后大小:\",len(c0))\n"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": 15,
333 |    "metadata": {},
334 |    "outputs": [
335 |     {
336 |      "name": "stdout",
337 |      "output_type": "stream",
338 |      "text": [
339 |       "删除前大小: 768265\n",
340 |       "删除后大小: 444508\n"
341 |      ]
342 |     }
343 |    ],
344 |    "source": [
345 |     "print(\"删除前大小:\",len(c1))\n",
346 |     "del c1[a:len(c1)]\n",
347 |     "print(\"删除后大小:\",len(c1))"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": 16,
353 |    "metadata": {},
354 |    "outputs": [],
355 |    "source": [
356 |     "c00 = np.array(c0)\n",
357 |     "c11 = np.array(c1)"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": 17,
363 |    "metadata": {},
364 |    "outputs": [
365 |     {
366 |      "data": {
367 |       "text/plain": [
368 |        "(889016, 12)"
369 |       ]
370 |      },
371 |      "execution_count": 17,
372 |      "metadata": {},
373 |      "output_type": "execute_result"
374 |     }
375 |    ],
376 |    "source": [
377 |     "q = np.concatenate((c00,c11),axis=0)\n",
378 |     "q.shape"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": 18,
384 |    "metadata": {},
385 |    "outputs": [
386 |     {
387 |      "data": {
388 |       "text/plain": [
389 |        "(889016,)"
390 |       ]
391 |      },
392 |      "execution_count": 18,
393 |      "metadata": {},
394 |      "output_type": "execute_result"
395 |     }
396 |    ],
397 |    "source": [
398 |     "#创建一个全零数组为正常类的标签\n",
399 |     "label_zc = np.zeros((q.shape[0],), dtype=int)\n",
400 |     "label_zc.shape"
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "code",
405 |    "execution_count": 19,
406 |    "metadata": {},
407 |    "outputs": [],
408 |    "source": [
409 |     "#将正常类数据和攻击数据合并到一起，将标签也合并到一起\n",
410 |     "data_end = np.concatenate((q,data1),axis=0)\n",
411 |     "label_end = np.concatenate((label_zc,label1),axis=0)"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "code",
416 |    "execution_count": 20,
417 |    "metadata": {},
418 |    "outputs": [
419 |     {
420 |      "data": {
421 |       "text/plain": [
422 |        "[(0, 889016), (1, 889015)]"
423 |       ]
424 |      },
425 |      "execution_count": 20,
426 |      "metadata": {},
427 |      "output_type": "execute_result"
428 |     }
429 |    ],
430 |    "source": [
431 |     "sorted(Counter(label_end).items())"
432 |    ]
433 |   },
434 |   {
435 |    "cell_type": "code",
436 |    "execution_count": 21,
437 |    "metadata": {},
438 |    "outputs": [],
439 |    "source": [
440 |     "label_end = label_end.reshape(label_end.shape[0],1)"
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "code",
445 |    "execution_count": 22,
446 |    "metadata": {},
447 |    "outputs": [],
448 |    "source": [
449 |     "#将最终采样后的数据保存成文件\n",
450 |     "np.save(\"E:/IDS/alldata/12/k-means+smote_train/label_2/data.npy\",data_end)\n",
451 |     "np.save(\"E:/IDS/alldata/12/k-means+smote_train/label_2/label.npy\",label_end)"
452 |    ]
453 |   },
454 |   {
455 |    "cell_type": "code",
456 |    "execution_count": null,
457 |    "metadata": {},
458 |    "outputs": [],
459 |    "source": []
460 |   }
461 |  ],
462 |  "metadata": {
463 |   "kernelspec": {
464 |    "display_name": "Python 3",
465 |    "language": "python",
466 |    "name": "python3"
467 |   },
468 |   "language_info": {
469 |    "codemirror_mode": {
470 |     "name": "ipython",
471 |     "version": 3
472 |    },
473 |    "file_extension": ".py",
474 |    "mimetype": "text/x-python",
475 |    "name": "python",
476 |    "nbconvert_exporter": "python",
477 |    "pygments_lexer": "ipython3",
478 |    "version": "3.7.4"
479 |   }
480 |  },
481 |  "nbformat": 4,
482 |  "nbformat_minor": 2
483 | }
484 | 


--------------------------------------------------------------------------------
/.ipynb_checkpoints/GMM + SMOTE -10-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 将少数类均SMOTE到（数据集样本数/类数）"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [
 15 |     {
 16 |      "name": "stderr",
 17 |      "output_type": "stream",
 18 |      "text": [
 19 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:526: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 20 |       "  _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n",
 21 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:527: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 22 |       "  _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n",
 23 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:528: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 24 |       "  _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n",
 25 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:529: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 26 |       "  _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n",
 27 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:530: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 28 |       "  _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n",
 29 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:535: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 30 |       "  np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n"
 31 |      ]
 32 |     }
 33 |    ],
 34 |    "source": [
 35 |     "from collections import Counter\n",
 36 |     "import os\n",
 37 |     "import pandas as pd\n",
 38 |     "import numpy as np\n",
 39 |     "import tensorflow as tf\n",
 40 |     "\n",
 41 |     "data = np.load('E:/IDS/alldata/12/train/data.npy')\n",
 42 |     "label = np.load('E:/IDS/alldata/12/train/label_10.npy')"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 2,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "X=np.array(data)  #np.array将列表转换为数组，即去掉每个元素外面的中括号\n",
 52 |     "b=np.array(label)\n",
 53 |     "bb=b.reshape(b.shape[0],)     #reshape重新定义形状,此时标签数据是1维的\n",
 54 |     "y10 = np.int32(bb)  #将标签类型从浮点型换成整形"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 3,
 60 |    "metadata": {},
 61 |    "outputs": [
 62 |     {
 63 |      "data": {
 64 |       "text/plain": [
 65 |        "(1778030, 12)"
 66 |       ]
 67 |      },
 68 |      "execution_count": 3,
 69 |      "metadata": {},
 70 |      "output_type": "execute_result"
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "X.shape"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 4,
 80 |    "metadata": {},
 81 |    "outputs": [
 82 |     {
 83 |      "data": {
 84 |       "text/plain": [
 85 |        "[(0, 1553132),\n",
 86 |        " (1, 1874),\n",
 87 |        " (2, 1630),\n",
 88 |        " (3, 11449),\n",
 89 |        " (4, 31167),\n",
 90 |        " (5, 16972),\n",
 91 |        " (6, 150836),\n",
 92 |        " (7, 9791),\n",
 93 |        " (8, 1057),\n",
 94 |        " (9, 122)]"
 95 |       ]
 96 |      },
 97 |      "execution_count": 4,
 98 |      "metadata": {},
 99 |      "output_type": "execute_result"
100 |     }
101 |    ],
102 |    "source": [
103 |     "sorted(Counter(y10).items())"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 5,
109 |    "metadata": {},
110 |    "outputs": [
111 |     {
112 |      "name": "stderr",
113 |      "output_type": "stream",
114 |      "text": [
115 |       "Using TensorFlow backend.\n"
116 |      ]
117 |     },
118 |     {
119 |      "name": "stdout",
120 |      "output_type": "stream",
121 |      "text": [
122 |       "[(0, 1553132), (1, 177803), (2, 177803), (3, 177803), (4, 177803), (5, 177803), (6, 177803), (7, 177803), (8, 177803), (9, 177803)]\n",
123 |       "time: 143.16974186897278\n"
124 |      ]
125 |     }
126 |    ],
127 |    "source": [
128 |     "#使用imlbearn库中上采样方法中的SMOTE接口\n",
129 |     "from imblearn.over_sampling import SMOTE\n",
130 |     "import time\n",
131 |     "time_start = time.time()\n",
132 |     "\n",
133 |     "smo = SMOTE(ratio={1:177803,2:177803,3:177803,4:177803,5:177803,6:177803,7:177803,8:177803,9:177803},random_state=42)  #可以用ratio指定采样程度\n",
134 |     "\n",
135 |     "X_smo, y_smo = smo.fit_sample(X, y10)    #对数据和标签进行SMOTE处理\n",
136 |     "print(sorted(Counter(y_smo).items()))\n",
137 |     "\n",
138 |     "time_end = time.time()\n",
139 |     "time = time_end - time_start\n",
140 |     "print(\"time:\",time)"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 6,
146 |    "metadata": {},
147 |    "outputs": [
148 |     {
149 |      "data": {
150 |       "text/plain": [
151 |        "3153359"
152 |       ]
153 |      },
154 |      "execution_count": 6,
155 |      "metadata": {},
156 |      "output_type": "execute_result"
157 |     }
158 |    ],
159 |    "source": [
160 |     "X_smo.shape[0]"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "metadata": {},
166 |    "source": [
167 |     "# 将多数类数据提取出来"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 7,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "list0 = []  #正常类数据\n",
177 |     "list1 = []  #攻击类数据\n",
178 |     "list2 = []  #攻击类标签\n",
179 |     "\n",
180 |     "for i in range(X_smo.shape[0]):\n",
181 |     "    if y_smo[i] == 0:\n",
182 |     "        list0.append(X_smo[i])\n",
183 |     "    else:\n",
184 |     "        list1.append(X_smo[i])\n",
185 |     "        list2.append(y_smo[i])\n",
186 |     "    \n"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": 8,
192 |    "metadata": {},
193 |    "outputs": [
194 |     {
195 |      "name": "stdout",
196 |      "output_type": "stream",
197 |      "text": [
198 |       "正常类数据形状： (1553132, 12)\n",
199 |       "攻击类数据形状： (1600227, 12)\n",
200 |       "攻击类标签形状： (1600227,)\n"
201 |      ]
202 |     }
203 |    ],
204 |    "source": [
205 |     "data0 = np.array(list0)  #np.array将列表转换为数组，即去掉每个元素外面的中括号\n",
206 |     "data1 = np.array(list1)\n",
207 |     "label1 = np.array(list2)\n",
208 |     "\n",
209 |     "label11 = label1.reshape(label1.shape[0],)     #reshape重新定义形状,此时标签数据是1维的\n",
210 |     "\n",
211 |     "print(\"正常类数据形状：\",data0.shape)\n",
212 |     "print(\"攻击类数据形状：\",data1.shape)  #使用函数查看类分布需要这种格式\n",
213 |     "print(\"攻击类标签形状：\",label11.shape)"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 9,
219 |    "metadata": {},
220 |    "outputs": [
221 |     {
222 |      "data": {
223 |       "text/plain": [
224 |        "array([5, 6, 6, ..., 9, 9, 9])"
225 |       ]
226 |      },
227 |      "execution_count": 9,
228 |      "metadata": {},
229 |      "output_type": "execute_result"
230 |     }
231 |    ],
232 |    "source": [
233 |     "label11"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "metadata": {},
239 |    "source": [
240 |     "# 将多数类数据聚成（总类数）"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": null,
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "from sklearn.mixture import GaussianMixture\n",
250 |     "import time\n",
251 |     "time_start = time.time()\n",
252 |     "\n",
253 |     "estimator = GaussianMixture(n_components=10)\n",
254 |     "estimator.fit(data0)  # 聚类\n",
255 |     "\n",
256 |     "time_end = time.time()\n",
257 |     "time = time_end - time_start\n",
258 |     "print(\"time:\",time)\n",
259 |     "\n",
260 |     "label_pred = estimator.predict(data0)  # 获取聚类标签"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": 10,
266 |    "metadata": {},
267 |    "outputs": [
268 |     {
269 |      "name": "stdout",
270 |      "output_type": "stream",
271 |      "text": [
272 |       "time: 49.27290630340576\n"
273 |      ]
274 |     }
275 |    ],
276 |    "source": [
277 |     "from sklearn.cluster import KMeans\n",
278 |     "import time\n",
279 |     "time_start = time.time()\n",
280 |     "\n",
281 |     "estimator = KMeans(n_clusters=10)\n",
282 |     "estimator.fit(data0)  # 聚类\n",
283 |     "\n",
284 |     "time_end = time.time()\n",
285 |     "time = time_end - time_start\n",
286 |     "print(\"time:\",time)\n",
287 |     "\n",
288 |     "label_pred = estimator.labels_  # 获取聚类标签"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": 11,
294 |    "metadata": {},
295 |    "outputs": [
296 |     {
297 |      "data": {
298 |       "text/plain": [
299 |        "[(0, 192248),\n",
300 |        " (1, 449896),\n",
301 |        " (2, 277310),\n",
302 |        " (3, 131107),\n",
303 |        " (4, 27652),\n",
304 |        " (5, 143659),\n",
305 |        " (6, 229698),\n",
306 |        " (7, 7405),\n",
307 |        " (8, 92530),\n",
308 |        " (9, 1627)]"
309 |       ]
310 |      },
311 |      "execution_count": 11,
312 |      "metadata": {},
313 |      "output_type": "execute_result"
314 |     }
315 |    ],
316 |    "source": [
317 |     "sorted(Counter(label_pred).items())"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "markdown",
322 |    "metadata": {},
323 |    "source": [
324 |     "# 从每个簇中选出一定量的数据组成新的多数类数据"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": 12,
330 |    "metadata": {},
331 |    "outputs": [],
332 |    "source": [
333 |     "c0 = []\n",
334 |     "c1 = []\n",
335 |     "c2 = []\n",
336 |     "c3 = []\n",
337 |     "c4 = []\n",
338 |     "c5 = []\n",
339 |     "c6 = []\n",
340 |     "c7 = []\n",
341 |     "c8 = []\n",
342 |     "c9 = []\n",
343 |     "\n",
344 |     "\n",
345 |     "s0=s1=s2=s3=s4=s5=s6=s7=s8=s9=0\n",
346 |     "\n",
347 |     "for i in range(data0.shape[0]):\n",
348 |     "    if label_pred[i] == 0:\n",
349 |     "        c0.append(data0[i])\n",
350 |     "        s0=s0+1\n",
351 |     "    elif label_pred[i] == 1:\n",
352 |     "        c1.append(data0[i])\n",
353 |     "        s1=s1+1\n",
354 |     "    elif label_pred[i] == 2:\n",
355 |     "        c2.append(data0[i])\n",
356 |     "        s2=s2+1\n",
357 |     "    elif label_pred[i] == 3:\n",
358 |     "        c3.append(data0[i])\n",
359 |     "        s3=s3+1\n",
360 |     "    elif label_pred[i] == 4:\n",
361 |     "        c4.append(data0[i])\n",
362 |     "        s4=s4+1\n",
363 |     "    elif label_pred[i] == 5:\n",
364 |     "        c5.append(data0[i])\n",
365 |     "        s5=s5+1\n",
366 |     "    elif label_pred[i] == 6:\n",
367 |     "        c6.append(data0[i])\n",
368 |     "        s6=s6+1\n",
369 |     "    elif label_pred[i] == 7:\n",
370 |     "        c7.append(data0[i])\n",
371 |     "        s7=s7+1\n",
372 |     "    elif label_pred[i] == 8:\n",
373 |     "        c8.append(data0[i])\n",
374 |     "        s8=s8+1\n",
375 |     "    elif label_pred[i] == 9:\n",
376 |     "        c9.append(data0[i])\n",
377 |     "        s9=s9+1\n",
378 |     "        \n",
379 |     "        \n"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": 13,
385 |    "metadata": {},
386 |    "outputs": [],
387 |    "source": [
388 |     "a = 21096"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": 14,
394 |    "metadata": {},
395 |    "outputs": [
396 |     {
397 |      "name": "stdout",
398 |      "output_type": "stream",
399 |      "text": [
400 |       "删除前大小: 192248\n",
401 |       "删除后大小: 21096\n"
402 |      ]
403 |     }
404 |    ],
405 |    "source": [
406 |     "print(\"删除前大小:\",len(c0))\n",
407 |     "del c0[a:len(c0)]\n",
408 |     "print(\"删除后大小:\",len(c0))\n"
409 |    ]
410 |   },
411 |   {
412 |    "cell_type": "code",
413 |    "execution_count": 15,
414 |    "metadata": {},
415 |    "outputs": [
416 |     {
417 |      "name": "stdout",
418 |      "output_type": "stream",
419 |      "text": [
420 |       "删除前大小: 449896\n",
421 |       "删除后大小: 21096\n"
422 |      ]
423 |     }
424 |    ],
425 |    "source": [
426 |     "print(\"删除前大小:\",len(c1))\n",
427 |     "del c1[a:len(c1)]\n",
428 |     "print(\"删除后大小:\",len(c1))"
429 |    ]
430 |   },
431 |   {
432 |    "cell_type": "code",
433 |    "execution_count": 16,
434 |    "metadata": {},
435 |    "outputs": [
436 |     {
437 |      "name": "stdout",
438 |      "output_type": "stream",
439 |      "text": [
440 |       "删除前大小: 277310\n",
441 |       "删除后大小: 21096\n"
442 |      ]
443 |     }
444 |    ],
445 |    "source": [
446 |     "print(\"删除前大小:\",len(c2))\n",
447 |     "del c2[a:len(c2)]\n",
448 |     "print(\"删除后大小:\",len(c2))"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "code",
453 |    "execution_count": 17,
454 |    "metadata": {},
455 |    "outputs": [
456 |     {
457 |      "name": "stdout",
458 |      "output_type": "stream",
459 |      "text": [
460 |       "删除前大小: 131107\n",
461 |       "删除后大小: 21096\n"
462 |      ]
463 |     }
464 |    ],
465 |    "source": [
466 |     "print(\"删除前大小:\",len(c3))\n",
467 |     "del c3[a:len(c3)]\n",
468 |     "print(\"删除后大小:\",len(c3))"
469 |    ]
470 |   },
471 |   {
472 |    "cell_type": "code",
473 |    "execution_count": 18,
474 |    "metadata": {},
475 |    "outputs": [
476 |     {
477 |      "name": "stdout",
478 |      "output_type": "stream",
479 |      "text": [
480 |       "删除前大小: 27652\n",
481 |       "删除后大小: 21096\n"
482 |      ]
483 |     }
484 |    ],
485 |    "source": [
486 |     "print(\"删除前大小:\",len(c4))\n",
487 |     "del c4[a:len(c4)]\n",
488 |     "print(\"删除后大小:\",len(c4))"
489 |    ]
490 |   },
491 |   {
492 |    "cell_type": "code",
493 |    "execution_count": 19,
494 |    "metadata": {},
495 |    "outputs": [],
496 |    "source": [
497 |     "del c5[a:len(c5)]\n",
498 |     "del c6[a:len(c6)]\n",
499 |     "del c7[a:len(c7)]\n",
500 |     "del c8[a:len(c8)]\n",
501 |     "del c9[a:len(c9)]"
502 |    ]
503 |   },
504 |   {
505 |    "cell_type": "code",
506 |    "execution_count": 20,
507 |    "metadata": {},
508 |    "outputs": [],
509 |    "source": [
510 |     "c00 = np.array(c0)\n",
511 |     "c11 = np.array(c1)\n",
512 |     "c22 = np.array(c2)\n",
513 |     "c33 = np.array(c3)\n",
514 |     "c44 = np.array(c4)\n",
515 |     "c55 = np.array(c5)\n",
516 |     "c66 = np.array(c6)\n",
517 |     "c77 = np.array(c7)\n",
518 |     "c88 = np.array(c8)\n",
519 |     "c99 = np.array(c9)"
520 |    ]
521 |   },
522 |   {
523 |    "cell_type": "code",
524 |    "execution_count": 21,
525 |    "metadata": {},
526 |    "outputs": [
527 |     {
528 |      "data": {
529 |       "text/plain": [
530 |        "(177800, 12)"
531 |       ]
532 |      },
533 |      "execution_count": 21,
534 |      "metadata": {},
535 |      "output_type": "execute_result"
536 |     }
537 |    ],
538 |    "source": [
539 |     "q = np.concatenate((c00,c11,c22,c33,c44,c55,c66,c77,c88,c99),axis=0)\n",
540 |     "q.shape"
541 |    ]
542 |   },
543 |   {
544 |    "cell_type": "code",
545 |    "execution_count": 22,
546 |    "metadata": {},
547 |    "outputs": [
548 |     {
549 |      "data": {
550 |       "text/plain": [
551 |        "(177800,)"
552 |       ]
553 |      },
554 |      "execution_count": 22,
555 |      "metadata": {},
556 |      "output_type": "execute_result"
557 |     }
558 |    ],
559 |    "source": [
560 |     "#创建一个全零数组为正常类的标签\n",
561 |     "label_zc = np.zeros((q.shape[0],), dtype=int)\n",
562 |     "label_zc.shape"
563 |    ]
564 |   },
565 |   {
566 |    "cell_type": "code",
567 |    "execution_count": 23,
568 |    "metadata": {},
569 |    "outputs": [],
570 |    "source": [
571 |     "#将正常类数据和攻击数据合并到一起，将标签也合并到一起\n",
572 |     "data_end = np.concatenate((q,data1),axis=0)\n",
573 |     "label_end = np.concatenate((label_zc,label1),axis=0)"
574 |    ]
575 |   },
576 |   {
577 |    "cell_type": "code",
578 |    "execution_count": 25,
579 |    "metadata": {},
580 |    "outputs": [
581 |     {
582 |      "data": {
583 |       "text/plain": [
584 |        "[(0, 177800),\n",
585 |        " (1, 177803),\n",
586 |        " (2, 177803),\n",
587 |        " (3, 177803),\n",
588 |        " (4, 177803),\n",
589 |        " (5, 177803),\n",
590 |        " (6, 177803),\n",
591 |        " (7, 177803),\n",
592 |        " (8, 177803),\n",
593 |        " (9, 177803)]"
594 |       ]
595 |      },
596 |      "execution_count": 25,
597 |      "metadata": {},
598 |      "output_type": "execute_result"
599 |     }
600 |    ],
601 |    "source": [
602 |     "sorted(Counter(label_end).items())"
603 |    ]
604 |   },
605 |   {
606 |    "cell_type": "code",
607 |    "execution_count": 28,
608 |    "metadata": {},
609 |    "outputs": [],
610 |    "source": [
611 |     "label_end = label_end.reshape(label_end.shape[0],1)"
612 |    ]
613 |   },
614 |   {
615 |    "cell_type": "code",
616 |    "execution_count": 30,
617 |    "metadata": {},
618 |    "outputs": [],
619 |    "source": [
620 |     "#将最终采样后的数据保存成文件\n",
621 |     "np.save(\"E:/IDS/alldata/12/k-means+smote_train/data.npy\",data_end)\n",
622 |     "np.save(\"E:/IDS/alldata/12/k-means+smote_train/label_10.npy\",label_end)"
623 |    ]
624 |   },
625 |   {
626 |    "cell_type": "code",
627 |    "execution_count": null,
628 |    "metadata": {},
629 |    "outputs": [],
630 |    "source": []
631 |   }
632 |  ],
633 |  "metadata": {
634 |   "kernelspec": {
635 |    "display_name": "Python 3",
636 |    "language": "python",
637 |    "name": "python3"
638 |   },
639 |   "language_info": {
640 |    "codemirror_mode": {
641 |     "name": "ipython",
642 |     "version": 3
643 |    },
644 |    "file_extension": ".py",
645 |    "mimetype": "text/x-python",
646 |    "name": "python",
647 |    "nbconvert_exporter": "python",
648 |    "pygments_lexer": "ipython3",
649 |    "version": "3.7.4"
650 |   }
651 |  },
652 |  "nbformat": 4,
653 |  "nbformat_minor": 2
654 | }
655 | 


--------------------------------------------------------------------------------
/imbalance processing/ROS,SMOTE,ADASYN.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import os\n",
 10 |     "# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"1\""
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [
 18 |     {
 19 |      "name": "stderr",
 20 |      "output_type": "stream",
 21 |      "text": [
 22 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:526: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 23 |       "  _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n",
 24 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:527: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 25 |       "  _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n",
 26 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:528: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 27 |       "  _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n",
 28 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:529: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 29 |       "  _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n",
 30 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:530: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 31 |       "  _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n",
 32 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:535: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 33 |       "  np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n"
 34 |      ]
 35 |     },
 36 |     {
 37 |      "name": "stdout",
 38 |      "output_type": "stream",
 39 |      "text": [
 40 |       "WARNING:tensorflow:From <ipython-input-2-aa5b6903d61d>:15: tf_record_iterator (from tensorflow.python.lib.io.tf_record) is deprecated and will be removed in a future version.\n",
 41 |       "Instructions for updating:\n",
 42 |       "Use eager execution and: \n",
 43 |       "`tf.data.TFRecordDataset(path)`\n"
 44 |      ]
 45 |     }
 46 |    ],
 47 |    "source": [
 48 |     "#把所有数据整合到一起，list1是数据，list2是二分类标签，list10是10分类标签。\n",
 49 |     "#Put all the data together, list1 is the data, list2 is the dichotomy label, and list10 is the 10-category label.\n",
 50 |     "from collections import Counter\n",
 51 |     "import os\n",
 52 |     "import pandas as pd\n",
 53 |     "import numpy as np\n",
 54 |     "import tensorflow as tf\n",
 55 |     "\n",
 56 |     "#读取tf文件数据\n",
 57 |     "#Read the tf file data\n",
 58 |     "list1= []#data\n",
 59 |     "list2=[]   #label_2\n",
 60 |     "list10=[]#label_10\n",
 61 |     "\n",
 62 |     "for serialized_example in tf.python_io.tf_record_iterator(\"E:/IDS/normalized/train_select_12.tfrecords\"):  \n",
 63 |     "\n",
 64 |     "\n",
 65 |     "    example = tf.train.Example()\n",
 66 |     "    example.ParseFromString(serialized_example)\n",
 67 |     "    \n",
 68 |     "\n",
 69 |     "    feature = example.features.feature['features'].float_list.value  \n",
 70 |     "    label_2 = example.features.feature['label_2'].float_list.value\n",
 71 |     "    label_10 = example.features.feature['label_10'].float_list.value\n",
 72 |     "    \n",
 73 |     "    list1.append(feature)  \n",
 74 |     "    list2.append(label_2)\n",
 75 |     "    list10.append(label_10)\n",
 76 |     "\n",
 77 |     "    \n",
 78 |     "# for serialized_example in tf.python_io.tf_record_iterator(\"/home/hll/IDS/normalized/test_select_12.tfrecords\"):  \n",
 79 |     "\n",
 80 |     "\n",
 81 |     "#     example = tf.train.Example()\n",
 82 |     "#     example.ParseFromString(serialized_example)\n",
 83 |     "    \n",
 84 |     "\n",
 85 |     "#     feature = example.features.feature['features'].float_list.value  \n",
 86 |     "#     label_2 = example.features.feature['label_2'].float_list.value\n",
 87 |     "#     label_10 = example.features.feature['label_10'].float_list.value\n",
 88 |     "    \n",
 89 |     "#     list1.append(feature)  \n",
 90 |     "#     list2.append(label_2)\n",
 91 |     "#     list10.append(label_10)\n",
 92 |     "    \n",
 93 |     "\n",
 94 |     "        \n",
 95 |     "# for serialized_example in tf.python_io.tf_record_iterator(\"/home/hll/IDS/normalized/validation_select_12.tfrecords\"):  \n",
 96 |     "\n",
 97 |     "\n",
 98 |     "#     example = tf.train.Example()\n",
 99 |     "#     example.ParseFromString(serialized_example)\n",
100 |     "    \n",
101 |     "\n",
102 |     "#     feature = example.features.feature['features'].float_list.value  \n",
103 |     "#     label_2 = example.features.feature['label_2'].float_list.value\n",
104 |     "#     label_10 = example.features.feature['label_10'].float_list.value\n",
105 |     "    \n",
106 |     "#     list1.append(feature) \n",
107 |     "#     list2.append(label_2)\n",
108 |     "#     list10.append(label_10)\n"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 3,
114 |    "metadata": {},
115 |    "outputs": [
116 |     {
117 |      "data": {
118 |       "text/plain": [
119 |        "array([0, 0, 0, ..., 0, 0, 0])"
120 |       ]
121 |      },
122 |      "execution_count": 3,
123 |      "metadata": {},
124 |      "output_type": "execute_result"
125 |     }
126 |    ],
127 |    "source": [
128 |     "X=np.array(list1) \n",
129 |     "b2=np.array(list2)\n",
130 |     "bb2=b2.reshape(b2.shape[0],)     \n",
131 |     "b10=np.array(list10)\n",
132 |     "bb10=b10.reshape(b10.shape[0],)\n",
133 |     "y2 = np.int32(bb2)  \n",
134 |     "y10 = np.int32(bb10)\n",
135 |     "y10"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 5,
141 |    "metadata": {},
142 |    "outputs": [
143 |     {
144 |      "name": "stdout",
145 |      "output_type": "stream",
146 |      "text": [
147 |       "Counter({0: 1553132, 1: 224898})\n",
148 |       "Counter({0: 1553132, 6: 150836, 4: 31167, 5: 16972, 3: 11449, 7: 9791, 1: 1874, 2: 1630, 8: 1057, 9: 122})\n"
149 |      ]
150 |     }
151 |    ],
152 |    "source": [
153 |     "#View the distribution of data classes (data distribution without imbalanced processing)\n",
154 |     "#查看数据类的分布(未不平衡处理的数据分布)\n",
155 |     "print(Counter(y2))  \n",
156 |     "print(Counter(y10))"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 11,
162 |    "metadata": {},
163 |    "outputs": [
164 |     {
165 |      "name": "stdout",
166 |      "output_type": "stream",
167 |      "text": [
168 |       "time: 238.82638692855835\n"
169 |      ]
170 |     },
171 |     {
172 |      "data": {
173 |       "text/plain": [
174 |        "[(0, 1548999), (1, 1548999)]"
175 |       ]
176 |      },
177 |      "execution_count": 11,
178 |      "metadata": {},
179 |      "output_type": "execute_result"
180 |     }
181 |    ],
182 |    "source": [
183 |     "from imblearn.over_sampling import SMOTE\n",
184 |     "import time\n",
185 |     "time_start = time.time()\n",
186 |     "smo = SMOTE(random_state=42) \n",
187 |     "\n",
188 |     "X_smo, y_smo = smo.fit_sample(X_tl2, y_tl2)   \n",
189 |     "\n",
190 |     "time_end = time.time()\n",
191 |     "time = time_end - time_start\n",
192 |     "print(\"time:\",time)\n",
193 |     "\n",
194 |     "sorted(Counter(y_smo).items())"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 12,
200 |    "metadata": {},
201 |    "outputs": [
202 |     {
203 |      "name": "stdout",
204 |      "output_type": "stream",
205 |      "text": [
206 |       "(3097998, 1)\n"
207 |      ]
208 |     }
209 |    ],
210 |    "source": [
211 |     "yy2 = y_smo.reshape(y_smo.shape[0],1)  \n",
212 |     "print(yy2.shape)"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": 13,
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": [
221 |     "#Save the data as an npy file\n",
222 |     "#把数据存成npy格式的文件\n",
223 |     "np.save(\"E:/IDS/alldata/12/tomektrain/data.npy\",X_smo)  \n",
224 |     "np.save(\"E:/IDS/alldata/12/tomektrain/label_2.npy\",yy2)"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": null,
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "# yy2 = y2.reshape(y2.shape[0],1)  #将标签再重塑为列向量形式,最终分类需要这种格式\n",
234 |     "# print(yy2.shape)\n",
235 |     "# yy10 = y10.reshape(y10.shape[0],1)  #将标签再重塑为列向量形式,最终分类需要这种格式\n",
236 |     "# print(yy10.shape)"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": null,
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "# np.save(\"/home/hll/IDS/alldata/12/val/data.npy\",X)  #把数据存成npy格式的文件\n",
246 |     "# np.save(\"/home/hll/IDS/alldata/12/val/label_2.npy\",yy2)\n",
247 |     "# np.save(\"/home/hll/IDS/alldata/12/val/label_10.npy\",yy10)"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "markdown",
252 |    "metadata": {},
253 |    "source": [
254 |     "# test set does not do processing"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {},
261 |    "outputs": [],
262 |    "source": [
263 |     "#保存测试集数据\n",
264 |     "#Save the test set data\n",
265 |     "list11= []#data\n",
266 |     "list22=[]   #label_2\n",
267 |     "list1010=[]#label_10\n",
268 |     "\n",
269 |     "for serialized_example in tf.python_io.tf_record_iterator(\"normalized/test_select_12.tfrecords\"):  \n",
270 |     "\n",
271 |     "    example = tf.train.Example()\n",
272 |     "    example.ParseFromString(serialized_example)\n",
273 |     "\n",
274 |     "    feature = example.features.feature['features'].float_list.value  \n",
275 |     "    label_2 = example.features.feature['label_2'].float_list.value\n",
276 |     "    label_10 = example.features.feature['label_10'].float_list.value\n",
277 |     "    \n",
278 |     "    list11.append(feature) \n",
279 |     "    list22.append(label_2)\n",
280 |     "    list1010.append(label_10)\n",
281 |     "        "
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": null,
287 |    "metadata": {},
288 |    "outputs": [],
289 |    "source": [
290 |     "XX=np.array(list11) \n",
291 |     "b22=np.array(list22)\n",
292 |     "bb22=b22.reshape(b22.shape[0],)   \n",
293 |     "b1010=np.array(list1010)\n",
294 |     "bb1010=b1010.reshape(b1010.shape[0],)\n",
295 |     "y22 = np.int32(bb22)  \n",
296 |     "y1010 = np.int32(bb1010)"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": null,
302 |    "metadata": {},
303 |    "outputs": [],
304 |    "source": [
305 |     "print(y22.shape)\n",
306 |     "print(y1010.shape)  \n",
307 |     "print(XX.shape)\n",
308 |     "print(Counter(y22))  \n",
309 |     "print(Counter(y1010))"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": null,
315 |    "metadata": {},
316 |    "outputs": [],
317 |    "source": [
318 |     "yy22 = y22.reshape(y22.shape[0],1)  \n",
319 |     "print(yy22.shape)\n",
320 |     "yy1010 = y1010.reshape(y1010.shape[0],1)  \n",
321 |     "print(yy1010.shape)"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": null,
327 |    "metadata": {},
328 |    "outputs": [],
329 |    "source": [
330 |     "np.save(\"alldata/data_12_test.npy\",XX) \n",
331 |     "np.save(\"alldata/label_2_12_test.npy\",yy22)\n",
332 |     "np.save(\"alldata/label_10_12_test.npy\",yy1010)"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "markdown",
337 |    "metadata": {},
338 |    "source": [
339 |     "# ROS-binary-class"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": null,
345 |    "metadata": {},
346 |    "outputs": [],
347 |    "source": [
348 |     "from imblearn.over_sampling import RandomOverSampler\n",
349 |     "\n",
350 |     "ros = RandomOverSampler(random_state=0)\n",
351 |     "X_ros, y_ros = ros.fit_sample(X, y2)\n",
352 |     "\n",
353 |     "sorted(Counter(y_ros).items())"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": null,
359 |    "metadata": {},
360 |    "outputs": [],
361 |    "source": [
362 |     "yy2 = y_ros.reshape(y_ros.shape[0],1)  \n",
363 |     "yy2.shape"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": null,
369 |    "metadata": {},
370 |    "outputs": [],
371 |    "source": [
372 |     "np.save(\"/home/hll/IDS/cicdata/77_2/data_ros_train.npy\",X_ros) \n",
373 |     "np.save(\"/home/hll/IDS/cicdata/77_2/label_ros_train.npy\",yy2)"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "markdown",
378 |    "metadata": {},
379 |    "source": [
380 |     "# ROS-multi-class"
381 |    ]
382 |   },
383 |   {
384 |    "cell_type": "code",
385 |    "execution_count": null,
386 |    "metadata": {},
387 |    "outputs": [],
388 |    "source": [
389 |     "from imblearn.over_sampling import RandomOverSampler\n",
390 |     "\n",
391 |     "ros = RandomOverSampler(random_state=0)\n",
392 |     "X_ros, y_ros = ros.fit_sample(X, y10)\n",
393 |     "\n",
394 |     "sorted(Counter(y_ros).items())"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "code",
399 |    "execution_count": null,
400 |    "metadata": {},
401 |    "outputs": [],
402 |    "source": [
403 |     "yy10 = y_ros.reshape(y_ros.shape[0],1)  \n",
404 |     "yy10.shape"
405 |    ]
406 |   },
407 |   {
408 |    "cell_type": "code",
409 |    "execution_count": null,
410 |    "metadata": {},
411 |    "outputs": [],
412 |    "source": [
413 |     "np.save(\"/home/hll/IDS/alldata/12/rostrain/data_ros10.npy\",X_ros) \n",
414 |     "np.save(\"/home/hll/IDS/alldata/12/rostrain/label_10_ros.npy\",yy10)"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "markdown",
419 |    "metadata": {},
420 |    "source": [
421 |     "# SMOTE-binary-class"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": null,
427 |    "metadata": {},
428 |    "outputs": [],
429 |    "source": [
430 |     "from imblearn.over_sampling import SMOTE\n",
431 |     "\n",
432 |     "smo = SMOTE(random_state=42)  \n",
433 |     "\n",
434 |     "X_smo, y_smo = smo.fit_sample(X, y2)    \n",
435 |     "sorted(Counter(y_smo).items())"
436 |    ]
437 |   },
438 |   {
439 |    "cell_type": "code",
440 |    "execution_count": null,
441 |    "metadata": {},
442 |    "outputs": [],
443 |    "source": [
444 |     "yy2 = y_smo.reshape(y_smo.shape[0],1) \n",
445 |     "yy2.shape"
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "code",
450 |    "execution_count": null,
451 |    "metadata": {},
452 |    "outputs": [],
453 |    "source": [
454 |     "np.save(\"/home/hll/IDS/cicdata/77_2/data_smote_train.npy\",X_smo) \n",
455 |     "np.save(\"/home/hll/IDS/cicdata/77_2/label_smote_train.npy\",yy2)"
456 |    ]
457 |   },
458 |   {
459 |    "cell_type": "markdown",
460 |    "metadata": {},
461 |    "source": [
462 |     "# SMOTE-multi-class"
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "code",
467 |    "execution_count": null,
468 |    "metadata": {},
469 |    "outputs": [],
470 |    "source": [
471 |     "from imblearn.over_sampling import SMOTE\n",
472 |     "\n",
473 |     "smo = SMOTE(random_state=42)  \n",
474 |     "\n",
475 |     "X_smo, y_smo = smo.fit_sample(X, y10)   \n",
476 |     "sorted(Counter(y_smo).items())"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "code",
481 |    "execution_count": null,
482 |    "metadata": {},
483 |    "outputs": [],
484 |    "source": [
485 |     "yy10 = y_smo.reshape(y_smo.shape[0],1) \n",
486 |     "yy10.shape\n",
487 |     "np.save(\"/home/hll/IDS/alldata/12/smotetrain/data_12smo10.npy\",X_smo)  \n",
488 |     "np.save(\"/home/hll/IDS/alldata/12/smotetrain/label_10_12smo.npy\",yy10)"
489 |    ]
490 |   },
491 |   {
492 |    "cell_type": "code",
493 |    "execution_count": null,
494 |    "metadata": {},
495 |    "outputs": [],
496 |    "source": [
497 |     "yy10.shape"
498 |    ]
499 |   },
500 |   {
501 |    "cell_type": "markdown",
502 |    "metadata": {},
503 |    "source": [
504 |     "# ADASYN-binary-class"
505 |    ]
506 |   },
507 |   {
508 |    "cell_type": "code",
509 |    "execution_count": null,
510 |    "metadata": {},
511 |    "outputs": [],
512 |    "source": [
513 |     "from imblearn.over_sampling import  ADASYN\n",
514 |     "X_adasyn, y_adasyn = ADASYN().fit_sample(X, y2)\n",
515 |     "\n",
516 |     "print(sorted(Counter(y_adasyn).items()))"
517 |    ]
518 |   },
519 |   {
520 |    "cell_type": "code",
521 |    "execution_count": null,
522 |    "metadata": {},
523 |    "outputs": [],
524 |    "source": [
525 |     "yy2 = y_adasyn.reshape(y_adasyn.shape[0],1)  \n",
526 |     "yy2.shape\n",
527 |     "np.save(\"/home/hll/IDS/cicdata/77_2/data_adasyn_train.npy\",X_adasyn) \n",
528 |     "np.save(\"/home/hll/IDS/cicdata/77_2/label_adasyn_train.npy\",yy2)"
529 |    ]
530 |   },
531 |   {
532 |    "cell_type": "markdown",
533 |    "metadata": {},
534 |    "source": [
535 |     "# ADASYN-multi-class"
536 |    ]
537 |   },
538 |   {
539 |    "cell_type": "code",
540 |    "execution_count": null,
541 |    "metadata": {},
542 |    "outputs": [],
543 |    "source": [
544 |     "from imblearn.over_sampling import  ADASYN\n",
545 |     "X_adasyn, y_adasyn = ADASYN(ratio={1:10000,2:10000,3:100000,4:100000,\n",
546 |     "                                   5:100000,6:1000000,7:10000,8:10000,9:1000},random_state=42).fit_sample(X, y10)\n",
547 |     "\n",
548 |     "print(sorted(Counter(y_adasyn).items()))"
549 |    ]
550 |   },
551 |   {
552 |    "cell_type": "code",
553 |    "execution_count": null,
554 |    "metadata": {},
555 |    "outputs": [],
556 |    "source": [
557 |     "yy2 = y_adasyn.reshape(y_adasyn.shape[0],1)  \n",
558 |     "yy2.shape\n",
559 |     "np.save(\"alldata/ADASYN/data_12adasyn10.npy\",X_adasyn)  \n",
560 |     "np.save(\"alldata/ADASYN/label_10_12adasyn.npy\",yy2)"
561 |    ]
562 |   }
563 |  ],
564 |  "metadata": {
565 |   "kernelspec": {
566 |    "display_name": "Python 3",
567 |    "language": "python",
568 |    "name": "python3"
569 |   },
570 |   "language_info": {
571 |    "codemirror_mode": {
572 |     "name": "ipython",
573 |     "version": 3
574 |    },
575 |    "file_extension": ".py",
576 |    "mimetype": "text/x-python",
577 |    "name": "python",
578 |    "nbconvert_exporter": "python",
579 |    "pygments_lexer": "ipython3",
580 |    "version": "3.6.9"
581 |   }
582 |  },
583 |  "nbformat": 4,
584 |  "nbformat_minor": 2
585 | }
586 | 


--------------------------------------------------------------------------------
/.ipynb_checkpoints/K-means + SMOTE -10-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 将少数类均SMOTE到I（数据集样本数/类数）"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "from collections import Counter\n",
 17 |     "import os\n",
 18 |     "import pandas as pd\n",
 19 |     "import numpy as np\n",
 20 |     "import tensorflow as tf\n",
 21 |     "# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"1\"\n",
 22 |     "\n",
 23 |     "data = np.load('E:/IDS/cicdata/77/data_train.npy')\n",
 24 |     "label = np.load('E:/IDS/cicdata/77/label_train.npy')"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "X=np.array(data)  #np.array将列表转换为数组，即去掉每个元素外面的中括号\n",
 34 |     "b=np.array(label)\n",
 35 |     "bb=b.reshape(b.shape[0],)     #reshape重新定义形状,此时标签数据是1维的\n",
 36 |     "y10 = np.int32(bb)  #将标签类型从浮点型换成整形"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "X.shape"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "sorted(Counter(y10).items())"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "#使用imlbearn库中上采样方法中的SMOTE接口\n",
 64 |     "from imblearn.over_sampling import SMOTE\n",
 65 |     "import time\n",
 66 |     "time_start = time.time()\n",
 67 |     "\n",
 68 |     "guo = 132101  #需要过采样的样本数\n",
 69 |     "\n",
 70 |     "smo = SMOTE(ratio={2:guo,3:guo,4:guo,5:guo,6:guo,\n",
 71 |     "                   7:guo,8:guo,9:guo,10:guo,11:guo,12:guo,13:guo,14:guo,},random_state=42)\n",
 72 |     "\n",
 73 |     "X_smo, y_smo = smo.fit_sample(X, y10)    #对数据和标签进行SMOTE处理\n",
 74 |     "print(sorted(Counter(y_smo).items()))\n",
 75 |     "\n",
 76 |     "time_end = time.time()\n",
 77 |     "time = time_end - time_start\n",
 78 |     "print(\"time:\",time)"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "X_smo.shape[0]"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "# 将多数类数据提取出来"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "list0 = []  #正常类数据\n",
104 |     "list1 = []  #1类数据\n",
105 |     "list2 = []  #2-14数据\n",
106 |     "list3 = [] #2-14类标签\n",
107 |     "\n",
108 |     "for i in range(X_smo.shape[0]):\n",
109 |     "    if y_smo[i] == 0:\n",
110 |     "        list0.append(X_smo[i])\n",
111 |     "    elif y_smo[i] == 1:\n",
112 |     "        list1.append(X_smo[i])\n",
113 |     "    else:\n",
114 |     "        list2.append(X_smo[i])\n",
115 |     "        list3.append(y_smo[i])\n",
116 |     "    \n"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": [
125 |     "data0 = np.array(list0)  #np.array将列表转换为数组，即去掉每个元素外面的中括号\n",
126 |     "data1 = np.array(list1)\n",
127 |     "data2 = np.array(list2)\n",
128 |     "label2_14 = np.array(list3)\n",
129 |     "\n",
130 |     "label214 = label2_14.reshape(label2_14.shape[0],)     #reshape重新定义形状,此时标签数据是1维的\n",
131 |     "\n",
132 |     "print(\"正常类数据形状：\",data0.shape)\n",
133 |     "print(\"1类数据形状：\",data1.shape)  #使用函数查看类分布需要这种格式\n",
134 |     "print(\"2-14类数据形状：\",data2.shape)\n",
135 |     "print(\"2-14类标签形状：\",label214.shape)"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "label214"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "metadata": {},
150 |    "source": [
151 |     "# 将多数类数据聚成（总类数）"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "from sklearn.cluster import KMeans\n",
161 |     "import time\n",
162 |     "time_start = time.time()\n",
163 |     "\n",
164 |     "estimator = KMeans(n_clusters=10)\n",
165 |     "estimator.fit(data0)  # 聚类\n",
166 |     "# estimator.fit(data1)\n",
167 |     "\n",
168 |     "time_end = time.time()\n",
169 |     "time = time_end - time_start\n",
170 |     "print(\"time:\",time)\n",
171 |     "\n",
172 |     "label_pred_0 = estimator.labels_  # 获取聚类标签\n",
173 |     "# label_pred_1 = estimator.predict(data1)"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": null,
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "sorted(Counter(label_pred_0).items())"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "label_pred = label_pred_0"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "markdown",
196 |    "metadata": {},
197 |    "source": [
198 |     "# 从每个簇中选出一定量的数据组成新的多数类数据"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "metadata": {},
205 |    "outputs": [],
206 |    "source": [
207 |     "c0 = []\n",
208 |     "c1 = []\n",
209 |     "c2 = []\n",
210 |     "c3 = []\n",
211 |     "c4 = []\n",
212 |     "c5 = []\n",
213 |     "c6 = []\n",
214 |     "c7 = []\n",
215 |     "c8 = []\n",
216 |     "c9 = []\n",
217 |     "c10 = []\n",
218 |     "c11 = []\n",
219 |     "c12 = []\n",
220 |     "c13 = []\n",
221 |     "c14 = []\n",
222 |     "\n",
223 |     "# c9 = data0[label_pred == 9]\n",
224 |     "# c7 = data0[label_pred == 7]\n",
225 |     "\n",
226 |     "# s0=s1=s2=s3=s4=s5=s6=s7=s8=s9=0\n",
227 |     "s0=s1=s2=s3=s4=s5=s6=s7=s8=s9=s10=s11=s12=s13=s14=0\n",
228 |     "\n",
229 |     "for i in range(data0.shape[0]):\n",
230 |     "    if label_pred[i] == 0:\n",
231 |     "        c0.append(data0[i])\n",
232 |     "        s0=s0+1\n",
233 |     "    elif label_pred[i] == 1:\n",
234 |     "        c1.append(data0[i])\n",
235 |     "        s1=s1+1\n",
236 |     "    elif label_pred[i] == 2:\n",
237 |     "        c2.append(data0[i])\n",
238 |     "        s2=s2+1\n",
239 |     "    elif label_pred[i] == 3:\n",
240 |     "        c3.append(data0[i])\n",
241 |     "        s3=s3+1\n",
242 |     "    elif label_pred[i] == 4:\n",
243 |     "        c4.append(data0[i])\n",
244 |     "        s4=s4+1\n",
245 |     "    elif label_pred[i] == 5:\n",
246 |     "        c5.append(data0[i])\n",
247 |     "        s5=s5+1\n",
248 |     "    elif label_pred[i] == 6:\n",
249 |     "        c6.append(data0[i])\n",
250 |     "        s6=s6+1\n",
251 |     "    elif label_pred[i] == 7:\n",
252 |     "        c7.append(data0[i])\n",
253 |     "        s7=s7+1\n",
254 |     "    elif label_pred[i] == 8:\n",
255 |     "        c8.append(data0[i])\n",
256 |     "        s8=s8+1\n",
257 |     "    elif label_pred[i] == 9:\n",
258 |     "        c9.append(data0[i])\n",
259 |     "        s9=s9+1\n",
260 |     "    elif label_pred[i] == 10:\n",
261 |     "        c10.append(data0[i])\n",
262 |     "        s10=s10+1\n",
263 |     "    elif label_pred[i] == 11:\n",
264 |     "        c11.append(data0[i])\n",
265 |     "        s11=s11+1\n",
266 |     "    elif label_pred[i] == 12:\n",
267 |     "        c12.append(data0[i])\n",
268 |     "        s12=s12+1\n",
269 |     "    elif label_pred[i] == 13:\n",
270 |     "        c13.append(data0[i])\n",
271 |     "        s13=s13+1\n",
272 |     "    elif label_pred[i] == 14:\n",
273 |     "        c14.append(data0[i])\n",
274 |     "        s14=s14+1\n",
275 |     "        \n"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": null,
281 |    "metadata": {},
282 |    "outputs": [],
283 |    "source": [
284 |     "a=9405"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": null,
290 |    "metadata": {},
291 |    "outputs": [],
292 |    "source": [
293 |     "print(\"删除前大小:\",len(c0))\n",
294 |     "del c0[a:len(c0)]\n",
295 |     "print(\"删除后大小:\",len(c0))\n"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": null,
301 |    "metadata": {},
302 |    "outputs": [],
303 |    "source": [
304 |     "print(\"删除前大小:\",len(c1))\n",
305 |     "del c1[a:len(c1)]\n",
306 |     "print(\"删除后大小:\",len(c1))"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": null,
312 |    "metadata": {},
313 |    "outputs": [],
314 |    "source": [
315 |     "print(\"删除前大小:\",len(c2))\n",
316 |     "del c2[a:len(c2)]\n",
317 |     "print(\"删除后大小:\",len(c2))"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": null,
323 |    "metadata": {},
324 |    "outputs": [],
325 |    "source": [
326 |     "print(\"删除前大小:\",len(c3))\n",
327 |     "del c3[a:len(c3)]\n",
328 |     "print(\"删除后大小:\",len(c3))"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": null,
334 |    "metadata": {},
335 |    "outputs": [],
336 |    "source": [
337 |     "print(\"删除前大小:\",len(c4))\n",
338 |     "del c4[a:len(c4)]\n",
339 |     "print(\"删除后大小:\",len(c4))"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": null,
345 |    "metadata": {},
346 |    "outputs": [],
347 |    "source": [
348 |     "del c5[a:len(c5)]\n",
349 |     "del c6[a:len(c6)]\n",
350 |     "del c7[a:len(c7)]\n",
351 |     "# del c8[a:len(c8)]\n",
352 |     "del c9[a:len(c9)]\n",
353 |     "del c10[a:len(c10)]\n",
354 |     "del c11[a:len(c11)]\n",
355 |     "del c12[a:len(c12)]\n",
356 |     "del c13[a:len(c13)]\n",
357 |     "del c14[a:len(c14)]"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": null,
363 |    "metadata": {},
364 |    "outputs": [],
365 |    "source": [
366 |     "c00 = np.array(c0)\n",
367 |     "c11 = np.array(c1)\n",
368 |     "c22 = np.array(c2)\n",
369 |     "c33 = np.array(c3)\n",
370 |     "c44 = np.array(c4)\n",
371 |     "c55 = np.array(c5)\n",
372 |     "c66 = np.array(c6)\n",
373 |     "c77 = np.array(c7)\n",
374 |     "c88 = np.array(c8)\n",
375 |     "c99 = np.array(c9)\n",
376 |     "c1010 = np.array(c10)\n",
377 |     "c1111 = np.array(c11)\n",
378 |     "c1212 = np.array(c12)\n",
379 |     "c1313 = np.array(c13)\n",
380 |     "c1414 = np.array(c14)"
381 |    ]
382 |   },
383 |   {
384 |    "cell_type": "code",
385 |    "execution_count": null,
386 |    "metadata": {},
387 |    "outputs": [],
388 |    "source": [
389 |     "q0 = np.concatenate((c00,c11,c22,c33,c44,c55,c66,c77,c88,c99,c1010,c1111,c1212,c1313,c1414),axis=0)\n",
390 |     "q0.shape"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "code",
395 |    "execution_count": null,
396 |    "metadata": {},
397 |    "outputs": [],
398 |    "source": [
399 |     "#创建一个全零数组为正常类的标签\n",
400 |     "label_zc = np.zeros((q0.shape[0],), dtype=int)\n",
401 |     "label_zc.shape"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "code",
406 |    "execution_count": null,
407 |    "metadata": {},
408 |    "outputs": [],
409 |    "source": []
410 |   },
411 |   {
412 |    "cell_type": "code",
413 |    "execution_count": null,
414 |    "metadata": {},
415 |    "outputs": [],
416 |    "source": []
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": null,
421 |    "metadata": {},
422 |    "outputs": [],
423 |    "source": [
424 |     "from sklearn.cluster import KMeans\n",
425 |     "import time\n",
426 |     "time_start = time.time()\n",
427 |     "\n",
428 |     "estimator = KMeans(n_clusters=10)\n",
429 |     "# estimator.fit(data0)  # 聚类\n",
430 |     "estimator.fit(data1)\n",
431 |     "\n",
432 |     "time_end = time.time()\n",
433 |     "time = time_end - time_start\n",
434 |     "print(\"time:\",time)\n",
435 |     "\n",
436 |     "# label_pred_0 = estimator.labels_  # 获取聚类标签\n",
437 |     "label_pred_1 = estimator.predict(data1)"
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": null,
443 |    "metadata": {},
444 |    "outputs": [],
445 |    "source": [
446 |     "sorted(Counter(label_pred_1).items())"
447 |    ]
448 |   },
449 |   {
450 |    "cell_type": "code",
451 |    "execution_count": null,
452 |    "metadata": {},
453 |    "outputs": [],
454 |    "source": [
455 |     "label_pred = label_pred_1\n",
456 |     "\n",
457 |     "c0 = []\n",
458 |     "c1 = []\n",
459 |     "c2 = []\n",
460 |     "c3 = []\n",
461 |     "c4 = []\n",
462 |     "c5 = []\n",
463 |     "c6 = []\n",
464 |     "c7 = []\n",
465 |     "c8 = []\n",
466 |     "c9 = []\n",
467 |     "c10 = []\n",
468 |     "c11 = []\n",
469 |     "c12 = []\n",
470 |     "c13 = []\n",
471 |     "c14 = []\n",
472 |     "\n",
473 |     "# s0=s1=s2=s3=s4=s5=s6=s7=s8=s9=0\n",
474 |     "s0=s1=s2=s3=s4=s5=s6=s7=s8=s9=s10=s11=s12=s13=s14=0\n",
475 |     "\n",
476 |     "for i in range(data1.shape[0]):\n",
477 |     "    if label_pred[i] == 0:\n",
478 |     "        c0.append(data1[i])\n",
479 |     "        s0=s0+1\n",
480 |     "    elif label_pred[i] == 1:\n",
481 |     "        c1.append(data1[i])\n",
482 |     "        s1=s1+1\n",
483 |     "    elif label_pred[i] == 2:\n",
484 |     "        c2.append(data1[i])\n",
485 |     "        s2=s2+1\n",
486 |     "    elif label_pred[i] == 3:\n",
487 |     "        c3.append(data1[i])\n",
488 |     "        s3=s3+1\n",
489 |     "    elif label_pred[i] == 4:\n",
490 |     "        c4.append(data1[i])\n",
491 |     "        s4=s4+1\n",
492 |     "    elif label_pred[i] == 5:\n",
493 |     "        c5.append(data1[i])\n",
494 |     "        s5=s5+1\n",
495 |     "    elif label_pred[i] == 6:\n",
496 |     "        c6.append(data1[i])\n",
497 |     "        s6=s6+1\n",
498 |     "    elif label_pred[i] == 7:\n",
499 |     "        c7.append(data1[i])\n",
500 |     "        s7=s7+1\n",
501 |     "    elif label_pred[i] == 8:\n",
502 |     "        c8.append(data1[i])\n",
503 |     "        s8=s8+1\n",
504 |     "    elif label_pred[i] == 9:\n",
505 |     "        c9.append(data1[i])\n",
506 |     "        s9=s9+1\n",
507 |     "    elif label_pred[i] == 10:\n",
508 |     "        c10.append(data1[i])\n",
509 |     "        s10=s10+1\n",
510 |     "    elif label_pred[i] == 11:\n",
511 |     "        c11.append(data1[i])\n",
512 |     "        s11=s11+1\n",
513 |     "    elif label_pred[i] == 12:\n",
514 |     "        c12.append(data1[i])\n",
515 |     "        s12=s12+1\n",
516 |     "    elif label_pred[i] == 13:\n",
517 |     "        c13.append(data1[i])\n",
518 |     "        s13=s13+1\n",
519 |     "    elif label_pred[i] == 14:\n",
520 |     "        c14.append(data1[i])\n",
521 |     "        s14=s14+1\n",
522 |     "        \n",
523 |     "a=15191"
524 |    ]
525 |   },
526 |   {
527 |    "cell_type": "code",
528 |    "execution_count": null,
529 |    "metadata": {},
530 |    "outputs": [],
531 |    "source": [
532 |     "print(\"删除前大小:\",len(c1))\n",
533 |     "del c1[a:len(c1)]\n",
534 |     "print(\"删除后大小:\",len(c1))\n"
535 |    ]
536 |   },
537 |   {
538 |    "cell_type": "code",
539 |    "execution_count": null,
540 |    "metadata": {},
541 |    "outputs": [],
542 |    "source": [
543 |     "del c2[a:len(c2)]\n",
544 |     "print(\"删除前大小:\",len(c4))\n",
545 |     "del c4[a:len(c4)]\n",
546 |     "print(\"删除后大小:\",len(c4))\n",
547 |     "print(\"删除前大小:\",len(c6))\n",
548 |     "del c6[a:len(c6)]\n",
549 |     "print(\"删除后大小:\",len(c6))"
550 |    ]
551 |   },
552 |   {
553 |    "cell_type": "code",
554 |    "execution_count": null,
555 |    "metadata": {},
556 |    "outputs": [],
557 |    "source": [
558 |     "c0 = np.array(c0)\n",
559 |     "c1 = np.array(c1)\n",
560 |     "c2 = np.array(c2)\n",
561 |     "c3 = np.array(c3)\n",
562 |     "c4 = np.array(c4)\n",
563 |     "c5 = np.array(c5)\n",
564 |     "c6 = np.array(c6)\n",
565 |     "c7 = np.array(c7)\n",
566 |     "c8 = np.array(c8)\n",
567 |     "c9 = np.array(c9)\n",
568 |     "c10 = np.array(c10)\n",
569 |     "c11 = np.array(c11)\n",
570 |     "c12 = np.array(c12)\n",
571 |     "c13 = np.array(c13)\n",
572 |     "c14 = np.array(c14)"
573 |    ]
574 |   },
575 |   {
576 |    "cell_type": "code",
577 |    "execution_count": null,
578 |    "metadata": {},
579 |    "outputs": [],
580 |    "source": [
581 |     "q1 = np.concatenate((c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14),axis=0)\n",
582 |     "q1.shape"
583 |    ]
584 |   },
585 |   {
586 |    "cell_type": "code",
587 |    "execution_count": null,
588 |    "metadata": {},
589 |    "outputs": [],
590 |    "source": [
591 |     "#创建一个数组标签\n",
592 |     "label_1 = np.ones((q1.shape[0],), dtype=int)\n",
593 |     "\n",
594 |     "label_1.shape"
595 |    ]
596 |   },
597 |   {
598 |    "cell_type": "code",
599 |    "execution_count": null,
600 |    "metadata": {},
601 |    "outputs": [],
602 |    "source": []
603 |   },
604 |   {
605 |    "cell_type": "code",
606 |    "execution_count": null,
607 |    "metadata": {},
608 |    "outputs": [],
609 |    "source": [
610 |     "#将正常类数据和攻击数据合并到一起，将标签也合并到一起\n",
611 |     "data_end = np.concatenate((q0,q1,data2),axis=0)\n",
612 |     "label_end = np.concatenate((label_zc,label_1,label214),axis=0)"
613 |    ]
614 |   },
615 |   {
616 |    "cell_type": "code",
617 |    "execution_count": null,
618 |    "metadata": {},
619 |    "outputs": [],
620 |    "source": [
621 |     "data_end.shape"
622 |    ]
623 |   },
624 |   {
625 |    "cell_type": "code",
626 |    "execution_count": null,
627 |    "metadata": {},
628 |    "outputs": [],
629 |    "source": [
630 |     "sorted(Counter(label_end).items())"
631 |    ]
632 |   },
633 |   {
634 |    "cell_type": "code",
635 |    "execution_count": null,
636 |    "metadata": {},
637 |    "outputs": [],
638 |    "source": [
639 |     "label_end = label_end.reshape(label_end.shape[0],1)"
640 |    ]
641 |   },
642 |   {
643 |    "cell_type": "code",
644 |    "execution_count": null,
645 |    "metadata": {},
646 |    "outputs": [],
647 |    "source": [
648 |     "#将最终采样后的数据保存成文件\n",
649 |     "np.save(\"E:/IDS/cicdata/K-means+SMOTE/data.npy\",data_end)\n",
650 |     "np.save(\"E:/IDS/cicdata/K-means+SMOTE/label.npy\",label_end)"
651 |    ]
652 |   },
653 |   {
654 |    "cell_type": "code",
655 |    "execution_count": null,
656 |    "metadata": {},
657 |    "outputs": [],
658 |    "source": []
659 |   }
660 |  ],
661 |  "metadata": {
662 |   "kernelspec": {
663 |    "display_name": "Python 3",
664 |    "language": "python",
665 |    "name": "python3"
666 |   },
667 |   "language_info": {
668 |    "codemirror_mode": {
669 |     "name": "ipython",
670 |     "version": 3
671 |    },
672 |    "file_extension": ".py",
673 |    "mimetype": "text/x-python",
674 |    "name": "python",
675 |    "nbconvert_exporter": "python",
676 |    "pygments_lexer": "ipython3",
677 |    "version": "3.7.4"
678 |   }
679 |  },
680 |  "nbformat": 4,
681 |  "nbformat_minor": 2
682 | }
683 | 


--------------------------------------------------------------------------------
/.ipynb_checkpoints/ROS,SMOTE,ADASYN-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import os\n",
 10 |     "# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"1\""
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [
 18 |     {
 19 |      "name": "stderr",
 20 |      "output_type": "stream",
 21 |      "text": [
 22 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:526: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 23 |       "  _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n",
 24 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:527: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 25 |       "  _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n",
 26 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:528: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 27 |       "  _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n",
 28 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:529: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 29 |       "  _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n",
 30 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:530: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 31 |       "  _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n",
 32 |       "f:\\anaconda\\envs\\py37\\lib\\site-packages\\tensorflow\\python\\framework\\dtypes.py:535: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
 33 |       "  np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n"
 34 |      ]
 35 |     },
 36 |     {
 37 |      "name": "stdout",
 38 |      "output_type": "stream",
 39 |      "text": [
 40 |       "WARNING:tensorflow:From <ipython-input-2-aa5b6903d61d>:15: tf_record_iterator (from tensorflow.python.lib.io.tf_record) is deprecated and will be removed in a future version.\n",
 41 |       "Instructions for updating:\n",
 42 |       "Use eager execution and: \n",
 43 |       "`tf.data.TFRecordDataset(path)`\n"
 44 |      ]
 45 |     }
 46 |    ],
 47 |    "source": [
 48 |     "#把所有数据整合到一起，list1是数据，list2是二分类标签，list10是10分类标签。\n",
 49 |     "#Put all the data together, list1 is the data, list2 is the dichotomy label, and list10 is the 10-category label.\n",
 50 |     "from collections import Counter\n",
 51 |     "import os\n",
 52 |     "import pandas as pd\n",
 53 |     "import numpy as np\n",
 54 |     "import tensorflow as tf\n",
 55 |     "\n",
 56 |     "#读取tf文件数据\n",
 57 |     "#Read the tf file data\n",
 58 |     "list1= []#data\n",
 59 |     "list2=[]   #label_2\n",
 60 |     "list10=[]#label_10\n",
 61 |     "\n",
 62 |     "for serialized_example in tf.python_io.tf_record_iterator(\"E:/IDS/normalized/train_select_12.tfrecords\"):  \n",
 63 |     "\n",
 64 |     "\n",
 65 |     "    example = tf.train.Example()\n",
 66 |     "    example.ParseFromString(serialized_example)\n",
 67 |     "    \n",
 68 |     "\n",
 69 |     "    feature = example.features.feature['features'].float_list.value  \n",
 70 |     "    label_2 = example.features.feature['label_2'].float_list.value\n",
 71 |     "    label_10 = example.features.feature['label_10'].float_list.value\n",
 72 |     "    \n",
 73 |     "    list1.append(feature)  \n",
 74 |     "    list2.append(label_2)\n",
 75 |     "    list10.append(label_10)\n",
 76 |     "\n",
 77 |     "    \n",
 78 |     "# for serialized_example in tf.python_io.tf_record_iterator(\"/home/hll/IDS/normalized/test_select_12.tfrecords\"):  \n",
 79 |     "\n",
 80 |     "\n",
 81 |     "#     example = tf.train.Example()\n",
 82 |     "#     example.ParseFromString(serialized_example)\n",
 83 |     "    \n",
 84 |     "\n",
 85 |     "#     feature = example.features.feature['features'].float_list.value  \n",
 86 |     "#     label_2 = example.features.feature['label_2'].float_list.value\n",
 87 |     "#     label_10 = example.features.feature['label_10'].float_list.value\n",
 88 |     "    \n",
 89 |     "#     list1.append(feature)  \n",
 90 |     "#     list2.append(label_2)\n",
 91 |     "#     list10.append(label_10)\n",
 92 |     "    \n",
 93 |     "\n",
 94 |     "        \n",
 95 |     "# for serialized_example in tf.python_io.tf_record_iterator(\"/home/hll/IDS/normalized/validation_select_12.tfrecords\"):  \n",
 96 |     "\n",
 97 |     "\n",
 98 |     "#     example = tf.train.Example()\n",
 99 |     "#     example.ParseFromString(serialized_example)\n",
100 |     "    \n",
101 |     "\n",
102 |     "#     feature = example.features.feature['features'].float_list.value  \n",
103 |     "#     label_2 = example.features.feature['label_2'].float_list.value\n",
104 |     "#     label_10 = example.features.feature['label_10'].float_list.value\n",
105 |     "    \n",
106 |     "#     list1.append(feature) \n",
107 |     "#     list2.append(label_2)\n",
108 |     "#     list10.append(label_10)\n"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 3,
114 |    "metadata": {},
115 |    "outputs": [
116 |     {
117 |      "data": {
118 |       "text/plain": [
119 |        "array([0, 0, 0, ..., 0, 0, 0])"
120 |       ]
121 |      },
122 |      "execution_count": 3,
123 |      "metadata": {},
124 |      "output_type": "execute_result"
125 |     }
126 |    ],
127 |    "source": [
128 |     "X=np.array(list1) \n",
129 |     "b2=np.array(list2)\n",
130 |     "bb2=b2.reshape(b2.shape[0],)     \n",
131 |     "b10=np.array(list10)\n",
132 |     "bb10=b10.reshape(b10.shape[0],)\n",
133 |     "y2 = np.int32(bb2)  \n",
134 |     "y10 = np.int32(bb10)\n",
135 |     "y10"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 5,
141 |    "metadata": {},
142 |    "outputs": [
143 |     {
144 |      "name": "stdout",
145 |      "output_type": "stream",
146 |      "text": [
147 |       "Counter({0: 1553132, 1: 224898})\n",
148 |       "Counter({0: 1553132, 6: 150836, 4: 31167, 5: 16972, 3: 11449, 7: 9791, 1: 1874, 2: 1630, 8: 1057, 9: 122})\n"
149 |      ]
150 |     }
151 |    ],
152 |    "source": [
153 |     "#View the distribution of data classes (data distribution without imbalanced processing)\n",
154 |     "#查看数据类的分布(未不平衡处理的数据分布)\n",
155 |     "print(Counter(y2))  \n",
156 |     "print(Counter(y10))"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 11,
162 |    "metadata": {},
163 |    "outputs": [
164 |     {
165 |      "name": "stdout",
166 |      "output_type": "stream",
167 |      "text": [
168 |       "time: 238.82638692855835\n"
169 |      ]
170 |     },
171 |     {
172 |      "data": {
173 |       "text/plain": [
174 |        "[(0, 1548999), (1, 1548999)]"
175 |       ]
176 |      },
177 |      "execution_count": 11,
178 |      "metadata": {},
179 |      "output_type": "execute_result"
180 |     }
181 |    ],
182 |    "source": [
183 |     "from imblearn.over_sampling import SMOTE\n",
184 |     "import time\n",
185 |     "time_start = time.time()\n",
186 |     "smo = SMOTE(random_state=42) \n",
187 |     "\n",
188 |     "X_smo, y_smo = smo.fit_sample(X_tl2, y_tl2)   \n",
189 |     "\n",
190 |     "time_end = time.time()\n",
191 |     "time = time_end - time_start\n",
192 |     "print(\"time:\",time)\n",
193 |     "\n",
194 |     "sorted(Counter(y_smo).items())"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 12,
200 |    "metadata": {},
201 |    "outputs": [
202 |     {
203 |      "name": "stdout",
204 |      "output_type": "stream",
205 |      "text": [
206 |       "(3097998, 1)\n"
207 |      ]
208 |     }
209 |    ],
210 |    "source": [
211 |     "yy2 = y_smo.reshape(y_smo.shape[0],1)  \n",
212 |     "print(yy2.shape)"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": 13,
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": [
221 |     "#Save the data as an npy file\n",
222 |     "#把数据存成npy格式的文件\n",
223 |     "np.save(\"E:/IDS/alldata/12/tomektrain/data.npy\",X_smo)  \n",
224 |     "np.save(\"E:/IDS/alldata/12/tomektrain/label_2.npy\",yy2)"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": null,
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "# yy2 = y2.reshape(y2.shape[0],1)  #将标签再重塑为列向量形式,最终分类需要这种格式\n",
234 |     "# print(yy2.shape)\n",
235 |     "# yy10 = y10.reshape(y10.shape[0],1)  #将标签再重塑为列向量形式,最终分类需要这种格式\n",
236 |     "# print(yy10.shape)"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": null,
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "# np.save(\"/home/hll/IDS/alldata/12/val/data.npy\",X)  #把数据存成npy格式的文件\n",
246 |     "# np.save(\"/home/hll/IDS/alldata/12/val/label_2.npy\",yy2)\n",
247 |     "# np.save(\"/home/hll/IDS/alldata/12/val/label_10.npy\",yy10)"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "markdown",
252 |    "metadata": {},
253 |    "source": [
254 |     "# test set does not do processing"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {},
261 |    "outputs": [],
262 |    "source": [
263 |     "#保存测试集数据\n",
264 |     "#Save the test set data\n",
265 |     "list11= []#data\n",
266 |     "list22=[]   #label_2\n",
267 |     "list1010=[]#label_10\n",
268 |     "\n",
269 |     "for serialized_example in tf.python_io.tf_record_iterator(\"normalized/test_select_12.tfrecords\"):  \n",
270 |     "\n",
271 |     "    example = tf.train.Example()\n",
272 |     "    example.ParseFromString(serialized_example)\n",
273 |     "\n",
274 |     "    feature = example.features.feature['features'].float_list.value  \n",
275 |     "    label_2 = example.features.feature['label_2'].float_list.value\n",
276 |     "    label_10 = example.features.feature['label_10'].float_list.value\n",
277 |     "    \n",
278 |     "    list11.append(feature) \n",
279 |     "    list22.append(label_2)\n",
280 |     "    list1010.append(label_10)\n",
281 |     "        "
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": null,
287 |    "metadata": {},
288 |    "outputs": [],
289 |    "source": [
290 |     "XX=np.array(list11) \n",
291 |     "b22=np.array(list22)\n",
292 |     "bb22=b22.reshape(b22.shape[0],)   \n",
293 |     "b1010=np.array(list1010)\n",
294 |     "bb1010=b1010.reshape(b1010.shape[0],)\n",
295 |     "y22 = np.int32(bb22)  \n",
296 |     "y1010 = np.int32(bb1010)"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": null,
302 |    "metadata": {},
303 |    "outputs": [],
304 |    "source": [
305 |     "print(y22.shape)\n",
306 |     "print(y1010.shape)  \n",
307 |     "print(XX.shape)\n",
308 |     "print(Counter(y22))  \n",
309 |     "print(Counter(y1010))"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": null,
315 |    "metadata": {},
316 |    "outputs": [],
317 |    "source": [
318 |     "yy22 = y22.reshape(y22.shape[0],1)  \n",
319 |     "print(yy22.shape)\n",
320 |     "yy1010 = y1010.reshape(y1010.shape[0],1)  \n",
321 |     "print(yy1010.shape)"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": null,
327 |    "metadata": {},
328 |    "outputs": [],
329 |    "source": [
330 |     "np.save(\"alldata/data_12_test.npy\",XX) \n",
331 |     "np.save(\"alldata/label_2_12_test.npy\",yy22)\n",
332 |     "np.save(\"alldata/label_10_12_test.npy\",yy1010)"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "markdown",
337 |    "metadata": {},
338 |    "source": [
339 |     "# ROS-binary-class"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": null,
345 |    "metadata": {},
346 |    "outputs": [],
347 |    "source": [
348 |     "from imblearn.over_sampling import RandomOverSampler\n",
349 |     "\n",
350 |     "ros = RandomOverSampler(random_state=0)\n",
351 |     "X_ros, y_ros = ros.fit_sample(X, y2)\n",
352 |     "\n",
353 |     "sorted(Counter(y_ros).items())"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": null,
359 |    "metadata": {},
360 |    "outputs": [],
361 |    "source": [
362 |     "yy2 = y_ros.reshape(y_ros.shape[0],1)  \n",
363 |     "yy2.shape"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": null,
369 |    "metadata": {},
370 |    "outputs": [],
371 |    "source": [
372 |     "np.save(\"/home/hll/IDS/cicdata/77_2/data_ros_train.npy\",X_ros) \n",
373 |     "np.save(\"/home/hll/IDS/cicdata/77_2/label_ros_train.npy\",yy2)"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "markdown",
378 |    "metadata": {},
379 |    "source": [
380 |     "# ROS-multi-class"
381 |    ]
382 |   },
383 |   {
384 |    "cell_type": "code",
385 |    "execution_count": null,
386 |    "metadata": {},
387 |    "outputs": [],
388 |    "source": [
389 |     "from imblearn.over_sampling import RandomOverSampler\n",
390 |     "\n",
391 |     "ros = RandomOverSampler(random_state=0)\n",
392 |     "X_ros, y_ros = ros.fit_sample(X, y10)\n",
393 |     "\n",
394 |     "sorted(Counter(y_ros).items())"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "code",
399 |    "execution_count": null,
400 |    "metadata": {},
401 |    "outputs": [],
402 |    "source": [
403 |     "yy10 = y_ros.reshape(y_ros.shape[0],1)  \n",
404 |     "yy10.shape"
405 |    ]
406 |   },
407 |   {
408 |    "cell_type": "code",
409 |    "execution_count": null,
410 |    "metadata": {},
411 |    "outputs": [],
412 |    "source": [
413 |     "np.save(\"/home/hll/IDS/alldata/12/rostrain/data_ros10.npy\",X_ros) \n",
414 |     "np.save(\"/home/hll/IDS/alldata/12/rostrain/label_10_ros.npy\",yy10)"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "markdown",
419 |    "metadata": {},
420 |    "source": [
421 |     "# SMOTE-binary-class"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": null,
427 |    "metadata": {},
428 |    "outputs": [],
429 |    "source": [
430 |     "from imblearn.over_sampling import SMOTE\n",
431 |     "\n",
432 |     "smo = SMOTE(random_state=42)  \n",
433 |     "\n",
434 |     "X_smo, y_smo = smo.fit_sample(X, y2)    \n",
435 |     "sorted(Counter(y_smo).items())"
436 |    ]
437 |   },
438 |   {
439 |    "cell_type": "code",
440 |    "execution_count": null,
441 |    "metadata": {},
442 |    "outputs": [],
443 |    "source": [
444 |     "yy2 = y_smo.reshape(y_smo.shape[0],1) \n",
445 |     "yy2.shape"
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "code",
450 |    "execution_count": null,
451 |    "metadata": {},
452 |    "outputs": [],
453 |    "source": [
454 |     "np.save(\"/home/hll/IDS/cicdata/77_2/data_smote_train.npy\",X_smo) \n",
455 |     "np.save(\"/home/hll/IDS/cicdata/77_2/label_smote_train.npy\",yy2)"
456 |    ]
457 |   },
458 |   {
459 |    "cell_type": "markdown",
460 |    "metadata": {},
461 |    "source": [
462 |     "# SMOTE-multi-class"
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "code",
467 |    "execution_count": null,
468 |    "metadata": {},
469 |    "outputs": [],
470 |    "source": [
471 |     "from imblearn.over_sampling import SMOTE\n",
472 |     "\n",
473 |     "smo = SMOTE(random_state=42)  \n",
474 |     "\n",
475 |     "X_smo, y_smo = smo.fit_sample(X, y10)   \n",
476 |     "sorted(Counter(y_smo).items())"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "code",
481 |    "execution_count": null,
482 |    "metadata": {},
483 |    "outputs": [],
484 |    "source": [
485 |     "yy10 = y_smo.reshape(y_smo.shape[0],1) \n",
486 |     "yy10.shape\n",
487 |     "np.save(\"/home/hll/IDS/alldata/12/smotetrain/data_12smo10.npy\",X_smo)  \n",
488 |     "np.save(\"/home/hll/IDS/alldata/12/smotetrain/label_10_12smo.npy\",yy10)"
489 |    ]
490 |   },
491 |   {
492 |    "cell_type": "code",
493 |    "execution_count": null,
494 |    "metadata": {},
495 |    "outputs": [],
496 |    "source": [
497 |     "yy10.shape"
498 |    ]
499 |   },
500 |   {
501 |    "cell_type": "markdown",
502 |    "metadata": {},
503 |    "source": [
504 |     "# ADASYN-binary-class"
505 |    ]
506 |   },
507 |   {
508 |    "cell_type": "code",
509 |    "execution_count": null,
510 |    "metadata": {},
511 |    "outputs": [],
512 |    "source": [
513 |     "from imblearn.over_sampling import  ADASYN\n",
514 |     "X_adasyn, y_adasyn = ADASYN().fit_sample(X, y2)\n",
515 |     "\n",
516 |     "print(sorted(Counter(y_adasyn).items()))"
517 |    ]
518 |   },
519 |   {
520 |    "cell_type": "code",
521 |    "execution_count": null,
522 |    "metadata": {},
523 |    "outputs": [],
524 |    "source": [
525 |     "yy2 = y_adasyn.reshape(y_adasyn.shape[0],1)  \n",
526 |     "yy2.shape\n",
527 |     "np.save(\"/home/hll/IDS/cicdata/77_2/data_adasyn_train.npy\",X_adasyn) \n",
528 |     "np.save(\"/home/hll/IDS/cicdata/77_2/label_adasyn_train.npy\",yy2)"
529 |    ]
530 |   },
531 |   {
532 |    "cell_type": "markdown",
533 |    "metadata": {},
534 |    "source": [
535 |     "# ADASYN-multi-class"
536 |    ]
537 |   },
538 |   {
539 |    "cell_type": "code",
540 |    "execution_count": null,
541 |    "metadata": {},
542 |    "outputs": [],
543 |    "source": [
544 |     "from imblearn.over_sampling import  ADASYN\n",
545 |     "X_adasyn, y_adasyn = ADASYN(ratio={1:10000,2:10000,3:100000,4:100000,\n",
546 |     "                                   5:100000,6:1000000,7:10000,8:10000,9:1000},random_state=42).fit_sample(X, y10)\n",
547 |     "\n",
548 |     "print(sorted(Counter(y_adasyn).items()))"
549 |    ]
550 |   },
551 |   {
552 |    "cell_type": "code",
553 |    "execution_count": null,
554 |    "metadata": {},
555 |    "outputs": [],
556 |    "source": [
557 |     "yy2 = y_adasyn.reshape(y_adasyn.shape[0],1)  \n",
558 |     "yy2.shape\n",
559 |     "np.save(\"alldata/ADASYN/data_12adasyn10.npy\",X_adasyn)  \n",
560 |     "np.save(\"alldata/ADASYN/label_10_12adasyn.npy\",yy2)"
561 |    ]
562 |   }
563 |  ],
564 |  "metadata": {
565 |   "kernelspec": {
566 |    "display_name": "Python 3",
567 |    "language": "python",
568 |    "name": "python3"
569 |   },
570 |   "language_info": {
571 |    "codemirror_mode": {
572 |     "name": "ipython",
573 |     "version": 3
574 |    },
575 |    "file_extension": ".py",
576 |    "mimetype": "text/x-python",
577 |    "name": "python",
578 |    "nbconvert_exporter": "python",
579 |    "pygments_lexer": "ipython3",
580 |    "version": "3.6.9"
581 |   }
582 |  },
583 |  "nbformat": 4,
584 |  "nbformat_minor": 2
585 | }
586 | 


--------------------------------------------------------------------------------
/data preprocessing（UNSW-NB15）.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import os\n",
 10 |     "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\""
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [
 18 |     {
 19 |      "name": "stderr",
 20 |      "output_type": "stream",
 21 |      "text": [
 22 |       "/home/hll/anaconda3/envs/tensorflow/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py:415: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
 23 |       "If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
 24 |       "In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
 25 |       "  warnings.warn(msg, FutureWarning)\n",
 26 |       "/home/hll/anaconda3/envs/tensorflow/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py:415: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
 27 |       "If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
 28 |       "In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
 29 |       "  warnings.warn(msg, FutureWarning)\n",
 30 |       "/home/hll/anaconda3/envs/tensorflow/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py:415: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
 31 |       "If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
 32 |       "In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
 33 |       "  warnings.warn(msg, FutureWarning)\n",
 34 |       "/home/hll/anaconda3/envs/tensorflow/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py:415: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
 35 |       "If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
 36 |       "In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
 37 |       "  warnings.warn(msg, FutureWarning)\n"
 38 |      ]
 39 |     },
 40 |     {
 41 |      "name": "stdout",
 42 |      "output_type": "stream",
 43 |      "text": [
 44 |       "train: (560000, 204)\n",
 45 |       "train: (560000, 204)\n",
 46 |       "train: (560000, 204)\n",
 47 |       "train: (352035, 204)\n",
 48 |       "test: (140001, 204)\n",
 49 |       "test: (140001, 204)\n",
 50 |       "test: (140001, 204)\n",
 51 |       "test: (88009, 204)\n"
 52 |      ]
 53 |     }
 54 |    ],
 55 |    "source": [
 56 |     "import os\n",
 57 |     "import pandas as pd\n",
 58 |     "import numpy as np\n",
 59 |     "import tensorflow as tf\n",
 60 |     "from sklearn.model_selection import train_test_split\n",
 61 |     "from sklearn.preprocessing import LabelEncoder, OneHotEncoder\n",
 62 |     "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
 63 |     "\n",
 64 |     "#Mark the nominal columns and consolidate the data (extract all the nominal columns)\n",
 65 |     "#标出名词性列，整合数据（把名词性的列都提取出来）\n",
 66 |     "def combine_dataset(files, col_names, processed = False):  \n",
 67 |     "\tdtypes = {}\n",
 68 |     "\tif processed == False:\n",
 69 |     "\t\tfor col_name in col_names:\n",
 70 |     "\t\t\tnominal_names = set(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state',\n",
 71 |     "\t\t\t\t                 'service', 'ct_ftp', 'label_10'])  #Nominal column\n",
 72 |     "\t\t\tif col_name in nominal_names:\n",
 73 |     "\t\t\t\tdtypes[col_name] =  str\n",
 74 |     "\t\t\telse:\n",
 75 |     "\t\t\t\tdtypes[col_name] = np.float32\n",
 76 |     "\telse:\n",
 77 |     "\t\tfor col_name in col_names:\n",
 78 |     "\t\t\tdtypes[col_name] = np.float32\n",
 79 |     "\n",
 80 |     "\trecords = []\n",
 81 |     "\tfor file in files:\n",
 82 |     "\t\tdata = pd.read_csv(file, header = None, names = col_names, dtype = dtypes)\n",
 83 |     "\t\trecords.append(data)\n",
 84 |     "\n",
 85 |     "\trecords_all = pd.concat(records)#当没有索引时、concat不管列名，直接加到一起\n",
 86 |     "                                     #When there is no index, concat adds them together regardless of the column names,\n",
 87 |     "                     \n",
 88 |     "\n",
 89 |     "\treturn records_all\n",
 90 |     "\n",
 91 |     "## Make new col names for categorical features after one-hot encoding\n",
 92 |     "#为one-hot编码之后的列起个新列名\n",
 93 |     "def get_nominal_names(dataset, cols_nominal):  \n",
 94 |     "\tdata_nominal = dataset[cols_nominal]  \n",
 95 |     "\n",
 96 |     "\n",
 97 |     "\n",
 98 |     "\tnew_col_names = []\n",
 99 |     "\tfor col_name in cols_nominal:\n",
100 |     "\t\tname_unique = sorted(dataset[col_name].unique())  #名词性列的不同的值。Different values for noun columns\n",
101 |     "\t\tnew_col_name = [col_name + '_' + x for x in name_unique]  \n",
102 |     "\t\tnew_col_names.extend(new_col_name)\n",
103 |     "\n",
104 |     "\treturn new_col_names\n",
105 |     "\n",
106 |     "#Remove the unimportant feature, one-hot encoding, and convert the attack class to numeric\n",
107 |     "#删除不重要的特征，one-hot编码，将攻击类别转换为数值型\n",
108 |     "def select_feature_and_encoding(dataset, cols_to_drop, cols_nominal, cols_nominal_all):\n",
109 |     "    \n",
110 |     "\n",
111 |     "\t# Drop the features has no meaning such as src ip. 删除不重要的特征\n",
112 |     "\tfor cols in cols_to_drop:\n",
113 |     "\t\tdataset.drop(cols, axis = 1, inplace = True)\n",
114 |     "\n",
115 |     "\t# Save the label and then drop it from dataset 保留标签然后将它从数据集中删除（提取出标签列）\n",
116 |     "\tlabel_10 = dataset['label_10']\n",
117 |     "\tlabel_2 = dataset['label_2']\n",
118 |     "\tdataset.drop('label_2', axis = 1, inplace = True)\n",
119 |     "\tdataset.drop('label_10', axis = 1, inplace = True)\n",
120 |     "\n",
121 |     "\t# replace the label with specific code  将标签数值化\n",
122 |     "\treplace_dict = { np.nan: 0, 'Analysis': 1, 'Backdoors': 2, 'Backdoor': 2, 'DoS': 3,\n",
123 |     "                    'Exploits':4,' Fuzzers': 5, ' Fuzzers ':5, 'Generic': 6,\n",
124 |     "                    'Reconnaissance': 7, ' Shellcode ':8, 'Shellcode': 8,\n",
125 |     "                    'Worms':9, ' Reconnaissance ': 7,}\n",
126 |     "\tnew_label_10 = label_10.replace(replace_dict)\n",
127 |     "\tnew_label_10.to_frame() \n",
128 |     "\tlabel_2.to_frame\n",
129 |     "\tdel label_10\n",
130 |     "\n",
131 |     "\t# replace the lost values  用0替换缺失值\n",
132 |     "\treplace_dict = {np.nan: 0, ' ': 0}\n",
133 |     "\tfor cols in ['ct_ftp', 'ct_flw', 'is_ftp']:\n",
134 |     "\t\tdataset[cols] = dataset[cols].replace(replace_dict)\n",
135 |     "\n",
136 |     "\t# 'is_ftp' column is wrong, correct it(I found that the value of it is\n",
137 |     "\t# all the same with ct_ftp_cmd, so if the value is not 0, is_ftp should\n",
138 |     "\t# be 1)\n",
139 |     "\tfor x in dataset['is_ftp']:\n",
140 |     "\t\tif x != 0:\n",
141 |     "\t\t\tx = 1\n",
142 |     "\n",
143 |     "\t# select and process the categorical features 选择并处理分类特征\n",
144 |     "\tdata_nominal = dataset[cols_nominal]  #cols_nominal是名词性列的列名，提取出名词性列的数据\n",
145 |     "\tdata_temp_1 = data_nominal.apply(LabelEncoder().fit_transform)  #将名词性列进行编号\n",
146 |     "\tdel data_nominal\n",
147 |     "\n",
148 |     "\n",
149 |     "\tnew_col_names = []\n",
150 |     "\tfor col_name in cols_nominal:\n",
151 |     "\t\tname_unique = sorted(dataset[col_name].unique())\n",
152 |     "\t\tnew_col_name = [col_name + '_' + x for x in name_unique]\n",
153 |     "\n",
154 |     "\t\tnew_col_names.extend(new_col_name)\n",
155 |     "\t\tdataset.drop(col_name, axis = 1, inplace = True) \n",
156 |     "\n",
157 |     "\t#one-hot\n",
158 |     "\tenc = OneHotEncoder()\n",
159 |     "\tdata_temp_2 = enc.fit_transform(data_temp_1)\n",
160 |     "\tdel data_temp_1 \n",
161 |     "\n",
162 |     "\tdata_encoded = pd.DataFrame(data_temp_2.toarray(), columns = new_col_names)\n",
163 |     "\tdel data_temp_2\n",
164 |     "\n",
165 |     "\t# complement the nominal columns 补充名词性列\n",
166 |     "\tdiff = set(cols_nominal_all) - set(new_col_names) \n",
167 |     "\n",
168 |     "\tif diff:\n",
169 |     "\t\tfor cols in diff:\n",
170 |     "\t\t\tdata_encoded[cols] = 0.\n",
171 |     "\t\tdata_encoded = data_encoded[cols_nominal_all]\n",
172 |     "\n",
173 |     "\tdataset= dataset.join(data_encoded)  \n",
174 |     "\tdel data_encoded\n",
175 |     "\n",
176 |     "\tdataset = dataset.join(new_label_10)\n",
177 |     "\tdataset = dataset.join(label_2)\n",
178 |     "\n",
179 |     "\treturn dataset  #Complete data set (including data and labels)\n",
180 |     "                    #完整的数据集（包括数据和标签）\n",
181 |     "\n",
182 |     "#Split the training set and test set and save the file as a CSV file\n",
183 |     "#分裂训练集和测试集,并将文件保存成CSV文件\n",
184 |     "def split_dataset(dataset, file_train, file_test):   \n",
185 |     "\n",
186 |     "\tcols = dataset.columns\n",
187 |     "\t#trainset, testset = train_test_split(dataset, test_size = 0.2)\n",
188 |     "\ttrainset, testset = train_test_split(dataset, test_size = 0.2,random_state=40,stratify=dataset['label_10'])\n",
189 |     "\ttrain = pd.DataFrame(trainset, columns = cols)\n",
190 |     "\ttest = pd.DataFrame(testset, columns = cols)\n",
191 |     "\n",
192 |     "\ttrain.to_csv(file_train)\n",
193 |     "\ttest.to_csv(file_test)\n",
194 |     "\n",
195 |     "#Standardize, and save the file in CSV and tf formats\n",
196 |     "#标准化，并将文件保存成csv格式和tf格式\n",
197 |     "def scaling(files_train, files_test, col_names_scaling, scaling_type):\n",
198 |     "\n",
199 |     "\tif scaling_type == 'min_max':\n",
200 |     "\t\tscaler = MinMaxScaler()\n",
201 |     "\t\tfile_folder = 'min_max/'\n",
202 |     "\telse:\n",
203 |     "\t\tscaler = StandardScaler()\n",
204 |     "\t\tfile_folder = 'normalized/'\n",
205 |     "\n",
206 |     "\tif not os.path.exists(file_folder):\n",
207 |     "\t\tos.mkdir(file_folder)\n",
208 |     "\tcols = []\n",
209 |     "\tfor file in files_train:\n",
210 |     "\t\t# col 0 is the index in the file\n",
211 |     "\t\ttrainset = pd.read_csv(file, index_col = 0, dtype = np.float32)\n",
212 |     "\t\tif len(cols) == 0:\n",
213 |     "\t\t\tcols = trainset.columns\n",
214 |     "\t\tscaler.partial_fit(trainset[col_names_scaling])\n",
215 |     "\n",
216 |     "\tdel trainset\n",
217 |     "\tcols_keep = list(set(cols) - set(col_names_scaling))\n",
218 |     "\n",
219 |     "\tfor file in files_train:\n",
220 |     "\t\ttrainset = pd.read_csv(file, dtype = np.float32)\n",
221 |     "\t\ttrain_scaled = scaler.transform(trainset[col_names_scaling])\n",
222 |     "\t\ttrain_changed = pd.DataFrame(train_scaled, columns = col_names_scaling)\n",
223 |     "\t\ttrain_unchanged = trainset[cols_keep]\n",
224 |     "\t\ttrainset_final = pd.concat((train_changed, train_unchanged),\n",
225 |     "\t\t                        axis = 1)\n",
226 |     "\t\ttrainset_final = trainset_final[cols]\n",
227 |     "\t\tprint(\"train:\",trainset_final.shape)  #trainset shape\n",
228 |     "\t\tfile_csv = file_folder + file\n",
229 |     "\t\ttrainset.to_csv(file_csv, index = False)\n",
230 |     "\t\tlen_tail = len('.csv')   \n",
231 |     "\t\tfile_tfr = file_folder + file[:-1 * len_tail] + '.tfrecords'\n",
232 |     "\t\tmake_tfrecords(trainset_final, file_tfr)\n",
233 |     "\n",
234 |     "\tfor file in files_test:\n",
235 |     "\t\ttestset = pd.read_csv(file, dtype = np.float32)\n",
236 |     "\t\ttest_scaled = scaler.transform(testset[col_names_scaling])\n",
237 |     "\t\ttest_changed = pd.DataFrame(test_scaled, columns = col_names_scaling)\n",
238 |     "\t\ttest_unchanged = testset[cols_keep]\n",
239 |     "\t\ttestset_final = pd.concat((test_changed, test_unchanged),axis = 1)\n",
240 |     "\t\ttestset_final = testset_final[cols]\n",
241 |     "\t\tprint(\"test:\",testset_final.shape)\n",
242 |     "\t\tfile_csv = file_folder + file\n",
243 |     "\t\ttestset.to_csv(file_csv, index = False)\n",
244 |     "\t\tlen_tail = len('.csv')\n",
245 |     "\t\tfile_tfr = file_folder + file[:-1 * len_tail] + '.tfrecords'\n",
246 |     "\t\tmake_tfrecords(testset_final, file_tfr)\n",
247 |     "\n",
248 |     "#Save the file in tf format\n",
249 |     "#将文件保存成tf格式\n",
250 |     "def make_tfrecords(dataset, file_to_save):  \n",
251 |     "\n",
252 |     "\ttry:\n",
253 |     "\t\tdata = dataset.values\n",
254 |     "\texcept:\n",
255 |     "\t\tdata = dataset\n",
256 |     "\twith tf.python_io.TFRecordWriter(file_to_save) as writer:\n",
257 |     "\t\tfor rows in data:\n",
258 |     "\t\t\tfeatures, label_10, label_2 = rows[:-2], rows[-2], rows[-1]\n",
259 |     "\t\t\tfeature = {'features': tf.train.Feature(float_list = tf.train.FloatList(value = features)),\n",
260 |     "\t\t\t           'label_2': tf.train.Feature(float_list = tf.train.FloatList(value = [label_2])),\n",
261 |     "\t\t\t           'label_10': tf.train.Feature(float_list = tf.train.FloatList(value = [label_10]))}\n",
262 |     "\t\t\texample = tf.train.Example(features = tf.train.Features(feature = feature))\n",
263 |     "\t\t\twriter.write(example.SerializeToString())\n",
264 |     "\n",
265 |     "def next_batch(filename, batch_size):\n",
266 |     "\n",
267 |     "\tlen_feature = 202  #特征数（不包含标签）。 Number of features (not including tags)\n",
268 |     "\tlen_label = 1#标签长度。 The length of the label\n",
269 |     "\n",
270 |     "\tdef read_data(examples):\n",
271 |     "\t\tfeatures = {\"features\": tf.FixedLenFeature([len_feature], tf.float32),\n",
272 |     "                    \"label_2\": tf.FixedLenFeature([len_label], tf.float32),\n",
273 |     "                    \"label_10\": tf.FixedLenFeature([len_label], tf.float32)}\n",
274 |     "\t\tparsed_features = tf.parse_single_example(examples, features)\n",
275 |     "\t\treturn parsed_features['features'], parsed_features['label_2'], \\\n",
276 |     "               parsed_features['label_10']\n",
277 |     "\n",
278 |     "\tdata = tf.data.TFRecordDataset(filename)\n",
279 |     "\tdata = data.map(read_data)\n",
280 |     "\tdata = data.batch(batch_size)\n",
281 |     "\titerator = data.make_one_shot_iterator()\n",
282 |     "\tnext_data, next_label_2, next_label_10 = iterator.get_next()\n",
283 |     "\n",
284 |     "\treturn next_data, next_label_10, next_label_2\n",
285 |     "\n",
286 |     "\n",
287 |     "\n",
288 |     "\n",
289 |     "if __name__ == '__main__':\n",
290 |     "\n",
291 |     "\tfile_folder = '/home/hll/IDS/UNSW-NB15 - CSV Files/'  #读取的原始文件所在的位置。 The location where the original file was read\n",
292 |     "\tcol_names = ['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur',\n",
293 |     "\t             'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss',\n",
294 |     "\t             'service', 'sload', 'dload', 'spkts', 'dpkts', 'swin', 'dwin',\n",
295 |     "\t             'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'trans_depth',\n",
296 |     "\t             'res_bdy_len', 'sjit', 'djit', 'stime', 'ltime', 'sintpkt',\n",
297 |     "\t             'dintpkt', 'tcprtt', 'synack', 'ackdat', 'is_sm_ips',\n",
298 |     "\t             'ct_state_ttl', 'ct_flw', 'is_ftp', 'ct_ftp', 'ct_srv_src',\n",
299 |     "\t             'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport',\n",
300 |     "\t             'ct_dst_sport', 'ct_dst_src', 'label_10', 'label_2']    #特证名（列名）。 listed name\n",
301 |     "\n",
302 |     "\tcols_to_drop = ['srcip', 'dstip', 'stime', 'ltime', 'sport', 'dsport'] \n",
303 |     "\tcols_nominal = ['proto', 'service', 'state']   #名词性特征。Nominal features\n",
304 |     "\n",
305 |     "\tfiles = [file_folder + 'UNSW-NB15_' + str(i+1) + '.csv' for i in range(4)]  \n",
306 |     "\tdataset = combine_dataset(files, col_names)   \n",
307 |     "\tcols_nominal_all = get_nominal_names(dataset, cols_nominal)  \n",
308 |     "\tdel dataset  \n",
309 |     "\n",
310 |     "\tfile_tail = len('.csv')  \n",
311 |     "\tfile_head = len(file_folder + 'UNSW-NB15_')   \n",
312 |     "\tdtypes = {}   \n",
313 |     "\tfor col_name in col_names:\n",
314 |     "\t\tnominal_names = set(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state',\n",
315 |     "\t\t\t                 'service', 'is_ftp', 'ct_flw', 'ct_ftp', 'label_10'])\n",
316 |     "\t\tif col_name in nominal_names:\n",
317 |     "\t\t\tdtypes[col_name] =  str  \n",
318 |     "\t\telse:\n",
319 |     "\t\t\tdtypes[col_name] = np.float32 \n",
320 |     "\n",
321 |     "\tfor file in files:\n",
322 |     "\t\tfile_train = file[file_head:-1 * file_tail] + '_train.csv'  #每个文件分裂出的训练集和测试集，csv文件。 \n",
323 |     "        #Each file is split out of the training set and test set, CSV file\n",
324 |     "\t\tfile_test = file[file_head: -1 * file_tail] + '_test.csv'\n",
325 |     "\t\tdataset = pd.read_csv(file, header = None, names = col_names, dtype = dtypes)  \n",
326 |     "\t\tdataset = select_feature_and_encoding(dataset, cols_to_drop, cols_nominal,\n",
327 |     "\t\t                                          cols_nominal_all)  \n",
328 |     "\t\tsplit_dataset(dataset, file_train, file_test)   \n",
329 |     "\n",
330 |     "\tcols_unchanged = ['is_ftp', 'is_sm_ips'] + cols_nominal +\\\n",
331 |     "\t                 cols_to_drop + ['label_2', 'label_10']   \n",
332 |     "\tcols_scaling = [x for x in col_names if x not in cols_unchanged]  \n",
333 |     "\n",
334 |     "\tfiles_train = [str(x + 1) + '_train.csv' for x in range(4)]  \n",
335 |     "\tfiles_test = [str(x + 1) + '_test.csv' for x in range(4)]  \n",
336 |     "\n",
337 |     "\tscaling(files_train, files_test, cols_scaling, 'std')  #标准化。standardized\n",
338 |     "\n",
339 |     "\tfile_folder = 'normalized/' #数据标准化后存放的文件夹。A folder where data is stored after standardization\n",
340 |     "\tfiles_train = [file_folder + str(x + 1) + '_train.tfrecords' for x in range(4)]  \n",
341 |     "\tfiles_test = [file_folder + str(x + 1) + '_test.tfrecords' for x in range(4)]   \n"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": 10,
347 |    "metadata": {},
348 |    "outputs": [],
349 |    "source": [
350 |     "#Integrate the four separate data sets\n",
351 |     "#将分开的4个数据集整合到一起\n",
352 |     "def make_whole_datasets(tfrecords_train, num_train_example, tfrecords_test,\n",
353 |     "                        num_test_example):          \n",
354 |     "\n",
355 |     "    data_train, label_10_train, label_2_train= next_batch(tfrecords_train,num_train_example)\n",
356 |     "    data_test, label_10_test, label_2_test= next_batch(tfrecords_test,num_test_example)\n",
357 |     "    with tf.Session() as sess:\n",
358 |     "        data, label_10, label_2 = sess.run([data_train, label_10_train,label_2_train])\n",
359 |     "    dataset = np.concatenate([data, label_10, label_2], axis = 1)\n",
360 |     "\n",
361 |     "    #trainset, valiset = train_test_split(dataset, test_size = 254004,stratify=dataset['label_10'])  \n",
362 |     "    trainset, valiset = train_test_split(dataset, test_size = 0.125,random_state=40,stratify=dataset[:,-2])\n",
363 |     "    print(\"train:\",trainset.shape)\n",
364 |     "    print(\"val:\",valiset.shape) \n",
365 |     "\n",
366 |     "    make_tfrecords(trainset, 'normalized/train.tfrecords') \n",
367 |     "    make_tfrecords(valiset, 'normalized/validation.tfrecords')\n",
368 |     "\n",
369 |     "    del trainset, valiset\n",
370 |     "\n",
371 |     "    with tf.Session() as sess:\n",
372 |     "        data, label_10, label_2 = sess.run([data_test, label_10_test,label_2_test])\n",
373 |     "    dataset = np.concatenate([data, label_10, label_2], axis = 1)\n",
374 |     "    print(\"test:\",dataset.shape)  \n",
375 |     "    make_tfrecords(dataset, 'normalized/test.tfrecords')"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": 11,
381 |    "metadata": {},
382 |    "outputs": [
383 |     {
384 |      "name": "stdout",
385 |      "output_type": "stream",
386 |      "text": [
387 |       "train: (1778030, 204)\n",
388 |       "val: (254005, 204)\n",
389 |       "test: (508012, 204)\n"
390 |      ]
391 |     }
392 |    ],
393 |    "source": [
394 |     "num_train_example = 2032035 #trainset size\n",
395 |     "num_test_example = 508012 #testset size \n",
396 |     "make_whole_datasets(files_train, num_train_example, files_test, num_test_example)"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": null,
402 |    "metadata": {},
403 |    "outputs": [],
404 |    "source": []
405 |   }
406 |  ],
407 |  "metadata": {
408 |   "kernelspec": {
409 |    "display_name": "Python 3",
410 |    "language": "python",
411 |    "name": "python3"
412 |   },
413 |   "language_info": {
414 |    "codemirror_mode": {
415 |     "name": "ipython",
416 |     "version": 3
417 |    },
418 |    "file_extension": ".py",
419 |    "mimetype": "text/x-python",
420 |    "name": "python",
421 |    "nbconvert_exporter": "python",
422 |    "pygments_lexer": "ipython3",
423 |    "version": "3.6.9"
424 |   }
425 |  },
426 |  "nbformat": 4,
427 |  "nbformat_minor": 2
428 | }
429 | 


--------------------------------------------------------------------------------
/.ipynb_checkpoints/data preprocessing（UNSW-NB15）-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import os\n",
 10 |     "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\""
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [
 18 |     {
 19 |      "name": "stderr",
 20 |      "output_type": "stream",
 21 |      "text": [
 22 |       "/home/hll/anaconda3/envs/tensorflow/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py:415: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
 23 |       "If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
 24 |       "In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
 25 |       "  warnings.warn(msg, FutureWarning)\n",
 26 |       "/home/hll/anaconda3/envs/tensorflow/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py:415: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
 27 |       "If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
 28 |       "In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
 29 |       "  warnings.warn(msg, FutureWarning)\n",
 30 |       "/home/hll/anaconda3/envs/tensorflow/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py:415: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
 31 |       "If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
 32 |       "In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
 33 |       "  warnings.warn(msg, FutureWarning)\n",
 34 |       "/home/hll/anaconda3/envs/tensorflow/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py:415: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
 35 |       "If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
 36 |       "In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
 37 |       "  warnings.warn(msg, FutureWarning)\n"
 38 |      ]
 39 |     },
 40 |     {
 41 |      "name": "stdout",
 42 |      "output_type": "stream",
 43 |      "text": [
 44 |       "train: (560000, 204)\n",
 45 |       "train: (560000, 204)\n",
 46 |       "train: (560000, 204)\n",
 47 |       "train: (352035, 204)\n",
 48 |       "test: (140001, 204)\n",
 49 |       "test: (140001, 204)\n",
 50 |       "test: (140001, 204)\n",
 51 |       "test: (88009, 204)\n"
 52 |      ]
 53 |     }
 54 |    ],
 55 |    "source": [
 56 |     "import os\n",
 57 |     "import pandas as pd\n",
 58 |     "import numpy as np\n",
 59 |     "import tensorflow as tf\n",
 60 |     "from sklearn.model_selection import train_test_split\n",
 61 |     "from sklearn.preprocessing import LabelEncoder, OneHotEncoder\n",
 62 |     "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
 63 |     "\n",
 64 |     "#Mark the nominal columns and consolidate the data (extract all the nominal columns)\n",
 65 |     "#标出名词性列，整合数据（把名词性的列都提取出来）\n",
 66 |     "def combine_dataset(files, col_names, processed = False):  \n",
 67 |     "\tdtypes = {}\n",
 68 |     "\tif processed == False:\n",
 69 |     "\t\tfor col_name in col_names:\n",
 70 |     "\t\t\tnominal_names = set(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state',\n",
 71 |     "\t\t\t\t                 'service', 'ct_ftp', 'label_10'])  #Nominal column\n",
 72 |     "\t\t\tif col_name in nominal_names:\n",
 73 |     "\t\t\t\tdtypes[col_name] =  str\n",
 74 |     "\t\t\telse:\n",
 75 |     "\t\t\t\tdtypes[col_name] = np.float32\n",
 76 |     "\telse:\n",
 77 |     "\t\tfor col_name in col_names:\n",
 78 |     "\t\t\tdtypes[col_name] = np.float32\n",
 79 |     "\n",
 80 |     "\trecords = []\n",
 81 |     "\tfor file in files:\n",
 82 |     "\t\tdata = pd.read_csv(file, header = None, names = col_names, dtype = dtypes)\n",
 83 |     "\t\trecords.append(data)\n",
 84 |     "\n",
 85 |     "\trecords_all = pd.concat(records)#当没有索引时、concat不管列名，直接加到一起\n",
 86 |     "                                     #When there is no index, concat adds them together regardless of the column names,\n",
 87 |     "                     \n",
 88 |     "\n",
 89 |     "\treturn records_all\n",
 90 |     "\n",
 91 |     "## Make new col names for categorical features after one-hot encoding\n",
 92 |     "#为one-hot编码之后的列起个新列名\n",
 93 |     "def get_nominal_names(dataset, cols_nominal):  \n",
 94 |     "\tdata_nominal = dataset[cols_nominal]  \n",
 95 |     "\n",
 96 |     "\n",
 97 |     "\n",
 98 |     "\tnew_col_names = []\n",
 99 |     "\tfor col_name in cols_nominal:\n",
100 |     "\t\tname_unique = sorted(dataset[col_name].unique())  #名词性列的不同的值。Different values for noun columns\n",
101 |     "\t\tnew_col_name = [col_name + '_' + x for x in name_unique]  \n",
102 |     "\t\tnew_col_names.extend(new_col_name)\n",
103 |     "\n",
104 |     "\treturn new_col_names\n",
105 |     "\n",
106 |     "#Remove the unimportant feature, one-hot encoding, and convert the attack class to numeric\n",
107 |     "#删除不重要的特征，one-hot编码，将攻击类别转换为数值型\n",
108 |     "def select_feature_and_encoding(dataset, cols_to_drop, cols_nominal, cols_nominal_all):\n",
109 |     "    \n",
110 |     "\n",
111 |     "\t# Drop the features has no meaning such as src ip. 删除不重要的特征\n",
112 |     "\tfor cols in cols_to_drop:\n",
113 |     "\t\tdataset.drop(cols, axis = 1, inplace = True)\n",
114 |     "\n",
115 |     "\t# Save the label and then drop it from dataset 保留标签然后将它从数据集中删除（提取出标签列）\n",
116 |     "\tlabel_10 = dataset['label_10']\n",
117 |     "\tlabel_2 = dataset['label_2']\n",
118 |     "\tdataset.drop('label_2', axis = 1, inplace = True)\n",
119 |     "\tdataset.drop('label_10', axis = 1, inplace = True)\n",
120 |     "\n",
121 |     "\t# replace the label with specific code  将标签数值化\n",
122 |     "\treplace_dict = { np.nan: 0, 'Analysis': 1, 'Backdoors': 2, 'Backdoor': 2, 'DoS': 3,\n",
123 |     "                    'Exploits':4,' Fuzzers': 5, ' Fuzzers ':5, 'Generic': 6,\n",
124 |     "                    'Reconnaissance': 7, ' Shellcode ':8, 'Shellcode': 8,\n",
125 |     "                    'Worms':9, ' Reconnaissance ': 7,}\n",
126 |     "\tnew_label_10 = label_10.replace(replace_dict)\n",
127 |     "\tnew_label_10.to_frame() \n",
128 |     "\tlabel_2.to_frame\n",
129 |     "\tdel label_10\n",
130 |     "\n",
131 |     "\t# replace the lost values  用0替换缺失值\n",
132 |     "\treplace_dict = {np.nan: 0, ' ': 0}\n",
133 |     "\tfor cols in ['ct_ftp', 'ct_flw', 'is_ftp']:\n",
134 |     "\t\tdataset[cols] = dataset[cols].replace(replace_dict)\n",
135 |     "\n",
136 |     "\t# 'is_ftp' column is wrong, correct it(I found that the value of it is\n",
137 |     "\t# all the same with ct_ftp_cmd, so if the value is not 0, is_ftp should\n",
138 |     "\t# be 1)\n",
139 |     "\tfor x in dataset['is_ftp']:\n",
140 |     "\t\tif x != 0:\n",
141 |     "\t\t\tx = 1\n",
142 |     "\n",
143 |     "\t# select and process the categorical features 选择并处理分类特征\n",
144 |     "\tdata_nominal = dataset[cols_nominal]  #cols_nominal是名词性列的列名，提取出名词性列的数据\n",
145 |     "\tdata_temp_1 = data_nominal.apply(LabelEncoder().fit_transform)  #将名词性列进行编号\n",
146 |     "\tdel data_nominal\n",
147 |     "\n",
148 |     "\n",
149 |     "\tnew_col_names = []\n",
150 |     "\tfor col_name in cols_nominal:\n",
151 |     "\t\tname_unique = sorted(dataset[col_name].unique())\n",
152 |     "\t\tnew_col_name = [col_name + '_' + x for x in name_unique]\n",
153 |     "\n",
154 |     "\t\tnew_col_names.extend(new_col_name)\n",
155 |     "\t\tdataset.drop(col_name, axis = 1, inplace = True) \n",
156 |     "\n",
157 |     "\t#one-hot\n",
158 |     "\tenc = OneHotEncoder()\n",
159 |     "\tdata_temp_2 = enc.fit_transform(data_temp_1)\n",
160 |     "\tdel data_temp_1 \n",
161 |     "\n",
162 |     "\tdata_encoded = pd.DataFrame(data_temp_2.toarray(), columns = new_col_names)\n",
163 |     "\tdel data_temp_2\n",
164 |     "\n",
165 |     "\t# complement the nominal columns 补充名词性列\n",
166 |     "\tdiff = set(cols_nominal_all) - set(new_col_names) \n",
167 |     "\n",
168 |     "\tif diff:\n",
169 |     "\t\tfor cols in diff:\n",
170 |     "\t\t\tdata_encoded[cols] = 0.\n",
171 |     "\t\tdata_encoded = data_encoded[cols_nominal_all]\n",
172 |     "\n",
173 |     "\tdataset= dataset.join(data_encoded)  \n",
174 |     "\tdel data_encoded\n",
175 |     "\n",
176 |     "\tdataset = dataset.join(new_label_10)\n",
177 |     "\tdataset = dataset.join(label_2)\n",
178 |     "\n",
179 |     "\treturn dataset  #Complete data set (including data and labels)\n",
180 |     "                    #完整的数据集（包括数据和标签）\n",
181 |     "\n",
182 |     "#Split the training set and test set and save the file as a CSV file\n",
183 |     "#分裂训练集和测试集,并将文件保存成CSV文件\n",
184 |     "def split_dataset(dataset, file_train, file_test):   \n",
185 |     "\n",
186 |     "\tcols = dataset.columns\n",
187 |     "\t#trainset, testset = train_test_split(dataset, test_size = 0.2)\n",
188 |     "\ttrainset, testset = train_test_split(dataset, test_size = 0.2,random_state=40,stratify=dataset['label_10'])\n",
189 |     "\ttrain = pd.DataFrame(trainset, columns = cols)\n",
190 |     "\ttest = pd.DataFrame(testset, columns = cols)\n",
191 |     "\n",
192 |     "\ttrain.to_csv(file_train)\n",
193 |     "\ttest.to_csv(file_test)\n",
194 |     "\n",
195 |     "#Standardize, and save the file in CSV and tf formats\n",
196 |     "#标准化，并将文件保存成csv格式和tf格式\n",
197 |     "def scaling(files_train, files_test, col_names_scaling, scaling_type):\n",
198 |     "\n",
199 |     "\tif scaling_type == 'min_max':\n",
200 |     "\t\tscaler = MinMaxScaler()\n",
201 |     "\t\tfile_folder = 'min_max/'\n",
202 |     "\telse:\n",
203 |     "\t\tscaler = StandardScaler()\n",
204 |     "\t\tfile_folder = 'normalized/'\n",
205 |     "\n",
206 |     "\tif not os.path.exists(file_folder):\n",
207 |     "\t\tos.mkdir(file_folder)\n",
208 |     "\tcols = []\n",
209 |     "\tfor file in files_train:\n",
210 |     "\t\t# col 0 is the index in the file\n",
211 |     "\t\ttrainset = pd.read_csv(file, index_col = 0, dtype = np.float32)\n",
212 |     "\t\tif len(cols) == 0:\n",
213 |     "\t\t\tcols = trainset.columns\n",
214 |     "\t\tscaler.partial_fit(trainset[col_names_scaling])\n",
215 |     "\n",
216 |     "\tdel trainset\n",
217 |     "\tcols_keep = list(set(cols) - set(col_names_scaling))\n",
218 |     "\n",
219 |     "\tfor file in files_train:\n",
220 |     "\t\ttrainset = pd.read_csv(file, dtype = np.float32)\n",
221 |     "\t\ttrain_scaled = scaler.transform(trainset[col_names_scaling])\n",
222 |     "\t\ttrain_changed = pd.DataFrame(train_scaled, columns = col_names_scaling)\n",
223 |     "\t\ttrain_unchanged = trainset[cols_keep]\n",
224 |     "\t\ttrainset_final = pd.concat((train_changed, train_unchanged),\n",
225 |     "\t\t                        axis = 1)\n",
226 |     "\t\ttrainset_final = trainset_final[cols]\n",
227 |     "\t\tprint(\"train:\",trainset_final.shape)  #trainset shape\n",
228 |     "\t\tfile_csv = file_folder + file\n",
229 |     "\t\ttrainset.to_csv(file_csv, index = False)\n",
230 |     "\t\tlen_tail = len('.csv')   \n",
231 |     "\t\tfile_tfr = file_folder + file[:-1 * len_tail] + '.tfrecords'\n",
232 |     "\t\tmake_tfrecords(trainset_final, file_tfr)\n",
233 |     "\n",
234 |     "\tfor file in files_test:\n",
235 |     "\t\ttestset = pd.read_csv(file, dtype = np.float32)\n",
236 |     "\t\ttest_scaled = scaler.transform(testset[col_names_scaling])\n",
237 |     "\t\ttest_changed = pd.DataFrame(test_scaled, columns = col_names_scaling)\n",
238 |     "\t\ttest_unchanged = testset[cols_keep]\n",
239 |     "\t\ttestset_final = pd.concat((test_changed, test_unchanged),axis = 1)\n",
240 |     "\t\ttestset_final = testset_final[cols]\n",
241 |     "\t\tprint(\"test:\",testset_final.shape)\n",
242 |     "\t\tfile_csv = file_folder + file\n",
243 |     "\t\ttestset.to_csv(file_csv, index = False)\n",
244 |     "\t\tlen_tail = len('.csv')\n",
245 |     "\t\tfile_tfr = file_folder + file[:-1 * len_tail] + '.tfrecords'\n",
246 |     "\t\tmake_tfrecords(testset_final, file_tfr)\n",
247 |     "\n",
248 |     "#Save the file in tf format\n",
249 |     "#将文件保存成tf格式\n",
250 |     "def make_tfrecords(dataset, file_to_save):  \n",
251 |     "\n",
252 |     "\ttry:\n",
253 |     "\t\tdata = dataset.values\n",
254 |     "\texcept:\n",
255 |     "\t\tdata = dataset\n",
256 |     "\twith tf.python_io.TFRecordWriter(file_to_save) as writer:\n",
257 |     "\t\tfor rows in data:\n",
258 |     "\t\t\tfeatures, label_10, label_2 = rows[:-2], rows[-2], rows[-1]\n",
259 |     "\t\t\tfeature = {'features': tf.train.Feature(float_list = tf.train.FloatList(value = features)),\n",
260 |     "\t\t\t           'label_2': tf.train.Feature(float_list = tf.train.FloatList(value = [label_2])),\n",
261 |     "\t\t\t           'label_10': tf.train.Feature(float_list = tf.train.FloatList(value = [label_10]))}\n",
262 |     "\t\t\texample = tf.train.Example(features = tf.train.Features(feature = feature))\n",
263 |     "\t\t\twriter.write(example.SerializeToString())\n",
264 |     "\n",
265 |     "def next_batch(filename, batch_size):\n",
266 |     "\n",
267 |     "\tlen_feature = 202  #特征数（不包含标签）。 Number of features (not including tags)\n",
268 |     "\tlen_label = 1#标签长度。 The length of the label\n",
269 |     "\n",
270 |     "\tdef read_data(examples):\n",
271 |     "\t\tfeatures = {\"features\": tf.FixedLenFeature([len_feature], tf.float32),\n",
272 |     "                    \"label_2\": tf.FixedLenFeature([len_label], tf.float32),\n",
273 |     "                    \"label_10\": tf.FixedLenFeature([len_label], tf.float32)}\n",
274 |     "\t\tparsed_features = tf.parse_single_example(examples, features)\n",
275 |     "\t\treturn parsed_features['features'], parsed_features['label_2'], \\\n",
276 |     "               parsed_features['label_10']\n",
277 |     "\n",
278 |     "\tdata = tf.data.TFRecordDataset(filename)\n",
279 |     "\tdata = data.map(read_data)\n",
280 |     "\tdata = data.batch(batch_size)\n",
281 |     "\titerator = data.make_one_shot_iterator()\n",
282 |     "\tnext_data, next_label_2, next_label_10 = iterator.get_next()\n",
283 |     "\n",
284 |     "\treturn next_data, next_label_10, next_label_2\n",
285 |     "\n",
286 |     "\n",
287 |     "\n",
288 |     "\n",
289 |     "if __name__ == '__main__':\n",
290 |     "\n",
291 |     "\tfile_folder = '/home/hll/IDS/UNSW-NB15 - CSV Files/'  #读取的原始文件所在的位置。 The location where the original file was read\n",
292 |     "\tcol_names = ['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur',\n",
293 |     "\t             'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss',\n",
294 |     "\t             'service', 'sload', 'dload', 'spkts', 'dpkts', 'swin', 'dwin',\n",
295 |     "\t             'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'trans_depth',\n",
296 |     "\t             'res_bdy_len', 'sjit', 'djit', 'stime', 'ltime', 'sintpkt',\n",
297 |     "\t             'dintpkt', 'tcprtt', 'synack', 'ackdat', 'is_sm_ips',\n",
298 |     "\t             'ct_state_ttl', 'ct_flw', 'is_ftp', 'ct_ftp', 'ct_srv_src',\n",
299 |     "\t             'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport',\n",
300 |     "\t             'ct_dst_sport', 'ct_dst_src', 'label_10', 'label_2']    #特证名（列名）。 listed name\n",
301 |     "\n",
302 |     "\tcols_to_drop = ['srcip', 'dstip', 'stime', 'ltime', 'sport', 'dsport'] \n",
303 |     "\tcols_nominal = ['proto', 'service', 'state']   #名词性特征。Nominal features\n",
304 |     "\n",
305 |     "\tfiles = [file_folder + 'UNSW-NB15_' + str(i+1) + '.csv' for i in range(4)]  \n",
306 |     "\tdataset = combine_dataset(files, col_names)   \n",
307 |     "\tcols_nominal_all = get_nominal_names(dataset, cols_nominal)  \n",
308 |     "\tdel dataset  \n",
309 |     "\n",
310 |     "\tfile_tail = len('.csv')  \n",
311 |     "\tfile_head = len(file_folder + 'UNSW-NB15_')   \n",
312 |     "\tdtypes = {}   \n",
313 |     "\tfor col_name in col_names:\n",
314 |     "\t\tnominal_names = set(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state',\n",
315 |     "\t\t\t                 'service', 'is_ftp', 'ct_flw', 'ct_ftp', 'label_10'])\n",
316 |     "\t\tif col_name in nominal_names:\n",
317 |     "\t\t\tdtypes[col_name] =  str  \n",
318 |     "\t\telse:\n",
319 |     "\t\t\tdtypes[col_name] = np.float32 \n",
320 |     "\n",
321 |     "\tfor file in files:\n",
322 |     "\t\tfile_train = file[file_head:-1 * file_tail] + '_train.csv'  #每个文件分裂出的训练集和测试集，csv文件。 \n",
323 |     "        #Each file is split out of the training set and test set, CSV file\n",
324 |     "\t\tfile_test = file[file_head: -1 * file_tail] + '_test.csv'\n",
325 |     "\t\tdataset = pd.read_csv(file, header = None, names = col_names, dtype = dtypes)  \n",
326 |     "\t\tdataset = select_feature_and_encoding(dataset, cols_to_drop, cols_nominal,\n",
327 |     "\t\t                                          cols_nominal_all)  \n",
328 |     "\t\tsplit_dataset(dataset, file_train, file_test)   \n",
329 |     "\n",
330 |     "\tcols_unchanged = ['is_ftp', 'is_sm_ips'] + cols_nominal +\\\n",
331 |     "\t                 cols_to_drop + ['label_2', 'label_10']   \n",
332 |     "\tcols_scaling = [x for x in col_names if x not in cols_unchanged]  \n",
333 |     "\n",
334 |     "\tfiles_train = [str(x + 1) + '_train.csv' for x in range(4)]  \n",
335 |     "\tfiles_test = [str(x + 1) + '_test.csv' for x in range(4)]  \n",
336 |     "\n",
337 |     "\tscaling(files_train, files_test, cols_scaling, 'std')  #标准化。standardized\n",
338 |     "\n",
339 |     "\tfile_folder = 'normalized/' #数据标准化后存放的文件夹。A folder where data is stored after standardization\n",
340 |     "\tfiles_train = [file_folder + str(x + 1) + '_train.tfrecords' for x in range(4)]  \n",
341 |     "\tfiles_test = [file_folder + str(x + 1) + '_test.tfrecords' for x in range(4)]   \n"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": 10,
347 |    "metadata": {},
348 |    "outputs": [],
349 |    "source": [
350 |     "#Integrate the four separate data sets\n",
351 |     "#将分开的4个数据集整合到一起\n",
352 |     "def make_whole_datasets(tfrecords_train, num_train_example, tfrecords_test,\n",
353 |     "                        num_test_example):          \n",
354 |     "\n",
355 |     "    data_train, label_10_train, label_2_train= next_batch(tfrecords_train,num_train_example)\n",
356 |     "    data_test, label_10_test, label_2_test= next_batch(tfrecords_test,num_test_example)\n",
357 |     "    with tf.Session() as sess:\n",
358 |     "        data, label_10, label_2 = sess.run([data_train, label_10_train,label_2_train])\n",
359 |     "    dataset = np.concatenate([data, label_10, label_2], axis = 1)\n",
360 |     "\n",
361 |     "    #trainset, valiset = train_test_split(dataset, test_size = 254004,stratify=dataset['label_10'])  \n",
362 |     "    trainset, valiset = train_test_split(dataset, test_size = 0.125,random_state=40,stratify=dataset[:,-2])\n",
363 |     "    print(\"train:\",trainset.shape)\n",
364 |     "    print(\"val:\",valiset.shape) \n",
365 |     "\n",
366 |     "    make_tfrecords(trainset, 'normalized/train.tfrecords') \n",
367 |     "    make_tfrecords(valiset, 'normalized/validation.tfrecords')\n",
368 |     "\n",
369 |     "    del trainset, valiset\n",
370 |     "\n",
371 |     "    with tf.Session() as sess:\n",
372 |     "        data, label_10, label_2 = sess.run([data_test, label_10_test,label_2_test])\n",
373 |     "    dataset = np.concatenate([data, label_10, label_2], axis = 1)\n",
374 |     "    print(\"test:\",dataset.shape)  \n",
375 |     "    make_tfrecords(dataset, 'normalized/test.tfrecords')"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": 11,
381 |    "metadata": {},
382 |    "outputs": [
383 |     {
384 |      "name": "stdout",
385 |      "output_type": "stream",
386 |      "text": [
387 |       "train: (1778030, 204)\n",
388 |       "val: (254005, 204)\n",
389 |       "test: (508012, 204)\n"
390 |      ]
391 |     }
392 |    ],
393 |    "source": [
394 |     "num_train_example = 2032035 #trainset size\n",
395 |     "num_test_example = 508012 #testset size \n",
396 |     "make_whole_datasets(files_train, num_train_example, files_test, num_test_example)"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": null,
402 |    "metadata": {},
403 |    "outputs": [],
404 |    "source": []
405 |   }
406 |  ],
407 |  "metadata": {
408 |   "kernelspec": {
409 |    "display_name": "Python 3",
410 |    "language": "python",
411 |    "name": "python3"
412 |   },
413 |   "language_info": {
414 |    "codemirror_mode": {
415 |     "name": "ipython",
416 |     "version": 3
417 |    },
418 |    "file_extension": ".py",
419 |    "mimetype": "text/x-python",
420 |    "name": "python",
421 |    "nbconvert_exporter": "python",
422 |    "pygments_lexer": "ipython3",
423 |    "version": "3.6.9"
424 |   }
425 |  },
426 |  "nbformat": 4,
427 |  "nbformat_minor": 2
428 | }
429 | 


--------------------------------------------------------------------------------