├── .gitignore ├── README.md ├── ml ├── __init__.py ├── dataset.py ├── model.py └── utils.py ├── preprocess.py ├── train_model.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | 135 | # pytype static type analyzer 136 | .pytype/ 137 | 138 | # Cython debug symbols 139 | cython_debug/ 140 | 141 | # idea 142 | .idea/ 143 | 144 | # data 145 | processed_data/ 146 | UNSW-NB15 - CSV Files/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | LuNet 2 | -------------------------------------------------------------------------------- /ml/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/munhouiani/LuNet/baf4b063dd648ba448b22a77df7e130dd501f305/ml/__init__.py -------------------------------------------------------------------------------- /ml/dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from torch.utils.data import Dataset 3 | 4 | 5 | class UNSWNB15Dataset(Dataset): 6 | def __init__(self, data_path): 7 | self.data = np.load(data_path) 8 | 9 | def __len__(self): 10 | return len(self.data) 11 | 12 | def __getitem__(self, idx): 13 | attack_cat = self.data[idx, -2] 14 | label = self.data[idx, -1] 15 | feature = self.data[idx, 0:-2] 16 | 17 | # expand dims for matching batch & feature vector 18 | feature = np.expand_dims(feature, axis=0) 19 | 20 | return { 21 | 'feature': feature, 22 | 'attack_cat': attack_cat, 23 | 'label': label 24 | } 25 | -------------------------------------------------------------------------------- /ml/model.py: -------------------------------------------------------------------------------- 1 | from argparse import Namespace 2 | 3 | import torch 4 | from pytorch_lightning import LightningModule 5 | from torch import nn as nn 6 | from torch.nn import functional as F 7 | from torch.utils.data import DataLoader 8 | 9 | from ml.dataset import UNSWNB15Dataset 10 | 11 | 12 | class LuNetBlock(LightningModule): 13 | def __init__(self, hparams): 14 | super().__init__() 15 | self.conv = nn.Sequential( 16 | nn.Conv1d( 17 | in_channels=1, 18 | out_channels=hparams.conv_out, 19 | kernel_size=hparams.kernel 20 | ), 21 | nn.ReLU() 22 | ) 23 | self.max_pool = nn.Sequential( 24 | nn.MaxPool1d(kernel_size=2), 25 | nn.ReLU() 26 | ) 27 | # calc the output size after max_pool 28 | dummy_x = torch.randn(1, 1, hparams.input_dim, requires_grad=False) 29 | dummy_x = self.conv(dummy_x) 30 | dummy_x = self.max_pool(dummy_x) 31 | max_pool_out = dummy_x.shape[1] 32 | lstm_in = dummy_x.shape[2] 33 | 34 | self.batch_norm = nn.BatchNorm1d(max_pool_out) 35 | 36 | self.lstm = nn.LSTM(lstm_in, hparams.conv_out) 37 | 38 | self.dropout = nn.Dropout(p=0.5) 39 | 40 | def forward(self, x): 41 | x = self.conv(x) 42 | x = self.max_pool(x) 43 | x = self.batch_norm(x) 44 | 45 | # lstm and relu 46 | x, hidden = self.lstm(x) 47 | x = F.relu(x) 48 | 49 | # reshape 50 | x = x.view(x.shape[0], 1, -1) 51 | 52 | # drop out 53 | x = self.dropout(x) 54 | 55 | return x 56 | 57 | 58 | class LuNet(LightningModule): 59 | def __init__(self, hparams): 60 | super().__init__() 61 | 62 | # config 63 | self.train_data_path = hparams.train_data_path 64 | self.val_data_path = hparams.val_data_path 65 | self.out_dim = hparams.out 66 | 67 | hparams_lu_block_1 = Namespace(**{ 68 | 'input_dim': hparams.input_dim, 69 | 'conv_out': hparams.c1_out, 70 | 'kernel': hparams.c1_kernel 71 | }) 72 | self.lu_block_1 = LuNetBlock(hparams_lu_block_1) 73 | 74 | # use dummy to calc output 75 | dummy_x = torch.randn(1, 1, hparams.input_dim, requires_grad=False) 76 | dummy_x = self.lu_block_1(dummy_x) 77 | 78 | hparams_lu_block_2 = Namespace(**{ 79 | 'input_dim': dummy_x.shape[2], 80 | 'conv_out': hparams.c2_out, 81 | 'kernel': hparams.c2_kernel 82 | }) 83 | self.lu_block_2 = LuNetBlock(hparams_lu_block_2) 84 | 85 | dummy_x = self.lu_block_2(dummy_x) 86 | 87 | hparams_lu_block_3 = Namespace(**{ 88 | 'input_dim': dummy_x.shape[2], 89 | 'conv_out': hparams.c3_out, 90 | 'kernel': hparams.c3_kernel 91 | }) 92 | self.lu_block_3 = LuNetBlock(hparams_lu_block_3) 93 | 94 | dummy_x = self.lu_block_3(dummy_x) 95 | 96 | self.conv = nn.Sequential( 97 | nn.Conv1d( 98 | in_channels=1, 99 | out_channels=hparams.final_conv, 100 | kernel_size=hparams.final_kernel, 101 | ) 102 | ) 103 | 104 | dummy_x = self.conv(dummy_x) 105 | self.avg_pool = nn.AvgPool1d(kernel_size=hparams.final_conv) 106 | 107 | dummy_x = self.avg_pool(dummy_x) 108 | self.drop_out = nn.Dropout(p=0.5) 109 | 110 | if self.out_dim == 2: # binary classification 111 | self.out = nn.Sequential( 112 | nn.Linear( 113 | in_features=dummy_x.shape[1] * dummy_x.shape[2], 114 | out_features=1 115 | 116 | ), 117 | nn.Sigmoid() 118 | ) 119 | else: 120 | self.out = nn.Linear( 121 | in_features=dummy_x.shape[1] * dummy_x.shape[2], 122 | out_features=self.out_dim 123 | ) 124 | 125 | def forward(self, x): 126 | x = self.lu_block_1(x) 127 | x = self.lu_block_2(x) 128 | x = self.lu_block_3(x) 129 | x = self.conv(x) 130 | x = self.avg_pool(x) 131 | x = self.drop_out(x) 132 | 133 | # reshape 134 | x = x.view(x.shape[0], -1) 135 | 136 | x = self.out(x) 137 | 138 | return x 139 | 140 | def train_dataloader(self): 141 | data_loader = DataLoader(UNSWNB15Dataset(self.train_data_path), batch_size=16, shuffle=True, num_workers=12) 142 | 143 | return data_loader 144 | 145 | def val_dataloader(self): 146 | data_loader = DataLoader(UNSWNB15Dataset(self.val_data_path), batch_size=16, shuffle=True, num_workers=12) 147 | 148 | return data_loader 149 | 150 | def training_step(self, batch, batch_idx): 151 | x = batch['feature'].float() 152 | y_hat = self(x) 153 | 154 | if self.out_dim == 2: # binary classification 155 | y = batch['label'].float() 156 | loss = {'loss': F.binary_cross_entropy(y_hat, y)} 157 | else: 158 | y = batch['attack_cat'].long() 159 | loss = {'loss': F.cross_entropy(y_hat, y)} 160 | 161 | if (batch_idx % 50) == 0: 162 | self.logger.log_metrics(loss) 163 | return loss 164 | 165 | def validation_step(self, batch, batch_idx): 166 | x = batch['feature'].float() 167 | y_hat = self(x) 168 | 169 | if self.out_dim == 2: # binary classification 170 | y = batch['label'].float() 171 | loss = {'val_loss': F.binary_cross_entropy(y_hat, y)} 172 | else: 173 | y = batch['attack_cat'].long() 174 | loss = {'val_loss': F.cross_entropy(y_hat, y)} 175 | 176 | if (batch_idx % 50) == 0: 177 | self.logger.log_metrics(loss) 178 | return loss 179 | 180 | def configure_optimizers(self): 181 | return torch.optim.RMSprop(self.parameters(), lr=0.001) 182 | -------------------------------------------------------------------------------- /ml/utils.py: -------------------------------------------------------------------------------- 1 | from argparse import Namespace 2 | 3 | from pytorch_lightning import Trainer 4 | from pytorch_lightning.loggers import TensorBoardLogger 5 | 6 | from ml.model import LuNet 7 | 8 | 9 | def train_lunet(input_dim, c1_out, c1_kernel, c2_out, c2_kernel, c3_out, c3_kernel, final_conv, final_kernel, out_dim, 10 | train_data_path, test_data_path, num_epoch, gpu, model_path, logger): 11 | hparams = Namespace(**{ 12 | 'input_dim': input_dim, 13 | 'c1_out': c1_out, 14 | 'c1_kernel': c1_kernel, 15 | 'c2_out': c2_out, 16 | 'c2_kernel': c2_kernel, 17 | 'c3_out': c3_out, 18 | 'c3_kernel': c3_kernel, 19 | 'final_conv': final_conv, 20 | 'final_kernel': final_kernel, 21 | 'out': out_dim, 22 | 'train_data_path': train_data_path, 23 | 'val_data_path': test_data_path, 24 | }) 25 | model = LuNet(hparams) 26 | trainer = Trainer(max_epochs=num_epoch, gpus=gpu, logger=logger) 27 | trainer.fit(model) 28 | 29 | # save model 30 | trainer.save_checkpoint(str(model_path.absolute())) 31 | 32 | 33 | def train_lunet_binary(train_data_path, test_data_path, gpu, model_path): 34 | # prepare logger 35 | logger = TensorBoardLogger('lunet_binary_model_logs', 'lunet_binary_model') 36 | 37 | train_lunet( 38 | input_dim=196, c1_out=64, c1_kernel=64, c2_out=128, c2_kernel=128, c3_out=256, c3_kernel=256, final_conv=512, 39 | final_kernel=512, out_dim=2, train_data_path=train_data_path, test_data_path=test_data_path, num_epoch=10, 40 | gpu=gpu, model_path=model_path, logger=logger 41 | ) 42 | 43 | 44 | def train_lunet_multiclass(train_data_path, test_data_path, gpu, model_path): 45 | # prepare logger 46 | logger = TensorBoardLogger('lunet_multiclass_model_logs', 'lunet_multiclass_model') 47 | 48 | train_lunet( 49 | input_dim=196, c1_out=64, c1_kernel=64, c2_out=128, c2_kernel=128, c3_out=256, c3_kernel=256, final_conv=512, 50 | final_kernel=512, out_dim=10, train_data_path=train_data_path, test_data_path=test_data_path, num_epoch=10, 51 | gpu=gpu, model_path=model_path, logger=logger 52 | ) 53 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import click 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from utils import preprocess_data 8 | 9 | 10 | def transform_and_save(source_path, target_path): 11 | # read data 12 | data = pd.read_csv(source_path) 13 | 14 | # transform 15 | data = preprocess_data(data) 16 | 17 | # output 18 | np.save(str(target_path.absolute()), data) 19 | 20 | 21 | @click.command() 22 | @click.option('-s', '--source', help='path to the dir containing train and test csv', required=True) 23 | @click.option('-t', '--target', help='path to the dir for persisting transformed numpy array', required=True) 24 | def main(source, target): 25 | source_dir_path = Path(source) 26 | train_data_path = source_dir_path / 'UNSW_NB15_training-set.csv' 27 | test_data_path = source_dir_path / 'UNSW_NB15_testing-set.csv' 28 | 29 | # prepare target path 30 | target_dir_path = Path(target) 31 | target_dir_path.mkdir(parents=True, exist_ok=True) 32 | target_train_path = target_dir_path / 'train' 33 | target_test_path = target_dir_path / 'test' 34 | 35 | # processing 36 | transform_and_save(train_data_path, target_train_path) 37 | transform_and_save(test_data_path, target_test_path) 38 | 39 | 40 | if __name__ == '__main__': 41 | main() 42 | -------------------------------------------------------------------------------- /train_model.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import click 4 | 5 | from ml.utils import train_lunet_binary, train_lunet_multiclass 6 | 7 | 8 | @click.command() 9 | @click.option('-t', '--train_data_path', help='path to train set', required=True) 10 | @click.option('-v', '--val_data_path', help='path to val set', required=True) 11 | @click.option('-m', '--model_path', help='path to persist model', required=True) 12 | @click.option('-a', '--task', help='task type (Option: "binary" or "multiclass")', required=True) 13 | @click.option('--use_gpu', help='whether to use gpu', default=True) 14 | def main(train_data_path, val_data_path, model_path, use_gpu, task): 15 | # prepare path for model 16 | model_path = Path(model_path) 17 | model_path.parent.mkdir(parents=True, exist_ok=True) 18 | 19 | if use_gpu: 20 | gpu = -1 21 | else: 22 | gpu = None 23 | 24 | if task == 'binary': 25 | train_lunet_binary(train_data_path=train_data_path, test_data_path=val_data_path, gpu=gpu, 26 | model_path=model_path) 27 | elif task == 'multiclass': 28 | train_lunet_multiclass(train_data_path=train_data_path, test_data_path=val_data_path, gpu=gpu, 29 | model_path=model_path) 30 | 31 | else: 32 | exit('Error task type') 33 | 34 | 35 | if __name__ == '__main__': 36 | main() 37 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | ATTACK_CAT_TO_ID = { 4 | 'Normal': 0, 5 | 'Reconnaissance': 1, 6 | 'Backdoor': 2, 7 | 'DoS': 3, 8 | 'Exploits': 4, 9 | 'Analysis': 5, 10 | 'Fuzzers': 6, 11 | 'Worms': 7, 12 | 'Shellcode': 8, 13 | 'Generic': 9, 14 | } 15 | 16 | 17 | def proto_to_one_hot(x): 18 | proto_val = [ 19 | 'udp', 'arp', 'tcp', 'igmp', 'ospf', 'sctp', 'gre', 'ggp', 'ip', 'ipnip', 'st2', 'argus', 'chaos', 'egp', 20 | 'emcon', 'nvp', 'pup', 'xnet', 'mux', 'dcn', 'hmp', 'prm', 'trunk-1', 'trunk-2', 'xns-idp', 'leaf-1', 'leaf-2', 21 | 'irtp', 'rdp', 'netblt', 'mfe-nsp', 'merit-inp', '3pc', 'idpr', 'ddp', 'idpr-cmtp', 'tp++', 'ipv6', 'sdrp', 22 | 'ipv6-frag', 'ipv6-route', 'idrp', 'mhrp', 'i-nlsp', 'rvd', 'mobile', 'narp', 'skip', 'tlsp', 'ipv6-no', 'any', 23 | 'ipv6-opts', 'cftp', 'sat-expak', 'ippc', 'kryptolan', 'sat-mon', 'cpnx', 'wsn', 'pvp', 'br-sat-mon', 'sun-nd', 24 | 'wb-mon', 'vmtp', 'ttp', 'vines', 'nsfnet-igp', 'dgp', 'eigrp', 'tcf', 'sprite-rpc', 'larp', 'mtp', 'ax.25', 25 | 'ipip', 'aes-sp3-d', 'micp', 'encap', 'pri-enc', 'gmtp', 'ifmp', 'pnni', 'qnx', 'scps', 'cbt', 'bbn-rcc', 'igp', 26 | 'bna', 'swipe', 'visa', 'ipcv', 'cphb', 'iso-tp4', 'wb-expak', 'sep', 'secure-vmtp', 'xtp', 'il', 'rsvp', 27 | 'unas', 'fc', 'iso-ip', 'etherip', 'pim', 'aris', 'a/n', 'ipcomp', 'snp', 'compaq-peer', 'ipx-n-ip', 'pgm', 28 | 'vrrp', 'l2tp', 'zero', 'ddx', 'iatp', 'stp', 'srp', 'uti', 'sm', 'smp', 'isis', 'ptp', 'fire', 'crtp', 'crudp', 29 | 'sccopmce', 'iplt', 'pipe', 'sps', 'ib', 'icmp', 'rtp' 30 | ] 31 | 32 | one_hot = [0] * len(proto_val) 33 | one_hot[proto_val.index(x)] = 1 34 | 35 | return one_hot 36 | 37 | 38 | def service_to_one_hot(x): 39 | service_val = [ 40 | '-', 'http', 'ftp', 'ftp-data', 'smtp', 'pop3', 'dns', 'snmp', 'ssl', 'dhcp', 'irc', 'radius', 'ssh' 41 | ] 42 | 43 | one_hot = [0] * len(service_val) 44 | one_hot[service_val.index(x)] = 1 45 | 46 | return one_hot 47 | 48 | 49 | def state_to_one_hot(x): 50 | state_val = ['INT', 'FIN', 'REQ', 'ACC', 'CON', 'RST', 'CLO', 'ECO', 'PAR', 'URN', 'no'] 51 | 52 | one_hot = [0] * len(state_val) 53 | one_hot[state_val.index(x)] = 1 54 | 55 | return one_hot 56 | 57 | 58 | def double_sided_log(x): 59 | return np.sign(x) * np.log(1 + np.abs(x)) 60 | 61 | 62 | def sigmoid(x): 63 | return np.divide(1, (1 + np.exp(np.negative(x)))) 64 | 65 | 66 | def preprocess_data(data): 67 | # extract label and drop id 68 | attack_cat = data['attack_cat'].apply(func=(lambda x: ATTACK_CAT_TO_ID.get(x))).to_numpy().reshape(-1, 1) 69 | label = data['label'].to_numpy().reshape(-1, 1) 70 | data = data.drop(['id', 'attack_cat', 'label'], axis=1) 71 | 72 | # categorical to one-hot-encoding 73 | # extract and transform categorical to one-hot 74 | proto = np.array(data['proto'].apply(lambda x: proto_to_one_hot(x)).to_list()) 75 | service = np.array(data['service'].apply(lambda x: service_to_one_hot(x)).to_list()) 76 | state = np.array(data['state'].apply(lambda x: state_to_one_hot(x)).to_list()) 77 | categorical = np.concatenate([proto, service, state], axis=1) 78 | data = data.drop(['proto', 'service', 'state'], axis=1) 79 | 80 | # transform to np matrix 81 | data = data.to_numpy() 82 | 83 | # merge categorical freature back to data 84 | data = np.concatenate([data, categorical], axis=1) 85 | 86 | # double-sided log: to attenuate outliers 87 | data = double_sided_log(data) 88 | 89 | # sigmoid transform 90 | data = sigmoid(data) 91 | 92 | # merge label back to data 93 | data = np.concatenate([data, attack_cat, label], axis=1) 94 | 95 | return data 96 | --------------------------------------------------------------------------------