├── .gitignore
├── README.md
├── ml
    ├── __init__.py
    ├── dataset.py
    ├── model.py
    └── utils.py
├── preprocess.py
├── train_model.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | # .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 
135 | # pytype static type analyzer
136 | .pytype/
137 | 
138 | # Cython debug symbols
139 | cython_debug/
140 | 
141 | # idea
142 | .idea/
143 | 
144 | # data
145 | processed_data/
146 | UNSW-NB15 - CSV Files/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | LuNet
2 | 


--------------------------------------------------------------------------------
/ml/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/munhouiani/LuNet/baf4b063dd648ba448b22a77df7e130dd501f305/ml/__init__.py


--------------------------------------------------------------------------------
/ml/dataset.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from torch.utils.data import Dataset
 3 | 
 4 | 
 5 | class UNSWNB15Dataset(Dataset):
 6 |     def __init__(self, data_path):
 7 |         self.data = np.load(data_path)
 8 | 
 9 |     def __len__(self):
10 |         return len(self.data)
11 | 
12 |     def __getitem__(self, idx):
13 |         attack_cat = self.data[idx, -2]
14 |         label = self.data[idx, -1]
15 |         feature = self.data[idx, 0:-2]
16 | 
17 |         # expand dims for matching batch & feature vector
18 |         feature = np.expand_dims(feature, axis=0)
19 | 
20 |         return {
21 |             'feature': feature,
22 |             'attack_cat': attack_cat,
23 |             'label': label
24 |         }
25 | 


--------------------------------------------------------------------------------
/ml/model.py:
--------------------------------------------------------------------------------
  1 | from argparse import Namespace
  2 | 
  3 | import torch
  4 | from pytorch_lightning import LightningModule
  5 | from torch import nn as nn
  6 | from torch.nn import functional as F
  7 | from torch.utils.data import DataLoader
  8 | 
  9 | from ml.dataset import UNSWNB15Dataset
 10 | 
 11 | 
 12 | class LuNetBlock(LightningModule):
 13 |     def __init__(self, hparams):
 14 |         super().__init__()
 15 |         self.conv = nn.Sequential(
 16 |             nn.Conv1d(
 17 |                 in_channels=1,
 18 |                 out_channels=hparams.conv_out,
 19 |                 kernel_size=hparams.kernel
 20 |             ),
 21 |             nn.ReLU()
 22 |         )
 23 |         self.max_pool = nn.Sequential(
 24 |             nn.MaxPool1d(kernel_size=2),
 25 |             nn.ReLU()
 26 |         )
 27 |         # calc the output size after max_pool
 28 |         dummy_x = torch.randn(1, 1, hparams.input_dim, requires_grad=False)
 29 |         dummy_x = self.conv(dummy_x)
 30 |         dummy_x = self.max_pool(dummy_x)
 31 |         max_pool_out = dummy_x.shape[1]
 32 |         lstm_in = dummy_x.shape[2]
 33 | 
 34 |         self.batch_norm = nn.BatchNorm1d(max_pool_out)
 35 | 
 36 |         self.lstm = nn.LSTM(lstm_in, hparams.conv_out)
 37 | 
 38 |         self.dropout = nn.Dropout(p=0.5)
 39 | 
 40 |     def forward(self, x):
 41 |         x = self.conv(x)
 42 |         x = self.max_pool(x)
 43 |         x = self.batch_norm(x)
 44 | 
 45 |         # lstm and relu
 46 |         x, hidden = self.lstm(x)
 47 |         x = F.relu(x)
 48 | 
 49 |         # reshape
 50 |         x = x.view(x.shape[0], 1, -1)
 51 | 
 52 |         # drop out
 53 |         x = self.dropout(x)
 54 | 
 55 |         return x
 56 | 
 57 | 
 58 | class LuNet(LightningModule):
 59 |     def __init__(self, hparams):
 60 |         super().__init__()
 61 | 
 62 |         # config
 63 |         self.train_data_path = hparams.train_data_path
 64 |         self.val_data_path = hparams.val_data_path
 65 |         self.out_dim = hparams.out
 66 | 
 67 |         hparams_lu_block_1 = Namespace(**{
 68 |             'input_dim': hparams.input_dim,
 69 |             'conv_out': hparams.c1_out,
 70 |             'kernel': hparams.c1_kernel
 71 |         })
 72 |         self.lu_block_1 = LuNetBlock(hparams_lu_block_1)
 73 | 
 74 |         # use dummy to calc output
 75 |         dummy_x = torch.randn(1, 1, hparams.input_dim, requires_grad=False)
 76 |         dummy_x = self.lu_block_1(dummy_x)
 77 | 
 78 |         hparams_lu_block_2 = Namespace(**{
 79 |             'input_dim': dummy_x.shape[2],
 80 |             'conv_out': hparams.c2_out,
 81 |             'kernel': hparams.c2_kernel
 82 |         })
 83 |         self.lu_block_2 = LuNetBlock(hparams_lu_block_2)
 84 | 
 85 |         dummy_x = self.lu_block_2(dummy_x)
 86 | 
 87 |         hparams_lu_block_3 = Namespace(**{
 88 |             'input_dim': dummy_x.shape[2],
 89 |             'conv_out': hparams.c3_out,
 90 |             'kernel': hparams.c3_kernel
 91 |         })
 92 |         self.lu_block_3 = LuNetBlock(hparams_lu_block_3)
 93 | 
 94 |         dummy_x = self.lu_block_3(dummy_x)
 95 | 
 96 |         self.conv = nn.Sequential(
 97 |             nn.Conv1d(
 98 |                 in_channels=1,
 99 |                 out_channels=hparams.final_conv,
100 |                 kernel_size=hparams.final_kernel,
101 |             )
102 |         )
103 | 
104 |         dummy_x = self.conv(dummy_x)
105 |         self.avg_pool = nn.AvgPool1d(kernel_size=hparams.final_conv)
106 | 
107 |         dummy_x = self.avg_pool(dummy_x)
108 |         self.drop_out = nn.Dropout(p=0.5)
109 | 
110 |         if self.out_dim == 2:  # binary classification
111 |             self.out = nn.Sequential(
112 |                 nn.Linear(
113 |                     in_features=dummy_x.shape[1] * dummy_x.shape[2],
114 |                     out_features=1
115 | 
116 |                 ),
117 |                 nn.Sigmoid()
118 |             )
119 |         else:
120 |             self.out = nn.Linear(
121 |                 in_features=dummy_x.shape[1] * dummy_x.shape[2],
122 |                 out_features=self.out_dim
123 |             )
124 | 
125 |     def forward(self, x):
126 |         x = self.lu_block_1(x)
127 |         x = self.lu_block_2(x)
128 |         x = self.lu_block_3(x)
129 |         x = self.conv(x)
130 |         x = self.avg_pool(x)
131 |         x = self.drop_out(x)
132 | 
133 |         # reshape
134 |         x = x.view(x.shape[0], -1)
135 | 
136 |         x = self.out(x)
137 | 
138 |         return x
139 | 
140 |     def train_dataloader(self):
141 |         data_loader = DataLoader(UNSWNB15Dataset(self.train_data_path), batch_size=16, shuffle=True, num_workers=12)
142 | 
143 |         return data_loader
144 | 
145 |     def val_dataloader(self):
146 |         data_loader = DataLoader(UNSWNB15Dataset(self.val_data_path), batch_size=16, shuffle=True, num_workers=12)
147 | 
148 |         return data_loader
149 | 
150 |     def training_step(self, batch, batch_idx):
151 |         x = batch['feature'].float()
152 |         y_hat = self(x)
153 | 
154 |         if self.out_dim == 2:  # binary classification
155 |             y = batch['label'].float()
156 |             loss = {'loss': F.binary_cross_entropy(y_hat, y)}
157 |         else:
158 |             y = batch['attack_cat'].long()
159 |             loss = {'loss': F.cross_entropy(y_hat, y)}
160 | 
161 |         if (batch_idx % 50) == 0:
162 |             self.logger.log_metrics(loss)
163 |         return loss
164 | 
165 |     def validation_step(self, batch, batch_idx):
166 |         x = batch['feature'].float()
167 |         y_hat = self(x)
168 | 
169 |         if self.out_dim == 2:  # binary classification
170 |             y = batch['label'].float()
171 |             loss = {'val_loss': F.binary_cross_entropy(y_hat, y)}
172 |         else:
173 |             y = batch['attack_cat'].long()
174 |             loss = {'val_loss': F.cross_entropy(y_hat, y)}
175 | 
176 |         if (batch_idx % 50) == 0:
177 |             self.logger.log_metrics(loss)
178 |         return loss
179 | 
180 |     def configure_optimizers(self):
181 |         return torch.optim.RMSprop(self.parameters(), lr=0.001)
182 | 


--------------------------------------------------------------------------------
/ml/utils.py:
--------------------------------------------------------------------------------
 1 | from argparse import Namespace
 2 | 
 3 | from pytorch_lightning import Trainer
 4 | from pytorch_lightning.loggers import TensorBoardLogger
 5 | 
 6 | from ml.model import LuNet
 7 | 
 8 | 
 9 | def train_lunet(input_dim, c1_out, c1_kernel, c2_out, c2_kernel, c3_out, c3_kernel, final_conv, final_kernel, out_dim,
10 |                 train_data_path, test_data_path, num_epoch, gpu, model_path, logger):
11 |     hparams = Namespace(**{
12 |         'input_dim': input_dim,
13 |         'c1_out': c1_out,
14 |         'c1_kernel': c1_kernel,
15 |         'c2_out': c2_out,
16 |         'c2_kernel': c2_kernel,
17 |         'c3_out': c3_out,
18 |         'c3_kernel': c3_kernel,
19 |         'final_conv': final_conv,
20 |         'final_kernel': final_kernel,
21 |         'out': out_dim,
22 |         'train_data_path': train_data_path,
23 |         'val_data_path': test_data_path,
24 |     })
25 |     model = LuNet(hparams)
26 |     trainer = Trainer(max_epochs=num_epoch, gpus=gpu, logger=logger)
27 |     trainer.fit(model)
28 | 
29 |     # save model
30 |     trainer.save_checkpoint(str(model_path.absolute()))
31 | 
32 | 
33 | def train_lunet_binary(train_data_path, test_data_path, gpu, model_path):
34 |     # prepare logger
35 |     logger = TensorBoardLogger('lunet_binary_model_logs', 'lunet_binary_model')
36 | 
37 |     train_lunet(
38 |         input_dim=196, c1_out=64, c1_kernel=64, c2_out=128, c2_kernel=128, c3_out=256, c3_kernel=256, final_conv=512,
39 |         final_kernel=512, out_dim=2, train_data_path=train_data_path, test_data_path=test_data_path, num_epoch=10,
40 |         gpu=gpu, model_path=model_path, logger=logger
41 |     )
42 | 
43 | 
44 | def train_lunet_multiclass(train_data_path, test_data_path, gpu, model_path):
45 |     # prepare logger
46 |     logger = TensorBoardLogger('lunet_multiclass_model_logs', 'lunet_multiclass_model')
47 | 
48 |     train_lunet(
49 |         input_dim=196, c1_out=64, c1_kernel=64, c2_out=128, c2_kernel=128, c3_out=256, c3_kernel=256, final_conv=512,
50 |         final_kernel=512, out_dim=10, train_data_path=train_data_path, test_data_path=test_data_path, num_epoch=10,
51 |         gpu=gpu, model_path=model_path, logger=logger
52 |     )
53 | 


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import click
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | from utils import preprocess_data
 8 | 
 9 | 
10 | def transform_and_save(source_path, target_path):
11 |     # read data
12 |     data = pd.read_csv(source_path)
13 | 
14 |     # transform
15 |     data = preprocess_data(data)
16 | 
17 |     # output
18 |     np.save(str(target_path.absolute()), data)
19 | 
20 | 
21 | @click.command()
22 | @click.option('-s', '--source', help='path to the dir containing train and test csv', required=True)
23 | @click.option('-t', '--target', help='path to the dir for persisting transformed numpy array', required=True)
24 | def main(source, target):
25 |     source_dir_path = Path(source)
26 |     train_data_path = source_dir_path / 'UNSW_NB15_training-set.csv'
27 |     test_data_path = source_dir_path / 'UNSW_NB15_testing-set.csv'
28 | 
29 |     # prepare target path
30 |     target_dir_path = Path(target)
31 |     target_dir_path.mkdir(parents=True, exist_ok=True)
32 |     target_train_path = target_dir_path / 'train'
33 |     target_test_path = target_dir_path / 'test'
34 | 
35 |     # processing
36 |     transform_and_save(train_data_path, target_train_path)
37 |     transform_and_save(test_data_path, target_test_path)
38 | 
39 | 
40 | if __name__ == '__main__':
41 |     main()
42 | 


--------------------------------------------------------------------------------
/train_model.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import click
 4 | 
 5 | from ml.utils import train_lunet_binary, train_lunet_multiclass
 6 | 
 7 | 
 8 | @click.command()
 9 | @click.option('-t', '--train_data_path', help='path to train set', required=True)
10 | @click.option('-v', '--val_data_path', help='path to val set', required=True)
11 | @click.option('-m', '--model_path', help='path to persist model', required=True)
12 | @click.option('-a', '--task', help='task type (Option: "binary" or "multiclass")', required=True)
13 | @click.option('--use_gpu', help='whether to use gpu', default=True)
14 | def main(train_data_path, val_data_path, model_path, use_gpu, task):
15 |     # prepare path for model
16 |     model_path = Path(model_path)
17 |     model_path.parent.mkdir(parents=True, exist_ok=True)
18 | 
19 |     if use_gpu:
20 |         gpu = -1
21 |     else:
22 |         gpu = None
23 | 
24 |     if task == 'binary':
25 |         train_lunet_binary(train_data_path=train_data_path, test_data_path=val_data_path, gpu=gpu,
26 |                            model_path=model_path)
27 |     elif task == 'multiclass':
28 |         train_lunet_multiclass(train_data_path=train_data_path, test_data_path=val_data_path, gpu=gpu,
29 |                                model_path=model_path)
30 | 
31 |     else:
32 |         exit('Error task type')
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     main()
37 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | ATTACK_CAT_TO_ID = {
 4 |     'Normal': 0,
 5 |     'Reconnaissance': 1,
 6 |     'Backdoor': 2,
 7 |     'DoS': 3,
 8 |     'Exploits': 4,
 9 |     'Analysis': 5,
10 |     'Fuzzers': 6,
11 |     'Worms': 7,
12 |     'Shellcode': 8,
13 |     'Generic': 9,
14 | }
15 | 
16 | 
17 | def proto_to_one_hot(x):
18 |     proto_val = [
19 |         'udp', 'arp', 'tcp', 'igmp', 'ospf', 'sctp', 'gre', 'ggp', 'ip', 'ipnip', 'st2', 'argus', 'chaos', 'egp',
20 |         'emcon', 'nvp', 'pup', 'xnet', 'mux', 'dcn', 'hmp', 'prm', 'trunk-1', 'trunk-2', 'xns-idp', 'leaf-1', 'leaf-2',
21 |         'irtp', 'rdp', 'netblt', 'mfe-nsp', 'merit-inp', '3pc', 'idpr', 'ddp', 'idpr-cmtp', 'tp++', 'ipv6', 'sdrp',
22 |         'ipv6-frag', 'ipv6-route', 'idrp', 'mhrp', 'i-nlsp', 'rvd', 'mobile', 'narp', 'skip', 'tlsp', 'ipv6-no', 'any',
23 |         'ipv6-opts', 'cftp', 'sat-expak', 'ippc', 'kryptolan', 'sat-mon', 'cpnx', 'wsn', 'pvp', 'br-sat-mon', 'sun-nd',
24 |         'wb-mon', 'vmtp', 'ttp', 'vines', 'nsfnet-igp', 'dgp', 'eigrp', 'tcf', 'sprite-rpc', 'larp', 'mtp', 'ax.25',
25 |         'ipip', 'aes-sp3-d', 'micp', 'encap', 'pri-enc', 'gmtp', 'ifmp', 'pnni', 'qnx', 'scps', 'cbt', 'bbn-rcc', 'igp',
26 |         'bna', 'swipe', 'visa', 'ipcv', 'cphb', 'iso-tp4', 'wb-expak', 'sep', 'secure-vmtp', 'xtp', 'il', 'rsvp',
27 |         'unas', 'fc', 'iso-ip', 'etherip', 'pim', 'aris', 'a/n', 'ipcomp', 'snp', 'compaq-peer', 'ipx-n-ip', 'pgm',
28 |         'vrrp', 'l2tp', 'zero', 'ddx', 'iatp', 'stp', 'srp', 'uti', 'sm', 'smp', 'isis', 'ptp', 'fire', 'crtp', 'crudp',
29 |         'sccopmce', 'iplt', 'pipe', 'sps', 'ib', 'icmp', 'rtp'
30 |     ]
31 | 
32 |     one_hot = [0] * len(proto_val)
33 |     one_hot[proto_val.index(x)] = 1
34 | 
35 |     return one_hot
36 | 
37 | 
38 | def service_to_one_hot(x):
39 |     service_val = [
40 |         '-', 'http', 'ftp', 'ftp-data', 'smtp', 'pop3', 'dns', 'snmp', 'ssl', 'dhcp', 'irc', 'radius', 'ssh'
41 |     ]
42 | 
43 |     one_hot = [0] * len(service_val)
44 |     one_hot[service_val.index(x)] = 1
45 | 
46 |     return one_hot
47 | 
48 | 
49 | def state_to_one_hot(x):
50 |     state_val = ['INT', 'FIN', 'REQ', 'ACC', 'CON', 'RST', 'CLO', 'ECO', 'PAR', 'URN', 'no']
51 | 
52 |     one_hot = [0] * len(state_val)
53 |     one_hot[state_val.index(x)] = 1
54 | 
55 |     return one_hot
56 | 
57 | 
58 | def double_sided_log(x):
59 |     return np.sign(x) * np.log(1 + np.abs(x))
60 | 
61 | 
62 | def sigmoid(x):
63 |     return np.divide(1, (1 + np.exp(np.negative(x))))
64 | 
65 | 
66 | def preprocess_data(data):
67 |     # extract label and drop id
68 |     attack_cat = data['attack_cat'].apply(func=(lambda x: ATTACK_CAT_TO_ID.get(x))).to_numpy().reshape(-1, 1)
69 |     label = data['label'].to_numpy().reshape(-1, 1)
70 |     data = data.drop(['id', 'attack_cat', 'label'], axis=1)
71 | 
72 |     # categorical to one-hot-encoding
73 |     # extract and transform categorical to one-hot
74 |     proto = np.array(data['proto'].apply(lambda x: proto_to_one_hot(x)).to_list())
75 |     service = np.array(data['service'].apply(lambda x: service_to_one_hot(x)).to_list())
76 |     state = np.array(data['state'].apply(lambda x: state_to_one_hot(x)).to_list())
77 |     categorical = np.concatenate([proto, service, state], axis=1)
78 |     data = data.drop(['proto', 'service', 'state'], axis=1)
79 | 
80 |     # transform to np matrix
81 |     data = data.to_numpy()
82 | 
83 |     # merge categorical freature back to data
84 |     data = np.concatenate([data, categorical], axis=1)
85 | 
86 |     # double-sided log: to attenuate outliers
87 |     data = double_sided_log(data)
88 | 
89 |     # sigmoid transform
90 |     data = sigmoid(data)
91 | 
92 |     # merge label back to data
93 |     data = np.concatenate([data, attack_cat, label], axis=1)
94 | 
95 |     return data
96 | 


--------------------------------------------------------------------------------