├── .gitignore ├── README.md ├── dp_p_all.py ├── framework.png ├── numvec_dict ├── svIndex └── sv_list /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Note 2 | Due to the data is too large, we put the parsed data on the google drive. Following is the link (https://drive.google.com/drive/folders/1jI5SQdT7wZE64E9baQhCMWzTsudfIMoc?usp=sharing) 3 | Parsed data is in the /pased_data. An early version of test code and data is also available. 4 | 5 | # DeepSyslog 6 | implement of DeepSyslog 7 | ![Framework](framework.png) 8 | framework 9 | ## Requirement 10 | 11 | - python 3.7 12 | - pytorch >= 1.1.0 13 | ## raw data 14 | you can download raw data from (https://zenodo.org/record/3227177) 15 | 16 | ## log pasing and preprocess 17 | [logpaser toolkit](https://github.com/logpai/logparser) 18 | we choose Spell to parse HDFS datasets and Drain to parse BGL datasets. 19 | The raw logs are seperated to text data and parameters. Then split these composite words into separate parts and remove the stop words. 20 | place some non-numeric parameters with the label like "ip address","exception","port". 21 | also save parameters for each log. 22 | 23 | 24 | ## word embedding 25 | embedding/w2v model.py 26 | download pre-trained fastText word vectors from (https://fasttext.cc/docs/en/crawl-vectors.html) 27 | load pre-trained model and convert it to gensim fastText model then continue to train the model using training data. 28 | 29 | 30 | ## sentence embedding 31 | embedding/sif by fse.py 32 | (https://github.com/oborchers/Fast_Sentence_Embeddings) 33 | implement by fse 34 | load the trained word embedding and choose SIF case of fse to generate sentence embedding 35 | 36 | ## model 37 | train and test model in s_p_all.py 38 | 39 | ## results 40 | 41 | | **Dataset** | **Precision** | **Recall** | **F1** | 42 | | :----:|:----:|:----:|:----:| 43 | |HDFS|0.97 | 0.99 | 0.98 | 44 | |BGL |0.98 |0.97 |0.975 | 45 | 46 | -------------------------------------------------------------------------------- /dp_p_all.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | from torch.utils.data import TensorDataset, DataLoader 5 | import time 6 | import pickle 7 | import logging 8 | import matplotlib.pyplot as plt 9 | 10 | logging.basicConfig( 11 | filename="dp/dp_p_all.log", 12 | filemode="a", 13 | format="%(asctime)s %(name)s:%(levelname)s" 14 | ":%(filename)s-[line%(lineno)d]-:%(message)s", 15 | datefmt="%Y-%m-%d %H:%M:%S", 16 | level=logging.DEBUG) 17 | 18 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 19 | 20 | 21 | def load(filename): 22 | with open(filename, 'rb') as f: 23 | return pickle.load(f) 24 | 25 | 26 | def generate(window_size): 27 | # load sentence vector 28 | sv_list = load('sv_list') 29 | 30 | # load sentence vector index 31 | svIndex = load("svIndex") 32 | 33 | # laod parameter dict 34 | d = load("numvec_dict") 35 | 36 | # load training data list: [[line indexes for each block, params for each block]] 37 | train_normal_blocks = load("train_normal_blocks") 38 | inputs = [] 39 | num_vecs = [] 40 | labels = [] 41 | 42 | for block in train_normal_blocks: 43 | line = block[0] 44 | param = block[1] 45 | for j in range(len(line) - window_size): 46 | vector = [] 47 | num_vec = [0 for k in range(numerical_dim)] 48 | for a in range(j, j + window_size): 49 | index = svIndex[line[a]] 50 | vector.append(sv_list[index]) 51 | 52 | for k in range(j + window_size): 53 | index = svIndex[line[k]] 54 | dindex = d.get(index) 55 | if dindex != None: 56 | cp = param[k] 57 | for pi in range(1, len(cp)): 58 | num_vec[dindex] += int(cp[pi]) 59 | dindex += 1 60 | num_vec[numerical_dim - 1] = param[j + window_size - 1][0] 61 | 62 | inputs.append(vector) 63 | num_vecs.append(num_vec) 64 | labels.append(svIndex[line[j + window_size]]) 65 | 66 | print('train block number:{}'.format(len(train_normal_blocks))) 67 | print('training data length:{}'.format(len(inputs))) 68 | dataset = TensorDataset(torch.tensor(inputs, dtype=torch.float), torch.tensor(num_vecs, dtype=torch.float), 69 | torch.tensor(labels)) 70 | return dataset 71 | 72 | 73 | class Model(nn.Module): 74 | def __init__(self, input_size, hidden_size, num_layers, num_classes, numerical_dim, mid_size=100): 75 | super(Model, self).__init__() 76 | self.hidden_size = hidden_size 77 | self.num_layers = num_layers 78 | self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True) 79 | self.fc0 = nn.Linear(hidden_size + numerical_dim, mid_size) 80 | # self.batch_norm = nn.BatchNorm1d(numerical_dim) 81 | self.relu = nn.ReLU() 82 | self.fc = nn.Linear(mid_size, num_classes) 83 | 84 | def forward(self, input, num_vec): 85 | h0 = torch.zeros(self.num_layers, input.size(0), self.hidden_size).to(device) 86 | c0 = torch.zeros(self.num_layers, input.size(0), self.hidden_size).to(device) 87 | out, _ = self.lstm(input, (h0, c0)) 88 | # num_vec = self.batch_norm(num_vec) 89 | out = torch.cat((out[:, -1, :], num_vec), dim=1) 90 | out = self.fc0(out) 91 | out = self.relu(out) 92 | out = self.fc(out) 93 | return out 94 | 95 | 96 | def train(input_size, hidden_size, num_layers, num_classes, numerical_dim, window_size, batch_size, num_epochs, path, 97 | loginfo): 98 | model = Model(input_size, hidden_size, num_layers, num_classes, numerical_dim).to(device) 99 | seq_dataset = generate(window_size) 100 | dataloader = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, pin_memory=True) 101 | 102 | criterion = nn.CrossEntropyLoss() 103 | optimizer = optim.Adam(model.parameters(), lr=0.001) 104 | 105 | train_time1 = time.time() 106 | 107 | train_loss = 0 108 | for epoch in range(num_epochs): 109 | if epoch + 1 > 250: 110 | optimizer.param_groups[0]['lr'] = 1e-6 111 | elif epoch + 1 > 200: 112 | optimizer.param_groups[0]['lr'] = 1e-5 113 | elif epoch + 1 > 100: 114 | optimizer.param_groups[0]['lr'] = 1e-4 115 | elif epoch + 1 > 50: 116 | optimizer.param_groups[0]['lr'] = 0.0005 117 | 118 | train_loss = 0 119 | t1 = time.time() 120 | for step, (seq, num_vec, label) in enumerate(dataloader): 121 | seq = seq.view(-1, window_size, input_size).to(device) 122 | num_vec = num_vec.view(-1, numerical_dim).to(device) 123 | output = model(seq, num_vec) 124 | loss = criterion(output, label.to(device)) 125 | 126 | optimizer.zero_grad() 127 | loss.backward() 128 | optimizer.step() 129 | train_loss += loss.item() 130 | t2 = time.time() 131 | print('Epoch [{}/{}],time_cost:{} Train_loss: {:.10f}'.format(epoch + 1, num_epochs, t2 - t1, 132 | train_loss / len(dataloader.dataset))) 133 | torch.save(model.state_dict(), path) 134 | 135 | print('Finished Training') 136 | logging.info(loginfo.format(num_epochs, train_loss / len(dataloader.dataset))) 137 | 138 | train_time2 = time.time() 139 | logging.info("time cost for training:{}".format(train_time2 - train_time1)) 140 | 141 | 142 | def predict(input_size, hidden_size, num_layers, num_classes, numerical_dim, window_size, model_path, num_candidates): 143 | model = Model(input_size, hidden_size, num_layers, num_classes, numerical_dim).to(device) 144 | model.load_state_dict(torch.load(model_path)) 145 | model.eval() 146 | 147 | sv_list = load('sv_list') 148 | svIndex = load("svIndex") 149 | d = load("numvec_dict") 150 | test_normal_blocks = load("test_normal_blocks") 151 | abnormal_blocks = load("abnormal_blocks") 152 | 153 | print("normal block number:", len(test_normal_blocks)) 154 | print("abnormal block number:", len(abnormal_blocks)) 155 | 156 | test_time1 = time.time() 157 | test_logs = 0 158 | 159 | TP = FP = 0 160 | start_time = time.time() 161 | with torch.no_grad(): 162 | for block in abnormal_blocks: 163 | line = block[0] 164 | param = block[1] 165 | line = line + [-1] * (window_size + 1 - len(line)) 166 | for j in range(len(line) - window_size): 167 | test_logs += 1 168 | if -1 in line: 169 | TP += 1 170 | break 171 | input = [] 172 | for k in range(j, j + window_size): 173 | input.append(sv_list[svIndex[line[k]]]) 174 | 175 | num_vec = [0 for k in range(numerical_dim)] 176 | for k in range(j + window_size): 177 | index = svIndex[line[k]] 178 | dindex = d.get(index) 179 | if dindex != None: 180 | cp = param[k] 181 | for pi in range(1, len(cp)): 182 | num_vec[dindex] += int(cp[pi]) 183 | dindex += 1 184 | num_vec[numerical_dim - 1] = param[j + window_size - 1][0] 185 | label = svIndex[line[j + window_size]] 186 | input = torch.tensor(input, dtype=torch.float).view(-1, window_size, input_size).to(device) 187 | num_vec = torch.tensor(num_vec, dtype=torch.float).view(1, numerical_dim).to(device) 188 | label = torch.tensor(label).view(-1).to(device) 189 | output = model(input, num_vec) 190 | predicted = torch.argsort(output, 1)[0][-num_candidates:] 191 | if label not in predicted: 192 | TP += 1 193 | break 194 | FN = len(abnormal_blocks) - TP 195 | print("TP={},FN={}".format(TP, FN)) 196 | print("abnormal test time:", time.time() - start_time) 197 | 198 | with torch.no_grad(): 199 | for block in test_normal_blocks: 200 | line = block[0] 201 | param = block[1] 202 | for j in range(len(line) - window_size): 203 | test_logs += 1 204 | 205 | input = [] 206 | for k in range(j, j + window_size): 207 | input.append(sv_list[svIndex[line[k]]]) 208 | num_vec = [0 for k in range(numerical_dim)] 209 | for k in range(j + window_size): 210 | index = svIndex[line[k]] 211 | dindex = d.get(index) 212 | if dindex != None: 213 | cp = param[k] 214 | for pi in range(1, len(cp)): 215 | num_vec[dindex] += int(cp[pi]) 216 | dindex += 1 217 | num_vec[numerical_dim - 1] = param[j + window_size - 1][0] 218 | label = svIndex[line[j + window_size]] 219 | input = torch.tensor(input, dtype=torch.float).view(-1, window_size, input_size).to(device) 220 | num_vec = torch.tensor(num_vec, dtype=torch.float).view(1, numerical_dim).to(device) 221 | label = torch.tensor(label).view(-1).to(device) 222 | output = model(input, num_vec) 223 | predicted = torch.argsort(output, 1)[0][-num_candidates:] 224 | if label not in predicted: 225 | FP += 1 226 | break 227 | 228 | P = 100 * TP / (TP + FP) 229 | R = 100 * TP / (TP + FN) 230 | F1 = 2 * P * R / (P + R) 231 | logging.info('FP: {}, FN: {}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(FP, FN, P, R, F1)) 232 | print('FP: {}, FN: {}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(FP, FN, P, R, F1)) 233 | print('total time: {}'.format(time.time() - start_time)) 234 | 235 | test_time2 = time.time() 236 | logging.info("time cost for testing:{}".format(test_time2 - test_time1)) 237 | logging.info(("number of test logs:{}").format(test_logs)) 238 | 239 | 240 | window_size = 6 241 | input_size = 100 242 | hidden_size = 128 243 | num_layers = 2 244 | num_epochs = 100 245 | 246 | batch_size = 2048 247 | numerical_dim = 24 248 | 249 | num_candidates = 10 250 | 251 | loginfo1 = 'model changed epoch={},last train_loss{}' 252 | 253 | if __name__ == "__main__": 254 | logging.info('input_dim={} hidden_dim={} num_layer={} epochs={} batch={} h={} g={}'. 255 | format(input_size, hidden_size, 256 | num_layers, num_epochs, 257 | batch_size, window_size, 258 | num_candidates)) 259 | sv_list = load("sv_list") 260 | print("class_num", len(sv_list)) 261 | 262 | model_path = 'dp/dp_p_all h={} layer={} hidden={}.pt'.format(window_size, num_layers, hidden_size) 263 | 264 | train(input_size, hidden_size, num_layers, len(sv_list), numerical_dim, window_size, batch_size, num_epochs, 265 | model_path, loginfo1) 266 | predict(input_size, hidden_size, num_layers, len(sv_list), numerical_dim, window_size, model_path, num_candidates) 267 | -------------------------------------------------------------------------------- /framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qyjcode/deepsyslog/f95b4e3f5615639e12fd71d759838b3ece04f034/framework.png -------------------------------------------------------------------------------- /numvec_dict: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qyjcode/deepsyslog/f95b4e3f5615639e12fd71d759838b3ece04f034/numvec_dict -------------------------------------------------------------------------------- /svIndex: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qyjcode/deepsyslog/f95b4e3f5615639e12fd71d759838b3ece04f034/svIndex -------------------------------------------------------------------------------- /sv_list: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qyjcode/deepsyslog/f95b4e3f5615639e12fd71d759838b3ece04f034/sv_list --------------------------------------------------------------------------------