├── model └── gitignore ├── final_score.jpg ├── requirements.txt ├── security_test └── security_test.xlsx ├── security_train └── security_train.xlsx ├── .gitignore ├── loadfile.py ├── xgdboost.py ├── stack_result.py ├── train_lstm3.py ├── train_textcnn.py ├── train_lstm2.py ├── train_lstm.py └── readme.md /model/gitignore: -------------------------------------------------------------------------------- 1 | for model save -------------------------------------------------------------------------------- /final_score.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RManLuo/ML_Malware_detect/HEAD/final_score.jpg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | pickle 3 | numpy 4 | csv 5 | sklearn 6 | keras 7 | matplotlib 8 | keras 9 | tensoflow -------------------------------------------------------------------------------- /security_test/security_test.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RManLuo/ML_Malware_detect/HEAD/security_test/security_test.xlsx -------------------------------------------------------------------------------- /security_train/security_train.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RManLuo/ML_Malware_detect/HEAD/security_train/security_train.xlsx -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /loadfile.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pickle 3 | import numpy as np 4 | 5 | train_path = r'security_train/security_train.csv' 6 | test_path = r'security_test/security_test.csv' 7 | 8 | 9 | def read_train_file(path): 10 | labels = [] 11 | files = [] 12 | data = pd.read_csv(path) 13 | # for data in data1: 14 | goup_fileid = data.groupby('file_id') 15 | for file_name, file_group in goup_fileid: 16 | print(file_name) 17 | file_labels = file_group['label'].values[0] 18 | result = file_group.sort_values(['tid', 'index'], ascending=True) 19 | api_sequence = ' '.join(result['api']) 20 | labels.append(file_labels) 21 | files.append(api_sequence) 22 | print(len(labels)) 23 | print(len(files)) 24 | with open(path.split('/')[-1] + ".txt", 'w') as f: 25 | for i in range(len(labels)): 26 | f.write(str(labels[i]) + ' ' + files[i] + '\n') 27 | 28 | 29 | 30 | def read_test_file(path): 31 | names = [] 32 | files = [] 33 | data = pd.read_csv(path) 34 | # for data in data1: 35 | goup_fileid = data.groupby('file_id') 36 | for file_name, file_group in goup_fileid: 37 | print(file_name) 38 | # file_labels = file_group['label'].values[0] 39 | result = file_group.sort_values(['tid', 'index'], ascending=True) 40 | api_sequence = ' '.join(result['api']) 41 | # labels.append(file_labels) 42 | names.append(file_name) 43 | files.append(api_sequence) 44 | print(len(names)) 45 | print(len(files)) 46 | with open("security_test.csv.pkl", 'wb') as f: 47 | pickle.dump(names, f) 48 | pickle.dump(files, f) 49 | # with open(path.split('/')[-1] + ".txt", 'w') as f: 50 | # for i in range(len(names)): 51 | # f.write(str(names[i]) + ' ' + files[i] + '\n') 52 | 53 | 54 | def load_train2h5py(path="security_train.csv.txt"): 55 | labels = [] 56 | files = [] 57 | with open(path) as f: 58 | for i in f.readlines(): 59 | i = i.strip('\n') 60 | labels.append(i[0]) 61 | files.append(i[2:]) 62 | labels = np.asarray(labels) 63 | print(labels.shape) 64 | with open("security_train.csv.pkl", 'wb') as f: 65 | pickle.dump(labels, f) 66 | pickle.dump(files, f) 67 | 68 | 69 | # def load_test2h5py(path="D:\ML_Malware\security_test.csv.txt"): 70 | # labels = [] 71 | # files = [] 72 | # with open(path) as f: 73 | # for i in f.readlines(): 74 | # i = i.strip('\n') 75 | # labels.append(i[0]) 76 | # files.append(' '.join(i.split(" ")[1:])) 77 | # labels = np.asarray(labels) 78 | # print(labels.shape) 79 | # with open("security_test.csv.pkl", 'wb') as f: 80 | # pickle.dump(labels, f) 81 | # pickle.dump(files, f) 82 | 83 | 84 | if __name__ == '__main__': 85 | print("read train file.....") 86 | read_train_file(train_path) 87 | load_train2h5py() 88 | print("read test file......") 89 | read_test_file(test_path) 90 | 91 | # load_test2h5py() 92 | -------------------------------------------------------------------------------- /xgdboost.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from sklearn.model_selection import train_test_split 3 | from sklearn.feature_extraction.text import TfidfVectorizer 4 | import time 5 | import csv 6 | import xgboost as xgb 7 | from sklearn.model_selection import StratifiedKFold 8 | import numpy as np 9 | 10 | with open("security_test.csv.pkl", "rb") as f: 11 | file_names = pickle.load(f) 12 | outfiles = pickle.load(f) 13 | 14 | with open("security_train.csv.pkl", "rb") as f: 15 | labels = pickle.load(f) 16 | files = pickle.load(f) 17 | 18 | print("start tfidf...") 19 | vectorizer = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9, ) # tf-idf特征抽取ngram_range=(1,5) 20 | 21 | train_features = vectorizer.fit_transform(files) 22 | 23 | out_features = vectorizer.transform(outfiles) 24 | 25 | # with open("tfidf_feature_no_limit.pkl", 'wb') as f: 26 | # pickle.dump(train_features, f) 27 | # pickle.dump(out_features, f) 28 | # 29 | # with open("tfidf_feature_no_limit.pkl", 'rb') as f: 30 | # train_features = pickle.load(f) 31 | # out_features = pickle.load(f) 32 | print(train_features.shape) 33 | print(out_features.shape) 34 | meta_train = np.zeros(shape=(len(files), 8)) 35 | meta_test = np.zeros(shape=(len(outfiles), 8)) 36 | skf = StratifiedKFold(n_splits=5, random_state=4, shuffle=True) 37 | for i, (tr_ind, te_ind) in enumerate(skf.split(train_features, labels)): 38 | X_train, X_train_label = train_features[tr_ind], labels[tr_ind] 39 | X_val, X_val_label = train_features[te_ind], labels[te_ind] 40 | 41 | print('FOLD: {}'.format(str(i))) 42 | print(len(te_ind), len(tr_ind)) 43 | dtrain = xgb.DMatrix(X_train, label=X_train_label) 44 | dtest = xgb.DMatrix(X_val, X_val_label) 45 | dout = xgb.DMatrix(out_features) 46 | param = {'max_depth': 6, 'eta': 0.1, 'eval_metric': 'mlogloss', 'silent': 1, 'objective': 'multi:softprob', 47 | 'num_class': 8, 'subsample': 0.8, 48 | 'colsample_bytree': 0.85} # 参数 49 | 50 | evallist = [(dtrain, 'train'), (dtest, 'val')] # 测试 , (dtrain, 'train') 51 | num_round = 300 # 循环次数 52 | bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=50) 53 | 54 | # dtr = xgb.DMatrix(train_features) 55 | pred_val = bst.predict(dtest) 56 | pred_test = bst.predict(dout) 57 | meta_train[te_ind] = pred_val 58 | meta_test += pred_test 59 | meta_test /= 5.0 60 | with open("tfidf_result.pkl", 'wb') as f: 61 | pickle.dump(meta_train, f) 62 | pickle.dump(meta_test, f) 63 | 64 | # preds = bst.predict(dout) 65 | # 66 | # 67 | # result = preds 68 | # # print(result) 69 | # out = [] 70 | # for i in range(len(file_names)): 71 | # tmp = [] 72 | # a = result[i].tolist() 73 | # # for j in range(len(a)): 74 | # # a[j] = ("%.5f" % a[j]) 75 | # 76 | # tmp.append(file_names[i]) 77 | # tmp.extend(a) 78 | # out.append(tmp) 79 | # with open("result_xgd_boost_{}.csv".format(str(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()))), "w", 80 | # newline='') as csvfile: 81 | # writer = csv.writer(csvfile) 82 | 83 | # 先写入columns_name 84 | # writer.writerow(["file_id", "prob0", "prob1", "prob2", "prob3", "prob4", "prob5", "prob6", "prob7" 85 | # ]) 86 | # # 写入多行用writerows 87 | # writer.writerows(out) 88 | -------------------------------------------------------------------------------- /stack_result.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from sklearn.model_selection import train_test_split 3 | from sklearn.feature_extraction.text import TfidfVectorizer 4 | import time 5 | import csv 6 | import xgboost as xgb 7 | import numpy as np 8 | from sklearn.model_selection import StratifiedKFold 9 | 10 | with open("security_test.csv.pkl", "rb") as f: 11 | file_names = pickle.load(f) 12 | outfiles = pickle.load(f) 13 | 14 | with open("cnn_lstm_result.pkl", "rb") as f: 15 | cnn_train_result = pickle.load(f) 16 | cnn_out_result = pickle.load(f) 17 | 18 | with open("tfidf_result.pkl", "rb") as f: 19 | tfidf_train_result = pickle.load(f) 20 | tfidf_out_result = pickle.load(f) 21 | 22 | with open("textcnn_result.pkl", "rb") as f: 23 | textcnn_train_result = pickle.load(f) 24 | textcnn_out_result = pickle.load(f) 25 | 26 | with open("mulitl_version_lstm_result.pkl", "rb") as f: 27 | mulitl_version_lstm_train_result = pickle.load(f) 28 | mulitl_version_lstm_test_result = pickle.load(f) 29 | 30 | with open("textcnn_lstm_result.pkl", "rb") as f: 31 | textcnn_lstm_train_result = pickle.load(f) 32 | textcnn_lstm_test_result = pickle.load(f) 33 | 34 | with open("security_train.csv.pkl", "rb") as f: 35 | labels = pickle.load(f) 36 | files = pickle.load(f) 37 | 38 | train = np.hstack([tfidf_train_result, textcnn_train_result, mulitl_version_lstm_train_result, cnn_train_result, 39 | textcnn_lstm_train_result]) 40 | test = np.hstack( 41 | [tfidf_out_result, textcnn_out_result, mulitl_version_lstm_test_result, cnn_out_result, textcnn_lstm_test_result]) 42 | meta_test = np.zeros(shape=(len(outfiles), 8)) 43 | skf = StratifiedKFold(n_splits=5, random_state=4, shuffle=True) 44 | dout = xgb.DMatrix(test) 45 | for i, (tr_ind, te_ind) in enumerate(skf.split(train, labels)): 46 | print('FOLD: {}'.format(str(i))) 47 | X_train, X_train_label = train[tr_ind], labels[tr_ind] 48 | X_val, X_val_label = train[te_ind], labels[te_ind] 49 | dtrain = xgb.DMatrix(X_train, label=X_train_label) 50 | dtest = xgb.DMatrix(X_val, X_val_label) # label可以不要,此处需要是为了测试效果 51 | 52 | param = {'max_depth': 6, 'eta': 0.01, 'eval_metric': 'mlogloss', 'silent': 1, 'objective': 'multi:softprob', 53 | 'num_class': 8, 'subsample': 0.9, 54 | 'colsample_bytree': 0.85} # 参数 55 | evallist = [(dtrain, 'train'), (dtest, 'val')] # 测试 , (dtrain, 'train') 56 | num_round = 10000 # 循环次数 57 | bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=100) 58 | preds = bst.predict(dout) 59 | meta_test += preds 60 | 61 | meta_test /= 5.0 62 | result = meta_test 63 | # print(result) 64 | out = [] 65 | for i in range(len(file_names)): 66 | tmp = [] 67 | a = result[i].tolist() 68 | # for j in range(len(a)): 69 | # a[j] = ("%.5f" % a[j]) 70 | 71 | tmp.append(file_names[i]) 72 | tmp.extend(a) 73 | out.append(tmp) 74 | with open("./submit/mulltimodel_xgd_boost_tf+cnn_mlstm{}.csv".format( 75 | str(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()))), 76 | "w", 77 | newline='') as csvfile: 78 | writer = csv.writer(csvfile) 79 | 80 | # 先写入columns_name 81 | writer.writerow(["file_id", "prob0", "prob1", "prob2", "prob3", "prob4", "prob5", "prob6", "prob7" 82 | ]) 83 | # 写入多行用writerows 84 | writer.writerows(out) 85 | -------------------------------------------------------------------------------- /train_lstm3.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from keras.preprocessing.sequence import pad_sequences 3 | from keras_preprocessing.text import Tokenizer 4 | from keras.models import Sequential, Model 5 | from keras.layers import Dense, Embedding, Activation, merge, Input, Lambda, Reshape, LSTM, RNN, CuDNNLSTM, \ 6 | SimpleRNNCell, SpatialDropout1D, Add, Maximum 7 | from keras.layers import Conv1D, Flatten, Dropout, MaxPool1D, GlobalAveragePooling1D, concatenate, AveragePooling1D 8 | from keras import optimizers 9 | from keras import regularizers 10 | from keras.layers import BatchNormalization 11 | from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint 12 | from keras.utils import to_categorical 13 | import time 14 | import numpy as np 15 | from keras import backend as K 16 | from sklearn.model_selection import StratifiedKFold 17 | 18 | config = K.tf.ConfigProto() 19 | config.gpu_options.allow_growth = True 20 | session = K.tf.Session(config=config) 21 | 22 | Fname = 'malware_' 23 | Time = Fname + str(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())) 24 | tensorboard = TensorBoard(log_dir='./Logs/' + Time, histogram_freq=0, write_graph=False, write_images=False, 25 | embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) 26 | 27 | with open("security_test.csv.pkl", "rb") as f: 28 | file_names = pickle.load(f) 29 | outfiles = pickle.load(f) 30 | with open("security_train.csv.pkl", "rb") as f: 31 | labels_d = pickle.load(f) 32 | with open("security_train.csv.pkl", "rb") as f: 33 | labels = pickle.load(f) 34 | files = pickle.load(f) 35 | maxlen = 6000 36 | 37 | 38 | # with open("wordsdic.pkl", 'rb') as f: 39 | # tokenizer = pickle.load(f) 40 | # 41 | 42 | labels = np.asarray(labels) 43 | 44 | labels = to_categorical(labels, num_classes=8) 45 | 46 | tokenizer = Tokenizer(num_words=None, 47 | filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', 48 | split=' ', 49 | char_level=False, 50 | oov_token=None) 51 | tokenizer.fit_on_texts(files) 52 | tokenizer.fit_on_texts(outfiles) 53 | 54 | # with open("wordsdic.pkl", 'wb') as f: 55 | # pickle.dump(tokenizer, f) 56 | 57 | vocab = tokenizer.word_index 58 | print(tokenizer.word_index) 59 | print(len(vocab)) 60 | x_train_word_ids = tokenizer.texts_to_sequences(files) 61 | x_out_word_ids = tokenizer.texts_to_sequences(outfiles) 62 | 63 | x_train_padded_seqs = pad_sequences(x_train_word_ids, maxlen=maxlen) 64 | 65 | x_out_padded_seqs = pad_sequences(x_out_word_ids, maxlen=maxlen) 66 | 67 | 68 | # with open('datasets.pkl', 'wb') as f: 69 | # pickle.dump(x_train_padded_seqs, f) 70 | # pickle.dump(x_out_padded_seqs, f) 71 | # pickle.dump(labels, f) 72 | 73 | 74 | # with open('datasets.pkl', 'rb') as f: 75 | # x_train_padded_seqs = pickle.load(f) 76 | # # x_test_padded_seqs = pickle.load(f) 77 | # x_out_padded_seqs = pickle.load(f) 78 | # # y_train = pickle.load(f) 79 | # # y_test = pickle.load(f) 80 | # labels = pickle.load(f) 81 | 82 | 83 | def textcnn_lstm(): 84 | main_input = Input(shape=(maxlen,), dtype='float64') 85 | 86 | embedder = Embedding(304, 256, input_length=maxlen) 87 | embed = embedder(main_input) 88 | # cnn1模块,kernel_size = 3 89 | conv1_1 = Conv1D(16, 3, padding='same')(embed) 90 | bn1_1 = BatchNormalization()(conv1_1) 91 | relu1_1 = Activation('relu')(bn1_1) 92 | conv1_2 = Conv1D(32, 3, padding='same')(relu1_1) 93 | bn1_2 = BatchNormalization()(conv1_2) 94 | relu1_2 = Activation('relu')(bn1_2) 95 | cnn1 = MaxPool1D(pool_size=4)(relu1_2) 96 | # cnn2模块,kernel_size = 4 97 | conv2_1 = Conv1D(16, 4, padding='same')(embed) 98 | bn2_1 = BatchNormalization()(conv2_1) 99 | relu2_1 = Activation('relu')(bn2_1) 100 | conv2_2 = Conv1D(32, 4, padding='same')(relu2_1) 101 | bn2_2 = BatchNormalization()(conv2_2) 102 | relu2_2 = Activation('relu')(bn2_2) 103 | cnn2 = MaxPool1D(pool_size=4)(relu2_2) 104 | # cnn3模块,kernel_size = 5 105 | conv3_1 = Conv1D(16, 5, padding='same')(embed) 106 | bn3_1 = BatchNormalization()(conv3_1) 107 | relu3_1 = Activation('relu')(bn3_1) 108 | conv3_2 = Conv1D(32, 5, padding='same')(relu3_1) 109 | bn3_2 = BatchNormalization()(conv3_2) 110 | relu3_2 = Activation('relu')(bn3_2) 111 | cnn3 = MaxPool1D(pool_size=4)(relu3_2) 112 | # 拼接三个模块 113 | cnn = concatenate([cnn1, cnn2, cnn3], axis=-1) 114 | lstm = CuDNNLSTM(256)(cnn) 115 | f = Flatten()(cnn1) 116 | fc = Dense(256, activation='relu')(f) 117 | D = Dropout(0.5)(fc) 118 | main_output = Dense(8, activation='softmax')(lstm) 119 | model = Model(inputs=main_input, outputs=main_output) 120 | return model 121 | 122 | 123 | meta_train = np.zeros(shape=(len(x_train_padded_seqs), 8)) 124 | meta_test = np.zeros(shape=(len(x_out_padded_seqs), 8)) 125 | skf = StratifiedKFold(n_splits=5, random_state=4, shuffle=True) 126 | for i, (tr_ind, te_ind) in enumerate(skf.split(x_train_padded_seqs, labels_d)): 127 | print('FOLD: {}'.format(str(i))) 128 | print(len(te_ind), len(tr_ind)) 129 | X_train, X_train_label = x_train_padded_seqs[tr_ind], labels[tr_ind] 130 | X_val, X_val_label = x_train_padded_seqs[te_ind], labels[te_ind] 131 | 132 | model = textcnn_lstm() 133 | # model = load_model('model_weight.h5') 134 | print(model.summary()) 135 | # exit() 136 | model.compile(loss='categorical_crossentropy', 137 | optimizer='adam', 138 | metrics=['accuracy']) 139 | model_save_path = './model/model_weight_textcnn_lstm_{}.h5'.format(str(i)) 140 | print(model_save_path) 141 | if i in [-1]: 142 | model.load_weights(model_save_path) 143 | print(model.evaluate(X_val, X_val_label)) 144 | else: 145 | 146 | checkpoint = model_checkpoint = ModelCheckpoint(model_save_path, save_best_only=True, save_weights_only=True) 147 | ear = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='min', baseline=None, 148 | restore_best_weights=False) 149 | history = model.fit(X_train, X_train_label, 150 | batch_size=128, 151 | epochs=100, 152 | validation_data=(X_val, X_val_label), callbacks=[checkpoint, ear]) 153 | 154 | # model.save('./model/model_weight_cnn_lstm_{}.h5'.format(str(i))) 155 | model.load_weights(model_save_path) 156 | # model = load_model('model_weight.h5') 157 | pred_val = model.predict(X_val) 158 | pred_test = model.predict(x_out_padded_seqs) 159 | 160 | meta_train[te_ind] = pred_val 161 | meta_test += pred_test 162 | K.clear_session() 163 | 164 | meta_test /= 5.0 165 | with open("textcnn_lstm_result.pkl", 'wb') as f: 166 | pickle.dump(meta_train, f) 167 | pickle.dump(meta_test, f) 168 | 169 | # result = model.predict(x_out_padded_seqs) 170 | # out = [] 171 | # for i in range(len(file_names)): 172 | # tmp = [] 173 | # a = result[i].tolist() 174 | # # for j in range(len(a)): 175 | # # a[j] = ("%.5f" % a[j]) 176 | # 177 | # tmp.append(file_names[i]) 178 | # tmp.extend(a) 179 | # out.append(tmp) 180 | # with open("result_lstm.csv", "w", newline='') as csvfile: 181 | # writer = csv.writer(csvfile) 182 | # 183 | # # 先写入columns_name 184 | # writer.writerow(["file_id", "prob0", "prob1", "prob2", "prob3", "prob4", "prob5", "prob6", "prob7" 185 | # ]) 186 | # # 写入多行用writerows 187 | # writer.writerows(out) 188 | 189 | 190 | # def mulitl_version_lstm(): 191 | # embed_size = 256 192 | # num_filters = 64 193 | # kernel_size = [3, 5, 7] 194 | # main_input = Input(shape=(maxlen,)) 195 | # emb = Embedding(304, 256, input_length=maxlen)(main_input) 196 | # _embed = SpatialDropout1D(0.15)(emb) 197 | # warppers = [] 198 | # warppers2 = [] 199 | # for _kernel_size in kernel_size: 200 | # conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(_embed) 201 | # warppers.append(MaxPool1D(2)(conv1d)) 202 | # for (_kernel_size, cnn) in zip(kernel_size, warppers): 203 | # conv1d_2 = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(cnn) 204 | # warppers2.append(MaxPool1D(2)(conv1d_2)) 205 | # fc = Add()(warppers2) 206 | # rl = CuDNNLSTM(512)(fc) 207 | # main_output = Dense(8, activation='softmax')(rl) 208 | # model = Model(inputs=main_input, outputs=main_output) 209 | # return model 210 | -------------------------------------------------------------------------------- /train_textcnn.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from sklearn.model_selection import train_test_split 3 | from keras.preprocessing.sequence import pad_sequences 4 | from keras_preprocessing.text import Tokenizer 5 | import matplotlib.pyplot as plt 6 | import matplotlib.mlab as mlab 7 | from keras.models import Sequential, Model 8 | from keras.layers import Dense, Embedding, Activation, merge, Input, Lambda, Reshape, LSTM, RNN, CuDNNLSTM, \ 9 | SimpleRNNCell, SpatialDropout1D 10 | from keras.layers import Conv1D, Flatten, Dropout, MaxPool1D, GlobalAveragePooling1D, concatenate, MaxPool2D,GlobalMaxPooling1D 11 | from keras import optimizers 12 | from keras import regularizers 13 | from keras.layers import BatchNormalization 14 | from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint 15 | from keras.utils import to_categorical 16 | import time 17 | import numpy as np 18 | from scipy import interp 19 | from sklearn import metrics 20 | from keras import backend as K 21 | from keras.models import load_model 22 | import csv 23 | from sklearn.model_selection import StratifiedKFold 24 | 25 | config = K.tf.ConfigProto() 26 | config.gpu_options.allow_growth = True 27 | session = K.tf.Session(config=config) 28 | 29 | Fname = 'malware_' 30 | Time = Fname + str(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())) 31 | tensorboard = TensorBoard(log_dir='./Logs/' + Time, histogram_freq=0, write_graph=False, write_images=False, 32 | embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) 33 | 34 | with open("security_test.csv.pkl", "rb") as f: 35 | file_names = pickle.load(f) 36 | outfiles = pickle.load(f) 37 | with open("security_train.csv.pkl", "rb") as f: 38 | labels_d = pickle.load(f) 39 | with open("security_train.csv.pkl", "rb") as f: 40 | labels = pickle.load(f) 41 | files = pickle.load(f) 42 | maxlen = 6000 43 | 44 | 45 | # with open("wordsdic.pkl", 'rb') as f: 46 | # tokenizer = pickle.load(f) 47 | # 48 | 49 | labels = np.asarray(labels) 50 | 51 | labels = to_categorical(labels, num_classes=8) 52 | 53 | tokenizer = Tokenizer(num_words=None, 54 | filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', 55 | split=' ', 56 | char_level=False, 57 | oov_token=None) 58 | tokenizer.fit_on_texts(files) 59 | tokenizer.fit_on_texts(outfiles) 60 | 61 | # with open("wordsdic.pkl", 'wb') as f: 62 | # pickle.dump(tokenizer, f) 63 | 64 | vocab = tokenizer.word_index 65 | print(tokenizer.word_index) 66 | print(len(vocab)) 67 | x_train_word_ids = tokenizer.texts_to_sequences(files) 68 | x_out_word_ids = tokenizer.texts_to_sequences(outfiles) 69 | 70 | x_train_padded_seqs = pad_sequences(x_train_word_ids, maxlen=maxlen) 71 | 72 | x_out_padded_seqs = pad_sequences(x_out_word_ids, maxlen=maxlen) 73 | 74 | 75 | # with open('datasets.pkl', 'wb') as f: 76 | # pickle.dump(x_train_padded_seqs, f) 77 | # pickle.dump(x_out_padded_seqs, f) 78 | # pickle.dump(labels, f) 79 | 80 | 81 | # with open('datasets.pkl', 'rb') as f: 82 | # x_train_padded_seqs = pickle.load(f) 83 | # # x_test_padded_seqs = pickle.load(f) 84 | # x_out_padded_seqs = pickle.load(f) 85 | # # y_train = pickle.load(f) 86 | # # y_test = pickle.load(f) 87 | # labels = pickle.load(f) 88 | 89 | 90 | def TextCNN(): 91 | num_filters = 64 92 | kernel_size = [2, 4, 6, 8, 10] 93 | conv_action = 'relu' 94 | _input = Input(shape=(maxlen,), dtype='int32') 95 | _embed = Embedding(304, 256, input_length=maxlen)(_input) 96 | _embed = SpatialDropout1D(0.15)(_embed) 97 | warppers = [] 98 | for _kernel_size in kernel_size: 99 | conv1d = Conv1D(filters=32, kernel_size=_kernel_size, activation=conv_action, padding="same")(_embed) 100 | warppers.append(MaxPool1D(2)(conv1d)) 101 | 102 | fc = concatenate(warppers) 103 | fc = Flatten()(fc) 104 | fc = Dropout(0.5)(fc) 105 | # fc = BatchNormalization()(fc) 106 | fc = Dense(256, activation='relu')(fc) 107 | fc = Dropout(0.5)(fc) 108 | # fc = BatchNormalization()(fc) 109 | preds = Dense(8, activation='softmax')(fc) 110 | 111 | model = Model(inputs=_input, outputs=preds) 112 | 113 | model.compile(loss='categorical_crossentropy', 114 | optimizer='adam', 115 | metrics=['accuracy']) 116 | return model 117 | 118 | def dila(): 119 | main_input = Input(shape=(maxlen,), dtype='float64') 120 | _embed = Embedding(304, 256, input_length=maxlen)(main_input) 121 | _embed = SpatialDropout1D(0.25)(_embed) 122 | warppers = [] 123 | num_filters = 64 124 | kernel_size = [2, 3, 4, 5] 125 | conv_action = 'relu' 126 | for _kernel_size in kernel_size: 127 | for dilated_rate in [1, 2, 3, 4]: 128 | conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation=conv_action, 129 | dilation_rate=dilated_rate)(_embed) 130 | warppers.append(GlobalMaxPooling1D()(conv1d)) 131 | 132 | fc = concatenate(warppers) 133 | fc = Dropout(0.5)(fc) 134 | # fc = BatchNormalization()(fc) 135 | fc = Dense(256, activation='relu')(fc) 136 | fc = Dropout(0.25)(fc) 137 | # fc = BatchNormalization()(fc) 138 | preds = Dense(8, activation='softmax')(fc) 139 | 140 | model = Model(inputs=main_input, outputs=preds) 141 | 142 | model.compile(loss='categorical_crossentropy', 143 | optimizer='adam', 144 | metrics=['accuracy']) 145 | return model 146 | def fasttext(): 147 | main_input = Input(shape=(maxlen,), dtype='float64') 148 | embedder = Embedding(304, 256, input_length=maxlen) 149 | embed = embedder(main_input) 150 | # cnn1模块,kernel_size = 3 151 | gb = GlobalAveragePooling1D()(embed) 152 | main_output = Dense(8, activation='softmax')(gb) 153 | model = Model(inputs=main_input, outputs=main_output) 154 | return model 155 | 156 | 157 | meta_train = np.zeros(shape=(len(x_train_padded_seqs), 8)) 158 | meta_test = np.zeros(shape=(len(x_out_padded_seqs), 8)) 159 | skf = StratifiedKFold(n_splits=5, random_state=4, shuffle=True) 160 | for i, (tr_ind, te_ind) in enumerate(skf.split(x_train_padded_seqs, labels_d)): 161 | print('FOLD: {}'.format(str(i))) 162 | print(len(te_ind), len(tr_ind)) 163 | X_train, X_train_label = x_train_padded_seqs[tr_ind], labels[tr_ind] 164 | X_val, X_val_label = x_train_padded_seqs[te_ind], labels[te_ind] 165 | 166 | model = dila() 167 | # model = load_model('model_weight.h5') 168 | # print(model.summary()) 169 | # exit() 170 | model.compile(loss='categorical_crossentropy', 171 | optimizer='adam', 172 | metrics=['accuracy']) 173 | model_save_path = './model/model_weight_testcnn_{}.h5'.format(str(i)) 174 | if i in [-1]: 175 | model = model.load_weights(model_save_path) 176 | print(model.evaluate(X_val, X_val_label)) 177 | else: 178 | ear = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='min', baseline=None, 179 | restore_best_weights=False) 180 | 181 | checkpoint = model_checkpoint = ModelCheckpoint(model_save_path, save_best_only=True, save_weights_only=True) 182 | history = model.fit(X_train, X_train_label, 183 | batch_size=32, 184 | epochs=100, 185 | shuffle=True, 186 | validation_data=(X_val, X_val_label), callbacks=[tensorboard, ear,checkpoint]) 187 | model.load_weights(model_save_path) 188 | # model.save('./model/model_weight_TextCNN_1st_{}.h5'.format(str(i))) 189 | 190 | # model = load_model('model_weight.h5') 191 | pred_val = model.predict(X_val) 192 | pred_test = model.predict(x_out_padded_seqs) 193 | 194 | meta_train[te_ind] = pred_val 195 | meta_test += pred_test 196 | K.clear_session() 197 | meta_test /= 5.0 198 | with open("textcnn_result.pkl", 'wb') as f: 199 | pickle.dump(meta_train, f) 200 | pickle.dump(meta_test, f) 201 | 202 | # 203 | # with open("TextCNN_1st_result.pkl", 'wb') as f: 204 | # pickle.dump(train_result, f) 205 | # pickle.dump(out_result, f) 206 | # 207 | # result = model.predict(x_out_padded_seqs) 208 | # out = [] 209 | # for i in range(len(file_names)): 210 | # tmp = [] 211 | # a = result[i].tolist() 212 | # # for j in range(len(a)): 213 | # # a[j] = ("%.5f" % a[j]) 214 | # 215 | # tmp.append(file_names[i]) 216 | # tmp.extend(a) 217 | # out.append(tmp) 218 | # with open("result_textcnn_TextCNN_1st.csv", "w", newline='') as csvfile: 219 | # writer = csv.writer(csvfile) 220 | # 221 | # # 先写入columns_name 222 | # writer.writerow(["file_id", "prob0", "prob1", "prob2", "prob3", "prob4", "prob5", "prob6", "prob7" 223 | # ]) 224 | # # 写入多行用writerows 225 | # writer.writerows(out) 226 | -------------------------------------------------------------------------------- /train_lstm2.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from keras.preprocessing.sequence import pad_sequences 3 | from keras_preprocessing.text import Tokenizer 4 | from keras.models import Sequential, Model 5 | from keras.layers import Dense, Embedding, Activation, merge, Input, Lambda, Reshape, LSTM, RNN, CuDNNLSTM, \ 6 | SimpleRNNCell, SpatialDropout1D, Add, Maximum 7 | from keras.layers import Conv1D, Flatten, Dropout, MaxPool1D, GlobalAveragePooling1D, concatenate, AveragePooling1D 8 | from keras import optimizers 9 | from keras import regularizers 10 | from keras.layers import BatchNormalization 11 | from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint 12 | from keras.utils import to_categorical 13 | import time 14 | import numpy as np 15 | from keras import backend as K 16 | from sklearn.model_selection import StratifiedKFold 17 | 18 | config = K.tf.ConfigProto() 19 | config.gpu_options.allow_growth = True 20 | session = K.tf.Session(config=config) 21 | 22 | Fname = 'malware_' 23 | Time = Fname + str(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())) 24 | tensorboard = TensorBoard(log_dir='./Logs/' + Time, histogram_freq=0, write_graph=False, write_images=False, 25 | embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) 26 | 27 | with open("security_test.csv.pkl", "rb") as f: 28 | file_names = pickle.load(f) 29 | outfiles = pickle.load(f) 30 | with open("security_train.csv.pkl", "rb") as f: 31 | labels_d = pickle.load(f) 32 | with open("security_train.csv.pkl", "rb") as f: 33 | labels = pickle.load(f) 34 | files = pickle.load(f) 35 | maxlen = 6000 36 | 37 | 38 | # with open("wordsdic.pkl", 'rb') as f: 39 | # tokenizer = pickle.load(f) 40 | # 41 | 42 | labels = np.asarray(labels) 43 | 44 | labels = to_categorical(labels, num_classes=8) 45 | 46 | tokenizer = Tokenizer(num_words=None, 47 | filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', 48 | split=' ', 49 | char_level=False, 50 | oov_token=None) 51 | tokenizer.fit_on_texts(files) 52 | tokenizer.fit_on_texts(outfiles) 53 | 54 | # with open("wordsdic.pkl", 'wb') as f: 55 | # pickle.dump(tokenizer, f) 56 | 57 | vocab = tokenizer.word_index 58 | print(tokenizer.word_index) 59 | print(len(vocab)) 60 | x_train_word_ids = tokenizer.texts_to_sequences(files) 61 | x_out_word_ids = tokenizer.texts_to_sequences(outfiles) 62 | 63 | x_train_padded_seqs = pad_sequences(x_train_word_ids, maxlen=maxlen) 64 | 65 | x_out_padded_seqs = pad_sequences(x_out_word_ids, maxlen=maxlen) 66 | 67 | 68 | # with open('datasets.pkl', 'wb') as f: 69 | # pickle.dump(x_train_padded_seqs, f) 70 | # pickle.dump(x_out_padded_seqs, f) 71 | # pickle.dump(labels, f) 72 | 73 | 74 | # with open('datasets.pkl', 'rb') as f: 75 | # x_train_padded_seqs = pickle.load(f) 76 | # # x_test_padded_seqs = pickle.load(f) 77 | # x_out_padded_seqs = pickle.load(f) 78 | # # y_train = pickle.load(f) 79 | # # y_test = pickle.load(f) 80 | # labels = pickle.load(f) 81 | 82 | 83 | def mulitl_version_lstm(): 84 | embed_size = 256 85 | num_filters = 64 86 | kernel_size = [3, 5, 7] 87 | main_input = Input(shape=(maxlen,)) 88 | emb = Embedding(304, 256, input_length=maxlen)(main_input) 89 | # _embed = SpatialDropout1D(0.15)(emb) 90 | warppers = [] 91 | warppers2 = [] # 0.42 92 | warppers3 = [] 93 | for _kernel_size in kernel_size: 94 | conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(emb) 95 | warppers.append(AveragePooling1D(2)(conv1d)) 96 | for (_kernel_size, cnn) in zip(kernel_size, warppers): 97 | conv1d_2 = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(cnn) 98 | warppers2.append(AveragePooling1D(2)(conv1d_2)) 99 | for (_kernel_size, cnn) in zip(kernel_size, warppers2): 100 | conv1d_2 = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(cnn) 101 | warppers3.append(AveragePooling1D(2)(conv1d_2)) 102 | fc = Maximum()(warppers3) 103 | rl = CuDNNLSTM(512)(fc) 104 | main_output = Dense(8, activation='softmax')(rl) 105 | model = Model(inputs=main_input, outputs=main_output) 106 | return model 107 | 108 | 109 | def Build(): 110 | main_input = Input(shape=(maxlen,), dtype='float64') 111 | embedder = Embedding(304, 256, input_length=maxlen) 112 | embed = embedder(main_input) 113 | # avg = GlobalAveragePooling1D()(embed) 114 | # cnn1模块,kernel_size = 3 115 | conv1_1 = Conv1D(64, 3, padding='same', activation='relu')(embed) 116 | 117 | conv1_2 = Conv1D(64, 3, padding='same', activation='relu')(conv1_1) 118 | 119 | cnn1 = MaxPool1D(pool_size=2)(conv1_2) 120 | conv1_1 = Conv1D(64, 3, padding='same', activation='relu')(cnn1) 121 | 122 | conv1_2 = Conv1D(64, 3, padding='same', activation='relu')(conv1_1) 123 | 124 | cnn1 = MaxPool1D(pool_size=2)(conv1_2) 125 | conv1_1 = Conv1D(64, 3, padding='same', activation='relu')(cnn1) 126 | 127 | conv1_2 = Conv1D(64, 3, padding='same', activation='relu')(conv1_1) 128 | 129 | cnn1 = MaxPool1D(pool_size=2)(conv1_2) 130 | rl = CuDNNLSTM(256)(cnn1) 131 | # flat = Flatten()(cnn3) 132 | # drop = Dropout(0.5)(flat) 133 | fc = Dense(256)(rl) 134 | 135 | main_output = Dense(8, activation='softmax')(rl) 136 | model = Model(inputs=main_input, outputs=main_output) 137 | return model 138 | 139 | 140 | meta_train = np.zeros(shape=(len(x_train_padded_seqs), 8)) 141 | meta_test = np.zeros(shape=(len(x_out_padded_seqs), 8)) 142 | skf = StratifiedKFold(n_splits=5, random_state=4, shuffle=True) 143 | for i, (tr_ind, te_ind) in enumerate(skf.split(x_train_padded_seqs, labels_d)): 144 | print('FOLD: {}'.format(str(i))) 145 | print(len(te_ind), len(tr_ind)) 146 | X_train, X_train_label = x_train_padded_seqs[tr_ind], labels[tr_ind] 147 | X_val, X_val_label = x_train_padded_seqs[te_ind], labels[te_ind] 148 | 149 | model = Build() 150 | # model = load_model('model_weight.h5') 151 | print(model.summary()) 152 | # exit() 153 | model.compile(loss='categorical_crossentropy', 154 | optimizer='adam', 155 | metrics=['accuracy']) 156 | model_save_path = './model/model_weight_cnn_lstm_{}.h5'.format(str(i)) 157 | print(model_save_path) 158 | if i in [-1]: 159 | model.load_weights(model_save_path) 160 | print(model.evaluate(X_val, X_val_label)) 161 | else: 162 | 163 | checkpoint = model_checkpoint = ModelCheckpoint(model_save_path, save_best_only=True, save_weights_only=True) 164 | ear = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='min', baseline=None, 165 | restore_best_weights=False) 166 | history = model.fit(X_train, X_train_label, 167 | batch_size=128, 168 | epochs=100, 169 | validation_data=(X_val, X_val_label), callbacks=[checkpoint, ear]) 170 | 171 | # model.save('./model/model_weight_cnn_lstm_{}.h5'.format(str(i))) 172 | model.load_weights(model_save_path) 173 | # model = load_model('model_weight.h5') 174 | pred_val = model.predict(X_val) 175 | pred_test = model.predict(x_out_padded_seqs) 176 | 177 | meta_train[te_ind] = pred_val 178 | meta_test += pred_test 179 | K.clear_session() 180 | 181 | meta_test /= 5.0 182 | with open("cnn_lstm_result.pkl", 'wb') as f: 183 | pickle.dump(meta_train, f) 184 | pickle.dump(meta_test, f) 185 | 186 | # result = model.predict(x_out_padded_seqs) 187 | # out = [] 188 | # for i in range(len(file_names)): 189 | # tmp = [] 190 | # a = result[i].tolist() 191 | # # for j in range(len(a)): 192 | # # a[j] = ("%.5f" % a[j]) 193 | # 194 | # tmp.append(file_names[i]) 195 | # tmp.extend(a) 196 | # out.append(tmp) 197 | # with open("result_lstm.csv", "w", newline='') as csvfile: 198 | # writer = csv.writer(csvfile) 199 | # 200 | # # 先写入columns_name 201 | # writer.writerow(["file_id", "prob0", "prob1", "prob2", "prob3", "prob4", "prob5", "prob6", "prob7" 202 | # ]) 203 | # # 写入多行用writerows 204 | # writer.writerows(out) 205 | 206 | 207 | # def mulitl_version_lstm(): 208 | # embed_size = 256 209 | # num_filters = 64 210 | # kernel_size = [3, 5, 7] 211 | # main_input = Input(shape=(maxlen,)) 212 | # emb = Embedding(304, 256, input_length=maxlen)(main_input) 213 | # _embed = SpatialDropout1D(0.15)(emb) 214 | # warppers = [] 215 | # warppers2 = [] 216 | # for _kernel_size in kernel_size: 217 | # conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(_embed) 218 | # warppers.append(MaxPool1D(2)(conv1d)) 219 | # for (_kernel_size, cnn) in zip(kernel_size, warppers): 220 | # conv1d_2 = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(cnn) 221 | # warppers2.append(MaxPool1D(2)(conv1d_2)) 222 | # fc = Add()(warppers2) 223 | # rl = CuDNNLSTM(512)(fc) 224 | # main_output = Dense(8, activation='softmax')(rl) 225 | # model = Model(inputs=main_input, outputs=main_output) 226 | # return model 227 | -------------------------------------------------------------------------------- /train_lstm.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from keras.preprocessing.sequence import pad_sequences 3 | from keras_preprocessing.text import Tokenizer 4 | from keras.models import Sequential, Model 5 | from keras.layers import Dense, Embedding, Activation, merge, Input, Lambda, Reshape, LSTM, RNN, CuDNNLSTM, \ 6 | SimpleRNNCell, SpatialDropout1D, Add, Maximum 7 | from keras.layers import Conv1D, Flatten, Dropout, MaxPool1D, GlobalAveragePooling1D, concatenate, AveragePooling1D 8 | from keras import optimizers 9 | from keras import regularizers 10 | from keras.layers import BatchNormalization 11 | from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint 12 | from keras.utils import to_categorical 13 | import time 14 | import numpy as np 15 | from keras import backend as K 16 | from sklearn.model_selection import StratifiedKFold 17 | 18 | config = K.tf.ConfigProto() 19 | config.gpu_options.allow_growth = True 20 | session = K.tf.Session(config=config) 21 | 22 | Fname = 'malware_' 23 | Time = Fname + str(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())) 24 | tensorboard = TensorBoard(log_dir='./Logs/' + Time, histogram_freq=0, write_graph=False, write_images=False, 25 | embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) 26 | 27 | with open("security_test.csv.pkl", "rb") as f: 28 | file_names = pickle.load(f) 29 | outfiles = pickle.load(f) 30 | with open("security_train.csv.pkl", "rb") as f: 31 | labels_d = pickle.load(f) 32 | with open("security_train.csv.pkl", "rb") as f: 33 | labels = pickle.load(f) 34 | files = pickle.load(f) 35 | maxlen = 6000 36 | 37 | 38 | # with open("wordsdic.pkl", 'rb') as f: 39 | # tokenizer = pickle.load(f) 40 | # 41 | 42 | labels = np.asarray(labels) 43 | 44 | labels = to_categorical(labels, num_classes=8) 45 | 46 | tokenizer = Tokenizer(num_words=None, 47 | filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', 48 | split=' ', 49 | char_level=False, 50 | oov_token=None) 51 | tokenizer.fit_on_texts(files) 52 | tokenizer.fit_on_texts(outfiles) 53 | 54 | # with open("wordsdic.pkl", 'wb') as f: 55 | # pickle.dump(tokenizer, f) 56 | 57 | vocab = tokenizer.word_index 58 | print(tokenizer.word_index) 59 | print(len(vocab)) 60 | x_train_word_ids = tokenizer.texts_to_sequences(files) 61 | x_out_word_ids = tokenizer.texts_to_sequences(outfiles) 62 | 63 | x_train_padded_seqs = pad_sequences(x_train_word_ids, maxlen=maxlen) 64 | 65 | x_out_padded_seqs = pad_sequences(x_out_word_ids, maxlen=maxlen) 66 | 67 | 68 | # with open('datasets.pkl', 'wb') as f: 69 | # pickle.dump(x_train_padded_seqs, f) 70 | # pickle.dump(x_out_padded_seqs, f) 71 | # pickle.dump(labels, f) 72 | 73 | 74 | # with open('datasets.pkl', 'rb') as f: 75 | # x_train_padded_seqs = pickle.load(f) 76 | # # x_test_padded_seqs = pickle.load(f) 77 | # x_out_padded_seqs = pickle.load(f) 78 | # # y_train = pickle.load(f) 79 | # # y_test = pickle.load(f) 80 | # labels = pickle.load(f) 81 | 82 | 83 | def mulitl_version_lstm(): 84 | embed_size = 256 85 | num_filters = 64 86 | kernel_size = [3, 5, 7] 87 | main_input = Input(shape=(maxlen,)) 88 | emb = Embedding(304, 256, input_length=maxlen)(main_input) 89 | # _embed = SpatialDropout1D(0.15)(emb) 90 | warppers = [] 91 | warppers2 = [] # 0.42 92 | warppers3 = [] 93 | for _kernel_size in kernel_size: 94 | conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(emb) 95 | warppers.append(AveragePooling1D(2)(conv1d)) 96 | for (_kernel_size, cnn) in zip(kernel_size, warppers): 97 | conv1d_2 = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(cnn) 98 | warppers2.append(AveragePooling1D(2)(conv1d_2)) 99 | for (_kernel_size, cnn) in zip(kernel_size, warppers2): 100 | conv1d_2 = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(cnn) 101 | warppers3.append(AveragePooling1D(2)(conv1d_2)) 102 | fc = Maximum()(warppers3) 103 | rl = CuDNNLSTM(512)(fc) 104 | main_output = Dense(8, activation='softmax')(rl) 105 | model = Model(inputs=main_input, outputs=main_output) 106 | return model 107 | 108 | 109 | def Build(): 110 | main_input = Input(shape=(maxlen,), dtype='float64') 111 | embedder = Embedding(304, 256, input_length=maxlen) 112 | embed = embedder(main_input) 113 | # avg = GlobalAveragePooling1D()(embed) 114 | # cnn1模块,kernel_size = 3 115 | conv1_1 = Conv1D(64, 3, padding='same', activation='relu')(embed) 116 | 117 | conv1_2 = Conv1D(64, 3, padding='same', activation='relu')(conv1_1) 118 | 119 | cnn1 = MaxPool1D(pool_size=2)(conv1_2) 120 | conv1_1 = Conv1D(64, 3, padding='same', activation='relu')(cnn1) 121 | 122 | conv1_2 = Conv1D(64, 3, padding='same', activation='relu')(conv1_1) 123 | 124 | cnn1 = MaxPool1D(pool_size=2)(conv1_2) 125 | conv1_1 = Conv1D(64, 3, padding='same', activation='relu')(cnn1) 126 | 127 | conv1_2 = Conv1D(64, 3, padding='same', activation='relu')(conv1_1) 128 | 129 | cnn1 = MaxPool1D(pool_size=2)(conv1_2) 130 | rl = CuDNNLSTM(256)(cnn1) 131 | # flat = Flatten()(cnn3) 132 | # drop = Dropout(0.5)(flat) 133 | fc = Dense(256)(rl) 134 | 135 | main_output = Dense(8, activation='softmax')(rl) 136 | model = Model(inputs=main_input, outputs=main_output) 137 | return model 138 | 139 | 140 | meta_train = np.zeros(shape=(len(x_train_padded_seqs), 8)) 141 | meta_test = np.zeros(shape=(len(x_out_padded_seqs), 8)) 142 | skf = StratifiedKFold(n_splits=5, random_state=4, shuffle=True) 143 | for i, (tr_ind, te_ind) in enumerate(skf.split(x_train_padded_seqs, labels_d)): 144 | print('FOLD: {}'.format(str(i))) 145 | print(len(te_ind), len(tr_ind)) 146 | X_train, X_train_label = x_train_padded_seqs[tr_ind], labels[tr_ind] 147 | X_val, X_val_label = x_train_padded_seqs[te_ind], labels[te_ind] 148 | 149 | model = mulitl_version_lstm() 150 | # model = load_model('model_weight.h5') 151 | print(model.summary()) 152 | # exit() 153 | model.compile(loss='categorical_crossentropy', 154 | optimizer='adam', 155 | metrics=['accuracy']) 156 | model_save_path = './model/model_weight_mulitl_version_lstm_{}.h5'.format(str(i)) 157 | print(model_save_path) 158 | if i in [-1]: 159 | model.load_weights(model_save_path) 160 | print(model.evaluate(X_val, X_val_label)) 161 | else: 162 | 163 | checkpoint = model_checkpoint = ModelCheckpoint(model_save_path, save_best_only=True, save_weights_only=True) 164 | ear = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='min', baseline=None, 165 | restore_best_weights=False) 166 | history = model.fit(X_train, X_train_label, 167 | batch_size=128, 168 | epochs=100, 169 | validation_data=(X_val, X_val_label), callbacks=[checkpoint, ear]) 170 | 171 | # model.save('./model/model_weight_cnn_lstm_{}.h5'.format(str(i))) 172 | model.load_weights(model_save_path) 173 | # model = load_model('model_weight.h5') 174 | pred_val = model.predict(X_val) 175 | pred_test = model.predict(x_out_padded_seqs) 176 | 177 | meta_train[te_ind] = pred_val 178 | meta_test += pred_test 179 | K.clear_session() 180 | 181 | meta_test /= 5.0 182 | with open("mulitl_version_lstm_result.pkl", 'wb') as f: 183 | pickle.dump(meta_train, f) 184 | pickle.dump(meta_test, f) 185 | 186 | # result = model.predict(x_out_padded_seqs) 187 | # out = [] 188 | # for i in range(len(file_names)): 189 | # tmp = [] 190 | # a = result[i].tolist() 191 | # # for j in range(len(a)): 192 | # # a[j] = ("%.5f" % a[j]) 193 | # 194 | # tmp.append(file_names[i]) 195 | # tmp.extend(a) 196 | # out.append(tmp) 197 | # with open("result_lstm.csv", "w", newline='') as csvfile: 198 | # writer = csv.writer(csvfile) 199 | # 200 | # # 先写入columns_name 201 | # writer.writerow(["file_id", "prob0", "prob1", "prob2", "prob3", "prob4", "prob5", "prob6", "prob7" 202 | # ]) 203 | # # 写入多行用writerows 204 | # writer.writerows(out) 205 | 206 | 207 | # def mulitl_version_lstm(): 208 | # embed_size = 256 209 | # num_filters = 64 210 | # kernel_size = [3, 5, 7] 211 | # main_input = Input(shape=(maxlen,)) 212 | # emb = Embedding(304, 256, input_length=maxlen)(main_input) 213 | # _embed = SpatialDropout1D(0.15)(emb) 214 | # warppers = [] 215 | # warppers2 = [] 216 | # for _kernel_size in kernel_size: 217 | # conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(_embed) 218 | # warppers.append(MaxPool1D(2)(conv1d)) 219 | # for (_kernel_size, cnn) in zip(kernel_size, warppers): 220 | # conv1d_2 = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(cnn) 221 | # warppers2.append(MaxPool1D(2)(conv1d_2)) 222 | # fc = Add()(warppers2) 223 | # rl = CuDNNLSTM(512)(fc) 224 | # main_output = Dense(8, activation='softmax')(rl) 225 | # model = Model(inputs=main_input, outputs=main_output) 226 | # return model 227 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Machine Learning for Malware Detection 2 | 3 | edited by Raymond Luo 4 | 5 | ## Environment 6 | 7 | ``` bash 8 | python3 9 | RTX2080 I7 9700K 16G RAM 10 | ubuntu 18.04 11 | ``` 12 | 13 | ## Requirement 14 | TensorFlow==1.14 15 | keras==2.3.1 16 | ```bash 17 | sudo pip3 install -r requirements.txt 18 | ``` 19 | 20 | Please put the `security_train.xlsx` and `security_test.xlsx` in related floders 21 | 22 | ## Loadfile 23 | 24 | ```bash 25 | python3 loadfile.py 26 | ``` 27 | 28 | ## Train multi vision lstm 29 | 30 | ``` 31 | python3 train_lstm.py 32 | ``` 33 | 34 | ## Train LSTM 35 | 36 | ``` 37 | python3 train_lstm2.py 38 | ``` 39 | 40 | ## Train textcnn LSTM 41 | 42 | ``` 43 | python3 train_lstm3.py 44 | ``` 45 | 46 | ## Train CNN 47 | 48 | ``` 49 | python3 train_textcnn.py 50 | ``` 51 | 52 | ## Train tf-idf model 53 | 54 | ``` 55 | python3 xgboost.py 56 | ``` 57 | 58 | ## Stack model 59 | 60 | ``` 61 | python3 stack_result.py 62 | ``` 63 | 64 | ## Result 65 | 66 | Result will be in `submit` floder 67 | 68 | # FILE PREPROCESS 69 | 70 | ### Data format 71 | 72 | | label | class | explain | 73 | | ------- | ------ | ------------------------------------------------------------ | 74 | | file_id | bigint | Number of files | 75 | | label | bigint | 文件标签,0-正常/1-勒索病毒/2-挖矿程序/3-DDoS木马/4-蠕虫病毒/5-感染型病毒/6-后门程序/7-木马程序. | 76 | | api | string | The API call list of the file. | 77 | | tid | bigint | The thread number of the file. | 78 | | index | string | The order of the API call. | 79 | ---- 80 | [阿里云安全恶意程序检测](https://tianchi.aliyun.com/competition/entrance/231694/introduction?spm=5176.12281925.0.0.60c17137jVy9vv) 81 | ### Our method: 82 | 83 | Firstly, we grouped the file by ‘file-id’. For each files, we then grouped the API calls by ‘tid’ and sorted the API calls by its ‘index’. Finally, we concatenate each thread to one line to get our final represent of 1 sample. 84 | 85 | `Sample 86 | LdrLoadDll LdrGetProcedureAddress LdrGetProcedureAddress LdrGetProcedureAddress LdrGetProcedureAddress....... 87 | label 5` 88 | 89 | # MODEL 90 | 91 | For best score we use 5 models and stack them to get our final score. 92 | 93 | 94 | 95 | ## TF-idf model: 96 | 97 | In order to get the whole information of the sample. We firstly extract the TF-idf features of the sample. We extract ‘TF-IDF’ feature after we preprocess the sample with 1-5 gram. 98 | 99 | ```python 100 | vectorizer = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9, ) # tf-idf特征抽取ngram_range=(1,5) 101 | ``` 102 | 103 | Then we classify the sample base on XGboost method. 104 | 105 | ``` python 106 | param = {'max_depth': 6, 'eta': 0.1, 'eval_metric': 'mlogloss', 'silent': 1, 'objective': 'multi:softprob', 107 | 'num_class': 8, 'subsample': 0.8, 108 | 'colsample_bytree': 0.85} # 参数 109 | 110 | evallist = [(dtrain, 'train'), (dtest, 'val')] # 测试 , (dtrain, 'train') 111 | num_round = 300 # 循环次数 112 | bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=50) 113 | ``` 114 | 115 | ## Text cnn model 116 | 117 | In this model, we used ‘text cnn’ to extract the features and classify the sample. 118 | 119 | Each sample has been cut to maximum 6000 length and encode as one-hot label. 120 | 121 | We input the sample to 1 embedding layer then go to the CNN layers. 122 | 123 | We used 2,4,6,8,10 5 different kernel sizes, each convolutional layer has 32 filters. 124 | 125 | After the cnn layers, we concatenate the feature extracted by CNN layers and classify them. 126 | 127 | ```python 128 | main_input = Input(shape=(maxlen,), dtype='float64') 129 | _embed = Embedding(304, 256, input_length=maxlen)(main_input) 130 | _embed = SpatialDropout1D(0.25)(_embed) 131 | warppers = [] 132 | num_filters = 64 133 | kernel_size = [2, 3, 4, 5] 134 | conv_action = 'relu' 135 | for _kernel_size in kernel_size: 136 | for dilated_rate in [1, 2, 3, 4]: 137 | conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation=conv_action, 138 | dilation_rate=dilated_rate)(_embed) 139 | warppers.append(GlobalMaxPooling1D()(conv1d)) 140 | 141 | fc = concatenate(warppers) 142 | fc = Dropout(0.5)(fc) 143 | # fc = BatchNormalization()(fc) 144 | fc = Dense(256, activation='relu')(fc) 145 | fc = Dropout(0.25)(fc) 146 | # fc = BatchNormalization()(fc) 147 | preds = Dense(8, activation='softmax')(fc) 148 | 149 | model = Model(inputs=main_input, outputs=preds) 150 | 151 | model.compile(loss='categorical_crossentropy', 152 | optimizer='adam', 153 | metrics=['accuracy']) 154 | return model 155 | 156 | ``` 157 | 158 | ## CNN_LSTM model 159 | 160 | In order to get the sequence information of the sample. We used 3 LSTM based model. 161 | 162 | The first one is simple CNN+LSTM. All cnn layer has the same kernel size. 163 | 164 | ```python 165 | main_input = Input(shape=(maxlen,), dtype='float64') 166 | embedder = Embedding(304, 256, input_length=maxlen) 167 | embed = embedder(main_input) 168 | # avg = GlobalAveragePooling1D()(embed) 169 | # cnn1模块,kernel_size = 3 170 | conv1_1 = Conv1D(64, 3, padding='same', activation='relu')(embed) 171 | 172 | conv1_2 = Conv1D(64, 3, padding='same', activation='relu')(conv1_1) 173 | 174 | cnn1 = MaxPool1D(pool_size=2)(conv1_2) 175 | conv1_1 = Conv1D(64, 3, padding='same', activation='relu')(cnn1) 176 | 177 | conv1_2 = Conv1D(64, 3, padding='same', activation='relu')(conv1_1) 178 | 179 | cnn1 = MaxPool1D(pool_size=2)(conv1_2) 180 | conv1_1 = Conv1D(64, 3, padding='same', activation='relu')(cnn1) 181 | 182 | conv1_2 = Conv1D(64, 3, padding='same', activation='relu')(conv1_1) 183 | 184 | cnn1 = MaxPool1D(pool_size=2)(conv1_2) 185 | rl = CuDNNLSTM(256)(cnn1) 186 | # flat = Flatten()(cnn3) 187 | # drop = Dropout(0.5)(flat) 188 | fc = Dense(256)(rl) 189 | 190 | main_output = Dense(8, activation='softmax')(rl) 191 | model = Model(inputs=main_input, outputs=main_output) 192 | return model 193 | 194 | ``` 195 | 196 | ## Mulit vision LSTM 197 | 198 | Inspired by out text cnn method. We thought that if different kernel size can have different visions why can we have a mulit vision LSTM model. So we first use 3,5,7 3 different kerner sizes. In spite of arrange them as linner order, we use them to extract the features from the embedding layer independly. Between each lyayer we used averge pooling instead of max pooling to get more information of the sequence rather than only one API call. After the CNN layers we have three feature vectors which have same size. We then use 199 | $$ 200 | max_{elements}(v_1,v_2,v_3) 201 | $$ 202 | in order to get the most significant feature in different visions. After that we used LSTM layer to analyze the sequence and classify it. 203 | 204 | ```python 205 | embed_size = 256 206 | num_filters = 64 207 | kernel_size = [3, 5, 7] 208 | main_input = Input(shape=(maxlen,)) 209 | emb = Embedding(304, 256, input_length=maxlen)(main_input) 210 | # _embed = SpatialDropout1D(0.15)(emb) 211 | warppers = [] 212 | warppers2 = [] # 0.42 213 | warppers3 = [] 214 | for _kernel_size in kernel_size: 215 | conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(emb) 216 | warppers.append(AveragePooling1D(2)(conv1d)) 217 | for (_kernel_size, cnn) in zip(kernel_size, warppers): 218 | conv1d_2 = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(cnn) 219 | warppers2.append(AveragePooling1D(2)(conv1d_2)) 220 | for (_kernel_size, cnn) in zip(kernel_size, warppers2): 221 | conv1d_2 = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(cnn) 222 | warppers3.append(AveragePooling1D(2)(conv1d_2)) 223 | fc = Maximum()(warppers3) 224 | rl = CuDNNLSTM(512)(fc) 225 | main_output = Dense(8, activation='softmax')(rl) 226 | model = Model(inputs=main_input, outputs=main_output) 227 | return model 228 | ``` 229 | 230 | ## Text cnn lstm 231 | 232 | This model is mostly as same as multi visions LSTM but we concatenate the 3 feature in order. Because we think this method did't affect the order of the sequence. In each part the order of API calls did't change. 233 | 234 | ``` python 235 | main_input = Input(shape=(maxlen,), dtype='float64') 236 | 237 | embedder = Embedding(304, 256, input_length=maxlen) 238 | embed = embedder(main_input) 239 | # cnn1模块,kernel_size = 3 240 | conv1_1 = Conv1D(16, 3, padding='same')(embed) 241 | bn1_1 = BatchNormalization()(conv1_1) 242 | relu1_1 = Activation('relu')(bn1_1) 243 | conv1_2 = Conv1D(32, 3, padding='same')(relu1_1) 244 | bn1_2 = BatchNormalization()(conv1_2) 245 | relu1_2 = Activation('relu')(bn1_2) 246 | cnn1 = MaxPool1D(pool_size=4)(relu1_2) 247 | # cnn2模块,kernel_size = 4 248 | conv2_1 = Conv1D(16, 4, padding='same')(embed) 249 | bn2_1 = BatchNormalization()(conv2_1) 250 | relu2_1 = Activation('relu')(bn2_1) 251 | conv2_2 = Conv1D(32, 4, padding='same')(relu2_1) 252 | bn2_2 = BatchNormalization()(conv2_2) 253 | relu2_2 = Activation('relu')(bn2_2) 254 | cnn2 = MaxPool1D(pool_size=4)(relu2_2) 255 | # cnn3模块,kernel_size = 5 256 | conv3_1 = Conv1D(16, 5, padding='same')(embed) 257 | bn3_1 = BatchNormalization()(conv3_1) 258 | relu3_1 = Activation('relu')(bn3_1) 259 | conv3_2 = Conv1D(32, 5, padding='same')(relu3_1) 260 | bn3_2 = BatchNormalization()(conv3_2) 261 | relu3_2 = Activation('relu')(bn3_2) 262 | cnn3 = MaxPool1D(pool_size=4)(relu3_2) 263 | # 拼接三个模块 264 | cnn = concatenate([cnn1, cnn2, cnn3], axis=-1) 265 | lstm = CuDNNLSTM(256)(cnn) 266 | f = Flatten()(cnn1) 267 | fc = Dense(256, activation='relu')(f) 268 | D = Dropout(0.5)(fc) 269 | main_output = Dense(8, activation='softmax')(lstm) 270 | model = Model(inputs=main_input, outputs=main_output) 271 | return model 272 | ``` 273 | 274 | ### Stack result 275 | 276 | We stack the results of the model mentioned above. 277 | 278 | ``` python 279 | train = np.hstack([tfidf_train_result, textcnn_train_result, mulitl_version_lstm_train_result, cnn_train_result, 280 | textcnn_lstm_train_result]) 281 | test = np.hstack( 282 | [tfidf_out_result, textcnn_out_result, mulitl_version_lstm_test_result, cnn_out_result, textcnn_lstm_test_result]) 283 | meta_test = np.zeros(shape=(len(outfiles), 8)) 284 | skf = StratifiedKFold(n_splits=5, random_state=4, shuffle=True) 285 | dout = xgb.DMatrix(test) 286 | for i, (tr_ind, te_ind) in enumerate(skf.split(train, labels)): 287 | print('FOLD: {}'.format(str(i))) 288 | X_train, X_train_label = train[tr_ind], labels[tr_ind] 289 | X_val, X_val_label = train[te_ind], labels[te_ind] 290 | dtrain = xgb.DMatrix(X_train, label=X_train_label) 291 | dtest = xgb.DMatrix(X_val, X_val_label) # label可以不要,此处需要是为了测试效果 292 | 293 | param = {'max_depth': 6, 'eta': 0.01, 'eval_metric': 'mlogloss', 'silent': 1, 'objective': 'multi:softprob', 294 | 'num_class': 8, 'subsample': 0.9, 295 | 'colsample_bytree': 0.85} # 参数 296 | evallist = [(dtrain, 'train'), (dtest, 'val')] # 测试 , (dtrain, 'train') 297 | num_round = 10000 # 循环次数 298 | bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=100) 299 | preds = bst.predict(dout) 300 | meta_test += preds 301 | 302 | meta_test /= 5.0 303 | result = meta_test 304 | ``` 305 | --------------------------------------------------------------------------------