├── model
    └── gitignore
├── final_score.jpg
├── requirements.txt
├── security_test
    └── security_test.xlsx
├── security_train
    └── security_train.xlsx
├── .gitignore
├── loadfile.py
├── xgdboost.py
├── stack_result.py
├── train_lstm3.py
├── train_textcnn.py
├── train_lstm2.py
├── train_lstm.py
└── readme.md


/model/gitignore:
--------------------------------------------------------------------------------
1 | for model save


--------------------------------------------------------------------------------
/final_score.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RManLuo/ML_Malware_detect/HEAD/final_score.jpg


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | pickle
3 | numpy
4 | csv
5 | sklearn
6 | keras
7 | matplotlib
8 | keras
9 | tensoflow


--------------------------------------------------------------------------------
/security_test/security_test.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RManLuo/ML_Malware_detect/HEAD/security_test/security_test.xlsx


--------------------------------------------------------------------------------
/security_train/security_train.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RManLuo/ML_Malware_detect/HEAD/security_train/security_train.xlsx


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/loadfile.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pickle
 3 | import numpy as np
 4 | 
 5 | train_path = r'security_train/security_train.csv'
 6 | test_path = r'security_test/security_test.csv'
 7 | 
 8 | 
 9 | def read_train_file(path):
10 |     labels = []
11 |     files = []
12 |     data = pd.read_csv(path)
13 |     # for data in data1:
14 |     goup_fileid = data.groupby('file_id')
15 |     for file_name, file_group in goup_fileid:
16 |         print(file_name)
17 |         file_labels = file_group['label'].values[0]
18 |         result = file_group.sort_values(['tid', 'index'], ascending=True)
19 |         api_sequence = ' '.join(result['api'])
20 |         labels.append(file_labels)
21 |         files.append(api_sequence)
22 |     print(len(labels))
23 |     print(len(files))
24 |     with open(path.split('/')[-1] + ".txt", 'w') as f:
25 |         for i in range(len(labels)):
26 |             f.write(str(labels[i]) + ' ' + files[i] + '\n')
27 | 
28 | 
29 | 
30 | def read_test_file(path):
31 |     names = []
32 |     files = []
33 |     data = pd.read_csv(path)
34 |     # for data in data1:
35 |     goup_fileid = data.groupby('file_id')
36 |     for file_name, file_group in goup_fileid:
37 |         print(file_name)
38 |         # file_labels = file_group['label'].values[0]
39 |         result = file_group.sort_values(['tid', 'index'], ascending=True)
40 |         api_sequence = ' '.join(result['api'])
41 |         # labels.append(file_labels)
42 |         names.append(file_name)
43 |         files.append(api_sequence)
44 |     print(len(names))
45 |     print(len(files))
46 |     with open("security_test.csv.pkl", 'wb') as f:
47 |         pickle.dump(names, f)
48 |         pickle.dump(files, f)
49 |     # with open(path.split('/')[-1] + ".txt", 'w') as f:
50 |     #     for i in range(len(names)):
51 |     #         f.write(str(names[i]) + ' ' + files[i] + '\n')
52 | 
53 | 
54 | def load_train2h5py(path="security_train.csv.txt"):
55 |     labels = []
56 |     files = []
57 |     with open(path) as f:
58 |         for i in f.readlines():
59 |             i = i.strip('\n')
60 |             labels.append(i[0])
61 |             files.append(i[2:])
62 |     labels = np.asarray(labels)
63 |     print(labels.shape)
64 |     with open("security_train.csv.pkl", 'wb') as f:
65 |         pickle.dump(labels, f)
66 |         pickle.dump(files, f)
67 | 
68 | 
69 | # def load_test2h5py(path="D:\ML_Malware\security_test.csv.txt"):
70 | #     labels = []
71 | #     files = []
72 | #     with open(path) as f:
73 | #         for i in f.readlines():
74 | #             i = i.strip('\n')
75 | #             labels.append(i[0])
76 | #             files.append(' '.join(i.split(" ")[1:]))
77 | #     labels = np.asarray(labels)
78 | #     print(labels.shape)
79 | #     with open("security_test.csv.pkl", 'wb') as f:
80 | #         pickle.dump(labels, f)
81 | #         pickle.dump(files, f)
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     print("read train file.....")
86 |     read_train_file(train_path)
87 |     load_train2h5py()
88 |     print("read test file......")
89 |     read_test_file(test_path)
90 | 
91 |     # load_test2h5py()
92 | 


--------------------------------------------------------------------------------
/xgdboost.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | from sklearn.model_selection import train_test_split
 3 | from sklearn.feature_extraction.text import TfidfVectorizer
 4 | import time
 5 | import csv
 6 | import xgboost as xgb
 7 | from sklearn.model_selection import StratifiedKFold
 8 | import numpy  as np
 9 | 
10 | with open("security_test.csv.pkl", "rb") as f:
11 |     file_names = pickle.load(f)
12 |     outfiles = pickle.load(f)
13 | 
14 | with open("security_train.csv.pkl", "rb") as f:
15 |     labels = pickle.load(f)
16 |     files = pickle.load(f)
17 | 
18 | print("start tfidf...")
19 | vectorizer = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9, )  # tf-idf特征抽取ngram_range=(1,5)
20 | 
21 | train_features = vectorizer.fit_transform(files)
22 | 
23 | out_features = vectorizer.transform(outfiles)
24 | 
25 | # with open("tfidf_feature_no_limit.pkl", 'wb') as f:
26 | #     pickle.dump(train_features, f)
27 | #     pickle.dump(out_features, f)
28 | #
29 | # with open("tfidf_feature_no_limit.pkl", 'rb') as f:
30 | #     train_features = pickle.load(f)
31 | #     out_features = pickle.load(f)
32 | print(train_features.shape)
33 | print(out_features.shape)
34 | meta_train = np.zeros(shape=(len(files), 8))
35 | meta_test = np.zeros(shape=(len(outfiles), 8))
36 | skf = StratifiedKFold(n_splits=5, random_state=4, shuffle=True)
37 | for i, (tr_ind, te_ind) in enumerate(skf.split(train_features, labels)):
38 |     X_train, X_train_label = train_features[tr_ind], labels[tr_ind]
39 |     X_val, X_val_label = train_features[te_ind], labels[te_ind]
40 | 
41 |     print('FOLD: {}'.format(str(i)))
42 |     print(len(te_ind), len(tr_ind))
43 |     dtrain = xgb.DMatrix(X_train, label=X_train_label)
44 |     dtest = xgb.DMatrix(X_val, X_val_label)
45 |     dout = xgb.DMatrix(out_features)
46 |     param = {'max_depth': 6, 'eta': 0.1, 'eval_metric': 'mlogloss', 'silent': 1, 'objective': 'multi:softprob',
47 |              'num_class': 8, 'subsample': 0.8,
48 |              'colsample_bytree': 0.85}  # 参数
49 | 
50 |     evallist = [(dtrain, 'train'), (dtest, 'val')]  # 测试 , (dtrain, 'train')
51 |     num_round = 300  # 循环次数
52 |     bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=50)
53 | 
54 |     # dtr = xgb.DMatrix(train_features)
55 |     pred_val = bst.predict(dtest)
56 |     pred_test = bst.predict(dout)
57 |     meta_train[te_ind] = pred_val
58 |     meta_test += pred_test
59 | meta_test /= 5.0
60 | with open("tfidf_result.pkl", 'wb') as f:
61 |     pickle.dump(meta_train, f)
62 |     pickle.dump(meta_test, f)
63 | 
64 |     # preds = bst.predict(dout)
65 |     #
66 |     #
67 |     # result = preds
68 |     # # print(result)
69 |     # out = []
70 |     # for i in range(len(file_names)):
71 |     #     tmp = []
72 |     #     a = result[i].tolist()
73 |     #     # for j in range(len(a)):
74 |     #     #     a[j] = ("%.5f" % a[j])
75 |     #
76 |     #     tmp.append(file_names[i])
77 |     #     tmp.extend(a)
78 |     #     out.append(tmp)
79 |     # with open("result_xgd_boost_{}.csv".format(str(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()))), "w",
80 |     #           newline='') as csvfile:
81 |     #     writer = csv.writer(csvfile)
82 | 
83 |     # 先写入columns_name
84 |     # writer.writerow(["file_id", "prob0", "prob1", "prob2", "prob3", "prob4", "prob5", "prob6", "prob7"
85 |     #                  ])
86 |     # # 写入多行用writerows
87 |     # writer.writerows(out)
88 | 


--------------------------------------------------------------------------------
/stack_result.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | from sklearn.model_selection import train_test_split
 3 | from sklearn.feature_extraction.text import TfidfVectorizer
 4 | import time
 5 | import csv
 6 | import xgboost as xgb
 7 | import numpy as np
 8 | from sklearn.model_selection import StratifiedKFold
 9 | 
10 | with open("security_test.csv.pkl", "rb") as f:
11 |     file_names = pickle.load(f)
12 |     outfiles = pickle.load(f)
13 | 
14 | with open("cnn_lstm_result.pkl", "rb") as f:
15 |     cnn_train_result = pickle.load(f)
16 |     cnn_out_result = pickle.load(f)
17 | 
18 | with open("tfidf_result.pkl", "rb") as f:
19 |     tfidf_train_result = pickle.load(f)
20 |     tfidf_out_result = pickle.load(f)
21 | 
22 | with open("textcnn_result.pkl", "rb") as f:
23 |     textcnn_train_result = pickle.load(f)
24 |     textcnn_out_result = pickle.load(f)
25 | 
26 | with open("mulitl_version_lstm_result.pkl", "rb") as f:
27 |     mulitl_version_lstm_train_result = pickle.load(f)
28 |     mulitl_version_lstm_test_result = pickle.load(f)
29 | 
30 | with open("textcnn_lstm_result.pkl", "rb") as f:
31 |     textcnn_lstm_train_result = pickle.load(f)
32 |     textcnn_lstm_test_result = pickle.load(f)	
33 | 	
34 | with open("security_train.csv.pkl", "rb") as f:
35 |     labels = pickle.load(f)
36 |     files = pickle.load(f)
37 | 
38 | train = np.hstack([tfidf_train_result, textcnn_train_result, mulitl_version_lstm_train_result, cnn_train_result,
39 |                    textcnn_lstm_train_result])
40 | test = np.hstack(
41 |     [tfidf_out_result, textcnn_out_result, mulitl_version_lstm_test_result, cnn_out_result, textcnn_lstm_test_result])
42 | meta_test = np.zeros(shape=(len(outfiles), 8))
43 | skf = StratifiedKFold(n_splits=5, random_state=4, shuffle=True)
44 | dout = xgb.DMatrix(test)
45 | for i, (tr_ind, te_ind) in enumerate(skf.split(train, labels)):
46 |     print('FOLD: {}'.format(str(i)))
47 |     X_train, X_train_label = train[tr_ind], labels[tr_ind]
48 |     X_val, X_val_label = train[te_ind], labels[te_ind]
49 |     dtrain = xgb.DMatrix(X_train, label=X_train_label)
50 |     dtest = xgb.DMatrix(X_val, X_val_label)  # label可以不要，此处需要是为了测试效果
51 | 
52 |     param = {'max_depth': 6, 'eta': 0.01, 'eval_metric': 'mlogloss', 'silent': 1, 'objective': 'multi:softprob',
53 |              'num_class': 8, 'subsample': 0.9,
54 |              'colsample_bytree': 0.85}  # 参数
55 |     evallist = [(dtrain, 'train'), (dtest, 'val')]  # 测试 , (dtrain, 'train')
56 |     num_round = 10000  # 循环次数
57 |     bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=100)
58 |     preds = bst.predict(dout)
59 |     meta_test += preds
60 | 
61 | meta_test /= 5.0
62 | result = meta_test
63 | # print(result)
64 | out = []
65 | for i in range(len(file_names)):
66 |     tmp = []
67 |     a = result[i].tolist()
68 |     # for j in range(len(a)):
69 |     #     a[j] = ("%.5f" % a[j])
70 | 
71 |     tmp.append(file_names[i])
72 |     tmp.extend(a)
73 |     out.append(tmp)
74 | with open("./submit/mulltimodel_xgd_boost_tf+cnn_mlstm{}.csv".format(
75 |         str(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()))),
76 |         "w",
77 |         newline='') as csvfile:
78 |     writer = csv.writer(csvfile)
79 | 
80 |     # 先写入columns_name
81 |     writer.writerow(["file_id", "prob0", "prob1", "prob2", "prob3", "prob4", "prob5", "prob6", "prob7"
82 |                      ])
83 |     # 写入多行用writerows
84 |     writer.writerows(out)
85 | 


--------------------------------------------------------------------------------
/train_lstm3.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | from keras.preprocessing.sequence import pad_sequences
  3 | from keras_preprocessing.text import Tokenizer
  4 | from keras.models import Sequential, Model
  5 | from keras.layers import Dense, Embedding, Activation, merge, Input, Lambda, Reshape, LSTM, RNN, CuDNNLSTM, \
  6 |     SimpleRNNCell, SpatialDropout1D, Add, Maximum
  7 | from keras.layers import Conv1D, Flatten, Dropout, MaxPool1D, GlobalAveragePooling1D, concatenate, AveragePooling1D
  8 | from keras import optimizers
  9 | from keras import regularizers
 10 | from keras.layers import BatchNormalization
 11 | from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
 12 | from keras.utils import to_categorical
 13 | import time
 14 | import numpy as np
 15 | from keras import backend as K
 16 | from sklearn.model_selection import StratifiedKFold
 17 | 
 18 | config = K.tf.ConfigProto()
 19 | config.gpu_options.allow_growth = True
 20 | session = K.tf.Session(config=config)
 21 | 
 22 | Fname = 'malware_'
 23 | Time = Fname + str(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()))
 24 | tensorboard = TensorBoard(log_dir='./Logs/' + Time, histogram_freq=0, write_graph=False, write_images=False,
 25 |                           embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None)
 26 | 
 27 | with open("security_test.csv.pkl", "rb") as f:
 28 |     file_names = pickle.load(f)
 29 |     outfiles = pickle.load(f)
 30 | with open("security_train.csv.pkl", "rb") as f:
 31 |     labels_d = pickle.load(f)
 32 | with open("security_train.csv.pkl", "rb") as f:
 33 |     labels = pickle.load(f)
 34 |     files = pickle.load(f)
 35 | maxlen = 6000
 36 | 
 37 | 
 38 | # with open("wordsdic.pkl", 'rb') as f:
 39 | #     tokenizer = pickle.load(f)
 40 | #
 41 | 
 42 | labels = np.asarray(labels)
 43 | 
 44 | labels = to_categorical(labels, num_classes=8)
 45 | 
 46 | tokenizer = Tokenizer(num_words=None,
 47 |                       filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
 48 |                       split=' ',
 49 |                       char_level=False,
 50 |                       oov_token=None)
 51 | tokenizer.fit_on_texts(files)
 52 | tokenizer.fit_on_texts(outfiles)
 53 | 
 54 | # with open("wordsdic.pkl", 'wb') as f:
 55 | #     pickle.dump(tokenizer, f)
 56 | 
 57 | vocab = tokenizer.word_index
 58 | print(tokenizer.word_index)
 59 | print(len(vocab))
 60 | x_train_word_ids = tokenizer.texts_to_sequences(files)
 61 | x_out_word_ids = tokenizer.texts_to_sequences(outfiles)
 62 | 
 63 | x_train_padded_seqs = pad_sequences(x_train_word_ids, maxlen=maxlen)
 64 | 
 65 | x_out_padded_seqs = pad_sequences(x_out_word_ids, maxlen=maxlen)
 66 | 
 67 | 
 68 | # with open('datasets.pkl', 'wb') as f:
 69 | #     pickle.dump(x_train_padded_seqs, f)
 70 | #     pickle.dump(x_out_padded_seqs, f)
 71 | #     pickle.dump(labels, f)
 72 | 
 73 | 
 74 | # with open('datasets.pkl', 'rb') as f:
 75 | #     x_train_padded_seqs = pickle.load(f)
 76 | #     # x_test_padded_seqs = pickle.load(f)
 77 | #     x_out_padded_seqs = pickle.load(f)
 78 | #     # y_train = pickle.load(f)
 79 | #     # y_test = pickle.load(f)
 80 | #     labels = pickle.load(f)
 81 | 
 82 | 
 83 | def textcnn_lstm():
 84 |     main_input = Input(shape=(maxlen,), dtype='float64')
 85 | 
 86 |     embedder = Embedding(304, 256, input_length=maxlen)
 87 |     embed = embedder(main_input)
 88 |     # cnn1模块，kernel_size = 3
 89 |     conv1_1 = Conv1D(16, 3, padding='same')(embed)
 90 |     bn1_1 = BatchNormalization()(conv1_1)
 91 |     relu1_1 = Activation('relu')(bn1_1)
 92 |     conv1_2 = Conv1D(32, 3, padding='same')(relu1_1)
 93 |     bn1_2 = BatchNormalization()(conv1_2)
 94 |     relu1_2 = Activation('relu')(bn1_2)
 95 |     cnn1 = MaxPool1D(pool_size=4)(relu1_2)
 96 |     # cnn2模块，kernel_size = 4
 97 |     conv2_1 = Conv1D(16, 4, padding='same')(embed)
 98 |     bn2_1 = BatchNormalization()(conv2_1)
 99 |     relu2_1 = Activation('relu')(bn2_1)
100 |     conv2_2 = Conv1D(32, 4, padding='same')(relu2_1)
101 |     bn2_2 = BatchNormalization()(conv2_2)
102 |     relu2_2 = Activation('relu')(bn2_2)
103 |     cnn2 = MaxPool1D(pool_size=4)(relu2_2)
104 |     # cnn3模块，kernel_size = 5
105 |     conv3_1 = Conv1D(16, 5, padding='same')(embed)
106 |     bn3_1 = BatchNormalization()(conv3_1)
107 |     relu3_1 = Activation('relu')(bn3_1)
108 |     conv3_2 = Conv1D(32, 5, padding='same')(relu3_1)
109 |     bn3_2 = BatchNormalization()(conv3_2)
110 |     relu3_2 = Activation('relu')(bn3_2)
111 |     cnn3 = MaxPool1D(pool_size=4)(relu3_2)
112 |     # 拼接三个模块
113 |     cnn = concatenate([cnn1, cnn2, cnn3], axis=-1)
114 |     lstm = CuDNNLSTM(256)(cnn)
115 |     f = Flatten()(cnn1)
116 |     fc = Dense(256, activation='relu')(f)
117 |     D = Dropout(0.5)(fc)
118 |     main_output = Dense(8, activation='softmax')(lstm)
119 |     model = Model(inputs=main_input, outputs=main_output)
120 |     return model
121 | 
122 | 
123 | meta_train = np.zeros(shape=(len(x_train_padded_seqs), 8))
124 | meta_test = np.zeros(shape=(len(x_out_padded_seqs), 8))
125 | skf = StratifiedKFold(n_splits=5, random_state=4, shuffle=True)
126 | for i, (tr_ind, te_ind) in enumerate(skf.split(x_train_padded_seqs, labels_d)):
127 |     print('FOLD: {}'.format(str(i)))
128 |     print(len(te_ind), len(tr_ind))
129 |     X_train, X_train_label = x_train_padded_seqs[tr_ind], labels[tr_ind]
130 |     X_val, X_val_label = x_train_padded_seqs[te_ind], labels[te_ind]
131 | 
132 |     model = textcnn_lstm()
133 |     # model = load_model('model_weight.h5')
134 |     print(model.summary())
135 |     # exit()
136 |     model.compile(loss='categorical_crossentropy',
137 |                   optimizer='adam',
138 |                   metrics=['accuracy'])
139 |     model_save_path = './model/model_weight_textcnn_lstm_{}.h5'.format(str(i))
140 |     print(model_save_path)
141 |     if i in [-1]:
142 |         model.load_weights(model_save_path)
143 |         print(model.evaluate(X_val, X_val_label))
144 |     else:
145 | 
146 |         checkpoint = model_checkpoint = ModelCheckpoint(model_save_path, save_best_only=True, save_weights_only=True)
147 |         ear = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='min', baseline=None,
148 |                             restore_best_weights=False)
149 |         history = model.fit(X_train, X_train_label,
150 |                             batch_size=128,
151 |                             epochs=100,
152 |                             validation_data=(X_val, X_val_label), callbacks=[checkpoint, ear])
153 | 
154 |         # model.save('./model/model_weight_cnn_lstm_{}.h5'.format(str(i)))
155 |         model.load_weights(model_save_path)
156 |         # model = load_model('model_weight.h5')
157 |     pred_val = model.predict(X_val)
158 |     pred_test = model.predict(x_out_padded_seqs)
159 | 
160 |     meta_train[te_ind] = pred_val
161 |     meta_test += pred_test
162 |     K.clear_session()
163 | 
164 | meta_test /= 5.0
165 | with open("textcnn_lstm_result.pkl", 'wb') as f:
166 |     pickle.dump(meta_train, f)
167 |     pickle.dump(meta_test, f)
168 | 
169 | # result = model.predict(x_out_padded_seqs)
170 | # out = []
171 | # for i in range(len(file_names)):
172 | #     tmp = []
173 | #     a = result[i].tolist()
174 | #     # for j in range(len(a)):
175 | #     #     a[j] = ("%.5f" % a[j])
176 | #
177 | #     tmp.append(file_names[i])
178 | #     tmp.extend(a)
179 | #     out.append(tmp)
180 | # with open("result_lstm.csv", "w", newline='') as csvfile:
181 | #     writer = csv.writer(csvfile)
182 | #
183 | #     # 先写入columns_name
184 | #     writer.writerow(["file_id", "prob0", "prob1", "prob2", "prob3", "prob4", "prob5", "prob6", "prob7"
185 | #                      ])
186 | #     # 写入多行用writerows
187 | #     writer.writerows(out)
188 | 
189 | 
190 | # def mulitl_version_lstm():
191 | #     embed_size = 256
192 | #     num_filters = 64
193 | #     kernel_size = [3, 5, 7]
194 | #     main_input = Input(shape=(maxlen,))
195 | #     emb = Embedding(304, 256, input_length=maxlen)(main_input)
196 | #     _embed = SpatialDropout1D(0.15)(emb)
197 | #     warppers = []
198 | #     warppers2 = []
199 | #     for _kernel_size in kernel_size:
200 | #         conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(_embed)
201 | #         warppers.append(MaxPool1D(2)(conv1d))
202 | #     for (_kernel_size, cnn) in zip(kernel_size, warppers):
203 | #         conv1d_2 = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(cnn)
204 | #         warppers2.append(MaxPool1D(2)(conv1d_2))
205 | #     fc = Add()(warppers2)
206 | #     rl = CuDNNLSTM(512)(fc)
207 | #     main_output = Dense(8, activation='softmax')(rl)
208 | #     model = Model(inputs=main_input, outputs=main_output)
209 | #     return model
210 | 


--------------------------------------------------------------------------------
/train_textcnn.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | from sklearn.model_selection import train_test_split
  3 | from keras.preprocessing.sequence import pad_sequences
  4 | from keras_preprocessing.text import Tokenizer
  5 | import matplotlib.pyplot as plt
  6 | import matplotlib.mlab as mlab
  7 | from keras.models import Sequential, Model
  8 | from keras.layers import Dense, Embedding, Activation, merge, Input, Lambda, Reshape, LSTM, RNN, CuDNNLSTM, \
  9 |     SimpleRNNCell, SpatialDropout1D
 10 | from keras.layers import Conv1D, Flatten, Dropout, MaxPool1D, GlobalAveragePooling1D, concatenate, MaxPool2D,GlobalMaxPooling1D
 11 | from keras import optimizers
 12 | from keras import regularizers
 13 | from keras.layers import BatchNormalization
 14 | from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
 15 | from keras.utils import to_categorical
 16 | import time
 17 | import numpy as np
 18 | from scipy import interp
 19 | from sklearn import metrics
 20 | from keras import backend as K
 21 | from keras.models import load_model
 22 | import csv
 23 | from sklearn.model_selection import StratifiedKFold
 24 | 
 25 | config = K.tf.ConfigProto()
 26 | config.gpu_options.allow_growth = True
 27 | session = K.tf.Session(config=config)
 28 | 
 29 | Fname = 'malware_'
 30 | Time = Fname + str(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()))
 31 | tensorboard = TensorBoard(log_dir='./Logs/' + Time, histogram_freq=0, write_graph=False, write_images=False,
 32 |                           embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None)
 33 | 
 34 | with open("security_test.csv.pkl", "rb") as f:
 35 |     file_names = pickle.load(f)
 36 |     outfiles = pickle.load(f)
 37 | with open("security_train.csv.pkl", "rb") as f:
 38 |     labels_d = pickle.load(f)
 39 | with open("security_train.csv.pkl", "rb") as f:
 40 |     labels = pickle.load(f)
 41 |     files = pickle.load(f)
 42 | maxlen = 6000
 43 | 
 44 | 
 45 | # with open("wordsdic.pkl", 'rb') as f:
 46 | #     tokenizer = pickle.load(f)
 47 | #
 48 | 
 49 | labels = np.asarray(labels)
 50 | 
 51 | labels = to_categorical(labels, num_classes=8)
 52 | 
 53 | tokenizer = Tokenizer(num_words=None,
 54 |                       filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
 55 |                       split=' ',
 56 |                       char_level=False,
 57 |                       oov_token=None)
 58 | tokenizer.fit_on_texts(files)
 59 | tokenizer.fit_on_texts(outfiles)
 60 | 
 61 | # with open("wordsdic.pkl", 'wb') as f:
 62 | #     pickle.dump(tokenizer, f)
 63 | 
 64 | vocab = tokenizer.word_index
 65 | print(tokenizer.word_index)
 66 | print(len(vocab))
 67 | x_train_word_ids = tokenizer.texts_to_sequences(files)
 68 | x_out_word_ids = tokenizer.texts_to_sequences(outfiles)
 69 | 
 70 | x_train_padded_seqs = pad_sequences(x_train_word_ids, maxlen=maxlen)
 71 | 
 72 | x_out_padded_seqs = pad_sequences(x_out_word_ids, maxlen=maxlen)
 73 | 
 74 | 
 75 | # with open('datasets.pkl', 'wb') as f:
 76 | #     pickle.dump(x_train_padded_seqs, f)
 77 | #     pickle.dump(x_out_padded_seqs, f)
 78 | #     pickle.dump(labels, f)
 79 | 
 80 | 
 81 | # with open('datasets.pkl', 'rb') as f:
 82 | #     x_train_padded_seqs = pickle.load(f)
 83 | #     # x_test_padded_seqs = pickle.load(f)
 84 | #     x_out_padded_seqs = pickle.load(f)
 85 | #     # y_train = pickle.load(f)
 86 | #     # y_test = pickle.load(f)
 87 | #     labels = pickle.load(f)
 88 | 
 89 | 
 90 | def TextCNN():
 91 |     num_filters = 64
 92 |     kernel_size = [2, 4, 6, 8, 10]
 93 |     conv_action = 'relu'
 94 |     _input = Input(shape=(maxlen,), dtype='int32')
 95 |     _embed = Embedding(304, 256, input_length=maxlen)(_input)
 96 |     _embed = SpatialDropout1D(0.15)(_embed)
 97 |     warppers = []
 98 |     for _kernel_size in kernel_size:
 99 |         conv1d = Conv1D(filters=32, kernel_size=_kernel_size, activation=conv_action, padding="same")(_embed)
100 |         warppers.append(MaxPool1D(2)(conv1d))
101 | 
102 |     fc = concatenate(warppers)
103 |     fc = Flatten()(fc)
104 |     fc = Dropout(0.5)(fc)
105 |     # fc = BatchNormalization()(fc)
106 |     fc = Dense(256, activation='relu')(fc)
107 |     fc = Dropout(0.5)(fc)
108 |     # fc = BatchNormalization()(fc)
109 |     preds = Dense(8, activation='softmax')(fc)
110 | 
111 |     model = Model(inputs=_input, outputs=preds)
112 | 
113 |     model.compile(loss='categorical_crossentropy',
114 |                   optimizer='adam',
115 |                   metrics=['accuracy'])
116 |     return model
117 | 
118 | def dila():
119 |     main_input = Input(shape=(maxlen,), dtype='float64')
120 |     _embed = Embedding(304, 256, input_length=maxlen)(main_input)
121 |     _embed = SpatialDropout1D(0.25)(_embed)
122 |     warppers = []
123 |     num_filters = 64
124 |     kernel_size = [2, 3, 4, 5]
125 |     conv_action = 'relu'
126 |     for _kernel_size in kernel_size:
127 |         for dilated_rate in [1, 2, 3, 4]:
128 |             conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation=conv_action,
129 |                             dilation_rate=dilated_rate)(_embed)
130 |             warppers.append(GlobalMaxPooling1D()(conv1d))
131 | 
132 |     fc = concatenate(warppers)
133 |     fc = Dropout(0.5)(fc)
134 |     # fc = BatchNormalization()(fc)
135 |     fc = Dense(256, activation='relu')(fc)
136 |     fc = Dropout(0.25)(fc)
137 |     # fc = BatchNormalization()(fc)
138 |     preds = Dense(8, activation='softmax')(fc)
139 | 
140 |     model = Model(inputs=main_input, outputs=preds)
141 | 
142 |     model.compile(loss='categorical_crossentropy',
143 |                   optimizer='adam',
144 |                   metrics=['accuracy'])
145 |     return model
146 | def fasttext():
147 |     main_input = Input(shape=(maxlen,), dtype='float64')
148 |     embedder = Embedding(304, 256, input_length=maxlen)
149 |     embed = embedder(main_input)
150 |     # cnn1模块，kernel_size = 3
151 |     gb = GlobalAveragePooling1D()(embed)
152 |     main_output = Dense(8, activation='softmax')(gb)
153 |     model = Model(inputs=main_input, outputs=main_output)
154 |     return model
155 | 
156 | 
157 | meta_train = np.zeros(shape=(len(x_train_padded_seqs), 8))
158 | meta_test = np.zeros(shape=(len(x_out_padded_seqs), 8))
159 | skf = StratifiedKFold(n_splits=5, random_state=4, shuffle=True)
160 | for i, (tr_ind, te_ind) in enumerate(skf.split(x_train_padded_seqs, labels_d)):
161 |     print('FOLD: {}'.format(str(i)))
162 |     print(len(te_ind), len(tr_ind))
163 |     X_train, X_train_label = x_train_padded_seqs[tr_ind], labels[tr_ind]
164 |     X_val, X_val_label = x_train_padded_seqs[te_ind], labels[te_ind]
165 | 
166 |     model = dila()
167 |     # model = load_model('model_weight.h5')
168 |     # print(model.summary())
169 |     # exit()
170 |     model.compile(loss='categorical_crossentropy',
171 |                   optimizer='adam',
172 |                   metrics=['accuracy'])
173 |     model_save_path = './model/model_weight_testcnn_{}.h5'.format(str(i))
174 |     if i in [-1]:
175 |         model = model.load_weights(model_save_path)
176 |         print(model.evaluate(X_val, X_val_label))
177 |     else:
178 |         ear = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='min', baseline=None,
179 |                             restore_best_weights=False)
180 | 
181 |         checkpoint = model_checkpoint = ModelCheckpoint(model_save_path, save_best_only=True, save_weights_only=True)
182 |         history = model.fit(X_train, X_train_label,
183 |                             batch_size=32,
184 |                             epochs=100,
185 |                             shuffle=True,
186 |                             validation_data=(X_val, X_val_label), callbacks=[tensorboard, ear,checkpoint])
187 |         model.load_weights(model_save_path)
188 |         # model.save('./model/model_weight_TextCNN_1st_{}.h5'.format(str(i)))
189 | 
190 |         # model = load_model('model_weight.h5')
191 |     pred_val = model.predict(X_val)
192 |     pred_test = model.predict(x_out_padded_seqs)
193 | 
194 |     meta_train[te_ind] = pred_val
195 |     meta_test += pred_test
196 |     K.clear_session()
197 | meta_test /= 5.0
198 | with open("textcnn_result.pkl", 'wb') as f:
199 |     pickle.dump(meta_train, f)
200 |     pickle.dump(meta_test, f)
201 | 
202 | #
203 | # with open("TextCNN_1st_result.pkl", 'wb') as f:
204 | #     pickle.dump(train_result, f)
205 | #     pickle.dump(out_result, f)
206 | #
207 | # result = model.predict(x_out_padded_seqs)
208 | # out = []
209 | # for i in range(len(file_names)):
210 | #     tmp = []
211 | #     a = result[i].tolist()
212 | #     # for j in range(len(a)):
213 | #     #     a[j] = ("%.5f" % a[j])
214 | #
215 | #     tmp.append(file_names[i])
216 | #     tmp.extend(a)
217 | #     out.append(tmp)
218 | # with open("result_textcnn_TextCNN_1st.csv", "w", newline='') as csvfile:
219 | #     writer = csv.writer(csvfile)
220 | #
221 | #     # 先写入columns_name
222 | #     writer.writerow(["file_id", "prob0", "prob1", "prob2", "prob3", "prob4", "prob5", "prob6", "prob7"
223 | #                      ])
224 | #     # 写入多行用writerows
225 | #     writer.writerows(out)
226 | 


--------------------------------------------------------------------------------
/train_lstm2.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | from keras.preprocessing.sequence import pad_sequences
  3 | from keras_preprocessing.text import Tokenizer
  4 | from keras.models import Sequential, Model
  5 | from keras.layers import Dense, Embedding, Activation, merge, Input, Lambda, Reshape, LSTM, RNN, CuDNNLSTM, \
  6 |     SimpleRNNCell, SpatialDropout1D, Add, Maximum
  7 | from keras.layers import Conv1D, Flatten, Dropout, MaxPool1D, GlobalAveragePooling1D, concatenate, AveragePooling1D
  8 | from keras import optimizers
  9 | from keras import regularizers
 10 | from keras.layers import BatchNormalization
 11 | from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
 12 | from keras.utils import to_categorical
 13 | import time
 14 | import numpy as np
 15 | from keras import backend as K
 16 | from sklearn.model_selection import StratifiedKFold
 17 | 
 18 | config = K.tf.ConfigProto()
 19 | config.gpu_options.allow_growth = True
 20 | session = K.tf.Session(config=config)
 21 | 
 22 | Fname = 'malware_'
 23 | Time = Fname + str(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()))
 24 | tensorboard = TensorBoard(log_dir='./Logs/' + Time, histogram_freq=0, write_graph=False, write_images=False,
 25 |                           embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None)
 26 | 
 27 | with open("security_test.csv.pkl", "rb") as f:
 28 |     file_names = pickle.load(f)
 29 |     outfiles = pickle.load(f)
 30 | with open("security_train.csv.pkl", "rb") as f:
 31 |     labels_d = pickle.load(f)
 32 | with open("security_train.csv.pkl", "rb") as f:
 33 |     labels = pickle.load(f)
 34 |     files = pickle.load(f)
 35 | maxlen = 6000
 36 | 
 37 | 
 38 | # with open("wordsdic.pkl", 'rb') as f:
 39 | #     tokenizer = pickle.load(f)
 40 | #
 41 | 
 42 | labels = np.asarray(labels)
 43 | 
 44 | labels = to_categorical(labels, num_classes=8)
 45 | 
 46 | tokenizer = Tokenizer(num_words=None,
 47 |                       filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
 48 |                       split=' ',
 49 |                       char_level=False,
 50 |                       oov_token=None)
 51 | tokenizer.fit_on_texts(files)
 52 | tokenizer.fit_on_texts(outfiles)
 53 | 
 54 | # with open("wordsdic.pkl", 'wb') as f:
 55 | #     pickle.dump(tokenizer, f)
 56 | 
 57 | vocab = tokenizer.word_index
 58 | print(tokenizer.word_index)
 59 | print(len(vocab))
 60 | x_train_word_ids = tokenizer.texts_to_sequences(files)
 61 | x_out_word_ids = tokenizer.texts_to_sequences(outfiles)
 62 | 
 63 | x_train_padded_seqs = pad_sequences(x_train_word_ids, maxlen=maxlen)
 64 | 
 65 | x_out_padded_seqs = pad_sequences(x_out_word_ids, maxlen=maxlen)
 66 | 
 67 | 
 68 | # with open('datasets.pkl', 'wb') as f:
 69 | #     pickle.dump(x_train_padded_seqs, f)
 70 | #     pickle.dump(x_out_padded_seqs, f)
 71 | #     pickle.dump(labels, f)
 72 | 
 73 | 
 74 | # with open('datasets.pkl', 'rb') as f:
 75 | #     x_train_padded_seqs = pickle.load(f)
 76 | #     # x_test_padded_seqs = pickle.load(f)
 77 | #     x_out_padded_seqs = pickle.load(f)
 78 | #     # y_train = pickle.load(f)
 79 | #     # y_test = pickle.load(f)
 80 | #     labels = pickle.load(f)
 81 | 
 82 | 
 83 | def mulitl_version_lstm():
 84 |     embed_size = 256
 85 |     num_filters = 64
 86 |     kernel_size = [3, 5, 7]
 87 |     main_input = Input(shape=(maxlen,))
 88 |     emb = Embedding(304, 256, input_length=maxlen)(main_input)
 89 |     # _embed = SpatialDropout1D(0.15)(emb)
 90 |     warppers = []
 91 |     warppers2 = []  # 0.42
 92 |     warppers3 = []
 93 |     for _kernel_size in kernel_size:
 94 |         conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(emb)
 95 |         warppers.append(AveragePooling1D(2)(conv1d))
 96 |     for (_kernel_size, cnn) in zip(kernel_size, warppers):
 97 |         conv1d_2 = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(cnn)
 98 |         warppers2.append(AveragePooling1D(2)(conv1d_2))
 99 |     for (_kernel_size, cnn) in zip(kernel_size, warppers2):
100 |         conv1d_2 = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(cnn)
101 |         warppers3.append(AveragePooling1D(2)(conv1d_2))
102 |     fc = Maximum()(warppers3)
103 |     rl = CuDNNLSTM(512)(fc)
104 |     main_output = Dense(8, activation='softmax')(rl)
105 |     model = Model(inputs=main_input, outputs=main_output)
106 |     return model
107 | 
108 | 
109 | def Build():
110 |     main_input = Input(shape=(maxlen,), dtype='float64')
111 |     embedder = Embedding(304, 256, input_length=maxlen)
112 |     embed = embedder(main_input)
113 |     # avg = GlobalAveragePooling1D()(embed)
114 |     # cnn1模块，kernel_size = 3
115 |     conv1_1 = Conv1D(64, 3, padding='same', activation='relu')(embed)
116 | 
117 |     conv1_2 = Conv1D(64, 3, padding='same', activation='relu')(conv1_1)
118 | 
119 |     cnn1 = MaxPool1D(pool_size=2)(conv1_2)
120 |     conv1_1 = Conv1D(64, 3, padding='same', activation='relu')(cnn1)
121 | 
122 |     conv1_2 = Conv1D(64, 3, padding='same', activation='relu')(conv1_1)
123 | 
124 |     cnn1 = MaxPool1D(pool_size=2)(conv1_2)
125 |     conv1_1 = Conv1D(64, 3, padding='same', activation='relu')(cnn1)
126 | 
127 |     conv1_2 = Conv1D(64, 3, padding='same', activation='relu')(conv1_1)
128 | 
129 |     cnn1 = MaxPool1D(pool_size=2)(conv1_2)
130 |     rl = CuDNNLSTM(256)(cnn1)
131 |     # flat = Flatten()(cnn3)
132 |     # drop = Dropout(0.5)(flat)
133 |     fc = Dense(256)(rl)
134 | 
135 |     main_output = Dense(8, activation='softmax')(rl)
136 |     model = Model(inputs=main_input, outputs=main_output)
137 |     return model
138 | 
139 | 
140 | meta_train = np.zeros(shape=(len(x_train_padded_seqs), 8))
141 | meta_test = np.zeros(shape=(len(x_out_padded_seqs), 8))
142 | skf = StratifiedKFold(n_splits=5, random_state=4, shuffle=True)
143 | for i, (tr_ind, te_ind) in enumerate(skf.split(x_train_padded_seqs, labels_d)):
144 |     print('FOLD: {}'.format(str(i)))
145 |     print(len(te_ind), len(tr_ind))
146 |     X_train, X_train_label = x_train_padded_seqs[tr_ind], labels[tr_ind]
147 |     X_val, X_val_label = x_train_padded_seqs[te_ind], labels[te_ind]
148 | 
149 |     model = Build()
150 |     # model = load_model('model_weight.h5')
151 |     print(model.summary())
152 |     # exit()
153 |     model.compile(loss='categorical_crossentropy',
154 |                   optimizer='adam',
155 |                   metrics=['accuracy'])
156 |     model_save_path = './model/model_weight_cnn_lstm_{}.h5'.format(str(i))
157 |     print(model_save_path)
158 |     if i in [-1]:
159 |         model.load_weights(model_save_path)
160 |         print(model.evaluate(X_val, X_val_label))
161 |     else:
162 | 
163 |         checkpoint = model_checkpoint = ModelCheckpoint(model_save_path, save_best_only=True, save_weights_only=True)
164 |         ear = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='min', baseline=None,
165 |                             restore_best_weights=False)
166 |         history = model.fit(X_train, X_train_label,
167 |                             batch_size=128,
168 |                             epochs=100,
169 |                             validation_data=(X_val, X_val_label), callbacks=[checkpoint, ear])
170 | 
171 |         # model.save('./model/model_weight_cnn_lstm_{}.h5'.format(str(i)))
172 |         model.load_weights(model_save_path)
173 |         # model = load_model('model_weight.h5')
174 |     pred_val = model.predict(X_val)
175 |     pred_test = model.predict(x_out_padded_seqs)
176 | 
177 |     meta_train[te_ind] = pred_val
178 |     meta_test += pred_test
179 |     K.clear_session()
180 | 
181 | meta_test /= 5.0
182 | with open("cnn_lstm_result.pkl", 'wb') as f:
183 |     pickle.dump(meta_train, f)
184 |     pickle.dump(meta_test, f)
185 | 
186 | # result = model.predict(x_out_padded_seqs)
187 | # out = []
188 | # for i in range(len(file_names)):
189 | #     tmp = []
190 | #     a = result[i].tolist()
191 | #     # for j in range(len(a)):
192 | #     #     a[j] = ("%.5f" % a[j])
193 | #
194 | #     tmp.append(file_names[i])
195 | #     tmp.extend(a)
196 | #     out.append(tmp)
197 | # with open("result_lstm.csv", "w", newline='') as csvfile:
198 | #     writer = csv.writer(csvfile)
199 | #
200 | #     # 先写入columns_name
201 | #     writer.writerow(["file_id", "prob0", "prob1", "prob2", "prob3", "prob4", "prob5", "prob6", "prob7"
202 | #                      ])
203 | #     # 写入多行用writerows
204 | #     writer.writerows(out)
205 | 
206 | 
207 | # def mulitl_version_lstm():
208 | #     embed_size = 256
209 | #     num_filters = 64
210 | #     kernel_size = [3, 5, 7]
211 | #     main_input = Input(shape=(maxlen,))
212 | #     emb = Embedding(304, 256, input_length=maxlen)(main_input)
213 | #     _embed = SpatialDropout1D(0.15)(emb)
214 | #     warppers = []
215 | #     warppers2 = []
216 | #     for _kernel_size in kernel_size:
217 | #         conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(_embed)
218 | #         warppers.append(MaxPool1D(2)(conv1d))
219 | #     for (_kernel_size, cnn) in zip(kernel_size, warppers):
220 | #         conv1d_2 = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(cnn)
221 | #         warppers2.append(MaxPool1D(2)(conv1d_2))
222 | #     fc = Add()(warppers2)
223 | #     rl = CuDNNLSTM(512)(fc)
224 | #     main_output = Dense(8, activation='softmax')(rl)
225 | #     model = Model(inputs=main_input, outputs=main_output)
226 | #     return model
227 | 


--------------------------------------------------------------------------------
/train_lstm.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | from keras.preprocessing.sequence import pad_sequences
  3 | from keras_preprocessing.text import Tokenizer
  4 | from keras.models import Sequential, Model
  5 | from keras.layers import Dense, Embedding, Activation, merge, Input, Lambda, Reshape, LSTM, RNN, CuDNNLSTM, \
  6 |     SimpleRNNCell, SpatialDropout1D, Add, Maximum
  7 | from keras.layers import Conv1D, Flatten, Dropout, MaxPool1D, GlobalAveragePooling1D, concatenate, AveragePooling1D
  8 | from keras import optimizers
  9 | from keras import regularizers
 10 | from keras.layers import BatchNormalization
 11 | from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
 12 | from keras.utils import to_categorical
 13 | import time
 14 | import numpy as np
 15 | from keras import backend as K
 16 | from sklearn.model_selection import StratifiedKFold
 17 | 
 18 | config = K.tf.ConfigProto()
 19 | config.gpu_options.allow_growth = True
 20 | session = K.tf.Session(config=config)
 21 | 
 22 | Fname = 'malware_'
 23 | Time = Fname + str(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()))
 24 | tensorboard = TensorBoard(log_dir='./Logs/' + Time, histogram_freq=0, write_graph=False, write_images=False,
 25 |                           embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None)
 26 | 
 27 | with open("security_test.csv.pkl", "rb") as f:
 28 |     file_names = pickle.load(f)
 29 |     outfiles = pickle.load(f)
 30 | with open("security_train.csv.pkl", "rb") as f:
 31 |     labels_d = pickle.load(f)
 32 | with open("security_train.csv.pkl", "rb") as f:
 33 |     labels = pickle.load(f)
 34 |     files = pickle.load(f)
 35 | maxlen = 6000
 36 | 
 37 | 
 38 | # with open("wordsdic.pkl", 'rb') as f:
 39 | #     tokenizer = pickle.load(f)
 40 | #
 41 | 
 42 | labels = np.asarray(labels)
 43 | 
 44 | labels = to_categorical(labels, num_classes=8)
 45 | 
 46 | tokenizer = Tokenizer(num_words=None,
 47 |                       filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
 48 |                       split=' ',
 49 |                       char_level=False,
 50 |                       oov_token=None)
 51 | tokenizer.fit_on_texts(files)
 52 | tokenizer.fit_on_texts(outfiles)
 53 | 
 54 | # with open("wordsdic.pkl", 'wb') as f:
 55 | #     pickle.dump(tokenizer, f)
 56 | 
 57 | vocab = tokenizer.word_index
 58 | print(tokenizer.word_index)
 59 | print(len(vocab))
 60 | x_train_word_ids = tokenizer.texts_to_sequences(files)
 61 | x_out_word_ids = tokenizer.texts_to_sequences(outfiles)
 62 | 
 63 | x_train_padded_seqs = pad_sequences(x_train_word_ids, maxlen=maxlen)
 64 | 
 65 | x_out_padded_seqs = pad_sequences(x_out_word_ids, maxlen=maxlen)
 66 | 
 67 | 
 68 | # with open('datasets.pkl', 'wb') as f:
 69 | #     pickle.dump(x_train_padded_seqs, f)
 70 | #     pickle.dump(x_out_padded_seqs, f)
 71 | #     pickle.dump(labels, f)
 72 | 
 73 | 
 74 | # with open('datasets.pkl', 'rb') as f:
 75 | #     x_train_padded_seqs = pickle.load(f)
 76 | #     # x_test_padded_seqs = pickle.load(f)
 77 | #     x_out_padded_seqs = pickle.load(f)
 78 | #     # y_train = pickle.load(f)
 79 | #     # y_test = pickle.load(f)
 80 | #     labels = pickle.load(f)
 81 | 
 82 | 
 83 | def mulitl_version_lstm():
 84 |     embed_size = 256
 85 |     num_filters = 64
 86 |     kernel_size = [3, 5, 7]
 87 |     main_input = Input(shape=(maxlen,))
 88 |     emb = Embedding(304, 256, input_length=maxlen)(main_input)
 89 |     # _embed = SpatialDropout1D(0.15)(emb)
 90 |     warppers = []
 91 |     warppers2 = []  # 0.42
 92 |     warppers3 = []
 93 |     for _kernel_size in kernel_size:
 94 |         conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(emb)
 95 |         warppers.append(AveragePooling1D(2)(conv1d))
 96 |     for (_kernel_size, cnn) in zip(kernel_size, warppers):
 97 |         conv1d_2 = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(cnn)
 98 |         warppers2.append(AveragePooling1D(2)(conv1d_2))
 99 |     for (_kernel_size, cnn) in zip(kernel_size, warppers2):
100 |         conv1d_2 = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(cnn)
101 |         warppers3.append(AveragePooling1D(2)(conv1d_2))
102 |     fc = Maximum()(warppers3)
103 |     rl = CuDNNLSTM(512)(fc)
104 |     main_output = Dense(8, activation='softmax')(rl)
105 |     model = Model(inputs=main_input, outputs=main_output)
106 |     return model
107 | 
108 | 
109 | def Build():
110 |     main_input = Input(shape=(maxlen,), dtype='float64')
111 |     embedder = Embedding(304, 256, input_length=maxlen)
112 |     embed = embedder(main_input)
113 |     # avg = GlobalAveragePooling1D()(embed)
114 |     # cnn1模块，kernel_size = 3
115 |     conv1_1 = Conv1D(64, 3, padding='same', activation='relu')(embed)
116 | 
117 |     conv1_2 = Conv1D(64, 3, padding='same', activation='relu')(conv1_1)
118 | 
119 |     cnn1 = MaxPool1D(pool_size=2)(conv1_2)
120 |     conv1_1 = Conv1D(64, 3, padding='same', activation='relu')(cnn1)
121 | 
122 |     conv1_2 = Conv1D(64, 3, padding='same', activation='relu')(conv1_1)
123 | 
124 |     cnn1 = MaxPool1D(pool_size=2)(conv1_2)
125 |     conv1_1 = Conv1D(64, 3, padding='same', activation='relu')(cnn1)
126 | 
127 |     conv1_2 = Conv1D(64, 3, padding='same', activation='relu')(conv1_1)
128 | 
129 |     cnn1 = MaxPool1D(pool_size=2)(conv1_2)
130 |     rl = CuDNNLSTM(256)(cnn1)
131 |     # flat = Flatten()(cnn3)
132 |     # drop = Dropout(0.5)(flat)
133 |     fc = Dense(256)(rl)
134 | 
135 |     main_output = Dense(8, activation='softmax')(rl)
136 |     model = Model(inputs=main_input, outputs=main_output)
137 |     return model
138 | 
139 | 
140 | meta_train = np.zeros(shape=(len(x_train_padded_seqs), 8))
141 | meta_test = np.zeros(shape=(len(x_out_padded_seqs), 8))
142 | skf = StratifiedKFold(n_splits=5, random_state=4, shuffle=True)
143 | for i, (tr_ind, te_ind) in enumerate(skf.split(x_train_padded_seqs, labels_d)):
144 |     print('FOLD: {}'.format(str(i)))
145 |     print(len(te_ind), len(tr_ind))
146 |     X_train, X_train_label = x_train_padded_seqs[tr_ind], labels[tr_ind]
147 |     X_val, X_val_label = x_train_padded_seqs[te_ind], labels[te_ind]
148 | 
149 |     model = mulitl_version_lstm()
150 |     # model = load_model('model_weight.h5')
151 |     print(model.summary())
152 |     # exit()
153 |     model.compile(loss='categorical_crossentropy',
154 |                   optimizer='adam',
155 |                   metrics=['accuracy'])
156 |     model_save_path = './model/model_weight_mulitl_version_lstm_{}.h5'.format(str(i))
157 |     print(model_save_path)
158 |     if i in [-1]:
159 |         model.load_weights(model_save_path)
160 |         print(model.evaluate(X_val, X_val_label))
161 |     else:
162 | 
163 |         checkpoint = model_checkpoint = ModelCheckpoint(model_save_path, save_best_only=True, save_weights_only=True)
164 |         ear = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='min', baseline=None,
165 |                             restore_best_weights=False)
166 |         history = model.fit(X_train, X_train_label,
167 |                             batch_size=128,
168 |                             epochs=100,
169 |                             validation_data=(X_val, X_val_label), callbacks=[checkpoint, ear])
170 | 
171 |         # model.save('./model/model_weight_cnn_lstm_{}.h5'.format(str(i)))
172 |         model.load_weights(model_save_path)
173 |         # model = load_model('model_weight.h5')
174 |     pred_val = model.predict(X_val)
175 |     pred_test = model.predict(x_out_padded_seqs)
176 | 
177 |     meta_train[te_ind] = pred_val
178 |     meta_test += pred_test
179 |     K.clear_session()
180 | 
181 | meta_test /= 5.0
182 | with open("mulitl_version_lstm_result.pkl", 'wb') as f:
183 |     pickle.dump(meta_train, f)
184 |     pickle.dump(meta_test, f)
185 | 
186 | # result = model.predict(x_out_padded_seqs)
187 | # out = []
188 | # for i in range(len(file_names)):
189 | #     tmp = []
190 | #     a = result[i].tolist()
191 | #     # for j in range(len(a)):
192 | #     #     a[j] = ("%.5f" % a[j])
193 | #
194 | #     tmp.append(file_names[i])
195 | #     tmp.extend(a)
196 | #     out.append(tmp)
197 | # with open("result_lstm.csv", "w", newline='') as csvfile:
198 | #     writer = csv.writer(csvfile)
199 | #
200 | #     # 先写入columns_name
201 | #     writer.writerow(["file_id", "prob0", "prob1", "prob2", "prob3", "prob4", "prob5", "prob6", "prob7"
202 | #                      ])
203 | #     # 写入多行用writerows
204 | #     writer.writerows(out)
205 | 
206 | 
207 | # def mulitl_version_lstm():
208 | #     embed_size = 256
209 | #     num_filters = 64
210 | #     kernel_size = [3, 5, 7]
211 | #     main_input = Input(shape=(maxlen,))
212 | #     emb = Embedding(304, 256, input_length=maxlen)(main_input)
213 | #     _embed = SpatialDropout1D(0.15)(emb)
214 | #     warppers = []
215 | #     warppers2 = []
216 | #     for _kernel_size in kernel_size:
217 | #         conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(_embed)
218 | #         warppers.append(MaxPool1D(2)(conv1d))
219 | #     for (_kernel_size, cnn) in zip(kernel_size, warppers):
220 | #         conv1d_2 = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(cnn)
221 | #         warppers2.append(MaxPool1D(2)(conv1d_2))
222 | #     fc = Add()(warppers2)
223 | #     rl = CuDNNLSTM(512)(fc)
224 | #     main_output = Dense(8, activation='softmax')(rl)
225 | #     model = Model(inputs=main_input, outputs=main_output)
226 | #     return model
227 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | # Machine Learning for Malware Detection
  2 | 
  3 | edited by Raymond Luo
  4 | 
  5 | ## Environment
  6 | 
  7 | ``` bash
  8 | python3
  9 | RTX2080 I7 9700K 16G RAM
 10 | ubuntu 18.04
 11 | ```
 12 | 
 13 | ## Requirement
 14 | TensorFlow==1.14
 15 | keras==2.3.1
 16 | ```bash
 17 | sudo pip3 install -r requirements.txt
 18 | ```
 19 | 
 20 | Please put the `security_train.xlsx` and `security_test.xlsx` in related floders
 21 | 
 22 | ## Loadfile
 23 | 
 24 | ```bash
 25 | python3 loadfile.py
 26 | ```
 27 | 
 28 | ## Train multi vision lstm
 29 | 
 30 | ```
 31 | python3 train_lstm.py
 32 | ```
 33 | 
 34 | ## Train LSTM
 35 | 
 36 | ```
 37 | python3 train_lstm2.py
 38 | ```
 39 | 
 40 | ## Train textcnn LSTM
 41 | 
 42 | ```
 43 | python3 train_lstm3.py
 44 | ```
 45 | 
 46 | ## Train CNN
 47 | 
 48 | ```
 49 | python3 train_textcnn.py
 50 | ```
 51 | 
 52 | ## Train tf-idf model
 53 | 
 54 | ```
 55 | python3 xgboost.py
 56 | ```
 57 | 
 58 | ## Stack model
 59 | 
 60 | ```
 61 | python3 stack_result.py
 62 | ```
 63 | 
 64 | ## Result
 65 | 
 66 | Result will be in `submit` floder
 67 | 
 68 | # FILE PREPROCESS
 69 | 
 70 | ### Data format
 71 | 
 72 | | label   | class  | explain                                                      |
 73 | | ------- | ------ | ------------------------------------------------------------ |
 74 | | file_id | bigint | Number of files                                              |
 75 | | label   | bigint | 文件标签，0-正常/1-勒索病毒/2-挖矿程序/3-DDoS木马/4-蠕虫病毒/5-感染型病毒/6-后门程序/7-木马程序. |
 76 | | api     | string | The API call list of the file.                               |
 77 | | tid     | bigint | The thread number of the file.                               |
 78 | | index   | string | The order of the API call.                                   |
 79 | ----
 80 | [阿里云安全恶意程序检测](https://tianchi.aliyun.com/competition/entrance/231694/introduction?spm=5176.12281925.0.0.60c17137jVy9vv)
 81 | ### Our method:
 82 | 
 83 | Firstly, we grouped the file by ‘file-id’. For each files, we then grouped the API calls by ‘tid’ and sorted the API calls by its ‘index’. Finally, we concatenate each thread to one line to get our final represent of 1 sample.
 84 | 
 85 | `Sample
 86 | LdrLoadDll LdrGetProcedureAddress LdrGetProcedureAddress LdrGetProcedureAddress LdrGetProcedureAddress.......
 87 | label 5`
 88 | 
 89 | # MODEL
 90 | 
 91 | For best score we use 5 models and stack them to get our final score.
 92 | 
 93 | 
 94 | 
 95 | ## TF-idf model:
 96 | 
 97 | In order to get the whole information of the sample. We firstly extract the TF-idf features of the sample. We extract ‘TF-IDF’ feature after we preprocess the sample with 1-5 gram.
 98 | 
 99 | ```python
100 | vectorizer = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9, )  # tf-idf特征抽取ngram_range=(1,5)
101 | ```
102 | 
103 | Then we classify the sample base on XGboost method.
104 | 
105 |  ``` python
106 | param = {'max_depth': 6, 'eta': 0.1, 'eval_metric': 'mlogloss', 'silent': 1, 'objective': 'multi:softprob',
107 |           'num_class': 8, 'subsample': 0.8,
108 |           'colsample_bytree': 0.85}  # 参数
109 | 
110 |  evallist = [(dtrain, 'train'), (dtest, 'val')]  # 测试 , (dtrain, 'train')
111 |  num_round = 300  # 循环次数
112 |  bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=50)
113 |  ```
114 | 
115 | ## Text cnn model
116 | 
117 | In this model, we used ‘text cnn’ to extract the features and classify the sample.
118 | 
119 | Each sample has been cut to maximum 6000 length and encode as one-hot label.
120 | 
121 | We input the sample to 1 embedding layer then go to the CNN layers.
122 | 
123 | We used 2,4,6,8,10 5 different kernel sizes, each convolutional layer has 32 filters. 
124 | 
125 | After the cnn layers, we concatenate the feature extracted by CNN layers and classify them.
126 | 
127 | ```python
128 | main_input = Input(shape=(maxlen,), dtype='float64')
129 | _embed = Embedding(304, 256, input_length=maxlen)(main_input)
130 | _embed = SpatialDropout1D(0.25)(_embed)
131 | warppers = []
132 | num_filters = 64
133 | kernel_size = [2, 3, 4, 5]
134 | conv_action = 'relu'
135 | for _kernel_size in kernel_size:
136 |     for dilated_rate in [1, 2, 3, 4]:
137 |         conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation=conv_action,
138 |                         dilation_rate=dilated_rate)(_embed)
139 |         warppers.append(GlobalMaxPooling1D()(conv1d))
140 | 
141 | fc = concatenate(warppers)
142 | fc = Dropout(0.5)(fc)
143 | # fc = BatchNormalization()(fc)
144 | fc = Dense(256, activation='relu')(fc)
145 | fc = Dropout(0.25)(fc)
146 | # fc = BatchNormalization()(fc)
147 | preds = Dense(8, activation='softmax')(fc)
148 | 
149 | model = Model(inputs=main_input, outputs=preds)
150 | 
151 | model.compile(loss='categorical_crossentropy',
152 |               optimizer='adam',
153 |               metrics=['accuracy'])
154 | return model
155 | 
156 | ```
157 | 
158 | ## CNN_LSTM model
159 | 
160 | In order to get the sequence information of the sample. We used 3 LSTM based model.
161 | 
162 | The first one is simple CNN+LSTM. All cnn layer has the same kernel size.
163 | 
164 | ```python
165 |  main_input = Input(shape=(maxlen,), dtype='float64')
166 |     embedder = Embedding(304, 256, input_length=maxlen)
167 |     embed = embedder(main_input)
168 |     # avg = GlobalAveragePooling1D()(embed)
169 |     # cnn1模块，kernel_size = 3
170 |     conv1_1 = Conv1D(64, 3, padding='same', activation='relu')(embed)
171 | 
172 |     conv1_2 = Conv1D(64, 3, padding='same', activation='relu')(conv1_1)
173 | 
174 |     cnn1 = MaxPool1D(pool_size=2)(conv1_2)
175 |     conv1_1 = Conv1D(64, 3, padding='same', activation='relu')(cnn1)
176 | 
177 |     conv1_2 = Conv1D(64, 3, padding='same', activation='relu')(conv1_1)
178 | 
179 |     cnn1 = MaxPool1D(pool_size=2)(conv1_2)
180 |     conv1_1 = Conv1D(64, 3, padding='same', activation='relu')(cnn1)
181 | 
182 |     conv1_2 = Conv1D(64, 3, padding='same', activation='relu')(conv1_1)
183 | 
184 |     cnn1 = MaxPool1D(pool_size=2)(conv1_2)
185 |     rl = CuDNNLSTM(256)(cnn1)
186 |     # flat = Flatten()(cnn3)
187 |     # drop = Dropout(0.5)(flat)
188 |     fc = Dense(256)(rl)
189 | 
190 |     main_output = Dense(8, activation='softmax')(rl)
191 |     model = Model(inputs=main_input, outputs=main_output)
192 |     return model
193 | 
194 | ```
195 | 
196 | ## Mulit vision LSTM
197 | 
198 | Inspired by out text cnn method. We thought that if different kernel size can have different visions why can we have a mulit vision LSTM model. So we first use 3,5,7 3 different kerner sizes. In spite of arrange them as linner order, we use them to extract the features from the embedding layer independly. Between each lyayer we used averge pooling instead of max pooling to get more information of the sequence rather than only one API call. After the CNN layers we have three feature vectors which have same size. We then use  
199 | $$
200 | max_{elements}(v_1,v_2,v_3)
201 | $$
202 | in order to get the most significant feature in different visions. After that we used LSTM layer to analyze the sequence and classify it. 
203 | 
204 | ```python
205 | embed_size = 256
206 |     num_filters = 64
207 |     kernel_size = [3, 5, 7]
208 |     main_input = Input(shape=(maxlen,))
209 |     emb = Embedding(304, 256, input_length=maxlen)(main_input)
210 |     # _embed = SpatialDropout1D(0.15)(emb)
211 |     warppers = []
212 |     warppers2 = []  # 0.42
213 |     warppers3 = []
214 |     for _kernel_size in kernel_size:
215 |         conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(emb)
216 |         warppers.append(AveragePooling1D(2)(conv1d))
217 |     for (_kernel_size, cnn) in zip(kernel_size, warppers):
218 |         conv1d_2 = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(cnn)
219 |         warppers2.append(AveragePooling1D(2)(conv1d_2))
220 |     for (_kernel_size, cnn) in zip(kernel_size, warppers2):
221 |         conv1d_2 = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation='relu', padding='same')(cnn)
222 |         warppers3.append(AveragePooling1D(2)(conv1d_2))
223 |     fc = Maximum()(warppers3)
224 |     rl = CuDNNLSTM(512)(fc)
225 |     main_output = Dense(8, activation='softmax')(rl)
226 |     model = Model(inputs=main_input, outputs=main_output)
227 |     return model
228 | ```
229 | 
230 | ## Text cnn lstm
231 | 
232 | This model is mostly  as same as multi visions LSTM but we concatenate the 3 feature in order. Because we think this method did't affect the order of the sequence. In each part the order of API calls did't change.
233 | 
234 | ``` python
235 | main_input = Input(shape=(maxlen,), dtype='float64')
236 | 
237 |     embedder = Embedding(304, 256, input_length=maxlen)
238 |     embed = embedder(main_input)
239 |     # cnn1模块，kernel_size = 3
240 |     conv1_1 = Conv1D(16, 3, padding='same')(embed)
241 |     bn1_1 = BatchNormalization()(conv1_1)
242 |     relu1_1 = Activation('relu')(bn1_1)
243 |     conv1_2 = Conv1D(32, 3, padding='same')(relu1_1)
244 |     bn1_2 = BatchNormalization()(conv1_2)
245 |     relu1_2 = Activation('relu')(bn1_2)
246 |     cnn1 = MaxPool1D(pool_size=4)(relu1_2)
247 |     # cnn2模块，kernel_size = 4
248 |     conv2_1 = Conv1D(16, 4, padding='same')(embed)
249 |     bn2_1 = BatchNormalization()(conv2_1)
250 |     relu2_1 = Activation('relu')(bn2_1)
251 |     conv2_2 = Conv1D(32, 4, padding='same')(relu2_1)
252 |     bn2_2 = BatchNormalization()(conv2_2)
253 |     relu2_2 = Activation('relu')(bn2_2)
254 |     cnn2 = MaxPool1D(pool_size=4)(relu2_2)
255 |     # cnn3模块，kernel_size = 5
256 |     conv3_1 = Conv1D(16, 5, padding='same')(embed)
257 |     bn3_1 = BatchNormalization()(conv3_1)
258 |     relu3_1 = Activation('relu')(bn3_1)
259 |     conv3_2 = Conv1D(32, 5, padding='same')(relu3_1)
260 |     bn3_2 = BatchNormalization()(conv3_2)
261 |     relu3_2 = Activation('relu')(bn3_2)
262 |     cnn3 = MaxPool1D(pool_size=4)(relu3_2)
263 |     # 拼接三个模块
264 |     cnn = concatenate([cnn1, cnn2, cnn3], axis=-1)
265 |     lstm = CuDNNLSTM(256)(cnn)
266 |     f = Flatten()(cnn1)
267 |     fc = Dense(256, activation='relu')(f)
268 |     D = Dropout(0.5)(fc)
269 |     main_output = Dense(8, activation='softmax')(lstm)
270 |     model = Model(inputs=main_input, outputs=main_output)
271 |     return model
272 | ```
273 | 
274 | ### Stack result
275 | 
276 | We stack the results of the model mentioned above.
277 | 
278 | ``` python
279 | train = np.hstack([tfidf_train_result, textcnn_train_result, mulitl_version_lstm_train_result, cnn_train_result,
280 |                    textcnn_lstm_train_result])
281 | test = np.hstack(
282 |     [tfidf_out_result, textcnn_out_result, mulitl_version_lstm_test_result, cnn_out_result, textcnn_lstm_test_result])
283 | meta_test = np.zeros(shape=(len(outfiles), 8))
284 | skf = StratifiedKFold(n_splits=5, random_state=4, shuffle=True)
285 | dout = xgb.DMatrix(test)
286 | for i, (tr_ind, te_ind) in enumerate(skf.split(train, labels)):
287 |     print('FOLD: {}'.format(str(i)))
288 |     X_train, X_train_label = train[tr_ind], labels[tr_ind]
289 |     X_val, X_val_label = train[te_ind], labels[te_ind]
290 |     dtrain = xgb.DMatrix(X_train, label=X_train_label)
291 |     dtest = xgb.DMatrix(X_val, X_val_label)  # label可以不要，此处需要是为了测试效果
292 | 
293 |     param = {'max_depth': 6, 'eta': 0.01, 'eval_metric': 'mlogloss', 'silent': 1, 'objective': 'multi:softprob',
294 |              'num_class': 8, 'subsample': 0.9,
295 |              'colsample_bytree': 0.85}  # 参数
296 |     evallist = [(dtrain, 'train'), (dtest, 'val')]  # 测试 , (dtrain, 'train')
297 |     num_round = 10000  # 循环次数
298 |     bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=100)
299 |     preds = bst.predict(dout)
300 |     meta_test += preds
301 | 
302 | meta_test /= 5.0
303 | result = meta_test
304 | ```
305 | 


--------------------------------------------------------------------------------