├── param_search ├── __init__.py ├── test.sh ├── larger_cluster_and_others.py └── best_cluster_search.py ├── feature_extraction ├── __init__.py ├── save_embedding.py ├── save_autoencoder_mid_layer.py ├── nn_train.py ├── nn_autoencoder_stage2.py ├── cluster_static_features.py └── tfidf_features.py ├── one_time_use └── save_api_dict.py ├── .gitignore ├── metrics.py ├── cluster_performance_evaluate.py ├── basic_function.py ├── README.md ├── server_cluster.py ├── test_prepare_dataset.py ├── param_search_cluster.py ├── model.py ├── xml_to_csv.py └── prepare_dataset.py /param_search/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /feature_extraction/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /param_search/test.sh: -------------------------------------------------------------------------------- 1 | for ((i = 2 ; i < 4 ; i++)); do 2 | echo "python ../server_cluster.py -d 0 -c 0 -nc 210 -l $i" 3 | python server_cluster.py -d 0 -c 0 -nc 210 -l $i 4 | done 5 | -------------------------------------------------------------------------------- /feature_extraction/save_embedding.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from model import load_model 4 | import numpy as np 5 | 6 | model = load_model(os.path.join("models", "weights.01-0.06924324.hdf5"),load_type=0) 7 | weight = model.layers[1].get_weights()[0] 8 | np.save("weight", weight) -------------------------------------------------------------------------------- /one_time_use/save_api_dict.py: -------------------------------------------------------------------------------- 1 | from basic_function import load_df, save_dict 2 | import pandas as pd 3 | 4 | full = load_df("../features/stage2") 5 | 6 | # full.fillna("None", inplace) 7 | api_list = set(full['api_name']) 8 | print(len(api_list)) 9 | api_dict = dict(zip(api_list, range(len(api_list)))) 10 | save_dict(api_dict, "../features/api_dict.txt") 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | \.idea/ 2 | .idea/ 3 | __pycache__/ 4 | *.png 5 | transactions.csv 6 | data/ 7 | *.py[cod] 8 | *.so 9 | *.egg 10 | *.egg-info 11 | dist 12 | build 13 | *.ZE 14 | checkpoint 15 | *.meta 16 | 17 | *.index 18 | *.ckpt.data* 19 | *.tsv 20 | *.pbtxt 21 | *.csv 22 | *temp* 23 | *.h5 24 | *.txt 25 | *.rar 26 | model\.h5 27 | deep_learning/model.h5 28 | *.model 29 | .ipynb_checkpoints/ 30 | *.npz 31 | *.data 32 | *.dat 33 | *.png 34 | one_time_use/.ipynb_checkpoints/* 35 | train_dict 36 | *.npy 37 | *.pkl 38 | feature_vectors/ 39 | prediction/ 40 | *.dirlock 41 | *.tmp 42 | basic_model/classifiers/catboost_info/ 43 | model 44 | *.hdf5 45 | *.zip 46 | *.7z 47 | logs/ 48 | models/ 49 | *.wav 50 | dask-worker-space/ 51 | feature_definition* 52 | test_name_list 53 | -------------------------------------------------------------------------------- /metrics.py: -------------------------------------------------------------------------------- 1 | def com_acc(y_true, y_pre, threshold): 2 | try: 3 | a = y_true.values 4 | y_true = y_true.values 5 | except AttributeError: 6 | pass 7 | score = 0 8 | for i in range(len(y_pre)): 9 | sk = 1 if y_pre[i] >= threshold else 0 10 | score += int(sk == y_true[i]) - (sk - y_true[i])*sk 11 | return score/len(y_true) 12 | 13 | 14 | def com_acc_keras(threshold): 15 | def acc(y_true, y_pre): 16 | y_true = y_true 17 | score = 0 18 | print(y_pre.shape) 19 | for i in range(y_pre.shape[0]): 20 | sk = 1 if y_pre[i] >= threshold else 0 21 | score += int(sk == y_true[i]) - (sk - y_true[i]) * sk 22 | return score / len(y_true) 23 | return acc 24 | -------------------------------------------------------------------------------- /feature_extraction/save_autoencoder_mid_layer.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from keras_preprocessing.sequence import pad_sequences 4 | 5 | from model import load_model 6 | import numpy as np 7 | 8 | now = datetime.datetime.now() 9 | time_name = str(now.month) + "_" + str(now.day) + "_" + str(now.hour) + "_" + str( 10 | now.minute) + "_" + "features_plus_nn_mid" 11 | model = load_model("./models/weights.01-0.03171408.hdf5", load_type=2) 12 | shape = (512, 64) 13 | input_dim = 92 + 1 14 | batch_size = 16 15 | epochs = 50 16 | class_num = 2 17 | 18 | final_api_list = np.load("api_list_stage2.npy") 19 | # final_api_list = np.load("api_list_nn.npy") 20 | # label = np.load("api_list_nn.npy") 21 | 22 | fixed_sequence = pad_sequences(final_api_list, maxlen=shape[0], dtype='int32', padding='post', truncating='post', 23 | value=input_dim-1) 24 | 25 | features = model.predict(fixed_sequence) 26 | 27 | np.save("nn_features", features) 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /cluster_performance_evaluate.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from sklearn import metrics 4 | 5 | from prepare_dataset import load_stage2_tf_idf 6 | import pandas as pd 7 | 8 | 9 | def evaluate_cluster_performance(X, labels): 10 | sc = metrics.silhouette_score(X, labels, metric='euclidean') 11 | chs = metrics.calinski_harabaz_score(X, labels) 12 | dbs = metrics.davies_bouldin_score(X, labels) 13 | print("silhouette_score:", sc) 14 | print("calinski_harabaz_score:", chs) 15 | print("davies_bouldin_score:", dbs) 16 | return [sc, chs, dbs] 17 | 18 | 19 | if __name__ == '__main__': 20 | parser = argparse.ArgumentParser("cluster", fromfile_prefix_chars='@') 21 | parser.add_argument('-n', '--class_file', type=str, help='class_file path') 22 | 23 | args = parser.parse_args() 24 | 25 | train_data = load_stage2_tf_idf("") 26 | labels = pd.read_csv(args.class_file) 27 | full = pd.merge(train_data, labels, "left", left_on="file_name", right_on="id") 28 | 29 | labels = full["family_id"] 30 | # full.drop(columns=["family_id", "id"], inplace=True) 31 | train_data.drop(columns=["file_name"], inplace=True) 32 | 33 | evaluate_cluster_performance(train_data, labels) -------------------------------------------------------------------------------- /basic_function.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pandas as pd 4 | from numpy import int64 5 | from numpy import nan, array 6 | 7 | 8 | def get_root_path() -> str: 9 | return os.path.abspath(os.path.dirname(__file__)) 10 | 11 | 12 | def useless(): 13 | return nan, array, int64 14 | 15 | 16 | def extract_id_from_file_name(x: str): 17 | return x.split('.')[0] 18 | 19 | 20 | def load_df(dir_name, mode=0): 21 | files = os.listdir(dir_name) 22 | if mode == 0: 23 | k = [pd.read_csv(os.path.join(dir_name, file))[['file_name', 'api_name']] for file in files] 24 | else: 25 | k = [pd.read_csv(os.path.join(dir_name, file)) for file in files] 26 | return pd.concat(k) 27 | 28 | 29 | def make_dir(name): 30 | if not os.path.exists(name): 31 | os.makedirs(name) 32 | 33 | 34 | def get_file_list_in_dir(path): 35 | return [name for name in os.listdir(path) if not os.path.isdir(os.path.join(path, name))] 36 | 37 | 38 | def save_dict(dictionary, path): 39 | with open(path, "w") as f: 40 | f.write(str(dictionary)) 41 | return 42 | 43 | 44 | def load_dict(path): 45 | with open(path, "r") as f: 46 | dic = f.read() 47 | return eval(dic) 48 | -------------------------------------------------------------------------------- /param_search/larger_cluster_and_others.py: -------------------------------------------------------------------------------- 1 | import threading 2 | from queue import Queue 3 | 4 | from param_search_cluster import train_cluster 5 | 6 | 7 | def print_param(**kwargs): 8 | result = train_cluster(**kwargs) 9 | print(kwargs) 10 | return result 11 | 12 | 13 | def worker(): 14 | while True: 15 | param = q.get() 16 | print_param(**param) 17 | q.task_done() 18 | 19 | 20 | use_limited_thread = False 21 | use_threading = False 22 | 23 | params = [{"data_type": 4, "cluster_way": 0, "n_clusters": 210, "dimension_reduction": 3, "n_components": 2800}, 24 | {"data_type": 4, "cluster_way": 0, "n_clusters": 600, "dimension_reduction": 3, "n_components": 2800}, 25 | {"data_type": 4, "cluster_way": 0, "n_clusters": 420, "dimension_reduction": 3, "n_components": 2800} 26 | ] 27 | 28 | if use_threading: 29 | if use_limited_thread: 30 | num_worker_threads = 2 31 | 32 | q = Queue() 33 | 34 | for i in range(num_worker_threads): 35 | t = threading.Thread(target=worker) 36 | t.daemon = True 37 | t.start() 38 | 39 | for item in params: 40 | q.put(item) 41 | 42 | q.join() 43 | else: 44 | for param in params: 45 | threading.Thread(target=print_param, kwargs=param).start() 46 | else: 47 | for param in params: 48 | print_param(**param) 49 | -------------------------------------------------------------------------------- /param_search/best_cluster_search.py: -------------------------------------------------------------------------------- 1 | from basic_function import save_dict 2 | from param_search_cluster import connect_params, train_cluster 3 | 4 | 5 | def search(): 6 | base_parameter = {"data_type": 0, "dimension_reduction": 0} 7 | 8 | top_search = {"n_clusters": [380, 400, 420, 440, 460, 480]} 9 | 10 | cluster_ways = [0] 11 | cluster_parameter = {0: [{"linkage": 0}]} 12 | 13 | full_parameter = [] 14 | scores_list = [] 15 | for key in top_search.keys(): 16 | for i in range(len(top_search[key])): 17 | full_parameter.append({key:top_search[key][i]}) 18 | for cluster_way in cluster_ways: 19 | full_parameter.append({"cluster_way":cluster_way}) 20 | for cp in cluster_parameter[cluster_way]: 21 | full_parameter.append(cp) 22 | real_full = [base_parameter] + full_parameter 23 | print("params:", connect_params(real_full)) 24 | scores = train_cluster(**connect_params(real_full)) 25 | scores_list.append([connect_params(real_full)] + scores) 26 | full_parameter.pop(-1) 27 | full_parameter.pop(-1) 28 | full_parameter.pop(-1) 29 | 30 | print(scores_list) 31 | 32 | save_dict(scores_list, "search_b.txt") 33 | 34 | for i in range(3): # length of evaluation ways 35 | print(sorted(scores_list, key=lambda x:x[i+1])) 36 | 37 | 38 | if __name__ == '__main__': 39 | search() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 特征提取引用了此处代码:[第三届阿里云算法赛_i_hate_mcdonalds团队_解决方案](https://github.com/DeanNg/3rd_security_competition) 2 | ![Slide1](https://user-images.githubusercontent.com/31768052/58380229-4bf18780-7fe1-11e9-8cba-c78027326686.JPG) 3 | ![Slide2](https://user-images.githubusercontent.com/31768052/58380230-4c8a1e00-7fe1-11e9-9bec-dd177b31ba38.JPG) 4 | ![Slide3](https://user-images.githubusercontent.com/31768052/58380232-4dbb4b00-7fe1-11e9-87ae-1d9ca81cc506.JPG) 5 | ![Slide4](https://user-images.githubusercontent.com/31768052/58380233-4e53e180-7fe1-11e9-9af1-9c83a7a1ce8c.JPG) 6 | ![Slide5](https://user-images.githubusercontent.com/31768052/58380235-4eec7800-7fe1-11e9-84e3-977a8bb6fa87.JPG) 7 | ![Slide6](https://user-images.githubusercontent.com/31768052/58380236-4eec7800-7fe1-11e9-9d00-9bb138dbf8b0.JPG) 8 | ![Slide7](https://user-images.githubusercontent.com/31768052/58380237-4f850e80-7fe1-11e9-8d93-a97b4651ea8d.JPG) 9 | ![Slide8](https://user-images.githubusercontent.com/31768052/58380238-4f850e80-7fe1-11e9-9d74-ffc3a96cd958.JPG) 10 | ![Slide9](https://user-images.githubusercontent.com/31768052/58380240-51e76880-7fe1-11e9-8187-acf536ed1782.JPG) 11 | ![Slide10](https://user-images.githubusercontent.com/31768052/58380241-51e76880-7fe1-11e9-8a5e-b3838d239cb1.JPG) 12 | ![Slide11](https://user-images.githubusercontent.com/31768052/58380242-527fff00-7fe1-11e9-9392-f5f3656380fa.JPG) 13 | ![Slide12](https://user-images.githubusercontent.com/31768052/58380243-527fff00-7fe1-11e9-8329-5a9e1d21104b.JPG) 14 | ![Slide13](https://user-images.githubusercontent.com/31768052/58380244-53189580-7fe1-11e9-8556-1b88899f3ba1.JPG) 15 | -------------------------------------------------------------------------------- /server_cluster.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from param_search_cluster import train_cluster 3 | 4 | parser = argparse.ArgumentParser("cluster", fromfile_prefix_chars='@') 5 | parser.add_argument('-d', '--dimension_reduction', type=int, default=0, help='0:none,1:pca, 2:nmf') 6 | parser.add_argument('-c', "--cluster_way", type=int, default=0, help='1:birch, 3:dbscan') 7 | parser.add_argument('-n', "--n_components", type=int, default=0, help='n_components') 8 | parser.add_argument('-e', "--eps", type=float, default=1.0, help='dbscan') 9 | parser.add_argument('-t', "--threshold", type=float, default=2, help='birch threshold') 10 | parser.add_argument('-nc', "--n_clusters", type=int, default=200, help='birch n_clusters') 11 | parser.add_argument('-bc', "--branching_factor", type=int, default=50, help='birch branching_factor') 12 | parser.add_argument('-l', "--linkage", type=int, default=0, 13 | help='AgglomerativeClustering: ["ward", "complete", "average", "single"]') 14 | parser.add_argument('-dt', "--data_type", type=int, default=0, help='0:only tfidf, 1:all, 2:only nn') 15 | parser.add_argument('-i', "--max_iter", type=int, default=200, help='max_iter for NMF') 16 | 17 | args = parser.parse_args() 18 | 19 | data_type = args.data_type 20 | dimension_reduction = args.dimension_reduction 21 | cluster_way = args.cluster_way 22 | n_components = args.n_components 23 | threshold = args.threshold 24 | n_clusters = args.n_clusters 25 | branching_factor = args.branching_factor 26 | linkage = args.linkage 27 | max_iter = args.max_iter 28 | eps = args.eps 29 | 30 | train_cluster(data_type, dimension_reduction, cluster_way, n_components, threshold, n_clusters, branching_factor, 31 | linkage, max_iter, eps) 32 | -------------------------------------------------------------------------------- /test_prepare_dataset.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from prepare_dataset import * 4 | 5 | 6 | class TestPrepareDataSet(TestCase): 7 | def test_get_outside_train_features(self): 8 | train_data, label, test_data = get_outside_train_features() 9 | print(train_data.shape) 10 | print(test_data.shape) 11 | 12 | self.assertEqual(train_data.shape[0], 30000) 13 | self.assertEqual(test_data.shape[0], 15000) 14 | 15 | def test_load_ft_features(self): 16 | train_data, label, test_data = load_ft_features() 17 | print(train_data.shape) 18 | print(test_data.shape) 19 | 20 | self.assertEqual(train_data.shape[0], 30000) 21 | self.assertEqual(test_data.shape[0], 15000) 22 | 23 | def test_load_nn_features(self): 24 | train_data, label, test_data = load_nn_features() 25 | print(train_data.shape) 26 | print(test_data.shape) 27 | 28 | self.assertEqual(train_data.shape[0], 30000) 29 | self.assertEqual(test_data.shape[0], 15000) 30 | 31 | def test_load_depth_three_features(self): 32 | train_data, label, test_data = load_depth_three_features() 33 | 34 | print(train_data.shape) 35 | print(test_data.shape) 36 | 37 | self.assertEqual(train_data.shape[0], 30000) 38 | self.assertEqual(test_data.shape[0], 15000) 39 | 40 | def test_load_tfidf_features(self): 41 | train_data, label, test_data = load_tfidf_features("") 42 | print(train_data.shape) 43 | print(test_data.shape) 44 | 45 | self.assertEqual(train_data.shape[0], 30000) 46 | self.assertEqual(test_data.shape[0], 15000) 47 | 48 | train_data, label, test_data = load_tfidf_features("_hkey") 49 | print(train_data.shape) 50 | print(test_data.shape) 51 | 52 | self.assertEqual(train_data.shape[0], 21154) 53 | self.assertEqual(test_data.shape[0], 10661) 54 | 55 | train_data, label, test_data = load_tfidf_features("_dll") 56 | print(train_data.shape) 57 | print(test_data.shape) 58 | 59 | self.assertEqual(train_data.shape[0], 29990) 60 | self.assertEqual(test_data.shape[0], 14989) 61 | 62 | 63 | -------------------------------------------------------------------------------- /feature_extraction/nn_train.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from keras.callbacks import ModelCheckpoint 6 | from keras.utils import to_categorical 7 | from keras_preprocessing.sequence import pad_sequences 8 | from sklearn.model_selection import train_test_split 9 | 10 | from basic_function import load_dict, load_df, get_root_path, save_dict 11 | from metrics import com_acc 12 | from model import get_model 13 | 14 | shape = (500, 64) 15 | input_dim = 92 + 1 16 | batch_size = 16 17 | epochs = 50 18 | class_num = 2 19 | 20 | api_dict = load_dict(os.path.join(get_root_path(), "features", "api_dict.txt")) 21 | white = pd.read_csv(os.path.join(get_root_path(), "features", "white.csv"))[['file_name', 'api_name', 'call_time']] 22 | black = load_df(os.path.join(get_root_path(), "features", "black"), mode=1)[['file_name', 'api_name', 'call_time']] 23 | 24 | white_label = np.zeros(white.shape[0]) 25 | black_label = np.ones(black.shape[0]) 26 | 27 | full = pd.concat([white, black], sort=False) 28 | label = np.concatenate((white_label, black_label)) 29 | full['label'] = label 30 | 31 | full['api_name'] = full['api_name'].map(api_dict) 32 | 33 | final_api_list = [] 34 | label = [] 35 | 36 | less = False 37 | length = 3 38 | for file_name, api_df in full.groupby('file_name'): 39 | api_df = api_df.sort_values(by="call_time", axis=0, kind="mergesort") 40 | final_api_list.append(api_df['api_name'].values) 41 | label.append(api_df['label'].values[0]) 42 | 43 | fixed_sequence = pad_sequences(final_api_list, maxlen=shape[0], dtype='int32', padding='post', truncating='post', 44 | value=input_dim-1) 45 | 46 | label = to_categorical(label, num_classes=class_num) 47 | x, x_test, y, y_test = train_test_split(fixed_sequence, label, test_size=0.25) 48 | 49 | checkpoint = ModelCheckpoint(filepath='./models/weights.{epoch:02d}-{val_loss:.8f}.hdf5', 50 | monitor='val_acc', 51 | verbose=1, 52 | save_best_only=False) 53 | 54 | callbacks = [checkpoint] 55 | 56 | model = get_model(shape=shape, model_type=4, n=1, input_dim=input_dim, class_num=class_num, use_attention=True) 57 | for i in range(epochs): 58 | model.fit(np.array(x), y, 59 | batch_size=batch_size, 60 | epochs=1, 61 | validation_data=(np.array(x_test), y_test), 62 | shuffle=True, 63 | callbacks=callbacks) 64 | 65 | predictions = model.predict(np.array(x_test))[:, 1] 66 | acc = com_acc(np.array(y_test)[:, 1].flatten(), predictions.flatten(), 0.1) 67 | print("valid:", acc, 0.1) 68 | 69 | acc = com_acc(np.array(y_test)[:, 1].flatten(), predictions.flatten(), 0.3) 70 | print("valid:", acc, 0.3) 71 | 72 | acc = com_acc(np.array(y_test)[:, 1].flatten(), predictions.flatten(), 0.5) 73 | print("valid:", acc, 0.5) 74 | 75 | acc = com_acc(np.array(y_test)[:, 1].flatten(), predictions.flatten(), 0.6) 76 | print("valid:", acc, 0.6) 77 | 78 | acc = com_acc(np.array(y_test)[:, 1].flatten(), predictions.flatten(), 0.7) 79 | print("valid:", acc, 0.7) 80 | 81 | acc = com_acc(np.array(y_test)[:, 1].flatten(), predictions.flatten(), 0.9) 82 | print("valid:", acc, 0.9) 83 | 84 | 85 | -------------------------------------------------------------------------------- /feature_extraction/nn_autoencoder_stage2.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from keras.callbacks import ModelCheckpoint 6 | from keras.utils import to_categorical 7 | from keras_preprocessing.sequence import pad_sequences 8 | from sklearn.model_selection import train_test_split 9 | 10 | from basic_function import load_dict, load_df, get_root_path, save_dict 11 | from metrics import com_acc 12 | from model import get_model 13 | from shorten_api_list import delete_repeat_pattern, delete_same_pattern 14 | import pickle 15 | 16 | shape = (512, 64) 17 | input_dim = 92 + 1 18 | batch_size = 32 19 | epochs = 50 20 | class_num = 2 21 | 22 | api_dict = load_dict(os.path.join(get_root_path(), "features", "api_dict.txt")) 23 | white = load_df(os.path.join(get_root_path(), "features", "white"), mode=1)[['file_name', 'api_name', 'call_time']] 24 | black = load_df(os.path.join(get_root_path(), "features", "black"), mode=1)[['file_name', 'api_name', 'call_time']] 25 | 26 | white_label = np.zeros(white.shape[0]) 27 | black_label = np.ones(black.shape[0]) 28 | 29 | full = pd.concat([white, black], sort=False) 30 | label = np.concatenate((white_label, black_label)) 31 | full['label'] = label 32 | 33 | full['api_name'] = full['api_name'].map(api_dict) 34 | 35 | # full = load_df(os.path.join(get_root_path(), "features", "stage2"), mode=1)[['file_name', 'api_name', 'call_time']] 36 | full['label'] = np.zeros((full.shape[0],)) 37 | full['api_name'] = full['api_name'].map(api_dict) 38 | 39 | 40 | final_api_list = [] 41 | file_name_list = [] 42 | label = [] 43 | 44 | less = False 45 | length = 3 46 | if not less: 47 | for file_name, api_df in full.groupby('file_name'): 48 | api_df = api_df.sort_values(by="call_time", axis=0, kind="mergesort") 49 | final_api_list.append(api_df['api_name'].values) 50 | label.append(api_df['label'].values[0]) 51 | file_name_list.append(file_name) 52 | 53 | 54 | try: 55 | # pickle.dump(final_api_list, "./api_list.pk") 56 | # pickle.dump(label, "./label.pk") 57 | np.save("api_list_stage2", final_api_list) 58 | np.save("file_name_list_stage2", file_name_list) 59 | # np.save("label_", label) 60 | except: 61 | print("error") 62 | else: 63 | for file_name, api_df in full.groupby('file_name'): 64 | api_df = api_df.sort_values(by="call_time", axis=0, kind="mergesort") 65 | result = delete_repeat_pattern(api_df['api_name'].values.tolist(), 2) 66 | result = delete_same_pattern(result, 3) 67 | 68 | final_api_list.append(result) 69 | 70 | label.append(api_df['label'].values[0]) 71 | try: 72 | save_dict(final_api_list, "./api_list_less.txt") 73 | save_dict(label, "./label_less.txt") 74 | except: 75 | print("error") 76 | 77 | # final_api_list = np.load("api_list_stage2.npy") 78 | # label = np.zeros((len(final_api_list))) 79 | # print(api_df) 80 | # print(final_api_list) 81 | fixed_sequence = pad_sequences(final_api_list, maxlen=shape[0], dtype='int32', padding='post', truncating='post', 82 | value=input_dim-1) 83 | 84 | label = to_categorical(label, num_classes=class_num) 85 | x, x_test, y, y_test = train_test_split(fixed_sequence, label, test_size=0.25) 86 | 87 | checkpoint = ModelCheckpoint(filepath='./models/weights.{epoch:02d}-{val_loss:.8f}.hdf5', 88 | monitor='val_acc', 89 | verbose=1, 90 | save_best_only=False) 91 | 92 | callbacks = [checkpoint] 93 | 94 | model = get_model(shape=shape, model_type=6, n=1, input_dim=input_dim, class_num=class_num, use_attention=True) 95 | for i in range(epochs): 96 | model.fit(np.array(x), y, 97 | batch_size=batch_size, 98 | epochs=1, 99 | validation_data=(np.array(x_test), y_test), 100 | shuffle=True, 101 | callbacks=callbacks) 102 | 103 | predictions = model.predict(np.array(x_test))[:, 1] 104 | acc = com_acc(np.array(y_test)[:, 1].flatten(), predictions.flatten(), 0.1) 105 | print("valid:", acc, 0.1) 106 | 107 | acc = com_acc(np.array(y_test)[:, 1].flatten(), predictions.flatten(), 0.3) 108 | print("valid:", acc, 0.3) 109 | 110 | acc = com_acc(np.array(y_test)[:, 1].flatten(), predictions.flatten(), 0.5) 111 | print("valid:", acc, 0.5) 112 | 113 | acc = com_acc(np.array(y_test)[:, 1].flatten(), predictions.flatten(), 0.6) 114 | print("valid:", acc, 0.6) 115 | 116 | acc = com_acc(np.array(y_test)[:, 1].flatten(), predictions.flatten(), 0.7) 117 | print("valid:", acc, 0.7) 118 | 119 | acc = com_acc(np.array(y_test)[:, 1].flatten(), predictions.flatten(), 0.9) 120 | print("valid:", acc, 0.9) 121 | 122 | 123 | -------------------------------------------------------------------------------- /param_search_cluster.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn.cluster import DBSCAN, AgglomerativeClustering, AffinityPropagation 6 | from sklearn.cluster import KMeans, Birch 7 | from sklearn.decomposition import NMF, IncrementalPCA, PCA 8 | from sklearn.preprocessing import StandardScaler 9 | 10 | from cluster_performance_evaluate import evaluate_cluster_performance 11 | from prepare_dataset import load_nn_stage2_features, load_stage2_tf_idf, load_clustering_statics_files 12 | 13 | 14 | def train_cluster(data_type=0, dimension_reduction=0, cluster_way=0, n_components=50, threshold=2, n_clusters=210, 15 | branching_factor=50, linkage=0, max_iter=500, eps=1.0): 16 | if data_type == 0: 17 | train_data = load_stage2_tf_idf("") 18 | elif data_type == 1: 19 | train_data = load_stage2_tf_idf("") 20 | nn_data = load_nn_stage2_features() 21 | train_data = pd.merge(train_data, nn_data, 'left', on="file_name") 22 | elif data_type == 2: 23 | train_data = load_nn_stage2_features() 24 | elif data_type == 3: 25 | train_data = load_stage2_tf_idf("1000") 26 | nn_data = load_nn_stage2_features() 27 | train_data = pd.merge(train_data, nn_data, 'left', on="file_name") 28 | dll = load_stage2_tf_idf("_dll") 29 | train_data = pd.merge(train_data, dll, 'left', on="file_name") 30 | dll = load_stage2_tf_idf("_hkey", "first") 31 | train_data = pd.merge(train_data, dll, 'left', on="file_name") 32 | dll = load_stage2_tf_idf("_hkey", "last") 33 | train_data = pd.merge(train_data, dll, 'left', on="file_name") 34 | train_data.fillna(0, inplace=True) 35 | elif data_type == 4: 36 | train_data = load_stage2_tf_idf("1000") 37 | nn_data = load_nn_stage2_features() 38 | train_data = pd.merge(train_data, nn_data, 'left', on="file_name") 39 | dll = load_stage2_tf_idf("_dll") 40 | train_data = pd.merge(train_data, dll, 'left', on="file_name") 41 | dll = load_stage2_tf_idf("_hkey", "first") 42 | train_data = pd.merge(train_data, dll, 'left', on="file_name") 43 | dll = load_stage2_tf_idf("_hkey", "last") 44 | train_data = pd.merge(train_data, dll, 'left', on="file_name") 45 | dll = load_clustering_statics_files() 46 | train_data = pd.merge(train_data, dll, 'left', on="file_name") 47 | train_data.fillna(0, inplace=True) 48 | 49 | file_name = train_data["file_name"] 50 | train_data.drop(columns=["file_name"], inplace=True) 51 | X = StandardScaler(with_mean=False).fit_transform(train_data) 52 | origin_data = X 53 | 54 | if dimension_reduction == 0: 55 | pass 56 | elif dimension_reduction == 1: 57 | model = IncrementalPCA(n_components=n_components) 58 | X = model.fit_transform(X) 59 | elif dimension_reduction == 2: 60 | model = NMF(n_components=n_components, init='random', random_state=0, max_iter=max_iter) 61 | X = model.fit_transform(X) 62 | elif dimension_reduction == 3: 63 | model = PCA(n_components=n_components) 64 | X = model.fit_transform(X) 65 | 66 | print(len(X[0])) 67 | if cluster_way == 0: 68 | mode = ["ward", "complete", "average", "single"] 69 | db = AgglomerativeClustering(n_clusters=n_clusters, linkage=mode[linkage]).fit(X) 70 | labels = db.labels_ 71 | pd.DataFrame(data={"id": file_name, "family_id": db.labels_}).to_csv( 72 | os.path.join("predictions", "aggcl" + "_" + str(n_clusters) + "_" + str(data_type) + "_" + str( 73 | dimension_reduction) + "_" + str(n_components) + ".csv"), index=False) 74 | print(len(set(labels))) 75 | elif cluster_way == 1: 76 | db = Birch(branching_factor=branching_factor, n_clusters=n_clusters, threshold=threshold).fit(X) 77 | labels = db.predict(X) 78 | pd.DataFrame(data={"id": file_name, "family_id": db.labels_}).to_csv( 79 | os.path.join("predictions", "birch" + ".csv"), 80 | index=False) 81 | print(len(set(labels))) 82 | elif cluster_way == 2: 83 | db = hdbscan.HDBSCAN(min_cluster_size=40) 84 | db.fit(X) 85 | labels = db.labels_ 86 | pd.DataFrame(data={"id": file_name, "family_id": db.labels_}).to_csv( 87 | os.path.join("predictions", "hdb_40" + ".csv"), 88 | index=False) 89 | print(len(set(labels))) 90 | elif cluster_way == 3: 91 | db = DBSCAN(eps=eps, n_jobs=-1).fit(X) 92 | labels = db.labels_ 93 | pd.DataFrame(data={"id": file_name, "family_id": db.labels_}).to_csv( 94 | os.path.join("predictions", "db" + "_" + str(eps) + "_" + str(dimension_reduction) + ".csv"), 95 | index=False) 96 | print(len(set(labels))) 97 | elif cluster_way == 4: 98 | labels = np.zeros((len(file_name),)) 99 | pd.DataFrame(data={"id": file_name, "family_id": np.zeros((len(file_name),))}).to_csv( 100 | os.path.join("predictions", "zeros" + ".csv"), 101 | index=False) 102 | elif cluster_way == 5: 103 | db = KMeans(n_clusters=n_clusters, random_state=0).fit(X) 104 | labels = db.labels_ 105 | pd.DataFrame(data={"id": file_name, "family_id": db.labels_}).to_csv( 106 | os.path.join("predictions", "kmeans" + str(n_clusters) + ".csv"), 107 | index=False) 108 | print(len(set(labels))) 109 | elif cluster_way == 6: 110 | db = AffinityPropagation() 111 | 112 | # Number of clusters in labels, ignoring noise if present. 113 | n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) 114 | n_noise_ = list(labels).count(-1) 115 | 116 | print('Estimated number of clusters: %d' % n_clusters_) 117 | print('Estimated number of noise points: %d' % n_noise_) 118 | 119 | scores = evaluate_cluster_performance(origin_data, labels) 120 | evaluate_cluster_performance(X, labels) 121 | return scores 122 | 123 | 124 | def connect_params(params): 125 | full = {} 126 | for i in params: 127 | full.update(i) 128 | return full 129 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import keras 2 | import numpy as np 3 | from keras import Model 4 | from keras import layers 5 | from keras.layers import Dropout, BatchNormalization, Embedding, SpatialDropout1D, GlobalMaxPooling1D, Conv1D, \ 6 | concatenate, Dense, Activation, MaxPool1D, Flatten, Lambda 7 | from keras.utils import plot_model 8 | 9 | 10 | def get_model(shape=(500, 64), num_classes=2, input_dim=93, model_type=0, **kwargs): 11 | if model_type == 4: 12 | return deeper_textcnn(shape, num_classes, input_dim, **kwargs) 13 | elif model_type == 6: 14 | return cnn_autoencoder(shape, num_classes, input_dim, **kwargs) 15 | else: 16 | print("error") 17 | 18 | 19 | def deeper_textcnn(shape=(500, 64), num_classes=2, input_dim=93, n=1, use_attention=False, **kwargs): 20 | input_array = keras.Input(shape=(shape[0],), name='input') 21 | 22 | embedding = Embedding(input_dim=input_dim, output_dim=shape[1]) 23 | embedding_output = embedding(input_array) 24 | 25 | _embed = SpatialDropout1D(0.25)(embedding_output) 26 | warppers = [] 27 | kernel_size = [2, 3, 4, 5] 28 | 29 | for _kernel_size in kernel_size: 30 | for dilated_rate in [1, 2, 3, 4]: 31 | num_res_blocks = 1 32 | num_filters_in = 64 33 | conv1d = Conv1D(filters=num_filters_in, kernel_size=_kernel_size, dilation_rate=dilated_rate)(_embed) 34 | b = BatchNormalization()(conv1d) 35 | r = Activation("elu")(b) 36 | x = r 37 | 38 | conv1 = Conv1D(filters=num_filters_in, kernel_size=1, padding="same")(x) 39 | b = BatchNormalization()(conv1) 40 | r = Activation("elu")(b) 41 | conv2 = Conv1D(filters=num_filters_in, kernel_size=3, padding="same")(r) 42 | b = BatchNormalization()(conv2) 43 | r = Activation("elu")(b) 44 | 45 | x = keras.layers.add([x, r]) 46 | 47 | x = BatchNormalization()(x) 48 | x = Activation('elu')(x) 49 | warppers.append(GlobalMaxPooling1D()(x)) 50 | 51 | fc = concatenate(warppers) 52 | fc = Dropout(0.25)(fc) 53 | fc = Dense(50, activation='relu', name="feature_layer")(fc) 54 | preds = Dense(num_classes, activation='softmax')(fc) 55 | 56 | model = Model(inputs=input_array, outputs=preds) 57 | 58 | model.compile(loss=l2_softmax(5), 59 | optimizer='adam', 60 | metrics=['accuracy']) 61 | model.summary() 62 | plot_model(model, "attention.png") 63 | return model 64 | 65 | 66 | def cnn_autoencoder(shape=(500, 64), num_classes=2, input_dim=93, n=1, use_attention=False, **kwargs): 67 | input_array = keras.Input(shape=(shape[0],), name='input') 68 | 69 | embedding = Embedding(input_dim=input_dim, output_dim=shape[1], weights=[np.load("weight.npy")]) 70 | embedding.trainable = False 71 | embedding_output = embedding(input_array) 72 | 73 | _embed = SpatialDropout1D(0.25)(embedding_output) 74 | kernel_size = 3 75 | dilated_rate = 1 76 | 77 | num_filters_in = 64 78 | conv1d = Conv1D(filters=num_filters_in, kernel_size=kernel_size, padding="same")(_embed) 79 | b = BatchNormalization()(conv1d) 80 | r = Activation("elu")(b) 81 | x = r 82 | 83 | conv1 = Conv1D(filters=num_filters_in, kernel_size=1, padding="same")(x) 84 | b = BatchNormalization()(conv1) 85 | r = Activation("elu")(b) 86 | conv2 = Conv1D(filters=num_filters_in, kernel_size=3, padding="same")(r) 87 | b = BatchNormalization()(conv2) 88 | r = Activation("elu")(b) 89 | 90 | x = keras.layers.add([x, r]) 91 | 92 | x = BatchNormalization()(x) 93 | x = Activation('elu')(x) 94 | x = MaxPool1D(pool_size=128, strides=128, data_format='channels_last')(x) 95 | fc = Activation("sigmoid", name="feature_output")(x) 96 | 97 | fc = layers.UpSampling1D(size=128)(fc) 98 | fc = BatchNormalization()(fc) 99 | fc = Activation('elu')(fc) # add later 100 | # mid = Flatten(name="feature_output")(fc) 101 | 102 | conv2 = Conv1D(filters=num_filters_in, kernel_size=3, padding="same")(fc) 103 | b = BatchNormalization()(conv2) 104 | r = Activation("elu")(b) 105 | 106 | conv1 = Conv1D(filters=num_filters_in, kernel_size=1, padding="same")(r) 107 | b = BatchNormalization()(conv1) 108 | r = Activation("elu")(b) 109 | 110 | x = keras.layers.add([fc, r]) 111 | conv1d = Conv1D(filters=num_filters_in, kernel_size=kernel_size, padding="same")(x) 112 | b = BatchNormalization()(conv1d) 113 | r = Activation("elu")(b) 114 | 115 | output = r 116 | 117 | output = Lambda(lambda x: keras.losses.mean_squared_error(x[0], x[1]), name='loss', 118 | output_shape=(1,))([output, embedding_output]) 119 | 120 | model = Model(inputs=input_array, outputs=output) 121 | 122 | model.compile(loss=loss_first, 123 | optimizer='adam') 124 | model.summary() 125 | plot_model(model, "attention.png") 126 | return model 127 | 128 | 129 | def loss_first(x, y): 130 | return y 131 | 132 | 133 | def l2_softmax(alpha): 134 | def a(y_true, y_pred): 135 | y_normal = alpha * keras.backend.l2_normalize(y_pred) 136 | 137 | return keras.losses.categorical_crossentropy(y_true, y_normal) 138 | 139 | return a 140 | 141 | 142 | def load_model(model_path, load_type=0) -> keras.Model: 143 | """ 144 | 返回训练好的模型 145 | :return: 146 | """ 147 | 148 | def temp(a, b=0): 149 | return a 150 | 151 | if load_type == 0: 152 | a = temp 153 | return keras.models.load_model(model_path, custom_objects={'a': l2_softmax(5)}) 154 | elif load_type == 1: 155 | a = temp 156 | model = keras.models.load_model(model_path, custom_objects={'a': l2_softmax(5)}) 157 | plot_model(model, "attention.png") 158 | 159 | output = model.get_layer('feature_layer').output 160 | model = Model(model.input, output) 161 | model.summary() 162 | return model 163 | elif load_type == 2: 164 | a = temp 165 | model = keras.models.load_model(model_path, 166 | custom_objects={'a': l2_softmax(5), "keras": keras, "loss_first": loss_first}) 167 | plot_model(model, "attention.png") 168 | 169 | output = model.get_layer('feature_output').output 170 | output = Flatten()(output) 171 | model = Model(model.input, output) 172 | model.summary() 173 | return model 174 | 175 | 176 | if __name__ == '__main__': 177 | model = load_model("weights.01-0.06816901.hdf5", 1) 178 | plot_model(model, "attention.png") 179 | -------------------------------------------------------------------------------- /feature_extraction/cluster_static_features.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from contextlib import contextmanager 4 | 5 | import pandas as pd 6 | 7 | # FEATURE ENGINEERING V1 8 | from basic_function import load_df 9 | 10 | 11 | def makeFeature(data, is_train=True): 12 | ''' 13 | file_cnt: file有多少样本; 14 | tid_distinct_cnt: file发起了多少线程; 15 | api_distinct_cnt: file调用了多少不同的API ; 16 | value_distinct_cnt: file有多少不同的返回值; 17 | tid_api_cnt_max,tid_api_cnt_min,tid_api_cnt_mean: ","file中的线程调用的 最多/最少/平均 api数目; 18 | tid_api_distinct_cnt_max, tid_api_distinct_cnt_min, tid_api_distinct_cnt_mean:; 19 | file中的线程调用的 最多/最少/平均 不同api数目 ; 20 | value_equals0_cnt: file返回值为0的样本数; 21 | value_equals0_rate: file返回值为0的样本比率; 22 | ''' 23 | if is_train: 24 | return_data = data[['file_id', 'label']].drop_duplicates() 25 | else: 26 | return_data = data[['file_id']].drop_duplicates() 27 | ################################################################################ 28 | 29 | feat = data.groupby(['file_id']).agg( 30 | {'api': pd.Series.nunique, 'return_value': pd.Series.nunique}).reset_index() 31 | feat.columns = ['file_id', 'api_distinct_cnt', 'value_distinct_cnt'] 32 | return_data = return_data.merge(feat, on='file_id', how='left') 33 | ################################################################################ 34 | feat = data[data.return_value == 0].groupby(['file_id']).return_value.count().reset_index(name='value_equals0_cnt') 35 | return_data = return_data.merge(feat, on='file_id', how='left') 36 | ################################################################################ 37 | return_data.loc[:, 'value_equals0_rate'] = (return_data.value_equals0_cnt + 1) / (return_data.file_cnt + 1) 38 | 39 | return return_data 40 | 41 | 42 | # FEATURE ENGINEERING V2 43 | def makeFeature_v2(data): 44 | ''' 45 | api_index_min: api首次出现的index; 46 | api_cnt: api出现的次数; 47 | api_rate: api出现的次数占所有api调用次数的比例; 48 | api_value_equals_0_cnt: api返回值为0的次数; 49 | ''' 50 | return_data = data[['file_id']].drop_duplicates() 51 | 52 | # 统计file调用api的次数 53 | tmp = data.groupby(['file_id']).api.count() 54 | 55 | # 统计api调用的最小Index 56 | feat = data.groupby(['file_id', 'api'])['index'].min().reset_index(name='val') 57 | feat = feat.pivot(index='file_id', columns='api', values='val') 58 | feat.columns = [feat.columns[i] + '_index_min' for i in range(feat.shape[1])] 59 | feat_withFileid = feat.reset_index() 60 | return_data = return_data.merge(feat_withFileid, on='file_id', how='left') 61 | # 统计api调用的次数 62 | feat = data.groupby(['file_id', 'api'])['index'].count().reset_index(name='val') 63 | feat = feat.pivot(index='file_id', columns='api', values='val') 64 | feat.columns = [feat.columns[i] + '_cnt' for i in range(feat.shape[1])] 65 | feat_withFileid = feat.reset_index() 66 | return_data = return_data.merge(feat_withFileid, on='file_id', how='left') 67 | # 统计api调用的比例 68 | feat_rate = pd.concat([feat, tmp], axis=1) 69 | feat_rate = feat_rate.apply(lambda x: x / feat_rate.api) 70 | feat_rate.columns = [feat_rate.columns[i] + '_rate' for i in range(feat_rate.shape[1])] 71 | feat_rate_withFileid = feat_rate.reset_index().drop(['api_rate'], axis=1) 72 | return_data = return_data.merge(feat_rate_withFileid, on='file_id', how='left') 73 | 74 | # 统计api返回值为0的次数 75 | feat = data[data.return_value == 0].groupby(['file_id', 'api'])['index'].count().reset_index(name='val') 76 | feat = feat.pivot(index='file_id', columns='api', values='val') 77 | feat.columns = [feat.columns[i] + '_value_equals_0_cnt' for i in range(feat.shape[1])] 78 | feat_withFileid = feat.reset_index() 79 | return_data = return_data.merge(feat_withFileid, on='file_id', how='left') 80 | 81 | return return_data 82 | 83 | 84 | # FEATURE ENGINEERING V3 85 | def makeFeature_v3(data): 86 | ''' 87 | api_not0_index_min: api返回值不为0的index的最小值; 88 | api_not0_index_min_diff: api返回值不为0时最小index和该api出现的最小index的差; 89 | api_equals0_rate: api返回值为0的次数占该api次数的比例 90 | ''' 91 | return_data = data[['file_id']].drop_duplicates() 92 | # 统计api调用的最小Index 93 | feat_api_min_index = data.groupby(['file_id', 'api'])['index'].min().reset_index(name='min_index') 94 | feat_api_not0_min_index = data[data.return_value != 0].groupby(['file_id', 'api'])['index'].min().reset_index( 95 | name='value_not0_min_index') 96 | # 统计return_value不为0的最小Index 97 | feat = feat_api_not0_min_index.pivot(index='file_id', columns='api', values='value_not0_min_index') 98 | feat.columns = [feat.columns[i] + '_not0_index_min' for i in range(feat.shape[1])] 99 | feat_withFileid = feat.reset_index() 100 | return_data = return_data.merge(feat_withFileid, on='file_id', how='left') 101 | # 统计return_value不为0的最小Index和api最小index的差 102 | feat = feat_api_min_index.merge(feat_api_not0_min_index, on=['file_id', 'api'], how='left') 103 | feat.loc[:, 'api_index_not0_min_diff'] = feat['value_not0_min_index'] - feat['min_index'] 104 | feat = feat.pivot(index='file_id', columns='api', values='api_index_not0_min_diff') 105 | feat.columns = [feat.columns[i] + '_not0_index_min_diff' for i in range(feat.shape[1])] 106 | feat_withFileid = feat.reset_index() 107 | return_data = return_data.merge(feat_withFileid, on='file_id', how='left') 108 | # 统计api返回值为0的次数 109 | feat = data[data.return_value == 0].groupby(['file_id', 'api'])['index'].count().reset_index( 110 | name='value_equals0_cnt') 111 | feat_api_cnt = data.groupby(['file_id', 'api']).return_value.count().reset_index(name='file_api_cnt') 112 | feat = feat.merge(feat_api_cnt, on=['file_id', 'api'], how='left') 113 | feat.loc[:, 'value_equals0_rate'] = feat['value_equals0_cnt'] / (feat['file_api_cnt'] * 1.0) 114 | # 统计return_value为0的比例 115 | feat = feat.pivot(index='file_id', columns='api', values='value_equals0_rate') 116 | feat.columns = [feat.columns[i] + '_equals0_rate' for i in range(feat.shape[1])] 117 | feat_withFileid = feat.reset_index() 118 | return_data = return_data.merge(feat_withFileid, on='file_id', how='left') 119 | 120 | return return_data 121 | 122 | 123 | # TIME-COST FUNCTION 124 | @contextmanager 125 | def timer(title): 126 | t0 = time.time() 127 | yield 128 | print("{} - done in {:.2f}s".format(title, time.time() - t0)) 129 | 130 | 131 | def extract_features(load_name, extract_function): 132 | white = load_df(load_name, mode=1)[['file_name', 'api_name', 'call_name', 'call_pid', 133 | 'ret_value', 'apiArg_list_count', 'exInfo_list_count', 'call_time']] 134 | white.rename(columns={'ret_value': 'return_value', 'api_name': 'api', 'file_name': 'file_id', 'call_time': 'index'}, 135 | inplace=True) 136 | a = extract_function(white) 137 | a.rename(columns={'file_id': 'file_name'}, inplace=True) 138 | return a 139 | 140 | 141 | if __name__ == '__main__': 142 | def inter(load_name): 143 | white = load_df(load_name, mode=1)[['file_name', 'api_name', 'ret_value', 'call_time']] 144 | white.rename(columns={'ret_value': 'return_value', 'api_name': 'api', 'file_name': 'file_id', 'call_time': 'index'}, 145 | inplace=True) 146 | return white 147 | 148 | 149 | extract_functions = [makeFeature_v2, makeFeature_v3] 150 | data = inter(os.path.join("features", "stage2")) 151 | full = [] 152 | for func in extract_functions: 153 | full.append(func(data)) 154 | pd.merge(full[0], full[1], 'outer', on="file_id").rename(columns={'file_id': 'file_name'}).to_csv( 155 | os.path.join("features", "outside_stage2" + ".csv")) 156 | 157 | -------------------------------------------------------------------------------- /xml_to_csv.py: -------------------------------------------------------------------------------- 1 | import os 2 | import xml.etree.ElementTree as ET 3 | from datetime import datetime 4 | 5 | import pandas as pd 6 | 7 | from basic_function import make_dir 8 | 9 | 10 | def get_file_list_in_dir(path): 11 | return [name for name in os.listdir(path) if not os.path.isdir(os.path.join(path, name))] 12 | 13 | 14 | def save_file_info(path, csv_name): 15 | file = get_file_list_in_dir(path=path) 16 | print("file number:", len(file)) 17 | file_data = [] 18 | for i in range(len(file)): 19 | if i % 200 == 0: 20 | print("percent:", i / len(file) * 100) 21 | file_name = file[i] 22 | root = ET.parse(os.path.join(path, file_name)).getroot() 23 | 24 | file_list_node = root.findall('file_list')[0] 25 | 26 | file_error = int(file_list_node.get('file_error')) 27 | file_name = file_list_node.get('file_name') 28 | file_uid = file_list_node.get('file_uid') 29 | 30 | file_data.append([file_error, file_name, file_uid]) 31 | 32 | file_data_df = pd.DataFrame(data=file_data, columns=['file_error', 'file_name', 'file_uid']) 33 | file_data_df.to_csv(csv_name) 34 | 35 | 36 | def save_action_data_to_csv(action_data, csv_name, columns=None): 37 | if columns is None: 38 | action_data_df = pd.DataFrame(data=action_data, 39 | columns=['file_name', 'api_name', 'call_name', 'call_pid', 'ret_value', 40 | 'apiArg_list_count', 41 | 'exInfo_list_count', 'call_time']) 42 | action_data_df.to_csv(csv_name) 43 | else: 44 | action_data_df = pd.DataFrame(data=action_data, columns=columns) 45 | action_data_df.to_csv(csv_name) 46 | 47 | 48 | def time_string_to_ns(time_string): 49 | time_format = '%H:%M:%S.%f' 50 | datetime_object = datetime.strptime(time_string, time_format) 51 | return (datetime_object - datetime(1970, 1, 1)).total_seconds() 52 | 53 | 54 | def is_dll(name): 55 | return name[-3:] == "dll" or name[-3:] == "DLL" 56 | 57 | 58 | def is_hkey(name): 59 | return name[:4] == "HKEY" 60 | 61 | 62 | def save_action_stream(path, csv_name, dir_name="", mode=0): 63 | if dir_name != "": 64 | make_dir(os.path.join("features", dir_name)) 65 | file = get_file_list_in_dir(path=path) 66 | print("file number:", len(file)) 67 | action_data = [] 68 | for i in range(len(file)): 69 | if i % 200 == 0: 70 | print("percent:", i / len(file) * 100) 71 | 72 | if i % 1000 == 0 and mode == 1 and i != 0: 73 | save_action_data_to_csv(action_data, os.path.join("features", dir_name, str(i) + csv_name)) 74 | action_data = [] 75 | 76 | file_name = file[i] 77 | root = ET.parse(os.path.join(path, file_name)).getroot() 78 | 79 | action_list = root.findall('./file_list/file/start_boot/action_list/action') 80 | start = 0 81 | start_time = 0 82 | last = 0 83 | for action in action_list: 84 | api_name = action.get('api_name') 85 | call_name = action.get('call_name') 86 | call_pid = action.get('call_pid') 87 | call_time = action.get('call_time') 88 | ret_value = action.get('ret_value') 89 | 90 | # convert time string to seconds since software start 91 | try: 92 | call_time = time_string_to_ns(call_time) 93 | last = call_time 94 | except ValueError: 95 | print(call_time) 96 | call_time = last 97 | 98 | if start == 0: 99 | start_time = call_time 100 | start += 1 101 | 102 | call_time = call_time - start_time 103 | 104 | try: 105 | apiArg_list = action.findall('apiArg_list')[0] 106 | apiArg_list_count = apiArg_list.get('count') 107 | except IndexError: 108 | apiArg_list_count = -1 109 | print(action.attrib) 110 | 111 | exInfo_list = action.findall('exInfo_list')[0] 112 | exInfo_list_count = exInfo_list.get('count') 113 | 114 | action_data.append( 115 | [file_name, api_name, call_name, call_pid, ret_value, apiArg_list_count, exInfo_list_count, call_time]) 116 | 117 | if mode == 1: 118 | save_action_data_to_csv(action_data, os.path.join("features", dir_name, 'final_' + csv_name)) 119 | else: 120 | save_action_data_to_csv(action_data, csv_name) 121 | 122 | 123 | def save_attribute_list(path, csv_name, dir_name="", mode=0, dll_or_hkey="dll"): 124 | if dir_name != "": 125 | make_dir(os.path.join("features", dir_name)) 126 | file = get_file_list_in_dir(path=path) 127 | print("file number:", len(file)) 128 | column_name = ['file_name', 'api_name', 'call_name', 'call_pid', 'ret_value', 'value', "call_time"] 129 | determination = is_hkey 130 | if dll_or_hkey == "dll": 131 | determination = is_dll 132 | elif dll_or_hkey == "hkey": 133 | determination = is_hkey 134 | 135 | action_data = [] 136 | for i in range(len(file)): 137 | if i % 200 == 0: 138 | print("percent:", i / len(file) * 100) 139 | 140 | if i % 1000 == 0 and mode == 1 and i != 0: 141 | save_action_data_to_csv(action_data, os.path.join("features", dir_name, str(i) + csv_name), column_name) 142 | action_data = [] 143 | 144 | file_name = file[i] 145 | root = ET.parse(os.path.join(path, file_name)).getroot() 146 | 147 | action_list = root.findall('./file_list/file/start_boot/action_list/action') 148 | for action in action_list: 149 | 150 | api_name = action.get('api_name') 151 | call_name = action.get('call_name') 152 | call_pid = action.get('call_pid') 153 | call_time = action.get('call_time') 154 | ret_value = action.get('ret_value') 155 | 156 | apiArg_list = action.findall("./apiArg_list/apiArg") 157 | exInfo_list = action.findall('./exInfo_list/exInfo') 158 | 159 | for apiArg in apiArg_list: 160 | value = apiArg.get('value') 161 | # print(value) 162 | 163 | if determination(value): 164 | print(1) 165 | action_data.append( 166 | [file_name, api_name, call_name, call_pid, ret_value, value, call_time]) 167 | 168 | for exInfo in exInfo_list: 169 | value = exInfo.get('value') 170 | # print(value) 171 | if determination(value): 172 | action_data.append( 173 | [file_name, api_name, call_name, call_pid, ret_value, value, call_time]) 174 | 175 | if mode == 1: 176 | save_action_data_to_csv(action_data, os.path.join("features", dir_name, 'final_' + csv_name), column_name) 177 | else: 178 | save_action_data_to_csv(action_data, csv_name, column_name) 179 | 180 | 181 | if __name__ == '__main__': 182 | save_action_stream('stage2_dataset', "stage2.csv", dir_name="stage2", mode=1) 183 | 184 | save_attribute_list('stage2_dataset', "stage2.csv", 185 | dir_name="stage2_dll", mode=1, dll_or_hkey="dll") 186 | 187 | save_attribute_list('stage2_dataset', "stage2.csv", 188 | dir_name="stage2_hkey", mode=1, dll_or_hkey="hkey") 189 | 190 | # save_action_stream('stage1_dataset\\test', "test.csv", dir_name="test", mode=1) 191 | # save_action_stream('stage1_dataset\\train\\white', "white.csv", dir_name="white", 192 | # mode=1) 193 | # save_action_stream('stage1_dataset\\stage1_dataset\\train\\black', "black.csv", dir_name="black", 194 | # mode=1) 195 | 196 | # save_attribute_list('stage1_dataset\\stage1_dataset\\train\\white', "white.csv", 197 | # dir_name="white_dll", mode=1, dll_or_hkey="dll") 198 | # save_attribute_list('stage1_dataset\\stage1_dataset\\train\\black', "black.csv", 199 | # dir_name="black_dll", mode=1, dll_or_hkey="dll") 200 | # save_attribute_list('stage1_dataset\\stage1_dataset\\test', "test.csv", 201 | # dir_name="test_dll", mode=1, dll_or_hkey="dll") 202 | # 203 | # save_attribute_list('stage1_dataset\\stage1_dataset\\train\\white', "white.csv", 204 | # dir_name="white_hkey", mode=1, dll_or_hkey="hkey") 205 | # save_attribute_list('stage1_dataset\\stage1_dataset\\train\\black', "black.csv", 206 | # dir_name="black_hkey", mode=1, dll_or_hkey="hkey") 207 | # save_attribute_list('stage1_dataset\\stage1_dataset\\test', "test.csv", 208 | # dir_name="test_hkey", mode=1, dll_or_hkey="hkey") 209 | -------------------------------------------------------------------------------- /feature_extraction/tfidf_features.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pandas as pd 4 | import scipy 5 | from sklearn.feature_extraction.text import TfidfVectorizer 6 | 7 | from basic_function import load_df, save_dict, get_root_path 8 | 9 | 10 | def to_str(df, mode=0, column_name=None): 11 | if column_name is None: 12 | column_name = 'api_name' 13 | string_list = [] 14 | name_list = [] 15 | for i in df.groupby('file_name')[column_name]: 16 | name_list.append(i[0]) 17 | api_str = "" 18 | for p in i[1].iteritems(): 19 | api_str += " " + str(p[1]) 20 | string_list.append(api_str) 21 | # print(name_list) 22 | if mode == 1: 23 | return string_list, name_list 24 | else: 25 | return string_list 26 | 27 | 28 | def api(): 29 | api_vec = TfidfVectorizer(ngram_range=(1, 5), 30 | min_df=3, max_df=0.9, 31 | strip_accents='unicode', 32 | use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=500) 33 | 34 | white = pd.read_csv("white.csv")[['file_name', 'api_name']] 35 | black = load_df("black") 36 | test = load_df("test") 37 | 38 | full = pd.concat([white, black, test]) 39 | full_str = to_str(full) 40 | 41 | print(1) 42 | api_vec.fit(full_str) 43 | 44 | print(2) 45 | black_output, name_list = to_str(black, mode=1) 46 | save_dict(name_list, "black_name_list") 47 | black_output = api_vec.transform(black_output) 48 | scipy.sparse.save_npz("black.npz", black_output) 49 | 50 | white_output, name_list = to_str(white, mode=1) 51 | save_dict(name_list, "white_name_list") 52 | white_output = api_vec.transform(white_output) 53 | scipy.sparse.save_npz("white.npz", white_output) 54 | 55 | test_str, name_list = to_str(test, mode=1) 56 | save_dict(name_list, "test_name_list") 57 | test_output = api_vec.transform(test_str) 58 | scipy.sparse.save_npz("test.npz", test_output) 59 | 60 | 61 | def stage2_api(feature_num=500): 62 | api_vec = TfidfVectorizer(ngram_range=(1, 5), 63 | min_df=3, max_df=0.9, 64 | strip_accents='unicode', 65 | use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=feature_num) 66 | 67 | white = pd.read_csv(os.path.join("features", "white.csv"))[['file_name', 'api_name']] 68 | black = load_df(os.path.join("features", "black")) 69 | test = load_df(os.path.join("features", "test")) 70 | stage2 = load_df(os.path.join("features", "stage2")) 71 | 72 | full = pd.concat([white, black, test]) 73 | full_str = to_str(full) 74 | 75 | print(1) 76 | api_vec.fit(full_str) 77 | 78 | print(2) 79 | 80 | black_output, name_list = to_str(stage2, mode=1) 81 | save_dict(name_list, os.path.join("features", "stage2_name_list"+str(feature_num))) 82 | black_output = api_vec.transform(black_output) 83 | scipy.sparse.save_npz(os.path.join("features", "stage2"+str(feature_num)+".npz"), black_output) 84 | 85 | 86 | def tianchi_api(): 87 | api_vec = TfidfVectorizer(ngram_range=(1, 5), 88 | min_df=3, max_df=0.9, 89 | strip_accents='unicode', 90 | use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=500) 91 | 92 | white = pd.read_csv(os.path.join("features", "white.csv"))[['file_name', 'api_name']] 93 | black = load_df(os.path.join("features", "black")) 94 | test = load_df(os.path.join("features", "test")) 95 | tianchi = pd.read_csv("security_train.csv").rename(columns={"file_id":"file_name"}) 96 | 97 | full = pd.concat([white, black, test]) 98 | full_str = to_str(full) 99 | 100 | print(1) 101 | api_vec.fit(full_str) 102 | 103 | print(2) 104 | 105 | black_output, name_list = to_str(tianchi, mode=1, column_name="api") 106 | save_dict(name_list, os.path.join("features", "tianchi_name_list")) 107 | black_output = api_vec.transform(black_output) 108 | scipy.sparse.save_npz(os.path.join("features", "tianchi.npz"), black_output) 109 | 110 | 111 | def stage_2_attribute(suffix="_dll", use_less_value=False, type_name="", map_func=None, max_feature=1000): 112 | stage2 = load_df(os.path.join("features", "stage2"+suffix), mode=1) 113 | 114 | if use_less_value: 115 | if map_func is None: 116 | stage2["value"] = stage2["value"].map(lambda x: x.split("\\")[-1]) 117 | else: 118 | stage2["value"] = stage2["value"].map(lambda x: map_func(x)) 119 | stage2_output, name_list = to_str(stage2, mode=1, column_name="value") 120 | api_vec, _ = train_tf_idf(suffix="_dll", use_less_value=use_less_value, map_func=map_func, data=stage2_output, max_feature=max_feature) 121 | 122 | save_dict(name_list, os.path.join(get_root_path(), "features", "stage2_name_list" + suffix + type_name)) 123 | stage2_output = api_vec.transform(stage2_output) 124 | scipy.sparse.save_npz(os.path.join(get_root_path(), "features", "stage2" + suffix + type_name + ".npz"), stage2_output) 125 | 126 | 127 | def attribution(suffix="_dll", use_less_value=False, type_name="", map_func=None, max_feature=2000): 128 | api_vec, data = train_tf_idf(suffix="_dll", use_less_value=use_less_value, map_func=map_func, 129 | max_feature=max_feature) 130 | 131 | white, black, test = data 132 | 133 | black_output, name_list = to_str(black, mode=1, column_name="value") 134 | save_dict(name_list, os.path.join(get_root_path(), "features", "black_name_list" + suffix + type_name)) 135 | black_output = api_vec.transform(black_output) 136 | scipy.sparse.save_npz(os.path.join(get_root_path(), "features", "black" + suffix + type_name + ".npz"), black_output) 137 | 138 | white_output, name_list = to_str(white, mode=1, column_name="value") 139 | save_dict(name_list, os.path.join(get_root_path(), "features", "white_name_list" + suffix + type_name)) 140 | white_output = api_vec.transform(white_output) 141 | scipy.sparse.save_npz(os.path.join(get_root_path(), "features", "white" + suffix + type_name + ".npz"), white_output) 142 | 143 | test_str, name_list = to_str(test, mode=1, column_name="value") 144 | save_dict(name_list, os.path.join(get_root_path(), "features", "test_name_list" + suffix + type_name)) 145 | test_output = api_vec.transform(test_str) 146 | scipy.sparse.save_npz(os.path.join(get_root_path(), "features", "test" + suffix + type_name + ".npz"), test_output) 147 | 148 | 149 | def train_tf_idf(suffix="_dll", use_less_value=False, map_func=None, max_feature=2000, data=None): 150 | api_vec = TfidfVectorizer(ngram_range=(1, 5), 151 | min_df=3, max_df=0.9, 152 | strip_accents='unicode', 153 | use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=max_feature) 154 | 155 | if data is None: 156 | white = load_df(os.path.join(get_root_path(), "features", "white" + suffix), mode=1) 157 | black = load_df(os.path.join(get_root_path(), "features", "black" + suffix), mode=1) 158 | test = load_df(os.path.join(get_root_path(), "features", "test" + suffix), mode=1) 159 | 160 | if use_less_value: 161 | if map_func is None: 162 | for i in [white, black, test]: 163 | i["value"] = i["value"].map(lambda x: x.split("\\")[-1]) 164 | else: 165 | for i in [white, black, test]: 166 | i["value"] = i["value"].map(lambda x: map_func(x)) 167 | 168 | full = pd.concat([white, black, test]) 169 | full_str = to_str(full, column_name="value") 170 | else: 171 | full_str = data 172 | 173 | print(1) 174 | api_vec.fit(full_str) 175 | print(2) 176 | if data is None: 177 | return api_vec, [white, black, test] 178 | else: 179 | return api_vec, None 180 | 181 | 182 | def last_hkey(x): 183 | return "speech" if len(x.split("\\")) == 1 else x.split("\\")[-1] 184 | 185 | 186 | def second_hkey(x): 187 | return "speech" if len(x.split("\\")) == 1 else x.split("\\")[1] 188 | 189 | 190 | def stage2_api_new(feature_num=500): 191 | api_vec = TfidfVectorizer(ngram_range=(1, 5), 192 | min_df=3, max_df=0.9, 193 | strip_accents='unicode', 194 | use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=feature_num) 195 | 196 | 197 | stage2 = load_df(os.path.join("features", "stage2")) 198 | 199 | black_output, name_list = to_str(stage2, mode=1) 200 | 201 | print(1) 202 | api_vec.fit(black_output) 203 | 204 | print(2) 205 | 206 | # black_output, name_list = to_str(stage2, mode=1) 207 | save_dict(name_list, os.path.join(get_root_path(), "features", "stage2_name_list"+str(feature_num))) 208 | black_output = api_vec.transform(black_output) 209 | scipy.sparse.save_npz(os.path.join(get_root_path(), "features", "stage2"+str(feature_num)+".npz"), black_output) 210 | 211 | # attribution("_hkey", use_less_value=True, type_name="second", map_func=second_hkey, max_feature=100) 212 | # attribution("_hkey", use_less_value=True, type_name="last", map_func=last_hkey, max_feature=200) 213 | # attribution("_hkey", use_less_value=False, type_name="", max_feature=500) 214 | # attribution("_dll", use_less_value=False, type_name="", max_feature=500) 215 | # tianchi_api() 216 | # api() 217 | 218 | 219 | stage_2_attribute("_dll", False, max_feature=1000) 220 | stage_2_attribute("_hkey", use_less_value=True, type_name="last", map_func=last_hkey, max_feature=1000) 221 | stage_2_attribute("_hkey", use_less_value=True, type_name="first", map_func=second_hkey, max_feature=100) 222 | stage2_api_new(1000) 223 | -------------------------------------------------------------------------------- /prepare_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import scipy.sparse 7 | 8 | from basic_function import extract_id_from_file_name, load_dict, get_root_path 9 | 10 | 11 | def get_outside_train_features(): 12 | train = pd.read_csv(os.path.join(get_root_path(), "features", "safe_type_train.csv")) 13 | train.rename(columns={"id": "file_name"}, inplace=True) 14 | full_features = pd.read_csv(os.path.join(get_root_path(), "features", "outside.csv"), index_col=0) 15 | full_features["file_name"] = full_features["file_name"].map(lambda x: extract_id_from_file_name(x)) 16 | 17 | test_name_list = load_dict(os.path.join(get_root_path(), "features", "test_name_list")) 18 | test_data = pd.DataFrame(columns=["file_name"], data=np.array(test_name_list)) 19 | test_data["file_name"] = test_data["file_name"].map(lambda x: extract_id_from_file_name(x)) 20 | 21 | # merge 22 | train_data = pd.merge(train, full_features, "left", on="file_name") 23 | test_data = pd.merge(test_data, full_features, "left", on="file_name") 24 | 25 | label = train_data["safe_type"] 26 | 27 | train_data.drop(columns=["safe_type"], inplace=True) 28 | 29 | return train_data, label, test_data 30 | 31 | 32 | def load_clustering_statics_files(): 33 | full_features = pd.read_csv(os.path.join(get_root_path(), "features", "outside_stage2.csv"), index_col=0) 34 | full_features["file_name"] = full_features["file_name"].map(lambda x: extract_id_from_file_name(x)) 35 | return full_features 36 | 37 | 38 | def load_ft_features(feature_files=None): 39 | if feature_files is None: 40 | feature_files = {"black": "black_features.csv", "white": "white_features.csv", "test": "test_features.csv"} 41 | train = pd.read_csv(os.path.join(get_root_path(), "features", "safe_type_train.csv")) 42 | train.rename(columns={"id": "file_name"}, inplace=True) 43 | 44 | black_features = pd.read_csv(os.path.join(get_root_path(), "features", feature_files["black"])) 45 | white_features = pd.read_csv(os.path.join(get_root_path(), "features", feature_files["white"])) 46 | full_features = pd.concat([black_features, white_features]) 47 | full_features["file_name"] = full_features["file_name"].map(lambda x: extract_id_from_file_name(x)) 48 | 49 | # load test data 50 | test_data = pd.read_csv(os.path.join(get_root_path(), "features", feature_files["test"])) 51 | test_data["file_name"] = test_data["file_name"].map(lambda x: extract_id_from_file_name(x)) 52 | 53 | # merge 54 | train_dat = pd.merge(train, full_features, "inner", on="file_name") 55 | 56 | label = train_dat["safe_type"] 57 | train_dat.drop(columns=["safe_type"], inplace=True) 58 | return train_dat, label, test_data 59 | 60 | 61 | def load_runtime_features(): 62 | train = pd.read_csv(os.path.join(get_root_path(), "features", "safe_type_train.csv")) 63 | train.rename(columns={"id": "file_name"}, inplace=True) 64 | 65 | full_features = pd.read_csv(os.path.join("features", "train_used_time_feauture.csv")) 66 | full_features["file_name"] = full_features["file_name"].map(lambda x: extract_id_from_file_name(x)) 67 | 68 | # load test data 69 | test_data = pd.read_csv(os.path.join("features", "test_used_time_feauture.csv")) 70 | test_data["file_name"] = test_data["file_name"].map(lambda x: extract_id_from_file_name(x)) 71 | 72 | # merge 73 | train_dat = pd.merge(train, full_features, "inner", on="file_name") 74 | 75 | label = train_dat["safe_type"] 76 | train_dat.drop(columns=["safe_type"], inplace=True) 77 | return train_dat, label, test_data 78 | 79 | 80 | def load_depth_three_features(): 81 | return load_ft_features({"black": "black_features_depth_3.csv", "white": "white_features_depth_3.csv", 82 | "test": "test_features_depth_3.csv"}) 83 | 84 | 85 | def load_nn_features(): 86 | train = pd.read_csv(os.path.join(get_root_path(), "features", "safe_type_train.csv")) 87 | train.rename(columns={"id": "file_name"}, inplace=True) 88 | 89 | train_features = pd.read_csv(os.path.join(get_root_path(), "features", "train_nn.csv")) 90 | train_features["file_name"] = train_features["file_name"].map(lambda x: extract_id_from_file_name(x)) 91 | 92 | # load test data 93 | test_data = pd.read_csv(os.path.join(get_root_path(), "features", "test_nn.csv")) 94 | test_data["file_name"] = test_data["file_name"].map(lambda x: extract_id_from_file_name(x)) 95 | 96 | # merge 97 | train_dat = pd.merge(train, train_features, "inner", on="file_name") 98 | 99 | label = train_dat["safe_type"] 100 | train_dat.drop(columns=["safe_type"], inplace=True) 101 | return train_dat, label, test_data 102 | 103 | 104 | def load_tfidf_features(suffix, type_name=""): 105 | black = scipy.sparse.load_npz( 106 | os.path.join(get_root_path(), "features", "black" + suffix + type_name + ".npz")).toarray() 107 | white = scipy.sparse.load_npz( 108 | os.path.join(get_root_path(), "features", "white" + suffix + type_name + ".npz")).toarray() 109 | test = scipy.sparse.load_npz( 110 | os.path.join(get_root_path(), "features", "test" + suffix + type_name + ".npz")).toarray() 111 | 112 | black_l = np.ones((black.shape[0],)) 113 | white_l = np.zeros((white.shape[0],)) 114 | train_data = pd.DataFrame(np.concatenate((black, white), axis=0)) 115 | 116 | label = pd.DataFrame(np.concatenate((black_l, white_l), axis=0)) 117 | 118 | test_df = pd.DataFrame(test) 119 | 120 | black_name_list = load_dict(os.path.join(get_root_path(), "features", "black_name_list" + suffix + type_name)) 121 | white_name_list = load_dict(os.path.join(get_root_path(), "features", "white_name_list" + suffix + type_name)) 122 | train_name_list = np.concatenate((black_name_list, white_name_list), axis=0) 123 | 124 | test_name_list = load_dict(os.path.join(get_root_path(), "features", "test_name_list" + suffix + type_name)) 125 | 126 | train_data["file_name"] = train_name_list 127 | train_data["file_name"] = train_data["file_name"].map(lambda x: extract_id_from_file_name(x)) 128 | 129 | test_df["file_name"] = test_name_list 130 | test_df["file_name"] = test_df["file_name"].map(lambda x: extract_id_from_file_name(x)) 131 | 132 | return train_data, label, test_df 133 | 134 | 135 | def load_autoencoder_features(): 136 | features = np.load("train_nn.npy") 137 | print(features.shape) 138 | name = np.load("file_name_list_stage2.npy") 139 | label = np.load("label_nn.npy") 140 | features = pd.DataFrame(data=features) 141 | features["file_name"] = name 142 | features["file_name"] = features["file_name"].map(lambda x: extract_id_from_file_name(x)) 143 | 144 | """ 145 | useless test_df. just read to avoid error 146 | """ 147 | test = scipy.sparse.load_npz(os.path.join(get_root_path(), "features", "test.npz")).toarray() 148 | test_df = pd.DataFrame(test) 149 | test_name_list = load_dict(os.path.join(get_root_path(), "features", "test_name_list")) 150 | 151 | test_df["file_name"] = test_name_list 152 | test_df["file_name"] = test_df["file_name"].map(lambda x: extract_id_from_file_name(x)) 153 | return features, label, test_df 154 | 155 | 156 | def load_stage2_tf_idf(suffix, type_name=""): 157 | stage2 = scipy.sparse.load_npz( 158 | os.path.join(get_root_path(), "features", "stage2" + suffix + type_name + ".npz")).toarray() 159 | 160 | train_data = pd.DataFrame(stage2) 161 | 162 | stage2_name_list = load_dict(os.path.join(get_root_path(), "features", "stage2_name_list" + suffix + type_name)) 163 | 164 | train_data["file_name"] = stage2_name_list 165 | train_data["file_name"] = train_data["file_name"].map(lambda x: extract_id_from_file_name(x)) 166 | 167 | return train_data 168 | 169 | 170 | def load_nn_stage2_features(): 171 | nn_features = np.load("nn_features.npy") 172 | name = np.load("file_name_list_stage2.npy") 173 | 174 | train_data = pd.DataFrame(nn_features) 175 | train_data["file_name"] = name 176 | train_data["file_name"] = train_data["file_name"].map(lambda x: extract_id_from_file_name(x)) 177 | return train_data 178 | 179 | 180 | def load_tianchi_tf_idf(): 181 | stage2 = scipy.sparse.load_npz( 182 | os.path.join(get_root_path(), "features", "tianchi" + ".npz")).toarray() 183 | 184 | train_data = pd.DataFrame(stage2) 185 | 186 | stage2_name_list = load_dict(os.path.join(get_root_path(), "features", "tianchi_name_list")) 187 | 188 | train_data["file_name"] = stage2_name_list 189 | train_data["file_name"] = train_data["file_name"].map(lambda x: extract_id_from_file_name(x)) 190 | tianchi = pd.read_csv("security_train.csv")[["label", "file_id"]].drop_duplicates() 191 | tianchi = tianchi.rename(columns={"file_id": "file_name"}) 192 | full = pd.merge(train_data, tianchi, how="left", on="file_name") 193 | label = full["label"] 194 | 195 | return train_data, label 196 | 197 | 198 | def load_tfidf_sparse_features(suffix): 199 | black = scipy.sparse.load_npz(os.path.join(get_root_path(), "black" + suffix + ".npz")) 200 | white = scipy.sparse.load_npz(os.path.join(get_root_path(), "white" + suffix + ".npz")) 201 | test = scipy.sparse.load_npz(os.path.join(get_root_path(), "test" + suffix + ".npz")) 202 | 203 | white_file_id = load_dict("white_name_list") 204 | black_file_id = load_dict("black_name_list") 205 | 206 | black_l = np.ones((black.shape[0],)) 207 | white_l = np.zeros((white.shape[0],)) 208 | train_data = scipy.sparse.vstack([black, white]) 209 | label = pd.DataFrame(np.concatenate((black_l, white_l), axis=0)) 210 | 211 | test_df = test 212 | file_id = load_dict(os.path.join(get_root_path(), "test_name_list" + suffix)) 213 | return train_data, label, test_df, file_id, np.array(black_file_id + white_file_id) 214 | 215 | 216 | def merge_features(features: List): 217 | train_data, label, test_data = features.pop(0) 218 | train_data["label"] = label 219 | 220 | for i in range(len(features)): 221 | train_data = pd.merge(train_data, features[i][0], how="left", on="file_name") 222 | test_data = pd.merge(test_data, features[i][2], how="left", on="file_name") 223 | 224 | label = train_data["label"] 225 | train_data.drop(columns=["label"], inplace=True) 226 | 227 | return train_data, label, test_data 228 | 229 | 230 | def drop_id(features: List): 231 | features[0].drop(columns=["file_name"], inplace=True) 232 | features[2].drop(columns=["file_name"], inplace=True) 233 | return features 234 | 235 | 236 | if __name__ == '__main__': 237 | train_data = load_stage2_tf_idf("1000") 238 | --------------------------------------------------------------------------------