├── param_search
    ├── __init__.py
    ├── test.sh
    ├── larger_cluster_and_others.py
    └── best_cluster_search.py
├── feature_extraction
    ├── __init__.py
    ├── save_embedding.py
    ├── save_autoencoder_mid_layer.py
    ├── nn_train.py
    ├── nn_autoencoder_stage2.py
    ├── cluster_static_features.py
    └── tfidf_features.py
├── one_time_use
    └── save_api_dict.py
├── .gitignore
├── metrics.py
├── cluster_performance_evaluate.py
├── basic_function.py
├── README.md
├── server_cluster.py
├── test_prepare_dataset.py
├── param_search_cluster.py
├── model.py
├── xml_to_csv.py
└── prepare_dataset.py


/param_search/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/feature_extraction/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/param_search/test.sh:
--------------------------------------------------------------------------------
1 | for ((i = 2 ; i < 4 ; i++)); do
2 |   echo  "python  ../server_cluster.py -d 0 -c 0 -nc 210 -l $i"
3 |   python  server_cluster.py -d 0 -c 0 -nc 210 -l $i
4 | done
5 | 


--------------------------------------------------------------------------------
/feature_extraction/save_embedding.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | from model import load_model
4 | import numpy as np
5 | 
6 | model = load_model(os.path.join("models", "weights.01-0.06924324.hdf5"),load_type=0)
7 | weight = model.layers[1].get_weights()[0]
8 | np.save("weight", weight)


--------------------------------------------------------------------------------
/one_time_use/save_api_dict.py:
--------------------------------------------------------------------------------
 1 | from basic_function import load_df, save_dict
 2 | import pandas as pd
 3 | 
 4 | full = load_df("../features/stage2")
 5 | 
 6 | # full.fillna("None", inplace)
 7 | api_list = set(full['api_name'])
 8 | print(len(api_list))
 9 | api_dict = dict(zip(api_list, range(len(api_list))))
10 | save_dict(api_dict, "../features/api_dict.txt")
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | \.idea/
 2 | .idea/
 3 | __pycache__/
 4 | *.png
 5 | transactions.csv
 6 | data/
 7 | *.py[cod]
 8 | *.so
 9 | *.egg
10 | *.egg-info
11 | dist
12 | build
13 | *.ZE
14 | checkpoint
15 | *.meta
16 | 
17 | *.index
18 | *.ckpt.data*
19 | *.tsv
20 | *.pbtxt
21 | *.csv
22 | *temp*
23 | *.h5
24 | *.txt
25 | *.rar
26 | model\.h5
27 | deep_learning/model.h5
28 | *.model
29 | .ipynb_checkpoints/
30 | *.npz
31 | *.data
32 | *.dat
33 | *.png
34 | one_time_use/.ipynb_checkpoints/*
35 | train_dict
36 | *.npy
37 | *.pkl
38 | feature_vectors/
39 | prediction/
40 | *.dirlock
41 | *.tmp
42 | basic_model/classifiers/catboost_info/
43 | model
44 | *.hdf5
45 | *.zip
46 | *.7z
47 | logs/
48 | models/
49 | *.wav
50 | dask-worker-space/
51 | feature_definition*
52 | test_name_list
53 | 


--------------------------------------------------------------------------------
/metrics.py:
--------------------------------------------------------------------------------
 1 | def com_acc(y_true, y_pre, threshold):
 2 |     try:
 3 |         a = y_true.values
 4 |         y_true = y_true.values
 5 |     except AttributeError:
 6 |         pass
 7 |     score = 0
 8 |     for i in range(len(y_pre)):
 9 |         sk = 1 if y_pre[i] >= threshold else 0
10 |         score += int(sk == y_true[i]) - (sk - y_true[i])*sk
11 |     return score/len(y_true)
12 | 
13 | 
14 | def com_acc_keras(threshold):
15 |     def acc(y_true, y_pre):
16 |         y_true = y_true
17 |         score = 0
18 |         print(y_pre.shape)
19 |         for i in range(y_pre.shape[0]):
20 |             sk = 1 if y_pre[i] >= threshold else 0
21 |             score += int(sk == y_true[i]) - (sk - y_true[i]) * sk
22 |         return score / len(y_true)
23 |     return acc
24 | 


--------------------------------------------------------------------------------
/feature_extraction/save_autoencoder_mid_layer.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | from keras_preprocessing.sequence import pad_sequences
 4 | 
 5 | from model import load_model
 6 | import numpy as np
 7 | 
 8 | now = datetime.datetime.now()
 9 | time_name = str(now.month) + "_" + str(now.day) + "_" + str(now.hour) + "_" + str(
10 |     now.minute) + "_" + "features_plus_nn_mid"
11 | model = load_model("./models/weights.01-0.03171408.hdf5", load_type=2)
12 | shape = (512, 64)
13 | input_dim = 92 + 1
14 | batch_size = 16
15 | epochs = 50
16 | class_num = 2
17 | 
18 | final_api_list = np.load("api_list_stage2.npy")
19 | # final_api_list = np.load("api_list_nn.npy")
20 | # label = np.load("api_list_nn.npy")
21 | 
22 | fixed_sequence = pad_sequences(final_api_list, maxlen=shape[0], dtype='int32', padding='post', truncating='post',
23 |                                value=input_dim-1)
24 | 
25 | features = model.predict(fixed_sequence)
26 | 
27 | np.save("nn_features", features)
28 | 
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/cluster_performance_evaluate.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from sklearn import metrics
 4 | 
 5 | from prepare_dataset import load_stage2_tf_idf
 6 | import pandas as pd
 7 | 
 8 | 
 9 | def evaluate_cluster_performance(X, labels):
10 |     sc = metrics.silhouette_score(X, labels, metric='euclidean')
11 |     chs = metrics.calinski_harabaz_score(X, labels)
12 |     dbs = metrics.davies_bouldin_score(X, labels)
13 |     print("silhouette_score:", sc)
14 |     print("calinski_harabaz_score:", chs)
15 |     print("davies_bouldin_score:", dbs)
16 |     return [sc, chs, dbs]
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     parser = argparse.ArgumentParser("cluster", fromfile_prefix_chars='@')
21 |     parser.add_argument('-n', '--class_file', type=str, help='class_file path')
22 | 
23 |     args = parser.parse_args()
24 | 
25 |     train_data = load_stage2_tf_idf("")
26 |     labels = pd.read_csv(args.class_file)
27 |     full = pd.merge(train_data, labels, "left", left_on="file_name", right_on="id")
28 | 
29 |     labels = full["family_id"]
30 |     # full.drop(columns=["family_id", "id"], inplace=True)
31 |     train_data.drop(columns=["file_name"], inplace=True)
32 | 
33 |     evaluate_cluster_performance(train_data, labels)


--------------------------------------------------------------------------------
/basic_function.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pandas as pd
 4 | from numpy import int64
 5 | from numpy import nan, array
 6 | 
 7 | 
 8 | def get_root_path() -> str:
 9 |     return os.path.abspath(os.path.dirname(__file__))
10 | 
11 | 
12 | def useless():
13 |     return nan, array, int64
14 | 
15 | 
16 | def extract_id_from_file_name(x: str):
17 |     return x.split('.')[0]
18 | 
19 | 
20 | def load_df(dir_name, mode=0):
21 |     files = os.listdir(dir_name)
22 |     if mode == 0:
23 |         k = [pd.read_csv(os.path.join(dir_name, file))[['file_name', 'api_name']] for file in files]
24 |     else:
25 |         k = [pd.read_csv(os.path.join(dir_name, file)) for file in files]
26 |     return pd.concat(k)
27 | 
28 | 
29 | def make_dir(name):
30 |     if not os.path.exists(name):
31 |         os.makedirs(name)
32 | 
33 | 
34 | def get_file_list_in_dir(path):
35 |     return [name for name in os.listdir(path) if not os.path.isdir(os.path.join(path, name))]
36 | 
37 | 
38 | def save_dict(dictionary, path):
39 |     with open(path, "w") as f:
40 |         f.write(str(dictionary))
41 |         return
42 | 
43 | 
44 | def load_dict(path):
45 |     with open(path, "r") as f:
46 |         dic = f.read()
47 |     return eval(dic)
48 | 


--------------------------------------------------------------------------------
/param_search/larger_cluster_and_others.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | from queue import Queue
 3 | 
 4 | from param_search_cluster import train_cluster
 5 | 
 6 | 
 7 | def print_param(**kwargs):
 8 |     result = train_cluster(**kwargs)
 9 |     print(kwargs)
10 |     return result
11 | 
12 | 
13 | def worker():
14 |     while True:
15 |         param = q.get()
16 |         print_param(**param)
17 |         q.task_done()
18 | 
19 | 
20 | use_limited_thread = False
21 | use_threading = False
22 | 
23 | params = [{"data_type": 4, "cluster_way": 0, "n_clusters": 210, "dimension_reduction": 3, "n_components": 2800},
24 |           {"data_type": 4, "cluster_way": 0, "n_clusters": 600, "dimension_reduction": 3, "n_components": 2800},
25 |           {"data_type": 4, "cluster_way": 0, "n_clusters": 420, "dimension_reduction": 3, "n_components": 2800}
26 |           ]
27 | 
28 | if use_threading:
29 |     if use_limited_thread:
30 |         num_worker_threads = 2
31 | 
32 |         q = Queue()
33 | 
34 |         for i in range(num_worker_threads):
35 |             t = threading.Thread(target=worker)
36 |             t.daemon = True
37 |             t.start()
38 | 
39 |         for item in params:
40 |             q.put(item)
41 | 
42 |         q.join()
43 |     else:
44 |         for param in params:
45 |             threading.Thread(target=print_param, kwargs=param).start()
46 | else:
47 |     for param in params:
48 |         print_param(**param)
49 | 


--------------------------------------------------------------------------------
/param_search/best_cluster_search.py:
--------------------------------------------------------------------------------
 1 | from basic_function import save_dict
 2 | from param_search_cluster import connect_params, train_cluster
 3 | 
 4 | 
 5 | def search():
 6 |     base_parameter = {"data_type": 0, "dimension_reduction": 0}
 7 | 
 8 |     top_search = {"n_clusters": [380, 400, 420, 440, 460, 480]}
 9 | 
10 |     cluster_ways = [0]
11 |     cluster_parameter = {0: [{"linkage": 0}]}
12 | 
13 |     full_parameter = []
14 |     scores_list = []
15 |     for key in top_search.keys():
16 |         for i in range(len(top_search[key])):
17 |             full_parameter.append({key:top_search[key][i]})
18 |             for cluster_way in cluster_ways:
19 |                 full_parameter.append({"cluster_way":cluster_way})
20 |                 for cp in cluster_parameter[cluster_way]:
21 |                     full_parameter.append(cp)
22 |                     real_full = [base_parameter] + full_parameter
23 |                     print("params:", connect_params(real_full))
24 |                     scores = train_cluster(**connect_params(real_full))
25 |                     scores_list.append([connect_params(real_full)] + scores)
26 |                     full_parameter.pop(-1)
27 |                 full_parameter.pop(-1)
28 |             full_parameter.pop(-1)
29 | 
30 |     print(scores_list)
31 | 
32 |     save_dict(scores_list, "search_b.txt")
33 | 
34 |     for i in range(3):  # length of evaluation ways
35 |         print(sorted(scores_list, key=lambda x:x[i+1]))
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     search()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 特征提取引用了此处代码:[第三届阿里云算法赛_i_hate_mcdonalds团队_解决方案](https://github.com/DeanNg/3rd_security_competition)     
 2 | ![Slide1](https://user-images.githubusercontent.com/31768052/58380229-4bf18780-7fe1-11e9-8cba-c78027326686.JPG)
 3 | ![Slide2](https://user-images.githubusercontent.com/31768052/58380230-4c8a1e00-7fe1-11e9-9bec-dd177b31ba38.JPG)
 4 | ![Slide3](https://user-images.githubusercontent.com/31768052/58380232-4dbb4b00-7fe1-11e9-87ae-1d9ca81cc506.JPG)
 5 | ![Slide4](https://user-images.githubusercontent.com/31768052/58380233-4e53e180-7fe1-11e9-9af1-9c83a7a1ce8c.JPG)
 6 | ![Slide5](https://user-images.githubusercontent.com/31768052/58380235-4eec7800-7fe1-11e9-84e3-977a8bb6fa87.JPG)
 7 | ![Slide6](https://user-images.githubusercontent.com/31768052/58380236-4eec7800-7fe1-11e9-9d00-9bb138dbf8b0.JPG)
 8 | ![Slide7](https://user-images.githubusercontent.com/31768052/58380237-4f850e80-7fe1-11e9-8d93-a97b4651ea8d.JPG)
 9 | ![Slide8](https://user-images.githubusercontent.com/31768052/58380238-4f850e80-7fe1-11e9-9d74-ffc3a96cd958.JPG)
10 | ![Slide9](https://user-images.githubusercontent.com/31768052/58380240-51e76880-7fe1-11e9-8187-acf536ed1782.JPG)
11 | ![Slide10](https://user-images.githubusercontent.com/31768052/58380241-51e76880-7fe1-11e9-8a5e-b3838d239cb1.JPG)
12 | ![Slide11](https://user-images.githubusercontent.com/31768052/58380242-527fff00-7fe1-11e9-9392-f5f3656380fa.JPG)
13 | ![Slide12](https://user-images.githubusercontent.com/31768052/58380243-527fff00-7fe1-11e9-8329-5a9e1d21104b.JPG)
14 | ![Slide13](https://user-images.githubusercontent.com/31768052/58380244-53189580-7fe1-11e9-8556-1b88899f3ba1.JPG)
15 | 


--------------------------------------------------------------------------------
/server_cluster.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from param_search_cluster import train_cluster
 3 | 
 4 | parser = argparse.ArgumentParser("cluster", fromfile_prefix_chars='@')
 5 | parser.add_argument('-d', '--dimension_reduction', type=int, default=0, help='0:none,1:pca, 2:nmf')
 6 | parser.add_argument('-c', "--cluster_way", type=int, default=0, help='1:birch, 3:dbscan')
 7 | parser.add_argument('-n', "--n_components", type=int, default=0, help='n_components')
 8 | parser.add_argument('-e', "--eps", type=float, default=1.0, help='dbscan')
 9 | parser.add_argument('-t', "--threshold", type=float, default=2, help='birch threshold')
10 | parser.add_argument('-nc', "--n_clusters", type=int, default=200, help='birch n_clusters')
11 | parser.add_argument('-bc', "--branching_factor", type=int, default=50, help='birch branching_factor')
12 | parser.add_argument('-l', "--linkage", type=int, default=0,
13 |                     help='AgglomerativeClustering: ["ward", "complete", "average", "single"]')
14 | parser.add_argument('-dt', "--data_type", type=int, default=0, help='0:only tfidf, 1:all, 2:only nn')
15 | parser.add_argument('-i', "--max_iter", type=int, default=200, help='max_iter for NMF')
16 | 
17 | args = parser.parse_args()
18 | 
19 | data_type = args.data_type
20 | dimension_reduction = args.dimension_reduction
21 | cluster_way = args.cluster_way
22 | n_components = args.n_components
23 | threshold = args.threshold
24 | n_clusters = args.n_clusters
25 | branching_factor = args.branching_factor
26 | linkage = args.linkage
27 | max_iter = args.max_iter
28 | eps = args.eps
29 | 
30 | train_cluster(data_type, dimension_reduction, cluster_way, n_components, threshold, n_clusters, branching_factor,
31 |               linkage, max_iter, eps)
32 | 


--------------------------------------------------------------------------------
/test_prepare_dataset.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from prepare_dataset import *
 4 | 
 5 | 
 6 | class TestPrepareDataSet(TestCase):
 7 |     def test_get_outside_train_features(self):
 8 |         train_data, label, test_data = get_outside_train_features()
 9 |         print(train_data.shape)
10 |         print(test_data.shape)
11 | 
12 |         self.assertEqual(train_data.shape[0], 30000)
13 |         self.assertEqual(test_data.shape[0], 15000)
14 | 
15 |     def test_load_ft_features(self):
16 |         train_data, label, test_data = load_ft_features()
17 |         print(train_data.shape)
18 |         print(test_data.shape)
19 | 
20 |         self.assertEqual(train_data.shape[0], 30000)
21 |         self.assertEqual(test_data.shape[0], 15000)
22 | 
23 |     def test_load_nn_features(self):
24 |         train_data, label, test_data = load_nn_features()
25 |         print(train_data.shape)
26 |         print(test_data.shape)
27 | 
28 |         self.assertEqual(train_data.shape[0], 30000)
29 |         self.assertEqual(test_data.shape[0], 15000)
30 | 
31 |     def test_load_depth_three_features(self):
32 |         train_data, label, test_data = load_depth_three_features()
33 | 
34 |         print(train_data.shape)
35 |         print(test_data.shape)
36 | 
37 |         self.assertEqual(train_data.shape[0], 30000)
38 |         self.assertEqual(test_data.shape[0], 15000)
39 | 
40 |     def test_load_tfidf_features(self):
41 |         train_data, label, test_data = load_tfidf_features("")
42 |         print(train_data.shape)
43 |         print(test_data.shape)
44 | 
45 |         self.assertEqual(train_data.shape[0], 30000)
46 |         self.assertEqual(test_data.shape[0], 15000)
47 | 
48 |         train_data, label, test_data = load_tfidf_features("_hkey")
49 |         print(train_data.shape)
50 |         print(test_data.shape)
51 | 
52 |         self.assertEqual(train_data.shape[0], 21154)
53 |         self.assertEqual(test_data.shape[0], 10661)
54 | 
55 |         train_data, label, test_data = load_tfidf_features("_dll")
56 |         print(train_data.shape)
57 |         print(test_data.shape)
58 | 
59 |         self.assertEqual(train_data.shape[0], 29990)
60 |         self.assertEqual(test_data.shape[0], 14989)
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/feature_extraction/nn_train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | from keras.callbacks import ModelCheckpoint
 6 | from keras.utils import to_categorical
 7 | from keras_preprocessing.sequence import pad_sequences
 8 | from sklearn.model_selection import train_test_split
 9 | 
10 | from basic_function import load_dict, load_df, get_root_path, save_dict
11 | from metrics import com_acc
12 | from model import get_model
13 | 
14 | shape = (500, 64)
15 | input_dim = 92 + 1
16 | batch_size = 16
17 | epochs = 50
18 | class_num = 2
19 | 
20 | api_dict = load_dict(os.path.join(get_root_path(), "features", "api_dict.txt"))
21 | white = pd.read_csv(os.path.join(get_root_path(), "features", "white.csv"))[['file_name', 'api_name', 'call_time']]
22 | black = load_df(os.path.join(get_root_path(), "features", "black"), mode=1)[['file_name', 'api_name', 'call_time']]
23 | 
24 | white_label = np.zeros(white.shape[0])
25 | black_label = np.ones(black.shape[0])
26 | 
27 | full = pd.concat([white, black], sort=False)
28 | label = np.concatenate((white_label, black_label))
29 | full['label'] = label
30 | 
31 | full['api_name'] = full['api_name'].map(api_dict)
32 | 
33 | final_api_list = []
34 | label = []
35 | 
36 | less = False
37 | length = 3
38 | for file_name, api_df in full.groupby('file_name'):
39 |     api_df = api_df.sort_values(by="call_time", axis=0, kind="mergesort")
40 |     final_api_list.append(api_df['api_name'].values)
41 |     label.append(api_df['label'].values[0])
42 | 
43 | fixed_sequence = pad_sequences(final_api_list, maxlen=shape[0], dtype='int32', padding='post', truncating='post',
44 |                                value=input_dim-1)
45 | 
46 | label = to_categorical(label, num_classes=class_num)
47 | x, x_test, y, y_test = train_test_split(fixed_sequence, label, test_size=0.25)
48 | 
49 | checkpoint = ModelCheckpoint(filepath='./models/weights.{epoch:02d}-{val_loss:.8f}.hdf5',
50 |                              monitor='val_acc',
51 |                              verbose=1,
52 |                              save_best_only=False)
53 | 
54 | callbacks = [checkpoint]
55 | 
56 | model = get_model(shape=shape, model_type=4, n=1, input_dim=input_dim, class_num=class_num, use_attention=True)
57 | for i in range(epochs):
58 |     model.fit(np.array(x), y,
59 |               batch_size=batch_size,
60 |               epochs=1,
61 |               validation_data=(np.array(x_test), y_test),
62 |               shuffle=True,
63 |               callbacks=callbacks)
64 | 
65 |     predictions = model.predict(np.array(x_test))[:, 1]
66 |     acc = com_acc(np.array(y_test)[:, 1].flatten(), predictions.flatten(), 0.1)
67 |     print("valid:", acc, 0.1)
68 | 
69 |     acc = com_acc(np.array(y_test)[:, 1].flatten(), predictions.flatten(), 0.3)
70 |     print("valid:", acc, 0.3)
71 | 
72 |     acc = com_acc(np.array(y_test)[:, 1].flatten(), predictions.flatten(), 0.5)
73 |     print("valid:", acc, 0.5)
74 | 
75 |     acc = com_acc(np.array(y_test)[:, 1].flatten(), predictions.flatten(), 0.6)
76 |     print("valid:", acc, 0.6)
77 | 
78 |     acc = com_acc(np.array(y_test)[:, 1].flatten(), predictions.flatten(), 0.7)
79 |     print("valid:", acc, 0.7)
80 | 
81 |     acc = com_acc(np.array(y_test)[:, 1].flatten(), predictions.flatten(), 0.9)
82 |     print("valid:", acc, 0.9)
83 | 
84 | 
85 | 


--------------------------------------------------------------------------------
/feature_extraction/nn_autoencoder_stage2.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | from keras.callbacks import ModelCheckpoint
  6 | from keras.utils import to_categorical
  7 | from keras_preprocessing.sequence import pad_sequences
  8 | from sklearn.model_selection import train_test_split
  9 | 
 10 | from basic_function import load_dict, load_df, get_root_path, save_dict
 11 | from metrics import com_acc
 12 | from model import get_model
 13 | from shorten_api_list import delete_repeat_pattern, delete_same_pattern
 14 | import pickle
 15 | 
 16 | shape = (512, 64)
 17 | input_dim = 92 + 1
 18 | batch_size = 32
 19 | epochs = 50
 20 | class_num = 2
 21 | 
 22 | api_dict = load_dict(os.path.join(get_root_path(), "features", "api_dict.txt"))
 23 | white = load_df(os.path.join(get_root_path(), "features", "white"), mode=1)[['file_name', 'api_name', 'call_time']]
 24 | black = load_df(os.path.join(get_root_path(), "features", "black"), mode=1)[['file_name', 'api_name', 'call_time']]
 25 | 
 26 | white_label = np.zeros(white.shape[0])
 27 | black_label = np.ones(black.shape[0])
 28 | 
 29 | full = pd.concat([white, black], sort=False)
 30 | label = np.concatenate((white_label, black_label))
 31 | full['label'] = label
 32 | 
 33 | full['api_name'] = full['api_name'].map(api_dict)
 34 | 
 35 | # full = load_df(os.path.join(get_root_path(), "features", "stage2"), mode=1)[['file_name', 'api_name', 'call_time']]
 36 | full['label'] = np.zeros((full.shape[0],))
 37 | full['api_name'] = full['api_name'].map(api_dict)
 38 | 
 39 | 
 40 | final_api_list = []
 41 | file_name_list = []
 42 | label = []
 43 | 
 44 | less = False
 45 | length = 3
 46 | if not less:
 47 |     for file_name, api_df in full.groupby('file_name'):
 48 |         api_df = api_df.sort_values(by="call_time", axis=0, kind="mergesort")
 49 |         final_api_list.append(api_df['api_name'].values)
 50 |         label.append(api_df['label'].values[0])
 51 |         file_name_list.append(file_name)
 52 | 
 53 | 
 54 |     try:
 55 |         # pickle.dump(final_api_list,  "./api_list.pk")
 56 |         # pickle.dump(label, "./label.pk")
 57 |         np.save("api_list_stage2", final_api_list)
 58 |         np.save("file_name_list_stage2", file_name_list)
 59 |         # np.save("label_", label)
 60 |     except:
 61 |         print("error")
 62 | else:
 63 |     for file_name, api_df in full.groupby('file_name'):
 64 |         api_df = api_df.sort_values(by="call_time", axis=0, kind="mergesort")
 65 |         result = delete_repeat_pattern(api_df['api_name'].values.tolist(), 2)
 66 |         result = delete_same_pattern(result, 3)
 67 | 
 68 |         final_api_list.append(result)
 69 | 
 70 |         label.append(api_df['label'].values[0])
 71 |     try:
 72 |         save_dict(final_api_list, "./api_list_less.txt")
 73 |         save_dict(label, "./label_less.txt")
 74 |     except:
 75 |         print("error")
 76 | 
 77 | # final_api_list = np.load("api_list_stage2.npy")
 78 | # label = np.zeros((len(final_api_list)))
 79 | # print(api_df)
 80 | # print(final_api_list)
 81 | fixed_sequence = pad_sequences(final_api_list, maxlen=shape[0], dtype='int32', padding='post', truncating='post',
 82 |                                value=input_dim-1)
 83 | 
 84 | label = to_categorical(label, num_classes=class_num)
 85 | x, x_test, y, y_test = train_test_split(fixed_sequence, label, test_size=0.25)
 86 | 
 87 | checkpoint = ModelCheckpoint(filepath='./models/weights.{epoch:02d}-{val_loss:.8f}.hdf5',
 88 |                              monitor='val_acc',
 89 |                              verbose=1,
 90 |                              save_best_only=False)
 91 | 
 92 | callbacks = [checkpoint]
 93 | 
 94 | model = get_model(shape=shape, model_type=6, n=1, input_dim=input_dim, class_num=class_num, use_attention=True)
 95 | for i in range(epochs):
 96 |     model.fit(np.array(x), y,
 97 |               batch_size=batch_size,
 98 |               epochs=1,
 99 |               validation_data=(np.array(x_test), y_test),
100 |               shuffle=True,
101 |               callbacks=callbacks)
102 | 
103 |     predictions = model.predict(np.array(x_test))[:, 1]
104 |     acc = com_acc(np.array(y_test)[:, 1].flatten(), predictions.flatten(), 0.1)
105 |     print("valid:", acc, 0.1)
106 | 
107 |     acc = com_acc(np.array(y_test)[:, 1].flatten(), predictions.flatten(), 0.3)
108 |     print("valid:", acc, 0.3)
109 | 
110 |     acc = com_acc(np.array(y_test)[:, 1].flatten(), predictions.flatten(), 0.5)
111 |     print("valid:", acc, 0.5)
112 | 
113 |     acc = com_acc(np.array(y_test)[:, 1].flatten(), predictions.flatten(), 0.6)
114 |     print("valid:", acc, 0.6)
115 | 
116 |     acc = com_acc(np.array(y_test)[:, 1].flatten(), predictions.flatten(), 0.7)
117 |     print("valid:", acc, 0.7)
118 | 
119 |     acc = com_acc(np.array(y_test)[:, 1].flatten(), predictions.flatten(), 0.9)
120 |     print("valid:", acc, 0.9)
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/param_search_cluster.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | from sklearn.cluster import DBSCAN, AgglomerativeClustering, AffinityPropagation
  6 | from sklearn.cluster import KMeans, Birch
  7 | from sklearn.decomposition import NMF, IncrementalPCA, PCA
  8 | from sklearn.preprocessing import StandardScaler
  9 | 
 10 | from cluster_performance_evaluate import evaluate_cluster_performance
 11 | from prepare_dataset import load_nn_stage2_features, load_stage2_tf_idf, load_clustering_statics_files
 12 | 
 13 | 
 14 | def train_cluster(data_type=0, dimension_reduction=0, cluster_way=0, n_components=50, threshold=2, n_clusters=210,
 15 |                   branching_factor=50, linkage=0, max_iter=500, eps=1.0):
 16 |     if data_type == 0:
 17 |         train_data = load_stage2_tf_idf("")
 18 |     elif data_type == 1:
 19 |         train_data = load_stage2_tf_idf("")
 20 |         nn_data = load_nn_stage2_features()
 21 |         train_data = pd.merge(train_data, nn_data, 'left', on="file_name")
 22 |     elif data_type == 2:
 23 |         train_data = load_nn_stage2_features()
 24 |     elif data_type == 3:
 25 |         train_data = load_stage2_tf_idf("1000")
 26 |         nn_data = load_nn_stage2_features()
 27 |         train_data = pd.merge(train_data, nn_data, 'left', on="file_name")
 28 |         dll = load_stage2_tf_idf("_dll")
 29 |         train_data = pd.merge(train_data, dll, 'left', on="file_name")
 30 |         dll = load_stage2_tf_idf("_hkey", "first")
 31 |         train_data = pd.merge(train_data, dll, 'left', on="file_name")
 32 |         dll = load_stage2_tf_idf("_hkey", "last")
 33 |         train_data = pd.merge(train_data, dll, 'left', on="file_name")
 34 |         train_data.fillna(0, inplace=True)
 35 |     elif data_type == 4:
 36 |         train_data = load_stage2_tf_idf("1000")
 37 |         nn_data = load_nn_stage2_features()
 38 |         train_data = pd.merge(train_data, nn_data, 'left', on="file_name")
 39 |         dll = load_stage2_tf_idf("_dll")
 40 |         train_data = pd.merge(train_data, dll, 'left', on="file_name")
 41 |         dll = load_stage2_tf_idf("_hkey", "first")
 42 |         train_data = pd.merge(train_data, dll, 'left', on="file_name")
 43 |         dll = load_stage2_tf_idf("_hkey", "last")
 44 |         train_data = pd.merge(train_data, dll, 'left', on="file_name")
 45 |         dll = load_clustering_statics_files()
 46 |         train_data = pd.merge(train_data, dll, 'left', on="file_name")
 47 |         train_data.fillna(0, inplace=True)
 48 | 
 49 |     file_name = train_data["file_name"]
 50 |     train_data.drop(columns=["file_name"], inplace=True)
 51 |     X = StandardScaler(with_mean=False).fit_transform(train_data)
 52 |     origin_data = X
 53 | 
 54 |     if dimension_reduction == 0:
 55 |         pass
 56 |     elif dimension_reduction == 1:
 57 |         model = IncrementalPCA(n_components=n_components)
 58 |         X = model.fit_transform(X)
 59 |     elif dimension_reduction == 2:
 60 |         model = NMF(n_components=n_components, init='random', random_state=0, max_iter=max_iter)
 61 |         X = model.fit_transform(X)
 62 |     elif dimension_reduction == 3:
 63 |         model = PCA(n_components=n_components)
 64 |         X = model.fit_transform(X)
 65 | 
 66 |     print(len(X[0]))
 67 |     if cluster_way == 0:
 68 |         mode = ["ward", "complete", "average", "single"]
 69 |         db = AgglomerativeClustering(n_clusters=n_clusters, linkage=mode[linkage]).fit(X)
 70 |         labels = db.labels_
 71 |         pd.DataFrame(data={"id": file_name, "family_id": db.labels_}).to_csv(
 72 |             os.path.join("predictions", "aggcl" + "_" + str(n_clusters) + "_" + str(data_type) + "_" + str(
 73 |                 dimension_reduction) + "_" + str(n_components) + ".csv"), index=False)
 74 |         print(len(set(labels)))
 75 |     elif cluster_way == 1:
 76 |         db = Birch(branching_factor=branching_factor, n_clusters=n_clusters, threshold=threshold).fit(X)
 77 |         labels = db.predict(X)
 78 |         pd.DataFrame(data={"id": file_name, "family_id": db.labels_}).to_csv(
 79 |             os.path.join("predictions", "birch" + ".csv"),
 80 |             index=False)
 81 |         print(len(set(labels)))
 82 |     elif cluster_way == 2:
 83 |         db = hdbscan.HDBSCAN(min_cluster_size=40)
 84 |         db.fit(X)
 85 |         labels = db.labels_
 86 |         pd.DataFrame(data={"id": file_name, "family_id": db.labels_}).to_csv(
 87 |             os.path.join("predictions", "hdb_40" + ".csv"),
 88 |             index=False)
 89 |         print(len(set(labels)))
 90 |     elif cluster_way == 3:
 91 |         db = DBSCAN(eps=eps, n_jobs=-1).fit(X)
 92 |         labels = db.labels_
 93 |         pd.DataFrame(data={"id": file_name, "family_id": db.labels_}).to_csv(
 94 |             os.path.join("predictions", "db" + "_" + str(eps) + "_" + str(dimension_reduction) + ".csv"),
 95 |             index=False)
 96 |         print(len(set(labels)))
 97 |     elif cluster_way == 4:
 98 |         labels = np.zeros((len(file_name),))
 99 |         pd.DataFrame(data={"id": file_name, "family_id": np.zeros((len(file_name),))}).to_csv(
100 |             os.path.join("predictions", "zeros" + ".csv"),
101 |             index=False)
102 |     elif cluster_way == 5:
103 |         db = KMeans(n_clusters=n_clusters, random_state=0).fit(X)
104 |         labels = db.labels_
105 |         pd.DataFrame(data={"id": file_name, "family_id": db.labels_}).to_csv(
106 |             os.path.join("predictions", "kmeans" + str(n_clusters) + ".csv"),
107 |             index=False)
108 |         print(len(set(labels)))
109 |     elif cluster_way == 6:
110 |         db = AffinityPropagation()
111 | 
112 |     # Number of clusters in labels, ignoring noise if present.
113 |     n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
114 |     n_noise_ = list(labels).count(-1)
115 | 
116 |     print('Estimated number of clusters: %d' % n_clusters_)
117 |     print('Estimated number of noise points: %d' % n_noise_)
118 | 
119 |     scores = evaluate_cluster_performance(origin_data, labels)
120 |     evaluate_cluster_performance(X, labels)
121 |     return scores
122 | 
123 | 
124 | def connect_params(params):
125 |     full = {}
126 |     for i in params:
127 |         full.update(i)
128 |     return full
129 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | import keras
  2 | import numpy as np
  3 | from keras import Model
  4 | from keras import layers
  5 | from keras.layers import Dropout, BatchNormalization, Embedding, SpatialDropout1D, GlobalMaxPooling1D, Conv1D, \
  6 |     concatenate, Dense, Activation, MaxPool1D, Flatten, Lambda
  7 | from keras.utils import plot_model
  8 | 
  9 | 
 10 | def get_model(shape=(500, 64), num_classes=2, input_dim=93, model_type=0, **kwargs):
 11 |     if model_type == 4:
 12 |         return deeper_textcnn(shape, num_classes, input_dim, **kwargs)
 13 |     elif model_type == 6:
 14 |         return cnn_autoencoder(shape, num_classes, input_dim, **kwargs)
 15 |     else:
 16 |         print("error")
 17 | 
 18 | 
 19 | def deeper_textcnn(shape=(500, 64), num_classes=2, input_dim=93, n=1, use_attention=False, **kwargs):
 20 |     input_array = keras.Input(shape=(shape[0],), name='input')
 21 | 
 22 |     embedding = Embedding(input_dim=input_dim, output_dim=shape[1])
 23 |     embedding_output = embedding(input_array)
 24 | 
 25 |     _embed = SpatialDropout1D(0.25)(embedding_output)
 26 |     warppers = []
 27 |     kernel_size = [2, 3, 4, 5]
 28 | 
 29 |     for _kernel_size in kernel_size:
 30 |         for dilated_rate in [1, 2, 3, 4]:
 31 |             num_res_blocks = 1
 32 |             num_filters_in = 64
 33 |             conv1d = Conv1D(filters=num_filters_in, kernel_size=_kernel_size, dilation_rate=dilated_rate)(_embed)
 34 |             b = BatchNormalization()(conv1d)
 35 |             r = Activation("elu")(b)
 36 |             x = r
 37 | 
 38 |             conv1 = Conv1D(filters=num_filters_in, kernel_size=1, padding="same")(x)
 39 |             b = BatchNormalization()(conv1)
 40 |             r = Activation("elu")(b)
 41 |             conv2 = Conv1D(filters=num_filters_in, kernel_size=3, padding="same")(r)
 42 |             b = BatchNormalization()(conv2)
 43 |             r = Activation("elu")(b)
 44 | 
 45 |             x = keras.layers.add([x, r])
 46 | 
 47 |             x = BatchNormalization()(x)
 48 |             x = Activation('elu')(x)
 49 |             warppers.append(GlobalMaxPooling1D()(x))
 50 | 
 51 |     fc = concatenate(warppers)
 52 |     fc = Dropout(0.25)(fc)
 53 |     fc = Dense(50, activation='relu', name="feature_layer")(fc)
 54 |     preds = Dense(num_classes, activation='softmax')(fc)
 55 | 
 56 |     model = Model(inputs=input_array, outputs=preds)
 57 | 
 58 |     model.compile(loss=l2_softmax(5),
 59 |                   optimizer='adam',
 60 |                   metrics=['accuracy'])
 61 |     model.summary()
 62 |     plot_model(model, "attention.png")
 63 |     return model
 64 | 
 65 | 
 66 | def cnn_autoencoder(shape=(500, 64), num_classes=2, input_dim=93, n=1, use_attention=False, **kwargs):
 67 |     input_array = keras.Input(shape=(shape[0],), name='input')
 68 | 
 69 |     embedding = Embedding(input_dim=input_dim, output_dim=shape[1], weights=[np.load("weight.npy")])
 70 |     embedding.trainable = False
 71 |     embedding_output = embedding(input_array)
 72 | 
 73 |     _embed = SpatialDropout1D(0.25)(embedding_output)
 74 |     kernel_size = 3
 75 |     dilated_rate = 1
 76 | 
 77 |     num_filters_in = 64
 78 |     conv1d = Conv1D(filters=num_filters_in, kernel_size=kernel_size, padding="same")(_embed)
 79 |     b = BatchNormalization()(conv1d)
 80 |     r = Activation("elu")(b)
 81 |     x = r
 82 | 
 83 |     conv1 = Conv1D(filters=num_filters_in, kernel_size=1, padding="same")(x)
 84 |     b = BatchNormalization()(conv1)
 85 |     r = Activation("elu")(b)
 86 |     conv2 = Conv1D(filters=num_filters_in, kernel_size=3, padding="same")(r)
 87 |     b = BatchNormalization()(conv2)
 88 |     r = Activation("elu")(b)
 89 | 
 90 |     x = keras.layers.add([x, r])
 91 | 
 92 |     x = BatchNormalization()(x)
 93 |     x = Activation('elu')(x)
 94 |     x = MaxPool1D(pool_size=128, strides=128, data_format='channels_last')(x)
 95 |     fc = Activation("sigmoid", name="feature_output")(x)
 96 | 
 97 |     fc = layers.UpSampling1D(size=128)(fc)
 98 |     fc = BatchNormalization()(fc)
 99 |     fc = Activation('elu')(fc)  # add later
100 |     # mid = Flatten(name="feature_output")(fc)
101 | 
102 |     conv2 = Conv1D(filters=num_filters_in, kernel_size=3, padding="same")(fc)
103 |     b = BatchNormalization()(conv2)
104 |     r = Activation("elu")(b)
105 | 
106 |     conv1 = Conv1D(filters=num_filters_in, kernel_size=1, padding="same")(r)
107 |     b = BatchNormalization()(conv1)
108 |     r = Activation("elu")(b)
109 | 
110 |     x = keras.layers.add([fc, r])
111 |     conv1d = Conv1D(filters=num_filters_in, kernel_size=kernel_size, padding="same")(x)
112 |     b = BatchNormalization()(conv1d)
113 |     r = Activation("elu")(b)
114 | 
115 |     output = r
116 | 
117 |     output = Lambda(lambda x: keras.losses.mean_squared_error(x[0], x[1]), name='loss',
118 |                     output_shape=(1,))([output, embedding_output])
119 | 
120 |     model = Model(inputs=input_array, outputs=output)
121 | 
122 |     model.compile(loss=loss_first,
123 |                   optimizer='adam')
124 |     model.summary()
125 |     plot_model(model, "attention.png")
126 |     return model
127 | 
128 | 
129 | def loss_first(x, y):
130 |     return y
131 | 
132 | 
133 | def l2_softmax(alpha):
134 |     def a(y_true, y_pred):
135 |         y_normal = alpha * keras.backend.l2_normalize(y_pred)
136 | 
137 |         return keras.losses.categorical_crossentropy(y_true, y_normal)
138 | 
139 |     return a
140 | 
141 | 
142 | def load_model(model_path, load_type=0) -> keras.Model:
143 |     """
144 |     返回训练好的模型
145 |     :return:
146 |     """
147 | 
148 |     def temp(a, b=0):
149 |         return a
150 | 
151 |     if load_type == 0:
152 |         a = temp
153 |         return keras.models.load_model(model_path, custom_objects={'a': l2_softmax(5)})
154 |     elif load_type == 1:
155 |         a = temp
156 |         model = keras.models.load_model(model_path, custom_objects={'a': l2_softmax(5)})
157 |         plot_model(model, "attention.png")
158 | 
159 |         output = model.get_layer('feature_layer').output
160 |         model = Model(model.input, output)
161 |         model.summary()
162 |         return model
163 |     elif load_type == 2:
164 |         a = temp
165 |         model = keras.models.load_model(model_path,
166 |                                         custom_objects={'a': l2_softmax(5), "keras": keras, "loss_first": loss_first})
167 |         plot_model(model, "attention.png")
168 | 
169 |         output = model.get_layer('feature_output').output
170 |         output = Flatten()(output)
171 |         model = Model(model.input, output)
172 |         model.summary()
173 |         return model
174 | 
175 | 
176 | if __name__ == '__main__':
177 |     model = load_model("weights.01-0.06816901.hdf5", 1)
178 |     plot_model(model, "attention.png")
179 | 


--------------------------------------------------------------------------------
/feature_extraction/cluster_static_features.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | from contextlib import contextmanager
  4 | 
  5 | import pandas as pd
  6 | 
  7 | # FEATURE ENGINEERING V1
  8 | from basic_function import load_df
  9 | 
 10 | 
 11 | def makeFeature(data, is_train=True):
 12 |     '''
 13 |     file_cnt: file有多少样本;
 14 |     tid_distinct_cnt: file发起了多少线程;
 15 |     api_distinct_cnt: file调用了多少不同的API ;
 16 |     value_distinct_cnt: file有多少不同的返回值;
 17 |     tid_api_cnt_max,tid_api_cnt_min,tid_api_cnt_mean: ","file中的线程调用的 最多/最少/平均 api数目;
 18 |     tid_api_distinct_cnt_max, tid_api_distinct_cnt_min, tid_api_distinct_cnt_mean:;
 19 |     file中的线程调用的 最多/最少/平均 不同api数目 ;
 20 |     value_equals0_cnt: file返回值为0的样本数;
 21 |     value_equals0_rate： file返回值为0的样本比率;
 22 |     '''
 23 |     if is_train:
 24 |         return_data = data[['file_id', 'label']].drop_duplicates()
 25 |     else:
 26 |         return_data = data[['file_id']].drop_duplicates()
 27 |         ################################################################################
 28 | 
 29 |     feat = data.groupby(['file_id']).agg(
 30 |         {'api': pd.Series.nunique, 'return_value': pd.Series.nunique}).reset_index()
 31 |     feat.columns = ['file_id', 'api_distinct_cnt', 'value_distinct_cnt']
 32 |     return_data = return_data.merge(feat, on='file_id', how='left')
 33 |     ################################################################################
 34 |     feat = data[data.return_value == 0].groupby(['file_id']).return_value.count().reset_index(name='value_equals0_cnt')
 35 |     return_data = return_data.merge(feat, on='file_id', how='left')
 36 |     ################################################################################
 37 |     return_data.loc[:, 'value_equals0_rate'] = (return_data.value_equals0_cnt + 1) / (return_data.file_cnt + 1)
 38 | 
 39 |     return return_data
 40 | 
 41 | 
 42 | # FEATURE ENGINEERING V2
 43 | def makeFeature_v2(data):
 44 |     '''
 45 |     api_index_min: api首次出现的index;
 46 |     api_cnt: api出现的次数;
 47 |     api_rate: api出现的次数占所有api调用次数的比例;
 48 |     api_value_equals_0_cnt:   api返回值为0的次数;
 49 |     '''
 50 |     return_data = data[['file_id']].drop_duplicates()
 51 | 
 52 |     # 统计file调用api的次数
 53 |     tmp = data.groupby(['file_id']).api.count()
 54 | 
 55 |     # 统计api调用的最小Index
 56 |     feat = data.groupby(['file_id', 'api'])['index'].min().reset_index(name='val')
 57 |     feat = feat.pivot(index='file_id', columns='api', values='val')
 58 |     feat.columns = [feat.columns[i] + '_index_min' for i in range(feat.shape[1])]
 59 |     feat_withFileid = feat.reset_index()
 60 |     return_data = return_data.merge(feat_withFileid, on='file_id', how='left')
 61 |     # 统计api调用的次数
 62 |     feat = data.groupby(['file_id', 'api'])['index'].count().reset_index(name='val')
 63 |     feat = feat.pivot(index='file_id', columns='api', values='val')
 64 |     feat.columns = [feat.columns[i] + '_cnt' for i in range(feat.shape[1])]
 65 |     feat_withFileid = feat.reset_index()
 66 |     return_data = return_data.merge(feat_withFileid, on='file_id', how='left')
 67 |     # 统计api调用的比例
 68 |     feat_rate = pd.concat([feat, tmp], axis=1)
 69 |     feat_rate = feat_rate.apply(lambda x: x / feat_rate.api)
 70 |     feat_rate.columns = [feat_rate.columns[i] + '_rate' for i in range(feat_rate.shape[1])]
 71 |     feat_rate_withFileid = feat_rate.reset_index().drop(['api_rate'], axis=1)
 72 |     return_data = return_data.merge(feat_rate_withFileid, on='file_id', how='left')
 73 | 
 74 |     # 统计api返回值为0的次数
 75 |     feat = data[data.return_value == 0].groupby(['file_id', 'api'])['index'].count().reset_index(name='val')
 76 |     feat = feat.pivot(index='file_id', columns='api', values='val')
 77 |     feat.columns = [feat.columns[i] + '_value_equals_0_cnt' for i in range(feat.shape[1])]
 78 |     feat_withFileid = feat.reset_index()
 79 |     return_data = return_data.merge(feat_withFileid, on='file_id', how='left')
 80 | 
 81 |     return return_data
 82 | 
 83 | 
 84 | # FEATURE ENGINEERING V3
 85 | def makeFeature_v3(data):
 86 |     '''
 87 |     api_not0_index_min: api返回值不为0的index的最小值;
 88 |     api_not0_index_min_diff: api返回值不为0时最小index和该api出现的最小index的差;
 89 |     api_equals0_rate: api返回值为0的次数占该api次数的比例
 90 |     '''
 91 |     return_data = data[['file_id']].drop_duplicates()
 92 |     # 统计api调用的最小Index
 93 |     feat_api_min_index = data.groupby(['file_id', 'api'])['index'].min().reset_index(name='min_index')
 94 |     feat_api_not0_min_index = data[data.return_value != 0].groupby(['file_id', 'api'])['index'].min().reset_index(
 95 |         name='value_not0_min_index')
 96 |     # 统计return_value不为0的最小Index
 97 |     feat = feat_api_not0_min_index.pivot(index='file_id', columns='api', values='value_not0_min_index')
 98 |     feat.columns = [feat.columns[i] + '_not0_index_min' for i in range(feat.shape[1])]
 99 |     feat_withFileid = feat.reset_index()
100 |     return_data = return_data.merge(feat_withFileid, on='file_id', how='left')
101 |     # 统计return_value不为0的最小Index和api最小index的差
102 |     feat = feat_api_min_index.merge(feat_api_not0_min_index, on=['file_id', 'api'], how='left')
103 |     feat.loc[:, 'api_index_not0_min_diff'] = feat['value_not0_min_index'] - feat['min_index']
104 |     feat = feat.pivot(index='file_id', columns='api', values='api_index_not0_min_diff')
105 |     feat.columns = [feat.columns[i] + '_not0_index_min_diff' for i in range(feat.shape[1])]
106 |     feat_withFileid = feat.reset_index()
107 |     return_data = return_data.merge(feat_withFileid, on='file_id', how='left')
108 |     # 统计api返回值为0的次数
109 |     feat = data[data.return_value == 0].groupby(['file_id', 'api'])['index'].count().reset_index(
110 |         name='value_equals0_cnt')
111 |     feat_api_cnt = data.groupby(['file_id', 'api']).return_value.count().reset_index(name='file_api_cnt')
112 |     feat = feat.merge(feat_api_cnt, on=['file_id', 'api'], how='left')
113 |     feat.loc[:, 'value_equals0_rate'] = feat['value_equals0_cnt'] / (feat['file_api_cnt'] * 1.0)
114 |     # 统计return_value为0的比例
115 |     feat = feat.pivot(index='file_id', columns='api', values='value_equals0_rate')
116 |     feat.columns = [feat.columns[i] + '_equals0_rate' for i in range(feat.shape[1])]
117 |     feat_withFileid = feat.reset_index()
118 |     return_data = return_data.merge(feat_withFileid, on='file_id', how='left')
119 | 
120 |     return return_data
121 | 
122 | 
123 | # TIME-COST FUNCTION
124 | @contextmanager
125 | def timer(title):
126 |     t0 = time.time()
127 |     yield
128 |     print("{} - done in {:.2f}s".format(title, time.time() - t0))
129 | 
130 | 
131 | def extract_features(load_name, extract_function):
132 |     white = load_df(load_name, mode=1)[['file_name', 'api_name', 'call_name', 'call_pid',
133 |                                         'ret_value', 'apiArg_list_count', 'exInfo_list_count', 'call_time']]
134 |     white.rename(columns={'ret_value': 'return_value', 'api_name': 'api', 'file_name': 'file_id', 'call_time': 'index'},
135 |                  inplace=True)
136 |     a = extract_function(white)
137 |     a.rename(columns={'file_id': 'file_name'}, inplace=True)
138 |     return a
139 | 
140 | 
141 | if __name__ == '__main__':
142 |     def inter(load_name):
143 |         white = load_df(load_name, mode=1)[['file_name', 'api_name', 'ret_value', 'call_time']]
144 |         white.rename(columns={'ret_value': 'return_value', 'api_name': 'api', 'file_name': 'file_id', 'call_time': 'index'},
145 |                      inplace=True)
146 |         return white
147 | 
148 | 
149 |     extract_functions = [makeFeature_v2, makeFeature_v3]
150 |     data = inter(os.path.join("features", "stage2"))
151 |     full = []
152 |     for func in extract_functions:
153 |         full.append(func(data))
154 |     pd.merge(full[0], full[1], 'outer', on="file_id").rename(columns={'file_id': 'file_name'}).to_csv(
155 |         os.path.join("features", "outside_stage2" + ".csv"))
156 | 
157 | 


--------------------------------------------------------------------------------
/xml_to_csv.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import xml.etree.ElementTree as ET
  3 | from datetime import datetime
  4 | 
  5 | import pandas as pd
  6 | 
  7 | from basic_function import make_dir
  8 | 
  9 | 
 10 | def get_file_list_in_dir(path):
 11 |     return [name for name in os.listdir(path) if not os.path.isdir(os.path.join(path, name))]
 12 | 
 13 | 
 14 | def save_file_info(path, csv_name):
 15 |     file = get_file_list_in_dir(path=path)
 16 |     print("file number:", len(file))
 17 |     file_data = []
 18 |     for i in range(len(file)):
 19 |         if i % 200 == 0:
 20 |             print("percent:", i / len(file) * 100)
 21 |         file_name = file[i]
 22 |         root = ET.parse(os.path.join(path, file_name)).getroot()
 23 | 
 24 |         file_list_node = root.findall('file_list')[0]
 25 | 
 26 |         file_error = int(file_list_node.get('file_error'))
 27 |         file_name = file_list_node.get('file_name')
 28 |         file_uid = file_list_node.get('file_uid')
 29 | 
 30 |         file_data.append([file_error, file_name, file_uid])
 31 | 
 32 |     file_data_df = pd.DataFrame(data=file_data, columns=['file_error', 'file_name', 'file_uid'])
 33 |     file_data_df.to_csv(csv_name)
 34 | 
 35 | 
 36 | def save_action_data_to_csv(action_data, csv_name, columns=None):
 37 |     if columns is None:
 38 |         action_data_df = pd.DataFrame(data=action_data,
 39 |                                       columns=['file_name', 'api_name', 'call_name', 'call_pid', 'ret_value',
 40 |                                                'apiArg_list_count',
 41 |                                                'exInfo_list_count', 'call_time'])
 42 |         action_data_df.to_csv(csv_name)
 43 |     else:
 44 |         action_data_df = pd.DataFrame(data=action_data, columns=columns)
 45 |         action_data_df.to_csv(csv_name)
 46 | 
 47 | 
 48 | def time_string_to_ns(time_string):
 49 |     time_format = '%H:%M:%S.%f'
 50 |     datetime_object = datetime.strptime(time_string, time_format)
 51 |     return (datetime_object - datetime(1970, 1, 1)).total_seconds()
 52 | 
 53 | 
 54 | def is_dll(name):
 55 |     return name[-3:] == "dll" or name[-3:] == "DLL"
 56 | 
 57 | 
 58 | def is_hkey(name):
 59 |     return name[:4] == "HKEY"
 60 | 
 61 | 
 62 | def save_action_stream(path, csv_name, dir_name="", mode=0):
 63 |     if dir_name != "":
 64 |         make_dir(os.path.join("features", dir_name))
 65 |     file = get_file_list_in_dir(path=path)
 66 |     print("file number:", len(file))
 67 |     action_data = []
 68 |     for i in range(len(file)):
 69 |         if i % 200 == 0:
 70 |             print("percent:", i / len(file) * 100)
 71 | 
 72 |         if i % 1000 == 0 and mode == 1 and i != 0:
 73 |             save_action_data_to_csv(action_data, os.path.join("features", dir_name, str(i) + csv_name))
 74 |             action_data = []
 75 | 
 76 |         file_name = file[i]
 77 |         root = ET.parse(os.path.join(path, file_name)).getroot()
 78 | 
 79 |         action_list = root.findall('./file_list/file/start_boot/action_list/action')
 80 |         start = 0
 81 |         start_time = 0
 82 |         last = 0
 83 |         for action in action_list:
 84 |             api_name = action.get('api_name')
 85 |             call_name = action.get('call_name')
 86 |             call_pid = action.get('call_pid')
 87 |             call_time = action.get('call_time')
 88 |             ret_value = action.get('ret_value')
 89 | 
 90 |             # convert time string to seconds since software start
 91 |             try:
 92 |                 call_time = time_string_to_ns(call_time)
 93 |                 last = call_time
 94 |             except ValueError:
 95 |                 print(call_time)
 96 |                 call_time = last
 97 | 
 98 |             if start == 0:
 99 |                 start_time = call_time
100 |                 start += 1
101 | 
102 |             call_time = call_time - start_time
103 | 
104 |             try:
105 |                 apiArg_list = action.findall('apiArg_list')[0]
106 |                 apiArg_list_count = apiArg_list.get('count')
107 |             except IndexError:
108 |                 apiArg_list_count = -1
109 |                 print(action.attrib)
110 | 
111 |             exInfo_list = action.findall('exInfo_list')[0]
112 |             exInfo_list_count = exInfo_list.get('count')
113 | 
114 |             action_data.append(
115 |                 [file_name, api_name, call_name, call_pid, ret_value, apiArg_list_count, exInfo_list_count, call_time])
116 | 
117 |     if mode == 1:
118 |         save_action_data_to_csv(action_data, os.path.join("features", dir_name, 'final_' + csv_name))
119 |     else:
120 |         save_action_data_to_csv(action_data, csv_name)
121 | 
122 | 
123 | def save_attribute_list(path, csv_name, dir_name="", mode=0, dll_or_hkey="dll"):
124 |     if dir_name != "":
125 |         make_dir(os.path.join("features", dir_name))
126 |     file = get_file_list_in_dir(path=path)
127 |     print("file number:", len(file))
128 |     column_name = ['file_name', 'api_name', 'call_name', 'call_pid', 'ret_value', 'value', "call_time"]
129 |     determination = is_hkey
130 |     if dll_or_hkey == "dll":
131 |         determination = is_dll
132 |     elif dll_or_hkey == "hkey":
133 |         determination = is_hkey
134 | 
135 |     action_data = []
136 |     for i in range(len(file)):
137 |         if i % 200 == 0:
138 |             print("percent:", i / len(file) * 100)
139 | 
140 |         if i % 1000 == 0 and mode == 1 and i != 0:
141 |             save_action_data_to_csv(action_data, os.path.join("features", dir_name, str(i) + csv_name), column_name)
142 |             action_data = []
143 | 
144 |         file_name = file[i]
145 |         root = ET.parse(os.path.join(path, file_name)).getroot()
146 | 
147 |         action_list = root.findall('./file_list/file/start_boot/action_list/action')
148 |         for action in action_list:
149 | 
150 |             api_name = action.get('api_name')
151 |             call_name = action.get('call_name')
152 |             call_pid = action.get('call_pid')
153 |             call_time = action.get('call_time')
154 |             ret_value = action.get('ret_value')
155 | 
156 |             apiArg_list = action.findall("./apiArg_list/apiArg")
157 |             exInfo_list = action.findall('./exInfo_list/exInfo')
158 | 
159 |             for apiArg in apiArg_list:
160 |                 value = apiArg.get('value')
161 |                 # print(value)
162 | 
163 |                 if determination(value):
164 |                     print(1)
165 |                     action_data.append(
166 |                         [file_name, api_name, call_name, call_pid, ret_value, value, call_time])
167 | 
168 |             for exInfo in exInfo_list:
169 |                 value = exInfo.get('value')
170 |                 # print(value)
171 |                 if determination(value):
172 |                     action_data.append(
173 |                         [file_name, api_name, call_name, call_pid, ret_value, value, call_time])
174 | 
175 |     if mode == 1:
176 |         save_action_data_to_csv(action_data, os.path.join("features", dir_name, 'final_' + csv_name), column_name)
177 |     else:
178 |         save_action_data_to_csv(action_data, csv_name, column_name)
179 | 
180 | 
181 | if __name__ == '__main__':
182 |     save_action_stream('stage2_dataset', "stage2.csv", dir_name="stage2", mode=1)
183 | 
184 |     save_attribute_list('stage2_dataset', "stage2.csv",
185 |                         dir_name="stage2_dll", mode=1, dll_or_hkey="dll")
186 | 
187 |     save_attribute_list('stage2_dataset', "stage2.csv",
188 |                         dir_name="stage2_hkey", mode=1, dll_or_hkey="hkey")
189 | 
190 |     # save_action_stream('stage1_dataset\\test', "test.csv", dir_name="test", mode=1)
191 |     # save_action_stream('stage1_dataset\\train\\white', "white.csv", dir_name="white",
192 |     #                    mode=1)
193 |     # save_action_stream('stage1_dataset\\stage1_dataset\\train\\black', "black.csv", dir_name="black",
194 |     #                    mode=1)
195 | 
196 |     # save_attribute_list('stage1_dataset\\stage1_dataset\\train\\white', "white.csv",
197 |     #                     dir_name="white_dll", mode=1, dll_or_hkey="dll")
198 |     # save_attribute_list('stage1_dataset\\stage1_dataset\\train\\black', "black.csv",
199 |     #                     dir_name="black_dll", mode=1, dll_or_hkey="dll")
200 |     # save_attribute_list('stage1_dataset\\stage1_dataset\\test', "test.csv",
201 |     #                     dir_name="test_dll", mode=1, dll_or_hkey="dll")
202 |     #
203 |     # save_attribute_list('stage1_dataset\\stage1_dataset\\train\\white', "white.csv",
204 |     #                     dir_name="white_hkey", mode=1, dll_or_hkey="hkey")
205 |     # save_attribute_list('stage1_dataset\\stage1_dataset\\train\\black', "black.csv",
206 |     #                     dir_name="black_hkey", mode=1, dll_or_hkey="hkey")
207 |     # save_attribute_list('stage1_dataset\\stage1_dataset\\test', "test.csv",
208 |     #                     dir_name="test_hkey", mode=1, dll_or_hkey="hkey")
209 | 


--------------------------------------------------------------------------------
/feature_extraction/tfidf_features.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import pandas as pd
  4 | import scipy
  5 | from sklearn.feature_extraction.text import TfidfVectorizer
  6 | 
  7 | from basic_function import load_df, save_dict, get_root_path
  8 | 
  9 | 
 10 | def to_str(df, mode=0, column_name=None):
 11 |     if column_name is None:
 12 |         column_name = 'api_name'
 13 |     string_list = []
 14 |     name_list = []
 15 |     for i in df.groupby('file_name')[column_name]:
 16 |         name_list.append(i[0])
 17 |         api_str = ""
 18 |         for p in i[1].iteritems():
 19 |             api_str += " " + str(p[1])
 20 |         string_list.append(api_str)
 21 |     # print(name_list)
 22 |     if mode == 1:
 23 |         return string_list, name_list
 24 |     else:
 25 |         return string_list
 26 | 
 27 | 
 28 | def api():
 29 |     api_vec = TfidfVectorizer(ngram_range=(1, 5),
 30 |                               min_df=3, max_df=0.9,
 31 |                               strip_accents='unicode',
 32 |                               use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=500)
 33 | 
 34 |     white = pd.read_csv("white.csv")[['file_name', 'api_name']]
 35 |     black = load_df("black")
 36 |     test = load_df("test")
 37 | 
 38 |     full = pd.concat([white, black, test])
 39 |     full_str = to_str(full)
 40 | 
 41 |     print(1)
 42 |     api_vec.fit(full_str)
 43 | 
 44 |     print(2)
 45 |     black_output, name_list = to_str(black, mode=1)
 46 |     save_dict(name_list, "black_name_list")
 47 |     black_output = api_vec.transform(black_output)
 48 |     scipy.sparse.save_npz("black.npz", black_output)
 49 | 
 50 |     white_output, name_list = to_str(white, mode=1)
 51 |     save_dict(name_list, "white_name_list")
 52 |     white_output = api_vec.transform(white_output)
 53 |     scipy.sparse.save_npz("white.npz", white_output)
 54 | 
 55 |     test_str, name_list = to_str(test, mode=1)
 56 |     save_dict(name_list, "test_name_list")
 57 |     test_output = api_vec.transform(test_str)
 58 |     scipy.sparse.save_npz("test.npz", test_output)
 59 | 
 60 | 
 61 | def stage2_api(feature_num=500):
 62 |     api_vec = TfidfVectorizer(ngram_range=(1, 5),
 63 |                               min_df=3, max_df=0.9,
 64 |                               strip_accents='unicode',
 65 |                               use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=feature_num)
 66 | 
 67 |     white = pd.read_csv(os.path.join("features", "white.csv"))[['file_name', 'api_name']]
 68 |     black = load_df(os.path.join("features", "black"))
 69 |     test = load_df(os.path.join("features", "test"))
 70 |     stage2 = load_df(os.path.join("features", "stage2"))
 71 | 
 72 |     full = pd.concat([white, black, test])
 73 |     full_str = to_str(full)
 74 | 
 75 |     print(1)
 76 |     api_vec.fit(full_str)
 77 | 
 78 |     print(2)
 79 | 
 80 |     black_output, name_list = to_str(stage2, mode=1)
 81 |     save_dict(name_list, os.path.join("features", "stage2_name_list"+str(feature_num)))
 82 |     black_output = api_vec.transform(black_output)
 83 |     scipy.sparse.save_npz(os.path.join("features", "stage2"+str(feature_num)+".npz"), black_output)
 84 | 
 85 | 
 86 | def tianchi_api():
 87 |     api_vec = TfidfVectorizer(ngram_range=(1, 5),
 88 |                               min_df=3, max_df=0.9,
 89 |                               strip_accents='unicode',
 90 |                               use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=500)
 91 | 
 92 |     white = pd.read_csv(os.path.join("features", "white.csv"))[['file_name', 'api_name']]
 93 |     black = load_df(os.path.join("features", "black"))
 94 |     test = load_df(os.path.join("features", "test"))
 95 |     tianchi = pd.read_csv("security_train.csv").rename(columns={"file_id":"file_name"})
 96 | 
 97 |     full = pd.concat([white, black, test])
 98 |     full_str = to_str(full)
 99 | 
100 |     print(1)
101 |     api_vec.fit(full_str)
102 | 
103 |     print(2)
104 | 
105 |     black_output, name_list = to_str(tianchi, mode=1, column_name="api")
106 |     save_dict(name_list, os.path.join("features", "tianchi_name_list"))
107 |     black_output = api_vec.transform(black_output)
108 |     scipy.sparse.save_npz(os.path.join("features", "tianchi.npz"), black_output)
109 | 
110 | 
111 | def stage_2_attribute(suffix="_dll", use_less_value=False, type_name="", map_func=None, max_feature=1000):
112 |     stage2 = load_df(os.path.join("features", "stage2"+suffix), mode=1)
113 | 
114 |     if use_less_value:
115 |         if map_func is None:
116 |             stage2["value"] = stage2["value"].map(lambda x: x.split("\\")[-1])
117 |         else:
118 |             stage2["value"] = stage2["value"].map(lambda x: map_func(x))
119 |     stage2_output, name_list = to_str(stage2, mode=1, column_name="value")
120 |     api_vec, _ = train_tf_idf(suffix="_dll", use_less_value=use_less_value, map_func=map_func, data=stage2_output, max_feature=max_feature)
121 | 
122 |     save_dict(name_list, os.path.join(get_root_path(), "features", "stage2_name_list" + suffix + type_name))
123 |     stage2_output = api_vec.transform(stage2_output)
124 |     scipy.sparse.save_npz(os.path.join(get_root_path(), "features", "stage2" + suffix + type_name + ".npz"), stage2_output)
125 | 
126 | 
127 | def attribution(suffix="_dll", use_less_value=False, type_name="", map_func=None, max_feature=2000):
128 |     api_vec, data = train_tf_idf(suffix="_dll", use_less_value=use_less_value, map_func=map_func,
129 |                                  max_feature=max_feature)
130 | 
131 |     white, black, test = data
132 | 
133 |     black_output, name_list = to_str(black, mode=1, column_name="value")
134 |     save_dict(name_list, os.path.join(get_root_path(), "features", "black_name_list" + suffix + type_name))
135 |     black_output = api_vec.transform(black_output)
136 |     scipy.sparse.save_npz(os.path.join(get_root_path(), "features", "black" + suffix + type_name + ".npz"), black_output)
137 | 
138 |     white_output, name_list = to_str(white, mode=1, column_name="value")
139 |     save_dict(name_list, os.path.join(get_root_path(), "features", "white_name_list" + suffix + type_name))
140 |     white_output = api_vec.transform(white_output)
141 |     scipy.sparse.save_npz(os.path.join(get_root_path(), "features", "white" + suffix + type_name + ".npz"), white_output)
142 | 
143 |     test_str, name_list = to_str(test, mode=1, column_name="value")
144 |     save_dict(name_list, os.path.join(get_root_path(), "features", "test_name_list" + suffix + type_name))
145 |     test_output = api_vec.transform(test_str)
146 |     scipy.sparse.save_npz(os.path.join(get_root_path(), "features", "test" + suffix + type_name + ".npz"), test_output)
147 | 
148 | 
149 | def train_tf_idf(suffix="_dll", use_less_value=False, map_func=None, max_feature=2000, data=None):
150 |     api_vec = TfidfVectorizer(ngram_range=(1, 5),
151 |                               min_df=3, max_df=0.9,
152 |                               strip_accents='unicode',
153 |                               use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=max_feature)
154 | 
155 |     if data is None:
156 |         white = load_df(os.path.join(get_root_path(), "features", "white" + suffix), mode=1)
157 |         black = load_df(os.path.join(get_root_path(), "features", "black" + suffix), mode=1)
158 |         test = load_df(os.path.join(get_root_path(), "features", "test" + suffix), mode=1)
159 | 
160 |         if use_less_value:
161 |             if map_func is None:
162 |                 for i in [white, black, test]:
163 |                     i["value"] = i["value"].map(lambda x: x.split("\\")[-1])
164 |             else:
165 |                 for i in [white, black, test]:
166 |                     i["value"] = i["value"].map(lambda x: map_func(x))
167 | 
168 |         full = pd.concat([white, black, test])
169 |         full_str = to_str(full, column_name="value")
170 |     else:
171 |         full_str = data
172 | 
173 |     print(1)
174 |     api_vec.fit(full_str)
175 |     print(2)
176 |     if data is None:
177 |         return api_vec, [white, black, test]
178 |     else:
179 |         return api_vec, None
180 | 
181 | 
182 | def last_hkey(x):
183 |     return "speech" if len(x.split("\\")) == 1 else x.split("\\")[-1]
184 | 
185 | 
186 | def second_hkey(x):
187 |     return "speech" if len(x.split("\\")) == 1 else x.split("\\")[1]
188 | 
189 | 
190 | def stage2_api_new(feature_num=500):
191 |     api_vec = TfidfVectorizer(ngram_range=(1, 5),
192 |                               min_df=3, max_df=0.9,
193 |                               strip_accents='unicode',
194 |                               use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=feature_num)
195 | 
196 | 
197 |     stage2 = load_df(os.path.join("features", "stage2"))
198 | 
199 |     black_output, name_list = to_str(stage2, mode=1)
200 | 
201 |     print(1)
202 |     api_vec.fit(black_output)
203 | 
204 |     print(2)
205 | 
206 |     # black_output, name_list = to_str(stage2, mode=1)
207 |     save_dict(name_list, os.path.join(get_root_path(), "features", "stage2_name_list"+str(feature_num)))
208 |     black_output = api_vec.transform(black_output)
209 |     scipy.sparse.save_npz(os.path.join(get_root_path(), "features", "stage2"+str(feature_num)+".npz"), black_output)
210 | 
211 | # attribution("_hkey", use_less_value=True, type_name="second", map_func=second_hkey, max_feature=100)
212 | # attribution("_hkey", use_less_value=True, type_name="last", map_func=last_hkey, max_feature=200)
213 | # attribution("_hkey", use_less_value=False, type_name="", max_feature=500)
214 | # attribution("_dll", use_less_value=False, type_name="", max_feature=500)
215 | # tianchi_api()
216 | # api()
217 | 
218 | 
219 | stage_2_attribute("_dll", False, max_feature=1000)
220 | stage_2_attribute("_hkey", use_less_value=True, type_name="last", map_func=last_hkey, max_feature=1000)
221 | stage_2_attribute("_hkey", use_less_value=True, type_name="first", map_func=second_hkey, max_feature=100)
222 | stage2_api_new(1000)
223 | 


--------------------------------------------------------------------------------
/prepare_dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import List
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import scipy.sparse
  7 | 
  8 | from basic_function import extract_id_from_file_name, load_dict, get_root_path
  9 | 
 10 | 
 11 | def get_outside_train_features():
 12 |     train = pd.read_csv(os.path.join(get_root_path(), "features", "safe_type_train.csv"))
 13 |     train.rename(columns={"id": "file_name"}, inplace=True)
 14 |     full_features = pd.read_csv(os.path.join(get_root_path(), "features", "outside.csv"), index_col=0)
 15 |     full_features["file_name"] = full_features["file_name"].map(lambda x: extract_id_from_file_name(x))
 16 | 
 17 |     test_name_list = load_dict(os.path.join(get_root_path(), "features", "test_name_list"))
 18 |     test_data = pd.DataFrame(columns=["file_name"], data=np.array(test_name_list))
 19 |     test_data["file_name"] = test_data["file_name"].map(lambda x: extract_id_from_file_name(x))
 20 | 
 21 |     # merge
 22 |     train_data = pd.merge(train, full_features, "left", on="file_name")
 23 |     test_data = pd.merge(test_data, full_features, "left", on="file_name")
 24 | 
 25 |     label = train_data["safe_type"]
 26 | 
 27 |     train_data.drop(columns=["safe_type"], inplace=True)
 28 | 
 29 |     return train_data, label, test_data
 30 | 
 31 | 
 32 | def load_clustering_statics_files():
 33 |     full_features = pd.read_csv(os.path.join(get_root_path(), "features", "outside_stage2.csv"), index_col=0)
 34 |     full_features["file_name"] = full_features["file_name"].map(lambda x: extract_id_from_file_name(x))
 35 |     return full_features
 36 | 
 37 | 
 38 | def load_ft_features(feature_files=None):
 39 |     if feature_files is None:
 40 |         feature_files = {"black": "black_features.csv", "white": "white_features.csv", "test": "test_features.csv"}
 41 |     train = pd.read_csv(os.path.join(get_root_path(), "features", "safe_type_train.csv"))
 42 |     train.rename(columns={"id": "file_name"}, inplace=True)
 43 | 
 44 |     black_features = pd.read_csv(os.path.join(get_root_path(), "features", feature_files["black"]))
 45 |     white_features = pd.read_csv(os.path.join(get_root_path(), "features", feature_files["white"]))
 46 |     full_features = pd.concat([black_features, white_features])
 47 |     full_features["file_name"] = full_features["file_name"].map(lambda x: extract_id_from_file_name(x))
 48 | 
 49 |     # load test data
 50 |     test_data = pd.read_csv(os.path.join(get_root_path(), "features", feature_files["test"]))
 51 |     test_data["file_name"] = test_data["file_name"].map(lambda x: extract_id_from_file_name(x))
 52 | 
 53 |     # merge
 54 |     train_dat = pd.merge(train, full_features, "inner", on="file_name")
 55 | 
 56 |     label = train_dat["safe_type"]
 57 |     train_dat.drop(columns=["safe_type"], inplace=True)
 58 |     return train_dat, label, test_data
 59 | 
 60 | 
 61 | def load_runtime_features():
 62 |     train = pd.read_csv(os.path.join(get_root_path(), "features", "safe_type_train.csv"))
 63 |     train.rename(columns={"id": "file_name"}, inplace=True)
 64 | 
 65 |     full_features = pd.read_csv(os.path.join("features", "train_used_time_feauture.csv"))
 66 |     full_features["file_name"] = full_features["file_name"].map(lambda x: extract_id_from_file_name(x))
 67 | 
 68 |     # load test data
 69 |     test_data = pd.read_csv(os.path.join("features", "test_used_time_feauture.csv"))
 70 |     test_data["file_name"] = test_data["file_name"].map(lambda x: extract_id_from_file_name(x))
 71 | 
 72 |     # merge
 73 |     train_dat = pd.merge(train, full_features, "inner", on="file_name")
 74 | 
 75 |     label = train_dat["safe_type"]
 76 |     train_dat.drop(columns=["safe_type"], inplace=True)
 77 |     return train_dat, label, test_data
 78 | 
 79 | 
 80 | def load_depth_three_features():
 81 |     return load_ft_features({"black": "black_features_depth_3.csv", "white": "white_features_depth_3.csv",
 82 |                              "test": "test_features_depth_3.csv"})
 83 | 
 84 | 
 85 | def load_nn_features():
 86 |     train = pd.read_csv(os.path.join(get_root_path(), "features", "safe_type_train.csv"))
 87 |     train.rename(columns={"id": "file_name"}, inplace=True)
 88 | 
 89 |     train_features = pd.read_csv(os.path.join(get_root_path(), "features", "train_nn.csv"))
 90 |     train_features["file_name"] = train_features["file_name"].map(lambda x: extract_id_from_file_name(x))
 91 | 
 92 |     # load test data
 93 |     test_data = pd.read_csv(os.path.join(get_root_path(), "features", "test_nn.csv"))
 94 |     test_data["file_name"] = test_data["file_name"].map(lambda x: extract_id_from_file_name(x))
 95 | 
 96 |     # merge
 97 |     train_dat = pd.merge(train, train_features, "inner", on="file_name")
 98 | 
 99 |     label = train_dat["safe_type"]
100 |     train_dat.drop(columns=["safe_type"], inplace=True)
101 |     return train_dat, label, test_data
102 | 
103 | 
104 | def load_tfidf_features(suffix, type_name=""):
105 |     black = scipy.sparse.load_npz(
106 |         os.path.join(get_root_path(), "features", "black" + suffix + type_name + ".npz")).toarray()
107 |     white = scipy.sparse.load_npz(
108 |         os.path.join(get_root_path(), "features", "white" + suffix + type_name + ".npz")).toarray()
109 |     test = scipy.sparse.load_npz(
110 |         os.path.join(get_root_path(), "features", "test" + suffix + type_name + ".npz")).toarray()
111 | 
112 |     black_l = np.ones((black.shape[0],))
113 |     white_l = np.zeros((white.shape[0],))
114 |     train_data = pd.DataFrame(np.concatenate((black, white), axis=0))
115 | 
116 |     label = pd.DataFrame(np.concatenate((black_l, white_l), axis=0))
117 | 
118 |     test_df = pd.DataFrame(test)
119 | 
120 |     black_name_list = load_dict(os.path.join(get_root_path(), "features", "black_name_list" + suffix + type_name))
121 |     white_name_list = load_dict(os.path.join(get_root_path(), "features", "white_name_list" + suffix + type_name))
122 |     train_name_list = np.concatenate((black_name_list, white_name_list), axis=0)
123 | 
124 |     test_name_list = load_dict(os.path.join(get_root_path(), "features", "test_name_list" + suffix + type_name))
125 | 
126 |     train_data["file_name"] = train_name_list
127 |     train_data["file_name"] = train_data["file_name"].map(lambda x: extract_id_from_file_name(x))
128 | 
129 |     test_df["file_name"] = test_name_list
130 |     test_df["file_name"] = test_df["file_name"].map(lambda x: extract_id_from_file_name(x))
131 | 
132 |     return train_data, label, test_df
133 | 
134 | 
135 | def load_autoencoder_features():
136 |     features = np.load("train_nn.npy")
137 |     print(features.shape)
138 |     name = np.load("file_name_list_stage2.npy")
139 |     label = np.load("label_nn.npy")
140 |     features = pd.DataFrame(data=features)
141 |     features["file_name"] = name
142 |     features["file_name"] = features["file_name"].map(lambda x: extract_id_from_file_name(x))
143 | 
144 |     """
145 |     useless test_df. just read to avoid error
146 |     """
147 |     test = scipy.sparse.load_npz(os.path.join(get_root_path(), "features", "test.npz")).toarray()
148 |     test_df = pd.DataFrame(test)
149 |     test_name_list = load_dict(os.path.join(get_root_path(), "features", "test_name_list"))
150 | 
151 |     test_df["file_name"] = test_name_list
152 |     test_df["file_name"] = test_df["file_name"].map(lambda x: extract_id_from_file_name(x))
153 |     return features, label, test_df
154 | 
155 | 
156 | def load_stage2_tf_idf(suffix, type_name=""):
157 |     stage2 = scipy.sparse.load_npz(
158 |         os.path.join(get_root_path(), "features", "stage2" + suffix + type_name + ".npz")).toarray()
159 | 
160 |     train_data = pd.DataFrame(stage2)
161 | 
162 |     stage2_name_list = load_dict(os.path.join(get_root_path(), "features", "stage2_name_list" + suffix + type_name))
163 | 
164 |     train_data["file_name"] = stage2_name_list
165 |     train_data["file_name"] = train_data["file_name"].map(lambda x: extract_id_from_file_name(x))
166 | 
167 |     return train_data
168 | 
169 | 
170 | def load_nn_stage2_features():
171 |     nn_features = np.load("nn_features.npy")
172 |     name = np.load("file_name_list_stage2.npy")
173 | 
174 |     train_data = pd.DataFrame(nn_features)
175 |     train_data["file_name"] = name
176 |     train_data["file_name"] = train_data["file_name"].map(lambda x: extract_id_from_file_name(x))
177 |     return train_data
178 | 
179 | 
180 | def load_tianchi_tf_idf():
181 |     stage2 = scipy.sparse.load_npz(
182 |         os.path.join(get_root_path(), "features", "tianchi" + ".npz")).toarray()
183 | 
184 |     train_data = pd.DataFrame(stage2)
185 | 
186 |     stage2_name_list = load_dict(os.path.join(get_root_path(), "features", "tianchi_name_list"))
187 | 
188 |     train_data["file_name"] = stage2_name_list
189 |     train_data["file_name"] = train_data["file_name"].map(lambda x: extract_id_from_file_name(x))
190 |     tianchi = pd.read_csv("security_train.csv")[["label", "file_id"]].drop_duplicates()
191 |     tianchi = tianchi.rename(columns={"file_id": "file_name"})
192 |     full = pd.merge(train_data, tianchi, how="left", on="file_name")
193 |     label = full["label"]
194 | 
195 |     return train_data, label
196 | 
197 | 
198 | def load_tfidf_sparse_features(suffix):
199 |     black = scipy.sparse.load_npz(os.path.join(get_root_path(), "black" + suffix + ".npz"))
200 |     white = scipy.sparse.load_npz(os.path.join(get_root_path(), "white" + suffix + ".npz"))
201 |     test = scipy.sparse.load_npz(os.path.join(get_root_path(), "test" + suffix + ".npz"))
202 | 
203 |     white_file_id = load_dict("white_name_list")
204 |     black_file_id = load_dict("black_name_list")
205 | 
206 |     black_l = np.ones((black.shape[0],))
207 |     white_l = np.zeros((white.shape[0],))
208 |     train_data = scipy.sparse.vstack([black, white])
209 |     label = pd.DataFrame(np.concatenate((black_l, white_l), axis=0))
210 | 
211 |     test_df = test
212 |     file_id = load_dict(os.path.join(get_root_path(), "test_name_list" + suffix))
213 |     return train_data, label, test_df, file_id, np.array(black_file_id + white_file_id)
214 | 
215 | 
216 | def merge_features(features: List):
217 |     train_data, label, test_data = features.pop(0)
218 |     train_data["label"] = label
219 | 
220 |     for i in range(len(features)):
221 |         train_data = pd.merge(train_data, features[i][0], how="left", on="file_name")
222 |         test_data = pd.merge(test_data, features[i][2], how="left", on="file_name")
223 | 
224 |     label = train_data["label"]
225 |     train_data.drop(columns=["label"], inplace=True)
226 | 
227 |     return train_data, label, test_data
228 | 
229 | 
230 | def drop_id(features: List):
231 |     features[0].drop(columns=["file_name"], inplace=True)
232 |     features[2].drop(columns=["file_name"], inplace=True)
233 |     return features
234 | 
235 | 
236 | if __name__ == '__main__':
237 |     train_data = load_stage2_tf_idf("1000")
238 | 


--------------------------------------------------------------------------------