├── doc └── 云安全比赛_PPT.pdf ├── readme.md └── src ├── basic_feature.py ├── conf.py ├── dataset.py ├── feature.py ├── file_split.py ├── funcname_type_map.json ├── light_gbm_model.py ├── model_predict.py ├── preprocess.py ├── tfidf_model.py └── utility.py /doc/云安全比赛_PPT.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goodskillprogramer/MalwareClassify/5ef3c6267a9469a48ae94ae9d480d715aa675348/doc/云安全比赛_PPT.pdf -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # 恶意软件分类 2 | - 机器学习和恶意软件分类 3 | - 基于API调用序列,主要是n-gram和tfidf特征 4 | - 机器学习工具用的lightgbm 5 | 6 | # malware classify based on API sequence 7 | - Using machine learning method to classify malware type 8 | - most of the feature is extracted from API sequence 9 | - using n-gram and tfidf to extract the vector 10 | - you can download the trainset from this [website]( https://tianchi.aliyun.com/competition/information.htm?spm=5176.100067.5678.2.66304169Zp1gFH&raceId=231668) 11 | 12 | 13 | ## 程序介绍 14 | - file_split.py 读取csv文件,并按照不同的文件ID组织 15 | - preprocess.py 会将每个文件,转成json格式,并且序列化api 16 | - basic_feature.py 提取简单特征 17 | - tfidf_model.py 生成tfidf模型 18 | - feature.py 利用生成的tfidf模型转换训练和测试数据 19 | - light_gbm_model.py 模型调参 20 | - model_predict.py 结果预测 21 | 22 | ## 说明 23 | - 这是参加 第三届『阿里云安全算法挑战赛』源代码,最后成绩在Top30以内,不在Top10以内。 24 | - 因为Top30以内需要提交PPT,所以写了PPT。[PPT里面有检测原理介绍](https://github.com/jingwang3235/MalwareClassify/blob/master/doc/%E4%BA%91%E5%AE%89%E5%85%A8%E6%AF%94%E8%B5%9B_PPT.pdf)。 25 | - Top10选手需要去参加决赛,Top5 选手需要答辩。他们答辩视频见:http://tb.cn/qYyB8Mw。可以学学好的思路。 26 | - 比赛链接(里面有训练和测试数据):https://tianchi.aliyun.com/competition/information.htm?spm=5176.100067.5678.2.66304169Zp1gFH&raceId=231668 -------------------------------------------------------------------------------- /src/basic_feature.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import traceback 4 | import json 5 | from utility import write_to_file 6 | from conf import STORAGE 7 | 8 | def basic_feature(fpath): 9 | 10 | feature={} 11 | with open(fpath) as f: 12 | jsondata = json.load(f) 13 | threads=jsondata['threads'] 14 | file_id =jsondata['file_id'] 15 | label = jsondata.get('lable','-1') 16 | 17 | threadnum=len(threads) 18 | totalapicall = 0 19 | maxapicall = 0 20 | minapicall = 0 21 | meanapicallperthread=0.0 22 | for tid in threads: 23 | info = threads[tid] 24 | totalapicall = len(info['api_calls']) 25 | maxapicall=max(maxapicall,len(info['api_calls'])) 26 | minapicall=min(minapicall,len(info['api_calls'])) 27 | 28 | meanapicallperthread = (totalapicall*1.0)/threadnum 29 | 30 | print('{} {} {} {} {} {} {}'.format(file_id,label,threadnum,totalapicall,maxapicall,minapicall,meanapicallperthread)) 31 | feature['file_id'] = file_id 32 | feature['label'] = label 33 | feature['threadnum'] = threadnum 34 | feature['totalapicall'] = totalapicall 35 | feature['maxapicall'] = maxapicall 36 | feature['minapicall'] = minapicall 37 | feature['meanapicallperthread'] = meanapicallperthread 38 | return feature 39 | 40 | def folder_to_basic_feature(folder,feature_save_path): 41 | 42 | featureheader=['file_id','label','threadnum','totalapicall','maxapicall','minapicall','meanapicallperthread'] 43 | stime=time.time() 44 | count=0 45 | write_to_file(feature_save_path,'',mode='w+') 46 | for subfolder in os.listdir(folder): 47 | fullsubfolder = os.path.join(folder,subfolder) 48 | for fname in os.listdir(fullsubfolder): 49 | fullname = os.path.join(fullsubfolder,fname) 50 | try: 51 | feature = basic_feature(fullname) 52 | attr=[] 53 | for head in featureheader: 54 | attr.append(str(feature.get(head,0))) 55 | txt =','.join(attr) 56 | write_to_file(feature_save_path,txt+'\n',mode='a+') 57 | 58 | except Exception as e: 59 | traceback.print_exc() 60 | 61 | def main(): 62 | 63 | folder_to_basic_feature(os.path.join(STORAGE,'train_json'),os.path.join(STORAGE,'train_data','basicfeature.csv')) 64 | folder_to_basic_feature(os.path.join(STORAGE,'test_json'),os.path.join(STORAGE,'test_data','basicfeature.csv')) 65 | 66 | if __name__ == '__main__': 67 | main() -------------------------------------------------------------------------------- /src/conf.py: -------------------------------------------------------------------------------- 1 | 2 | STORAGE=r'G:\sample\binary' -------------------------------------------------------------------------------- /src/dataset.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | from collections import Counter 4 | 5 | def get_feature_header(fpath): 6 | 7 | with open(fpath) as f: 8 | data = json.load(f) 9 | return data 10 | 11 | def load_data(savepath,featureheaders,targethearders,nrows=None): 12 | 13 | dataset = pd.read_csv(savepath,header = 0,nrows =nrows) 14 | 15 | x = dataset[featureheaders] 16 | y = dataset[targethearders] 17 | return x.values,y.values.reshape((len(y.values),)) 18 | 19 | def get_class_weights(y): 20 | counter = Counter(y) 21 | majority = max(counter.values()) 22 | return {cls: round(float(majority)/float(count), 2) for cls, count in counter.items()} 23 | -------------------------------------------------------------------------------- /src/feature.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | import pandas as pd 5 | 6 | from conf import STORAGE 7 | from utility import write_to_file,load_model 8 | 9 | 10 | def load_documents(document_save_path): 11 | 12 | dataset = pd.read_csv(document_save_path,names=['file_id','txt'],nrows = None) 13 | 14 | labels = [] 15 | docs = [] 16 | filesid=[] 17 | 18 | for index,row in dataset.iterrows(): 19 | # label = row['lable'] 20 | content = row['txt'] 21 | fileid = row['file_id'] 22 | labels.append(-1) 23 | docs.append(content) 24 | filesid.append(fileid) 25 | 26 | return labels,docs,filesid 27 | 28 | def load_train_documents(document_save_path): 29 | 30 | dataset = pd.read_csv(document_save_path,names=['file_id','label','txt'],nrows = None) 31 | 32 | labels = [] 33 | docs = [] 34 | filesid=[] 35 | 36 | for index,row in dataset.iterrows(): 37 | label = row['label'] 38 | content = row['txt'] 39 | fileid = row['file_id'] 40 | labels.append(label) 41 | docs.append(content) 42 | filesid.append(fileid) 43 | 44 | return labels,docs,filesid 45 | 46 | 47 | def funcname23_tfidf_feature(document_apicall_save_path,newsavepath,loadfunc): 48 | 49 | if os.path.exists(newsavepath): 50 | print('Exists',newsavepath) 51 | return 52 | 53 | tfidf_save_path = os.path.join(STORAGE,'apicall_tfidf/webpage.tfidf.model') 54 | # pca_save_path = os.path.join(STORAGE,'apicall_tfidf_2_3_50000/webpage.tfidf.pca.model') 55 | 56 | rawy ,raw_documents,filesid = loadfunc(document_apicall_save_path) 57 | 58 | print(time.asctime(),len(rawy),len(raw_documents)) 59 | 60 | model = load_model(tfidf_save_path) 61 | 62 | x = model.transform(raw_documents) 63 | 64 | samples,feautre = x.shape 65 | 66 | write_to_file(newsavepath, '', mode='w+') 67 | 68 | headers=['file_id','label'] 69 | headers+=model.get_feature_names() 70 | txt =','.join(headers) 71 | txt+='\n' 72 | write_to_file(newsavepath,txt,mode='a+') 73 | 74 | for i in range(samples): 75 | 76 | attr = [str(filesid[i]),str(rawy[i])] 77 | 78 | feature = x[i].toarray()[0] 79 | # print(feature.shape) 80 | attr += [str(f) for f in feature] 81 | txt=','.join(attr) 82 | txt+='\n' 83 | write_to_file(newsavepath, txt, mode='a+') 84 | 85 | def api_type_tfidf_feature(document_apicall_save_path,newsavepath,loadfunc): 86 | 87 | if os.path.exists(newsavepath): 88 | print('Exists',newsavepath) 89 | return 90 | 91 | tfidf_save_path = os.path.join(STORAGE,'api_type_tfidf_5/webpage.tfidf.model') 92 | # pca_save_path = os.path.join(STORAGE,'apicall_tfidf_2_3_50000/webpage.tfidf.pca.model') 93 | 94 | rawy ,raw_documents,filesid = loadfunc(document_apicall_save_path) 95 | 96 | print(time.asctime(),len(rawy),len(raw_documents)) 97 | 98 | model = load_model(tfidf_save_path) 99 | 100 | x = model.transform(raw_documents) 101 | 102 | samples,feautre = x.shape 103 | 104 | write_to_file(newsavepath, '', mode='w+') 105 | 106 | headers=['file_id','label'] 107 | headers+=model.get_feature_names() 108 | txt =','.join(headers) 109 | txt+='\n' 110 | write_to_file(newsavepath,txt,mode='a+') 111 | 112 | for i in range(samples): 113 | 114 | attr = [str(filesid[i]),str(rawy[i])] 115 | 116 | feature = x[i].toarray()[0] 117 | # print(feature.shape) 118 | attr += [str(f) for f in feature] 119 | txt=','.join(attr) 120 | txt+='\n' 121 | write_to_file(newsavepath, txt, mode='a+') 122 | 123 | def rtn_funcname_tfidf_feature(document_apicall_save_path,newsavepath,loadfunc): 124 | 125 | if os.path.exists(newsavepath): 126 | print('Exists',newsavepath) 127 | return 128 | 129 | tfidf_save_path = os.path.join(STORAGE,'rtn_apicall_tfidf/webpage.tfidf.model') 130 | 131 | rawy ,raw_documents,filesid = loadfunc(document_apicall_save_path) 132 | 133 | print(time.asctime(),len(rawy),len(raw_documents)) 134 | 135 | model = load_model(tfidf_save_path) 136 | 137 | x = model.transform(raw_documents) 138 | 139 | samples,feautre = x.shape 140 | 141 | write_to_file(newsavepath, '', mode='w+') 142 | 143 | headers=['file_id','label'] 144 | headers+=model.get_feature_names() 145 | txt =','.join(headers) 146 | txt+='\n' 147 | write_to_file(newsavepath,txt,mode='a+') 148 | 149 | for i in range(samples): 150 | 151 | attr = [str(filesid[i]),str(rawy[i])] 152 | 153 | feature = x[i].toarray()[0] 154 | # print(feature.shape) 155 | attr += [str(f) for f in feature] 156 | txt=','.join(attr) 157 | txt+='\n' 158 | write_to_file(newsavepath, txt, mode='a+') 159 | 160 | def funcname1_tfidf_feature(document_apicall_save_path,newsavepath,loadfunc): 161 | 162 | if os.path.exists(newsavepath): 163 | print('Exists',newsavepath) 164 | return 165 | 166 | tfidf_save_path = os.path.join(STORAGE,'apicall_tfidf_1/webpage.tfidf.model') 167 | 168 | rawy ,raw_documents,filesid = loadfunc(document_apicall_save_path) 169 | 170 | print(time.asctime(),len(rawy),len(raw_documents)) 171 | 172 | model = load_model(tfidf_save_path) 173 | 174 | x = model.transform(raw_documents) 175 | 176 | samples,feautre = x.shape 177 | 178 | write_to_file(newsavepath, '', mode='w+') 179 | 180 | headers=['file_id','label'] 181 | headers+=model.get_feature_names() 182 | txt =','.join(headers) 183 | txt+='\n' 184 | write_to_file(newsavepath,txt,mode='a+') 185 | 186 | for i in range(samples): 187 | 188 | attr = [str(filesid[i]),str(rawy[i])] 189 | 190 | feature = x[i].toarray()[0] 191 | # print(feature.shape) 192 | attr += [str(f) for f in feature] 193 | txt=','.join(attr) 194 | txt+='\n' 195 | write_to_file(newsavepath, txt, mode='a+') 196 | 197 | def rtvalue_tfidf_feature(document_rtnvalue_save_path,newsavepath,loadfunc): 198 | 199 | if os.path.exists(newsavepath): 200 | print('Exists',newsavepath) 201 | return 202 | 203 | tfidf_save_path = os.path.join(STORAGE,'rtnvalue_tfidf/webpage.tfidf.model') 204 | 205 | rawy ,raw_documents,filesid = loadfunc(document_rtnvalue_save_path) 206 | 207 | print(time.asctime(),len(rawy),len(raw_documents)) 208 | 209 | model = load_model(tfidf_save_path) 210 | 211 | x = model.transform(raw_documents) 212 | 213 | samples,feautre = x.shape 214 | 215 | write_to_file(newsavepath, '', mode='w+') 216 | 217 | headers=['file_id','label'] 218 | 219 | headers+=model.get_feature_names() 220 | 221 | txt =','.join(headers) 222 | txt+='\n' 223 | 224 | write_to_file(newsavepath,txt,mode='a+') 225 | 226 | for i in range(samples): 227 | 228 | attr = [str(filesid[i]),str(rawy[i])] 229 | 230 | feature = x[i].toarray()[0] 231 | 232 | attr += [str(f) for f in feature] 233 | txt=','.join(attr) 234 | txt+='\n' 235 | write_to_file(newsavepath, txt, mode='a+') 236 | 237 | def feature_merge(merge_save_path,selected_save_path,webpage_vectore_save_path): 238 | 239 | if os.path.exists(merge_save_path): 240 | print('Exists',merge_save_path) 241 | return True 242 | 243 | txt = 'merge from {} {} to merge_save_path {}'.format(selected_save_path,webpage_vectore_save_path,merge_save_path) 244 | print(txt) 245 | selected_feature_set = pd.read_csv(selected_save_path,encoding='utf-8') 246 | txt_feature_set = pd.read_csv(webpage_vectore_save_path,encoding='utf-8') 247 | print('finish reading the two csv file .merging...') 248 | merge = pd.merge(selected_feature_set,txt_feature_set,on=['file_id']) 249 | print('finish merging,export to csv..') 250 | merge.to_csv(merge_save_path,index=False,encoding='utf-8') 251 | 252 | def test_data(): 253 | document_apicall_save_path=os.path.join(STORAGE,'test_apicall.txt') 254 | document_rtnvalue_save_path=os.path.join(STORAGE,'test_rtvalue.txt') 255 | document_rtnvalue_api_save_path=os.path.join(STORAGE,'test_rtnvalue_apicall.txt') 256 | 257 | document_api_type_save_path=os.path.join(STORAGE,'test_apicall.type.txt') 258 | 259 | 260 | funcname1_tfidf_savepath = os.path.join(STORAGE,'testdata/funcname1.tfidf.csv') 261 | funcname23_tfidf_savepath = os.path.join(STORAGE,'testdata/funcname23.tfidf.csv') 262 | rtn_tfidf_savepath = os.path.join(STORAGE,'testdata/rtn.tfidf.csv') 263 | basicfeature = os.path.join(STORAGE,'testdata/basicfeature.csv') 264 | rtn_funcname_tfidf_savepath = os.path.join(STORAGE,'testdata/rtnfuncname.tfidf.csv') 265 | apitype_tfidf_savepath = os.path.join(STORAGE,'testdata/apitype.tfidf.csv') 266 | 267 | funcname1_tfidf_feature(document_apicall_save_path,funcname1_tfidf_savepath,load_documents) 268 | rtvalue_tfidf_feature(document_rtnvalue_save_path,rtn_tfidf_savepath,load_documents) 269 | funcname23_tfidf_feature(document_apicall_save_path,funcname23_tfidf_savepath,load_documents) 270 | rtn_funcname_tfidf_feature(document_rtnvalue_api_save_path,rtn_funcname_tfidf_savepath,load_documents) 271 | api_type_tfidf_feature(document_api_type_save_path,apitype_tfidf_savepath,load_documents) 272 | 273 | merge_basic_funcnam1=os.path.join(STORAGE,'testdata/merge.basic.func1.csv') 274 | feature_merge(merge_basic_funcnam1,basicfeature,funcname1_tfidf_savepath) 275 | 276 | merge_basic_funcnam1_rtnvalue=os.path.join(STORAGE,'testdata/merge.basic.func1.rtnvalue.csv') 277 | feature_merge(merge_basic_funcnam1_rtnvalue,merge_basic_funcnam1,rtn_tfidf_savepath) 278 | 279 | merge_basic_funcnam1_rtnvalue_funcname23=os.path.join(STORAGE,'testdata/merge.basic.func1.rtnvalue.func23.csv') 280 | feature_merge(merge_basic_funcnam1_rtnvalue_funcname23,merge_basic_funcnam1_rtnvalue,funcname23_tfidf_savepath) 281 | 282 | merge_basic_funcnam1_rtnvalue_funcname23_rtnapi_=os.path.join(STORAGE,'testdata/merge.basic.func1.rtnvalue.func23.rtnapi.csv') 283 | feature_merge(merge_basic_funcnam1_rtnvalue_funcname23_rtnapi_,merge_basic_funcnam1_rtnvalue_funcname23,rtn_funcname_tfidf_savepath) 284 | 285 | merge_basic_funcnam1_rtnvalue_funcname23_rtnapi_apitype_=os.path.join(STORAGE,'testdata/merge.basic.func1.rtnvalue.func23.rtnapi.apitype.csv') 286 | feature_merge(merge_basic_funcnam1_rtnvalue_funcname23_rtnapi_apitype_,merge_basic_funcnam1_rtnvalue_funcname23_rtnapi_,apitype_tfidf_savepath) 287 | 288 | def train_data(): 289 | document_apicall_save_path=os.path.join(STORAGE,'apicall.txt') 290 | document_rtnvalue_save_path=os.path.join(STORAGE,'rtvalue.txt') 291 | document_rtnvalue_api_save_path=os.path.join(STORAGE,'rtnvalue_apicall.txt') 292 | document_api_type_save_path=os.path.join(STORAGE,'apicall.type.txt') 293 | 294 | 295 | funcname1_tfidf_savepath = os.path.join(STORAGE,'data/funcname1.tfidf.csv') 296 | funcname23_tfidf_savepath = os.path.join(STORAGE,'data/funcname23.tfidf.csv') 297 | rtn_funcname_tfidf_savepath = os.path.join(STORAGE,'data/rtnfuncname.tfidf.csv') 298 | rtn_tfidf_savepath = os.path.join(STORAGE,'data/rtn.tfidf.csv') 299 | basicfeature = os.path.join(STORAGE,'data/basicfeature.csv') 300 | apitype_tfidf_savepath = os.path.join(STORAGE,'data/apitype.tfidf.csv') 301 | 302 | 303 | funcname1_tfidf_feature(document_apicall_save_path,funcname1_tfidf_savepath,load_train_documents) 304 | rtvalue_tfidf_feature(document_rtnvalue_save_path,rtn_tfidf_savepath,load_train_documents) 305 | funcname23_tfidf_feature(document_apicall_save_path,funcname23_tfidf_savepath,load_train_documents) 306 | rtn_funcname_tfidf_feature(document_rtnvalue_api_save_path,rtn_funcname_tfidf_savepath,load_train_documents) 307 | api_type_tfidf_feature(document_api_type_save_path,apitype_tfidf_savepath,load_train_documents) 308 | 309 | 310 | merge_basic_funcnam1=os.path.join(STORAGE,'data/merge.basic.func1.csv') 311 | feature_merge(merge_basic_funcnam1,basicfeature,funcname1_tfidf_savepath) 312 | 313 | merge_basic_funcnam1_rtnvalue=os.path.join(STORAGE,'data/merge.basic.func1.rtnvalue.csv') 314 | feature_merge(merge_basic_funcnam1_rtnvalue,merge_basic_funcnam1,rtn_tfidf_savepath) 315 | 316 | merge_basic_funcnam1_rtnvalue_funcname23=os.path.join(STORAGE,'data/merge.basic.func1.rtnvalue.func23.csv') 317 | feature_merge(merge_basic_funcnam1_rtnvalue_funcname23,merge_basic_funcnam1_rtnvalue,funcname23_tfidf_savepath) 318 | 319 | merge_basic_funcnam1_rtnvalue_funcname23_rtnapi_=os.path.join(STORAGE,'data/merge.basic.func1.rtnvalue.func23.rtnapi.csv') 320 | feature_merge(merge_basic_funcnam1_rtnvalue_funcname23_rtnapi_,merge_basic_funcnam1_rtnvalue_funcname23,rtn_funcname_tfidf_savepath) 321 | 322 | 323 | merge_basic_funcnam1_rtnvalue_funcname23_rtnapi_apitype_=os.path.join(STORAGE,'data/merge.basic.func1.rtnvalue.func23.rtnapi.apitype.csv') 324 | feature_merge(merge_basic_funcnam1_rtnvalue_funcname23_rtnapi_apitype_,merge_basic_funcnam1_rtnvalue_funcname23_rtnapi_,apitype_tfidf_savepath) 325 | 326 | def main(): 327 | train_data() 328 | test_data() 329 | 330 | if __name__ == "__main__": 331 | 332 | main() 333 | 334 | 335 | -------------------------------------------------------------------------------- /src/file_split.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from conf import STORAGE 4 | from utility import write_to_file 5 | 6 | trainpath=os.path.join(STORAGE,'3rd_security_train/train.csv') 7 | testpath=os.path.join(STORAGE,'3rd_security_test/test.csv') 8 | 9 | def train_file_split(): 10 | 'split the train file to many files by file id.Rows with the same file id should be save in one file' 11 | count=0 12 | with open(trainpath,'r') as f: 13 | for line in f: 14 | count+=1 15 | if count==1: 16 | continue 17 | 18 | splits=line.split(',') 19 | if len(splits)>=2: 20 | lable = splits[1] 21 | fileid=splits[0] 22 | subfolder = lable 23 | 24 | savefolder=os.path.join(STORAGE,'train_flat',subfolder) 25 | os.makedirs(savefolder,exist_ok=True) 26 | savefile=os.path.join(savefolder,str(fileid)+'.txt') 27 | write_to_file(savefile,line) 28 | if count%10000==0: 29 | print(count) 30 | 31 | def test_file_split(mode,left): 32 | 'split the test file to many files by file id.Rows with the same file id should be save in one file' 33 | count=0 34 | 35 | with open(testpath,'r') as f: 36 | for line in f: 37 | count+=1 38 | if count==1: 39 | continue 40 | 41 | splits=line.split(',') 42 | if len(splits)>=2: 43 | fileid=splits[0] 44 | moderesut =int(fileid)%mode 45 | if moderesut == left: 46 | savefolder=os.path.join(STORAGE,'test_flat') 47 | savefolder=os.path.join(savefolder,str(left)) 48 | os.makedirs(savefolder,exist_ok=True) 49 | savefile=os.path.join(savefolder,str(fileid)+'.txt') 50 | write_to_file(savefile,line,mode='a+') 51 | if count%10000==0: 52 | print(count,fileid,mode,left) 53 | 54 | def main(): 55 | 56 | train_file_split() 57 | # u can use muti-process below for a more faster way 58 | test_file_split(4,0) 59 | test_file_split(4,1) 60 | test_file_split(4,2) 61 | test_file_split(4,3) 62 | 63 | if __name__ =='__main__': 64 | 65 | main() 66 | -------------------------------------------------------------------------------- /src/funcname_type_map.json: -------------------------------------------------------------------------------- 1 | { 2 | "regdeletekeya": "AM", 3 | "getaddrinfo": "Z", 4 | "regqueryvalueexa": "AP", 5 | "wsaconnect": "BA", 6 | "internetgetconnectedstateexa": "N", 7 | "ntopenkey": "AO", 8 | "ntopenmutant": "Y", 9 | "internetclosehandle": "M", 10 | "regopenkeyexw": "AP", 11 | "internetgetconnectedstateexw": "N", 12 | "getfileversioninfow": "H", 13 | "ntsavekey": "AQ", 14 | "ntqueryvaluekey": "AP", 15 | "regdeletekeyw": "AM", 16 | "httpqueryinfoa": "L", 17 | "regopenkeyexa": "AP", 18 | "getsystemdirectorya": "BC", 19 | "exitwindowsex": "BD", 20 | "getaddrinfow": "Z", 21 | "regdeletevaluew": "AM", 22 | "bind": "BA", 23 | "setwindowshookexa": "K", 24 | "ldrgetprocedureaddress": "X", 25 | "send": "BA", 26 | "ntdeletekey": "AM", 27 | "regdeletevaluea": "AM", 28 | "regcreatekeyexa": "AL", 29 | "getsystemwindowsdirectoryw": "BC", 30 | "ntprotectvirtualmemory": "AJ", 31 | "oleinitialize": "AB", 32 | "writeconsolea": "T", 33 | "getsystemdirectoryw": "BC", 34 | "getadaptersinfo": "Z", 35 | "ntquerydirectoryfile": "F", 36 | "setfileattributesw": "AZ", 37 | "netgetjoininformation": "Z", 38 | "deleteservice": "AV", 39 | "deletefilew": "F", 40 | "wsasend": "BA", 41 | "writeconsolew": "T", 42 | "ldrunloaddll": "AS", 43 | "findwindowexa": "BE", 44 | "getvolumepathnamew": "F", 45 | "enumservicesstatusa": "AX", 46 | "setwindowshookexw": "K", 47 | "wsarecvfrom": "BA", 48 | "regenumvaluea": "AP", 49 | "ntcreatekey": "AL", 50 | "regenumkeyexw": "AP", 51 | "regqueryvalueexw": "AP", 52 | "recvfrom": "BA", 53 | "getdiskfreespaceexw": "H", 54 | "isdebuggerpresent": "B", 55 | "findwindowexw": "BE", 56 | "sendto": "BA", 57 | "getadaptersaddresses": "BA", 58 | "regenumkeyexa": "AP", 59 | "netusergetlocalgroups": "Z", 60 | "process32nextw": "AI", 61 | "wsasendto": "BA", 62 | "ntwritevirtualmemory": "AJ", 63 | "readprocessmemory": "AI", 64 | "ntopenthread": "AH", 65 | "connect": "BA", 66 | "getsockname": "BA", 67 | "regcreatekeyexw": "AL", 68 | "ldrloaddll": "V", 69 | "ntgetcontextthread": "AI", 70 | "thread32first": "AF", 71 | "setfileinformationbyhandle": "AZ", 72 | "lookupaccountsidw": "BC", 73 | "getfileinformationbyhandleex": "H", 74 | "netusergetinfo": "Z", 75 | "writeprocessmemory": "AJ", 76 | "createdirectoryexw": "F", 77 | "regenumkeyw": "AP", 78 | "findwindoww": "BE", 79 | "ntsavekeyex": "AQ", 80 | "enumservicesstatusw": "AX", 81 | "httpsendrequestw": "L", 82 | "select": "BA", 83 | "ntallocatevirtualmemory": "AJ", 84 | "ntcreatethread": "AE", 85 | "ntopenprocess": "AH", 86 | "getinterfaceinfo": "Z", 87 | "copyfileexw": "F", 88 | "ntfreevirtualmemory": "W", 89 | "ntcreatemutant": "Y", 90 | "getcomputernamew": "G", 91 | "getsystemmetrics": "BC", 92 | "ntenumeratevaluekey": "AP", 93 | "ntcreatesection": "AT", 94 | "startservicew": "BB", 95 | "getfilesizeex": "H", 96 | "createthread": "AE", 97 | "ntdeletevaluekey": "AM", 98 | "findresourcew": "AR", 99 | "enumwindows": "BE", 100 | "startservicea": "BB", 101 | "ntclose": "Q", 102 | "getcomputernamea": "G", 103 | "getfileattributesw": "H", 104 | "findresourcea": "AR", 105 | "recv": "BA", 106 | "setinformationjobobject": "AZ", 107 | "process32firstw": "AI", 108 | "getusernameexw": "I", 109 | "setsockopt": "BA", 110 | "socket": "BA", 111 | "ntunloaddriver": "D", 112 | "removedirectoryw": "F", 113 | "ntterminatethread": "AG", 114 | "getusernameexa": "I", 115 | "removedirectorya": "F", 116 | "getasynckeystate": "AP", 117 | "setfilepointer": "AZ", 118 | "getusernamew": "I", 119 | "createservicea": "AW", 120 | "ioctlsocket": "BA", 121 | "getusernamea": "I", 122 | "controlservice": "AU", 123 | "internetgetconnectedstate": "N", 124 | "createremotethreadex": "AE", 125 | "shutdown": "BA", 126 | "createdirectoryw": "F", 127 | "ntdeletefile": "F", 128 | "createservicew": "AW", 129 | "ntopendirectoryobject": "F", 130 | "wsasocketa": "BA", 131 | "wsaaccept": "BA", 132 | "findwindowa": "BE", 133 | "ntloadkey": "AA", 134 | "regsetvalueexa": "AQ", 135 | "ntcreatefile": "F", 136 | "wsasocketw": "BA", 137 | "ntloaddriver": "D", 138 | "createprocessinternalw": "AE", 139 | "listen": "BA", 140 | "getnativesysteminfo": "BC", 141 | "findresourceexa": "AR", 142 | "httpopenrequesta": "L", 143 | "ntsetcontextthread": "AD", 144 | "getfilesize": "H", 145 | "openscmanagerw": "AC", 146 | "ntenumeratekey": "AP", 147 | "accept": "BA", 148 | "ntreadfile": "S", 149 | "getfileversioninfosizew": "H", 150 | "ntterminateprocess": "AG", 151 | "openscmanagera": "AC", 152 | "closesocket": "BA", 153 | "httpopenrequestw": "L", 154 | "urldownloadtofilew": "O", 155 | "getvolumepathnamesforvolumenamew": "F", 156 | "ntquerymultiplevaluekey": "AP", 157 | "regenumvaluew": "AP", 158 | "internetopenurla": "O", 159 | "ntcreatethreadex": "AE", 160 | "ntopenfile": "R", 161 | "unhookwindowshookex": "K", 162 | "findfirstfileexa": "E", 163 | "shellexecuteexw": "AE", 164 | "internetopenurlw": "O", 165 | "ntopenkeyex": "AO", 166 | "findfirstfileexw": "E", 167 | "getfiletype": "H", 168 | "getfileattributesexw": "H", 169 | "ntsetvaluekey": "AQ", 170 | "module32firstw": "X", 171 | "regsetvalueexw": "AQ", 172 | "wsarecv": "BA", 173 | "setfiletime": "AZ", 174 | "wsastartup": "BA", 175 | "thread32next": "AF", 176 | "getvolumenameforvolumemountpointw": "F", 177 | "ntsuspendthread": "AD", 178 | "ntloadkeyex": "AA", 179 | "ntwritefile": "T", 180 | "internetopenw": "O", 181 | "getfileversioninfoexw": "H", 182 | "getsystemwindowsdirectorya": "BC", 183 | "regclosekey": "AK", 184 | "copyfilew": "F", 185 | "openservicea": "AY", 186 | "ntquerykey": "AP", 187 | "regqueryinfokeyw": "AP", 188 | "openservicew": "AY", 189 | "copyfilea": "F", 190 | "ntloadkey2": "AA", 191 | "setfilepointerex": "AZ", 192 | "wnetgetprovidernamew": "BC", 193 | "ntcreatedirectoryobject": "F", 194 | "outputdebugstringa": "B", 195 | "ldrgetdllhandle": "X", 196 | "regqueryinfokeya": "AP", 197 | "createremotethread": "AE", 198 | "ntopensection": "AT", 199 | "getfileversioninfosizeexw": "H", 200 | "internetconnectw": "U", 201 | "httpsendrequesta": "L", 202 | "internetconnecta": "U", 203 | "gethostbyname": "Z", 204 | "getsystemtimeasfiletime": "BC", 205 | "getfileinformationbyhandle": "H", 206 | "ntreadvirtualmemory": "AI", 207 | "ntresumethread": "AD", 208 | "internetwritefile": "P", 209 | "internetreadfile": "O", 210 | "getdiskfreespacew": "H", 211 | "module32nextw": "X", 212 | "dnsquery_w": "C", 213 | "getsysteminfo": "BC" 214 | } -------------------------------------------------------------------------------- /src/light_gbm_model.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import time 4 | import json 5 | import traceback 6 | import pandas as pd 7 | import os 8 | import numpy as np 9 | 10 | from sklearn.metrics import log_loss 11 | from sklearn.model_selection import StratifiedShuffleSplit 12 | import lightgbm as lgb 13 | 14 | from dataset import load_data 15 | from utility import write_to_file 16 | from utility import save_model,load_model 17 | from conf import STORAGE 18 | 19 | def model_select_main(storage,datasetpath,featureheaders,targethearders): 20 | 21 | savefolder = 'estimatelgbm' 22 | 23 | x,y = load_data(datasetpath,featureheaders,targethearders,nrows= None) 24 | 25 | depthlist =[-1]+list(range(3,8,2)) 26 | depthlist= [5] #set 27 | 28 | minchildlist=[5,15,20,25,30,40] #grid search min_child_sample 29 | # minchildlist=[60]# set min_child_sample to 60 30 | leavelist=[31,41,51] 31 | min_sum_hessian_in_leafs = [0.007,0.001] 32 | # min_sum_hessian_in_leafs = [0.007] 33 | feature_fractions=[1,0.9,0.8,0.6,0.5] 34 | # feature_fractions=[1] 35 | 36 | drop_rates =[0.1,0.2,0.3,0.4] 37 | # skip_drops =[0.5,0.4,0.6] 38 | skip_drops =[0.5] 39 | boost_rounds = [600,800,900,1000,11000,500,1300] 40 | 41 | for max_depth in depthlist : 42 | for num_leaves in leavelist: 43 | for min_child_sample in minchildlist: 44 | for feature_fraction in feature_fractions: 45 | for min_sum_hessian_in_leaf in min_sum_hessian_in_leafs: 46 | for drop_rate in drop_rates: 47 | for skip_drop in skip_drops: 48 | for boost_round in boost_rounds: 49 | params = { 50 | 'boosting_type': 'dart', 51 | 'objective': 'multiclass', 52 | 'metric': 'multi_logloss', 53 | 'min_child_samples':min_child_sample, 54 | 'num_leaves':num_leaves, 55 | 'max_depth':max_depth, 56 | 'learning_rate': 0.1, 57 | 'num_class':6, 58 | 'num_threads':27, 59 | 'max_bin':6, 60 | 'lambda_l1':1, 61 | 'lambda_l2':0.8, 62 | 'feature_fraction':feature_fraction, 63 | 'min_sum_hessian_in_leaf':min_sum_hessian_in_leaf, 64 | 'drop_rate':drop_rate, 65 | 'skip_drop':skip_drop, 66 | 'verbose': 1 67 | } 68 | rst = cross_validation(x, y, 5,params,boost_round) 69 | 70 | rst['datasetpath'] = datasetpath 71 | 72 | txt= json.dumps(rst,indent=4) 73 | summarypath =os.path.join(storage,'model/'+savefolder,str(int(time.time()))+'.model.esimate') 74 | write_to_file(summarypath,txt.encode('utf-8'),mode='wb+') 75 | 76 | def cross_validation(x,y,kfolder,params,num_boost_round): 77 | print(time.asctime(),'cross validation') 78 | stime = time.time() 79 | 80 | num, num_feature = x.shape 81 | 82 | # create dataset for lightgbm 83 | # if you want to re-use data, remember to set free_raw_data=False 84 | lgb_train = lgb.Dataset(x, y, 85 | weight=None, free_raw_data=False) 86 | 87 | 88 | # generate a feature name 89 | feature_name = ['feature_' + str(col) for col in range(num_feature)] 90 | 91 | print('Start cross validation...') 92 | # feature_name and categorical_feature 93 | # specify your configurations as a dict 94 | 95 | 96 | result = lgb.cv(params, 97 | lgb_train, 98 | num_boost_round=num_boost_round, 99 | nfold=kfolder, 100 | feature_name=feature_name, 101 | verbose_eval=-1, 102 | show_stdv=True) 103 | print(params) 104 | print(result) 105 | 106 | # print('7th feature name is:', repr(model.feature_name[6])) 107 | return {'parameter':params,'result':result,'total_sample':num,'total_feautre':num_feature,'boost_round':num_boost_round} 108 | 109 | 110 | def model_build(datasetpath,featureheaders,targethearders): 111 | """ 112 | choose the selected model 113 | """ 114 | print(time.asctime(),'loading data start') 115 | 116 | x,y = load_data(datasetpath,featureheaders,targethearders,nrows=None) 117 | 118 | print(time.asctime(),'loading data end','total len',len(x)) 119 | 120 | modelinfo={} 121 | model = None 122 | try : 123 | 124 | modelinfo['train_start_time']=time.asctime() 125 | modelinfo={'name':'xgboost','detail':[],'buildtime':time.asctime()} 126 | 127 | test_size =0 128 | 129 | sample_weight=None 130 | 131 | # specify your configurations as a dict 132 | params = { 133 | 'boosting_type': 'dart', 134 | 'objective': 'multiclass', 135 | 'metric': 'multi_logloss', 136 | 'num_leaves': 31, 137 | 'learning_rate': 0.1, 138 | 'min_child_samples':60, 139 | 'max_depth':5, 140 | 'num_class':6, 141 | 'num_threads':27, 142 | 'max_bin':6, 143 | 'lambda_l1':1, 144 | 'lambda_l2':0.8, 145 | 'min_sum_hessian_in_leaf':0.007, 146 | 'verbose': 1 147 | } 148 | 149 | if test_size == 0: 150 | print('full fit') 151 | lgb_train = lgb.Dataset(x, y, 152 | weight=None, free_raw_data=False) 153 | stime = time.time() 154 | num_train, num_feature = x.shape 155 | # generate a feature name 156 | feature_name = ['feature_' + str(col) for col in range(num_feature)] 157 | 158 | print('Start training...') 159 | # feature_name and categorical_feature 160 | model = lgb.train(params, 161 | lgb_train, 162 | num_boost_round=1000, 163 | # learning_rates=lambda iter: 0.1 * (0.99 ** iter), 164 | feature_name=feature_name) 165 | 166 | else: 167 | sss = StratifiedShuffleSplit(n_splits=1,test_size=test_size) 168 | for train, test in sss.split(x, y): 169 | 170 | print(time.asctime(),'fit') 171 | stime = time.time() 172 | 173 | X_train = x[train] 174 | y_train = y[train] 175 | 176 | X_test = x[test] 177 | Y_test = y[test] 178 | 179 | num_train, num_feature = X_train.shape 180 | 181 | # create dataset for lightgbm 182 | # if you want to re-use data, remember to set free_raw_data=False 183 | lgb_train = lgb.Dataset(X_train, y_train, 184 | weight=None, free_raw_data=False) 185 | lgb_eval = lgb.Dataset(X_test, Y_test, reference=lgb_train, 186 | weight=None, free_raw_data=False) 187 | 188 | 189 | 190 | # generate a feature name 191 | feature_name = ['feature_' + str(col) for col in range(num_feature)] 192 | 193 | print('Start training...') 194 | # feature_name and categorical_feature 195 | model = lgb.train(params, 196 | lgb_train, 197 | num_boost_round=2000, 198 | valid_sets=lgb_eval, # eval training data 199 | # learning_rates=lambda iter: 0.1 * (0.99 ** iter), 200 | early_stopping_rounds=40, 201 | feature_name=feature_name) 202 | 203 | x_test_proba = model.predict(X_test) 204 | 205 | loss = log_loss(y[test], x_test_proba,labels=[0,1,2,3,4,5]) 206 | 207 | modelinfo['train_len']=len(train) 208 | modelinfo['train_malicious_len']= str(y[train].sum()) 209 | modelinfo['test_len']=len(test) 210 | modelinfo['test_malicious_len']= str(y[test].sum()) 211 | 212 | x_test_class = np.argmax(x_test_proba,axis=1) 213 | print('x_test_proba',type(x_test_proba)) 214 | print(loss) 215 | 216 | print(pd.crosstab(y[test], x_test_class, rownames=['Actual Species'], colnames=['Predicted Species'])) 217 | 218 | print(time.asctime(),'sample weight',sample_weight) 219 | 220 | result = {'loss': loss, 221 | 'costtime': time.time()-stime} 222 | 223 | modelinfo['detail'].append(result) 224 | 225 | samples,probas = x_test_proba.shape 226 | 227 | print('shape of x_test_proba.shape',x_test_proba.shape) 228 | 229 | predict_save_path = os.path.join(STORAGE,'train_predict.lightgbm.csv') 230 | 231 | write_to_file(predict_save_path,'',mode='w+') 232 | 233 | txt='{},{},{},{},{},{},{},{}\n'.format('file_id','prob0','prob1','prob2','prob3','prob4','prob5','label') 234 | 235 | write_to_file(predict_save_path,txt,mode='w+') 236 | 237 | 238 | for i in range(samples): 239 | attr=[] 240 | attr+=[str(p) for p in x_test_proba[i]] 241 | attr+=[str(y[test][i])] 242 | txt=','.join(attr) 243 | write_to_file(predict_save_path,txt+'\n',mode='a+') 244 | 245 | modelinfo['name']='lightgbm' 246 | modelinfo['attibutes_num']=x.shape[1] 247 | modelinfo['dataset_path']=datasetpath 248 | 249 | modelinfo['sample_weight']=sample_weight 250 | 251 | modelinfo['train_end_time']=time.asctime() 252 | modelinfo['test_size']=test_size 253 | 254 | except Exception as e: 255 | traceback.print_exc() 256 | print('excepion',str(e)) 257 | return 'lightgbm',model,modelinfo 258 | 259 | def model_build_main(storage,datasetpath,featureheaders,targethearders): 260 | 261 | name,clf,modelinfo = model_build(datasetpath,featureheaders,targethearders) 262 | 263 | summarypath = os.path.join(storage,'model/lightgbm.model.esimate') 264 | modelsavepath=os.path.join(storage,'model/lightgbm.model') 265 | modelinfosavepath = os.path.join(storage,'model/lightgbm.modelinfo') 266 | 267 | txt=json.dumps(modelinfo,indent=4) 268 | 269 | write_to_file(modelinfosavepath, txt.encode('utf-8'), mode='wb+') 270 | 271 | save_model(clf,modelsavepath) 272 | 273 | print('model summary:' ) 274 | 275 | print('save model summary->',summarypath ) 276 | write_to_file(summarypath,txt.encode('utf-8'),mode='wb+') 277 | 278 | def main(): 279 | 280 | feature_save_path = os.path.join(STORAGE,'data/merge.basic.func1.rtnvalue.func23.rtnapi.apitype.csv') 281 | 282 | rtn_tfidf_save_path = os.path.join(STORAGE,'rtnvalue_tfidf/webpage.tfidf.model') 283 | func1_tfidf_save_path = os.path.join(STORAGE,'apicall_tfidf_1/webpage.tfidf.model') 284 | func23_tfidf_save_path = os.path.join(STORAGE,'apicall_tfidf/webpage.tfidf.model') 285 | rtnfunc_tfidf_save_path = os.path.join(STORAGE,'rtn_apicall_tfidf/webpage.tfidf.model') 286 | 287 | featureheaders = [ 288 | 289 | ] 290 | 291 | basicfeature = [ 292 | 'threadnum', 293 | 'totalapicall', 294 | 'maxapicall', 295 | 'minapicall', 296 | 'meanapicallperthread' 297 | ] 298 | 299 | featureheaders+=basicfeature 300 | 301 | model = load_model(func1_tfidf_save_path) 302 | featureheaders += model.get_feature_names() 303 | 304 | model = load_model(rtn_tfidf_save_path) 305 | featureheaders += model.get_feature_names() 306 | 307 | model = load_model(func23_tfidf_save_path) 308 | featureheaders += model.get_feature_names() 309 | 310 | model = load_model(rtnfunc_tfidf_save_path) 311 | featureheaders += model.get_feature_names() 312 | 313 | apitype_save_path = os.path.join(STORAGE,'api_type_tfidf_5/webpage.tfidf.model') 314 | model = load_model(apitype_save_path) 315 | featureheaders += model.get_feature_names() 316 | 317 | print(time.asctime(),'total header len',len(featureheaders)) 318 | 319 | targethearders=['label_x'] 320 | 321 | # set ModelSelect to True for parameter tuning 322 | ModelSelect = False 323 | 324 | if ModelSelect: 325 | model_select_main(STORAGE,feature_save_path,featureheaders,targethearders) 326 | else: 327 | model_build_main(STORAGE, 328 | feature_save_path, 329 | featureheaders, 330 | targethearders) 331 | 332 | if __name__ == '__main__': 333 | 334 | main() 335 | 336 | 337 | 338 | 339 | -------------------------------------------------------------------------------- /src/model_predict.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from conf import STORAGE 4 | 5 | from utility import load_model, write_to_file 6 | 7 | def load_data(savepath,featureheaders,targethearders,nrows=None): 8 | 9 | dataset = pd.read_csv(savepath,header = 0,nrows =nrows) 10 | 11 | x = dataset[featureheaders] 12 | y = dataset[targethearders] 13 | l =dataset['label_x'] 14 | 15 | return x.values,y.values.reshape((len(y.values),)),l.values.reshape((len(y.values),)) 16 | 17 | def main(): 18 | 19 | modelsavepath=os.path.join(STORAGE,'model/lightgbm.model') 20 | 21 | clf = load_model(modelsavepath) 22 | 23 | rtn_tfidf_save_path = os.path.join(STORAGE,'rtnvalue_tfidf/webpage.tfidf.model') 24 | func1_tfidf_save_path = os.path.join(STORAGE,'apicall_tfidf_1/webpage.tfidf.model') 25 | func23_tfidf_save_path = os.path.join(STORAGE,'apicall_tfidf/webpage.tfidf.model') 26 | rtn_apicall_tfidf_save_path = os.path.join(STORAGE,'rtn_apicall_tfidf/webpage.tfidf.model') 27 | apitype_save_path = os.path.join(STORAGE,'api_type_tfidf_5/webpage.tfidf.model') 28 | 29 | 30 | featureheaders = [ 31 | 32 | ] 33 | 34 | basicfeature = [ 35 | 'threadnum', 36 | 'totalapicall', 37 | 'maxapicall', 38 | 'minapicall', 39 | 'meanapicallperthread' 40 | ] 41 | 42 | featureheaders+=basicfeature 43 | 44 | model = load_model(func1_tfidf_save_path) 45 | 46 | featureheaders += model.get_feature_names() 47 | 48 | model = load_model(rtn_tfidf_save_path) 49 | 50 | featureheaders += model.get_feature_names() 51 | 52 | model = load_model(func23_tfidf_save_path) 53 | 54 | featureheaders += model.get_feature_names() 55 | 56 | model = load_model(rtn_apicall_tfidf_save_path) 57 | 58 | featureheaders += model.get_feature_names() 59 | 60 | model = load_model(apitype_save_path) 61 | 62 | featureheaders += model.get_feature_names() 63 | 64 | targethearders=['file_id'] 65 | 66 | x,file_ids,labels = load_data(os.path.join(STORAGE,'testdata/merge.basic.func1.rtnvalue.func23.rtnapi.apitype.csv'), 67 | featureheaders, 68 | targethearders) 69 | 70 | x_test_proba = clf.predict(x)#predict_proba 71 | 72 | samples,probas = x_test_proba.shape 73 | 74 | print('shape of x_test_proba.shape',x_test_proba.shape) 75 | 76 | predict_save_path = os.path.join(STORAGE,'test_predict.lightgbm.csv') 77 | 78 | write_to_file(predict_save_path,'',mode='w+') 79 | 80 | txt='{},{},{},{},{},{},{}\n'.format('file_id','prob0','prob1','prob2','prob3','prob4','prob5') 81 | 82 | write_to_file(predict_save_path,txt,mode='w+') 83 | 84 | 85 | for i in range(samples): 86 | attr=[str(file_ids[i])] 87 | attr+=[str(p) for p in x_test_proba[i]] 88 | # attr+=[str(labels[i])] 89 | txt=','.join(attr) 90 | write_to_file(predict_save_path,txt+'\n',mode='a+') 91 | 92 | 93 | if __name__ =="__main__": 94 | main() -------------------------------------------------------------------------------- /src/preprocess.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import json 4 | import traceback 5 | 6 | from conf import STORAGE 7 | from utility import write_to_file,read_replace,get_replace_type_name 8 | 9 | 10 | def load_file(filepath): 11 | """load the file change the all the row to json format""" 12 | stime=time.time() 13 | document={} 14 | if not os.path.exists(filepath): 15 | print('file not exits',filepath) 16 | return 17 | with open(filepath,'r') as f: 18 | for line in f: 19 | line=line.strip('\n') 20 | if line: 21 | splits = line.split(',') 22 | fileid =int(splits[0]) 23 | lable=int(splits[1]) 24 | funcname=splits[2] 25 | tid=splits[3] 26 | rtnvalue=splits[4] 27 | index=splits[5] 28 | 29 | if 'threads' not in document: 30 | document['threads']={} 31 | document['file_id']=fileid 32 | document['lable']=lable 33 | if tid not in document['threads']: 34 | document['threads'][tid]={'api_calls':[]} 35 | document['threads'][tid]['api_calls'].append([index,funcname,rtnvalue]) 36 | for tid in document['threads']: 37 | apicalls = document['threads'][tid]['api_calls'] 38 | sortedcalls = sorted(apicalls,key=lambda s: int(s[0])) 39 | document['threads'][tid]['api_calls'] = sortedcalls 40 | 41 | return document 42 | 43 | 44 | def load_test_file(filepath): 45 | 46 | stime=time.time() 47 | document={} 48 | if not os.path.exists(filepath): 49 | print('file not exits',filepath) 50 | return 51 | with open(filepath,'r') as f: 52 | for line in f: 53 | line=line.strip('\n') 54 | if line: 55 | splits = line.split(',') 56 | fileid =int(splits[0]) 57 | funcname=splits[1] 58 | tid=splits[2] 59 | rtnvalue=splits[3] 60 | index=splits[4] 61 | 62 | if 'threads' not in document: 63 | document['threads']={} 64 | document['file_id']=fileid 65 | 66 | if tid not in document['threads']: 67 | document['threads'][tid]={'api_calls':[]} 68 | document['threads'][tid]['api_calls'].append([index,funcname,rtnvalue]) 69 | for tid in document['threads']: 70 | apicalls = document['threads'][tid]['api_calls'] 71 | sortedcalls = sorted(apicalls,key=lambda s: int(s[0])) 72 | document['threads'][tid]['api_calls'] = sortedcalls 73 | 74 | return document 75 | 76 | def train_json_to_txt(): 77 | 78 | folder=os.path.join(STORAGE,'train_json') 79 | save_apicalls=os.path.join(STORAGE,'apicall.txt') 80 | save_rtvalue=os.path.join(STORAGE,'rtvalue.txt') 81 | 82 | write_to_file(save_apicalls, '',mode='w+') 83 | write_to_file(save_rtvalue, '',mode='w+') 84 | 85 | 86 | for subfolder in os.listdir(folder): 87 | fullsubfolder = os.path.join(folder,subfolder) 88 | for fname in os.listdir(fullsubfolder): 89 | fullname=os.path.join(fullsubfolder,fname) 90 | with open(fullname) as f: 91 | jsondata = json.load(f) 92 | common=[str(jsondata['file_id']),str(jsondata['lable'])] 93 | apis=[] 94 | rtns=[] 95 | for tid in jsondata['threads']: 96 | 97 | api_calls = jsondata['threads'][tid]['api_calls'] 98 | 99 | apis+=[apicall[1] for apicall in api_calls] 100 | rtns+=[apicall[2] for apicall in api_calls] 101 | 102 | apis+=['.'] 103 | rtns+=['.'] 104 | 105 | txt=' '.join(apis) 106 | txt=','.join(common+[txt]) 107 | write_to_file(save_apicalls, txt+'\n') 108 | 109 | txt =' '.join(rtns) 110 | txt=','.join(common+[txt]) 111 | 112 | write_to_file(save_rtvalue, txt+'\n') 113 | 114 | 115 | def test_json_to_txt(): 116 | 117 | folder=os.path.join(STORAGE,'test_json') 118 | save_apicalls=os.path.join(STORAGE,'test_apicall.txt') 119 | save_rtvalue=os.path.join(STORAGE,'test_rtvalue.txt') 120 | 121 | write_to_file(save_apicalls, '',mode='w+') 122 | write_to_file(save_rtvalue, '',mode='w+') 123 | 124 | for subfolder in os.listdir(folder): 125 | fullsubfolder = os.path.join(folder,subfolder) 126 | for fname in os.listdir(fullsubfolder): 127 | fullname=os.path.join(fullsubfolder,fname) 128 | with open(fullname) as f: 129 | jsondata = json.load(f) 130 | common=[str(jsondata['file_id'])] 131 | apis=[] 132 | rtns=[] 133 | for tid in jsondata['threads']: 134 | 135 | api_calls = jsondata['threads'][tid]['api_calls'] 136 | apis+=[apicall[1] for apicall in api_calls] 137 | rtns+=[apicall[2] for apicall in api_calls] 138 | 139 | apis+=['.'] 140 | rtns+=['.'] 141 | 142 | txt=' '.join(apis) 143 | txt=','.join(common+[txt]) 144 | write_to_file(save_apicalls, txt+'\n',mode='a+') 145 | 146 | txt =' '.join(rtns) 147 | txt=','.join(common+[txt]) 148 | 149 | write_to_file(save_rtvalue, txt+'\n',mode='a+') 150 | 151 | def train_json_to_txt_type(replacedic): 152 | 153 | folder=os.path.join(STORAGE,'train_json') 154 | save_apicalls=os.path.join(STORAGE,'apicall.type.txt') 155 | 156 | write_to_file(save_apicalls, '',mode='w+') 157 | 158 | for subfolder in os.listdir(folder): 159 | fullsubfolder = os.path.join(folder,subfolder) 160 | for fname in os.listdir(fullsubfolder): 161 | fullname=os.path.join(fullsubfolder,fname) 162 | with open(fullname) as f: 163 | jsondata = json.load(f) 164 | common=[str(jsondata['file_id']),str(jsondata['lable'])] 165 | apis=[] 166 | rtns=[] 167 | for tid in jsondata['threads']: 168 | 169 | api_calls = jsondata['threads'][tid]['api_calls'] 170 | 171 | apis+=[get_replace_type_name(apicall[1],replacedic) for apicall in api_calls] 172 | rtns+=[apicall[2] for apicall in api_calls] 173 | 174 | apis+=['.'] 175 | rtns+=['.'] 176 | 177 | txt=' '.join(apis) 178 | txt=','.join(common+[txt]) 179 | write_to_file(save_apicalls, txt+'\n') 180 | 181 | txt =' '.join(rtns) 182 | txt=','.join(common+[txt]) 183 | 184 | 185 | def test_json_to_txt_type(replacedic): 186 | 187 | folder=os.path.join(STORAGE,'test_json') 188 | 189 | save_apicalls=os.path.join(STORAGE,'test_apicall.type.txt') 190 | 191 | write_to_file(save_apicalls, '',mode='w+') 192 | 193 | for subfolder in os.listdir(folder): 194 | fullsubfolder = os.path.join(folder,subfolder) 195 | for fname in os.listdir(fullsubfolder): 196 | fullname=os.path.join(fullsubfolder,fname) 197 | with open(fullname) as f: 198 | jsondata = json.load(f) 199 | common=[str(jsondata['file_id'])] 200 | apis=[] 201 | rtns=[] 202 | for tid in jsondata['threads']: 203 | 204 | api_calls = jsondata['threads'][tid]['api_calls'] 205 | apis+=[get_replace_type_name(apicall[1],replacedic) for apicall in api_calls] 206 | rtns+=[apicall[2] for apicall in api_calls] 207 | 208 | apis+=['.'] 209 | rtns+=['.'] 210 | 211 | txt=' '.join(apis) 212 | txt=','.join(common+[txt]) 213 | write_to_file(save_apicalls, txt+'\n',mode='a+') 214 | 215 | txt =' '.join(rtns) 216 | txt=','.join(common+[txt]) 217 | 218 | def folder_to_json(folder,fileparser,save_folder): 219 | stime=time.time() 220 | count=0 221 | for subfolder in os.listdir(folder): 222 | fullsubfolder = os.path.join(folder,subfolder) 223 | save_subfolder = os.path.join(save_folder,subfolder) 224 | os.makedirs(save_subfolder,exist_ok=True) 225 | for fname in os.listdir(fullsubfolder): 226 | try: 227 | fullname=os.path.join(save_subfolder,fname) 228 | document = fileparser(fullname) 229 | txt = json.dumps(document) 230 | write_to_file(fullname,txt,mode='a+') 231 | count+=1 232 | if count%1000==0: 233 | print(count) 234 | except Exception as e: 235 | traceback.print_exc() 236 | print(fullname) 237 | 238 | print(time.time()-stime,count) 239 | 240 | def train_json_to_txt_merge(): 241 | """json file to rtn_api sequence""" 242 | 243 | folder=os.path.join(STORAGE,'train_json') 244 | save_rtn_apicalls=os.path.join(STORAGE,'rtnvalue_apicall.txt') 245 | 246 | write_to_file(save_rtn_apicalls, '',mode='w+') 247 | # write_to_file(save_rtn_apicalls, ['file_id','label','text'],mode='a+') 248 | for subfolder in os.listdir(folder): 249 | fullsubfolder = os.path.join(folder,subfolder) 250 | for fname in os.listdir(fullsubfolder): 251 | fullname=os.path.join(fullsubfolder,fname) 252 | with open(fullname) as f: 253 | jsondata = json.load(f) 254 | common=[str(jsondata['file_id']),str(jsondata['lable'])] 255 | 256 | rtn_apis=[] 257 | for tid in jsondata['threads']: 258 | api_calls = jsondata['threads'][tid]['api_calls'] 259 | rtn_apis+=[apicall[2]+'_'+apicall[1] for apicall in api_calls] 260 | rtn_apis+=['.'] 261 | 262 | txt=' '.join(rtn_apis) 263 | txt=','.join(common+[txt]) 264 | write_to_file(save_rtn_apicalls, txt+'\n') 265 | 266 | def test_json_to_txt_merge(): 267 | """json file to rtn_api sequence""" 268 | 269 | folder=os.path.join(STORAGE,'test_json') 270 | save_rtn_apicalls=os.path.join(STORAGE,'test_rtnvalue_apicall.txt') 271 | 272 | write_to_file(save_rtn_apicalls, '',mode='w+') 273 | 274 | for subfolder in os.listdir(folder): 275 | fullsubfolder = os.path.join(folder,subfolder) 276 | for fname in os.listdir(fullsubfolder): 277 | fullname=os.path.join(fullsubfolder,fname) 278 | with open(fullname) as f: 279 | jsondata = json.load(f) 280 | common=[str(jsondata['file_id'])] 281 | 282 | rtn_apis=[] 283 | for tid in jsondata['threads']: 284 | api_calls = jsondata['threads'][tid]['api_calls'] 285 | rtn_apis+=[apicall[2]+'_'+apicall[1] for apicall in api_calls] 286 | rtn_apis+=['.'] 287 | 288 | txt=' '.join(rtn_apis) 289 | txt=','.join(common+[txt]) 290 | write_to_file(save_rtn_apicalls, txt+'\n',mode='a+') 291 | 292 | def main(): 293 | 294 | replacedic = read_replace('./funcname_type_map.json') 295 | 296 | folder_to_json(os.path.join(STORAGE,'train_flat'), 297 | load_test_file, 298 | os.path.join(STORAGE,'train_json')) 299 | 300 | folder_to_json(os.path.join(STORAGE,'test_flat'), 301 | load_test_file, 302 | os.path.join(STORAGE,'test_json')) 303 | 304 | train_json_to_txt() 305 | test_json_to_txt() 306 | 307 | train_json_to_txt_merge() 308 | test_json_to_txt_merge() 309 | 310 | train_json_to_txt_type(replacedic) 311 | test_json_to_txt_type(replacedic) 312 | 313 | 314 | if __name__=="__main__": 315 | main() -------------------------------------------------------------------------------- /src/tfidf_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Use tfidf to extract the text feature; 3 | The text feature include page title and page text. 4 | """ 5 | import os 6 | import numpy as np 7 | import pandas as pd 8 | 9 | from sklearn.feature_extraction.text import TfidfVectorizer 10 | 11 | from utility import write_to_file,load_model,save_model 12 | from conf import STORAGE 13 | 14 | def load_documents(document_save_path): 15 | 16 | dataset = pd.read_csv(document_save_path,names=['file_id','label','txt'],nrows = None) 17 | 18 | labels = [] 19 | docs = [] 20 | 21 | for index,row in dataset.iterrows(): 22 | label = row['label'] 23 | content = row['txt'] 24 | 25 | labels.append(int(label)) 26 | docs.append(content) 27 | 28 | return labels,docs 29 | 30 | def load_test_documents(document_save_path): 31 | 32 | dataset = pd.read_csv(document_save_path,names=['file_id','txt'],nrows = None) 33 | 34 | labels = [] 35 | docs = [] 36 | filesid=[] 37 | 38 | for index,row in dataset.iterrows(): 39 | # label = row['lable'] 40 | content = row['txt'] 41 | fileid = row['file_id'] 42 | labels.append(-1) 43 | docs.append(content) 44 | filesid.append(fileid) 45 | 46 | return labels,docs,filesid 47 | 48 | def display_scores(vectorizer, tfidf_result,savesubfolder): 49 | # http://stackoverflow.com/questions/16078015/ 50 | 51 | write_to_file(os.path.join(STORAGE,'{}/webpage.vocabulary.txt'.format(savesubfolder)), b'','wb+') 52 | for fea_name in vectorizer.get_feature_names(): 53 | fea_name=fea_name+'\n' 54 | write_to_file(os.path.join(STORAGE,'{}/webpage.vocabulary.txt'.format(savesubfolder)), fea_name.encode('utf-8')) 55 | 56 | scores = zip(vectorizer.get_feature_names(), 57 | np.asarray(tfidf_result.sum(axis=0)).ravel()) 58 | 59 | sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True) 60 | 61 | index = 0 62 | write_to_file(os.path.join(STORAGE,'{}/webpage.vocabulary_top.txt'.format(savesubfolder)), b'','wb+') 63 | for item in sorted_scores: 64 | index+=1 65 | txt = "{0} {1:50} Score: {2}\n".format(index,repr(item[0]), item[1]) 66 | write_to_file(os.path.join(STORAGE,'{}/webpage.vocabulary_top.txt'.format(savesubfolder)), txt.encode('utf-8')) 67 | 68 | 69 | def api_call_tfidf(): 70 | 71 | document_title_content_save_path=os.path.join(STORAGE,'apicall.txt') 72 | test_document_title_content_save_path=os.path.join(STORAGE,'test_apicall.txt') 73 | 74 | tfidf_save_path = os.path.join(STORAGE,'apicall_tfidf/webpage.tfidf.model') 75 | 76 | rawy ,raw_documents = load_documents(document_title_content_save_path) 77 | labels,docs,filesid = load_test_documents(test_document_title_content_save_path) 78 | 79 | documents=raw_documents+docs 80 | 81 | print(len(documents),len(documents)) 82 | 83 | # model = TfidfVectorizer(min_df = 4,decode_error ='ignore',stop_words='english',ngram_range=(1, 1),max_features=50000) 84 | model = TfidfVectorizer(decode_error ='ignore',stop_words='english',ngram_range=(2, 3),max_features=5000) #apicall 85 | # model = TfidfVectorizer(decode_error ='ignore',stop_words='english',ngram_range=(1, 1),max_features=1500) 86 | 87 | x = model.fit_transform(documents) 88 | 89 | save_model(model,tfidf_save_path) 90 | 91 | display_scores(model,x,'apicall_tfidf') 92 | 93 | def api_type_tfidf(): 94 | 95 | 96 | test_document_title_content_save_path=os.path.join(STORAGE,'test_apicall.type.txt') 97 | document_title_content_save_path=os.path.join(STORAGE,'apicall.type.txt') 98 | 99 | 100 | 101 | tfidf_save_path = os.path.join(STORAGE,'api_type_tfidf/webpage.tfidf.model') 102 | 103 | rawy ,raw_documents = load_documents(document_title_content_save_path) 104 | labels,docs,filesid = load_test_documents(test_document_title_content_save_path) 105 | 106 | documents=raw_documents+docs 107 | 108 | print(len(documents),len(documents)) 109 | 110 | # model = TfidfVectorizer(min_df = 4,decode_error ='ignore',stop_words='english',ngram_range=(1, 1),max_features=50000) 111 | model = TfidfVectorizer(decode_error ='ignore',stop_words='english',ngram_range=(4, 5),max_features=2000) #apicall 112 | # model = TfidfVectorizer(decode_error ='ignore',stop_words='english',ngram_range=(1, 1),max_features=1500) 113 | 114 | x = model.fit_transform(documents) 115 | 116 | save_model(model,tfidf_save_path) 117 | 118 | display_scores(model,x,'api_type_tfidf') 119 | 120 | def api_type_tfidf_5(): 121 | 122 | 123 | test_document_title_content_save_path=os.path.join(STORAGE,'test_apicall.type.txt') 124 | document_title_content_save_path=os.path.join(STORAGE,'apicall.type.txt') 125 | 126 | tfidf_save_path = os.path.join(STORAGE,'api_type_tfidf_5/webpage.tfidf.model') 127 | 128 | rawy ,raw_documents = load_documents(document_title_content_save_path) 129 | labels,docs,filesid = load_test_documents(test_document_title_content_save_path) 130 | 131 | documents=raw_documents+docs 132 | 133 | print(len(documents),len(documents)) 134 | 135 | # model = TfidfVectorizer(min_df = 4,decode_error ='ignore',stop_words='english',ngram_range=(1, 1),max_features=50000) 136 | model = TfidfVectorizer(min_df = 3,decode_error ='ignore',stop_words='english',ngram_range=(5, 5),max_features=1000) #apicall 137 | # model = TfidfVectorizer(decode_error ='ignore',stop_words='english',ngram_range=(1, 1),max_features=1500) 138 | 139 | x = model.fit_transform(documents) 140 | 141 | save_model(model,tfidf_save_path) 142 | 143 | display_scores(model,x,'api_type_tfidf_5') 144 | def api_call_tfidf_1(): 145 | 146 | document_title_content_save_path=os.path.join(STORAGE,'apicall.txt') 147 | test_document_title_content_save_path=os.path.join(STORAGE,'test_apicall.txt') 148 | 149 | tfidf_save_path = os.path.join(STORAGE,'apicall_tfidf_1/webpage.tfidf.model') 150 | 151 | rawy ,raw_documents = load_documents(document_title_content_save_path) 152 | labels,docs,filesid = load_test_documents(test_document_title_content_save_path) 153 | 154 | documents=raw_documents+docs 155 | 156 | model = TfidfVectorizer(decode_error ='ignore',stop_words='english',ngram_range=(1, 1)) 157 | 158 | x = model.fit_transform(documents) 159 | 160 | save_model(model,tfidf_save_path) 161 | 162 | display_scores(model,x,'apicall_tfidf_1') 163 | 164 | def rtnvalue_api_call_tfidf(): 165 | 166 | document_title_content_save_path=os.path.join(STORAGE,'rtnvalue_apicall.txt') 167 | test_document_title_content_save_path=os.path.join(STORAGE,'test_rtnvalue_apicall.txt') 168 | 169 | tfidf_save_path = os.path.join(STORAGE,'rtn_apicall_tfidf/webpage.tfidf.model') 170 | 171 | rawy ,raw_documents = load_documents(document_title_content_save_path) 172 | labels,docs,filesid = load_test_documents(test_document_title_content_save_path) 173 | 174 | documents=raw_documents+docs 175 | 176 | print(len(documents),len(documents)) 177 | 178 | # model = TfidfVectorizer(min_df = 4,decode_error ='ignore',stop_words='english',ngram_range=(1, 1),max_features=50000) 179 | # model = TfidfVectorizer(decode_error ='ignore',stop_words='english',ngram_range=(2, 3),max_features=5000) #apicall 180 | model = TfidfVectorizer(decode_error ='ignore',stop_words='english',ngram_range=(1, 1),max_features=1500) 181 | 182 | x = model.fit_transform(documents) 183 | 184 | save_model(model,tfidf_save_path) 185 | 186 | display_scores(model,x,'rtn_apicall_tfidf') 187 | 188 | def rtnvalue__tfidf(): 189 | 190 | document_title_content_save_path=os.path.join(STORAGE,'rtvalue.txt') 191 | test_document_title_content_save_path=os.path.join(STORAGE,'test_rtvalue.txt') 192 | 193 | tfidf_save_path = os.path.join(STORAGE,'rtnvalue_tfidf/webpage.tfidf.model') 194 | 195 | rawy ,raw_documents = load_documents(document_title_content_save_path) 196 | labels,docs,filesid = load_test_documents(test_document_title_content_save_path) 197 | 198 | documents=raw_documents+docs 199 | 200 | print(len(documents),len(documents)) 201 | 202 | model = TfidfVectorizer(min_df = 4,decode_error ='ignore',stop_words='english',ngram_range=(1, 3),max_features=3000) 203 | 204 | x = model.fit_transform(documents) 205 | 206 | save_model(model,tfidf_save_path) 207 | 208 | display_scores(model,x,'rtnvalue_tfidf') 209 | 210 | 211 | def main(): 212 | 213 | api_type_tfidf_5() 214 | api_call_tfidf_1() 215 | api_call_tfidf() 216 | rtnvalue_api_call_tfidf() 217 | rtnvalue__tfidf() 218 | 219 | if __name__ == "__main__": 220 | 221 | main() 222 | 223 | 224 | 225 | 226 | 227 | -------------------------------------------------------------------------------- /src/utility.py: -------------------------------------------------------------------------------- 1 | try: 2 | import _pickle as cPickle 3 | except Exception as e: 4 | import cPickle 5 | 6 | import logging 7 | import json 8 | 9 | log = logging.getLogger(__name__) 10 | 11 | def save_model(clf,modelpath): 12 | with open(modelpath, 'wb') as f: 13 | cPickle.dump(clf, f) 14 | 15 | def load_model(modelpath): 16 | try: 17 | with open(modelpath, 'rb') as f: 18 | rf = cPickle.load(f) 19 | return rf 20 | except Exception as e: 21 | log.exception('load model error {}'.format(modelpath)) 22 | return None 23 | 24 | def write_to_file(path,txt,mode='ab+'): 25 | with open(path,mode=mode) as f: 26 | f.write(txt) 27 | 28 | def get_replace_type_name(name,replacedic): 29 | name=name.lower() 30 | return replacedic.get(name,'') 31 | 32 | def read_replace(path): 33 | with open(path) as f: 34 | data = json.load(f) 35 | return data 36 | --------------------------------------------------------------------------------