├── doc
    └── 云安全比赛_PPT.pdf
├── readme.md
└── src
    ├── basic_feature.py
    ├── conf.py
    ├── dataset.py
    ├── feature.py
    ├── file_split.py
    ├── funcname_type_map.json
    ├── light_gbm_model.py
    ├── model_predict.py
    ├── preprocess.py
    ├── tfidf_model.py
    └── utility.py


/doc/云安全比赛_PPT.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/goodskillprogramer/MalwareClassify/5ef3c6267a9469a48ae94ae9d480d715aa675348/doc/云安全比赛_PPT.pdf


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # 恶意软件分类
 2 | - 机器学习和恶意软件分类
 3 | - 基于API调用序列，主要是n-gram和tfidf特征
 4 | - 机器学习工具用的lightgbm
 5 | 
 6 | # malware classify based on API sequence
 7 | - Using machine learning method to classify malware type
 8 | - most of the feature is extracted from API sequence
 9 | - using n-gram and tfidf to extract the vector
10 | - you can download the trainset from this [website]( https://tianchi.aliyun.com/competition/information.htm?spm=5176.100067.5678.2.66304169Zp1gFH&raceId=231668)
11 | 
12 | 
13 | ## 程序介绍
14 | - file_split.py 读取csv文件，并按照不同的文件ID组织
15 | - preprocess.py 会将每个文件，转成json格式，并且序列化api
16 | - basic_feature.py 提取简单特征
17 | - tfidf_model.py  生成tfidf模型
18 | - feature.py 利用生成的tfidf模型转换训练和测试数据
19 | - light_gbm_model.py 模型调参
20 | - model_predict.py 结果预测
21 | 
22 | ## 说明
23 | - 这是参加 第三届『阿里云安全算法挑战赛』源代码，最后成绩在Top30以内,不在Top10以内。
24 | -  因为Top30以内需要提交PPT，所以写了PPT。[PPT里面有检测原理介绍](https://github.com/jingwang3235/MalwareClassify/blob/master/doc/%E4%BA%91%E5%AE%89%E5%85%A8%E6%AF%94%E8%B5%9B_PPT.pdf)。
25 | - Top10选手需要去参加决赛，Top5 选手需要答辩。他们答辩视频见：http://tb.cn/qYyB8Mw。可以学学好的思路。
26 | - 比赛链接（里面有训练和测试数据）：https://tianchi.aliyun.com/competition/information.htm?spm=5176.100067.5678.2.66304169Zp1gFH&raceId=231668


--------------------------------------------------------------------------------
/src/basic_feature.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import os
 3 | import traceback
 4 | import json
 5 | from utility import write_to_file
 6 | from conf import STORAGE
 7 | 
 8 | def basic_feature(fpath):
 9 |     
10 |     feature={}
11 |     with open(fpath) as f:
12 |         jsondata = json.load(f)
13 |         threads=jsondata['threads']
14 |         file_id =jsondata['file_id']
15 |         label = jsondata.get('lable','-1')
16 |         
17 |         threadnum=len(threads)
18 |         totalapicall = 0
19 |         maxapicall = 0
20 |         minapicall = 0
21 |         meanapicallperthread=0.0
22 |         for tid in threads:
23 |             info = threads[tid]
24 |             totalapicall = len(info['api_calls'])
25 |             maxapicall=max(maxapicall,len(info['api_calls']))
26 |             minapicall=min(minapicall,len(info['api_calls']))
27 |             
28 |         meanapicallperthread = (totalapicall*1.0)/threadnum
29 |         
30 |         print('{} {} {} {} {} {} {}'.format(file_id,label,threadnum,totalapicall,maxapicall,minapicall,meanapicallperthread))
31 |         feature['file_id'] = file_id
32 |         feature['label'] = label
33 |         feature['threadnum'] = threadnum
34 |         feature['totalapicall'] = totalapicall
35 |         feature['maxapicall'] = maxapicall
36 |         feature['minapicall'] = minapicall
37 |         feature['meanapicallperthread'] = meanapicallperthread
38 |         return feature        
39 | 
40 | def folder_to_basic_feature(folder,feature_save_path):
41 |     
42 |     featureheader=['file_id','label','threadnum','totalapicall','maxapicall','minapicall','meanapicallperthread']
43 |     stime=time.time()
44 |     count=0
45 |     write_to_file(feature_save_path,'',mode='w+')     
46 |     for subfolder in os.listdir(folder):
47 |         fullsubfolder = os.path.join(folder,subfolder)        
48 |         for fname in os.listdir(fullsubfolder):
49 |             fullname = os.path.join(fullsubfolder,fname)
50 |             try:                
51 |                 feature = basic_feature(fullname)
52 |                 attr=[]
53 |                 for head in featureheader:
54 |                     attr.append(str(feature.get(head,0)))
55 |                 txt =','.join(attr)
56 |                 write_to_file(feature_save_path,txt+'\n',mode='a+')                    
57 |                 
58 |             except Exception as e:
59 |                 traceback.print_exc()
60 | 
61 | def main():
62 | 
63 |     folder_to_basic_feature(os.path.join(STORAGE,'train_json'),os.path.join(STORAGE,'train_data','basicfeature.csv'))
64 |     folder_to_basic_feature(os.path.join(STORAGE,'test_json'),os.path.join(STORAGE,'test_data','basicfeature.csv'))
65 | 
66 | if __name__ == '__main__':
67 |     main()


--------------------------------------------------------------------------------
/src/conf.py:
--------------------------------------------------------------------------------
1 | 
2 | STORAGE=r'G:\sample\binary'


--------------------------------------------------------------------------------
/src/dataset.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | from collections import Counter
 4 | 
 5 | def get_feature_header(fpath):
 6 |     
 7 |     with open(fpath) as f:
 8 |         data = json.load(f)
 9 |         return data 
10 |     
11 | def load_data(savepath,featureheaders,targethearders,nrows=None):
12 |     
13 |     dataset = pd.read_csv(savepath,header = 0,nrows =nrows)
14 |     
15 |     x = dataset[featureheaders]
16 |     y = dataset[targethearders]
17 |     return x.values,y.values.reshape((len(y.values),))
18 | 
19 | def get_class_weights(y):
20 |     counter = Counter(y)
21 |     majority = max(counter.values())
22 |     return  {cls: round(float(majority)/float(count), 2) for cls, count in counter.items()}
23 | 


--------------------------------------------------------------------------------
/src/feature.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | 
  4 | import pandas as pd
  5 | 
  6 | from conf import STORAGE
  7 | from utility import write_to_file,load_model
  8 | 
  9 | 
 10 | def load_documents(document_save_path):
 11 |     
 12 |     dataset = pd.read_csv(document_save_path,names=['file_id','txt'],nrows = None)
 13 |     
 14 |     labels = []
 15 |     docs = []
 16 |     filesid=[]
 17 |     
 18 |     for index,row in dataset.iterrows():
 19 | #         label = row['lable']
 20 |         content = row['txt']
 21 |         fileid = row['file_id']
 22 |         labels.append(-1)
 23 |         docs.append(content)
 24 |         filesid.append(fileid)
 25 |         
 26 |     return labels,docs,filesid
 27 | 
 28 | def load_train_documents(document_save_path):
 29 |     
 30 |     dataset = pd.read_csv(document_save_path,names=['file_id','label','txt'],nrows = None)
 31 |     
 32 |     labels = []
 33 |     docs = []
 34 |     filesid=[]
 35 |     
 36 |     for index,row in dataset.iterrows():
 37 |         label = row['label']
 38 |         content = row['txt']
 39 |         fileid = row['file_id']
 40 |         labels.append(label)
 41 |         docs.append(content)
 42 |         filesid.append(fileid)
 43 |         
 44 |     return labels,docs,filesid
 45 | 
 46 | 
 47 | def funcname23_tfidf_feature(document_apicall_save_path,newsavepath,loadfunc):
 48 |     
 49 |     if os.path.exists(newsavepath):
 50 |         print('Exists',newsavepath)
 51 |         return
 52 |     
 53 |     tfidf_save_path = os.path.join(STORAGE,'apicall_tfidf/webpage.tfidf.model')
 54 | #     pca_save_path = os.path.join(STORAGE,'apicall_tfidf_2_3_50000/webpage.tfidf.pca.model')
 55 | 
 56 |     rawy ,raw_documents,filesid = loadfunc(document_apicall_save_path)
 57 | 
 58 |     print(time.asctime(),len(rawy),len(raw_documents))
 59 |    
 60 |     model = load_model(tfidf_save_path)
 61 |     
 62 |     x = model.transform(raw_documents)   
 63 |     
 64 |     samples,feautre = x.shape
 65 |     
 66 |     write_to_file(newsavepath, '', mode='w+')
 67 |     
 68 |     headers=['file_id','label']
 69 |     headers+=model.get_feature_names()
 70 |     txt =','.join(headers)
 71 |     txt+='\n'
 72 |     write_to_file(newsavepath,txt,mode='a+')
 73 |     
 74 |     for i in range(samples):
 75 |         
 76 |         attr = [str(filesid[i]),str(rawy[i])]
 77 |                 
 78 |         feature = x[i].toarray()[0]
 79 | #         print(feature.shape)
 80 |         attr += [str(f)  for f in feature]
 81 |         txt=','.join(attr)
 82 |         txt+='\n'
 83 |         write_to_file(newsavepath, txt, mode='a+')
 84 |         
 85 | def api_type_tfidf_feature(document_apicall_save_path,newsavepath,loadfunc):
 86 |     
 87 |     if os.path.exists(newsavepath):
 88 |         print('Exists',newsavepath)
 89 |         return
 90 |     
 91 |     tfidf_save_path = os.path.join(STORAGE,'api_type_tfidf_5/webpage.tfidf.model')
 92 | #     pca_save_path = os.path.join(STORAGE,'apicall_tfidf_2_3_50000/webpage.tfidf.pca.model')
 93 | 
 94 |     rawy ,raw_documents,filesid = loadfunc(document_apicall_save_path)
 95 | 
 96 |     print(time.asctime(),len(rawy),len(raw_documents))
 97 |    
 98 |     model = load_model(tfidf_save_path)
 99 |     
100 |     x = model.transform(raw_documents)   
101 |     
102 |     samples,feautre = x.shape
103 |     
104 |     write_to_file(newsavepath, '', mode='w+')
105 |     
106 |     headers=['file_id','label']
107 |     headers+=model.get_feature_names()
108 |     txt =','.join(headers)
109 |     txt+='\n'
110 |     write_to_file(newsavepath,txt,mode='a+')
111 |     
112 |     for i in range(samples):
113 |         
114 |         attr = [str(filesid[i]),str(rawy[i])]
115 |                 
116 |         feature = x[i].toarray()[0]
117 | #         print(feature.shape)
118 |         attr += [str(f)  for f in feature]
119 |         txt=','.join(attr)
120 |         txt+='\n'
121 |         write_to_file(newsavepath, txt, mode='a+')	
122 |         	
123 | def rtn_funcname_tfidf_feature(document_apicall_save_path,newsavepath,loadfunc):
124 |     
125 |     if os.path.exists(newsavepath):
126 |         print('Exists',newsavepath)
127 |         return
128 |     
129 |     tfidf_save_path = os.path.join(STORAGE,'rtn_apicall_tfidf/webpage.tfidf.model')
130 | 
131 |     rawy ,raw_documents,filesid = loadfunc(document_apicall_save_path)
132 | 
133 |     print(time.asctime(),len(rawy),len(raw_documents))
134 |    
135 |     model = load_model(tfidf_save_path)
136 |     
137 |     x = model.transform(raw_documents)   
138 |     
139 |     samples,feautre = x.shape
140 |     
141 |     write_to_file(newsavepath, '', mode='w+')
142 |     
143 |     headers=['file_id','label']
144 |     headers+=model.get_feature_names()
145 |     txt =','.join(headers)
146 |     txt+='\n'
147 |     write_to_file(newsavepath,txt,mode='a+')
148 |     
149 |     for i in range(samples):
150 |         
151 |         attr = [str(filesid[i]),str(rawy[i])]
152 |                 
153 |         feature = x[i].toarray()[0]
154 | #         print(feature.shape)
155 |         attr += [str(f)  for f in feature]
156 |         txt=','.join(attr)
157 |         txt+='\n'
158 |         write_to_file(newsavepath, txt, mode='a+')
159 |         
160 | def funcname1_tfidf_feature(document_apicall_save_path,newsavepath,loadfunc):
161 | 
162 |     if os.path.exists(newsavepath):
163 |         print('Exists',newsavepath)
164 |         return
165 |         
166 |     tfidf_save_path = os.path.join(STORAGE,'apicall_tfidf_1/webpage.tfidf.model')
167 | 
168 |     rawy ,raw_documents,filesid = loadfunc(document_apicall_save_path)
169 | 
170 |     print(time.asctime(),len(rawy),len(raw_documents))
171 |    
172 |     model = load_model(tfidf_save_path)
173 |     
174 |     x = model.transform(raw_documents)   
175 |     
176 |     samples,feautre = x.shape
177 |     
178 |     write_to_file(newsavepath, '', mode='w+')
179 |     
180 |     headers=['file_id','label']
181 |     headers+=model.get_feature_names()
182 |     txt =','.join(headers)
183 |     txt+='\n'
184 |     write_to_file(newsavepath,txt,mode='a+')
185 |     
186 |     for i in range(samples):
187 |         
188 |         attr = [str(filesid[i]),str(rawy[i])]
189 |                 
190 |         feature = x[i].toarray()[0]
191 | #         print(feature.shape)
192 |         attr += [str(f)  for f in feature]
193 |         txt=','.join(attr)
194 |         txt+='\n'
195 |         write_to_file(newsavepath, txt, mode='a+')
196 | 
197 | def rtvalue_tfidf_feature(document_rtnvalue_save_path,newsavepath,loadfunc):
198 |     
199 |     if os.path.exists(newsavepath):
200 |         print('Exists',newsavepath)
201 |         return
202 | 
203 |     tfidf_save_path = os.path.join(STORAGE,'rtnvalue_tfidf/webpage.tfidf.model')
204 |    
205 |     rawy ,raw_documents,filesid = loadfunc(document_rtnvalue_save_path)
206 | 
207 |     print(time.asctime(),len(rawy),len(raw_documents))
208 |    
209 |     model = load_model(tfidf_save_path)
210 |     
211 |     x = model.transform(raw_documents)   
212 |     
213 |     samples,feautre = x.shape
214 |     
215 |     write_to_file(newsavepath, '', mode='w+')
216 |     
217 |     headers=['file_id','label']
218 |     
219 |     headers+=model.get_feature_names()
220 |     
221 |     txt =','.join(headers)
222 |     txt+='\n'
223 |     
224 |     write_to_file(newsavepath,txt,mode='a+')
225 |     
226 |     for i in range(samples):
227 |         
228 |         attr = [str(filesid[i]),str(rawy[i])]
229 |                 
230 |         feature = x[i].toarray()[0]
231 | 
232 |         attr += [str(f)  for f in feature]
233 |         txt=','.join(attr)
234 |         txt+='\n'
235 |         write_to_file(newsavepath, txt, mode='a+')
236 |     
237 | def feature_merge(merge_save_path,selected_save_path,webpage_vectore_save_path):
238 |     
239 |     if os.path.exists(merge_save_path):
240 |         print('Exists',merge_save_path)
241 |         return True
242 |      
243 |     txt = 'merge from {} {} to merge_save_path {}'.format(selected_save_path,webpage_vectore_save_path,merge_save_path)
244 |     print(txt)
245 |     selected_feature_set = pd.read_csv(selected_save_path,encoding='utf-8')
246 |     txt_feature_set = pd.read_csv(webpage_vectore_save_path,encoding='utf-8')
247 |     print('finish reading the two csv file .merging...')
248 |     merge = pd.merge(selected_feature_set,txt_feature_set,on=['file_id'])
249 |     print('finish merging,export to csv..')
250 |     merge.to_csv(merge_save_path,index=False,encoding='utf-8')
251 |     
252 | def test_data():
253 |     document_apicall_save_path=os.path.join(STORAGE,'test_apicall.txt')
254 |     document_rtnvalue_save_path=os.path.join(STORAGE,'test_rtvalue.txt')
255 |     document_rtnvalue_api_save_path=os.path.join(STORAGE,'test_rtnvalue_apicall.txt')
256 | 
257 |     document_api_type_save_path=os.path.join(STORAGE,'test_apicall.type.txt')
258 |     
259 |     
260 |     funcname1_tfidf_savepath = os.path.join(STORAGE,'testdata/funcname1.tfidf.csv')
261 |     funcname23_tfidf_savepath = os.path.join(STORAGE,'testdata/funcname23.tfidf.csv')
262 |     rtn_tfidf_savepath = os.path.join(STORAGE,'testdata/rtn.tfidf.csv')    
263 |     basicfeature = os.path.join(STORAGE,'testdata/basicfeature.csv')
264 |     rtn_funcname_tfidf_savepath = os.path.join(STORAGE,'testdata/rtnfuncname.tfidf.csv')
265 |     apitype_tfidf_savepath = os.path.join(STORAGE,'testdata/apitype.tfidf.csv')
266 |     
267 |     funcname1_tfidf_feature(document_apicall_save_path,funcname1_tfidf_savepath,load_documents)
268 |     rtvalue_tfidf_feature(document_rtnvalue_save_path,rtn_tfidf_savepath,load_documents)
269 |     funcname23_tfidf_feature(document_apicall_save_path,funcname23_tfidf_savepath,load_documents)
270 |     rtn_funcname_tfidf_feature(document_rtnvalue_api_save_path,rtn_funcname_tfidf_savepath,load_documents)
271 |     api_type_tfidf_feature(document_api_type_save_path,apitype_tfidf_savepath,load_documents)
272 |     
273 |     merge_basic_funcnam1=os.path.join(STORAGE,'testdata/merge.basic.func1.csv')    
274 |     feature_merge(merge_basic_funcnam1,basicfeature,funcname1_tfidf_savepath)
275 |         
276 |     merge_basic_funcnam1_rtnvalue=os.path.join(STORAGE,'testdata/merge.basic.func1.rtnvalue.csv')
277 |     feature_merge(merge_basic_funcnam1_rtnvalue,merge_basic_funcnam1,rtn_tfidf_savepath)
278 |     
279 |     merge_basic_funcnam1_rtnvalue_funcname23=os.path.join(STORAGE,'testdata/merge.basic.func1.rtnvalue.func23.csv')
280 |     feature_merge(merge_basic_funcnam1_rtnvalue_funcname23,merge_basic_funcnam1_rtnvalue,funcname23_tfidf_savepath)
281 |     
282 |     merge_basic_funcnam1_rtnvalue_funcname23_rtnapi_=os.path.join(STORAGE,'testdata/merge.basic.func1.rtnvalue.func23.rtnapi.csv')
283 |     feature_merge(merge_basic_funcnam1_rtnvalue_funcname23_rtnapi_,merge_basic_funcnam1_rtnvalue_funcname23,rtn_funcname_tfidf_savepath)
284 |     
285 |     merge_basic_funcnam1_rtnvalue_funcname23_rtnapi_apitype_=os.path.join(STORAGE,'testdata/merge.basic.func1.rtnvalue.func23.rtnapi.apitype.csv')
286 |     feature_merge(merge_basic_funcnam1_rtnvalue_funcname23_rtnapi_apitype_,merge_basic_funcnam1_rtnvalue_funcname23_rtnapi_,apitype_tfidf_savepath)
287 |     
288 | def train_data():
289 |     document_apicall_save_path=os.path.join(STORAGE,'apicall.txt')
290 |     document_rtnvalue_save_path=os.path.join(STORAGE,'rtvalue.txt')
291 |     document_rtnvalue_api_save_path=os.path.join(STORAGE,'rtnvalue_apicall.txt')
292 |     document_api_type_save_path=os.path.join(STORAGE,'apicall.type.txt')
293 |     
294 |     
295 |     funcname1_tfidf_savepath = os.path.join(STORAGE,'data/funcname1.tfidf.csv')
296 |     funcname23_tfidf_savepath = os.path.join(STORAGE,'data/funcname23.tfidf.csv')
297 |     rtn_funcname_tfidf_savepath = os.path.join(STORAGE,'data/rtnfuncname.tfidf.csv')
298 |     rtn_tfidf_savepath = os.path.join(STORAGE,'data/rtn.tfidf.csv')    
299 |     basicfeature = os.path.join(STORAGE,'data/basicfeature.csv')
300 |     apitype_tfidf_savepath = os.path.join(STORAGE,'data/apitype.tfidf.csv')
301 |     
302 |     
303 |     funcname1_tfidf_feature(document_apicall_save_path,funcname1_tfidf_savepath,load_train_documents)
304 |     rtvalue_tfidf_feature(document_rtnvalue_save_path,rtn_tfidf_savepath,load_train_documents)
305 |     funcname23_tfidf_feature(document_apicall_save_path,funcname23_tfidf_savepath,load_train_documents)
306 |     rtn_funcname_tfidf_feature(document_rtnvalue_api_save_path,rtn_funcname_tfidf_savepath,load_train_documents)
307 |     api_type_tfidf_feature(document_api_type_save_path,apitype_tfidf_savepath,load_train_documents)
308 |     
309 |     
310 |     merge_basic_funcnam1=os.path.join(STORAGE,'data/merge.basic.func1.csv')    
311 |     feature_merge(merge_basic_funcnam1,basicfeature,funcname1_tfidf_savepath)
312 |         
313 |     merge_basic_funcnam1_rtnvalue=os.path.join(STORAGE,'data/merge.basic.func1.rtnvalue.csv')
314 |     feature_merge(merge_basic_funcnam1_rtnvalue,merge_basic_funcnam1,rtn_tfidf_savepath)
315 |     
316 |     merge_basic_funcnam1_rtnvalue_funcname23=os.path.join(STORAGE,'data/merge.basic.func1.rtnvalue.func23.csv')
317 |     feature_merge(merge_basic_funcnam1_rtnvalue_funcname23,merge_basic_funcnam1_rtnvalue,funcname23_tfidf_savepath)
318 |     
319 |     merge_basic_funcnam1_rtnvalue_funcname23_rtnapi_=os.path.join(STORAGE,'data/merge.basic.func1.rtnvalue.func23.rtnapi.csv')
320 |     feature_merge(merge_basic_funcnam1_rtnvalue_funcname23_rtnapi_,merge_basic_funcnam1_rtnvalue_funcname23,rtn_funcname_tfidf_savepath)
321 |     
322 |     
323 |     merge_basic_funcnam1_rtnvalue_funcname23_rtnapi_apitype_=os.path.join(STORAGE,'data/merge.basic.func1.rtnvalue.func23.rtnapi.apitype.csv')
324 |     feature_merge(merge_basic_funcnam1_rtnvalue_funcname23_rtnapi_apitype_,merge_basic_funcnam1_rtnvalue_funcname23_rtnapi_,apitype_tfidf_savepath)
325 |     
326 | def main():    
327 |     train_data()
328 |     test_data()  
329 | 
330 | if __name__ == "__main__":
331 | 
332 |     main()
333 |     
334 |     
335 |     


--------------------------------------------------------------------------------
/src/file_split.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from conf import STORAGE
 4 | from utility import write_to_file
 5 | 
 6 | trainpath=os.path.join(STORAGE,'3rd_security_train/train.csv')
 7 | testpath=os.path.join(STORAGE,'3rd_security_test/test.csv')
 8 | 
 9 | def train_file_split():
10 |     'split the train file to many files by file id.Rows with the same file id should be save in one file'
11 |     count=0
12 |     with open(trainpath,'r') as f:
13 |         for line in f:
14 |             count+=1
15 |             if count==1:
16 |                 continue
17 |             
18 |             splits=line.split(',')
19 |             if len(splits)>=2:
20 |                 lable = splits[1]
21 |                 fileid=splits[0]
22 |                 subfolder = lable
23 | 
24 |                 savefolder=os.path.join(STORAGE,'train_flat',subfolder)
25 |                 os.makedirs(savefolder,exist_ok=True)
26 |                 savefile=os.path.join(savefolder,str(fileid)+'.txt')
27 |                 write_to_file(savefile,line)
28 |                 if count%10000==0:
29 |                     print(count)
30 |                     
31 | def test_file_split(mode,left):
32 |     'split the test file to many files by file id.Rows with the same file id should be save in one file'
33 |     count=0
34 | 
35 |     with open(testpath,'r') as f:
36 |         for line in f:
37 |             count+=1
38 |             if count==1:
39 |                 continue
40 |             
41 |             splits=line.split(',')
42 |             if len(splits)>=2:                
43 |                 fileid=splits[0]
44 |                 moderesut =int(fileid)%mode
45 |                 if  moderesut == left:
46 |                     savefolder=os.path.join(STORAGE,'test_flat')
47 |                     savefolder=os.path.join(savefolder,str(left))
48 |                     os.makedirs(savefolder,exist_ok=True)    
49 |                     savefile=os.path.join(savefolder,str(fileid)+'.txt')                    
50 |                     write_to_file(savefile,line,mode='a+')
51 |                     if count%10000==0:
52 |                         print(count,fileid,mode,left)
53 |  
54 | def main():
55 |     
56 |     train_file_split()
57 |     # u can use muti-process below for a more faster way
58 |     test_file_split(4,0)
59 |     test_file_split(4,1)
60 |     test_file_split(4,2)
61 |     test_file_split(4,3)
62 |     
63 | if __name__ =='__main__':
64 |     
65 |     main()
66 | 


--------------------------------------------------------------------------------
/src/funcname_type_map.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "regdeletekeya": "AM", 
  3 |     "getaddrinfo": "Z", 
  4 |     "regqueryvalueexa": "AP", 
  5 |     "wsaconnect": "BA", 
  6 |     "internetgetconnectedstateexa": "N", 
  7 |     "ntopenkey": "AO", 
  8 |     "ntopenmutant": "Y", 
  9 |     "internetclosehandle": "M", 
 10 |     "regopenkeyexw": "AP", 
 11 |     "internetgetconnectedstateexw": "N", 
 12 |     "getfileversioninfow": "H", 
 13 |     "ntsavekey": "AQ", 
 14 |     "ntqueryvaluekey": "AP", 
 15 |     "regdeletekeyw": "AM", 
 16 |     "httpqueryinfoa": "L", 
 17 |     "regopenkeyexa": "AP", 
 18 |     "getsystemdirectorya": "BC", 
 19 |     "exitwindowsex": "BD", 
 20 |     "getaddrinfow": "Z", 
 21 |     "regdeletevaluew": "AM", 
 22 |     "bind": "BA", 
 23 |     "setwindowshookexa": "K", 
 24 |     "ldrgetprocedureaddress": "X", 
 25 |     "send": "BA", 
 26 |     "ntdeletekey": "AM", 
 27 |     "regdeletevaluea": "AM", 
 28 |     "regcreatekeyexa": "AL", 
 29 |     "getsystemwindowsdirectoryw": "BC", 
 30 |     "ntprotectvirtualmemory": "AJ", 
 31 |     "oleinitialize": "AB", 
 32 |     "writeconsolea": "T", 
 33 |     "getsystemdirectoryw": "BC", 
 34 |     "getadaptersinfo": "Z", 
 35 |     "ntquerydirectoryfile": "F", 
 36 |     "setfileattributesw": "AZ", 
 37 |     "netgetjoininformation": "Z", 
 38 |     "deleteservice": "AV", 
 39 |     "deletefilew": "F", 
 40 |     "wsasend": "BA", 
 41 |     "writeconsolew": "T", 
 42 |     "ldrunloaddll": "AS", 
 43 |     "findwindowexa": "BE", 
 44 |     "getvolumepathnamew": "F", 
 45 |     "enumservicesstatusa": "AX", 
 46 |     "setwindowshookexw": "K", 
 47 |     "wsarecvfrom": "BA", 
 48 |     "regenumvaluea": "AP", 
 49 |     "ntcreatekey": "AL", 
 50 |     "regenumkeyexw": "AP", 
 51 |     "regqueryvalueexw": "AP", 
 52 |     "recvfrom": "BA", 
 53 |     "getdiskfreespaceexw": "H", 
 54 |     "isdebuggerpresent": "B", 
 55 |     "findwindowexw": "BE", 
 56 |     "sendto": "BA", 
 57 |     "getadaptersaddresses": "BA", 
 58 |     "regenumkeyexa": "AP", 
 59 |     "netusergetlocalgroups": "Z", 
 60 |     "process32nextw": "AI", 
 61 |     "wsasendto": "BA", 
 62 |     "ntwritevirtualmemory": "AJ", 
 63 |     "readprocessmemory": "AI", 
 64 |     "ntopenthread": "AH", 
 65 |     "connect": "BA", 
 66 |     "getsockname": "BA", 
 67 |     "regcreatekeyexw": "AL", 
 68 |     "ldrloaddll": "V", 
 69 |     "ntgetcontextthread": "AI", 
 70 |     "thread32first": "AF", 
 71 |     "setfileinformationbyhandle": "AZ", 
 72 |     "lookupaccountsidw": "BC", 
 73 |     "getfileinformationbyhandleex": "H", 
 74 |     "netusergetinfo": "Z", 
 75 |     "writeprocessmemory": "AJ", 
 76 |     "createdirectoryexw": "F", 
 77 |     "regenumkeyw": "AP", 
 78 |     "findwindoww": "BE", 
 79 |     "ntsavekeyex": "AQ", 
 80 |     "enumservicesstatusw": "AX", 
 81 |     "httpsendrequestw": "L", 
 82 |     "select": "BA", 
 83 |     "ntallocatevirtualmemory": "AJ", 
 84 |     "ntcreatethread": "AE", 
 85 |     "ntopenprocess": "AH", 
 86 |     "getinterfaceinfo": "Z", 
 87 |     "copyfileexw": "F", 
 88 |     "ntfreevirtualmemory": "W", 
 89 |     "ntcreatemutant": "Y", 
 90 |     "getcomputernamew": "G", 
 91 |     "getsystemmetrics": "BC", 
 92 |     "ntenumeratevaluekey": "AP", 
 93 |     "ntcreatesection": "AT", 
 94 |     "startservicew": "BB", 
 95 |     "getfilesizeex": "H", 
 96 |     "createthread": "AE", 
 97 |     "ntdeletevaluekey": "AM", 
 98 |     "findresourcew": "AR", 
 99 |     "enumwindows": "BE", 
100 |     "startservicea": "BB", 
101 |     "ntclose": "Q", 
102 |     "getcomputernamea": "G", 
103 |     "getfileattributesw": "H", 
104 |     "findresourcea": "AR", 
105 |     "recv": "BA", 
106 |     "setinformationjobobject": "AZ", 
107 |     "process32firstw": "AI", 
108 |     "getusernameexw": "I", 
109 |     "setsockopt": "BA", 
110 |     "socket": "BA", 
111 |     "ntunloaddriver": "D", 
112 |     "removedirectoryw": "F", 
113 |     "ntterminatethread": "AG", 
114 |     "getusernameexa": "I", 
115 |     "removedirectorya": "F", 
116 |     "getasynckeystate": "AP", 
117 |     "setfilepointer": "AZ", 
118 |     "getusernamew": "I", 
119 |     "createservicea": "AW", 
120 |     "ioctlsocket": "BA", 
121 |     "getusernamea": "I", 
122 |     "controlservice": "AU", 
123 |     "internetgetconnectedstate": "N", 
124 |     "createremotethreadex": "AE", 
125 |     "shutdown": "BA", 
126 |     "createdirectoryw": "F", 
127 |     "ntdeletefile": "F", 
128 |     "createservicew": "AW", 
129 |     "ntopendirectoryobject": "F", 
130 |     "wsasocketa": "BA", 
131 |     "wsaaccept": "BA", 
132 |     "findwindowa": "BE", 
133 |     "ntloadkey": "AA", 
134 |     "regsetvalueexa": "AQ", 
135 |     "ntcreatefile": "F", 
136 |     "wsasocketw": "BA", 
137 |     "ntloaddriver": "D", 
138 |     "createprocessinternalw": "AE", 
139 |     "listen": "BA", 
140 |     "getnativesysteminfo": "BC", 
141 |     "findresourceexa": "AR", 
142 |     "httpopenrequesta": "L", 
143 |     "ntsetcontextthread": "AD", 
144 |     "getfilesize": "H", 
145 |     "openscmanagerw": "AC", 
146 |     "ntenumeratekey": "AP", 
147 |     "accept": "BA", 
148 |     "ntreadfile": "S", 
149 |     "getfileversioninfosizew": "H", 
150 |     "ntterminateprocess": "AG", 
151 |     "openscmanagera": "AC", 
152 |     "closesocket": "BA", 
153 |     "httpopenrequestw": "L", 
154 |     "urldownloadtofilew": "O", 
155 |     "getvolumepathnamesforvolumenamew": "F", 
156 |     "ntquerymultiplevaluekey": "AP", 
157 |     "regenumvaluew": "AP", 
158 |     "internetopenurla": "O", 
159 |     "ntcreatethreadex": "AE", 
160 |     "ntopenfile": "R", 
161 |     "unhookwindowshookex": "K", 
162 |     "findfirstfileexa": "E", 
163 |     "shellexecuteexw": "AE", 
164 |     "internetopenurlw": "O", 
165 |     "ntopenkeyex": "AO", 
166 |     "findfirstfileexw": "E", 
167 |     "getfiletype": "H", 
168 |     "getfileattributesexw": "H", 
169 |     "ntsetvaluekey": "AQ", 
170 |     "module32firstw": "X", 
171 |     "regsetvalueexw": "AQ", 
172 |     "wsarecv": "BA", 
173 |     "setfiletime": "AZ", 
174 |     "wsastartup": "BA", 
175 |     "thread32next": "AF", 
176 |     "getvolumenameforvolumemountpointw": "F", 
177 |     "ntsuspendthread": "AD", 
178 |     "ntloadkeyex": "AA", 
179 |     "ntwritefile": "T", 
180 |     "internetopenw": "O", 
181 |     "getfileversioninfoexw": "H", 
182 |     "getsystemwindowsdirectorya": "BC", 
183 |     "regclosekey": "AK", 
184 |     "copyfilew": "F", 
185 |     "openservicea": "AY", 
186 |     "ntquerykey": "AP", 
187 |     "regqueryinfokeyw": "AP", 
188 |     "openservicew": "AY", 
189 |     "copyfilea": "F", 
190 |     "ntloadkey2": "AA", 
191 |     "setfilepointerex": "AZ", 
192 |     "wnetgetprovidernamew": "BC", 
193 |     "ntcreatedirectoryobject": "F", 
194 |     "outputdebugstringa": "B", 
195 |     "ldrgetdllhandle": "X", 
196 |     "regqueryinfokeya": "AP", 
197 |     "createremotethread": "AE", 
198 |     "ntopensection": "AT", 
199 |     "getfileversioninfosizeexw": "H", 
200 |     "internetconnectw": "U", 
201 |     "httpsendrequesta": "L", 
202 |     "internetconnecta": "U", 
203 |     "gethostbyname": "Z", 
204 |     "getsystemtimeasfiletime": "BC", 
205 |     "getfileinformationbyhandle": "H", 
206 |     "ntreadvirtualmemory": "AI", 
207 |     "ntresumethread": "AD", 
208 |     "internetwritefile": "P", 
209 |     "internetreadfile": "O", 
210 |     "getdiskfreespacew": "H", 
211 |     "module32nextw": "X", 
212 |     "dnsquery_w": "C", 
213 |     "getsysteminfo": "BC"
214 | }


--------------------------------------------------------------------------------
/src/light_gbm_model.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | import  time
  4 | import json
  5 | import traceback
  6 | import pandas as pd
  7 | import os
  8 | import numpy as np
  9 | 
 10 | from sklearn.metrics import log_loss
 11 | from sklearn.model_selection import StratifiedShuffleSplit  
 12 | import lightgbm as lgb
 13 | 
 14 | from dataset import load_data
 15 | from utility import write_to_file
 16 | from utility import save_model,load_model
 17 | from conf import STORAGE
 18 | 
 19 | def model_select_main(storage,datasetpath,featureheaders,targethearders):
 20 | 
 21 |     savefolder = 'estimatelgbm'
 22 | 
 23 |     x,y =  load_data(datasetpath,featureheaders,targethearders,nrows= None)
 24 |     
 25 |     depthlist =[-1]+list(range(3,8,2))
 26 |     depthlist= [5] #set 
 27 | 
 28 |     minchildlist=[5,15,20,25,30,40] #grid search min_child_sample
 29 | #     minchildlist=[60]# set min_child_sample to 60
 30 |     leavelist=[31,41,51]
 31 |     min_sum_hessian_in_leafs = [0.007,0.001]
 32 | #     min_sum_hessian_in_leafs = [0.007]
 33 |     feature_fractions=[1,0.9,0.8,0.6,0.5]
 34 | #     feature_fractions=[1]
 35 | 
 36 |     drop_rates =[0.1,0.2,0.3,0.4]
 37 | #     skip_drops =[0.5,0.4,0.6]
 38 |     skip_drops =[0.5]
 39 |     boost_rounds = [600,800,900,1000,11000,500,1300]
 40 |     
 41 |     for max_depth in depthlist :
 42 |         for num_leaves in leavelist:    
 43 |             for min_child_sample in minchildlist:
 44 |                 for feature_fraction in feature_fractions:
 45 |                     for min_sum_hessian_in_leaf in min_sum_hessian_in_leafs:
 46 |                         for drop_rate in drop_rates:
 47 |                             for skip_drop in skip_drops:
 48 |                                 for boost_round in boost_rounds:
 49 |                                     params = {
 50 |                                     'boosting_type': 'dart',
 51 |                                     'objective': 'multiclass',
 52 |                                     'metric': 'multi_logloss',                
 53 |                                     'min_child_samples':min_child_sample,
 54 |                                     'num_leaves':num_leaves,
 55 |                                     'max_depth':max_depth,
 56 |                                     'learning_rate': 0.1,
 57 |                                     'num_class':6, 
 58 |                                     'num_threads':27,
 59 |                                     'max_bin':6,
 60 |                                     'lambda_l1':1,
 61 |                                     'lambda_l2':0.8,                
 62 |                                     'feature_fraction':feature_fraction,
 63 |                                     'min_sum_hessian_in_leaf':min_sum_hessian_in_leaf,
 64 |                                     'drop_rate':drop_rate,
 65 |                                     'skip_drop':skip_drop,
 66 |                                     'verbose': 1
 67 |                                 }
 68 |                                     rst =   cross_validation(x, y, 5,params,boost_round)
 69 |                                     
 70 |                                     rst['datasetpath'] = datasetpath
 71 |                                     
 72 |                                     txt= json.dumps(rst,indent=4)
 73 |                                     summarypath =os.path.join(storage,'model/'+savefolder,str(int(time.time()))+'.model.esimate')
 74 |                                     write_to_file(summarypath,txt.encode('utf-8'),mode='wb+')       
 75 |        
 76 | def cross_validation(x,y,kfolder,params,num_boost_round):
 77 |     print(time.asctime(),'cross validation')
 78 |     stime = time.time()
 79 |     
 80 |     num, num_feature = x.shape
 81 |     
 82 |     # create dataset for lightgbm
 83 |     # if you want to re-use data, remember to set free_raw_data=False
 84 |     lgb_train = lgb.Dataset(x, y,
 85 |                             weight=None, free_raw_data=False)
 86 | 
 87 |     
 88 |     # generate a feature name
 89 |     feature_name = ['feature_' + str(col) for col in range(num_feature)]
 90 |     
 91 |     print('Start cross validation...')
 92 |     # feature_name and categorical_feature
 93 |             # specify your configurations as a dict
 94 | 
 95 |         
 96 |     result = lgb.cv(params,
 97 |                     lgb_train,
 98 |                     num_boost_round=num_boost_round,
 99 |                     nfold=kfolder,                   
100 |                     feature_name=feature_name,                    
101 |                     verbose_eval=-1,
102 |                     show_stdv=True)
103 |     print(params)
104 |     print(result)
105 | 
106 | #                 print('7th feature name is:', repr(model.feature_name[6]))
107 |     return {'parameter':params,'result':result,'total_sample':num,'total_feautre':num_feature,'boost_round':num_boost_round}
108 |                 
109 | 
110 | def model_build(datasetpath,featureheaders,targethearders):
111 |     """
112 |     choose the selected model
113 |     """
114 |     print(time.asctime(),'loading data start')
115 |     
116 |     x,y =  load_data(datasetpath,featureheaders,targethearders,nrows=None) 
117 |     
118 |     print(time.asctime(),'loading data end','total len',len(x))   
119 | 
120 |     modelinfo={}
121 |     model = None
122 |     try :
123 |         
124 |         modelinfo['train_start_time']=time.asctime()
125 |         modelinfo={'name':'xgboost','detail':[],'buildtime':time.asctime()}
126 |         
127 |         test_size =0
128 |         
129 |         sample_weight=None  
130 | 
131 |         # specify your configurations as a dict
132 |         params = {
133 |             'boosting_type': 'dart',
134 |             'objective': 'multiclass',
135 |             'metric': 'multi_logloss',
136 |             'num_leaves': 31,
137 |             'learning_rate': 0.1,
138 |             'min_child_samples':60,                
139 |             'max_depth':5,
140 |             'num_class':6, 
141 |             'num_threads':27,
142 |             'max_bin':6,
143 |             'lambda_l1':1,
144 |             'lambda_l2':0.8,
145 |             'min_sum_hessian_in_leaf':0.007,              
146 |             'verbose': 1
147 |         }
148 |         
149 |         if test_size == 0:
150 |             print('full fit')
151 |             lgb_train = lgb.Dataset(x, y,
152 |                                         weight=None, free_raw_data=False)           
153 |             stime = time.time()
154 |             num_train, num_feature = x.shape
155 |             # generate a feature name
156 |             feature_name = ['feature_' + str(col) for col in range(num_feature)]
157 |             
158 |             print('Start training...')
159 |             # feature_name and categorical_feature
160 |             model = lgb.train(params,
161 |                             lgb_train,
162 |                             num_boost_round=1000,     
163 | #                             learning_rates=lambda iter: 0.1 * (0.99 ** iter),                       
164 |                             feature_name=feature_name)
165 |             
166 |         else:        
167 |             sss = StratifiedShuffleSplit(n_splits=1,test_size=test_size)
168 |             for train, test in sss.split(x, y):
169 |                 
170 |                 print(time.asctime(),'fit')
171 |                 stime = time.time()
172 |                 
173 |                 X_train = x[train]
174 |                 y_train = y[train]
175 |                 
176 |                 X_test = x[test]
177 |                 Y_test = y[test]
178 |                 
179 |                 num_train, num_feature = X_train.shape
180 |                 
181 |                 # create dataset for lightgbm
182 |                 # if you want to re-use data, remember to set free_raw_data=False
183 |                 lgb_train = lgb.Dataset(X_train, y_train,
184 |                                         weight=None, free_raw_data=False)
185 |                 lgb_eval = lgb.Dataset(X_test, Y_test, reference=lgb_train,
186 |                                        weight=None, free_raw_data=False)
187 |                 
188 | 
189 |                 
190 |                 # generate a feature name
191 |                 feature_name = ['feature_' + str(col) for col in range(num_feature)]
192 |                 
193 |                 print('Start training...')
194 |                 # feature_name and categorical_feature
195 |                 model = lgb.train(params,
196 |                                 lgb_train,
197 |                                 num_boost_round=2000,
198 |                                 valid_sets=lgb_eval,  # eval training data
199 | #                                 learning_rates=lambda iter: 0.1 * (0.99 ** iter),
200 |                                 early_stopping_rounds=40,
201 |                                 feature_name=feature_name)
202 |                 
203 |                 x_test_proba = model.predict(X_test)                
204 |                 
205 |                 loss = log_loss(y[test], x_test_proba,labels=[0,1,2,3,4,5])             
206 |                              
207 |                 modelinfo['train_len']=len(train)   
208 |                 modelinfo['train_malicious_len']= str(y[train].sum())
209 |                 modelinfo['test_len']=len(test)
210 |                 modelinfo['test_malicious_len']= str(y[test].sum())           
211 |         
212 |                 x_test_class = np.argmax(x_test_proba,axis=1)
213 |                 print('x_test_proba',type(x_test_proba))
214 |                 print(loss)
215 |                 
216 |                 print(pd.crosstab(y[test], x_test_class, rownames=['Actual Species'], colnames=['Predicted Species']))
217 |     
218 |                 print(time.asctime(),'sample weight',sample_weight)
219 |     
220 |                 result = {'loss': loss,
221 |                   'costtime':       time.time()-stime}
222 |             
223 |                 modelinfo['detail'].append(result)    
224 | 
225 |                 samples,probas = x_test_proba.shape
226 |                 
227 |                 print('shape of x_test_proba.shape',x_test_proba.shape)
228 |                 
229 |                 predict_save_path = os.path.join(STORAGE,'train_predict.lightgbm.csv')
230 |                 
231 |                 write_to_file(predict_save_path,'',mode='w+')
232 |                 
233 |                 txt='{},{},{},{},{},{},{},{}\n'.format('file_id','prob0','prob1','prob2','prob3','prob4','prob5','label')
234 |                 
235 |                 write_to_file(predict_save_path,txt,mode='w+')
236 |                 
237 |                 
238 |                 for i in range(samples):
239 |                     attr=[]
240 |                     attr+=[str(p) for p in x_test_proba[i]]
241 |                     attr+=[str(y[test][i])]
242 |                     txt=','.join(attr)
243 |                     write_to_file(predict_save_path,txt+'\n',mode='a+')
244 | 
245 |         modelinfo['name']='lightgbm'
246 |         modelinfo['attibutes_num']=x.shape[1]
247 |         modelinfo['dataset_path']=datasetpath
248 | 
249 |         modelinfo['sample_weight']=sample_weight        
250 |        
251 |         modelinfo['train_end_time']=time.asctime()
252 |         modelinfo['test_size']=test_size
253 |     
254 |     except Exception as e:
255 |         traceback.print_exc()
256 |         print('excepion',str(e))
257 |     return 'lightgbm',model,modelinfo
258 | 
259 | def model_build_main(storage,datasetpath,featureheaders,targethearders):
260 |     
261 |     name,clf,modelinfo = model_build(datasetpath,featureheaders,targethearders)
262 |     
263 |     summarypath = os.path.join(storage,'model/lightgbm.model.esimate')
264 |     modelsavepath=os.path.join(storage,'model/lightgbm.model')
265 |     modelinfosavepath = os.path.join(storage,'model/lightgbm.modelinfo')
266 |     
267 |     txt=json.dumps(modelinfo,indent=4)
268 |     
269 |     write_to_file(modelinfosavepath, txt.encode('utf-8'), mode='wb+')
270 |     
271 |     save_model(clf,modelsavepath)
272 |     
273 |     print('model summary:' )
274 |     
275 |     print('save model summary->',summarypath )
276 |     write_to_file(summarypath,txt.encode('utf-8'),mode='wb+')     
277 |     
278 | def main():
279 | 
280 |     feature_save_path = os.path.join(STORAGE,'data/merge.basic.func1.rtnvalue.func23.rtnapi.apitype.csv')
281 |     
282 |     rtn_tfidf_save_path = os.path.join(STORAGE,'rtnvalue_tfidf/webpage.tfidf.model')    
283 |     func1_tfidf_save_path = os.path.join(STORAGE,'apicall_tfidf_1/webpage.tfidf.model')    
284 |     func23_tfidf_save_path = os.path.join(STORAGE,'apicall_tfidf/webpage.tfidf.model')    
285 |     rtnfunc_tfidf_save_path = os.path.join(STORAGE,'rtn_apicall_tfidf/webpage.tfidf.model')
286 |         
287 |     featureheaders = [
288 |                                      
289 |                     ]
290 |     
291 |     basicfeature = [
292 |                       'threadnum',
293 |                       'totalapicall',
294 |                       'maxapicall',
295 |                       'minapicall',
296 |                       'meanapicallperthread'                  
297 |                     ]
298 |     
299 |     featureheaders+=basicfeature
300 |     
301 |     model = load_model(func1_tfidf_save_path)       
302 |     featureheaders += model.get_feature_names()
303 |        
304 |     model = load_model(rtn_tfidf_save_path)       
305 |     featureheaders += model.get_feature_names()
306 |   
307 |     model = load_model(func23_tfidf_save_path)      
308 |     featureheaders += model.get_feature_names()    
309 |         
310 |     model = load_model(rtnfunc_tfidf_save_path)  
311 |     featureheaders += model.get_feature_names()
312 |     
313 |     apitype_save_path = os.path.join(STORAGE,'api_type_tfidf_5/webpage.tfidf.model')
314 |     model = load_model(apitype_save_path)  
315 |     featureheaders += model.get_feature_names()
316 |  
317 |     print(time.asctime(),'total header len',len(featureheaders))
318 |     
319 |     targethearders=['label_x']
320 |     
321 |     # set ModelSelect to True for parameter tuning
322 |     ModelSelect = False
323 |     
324 |     if ModelSelect:    
325 |         model_select_main(STORAGE,feature_save_path,featureheaders,targethearders)
326 |     else:    
327 |         model_build_main(STORAGE,
328 |                          feature_save_path,
329 |                          featureheaders,
330 |                          targethearders)
331 | 
332 | if __name__ == '__main__':
333 |     
334 |     main()
335 |     
336 |     
337 | 
338 | 
339 | 


--------------------------------------------------------------------------------
/src/model_predict.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | from conf import STORAGE
 4 | 
 5 | from utility import load_model, write_to_file
 6 | 
 7 | def load_data(savepath,featureheaders,targethearders,nrows=None):
 8 |     
 9 |     dataset = pd.read_csv(savepath,header = 0,nrows =nrows)
10 |     
11 |     x = dataset[featureheaders]
12 |     y = dataset[targethearders]
13 |     l =dataset['label_x']
14 |     
15 |     return x.values,y.values.reshape((len(y.values),)),l.values.reshape((len(y.values),))
16 | 
17 | def main():
18 | 
19 |     modelsavepath=os.path.join(STORAGE,'model/lightgbm.model')
20 |     
21 |     clf =  load_model(modelsavepath)
22 | 
23 |     rtn_tfidf_save_path = os.path.join(STORAGE,'rtnvalue_tfidf/webpage.tfidf.model')    
24 |     func1_tfidf_save_path = os.path.join(STORAGE,'apicall_tfidf_1/webpage.tfidf.model')
25 |     func23_tfidf_save_path = os.path.join(STORAGE,'apicall_tfidf/webpage.tfidf.model')    
26 |     rtn_apicall_tfidf_save_path = os.path.join(STORAGE,'rtn_apicall_tfidf/webpage.tfidf.model')    
27 |     apitype_save_path = os.path.join(STORAGE,'api_type_tfidf_5/webpage.tfidf.model')
28 | 
29 |     
30 |     featureheaders = [
31 |                                      
32 |                     ]
33 |     
34 |     basicfeature = [
35 |                       'threadnum',
36 |                       'totalapicall',
37 |                       'maxapicall',
38 |                       'minapicall',
39 |                       'meanapicallperthread'                  
40 |                     ]
41 |      
42 |     featureheaders+=basicfeature
43 |      
44 |     model = load_model(func1_tfidf_save_path)
45 |      
46 |     featureheaders += model.get_feature_names()
47 |      
48 |     model = load_model(rtn_tfidf_save_path)
49 |      
50 |     featureheaders += model.get_feature_names()
51 |      
52 |     model = load_model(func23_tfidf_save_path)
53 |      
54 |     featureheaders += model.get_feature_names()
55 |      
56 |     model = load_model(rtn_apicall_tfidf_save_path)
57 |      
58 |     featureheaders += model.get_feature_names()
59 |     
60 |     model = load_model(apitype_save_path)
61 |      
62 |     featureheaders += model.get_feature_names()
63 | 
64 |     targethearders=['file_id']
65 |     
66 |     x,file_ids,labels = load_data(os.path.join(STORAGE,'testdata/merge.basic.func1.rtnvalue.func23.rtnapi.apitype.csv'),
67 |                     featureheaders,
68 |                     targethearders)
69 | 
70 |     x_test_proba = clf.predict(x)#predict_proba
71 |     
72 |     samples,probas = x_test_proba.shape
73 |     
74 |     print('shape of x_test_proba.shape',x_test_proba.shape)
75 |     
76 |     predict_save_path = os.path.join(STORAGE,'test_predict.lightgbm.csv')
77 |     
78 |     write_to_file(predict_save_path,'',mode='w+')
79 |     
80 |     txt='{},{},{},{},{},{},{}\n'.format('file_id','prob0','prob1','prob2','prob3','prob4','prob5')
81 |     
82 |     write_to_file(predict_save_path,txt,mode='w+')
83 |     
84 |     
85 |     for i in range(samples):
86 |         attr=[str(file_ids[i])]
87 |         attr+=[str(p) for p in x_test_proba[i]]
88 | #         attr+=[str(labels[i])]
89 |         txt=','.join(attr)
90 |         write_to_file(predict_save_path,txt+'\n',mode='a+')
91 | 
92 |     
93 | if __name__ =="__main__":
94 |     main()


--------------------------------------------------------------------------------
/src/preprocess.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import json
  4 | import traceback
  5 | 
  6 | from conf import STORAGE
  7 | from utility import write_to_file,read_replace,get_replace_type_name
  8 | 
  9 | 
 10 | def load_file(filepath):
 11 |     """load the file change the all the row to json format"""
 12 |     stime=time.time()
 13 |     document={}
 14 |     if not os.path.exists(filepath):
 15 |         print('file not exits',filepath)
 16 |         return
 17 |     with open(filepath,'r') as f:
 18 |         for line in f:        
 19 |             line=line.strip('\n')
 20 |             if line:
 21 |                 splits = line.split(',')
 22 |                 fileid =int(splits[0])
 23 |                 lable=int(splits[1])
 24 |                 funcname=splits[2]
 25 |                 tid=splits[3]
 26 |                 rtnvalue=splits[4]
 27 |                 index=splits[5]
 28 | 
 29 |                 if 'threads' not in document:
 30 |                     document['threads']={}
 31 |                     document['file_id']=fileid
 32 |                     document['lable']=lable
 33 |                 if tid not in document['threads']:
 34 |                     document['threads'][tid]={'api_calls':[]}
 35 |                 document['threads'][tid]['api_calls'].append([index,funcname,rtnvalue])
 36 |     for tid in document['threads']:
 37 |         apicalls = document['threads'][tid]['api_calls']
 38 |         sortedcalls = sorted(apicalls,key=lambda s: int(s[0]))
 39 |         document['threads'][tid]['api_calls'] = sortedcalls
 40 | 
 41 |     return document
 42 | 
 43 | 
 44 | def load_test_file(filepath):
 45 | 
 46 |     stime=time.time()
 47 |     document={}
 48 |     if not os.path.exists(filepath):
 49 |         print('file not exits',filepath)
 50 |         return
 51 |     with open(filepath,'r') as f:
 52 |         for line in f:        
 53 |             line=line.strip('\n')
 54 |             if line:
 55 |                 splits = line.split(',')
 56 |                 fileid =int(splits[0])               
 57 |                 funcname=splits[1]
 58 |                 tid=splits[2]
 59 |                 rtnvalue=splits[3]
 60 |                 index=splits[4]
 61 | 
 62 |                 if 'threads' not in document:
 63 |                     document['threads']={}
 64 |                     document['file_id']=fileid
 65 | 
 66 |                 if tid not in document['threads']:
 67 |                     document['threads'][tid]={'api_calls':[]}
 68 |                 document['threads'][tid]['api_calls'].append([index,funcname,rtnvalue])
 69 |     for tid in document['threads']:
 70 |         apicalls = document['threads'][tid]['api_calls']
 71 |         sortedcalls = sorted(apicalls,key=lambda s: int(s[0]))
 72 |         document['threads'][tid]['api_calls'] = sortedcalls
 73 | 
 74 |     return document
 75 | 
 76 | def train_json_to_txt():
 77 | 
 78 |     folder=os.path.join(STORAGE,'train_json')
 79 |     save_apicalls=os.path.join(STORAGE,'apicall.txt')
 80 |     save_rtvalue=os.path.join(STORAGE,'rtvalue.txt')
 81 |     
 82 |     write_to_file(save_apicalls, '',mode='w+')
 83 |     write_to_file(save_rtvalue, '',mode='w+')
 84 |     
 85 | 
 86 |     for subfolder in os.listdir(folder):
 87 |         fullsubfolder = os.path.join(folder,subfolder)        
 88 |         for fname in os.listdir(fullsubfolder):
 89 |             fullname=os.path.join(fullsubfolder,fname)
 90 |             with open(fullname) as f:
 91 |                 jsondata = json.load(f)
 92 |                 common=[str(jsondata['file_id']),str(jsondata['lable'])]
 93 |                 apis=[]
 94 |                 rtns=[]
 95 |                 for tid in jsondata['threads']:
 96 |                     
 97 |                     api_calls = jsondata['threads'][tid]['api_calls']
 98 |                     
 99 |                     apis+=[apicall[1] for apicall in api_calls]
100 |                     rtns+=[apicall[2] for apicall in api_calls]
101 |                     
102 |                     apis+=['.']
103 |                     rtns+=['.']
104 |                     
105 |                 txt=' '.join(apis)
106 |                 txt=','.join(common+[txt])
107 |                 write_to_file(save_apicalls, txt+'\n')
108 |                 
109 |                 txt =' '.join(rtns)
110 |                 txt=','.join(common+[txt])
111 |                 
112 |                 write_to_file(save_rtvalue, txt+'\n')
113 |                 
114 |                 
115 | def test_json_to_txt():
116 | 
117 |     folder=os.path.join(STORAGE,'test_json')    
118 |     save_apicalls=os.path.join(STORAGE,'test_apicall.txt')
119 |     save_rtvalue=os.path.join(STORAGE,'test_rtvalue.txt')
120 |     
121 |     write_to_file(save_apicalls, '',mode='w+')
122 |     write_to_file(save_rtvalue, '',mode='w+')
123 |     
124 |     for subfolder in os.listdir(folder):
125 |         fullsubfolder = os.path.join(folder,subfolder)        
126 |         for fname in os.listdir(fullsubfolder):
127 |             fullname=os.path.join(fullsubfolder,fname)
128 |             with open(fullname) as f:
129 |                 jsondata = json.load(f)
130 |                 common=[str(jsondata['file_id'])]
131 |                 apis=[]
132 |                 rtns=[]
133 |                 for tid in jsondata['threads']:
134 |                     
135 |                     api_calls = jsondata['threads'][tid]['api_calls']
136 |                     apis+=[apicall[1] for apicall in api_calls]
137 |                     rtns+=[apicall[2] for apicall in api_calls]
138 |                     
139 |                     apis+=['.']
140 |                     rtns+=['.']
141 |                     
142 |                 txt=' '.join(apis)
143 |                 txt=','.join(common+[txt])
144 |                 write_to_file(save_apicalls, txt+'\n',mode='a+')
145 |                 
146 |                 txt =' '.join(rtns)
147 |                 txt=','.join(common+[txt])
148 |                 
149 |                 write_to_file(save_rtvalue, txt+'\n',mode='a+')
150 |                 
151 | def train_json_to_txt_type(replacedic):
152 | 
153 |     folder=os.path.join(STORAGE,'train_json')
154 |     save_apicalls=os.path.join(STORAGE,'apicall.type.txt')
155 |     
156 |     write_to_file(save_apicalls, '',mode='w+')
157 | 
158 |     for subfolder in os.listdir(folder):
159 |         fullsubfolder = os.path.join(folder,subfolder)        
160 |         for fname in os.listdir(fullsubfolder):
161 |             fullname=os.path.join(fullsubfolder,fname)
162 |             with open(fullname) as f:
163 |                 jsondata = json.load(f)
164 |                 common=[str(jsondata['file_id']),str(jsondata['lable'])]
165 |                 apis=[]
166 |                 rtns=[]
167 |                 for tid in jsondata['threads']:
168 |                     
169 |                     api_calls = jsondata['threads'][tid]['api_calls']
170 |                     
171 |                     apis+=[get_replace_type_name(apicall[1],replacedic) for apicall in api_calls]
172 |                     rtns+=[apicall[2] for apicall in api_calls]
173 |                     
174 |                     apis+=['.']
175 |                     rtns+=['.']
176 |                     
177 |                 txt=' '.join(apis)
178 |                 txt=','.join(common+[txt])
179 |                 write_to_file(save_apicalls, txt+'\n')
180 |                 
181 |                 txt =' '.join(rtns)
182 |                 txt=','.join(common+[txt])
183 |                 
184 |                 
185 | def test_json_to_txt_type(replacedic):
186 | 
187 |     folder=os.path.join(STORAGE,'test_json')
188 |     
189 |     save_apicalls=os.path.join(STORAGE,'test_apicall.type.txt')
190 |     
191 |     write_to_file(save_apicalls, '',mode='w+')
192 |     
193 |     for subfolder in os.listdir(folder):
194 |         fullsubfolder = os.path.join(folder,subfolder)        
195 |         for fname in os.listdir(fullsubfolder):
196 |             fullname=os.path.join(fullsubfolder,fname)
197 |             with open(fullname) as f:
198 |                 jsondata = json.load(f)
199 |                 common=[str(jsondata['file_id'])]
200 |                 apis=[]
201 |                 rtns=[]
202 |                 for tid in jsondata['threads']:
203 |                     
204 |                     api_calls = jsondata['threads'][tid]['api_calls']
205 |                     apis+=[get_replace_type_name(apicall[1],replacedic) for apicall in api_calls]
206 |                     rtns+=[apicall[2] for apicall in api_calls]
207 |                     
208 |                     apis+=['.']
209 |                     rtns+=['.']
210 |                     
211 |                 txt=' '.join(apis)
212 |                 txt=','.join(common+[txt])
213 |                 write_to_file(save_apicalls, txt+'\n',mode='a+')
214 |                 
215 |                 txt =' '.join(rtns)
216 |                 txt=','.join(common+[txt])
217 |     
218 | def folder_to_json(folder,fileparser,save_folder):
219 |     stime=time.time()
220 |     count=0
221 |     for subfolder in os.listdir(folder):
222 |         fullsubfolder = os.path.join(folder,subfolder)        
223 |         save_subfolder = os.path.join(save_folder,subfolder)
224 |         os.makedirs(save_subfolder,exist_ok=True)
225 |         for fname in os.listdir(fullsubfolder):
226 |             try:
227 |                 fullname=os.path.join(save_subfolder,fname)
228 |                 document = fileparser(fullname)                          
229 |                 txt = json.dumps(document)                
230 |                 write_to_file(fullname,txt,mode='a+')
231 |                 count+=1
232 |                 if count%1000==0:
233 |                     print(count)
234 |             except Exception as e:
235 |                 traceback.print_exc()
236 |                 print(fullname)
237 |         
238 |     print(time.time()-stime,count)
239 |     
240 | def train_json_to_txt_merge():
241 |     """json file to rtn_api sequence"""
242 | 
243 |     folder=os.path.join(STORAGE,'train_json')
244 |     save_rtn_apicalls=os.path.join(STORAGE,'rtnvalue_apicall.txt')
245 |     
246 |     write_to_file(save_rtn_apicalls, '',mode='w+')
247 | #     write_to_file(save_rtn_apicalls, ['file_id','label','text'],mode='a+')
248 |     for subfolder in os.listdir(folder):
249 |         fullsubfolder = os.path.join(folder,subfolder)        
250 |         for fname in os.listdir(fullsubfolder):
251 |             fullname=os.path.join(fullsubfolder,fname)
252 |             with open(fullname) as f:
253 |                 jsondata = json.load(f)
254 |                 common=[str(jsondata['file_id']),str(jsondata['lable'])]
255 | 
256 |                 rtn_apis=[]
257 |                 for tid in jsondata['threads']:                    
258 |                     api_calls = jsondata['threads'][tid]['api_calls']
259 |                     rtn_apis+=[apicall[2]+'_'+apicall[1]  for apicall in api_calls]                    
260 |                     rtn_apis+=['.']
261 |                     
262 |                 txt=' '.join(rtn_apis)
263 |                 txt=','.join(common+[txt])
264 |                 write_to_file(save_rtn_apicalls, txt+'\n')
265 |                 
266 | def test_json_to_txt_merge():
267 |     """json file to rtn_api sequence"""
268 | 
269 |     folder=os.path.join(STORAGE,'test_json')
270 |     save_rtn_apicalls=os.path.join(STORAGE,'test_rtnvalue_apicall.txt')
271 |     
272 |     write_to_file(save_rtn_apicalls, '',mode='w+')
273 |     
274 |     for subfolder in os.listdir(folder):
275 |         fullsubfolder = os.path.join(folder,subfolder)        
276 |         for fname in os.listdir(fullsubfolder):
277 |             fullname=os.path.join(fullsubfolder,fname)
278 |             with open(fullname) as f:
279 |                 jsondata = json.load(f)
280 |                 common=[str(jsondata['file_id'])]
281 | 
282 |                 rtn_apis=[]
283 |                 for tid in jsondata['threads']:                    
284 |                     api_calls = jsondata['threads'][tid]['api_calls']
285 |                     rtn_apis+=[apicall[2]+'_'+apicall[1]  for apicall in api_calls]                    
286 |                     rtn_apis+=['.']
287 |                     
288 |                 txt=' '.join(rtn_apis)
289 |                 txt=','.join(common+[txt])
290 |                 write_to_file(save_rtn_apicalls, txt+'\n',mode='a+')
291 |                 
292 | def main():
293 | 
294 |     replacedic = read_replace('./funcname_type_map.json')
295 | 
296 |     folder_to_json(os.path.join(STORAGE,'train_flat'),
297 |                    load_test_file,
298 |                    os.path.join(STORAGE,'train_json'))
299 |     
300 |     folder_to_json(os.path.join(STORAGE,'test_flat'),
301 |                    load_test_file,
302 |                    os.path.join(STORAGE,'test_json'))
303 |     
304 |     train_json_to_txt()
305 |     test_json_to_txt()
306 |     
307 |     train_json_to_txt_merge()
308 |     test_json_to_txt_merge()
309 |     
310 |     train_json_to_txt_type(replacedic)
311 |     test_json_to_txt_type(replacedic)
312 | 
313 |         
314 | if __name__=="__main__":
315 |     main()


--------------------------------------------------------------------------------
/src/tfidf_model.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Use tfidf to extract the text feature;
  3 | The text feature include page title and page text.
  4 | """
  5 | import os
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | from sklearn.feature_extraction.text import  TfidfVectorizer
 10 | 
 11 | from utility import write_to_file,load_model,save_model
 12 | from conf import STORAGE
 13 | 
 14 | def load_documents(document_save_path):
 15 |     
 16 |     dataset = pd.read_csv(document_save_path,names=['file_id','label','txt'],nrows = None)
 17 |     
 18 |     labels = []
 19 |     docs = []
 20 |     
 21 |     for index,row in dataset.iterrows():
 22 |         label = row['label']
 23 |         content = row['txt']
 24 | 
 25 |         labels.append(int(label))
 26 |         docs.append(content)
 27 |         
 28 |     return labels,docs
 29 | 
 30 | def load_test_documents(document_save_path):
 31 |     
 32 |     dataset = pd.read_csv(document_save_path,names=['file_id','txt'],nrows = None)
 33 |     
 34 |     labels = []
 35 |     docs = []
 36 |     filesid=[]
 37 |     
 38 |     for index,row in dataset.iterrows():
 39 | #         label = row['lable']
 40 |         content = row['txt']
 41 |         fileid = row['file_id']
 42 |         labels.append(-1)
 43 |         docs.append(content)
 44 |         filesid.append(fileid)
 45 |         
 46 |     return labels,docs,filesid
 47 | 
 48 | def display_scores(vectorizer, tfidf_result,savesubfolder):
 49 |     # http://stackoverflow.com/questions/16078015/
 50 |     
 51 |     write_to_file(os.path.join(STORAGE,'{}/webpage.vocabulary.txt'.format(savesubfolder)), b'','wb+')
 52 |     for fea_name in vectorizer.get_feature_names():
 53 |         fea_name=fea_name+'\n'
 54 |         write_to_file(os.path.join(STORAGE,'{}/webpage.vocabulary.txt'.format(savesubfolder)), fea_name.encode('utf-8'))
 55 |         
 56 |     scores = zip(vectorizer.get_feature_names(),
 57 |                  np.asarray(tfidf_result.sum(axis=0)).ravel())
 58 |     
 59 |     sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
 60 |     
 61 |     index = 0
 62 |     write_to_file(os.path.join(STORAGE,'{}/webpage.vocabulary_top.txt'.format(savesubfolder)), b'','wb+')
 63 |     for item in sorted_scores:
 64 |         index+=1
 65 |         txt = "{0} {1:50} Score: {2}\n".format(index,repr(item[0]), item[1])    
 66 |         write_to_file(os.path.join(STORAGE,'{}/webpage.vocabulary_top.txt'.format(savesubfolder)), txt.encode('utf-8'))        
 67 |         
 68 |         
 69 | def api_call_tfidf():
 70 |     
 71 |     document_title_content_save_path=os.path.join(STORAGE,'apicall.txt')
 72 |     test_document_title_content_save_path=os.path.join(STORAGE,'test_apicall.txt')
 73 |     
 74 |     tfidf_save_path = os.path.join(STORAGE,'apicall_tfidf/webpage.tfidf.model')
 75 | 
 76 |     rawy ,raw_documents = load_documents(document_title_content_save_path)
 77 |     labels,docs,filesid = load_test_documents(test_document_title_content_save_path)
 78 |     
 79 |     documents=raw_documents+docs
 80 |     
 81 |     print(len(documents),len(documents))
 82 |           
 83 | #   model = TfidfVectorizer(min_df = 4,decode_error ='ignore',stop_words='english',ngram_range=(1, 1),max_features=50000)
 84 |     model = TfidfVectorizer(decode_error ='ignore',stop_words='english',ngram_range=(2, 3),max_features=5000)  #apicall
 85 | #     model = TfidfVectorizer(decode_error ='ignore',stop_words='english',ngram_range=(1, 1),max_features=1500) 
 86 |      
 87 |     x = model.fit_transform(documents)
 88 |     
 89 |     save_model(model,tfidf_save_path)
 90 |     
 91 |     display_scores(model,x,'apicall_tfidf')
 92 |     
 93 | def api_type_tfidf():
 94 |     
 95 |     
 96 |     test_document_title_content_save_path=os.path.join(STORAGE,'test_apicall.type.txt')
 97 |     document_title_content_save_path=os.path.join(STORAGE,'apicall.type.txt')
 98 | 
 99 |     
100 |     
101 |     tfidf_save_path = os.path.join(STORAGE,'api_type_tfidf/webpage.tfidf.model')
102 | 
103 |     rawy ,raw_documents = load_documents(document_title_content_save_path)
104 |     labels,docs,filesid = load_test_documents(test_document_title_content_save_path)
105 |     
106 |     documents=raw_documents+docs
107 |     
108 |     print(len(documents),len(documents))
109 |           
110 | #   model = TfidfVectorizer(min_df = 4,decode_error ='ignore',stop_words='english',ngram_range=(1, 1),max_features=50000)
111 |     model = TfidfVectorizer(decode_error ='ignore',stop_words='english',ngram_range=(4, 5),max_features=2000)  #apicall
112 | #     model = TfidfVectorizer(decode_error ='ignore',stop_words='english',ngram_range=(1, 1),max_features=1500) 
113 |      
114 |     x = model.fit_transform(documents)
115 |     
116 |     save_model(model,tfidf_save_path)
117 |     
118 |     display_scores(model,x,'api_type_tfidf')
119 |     
120 | def api_type_tfidf_5():
121 |     
122 |     
123 |     test_document_title_content_save_path=os.path.join(STORAGE,'test_apicall.type.txt')
124 |     document_title_content_save_path=os.path.join(STORAGE,'apicall.type.txt')
125 |     
126 |     tfidf_save_path = os.path.join(STORAGE,'api_type_tfidf_5/webpage.tfidf.model')
127 | 
128 |     rawy ,raw_documents = load_documents(document_title_content_save_path)
129 |     labels,docs,filesid = load_test_documents(test_document_title_content_save_path)
130 |     
131 |     documents=raw_documents+docs
132 |     
133 |     print(len(documents),len(documents))
134 |           
135 | #   model = TfidfVectorizer(min_df = 4,decode_error ='ignore',stop_words='english',ngram_range=(1, 1),max_features=50000)
136 |     model = TfidfVectorizer(min_df = 3,decode_error ='ignore',stop_words='english',ngram_range=(5, 5),max_features=1000)  #apicall
137 | #     model = TfidfVectorizer(decode_error ='ignore',stop_words='english',ngram_range=(1, 1),max_features=1500) 
138 |      
139 |     x = model.fit_transform(documents)
140 |     
141 |     save_model(model,tfidf_save_path)
142 |     
143 |     display_scores(model,x,'api_type_tfidf_5')
144 | def api_call_tfidf_1():
145 |     
146 |     document_title_content_save_path=os.path.join(STORAGE,'apicall.txt')
147 |     test_document_title_content_save_path=os.path.join(STORAGE,'test_apicall.txt')
148 |     
149 |     tfidf_save_path = os.path.join(STORAGE,'apicall_tfidf_1/webpage.tfidf.model')
150 | 
151 |     rawy ,raw_documents = load_documents(document_title_content_save_path)
152 |     labels,docs,filesid = load_test_documents(test_document_title_content_save_path)
153 |     
154 |     documents=raw_documents+docs
155 |   
156 |     model = TfidfVectorizer(decode_error ='ignore',stop_words='english',ngram_range=(1, 1)) 
157 |      
158 |     x = model.fit_transform(documents)
159 |     
160 |     save_model(model,tfidf_save_path)
161 |     
162 |     display_scores(model,x,'apicall_tfidf_1')
163 |     
164 | def rtnvalue_api_call_tfidf():
165 |     
166 |     document_title_content_save_path=os.path.join(STORAGE,'rtnvalue_apicall.txt')
167 |     test_document_title_content_save_path=os.path.join(STORAGE,'test_rtnvalue_apicall.txt')
168 |     
169 |     tfidf_save_path = os.path.join(STORAGE,'rtn_apicall_tfidf/webpage.tfidf.model')
170 | 
171 |     rawy ,raw_documents = load_documents(document_title_content_save_path)
172 |     labels,docs,filesid = load_test_documents(test_document_title_content_save_path)
173 |     
174 |     documents=raw_documents+docs
175 |     
176 |     print(len(documents),len(documents))
177 |           
178 | #   model = TfidfVectorizer(min_df = 4,decode_error ='ignore',stop_words='english',ngram_range=(1, 1),max_features=50000)
179 | #     model = TfidfVectorizer(decode_error ='ignore',stop_words='english',ngram_range=(2, 3),max_features=5000)  #apicall
180 |     model = TfidfVectorizer(decode_error ='ignore',stop_words='english',ngram_range=(1, 1),max_features=1500) 
181 |      
182 |     x = model.fit_transform(documents)
183 |     
184 |     save_model(model,tfidf_save_path)
185 |     
186 |     display_scores(model,x,'rtn_apicall_tfidf')
187 |     
188 | def rtnvalue__tfidf():
189 |     
190 |     document_title_content_save_path=os.path.join(STORAGE,'rtvalue.txt')
191 |     test_document_title_content_save_path=os.path.join(STORAGE,'test_rtvalue.txt')   
192 |     
193 |     tfidf_save_path = os.path.join(STORAGE,'rtnvalue_tfidf/webpage.tfidf.model')
194 | 
195 |     rawy ,raw_documents = load_documents(document_title_content_save_path)
196 |     labels,docs,filesid = load_test_documents(test_document_title_content_save_path)
197 |     
198 |     documents=raw_documents+docs
199 |     
200 |     print(len(documents),len(documents))
201 | 
202 |     model = TfidfVectorizer(min_df = 4,decode_error ='ignore',stop_words='english',ngram_range=(1, 3),max_features=3000) 
203 |      
204 |     x = model.fit_transform(documents)
205 |     
206 |     save_model(model,tfidf_save_path)
207 |     
208 |     display_scores(model,x,'rtnvalue_tfidf')
209 |     
210 |     
211 | def main():    
212 | 
213 |     api_type_tfidf_5()
214 |     api_call_tfidf_1()
215 |     api_call_tfidf()
216 |     rtnvalue_api_call_tfidf()
217 |     rtnvalue__tfidf()
218 | 
219 | if __name__ == "__main__":
220 | 
221 |     main()
222 | 
223 |                 
224 |                 
225 |         
226 |         
227 |         


--------------------------------------------------------------------------------
/src/utility.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import _pickle as cPickle
 3 | except Exception as e:
 4 |     import cPickle
 5 | 
 6 | import logging
 7 | import json
 8 | 
 9 | log = logging.getLogger(__name__)
10 | 
11 | def save_model(clf,modelpath): 
12 |     with open(modelpath, 'wb') as f: 
13 |         cPickle.dump(clf, f) 
14 |  
15 | def load_model(modelpath): 
16 |     try: 
17 |         with open(modelpath, 'rb') as f: 
18 |             rf = cPickle.load(f) 
19 |             return rf 
20 |     except Exception as e: 
21 |         log.exception('load model error {}'.format(modelpath))
22 |         return None 
23 |     
24 | def write_to_file(path,txt,mode='ab+'):
25 |     with open(path,mode=mode) as f:
26 |         f.write(txt)
27 | 
28 | def get_replace_type_name(name,replacedic):
29 |     name=name.lower()
30 |     return replacedic.get(name,'')
31 | 
32 | def read_replace(path):    
33 |     with open(path) as f:
34 |         data = json.load(f)
35 |         return data
36 |   


--------------------------------------------------------------------------------