├── run.sh
├── README.md
├── secondrandomforest.py
├── combine.py
├── firstrandomforest.py
├── randomsubset.py
├── asmimage.py
└── opcode_n-gram.py


/run.sh:
--------------------------------------------------------------------------------
1 | python asmimage.py
2 | python opcode_n-gram.py
3 | python firstrandomforest.py
4 | python secondrandomforest.py
5 | python combine.py


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 微软恶意代码分类
 2 | 
 3 | - 比赛说明 https://www.kaggle.com/c/malware-classification/
 4 | 
 5 | - 数据下载 [Google Drive](https://drive.google.com/open?id=1C1Tc1dh8W8f3x6vn-Ado4mcBhn-EpaCO), [OneDrive](https://dlpwn-my.sharepoint.com/:f:/g/personal/deep_dlpwn_onmicrosoft_com/EhljOjr5llZBuUKWjWwKHS8BS8JrHyS76pzRU9RDKMDeoQ?e=dUgBNU)
 6 | 
 7 | ## 代码说明
 8 | - `randomsubset.py` 抽取训练子集
 9 | - `asmimage.py` ASM文件图像纹理特征
10 | - `opcode_n-gram.py` Opcode n-gram特征
11 | - `firstrandomforest.py` 基于ASM文件图像纹理特征的随机森林
12 | - `secondrandomforest.py` 基于Opcode n-gram特征特征的随机森林
13 | - `combine.py` 将两种类型的特征结合
14 | 
15 | ## 运行说明
16 | 
17 | 1. 将完整的训练数据集解压，修改`randomsubset.py`中的路径并运行
18 | 2. 修改`asmimage.py`和`opcode_n-gram.py`中的路径，并运行`run.sh`，耐心等待即可看到结果
19 | 


--------------------------------------------------------------------------------
/secondrandomforest.py:
--------------------------------------------------------------------------------
 1 | from sklearn.ensemble import RandomForestClassifier as RF
 2 | from sklearn import cross_validation
 3 | from sklearn.metrics import confusion_matrix
 4 | import pandas as pd
 5 | 
 6 | 
 7 | subtrainLabel = pd.read_csv('subtrainLabels.csv')
 8 | subtrainfeature = pd.read_csv("3gramfeature.csv")
 9 | subtrain = pd.merge(subtrainLabel,subtrainfeature,on='Id')
10 | labels = subtrain.Class
11 | subtrain.drop(["Class","Id"], axis=1, inplace=True)
12 | subtrain = subtrain.as_matrix()
13 | 
14 | X_train, X_test, y_train, y_test = cross_validation.train_test_split(subtrain,labels,test_size=0.4)
15 | 
16 | srf = RF(n_estimators=500, n_jobs=-1)
17 | srf.fit(X_train,y_train)
18 | print srf.score(X_test,y_test)
19 | # y_pred = srf.predict(X_test)
20 | # print confusion_matrix(y_test, y_pred)


--------------------------------------------------------------------------------
/combine.py:
--------------------------------------------------------------------------------
 1 | from sklearn.ensemble import RandomForestClassifier as RF
 2 | from sklearn import cross_validation
 3 | from sklearn.metrics import confusion_matrix
 4 | import pandas as pd
 5 | 
 6 | 
 7 | subtrainLabel = pd.read_csv('subtrainLabels.csv')
 8 | subtrainfeature1 = pd.read_csv("3gramfeature.csv")
 9 | subtrainfeature2 = pd.read_csv("imgfeature.csv")
10 | subtrain = pd.merge(subtrainfeature1,subtrainfeature2,on='Id')
11 | subtrain = pd.merge(subtrain,subtrainLabel,on='Id')
12 | labels = subtrain.Class
13 | subtrain.drop(["Class","Id"], axis=1, inplace=True)
14 | subtrain = subtrain.as_matrix()
15 | 
16 | X_train, X_test, y_train, y_test = cross_validation.train_test_split(subtrain,labels,test_size=0.4)
17 | 
18 | srf = RF(n_estimators=500, n_jobs=-1)
19 | srf.fit(X_train,y_train)
20 | print srf.score(X_test,y_test)


--------------------------------------------------------------------------------
/firstrandomforest.py:
--------------------------------------------------------------------------------
 1 | from sklearn.ensemble import RandomForestClassifier as RF
 2 | from sklearn import cross_validation
 3 | from sklearn.metrics import confusion_matrix
 4 | import pandas as pd
 5 | import numpy as np
 6 | 
 7 | 
 8 | subtrainLabel = pd.read_csv('subtrainLabels.csv')
 9 | subtrainfeature = pd.read_csv("imgfeature.csv")
10 | subtrain = pd.merge(subtrainLabel,subtrainfeature,on='Id')
11 | labels = subtrain.Class
12 | subtrain.drop(["Class","Id"], axis=1, inplace=True)
13 | subtrain = subtrain.as_matrix()
14 | X_train, X_test, y_train, y_test = cross_validation.train_test_split(subtrain,labels,test_size=0.4)
15 | 
16 | srf = RF(n_estimators=500, n_jobs=-1)
17 | srf.fit(X_train,y_train)
18 | print srf.score(X_test,y_test)
19 | 
20 | # importances = srf.feature_importances_
21 | # std = np.std([tree.feature_importances_ for tree in srf.estimators_],axis=0)
22 | # indices = np.argsort(importances)[::-1]
23 | # print("Feature ranking:")
24 | # for f in range(20):
25 | #     print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
26 | 


--------------------------------------------------------------------------------
/randomsubset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from random import *
 3 | import pandas as pd
 4 | import shutil
 5 | 
 6 | rs = Random()
 7 | rs.seed(1)
 8 | 
 9 | trainlabels = pd.read_csv('trainLabels.csv')
10 | fids = []
11 | opd = pd.DataFrame()
12 | for clabel in range (1,10):
13 |     mids = trainlabels[trainlabels.Class == clabel]
14 |     mids = mids.reset_index(drop=True)
15 | 
16 |     rchoice = [rs.randint(0,len(mids)-1) for i in range(100)]
17 |     print rchoice   
18 |     
19 | #     for i in rchoice:
20 | #         fids.append(mids.loc[i].Id)
21 | #         opd = opd.append(mids.loc[i])
22 | 
23 |     rids = [mids.loc[i].Id for i in rchoice]
24 |     fids.extend(rids)
25 |     opd = opd.append(mids.loc[rchoice])
26 |     
27 | 
28 | print len(fids)
29 | opd = opd.reset_index(drop=True)
30 | print opd
31 | opd.to_csv('subtrainLabels.csv', encoding='utf-8', index=False)
32 | 
33 | sbase = '/home/moon/train/'
34 | tbase = '/home/moon/subtrain/'
35 | 
36 | for fid in fids:
37 | 	fnames = ['{0}.asm'.format(fid),'{0}.bytes'.format(fid)]
38 | 	for fname in fnames:
39 | 		cspath = sbase + fname
40 | 		ctpath = tbase + fname
41 | 		shutil.copy(cspath,ctpath)


--------------------------------------------------------------------------------
/asmimage.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy
 3 | from collections import *
 4 | import pandas as pd
 5 | import binascii
 6 | 
 7 | 
 8 | def getMatrixfrom_bin(filename, width = 512, oneRow = False):
 9 |     with open(filename, 'rb') as f:
10 |         content = f.read()
11 |     hexst = binascii.hexlify(content)
12 |     fh = numpy.array([int(hexst[i:i+2],16) for i in range(0, len(hexst), 2)])
13 |     if oneRow is False:
14 |         rn = len(fh)/width
15 |         fh = numpy.reshape(fh[:rn*width],(-1,width))
16 |     fh = numpy.uint8(fh)
17 |     return fh
18 | 
19 | def getMatrixfrom_asm(filename, startindex = 0, pixnum = 5000):
20 |     with open(filename, 'rb') as f:
21 |         f.seek(startindex, 0)
22 |         content = f.read(pixnum)
23 |     hexst = binascii.hexlify(content)
24 |     fh = numpy.array([int(hexst[i:i+2],16) for i in range(0, len(hexst), 2)])
25 |     fh = numpy.uint8(fh)
26 |     return fh
27 | 
28 | def getMatrixfrom_hex(filename, width):
29 |     hexar = []
30 |     with open(filename,'rb') as f:
31 |         for line in f:
32 |             hexar.extend(int(el,16) for el in line.split()[1:] if el != "??")
33 |     rn = len(hexar)/width
34 |     fh = numpy.reshape(hexar[:rn*width],(-1,width))
35 |     fh = numpy.uint8(fh)
36 |     return fh
37 | 
38 | def read_hexbytes(filename):
39 |     hexar = []
40 |     with open(filename,'rb') as f:
41 |         for line in f:
42 |             hexar.extend(int(el,16) for el in line.split()[1:] if el != "??")
43 |     rn = len(hexar)/256
44 |     fh = numpy.reshape(hexar[:rn*256],(-1,256))
45 |     fh = numpy.uint8(fh)
46 |     return fh
47 | 
48 | basepath = "/home/moon/subtrain/"
49 | mapimg = defaultdict(list)
50 | subtrain = pd.read_csv('subtrainLabels.csv')
51 | i = 0
52 | for sid in subtrain.Id:
53 |     i += 1
54 |     print "dealing with {0}th file...".format(str(i))
55 |     filename = basepath + sid + ".asm"
56 |     im = getMatrixfrom_asm(filename, startindex = 0, pixnum = 1500)
57 |     mapimg[sid] = im
58 | 
59 | dataframelist = []
60 | for sid,imf in mapimg.iteritems():
61 |     standard = {}
62 |     standard["Id"] = sid
63 |     for index,value in enumerate(imf):
64 |         colName = "pix{0}".format(str(index))
65 |         standard[colName] = value
66 |     dataframelist.append(standard)
67 | 
68 | df = pd.DataFrame(dataframelist)
69 | df.to_csv("imgfeature.csv",index=False)


--------------------------------------------------------------------------------
/opcode_n-gram.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from collections import *
 3 | import os
 4 | import pandas as pd
 5 | 
 6 | def getOpcodeSequence(filename):
 7 |     opcode_seq = []
 8 |     p = re.compile(r'\s([a-fA-F0-9]{2}\s)+\s*([a-z]+)')
 9 |     with open(filename) as f:
10 |         for line in f:
11 |             if line.startswith(".text"):
12 |                 m = re.findall(p,line)
13 |                 if m:
14 |                     opc = m[0][1]
15 |                     if opc != "align":
16 |                         opcode_seq.append(opc)
17 |     return opcode_seq
18 | 
19 | def train_opcode_lm(ops, order=4):
20 |     lm = defaultdict(Counter)
21 |     prefix = ["~"] * order
22 |     prefix.extend(ops)
23 |     data = prefix
24 |     for i in xrange(len(data)-order):
25 |         history, char = tuple(data[i:i+order]), data[i+order]
26 |         lm[history][char]+=1
27 |     def normalize(counter):
28 |         s = float(sum(counter.values()))
29 |         return [(c,cnt/s) for c,cnt in counter.iteritems()]
30 |     outlm = {hist:chars for hist, chars in lm.iteritems()}
31 |     return outlm
32 | 
33 | def getOpcodeNgram(ops, n=3):
34 |     opngramlist = [tuple(ops[i:i+n]) for i in range(len(ops)-n)]
35 |     opngram = Counter(opngramlist)
36 |     return opngram
37 | 
38 | basepath = "/home/moon/subtrain/"
39 | map3gram = defaultdict(Counter)
40 | subtrain = pd.read_csv('subtrainLabels.csv')
41 | count = 1
42 | for sid in subtrain.Id:
43 |     print "counting the 3-gram of the {0} file...".format(str(count))
44 |     count += 1
45 |     filename = basepath + sid + ".asm"
46 |     ops = getOpcodeSequence(filename)
47 |     op3gram = getOpcodeNgram(ops)
48 |     map3gram[sid] = op3gram
49 | 
50 | cc = Counter([])
51 | for d in map3gram.values():
52 |     cc += d
53 | selectedfeatures = {}
54 | tc = 0
55 | for k,v in cc.iteritems():
56 |     if v >= 500:
57 |         selectedfeatures[k] = v
58 |         print k,v
59 |         tc += 1
60 | dataframelist = []
61 | for fid,op3gram in map3gram.iteritems():
62 |     standard = {}
63 |     standard["Id"] = fid
64 |     for feature in selectedfeatures:
65 |         if feature in op3gram:
66 |             standard[feature] = op3gram[feature]
67 |         else:
68 |             standard[feature] = 0
69 |     dataframelist.append(standard)
70 | df = pd.DataFrame(dataframelist)
71 | df.to_csv("3gramfeature.csv",index=False)
72 | 


--------------------------------------------------------------------------------