├── run.sh ├── README.md ├── secondrandomforest.py ├── combine.py ├── firstrandomforest.py ├── randomsubset.py ├── asmimage.py └── opcode_n-gram.py /run.sh: -------------------------------------------------------------------------------- 1 | python asmimage.py 2 | python opcode_n-gram.py 3 | python firstrandomforest.py 4 | python secondrandomforest.py 5 | python combine.py -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 微软恶意代码分类 2 | 3 | - 比赛说明 https://www.kaggle.com/c/malware-classification/ 4 | 5 | - 数据下载 [Google Drive](https://drive.google.com/open?id=1C1Tc1dh8W8f3x6vn-Ado4mcBhn-EpaCO), [OneDrive](https://dlpwn-my.sharepoint.com/:f:/g/personal/deep_dlpwn_onmicrosoft_com/EhljOjr5llZBuUKWjWwKHS8BS8JrHyS76pzRU9RDKMDeoQ?e=dUgBNU) 6 | 7 | ## 代码说明 8 | - `randomsubset.py` 抽取训练子集 9 | - `asmimage.py` ASM文件图像纹理特征 10 | - `opcode_n-gram.py` Opcode n-gram特征 11 | - `firstrandomforest.py` 基于ASM文件图像纹理特征的随机森林 12 | - `secondrandomforest.py` 基于Opcode n-gram特征特征的随机森林 13 | - `combine.py` 将两种类型的特征结合 14 | 15 | ## 运行说明 16 | 17 | 1. 将完整的训练数据集解压,修改`randomsubset.py`中的路径并运行 18 | 2. 修改`asmimage.py`和`opcode_n-gram.py`中的路径,并运行`run.sh`,耐心等待即可看到结果 19 | -------------------------------------------------------------------------------- /secondrandomforest.py: -------------------------------------------------------------------------------- 1 | from sklearn.ensemble import RandomForestClassifier as RF 2 | from sklearn import cross_validation 3 | from sklearn.metrics import confusion_matrix 4 | import pandas as pd 5 | 6 | 7 | subtrainLabel = pd.read_csv('subtrainLabels.csv') 8 | subtrainfeature = pd.read_csv("3gramfeature.csv") 9 | subtrain = pd.merge(subtrainLabel,subtrainfeature,on='Id') 10 | labels = subtrain.Class 11 | subtrain.drop(["Class","Id"], axis=1, inplace=True) 12 | subtrain = subtrain.as_matrix() 13 | 14 | X_train, X_test, y_train, y_test = cross_validation.train_test_split(subtrain,labels,test_size=0.4) 15 | 16 | srf = RF(n_estimators=500, n_jobs=-1) 17 | srf.fit(X_train,y_train) 18 | print srf.score(X_test,y_test) 19 | # y_pred = srf.predict(X_test) 20 | # print confusion_matrix(y_test, y_pred) -------------------------------------------------------------------------------- /combine.py: -------------------------------------------------------------------------------- 1 | from sklearn.ensemble import RandomForestClassifier as RF 2 | from sklearn import cross_validation 3 | from sklearn.metrics import confusion_matrix 4 | import pandas as pd 5 | 6 | 7 | subtrainLabel = pd.read_csv('subtrainLabels.csv') 8 | subtrainfeature1 = pd.read_csv("3gramfeature.csv") 9 | subtrainfeature2 = pd.read_csv("imgfeature.csv") 10 | subtrain = pd.merge(subtrainfeature1,subtrainfeature2,on='Id') 11 | subtrain = pd.merge(subtrain,subtrainLabel,on='Id') 12 | labels = subtrain.Class 13 | subtrain.drop(["Class","Id"], axis=1, inplace=True) 14 | subtrain = subtrain.as_matrix() 15 | 16 | X_train, X_test, y_train, y_test = cross_validation.train_test_split(subtrain,labels,test_size=0.4) 17 | 18 | srf = RF(n_estimators=500, n_jobs=-1) 19 | srf.fit(X_train,y_train) 20 | print srf.score(X_test,y_test) -------------------------------------------------------------------------------- /firstrandomforest.py: -------------------------------------------------------------------------------- 1 | from sklearn.ensemble import RandomForestClassifier as RF 2 | from sklearn import cross_validation 3 | from sklearn.metrics import confusion_matrix 4 | import pandas as pd 5 | import numpy as np 6 | 7 | 8 | subtrainLabel = pd.read_csv('subtrainLabels.csv') 9 | subtrainfeature = pd.read_csv("imgfeature.csv") 10 | subtrain = pd.merge(subtrainLabel,subtrainfeature,on='Id') 11 | labels = subtrain.Class 12 | subtrain.drop(["Class","Id"], axis=1, inplace=True) 13 | subtrain = subtrain.as_matrix() 14 | X_train, X_test, y_train, y_test = cross_validation.train_test_split(subtrain,labels,test_size=0.4) 15 | 16 | srf = RF(n_estimators=500, n_jobs=-1) 17 | srf.fit(X_train,y_train) 18 | print srf.score(X_test,y_test) 19 | 20 | # importances = srf.feature_importances_ 21 | # std = np.std([tree.feature_importances_ for tree in srf.estimators_],axis=0) 22 | # indices = np.argsort(importances)[::-1] 23 | # print("Feature ranking:") 24 | # for f in range(20): 25 | # print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) 26 | -------------------------------------------------------------------------------- /randomsubset.py: -------------------------------------------------------------------------------- 1 | import os 2 | from random import * 3 | import pandas as pd 4 | import shutil 5 | 6 | rs = Random() 7 | rs.seed(1) 8 | 9 | trainlabels = pd.read_csv('trainLabels.csv') 10 | fids = [] 11 | opd = pd.DataFrame() 12 | for clabel in range (1,10): 13 | mids = trainlabels[trainlabels.Class == clabel] 14 | mids = mids.reset_index(drop=True) 15 | 16 | rchoice = [rs.randint(0,len(mids)-1) for i in range(100)] 17 | print rchoice 18 | 19 | # for i in rchoice: 20 | # fids.append(mids.loc[i].Id) 21 | # opd = opd.append(mids.loc[i]) 22 | 23 | rids = [mids.loc[i].Id for i in rchoice] 24 | fids.extend(rids) 25 | opd = opd.append(mids.loc[rchoice]) 26 | 27 | 28 | print len(fids) 29 | opd = opd.reset_index(drop=True) 30 | print opd 31 | opd.to_csv('subtrainLabels.csv', encoding='utf-8', index=False) 32 | 33 | sbase = '/home/moon/train/' 34 | tbase = '/home/moon/subtrain/' 35 | 36 | for fid in fids: 37 | fnames = ['{0}.asm'.format(fid),'{0}.bytes'.format(fid)] 38 | for fname in fnames: 39 | cspath = sbase + fname 40 | ctpath = tbase + fname 41 | shutil.copy(cspath,ctpath) -------------------------------------------------------------------------------- /asmimage.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy 3 | from collections import * 4 | import pandas as pd 5 | import binascii 6 | 7 | 8 | def getMatrixfrom_bin(filename, width = 512, oneRow = False): 9 | with open(filename, 'rb') as f: 10 | content = f.read() 11 | hexst = binascii.hexlify(content) 12 | fh = numpy.array([int(hexst[i:i+2],16) for i in range(0, len(hexst), 2)]) 13 | if oneRow is False: 14 | rn = len(fh)/width 15 | fh = numpy.reshape(fh[:rn*width],(-1,width)) 16 | fh = numpy.uint8(fh) 17 | return fh 18 | 19 | def getMatrixfrom_asm(filename, startindex = 0, pixnum = 5000): 20 | with open(filename, 'rb') as f: 21 | f.seek(startindex, 0) 22 | content = f.read(pixnum) 23 | hexst = binascii.hexlify(content) 24 | fh = numpy.array([int(hexst[i:i+2],16) for i in range(0, len(hexst), 2)]) 25 | fh = numpy.uint8(fh) 26 | return fh 27 | 28 | def getMatrixfrom_hex(filename, width): 29 | hexar = [] 30 | with open(filename,'rb') as f: 31 | for line in f: 32 | hexar.extend(int(el,16) for el in line.split()[1:] if el != "??") 33 | rn = len(hexar)/width 34 | fh = numpy.reshape(hexar[:rn*width],(-1,width)) 35 | fh = numpy.uint8(fh) 36 | return fh 37 | 38 | def read_hexbytes(filename): 39 | hexar = [] 40 | with open(filename,'rb') as f: 41 | for line in f: 42 | hexar.extend(int(el,16) for el in line.split()[1:] if el != "??") 43 | rn = len(hexar)/256 44 | fh = numpy.reshape(hexar[:rn*256],(-1,256)) 45 | fh = numpy.uint8(fh) 46 | return fh 47 | 48 | basepath = "/home/moon/subtrain/" 49 | mapimg = defaultdict(list) 50 | subtrain = pd.read_csv('subtrainLabels.csv') 51 | i = 0 52 | for sid in subtrain.Id: 53 | i += 1 54 | print "dealing with {0}th file...".format(str(i)) 55 | filename = basepath + sid + ".asm" 56 | im = getMatrixfrom_asm(filename, startindex = 0, pixnum = 1500) 57 | mapimg[sid] = im 58 | 59 | dataframelist = [] 60 | for sid,imf in mapimg.iteritems(): 61 | standard = {} 62 | standard["Id"] = sid 63 | for index,value in enumerate(imf): 64 | colName = "pix{0}".format(str(index)) 65 | standard[colName] = value 66 | dataframelist.append(standard) 67 | 68 | df = pd.DataFrame(dataframelist) 69 | df.to_csv("imgfeature.csv",index=False) -------------------------------------------------------------------------------- /opcode_n-gram.py: -------------------------------------------------------------------------------- 1 | import re 2 | from collections import * 3 | import os 4 | import pandas as pd 5 | 6 | def getOpcodeSequence(filename): 7 | opcode_seq = [] 8 | p = re.compile(r'\s([a-fA-F0-9]{2}\s)+\s*([a-z]+)') 9 | with open(filename) as f: 10 | for line in f: 11 | if line.startswith(".text"): 12 | m = re.findall(p,line) 13 | if m: 14 | opc = m[0][1] 15 | if opc != "align": 16 | opcode_seq.append(opc) 17 | return opcode_seq 18 | 19 | def train_opcode_lm(ops, order=4): 20 | lm = defaultdict(Counter) 21 | prefix = ["~"] * order 22 | prefix.extend(ops) 23 | data = prefix 24 | for i in xrange(len(data)-order): 25 | history, char = tuple(data[i:i+order]), data[i+order] 26 | lm[history][char]+=1 27 | def normalize(counter): 28 | s = float(sum(counter.values())) 29 | return [(c,cnt/s) for c,cnt in counter.iteritems()] 30 | outlm = {hist:chars for hist, chars in lm.iteritems()} 31 | return outlm 32 | 33 | def getOpcodeNgram(ops, n=3): 34 | opngramlist = [tuple(ops[i:i+n]) for i in range(len(ops)-n)] 35 | opngram = Counter(opngramlist) 36 | return opngram 37 | 38 | basepath = "/home/moon/subtrain/" 39 | map3gram = defaultdict(Counter) 40 | subtrain = pd.read_csv('subtrainLabels.csv') 41 | count = 1 42 | for sid in subtrain.Id: 43 | print "counting the 3-gram of the {0} file...".format(str(count)) 44 | count += 1 45 | filename = basepath + sid + ".asm" 46 | ops = getOpcodeSequence(filename) 47 | op3gram = getOpcodeNgram(ops) 48 | map3gram[sid] = op3gram 49 | 50 | cc = Counter([]) 51 | for d in map3gram.values(): 52 | cc += d 53 | selectedfeatures = {} 54 | tc = 0 55 | for k,v in cc.iteritems(): 56 | if v >= 500: 57 | selectedfeatures[k] = v 58 | print k,v 59 | tc += 1 60 | dataframelist = [] 61 | for fid,op3gram in map3gram.iteritems(): 62 | standard = {} 63 | standard["Id"] = fid 64 | for feature in selectedfeatures: 65 | if feature in op3gram: 66 | standard[feature] = op3gram[feature] 67 | else: 68 | standard[feature] = 0 69 | dataframelist.append(standard) 70 | df = pd.DataFrame(dataframelist) 71 | df.to_csv("3gramfeature.csv",index=False) 72 | --------------------------------------------------------------------------------