├── Eval.py
├── Malware.py
├── TranAsmToByte.py
└── load_file.py


/Eval.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -* 
 2 | import time
 3 | import tensorflow as tf
 4 | from tensorflow.examples.tutorials.mnist import input_data
 5 | 
 6 | import load_file
 7 | import Malware
 8 | import numpy as np
 9 | import os
10 | 
11 | EVAL_INTERVAL_SECS = 10
12 | BATCH_SIZE = 1
13 | 
14 | def evaluate(X_test,Y_test):
15 |     #f=open("feature","w")
16 |     with tf.Graph().as_default() as g:
17 |         x = tf.placeholder(tf.float32,[
18 |             BATCH_SIZE,        #第一维表示一个batch中样例的个数
19 |             Malware.IMAGE_SIZE,    #第二维和第三维表示图片的尺寸
20 |             Malware.IMAGE_SIZE,
21 |             Malware.NUM_CHANNELS]  #第四维表示图片的深度
22 |             ,name='x-input')
23 |         
24 |         y_ = tf.placeholder(tf.float32,[None,Malware.OUTPUT_NODE], name='y-input')
25 |         y = Malware.inference(x,False,None)
26 |         correct_prediction = tf.equal(tf.argmax(y,1),tf.argmax(y_,1))
27 |         accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))
28 | 
29 |         variable_averages = tf.train.ExponentialMovingAverage(Malware.MOVING_AVERAGE_DECAY)
30 |         variables_to_restore = variable_averages.variables_to_restore()
31 |         saver = tf.train.Saver(variables_to_restore)
32 | 
33 |         while True:
34 |             with tf.Session() as sess:
35 |                 ckpt = tf.train.get_checkpoint_state(Malware.MODEL_SAVE_PATH)
36 |                 if ckpt and ckpt.model_checkpoint_path:
37 |                     saver.restore(sess,ckpt.model_checkpoint_path)
38 |                     global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
39 | 
40 |                     for i in range(len(X_test)):
41 |                         xs=X_test[i:i+1]
42 |                         ys=Y_test[i:i+1]
43 |                         reshaped_xs = np.reshape(xs,(  #此处需要添加对xs的调整，重塑xs
44 |                                                    BATCH_SIZE,
45 |                                                    Malware.IMAGE_SIZE,
46 |                                                    Malware.IMAGE_SIZE,
47 |                                                    Malware.NUM_CHANNELS))
48 |                         accuracy_score = sess.run(accuracy,feed_dict = {x: reshaped_xs, y_: ys})
49 |                         if accuracy_score == 0.0: #输出预测失败的文件编号
50 |                             print (i+1)
51 |                         #print ("After %s training step(s),%d validation accuracy = %g"%(global_step,i,accuracy_score)) 
52 |                         
53 |                     '''  
54 |                     xs=X_test
55 |                     ys=Y_test  #测试集较小，一次全测即可
56 |                     reshaped_xs = np.reshape(xs,(  #此处需要添加对xs的调整
57 |                                                    BATCH_SIZE,
58 |                                                    Malware.IMAGE_SIZE,   
59 |                                                    Malware.IMAGE_SIZE,    
60 |                                                    Malware.NUM_CHANNELS))                      
61 |                     accuracy_score = sess.run(accuracy,feed_dict = {x: reshaped_xs, y_: ys})  #获得准确率
62 |                     
63 |                     print ("After %s training step(s), validation accuracy = %g"%(global_step,accuracy_score))
64 |                     '''
65 |                     #np.savetxt("result.txt",fc11)
66 |                     #print fc11
67 |                     #print type(fc11)
68 |                     #f.write(fc11)
69 |                     #f.write("\n")
70 | 
71 |                 else:
72 |                     print ("No checkpoint file found")
73 |                     return
74 | 
75 |             time.sleep(EVAL_INTERVAL_SECS)
76 | 
77 | def main(argv=None):
78 |     BASE_DIR = os.getcwd()
79 |     #X_test,Y_test=load_file.LoadDataMatrix(BASE_DIR + "/test/","test_label.csv") #用测试集测试
80 |     X_test,Y_test=load_file.LoadDataMatrix(BASE_DIR + "/train/","train_label.csv") #用训练集测试
81 |     evaluate(X_test,Y_test)
82 | 
83 | if __name__ == '__main__':
84 |     tf.app.run()
85 | 


--------------------------------------------------------------------------------
/Malware.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -* 
  2 | import tensorflow as tf
  3 | import os
  4 | import numpy as np
  5 | import load_file
  6 | #输入节点个数
  7 | INPUT_NODE = 1024
  8 | #输出节点个数
  9 | OUTPUT_NODE = 2
 10 | #图片的尺寸
 11 | IMAGE_SIZE = 32
 12 | #通道数
 13 | NUM_CHANNELS = 1
 14 | #输出节点的个数 
 15 | NUM_LABELS = 2
 16 | #第一层卷积层的过滤器深度及其尺寸
 17 | CONV1_DEEP = 16
 18 | CONV1_SIZE = 3
 19 | #第二层卷积层的过滤器深度及其尺寸
 20 | CONV2_DEEP = 32
 21 | CONV2_SIZE = 3
 22 | #全连接层的节点个数
 23 | FC_SIZE = 512
 24 | 
 25 | def inference(input_tensor, train, regularizer):
 26 |     #第一层卷积层输入大小是32*32*1=1024=INPUT_NODE  
 27 |     #卷积层参数个数计算： CONV1_SIZE*CONV1_SIZE*NUM_CHANNELS*CONV1_DEEP+bias =3*3*1*16+16  过滤器的长*宽*过滤器深度*当前层深度+biases(个数为过滤器深度)
 28 |     #过滤器尺寸3*3深度为16     从strides=[1, 1, 1, 1]可得  步长的长宽方向分别为1  第二维度跟第三维度表示分别为长宽方向步长
 29 |     #输出的深度为CONV1_DEEP=16  由于SAME是全0填充，因此输出的尺寸为当前输入矩阵的长宽分别除以对应的步长 32*32  bias与输出深度个数一>致
 30 |     with tf.variable_scope('layer1-conv1'):
 31 |         #weight前两个维度过滤器的尺寸  第三个维度当前层的深度 第四个是过滤器的维度
 32 |         conv1_weights = tf.get_variable(
 33 |             "weight", [CONV1_SIZE, CONV1_SIZE, NUM_CHANNELS, CONV1_DEEP],
 34 |             initializer=tf.truncated_normal_initializer(stddev=0.1))
 35 |         conv1_biases = tf.get_variable("bias", [CONV1_DEEP], initializer=tf.constant_initializer(0.0))
 36 |         conv1 = tf.nn.conv2d(input_tensor, conv1_weights, strides=[1, 1, 1, 1], padding='SAME')
 37 |         relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_biases))#给convert卷积的这个结果加上偏置  然后利用激活函数ReLu
 38 | 
 39 | 
 40 |     #第二层 池化层   输入矩阵为第一层的输出  32*32*16    池化层输出与当前输入的深度一致16    池化层采用了2*2的过滤器尺寸  并且SAME方>法全0填充
 41 |     #步长的长宽方向分别为2 所以输出尺寸为32/2=16  输出16*16*16的矩阵   池化层可以改变输入的尺寸但是不改变深度
 42 |     with tf.name_scope("layer2-pool1"):
 43 |         #其中relu1是激活函数  ksize是过滤器尺寸  strides是步长 SAME是全0填充  VALID是不适用全0   SAME方法得到的尺寸是输入的尺寸/步长
 44 |         #  VALID方法输出的尺寸是 ( 输入尺寸-过滤器尺寸+1)/2取得上限值
 45 |         pool1 = tf.nn.max_pool(relu1, ksize = [1,2,2,1],strides=[1,2,2,1],padding="SAME")
 46 | 
 47 | 
 48 |     #第三层 卷积层  输入矩阵为16*16*16     本层步长为1  所以输出尺寸为16/1=16    输出的矩阵为16*16*32    
 49 |     with tf.variable_scope("layer3-conv2"):
 50 |         #weight前两个维度过滤器的尺寸  第三个维度当前层的深度 第四个是过滤器的维度 :尺寸为3*3 深度为32的过滤器，  当前层深度为16  输出深度为32
 51 |         conv2_weights = tf.get_variable(
 52 |             "weight", [CONV2_SIZE, CONV2_SIZE, CONV1_DEEP, CONV2_DEEP],
 53 |             initializer=tf.truncated_normal_initializer(stddev=0.1))
 54 |         conv2_biases = tf.get_variable("bias", [CONV2_DEEP], initializer=tf.constant_initializer(0.0))
 55 |         conv2 = tf.nn.conv2d(pool1, conv2_weights, strides=[1, 1, 1, 1], padding='SAME')
 56 |         relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_biases))
 57 | 
 58 | 
 59 |     #第四层  池化层输入矩阵为上一层输出 16*16*32 过滤器尺寸为2*2 深度为32  池化层的输出深度同输入深度  步长分别为2
 60 |     #所以输出尺寸是16/2=8   pool2的输出矩阵8*8*32
 61 |     with tf.name_scope("layer4-pool2"):
 62 |         pool2 = tf.nn.max_pool(relu2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
 63 |         pool_shape = pool2.get_shape().as_list()#pool2.get_shape()获得第四层输出矩阵的维度   
 64 |         #每一层神经网络的输入输出都为一个batch的矩阵 所以这里面的维度也包含了一个batch中数据的个数pool_shape[0]
 65 |         nodes = pool_shape[1] * pool_shape[2] * pool_shape[3]
 66 |         reshaped = tf.reshape(pool2, [pool_shape[0], nodes])#把第四层的输出变为一个batch的向量
 67 | 
 68 |     #  第五层全连接层  输入为一组向量  向量长度为8*8*32=2048=nodes   输出一组长度为FC_SIZE=512的向量
 69 |     with tf.variable_scope('layer5-fc1'):
 70 |         fc1_weights = tf.get_variable("weight", [nodes, FC_SIZE],
 71 |                                       initializer=tf.truncated_normal_initializer(stddev=0.1))
 72 |         #只有全连接的权重需要加入正则化
 73 |         if regularizer != None: tf.add_to_collection('losses', regularizer(fc1_weights))
 74 |         fc1_biases = tf.get_variable("bias", [FC_SIZE], initializer=tf.constant_initializer(0.1))
 75 | 
 76 |         fc1 = tf.nn.relu(tf.matmul(reshaped, fc1_weights) + fc1_biases)
 77 |         #dropout在训练时会随机将部分的节点的输出改为0 避免过拟合问题  一般只在全连接层使用
 78 |         if train: fc1 = tf.nn.dropout(fc1, 0.5)
 79 | 
 80 |     #第六层  全连接层  也是输出层 输入为一组长度为512的向量 输出为一组长度为2的向量  这一次输出后会通过softmax得到分类结果      
 81 |     with tf.variable_scope('layer6-fc2'):
 82 |         fc2_weights = tf.get_variable("weight", [FC_SIZE, NUM_LABELS],
 83 |                                       initializer=tf.truncated_normal_initializer(stddev=0.1))
 84 |         if regularizer != None: tf.add_to_collection('losses', regularizer(fc2_weights))
 85 |     fc2_biases = tf.get_variable("bias", [NUM_LABELS], initializer=tf.constant_initializer(0.1))
 86 |     logit = tf.matmul(fc1, fc2_weights) + fc2_biases
 87 | 
 88 |     return logit
 89 | 
 90 | BATCH_SIZE = 100
 91 | LEARNING_RATE_BASE = 0.001
 92 | LEARNING_RATE_DECAY = 0.99
 93 | REGULARIZATION_RATE = 0.0001
 94 | TRAINING_STEPS = 6000
 95 | MOVING_AVERAGE_DECAY = 0.99
 96 | 
 97 | MODEL_SAVE_PATH = "/home/xujinghui/model"
 98 | MODEL_NAME = "model_malware.ckpt"
 99 | 
100 | #定义训练过程
101 | def train(X_train,Y_train):
102 |     # 定义输出为4维矩阵的placeholder
103 |     x = tf.placeholder(tf.float32, [
104 |         BATCH_SIZE,
105 |         IMAGE_SIZE,
106 |         IMAGE_SIZE,
107 |         NUM_CHANNELS],
108 |     name='x-input')
109 |     y_ = tf.placeholder(tf.float32, [None, OUTPUT_NODE], name='y-input')
110 | 
111 |     regularizer = tf.contrib.layers.l2_regularizer(REGULARIZATION_RATE)
112 |     y = inference(x,False,regularizer)
113 |     global_step = tf.Variable(0, trainable=False)
114 | 
115 |     # 定义损失函数、学习率、滑动平均操作以及训练过程。
116 |     variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
117 |     variables_averages_op = variable_averages.apply(tf.trainable_variables())
118 |     cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y, labels=tf.argmax(y_, 1))
119 |     cross_entropy_mean = tf.reduce_mean(cross_entropy)
120 |     loss = cross_entropy_mean + tf.add_n(tf.get_collection('losses'))
121 |     learning_rate = tf.train.exponential_decay(
122 |         LEARNING_RATE_BASE,
123 |         global_step,
124 |         len(X_train) / BATCH_SIZE, LEARNING_RATE_DECAY,
125 |         staircase=True)
126 | 
127 |     train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
128 |     with tf.control_dependencies([train_step, variables_averages_op]):
129 |         train_op = tf.no_op(name='train')
130 | 
131 |     # 初始化TensorFlow持久化类。
132 |     saver = tf.train.Saver()
133 |     with tf.Session() as sess:
134 |         tf.global_variables_initializer().run()
135 |         for i in range(TRAINING_STEPS):
136 | 
137 |             #选取要训练的始末坐标
138 |             start = (i*BATCH_SIZE)%len(X_train)
139 |             end = min(start+BATCH_SIZE , len(X_train))
140 |             if (end-start)<BATCH_SIZE:
141 |                 start=len(X_train)-BATCH_SIZE
142 |                 end = len(X_train)
143 |             xs=X_train[start:end]
144 |             ys=Y_train[start:end]
145 |             
146 |             #重塑xs的格式
147 |             reshaped_xs = np.reshape(xs, (
148 |                 BATCH_SIZE,
149 |                 IMAGE_SIZE,
150 |                 IMAGE_SIZE,
151 |                 NUM_CHANNELS))
152 | 
153 |             _, loss_value, step = sess.run([train_op, loss, global_step], feed_dict={x: reshaped_xs, y_: ys})
154 | 
155 |             if i % 100 == 0:
156 |                 print("After %d training step(s), loss on training batch is %g." % (step, loss_value))
157 |                 saver.save(sess, os.path.join(MODEL_SAVE_PATH,MODEL_NAME),global_step=global_step)#模型持久化存储
158 | 
159 | 
160 | def main(argv=None):
161 |     BASE_DIR = os.getcwd()
162 |     X_train,Y_train=load_file.LoadDataMatrix(BASE_DIR + "/train/","train_label.csv")
163 |     train(X_train,Y_train)
164 | 
165 | if __name__ == '__main__':
166 |     main()
167 | 


--------------------------------------------------------------------------------
/TranAsmToByte.py:
--------------------------------------------------------------------------------
  1 | #-*- coding: utf-8 -*-
  2 | 
  3 | import os
  4 | import numpy as np
  5 | import sys
  6 | import struct
  7 | def TranAsmToByte(FileName):
  8 |     #wordlist=['0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','G']
  9 |     #wordlist=[bytes('0',encoding="utf-8"),bytes('1',encoding="utf-8"),bytes('2',encoding="utf-8"),bytes('3',encoding="utf-8"),
 10 |     #          bytes('4',encoding="utf-8"),bytes('5',encoding="utf-8"),bytes('6',encoding="utf-8"),bytes('7',encoding="utf-8"),
 11 |     #          bytes('8',encoding="utf-8"),bytes('9',encoding="utf-8"),bytes('A',encoding="utf-8"),bytes('B',encoding="utf-8"),
 12 |     #          bytes('C',encoding="utf-8"),bytes('D',encoding="utf-8"),bytes('E',encoding="utf-8"),bytes('G',encoding="utf-8")]
 13 |     wordlist=[48,49,50,51,52,53,54,55,56,57,#0-9 的ascii码
 14 |               65,66,67,68,69,70]#A-F 的ascii码
 15 |     
 16 |     #print (wordlist)
 17 |     output=[]
 18 |     file_object = open(FileName+".asm","rb")#载入文件
 19 |     
 20 |     line_num = 0#行号
 21 |     for line in file_object:#逐行提取数据
 22 |         line_num = line_num+1
 23 |         X=line.split()#分割数据
 24 |         
 25 |         for a in X:
 26 |             #print (a,a[0],type(a[0]),len(a))
 27 |             if(len(a)==2 and (a[0] in wordlist) and (a[1] in wordlist)):#提取机器码
 28 |                 #print (a,a[0],a[1],type(a[0]),len(a))
 29 |                 output.append(a)
 30 |         elif (len(a)==3 and (a[0] in wordlist) and (a[1] in wordlist) and (a[2]==43)): #针对数据格式为00+等
 31 |             output.append(a[:-1])
 32 |         print (X)
 33 |         
 34 |         
 35 |         #if line_num==200: #对前200行进行测试
 36 |         #    break
 37 |     file_object.close()#关闭.asm文件
 38 |     
 39 |     #print (output)
 40 |     
 41 |     #二进制写,保存为.txt,不换行
 42 |     '''  
 43 |     file_save = open(FileName+".txt","wb")#持久化,二进制写
 44 |     num = 0
 45 |     ch1 = " "#插入空格分隔符
 46 |     ch2 = "\n"#插入回车分隔符
 47 |     ch1_byte = ch1.encode("ascii")#将分隔符转码为ascii码
 48 |     ch2_byte = ch2.encode("ascii")    
 49 |     for x in output:
 50 |         num = num+1
 51 |         #write_buf = x.decode('ascii')
 52 |         file_save.write(x+ch1_byte)#存储一个数据加一个空格
 53 |         if (num%16==0):
 54 |             file_save.write(ch2_byte)#输出回车符
 55 |     '''
 56 |     #二进制写结束
 57 |     
 58 |     
 59 |     #字符串写,保存为.txt
 60 |     file_save = open(FileName+".txt","w")#持久化,直接写
 61 |     num = 0
 62 |     for x in output:
 63 |         num = num+1
 64 |         write_buf = x.decode('ascii') #将二进制数据解码，转为字符型str
 65 |         #print (write_buf,type(write_buf))
 66 |         
 67 |         file_save.write(write_buf+" ") #存储一个数据加一个空格
 68 |         if (num%16==0):
 69 |             file_save.write("\n")#每16个一行输出回车符
 70 |     #字符串写结束
 71 |     
 72 |     file_save.close()#关闭.txt文件
 73 |     print ("write finished")
 74 |     
 75 | def grayImg(filename):
 76 |     
 77 |     file_object = open(FileName+".txt","r")#载入文件
 78 |     # 二进制文件list化
 79 |     binList = []
 80 |     for line in file_object :
 81 |         
 82 |         str_temp = line.split()
 83 |         
 84 |         # 转二进制
 85 |         row = []
 86 |         for n_hex in str2 :
 87 |             row.append( bin(int(n_hex, 16))[2:] )
 88 | 
 89 |             binList.append(row)
 90 | 
 91 |     # 最后一行是否满16个,不满则补 ‘0’
 92 |     maxRow = len(binList)
 93 |     length = len(binList[maxRow - 1])
 94 | 
 95 |     for i in range(16 - length) :
 96 |         binList[maxRow - 1].append('0')
 97 | 
 98 |     # 像素矩阵
 99 |     pixMat = np.matrix(binList)
100 |     #print pixMat
101 |     # 生成图像
102 |     img = Image.fromarray(np.uint8(pixMat))
103 |     # 像素改为32*32
104 |     img = img.resize((32, 32),Image.ANTIALIAS)
105 |     img.save( fileName + '.png')
106 |     
107 |     
108 | TranAsmToByte("Ig2DB5tSiEy1cJvV0zdw")
109 | 


--------------------------------------------------------------------------------
/load_file.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf-8 -*-
 2 | 
 3 | from scipy.misc import imread
 4 | import os
 5 | import numpy as np
 6 | def LoadDataMatrix(filepath,FileName):
 7 |     X=[]
 8 |     file_object = open(filepath+FileName,"r")#打开文件名表格 格式 （文件名  标签）
 9 |     Label= []
10 |     Name = []
11 |     for line in file_object:
12 |         Name.append((line.split(',')[:1])[0]) #Name String型  每一行切割后的第一个元素为Name
13 |         Label.append(int(((line.split(',')[1:])[0].split())[0])) #Label int型  每一行切割后的第二个元素为Name，类型转换为int
14 |         
15 |     for i in range(0,len(Name)):
16 |         #载入数据
17 |         
18 |         #处理像素矩阵
19 |         if (Label[i]==1):
20 |             img = imread(filepath +"1_"+Name[i]+".png") #载入标签为1的图片
21 |             #print (img)
22 |         if (Label[i]==0):
23 |             img = imread(filepath +"0_"+Name[i]+".png") #载入标签为0的图片
24 |             #print (img)
25 |             imga=img.astype('float32') #像素矩阵类型设置为float32
26 |             X.append(imga) #将一个图片的像素矩阵添加进X
27 |         
28 |         #处理标签    
29 |         Label = np.array(Label) #转为array格式
30 |         Y=[]
31 |         for i in range(0,len(Label)):
32 |             #temp = [0,0,0,0,0,0,0,0,0,0]
33 |             temp = [0.0,0.0] 
34 |             temp[Label[i]]=1.0
35 |             temp_np = np.array(temp,dtype = 'float64')
36 |             Y.append(temp_np)
37 |         
38 |         #print (Label.shape)
39 |         #Label = Label.reshape(len(Label),1)
40 |         #print (Label.shape)    
41 |         return X,Y
42 |         
43 | #BASE_DIR = os.getcwd()
44 | #X_train,Y_train=LoadDataMatrix(BASE_DIR + "/train/","train_label.csv")
45 | #print "finish loading train data"
46 | #X_test,Y_test=LoadDataMatrix(BASE_DIR + "/test/","test_label.csv")
47 | #print "finish loading test data"
48 | #print (X_train,Y_train)
49 | #print ("################")
50 | #print (X_test,Y_test)
51 | 
52 | 


--------------------------------------------------------------------------------