├── Eval.py ├── Malware.py ├── TranAsmToByte.py └── load_file.py /Eval.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -* 2 | import time 3 | import tensorflow as tf 4 | from tensorflow.examples.tutorials.mnist import input_data 5 | 6 | import load_file 7 | import Malware 8 | import numpy as np 9 | import os 10 | 11 | EVAL_INTERVAL_SECS = 10 12 | BATCH_SIZE = 1 13 | 14 | def evaluate(X_test,Y_test): 15 | #f=open("feature","w") 16 | with tf.Graph().as_default() as g: 17 | x = tf.placeholder(tf.float32,[ 18 | BATCH_SIZE, #第一维表示一个batch中样例的个数 19 | Malware.IMAGE_SIZE, #第二维和第三维表示图片的尺寸 20 | Malware.IMAGE_SIZE, 21 | Malware.NUM_CHANNELS] #第四维表示图片的深度 22 | ,name='x-input') 23 | 24 | y_ = tf.placeholder(tf.float32,[None,Malware.OUTPUT_NODE], name='y-input') 25 | y = Malware.inference(x,False,None) 26 | correct_prediction = tf.equal(tf.argmax(y,1),tf.argmax(y_,1)) 27 | accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32)) 28 | 29 | variable_averages = tf.train.ExponentialMovingAverage(Malware.MOVING_AVERAGE_DECAY) 30 | variables_to_restore = variable_averages.variables_to_restore() 31 | saver = tf.train.Saver(variables_to_restore) 32 | 33 | while True: 34 | with tf.Session() as sess: 35 | ckpt = tf.train.get_checkpoint_state(Malware.MODEL_SAVE_PATH) 36 | if ckpt and ckpt.model_checkpoint_path: 37 | saver.restore(sess,ckpt.model_checkpoint_path) 38 | global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] 39 | 40 | for i in range(len(X_test)): 41 | xs=X_test[i:i+1] 42 | ys=Y_test[i:i+1] 43 | reshaped_xs = np.reshape(xs,( #此处需要添加对xs的调整,重塑xs 44 | BATCH_SIZE, 45 | Malware.IMAGE_SIZE, 46 | Malware.IMAGE_SIZE, 47 | Malware.NUM_CHANNELS)) 48 | accuracy_score = sess.run(accuracy,feed_dict = {x: reshaped_xs, y_: ys}) 49 | if accuracy_score == 0.0: #输出预测失败的文件编号 50 | print (i+1) 51 | #print ("After %s training step(s),%d validation accuracy = %g"%(global_step,i,accuracy_score)) 52 | 53 | ''' 54 | xs=X_test 55 | ys=Y_test #测试集较小,一次全测即可 56 | reshaped_xs = np.reshape(xs,( #此处需要添加对xs的调整 57 | BATCH_SIZE, 58 | Malware.IMAGE_SIZE, 59 | Malware.IMAGE_SIZE, 60 | Malware.NUM_CHANNELS)) 61 | accuracy_score = sess.run(accuracy,feed_dict = {x: reshaped_xs, y_: ys}) #获得准确率 62 | 63 | print ("After %s training step(s), validation accuracy = %g"%(global_step,accuracy_score)) 64 | ''' 65 | #np.savetxt("result.txt",fc11) 66 | #print fc11 67 | #print type(fc11) 68 | #f.write(fc11) 69 | #f.write("\n") 70 | 71 | else: 72 | print ("No checkpoint file found") 73 | return 74 | 75 | time.sleep(EVAL_INTERVAL_SECS) 76 | 77 | def main(argv=None): 78 | BASE_DIR = os.getcwd() 79 | #X_test,Y_test=load_file.LoadDataMatrix(BASE_DIR + "/test/","test_label.csv") #用测试集测试 80 | X_test,Y_test=load_file.LoadDataMatrix(BASE_DIR + "/train/","train_label.csv") #用训练集测试 81 | evaluate(X_test,Y_test) 82 | 83 | if __name__ == '__main__': 84 | tf.app.run() 85 | -------------------------------------------------------------------------------- /Malware.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -* 2 | import tensorflow as tf 3 | import os 4 | import numpy as np 5 | import load_file 6 | #输入节点个数 7 | INPUT_NODE = 1024 8 | #输出节点个数 9 | OUTPUT_NODE = 2 10 | #图片的尺寸 11 | IMAGE_SIZE = 32 12 | #通道数 13 | NUM_CHANNELS = 1 14 | #输出节点的个数 15 | NUM_LABELS = 2 16 | #第一层卷积层的过滤器深度及其尺寸 17 | CONV1_DEEP = 16 18 | CONV1_SIZE = 3 19 | #第二层卷积层的过滤器深度及其尺寸 20 | CONV2_DEEP = 32 21 | CONV2_SIZE = 3 22 | #全连接层的节点个数 23 | FC_SIZE = 512 24 | 25 | def inference(input_tensor, train, regularizer): 26 | #第一层卷积层输入大小是32*32*1=1024=INPUT_NODE 27 | #卷积层参数个数计算: CONV1_SIZE*CONV1_SIZE*NUM_CHANNELS*CONV1_DEEP+bias =3*3*1*16+16 过滤器的长*宽*过滤器深度*当前层深度+biases(个数为过滤器深度) 28 | #过滤器尺寸3*3深度为16 从strides=[1, 1, 1, 1]可得 步长的长宽方向分别为1 第二维度跟第三维度表示分别为长宽方向步长 29 | #输出的深度为CONV1_DEEP=16 由于SAME是全0填充,因此输出的尺寸为当前输入矩阵的长宽分别除以对应的步长 32*32 bias与输出深度个数一>致 30 | with tf.variable_scope('layer1-conv1'): 31 | #weight前两个维度过滤器的尺寸 第三个维度当前层的深度 第四个是过滤器的维度 32 | conv1_weights = tf.get_variable( 33 | "weight", [CONV1_SIZE, CONV1_SIZE, NUM_CHANNELS, CONV1_DEEP], 34 | initializer=tf.truncated_normal_initializer(stddev=0.1)) 35 | conv1_biases = tf.get_variable("bias", [CONV1_DEEP], initializer=tf.constant_initializer(0.0)) 36 | conv1 = tf.nn.conv2d(input_tensor, conv1_weights, strides=[1, 1, 1, 1], padding='SAME') 37 | relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_biases))#给convert卷积的这个结果加上偏置 然后利用激活函数ReLu 38 | 39 | 40 | #第二层 池化层 输入矩阵为第一层的输出 32*32*16 池化层输出与当前输入的深度一致16 池化层采用了2*2的过滤器尺寸 并且SAME方>法全0填充 41 | #步长的长宽方向分别为2 所以输出尺寸为32/2=16 输出16*16*16的矩阵 池化层可以改变输入的尺寸但是不改变深度 42 | with tf.name_scope("layer2-pool1"): 43 | #其中relu1是激活函数 ksize是过滤器尺寸 strides是步长 SAME是全0填充 VALID是不适用全0 SAME方法得到的尺寸是输入的尺寸/步长 44 | # VALID方法输出的尺寸是 ( 输入尺寸-过滤器尺寸+1)/2取得上限值 45 | pool1 = tf.nn.max_pool(relu1, ksize = [1,2,2,1],strides=[1,2,2,1],padding="SAME") 46 | 47 | 48 | #第三层 卷积层 输入矩阵为16*16*16 本层步长为1 所以输出尺寸为16/1=16 输出的矩阵为16*16*32 49 | with tf.variable_scope("layer3-conv2"): 50 | #weight前两个维度过滤器的尺寸 第三个维度当前层的深度 第四个是过滤器的维度 :尺寸为3*3 深度为32的过滤器, 当前层深度为16 输出深度为32 51 | conv2_weights = tf.get_variable( 52 | "weight", [CONV2_SIZE, CONV2_SIZE, CONV1_DEEP, CONV2_DEEP], 53 | initializer=tf.truncated_normal_initializer(stddev=0.1)) 54 | conv2_biases = tf.get_variable("bias", [CONV2_DEEP], initializer=tf.constant_initializer(0.0)) 55 | conv2 = tf.nn.conv2d(pool1, conv2_weights, strides=[1, 1, 1, 1], padding='SAME') 56 | relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_biases)) 57 | 58 | 59 | #第四层 池化层输入矩阵为上一层输出 16*16*32 过滤器尺寸为2*2 深度为32 池化层的输出深度同输入深度 步长分别为2 60 | #所以输出尺寸是16/2=8 pool2的输出矩阵8*8*32 61 | with tf.name_scope("layer4-pool2"): 62 | pool2 = tf.nn.max_pool(relu2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') 63 | pool_shape = pool2.get_shape().as_list()#pool2.get_shape()获得第四层输出矩阵的维度 64 | #每一层神经网络的输入输出都为一个batch的矩阵 所以这里面的维度也包含了一个batch中数据的个数pool_shape[0] 65 | nodes = pool_shape[1] * pool_shape[2] * pool_shape[3] 66 | reshaped = tf.reshape(pool2, [pool_shape[0], nodes])#把第四层的输出变为一个batch的向量 67 | 68 | # 第五层全连接层 输入为一组向量 向量长度为8*8*32=2048=nodes 输出一组长度为FC_SIZE=512的向量 69 | with tf.variable_scope('layer5-fc1'): 70 | fc1_weights = tf.get_variable("weight", [nodes, FC_SIZE], 71 | initializer=tf.truncated_normal_initializer(stddev=0.1)) 72 | #只有全连接的权重需要加入正则化 73 | if regularizer != None: tf.add_to_collection('losses', regularizer(fc1_weights)) 74 | fc1_biases = tf.get_variable("bias", [FC_SIZE], initializer=tf.constant_initializer(0.1)) 75 | 76 | fc1 = tf.nn.relu(tf.matmul(reshaped, fc1_weights) + fc1_biases) 77 | #dropout在训练时会随机将部分的节点的输出改为0 避免过拟合问题 一般只在全连接层使用 78 | if train: fc1 = tf.nn.dropout(fc1, 0.5) 79 | 80 | #第六层 全连接层 也是输出层 输入为一组长度为512的向量 输出为一组长度为2的向量 这一次输出后会通过softmax得到分类结果 81 | with tf.variable_scope('layer6-fc2'): 82 | fc2_weights = tf.get_variable("weight", [FC_SIZE, NUM_LABELS], 83 | initializer=tf.truncated_normal_initializer(stddev=0.1)) 84 | if regularizer != None: tf.add_to_collection('losses', regularizer(fc2_weights)) 85 | fc2_biases = tf.get_variable("bias", [NUM_LABELS], initializer=tf.constant_initializer(0.1)) 86 | logit = tf.matmul(fc1, fc2_weights) + fc2_biases 87 | 88 | return logit 89 | 90 | BATCH_SIZE = 100 91 | LEARNING_RATE_BASE = 0.001 92 | LEARNING_RATE_DECAY = 0.99 93 | REGULARIZATION_RATE = 0.0001 94 | TRAINING_STEPS = 6000 95 | MOVING_AVERAGE_DECAY = 0.99 96 | 97 | MODEL_SAVE_PATH = "/home/xujinghui/model" 98 | MODEL_NAME = "model_malware.ckpt" 99 | 100 | #定义训练过程 101 | def train(X_train,Y_train): 102 | # 定义输出为4维矩阵的placeholder 103 | x = tf.placeholder(tf.float32, [ 104 | BATCH_SIZE, 105 | IMAGE_SIZE, 106 | IMAGE_SIZE, 107 | NUM_CHANNELS], 108 | name='x-input') 109 | y_ = tf.placeholder(tf.float32, [None, OUTPUT_NODE], name='y-input') 110 | 111 | regularizer = tf.contrib.layers.l2_regularizer(REGULARIZATION_RATE) 112 | y = inference(x,False,regularizer) 113 | global_step = tf.Variable(0, trainable=False) 114 | 115 | # 定义损失函数、学习率、滑动平均操作以及训练过程。 116 | variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step) 117 | variables_averages_op = variable_averages.apply(tf.trainable_variables()) 118 | cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y, labels=tf.argmax(y_, 1)) 119 | cross_entropy_mean = tf.reduce_mean(cross_entropy) 120 | loss = cross_entropy_mean + tf.add_n(tf.get_collection('losses')) 121 | learning_rate = tf.train.exponential_decay( 122 | LEARNING_RATE_BASE, 123 | global_step, 124 | len(X_train) / BATCH_SIZE, LEARNING_RATE_DECAY, 125 | staircase=True) 126 | 127 | train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step) 128 | with tf.control_dependencies([train_step, variables_averages_op]): 129 | train_op = tf.no_op(name='train') 130 | 131 | # 初始化TensorFlow持久化类。 132 | saver = tf.train.Saver() 133 | with tf.Session() as sess: 134 | tf.global_variables_initializer().run() 135 | for i in range(TRAINING_STEPS): 136 | 137 | #选取要训练的始末坐标 138 | start = (i*BATCH_SIZE)%len(X_train) 139 | end = min(start+BATCH_SIZE , len(X_train)) 140 | if (end-start)