├── Tensorflow基础使用与图像识别应用 ├── Tensorflow的基础使用与图像识别应用.pdf └── 程序 │ ├── 1创建图,启动图.py │ ├── 3Fetch_and_Feed.py │ ├── 2变量.py │ ├── 4MNIST分类.py │ ├── 5下载google图像识别网络inception-v3.py │ ├── 3Fetch_and_Feed.ipynb │ ├── 1创建图,启动图.ipynb │ ├── 2变量.ipynb │ ├── 5下载google图像识别网络inception-v3.ipynb │ ├── 6使用inception-v3做各种图像的识别.py │ └── 4MNIST分类.ipynb ├── Tensorflow基础使用与文本分类应用 ├── Tensorflow的基础使用与文本分类应用.pdf └── 程序 │ ├── MNIST分类.py │ ├── zhihu_predict.py │ ├── data_handle.py │ ├── MNIST分类.ipynb │ ├── zhihu_eval.py │ ├── zhihu_predict.ipynb │ ├── cnn.py │ ├── data_handle.ipynb │ ├── cnn.ipynb │ └── zhihu_eval.ipynb └── README.md /Tensorflow基础使用与图像识别应用/Tensorflow的基础使用与图像识别应用.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Qinbf/Tensorflow/HEAD/Tensorflow基础使用与图像识别应用/Tensorflow的基础使用与图像识别应用.pdf -------------------------------------------------------------------------------- /Tensorflow基础使用与文本分类应用/Tensorflow的基础使用与文本分类应用.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Qinbf/Tensorflow/HEAD/Tensorflow基础使用与文本分类应用/Tensorflow的基础使用与文本分类应用.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tensorflow 2 | 3 | 4 | ## 关注公众号 5 | ![](https://raw.githubusercontent.com/Qinbf/tf-model-zoo/master/README_IMG/01.jpg) 6 | 7 | 8 | ## 一起交流 9 | 我的微信号:**sdxxqbf** 10 | 以下为微信二维码: 11 | 12 | ![](https://raw.githubusercontent.com/Qinbf/tf-model-zoo/master/README_IMG/02.GIF) 13 | 14 | -------------------------------------------------------------------------------- /Tensorflow基础使用与图像识别应用/程序/1创建图,启动图.py: -------------------------------------------------------------------------------- 1 | #51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html 2 | #优酷频道:http://i.youku.com/sdxxqbf 3 | #微信公众号:深度学习与神经网络 4 | #Github:https://github.com/Qinbf 5 | 6 | # coding: utf-8 7 | 8 | # In[1]: 9 | 10 | import tensorflow as tf 11 | 12 | 13 | # In[2]: 14 | 15 | #创建一个常量op 16 | m1 = tf.constant([[3,3]]) 17 | #创建一个常量op 18 | m2 = tf.constant([[2],[3]]) 19 | #创建一个矩阵乘法op,把m1和m2传入 20 | product = tf.matmul(m1,m2) 21 | #这个时候打印product,只能看到product的属性,不能计算它的值 22 | print(product) 23 | 24 | 25 | # In[3]: 26 | 27 | #第一种定义会话的方式: 28 | #定义一个会话,启动默认图 29 | sess = tf.Session() 30 | #调用sess的run方法来执行矩阵乘法op 31 | #run(product)触发了图中3个op 32 | result = sess.run(product) 33 | print(result) 34 | sess.close() 35 | 36 | 37 | # In[4]: 38 | 39 | #第二种定义会话的方式: 40 | with tf.Session() as sess: 41 | #调用sess的run方法来执行矩阵乘法op 42 | #run(product)触发了图中3个op 43 | result = sess.run(product) 44 | print(result) 45 | 46 | 47 | # In[ ]: 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /Tensorflow基础使用与图像识别应用/程序/3Fetch_and_Feed.py: -------------------------------------------------------------------------------- 1 | #51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html 2 | #优酷频道:http://i.youku.com/sdxxqbf 3 | #微信公众号:深度学习与神经网络 4 | #Github:https://github.com/Qinbf 5 | 6 | # coding: utf-8 7 | 8 | # In[1]: 9 | 10 | import tensorflow as tf 11 | 12 | 13 | # In[2]: 14 | 15 | #Fetch:可以在session中同时计算多个op 16 | #定义三个常量 17 | input1 = tf.constant(3.0) 18 | input2 = tf.constant(2.0) 19 | input3 = tf.constant(5.0) 20 | #定义一个加法op 21 | add = tf.add(input2,input3) 22 | #定义一个乘法op 23 | mul = tf.multiply(input1,add) 24 | 25 | with tf.Session() as sess: 26 | #同时执行乘法op和加法op 27 | result = sess.run([mul,add]) 28 | print(result) 29 | 30 | 31 | # In[4]: 32 | 33 | #Feed:先定义占位符,等需要的时候再传入数据 34 | #创建占位符 35 | input1 = tf.placeholder(tf.float32) 36 | input2 = tf.placeholder(tf.float32) 37 | #定义乘法op 38 | output = tf.multiply(input1,input2) 39 | 40 | with tf.Session() as sess: 41 | #feed的数据以字典的形式传入 42 | print(sess.run(output,feed_dict={input1:[8.],input2:[2.]})) 43 | 44 | 45 | # In[ ]: 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /Tensorflow基础使用与图像识别应用/程序/2变量.py: -------------------------------------------------------------------------------- 1 | #51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html 2 | #优酷频道:http://i.youku.com/sdxxqbf 3 | #微信公众号:深度学习与神经网络 4 | #Github:https://github.com/Qinbf 5 | 6 | # coding: utf-8 7 | 8 | # In[1]: 9 | 10 | import tensorflow as tf 11 | 12 | 13 | # In[3]: 14 | 15 | #定义一个变量 16 | x = tf.Variable([1,2]) 17 | #定义一个常量 18 | a = tf.constant([3,3]) 19 | #增加一个减法op 20 | sub = tf.subtract(x,a) 21 | #增加一个加法op 22 | add = tf.add(x,sub) 23 | 24 | #所有变量初始化 25 | init = tf.global_variables_initializer() 26 | 27 | with tf.Session() as sess: 28 | #执行变量初始化 29 | sess.run(init) 30 | print(sess.run(sub)) 31 | print(sess.run(add)) 32 | 33 | 34 | # In[4]: 35 | 36 | #创建一个变量初始化为0 37 | state = tf.Variable(0,name='counter') 38 | #创建一个op,作用是使state加1 39 | new_value = tf.add(state,1) 40 | #赋值op 41 | update = tf.assign(state,new_value) 42 | #所有变量初始化 43 | init = tf.global_variables_initializer() 44 | 45 | with tf.Session() as sess: 46 | #执行变量初始化 47 | sess.run(init) 48 | print(sess.run(state)) 49 | for _ in range(5): 50 | sess.run(update) 51 | print(sess.run(state)) 52 | 53 | 54 | # In[ ]: 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /Tensorflow基础使用与图像识别应用/程序/4MNIST分类.py: -------------------------------------------------------------------------------- 1 | #51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html 2 | #优酷频道:http://i.youku.com/sdxxqbf 3 | #微信公众号:深度学习与神经网络 4 | #Github:https://github.com/Qinbf 5 | 6 | # coding: utf-8 7 | 8 | # In[1]: 9 | 10 | import tensorflow as tf 11 | from tensorflow.examples.tutorials.mnist import input_data 12 | 13 | 14 | # In[2]: 15 | 16 | #载入数据集 17 | mnist = input_data.read_data_sets("MNIST_data",one_hot=True) 18 | 19 | #每个批次100张照片 20 | batch_size = 100 21 | #计算一共有多少个批次 22 | n_batch = mnist.train.num_examples // batch_size 23 | 24 | #定义两个placeholder 25 | x = tf.placeholder(tf.float32,[None,784]) 26 | y = tf.placeholder(tf.float32,[None,10]) 27 | 28 | #创建一个简单的神经网络,输入层784个神经元,输出层10个神经元 29 | W = tf.Variable(tf.zeros([784,10])) 30 | b = tf.Variable(tf.zeros([10])) 31 | prediction = tf.nn.softmax(tf.matmul(x,W)+b) 32 | 33 | #二次代价函数 34 | #square是求平方 35 | #reduce_mean是求平均值 36 | loss = tf.reduce_mean(tf.square(y-prediction)) 37 | 38 | #使用梯度下降法来最小化loss,学习率是0.2 39 | train_step = tf.train.GradientDescentOptimizer(0.2).minimize(loss) 40 | 41 | #初始化变量 42 | init = tf.global_variables_initializer() 43 | 44 | #结果存放在一个布尔型列表中 45 | correct_prediction = tf.equal(tf.argmax(y,1),tf.argmax(prediction,1))#argmax返回一维张量中最大的值所在的位置 46 | #求准确率 47 | accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))#cast是进行数据格式转换,把布尔型转为float32类型 48 | 49 | with tf.Session() as sess: 50 | #执行初始化 51 | sess.run(init) 52 | #迭代21个周期 53 | for epoch in range(21): 54 | #每个周期迭代n_batch个batch,每个batch为100 55 | for batch in range(n_batch): 56 | #获得一个batch的数据和标签 57 | batch_xs,batch_ys = mnist.train.next_batch(batch_size) 58 | #通过feed喂到模型中进行训练 59 | sess.run(train_step,feed_dict={x:batch_xs,y:batch_ys}) 60 | 61 | #计算准确率 62 | acc = sess.run(accuracy,feed_dict={x:mnist.test.images,y:mnist.test.labels}) 63 | print("Iter " + str(epoch) + ",Testing Accuracy " + str(acc)) 64 | 65 | 66 | # In[ ]: 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /Tensorflow基础使用与文本分类应用/程序/MNIST分类.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # 51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html
5 | # 优酷频道:http://i.youku.com/sdxxqbf
6 | # 微信公众号:深度学习与神经网络
7 | # Github:https://github.com/Qinbf
8 | 9 | # In[2]: 10 | 11 | import tensorflow as tf 12 | from tensorflow.examples.tutorials.mnist import input_data 13 | 14 | 15 | # In[3]: 16 | 17 | #载入数据集 18 | mnist = input_data.read_data_sets("MNIST_data",one_hot=True) 19 | 20 | #每个批次100张照片 21 | batch_size = 100 22 | #计算一共有多少个批次 23 | n_batch = mnist.train.num_examples // batch_size 24 | 25 | #定义两个placeholder 26 | x = tf.placeholder(tf.float32,[None,784]) 27 | y = tf.placeholder(tf.float32,[None,10]) 28 | 29 | #创建一个简单的神经网络,输入层784个神经元,输出层10个神经元 30 | W = tf.Variable(tf.zeros([784,10])) 31 | b = tf.Variable(tf.zeros([10])) 32 | prediction = tf.nn.softmax(tf.matmul(x,W)+b) 33 | 34 | #二次代价函数 35 | #square是求平方 36 | #reduce_mean是求平均值 37 | loss = tf.reduce_mean(tf.square(y-prediction)) 38 | 39 | #使用梯度下降法来最小化loss,学习率是0.2 40 | train_step = tf.train.GradientDescentOptimizer(0.2).minimize(loss) 41 | 42 | #初始化变量 43 | init = tf.global_variables_initializer() 44 | 45 | #结果存放在一个布尔型列表中 46 | correct_prediction = tf.equal(tf.argmax(y,1),tf.argmax(prediction,1))#argmax返回一维张量中最大的值所在的位置 47 | #求准确率 48 | accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))#cast是进行数据格式转换,把布尔型转为float32类型 49 | 50 | with tf.Session() as sess: 51 | #执行初始化 52 | sess.run(init) 53 | #迭代21个周期 54 | for epoch in range(21): 55 | #每个周期迭代n_batch个batch,每个batch为100 56 | for batch in range(n_batch): 57 | #获得一个batch的数据和标签 58 | batch_xs,batch_ys = mnist.train.next_batch(batch_size) 59 | #通过feed喂到模型中进行训练 60 | sess.run(train_step,feed_dict={x:batch_xs,y:batch_ys}) 61 | 62 | #计算准确率 63 | acc = sess.run(accuracy,feed_dict={x:mnist.test.images,y:mnist.test.labels}) 64 | print("Iter " + str(epoch) + ",Testing Accuracy " + str(acc)) 65 | 66 | 67 | # In[ ]: 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /Tensorflow基础使用与图像识别应用/程序/5下载google图像识别网络inception-v3.py: -------------------------------------------------------------------------------- 1 | #51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html 2 | #优酷频道:http://i.youku.com/sdxxqbf 3 | #微信公众号:深度学习与神经网络 4 | #Github:https://github.com/Qinbf 5 | 6 | # coding: utf-8 7 | 8 | # In[1]: 9 | 10 | import tensorflow as tf 11 | import os 12 | import tarfile 13 | import requests 14 | 15 | 16 | # In[ ]: 17 | 18 | #inception模型下载地址 19 | inception_pretrain_model_url = 'http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz' 20 | 21 | #模型存放地址,存放在当前目录下inception_model文件夹下 22 | inception_pretrain_model_dir = "inception_model" 23 | if not os.path.exists(inception_pretrain_model_dir): 24 | os.makedirs(inception_pretrain_model_dir) 25 | 26 | #获取文件名,以及文件路径 27 | filename = inception_pretrain_model_url.split('/')[-1] 28 | filepath = os.path.join(inception_pretrain_model_dir, filename) 29 | 30 | #下载模型 31 | if not os.path.exists(filepath): 32 | print("download: ", filename) 33 | r = requests.get(inception_pretrain_model_url, stream=True) 34 | with open(filepath, 'wb') as f: 35 | for chunk in r.iter_content(chunk_size=1024): 36 | if chunk: 37 | f.write(chunk) 38 | print("finish: ", filename) 39 | 40 | #解压文件 41 | tarfile.open(filepath, 'r:gz').extractall(inception_pretrain_model_dir) 42 | 43 | #模型结构存放文件 44 | log_dir = 'inception_log' 45 | if not os.path.exists(log_dir): 46 | os.makedirs(log_dir) 47 | 48 | #classify_image_graph_def.pb为google训练好的模型 49 | inception_graph_def_file = os.path.join(inception_pretrain_model_dir, 'classify_image_graph_def.pb') 50 | with tf.Session() as sess: 51 | #创建一个图来存放google训练好的模型 52 | with tf.gfile.FastGFile(inception_graph_def_file, 'rb') as f: 53 | graph_def = tf.GraphDef() 54 | graph_def.ParseFromString(f.read()) 55 | tf.import_graph_def(graph_def, name='') 56 | #保存图的结构 57 | writer = tf.summary.FileWriter(log_dir, sess.graph) 58 | writer.close() 59 | 60 | 61 | # In[ ]: 62 | 63 | 64 | 65 | 66 | # In[ ]: 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /Tensorflow基础使用与文本分类应用/程序/zhihu_predict.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # 51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html
5 | # 优酷频道:http://i.youku.com/sdxxqbf
6 | # 微信公众号:深度学习与神经网络
7 | # Github:https://github.com/Qinbf
8 | 9 | # In[1]: 10 | 11 | import pandas as pd 12 | from tqdm import tqdm 13 | import re 14 | import numpy as np 15 | from six.moves import xrange 16 | 17 | 18 | # In[2]: 19 | 20 | topic_info = pd.read_table("./ieee_zhihu_cup/topic_info.txt",sep='\t',header=None) 21 | print(topic_info.iloc[0:5]) 22 | 23 | 24 | # In[3]: 25 | 26 | # 话题字典 27 | topic_dict = {} 28 | for i in xrange(topic_info.shape[0]): 29 | topic_dict[i] = topic_info.iloc[i][0] 30 | 31 | 32 | # In[4]: 33 | 34 | predict = open('predict.txt', "r") 35 | examples = predict.readlines() 36 | text = np.array([line.split(" ") for line in examples]) 37 | 38 | 39 | # In[5]: 40 | 41 | label = [] 42 | for line in tqdm(text): 43 | num2label = [] 44 | for i in xrange(5): 45 | num2label.append(topic_dict[int(line[i])]) # 把0-1999编号转成原来的id 46 | label.append(num2label) 47 | label = np.array(label) 48 | 49 | 50 | # In[6]: 51 | 52 | np.savetxt("temp.txt",label,fmt='%d') 53 | 54 | 55 | # In[7]: 56 | 57 | def clean_str(string): 58 | string = re.sub(r" ", ",", string) 59 | return string 60 | 61 | file1 = open('temp.txt', "r") 62 | examples = file1.readlines() 63 | examples = [clean_str(line) for line in examples] 64 | file1.close() 65 | 66 | file1 = open('temp.txt', "w") 67 | file1.writelines(examples) 68 | file1.close() 69 | 70 | 71 | # In[8]: 72 | 73 | # predict文件导入 74 | predict_file = 'temp.txt' 75 | predict_reader = pd.read_table(predict_file,sep=' ',header=None) 76 | print(predict_reader.iloc[0:5]) 77 | 78 | 79 | # In[9]: 80 | 81 | # 导入question_train_set 82 | eval_reader = pd.read_table('./ieee_zhihu_cup/question_eval_set.txt',sep='\t',header=None) 83 | print(eval_reader.iloc[0:3]) 84 | 85 | 86 | # In[10]: 87 | 88 | final_predict = pd.concat([eval_reader.ix[:,0],predict_reader],axis=1) 89 | print(final_predict.iloc[0:5]) 90 | 91 | 92 | # In[11]: 93 | 94 | final_predict.to_csv('temp.txt', header=None, index=None, sep=',') 95 | 96 | final_file = open('temp.txt', "r") 97 | final_examples = final_file.readlines() 98 | final_examples = [re.sub(r'"',"",line) for line in final_examples] 99 | final_file.close() 100 | 101 | final_file = open('final_predict.csv', "w") 102 | final_file.writelines(final_examples) 103 | final_file.close() 104 | 105 | -------------------------------------------------------------------------------- /Tensorflow基础使用与图像识别应用/程序/3Fetch_and_Feed.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "#51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html\n", 12 | "#优酷频道:http://i.youku.com/sdxxqbf\n", 13 | "#微信公众号:深度学习与神经网络\n", 14 | "#Github:https://github.com/Qinbf" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import tensorflow as tf" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "[21.0, 7.0]\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "#Fetch:可以在session中同时计算多个op\n", 45 | "#定义三个常量\n", 46 | "input1 = tf.constant(3.0)\n", 47 | "input2 = tf.constant(2.0)\n", 48 | "input3 = tf.constant(5.0)\n", 49 | "#定义一个加法op\n", 50 | "add = tf.add(input2,input3)\n", 51 | "#定义一个乘法op\n", 52 | "mul = tf.multiply(input1,add)\n", 53 | "\n", 54 | "with tf.Session() as sess:\n", 55 | " #同时执行乘法op和加法op\n", 56 | " result = sess.run([mul,add])\n", 57 | " print(result)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 4, 63 | "metadata": { 64 | "collapsed": false 65 | }, 66 | "outputs": [ 67 | { 68 | "name": "stdout", 69 | "output_type": "stream", 70 | "text": [ 71 | "[ 16.]\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "#Feed:先定义占位符,等需要的时候再传入数据\n", 77 | "#创建占位符\n", 78 | "input1 = tf.placeholder(tf.float32)\n", 79 | "input2 = tf.placeholder(tf.float32)\n", 80 | "#定义乘法op\n", 81 | "output = tf.multiply(input1,input2)\n", 82 | "\n", 83 | "with tf.Session() as sess:\n", 84 | " #feed的数据以字典的形式传入\n", 85 | " print(sess.run(output,feed_dict={input1:[8.],input2:[2.]}))" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": { 92 | "collapsed": true 93 | }, 94 | "outputs": [], 95 | "source": [] 96 | } 97 | ], 98 | "metadata": { 99 | "anaconda-cloud": {}, 100 | "kernelspec": { 101 | "display_name": "Python [default]", 102 | "language": "python", 103 | "name": "python3" 104 | }, 105 | "language_info": { 106 | "codemirror_mode": { 107 | "name": "ipython", 108 | "version": 3 109 | }, 110 | "file_extension": ".py", 111 | "mimetype": "text/x-python", 112 | "name": "python", 113 | "nbconvert_exporter": "python", 114 | "pygments_lexer": "ipython3", 115 | "version": "3.5.2" 116 | } 117 | }, 118 | "nbformat": 4, 119 | "nbformat_minor": 1 120 | } 121 | -------------------------------------------------------------------------------- /Tensorflow基础使用与图像识别应用/程序/1创建图,启动图.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "#51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html\n", 12 | "#优酷频道:http://i.youku.com/sdxxqbf\n", 13 | "#微信公众号:深度学习与神经网络\n", 14 | "#Github:https://github.com/Qinbf" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import tensorflow as tf" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "Tensor(\"MatMul:0\", shape=(1, 1), dtype=int32)\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "#创建一个常量op\n", 45 | "m1 = tf.constant([[3,3]])\n", 46 | "#创建一个常量op\n", 47 | "m2 = tf.constant([[2],[3]])\n", 48 | "#创建一个矩阵乘法op,把m1和m2传入\n", 49 | "product = tf.matmul(m1,m2)\n", 50 | "#这个时候打印product,只能看到product的属性,不能计算它的值\n", 51 | "print(product)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 4, 57 | "metadata": { 58 | "collapsed": false 59 | }, 60 | "outputs": [ 61 | { 62 | "name": "stdout", 63 | "output_type": "stream", 64 | "text": [ 65 | "[[15]]\n" 66 | ] 67 | } 68 | ], 69 | "source": [ 70 | "#第一种定义会话的方式:\n", 71 | "#定义一个会话,启动默认图\n", 72 | "sess = tf.Session()\n", 73 | "#调用sess的run方法来执行矩阵乘法op\n", 74 | "#run(product)触发了图中3个op\n", 75 | "result = sess.run(product)\n", 76 | "print(result)\n", 77 | "sess.close()" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 5, 83 | "metadata": { 84 | "collapsed": false 85 | }, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "[[15]]\n" 92 | ] 93 | } 94 | ], 95 | "source": [ 96 | "#第二种定义会话的方式:\n", 97 | "with tf.Session() as sess:\n", 98 | " #调用sess的run方法来执行矩阵乘法op\n", 99 | " #run(product)触发了图中3个op\n", 100 | " result = sess.run(product)\n", 101 | " print(result)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": { 108 | "collapsed": true 109 | }, 110 | "outputs": [], 111 | "source": [] 112 | } 113 | ], 114 | "metadata": { 115 | "anaconda-cloud": {}, 116 | "kernelspec": { 117 | "display_name": "Python [default]", 118 | "language": "python", 119 | "name": "python3" 120 | }, 121 | "language_info": { 122 | "codemirror_mode": { 123 | "name": "ipython", 124 | "version": 3 125 | }, 126 | "file_extension": ".py", 127 | "mimetype": "text/x-python", 128 | "name": "python", 129 | "nbconvert_exporter": "python", 130 | "pygments_lexer": "ipython3", 131 | "version": "3.5.2" 132 | } 133 | }, 134 | "nbformat": 4, 135 | "nbformat_minor": 1 136 | } 137 | -------------------------------------------------------------------------------- /Tensorflow基础使用与文本分类应用/程序/data_handle.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # 51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html
5 | # 优酷频道:http://i.youku.com/sdxxqbf
6 | # 微信公众号:深度学习与神经网络
7 | # Github:https://github.com/Qinbf
8 | # 9 | # question_train_set.txt: 10 | # 第一列为 问题id; 11 | # 第二列为 title 的字符编号序列; 12 | # 第三列为 title 的词语编号序列; 13 | # 第四列为描述的字符编号序列; 14 | # 第五列为描述的词语标号序列。 15 | # 16 | # question_topic_train_set.txt: 17 | # 第一列 问题 id; 18 | # 第二列 话题 id。 19 | # 20 | # topic_info.txt: 21 | # 第一列为话题 id 22 | # 第二列为话题的父话题 id。话题之间是有向无环图结构,一个话题可能有 0 到多个父话题; 23 | # 第三列为话题名字的字符编号序列; 24 | # 第四列为话题名字的词语编号序列; 25 | # 第五列为话题描述的字符编号序列; 26 | # 第六列为话题描述的词语编号序列。 27 | # 28 | # 1.title通常来说包含的信息最重要。对于question_train_set.txt文件,为了简单起见,我们只取第三列,title的词语编号序列。 29 | # 2.对于topic_info.txt,为了简单起见,我们不考虑2,3,4,5,6列。只是简单的提取话题id,然后转为0-1998的数字(一共有1999个话题) 30 | # 3.然后合并以上一些数据,得到最后处理后的数据。 31 | 32 | # In[1]: 33 | 34 | import pandas as pd 35 | from tqdm import tqdm # pip install tqdm 36 | from six.moves import xrange 37 | 38 | 39 | # In[2]: 40 | 41 | # 导入question_train_set 42 | reader = pd.read_table('./ieee_zhihu_cup/question_train_set.txt',sep='\t',header=None) 43 | print(reader.iloc[0:5]) 44 | 45 | 46 | # In[3]: 47 | 48 | # 导入question_topic_eval_set 49 | topic_reader = pd.read_table('./ieee_zhihu_cup/question_topic_train_set.txt',sep='\t',header=None) 50 | print(topic_reader.iloc[0:5]) 51 | 52 | 53 | # In[4]: 54 | 55 | # 合并title 的词语编号序列和话题 id 56 | data_topic = pd.concat([reader.ix[:,2], topic_reader.ix[:,1]], axis=1, ignore_index=True) 57 | print(data_topic.iloc[0:5]) 58 | 59 | 60 | # In[5]: 61 | 62 | # 导入topic_info 63 | label_reader = pd.read_table('./ieee_zhihu_cup/topic_info.txt',sep='\t',header=None) 64 | print(label_reader.iloc[0:5]) 65 | 66 | 67 | # In[6]: 68 | 69 | # 把标签转为0-1998的编号 70 | labels = list(label_reader.iloc[:,0]) 71 | my_labels = [] 72 | for label in labels: 73 | my_labels.append(label) 74 | 75 | # 建立topic字典 76 | topic_dict = {} 77 | for i,label in enumerate(my_labels): 78 | topic_dict[label] = i 79 | 80 | print(topic_dict[7739004195693774975]) 81 | 82 | 83 | # In[7]: 84 | 85 | for i in tqdm(xrange(data_topic.shape[0])): 86 | new_label = '' 87 | # 根据“,”切分话题id 88 | temp_topic = data_topic.iloc[i][1].split(',') 89 | for topic in temp_topic: 90 | # 判断该label是否在label文件中,并得到该行 91 | label_num = topic_dict[int(topic)] 92 | new_label = new_label + str(label_num) + ',' 93 | data_topic.iloc[i][1] = new_label[:-1] 94 | print(data_topic.iloc[:5]) 95 | 96 | 97 | # In[8]: 98 | 99 | # 保存处理过后的文件 100 | data_topic.to_csv("./ieee_zhihu_cup/data_topic.txt", header=None, index=None, sep='\t') 101 | 102 | # 切分成10块保存 103 | for i in xrange(10): 104 | data_topic_filename = './ieee_zhihu_cup/data_topic_block_' + str(i) + '.txt' 105 | if (i+1)*300000 < data_topic.shape[0]: 106 | data_topic.iloc[i*300000:(i+1)*300000].to_csv( 107 | data_topic_filename, header=None, index=None, sep='\t') 108 | else: 109 | data_topic.iloc[i*300000:data_topic.shape[0]].to_csv( 110 | data_topic_filename, header=None, index=None, sep='\t') 111 | 112 | 113 | # In[ ]: 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /Tensorflow基础使用与图像识别应用/程序/2变量.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "#51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html\n", 12 | "#优酷频道:http://i.youku.com/sdxxqbf\n", 13 | "#微信公众号:深度学习与神经网络\n", 14 | "#Github:https://github.com/Qinbf" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import tensorflow as tf" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "[-2 -1]\n", 40 | "[-1 1]\n" 41 | ] 42 | } 43 | ], 44 | "source": [ 45 | "#定义一个变量\n", 46 | "x = tf.Variable([1,2])\n", 47 | "#定义一个常量\n", 48 | "a = tf.constant([3,3])\n", 49 | "#增加一个减法op\n", 50 | "sub = tf.subtract(x,a)\n", 51 | "#增加一个加法op\n", 52 | "add = tf.add(x,sub)\n", 53 | "\n", 54 | "#所有变量初始化\n", 55 | "init = tf.global_variables_initializer()\n", 56 | "\n", 57 | "with tf.Session() as sess:\n", 58 | " #执行变量初始化\n", 59 | " sess.run(init)\n", 60 | " print(sess.run(sub))\n", 61 | " print(sess.run(add))" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 4, 67 | "metadata": { 68 | "collapsed": false 69 | }, 70 | "outputs": [ 71 | { 72 | "name": "stdout", 73 | "output_type": "stream", 74 | "text": [ 75 | "0\n", 76 | "1\n", 77 | "2\n", 78 | "3\n", 79 | "4\n", 80 | "5\n" 81 | ] 82 | } 83 | ], 84 | "source": [ 85 | "#创建一个变量初始化为0\n", 86 | "state = tf.Variable(0,name='counter')\n", 87 | "#创建一个op,作用是使state加1\n", 88 | "new_value = tf.add(state,1)\n", 89 | "#赋值op\n", 90 | "update = tf.assign(state,new_value)\n", 91 | "#所有变量初始化\n", 92 | "init = tf.global_variables_initializer()\n", 93 | "\n", 94 | "with tf.Session() as sess:\n", 95 | " #执行变量初始化\n", 96 | " sess.run(init)\n", 97 | " print(sess.run(state))\n", 98 | " for _ in range(5):\n", 99 | " sess.run(update)\n", 100 | " print(sess.run(state))" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "collapsed": true 108 | }, 109 | "outputs": [], 110 | "source": [] 111 | } 112 | ], 113 | "metadata": { 114 | "anaconda-cloud": {}, 115 | "kernelspec": { 116 | "display_name": "Python [default]", 117 | "language": "python", 118 | "name": "python3" 119 | }, 120 | "language_info": { 121 | "codemirror_mode": { 122 | "name": "ipython", 123 | "version": 3 124 | }, 125 | "file_extension": ".py", 126 | "mimetype": "text/x-python", 127 | "name": "python", 128 | "nbconvert_exporter": "python", 129 | "pygments_lexer": "ipython3", 130 | "version": "3.5.2" 131 | } 132 | }, 133 | "nbformat": 4, 134 | "nbformat_minor": 1 135 | } 136 | -------------------------------------------------------------------------------- /Tensorflow基础使用与图像识别应用/程序/5下载google图像识别网络inception-v3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "#51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html\n", 12 | "#优酷频道:http://i.youku.com/sdxxqbf\n", 13 | "#微信公众号:深度学习与神经网络\n", 14 | "#Github:https://github.com/Qinbf" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import tensorflow as tf\n", 26 | "import os\n", 27 | "import tarfile\n", 28 | "import requests" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 3, 34 | "metadata": { 35 | "collapsed": false 36 | }, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | "finish: inception-2015-12-05.tgz\n" 43 | ] 44 | } 45 | ], 46 | "source": [ 47 | "#inception模型下载地址\n", 48 | "inception_pretrain_model_url = 'http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz'\n", 49 | "\n", 50 | "#模型存放地址,存放在当前目录下inception_model文件夹下\n", 51 | "inception_pretrain_model_dir = \"inception_model\"\n", 52 | "if not os.path.exists(inception_pretrain_model_dir):\n", 53 | " os.makedirs(inception_pretrain_model_dir)\n", 54 | " \n", 55 | "#获取文件名,以及文件路径\n", 56 | "filename = inception_pretrain_model_url.split('/')[-1]\n", 57 | "filepath = os.path.join(inception_pretrain_model_dir, filename)\n", 58 | "\n", 59 | "#下载模型\n", 60 | "if not os.path.exists(filepath):\n", 61 | " print(\"download: \", filename)\n", 62 | " r = requests.get(inception_pretrain_model_url, stream=True)\n", 63 | " with open(filepath, 'wb') as f:\n", 64 | " for chunk in r.iter_content(chunk_size=1024):\n", 65 | " if chunk:\n", 66 | " f.write(chunk)\n", 67 | "print(\"finish: \", filename)\n", 68 | "\n", 69 | "#解压文件\n", 70 | "tarfile.open(filepath, 'r:gz').extractall(inception_pretrain_model_dir)\n", 71 | " \n", 72 | "#模型结构存放文件\n", 73 | "log_dir = 'inception_log'\n", 74 | "if not os.path.exists(log_dir):\n", 75 | " os.makedirs(log_dir)\n", 76 | "\n", 77 | "#classify_image_graph_def.pb为google训练好的模型\n", 78 | "inception_graph_def_file = os.path.join(inception_pretrain_model_dir, 'classify_image_graph_def.pb')\n", 79 | "with tf.Session() as sess:\n", 80 | " #创建一个图来存放google训练好的模型\n", 81 | " with tf.gfile.FastGFile(inception_graph_def_file, 'rb') as f:\n", 82 | " graph_def = tf.GraphDef()\n", 83 | " graph_def.ParseFromString(f.read())\n", 84 | " tf.import_graph_def(graph_def, name='')\n", 85 | " #保存图的结构\n", 86 | " writer = tf.summary.FileWriter(log_dir, sess.graph)\n", 87 | " writer.close()" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": { 94 | "collapsed": true 95 | }, 96 | "outputs": [], 97 | "source": [] 98 | } 99 | ], 100 | "metadata": { 101 | "anaconda-cloud": {}, 102 | "kernelspec": { 103 | "display_name": "Python [default]", 104 | "language": "python", 105 | "name": "python3" 106 | }, 107 | "language_info": { 108 | "codemirror_mode": { 109 | "name": "ipython", 110 | "version": 3 111 | }, 112 | "file_extension": ".py", 113 | "mimetype": "text/x-python", 114 | "name": "python", 115 | "nbconvert_exporter": "python", 116 | "pygments_lexer": "ipython3", 117 | "version": "3.5.2" 118 | } 119 | }, 120 | "nbformat": 4, 121 | "nbformat_minor": 1 122 | } 123 | -------------------------------------------------------------------------------- /Tensorflow基础使用与图像识别应用/程序/6使用inception-v3做各种图像的识别.py: -------------------------------------------------------------------------------- 1 | #51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html 2 | #优酷频道:http://i.youku.com/sdxxqbf 3 | #微信公众号:深度学习与神经网络 4 | #Github:https://github.com/Qinbf 5 | 6 | # coding: utf-8 7 | 8 | # In[4]: 9 | 10 | import tensorflow as tf 11 | import os 12 | import numpy as np 13 | import re 14 | from PIL import Image 15 | import matplotlib.pyplot as plt 16 | 17 | 18 | # In[5]: 19 | 20 | class NodeLookup(object): 21 | def __init__(self): 22 | label_lookup_path = 'inception_model/imagenet_2012_challenge_label_map_proto.pbtxt' 23 | uid_lookup_path = 'inception_model/imagenet_synset_to_human_label_map.txt' 24 | self.node_lookup = self.load(label_lookup_path, uid_lookup_path) 25 | 26 | def load(self, label_lookup_path, uid_lookup_path): 27 | # 加载分类字符串n********对应分类名称的文件 28 | proto_as_ascii_lines = tf.gfile.GFile(uid_lookup_path).readlines() 29 | uid_to_human = {} 30 | #匹配0或多个n或数字,匹配0或多个空格,非空白字符,逗号 31 | p = re.compile(r'[n\d]*[ \S,]*') 32 | for line in proto_as_ascii_lines: 33 | parsed_items = p.findall(line) 34 | #获取编号字符串n******** 35 | uid = parsed_items[0] 36 | #获取分类名称 37 | human_string = parsed_items[2] 38 | #保存编号字符串n********与分类名称映射关系 39 | uid_to_human[uid] = human_string 40 | 41 | # 加载分类字符串n********对应分类编号1-1000的文件 42 | proto_as_ascii = tf.gfile.GFile(label_lookup_path).readlines() 43 | node_id_to_uid = {} 44 | for line in proto_as_ascii: 45 | if line.startswith(' target_class:'): 46 | #获取分类编号1-1000 47 | target_class = int(line.split(': ')[1]) 48 | if line.startswith(' target_class_string:'): 49 | #获取编号字符串n******** 50 | target_class_string = line.split(': ')[1] 51 | #保存分类编号1-1000与编号字符串n********映射关系 52 | node_id_to_uid[target_class] = target_class_string[1:-2] 53 | 54 | #建立分类编号1-1000对应分类名称的映射关系 55 | node_id_to_name = {} 56 | for key, val in node_id_to_uid.items(): 57 | #获取分类名称 58 | name = uid_to_human[val] 59 | #建立分类编号1-1000到分类名称的映射关系 60 | node_id_to_name[key] = name 61 | return node_id_to_name 62 | 63 | #传入分类编号1-1000返回分类名称 64 | def id_to_string(self, node_id): 65 | if node_id not in self.node_lookup: 66 | return '' 67 | return self.node_lookup[node_id] 68 | 69 | 70 | #创建一个图来存放google训练好的模型 71 | with tf.gfile.FastGFile('inception_model/classify_image_graph_def.pb', 'rb') as f: 72 | graph_def = tf.GraphDef() 73 | graph_def.ParseFromString(f.read()) 74 | tf.import_graph_def(graph_def, name='') 75 | 76 | 77 | with tf.Session() as sess: 78 | softmax_tensor = sess.graph.get_tensor_by_name('softmax:0') 79 | #遍历目录 80 | for root,dirs,files in os.walk('images/'): 81 | for file in files: 82 | #载入图片 83 | image_data = tf.gfile.FastGFile(os.path.join(root,file), 'rb').read() 84 | predictions = sess.run(softmax_tensor,{'DecodeJpeg/contents:0': image_data})#图片格式是jpg格式 85 | predictions = np.squeeze(predictions)#把结果转为1维数据 86 | 87 | #打印图片路径及名称 88 | image_path = os.path.join(root,file) 89 | print(image_path) 90 | #显示图片 91 | img=Image.open(image_path) 92 | plt.imshow(img) 93 | plt.axis('off') 94 | plt.show() 95 | 96 | #排序 97 | top_k = predictions.argsort()[-5:][::-1] 98 | node_lookup = NodeLookup() 99 | for node_id in top_k: 100 | #获取分类名称 101 | human_string = node_lookup.id_to_string(node_id) 102 | #获取该分类的置信度 103 | score = predictions[node_id] 104 | print('%s (score = %.5f)' % (human_string, score)) 105 | print() 106 | 107 | 108 | # In[ ]: 109 | 110 | 111 | 112 | -------------------------------------------------------------------------------- /Tensorflow基础使用与文本分类应用/程序/MNIST分类.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html
\n", 10 | "优酷频道:http://i.youku.com/sdxxqbf
\n", 11 | "微信公众号:深度学习与神经网络
\n", 12 | "Github:https://github.com/Qinbf
" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "import tensorflow as tf\n", 24 | "from tensorflow.examples.tutorials.mnist import input_data" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": { 31 | "collapsed": false 32 | }, 33 | "outputs": [ 34 | { 35 | "name": "stdout", 36 | "output_type": "stream", 37 | "text": [ 38 | "Extracting MNIST_data\\train-images-idx3-ubyte.gz\n", 39 | "Extracting MNIST_data\\train-labels-idx1-ubyte.gz\n", 40 | "Extracting MNIST_data\\t10k-images-idx3-ubyte.gz\n", 41 | "Extracting MNIST_data\\t10k-labels-idx1-ubyte.gz\n", 42 | "Iter 0,Testing Accuracy 0.8304\n", 43 | "Iter 1,Testing Accuracy 0.8702\n", 44 | "Iter 2,Testing Accuracy 0.8821\n", 45 | "Iter 3,Testing Accuracy 0.8884\n", 46 | "Iter 4,Testing Accuracy 0.894\n", 47 | "Iter 5,Testing Accuracy 0.8968\n", 48 | "Iter 6,Testing Accuracy 0.9011\n", 49 | "Iter 7,Testing Accuracy 0.9019\n", 50 | "Iter 8,Testing Accuracy 0.9034\n", 51 | "Iter 9,Testing Accuracy 0.9049\n", 52 | "Iter 10,Testing Accuracy 0.9057\n", 53 | "Iter 11,Testing Accuracy 0.9073\n", 54 | "Iter 12,Testing Accuracy 0.9081\n", 55 | "Iter 13,Testing Accuracy 0.9088\n", 56 | "Iter 14,Testing Accuracy 0.9098\n", 57 | "Iter 15,Testing Accuracy 0.9108\n", 58 | "Iter 16,Testing Accuracy 0.9118\n", 59 | "Iter 17,Testing Accuracy 0.9123\n", 60 | "Iter 18,Testing Accuracy 0.9127\n", 61 | "Iter 19,Testing Accuracy 0.9137\n", 62 | "Iter 20,Testing Accuracy 0.9138\n" 63 | ] 64 | } 65 | ], 66 | "source": [ 67 | "#载入数据集\n", 68 | "mnist = input_data.read_data_sets(\"MNIST_data\",one_hot=True)\n", 69 | "\n", 70 | "#每个批次100张照片\n", 71 | "batch_size = 100\n", 72 | "#计算一共有多少个批次\n", 73 | "n_batch = mnist.train.num_examples // batch_size\n", 74 | "\n", 75 | "#定义两个placeholder\n", 76 | "x = tf.placeholder(tf.float32,[None,784])\n", 77 | "y = tf.placeholder(tf.float32,[None,10])\n", 78 | "\n", 79 | "#创建一个简单的神经网络,输入层784个神经元,输出层10个神经元\n", 80 | "W = tf.Variable(tf.zeros([784,10]))\n", 81 | "b = tf.Variable(tf.zeros([10]))\n", 82 | "prediction = tf.nn.softmax(tf.matmul(x,W)+b)\n", 83 | "\n", 84 | "#二次代价函数\n", 85 | "#square是求平方\n", 86 | "#reduce_mean是求平均值\n", 87 | "loss = tf.reduce_mean(tf.square(y-prediction))\n", 88 | "\n", 89 | "#使用梯度下降法来最小化loss,学习率是0.2\n", 90 | "train_step = tf.train.GradientDescentOptimizer(0.2).minimize(loss)\n", 91 | "\n", 92 | "#初始化变量\n", 93 | "init = tf.global_variables_initializer()\n", 94 | "\n", 95 | "#结果存放在一个布尔型列表中\n", 96 | "correct_prediction = tf.equal(tf.argmax(y,1),tf.argmax(prediction,1))#argmax返回一维张量中最大的值所在的位置\n", 97 | "#求准确率\n", 98 | "accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))#cast是进行数据格式转换,把布尔型转为float32类型\n", 99 | "\n", 100 | "with tf.Session() as sess:\n", 101 | " #执行初始化\n", 102 | " sess.run(init)\n", 103 | " #迭代21个周期\n", 104 | " for epoch in range(21):\n", 105 | " #每个周期迭代n_batch个batch,每个batch为100\n", 106 | " for batch in range(n_batch):\n", 107 | " #获得一个batch的数据和标签\n", 108 | " batch_xs,batch_ys = mnist.train.next_batch(batch_size)\n", 109 | " #通过feed喂到模型中进行训练\n", 110 | " sess.run(train_step,feed_dict={x:batch_xs,y:batch_ys})\n", 111 | " \n", 112 | " #计算准确率\n", 113 | " acc = sess.run(accuracy,feed_dict={x:mnist.test.images,y:mnist.test.labels})\n", 114 | " print(\"Iter \" + str(epoch) + \",Testing Accuracy \" + str(acc))" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": { 121 | "collapsed": true 122 | }, 123 | "outputs": [], 124 | "source": [] 125 | } 126 | ], 127 | "metadata": { 128 | "anaconda-cloud": {}, 129 | "kernelspec": { 130 | "display_name": "Python [default]", 131 | "language": "python", 132 | "name": "python3" 133 | }, 134 | "language_info": { 135 | "codemirror_mode": { 136 | "name": "ipython", 137 | "version": 3 138 | }, 139 | "file_extension": ".py", 140 | "mimetype": "text/x-python", 141 | "name": "python", 142 | "nbconvert_exporter": "python", 143 | "pygments_lexer": "ipython3", 144 | "version": "3.5.2" 145 | } 146 | }, 147 | "nbformat": 4, 148 | "nbformat_minor": 1 149 | } 150 | -------------------------------------------------------------------------------- /Tensorflow基础使用与图像识别应用/程序/4MNIST分类.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "#51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html\n", 12 | "#优酷频道:http://i.youku.com/sdxxqbf\n", 13 | "#微信公众号:深度学习与神经网络\n", 14 | "#Github:https://github.com/Qinbf" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import tensorflow as tf\n", 26 | "from tensorflow.examples.tutorials.mnist import input_data" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "metadata": { 33 | "collapsed": false 34 | }, 35 | "outputs": [ 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "Extracting MNIST_data\\train-images-idx3-ubyte.gz\n", 41 | "Extracting MNIST_data\\train-labels-idx1-ubyte.gz\n", 42 | "Extracting MNIST_data\\t10k-images-idx3-ubyte.gz\n", 43 | "Extracting MNIST_data\\t10k-labels-idx1-ubyte.gz\n", 44 | "Iter 0,Testing Accuracy 0.8304\n", 45 | "Iter 1,Testing Accuracy 0.8702\n", 46 | "Iter 2,Testing Accuracy 0.8821\n", 47 | "Iter 3,Testing Accuracy 0.8884\n", 48 | "Iter 4,Testing Accuracy 0.894\n", 49 | "Iter 5,Testing Accuracy 0.8968\n", 50 | "Iter 6,Testing Accuracy 0.9011\n", 51 | "Iter 7,Testing Accuracy 0.9019\n", 52 | "Iter 8,Testing Accuracy 0.9034\n", 53 | "Iter 9,Testing Accuracy 0.9049\n", 54 | "Iter 10,Testing Accuracy 0.9057\n", 55 | "Iter 11,Testing Accuracy 0.9073\n", 56 | "Iter 12,Testing Accuracy 0.9081\n", 57 | "Iter 13,Testing Accuracy 0.9088\n", 58 | "Iter 14,Testing Accuracy 0.9098\n", 59 | "Iter 15,Testing Accuracy 0.9108\n", 60 | "Iter 16,Testing Accuracy 0.9118\n", 61 | "Iter 17,Testing Accuracy 0.9123\n", 62 | "Iter 18,Testing Accuracy 0.9127\n", 63 | "Iter 19,Testing Accuracy 0.9137\n", 64 | "Iter 20,Testing Accuracy 0.9138\n" 65 | ] 66 | } 67 | ], 68 | "source": [ 69 | "#载入数据集\n", 70 | "mnist = input_data.read_data_sets(\"MNIST_data\",one_hot=True)\n", 71 | "\n", 72 | "#每个批次100张照片\n", 73 | "batch_size = 100\n", 74 | "#计算一共有多少个批次\n", 75 | "n_batch = mnist.train.num_examples // batch_size\n", 76 | "\n", 77 | "#定义两个placeholder\n", 78 | "x = tf.placeholder(tf.float32,[None,784])\n", 79 | "y = tf.placeholder(tf.float32,[None,10])\n", 80 | "\n", 81 | "#创建一个简单的神经网络,输入层784个神经元,输出层10个神经元\n", 82 | "W = tf.Variable(tf.zeros([784,10]))\n", 83 | "b = tf.Variable(tf.zeros([10]))\n", 84 | "prediction = tf.nn.softmax(tf.matmul(x,W)+b)\n", 85 | "\n", 86 | "#二次代价函数\n", 87 | "#square是求平方\n", 88 | "#reduce_mean是求平均值\n", 89 | "loss = tf.reduce_mean(tf.square(y-prediction))\n", 90 | "\n", 91 | "#使用梯度下降法来最小化loss,学习率是0.2\n", 92 | "train_step = tf.train.GradientDescentOptimizer(0.2).minimize(loss)\n", 93 | "\n", 94 | "#初始化变量\n", 95 | "init = tf.global_variables_initializer()\n", 96 | "\n", 97 | "#结果存放在一个布尔型列表中\n", 98 | "correct_prediction = tf.equal(tf.argmax(y,1),tf.argmax(prediction,1))#argmax返回一维张量中最大的值所在的位置\n", 99 | "#求准确率\n", 100 | "accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))#cast是进行数据格式转换,把布尔型转为float32类型\n", 101 | "\n", 102 | "with tf.Session() as sess:\n", 103 | " #执行初始化\n", 104 | " sess.run(init)\n", 105 | " #迭代21个周期\n", 106 | " for epoch in range(21):\n", 107 | " #每个周期迭代n_batch个batch,每个batch为100\n", 108 | " for batch in range(n_batch):\n", 109 | " #获得一个batch的数据和标签\n", 110 | " batch_xs,batch_ys = mnist.train.next_batch(batch_size)\n", 111 | " #通过feed喂到模型中进行训练\n", 112 | " sess.run(train_step,feed_dict={x:batch_xs,y:batch_ys})\n", 113 | " \n", 114 | " #计算准确率\n", 115 | " acc = sess.run(accuracy,feed_dict={x:mnist.test.images,y:mnist.test.labels})\n", 116 | " print(\"Iter \" + str(epoch) + \",Testing Accuracy \" + str(acc))" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": { 123 | "collapsed": true 124 | }, 125 | "outputs": [], 126 | "source": [] 127 | } 128 | ], 129 | "metadata": { 130 | "anaconda-cloud": {}, 131 | "kernelspec": { 132 | "display_name": "Python [default]", 133 | "language": "python", 134 | "name": "python3" 135 | }, 136 | "language_info": { 137 | "codemirror_mode": { 138 | "name": "ipython", 139 | "version": 3 140 | }, 141 | "file_extension": ".py", 142 | "mimetype": "text/x-python", 143 | "name": "python", 144 | "nbconvert_exporter": "python", 145 | "pygments_lexer": "ipython3", 146 | "version": "3.5.2" 147 | } 148 | }, 149 | "nbformat": 4, 150 | "nbformat_minor": 1 151 | } 152 | -------------------------------------------------------------------------------- /Tensorflow基础使用与文本分类应用/程序/zhihu_eval.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # 51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html
5 | # 优酷频道:http://i.youku.com/sdxxqbf
6 | # 微信公众号:深度学习与神经网络
7 | # Github:https://github.com/Qinbf
8 | 9 | # In[1]: 10 | 11 | 12 | import numpy as np 13 | import pandas as pd 14 | from tqdm import tqdm 15 | import tensorflow as tf 16 | import pickle 17 | import math 18 | from six.moves import xrange 19 | 20 | 21 | # In[2]: 22 | 23 | # 导入question_train_set 24 | reader = pd.read_table('./ieee_zhihu_cup/question_eval_set.txt',sep='\t',header=None) 25 | print(reader.iloc[0:5]) 26 | 27 | 28 | # In[3]: 29 | 30 | # 计算一段文本中最大词汇数 31 | x_text = reader.iloc[:,2] 32 | max_document_length = 0 33 | for i,line in enumerate(x_text): 34 | try: 35 | temp = line.split(',') 36 | max_document_length = max(max_document_length,len(temp)) 37 | except: 38 | # 其中有一行数据为空 39 | pass 40 | # x_text[i] = " " 41 | 42 | print("max_document_length:",max_document_length) 43 | 44 | # 载入字典 45 | vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor.restore("vocab_dict") 46 | 47 | 48 | # In[4]: 49 | 50 | # 按','切分数据 51 | text = [] 52 | for line in x_text: 53 | try: 54 | text.append(line.split(',')) 55 | except: 56 | # 其中有一行数据为空 57 | text.append(' ') 58 | 59 | 60 | # In[5]: 61 | 62 | # 把数据集变成编号的形式 63 | x = [] 64 | for line in tqdm(text): 65 | line_len = len(line) 66 | text2num = [] 67 | for i in xrange(max_document_length): 68 | if(i < line_len): 69 | try: 70 | text2num.append(vocab_processor.vocabulary_.get(line[i])) # 把词转为数字 71 | except: 72 | text2num.append(0) # 没有对应的词 73 | else: 74 | text2num.append(0) # 填充0 75 | x.append(text2num) 76 | x = np.array(x) 77 | x[:5] 78 | 79 | 80 | # In[6]: 81 | 82 | def batch_iter(data, batch_size, num_epochs, shuffle=False): 83 | """ 84 | Generates a batch iterator for a dataset. 85 | """ 86 | data = np.array(data) 87 | data_size = len(data) 88 | # 每个epoch的num_batch 89 | num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1 90 | print("num_batches_per_epoch:",num_batches_per_epoch) 91 | for epoch in range(num_epochs): 92 | # Shuffle the data at each epoch 93 | if shuffle: 94 | shuffle_indices = np.random.permutation(np.arange(data_size)) 95 | shuffled_data = data[shuffle_indices] 96 | else: 97 | shuffled_data = data 98 | for batch_num in range(num_batches_per_epoch): 99 | start_index = batch_num * batch_size 100 | end_index = min((batch_num + 1) * batch_size, data_size) 101 | yield shuffled_data[start_index:end_index] 102 | 103 | 104 | # In[7]: 105 | 106 | def eval(predict_label_and_marked_label_list): 107 | """ 108 | :param predict_label_and_marked_label_list: 一个元组列表。例如 109 | [ ([1, 2, 3, 4, 5], [4, 5, 6, 7]), 110 | ([3, 2, 1, 4, 7], [5, 7, 3]) 111 | ] 112 | 需要注意这里 predict_label 是去重复的,例如 [1,2,3,2,4,1,6],去重后变成[1,2,3,4,6] 113 | 114 | marked_label_list 本身没有顺序性,但提交结果有,例如上例的命中情况分别为 115 | [0,0,0,1,1] (4,5命中) 116 | [1,0,0,0,1] (3,7命中) 117 | 118 | """ 119 | right_label_num = 0 #总命中标签数量 120 | right_label_at_pos_num = [0, 0, 0, 0, 0] #在各个位置上总命中数量 121 | sample_num = 0 #总问题数量 122 | all_marked_label_num = 0 #总标签数量 123 | for predict_labels, marked_labels in predict_label_and_marked_label_list: 124 | sample_num += 1 125 | marked_label_set = set(marked_labels) 126 | all_marked_label_num += len(marked_label_set) 127 | for pos, label in zip(range(0, min(len(predict_labels), 5)), predict_labels): 128 | if label in marked_label_set: #命中 129 | right_label_num += 1 130 | right_label_at_pos_num[pos] += 1 131 | 132 | precision = 0.0 133 | for pos, right_num in zip(range(0, 5), right_label_at_pos_num): 134 | precision += ((right_num / float(sample_num))) / math.log(2.0 + pos) # 下标0-4 映射到 pos1-5 + 1,所以最终+2 135 | recall = float(right_label_num) / all_marked_label_num 136 | 137 | return 2*(precision * recall) / (precision + recall ) 138 | 139 | 140 | # In[8]: 141 | 142 | # 定义三个placeholder 143 | input_x = tf.placeholder(tf.int32, [None, x.shape[1]], name="input_x") 144 | dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") 145 | 146 | # sequence_length-最长词汇数 147 | sequence_length=x.shape[1] 148 | # num_classes-分类数 149 | num_classes=1999 150 | # vocab_size-总词汇数 151 | vocab_size=len(vocab_processor.vocabulary_) 152 | # embedding_size-词向量长度 153 | embedding_size=256 154 | # filter_sizes-卷积核尺寸3,4,5 155 | filter_sizes=list(map(int, [3,4,5])) 156 | # num_filters-卷积核数量 157 | num_filters=1024 158 | 159 | Weights = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name="Weights") 160 | # [None, sequence_length, embedding_size] 161 | embedded_chars = tf.nn.embedding_lookup(Weights, input_x) 162 | # 添加一个维度,[None, sequence_length, embedding_size, 1] 163 | embedded_chars_expanded = tf.expand_dims(embedded_chars, -1) 164 | # Create a convolution + maxpool layer for each filter size 165 | pooled_outputs = [] 166 | for i, filter_size in enumerate(filter_sizes): 167 | with tf.name_scope("conv-maxpool-%s" % filter_size): 168 | # Convolution Layer 169 | filter_shape = [filter_size, embedding_size, 1, num_filters] 170 | W = tf.Variable( 171 | tf.truncated_normal(filter_shape, stddev=0.1), name="W") 172 | b = tf.Variable( 173 | tf.constant(0.1, shape=[num_filters]), name="b") 174 | conv = tf.nn.conv2d( 175 | embedded_chars_expanded, 176 | W, 177 | strides=[1, 1, 1, 1], 178 | padding="VALID", 179 | name="conv") 180 | # Apply nonlinearity 181 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") 182 | # Maxpooling over the outputs 183 | pooled = tf.nn.max_pool( 184 | h, 185 | ksize=[1, sequence_length - filter_size + 1, 1, 1], 186 | strides=[1, 1, 1, 1], 187 | padding='VALID', 188 | name="pool") 189 | pooled_outputs.append(pooled) 190 | 191 | # Combine all the pooled features 192 | num_filters_total = num_filters * len(filter_sizes) 193 | print("num_filters_total:", num_filters_total) 194 | h_pool = tf.concat(pooled_outputs, 3) 195 | h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total]) 196 | 197 | # Add dropout 198 | with tf.name_scope("dropout"):h_drop = tf.nn.dropout(h_pool_flat,dropout_keep_prob) 199 | 200 | # Final (unnormalized) scores and predictions 201 | with tf.name_scope("output"): 202 | W = tf.get_variable( 203 | "W", 204 | shape=[num_filters_total, num_classes], 205 | initializer=tf.contrib.layers.xavier_initializer()) 206 | b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b") 207 | scores = tf.nn.xw_plus_b(h_drop, W, b, name="scores") 208 | 209 | 210 | # In[9]: 211 | 212 | # 选择模型 213 | checkpoint_file = "./models/model-10000" 214 | 215 | with tf.Session() as sess: 216 | predict_top_5 = tf.nn.top_k(scores, k=5) 217 | sess.run(tf.global_variables_initializer()) 218 | i = 0 219 | saver = tf.train.Saver() 220 | saver.restore(sess, checkpoint_file) 221 | 222 | # Generate batches 223 | batches = batch_iter(list(x), 1000, 1) 224 | 225 | for x_batch in batches: 226 | i = i + 1 227 | predict_5 = sess.run(predict_top_5,feed_dict={input_x:x_batch,dropout_keep_prob:1.0}) 228 | if i == 1: 229 | predict = predict_5[1] 230 | else: 231 | predict = np.concatenate((predict,predict_5[1])) 232 | if (i%5==0): 233 | print ("Evaluation:step",i) 234 | 235 | np.savetxt("predict.txt",predict,fmt='%d') 236 | 237 | 238 | # In[ ]: 239 | 240 | 241 | 242 | -------------------------------------------------------------------------------- /Tensorflow基础使用与文本分类应用/程序/zhihu_predict.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html
\n", 8 | "优酷频道:http://i.youku.com/sdxxqbf
\n", 9 | "微信公众号:深度学习与神经网络
\n", 10 | "Github:https://github.com/Qinbf
" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": { 17 | "collapsed": true 18 | }, 19 | "outputs": [], 20 | "source": [ 21 | "import pandas as pd\n", 22 | "from tqdm import tqdm\n", 23 | "import re\n", 24 | "import numpy as np\n", 25 | "from six.moves import xrange" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | " 0 1 \\\n", 40 | "0 738845194850773558 -5833678375673307423 \n", 41 | "1 3738968195649774859 2027693463582123305 \n", 42 | "2 4738849194894773882 1127459907694805235 \n", 43 | "3 7739004195693774975 2904932941037075699,1160326435131345730,725917... \n", 44 | "4 -7261194805221226386 -5833678375673307423 \n", 45 | "\n", 46 | " 2 3 4 \\\n", 47 | "0 c0,c1 w0 c0,c1,c2,c3,c4,c5,c6,c7,c0,c1,c8,c9,c10,c11,c1... \n", 48 | "1 c39,c40 w24 c41,c42,c43,c39,c40,c4,c44,c45,c46,c47,c48,c49... \n", 49 | "2 c172,c31,c0,c1 w102 NaN \n", 50 | "3 c39,c40,c5,c173 w103 c39,c40,c23,c21,c174,c74,c5,c173,c17,c35,c39,c... \n", 51 | "4 c36,c31,c45,c237 w148 c238,c239 \n", 52 | "\n", 53 | " 5 \n", 54 | "0 w0,w1,w2,w3,w4,w5,w6,w7,w8,w9,w10,w11,w12,w13,... \n", 55 | "1 w24,w25,w26,w27,w28,w6,w29,w30,w11,w31,w32,w33... \n", 56 | "2 NaN \n", 57 | "3 w104,w105,w11,w21,w24,w6,w106,w23,w54,w24,w107... \n", 58 | "4 w149,w150 \n" 59 | ] 60 | } 61 | ], 62 | "source": [ 63 | "topic_info = pd.read_table(\"./ieee_zhihu_cup/topic_info.txt\",sep='\\t',header=None)\n", 64 | "print(topic_info.iloc[0:5])" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 3, 70 | "metadata": { 71 | "collapsed": true 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "# 话题字典\n", 76 | "topic_dict = {}\n", 77 | "for i in xrange(topic_info.shape[0]):\n", 78 | " topic_dict[i] = topic_info.iloc[i][0]" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 4, 84 | "metadata": { 85 | "collapsed": true 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "predict = open('predict.txt', \"r\")\n", 90 | "examples = predict.readlines()\n", 91 | "text = np.array([line.split(\" \") for line in examples])" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 5, 97 | "metadata": { 98 | "collapsed": false 99 | }, 100 | "outputs": [ 101 | { 102 | "name": "stderr", 103 | "output_type": "stream", 104 | "text": [ 105 | "100%|██████████| 217360/217360 [00:01<00:00, 160389.86it/s]\n" 106 | ] 107 | } 108 | ], 109 | "source": [ 110 | "label = []\n", 111 | "for line in tqdm(text):\n", 112 | " num2label = []\n", 113 | " for i in xrange(5):\n", 114 | " num2label.append(topic_dict[int(line[i])]) # 把0-1999编号转成原来的id\n", 115 | " label.append(num2label)\n", 116 | "label = np.array(label)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 6, 122 | "metadata": { 123 | "collapsed": false 124 | }, 125 | "outputs": [], 126 | "source": [ 127 | "np.savetxt(\"temp.txt\",label,fmt='%d')" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 7, 133 | "metadata": { 134 | "collapsed": false 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "def clean_str(string):\n", 139 | " string = re.sub(r\" \", \",\", string)\n", 140 | " return string\n", 141 | "\n", 142 | "file1 = open('temp.txt', \"r\")\n", 143 | "examples = file1.readlines()\n", 144 | "examples = [clean_str(line) for line in examples]\n", 145 | "file1.close()\n", 146 | "\n", 147 | "file1 = open('temp.txt', \"w\")\n", 148 | "file1.writelines(examples)\n", 149 | "file1.close()" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 8, 155 | "metadata": { 156 | "collapsed": false 157 | }, 158 | "outputs": [ 159 | { 160 | "name": "stdout", 161 | "output_type": "stream", 162 | "text": [ 163 | " 0\n", 164 | "0 -3517637179126242000,-4653836020042332281,4715...\n", 165 | "1 3418451812342379591,2858911571784840089,238291...\n", 166 | "2 -7358589937244777363,-5265476641576484497,7477...\n", 167 | "3 -7046289575185911002,-4653836020042332281,-587...\n", 168 | "4 4715442001886462944,-8963554618409314978,11274...\n" 169 | ] 170 | } 171 | ], 172 | "source": [ 173 | "# predict文件导入\n", 174 | "predict_file = 'temp.txt'\n", 175 | "predict_reader = pd.read_table(predict_file,sep=' ',header=None)\n", 176 | "print(predict_reader.iloc[0:5])" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 9, 182 | "metadata": { 183 | "collapsed": false 184 | }, 185 | "outputs": [ 186 | { 187 | "name": "stdout", 188 | "output_type": "stream", 189 | "text": [ 190 | " 0 1 \\\n", 191 | "0 6215603645409872328 c924,c531,c102,c284,c188,c104,c98,c107,c11,c11... \n", 192 | "1 6649324930261961840 c346,c1549,c413,c294,c675,c504,c183,c74,c541,c... \n", 193 | "2 -4251899610700378615 c96,c97,c97,c98,c99,c100,c101,c141,c42,c42,c10... \n", 194 | "\n", 195 | " 2 \\\n", 196 | "0 w1340,w1341,w55,w1344,w58,w6,w24178,w26959,w47... \n", 197 | "1 w40132,w1357,w1556,w1380,w2464,w33,w16791,w109... \n", 198 | "2 w53,w54,w1779,w54,w1309,w54,w369,w949,w65587,w... \n", 199 | "\n", 200 | " 3 \\\n", 201 | "0 c1128,c529,c636,c572,c1321,c139,c540,c223,c510... \n", 202 | "1 NaN \n", 203 | "2 c149,c148,c148,c42,c185,c95,c95,c186,c186,c186... \n", 204 | "\n", 205 | " 4 \n", 206 | "0 w4094,w1618,w20104,w19234,w1097,w1005,w4228,w2... \n", 207 | "1 NaN \n", 208 | "2 NaN \n" 209 | ] 210 | } 211 | ], 212 | "source": [ 213 | "# 导入question_train_set\n", 214 | "eval_reader = pd.read_table('./ieee_zhihu_cup/question_eval_set.txt',sep='\\t',header=None)\n", 215 | "print(eval_reader.iloc[0:3])" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 10, 221 | "metadata": { 222 | "collapsed": false 223 | }, 224 | "outputs": [ 225 | { 226 | "name": "stdout", 227 | "output_type": "stream", 228 | "text": [ 229 | " 0 0\n", 230 | "0 6215603645409872328 -3517637179126242000,-4653836020042332281,4715...\n", 231 | "1 6649324930261961840 3418451812342379591,2858911571784840089,238291...\n", 232 | "2 -4251899610700378615 -7358589937244777363,-5265476641576484497,7477...\n", 233 | "3 6213817087034420233 -7046289575185911002,-4653836020042332281,-587...\n", 234 | "4 -8930652370334418373 4715442001886462944,-8963554618409314978,11274...\n" 235 | ] 236 | } 237 | ], 238 | "source": [ 239 | "final_predict = pd.concat([eval_reader.ix[:,0],predict_reader],axis=1)\n", 240 | "print(final_predict.iloc[0:5])" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 11, 246 | "metadata": { 247 | "collapsed": true 248 | }, 249 | "outputs": [], 250 | "source": [ 251 | "final_predict.to_csv('temp.txt', header=None, index=None, sep=',')\n", 252 | "\n", 253 | "final_file = open('temp.txt', \"r\")\n", 254 | "final_examples = final_file.readlines()\n", 255 | "final_examples = [re.sub(r'\"',\"\",line) for line in final_examples]\n", 256 | "final_file.close()\n", 257 | "\n", 258 | "final_file = open('final_predict.csv', \"w\")\n", 259 | "final_file.writelines(final_examples)\n", 260 | "final_file.close()" 261 | ] 262 | } 263 | ], 264 | "metadata": { 265 | "anaconda-cloud": {}, 266 | "kernelspec": { 267 | "display_name": "Python [default]", 268 | "language": "python", 269 | "name": "python3" 270 | }, 271 | "language_info": { 272 | "codemirror_mode": { 273 | "name": "ipython", 274 | "version": 3 275 | }, 276 | "file_extension": ".py", 277 | "mimetype": "text/x-python", 278 | "name": "python", 279 | "nbconvert_exporter": "python", 280 | "pygments_lexer": "ipython3", 281 | "version": "3.5.2" 282 | } 283 | }, 284 | "nbformat": 4, 285 | "nbformat_minor": 2 286 | } 287 | -------------------------------------------------------------------------------- /Tensorflow基础使用与文本分类应用/程序/cnn.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # 51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html
5 | # 优酷频道:http://i.youku.com/sdxxqbf
6 | # 微信公众号:深度学习与神经网络
7 | # Github:https://github.com/Qinbf
8 | 9 | # In[1]: 10 | 11 | import tensorflow as tf 12 | import numpy as np 13 | import os 14 | import time 15 | import numpy as np 16 | import pandas as pd 17 | import math 18 | from tqdm import tqdm 19 | from six.moves import xrange 20 | 21 | 22 | # In[2]: 23 | 24 | # Parameters 25 | # ================================================== 26 | 27 | # Data loading params 28 | # validation数据集占比 29 | tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation") 30 | # 数据集 31 | tf.flags.DEFINE_string("data_file", "./ieee_zhihu_cup/data_topic_block_0.txt", "Data source for the positive data.") 32 | 33 | # Model Hyperparameters 34 | # 词向量长度 35 | tf.flags.DEFINE_integer("embedding_dim", 256, "Dimensionality of character embedding (default: 256)") 36 | # 卷积核大小 37 | tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')") 38 | # 每一种卷积核个数 39 | tf.flags.DEFINE_integer("num_filters", 1024, "Number of filters per filter size (default: 1024)") 40 | # dropout参数 41 | tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)") 42 | # l2正则化参数 43 | tf.flags.DEFINE_float("l2_reg_lambda", 0.0005, "L2 regularization lambda (default: 0.0005)") 44 | 45 | # Training parameters 46 | # 批次大小 47 | tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)") 48 | # 迭代周期 49 | tf.flags.DEFINE_integer("num_epochs", 10, "Number of training epochs (default: 10)") 50 | # 多少step测试一次 51 | tf.flags.DEFINE_integer("evaluate_every", 50, "Evaluate model on dev set after this many steps (default: 50)") 52 | # 多少step保存一次模型 53 | tf.flags.DEFINE_integer("checkpoint_every", 200, "Save model after this many steps (default: 200)") 54 | # 保存多少个模型 55 | tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)") 56 | 57 | # flags解析 58 | FLAGS = tf.flags.FLAGS 59 | FLAGS._parse_flags() 60 | 61 | # 打印所有参数 62 | print("\nParameters:") 63 | for attr, value in sorted(FLAGS.__flags.items()): 64 | print("{}={}".format(attr.upper(), value)) 65 | print("") 66 | 67 | 68 | # In[3]: 69 | 70 | y = [] 71 | x_text = [] 72 | 73 | # 读取训练数据和标签 74 | reader = pd.read_table(FLAGS.data_file,sep='\t',header=None) 75 | for i in tqdm(xrange(reader.shape[0])): 76 | # 按','切分标签 77 | temp = reader.iloc[i][1].split(',') 78 | # 如果分类数大于5,只取前5个分类 79 | if (len(temp)>5): 80 | temp = temp[0:5] 81 | # 设置标签的对应位置为1,其余位置为0 82 | label = np.zeros(1999) 83 | for temp_label in temp: 84 | label[int(temp_label)] = 1 85 | y.append(label) 86 | x_text.append(reader.iloc[i][0]) 87 | 88 | 89 | # In[4]: 90 | 91 | # 打印x_text和y的前5行 92 | print(x_text[0:5]) 93 | y = np.array(y, dtype = np.float32) 94 | print(y[0:5]) 95 | 96 | 97 | # In[5]: 98 | 99 | # Build vocabulary 100 | # 计算一段文本中最多的词汇数 101 | max_document_length = max([len(x.split(",")) for x in x_text]) 102 | vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(max_document_length) 103 | 104 | x = np.array(list(vocab_processor.fit_transform(x_text))) 105 | print("x_shape:",x.shape) 106 | print("y_shape:",y.shape) 107 | 108 | # 保存字典 109 | vocab_processor.save("vocab_dict") 110 | 111 | # Split train/test set 112 | # 数据集切分为两部分,训练集和验证集 113 | dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) 114 | x_train, x_dev = x[:dev_sample_index], x[dev_sample_index:] 115 | y_train, y_dev = y[:dev_sample_index], y[dev_sample_index:] 116 | 117 | print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) 118 | print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) 119 | print("x:",x_train[0:5]) 120 | print("y:",y_train[0:5]) 121 | 122 | 123 | # In[6]: 124 | 125 | # 定义三个placeholder 126 | input_x = tf.placeholder(tf.int32, [None, x_train.shape[1]], name="input_x") 127 | input_y = tf.placeholder(tf.float32, [None, y_train.shape[1]], name="input_y") 128 | dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") 129 | 130 | # sequence_length-最长词汇数 131 | sequence_length=x_train.shape[1] 132 | # num_classes-分类数 133 | num_classes=y_train.shape[1] 134 | # vocab_size-总词汇数 135 | vocab_size=len(vocab_processor.vocabulary_) 136 | # embedding_size-词向量长度 137 | embedding_size=FLAGS.embedding_dim 138 | # filter_sizes-卷积核尺寸3,4,5 139 | filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))) 140 | # num_filters-卷积核数量 141 | num_filters=FLAGS.num_filters 142 | 143 | Weights = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name="Weights") 144 | # shape:[None, sequence_length, embedding_size] 145 | embedded_chars = tf.nn.embedding_lookup(Weights, input_x) 146 | # 添加一个维度,shape:[None, sequence_length, embedding_size, 1] 147 | embedded_chars_expanded = tf.expand_dims(embedded_chars, -1) 148 | 149 | # Create a convolution + maxpool layer for each filter size 150 | pooled_outputs = [] 151 | for i, filter_size in enumerate(filter_sizes): 152 | with tf.name_scope("conv-maxpool-%s" % filter_size): 153 | # Convolution Layer 154 | filter_shape = [filter_size, embedding_size, 1, num_filters] 155 | W = tf.Variable( 156 | tf.truncated_normal(filter_shape, stddev=0.1), name="W") 157 | b = tf.Variable( 158 | tf.constant(0.1, shape=[num_filters]), name="b") 159 | conv = tf.nn.conv2d( 160 | embedded_chars_expanded, 161 | W, 162 | strides=[1, 1, 1, 1], 163 | padding="VALID", 164 | name="conv") 165 | # Apply nonlinearity 166 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") 167 | # Maxpooling over the outputs 168 | pooled = tf.nn.max_pool( 169 | h, 170 | ksize=[1, sequence_length - filter_size + 1, 1, 1], 171 | strides=[1, 1, 1, 1], 172 | padding='VALID', 173 | name="pool") 174 | pooled_outputs.append(pooled) 175 | 176 | # Combine all the pooled features 177 | num_filters_total = num_filters * len(filter_sizes) 178 | print("num_filters_total:", num_filters_total) 179 | h_pool = tf.concat(pooled_outputs, 3) 180 | h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total]) 181 | 182 | # Add dropout 183 | with tf.name_scope("dropout"):h_drop = tf.nn.dropout(h_pool_flat,dropout_keep_prob) 184 | 185 | # Final (unnormalized) scores and predictions 186 | with tf.name_scope("output"): 187 | W = tf.get_variable( 188 | "W", 189 | shape=[num_filters_total, num_classes], 190 | initializer=tf.contrib.layers.xavier_initializer()) 191 | b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b") 192 | scores = tf.nn.xw_plus_b(h_drop, W, b, name="scores") 193 | 194 | # 定义loss 195 | with tf.name_scope("loss"): 196 | loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=scores, labels=input_y)) 197 | 198 | # 定义优化器 199 | with tf.name_scope("optimizer"): 200 | optimizer = tf.train.AdamOptimizer(1e-3).minimize(loss) 201 | 202 | 203 | # In[7]: 204 | 205 | # 生成批次数据 206 | def batch_iter(data, batch_size, num_epochs, shuffle=False): 207 | """ 208 | Generates a batch iterator for a dataset. 209 | """ 210 | data = np.array(data) 211 | data_size = len(data) 212 | # 每个epoch的num_batch 213 | num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1 214 | print("num_batches_per_epoch:",num_batches_per_epoch) 215 | for epoch in range(num_epochs): 216 | # Shuffle the data at each epoch 217 | if shuffle: 218 | shuffle_indices = np.random.permutation(np.arange(data_size)) 219 | shuffled_data = data[shuffle_indices] 220 | else: 221 | shuffled_data = data 222 | for batch_num in range(num_batches_per_epoch): 223 | start_index = batch_num * batch_size 224 | end_index = min((batch_num + 1) * batch_size, data_size) 225 | yield shuffled_data[start_index:end_index] 226 | 227 | 228 | # In[ ]: 229 | 230 | # 知乎提供的评测方案 231 | def eval(predict_label_and_marked_label_list): 232 | """ 233 | :param predict_label_and_marked_label_list: 一个元组列表。例如 234 | [ ([1, 2, 3, 4, 5], [4, 5, 6, 7]), 235 | ([3, 2, 1, 4, 7], [5, 7, 3]) 236 | ] 237 | 需要注意这里 predict_label 是去重复的,例如 [1,2,3,2,4,1,6],去重后变成[1,2,3,4,6] 238 | 239 | marked_label_list 本身没有顺序性,但提交结果有,例如上例的命中情况分别为 240 | [0,0,0,1,1] (4,5命中) 241 | [1,0,0,0,1] (3,7命中) 242 | 243 | """ 244 | right_label_num = 0 #总命中标签数量 245 | right_label_at_pos_num = [0, 0, 0, 0, 0] #在各个位置上总命中数量 246 | sample_num = 0 #总问题数量 247 | all_marked_label_num = 0 #总标签数量 248 | for predict_labels, marked_labels in predict_label_and_marked_label_list: 249 | sample_num += 1 250 | marked_label_set = set(marked_labels) 251 | all_marked_label_num += len(marked_label_set) 252 | for pos, label in zip(range(0, min(len(predict_labels), 5)), predict_labels): 253 | if label in marked_label_set: #命中 254 | right_label_num += 1 255 | right_label_at_pos_num[pos] += 1 256 | 257 | precision = 0.0 258 | for pos, right_num in zip(range(0, 5), right_label_at_pos_num): 259 | precision += ((right_num / float(sample_num))) / math.log(2.0 + pos) # 下标0-4 映射到 pos1-5 + 1,所以最终+2 260 | recall = float(right_label_num) / all_marked_label_num 261 | 262 | return 2*(precision * recall) / (precision + recall ) 263 | 264 | 265 | # In[ ]: 266 | 267 | # 定义saver,只保存最新的5个模型 268 | saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) 269 | 270 | with tf.Session() as sess: 271 | predict_top_5 = tf.nn.top_k(scores, k=5) 272 | label_top_5 = tf.nn.top_k(input_y, k=5) 273 | sess.run(tf.global_variables_initializer()) 274 | i = 0 275 | # 生成数据 276 | batches = batch_iter( 277 | list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs) 278 | for batch in batches: 279 | i = i + 1 280 | # 得到一个batch的数据 281 | x_batch, y_batch = zip(*batch) 282 | # 优化模型 283 | sess.run([optimizer],feed_dict={input_x:x_batch, input_y:y_batch, dropout_keep_prob:FLAGS.dropout_keep_prob}) 284 | 285 | # 每训练50次测试1次 286 | if (i % FLAGS.evaluate_every == 0): 287 | print ("Evaluation:step",i) 288 | predict_5, label_5, _loss = sess.run([predict_top_5,label_top_5,loss],feed_dict={input_x:x_batch, 289 | input_y:y_batch, 290 | dropout_keep_prob:1.0}) 291 | print ("label:",label_5[1][:5]) 292 | print ("predict:",predict_5[1][:5]) 293 | print ("predict:",predict_5[0][:5]) 294 | print ("loss:",_loss) 295 | predict_label_and_marked_label_list = [] 296 | for predict,label in zip(predict_5[1],label_5[1]): 297 | predict_label_and_marked_label_list.append((list(predict),list(label))) 298 | score = eval(predict_label_and_marked_label_list) 299 | print("score:",score) 300 | 301 | # 每训练200次保存1次模型 302 | if (i % FLAGS.checkpoint_every == 0): 303 | path = saver.save(sess, "models/model", global_step=i) 304 | print("Saved model checkpoint to {}".format(path)) 305 | 306 | 307 | # In[ ]: 308 | -------------------------------------------------------------------------------- /Tensorflow基础使用与文本分类应用/程序/data_handle.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html
\n", 8 | "优酷频道:http://i.youku.com/sdxxqbf
\n", 9 | "微信公众号:深度学习与神经网络
\n", 10 | "Github:https://github.com/Qinbf
\n", 11 | "\n", 12 | "question_train_set.txt: \n", 13 | " 第一列为 问题id; \n", 14 | " 第二列为 title 的字符编号序列; \n", 15 | " 第三列为 title 的词语编号序列; \n", 16 | " 第四列为描述的字符编号序列; \n", 17 | " 第五列为描述的词语标号序列。 \n", 18 | " \n", 19 | "question_topic_train_set.txt: \n", 20 | " 第一列 问题 id; \n", 21 | " 第二列 话题 id。 \n", 22 | "\n", 23 | "topic_info.txt: \n", 24 | " 第一列为话题 id \n", 25 | " 第二列为话题的父话题 id。话题之间是有向无环图结构,一个话题可能有 0 到多个父话题; \n", 26 | " 第三列为话题名字的字符编号序列; \n", 27 | " 第四列为话题名字的词语编号序列; \n", 28 | " 第五列为话题描述的字符编号序列; \n", 29 | " 第六列为话题描述的词语编号序列。 \n", 30 | "\n", 31 | "1.title通常来说包含的信息最重要。对于question_train_set.txt文件,为了简单起见,我们只取第三列,title的词语编号序列。 \n", 32 | "2.对于topic_info.txt,为了简单起见,我们不考虑2,3,4,5,6列。只是简单的提取话题id,然后转为0-1998的数字(一共有1999个话题) \n", 33 | "3.然后合并以上一些数据,得到最后处理后的数据。 " 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 1, 39 | "metadata": { 40 | "collapsed": true 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "import pandas as pd\n", 45 | "from tqdm import tqdm # pip install tqdm\n", 46 | "from six.moves import xrange" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 2, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [ 56 | { 57 | "name": "stdout", 58 | "output_type": "stream", 59 | "text": [ 60 | " 0 1 \\\n", 61 | "0 6555699376639805223 c324,c39,c40,c155,c180,c180,c181,c17,c4,c1153,... \n", 62 | "1 2887834264226772863 c44,c110,c101,c286,c106,c150,c101,c892,c632,c1... \n", 63 | "2 -2687466858632038806 c15,c768,c769,c1363,c650,c1218,c2361,c11,c90,c... \n", 64 | "3 -5698296155734268 c473,c1528,c528,c428,c295,c15,c101,c188,c146,c... \n", 65 | "4 -6719100304248915192 c190,c147,c105,c219,c220,c101,c647,c219,c220,c... \n", 66 | "\n", 67 | " 2 \\\n", 68 | "0 w305,w13549,w22752,w11,w7225,w2565,w1106,w16,w... \n", 69 | "1 w377,w54,w285,w57,w349,w54,w108215,w6,w47986,w... \n", 70 | "2 w875,w15450,w42394,w15863,w6,w95421,w25,w803,w... \n", 71 | "3 w8646,w2744,w1462,w9,w54,w138,w54,w50,w110,w14... \n", 72 | "4 w380,w54,w674,w133,w54,w134,w614,w54,w929,w307... \n", 73 | "\n", 74 | " 3 \\\n", 75 | "0 c335,c101,c611,c189,c97,c144,c147,c101,c15,c76... \n", 76 | "1 c1265,c518,c74,c131,c274,c57,c768,c769,c368,c3... \n", 77 | "2 c693,c100,c279,c99,c189,c532,c101,c189,c145,c1... \n", 78 | "3 NaN \n", 79 | "4 c644,c1212,c253,c199,c431,c452,c424,c207,c2,c1... \n", 80 | "\n", 81 | " 4 \n", 82 | "0 w231,w54,w1681,w54,w11506,w5714,w7,w54,w744,w1... \n", 83 | "1 w12508,w1380,w72,w27045,w276,w111 \n", 84 | "2 w140340,w54,w48398,w54,w140341,w54,w12856,w54,... \n", 85 | "3 NaN \n", 86 | "4 w4821,w1301,w16003,w928,w1961,w2565,w50803,w11... \n" 87 | ] 88 | } 89 | ], 90 | "source": [ 91 | "# 导入question_train_set\n", 92 | "reader = pd.read_table('./ieee_zhihu_cup/question_train_set.txt',sep='\\t',header=None)\n", 93 | "print(reader.iloc[0:5])" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 3, 99 | "metadata": { 100 | "collapsed": false 101 | }, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | " 0 1\n", 108 | "0 6555699376639805223 7739004195693774975,3738968195649774859\n", 109 | "1 2887834264226772863 -3149765934180654494\n", 110 | "2 -2687466858632038806 -760432988437306018\n", 111 | "3 -5698296155734268 -6758942141122113907,3195914392210930723\n", 112 | "4 -6719100304248915192 3804601920633030746,4797226510592237555,435133...\n" 113 | ] 114 | } 115 | ], 116 | "source": [ 117 | "# 导入question_topic_eval_set\n", 118 | "topic_reader = pd.read_table('./ieee_zhihu_cup/question_topic_train_set.txt',sep='\\t',header=None)\n", 119 | "print(topic_reader.iloc[0:5])" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 4, 125 | "metadata": { 126 | "collapsed": false 127 | }, 128 | "outputs": [ 129 | { 130 | "name": "stdout", 131 | "output_type": "stream", 132 | "text": [ 133 | " 0 \\\n", 134 | "0 w305,w13549,w22752,w11,w7225,w2565,w1106,w16,w... \n", 135 | "1 w377,w54,w285,w57,w349,w54,w108215,w6,w47986,w... \n", 136 | "2 w875,w15450,w42394,w15863,w6,w95421,w25,w803,w... \n", 137 | "3 w8646,w2744,w1462,w9,w54,w138,w54,w50,w110,w14... \n", 138 | "4 w380,w54,w674,w133,w54,w134,w614,w54,w929,w307... \n", 139 | "\n", 140 | " 1 \n", 141 | "0 7739004195693774975,3738968195649774859 \n", 142 | "1 -3149765934180654494 \n", 143 | "2 -760432988437306018 \n", 144 | "3 -6758942141122113907,3195914392210930723 \n", 145 | "4 3804601920633030746,4797226510592237555,435133... \n" 146 | ] 147 | } 148 | ], 149 | "source": [ 150 | "# 合并title 的词语编号序列和话题 id\n", 151 | "data_topic = pd.concat([reader.ix[:,2], topic_reader.ix[:,1]], axis=1, ignore_index=True)\n", 152 | "print(data_topic.iloc[0:5])" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 5, 158 | "metadata": { 159 | "collapsed": false 160 | }, 161 | "outputs": [ 162 | { 163 | "name": "stdout", 164 | "output_type": "stream", 165 | "text": [ 166 | " 0 1 \\\n", 167 | "0 738845194850773558 -5833678375673307423 \n", 168 | "1 3738968195649774859 2027693463582123305 \n", 169 | "2 4738849194894773882 1127459907694805235 \n", 170 | "3 7739004195693774975 2904932941037075699,1160326435131345730,725917... \n", 171 | "4 -7261194805221226386 -5833678375673307423 \n", 172 | "\n", 173 | " 2 3 4 \\\n", 174 | "0 c0,c1 w0 c0,c1,c2,c3,c4,c5,c6,c7,c0,c1,c8,c9,c10,c11,c1... \n", 175 | "1 c39,c40 w24 c41,c42,c43,c39,c40,c4,c44,c45,c46,c47,c48,c49... \n", 176 | "2 c172,c31,c0,c1 w102 NaN \n", 177 | "3 c39,c40,c5,c173 w103 c39,c40,c23,c21,c174,c74,c5,c173,c17,c35,c39,c... \n", 178 | "4 c36,c31,c45,c237 w148 c238,c239 \n", 179 | "\n", 180 | " 5 \n", 181 | "0 w0,w1,w2,w3,w4,w5,w6,w7,w8,w9,w10,w11,w12,w13,... \n", 182 | "1 w24,w25,w26,w27,w28,w6,w29,w30,w11,w31,w32,w33... \n", 183 | "2 NaN \n", 184 | "3 w104,w105,w11,w21,w24,w6,w106,w23,w54,w24,w107... \n", 185 | "4 w149,w150 \n" 186 | ] 187 | } 188 | ], 189 | "source": [ 190 | "# 导入topic_info\n", 191 | "label_reader = pd.read_table('./ieee_zhihu_cup/topic_info.txt',sep='\\t',header=None)\n", 192 | "print(label_reader.iloc[0:5])" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 6, 198 | "metadata": { 199 | "collapsed": false 200 | }, 201 | "outputs": [ 202 | { 203 | "name": "stdout", 204 | "output_type": "stream", 205 | "text": [ 206 | "3\n" 207 | ] 208 | } 209 | ], 210 | "source": [ 211 | "# 把标签转为0-1998的编号\n", 212 | "labels = list(label_reader.iloc[:,0])\n", 213 | "my_labels = []\n", 214 | "for label in labels:\n", 215 | " my_labels.append(label)\n", 216 | " \n", 217 | "# 建立topic字典\n", 218 | "topic_dict = {}\n", 219 | "for i,label in enumerate(my_labels):\n", 220 | " topic_dict[label] = i\n", 221 | "\n", 222 | "print(topic_dict[7739004195693774975])" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 7, 228 | "metadata": { 229 | "collapsed": false 230 | }, 231 | "outputs": [ 232 | { 233 | "name": "stderr", 234 | "output_type": "stream", 235 | "text": [ 236 | "100%|██████████████████████████████████████████████████████████████████████| 2999967/2999967 [12:15<00:00, 4076.87it/s]\n" 237 | ] 238 | }, 239 | { 240 | "name": "stdout", 241 | "output_type": "stream", 242 | "text": [ 243 | " 0 1\n", 244 | "0 w305,w13549,w22752,w11,w7225,w2565,w1106,w16,w... 3,1\n", 245 | "1 w377,w54,w285,w57,w349,w54,w108215,w6,w47986,w... 769\n", 246 | "2 w875,w15450,w42394,w15863,w6,w95421,w25,w803,w... 342\n", 247 | "3 w8646,w2744,w1462,w9,w54,w138,w54,w50,w110,w14... 1842,12\n", 248 | "4 w380,w54,w674,w133,w54,w134,w614,w54,w929,w307... 155,150,110,7,6\n" 249 | ] 250 | } 251 | ], 252 | "source": [ 253 | "for i in tqdm(xrange(data_topic.shape[0])):\n", 254 | " new_label = ''\n", 255 | " # 根据“,”切分话题id\n", 256 | " temp_topic = data_topic.iloc[i][1].split(',')\n", 257 | " for topic in temp_topic:\n", 258 | " # 判断该label是否在label文件中,并得到该行\n", 259 | " label_num = topic_dict[int(topic)]\n", 260 | " new_label = new_label + str(label_num) + ','\n", 261 | " data_topic.iloc[i][1] = new_label[:-1]\n", 262 | "print(data_topic.iloc[:5])" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 8, 268 | "metadata": { 269 | "collapsed": true 270 | }, 271 | "outputs": [], 272 | "source": [ 273 | "# 保存处理过后的文件\n", 274 | "data_topic.to_csv(\"./ieee_zhihu_cup/data_topic.txt\", header=None, index=None, sep='\\t')\n", 275 | "\n", 276 | "# 切分成10块保存\n", 277 | "for i in xrange(10):\n", 278 | " data_topic_filename = './ieee_zhihu_cup/data_topic_block_' + str(i) + '.txt'\n", 279 | " if (i+1)*300000 < data_topic.shape[0]:\n", 280 | " data_topic.iloc[i*300000:(i+1)*300000].to_csv(\n", 281 | " data_topic_filename, header=None, index=None, sep='\\t')\n", 282 | " else:\n", 283 | " data_topic.iloc[i*300000:data_topic.shape[0]].to_csv(\n", 284 | " data_topic_filename, header=None, index=None, sep='\\t')" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": { 291 | "collapsed": true 292 | }, 293 | "outputs": [], 294 | "source": [] 295 | } 296 | ], 297 | "metadata": { 298 | "anaconda-cloud": {}, 299 | "kernelspec": { 300 | "display_name": "Python [default]", 301 | "language": "python", 302 | "name": "python3" 303 | }, 304 | "language_info": { 305 | "codemirror_mode": { 306 | "name": "ipython", 307 | "version": 3 308 | }, 309 | "file_extension": ".py", 310 | "mimetype": "text/x-python", 311 | "name": "python", 312 | "nbconvert_exporter": "python", 313 | "pygments_lexer": "ipython3", 314 | "version": "3.5.2" 315 | } 316 | }, 317 | "nbformat": 4, 318 | "nbformat_minor": 2 319 | } 320 | -------------------------------------------------------------------------------- /Tensorflow基础使用与文本分类应用/程序/cnn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html
\n", 8 | "优酷频道:http://i.youku.com/sdxxqbf
\n", 9 | "微信公众号:深度学习与神经网络
\n", 10 | "Github:https://github.com/Qinbf
" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": { 17 | "collapsed": true 18 | }, 19 | "outputs": [], 20 | "source": [ 21 | "import tensorflow as tf\n", 22 | "import numpy as np\n", 23 | "import os\n", 24 | "import time\n", 25 | "import numpy as np\n", 26 | "import pandas as pd\n", 27 | "import math\n", 28 | "from tqdm import tqdm\n", 29 | "from six.moves import xrange" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": { 36 | "collapsed": false 37 | }, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "\n", 44 | "Parameters:\n", 45 | "BATCH_SIZE=64\n", 46 | "CHECKPOINT_EVERY=200\n", 47 | "DATA_FILE=./ieee_zhihu_cup/data_topic_block_0.txt\n", 48 | "DEV_SAMPLE_PERCENTAGE=0.1\n", 49 | "DROPOUT_KEEP_PROB=0.5\n", 50 | "EMBEDDING_DIM=256\n", 51 | "EVALUATE_EVERY=50\n", 52 | "FILTER_SIZES=3,4,5\n", 53 | "L2_REG_LAMBDA=0.0005\n", 54 | "NUM_CHECKPOINTS=5\n", 55 | "NUM_EPOCHS=10\n", 56 | "NUM_FILTERS=1024\n", 57 | "\n" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "# Parameters\n", 63 | "# ==================================================\n", 64 | "\n", 65 | "# Data loading params\n", 66 | "# validation数据集占比\n", 67 | "tf.flags.DEFINE_float(\"dev_sample_percentage\", .1, \"Percentage of the training data to use for validation\")\n", 68 | "# 数据集\n", 69 | "tf.flags.DEFINE_string(\"data_file\", \"./ieee_zhihu_cup/data_topic_block_0.txt\", \"Data source for the positive data.\")\n", 70 | "\n", 71 | "# Model Hyperparameters\n", 72 | "# 词向量长度\n", 73 | "tf.flags.DEFINE_integer(\"embedding_dim\", 256, \"Dimensionality of character embedding (default: 256)\")\n", 74 | "# 卷积核大小\n", 75 | "tf.flags.DEFINE_string(\"filter_sizes\", \"3,4,5\", \"Comma-separated filter sizes (default: '3,4,5')\")\n", 76 | "# 每一种卷积核个数\n", 77 | "tf.flags.DEFINE_integer(\"num_filters\", 1024, \"Number of filters per filter size (default: 1024)\")\n", 78 | "# dropout参数\n", 79 | "tf.flags.DEFINE_float(\"dropout_keep_prob\", 0.5, \"Dropout keep probability (default: 0.5)\")\n", 80 | "# l2正则化参数\n", 81 | "tf.flags.DEFINE_float(\"l2_reg_lambda\", 0.0005, \"L2 regularization lambda (default: 0.0005)\")\n", 82 | "\n", 83 | "# Training parameters\n", 84 | "# 批次大小\n", 85 | "tf.flags.DEFINE_integer(\"batch_size\", 64, \"Batch Size (default: 64)\")\n", 86 | "# 迭代周期\n", 87 | "tf.flags.DEFINE_integer(\"num_epochs\", 10, \"Number of training epochs (default: 10)\")\n", 88 | "# 多少step测试一次\n", 89 | "tf.flags.DEFINE_integer(\"evaluate_every\", 50, \"Evaluate model on dev set after this many steps (default: 50)\")\n", 90 | "# 多少step保存一次模型\n", 91 | "tf.flags.DEFINE_integer(\"checkpoint_every\", 200, \"Save model after this many steps (default: 200)\")\n", 92 | "# 保存多少个模型\n", 93 | "tf.flags.DEFINE_integer(\"num_checkpoints\", 5, \"Number of checkpoints to store (default: 5)\")\n", 94 | "\n", 95 | "# flags解析\n", 96 | "FLAGS = tf.flags.FLAGS\n", 97 | "FLAGS._parse_flags()\n", 98 | "\n", 99 | "# 打印所有参数\n", 100 | "print(\"\\nParameters:\")\n", 101 | "for attr, value in sorted(FLAGS.__flags.items()):\n", 102 | " print(\"{}={}\".format(attr.upper(), value))\n", 103 | "print(\"\")" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 3, 109 | "metadata": { 110 | "collapsed": false 111 | }, 112 | "outputs": [ 113 | { 114 | "name": "stderr", 115 | "output_type": "stream", 116 | "text": [ 117 | "100%|████████████████████████████████████████████████████████████████████████| 300000/300000 [01:15<00:00, 3959.17it/s]\n" 118 | ] 119 | } 120 | ], 121 | "source": [ 122 | "y = []\n", 123 | "x_text = []\n", 124 | "\n", 125 | "# 读取训练数据和标签\n", 126 | "reader = pd.read_table(FLAGS.data_file,sep='\\t',header=None)\n", 127 | "for i in tqdm(xrange(reader.shape[0])):\n", 128 | " # 按','切分标签\n", 129 | " temp = reader.iloc[i][1].split(',')\n", 130 | " # 如果分类数大于5,只取前5个分类\n", 131 | " if (len(temp)>5):\n", 132 | " temp = temp[0:5]\n", 133 | " # 设置标签的对应位置为1,其余位置为0\n", 134 | " label = np.zeros(1999)\n", 135 | " for temp_label in temp:\n", 136 | " label[int(temp_label)] = 1\n", 137 | " y.append(label)\n", 138 | " x_text.append(reader.iloc[i][0])" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 4, 144 | "metadata": { 145 | "collapsed": false 146 | }, 147 | "outputs": [ 148 | { 149 | "name": "stdout", 150 | "output_type": "stream", 151 | "text": [ 152 | "['w305,w13549,w22752,w11,w7225,w2565,w1106,w16,w31389,w6,w1019,w69288,w111,w3332,w109,w11,w25,w1110,w111', 'w377,w54,w285,w57,w349,w54,w108215,w6,w47986,w875,w3352,w500,w21790,w12144,w111', 'w875,w15450,w42394,w15863,w6,w95421,w25,w803,w346,w6,w3763,w347,w88,w111', 'w8646,w2744,w1462,w9,w54,w138,w54,w50,w110,w140344,w111,w112,w49270,w2129,w6,w6978,w359,w10147,w111', 'w380,w54,w674,w133,w54,w134,w614,w54,w929,w307,w109,w110,w19045,w6,w5830,w111']\n", 153 | "[[ 0. 1. 0. ..., 0. 0. 0.]\n", 154 | " [ 0. 0. 0. ..., 0. 0. 0.]\n", 155 | " [ 0. 0. 0. ..., 0. 0. 0.]\n", 156 | " [ 0. 0. 0. ..., 0. 0. 0.]\n", 157 | " [ 0. 0. 0. ..., 0. 0. 0.]]\n" 158 | ] 159 | } 160 | ], 161 | "source": [ 162 | "# 打印x_text和y的前5行\n", 163 | "print(x_text[0:5])\n", 164 | "y = np.array(y, dtype = np.float32)\n", 165 | "print(y[0:5])" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 5, 171 | "metadata": { 172 | "collapsed": false 173 | }, 174 | "outputs": [ 175 | { 176 | "name": "stdout", 177 | "output_type": "stream", 178 | "text": [ 179 | "x_shape: (300000, 72)\n", 180 | "y_shape: (300000, 1999)\n", 181 | "Vocabulary Size: 131900\n", 182 | "Train/Dev split: 270000/30000\n", 183 | "x: [[ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 4 16 17 13 0 0 0 0 0\n", 184 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", 185 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", 186 | " [18 19 20 21 22 19 23 10 24 25 26 27 28 29 13 0 0 0 0 0 0 0 0 0\n", 187 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", 188 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", 189 | " [25 30 31 32 10 33 16 34 35 10 36 37 38 13 0 0 0 0 0 0 0 0 0 0\n", 190 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", 191 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", 192 | " [39 40 41 42 19 43 19 44 45 46 13 47 48 49 10 50 51 52 13 0 0 0 0 0\n", 193 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", 194 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", 195 | " [53 19 54 55 19 56 57 19 58 59 15 45 60 10 61 13 0 0 0 0 0 0 0 0\n", 196 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", 197 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]\n", 198 | "y: [[ 0. 1. 0. ..., 0. 0. 0.]\n", 199 | " [ 0. 0. 0. ..., 0. 0. 0.]\n", 200 | " [ 0. 0. 0. ..., 0. 0. 0.]\n", 201 | " [ 0. 0. 0. ..., 0. 0. 0.]\n", 202 | " [ 0. 0. 0. ..., 0. 0. 0.]]\n" 203 | ] 204 | } 205 | ], 206 | "source": [ 207 | "# Build vocabulary\n", 208 | "# 计算一段文本中最多的词汇数\n", 209 | "max_document_length = max([len(x.split(\",\")) for x in x_text])\n", 210 | "vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(max_document_length)\n", 211 | "\n", 212 | "x = np.array(list(vocab_processor.fit_transform(x_text)))\n", 213 | "print(\"x_shape:\",x.shape)\n", 214 | "print(\"y_shape:\",y.shape)\n", 215 | "\n", 216 | "# 保存字典\n", 217 | "vocab_processor.save(\"vocab_dict\")\n", 218 | "\n", 219 | "# Split train/test set\n", 220 | "# 数据集切分为两部分,训练集和验证集\n", 221 | "dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))\n", 222 | "x_train, x_dev = x[:dev_sample_index], x[dev_sample_index:]\n", 223 | "y_train, y_dev = y[:dev_sample_index], y[dev_sample_index:]\n", 224 | "\n", 225 | "print(\"Vocabulary Size: {:d}\".format(len(vocab_processor.vocabulary_)))\n", 226 | "print(\"Train/Dev split: {:d}/{:d}\".format(len(y_train), len(y_dev)))\n", 227 | "print(\"x:\",x_train[0:5])\n", 228 | "print(\"y:\",y_train[0:5])" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 6, 234 | "metadata": { 235 | "collapsed": false 236 | }, 237 | "outputs": [ 238 | { 239 | "name": "stdout", 240 | "output_type": "stream", 241 | "text": [ 242 | "num_filters_total: 3072\n" 243 | ] 244 | } 245 | ], 246 | "source": [ 247 | "# 定义三个placeholder\n", 248 | "input_x = tf.placeholder(tf.int32, [None, x_train.shape[1]], name=\"input_x\")\n", 249 | "input_y = tf.placeholder(tf.float32, [None, y_train.shape[1]], name=\"input_y\")\n", 250 | "dropout_keep_prob = tf.placeholder(tf.float32, name=\"dropout_keep_prob\")\n", 251 | "\n", 252 | "# sequence_length-最长词汇数\n", 253 | "sequence_length=x_train.shape[1]\n", 254 | "# num_classes-分类数\n", 255 | "num_classes=y_train.shape[1]\n", 256 | "# vocab_size-总词汇数\n", 257 | "vocab_size=len(vocab_processor.vocabulary_)\n", 258 | "# embedding_size-词向量长度\n", 259 | "embedding_size=FLAGS.embedding_dim\n", 260 | "# filter_sizes-卷积核尺寸3,4,5\n", 261 | "filter_sizes=list(map(int, FLAGS.filter_sizes.split(\",\")))\n", 262 | "# num_filters-卷积核数量\n", 263 | "num_filters=FLAGS.num_filters\n", 264 | " \n", 265 | "Weights = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name=\"Weights\")\n", 266 | "# shape:[None, sequence_length, embedding_size]\n", 267 | "embedded_chars = tf.nn.embedding_lookup(Weights, input_x)\n", 268 | "# 添加一个维度,shape:[None, sequence_length, embedding_size, 1]\n", 269 | "embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)\n", 270 | "\n", 271 | "# Create a convolution + maxpool layer for each filter size\n", 272 | "pooled_outputs = []\n", 273 | "for i, filter_size in enumerate(filter_sizes):\n", 274 | " with tf.name_scope(\"conv-maxpool-%s\" % filter_size):\n", 275 | " # Convolution Layer\n", 276 | " filter_shape = [filter_size, embedding_size, 1, num_filters]\n", 277 | " W = tf.Variable(\n", 278 | " tf.truncated_normal(filter_shape, stddev=0.1), name=\"W\")\n", 279 | " b = tf.Variable(\n", 280 | " tf.constant(0.1, shape=[num_filters]), name=\"b\")\n", 281 | " conv = tf.nn.conv2d(\n", 282 | " embedded_chars_expanded,\n", 283 | " W,\n", 284 | " strides=[1, 1, 1, 1],\n", 285 | " padding=\"VALID\",\n", 286 | " name=\"conv\")\n", 287 | " # Apply nonlinearity\n", 288 | " h = tf.nn.relu(tf.nn.bias_add(conv, b), name=\"relu\")\n", 289 | " # Maxpooling over the outputs\n", 290 | " pooled = tf.nn.max_pool(\n", 291 | " h,\n", 292 | " ksize=[1, sequence_length - filter_size + 1, 1, 1],\n", 293 | " strides=[1, 1, 1, 1],\n", 294 | " padding='VALID',\n", 295 | " name=\"pool\")\n", 296 | " pooled_outputs.append(pooled)\n", 297 | "\n", 298 | "# Combine all the pooled features\n", 299 | "num_filters_total = num_filters * len(filter_sizes)\n", 300 | "print(\"num_filters_total:\", num_filters_total)\n", 301 | "h_pool = tf.concat(pooled_outputs, 3)\n", 302 | "h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])\n", 303 | "\n", 304 | "# Add dropout\n", 305 | "with tf.name_scope(\"dropout\"):h_drop = tf.nn.dropout(h_pool_flat,dropout_keep_prob)\n", 306 | "\n", 307 | "# Final (unnormalized) scores and predictions\n", 308 | "with tf.name_scope(\"output\"):\n", 309 | " W = tf.get_variable(\n", 310 | " \"W\",\n", 311 | " shape=[num_filters_total, num_classes],\n", 312 | " initializer=tf.contrib.layers.xavier_initializer())\n", 313 | " b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name=\"b\")\n", 314 | " scores = tf.nn.xw_plus_b(h_drop, W, b, name=\"scores\")\n", 315 | " \n", 316 | "# 定义loss\n", 317 | "with tf.name_scope(\"loss\"):\n", 318 | " loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=scores, labels=input_y))\n", 319 | "\n", 320 | "# 定义优化器\n", 321 | "with tf.name_scope(\"optimizer\"):\n", 322 | " optimizer = tf.train.AdamOptimizer(1e-3).minimize(loss)" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 7, 328 | "metadata": { 329 | "collapsed": true 330 | }, 331 | "outputs": [], 332 | "source": [ 333 | "# 生成批次数据\n", 334 | "def batch_iter(data, batch_size, num_epochs, shuffle=False):\n", 335 | " \"\"\"\n", 336 | " Generates a batch iterator for a dataset.\n", 337 | " \"\"\"\n", 338 | " data = np.array(data)\n", 339 | " data_size = len(data)\n", 340 | " # 每个epoch的num_batch\n", 341 | " num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1\n", 342 | " print(\"num_batches_per_epoch:\",num_batches_per_epoch)\n", 343 | " for epoch in range(num_epochs):\n", 344 | " # Shuffle the data at each epoch\n", 345 | " if shuffle:\n", 346 | " shuffle_indices = np.random.permutation(np.arange(data_size))\n", 347 | " shuffled_data = data[shuffle_indices]\n", 348 | " else:\n", 349 | " shuffled_data = data\n", 350 | " for batch_num in range(num_batches_per_epoch):\n", 351 | " start_index = batch_num * batch_size\n", 352 | " end_index = min((batch_num + 1) * batch_size, data_size)\n", 353 | " yield shuffled_data[start_index:end_index]" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": { 360 | "collapsed": true 361 | }, 362 | "outputs": [], 363 | "source": [ 364 | "# 知乎提供的评测方案\n", 365 | "def eval(predict_label_and_marked_label_list):\n", 366 | " \"\"\"\n", 367 | " :param predict_label_and_marked_label_list: 一个元组列表。例如\n", 368 | " [ ([1, 2, 3, 4, 5], [4, 5, 6, 7]),\n", 369 | " ([3, 2, 1, 4, 7], [5, 7, 3])\n", 370 | " ]\n", 371 | " 需要注意这里 predict_label 是去重复的,例如 [1,2,3,2,4,1,6],去重后变成[1,2,3,4,6]\n", 372 | " \n", 373 | " marked_label_list 本身没有顺序性,但提交结果有,例如上例的命中情况分别为\n", 374 | " [0,0,0,1,1] (4,5命中)\n", 375 | " [1,0,0,0,1] (3,7命中)\n", 376 | "\n", 377 | " \"\"\"\n", 378 | " right_label_num = 0 #总命中标签数量\n", 379 | " right_label_at_pos_num = [0, 0, 0, 0, 0] #在各个位置上总命中数量\n", 380 | " sample_num = 0 #总问题数量\n", 381 | " all_marked_label_num = 0 #总标签数量\n", 382 | " for predict_labels, marked_labels in predict_label_and_marked_label_list:\n", 383 | " sample_num += 1\n", 384 | " marked_label_set = set(marked_labels)\n", 385 | " all_marked_label_num += len(marked_label_set)\n", 386 | " for pos, label in zip(range(0, min(len(predict_labels), 5)), predict_labels):\n", 387 | " if label in marked_label_set: #命中\n", 388 | " right_label_num += 1\n", 389 | " right_label_at_pos_num[pos] += 1\n", 390 | "\n", 391 | " precision = 0.0\n", 392 | " for pos, right_num in zip(range(0, 5), right_label_at_pos_num):\n", 393 | " precision += ((right_num / float(sample_num))) / math.log(2.0 + pos) # 下标0-4 映射到 pos1-5 + 1,所以最终+2\n", 394 | " recall = float(right_label_num) / all_marked_label_num\n", 395 | "\n", 396 | " return 2*(precision * recall) / (precision + recall )" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": null, 402 | "metadata": { 403 | "collapsed": false 404 | }, 405 | "outputs": [ 406 | { 407 | "name": "stdout", 408 | "output_type": "stream", 409 | "text": [ 410 | "num_batches_per_epoch: 4219\n" 411 | ] 412 | } 413 | ], 414 | "source": [ 415 | "# 定义saver,只保存最新的5个模型\n", 416 | "saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)\n", 417 | "\n", 418 | "with tf.Session() as sess:\n", 419 | " predict_top_5 = tf.nn.top_k(scores, k=5)\n", 420 | " label_top_5 = tf.nn.top_k(input_y, k=5) \n", 421 | " sess.run(tf.global_variables_initializer())\n", 422 | " i = 0\n", 423 | " # 生成数据\n", 424 | " batches = batch_iter(\n", 425 | " list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs)\n", 426 | " for batch in batches:\n", 427 | " i = i + 1\n", 428 | " # 得到一个batch的数据\n", 429 | " x_batch, y_batch = zip(*batch)\n", 430 | " # 优化模型\n", 431 | " sess.run([optimizer],feed_dict={input_x:x_batch, input_y:y_batch, dropout_keep_prob:FLAGS.dropout_keep_prob})\n", 432 | "\n", 433 | " # 每训练50次测试1次\n", 434 | " if (i % FLAGS.evaluate_every == 0):\n", 435 | " print (\"Evaluation:step\",i)\n", 436 | " predict_5, label_5, _loss = sess.run([predict_top_5,label_top_5,loss],feed_dict={input_x:x_batch,\n", 437 | " input_y:y_batch,\n", 438 | " dropout_keep_prob:1.0})\n", 439 | " print (\"label:\",label_5[1][:5])\n", 440 | " print (\"predict:\",predict_5[1][:5])\n", 441 | " print (\"predict:\",predict_5[0][:5])\n", 442 | " print (\"loss:\",_loss)\n", 443 | " predict_label_and_marked_label_list = []\n", 444 | " for predict,label in zip(predict_5[1],label_5[1]):\n", 445 | " predict_label_and_marked_label_list.append((list(predict),list(label)))\n", 446 | " score = eval(predict_label_and_marked_label_list)\n", 447 | " print(\"score:\",score)\n", 448 | "\n", 449 | " # 每训练200次保存1次模型\n", 450 | " if (i % FLAGS.checkpoint_every == 0):\n", 451 | " path = saver.save(sess, \"models/model\", global_step=i)\n", 452 | " print(\"Saved model checkpoint to {}\".format(path))" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": null, 458 | "metadata": { 459 | "collapsed": false 460 | }, 461 | "outputs": [], 462 | "source": [] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": null, 467 | "metadata": { 468 | "collapsed": false 469 | }, 470 | "outputs": [], 471 | "source": [] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": null, 476 | "metadata": { 477 | "collapsed": true 478 | }, 479 | "outputs": [], 480 | "source": [] 481 | } 482 | ], 483 | "metadata": { 484 | "anaconda-cloud": {}, 485 | "kernelspec": { 486 | "display_name": "Python [default]", 487 | "language": "python", 488 | "name": "python3" 489 | }, 490 | "language_info": { 491 | "codemirror_mode": { 492 | "name": "ipython", 493 | "version": 3 494 | }, 495 | "file_extension": ".py", 496 | "mimetype": "text/x-python", 497 | "name": "python", 498 | "nbconvert_exporter": "python", 499 | "pygments_lexer": "ipython3", 500 | "version": "3.5.2" 501 | } 502 | }, 503 | "nbformat": 4, 504 | "nbformat_minor": 2 505 | } 506 | -------------------------------------------------------------------------------- /Tensorflow基础使用与文本分类应用/程序/zhihu_eval.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html
\n", 8 | "优酷频道:http://i.youku.com/sdxxqbf
\n", 9 | "微信公众号:深度学习与神经网络
\n", 10 | "Github:https://github.com/Qinbf
" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": { 17 | "collapsed": true 18 | }, 19 | "outputs": [], 20 | "source": [ 21 | "#coding:utf-8\n", 22 | "import numpy as np\n", 23 | "import pandas as pd\n", 24 | "from tqdm import tqdm\n", 25 | "import tensorflow as tf\n", 26 | "import pickle\n", 27 | "import math\n", 28 | "from six.moves import xrange" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "metadata": { 35 | "collapsed": false 36 | }, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | " 0 1 \\\n", 43 | "0 6215603645409872328 c924,c531,c102,c284,c188,c104,c98,c107,c11,c11... \n", 44 | "1 6649324930261961840 c346,c1549,c413,c294,c675,c504,c183,c74,c541,c... \n", 45 | "2 -4251899610700378615 c96,c97,c97,c98,c99,c100,c101,c141,c42,c42,c10... \n", 46 | "3 6213817087034420233 c504,c157,c221,c221,c633,c468,c469,c1637,c1072... \n", 47 | "4 -8930652370334418373 c0,c310,c35,c122,c123,c11,c317,c91,c175,c476,c... \n", 48 | "\n", 49 | " 2 \\\n", 50 | "0 w1340,w1341,w55,w1344,w58,w6,w24178,w26959,w47... \n", 51 | "1 w40132,w1357,w1556,w1380,w2464,w33,w16791,w109... \n", 52 | "2 w53,w54,w1779,w54,w1309,w54,w369,w949,w65587,w... \n", 53 | "3 w5083,w12537,w10427,w29724,w6,w2566,w11,w18476... \n", 54 | "4 w33792,w21,w83,w6,w21542,w21,w140670,w25,w1110... \n", 55 | "\n", 56 | " 3 \\\n", 57 | "0 c1128,c529,c636,c572,c1321,c139,c540,c223,c510... \n", 58 | "1 NaN \n", 59 | "2 c149,c148,c148,c42,c185,c95,c95,c186,c186,c186... \n", 60 | "3 c15,c131,c39,c40,c85,c166,c969,c2456,c17,c636,... \n", 61 | "4 NaN \n", 62 | "\n", 63 | " 4 \n", 64 | "0 w4094,w1618,w20104,w19234,w1097,w1005,w4228,w2... \n", 65 | "1 NaN \n", 66 | "2 NaN \n", 67 | "3 w2550,w24,w239,w98,w19456,w11,w108710,w3483,w2... \n", 68 | "4 NaN \n" 69 | ] 70 | } 71 | ], 72 | "source": [ 73 | "# 导入question_train_set\n", 74 | "reader = pd.read_table('./ieee_zhihu_cup/question_eval_set.txt',sep='\\t',header=None)\n", 75 | "print(reader.iloc[0:5])" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 3, 81 | "metadata": { 82 | "collapsed": false 83 | }, 84 | "outputs": [ 85 | { 86 | "name": "stdout", 87 | "output_type": "stream", 88 | "text": [ 89 | "('max_document_length:', 76)\n" 90 | ] 91 | } 92 | ], 93 | "source": [ 94 | "# 计算一段文本中最大词汇数\n", 95 | "x_text = reader.iloc[:,2]\n", 96 | "max_document_length = 0\n", 97 | "for i,line in enumerate(x_text):\n", 98 | " try:\n", 99 | " temp = line.split(',')\n", 100 | " max_document_length = max(max_document_length,len(temp))\n", 101 | " except:\n", 102 | " # 其中有一行数据为空\n", 103 | " pass\n", 104 | "# x_text[i] = \" \"\n", 105 | "\n", 106 | "print(\"max_document_length:\",max_document_length)\n", 107 | "\n", 108 | "# 载入字典\n", 109 | "vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor.restore(\"vocab_dict\")" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 4, 115 | "metadata": { 116 | "collapsed": true 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "# 按','切分数据\n", 121 | "text = []\n", 122 | "for line in x_text:\n", 123 | " try:\n", 124 | " text.append(line.split(','))\n", 125 | " except:\n", 126 | " # 其中有一行数据为空\n", 127 | " text.append(' ')" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 5, 133 | "metadata": { 134 | "collapsed": false 135 | }, 136 | "outputs": [ 137 | { 138 | "name": "stderr", 139 | "output_type": "stream", 140 | "text": [ 141 | "100%|██████████| 217360/217360 [00:05<00:00, 40820.07it/s]\n" 142 | ] 143 | }, 144 | { 145 | "data": { 146 | "text/plain": [ 147 | "array([[ 4507, 2664, 423, 3387, 425, 10, 84669, 1744,\n", 148 | " 152, 13, 90, 152, 1556, 403, 17192, 10,\n", 149 | " 3686, 13, 0, 0, 0, 0, 0, 0,\n", 150 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 151 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 152 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 153 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 154 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 155 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 156 | " 0, 0, 0, 0],\n", 157 | " [ 18531, 861, 1538, 490, 16758, 197, 4225, 658,\n", 158 | " 18551, 10, 4100, 15, 1929, 52, 13, 0,\n", 159 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 160 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 161 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 162 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 163 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 164 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 165 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 166 | " 0, 0, 0, 0],\n", 167 | " [ 1207, 19, 810, 19, 126081, 19, 501, 2249,\n", 168 | " 85078, 35, 218, 308, 99, 105, 313, 13,\n", 169 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 170 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 171 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 172 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 173 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 174 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 175 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 176 | " 0, 0, 0, 0],\n", 177 | " [ 1040, 11856, 360, 23102, 10, 4100, 4, 432,\n", 178 | " 17, 1424, 0, 13, 0, 0, 0, 0,\n", 179 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 180 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 181 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 182 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 183 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 184 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 185 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 186 | " 0, 0, 0, 0],\n", 187 | " [ 3538, 137, 1628, 10, 8450, 137, 0, 16,\n", 188 | " 17, 13, 0, 0, 0, 0, 0, 0,\n", 189 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 190 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 191 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 192 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 193 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 194 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 195 | " 0, 0, 0, 0, 0, 0, 0, 0,\n", 196 | " 0, 0, 0, 0]])" 197 | ] 198 | }, 199 | "execution_count": 5, 200 | "metadata": {}, 201 | "output_type": "execute_result" 202 | } 203 | ], 204 | "source": [ 205 | "# 把数据集变成编号的形式\n", 206 | "x = []\n", 207 | "for line in tqdm(text):\n", 208 | " line_len = len(line)\n", 209 | " text2num = []\n", 210 | " for i in xrange(max_document_length):\n", 211 | " if(i < line_len):\n", 212 | " try:\n", 213 | " text2num.append(vocab_processor.vocabulary_.get(line[i])) # 把词转为数字\n", 214 | " except:\n", 215 | " text2num.append(0) # 没有对应的词\n", 216 | " else:\n", 217 | " text2num.append(0) # 填充0\n", 218 | " x.append(text2num)\n", 219 | "x = np.array(x)\n", 220 | "x[:5]" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 6, 226 | "metadata": { 227 | "collapsed": false 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "def batch_iter(data, batch_size, num_epochs, shuffle=False):\n", 232 | " \"\"\"\n", 233 | " Generates a batch iterator for a dataset.\n", 234 | " \"\"\"\n", 235 | " data = np.array(data)\n", 236 | " data_size = len(data)\n", 237 | " # 每个epoch的num_batch\n", 238 | " num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1\n", 239 | " print(\"num_batches_per_epoch:\",num_batches_per_epoch)\n", 240 | " for epoch in range(num_epochs):\n", 241 | " # Shuffle the data at each epoch\n", 242 | " if shuffle:\n", 243 | " shuffle_indices = np.random.permutation(np.arange(data_size))\n", 244 | " shuffled_data = data[shuffle_indices]\n", 245 | " else:\n", 246 | " shuffled_data = data\n", 247 | " for batch_num in range(num_batches_per_epoch):\n", 248 | " start_index = batch_num * batch_size\n", 249 | " end_index = min((batch_num + 1) * batch_size, data_size)\n", 250 | " yield shuffled_data[start_index:end_index]" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 7, 256 | "metadata": { 257 | "collapsed": false 258 | }, 259 | "outputs": [], 260 | "source": [ 261 | "def eval(predict_label_and_marked_label_list):\n", 262 | " \"\"\"\n", 263 | " :param predict_label_and_marked_label_list: 一个元组列表。例如\n", 264 | " [ ([1, 2, 3, 4, 5], [4, 5, 6, 7]),\n", 265 | " ([3, 2, 1, 4, 7], [5, 7, 3])\n", 266 | " ]\n", 267 | " 需要注意这里 predict_label 是去重复的,例如 [1,2,3,2,4,1,6],去重后变成[1,2,3,4,6]\n", 268 | " \n", 269 | " marked_label_list 本身没有顺序性,但提交结果有,例如上例的命中情况分别为\n", 270 | " [0,0,0,1,1] (4,5命中)\n", 271 | " [1,0,0,0,1] (3,7命中)\n", 272 | "\n", 273 | " \"\"\"\n", 274 | " right_label_num = 0 #总命中标签数量\n", 275 | " right_label_at_pos_num = [0, 0, 0, 0, 0] #在各个位置上总命中数量\n", 276 | " sample_num = 0 #总问题数量\n", 277 | " all_marked_label_num = 0 #总标签数量\n", 278 | " for predict_labels, marked_labels in predict_label_and_marked_label_list:\n", 279 | " sample_num += 1\n", 280 | " marked_label_set = set(marked_labels)\n", 281 | " all_marked_label_num += len(marked_label_set)\n", 282 | " for pos, label in zip(range(0, min(len(predict_labels), 5)), predict_labels):\n", 283 | " if label in marked_label_set: #命中\n", 284 | " right_label_num += 1\n", 285 | " right_label_at_pos_num[pos] += 1\n", 286 | "\n", 287 | " precision = 0.0\n", 288 | " for pos, right_num in zip(range(0, 5), right_label_at_pos_num):\n", 289 | " precision += ((right_num / float(sample_num))) / math.log(2.0 + pos) # 下标0-4 映射到 pos1-5 + 1,所以最终+2\n", 290 | " recall = float(right_label_num) / all_marked_label_num\n", 291 | "\n", 292 | " return 2*(precision * recall) / (precision + recall )" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 8, 298 | "metadata": { 299 | "collapsed": false 300 | }, 301 | "outputs": [ 302 | { 303 | "name": "stdout", 304 | "output_type": "stream", 305 | "text": [ 306 | "('num_filters_total:', 3072)\n" 307 | ] 308 | } 309 | ], 310 | "source": [ 311 | "# 定义三个placeholder\n", 312 | "input_x = tf.placeholder(tf.int32, [None, x.shape[1]], name=\"input_x\")\n", 313 | "dropout_keep_prob = tf.placeholder(tf.float32, name=\"dropout_keep_prob\")\n", 314 | "\n", 315 | "# sequence_length-最长词汇数\n", 316 | "sequence_length=x.shape[1]\n", 317 | "# num_classes-分类数\n", 318 | "num_classes=1999\n", 319 | "# vocab_size-总词汇数\n", 320 | "vocab_size=len(vocab_processor.vocabulary_)\n", 321 | "# embedding_size-词向量长度\n", 322 | "embedding_size=256\n", 323 | "# filter_sizes-卷积核尺寸3,4,5\n", 324 | "filter_sizes=list(map(int, [3,4,5]))\n", 325 | "# num_filters-卷积核数量\n", 326 | "num_filters=1024\n", 327 | "\n", 328 | "Weights = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name=\"Weights\")\n", 329 | "# [None, sequence_length, embedding_size]\n", 330 | "embedded_chars = tf.nn.embedding_lookup(Weights, input_x)\n", 331 | "# 添加一个维度,[None, sequence_length, embedding_size, 1]\n", 332 | "embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)\n", 333 | "# Create a convolution + maxpool layer for each filter size\n", 334 | "pooled_outputs = []\n", 335 | "for i, filter_size in enumerate(filter_sizes):\n", 336 | " with tf.name_scope(\"conv-maxpool-%s\" % filter_size):\n", 337 | " # Convolution Layer\n", 338 | " filter_shape = [filter_size, embedding_size, 1, num_filters]\n", 339 | " W = tf.Variable(\n", 340 | " tf.truncated_normal(filter_shape, stddev=0.1), name=\"W\")\n", 341 | " b = tf.Variable(\n", 342 | " tf.constant(0.1, shape=[num_filters]), name=\"b\")\n", 343 | " conv = tf.nn.conv2d(\n", 344 | " embedded_chars_expanded,\n", 345 | " W,\n", 346 | " strides=[1, 1, 1, 1],\n", 347 | " padding=\"VALID\",\n", 348 | " name=\"conv\")\n", 349 | " # Apply nonlinearity\n", 350 | " h = tf.nn.relu(tf.nn.bias_add(conv, b), name=\"relu\")\n", 351 | " # Maxpooling over the outputs\n", 352 | " pooled = tf.nn.max_pool(\n", 353 | " h,\n", 354 | " ksize=[1, sequence_length - filter_size + 1, 1, 1],\n", 355 | " strides=[1, 1, 1, 1],\n", 356 | " padding='VALID',\n", 357 | " name=\"pool\")\n", 358 | " pooled_outputs.append(pooled)\n", 359 | "\n", 360 | "# Combine all the pooled features\n", 361 | "num_filters_total = num_filters * len(filter_sizes)\n", 362 | "print(\"num_filters_total:\", num_filters_total)\n", 363 | "h_pool = tf.concat(pooled_outputs, 3)\n", 364 | "h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])\n", 365 | "\n", 366 | "# Add dropout\n", 367 | "with tf.name_scope(\"dropout\"):h_drop = tf.nn.dropout(h_pool_flat,dropout_keep_prob)\n", 368 | "\n", 369 | "# Final (unnormalized) scores and predictions\n", 370 | "with tf.name_scope(\"output\"):\n", 371 | " W = tf.get_variable(\n", 372 | " \"W\",\n", 373 | " shape=[num_filters_total, num_classes],\n", 374 | " initializer=tf.contrib.layers.xavier_initializer())\n", 375 | " b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name=\"b\")\n", 376 | " scores = tf.nn.xw_plus_b(h_drop, W, b, name=\"scores\")" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 9, 382 | "metadata": { 383 | "collapsed": false 384 | }, 385 | "outputs": [ 386 | { 387 | "name": "stdout", 388 | "output_type": "stream", 389 | "text": [ 390 | "INFO:tensorflow:Restoring parameters from ./models/model_-7200\n", 391 | "('num_batches_per_epoch:', 218)\n", 392 | "('Evaluation:step', 5)\n", 393 | "('Evaluation:step', 10)\n", 394 | "('Evaluation:step', 15)\n", 395 | "('Evaluation:step', 20)\n", 396 | "('Evaluation:step', 25)\n", 397 | "('Evaluation:step', 30)\n", 398 | "('Evaluation:step', 35)\n", 399 | "('Evaluation:step', 40)\n", 400 | "('Evaluation:step', 45)\n", 401 | "('Evaluation:step', 50)\n", 402 | "('Evaluation:step', 55)\n", 403 | "('Evaluation:step', 60)\n", 404 | "('Evaluation:step', 65)\n", 405 | "('Evaluation:step', 70)\n", 406 | "('Evaluation:step', 75)\n", 407 | "('Evaluation:step', 80)\n", 408 | "('Evaluation:step', 85)\n", 409 | "('Evaluation:step', 90)\n", 410 | "('Evaluation:step', 95)\n", 411 | "('Evaluation:step', 100)\n", 412 | "('Evaluation:step', 105)\n", 413 | "('Evaluation:step', 110)\n", 414 | "('Evaluation:step', 115)\n", 415 | "('Evaluation:step', 120)\n", 416 | "('Evaluation:step', 125)\n", 417 | "('Evaluation:step', 130)\n", 418 | "('Evaluation:step', 135)\n", 419 | "('Evaluation:step', 140)\n", 420 | "('Evaluation:step', 145)\n", 421 | "('Evaluation:step', 150)\n", 422 | "('Evaluation:step', 155)\n", 423 | "('Evaluation:step', 160)\n", 424 | "('Evaluation:step', 165)\n", 425 | "('Evaluation:step', 170)\n", 426 | "('Evaluation:step', 175)\n", 427 | "('Evaluation:step', 180)\n", 428 | "('Evaluation:step', 185)\n", 429 | "('Evaluation:step', 190)\n", 430 | "('Evaluation:step', 195)\n", 431 | "('Evaluation:step', 200)\n", 432 | "('Evaluation:step', 205)\n", 433 | "('Evaluation:step', 210)\n", 434 | "('Evaluation:step', 215)\n" 435 | ] 436 | } 437 | ], 438 | "source": [ 439 | "# 选择模型\n", 440 | "checkpoint_file = \"./models/model-10000\"\n", 441 | " \n", 442 | "with tf.Session() as sess:\n", 443 | " predict_top_5 = tf.nn.top_k(scores, k=5)\n", 444 | " sess.run(tf.global_variables_initializer())\n", 445 | " i = 0\n", 446 | " saver = tf.train.Saver()\n", 447 | " saver.restore(sess, checkpoint_file)\n", 448 | "\n", 449 | " # Generate batches\n", 450 | " batches = batch_iter(list(x), 1000, 1)\n", 451 | " \n", 452 | " for x_batch in batches:\n", 453 | " i = i + 1\n", 454 | " predict_5 = sess.run(predict_top_5,feed_dict={input_x:x_batch,dropout_keep_prob:1.0})\n", 455 | " if i == 1:\n", 456 | " predict = predict_5[1]\n", 457 | " else:\n", 458 | " predict = np.concatenate((predict,predict_5[1]))\n", 459 | " if (i%5==0):\n", 460 | " print (\"Evaluation:step\",i)\n", 461 | "\n", 462 | " np.savetxt(\"predict.txt\",predict,fmt='%d')" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": null, 468 | "metadata": { 469 | "collapsed": true 470 | }, 471 | "outputs": [], 472 | "source": [] 473 | } 474 | ], 475 | "metadata": { 476 | "anaconda-cloud": {}, 477 | "kernelspec": { 478 | "display_name": "Python [default]", 479 | "language": "python", 480 | "name": "python3" 481 | }, 482 | "language_info": { 483 | "codemirror_mode": { 484 | "name": "ipython", 485 | "version": 3 486 | }, 487 | "file_extension": ".py", 488 | "mimetype": "text/x-python", 489 | "name": "python", 490 | "nbconvert_exporter": "python", 491 | "pygments_lexer": "ipython3", 492 | "version": "3.5.2" 493 | } 494 | }, 495 | "nbformat": 4, 496 | "nbformat_minor": 2 497 | } 498 | --------------------------------------------------------------------------------