├── Tensorflow基础使用与图像识别应用
├── Tensorflow的基础使用与图像识别应用.pdf
└── 程序
│ ├── 1创建图,启动图.py
│ ├── 3Fetch_and_Feed.py
│ ├── 2变量.py
│ ├── 4MNIST分类.py
│ ├── 5下载google图像识别网络inception-v3.py
│ ├── 3Fetch_and_Feed.ipynb
│ ├── 1创建图,启动图.ipynb
│ ├── 2变量.ipynb
│ ├── 5下载google图像识别网络inception-v3.ipynb
│ ├── 6使用inception-v3做各种图像的识别.py
│ └── 4MNIST分类.ipynb
├── Tensorflow基础使用与文本分类应用
├── Tensorflow的基础使用与文本分类应用.pdf
└── 程序
│ ├── MNIST分类.py
│ ├── zhihu_predict.py
│ ├── data_handle.py
│ ├── MNIST分类.ipynb
│ ├── zhihu_eval.py
│ ├── zhihu_predict.ipynb
│ ├── cnn.py
│ ├── data_handle.ipynb
│ ├── cnn.ipynb
│ └── zhihu_eval.ipynb
└── README.md
/Tensorflow基础使用与图像识别应用/Tensorflow的基础使用与图像识别应用.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Qinbf/Tensorflow/HEAD/Tensorflow基础使用与图像识别应用/Tensorflow的基础使用与图像识别应用.pdf
--------------------------------------------------------------------------------
/Tensorflow基础使用与文本分类应用/Tensorflow的基础使用与文本分类应用.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Qinbf/Tensorflow/HEAD/Tensorflow基础使用与文本分类应用/Tensorflow的基础使用与文本分类应用.pdf
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Tensorflow
2 |
3 |
4 | ## 关注公众号
5 | 
6 |
7 |
8 | ## 一起交流
9 | 我的微信号:**sdxxqbf**
10 | 以下为微信二维码:
11 |
12 | 
13 |
14 |
--------------------------------------------------------------------------------
/Tensorflow基础使用与图像识别应用/程序/1创建图,启动图.py:
--------------------------------------------------------------------------------
1 | #51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html
2 | #优酷频道:http://i.youku.com/sdxxqbf
3 | #微信公众号:深度学习与神经网络
4 | #Github:https://github.com/Qinbf
5 |
6 | # coding: utf-8
7 |
8 | # In[1]:
9 |
10 | import tensorflow as tf
11 |
12 |
13 | # In[2]:
14 |
15 | #创建一个常量op
16 | m1 = tf.constant([[3,3]])
17 | #创建一个常量op
18 | m2 = tf.constant([[2],[3]])
19 | #创建一个矩阵乘法op,把m1和m2传入
20 | product = tf.matmul(m1,m2)
21 | #这个时候打印product,只能看到product的属性,不能计算它的值
22 | print(product)
23 |
24 |
25 | # In[3]:
26 |
27 | #第一种定义会话的方式:
28 | #定义一个会话,启动默认图
29 | sess = tf.Session()
30 | #调用sess的run方法来执行矩阵乘法op
31 | #run(product)触发了图中3个op
32 | result = sess.run(product)
33 | print(result)
34 | sess.close()
35 |
36 |
37 | # In[4]:
38 |
39 | #第二种定义会话的方式:
40 | with tf.Session() as sess:
41 | #调用sess的run方法来执行矩阵乘法op
42 | #run(product)触发了图中3个op
43 | result = sess.run(product)
44 | print(result)
45 |
46 |
47 | # In[ ]:
48 |
49 |
50 |
51 |
--------------------------------------------------------------------------------
/Tensorflow基础使用与图像识别应用/程序/3Fetch_and_Feed.py:
--------------------------------------------------------------------------------
1 | #51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html
2 | #优酷频道:http://i.youku.com/sdxxqbf
3 | #微信公众号:深度学习与神经网络
4 | #Github:https://github.com/Qinbf
5 |
6 | # coding: utf-8
7 |
8 | # In[1]:
9 |
10 | import tensorflow as tf
11 |
12 |
13 | # In[2]:
14 |
15 | #Fetch:可以在session中同时计算多个op
16 | #定义三个常量
17 | input1 = tf.constant(3.0)
18 | input2 = tf.constant(2.0)
19 | input3 = tf.constant(5.0)
20 | #定义一个加法op
21 | add = tf.add(input2,input3)
22 | #定义一个乘法op
23 | mul = tf.multiply(input1,add)
24 |
25 | with tf.Session() as sess:
26 | #同时执行乘法op和加法op
27 | result = sess.run([mul,add])
28 | print(result)
29 |
30 |
31 | # In[4]:
32 |
33 | #Feed:先定义占位符,等需要的时候再传入数据
34 | #创建占位符
35 | input1 = tf.placeholder(tf.float32)
36 | input2 = tf.placeholder(tf.float32)
37 | #定义乘法op
38 | output = tf.multiply(input1,input2)
39 |
40 | with tf.Session() as sess:
41 | #feed的数据以字典的形式传入
42 | print(sess.run(output,feed_dict={input1:[8.],input2:[2.]}))
43 |
44 |
45 | # In[ ]:
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/Tensorflow基础使用与图像识别应用/程序/2变量.py:
--------------------------------------------------------------------------------
1 | #51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html
2 | #优酷频道:http://i.youku.com/sdxxqbf
3 | #微信公众号:深度学习与神经网络
4 | #Github:https://github.com/Qinbf
5 |
6 | # coding: utf-8
7 |
8 | # In[1]:
9 |
10 | import tensorflow as tf
11 |
12 |
13 | # In[3]:
14 |
15 | #定义一个变量
16 | x = tf.Variable([1,2])
17 | #定义一个常量
18 | a = tf.constant([3,3])
19 | #增加一个减法op
20 | sub = tf.subtract(x,a)
21 | #增加一个加法op
22 | add = tf.add(x,sub)
23 |
24 | #所有变量初始化
25 | init = tf.global_variables_initializer()
26 |
27 | with tf.Session() as sess:
28 | #执行变量初始化
29 | sess.run(init)
30 | print(sess.run(sub))
31 | print(sess.run(add))
32 |
33 |
34 | # In[4]:
35 |
36 | #创建一个变量初始化为0
37 | state = tf.Variable(0,name='counter')
38 | #创建一个op,作用是使state加1
39 | new_value = tf.add(state,1)
40 | #赋值op
41 | update = tf.assign(state,new_value)
42 | #所有变量初始化
43 | init = tf.global_variables_initializer()
44 |
45 | with tf.Session() as sess:
46 | #执行变量初始化
47 | sess.run(init)
48 | print(sess.run(state))
49 | for _ in range(5):
50 | sess.run(update)
51 | print(sess.run(state))
52 |
53 |
54 | # In[ ]:
55 |
56 |
57 |
58 |
--------------------------------------------------------------------------------
/Tensorflow基础使用与图像识别应用/程序/4MNIST分类.py:
--------------------------------------------------------------------------------
1 | #51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html
2 | #优酷频道:http://i.youku.com/sdxxqbf
3 | #微信公众号:深度学习与神经网络
4 | #Github:https://github.com/Qinbf
5 |
6 | # coding: utf-8
7 |
8 | # In[1]:
9 |
10 | import tensorflow as tf
11 | from tensorflow.examples.tutorials.mnist import input_data
12 |
13 |
14 | # In[2]:
15 |
16 | #载入数据集
17 | mnist = input_data.read_data_sets("MNIST_data",one_hot=True)
18 |
19 | #每个批次100张照片
20 | batch_size = 100
21 | #计算一共有多少个批次
22 | n_batch = mnist.train.num_examples // batch_size
23 |
24 | #定义两个placeholder
25 | x = tf.placeholder(tf.float32,[None,784])
26 | y = tf.placeholder(tf.float32,[None,10])
27 |
28 | #创建一个简单的神经网络,输入层784个神经元,输出层10个神经元
29 | W = tf.Variable(tf.zeros([784,10]))
30 | b = tf.Variable(tf.zeros([10]))
31 | prediction = tf.nn.softmax(tf.matmul(x,W)+b)
32 |
33 | #二次代价函数
34 | #square是求平方
35 | #reduce_mean是求平均值
36 | loss = tf.reduce_mean(tf.square(y-prediction))
37 |
38 | #使用梯度下降法来最小化loss,学习率是0.2
39 | train_step = tf.train.GradientDescentOptimizer(0.2).minimize(loss)
40 |
41 | #初始化变量
42 | init = tf.global_variables_initializer()
43 |
44 | #结果存放在一个布尔型列表中
45 | correct_prediction = tf.equal(tf.argmax(y,1),tf.argmax(prediction,1))#argmax返回一维张量中最大的值所在的位置
46 | #求准确率
47 | accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))#cast是进行数据格式转换,把布尔型转为float32类型
48 |
49 | with tf.Session() as sess:
50 | #执行初始化
51 | sess.run(init)
52 | #迭代21个周期
53 | for epoch in range(21):
54 | #每个周期迭代n_batch个batch,每个batch为100
55 | for batch in range(n_batch):
56 | #获得一个batch的数据和标签
57 | batch_xs,batch_ys = mnist.train.next_batch(batch_size)
58 | #通过feed喂到模型中进行训练
59 | sess.run(train_step,feed_dict={x:batch_xs,y:batch_ys})
60 |
61 | #计算准确率
62 | acc = sess.run(accuracy,feed_dict={x:mnist.test.images,y:mnist.test.labels})
63 | print("Iter " + str(epoch) + ",Testing Accuracy " + str(acc))
64 |
65 |
66 | # In[ ]:
67 |
68 |
69 |
70 |
--------------------------------------------------------------------------------
/Tensorflow基础使用与文本分类应用/程序/MNIST分类.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # 51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html
5 | # 优酷频道:http://i.youku.com/sdxxqbf
6 | # 微信公众号:深度学习与神经网络
7 | # Github:https://github.com/Qinbf
8 |
9 | # In[2]:
10 |
11 | import tensorflow as tf
12 | from tensorflow.examples.tutorials.mnist import input_data
13 |
14 |
15 | # In[3]:
16 |
17 | #载入数据集
18 | mnist = input_data.read_data_sets("MNIST_data",one_hot=True)
19 |
20 | #每个批次100张照片
21 | batch_size = 100
22 | #计算一共有多少个批次
23 | n_batch = mnist.train.num_examples // batch_size
24 |
25 | #定义两个placeholder
26 | x = tf.placeholder(tf.float32,[None,784])
27 | y = tf.placeholder(tf.float32,[None,10])
28 |
29 | #创建一个简单的神经网络,输入层784个神经元,输出层10个神经元
30 | W = tf.Variable(tf.zeros([784,10]))
31 | b = tf.Variable(tf.zeros([10]))
32 | prediction = tf.nn.softmax(tf.matmul(x,W)+b)
33 |
34 | #二次代价函数
35 | #square是求平方
36 | #reduce_mean是求平均值
37 | loss = tf.reduce_mean(tf.square(y-prediction))
38 |
39 | #使用梯度下降法来最小化loss,学习率是0.2
40 | train_step = tf.train.GradientDescentOptimizer(0.2).minimize(loss)
41 |
42 | #初始化变量
43 | init = tf.global_variables_initializer()
44 |
45 | #结果存放在一个布尔型列表中
46 | correct_prediction = tf.equal(tf.argmax(y,1),tf.argmax(prediction,1))#argmax返回一维张量中最大的值所在的位置
47 | #求准确率
48 | accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))#cast是进行数据格式转换,把布尔型转为float32类型
49 |
50 | with tf.Session() as sess:
51 | #执行初始化
52 | sess.run(init)
53 | #迭代21个周期
54 | for epoch in range(21):
55 | #每个周期迭代n_batch个batch,每个batch为100
56 | for batch in range(n_batch):
57 | #获得一个batch的数据和标签
58 | batch_xs,batch_ys = mnist.train.next_batch(batch_size)
59 | #通过feed喂到模型中进行训练
60 | sess.run(train_step,feed_dict={x:batch_xs,y:batch_ys})
61 |
62 | #计算准确率
63 | acc = sess.run(accuracy,feed_dict={x:mnist.test.images,y:mnist.test.labels})
64 | print("Iter " + str(epoch) + ",Testing Accuracy " + str(acc))
65 |
66 |
67 | # In[ ]:
68 |
69 |
70 |
71 |
--------------------------------------------------------------------------------
/Tensorflow基础使用与图像识别应用/程序/5下载google图像识别网络inception-v3.py:
--------------------------------------------------------------------------------
1 | #51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html
2 | #优酷频道:http://i.youku.com/sdxxqbf
3 | #微信公众号:深度学习与神经网络
4 | #Github:https://github.com/Qinbf
5 |
6 | # coding: utf-8
7 |
8 | # In[1]:
9 |
10 | import tensorflow as tf
11 | import os
12 | import tarfile
13 | import requests
14 |
15 |
16 | # In[ ]:
17 |
18 | #inception模型下载地址
19 | inception_pretrain_model_url = 'http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz'
20 |
21 | #模型存放地址,存放在当前目录下inception_model文件夹下
22 | inception_pretrain_model_dir = "inception_model"
23 | if not os.path.exists(inception_pretrain_model_dir):
24 | os.makedirs(inception_pretrain_model_dir)
25 |
26 | #获取文件名,以及文件路径
27 | filename = inception_pretrain_model_url.split('/')[-1]
28 | filepath = os.path.join(inception_pretrain_model_dir, filename)
29 |
30 | #下载模型
31 | if not os.path.exists(filepath):
32 | print("download: ", filename)
33 | r = requests.get(inception_pretrain_model_url, stream=True)
34 | with open(filepath, 'wb') as f:
35 | for chunk in r.iter_content(chunk_size=1024):
36 | if chunk:
37 | f.write(chunk)
38 | print("finish: ", filename)
39 |
40 | #解压文件
41 | tarfile.open(filepath, 'r:gz').extractall(inception_pretrain_model_dir)
42 |
43 | #模型结构存放文件
44 | log_dir = 'inception_log'
45 | if not os.path.exists(log_dir):
46 | os.makedirs(log_dir)
47 |
48 | #classify_image_graph_def.pb为google训练好的模型
49 | inception_graph_def_file = os.path.join(inception_pretrain_model_dir, 'classify_image_graph_def.pb')
50 | with tf.Session() as sess:
51 | #创建一个图来存放google训练好的模型
52 | with tf.gfile.FastGFile(inception_graph_def_file, 'rb') as f:
53 | graph_def = tf.GraphDef()
54 | graph_def.ParseFromString(f.read())
55 | tf.import_graph_def(graph_def, name='')
56 | #保存图的结构
57 | writer = tf.summary.FileWriter(log_dir, sess.graph)
58 | writer.close()
59 |
60 |
61 | # In[ ]:
62 |
63 |
64 |
65 |
66 | # In[ ]:
67 |
68 |
69 |
70 |
--------------------------------------------------------------------------------
/Tensorflow基础使用与文本分类应用/程序/zhihu_predict.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # 51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html
5 | # 优酷频道:http://i.youku.com/sdxxqbf
6 | # 微信公众号:深度学习与神经网络
7 | # Github:https://github.com/Qinbf
8 |
9 | # In[1]:
10 |
11 | import pandas as pd
12 | from tqdm import tqdm
13 | import re
14 | import numpy as np
15 | from six.moves import xrange
16 |
17 |
18 | # In[2]:
19 |
20 | topic_info = pd.read_table("./ieee_zhihu_cup/topic_info.txt",sep='\t',header=None)
21 | print(topic_info.iloc[0:5])
22 |
23 |
24 | # In[3]:
25 |
26 | # 话题字典
27 | topic_dict = {}
28 | for i in xrange(topic_info.shape[0]):
29 | topic_dict[i] = topic_info.iloc[i][0]
30 |
31 |
32 | # In[4]:
33 |
34 | predict = open('predict.txt', "r")
35 | examples = predict.readlines()
36 | text = np.array([line.split(" ") for line in examples])
37 |
38 |
39 | # In[5]:
40 |
41 | label = []
42 | for line in tqdm(text):
43 | num2label = []
44 | for i in xrange(5):
45 | num2label.append(topic_dict[int(line[i])]) # 把0-1999编号转成原来的id
46 | label.append(num2label)
47 | label = np.array(label)
48 |
49 |
50 | # In[6]:
51 |
52 | np.savetxt("temp.txt",label,fmt='%d')
53 |
54 |
55 | # In[7]:
56 |
57 | def clean_str(string):
58 | string = re.sub(r" ", ",", string)
59 | return string
60 |
61 | file1 = open('temp.txt', "r")
62 | examples = file1.readlines()
63 | examples = [clean_str(line) for line in examples]
64 | file1.close()
65 |
66 | file1 = open('temp.txt', "w")
67 | file1.writelines(examples)
68 | file1.close()
69 |
70 |
71 | # In[8]:
72 |
73 | # predict文件导入
74 | predict_file = 'temp.txt'
75 | predict_reader = pd.read_table(predict_file,sep=' ',header=None)
76 | print(predict_reader.iloc[0:5])
77 |
78 |
79 | # In[9]:
80 |
81 | # 导入question_train_set
82 | eval_reader = pd.read_table('./ieee_zhihu_cup/question_eval_set.txt',sep='\t',header=None)
83 | print(eval_reader.iloc[0:3])
84 |
85 |
86 | # In[10]:
87 |
88 | final_predict = pd.concat([eval_reader.ix[:,0],predict_reader],axis=1)
89 | print(final_predict.iloc[0:5])
90 |
91 |
92 | # In[11]:
93 |
94 | final_predict.to_csv('temp.txt', header=None, index=None, sep=',')
95 |
96 | final_file = open('temp.txt', "r")
97 | final_examples = final_file.readlines()
98 | final_examples = [re.sub(r'"',"",line) for line in final_examples]
99 | final_file.close()
100 |
101 | final_file = open('final_predict.csv', "w")
102 | final_file.writelines(final_examples)
103 | final_file.close()
104 |
105 |
--------------------------------------------------------------------------------
/Tensorflow基础使用与图像识别应用/程序/3Fetch_and_Feed.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "#51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html\n",
12 | "#优酷频道:http://i.youku.com/sdxxqbf\n",
13 | "#微信公众号:深度学习与神经网络\n",
14 | "#Github:https://github.com/Qinbf"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "import tensorflow as tf"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 3,
31 | "metadata": {
32 | "collapsed": false
33 | },
34 | "outputs": [
35 | {
36 | "name": "stdout",
37 | "output_type": "stream",
38 | "text": [
39 | "[21.0, 7.0]\n"
40 | ]
41 | }
42 | ],
43 | "source": [
44 | "#Fetch:可以在session中同时计算多个op\n",
45 | "#定义三个常量\n",
46 | "input1 = tf.constant(3.0)\n",
47 | "input2 = tf.constant(2.0)\n",
48 | "input3 = tf.constant(5.0)\n",
49 | "#定义一个加法op\n",
50 | "add = tf.add(input2,input3)\n",
51 | "#定义一个乘法op\n",
52 | "mul = tf.multiply(input1,add)\n",
53 | "\n",
54 | "with tf.Session() as sess:\n",
55 | " #同时执行乘法op和加法op\n",
56 | " result = sess.run([mul,add])\n",
57 | " print(result)"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 4,
63 | "metadata": {
64 | "collapsed": false
65 | },
66 | "outputs": [
67 | {
68 | "name": "stdout",
69 | "output_type": "stream",
70 | "text": [
71 | "[ 16.]\n"
72 | ]
73 | }
74 | ],
75 | "source": [
76 | "#Feed:先定义占位符,等需要的时候再传入数据\n",
77 | "#创建占位符\n",
78 | "input1 = tf.placeholder(tf.float32)\n",
79 | "input2 = tf.placeholder(tf.float32)\n",
80 | "#定义乘法op\n",
81 | "output = tf.multiply(input1,input2)\n",
82 | "\n",
83 | "with tf.Session() as sess:\n",
84 | " #feed的数据以字典的形式传入\n",
85 | " print(sess.run(output,feed_dict={input1:[8.],input2:[2.]}))"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": null,
91 | "metadata": {
92 | "collapsed": true
93 | },
94 | "outputs": [],
95 | "source": []
96 | }
97 | ],
98 | "metadata": {
99 | "anaconda-cloud": {},
100 | "kernelspec": {
101 | "display_name": "Python [default]",
102 | "language": "python",
103 | "name": "python3"
104 | },
105 | "language_info": {
106 | "codemirror_mode": {
107 | "name": "ipython",
108 | "version": 3
109 | },
110 | "file_extension": ".py",
111 | "mimetype": "text/x-python",
112 | "name": "python",
113 | "nbconvert_exporter": "python",
114 | "pygments_lexer": "ipython3",
115 | "version": "3.5.2"
116 | }
117 | },
118 | "nbformat": 4,
119 | "nbformat_minor": 1
120 | }
121 |
--------------------------------------------------------------------------------
/Tensorflow基础使用与图像识别应用/程序/1创建图,启动图.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "#51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html\n",
12 | "#优酷频道:http://i.youku.com/sdxxqbf\n",
13 | "#微信公众号:深度学习与神经网络\n",
14 | "#Github:https://github.com/Qinbf"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "import tensorflow as tf"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 3,
31 | "metadata": {
32 | "collapsed": false
33 | },
34 | "outputs": [
35 | {
36 | "name": "stdout",
37 | "output_type": "stream",
38 | "text": [
39 | "Tensor(\"MatMul:0\", shape=(1, 1), dtype=int32)\n"
40 | ]
41 | }
42 | ],
43 | "source": [
44 | "#创建一个常量op\n",
45 | "m1 = tf.constant([[3,3]])\n",
46 | "#创建一个常量op\n",
47 | "m2 = tf.constant([[2],[3]])\n",
48 | "#创建一个矩阵乘法op,把m1和m2传入\n",
49 | "product = tf.matmul(m1,m2)\n",
50 | "#这个时候打印product,只能看到product的属性,不能计算它的值\n",
51 | "print(product)"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 4,
57 | "metadata": {
58 | "collapsed": false
59 | },
60 | "outputs": [
61 | {
62 | "name": "stdout",
63 | "output_type": "stream",
64 | "text": [
65 | "[[15]]\n"
66 | ]
67 | }
68 | ],
69 | "source": [
70 | "#第一种定义会话的方式:\n",
71 | "#定义一个会话,启动默认图\n",
72 | "sess = tf.Session()\n",
73 | "#调用sess的run方法来执行矩阵乘法op\n",
74 | "#run(product)触发了图中3个op\n",
75 | "result = sess.run(product)\n",
76 | "print(result)\n",
77 | "sess.close()"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 5,
83 | "metadata": {
84 | "collapsed": false
85 | },
86 | "outputs": [
87 | {
88 | "name": "stdout",
89 | "output_type": "stream",
90 | "text": [
91 | "[[15]]\n"
92 | ]
93 | }
94 | ],
95 | "source": [
96 | "#第二种定义会话的方式:\n",
97 | "with tf.Session() as sess:\n",
98 | " #调用sess的run方法来执行矩阵乘法op\n",
99 | " #run(product)触发了图中3个op\n",
100 | " result = sess.run(product)\n",
101 | " print(result)"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "metadata": {
108 | "collapsed": true
109 | },
110 | "outputs": [],
111 | "source": []
112 | }
113 | ],
114 | "metadata": {
115 | "anaconda-cloud": {},
116 | "kernelspec": {
117 | "display_name": "Python [default]",
118 | "language": "python",
119 | "name": "python3"
120 | },
121 | "language_info": {
122 | "codemirror_mode": {
123 | "name": "ipython",
124 | "version": 3
125 | },
126 | "file_extension": ".py",
127 | "mimetype": "text/x-python",
128 | "name": "python",
129 | "nbconvert_exporter": "python",
130 | "pygments_lexer": "ipython3",
131 | "version": "3.5.2"
132 | }
133 | },
134 | "nbformat": 4,
135 | "nbformat_minor": 1
136 | }
137 |
--------------------------------------------------------------------------------
/Tensorflow基础使用与文本分类应用/程序/data_handle.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # 51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html
5 | # 优酷频道:http://i.youku.com/sdxxqbf
6 | # 微信公众号:深度学习与神经网络
7 | # Github:https://github.com/Qinbf
8 | #
9 | # question_train_set.txt:
10 | # 第一列为 问题id;
11 | # 第二列为 title 的字符编号序列;
12 | # 第三列为 title 的词语编号序列;
13 | # 第四列为描述的字符编号序列;
14 | # 第五列为描述的词语标号序列。
15 | #
16 | # question_topic_train_set.txt:
17 | # 第一列 问题 id;
18 | # 第二列 话题 id。
19 | #
20 | # topic_info.txt:
21 | # 第一列为话题 id
22 | # 第二列为话题的父话题 id。话题之间是有向无环图结构,一个话题可能有 0 到多个父话题;
23 | # 第三列为话题名字的字符编号序列;
24 | # 第四列为话题名字的词语编号序列;
25 | # 第五列为话题描述的字符编号序列;
26 | # 第六列为话题描述的词语编号序列。
27 | #
28 | # 1.title通常来说包含的信息最重要。对于question_train_set.txt文件,为了简单起见,我们只取第三列,title的词语编号序列。
29 | # 2.对于topic_info.txt,为了简单起见,我们不考虑2,3,4,5,6列。只是简单的提取话题id,然后转为0-1998的数字(一共有1999个话题)
30 | # 3.然后合并以上一些数据,得到最后处理后的数据。
31 |
32 | # In[1]:
33 |
34 | import pandas as pd
35 | from tqdm import tqdm # pip install tqdm
36 | from six.moves import xrange
37 |
38 |
39 | # In[2]:
40 |
41 | # 导入question_train_set
42 | reader = pd.read_table('./ieee_zhihu_cup/question_train_set.txt',sep='\t',header=None)
43 | print(reader.iloc[0:5])
44 |
45 |
46 | # In[3]:
47 |
48 | # 导入question_topic_eval_set
49 | topic_reader = pd.read_table('./ieee_zhihu_cup/question_topic_train_set.txt',sep='\t',header=None)
50 | print(topic_reader.iloc[0:5])
51 |
52 |
53 | # In[4]:
54 |
55 | # 合并title 的词语编号序列和话题 id
56 | data_topic = pd.concat([reader.ix[:,2], topic_reader.ix[:,1]], axis=1, ignore_index=True)
57 | print(data_topic.iloc[0:5])
58 |
59 |
60 | # In[5]:
61 |
62 | # 导入topic_info
63 | label_reader = pd.read_table('./ieee_zhihu_cup/topic_info.txt',sep='\t',header=None)
64 | print(label_reader.iloc[0:5])
65 |
66 |
67 | # In[6]:
68 |
69 | # 把标签转为0-1998的编号
70 | labels = list(label_reader.iloc[:,0])
71 | my_labels = []
72 | for label in labels:
73 | my_labels.append(label)
74 |
75 | # 建立topic字典
76 | topic_dict = {}
77 | for i,label in enumerate(my_labels):
78 | topic_dict[label] = i
79 |
80 | print(topic_dict[7739004195693774975])
81 |
82 |
83 | # In[7]:
84 |
85 | for i in tqdm(xrange(data_topic.shape[0])):
86 | new_label = ''
87 | # 根据“,”切分话题id
88 | temp_topic = data_topic.iloc[i][1].split(',')
89 | for topic in temp_topic:
90 | # 判断该label是否在label文件中,并得到该行
91 | label_num = topic_dict[int(topic)]
92 | new_label = new_label + str(label_num) + ','
93 | data_topic.iloc[i][1] = new_label[:-1]
94 | print(data_topic.iloc[:5])
95 |
96 |
97 | # In[8]:
98 |
99 | # 保存处理过后的文件
100 | data_topic.to_csv("./ieee_zhihu_cup/data_topic.txt", header=None, index=None, sep='\t')
101 |
102 | # 切分成10块保存
103 | for i in xrange(10):
104 | data_topic_filename = './ieee_zhihu_cup/data_topic_block_' + str(i) + '.txt'
105 | if (i+1)*300000 < data_topic.shape[0]:
106 | data_topic.iloc[i*300000:(i+1)*300000].to_csv(
107 | data_topic_filename, header=None, index=None, sep='\t')
108 | else:
109 | data_topic.iloc[i*300000:data_topic.shape[0]].to_csv(
110 | data_topic_filename, header=None, index=None, sep='\t')
111 |
112 |
113 | # In[ ]:
114 |
115 |
116 |
117 |
--------------------------------------------------------------------------------
/Tensorflow基础使用与图像识别应用/程序/2变量.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "#51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html\n",
12 | "#优酷频道:http://i.youku.com/sdxxqbf\n",
13 | "#微信公众号:深度学习与神经网络\n",
14 | "#Github:https://github.com/Qinbf"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "import tensorflow as tf"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 3,
31 | "metadata": {
32 | "collapsed": false
33 | },
34 | "outputs": [
35 | {
36 | "name": "stdout",
37 | "output_type": "stream",
38 | "text": [
39 | "[-2 -1]\n",
40 | "[-1 1]\n"
41 | ]
42 | }
43 | ],
44 | "source": [
45 | "#定义一个变量\n",
46 | "x = tf.Variable([1,2])\n",
47 | "#定义一个常量\n",
48 | "a = tf.constant([3,3])\n",
49 | "#增加一个减法op\n",
50 | "sub = tf.subtract(x,a)\n",
51 | "#增加一个加法op\n",
52 | "add = tf.add(x,sub)\n",
53 | "\n",
54 | "#所有变量初始化\n",
55 | "init = tf.global_variables_initializer()\n",
56 | "\n",
57 | "with tf.Session() as sess:\n",
58 | " #执行变量初始化\n",
59 | " sess.run(init)\n",
60 | " print(sess.run(sub))\n",
61 | " print(sess.run(add))"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 4,
67 | "metadata": {
68 | "collapsed": false
69 | },
70 | "outputs": [
71 | {
72 | "name": "stdout",
73 | "output_type": "stream",
74 | "text": [
75 | "0\n",
76 | "1\n",
77 | "2\n",
78 | "3\n",
79 | "4\n",
80 | "5\n"
81 | ]
82 | }
83 | ],
84 | "source": [
85 | "#创建一个变量初始化为0\n",
86 | "state = tf.Variable(0,name='counter')\n",
87 | "#创建一个op,作用是使state加1\n",
88 | "new_value = tf.add(state,1)\n",
89 | "#赋值op\n",
90 | "update = tf.assign(state,new_value)\n",
91 | "#所有变量初始化\n",
92 | "init = tf.global_variables_initializer()\n",
93 | "\n",
94 | "with tf.Session() as sess:\n",
95 | " #执行变量初始化\n",
96 | " sess.run(init)\n",
97 | " print(sess.run(state))\n",
98 | " for _ in range(5):\n",
99 | " sess.run(update)\n",
100 | " print(sess.run(state))"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {
107 | "collapsed": true
108 | },
109 | "outputs": [],
110 | "source": []
111 | }
112 | ],
113 | "metadata": {
114 | "anaconda-cloud": {},
115 | "kernelspec": {
116 | "display_name": "Python [default]",
117 | "language": "python",
118 | "name": "python3"
119 | },
120 | "language_info": {
121 | "codemirror_mode": {
122 | "name": "ipython",
123 | "version": 3
124 | },
125 | "file_extension": ".py",
126 | "mimetype": "text/x-python",
127 | "name": "python",
128 | "nbconvert_exporter": "python",
129 | "pygments_lexer": "ipython3",
130 | "version": "3.5.2"
131 | }
132 | },
133 | "nbformat": 4,
134 | "nbformat_minor": 1
135 | }
136 |
--------------------------------------------------------------------------------
/Tensorflow基础使用与图像识别应用/程序/5下载google图像识别网络inception-v3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "#51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html\n",
12 | "#优酷频道:http://i.youku.com/sdxxqbf\n",
13 | "#微信公众号:深度学习与神经网络\n",
14 | "#Github:https://github.com/Qinbf"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "import tensorflow as tf\n",
26 | "import os\n",
27 | "import tarfile\n",
28 | "import requests"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 3,
34 | "metadata": {
35 | "collapsed": false
36 | },
37 | "outputs": [
38 | {
39 | "name": "stdout",
40 | "output_type": "stream",
41 | "text": [
42 | "finish: inception-2015-12-05.tgz\n"
43 | ]
44 | }
45 | ],
46 | "source": [
47 | "#inception模型下载地址\n",
48 | "inception_pretrain_model_url = 'http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz'\n",
49 | "\n",
50 | "#模型存放地址,存放在当前目录下inception_model文件夹下\n",
51 | "inception_pretrain_model_dir = \"inception_model\"\n",
52 | "if not os.path.exists(inception_pretrain_model_dir):\n",
53 | " os.makedirs(inception_pretrain_model_dir)\n",
54 | " \n",
55 | "#获取文件名,以及文件路径\n",
56 | "filename = inception_pretrain_model_url.split('/')[-1]\n",
57 | "filepath = os.path.join(inception_pretrain_model_dir, filename)\n",
58 | "\n",
59 | "#下载模型\n",
60 | "if not os.path.exists(filepath):\n",
61 | " print(\"download: \", filename)\n",
62 | " r = requests.get(inception_pretrain_model_url, stream=True)\n",
63 | " with open(filepath, 'wb') as f:\n",
64 | " for chunk in r.iter_content(chunk_size=1024):\n",
65 | " if chunk:\n",
66 | " f.write(chunk)\n",
67 | "print(\"finish: \", filename)\n",
68 | "\n",
69 | "#解压文件\n",
70 | "tarfile.open(filepath, 'r:gz').extractall(inception_pretrain_model_dir)\n",
71 | " \n",
72 | "#模型结构存放文件\n",
73 | "log_dir = 'inception_log'\n",
74 | "if not os.path.exists(log_dir):\n",
75 | " os.makedirs(log_dir)\n",
76 | "\n",
77 | "#classify_image_graph_def.pb为google训练好的模型\n",
78 | "inception_graph_def_file = os.path.join(inception_pretrain_model_dir, 'classify_image_graph_def.pb')\n",
79 | "with tf.Session() as sess:\n",
80 | " #创建一个图来存放google训练好的模型\n",
81 | " with tf.gfile.FastGFile(inception_graph_def_file, 'rb') as f:\n",
82 | " graph_def = tf.GraphDef()\n",
83 | " graph_def.ParseFromString(f.read())\n",
84 | " tf.import_graph_def(graph_def, name='')\n",
85 | " #保存图的结构\n",
86 | " writer = tf.summary.FileWriter(log_dir, sess.graph)\n",
87 | " writer.close()"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": null,
93 | "metadata": {
94 | "collapsed": true
95 | },
96 | "outputs": [],
97 | "source": []
98 | }
99 | ],
100 | "metadata": {
101 | "anaconda-cloud": {},
102 | "kernelspec": {
103 | "display_name": "Python [default]",
104 | "language": "python",
105 | "name": "python3"
106 | },
107 | "language_info": {
108 | "codemirror_mode": {
109 | "name": "ipython",
110 | "version": 3
111 | },
112 | "file_extension": ".py",
113 | "mimetype": "text/x-python",
114 | "name": "python",
115 | "nbconvert_exporter": "python",
116 | "pygments_lexer": "ipython3",
117 | "version": "3.5.2"
118 | }
119 | },
120 | "nbformat": 4,
121 | "nbformat_minor": 1
122 | }
123 |
--------------------------------------------------------------------------------
/Tensorflow基础使用与图像识别应用/程序/6使用inception-v3做各种图像的识别.py:
--------------------------------------------------------------------------------
1 | #51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html
2 | #优酷频道:http://i.youku.com/sdxxqbf
3 | #微信公众号:深度学习与神经网络
4 | #Github:https://github.com/Qinbf
5 |
6 | # coding: utf-8
7 |
8 | # In[4]:
9 |
10 | import tensorflow as tf
11 | import os
12 | import numpy as np
13 | import re
14 | from PIL import Image
15 | import matplotlib.pyplot as plt
16 |
17 |
18 | # In[5]:
19 |
20 | class NodeLookup(object):
21 | def __init__(self):
22 | label_lookup_path = 'inception_model/imagenet_2012_challenge_label_map_proto.pbtxt'
23 | uid_lookup_path = 'inception_model/imagenet_synset_to_human_label_map.txt'
24 | self.node_lookup = self.load(label_lookup_path, uid_lookup_path)
25 |
26 | def load(self, label_lookup_path, uid_lookup_path):
27 | # 加载分类字符串n********对应分类名称的文件
28 | proto_as_ascii_lines = tf.gfile.GFile(uid_lookup_path).readlines()
29 | uid_to_human = {}
30 | #匹配0或多个n或数字,匹配0或多个空格,非空白字符,逗号
31 | p = re.compile(r'[n\d]*[ \S,]*')
32 | for line in proto_as_ascii_lines:
33 | parsed_items = p.findall(line)
34 | #获取编号字符串n********
35 | uid = parsed_items[0]
36 | #获取分类名称
37 | human_string = parsed_items[2]
38 | #保存编号字符串n********与分类名称映射关系
39 | uid_to_human[uid] = human_string
40 |
41 | # 加载分类字符串n********对应分类编号1-1000的文件
42 | proto_as_ascii = tf.gfile.GFile(label_lookup_path).readlines()
43 | node_id_to_uid = {}
44 | for line in proto_as_ascii:
45 | if line.startswith(' target_class:'):
46 | #获取分类编号1-1000
47 | target_class = int(line.split(': ')[1])
48 | if line.startswith(' target_class_string:'):
49 | #获取编号字符串n********
50 | target_class_string = line.split(': ')[1]
51 | #保存分类编号1-1000与编号字符串n********映射关系
52 | node_id_to_uid[target_class] = target_class_string[1:-2]
53 |
54 | #建立分类编号1-1000对应分类名称的映射关系
55 | node_id_to_name = {}
56 | for key, val in node_id_to_uid.items():
57 | #获取分类名称
58 | name = uid_to_human[val]
59 | #建立分类编号1-1000到分类名称的映射关系
60 | node_id_to_name[key] = name
61 | return node_id_to_name
62 |
63 | #传入分类编号1-1000返回分类名称
64 | def id_to_string(self, node_id):
65 | if node_id not in self.node_lookup:
66 | return ''
67 | return self.node_lookup[node_id]
68 |
69 |
70 | #创建一个图来存放google训练好的模型
71 | with tf.gfile.FastGFile('inception_model/classify_image_graph_def.pb', 'rb') as f:
72 | graph_def = tf.GraphDef()
73 | graph_def.ParseFromString(f.read())
74 | tf.import_graph_def(graph_def, name='')
75 |
76 |
77 | with tf.Session() as sess:
78 | softmax_tensor = sess.graph.get_tensor_by_name('softmax:0')
79 | #遍历目录
80 | for root,dirs,files in os.walk('images/'):
81 | for file in files:
82 | #载入图片
83 | image_data = tf.gfile.FastGFile(os.path.join(root,file), 'rb').read()
84 | predictions = sess.run(softmax_tensor,{'DecodeJpeg/contents:0': image_data})#图片格式是jpg格式
85 | predictions = np.squeeze(predictions)#把结果转为1维数据
86 |
87 | #打印图片路径及名称
88 | image_path = os.path.join(root,file)
89 | print(image_path)
90 | #显示图片
91 | img=Image.open(image_path)
92 | plt.imshow(img)
93 | plt.axis('off')
94 | plt.show()
95 |
96 | #排序
97 | top_k = predictions.argsort()[-5:][::-1]
98 | node_lookup = NodeLookup()
99 | for node_id in top_k:
100 | #获取分类名称
101 | human_string = node_lookup.id_to_string(node_id)
102 | #获取该分类的置信度
103 | score = predictions[node_id]
104 | print('%s (score = %.5f)' % (human_string, score))
105 | print()
106 |
107 |
108 | # In[ ]:
109 |
110 |
111 |
112 |
--------------------------------------------------------------------------------
/Tensorflow基础使用与文本分类应用/程序/MNIST分类.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "collapsed": true
7 | },
8 | "source": [
9 | "51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html
\n",
10 | "优酷频道:http://i.youku.com/sdxxqbf
\n",
11 | "微信公众号:深度学习与神经网络
\n",
12 | "Github:https://github.com/Qinbf
"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 2,
18 | "metadata": {
19 | "collapsed": true
20 | },
21 | "outputs": [],
22 | "source": [
23 | "import tensorflow as tf\n",
24 | "from tensorflow.examples.tutorials.mnist import input_data"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 3,
30 | "metadata": {
31 | "collapsed": false
32 | },
33 | "outputs": [
34 | {
35 | "name": "stdout",
36 | "output_type": "stream",
37 | "text": [
38 | "Extracting MNIST_data\\train-images-idx3-ubyte.gz\n",
39 | "Extracting MNIST_data\\train-labels-idx1-ubyte.gz\n",
40 | "Extracting MNIST_data\\t10k-images-idx3-ubyte.gz\n",
41 | "Extracting MNIST_data\\t10k-labels-idx1-ubyte.gz\n",
42 | "Iter 0,Testing Accuracy 0.8304\n",
43 | "Iter 1,Testing Accuracy 0.8702\n",
44 | "Iter 2,Testing Accuracy 0.8821\n",
45 | "Iter 3,Testing Accuracy 0.8884\n",
46 | "Iter 4,Testing Accuracy 0.894\n",
47 | "Iter 5,Testing Accuracy 0.8968\n",
48 | "Iter 6,Testing Accuracy 0.9011\n",
49 | "Iter 7,Testing Accuracy 0.9019\n",
50 | "Iter 8,Testing Accuracy 0.9034\n",
51 | "Iter 9,Testing Accuracy 0.9049\n",
52 | "Iter 10,Testing Accuracy 0.9057\n",
53 | "Iter 11,Testing Accuracy 0.9073\n",
54 | "Iter 12,Testing Accuracy 0.9081\n",
55 | "Iter 13,Testing Accuracy 0.9088\n",
56 | "Iter 14,Testing Accuracy 0.9098\n",
57 | "Iter 15,Testing Accuracy 0.9108\n",
58 | "Iter 16,Testing Accuracy 0.9118\n",
59 | "Iter 17,Testing Accuracy 0.9123\n",
60 | "Iter 18,Testing Accuracy 0.9127\n",
61 | "Iter 19,Testing Accuracy 0.9137\n",
62 | "Iter 20,Testing Accuracy 0.9138\n"
63 | ]
64 | }
65 | ],
66 | "source": [
67 | "#载入数据集\n",
68 | "mnist = input_data.read_data_sets(\"MNIST_data\",one_hot=True)\n",
69 | "\n",
70 | "#每个批次100张照片\n",
71 | "batch_size = 100\n",
72 | "#计算一共有多少个批次\n",
73 | "n_batch = mnist.train.num_examples // batch_size\n",
74 | "\n",
75 | "#定义两个placeholder\n",
76 | "x = tf.placeholder(tf.float32,[None,784])\n",
77 | "y = tf.placeholder(tf.float32,[None,10])\n",
78 | "\n",
79 | "#创建一个简单的神经网络,输入层784个神经元,输出层10个神经元\n",
80 | "W = tf.Variable(tf.zeros([784,10]))\n",
81 | "b = tf.Variable(tf.zeros([10]))\n",
82 | "prediction = tf.nn.softmax(tf.matmul(x,W)+b)\n",
83 | "\n",
84 | "#二次代价函数\n",
85 | "#square是求平方\n",
86 | "#reduce_mean是求平均值\n",
87 | "loss = tf.reduce_mean(tf.square(y-prediction))\n",
88 | "\n",
89 | "#使用梯度下降法来最小化loss,学习率是0.2\n",
90 | "train_step = tf.train.GradientDescentOptimizer(0.2).minimize(loss)\n",
91 | "\n",
92 | "#初始化变量\n",
93 | "init = tf.global_variables_initializer()\n",
94 | "\n",
95 | "#结果存放在一个布尔型列表中\n",
96 | "correct_prediction = tf.equal(tf.argmax(y,1),tf.argmax(prediction,1))#argmax返回一维张量中最大的值所在的位置\n",
97 | "#求准确率\n",
98 | "accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))#cast是进行数据格式转换,把布尔型转为float32类型\n",
99 | "\n",
100 | "with tf.Session() as sess:\n",
101 | " #执行初始化\n",
102 | " sess.run(init)\n",
103 | " #迭代21个周期\n",
104 | " for epoch in range(21):\n",
105 | " #每个周期迭代n_batch个batch,每个batch为100\n",
106 | " for batch in range(n_batch):\n",
107 | " #获得一个batch的数据和标签\n",
108 | " batch_xs,batch_ys = mnist.train.next_batch(batch_size)\n",
109 | " #通过feed喂到模型中进行训练\n",
110 | " sess.run(train_step,feed_dict={x:batch_xs,y:batch_ys})\n",
111 | " \n",
112 | " #计算准确率\n",
113 | " acc = sess.run(accuracy,feed_dict={x:mnist.test.images,y:mnist.test.labels})\n",
114 | " print(\"Iter \" + str(epoch) + \",Testing Accuracy \" + str(acc))"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": null,
120 | "metadata": {
121 | "collapsed": true
122 | },
123 | "outputs": [],
124 | "source": []
125 | }
126 | ],
127 | "metadata": {
128 | "anaconda-cloud": {},
129 | "kernelspec": {
130 | "display_name": "Python [default]",
131 | "language": "python",
132 | "name": "python3"
133 | },
134 | "language_info": {
135 | "codemirror_mode": {
136 | "name": "ipython",
137 | "version": 3
138 | },
139 | "file_extension": ".py",
140 | "mimetype": "text/x-python",
141 | "name": "python",
142 | "nbconvert_exporter": "python",
143 | "pygments_lexer": "ipython3",
144 | "version": "3.5.2"
145 | }
146 | },
147 | "nbformat": 4,
148 | "nbformat_minor": 1
149 | }
150 |
--------------------------------------------------------------------------------
/Tensorflow基础使用与图像识别应用/程序/4MNIST分类.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "#51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html\n",
12 | "#优酷频道:http://i.youku.com/sdxxqbf\n",
13 | "#微信公众号:深度学习与神经网络\n",
14 | "#Github:https://github.com/Qinbf"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "import tensorflow as tf\n",
26 | "from tensorflow.examples.tutorials.mnist import input_data"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 3,
32 | "metadata": {
33 | "collapsed": false
34 | },
35 | "outputs": [
36 | {
37 | "name": "stdout",
38 | "output_type": "stream",
39 | "text": [
40 | "Extracting MNIST_data\\train-images-idx3-ubyte.gz\n",
41 | "Extracting MNIST_data\\train-labels-idx1-ubyte.gz\n",
42 | "Extracting MNIST_data\\t10k-images-idx3-ubyte.gz\n",
43 | "Extracting MNIST_data\\t10k-labels-idx1-ubyte.gz\n",
44 | "Iter 0,Testing Accuracy 0.8304\n",
45 | "Iter 1,Testing Accuracy 0.8702\n",
46 | "Iter 2,Testing Accuracy 0.8821\n",
47 | "Iter 3,Testing Accuracy 0.8884\n",
48 | "Iter 4,Testing Accuracy 0.894\n",
49 | "Iter 5,Testing Accuracy 0.8968\n",
50 | "Iter 6,Testing Accuracy 0.9011\n",
51 | "Iter 7,Testing Accuracy 0.9019\n",
52 | "Iter 8,Testing Accuracy 0.9034\n",
53 | "Iter 9,Testing Accuracy 0.9049\n",
54 | "Iter 10,Testing Accuracy 0.9057\n",
55 | "Iter 11,Testing Accuracy 0.9073\n",
56 | "Iter 12,Testing Accuracy 0.9081\n",
57 | "Iter 13,Testing Accuracy 0.9088\n",
58 | "Iter 14,Testing Accuracy 0.9098\n",
59 | "Iter 15,Testing Accuracy 0.9108\n",
60 | "Iter 16,Testing Accuracy 0.9118\n",
61 | "Iter 17,Testing Accuracy 0.9123\n",
62 | "Iter 18,Testing Accuracy 0.9127\n",
63 | "Iter 19,Testing Accuracy 0.9137\n",
64 | "Iter 20,Testing Accuracy 0.9138\n"
65 | ]
66 | }
67 | ],
68 | "source": [
69 | "#载入数据集\n",
70 | "mnist = input_data.read_data_sets(\"MNIST_data\",one_hot=True)\n",
71 | "\n",
72 | "#每个批次100张照片\n",
73 | "batch_size = 100\n",
74 | "#计算一共有多少个批次\n",
75 | "n_batch = mnist.train.num_examples // batch_size\n",
76 | "\n",
77 | "#定义两个placeholder\n",
78 | "x = tf.placeholder(tf.float32,[None,784])\n",
79 | "y = tf.placeholder(tf.float32,[None,10])\n",
80 | "\n",
81 | "#创建一个简单的神经网络,输入层784个神经元,输出层10个神经元\n",
82 | "W = tf.Variable(tf.zeros([784,10]))\n",
83 | "b = tf.Variable(tf.zeros([10]))\n",
84 | "prediction = tf.nn.softmax(tf.matmul(x,W)+b)\n",
85 | "\n",
86 | "#二次代价函数\n",
87 | "#square是求平方\n",
88 | "#reduce_mean是求平均值\n",
89 | "loss = tf.reduce_mean(tf.square(y-prediction))\n",
90 | "\n",
91 | "#使用梯度下降法来最小化loss,学习率是0.2\n",
92 | "train_step = tf.train.GradientDescentOptimizer(0.2).minimize(loss)\n",
93 | "\n",
94 | "#初始化变量\n",
95 | "init = tf.global_variables_initializer()\n",
96 | "\n",
97 | "#结果存放在一个布尔型列表中\n",
98 | "correct_prediction = tf.equal(tf.argmax(y,1),tf.argmax(prediction,1))#argmax返回一维张量中最大的值所在的位置\n",
99 | "#求准确率\n",
100 | "accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))#cast是进行数据格式转换,把布尔型转为float32类型\n",
101 | "\n",
102 | "with tf.Session() as sess:\n",
103 | " #执行初始化\n",
104 | " sess.run(init)\n",
105 | " #迭代21个周期\n",
106 | " for epoch in range(21):\n",
107 | " #每个周期迭代n_batch个batch,每个batch为100\n",
108 | " for batch in range(n_batch):\n",
109 | " #获得一个batch的数据和标签\n",
110 | " batch_xs,batch_ys = mnist.train.next_batch(batch_size)\n",
111 | " #通过feed喂到模型中进行训练\n",
112 | " sess.run(train_step,feed_dict={x:batch_xs,y:batch_ys})\n",
113 | " \n",
114 | " #计算准确率\n",
115 | " acc = sess.run(accuracy,feed_dict={x:mnist.test.images,y:mnist.test.labels})\n",
116 | " print(\"Iter \" + str(epoch) + \",Testing Accuracy \" + str(acc))"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": null,
122 | "metadata": {
123 | "collapsed": true
124 | },
125 | "outputs": [],
126 | "source": []
127 | }
128 | ],
129 | "metadata": {
130 | "anaconda-cloud": {},
131 | "kernelspec": {
132 | "display_name": "Python [default]",
133 | "language": "python",
134 | "name": "python3"
135 | },
136 | "language_info": {
137 | "codemirror_mode": {
138 | "name": "ipython",
139 | "version": 3
140 | },
141 | "file_extension": ".py",
142 | "mimetype": "text/x-python",
143 | "name": "python",
144 | "nbconvert_exporter": "python",
145 | "pygments_lexer": "ipython3",
146 | "version": "3.5.2"
147 | }
148 | },
149 | "nbformat": 4,
150 | "nbformat_minor": 1
151 | }
152 |
--------------------------------------------------------------------------------
/Tensorflow基础使用与文本分类应用/程序/zhihu_eval.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # 51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html
5 | # 优酷频道:http://i.youku.com/sdxxqbf
6 | # 微信公众号:深度学习与神经网络
7 | # Github:https://github.com/Qinbf
8 |
9 | # In[1]:
10 |
11 |
12 | import numpy as np
13 | import pandas as pd
14 | from tqdm import tqdm
15 | import tensorflow as tf
16 | import pickle
17 | import math
18 | from six.moves import xrange
19 |
20 |
21 | # In[2]:
22 |
23 | # 导入question_train_set
24 | reader = pd.read_table('./ieee_zhihu_cup/question_eval_set.txt',sep='\t',header=None)
25 | print(reader.iloc[0:5])
26 |
27 |
28 | # In[3]:
29 |
30 | # 计算一段文本中最大词汇数
31 | x_text = reader.iloc[:,2]
32 | max_document_length = 0
33 | for i,line in enumerate(x_text):
34 | try:
35 | temp = line.split(',')
36 | max_document_length = max(max_document_length,len(temp))
37 | except:
38 | # 其中有一行数据为空
39 | pass
40 | # x_text[i] = " "
41 |
42 | print("max_document_length:",max_document_length)
43 |
44 | # 载入字典
45 | vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor.restore("vocab_dict")
46 |
47 |
48 | # In[4]:
49 |
50 | # 按','切分数据
51 | text = []
52 | for line in x_text:
53 | try:
54 | text.append(line.split(','))
55 | except:
56 | # 其中有一行数据为空
57 | text.append(' ')
58 |
59 |
60 | # In[5]:
61 |
62 | # 把数据集变成编号的形式
63 | x = []
64 | for line in tqdm(text):
65 | line_len = len(line)
66 | text2num = []
67 | for i in xrange(max_document_length):
68 | if(i < line_len):
69 | try:
70 | text2num.append(vocab_processor.vocabulary_.get(line[i])) # 把词转为数字
71 | except:
72 | text2num.append(0) # 没有对应的词
73 | else:
74 | text2num.append(0) # 填充0
75 | x.append(text2num)
76 | x = np.array(x)
77 | x[:5]
78 |
79 |
80 | # In[6]:
81 |
82 | def batch_iter(data, batch_size, num_epochs, shuffle=False):
83 | """
84 | Generates a batch iterator for a dataset.
85 | """
86 | data = np.array(data)
87 | data_size = len(data)
88 | # 每个epoch的num_batch
89 | num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1
90 | print("num_batches_per_epoch:",num_batches_per_epoch)
91 | for epoch in range(num_epochs):
92 | # Shuffle the data at each epoch
93 | if shuffle:
94 | shuffle_indices = np.random.permutation(np.arange(data_size))
95 | shuffled_data = data[shuffle_indices]
96 | else:
97 | shuffled_data = data
98 | for batch_num in range(num_batches_per_epoch):
99 | start_index = batch_num * batch_size
100 | end_index = min((batch_num + 1) * batch_size, data_size)
101 | yield shuffled_data[start_index:end_index]
102 |
103 |
104 | # In[7]:
105 |
106 | def eval(predict_label_and_marked_label_list):
107 | """
108 | :param predict_label_and_marked_label_list: 一个元组列表。例如
109 | [ ([1, 2, 3, 4, 5], [4, 5, 6, 7]),
110 | ([3, 2, 1, 4, 7], [5, 7, 3])
111 | ]
112 | 需要注意这里 predict_label 是去重复的,例如 [1,2,3,2,4,1,6],去重后变成[1,2,3,4,6]
113 |
114 | marked_label_list 本身没有顺序性,但提交结果有,例如上例的命中情况分别为
115 | [0,0,0,1,1] (4,5命中)
116 | [1,0,0,0,1] (3,7命中)
117 |
118 | """
119 | right_label_num = 0 #总命中标签数量
120 | right_label_at_pos_num = [0, 0, 0, 0, 0] #在各个位置上总命中数量
121 | sample_num = 0 #总问题数量
122 | all_marked_label_num = 0 #总标签数量
123 | for predict_labels, marked_labels in predict_label_and_marked_label_list:
124 | sample_num += 1
125 | marked_label_set = set(marked_labels)
126 | all_marked_label_num += len(marked_label_set)
127 | for pos, label in zip(range(0, min(len(predict_labels), 5)), predict_labels):
128 | if label in marked_label_set: #命中
129 | right_label_num += 1
130 | right_label_at_pos_num[pos] += 1
131 |
132 | precision = 0.0
133 | for pos, right_num in zip(range(0, 5), right_label_at_pos_num):
134 | precision += ((right_num / float(sample_num))) / math.log(2.0 + pos) # 下标0-4 映射到 pos1-5 + 1,所以最终+2
135 | recall = float(right_label_num) / all_marked_label_num
136 |
137 | return 2*(precision * recall) / (precision + recall )
138 |
139 |
140 | # In[8]:
141 |
142 | # 定义三个placeholder
143 | input_x = tf.placeholder(tf.int32, [None, x.shape[1]], name="input_x")
144 | dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
145 |
146 | # sequence_length-最长词汇数
147 | sequence_length=x.shape[1]
148 | # num_classes-分类数
149 | num_classes=1999
150 | # vocab_size-总词汇数
151 | vocab_size=len(vocab_processor.vocabulary_)
152 | # embedding_size-词向量长度
153 | embedding_size=256
154 | # filter_sizes-卷积核尺寸3,4,5
155 | filter_sizes=list(map(int, [3,4,5]))
156 | # num_filters-卷积核数量
157 | num_filters=1024
158 |
159 | Weights = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name="Weights")
160 | # [None, sequence_length, embedding_size]
161 | embedded_chars = tf.nn.embedding_lookup(Weights, input_x)
162 | # 添加一个维度,[None, sequence_length, embedding_size, 1]
163 | embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)
164 | # Create a convolution + maxpool layer for each filter size
165 | pooled_outputs = []
166 | for i, filter_size in enumerate(filter_sizes):
167 | with tf.name_scope("conv-maxpool-%s" % filter_size):
168 | # Convolution Layer
169 | filter_shape = [filter_size, embedding_size, 1, num_filters]
170 | W = tf.Variable(
171 | tf.truncated_normal(filter_shape, stddev=0.1), name="W")
172 | b = tf.Variable(
173 | tf.constant(0.1, shape=[num_filters]), name="b")
174 | conv = tf.nn.conv2d(
175 | embedded_chars_expanded,
176 | W,
177 | strides=[1, 1, 1, 1],
178 | padding="VALID",
179 | name="conv")
180 | # Apply nonlinearity
181 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
182 | # Maxpooling over the outputs
183 | pooled = tf.nn.max_pool(
184 | h,
185 | ksize=[1, sequence_length - filter_size + 1, 1, 1],
186 | strides=[1, 1, 1, 1],
187 | padding='VALID',
188 | name="pool")
189 | pooled_outputs.append(pooled)
190 |
191 | # Combine all the pooled features
192 | num_filters_total = num_filters * len(filter_sizes)
193 | print("num_filters_total:", num_filters_total)
194 | h_pool = tf.concat(pooled_outputs, 3)
195 | h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])
196 |
197 | # Add dropout
198 | with tf.name_scope("dropout"):h_drop = tf.nn.dropout(h_pool_flat,dropout_keep_prob)
199 |
200 | # Final (unnormalized) scores and predictions
201 | with tf.name_scope("output"):
202 | W = tf.get_variable(
203 | "W",
204 | shape=[num_filters_total, num_classes],
205 | initializer=tf.contrib.layers.xavier_initializer())
206 | b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
207 | scores = tf.nn.xw_plus_b(h_drop, W, b, name="scores")
208 |
209 |
210 | # In[9]:
211 |
212 | # 选择模型
213 | checkpoint_file = "./models/model-10000"
214 |
215 | with tf.Session() as sess:
216 | predict_top_5 = tf.nn.top_k(scores, k=5)
217 | sess.run(tf.global_variables_initializer())
218 | i = 0
219 | saver = tf.train.Saver()
220 | saver.restore(sess, checkpoint_file)
221 |
222 | # Generate batches
223 | batches = batch_iter(list(x), 1000, 1)
224 |
225 | for x_batch in batches:
226 | i = i + 1
227 | predict_5 = sess.run(predict_top_5,feed_dict={input_x:x_batch,dropout_keep_prob:1.0})
228 | if i == 1:
229 | predict = predict_5[1]
230 | else:
231 | predict = np.concatenate((predict,predict_5[1]))
232 | if (i%5==0):
233 | print ("Evaluation:step",i)
234 |
235 | np.savetxt("predict.txt",predict,fmt='%d')
236 |
237 |
238 | # In[ ]:
239 |
240 |
241 |
242 |
--------------------------------------------------------------------------------
/Tensorflow基础使用与文本分类应用/程序/zhihu_predict.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html
\n",
8 | "优酷频道:http://i.youku.com/sdxxqbf
\n",
9 | "微信公众号:深度学习与神经网络
\n",
10 | "Github:https://github.com/Qinbf
"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "metadata": {
17 | "collapsed": true
18 | },
19 | "outputs": [],
20 | "source": [
21 | "import pandas as pd\n",
22 | "from tqdm import tqdm\n",
23 | "import re\n",
24 | "import numpy as np\n",
25 | "from six.moves import xrange"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 2,
31 | "metadata": {
32 | "collapsed": false
33 | },
34 | "outputs": [
35 | {
36 | "name": "stdout",
37 | "output_type": "stream",
38 | "text": [
39 | " 0 1 \\\n",
40 | "0 738845194850773558 -5833678375673307423 \n",
41 | "1 3738968195649774859 2027693463582123305 \n",
42 | "2 4738849194894773882 1127459907694805235 \n",
43 | "3 7739004195693774975 2904932941037075699,1160326435131345730,725917... \n",
44 | "4 -7261194805221226386 -5833678375673307423 \n",
45 | "\n",
46 | " 2 3 4 \\\n",
47 | "0 c0,c1 w0 c0,c1,c2,c3,c4,c5,c6,c7,c0,c1,c8,c9,c10,c11,c1... \n",
48 | "1 c39,c40 w24 c41,c42,c43,c39,c40,c4,c44,c45,c46,c47,c48,c49... \n",
49 | "2 c172,c31,c0,c1 w102 NaN \n",
50 | "3 c39,c40,c5,c173 w103 c39,c40,c23,c21,c174,c74,c5,c173,c17,c35,c39,c... \n",
51 | "4 c36,c31,c45,c237 w148 c238,c239 \n",
52 | "\n",
53 | " 5 \n",
54 | "0 w0,w1,w2,w3,w4,w5,w6,w7,w8,w9,w10,w11,w12,w13,... \n",
55 | "1 w24,w25,w26,w27,w28,w6,w29,w30,w11,w31,w32,w33... \n",
56 | "2 NaN \n",
57 | "3 w104,w105,w11,w21,w24,w6,w106,w23,w54,w24,w107... \n",
58 | "4 w149,w150 \n"
59 | ]
60 | }
61 | ],
62 | "source": [
63 | "topic_info = pd.read_table(\"./ieee_zhihu_cup/topic_info.txt\",sep='\\t',header=None)\n",
64 | "print(topic_info.iloc[0:5])"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 3,
70 | "metadata": {
71 | "collapsed": true
72 | },
73 | "outputs": [],
74 | "source": [
75 | "# 话题字典\n",
76 | "topic_dict = {}\n",
77 | "for i in xrange(topic_info.shape[0]):\n",
78 | " topic_dict[i] = topic_info.iloc[i][0]"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": 4,
84 | "metadata": {
85 | "collapsed": true
86 | },
87 | "outputs": [],
88 | "source": [
89 | "predict = open('predict.txt', \"r\")\n",
90 | "examples = predict.readlines()\n",
91 | "text = np.array([line.split(\" \") for line in examples])"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": 5,
97 | "metadata": {
98 | "collapsed": false
99 | },
100 | "outputs": [
101 | {
102 | "name": "stderr",
103 | "output_type": "stream",
104 | "text": [
105 | "100%|██████████| 217360/217360 [00:01<00:00, 160389.86it/s]\n"
106 | ]
107 | }
108 | ],
109 | "source": [
110 | "label = []\n",
111 | "for line in tqdm(text):\n",
112 | " num2label = []\n",
113 | " for i in xrange(5):\n",
114 | " num2label.append(topic_dict[int(line[i])]) # 把0-1999编号转成原来的id\n",
115 | " label.append(num2label)\n",
116 | "label = np.array(label)"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": 6,
122 | "metadata": {
123 | "collapsed": false
124 | },
125 | "outputs": [],
126 | "source": [
127 | "np.savetxt(\"temp.txt\",label,fmt='%d')"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": 7,
133 | "metadata": {
134 | "collapsed": false
135 | },
136 | "outputs": [],
137 | "source": [
138 | "def clean_str(string):\n",
139 | " string = re.sub(r\" \", \",\", string)\n",
140 | " return string\n",
141 | "\n",
142 | "file1 = open('temp.txt', \"r\")\n",
143 | "examples = file1.readlines()\n",
144 | "examples = [clean_str(line) for line in examples]\n",
145 | "file1.close()\n",
146 | "\n",
147 | "file1 = open('temp.txt', \"w\")\n",
148 | "file1.writelines(examples)\n",
149 | "file1.close()"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 8,
155 | "metadata": {
156 | "collapsed": false
157 | },
158 | "outputs": [
159 | {
160 | "name": "stdout",
161 | "output_type": "stream",
162 | "text": [
163 | " 0\n",
164 | "0 -3517637179126242000,-4653836020042332281,4715...\n",
165 | "1 3418451812342379591,2858911571784840089,238291...\n",
166 | "2 -7358589937244777363,-5265476641576484497,7477...\n",
167 | "3 -7046289575185911002,-4653836020042332281,-587...\n",
168 | "4 4715442001886462944,-8963554618409314978,11274...\n"
169 | ]
170 | }
171 | ],
172 | "source": [
173 | "# predict文件导入\n",
174 | "predict_file = 'temp.txt'\n",
175 | "predict_reader = pd.read_table(predict_file,sep=' ',header=None)\n",
176 | "print(predict_reader.iloc[0:5])"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 9,
182 | "metadata": {
183 | "collapsed": false
184 | },
185 | "outputs": [
186 | {
187 | "name": "stdout",
188 | "output_type": "stream",
189 | "text": [
190 | " 0 1 \\\n",
191 | "0 6215603645409872328 c924,c531,c102,c284,c188,c104,c98,c107,c11,c11... \n",
192 | "1 6649324930261961840 c346,c1549,c413,c294,c675,c504,c183,c74,c541,c... \n",
193 | "2 -4251899610700378615 c96,c97,c97,c98,c99,c100,c101,c141,c42,c42,c10... \n",
194 | "\n",
195 | " 2 \\\n",
196 | "0 w1340,w1341,w55,w1344,w58,w6,w24178,w26959,w47... \n",
197 | "1 w40132,w1357,w1556,w1380,w2464,w33,w16791,w109... \n",
198 | "2 w53,w54,w1779,w54,w1309,w54,w369,w949,w65587,w... \n",
199 | "\n",
200 | " 3 \\\n",
201 | "0 c1128,c529,c636,c572,c1321,c139,c540,c223,c510... \n",
202 | "1 NaN \n",
203 | "2 c149,c148,c148,c42,c185,c95,c95,c186,c186,c186... \n",
204 | "\n",
205 | " 4 \n",
206 | "0 w4094,w1618,w20104,w19234,w1097,w1005,w4228,w2... \n",
207 | "1 NaN \n",
208 | "2 NaN \n"
209 | ]
210 | }
211 | ],
212 | "source": [
213 | "# 导入question_train_set\n",
214 | "eval_reader = pd.read_table('./ieee_zhihu_cup/question_eval_set.txt',sep='\\t',header=None)\n",
215 | "print(eval_reader.iloc[0:3])"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 10,
221 | "metadata": {
222 | "collapsed": false
223 | },
224 | "outputs": [
225 | {
226 | "name": "stdout",
227 | "output_type": "stream",
228 | "text": [
229 | " 0 0\n",
230 | "0 6215603645409872328 -3517637179126242000,-4653836020042332281,4715...\n",
231 | "1 6649324930261961840 3418451812342379591,2858911571784840089,238291...\n",
232 | "2 -4251899610700378615 -7358589937244777363,-5265476641576484497,7477...\n",
233 | "3 6213817087034420233 -7046289575185911002,-4653836020042332281,-587...\n",
234 | "4 -8930652370334418373 4715442001886462944,-8963554618409314978,11274...\n"
235 | ]
236 | }
237 | ],
238 | "source": [
239 | "final_predict = pd.concat([eval_reader.ix[:,0],predict_reader],axis=1)\n",
240 | "print(final_predict.iloc[0:5])"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": 11,
246 | "metadata": {
247 | "collapsed": true
248 | },
249 | "outputs": [],
250 | "source": [
251 | "final_predict.to_csv('temp.txt', header=None, index=None, sep=',')\n",
252 | "\n",
253 | "final_file = open('temp.txt', \"r\")\n",
254 | "final_examples = final_file.readlines()\n",
255 | "final_examples = [re.sub(r'\"',\"\",line) for line in final_examples]\n",
256 | "final_file.close()\n",
257 | "\n",
258 | "final_file = open('final_predict.csv', \"w\")\n",
259 | "final_file.writelines(final_examples)\n",
260 | "final_file.close()"
261 | ]
262 | }
263 | ],
264 | "metadata": {
265 | "anaconda-cloud": {},
266 | "kernelspec": {
267 | "display_name": "Python [default]",
268 | "language": "python",
269 | "name": "python3"
270 | },
271 | "language_info": {
272 | "codemirror_mode": {
273 | "name": "ipython",
274 | "version": 3
275 | },
276 | "file_extension": ".py",
277 | "mimetype": "text/x-python",
278 | "name": "python",
279 | "nbconvert_exporter": "python",
280 | "pygments_lexer": "ipython3",
281 | "version": "3.5.2"
282 | }
283 | },
284 | "nbformat": 4,
285 | "nbformat_minor": 2
286 | }
287 |
--------------------------------------------------------------------------------
/Tensorflow基础使用与文本分类应用/程序/cnn.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # 51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html
5 | # 优酷频道:http://i.youku.com/sdxxqbf
6 | # 微信公众号:深度学习与神经网络
7 | # Github:https://github.com/Qinbf
8 |
9 | # In[1]:
10 |
11 | import tensorflow as tf
12 | import numpy as np
13 | import os
14 | import time
15 | import numpy as np
16 | import pandas as pd
17 | import math
18 | from tqdm import tqdm
19 | from six.moves import xrange
20 |
21 |
22 | # In[2]:
23 |
24 | # Parameters
25 | # ==================================================
26 |
27 | # Data loading params
28 | # validation数据集占比
29 | tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation")
30 | # 数据集
31 | tf.flags.DEFINE_string("data_file", "./ieee_zhihu_cup/data_topic_block_0.txt", "Data source for the positive data.")
32 |
33 | # Model Hyperparameters
34 | # 词向量长度
35 | tf.flags.DEFINE_integer("embedding_dim", 256, "Dimensionality of character embedding (default: 256)")
36 | # 卷积核大小
37 | tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')")
38 | # 每一种卷积核个数
39 | tf.flags.DEFINE_integer("num_filters", 1024, "Number of filters per filter size (default: 1024)")
40 | # dropout参数
41 | tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
42 | # l2正则化参数
43 | tf.flags.DEFINE_float("l2_reg_lambda", 0.0005, "L2 regularization lambda (default: 0.0005)")
44 |
45 | # Training parameters
46 | # 批次大小
47 | tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
48 | # 迭代周期
49 | tf.flags.DEFINE_integer("num_epochs", 10, "Number of training epochs (default: 10)")
50 | # 多少step测试一次
51 | tf.flags.DEFINE_integer("evaluate_every", 50, "Evaluate model on dev set after this many steps (default: 50)")
52 | # 多少step保存一次模型
53 | tf.flags.DEFINE_integer("checkpoint_every", 200, "Save model after this many steps (default: 200)")
54 | # 保存多少个模型
55 | tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)")
56 |
57 | # flags解析
58 | FLAGS = tf.flags.FLAGS
59 | FLAGS._parse_flags()
60 |
61 | # 打印所有参数
62 | print("\nParameters:")
63 | for attr, value in sorted(FLAGS.__flags.items()):
64 | print("{}={}".format(attr.upper(), value))
65 | print("")
66 |
67 |
68 | # In[3]:
69 |
70 | y = []
71 | x_text = []
72 |
73 | # 读取训练数据和标签
74 | reader = pd.read_table(FLAGS.data_file,sep='\t',header=None)
75 | for i in tqdm(xrange(reader.shape[0])):
76 | # 按','切分标签
77 | temp = reader.iloc[i][1].split(',')
78 | # 如果分类数大于5,只取前5个分类
79 | if (len(temp)>5):
80 | temp = temp[0:5]
81 | # 设置标签的对应位置为1,其余位置为0
82 | label = np.zeros(1999)
83 | for temp_label in temp:
84 | label[int(temp_label)] = 1
85 | y.append(label)
86 | x_text.append(reader.iloc[i][0])
87 |
88 |
89 | # In[4]:
90 |
91 | # 打印x_text和y的前5行
92 | print(x_text[0:5])
93 | y = np.array(y, dtype = np.float32)
94 | print(y[0:5])
95 |
96 |
97 | # In[5]:
98 |
99 | # Build vocabulary
100 | # 计算一段文本中最多的词汇数
101 | max_document_length = max([len(x.split(",")) for x in x_text])
102 | vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(max_document_length)
103 |
104 | x = np.array(list(vocab_processor.fit_transform(x_text)))
105 | print("x_shape:",x.shape)
106 | print("y_shape:",y.shape)
107 |
108 | # 保存字典
109 | vocab_processor.save("vocab_dict")
110 |
111 | # Split train/test set
112 | # 数据集切分为两部分,训练集和验证集
113 | dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
114 | x_train, x_dev = x[:dev_sample_index], x[dev_sample_index:]
115 | y_train, y_dev = y[:dev_sample_index], y[dev_sample_index:]
116 |
117 | print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
118 | print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
119 | print("x:",x_train[0:5])
120 | print("y:",y_train[0:5])
121 |
122 |
123 | # In[6]:
124 |
125 | # 定义三个placeholder
126 | input_x = tf.placeholder(tf.int32, [None, x_train.shape[1]], name="input_x")
127 | input_y = tf.placeholder(tf.float32, [None, y_train.shape[1]], name="input_y")
128 | dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
129 |
130 | # sequence_length-最长词汇数
131 | sequence_length=x_train.shape[1]
132 | # num_classes-分类数
133 | num_classes=y_train.shape[1]
134 | # vocab_size-总词汇数
135 | vocab_size=len(vocab_processor.vocabulary_)
136 | # embedding_size-词向量长度
137 | embedding_size=FLAGS.embedding_dim
138 | # filter_sizes-卷积核尺寸3,4,5
139 | filter_sizes=list(map(int, FLAGS.filter_sizes.split(",")))
140 | # num_filters-卷积核数量
141 | num_filters=FLAGS.num_filters
142 |
143 | Weights = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name="Weights")
144 | # shape:[None, sequence_length, embedding_size]
145 | embedded_chars = tf.nn.embedding_lookup(Weights, input_x)
146 | # 添加一个维度,shape:[None, sequence_length, embedding_size, 1]
147 | embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)
148 |
149 | # Create a convolution + maxpool layer for each filter size
150 | pooled_outputs = []
151 | for i, filter_size in enumerate(filter_sizes):
152 | with tf.name_scope("conv-maxpool-%s" % filter_size):
153 | # Convolution Layer
154 | filter_shape = [filter_size, embedding_size, 1, num_filters]
155 | W = tf.Variable(
156 | tf.truncated_normal(filter_shape, stddev=0.1), name="W")
157 | b = tf.Variable(
158 | tf.constant(0.1, shape=[num_filters]), name="b")
159 | conv = tf.nn.conv2d(
160 | embedded_chars_expanded,
161 | W,
162 | strides=[1, 1, 1, 1],
163 | padding="VALID",
164 | name="conv")
165 | # Apply nonlinearity
166 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
167 | # Maxpooling over the outputs
168 | pooled = tf.nn.max_pool(
169 | h,
170 | ksize=[1, sequence_length - filter_size + 1, 1, 1],
171 | strides=[1, 1, 1, 1],
172 | padding='VALID',
173 | name="pool")
174 | pooled_outputs.append(pooled)
175 |
176 | # Combine all the pooled features
177 | num_filters_total = num_filters * len(filter_sizes)
178 | print("num_filters_total:", num_filters_total)
179 | h_pool = tf.concat(pooled_outputs, 3)
180 | h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])
181 |
182 | # Add dropout
183 | with tf.name_scope("dropout"):h_drop = tf.nn.dropout(h_pool_flat,dropout_keep_prob)
184 |
185 | # Final (unnormalized) scores and predictions
186 | with tf.name_scope("output"):
187 | W = tf.get_variable(
188 | "W",
189 | shape=[num_filters_total, num_classes],
190 | initializer=tf.contrib.layers.xavier_initializer())
191 | b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
192 | scores = tf.nn.xw_plus_b(h_drop, W, b, name="scores")
193 |
194 | # 定义loss
195 | with tf.name_scope("loss"):
196 | loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=scores, labels=input_y))
197 |
198 | # 定义优化器
199 | with tf.name_scope("optimizer"):
200 | optimizer = tf.train.AdamOptimizer(1e-3).minimize(loss)
201 |
202 |
203 | # In[7]:
204 |
205 | # 生成批次数据
206 | def batch_iter(data, batch_size, num_epochs, shuffle=False):
207 | """
208 | Generates a batch iterator for a dataset.
209 | """
210 | data = np.array(data)
211 | data_size = len(data)
212 | # 每个epoch的num_batch
213 | num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1
214 | print("num_batches_per_epoch:",num_batches_per_epoch)
215 | for epoch in range(num_epochs):
216 | # Shuffle the data at each epoch
217 | if shuffle:
218 | shuffle_indices = np.random.permutation(np.arange(data_size))
219 | shuffled_data = data[shuffle_indices]
220 | else:
221 | shuffled_data = data
222 | for batch_num in range(num_batches_per_epoch):
223 | start_index = batch_num * batch_size
224 | end_index = min((batch_num + 1) * batch_size, data_size)
225 | yield shuffled_data[start_index:end_index]
226 |
227 |
228 | # In[ ]:
229 |
230 | # 知乎提供的评测方案
231 | def eval(predict_label_and_marked_label_list):
232 | """
233 | :param predict_label_and_marked_label_list: 一个元组列表。例如
234 | [ ([1, 2, 3, 4, 5], [4, 5, 6, 7]),
235 | ([3, 2, 1, 4, 7], [5, 7, 3])
236 | ]
237 | 需要注意这里 predict_label 是去重复的,例如 [1,2,3,2,4,1,6],去重后变成[1,2,3,4,6]
238 |
239 | marked_label_list 本身没有顺序性,但提交结果有,例如上例的命中情况分别为
240 | [0,0,0,1,1] (4,5命中)
241 | [1,0,0,0,1] (3,7命中)
242 |
243 | """
244 | right_label_num = 0 #总命中标签数量
245 | right_label_at_pos_num = [0, 0, 0, 0, 0] #在各个位置上总命中数量
246 | sample_num = 0 #总问题数量
247 | all_marked_label_num = 0 #总标签数量
248 | for predict_labels, marked_labels in predict_label_and_marked_label_list:
249 | sample_num += 1
250 | marked_label_set = set(marked_labels)
251 | all_marked_label_num += len(marked_label_set)
252 | for pos, label in zip(range(0, min(len(predict_labels), 5)), predict_labels):
253 | if label in marked_label_set: #命中
254 | right_label_num += 1
255 | right_label_at_pos_num[pos] += 1
256 |
257 | precision = 0.0
258 | for pos, right_num in zip(range(0, 5), right_label_at_pos_num):
259 | precision += ((right_num / float(sample_num))) / math.log(2.0 + pos) # 下标0-4 映射到 pos1-5 + 1,所以最终+2
260 | recall = float(right_label_num) / all_marked_label_num
261 |
262 | return 2*(precision * recall) / (precision + recall )
263 |
264 |
265 | # In[ ]:
266 |
267 | # 定义saver,只保存最新的5个模型
268 | saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)
269 |
270 | with tf.Session() as sess:
271 | predict_top_5 = tf.nn.top_k(scores, k=5)
272 | label_top_5 = tf.nn.top_k(input_y, k=5)
273 | sess.run(tf.global_variables_initializer())
274 | i = 0
275 | # 生成数据
276 | batches = batch_iter(
277 | list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs)
278 | for batch in batches:
279 | i = i + 1
280 | # 得到一个batch的数据
281 | x_batch, y_batch = zip(*batch)
282 | # 优化模型
283 | sess.run([optimizer],feed_dict={input_x:x_batch, input_y:y_batch, dropout_keep_prob:FLAGS.dropout_keep_prob})
284 |
285 | # 每训练50次测试1次
286 | if (i % FLAGS.evaluate_every == 0):
287 | print ("Evaluation:step",i)
288 | predict_5, label_5, _loss = sess.run([predict_top_5,label_top_5,loss],feed_dict={input_x:x_batch,
289 | input_y:y_batch,
290 | dropout_keep_prob:1.0})
291 | print ("label:",label_5[1][:5])
292 | print ("predict:",predict_5[1][:5])
293 | print ("predict:",predict_5[0][:5])
294 | print ("loss:",_loss)
295 | predict_label_and_marked_label_list = []
296 | for predict,label in zip(predict_5[1],label_5[1]):
297 | predict_label_and_marked_label_list.append((list(predict),list(label)))
298 | score = eval(predict_label_and_marked_label_list)
299 | print("score:",score)
300 |
301 | # 每训练200次保存1次模型
302 | if (i % FLAGS.checkpoint_every == 0):
303 | path = saver.save(sess, "models/model", global_step=i)
304 | print("Saved model checkpoint to {}".format(path))
305 |
306 |
307 | # In[ ]:
308 |
--------------------------------------------------------------------------------
/Tensorflow基础使用与文本分类应用/程序/data_handle.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html
\n",
8 | "优酷频道:http://i.youku.com/sdxxqbf
\n",
9 | "微信公众号:深度学习与神经网络
\n",
10 | "Github:https://github.com/Qinbf
\n",
11 | "\n",
12 | "question_train_set.txt: \n",
13 | " 第一列为 问题id; \n",
14 | " 第二列为 title 的字符编号序列; \n",
15 | " 第三列为 title 的词语编号序列; \n",
16 | " 第四列为描述的字符编号序列; \n",
17 | " 第五列为描述的词语标号序列。 \n",
18 | " \n",
19 | "question_topic_train_set.txt: \n",
20 | " 第一列 问题 id; \n",
21 | " 第二列 话题 id。 \n",
22 | "\n",
23 | "topic_info.txt: \n",
24 | " 第一列为话题 id \n",
25 | " 第二列为话题的父话题 id。话题之间是有向无环图结构,一个话题可能有 0 到多个父话题; \n",
26 | " 第三列为话题名字的字符编号序列; \n",
27 | " 第四列为话题名字的词语编号序列; \n",
28 | " 第五列为话题描述的字符编号序列; \n",
29 | " 第六列为话题描述的词语编号序列。 \n",
30 | "\n",
31 | "1.title通常来说包含的信息最重要。对于question_train_set.txt文件,为了简单起见,我们只取第三列,title的词语编号序列。 \n",
32 | "2.对于topic_info.txt,为了简单起见,我们不考虑2,3,4,5,6列。只是简单的提取话题id,然后转为0-1998的数字(一共有1999个话题) \n",
33 | "3.然后合并以上一些数据,得到最后处理后的数据。 "
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 1,
39 | "metadata": {
40 | "collapsed": true
41 | },
42 | "outputs": [],
43 | "source": [
44 | "import pandas as pd\n",
45 | "from tqdm import tqdm # pip install tqdm\n",
46 | "from six.moves import xrange"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 2,
52 | "metadata": {
53 | "collapsed": false
54 | },
55 | "outputs": [
56 | {
57 | "name": "stdout",
58 | "output_type": "stream",
59 | "text": [
60 | " 0 1 \\\n",
61 | "0 6555699376639805223 c324,c39,c40,c155,c180,c180,c181,c17,c4,c1153,... \n",
62 | "1 2887834264226772863 c44,c110,c101,c286,c106,c150,c101,c892,c632,c1... \n",
63 | "2 -2687466858632038806 c15,c768,c769,c1363,c650,c1218,c2361,c11,c90,c... \n",
64 | "3 -5698296155734268 c473,c1528,c528,c428,c295,c15,c101,c188,c146,c... \n",
65 | "4 -6719100304248915192 c190,c147,c105,c219,c220,c101,c647,c219,c220,c... \n",
66 | "\n",
67 | " 2 \\\n",
68 | "0 w305,w13549,w22752,w11,w7225,w2565,w1106,w16,w... \n",
69 | "1 w377,w54,w285,w57,w349,w54,w108215,w6,w47986,w... \n",
70 | "2 w875,w15450,w42394,w15863,w6,w95421,w25,w803,w... \n",
71 | "3 w8646,w2744,w1462,w9,w54,w138,w54,w50,w110,w14... \n",
72 | "4 w380,w54,w674,w133,w54,w134,w614,w54,w929,w307... \n",
73 | "\n",
74 | " 3 \\\n",
75 | "0 c335,c101,c611,c189,c97,c144,c147,c101,c15,c76... \n",
76 | "1 c1265,c518,c74,c131,c274,c57,c768,c769,c368,c3... \n",
77 | "2 c693,c100,c279,c99,c189,c532,c101,c189,c145,c1... \n",
78 | "3 NaN \n",
79 | "4 c644,c1212,c253,c199,c431,c452,c424,c207,c2,c1... \n",
80 | "\n",
81 | " 4 \n",
82 | "0 w231,w54,w1681,w54,w11506,w5714,w7,w54,w744,w1... \n",
83 | "1 w12508,w1380,w72,w27045,w276,w111 \n",
84 | "2 w140340,w54,w48398,w54,w140341,w54,w12856,w54,... \n",
85 | "3 NaN \n",
86 | "4 w4821,w1301,w16003,w928,w1961,w2565,w50803,w11... \n"
87 | ]
88 | }
89 | ],
90 | "source": [
91 | "# 导入question_train_set\n",
92 | "reader = pd.read_table('./ieee_zhihu_cup/question_train_set.txt',sep='\\t',header=None)\n",
93 | "print(reader.iloc[0:5])"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": 3,
99 | "metadata": {
100 | "collapsed": false
101 | },
102 | "outputs": [
103 | {
104 | "name": "stdout",
105 | "output_type": "stream",
106 | "text": [
107 | " 0 1\n",
108 | "0 6555699376639805223 7739004195693774975,3738968195649774859\n",
109 | "1 2887834264226772863 -3149765934180654494\n",
110 | "2 -2687466858632038806 -760432988437306018\n",
111 | "3 -5698296155734268 -6758942141122113907,3195914392210930723\n",
112 | "4 -6719100304248915192 3804601920633030746,4797226510592237555,435133...\n"
113 | ]
114 | }
115 | ],
116 | "source": [
117 | "# 导入question_topic_eval_set\n",
118 | "topic_reader = pd.read_table('./ieee_zhihu_cup/question_topic_train_set.txt',sep='\\t',header=None)\n",
119 | "print(topic_reader.iloc[0:5])"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": 4,
125 | "metadata": {
126 | "collapsed": false
127 | },
128 | "outputs": [
129 | {
130 | "name": "stdout",
131 | "output_type": "stream",
132 | "text": [
133 | " 0 \\\n",
134 | "0 w305,w13549,w22752,w11,w7225,w2565,w1106,w16,w... \n",
135 | "1 w377,w54,w285,w57,w349,w54,w108215,w6,w47986,w... \n",
136 | "2 w875,w15450,w42394,w15863,w6,w95421,w25,w803,w... \n",
137 | "3 w8646,w2744,w1462,w9,w54,w138,w54,w50,w110,w14... \n",
138 | "4 w380,w54,w674,w133,w54,w134,w614,w54,w929,w307... \n",
139 | "\n",
140 | " 1 \n",
141 | "0 7739004195693774975,3738968195649774859 \n",
142 | "1 -3149765934180654494 \n",
143 | "2 -760432988437306018 \n",
144 | "3 -6758942141122113907,3195914392210930723 \n",
145 | "4 3804601920633030746,4797226510592237555,435133... \n"
146 | ]
147 | }
148 | ],
149 | "source": [
150 | "# 合并title 的词语编号序列和话题 id\n",
151 | "data_topic = pd.concat([reader.ix[:,2], topic_reader.ix[:,1]], axis=1, ignore_index=True)\n",
152 | "print(data_topic.iloc[0:5])"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": 5,
158 | "metadata": {
159 | "collapsed": false
160 | },
161 | "outputs": [
162 | {
163 | "name": "stdout",
164 | "output_type": "stream",
165 | "text": [
166 | " 0 1 \\\n",
167 | "0 738845194850773558 -5833678375673307423 \n",
168 | "1 3738968195649774859 2027693463582123305 \n",
169 | "2 4738849194894773882 1127459907694805235 \n",
170 | "3 7739004195693774975 2904932941037075699,1160326435131345730,725917... \n",
171 | "4 -7261194805221226386 -5833678375673307423 \n",
172 | "\n",
173 | " 2 3 4 \\\n",
174 | "0 c0,c1 w0 c0,c1,c2,c3,c4,c5,c6,c7,c0,c1,c8,c9,c10,c11,c1... \n",
175 | "1 c39,c40 w24 c41,c42,c43,c39,c40,c4,c44,c45,c46,c47,c48,c49... \n",
176 | "2 c172,c31,c0,c1 w102 NaN \n",
177 | "3 c39,c40,c5,c173 w103 c39,c40,c23,c21,c174,c74,c5,c173,c17,c35,c39,c... \n",
178 | "4 c36,c31,c45,c237 w148 c238,c239 \n",
179 | "\n",
180 | " 5 \n",
181 | "0 w0,w1,w2,w3,w4,w5,w6,w7,w8,w9,w10,w11,w12,w13,... \n",
182 | "1 w24,w25,w26,w27,w28,w6,w29,w30,w11,w31,w32,w33... \n",
183 | "2 NaN \n",
184 | "3 w104,w105,w11,w21,w24,w6,w106,w23,w54,w24,w107... \n",
185 | "4 w149,w150 \n"
186 | ]
187 | }
188 | ],
189 | "source": [
190 | "# 导入topic_info\n",
191 | "label_reader = pd.read_table('./ieee_zhihu_cup/topic_info.txt',sep='\\t',header=None)\n",
192 | "print(label_reader.iloc[0:5])"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": 6,
198 | "metadata": {
199 | "collapsed": false
200 | },
201 | "outputs": [
202 | {
203 | "name": "stdout",
204 | "output_type": "stream",
205 | "text": [
206 | "3\n"
207 | ]
208 | }
209 | ],
210 | "source": [
211 | "# 把标签转为0-1998的编号\n",
212 | "labels = list(label_reader.iloc[:,0])\n",
213 | "my_labels = []\n",
214 | "for label in labels:\n",
215 | " my_labels.append(label)\n",
216 | " \n",
217 | "# 建立topic字典\n",
218 | "topic_dict = {}\n",
219 | "for i,label in enumerate(my_labels):\n",
220 | " topic_dict[label] = i\n",
221 | "\n",
222 | "print(topic_dict[7739004195693774975])"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": 7,
228 | "metadata": {
229 | "collapsed": false
230 | },
231 | "outputs": [
232 | {
233 | "name": "stderr",
234 | "output_type": "stream",
235 | "text": [
236 | "100%|██████████████████████████████████████████████████████████████████████| 2999967/2999967 [12:15<00:00, 4076.87it/s]\n"
237 | ]
238 | },
239 | {
240 | "name": "stdout",
241 | "output_type": "stream",
242 | "text": [
243 | " 0 1\n",
244 | "0 w305,w13549,w22752,w11,w7225,w2565,w1106,w16,w... 3,1\n",
245 | "1 w377,w54,w285,w57,w349,w54,w108215,w6,w47986,w... 769\n",
246 | "2 w875,w15450,w42394,w15863,w6,w95421,w25,w803,w... 342\n",
247 | "3 w8646,w2744,w1462,w9,w54,w138,w54,w50,w110,w14... 1842,12\n",
248 | "4 w380,w54,w674,w133,w54,w134,w614,w54,w929,w307... 155,150,110,7,6\n"
249 | ]
250 | }
251 | ],
252 | "source": [
253 | "for i in tqdm(xrange(data_topic.shape[0])):\n",
254 | " new_label = ''\n",
255 | " # 根据“,”切分话题id\n",
256 | " temp_topic = data_topic.iloc[i][1].split(',')\n",
257 | " for topic in temp_topic:\n",
258 | " # 判断该label是否在label文件中,并得到该行\n",
259 | " label_num = topic_dict[int(topic)]\n",
260 | " new_label = new_label + str(label_num) + ','\n",
261 | " data_topic.iloc[i][1] = new_label[:-1]\n",
262 | "print(data_topic.iloc[:5])"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": 8,
268 | "metadata": {
269 | "collapsed": true
270 | },
271 | "outputs": [],
272 | "source": [
273 | "# 保存处理过后的文件\n",
274 | "data_topic.to_csv(\"./ieee_zhihu_cup/data_topic.txt\", header=None, index=None, sep='\\t')\n",
275 | "\n",
276 | "# 切分成10块保存\n",
277 | "for i in xrange(10):\n",
278 | " data_topic_filename = './ieee_zhihu_cup/data_topic_block_' + str(i) + '.txt'\n",
279 | " if (i+1)*300000 < data_topic.shape[0]:\n",
280 | " data_topic.iloc[i*300000:(i+1)*300000].to_csv(\n",
281 | " data_topic_filename, header=None, index=None, sep='\\t')\n",
282 | " else:\n",
283 | " data_topic.iloc[i*300000:data_topic.shape[0]].to_csv(\n",
284 | " data_topic_filename, header=None, index=None, sep='\\t')"
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": null,
290 | "metadata": {
291 | "collapsed": true
292 | },
293 | "outputs": [],
294 | "source": []
295 | }
296 | ],
297 | "metadata": {
298 | "anaconda-cloud": {},
299 | "kernelspec": {
300 | "display_name": "Python [default]",
301 | "language": "python",
302 | "name": "python3"
303 | },
304 | "language_info": {
305 | "codemirror_mode": {
306 | "name": "ipython",
307 | "version": 3
308 | },
309 | "file_extension": ".py",
310 | "mimetype": "text/x-python",
311 | "name": "python",
312 | "nbconvert_exporter": "python",
313 | "pygments_lexer": "ipython3",
314 | "version": "3.5.2"
315 | }
316 | },
317 | "nbformat": 4,
318 | "nbformat_minor": 2
319 | }
320 |
--------------------------------------------------------------------------------
/Tensorflow基础使用与文本分类应用/程序/cnn.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html
\n",
8 | "优酷频道:http://i.youku.com/sdxxqbf
\n",
9 | "微信公众号:深度学习与神经网络
\n",
10 | "Github:https://github.com/Qinbf
"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "metadata": {
17 | "collapsed": true
18 | },
19 | "outputs": [],
20 | "source": [
21 | "import tensorflow as tf\n",
22 | "import numpy as np\n",
23 | "import os\n",
24 | "import time\n",
25 | "import numpy as np\n",
26 | "import pandas as pd\n",
27 | "import math\n",
28 | "from tqdm import tqdm\n",
29 | "from six.moves import xrange"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 2,
35 | "metadata": {
36 | "collapsed": false
37 | },
38 | "outputs": [
39 | {
40 | "name": "stdout",
41 | "output_type": "stream",
42 | "text": [
43 | "\n",
44 | "Parameters:\n",
45 | "BATCH_SIZE=64\n",
46 | "CHECKPOINT_EVERY=200\n",
47 | "DATA_FILE=./ieee_zhihu_cup/data_topic_block_0.txt\n",
48 | "DEV_SAMPLE_PERCENTAGE=0.1\n",
49 | "DROPOUT_KEEP_PROB=0.5\n",
50 | "EMBEDDING_DIM=256\n",
51 | "EVALUATE_EVERY=50\n",
52 | "FILTER_SIZES=3,4,5\n",
53 | "L2_REG_LAMBDA=0.0005\n",
54 | "NUM_CHECKPOINTS=5\n",
55 | "NUM_EPOCHS=10\n",
56 | "NUM_FILTERS=1024\n",
57 | "\n"
58 | ]
59 | }
60 | ],
61 | "source": [
62 | "# Parameters\n",
63 | "# ==================================================\n",
64 | "\n",
65 | "# Data loading params\n",
66 | "# validation数据集占比\n",
67 | "tf.flags.DEFINE_float(\"dev_sample_percentage\", .1, \"Percentage of the training data to use for validation\")\n",
68 | "# 数据集\n",
69 | "tf.flags.DEFINE_string(\"data_file\", \"./ieee_zhihu_cup/data_topic_block_0.txt\", \"Data source for the positive data.\")\n",
70 | "\n",
71 | "# Model Hyperparameters\n",
72 | "# 词向量长度\n",
73 | "tf.flags.DEFINE_integer(\"embedding_dim\", 256, \"Dimensionality of character embedding (default: 256)\")\n",
74 | "# 卷积核大小\n",
75 | "tf.flags.DEFINE_string(\"filter_sizes\", \"3,4,5\", \"Comma-separated filter sizes (default: '3,4,5')\")\n",
76 | "# 每一种卷积核个数\n",
77 | "tf.flags.DEFINE_integer(\"num_filters\", 1024, \"Number of filters per filter size (default: 1024)\")\n",
78 | "# dropout参数\n",
79 | "tf.flags.DEFINE_float(\"dropout_keep_prob\", 0.5, \"Dropout keep probability (default: 0.5)\")\n",
80 | "# l2正则化参数\n",
81 | "tf.flags.DEFINE_float(\"l2_reg_lambda\", 0.0005, \"L2 regularization lambda (default: 0.0005)\")\n",
82 | "\n",
83 | "# Training parameters\n",
84 | "# 批次大小\n",
85 | "tf.flags.DEFINE_integer(\"batch_size\", 64, \"Batch Size (default: 64)\")\n",
86 | "# 迭代周期\n",
87 | "tf.flags.DEFINE_integer(\"num_epochs\", 10, \"Number of training epochs (default: 10)\")\n",
88 | "# 多少step测试一次\n",
89 | "tf.flags.DEFINE_integer(\"evaluate_every\", 50, \"Evaluate model on dev set after this many steps (default: 50)\")\n",
90 | "# 多少step保存一次模型\n",
91 | "tf.flags.DEFINE_integer(\"checkpoint_every\", 200, \"Save model after this many steps (default: 200)\")\n",
92 | "# 保存多少个模型\n",
93 | "tf.flags.DEFINE_integer(\"num_checkpoints\", 5, \"Number of checkpoints to store (default: 5)\")\n",
94 | "\n",
95 | "# flags解析\n",
96 | "FLAGS = tf.flags.FLAGS\n",
97 | "FLAGS._parse_flags()\n",
98 | "\n",
99 | "# 打印所有参数\n",
100 | "print(\"\\nParameters:\")\n",
101 | "for attr, value in sorted(FLAGS.__flags.items()):\n",
102 | " print(\"{}={}\".format(attr.upper(), value))\n",
103 | "print(\"\")"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": 3,
109 | "metadata": {
110 | "collapsed": false
111 | },
112 | "outputs": [
113 | {
114 | "name": "stderr",
115 | "output_type": "stream",
116 | "text": [
117 | "100%|████████████████████████████████████████████████████████████████████████| 300000/300000 [01:15<00:00, 3959.17it/s]\n"
118 | ]
119 | }
120 | ],
121 | "source": [
122 | "y = []\n",
123 | "x_text = []\n",
124 | "\n",
125 | "# 读取训练数据和标签\n",
126 | "reader = pd.read_table(FLAGS.data_file,sep='\\t',header=None)\n",
127 | "for i in tqdm(xrange(reader.shape[0])):\n",
128 | " # 按','切分标签\n",
129 | " temp = reader.iloc[i][1].split(',')\n",
130 | " # 如果分类数大于5,只取前5个分类\n",
131 | " if (len(temp)>5):\n",
132 | " temp = temp[0:5]\n",
133 | " # 设置标签的对应位置为1,其余位置为0\n",
134 | " label = np.zeros(1999)\n",
135 | " for temp_label in temp:\n",
136 | " label[int(temp_label)] = 1\n",
137 | " y.append(label)\n",
138 | " x_text.append(reader.iloc[i][0])"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 4,
144 | "metadata": {
145 | "collapsed": false
146 | },
147 | "outputs": [
148 | {
149 | "name": "stdout",
150 | "output_type": "stream",
151 | "text": [
152 | "['w305,w13549,w22752,w11,w7225,w2565,w1106,w16,w31389,w6,w1019,w69288,w111,w3332,w109,w11,w25,w1110,w111', 'w377,w54,w285,w57,w349,w54,w108215,w6,w47986,w875,w3352,w500,w21790,w12144,w111', 'w875,w15450,w42394,w15863,w6,w95421,w25,w803,w346,w6,w3763,w347,w88,w111', 'w8646,w2744,w1462,w9,w54,w138,w54,w50,w110,w140344,w111,w112,w49270,w2129,w6,w6978,w359,w10147,w111', 'w380,w54,w674,w133,w54,w134,w614,w54,w929,w307,w109,w110,w19045,w6,w5830,w111']\n",
153 | "[[ 0. 1. 0. ..., 0. 0. 0.]\n",
154 | " [ 0. 0. 0. ..., 0. 0. 0.]\n",
155 | " [ 0. 0. 0. ..., 0. 0. 0.]\n",
156 | " [ 0. 0. 0. ..., 0. 0. 0.]\n",
157 | " [ 0. 0. 0. ..., 0. 0. 0.]]\n"
158 | ]
159 | }
160 | ],
161 | "source": [
162 | "# 打印x_text和y的前5行\n",
163 | "print(x_text[0:5])\n",
164 | "y = np.array(y, dtype = np.float32)\n",
165 | "print(y[0:5])"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": 5,
171 | "metadata": {
172 | "collapsed": false
173 | },
174 | "outputs": [
175 | {
176 | "name": "stdout",
177 | "output_type": "stream",
178 | "text": [
179 | "x_shape: (300000, 72)\n",
180 | "y_shape: (300000, 1999)\n",
181 | "Vocabulary Size: 131900\n",
182 | "Train/Dev split: 270000/30000\n",
183 | "x: [[ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 4 16 17 13 0 0 0 0 0\n",
184 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
185 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
186 | " [18 19 20 21 22 19 23 10 24 25 26 27 28 29 13 0 0 0 0 0 0 0 0 0\n",
187 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
188 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
189 | " [25 30 31 32 10 33 16 34 35 10 36 37 38 13 0 0 0 0 0 0 0 0 0 0\n",
190 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
191 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
192 | " [39 40 41 42 19 43 19 44 45 46 13 47 48 49 10 50 51 52 13 0 0 0 0 0\n",
193 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
194 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
195 | " [53 19 54 55 19 56 57 19 58 59 15 45 60 10 61 13 0 0 0 0 0 0 0 0\n",
196 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
197 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]\n",
198 | "y: [[ 0. 1. 0. ..., 0. 0. 0.]\n",
199 | " [ 0. 0. 0. ..., 0. 0. 0.]\n",
200 | " [ 0. 0. 0. ..., 0. 0. 0.]\n",
201 | " [ 0. 0. 0. ..., 0. 0. 0.]\n",
202 | " [ 0. 0. 0. ..., 0. 0. 0.]]\n"
203 | ]
204 | }
205 | ],
206 | "source": [
207 | "# Build vocabulary\n",
208 | "# 计算一段文本中最多的词汇数\n",
209 | "max_document_length = max([len(x.split(\",\")) for x in x_text])\n",
210 | "vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(max_document_length)\n",
211 | "\n",
212 | "x = np.array(list(vocab_processor.fit_transform(x_text)))\n",
213 | "print(\"x_shape:\",x.shape)\n",
214 | "print(\"y_shape:\",y.shape)\n",
215 | "\n",
216 | "# 保存字典\n",
217 | "vocab_processor.save(\"vocab_dict\")\n",
218 | "\n",
219 | "# Split train/test set\n",
220 | "# 数据集切分为两部分,训练集和验证集\n",
221 | "dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))\n",
222 | "x_train, x_dev = x[:dev_sample_index], x[dev_sample_index:]\n",
223 | "y_train, y_dev = y[:dev_sample_index], y[dev_sample_index:]\n",
224 | "\n",
225 | "print(\"Vocabulary Size: {:d}\".format(len(vocab_processor.vocabulary_)))\n",
226 | "print(\"Train/Dev split: {:d}/{:d}\".format(len(y_train), len(y_dev)))\n",
227 | "print(\"x:\",x_train[0:5])\n",
228 | "print(\"y:\",y_train[0:5])"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": 6,
234 | "metadata": {
235 | "collapsed": false
236 | },
237 | "outputs": [
238 | {
239 | "name": "stdout",
240 | "output_type": "stream",
241 | "text": [
242 | "num_filters_total: 3072\n"
243 | ]
244 | }
245 | ],
246 | "source": [
247 | "# 定义三个placeholder\n",
248 | "input_x = tf.placeholder(tf.int32, [None, x_train.shape[1]], name=\"input_x\")\n",
249 | "input_y = tf.placeholder(tf.float32, [None, y_train.shape[1]], name=\"input_y\")\n",
250 | "dropout_keep_prob = tf.placeholder(tf.float32, name=\"dropout_keep_prob\")\n",
251 | "\n",
252 | "# sequence_length-最长词汇数\n",
253 | "sequence_length=x_train.shape[1]\n",
254 | "# num_classes-分类数\n",
255 | "num_classes=y_train.shape[1]\n",
256 | "# vocab_size-总词汇数\n",
257 | "vocab_size=len(vocab_processor.vocabulary_)\n",
258 | "# embedding_size-词向量长度\n",
259 | "embedding_size=FLAGS.embedding_dim\n",
260 | "# filter_sizes-卷积核尺寸3,4,5\n",
261 | "filter_sizes=list(map(int, FLAGS.filter_sizes.split(\",\")))\n",
262 | "# num_filters-卷积核数量\n",
263 | "num_filters=FLAGS.num_filters\n",
264 | " \n",
265 | "Weights = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name=\"Weights\")\n",
266 | "# shape:[None, sequence_length, embedding_size]\n",
267 | "embedded_chars = tf.nn.embedding_lookup(Weights, input_x)\n",
268 | "# 添加一个维度,shape:[None, sequence_length, embedding_size, 1]\n",
269 | "embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)\n",
270 | "\n",
271 | "# Create a convolution + maxpool layer for each filter size\n",
272 | "pooled_outputs = []\n",
273 | "for i, filter_size in enumerate(filter_sizes):\n",
274 | " with tf.name_scope(\"conv-maxpool-%s\" % filter_size):\n",
275 | " # Convolution Layer\n",
276 | " filter_shape = [filter_size, embedding_size, 1, num_filters]\n",
277 | " W = tf.Variable(\n",
278 | " tf.truncated_normal(filter_shape, stddev=0.1), name=\"W\")\n",
279 | " b = tf.Variable(\n",
280 | " tf.constant(0.1, shape=[num_filters]), name=\"b\")\n",
281 | " conv = tf.nn.conv2d(\n",
282 | " embedded_chars_expanded,\n",
283 | " W,\n",
284 | " strides=[1, 1, 1, 1],\n",
285 | " padding=\"VALID\",\n",
286 | " name=\"conv\")\n",
287 | " # Apply nonlinearity\n",
288 | " h = tf.nn.relu(tf.nn.bias_add(conv, b), name=\"relu\")\n",
289 | " # Maxpooling over the outputs\n",
290 | " pooled = tf.nn.max_pool(\n",
291 | " h,\n",
292 | " ksize=[1, sequence_length - filter_size + 1, 1, 1],\n",
293 | " strides=[1, 1, 1, 1],\n",
294 | " padding='VALID',\n",
295 | " name=\"pool\")\n",
296 | " pooled_outputs.append(pooled)\n",
297 | "\n",
298 | "# Combine all the pooled features\n",
299 | "num_filters_total = num_filters * len(filter_sizes)\n",
300 | "print(\"num_filters_total:\", num_filters_total)\n",
301 | "h_pool = tf.concat(pooled_outputs, 3)\n",
302 | "h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])\n",
303 | "\n",
304 | "# Add dropout\n",
305 | "with tf.name_scope(\"dropout\"):h_drop = tf.nn.dropout(h_pool_flat,dropout_keep_prob)\n",
306 | "\n",
307 | "# Final (unnormalized) scores and predictions\n",
308 | "with tf.name_scope(\"output\"):\n",
309 | " W = tf.get_variable(\n",
310 | " \"W\",\n",
311 | " shape=[num_filters_total, num_classes],\n",
312 | " initializer=tf.contrib.layers.xavier_initializer())\n",
313 | " b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name=\"b\")\n",
314 | " scores = tf.nn.xw_plus_b(h_drop, W, b, name=\"scores\")\n",
315 | " \n",
316 | "# 定义loss\n",
317 | "with tf.name_scope(\"loss\"):\n",
318 | " loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=scores, labels=input_y))\n",
319 | "\n",
320 | "# 定义优化器\n",
321 | "with tf.name_scope(\"optimizer\"):\n",
322 | " optimizer = tf.train.AdamOptimizer(1e-3).minimize(loss)"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": 7,
328 | "metadata": {
329 | "collapsed": true
330 | },
331 | "outputs": [],
332 | "source": [
333 | "# 生成批次数据\n",
334 | "def batch_iter(data, batch_size, num_epochs, shuffle=False):\n",
335 | " \"\"\"\n",
336 | " Generates a batch iterator for a dataset.\n",
337 | " \"\"\"\n",
338 | " data = np.array(data)\n",
339 | " data_size = len(data)\n",
340 | " # 每个epoch的num_batch\n",
341 | " num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1\n",
342 | " print(\"num_batches_per_epoch:\",num_batches_per_epoch)\n",
343 | " for epoch in range(num_epochs):\n",
344 | " # Shuffle the data at each epoch\n",
345 | " if shuffle:\n",
346 | " shuffle_indices = np.random.permutation(np.arange(data_size))\n",
347 | " shuffled_data = data[shuffle_indices]\n",
348 | " else:\n",
349 | " shuffled_data = data\n",
350 | " for batch_num in range(num_batches_per_epoch):\n",
351 | " start_index = batch_num * batch_size\n",
352 | " end_index = min((batch_num + 1) * batch_size, data_size)\n",
353 | " yield shuffled_data[start_index:end_index]"
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": null,
359 | "metadata": {
360 | "collapsed": true
361 | },
362 | "outputs": [],
363 | "source": [
364 | "# 知乎提供的评测方案\n",
365 | "def eval(predict_label_and_marked_label_list):\n",
366 | " \"\"\"\n",
367 | " :param predict_label_and_marked_label_list: 一个元组列表。例如\n",
368 | " [ ([1, 2, 3, 4, 5], [4, 5, 6, 7]),\n",
369 | " ([3, 2, 1, 4, 7], [5, 7, 3])\n",
370 | " ]\n",
371 | " 需要注意这里 predict_label 是去重复的,例如 [1,2,3,2,4,1,6],去重后变成[1,2,3,4,6]\n",
372 | " \n",
373 | " marked_label_list 本身没有顺序性,但提交结果有,例如上例的命中情况分别为\n",
374 | " [0,0,0,1,1] (4,5命中)\n",
375 | " [1,0,0,0,1] (3,7命中)\n",
376 | "\n",
377 | " \"\"\"\n",
378 | " right_label_num = 0 #总命中标签数量\n",
379 | " right_label_at_pos_num = [0, 0, 0, 0, 0] #在各个位置上总命中数量\n",
380 | " sample_num = 0 #总问题数量\n",
381 | " all_marked_label_num = 0 #总标签数量\n",
382 | " for predict_labels, marked_labels in predict_label_and_marked_label_list:\n",
383 | " sample_num += 1\n",
384 | " marked_label_set = set(marked_labels)\n",
385 | " all_marked_label_num += len(marked_label_set)\n",
386 | " for pos, label in zip(range(0, min(len(predict_labels), 5)), predict_labels):\n",
387 | " if label in marked_label_set: #命中\n",
388 | " right_label_num += 1\n",
389 | " right_label_at_pos_num[pos] += 1\n",
390 | "\n",
391 | " precision = 0.0\n",
392 | " for pos, right_num in zip(range(0, 5), right_label_at_pos_num):\n",
393 | " precision += ((right_num / float(sample_num))) / math.log(2.0 + pos) # 下标0-4 映射到 pos1-5 + 1,所以最终+2\n",
394 | " recall = float(right_label_num) / all_marked_label_num\n",
395 | "\n",
396 | " return 2*(precision * recall) / (precision + recall )"
397 | ]
398 | },
399 | {
400 | "cell_type": "code",
401 | "execution_count": null,
402 | "metadata": {
403 | "collapsed": false
404 | },
405 | "outputs": [
406 | {
407 | "name": "stdout",
408 | "output_type": "stream",
409 | "text": [
410 | "num_batches_per_epoch: 4219\n"
411 | ]
412 | }
413 | ],
414 | "source": [
415 | "# 定义saver,只保存最新的5个模型\n",
416 | "saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)\n",
417 | "\n",
418 | "with tf.Session() as sess:\n",
419 | " predict_top_5 = tf.nn.top_k(scores, k=5)\n",
420 | " label_top_5 = tf.nn.top_k(input_y, k=5) \n",
421 | " sess.run(tf.global_variables_initializer())\n",
422 | " i = 0\n",
423 | " # 生成数据\n",
424 | " batches = batch_iter(\n",
425 | " list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs)\n",
426 | " for batch in batches:\n",
427 | " i = i + 1\n",
428 | " # 得到一个batch的数据\n",
429 | " x_batch, y_batch = zip(*batch)\n",
430 | " # 优化模型\n",
431 | " sess.run([optimizer],feed_dict={input_x:x_batch, input_y:y_batch, dropout_keep_prob:FLAGS.dropout_keep_prob})\n",
432 | "\n",
433 | " # 每训练50次测试1次\n",
434 | " if (i % FLAGS.evaluate_every == 0):\n",
435 | " print (\"Evaluation:step\",i)\n",
436 | " predict_5, label_5, _loss = sess.run([predict_top_5,label_top_5,loss],feed_dict={input_x:x_batch,\n",
437 | " input_y:y_batch,\n",
438 | " dropout_keep_prob:1.0})\n",
439 | " print (\"label:\",label_5[1][:5])\n",
440 | " print (\"predict:\",predict_5[1][:5])\n",
441 | " print (\"predict:\",predict_5[0][:5])\n",
442 | " print (\"loss:\",_loss)\n",
443 | " predict_label_and_marked_label_list = []\n",
444 | " for predict,label in zip(predict_5[1],label_5[1]):\n",
445 | " predict_label_and_marked_label_list.append((list(predict),list(label)))\n",
446 | " score = eval(predict_label_and_marked_label_list)\n",
447 | " print(\"score:\",score)\n",
448 | "\n",
449 | " # 每训练200次保存1次模型\n",
450 | " if (i % FLAGS.checkpoint_every == 0):\n",
451 | " path = saver.save(sess, \"models/model\", global_step=i)\n",
452 | " print(\"Saved model checkpoint to {}\".format(path))"
453 | ]
454 | },
455 | {
456 | "cell_type": "code",
457 | "execution_count": null,
458 | "metadata": {
459 | "collapsed": false
460 | },
461 | "outputs": [],
462 | "source": []
463 | },
464 | {
465 | "cell_type": "code",
466 | "execution_count": null,
467 | "metadata": {
468 | "collapsed": false
469 | },
470 | "outputs": [],
471 | "source": []
472 | },
473 | {
474 | "cell_type": "code",
475 | "execution_count": null,
476 | "metadata": {
477 | "collapsed": true
478 | },
479 | "outputs": [],
480 | "source": []
481 | }
482 | ],
483 | "metadata": {
484 | "anaconda-cloud": {},
485 | "kernelspec": {
486 | "display_name": "Python [default]",
487 | "language": "python",
488 | "name": "python3"
489 | },
490 | "language_info": {
491 | "codemirror_mode": {
492 | "name": "ipython",
493 | "version": 3
494 | },
495 | "file_extension": ".py",
496 | "mimetype": "text/x-python",
497 | "name": "python",
498 | "nbconvert_exporter": "python",
499 | "pygments_lexer": "ipython3",
500 | "version": "3.5.2"
501 | }
502 | },
503 | "nbformat": 4,
504 | "nbformat_minor": 2
505 | }
506 |
--------------------------------------------------------------------------------
/Tensorflow基础使用与文本分类应用/程序/zhihu_eval.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "51CTO课程频道:http://edu.51cto.com/lecturer/index/user_id-12330098.html
\n",
8 | "优酷频道:http://i.youku.com/sdxxqbf
\n",
9 | "微信公众号:深度学习与神经网络
\n",
10 | "Github:https://github.com/Qinbf
"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "metadata": {
17 | "collapsed": true
18 | },
19 | "outputs": [],
20 | "source": [
21 | "#coding:utf-8\n",
22 | "import numpy as np\n",
23 | "import pandas as pd\n",
24 | "from tqdm import tqdm\n",
25 | "import tensorflow as tf\n",
26 | "import pickle\n",
27 | "import math\n",
28 | "from six.moves import xrange"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 2,
34 | "metadata": {
35 | "collapsed": false
36 | },
37 | "outputs": [
38 | {
39 | "name": "stdout",
40 | "output_type": "stream",
41 | "text": [
42 | " 0 1 \\\n",
43 | "0 6215603645409872328 c924,c531,c102,c284,c188,c104,c98,c107,c11,c11... \n",
44 | "1 6649324930261961840 c346,c1549,c413,c294,c675,c504,c183,c74,c541,c... \n",
45 | "2 -4251899610700378615 c96,c97,c97,c98,c99,c100,c101,c141,c42,c42,c10... \n",
46 | "3 6213817087034420233 c504,c157,c221,c221,c633,c468,c469,c1637,c1072... \n",
47 | "4 -8930652370334418373 c0,c310,c35,c122,c123,c11,c317,c91,c175,c476,c... \n",
48 | "\n",
49 | " 2 \\\n",
50 | "0 w1340,w1341,w55,w1344,w58,w6,w24178,w26959,w47... \n",
51 | "1 w40132,w1357,w1556,w1380,w2464,w33,w16791,w109... \n",
52 | "2 w53,w54,w1779,w54,w1309,w54,w369,w949,w65587,w... \n",
53 | "3 w5083,w12537,w10427,w29724,w6,w2566,w11,w18476... \n",
54 | "4 w33792,w21,w83,w6,w21542,w21,w140670,w25,w1110... \n",
55 | "\n",
56 | " 3 \\\n",
57 | "0 c1128,c529,c636,c572,c1321,c139,c540,c223,c510... \n",
58 | "1 NaN \n",
59 | "2 c149,c148,c148,c42,c185,c95,c95,c186,c186,c186... \n",
60 | "3 c15,c131,c39,c40,c85,c166,c969,c2456,c17,c636,... \n",
61 | "4 NaN \n",
62 | "\n",
63 | " 4 \n",
64 | "0 w4094,w1618,w20104,w19234,w1097,w1005,w4228,w2... \n",
65 | "1 NaN \n",
66 | "2 NaN \n",
67 | "3 w2550,w24,w239,w98,w19456,w11,w108710,w3483,w2... \n",
68 | "4 NaN \n"
69 | ]
70 | }
71 | ],
72 | "source": [
73 | "# 导入question_train_set\n",
74 | "reader = pd.read_table('./ieee_zhihu_cup/question_eval_set.txt',sep='\\t',header=None)\n",
75 | "print(reader.iloc[0:5])"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 3,
81 | "metadata": {
82 | "collapsed": false
83 | },
84 | "outputs": [
85 | {
86 | "name": "stdout",
87 | "output_type": "stream",
88 | "text": [
89 | "('max_document_length:', 76)\n"
90 | ]
91 | }
92 | ],
93 | "source": [
94 | "# 计算一段文本中最大词汇数\n",
95 | "x_text = reader.iloc[:,2]\n",
96 | "max_document_length = 0\n",
97 | "for i,line in enumerate(x_text):\n",
98 | " try:\n",
99 | " temp = line.split(',')\n",
100 | " max_document_length = max(max_document_length,len(temp))\n",
101 | " except:\n",
102 | " # 其中有一行数据为空\n",
103 | " pass\n",
104 | "# x_text[i] = \" \"\n",
105 | "\n",
106 | "print(\"max_document_length:\",max_document_length)\n",
107 | "\n",
108 | "# 载入字典\n",
109 | "vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor.restore(\"vocab_dict\")"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 4,
115 | "metadata": {
116 | "collapsed": true
117 | },
118 | "outputs": [],
119 | "source": [
120 | "# 按','切分数据\n",
121 | "text = []\n",
122 | "for line in x_text:\n",
123 | " try:\n",
124 | " text.append(line.split(','))\n",
125 | " except:\n",
126 | " # 其中有一行数据为空\n",
127 | " text.append(' ')"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": 5,
133 | "metadata": {
134 | "collapsed": false
135 | },
136 | "outputs": [
137 | {
138 | "name": "stderr",
139 | "output_type": "stream",
140 | "text": [
141 | "100%|██████████| 217360/217360 [00:05<00:00, 40820.07it/s]\n"
142 | ]
143 | },
144 | {
145 | "data": {
146 | "text/plain": [
147 | "array([[ 4507, 2664, 423, 3387, 425, 10, 84669, 1744,\n",
148 | " 152, 13, 90, 152, 1556, 403, 17192, 10,\n",
149 | " 3686, 13, 0, 0, 0, 0, 0, 0,\n",
150 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
151 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
152 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
153 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
154 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
155 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
156 | " 0, 0, 0, 0],\n",
157 | " [ 18531, 861, 1538, 490, 16758, 197, 4225, 658,\n",
158 | " 18551, 10, 4100, 15, 1929, 52, 13, 0,\n",
159 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
160 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
161 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
162 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
163 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
164 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
165 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
166 | " 0, 0, 0, 0],\n",
167 | " [ 1207, 19, 810, 19, 126081, 19, 501, 2249,\n",
168 | " 85078, 35, 218, 308, 99, 105, 313, 13,\n",
169 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
170 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
171 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
172 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
173 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
174 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
175 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
176 | " 0, 0, 0, 0],\n",
177 | " [ 1040, 11856, 360, 23102, 10, 4100, 4, 432,\n",
178 | " 17, 1424, 0, 13, 0, 0, 0, 0,\n",
179 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
180 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
181 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
182 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
183 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
184 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
185 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
186 | " 0, 0, 0, 0],\n",
187 | " [ 3538, 137, 1628, 10, 8450, 137, 0, 16,\n",
188 | " 17, 13, 0, 0, 0, 0, 0, 0,\n",
189 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
190 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
191 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
192 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
193 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
194 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
195 | " 0, 0, 0, 0, 0, 0, 0, 0,\n",
196 | " 0, 0, 0, 0]])"
197 | ]
198 | },
199 | "execution_count": 5,
200 | "metadata": {},
201 | "output_type": "execute_result"
202 | }
203 | ],
204 | "source": [
205 | "# 把数据集变成编号的形式\n",
206 | "x = []\n",
207 | "for line in tqdm(text):\n",
208 | " line_len = len(line)\n",
209 | " text2num = []\n",
210 | " for i in xrange(max_document_length):\n",
211 | " if(i < line_len):\n",
212 | " try:\n",
213 | " text2num.append(vocab_processor.vocabulary_.get(line[i])) # 把词转为数字\n",
214 | " except:\n",
215 | " text2num.append(0) # 没有对应的词\n",
216 | " else:\n",
217 | " text2num.append(0) # 填充0\n",
218 | " x.append(text2num)\n",
219 | "x = np.array(x)\n",
220 | "x[:5]"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": 6,
226 | "metadata": {
227 | "collapsed": false
228 | },
229 | "outputs": [],
230 | "source": [
231 | "def batch_iter(data, batch_size, num_epochs, shuffle=False):\n",
232 | " \"\"\"\n",
233 | " Generates a batch iterator for a dataset.\n",
234 | " \"\"\"\n",
235 | " data = np.array(data)\n",
236 | " data_size = len(data)\n",
237 | " # 每个epoch的num_batch\n",
238 | " num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1\n",
239 | " print(\"num_batches_per_epoch:\",num_batches_per_epoch)\n",
240 | " for epoch in range(num_epochs):\n",
241 | " # Shuffle the data at each epoch\n",
242 | " if shuffle:\n",
243 | " shuffle_indices = np.random.permutation(np.arange(data_size))\n",
244 | " shuffled_data = data[shuffle_indices]\n",
245 | " else:\n",
246 | " shuffled_data = data\n",
247 | " for batch_num in range(num_batches_per_epoch):\n",
248 | " start_index = batch_num * batch_size\n",
249 | " end_index = min((batch_num + 1) * batch_size, data_size)\n",
250 | " yield shuffled_data[start_index:end_index]"
251 | ]
252 | },
253 | {
254 | "cell_type": "code",
255 | "execution_count": 7,
256 | "metadata": {
257 | "collapsed": false
258 | },
259 | "outputs": [],
260 | "source": [
261 | "def eval(predict_label_and_marked_label_list):\n",
262 | " \"\"\"\n",
263 | " :param predict_label_and_marked_label_list: 一个元组列表。例如\n",
264 | " [ ([1, 2, 3, 4, 5], [4, 5, 6, 7]),\n",
265 | " ([3, 2, 1, 4, 7], [5, 7, 3])\n",
266 | " ]\n",
267 | " 需要注意这里 predict_label 是去重复的,例如 [1,2,3,2,4,1,6],去重后变成[1,2,3,4,6]\n",
268 | " \n",
269 | " marked_label_list 本身没有顺序性,但提交结果有,例如上例的命中情况分别为\n",
270 | " [0,0,0,1,1] (4,5命中)\n",
271 | " [1,0,0,0,1] (3,7命中)\n",
272 | "\n",
273 | " \"\"\"\n",
274 | " right_label_num = 0 #总命中标签数量\n",
275 | " right_label_at_pos_num = [0, 0, 0, 0, 0] #在各个位置上总命中数量\n",
276 | " sample_num = 0 #总问题数量\n",
277 | " all_marked_label_num = 0 #总标签数量\n",
278 | " for predict_labels, marked_labels in predict_label_and_marked_label_list:\n",
279 | " sample_num += 1\n",
280 | " marked_label_set = set(marked_labels)\n",
281 | " all_marked_label_num += len(marked_label_set)\n",
282 | " for pos, label in zip(range(0, min(len(predict_labels), 5)), predict_labels):\n",
283 | " if label in marked_label_set: #命中\n",
284 | " right_label_num += 1\n",
285 | " right_label_at_pos_num[pos] += 1\n",
286 | "\n",
287 | " precision = 0.0\n",
288 | " for pos, right_num in zip(range(0, 5), right_label_at_pos_num):\n",
289 | " precision += ((right_num / float(sample_num))) / math.log(2.0 + pos) # 下标0-4 映射到 pos1-5 + 1,所以最终+2\n",
290 | " recall = float(right_label_num) / all_marked_label_num\n",
291 | "\n",
292 | " return 2*(precision * recall) / (precision + recall )"
293 | ]
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": 8,
298 | "metadata": {
299 | "collapsed": false
300 | },
301 | "outputs": [
302 | {
303 | "name": "stdout",
304 | "output_type": "stream",
305 | "text": [
306 | "('num_filters_total:', 3072)\n"
307 | ]
308 | }
309 | ],
310 | "source": [
311 | "# 定义三个placeholder\n",
312 | "input_x = tf.placeholder(tf.int32, [None, x.shape[1]], name=\"input_x\")\n",
313 | "dropout_keep_prob = tf.placeholder(tf.float32, name=\"dropout_keep_prob\")\n",
314 | "\n",
315 | "# sequence_length-最长词汇数\n",
316 | "sequence_length=x.shape[1]\n",
317 | "# num_classes-分类数\n",
318 | "num_classes=1999\n",
319 | "# vocab_size-总词汇数\n",
320 | "vocab_size=len(vocab_processor.vocabulary_)\n",
321 | "# embedding_size-词向量长度\n",
322 | "embedding_size=256\n",
323 | "# filter_sizes-卷积核尺寸3,4,5\n",
324 | "filter_sizes=list(map(int, [3,4,5]))\n",
325 | "# num_filters-卷积核数量\n",
326 | "num_filters=1024\n",
327 | "\n",
328 | "Weights = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name=\"Weights\")\n",
329 | "# [None, sequence_length, embedding_size]\n",
330 | "embedded_chars = tf.nn.embedding_lookup(Weights, input_x)\n",
331 | "# 添加一个维度,[None, sequence_length, embedding_size, 1]\n",
332 | "embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)\n",
333 | "# Create a convolution + maxpool layer for each filter size\n",
334 | "pooled_outputs = []\n",
335 | "for i, filter_size in enumerate(filter_sizes):\n",
336 | " with tf.name_scope(\"conv-maxpool-%s\" % filter_size):\n",
337 | " # Convolution Layer\n",
338 | " filter_shape = [filter_size, embedding_size, 1, num_filters]\n",
339 | " W = tf.Variable(\n",
340 | " tf.truncated_normal(filter_shape, stddev=0.1), name=\"W\")\n",
341 | " b = tf.Variable(\n",
342 | " tf.constant(0.1, shape=[num_filters]), name=\"b\")\n",
343 | " conv = tf.nn.conv2d(\n",
344 | " embedded_chars_expanded,\n",
345 | " W,\n",
346 | " strides=[1, 1, 1, 1],\n",
347 | " padding=\"VALID\",\n",
348 | " name=\"conv\")\n",
349 | " # Apply nonlinearity\n",
350 | " h = tf.nn.relu(tf.nn.bias_add(conv, b), name=\"relu\")\n",
351 | " # Maxpooling over the outputs\n",
352 | " pooled = tf.nn.max_pool(\n",
353 | " h,\n",
354 | " ksize=[1, sequence_length - filter_size + 1, 1, 1],\n",
355 | " strides=[1, 1, 1, 1],\n",
356 | " padding='VALID',\n",
357 | " name=\"pool\")\n",
358 | " pooled_outputs.append(pooled)\n",
359 | "\n",
360 | "# Combine all the pooled features\n",
361 | "num_filters_total = num_filters * len(filter_sizes)\n",
362 | "print(\"num_filters_total:\", num_filters_total)\n",
363 | "h_pool = tf.concat(pooled_outputs, 3)\n",
364 | "h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])\n",
365 | "\n",
366 | "# Add dropout\n",
367 | "with tf.name_scope(\"dropout\"):h_drop = tf.nn.dropout(h_pool_flat,dropout_keep_prob)\n",
368 | "\n",
369 | "# Final (unnormalized) scores and predictions\n",
370 | "with tf.name_scope(\"output\"):\n",
371 | " W = tf.get_variable(\n",
372 | " \"W\",\n",
373 | " shape=[num_filters_total, num_classes],\n",
374 | " initializer=tf.contrib.layers.xavier_initializer())\n",
375 | " b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name=\"b\")\n",
376 | " scores = tf.nn.xw_plus_b(h_drop, W, b, name=\"scores\")"
377 | ]
378 | },
379 | {
380 | "cell_type": "code",
381 | "execution_count": 9,
382 | "metadata": {
383 | "collapsed": false
384 | },
385 | "outputs": [
386 | {
387 | "name": "stdout",
388 | "output_type": "stream",
389 | "text": [
390 | "INFO:tensorflow:Restoring parameters from ./models/model_-7200\n",
391 | "('num_batches_per_epoch:', 218)\n",
392 | "('Evaluation:step', 5)\n",
393 | "('Evaluation:step', 10)\n",
394 | "('Evaluation:step', 15)\n",
395 | "('Evaluation:step', 20)\n",
396 | "('Evaluation:step', 25)\n",
397 | "('Evaluation:step', 30)\n",
398 | "('Evaluation:step', 35)\n",
399 | "('Evaluation:step', 40)\n",
400 | "('Evaluation:step', 45)\n",
401 | "('Evaluation:step', 50)\n",
402 | "('Evaluation:step', 55)\n",
403 | "('Evaluation:step', 60)\n",
404 | "('Evaluation:step', 65)\n",
405 | "('Evaluation:step', 70)\n",
406 | "('Evaluation:step', 75)\n",
407 | "('Evaluation:step', 80)\n",
408 | "('Evaluation:step', 85)\n",
409 | "('Evaluation:step', 90)\n",
410 | "('Evaluation:step', 95)\n",
411 | "('Evaluation:step', 100)\n",
412 | "('Evaluation:step', 105)\n",
413 | "('Evaluation:step', 110)\n",
414 | "('Evaluation:step', 115)\n",
415 | "('Evaluation:step', 120)\n",
416 | "('Evaluation:step', 125)\n",
417 | "('Evaluation:step', 130)\n",
418 | "('Evaluation:step', 135)\n",
419 | "('Evaluation:step', 140)\n",
420 | "('Evaluation:step', 145)\n",
421 | "('Evaluation:step', 150)\n",
422 | "('Evaluation:step', 155)\n",
423 | "('Evaluation:step', 160)\n",
424 | "('Evaluation:step', 165)\n",
425 | "('Evaluation:step', 170)\n",
426 | "('Evaluation:step', 175)\n",
427 | "('Evaluation:step', 180)\n",
428 | "('Evaluation:step', 185)\n",
429 | "('Evaluation:step', 190)\n",
430 | "('Evaluation:step', 195)\n",
431 | "('Evaluation:step', 200)\n",
432 | "('Evaluation:step', 205)\n",
433 | "('Evaluation:step', 210)\n",
434 | "('Evaluation:step', 215)\n"
435 | ]
436 | }
437 | ],
438 | "source": [
439 | "# 选择模型\n",
440 | "checkpoint_file = \"./models/model-10000\"\n",
441 | " \n",
442 | "with tf.Session() as sess:\n",
443 | " predict_top_5 = tf.nn.top_k(scores, k=5)\n",
444 | " sess.run(tf.global_variables_initializer())\n",
445 | " i = 0\n",
446 | " saver = tf.train.Saver()\n",
447 | " saver.restore(sess, checkpoint_file)\n",
448 | "\n",
449 | " # Generate batches\n",
450 | " batches = batch_iter(list(x), 1000, 1)\n",
451 | " \n",
452 | " for x_batch in batches:\n",
453 | " i = i + 1\n",
454 | " predict_5 = sess.run(predict_top_5,feed_dict={input_x:x_batch,dropout_keep_prob:1.0})\n",
455 | " if i == 1:\n",
456 | " predict = predict_5[1]\n",
457 | " else:\n",
458 | " predict = np.concatenate((predict,predict_5[1]))\n",
459 | " if (i%5==0):\n",
460 | " print (\"Evaluation:step\",i)\n",
461 | "\n",
462 | " np.savetxt(\"predict.txt\",predict,fmt='%d')"
463 | ]
464 | },
465 | {
466 | "cell_type": "code",
467 | "execution_count": null,
468 | "metadata": {
469 | "collapsed": true
470 | },
471 | "outputs": [],
472 | "source": []
473 | }
474 | ],
475 | "metadata": {
476 | "anaconda-cloud": {},
477 | "kernelspec": {
478 | "display_name": "Python [default]",
479 | "language": "python",
480 | "name": "python3"
481 | },
482 | "language_info": {
483 | "codemirror_mode": {
484 | "name": "ipython",
485 | "version": 3
486 | },
487 | "file_extension": ".py",
488 | "mimetype": "text/x-python",
489 | "name": "python",
490 | "nbconvert_exporter": "python",
491 | "pygments_lexer": "ipython3",
492 | "version": "3.5.2"
493 | }
494 | },
495 | "nbformat": 4,
496 | "nbformat_minor": 2
497 | }
498 |
--------------------------------------------------------------------------------