├── Tensorflow基础使用与图像识别应用
    ├── Tensorflow的基础使用与图像识别应用.pdf
    └── 程序
    │   ├── 1创建图，启动图.py
    │   ├── 3Fetch_and_Feed.py
    │   ├── 2变量.py
    │   ├── 4MNIST分类.py
    │   ├── 5下载google图像识别网络inception-v3.py
    │   ├── 3Fetch_and_Feed.ipynb
    │   ├── 1创建图，启动图.ipynb
    │   ├── 2变量.ipynb
    │   ├── 5下载google图像识别网络inception-v3.ipynb
    │   ├── 6使用inception-v3做各种图像的识别.py
    │   └── 4MNIST分类.ipynb
├── Tensorflow基础使用与文本分类应用
    ├── Tensorflow的基础使用与文本分类应用.pdf
    └── 程序
    │   ├── MNIST分类.py
    │   ├── zhihu_predict.py
    │   ├── data_handle.py
    │   ├── MNIST分类.ipynb
    │   ├── zhihu_eval.py
    │   ├── zhihu_predict.ipynb
    │   ├── cnn.py
    │   ├── data_handle.ipynb
    │   ├── cnn.ipynb
    │   └── zhihu_eval.ipynb
└── README.md


/Tensorflow基础使用与图像识别应用/Tensorflow的基础使用与图像识别应用.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Qinbf/Tensorflow/HEAD/Tensorflow基础使用与图像识别应用/Tensorflow的基础使用与图像识别应用.pdf


--------------------------------------------------------------------------------
/Tensorflow基础使用与文本分类应用/Tensorflow的基础使用与文本分类应用.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Qinbf/Tensorflow/HEAD/Tensorflow基础使用与文本分类应用/Tensorflow的基础使用与文本分类应用.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Tensorflow
 2 | 
 3 | 
 4 | ## 关注公众号
 5 | ![](https://raw.githubusercontent.com/Qinbf/tf-model-zoo/master/README_IMG/01.jpg)
 6 | 
 7 | 
 8 | ## 一起交流
 9 | 我的微信号：**sdxxqbf**  
10 | 以下为微信二维码：
11 | 
12 | ![](https://raw.githubusercontent.com/Qinbf/tf-model-zoo/master/README_IMG/02.GIF)
13 | 
14 | 


--------------------------------------------------------------------------------
/Tensorflow基础使用与图像识别应用/程序/1创建图，启动图.py:
--------------------------------------------------------------------------------
 1 | #51CTO课程频道：http://edu.51cto.com/lecturer/index/user_id-12330098.html
 2 | #优酷频道：http://i.youku.com/sdxxqbf
 3 | #微信公众号：深度学习与神经网络
 4 | #Github：https://github.com/Qinbf
 5 | 
 6 | # coding: utf-8
 7 | 
 8 | # In[1]:
 9 | 
10 | import tensorflow as tf
11 | 
12 | 
13 | # In[2]:
14 | 
15 | #创建一个常量op
16 | m1 = tf.constant([[3,3]])
17 | #创建一个常量op
18 | m2 = tf.constant([[2],[3]])
19 | #创建一个矩阵乘法op，把m1和m2传入
20 | product = tf.matmul(m1,m2)
21 | #这个时候打印product，只能看到product的属性，不能计算它的值
22 | print(product)
23 | 
24 | 
25 | # In[3]:
26 | 
27 | #第一种定义会话的方式：
28 | #定义一个会话，启动默认图
29 | sess = tf.Session()
30 | #调用sess的run方法来执行矩阵乘法op
31 | #run(product)触发了图中3个op
32 | result = sess.run(product)
33 | print(result)
34 | sess.close()
35 | 
36 | 
37 | # In[4]:
38 | 
39 | #第二种定义会话的方式：
40 | with tf.Session() as sess:
41 |     #调用sess的run方法来执行矩阵乘法op
42 |     #run(product)触发了图中3个op
43 |     result = sess.run(product)
44 |     print(result)
45 | 
46 | 
47 | # In[ ]:
48 | 
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/Tensorflow基础使用与图像识别应用/程序/3Fetch_and_Feed.py:
--------------------------------------------------------------------------------
 1 | #51CTO课程频道：http://edu.51cto.com/lecturer/index/user_id-12330098.html
 2 | #优酷频道：http://i.youku.com/sdxxqbf
 3 | #微信公众号：深度学习与神经网络
 4 | #Github：https://github.com/Qinbf
 5 | 
 6 | # coding: utf-8
 7 | 
 8 | # In[1]:
 9 | 
10 | import tensorflow as tf
11 | 
12 | 
13 | # In[2]:
14 | 
15 | #Fetch：可以在session中同时计算多个op
16 | #定义三个常量
17 | input1 = tf.constant(3.0)
18 | input2 = tf.constant(2.0)
19 | input3 = tf.constant(5.0)
20 | #定义一个加法op
21 | add = tf.add(input2,input3)
22 | #定义一个乘法op
23 | mul = tf.multiply(input1,add)
24 | 
25 | with tf.Session() as sess:
26 |     #同时执行乘法op和加法op
27 |     result = sess.run([mul,add])
28 |     print(result)
29 | 
30 | 
31 | # In[4]:
32 | 
33 | #Feed：先定义占位符，等需要的时候再传入数据
34 | #创建占位符
35 | input1 = tf.placeholder(tf.float32)
36 | input2 = tf.placeholder(tf.float32)
37 | #定义乘法op
38 | output = tf.multiply(input1,input2)
39 | 
40 | with tf.Session() as sess:
41 |     #feed的数据以字典的形式传入
42 |     print(sess.run(output,feed_dict={input1:[8.],input2:[2.]}))
43 | 
44 | 
45 | # In[ ]:
46 | 
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/Tensorflow基础使用与图像识别应用/程序/2变量.py:
--------------------------------------------------------------------------------
 1 | #51CTO课程频道：http://edu.51cto.com/lecturer/index/user_id-12330098.html
 2 | #优酷频道：http://i.youku.com/sdxxqbf
 3 | #微信公众号：深度学习与神经网络
 4 | #Github：https://github.com/Qinbf
 5 | 
 6 | # coding: utf-8
 7 | 
 8 | # In[1]:
 9 | 
10 | import tensorflow as tf
11 | 
12 | 
13 | # In[3]:
14 | 
15 | #定义一个变量
16 | x = tf.Variable([1,2])
17 | #定义一个常量
18 | a = tf.constant([3,3])
19 | #增加一个减法op
20 | sub = tf.subtract(x,a)
21 | #增加一个加法op
22 | add = tf.add(x,sub)
23 | 
24 | #所有变量初始化
25 | init = tf.global_variables_initializer()
26 | 
27 | with tf.Session() as sess:
28 |     #执行变量初始化
29 |     sess.run(init)
30 |     print(sess.run(sub))
31 |     print(sess.run(add))
32 | 
33 | 
34 | # In[4]:
35 | 
36 | #创建一个变量初始化为0
37 | state = tf.Variable(0,name='counter')
38 | #创建一个op，作用是使state加1
39 | new_value = tf.add(state,1)
40 | #赋值op
41 | update = tf.assign(state,new_value)
42 | #所有变量初始化
43 | init = tf.global_variables_initializer()
44 | 
45 | with tf.Session() as sess:
46 |     #执行变量初始化
47 |     sess.run(init)
48 |     print(sess.run(state))
49 |     for _ in range(5):
50 |         sess.run(update)
51 |         print(sess.run(state))
52 | 
53 | 
54 | # In[ ]:
55 | 
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/Tensorflow基础使用与图像识别应用/程序/4MNIST分类.py:
--------------------------------------------------------------------------------
 1 | #51CTO课程频道：http://edu.51cto.com/lecturer/index/user_id-12330098.html
 2 | #优酷频道：http://i.youku.com/sdxxqbf
 3 | #微信公众号：深度学习与神经网络
 4 | #Github：https://github.com/Qinbf
 5 | 
 6 | # coding: utf-8
 7 | 
 8 | # In[1]:
 9 | 
10 | import tensorflow as tf
11 | from tensorflow.examples.tutorials.mnist import input_data
12 | 
13 | 
14 | # In[2]:
15 | 
16 | #载入数据集
17 | mnist = input_data.read_data_sets("MNIST_data",one_hot=True)
18 | 
19 | #每个批次100张照片
20 | batch_size = 100
21 | #计算一共有多少个批次
22 | n_batch = mnist.train.num_examples // batch_size
23 | 
24 | #定义两个placeholder
25 | x = tf.placeholder(tf.float32,[None,784])
26 | y = tf.placeholder(tf.float32,[None,10])
27 | 
28 | #创建一个简单的神经网络，输入层784个神经元，输出层10个神经元
29 | W = tf.Variable(tf.zeros([784,10]))
30 | b = tf.Variable(tf.zeros([10]))
31 | prediction = tf.nn.softmax(tf.matmul(x,W)+b)
32 | 
33 | #二次代价函数
34 | #square是求平方
35 | #reduce_mean是求平均值
36 | loss = tf.reduce_mean(tf.square(y-prediction))
37 | 
38 | #使用梯度下降法来最小化loss，学习率是0.2
39 | train_step = tf.train.GradientDescentOptimizer(0.2).minimize(loss)
40 | 
41 | #初始化变量
42 | init = tf.global_variables_initializer()
43 | 
44 | #结果存放在一个布尔型列表中
45 | correct_prediction = tf.equal(tf.argmax(y,1),tf.argmax(prediction,1))#argmax返回一维张量中最大的值所在的位置
46 | #求准确率
47 | accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))#cast是进行数据格式转换，把布尔型转为float32类型
48 | 
49 | with tf.Session() as sess:
50 |     #执行初始化
51 |     sess.run(init)
52 |     #迭代21个周期
53 |     for epoch in range(21):
54 |         #每个周期迭代n_batch个batch，每个batch为100
55 |         for batch in range(n_batch):
56 |             #获得一个batch的数据和标签
57 |             batch_xs,batch_ys =  mnist.train.next_batch(batch_size)
58 |             #通过feed喂到模型中进行训练
59 |             sess.run(train_step,feed_dict={x:batch_xs,y:batch_ys})
60 |         
61 |         #计算准确率
62 |         acc = sess.run(accuracy,feed_dict={x:mnist.test.images,y:mnist.test.labels})
63 |         print("Iter " + str(epoch) + ",Testing Accuracy " + str(acc))
64 | 
65 | 
66 | # In[ ]:
67 | 
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/Tensorflow基础使用与文本分类应用/程序/MNIST分类.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # 51CTO课程频道：http://edu.51cto.com/lecturer/index/user_id-12330098.html<br>
 5 | # 优酷频道：http://i.youku.com/sdxxqbf<br>
 6 | # 微信公众号：深度学习与神经网络<br>
 7 | # Github：https://github.com/Qinbf<br>
 8 | 
 9 | # In[2]:
10 | 
11 | import tensorflow as tf
12 | from tensorflow.examples.tutorials.mnist import input_data
13 | 
14 | 
15 | # In[3]:
16 | 
17 | #载入数据集
18 | mnist = input_data.read_data_sets("MNIST_data",one_hot=True)
19 | 
20 | #每个批次100张照片
21 | batch_size = 100
22 | #计算一共有多少个批次
23 | n_batch = mnist.train.num_examples // batch_size
24 | 
25 | #定义两个placeholder
26 | x = tf.placeholder(tf.float32,[None,784])
27 | y = tf.placeholder(tf.float32,[None,10])
28 | 
29 | #创建一个简单的神经网络，输入层784个神经元，输出层10个神经元
30 | W = tf.Variable(tf.zeros([784,10]))
31 | b = tf.Variable(tf.zeros([10]))
32 | prediction = tf.nn.softmax(tf.matmul(x,W)+b)
33 | 
34 | #二次代价函数
35 | #square是求平方
36 | #reduce_mean是求平均值
37 | loss = tf.reduce_mean(tf.square(y-prediction))
38 | 
39 | #使用梯度下降法来最小化loss，学习率是0.2
40 | train_step = tf.train.GradientDescentOptimizer(0.2).minimize(loss)
41 | 
42 | #初始化变量
43 | init = tf.global_variables_initializer()
44 | 
45 | #结果存放在一个布尔型列表中
46 | correct_prediction = tf.equal(tf.argmax(y,1),tf.argmax(prediction,1))#argmax返回一维张量中最大的值所在的位置
47 | #求准确率
48 | accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))#cast是进行数据格式转换，把布尔型转为float32类型
49 | 
50 | with tf.Session() as sess:
51 |     #执行初始化
52 |     sess.run(init)
53 |     #迭代21个周期
54 |     for epoch in range(21):
55 |         #每个周期迭代n_batch个batch，每个batch为100
56 |         for batch in range(n_batch):
57 |             #获得一个batch的数据和标签
58 |             batch_xs,batch_ys =  mnist.train.next_batch(batch_size)
59 |             #通过feed喂到模型中进行训练
60 |             sess.run(train_step,feed_dict={x:batch_xs,y:batch_ys})
61 |         
62 |         #计算准确率
63 |         acc = sess.run(accuracy,feed_dict={x:mnist.test.images,y:mnist.test.labels})
64 |         print("Iter " + str(epoch) + ",Testing Accuracy " + str(acc))
65 | 
66 | 
67 | # In[ ]:
68 | 
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/Tensorflow基础使用与图像识别应用/程序/5下载google图像识别网络inception-v3.py:
--------------------------------------------------------------------------------
 1 | #51CTO课程频道：http://edu.51cto.com/lecturer/index/user_id-12330098.html
 2 | #优酷频道：http://i.youku.com/sdxxqbf
 3 | #微信公众号：深度学习与神经网络
 4 | #Github：https://github.com/Qinbf
 5 | 
 6 | # coding: utf-8
 7 | 
 8 | # In[1]:
 9 | 
10 | import tensorflow as tf
11 | import os
12 | import tarfile
13 | import requests
14 | 
15 | 
16 | # In[ ]:
17 | 
18 | #inception模型下载地址
19 | inception_pretrain_model_url = 'http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz'
20 | 
21 | #模型存放地址,存放在当前目录下inception_model文件夹下
22 | inception_pretrain_model_dir = "inception_model"
23 | if not os.path.exists(inception_pretrain_model_dir):
24 |     os.makedirs(inception_pretrain_model_dir)
25 |     
26 | #获取文件名，以及文件路径
27 | filename = inception_pretrain_model_url.split('/')[-1]
28 | filepath = os.path.join(inception_pretrain_model_dir, filename)
29 | 
30 | #下载模型
31 | if not os.path.exists(filepath):
32 |     print("download: ", filename)
33 |     r = requests.get(inception_pretrain_model_url, stream=True)
34 |     with open(filepath, 'wb') as f:
35 |         for chunk in r.iter_content(chunk_size=1024):
36 |             if chunk:
37 |                 f.write(chunk)
38 | print("finish: ", filename)
39 | 
40 | #解压文件
41 | tarfile.open(filepath, 'r:gz').extractall(inception_pretrain_model_dir)
42 |  
43 | #模型结构存放文件
44 | log_dir = 'inception_log'
45 | if not os.path.exists(log_dir):
46 |     os.makedirs(log_dir)
47 | 
48 | #classify_image_graph_def.pb为google训练好的模型
49 | inception_graph_def_file = os.path.join(inception_pretrain_model_dir, 'classify_image_graph_def.pb')
50 | with tf.Session() as sess:
51 |     #创建一个图来存放google训练好的模型
52 |     with tf.gfile.FastGFile(inception_graph_def_file, 'rb') as f:
53 |         graph_def = tf.GraphDef()
54 |         graph_def.ParseFromString(f.read())
55 |         tf.import_graph_def(graph_def, name='')
56 |     #保存图的结构
57 |     writer = tf.summary.FileWriter(log_dir, sess.graph)
58 |     writer.close()
59 | 
60 | 
61 | # In[ ]:
62 | 
63 | 
64 | 
65 | 
66 | # In[ ]:
67 | 
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/Tensorflow基础使用与文本分类应用/程序/zhihu_predict.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # 51CTO课程频道：http://edu.51cto.com/lecturer/index/user_id-12330098.html<br>
  5 | # 优酷频道：http://i.youku.com/sdxxqbf<br>
  6 | # 微信公众号：深度学习与神经网络<br>
  7 | # Github：https://github.com/Qinbf<br>
  8 | 
  9 | # In[1]:
 10 | 
 11 | import pandas as pd
 12 | from tqdm import tqdm
 13 | import re
 14 | import numpy as np
 15 | from six.moves import xrange
 16 | 
 17 | 
 18 | # In[2]:
 19 | 
 20 | topic_info = pd.read_table("./ieee_zhihu_cup/topic_info.txt",sep='\t',header=None)
 21 | print(topic_info.iloc[0:5])
 22 | 
 23 | 
 24 | # In[3]:
 25 | 
 26 | # 话题字典
 27 | topic_dict = {}
 28 | for i in xrange(topic_info.shape[0]):
 29 |     topic_dict[i] = topic_info.iloc[i][0]
 30 | 
 31 | 
 32 | # In[4]:
 33 | 
 34 | predict = open('predict.txt', "r")
 35 | examples = predict.readlines()
 36 | text = np.array([line.split(" ") for line in examples])
 37 | 
 38 | 
 39 | # In[5]:
 40 | 
 41 | label = []
 42 | for line in tqdm(text):
 43 |     num2label = []
 44 |     for i in xrange(5):
 45 |         num2label.append(topic_dict[int(line[i])]) # 把0-1999编号转成原来的id
 46 |     label.append(num2label)
 47 | label = np.array(label)
 48 | 
 49 | 
 50 | # In[6]:
 51 | 
 52 | np.savetxt("temp.txt",label,fmt='%d')
 53 | 
 54 | 
 55 | # In[7]:
 56 | 
 57 | def clean_str(string):
 58 |     string = re.sub(r" ", ",", string)
 59 |     return string
 60 | 
 61 | file1 = open('temp.txt', "r")
 62 | examples = file1.readlines()
 63 | examples = [clean_str(line) for line in examples]
 64 | file1.close()
 65 | 
 66 | file1 = open('temp.txt', "w")
 67 | file1.writelines(examples)
 68 | file1.close()
 69 | 
 70 | 
 71 | # In[8]:
 72 | 
 73 | # predict文件导入
 74 | predict_file = 'temp.txt'
 75 | predict_reader = pd.read_table(predict_file,sep=' ',header=None)
 76 | print(predict_reader.iloc[0:5])
 77 | 
 78 | 
 79 | # In[9]:
 80 | 
 81 | # 导入question_train_set
 82 | eval_reader = pd.read_table('./ieee_zhihu_cup/question_eval_set.txt',sep='\t',header=None)
 83 | print(eval_reader.iloc[0:3])
 84 | 
 85 | 
 86 | # In[10]:
 87 | 
 88 | final_predict = pd.concat([eval_reader.ix[:,0],predict_reader],axis=1)
 89 | print(final_predict.iloc[0:5])
 90 | 
 91 | 
 92 | # In[11]:
 93 | 
 94 | final_predict.to_csv('temp.txt', header=None, index=None, sep=',')
 95 | 
 96 | final_file = open('temp.txt', "r")
 97 | final_examples = final_file.readlines()
 98 | final_examples = [re.sub(r'"',"",line) for line in final_examples]
 99 | final_file.close()
100 | 
101 | final_file = open('final_predict.csv', "w")
102 | final_file.writelines(final_examples)
103 | final_file.close()
104 | 
105 | 


--------------------------------------------------------------------------------
/Tensorflow基础使用与图像识别应用/程序/3Fetch_and_Feed.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "#51CTO课程频道：http://edu.51cto.com/lecturer/index/user_id-12330098.html\n",
 12 |     "#优酷频道：http://i.youku.com/sdxxqbf\n",
 13 |     "#微信公众号：深度学习与神经网络\n",
 14 |     "#Github：https://github.com/Qinbf"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import tensorflow as tf"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 3,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [
 35 |     {
 36 |      "name": "stdout",
 37 |      "output_type": "stream",
 38 |      "text": [
 39 |       "[21.0, 7.0]\n"
 40 |      ]
 41 |     }
 42 |    ],
 43 |    "source": [
 44 |     "#Fetch：可以在session中同时计算多个op\n",
 45 |     "#定义三个常量\n",
 46 |     "input1 = tf.constant(3.0)\n",
 47 |     "input2 = tf.constant(2.0)\n",
 48 |     "input3 = tf.constant(5.0)\n",
 49 |     "#定义一个加法op\n",
 50 |     "add = tf.add(input2,input3)\n",
 51 |     "#定义一个乘法op\n",
 52 |     "mul = tf.multiply(input1,add)\n",
 53 |     "\n",
 54 |     "with tf.Session() as sess:\n",
 55 |     "    #同时执行乘法op和加法op\n",
 56 |     "    result = sess.run([mul,add])\n",
 57 |     "    print(result)"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 4,
 63 |    "metadata": {
 64 |     "collapsed": false
 65 |    },
 66 |    "outputs": [
 67 |     {
 68 |      "name": "stdout",
 69 |      "output_type": "stream",
 70 |      "text": [
 71 |       "[ 16.]\n"
 72 |      ]
 73 |     }
 74 |    ],
 75 |    "source": [
 76 |     "#Feed：先定义占位符，等需要的时候再传入数据\n",
 77 |     "#创建占位符\n",
 78 |     "input1 = tf.placeholder(tf.float32)\n",
 79 |     "input2 = tf.placeholder(tf.float32)\n",
 80 |     "#定义乘法op\n",
 81 |     "output = tf.multiply(input1,input2)\n",
 82 |     "\n",
 83 |     "with tf.Session() as sess:\n",
 84 |     "    #feed的数据以字典的形式传入\n",
 85 |     "    print(sess.run(output,feed_dict={input1:[8.],input2:[2.]}))"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {
 92 |     "collapsed": true
 93 |    },
 94 |    "outputs": [],
 95 |    "source": []
 96 |   }
 97 |  ],
 98 |  "metadata": {
 99 |   "anaconda-cloud": {},
100 |   "kernelspec": {
101 |    "display_name": "Python [default]",
102 |    "language": "python",
103 |    "name": "python3"
104 |   },
105 |   "language_info": {
106 |    "codemirror_mode": {
107 |     "name": "ipython",
108 |     "version": 3
109 |    },
110 |    "file_extension": ".py",
111 |    "mimetype": "text/x-python",
112 |    "name": "python",
113 |    "nbconvert_exporter": "python",
114 |    "pygments_lexer": "ipython3",
115 |    "version": "3.5.2"
116 |   }
117 |  },
118 |  "nbformat": 4,
119 |  "nbformat_minor": 1
120 | }
121 | 


--------------------------------------------------------------------------------
/Tensorflow基础使用与图像识别应用/程序/1创建图，启动图.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "#51CTO课程频道：http://edu.51cto.com/lecturer/index/user_id-12330098.html\n",
 12 |     "#优酷频道：http://i.youku.com/sdxxqbf\n",
 13 |     "#微信公众号：深度学习与神经网络\n",
 14 |     "#Github：https://github.com/Qinbf"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import tensorflow as tf"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 3,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [
 35 |     {
 36 |      "name": "stdout",
 37 |      "output_type": "stream",
 38 |      "text": [
 39 |       "Tensor(\"MatMul:0\", shape=(1, 1), dtype=int32)\n"
 40 |      ]
 41 |     }
 42 |    ],
 43 |    "source": [
 44 |     "#创建一个常量op\n",
 45 |     "m1 = tf.constant([[3,3]])\n",
 46 |     "#创建一个常量op\n",
 47 |     "m2 = tf.constant([[2],[3]])\n",
 48 |     "#创建一个矩阵乘法op，把m1和m2传入\n",
 49 |     "product = tf.matmul(m1,m2)\n",
 50 |     "#这个时候打印product，只能看到product的属性，不能计算它的值\n",
 51 |     "print(product)"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 4,
 57 |    "metadata": {
 58 |     "collapsed": false
 59 |    },
 60 |    "outputs": [
 61 |     {
 62 |      "name": "stdout",
 63 |      "output_type": "stream",
 64 |      "text": [
 65 |       "[[15]]\n"
 66 |      ]
 67 |     }
 68 |    ],
 69 |    "source": [
 70 |     "#第一种定义会话的方式：\n",
 71 |     "#定义一个会话，启动默认图\n",
 72 |     "sess = tf.Session()\n",
 73 |     "#调用sess的run方法来执行矩阵乘法op\n",
 74 |     "#run(product)触发了图中3个op\n",
 75 |     "result = sess.run(product)\n",
 76 |     "print(result)\n",
 77 |     "sess.close()"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 5,
 83 |    "metadata": {
 84 |     "collapsed": false
 85 |    },
 86 |    "outputs": [
 87 |     {
 88 |      "name": "stdout",
 89 |      "output_type": "stream",
 90 |      "text": [
 91 |       "[[15]]\n"
 92 |      ]
 93 |     }
 94 |    ],
 95 |    "source": [
 96 |     "#第二种定义会话的方式：\n",
 97 |     "with tf.Session() as sess:\n",
 98 |     "    #调用sess的run方法来执行矩阵乘法op\n",
 99 |     "    #run(product)触发了图中3个op\n",
100 |     "    result = sess.run(product)\n",
101 |     "    print(result)"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {
108 |     "collapsed": true
109 |    },
110 |    "outputs": [],
111 |    "source": []
112 |   }
113 |  ],
114 |  "metadata": {
115 |   "anaconda-cloud": {},
116 |   "kernelspec": {
117 |    "display_name": "Python [default]",
118 |    "language": "python",
119 |    "name": "python3"
120 |   },
121 |   "language_info": {
122 |    "codemirror_mode": {
123 |     "name": "ipython",
124 |     "version": 3
125 |    },
126 |    "file_extension": ".py",
127 |    "mimetype": "text/x-python",
128 |    "name": "python",
129 |    "nbconvert_exporter": "python",
130 |    "pygments_lexer": "ipython3",
131 |    "version": "3.5.2"
132 |   }
133 |  },
134 |  "nbformat": 4,
135 |  "nbformat_minor": 1
136 | }
137 | 


--------------------------------------------------------------------------------
/Tensorflow基础使用与文本分类应用/程序/data_handle.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # 51CTO课程频道：http://edu.51cto.com/lecturer/index/user_id-12330098.html<br>
  5 | # 优酷频道：http://i.youku.com/sdxxqbf<br>
  6 | # 微信公众号：深度学习与神经网络<br>
  7 | # Github：https://github.com/Qinbf<br>
  8 | # 
  9 | # question_train_set.txt：  
 10 | #     第一列为 问题id；  
 11 | #     第二列为 title 的字符编号序列；  
 12 | #     第三列为 title 的词语编号序列；  
 13 | #     第四列为描述的字符编号序列；  
 14 | #     第五列为描述的词语标号序列。  
 15 | #     
 16 | # question_topic_train_set.txt：  
 17 | #     第一列 问题 id；  
 18 | #     第二列 话题 id。  
 19 | # 
 20 | # topic_info.txt：  
 21 | #     第一列为话题 id  
 22 | #     第二列为话题的父话题 id。话题之间是有向无环图结构，一个话题可能有 0 到多个父话题；  
 23 | #     第三列为话题名字的字符编号序列；  
 24 | #     第四列为话题名字的词语编号序列；  
 25 | #     第五列为话题描述的字符编号序列；  
 26 | #     第六列为话题描述的词语编号序列。  
 27 | # 
 28 | # 1.title通常来说包含的信息最重要。对于question_train_set.txt文件，为了简单起见，我们只取第三列，title的词语编号序列。    
 29 | # 2.对于topic_info.txt，为了简单起见，我们不考虑2,3,4,5,6列。只是简单的提取话题id，然后转为0-1998的数字（一共有1999个话题）  
 30 | # 3.然后合并以上一些数据，得到最后处理后的数据。  
 31 | 
 32 | # In[1]:
 33 | 
 34 | import pandas as pd
 35 | from tqdm import tqdm # pip install tqdm
 36 | from six.moves import xrange
 37 | 
 38 | 
 39 | # In[2]:
 40 | 
 41 | # 导入question_train_set
 42 | reader = pd.read_table('./ieee_zhihu_cup/question_train_set.txt',sep='\t',header=None)
 43 | print(reader.iloc[0:5])
 44 | 
 45 | 
 46 | # In[3]:
 47 | 
 48 | # 导入question_topic_eval_set
 49 | topic_reader = pd.read_table('./ieee_zhihu_cup/question_topic_train_set.txt',sep='\t',header=None)
 50 | print(topic_reader.iloc[0:5])
 51 | 
 52 | 
 53 | # In[4]:
 54 | 
 55 | # 合并title 的词语编号序列和话题 id
 56 | data_topic = pd.concat([reader.ix[:,2], topic_reader.ix[:,1]], axis=1, ignore_index=True)
 57 | print(data_topic.iloc[0:5])
 58 | 
 59 | 
 60 | # In[5]:
 61 | 
 62 | # 导入topic_info
 63 | label_reader = pd.read_table('./ieee_zhihu_cup/topic_info.txt',sep='\t',header=None)
 64 | print(label_reader.iloc[0:5])
 65 | 
 66 | 
 67 | # In[6]:
 68 | 
 69 | # 把标签转为0-1998的编号
 70 | labels = list(label_reader.iloc[:,0])
 71 | my_labels = []
 72 | for label in labels:
 73 |     my_labels.append(label)
 74 |     
 75 | # 建立topic字典
 76 | topic_dict = {}
 77 | for i,label in enumerate(my_labels):
 78 |     topic_dict[label] = i
 79 | 
 80 | print(topic_dict[7739004195693774975])
 81 | 
 82 | 
 83 | # In[7]:
 84 | 
 85 | for i in tqdm(xrange(data_topic.shape[0])):
 86 |     new_label = ''
 87 |     # 根据“,”切分话题id
 88 |     temp_topic = data_topic.iloc[i][1].split(',')
 89 |     for topic in temp_topic:
 90 |         # 判断该label是否在label文件中，并得到该行
 91 |         label_num = topic_dict[int(topic)]
 92 |         new_label = new_label + str(label_num) + ','
 93 |     data_topic.iloc[i][1] = new_label[:-1]
 94 | print(data_topic.iloc[:5])
 95 | 
 96 | 
 97 | # In[8]:
 98 | 
 99 | # 保存处理过后的文件
100 | data_topic.to_csv("./ieee_zhihu_cup/data_topic.txt", header=None, index=None, sep='\t')
101 | 
102 | # 切分成10块保存
103 | for i in xrange(10):
104 |     data_topic_filename = './ieee_zhihu_cup/data_topic_block_' + str(i) + '.txt'
105 |     if (i+1)*300000 < data_topic.shape[0]:
106 |         data_topic.iloc[i*300000:(i+1)*300000].to_csv(
107 |             data_topic_filename, header=None, index=None, sep='\t')
108 |     else:
109 |         data_topic.iloc[i*300000:data_topic.shape[0]].to_csv(
110 |             data_topic_filename, header=None, index=None, sep='\t')
111 | 
112 | 
113 | # In[ ]:
114 | 
115 | 
116 | 
117 | 


--------------------------------------------------------------------------------
/Tensorflow基础使用与图像识别应用/程序/2变量.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "#51CTO课程频道：http://edu.51cto.com/lecturer/index/user_id-12330098.html\n",
 12 |     "#优酷频道：http://i.youku.com/sdxxqbf\n",
 13 |     "#微信公众号：深度学习与神经网络\n",
 14 |     "#Github：https://github.com/Qinbf"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import tensorflow as tf"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 3,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [
 35 |     {
 36 |      "name": "stdout",
 37 |      "output_type": "stream",
 38 |      "text": [
 39 |       "[-2 -1]\n",
 40 |       "[-1  1]\n"
 41 |      ]
 42 |     }
 43 |    ],
 44 |    "source": [
 45 |     "#定义一个变量\n",
 46 |     "x = tf.Variable([1,2])\n",
 47 |     "#定义一个常量\n",
 48 |     "a = tf.constant([3,3])\n",
 49 |     "#增加一个减法op\n",
 50 |     "sub = tf.subtract(x,a)\n",
 51 |     "#增加一个加法op\n",
 52 |     "add = tf.add(x,sub)\n",
 53 |     "\n",
 54 |     "#所有变量初始化\n",
 55 |     "init = tf.global_variables_initializer()\n",
 56 |     "\n",
 57 |     "with tf.Session() as sess:\n",
 58 |     "    #执行变量初始化\n",
 59 |     "    sess.run(init)\n",
 60 |     "    print(sess.run(sub))\n",
 61 |     "    print(sess.run(add))"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 4,
 67 |    "metadata": {
 68 |     "collapsed": false
 69 |    },
 70 |    "outputs": [
 71 |     {
 72 |      "name": "stdout",
 73 |      "output_type": "stream",
 74 |      "text": [
 75 |       "0\n",
 76 |       "1\n",
 77 |       "2\n",
 78 |       "3\n",
 79 |       "4\n",
 80 |       "5\n"
 81 |      ]
 82 |     }
 83 |    ],
 84 |    "source": [
 85 |     "#创建一个变量初始化为0\n",
 86 |     "state = tf.Variable(0,name='counter')\n",
 87 |     "#创建一个op，作用是使state加1\n",
 88 |     "new_value = tf.add(state,1)\n",
 89 |     "#赋值op\n",
 90 |     "update = tf.assign(state,new_value)\n",
 91 |     "#所有变量初始化\n",
 92 |     "init = tf.global_variables_initializer()\n",
 93 |     "\n",
 94 |     "with tf.Session() as sess:\n",
 95 |     "    #执行变量初始化\n",
 96 |     "    sess.run(init)\n",
 97 |     "    print(sess.run(state))\n",
 98 |     "    for _ in range(5):\n",
 99 |     "        sess.run(update)\n",
100 |     "        print(sess.run(state))"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {
107 |     "collapsed": true
108 |    },
109 |    "outputs": [],
110 |    "source": []
111 |   }
112 |  ],
113 |  "metadata": {
114 |   "anaconda-cloud": {},
115 |   "kernelspec": {
116 |    "display_name": "Python [default]",
117 |    "language": "python",
118 |    "name": "python3"
119 |   },
120 |   "language_info": {
121 |    "codemirror_mode": {
122 |     "name": "ipython",
123 |     "version": 3
124 |    },
125 |    "file_extension": ".py",
126 |    "mimetype": "text/x-python",
127 |    "name": "python",
128 |    "nbconvert_exporter": "python",
129 |    "pygments_lexer": "ipython3",
130 |    "version": "3.5.2"
131 |   }
132 |  },
133 |  "nbformat": 4,
134 |  "nbformat_minor": 1
135 | }
136 | 


--------------------------------------------------------------------------------
/Tensorflow基础使用与图像识别应用/程序/5下载google图像识别网络inception-v3.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "#51CTO课程频道：http://edu.51cto.com/lecturer/index/user_id-12330098.html\n",
 12 |     "#优酷频道：http://i.youku.com/sdxxqbf\n",
 13 |     "#微信公众号：深度学习与神经网络\n",
 14 |     "#Github：https://github.com/Qinbf"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import tensorflow as tf\n",
 26 |     "import os\n",
 27 |     "import tarfile\n",
 28 |     "import requests"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 3,
 34 |    "metadata": {
 35 |     "collapsed": false
 36 |    },
 37 |    "outputs": [
 38 |     {
 39 |      "name": "stdout",
 40 |      "output_type": "stream",
 41 |      "text": [
 42 |       "finish:  inception-2015-12-05.tgz\n"
 43 |      ]
 44 |     }
 45 |    ],
 46 |    "source": [
 47 |     "#inception模型下载地址\n",
 48 |     "inception_pretrain_model_url = 'http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz'\n",
 49 |     "\n",
 50 |     "#模型存放地址,存放在当前目录下inception_model文件夹下\n",
 51 |     "inception_pretrain_model_dir = \"inception_model\"\n",
 52 |     "if not os.path.exists(inception_pretrain_model_dir):\n",
 53 |     "    os.makedirs(inception_pretrain_model_dir)\n",
 54 |     "    \n",
 55 |     "#获取文件名，以及文件路径\n",
 56 |     "filename = inception_pretrain_model_url.split('/')[-1]\n",
 57 |     "filepath = os.path.join(inception_pretrain_model_dir, filename)\n",
 58 |     "\n",
 59 |     "#下载模型\n",
 60 |     "if not os.path.exists(filepath):\n",
 61 |     "    print(\"download: \", filename)\n",
 62 |     "    r = requests.get(inception_pretrain_model_url, stream=True)\n",
 63 |     "    with open(filepath, 'wb') as f:\n",
 64 |     "        for chunk in r.iter_content(chunk_size=1024):\n",
 65 |     "            if chunk:\n",
 66 |     "                f.write(chunk)\n",
 67 |     "print(\"finish: \", filename)\n",
 68 |     "\n",
 69 |     "#解压文件\n",
 70 |     "tarfile.open(filepath, 'r:gz').extractall(inception_pretrain_model_dir)\n",
 71 |     " \n",
 72 |     "#模型结构存放文件\n",
 73 |     "log_dir = 'inception_log'\n",
 74 |     "if not os.path.exists(log_dir):\n",
 75 |     "    os.makedirs(log_dir)\n",
 76 |     "\n",
 77 |     "#classify_image_graph_def.pb为google训练好的模型\n",
 78 |     "inception_graph_def_file = os.path.join(inception_pretrain_model_dir, 'classify_image_graph_def.pb')\n",
 79 |     "with tf.Session() as sess:\n",
 80 |     "    #创建一个图来存放google训练好的模型\n",
 81 |     "    with tf.gfile.FastGFile(inception_graph_def_file, 'rb') as f:\n",
 82 |     "        graph_def = tf.GraphDef()\n",
 83 |     "        graph_def.ParseFromString(f.read())\n",
 84 |     "        tf.import_graph_def(graph_def, name='')\n",
 85 |     "    #保存图的结构\n",
 86 |     "    writer = tf.summary.FileWriter(log_dir, sess.graph)\n",
 87 |     "    writer.close()"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {
 94 |     "collapsed": true
 95 |    },
 96 |    "outputs": [],
 97 |    "source": []
 98 |   }
 99 |  ],
100 |  "metadata": {
101 |   "anaconda-cloud": {},
102 |   "kernelspec": {
103 |    "display_name": "Python [default]",
104 |    "language": "python",
105 |    "name": "python3"
106 |   },
107 |   "language_info": {
108 |    "codemirror_mode": {
109 |     "name": "ipython",
110 |     "version": 3
111 |    },
112 |    "file_extension": ".py",
113 |    "mimetype": "text/x-python",
114 |    "name": "python",
115 |    "nbconvert_exporter": "python",
116 |    "pygments_lexer": "ipython3",
117 |    "version": "3.5.2"
118 |   }
119 |  },
120 |  "nbformat": 4,
121 |  "nbformat_minor": 1
122 | }
123 | 


--------------------------------------------------------------------------------
/Tensorflow基础使用与图像识别应用/程序/6使用inception-v3做各种图像的识别.py:
--------------------------------------------------------------------------------
  1 | #51CTO课程频道：http://edu.51cto.com/lecturer/index/user_id-12330098.html
  2 | #优酷频道：http://i.youku.com/sdxxqbf
  3 | #微信公众号：深度学习与神经网络
  4 | #Github：https://github.com/Qinbf
  5 | 
  6 | # coding: utf-8
  7 | 
  8 | # In[4]:
  9 | 
 10 | import tensorflow as tf
 11 | import os
 12 | import numpy as np
 13 | import re
 14 | from PIL import Image
 15 | import matplotlib.pyplot as plt
 16 | 
 17 | 
 18 | # In[5]:
 19 | 
 20 | class NodeLookup(object):
 21 |     def __init__(self):  
 22 |         label_lookup_path = 'inception_model/imagenet_2012_challenge_label_map_proto.pbtxt'   
 23 |         uid_lookup_path = 'inception_model/imagenet_synset_to_human_label_map.txt'
 24 |         self.node_lookup = self.load(label_lookup_path, uid_lookup_path)
 25 | 
 26 |     def load(self, label_lookup_path, uid_lookup_path):
 27 |         # 加载分类字符串n********对应分类名称的文件
 28 |         proto_as_ascii_lines = tf.gfile.GFile(uid_lookup_path).readlines()
 29 |         uid_to_human = {}
 30 |         #匹配0或多个n或数字，匹配0或多个空格，非空白字符，逗号
 31 |         p = re.compile(r'[n\d]*[ \S,]*')
 32 |         for line in proto_as_ascii_lines:
 33 |             parsed_items = p.findall(line)
 34 |             #获取编号字符串n********
 35 |             uid = parsed_items[0]
 36 |             #获取分类名称
 37 |             human_string = parsed_items[2]
 38 |             #保存编号字符串n********与分类名称映射关系
 39 |             uid_to_human[uid] = human_string
 40 | 
 41 |         # 加载分类字符串n********对应分类编号1-1000的文件
 42 |         proto_as_ascii = tf.gfile.GFile(label_lookup_path).readlines()
 43 |         node_id_to_uid = {}
 44 |         for line in proto_as_ascii:
 45 |             if line.startswith('  target_class:'):
 46 |                 #获取分类编号1-1000
 47 |                 target_class = int(line.split(': ')[1])
 48 |             if line.startswith('  target_class_string:'):
 49 |                 #获取编号字符串n********
 50 |                 target_class_string = line.split(': ')[1]
 51 |                 #保存分类编号1-1000与编号字符串n********映射关系
 52 |                 node_id_to_uid[target_class] = target_class_string[1:-2]
 53 | 
 54 |         #建立分类编号1-1000对应分类名称的映射关系
 55 |         node_id_to_name = {}
 56 |         for key, val in node_id_to_uid.items():
 57 |             #获取分类名称
 58 |             name = uid_to_human[val]
 59 |             #建立分类编号1-1000到分类名称的映射关系
 60 |             node_id_to_name[key] = name
 61 |         return node_id_to_name
 62 | 
 63 |     #传入分类编号1-1000返回分类名称
 64 |     def id_to_string(self, node_id):
 65 |         if node_id not in self.node_lookup:
 66 |             return ''
 67 |         return self.node_lookup[node_id]
 68 | 
 69 | 
 70 | #创建一个图来存放google训练好的模型
 71 | with tf.gfile.FastGFile('inception_model/classify_image_graph_def.pb', 'rb') as f:
 72 |     graph_def = tf.GraphDef()
 73 |     graph_def.ParseFromString(f.read())
 74 |     tf.import_graph_def(graph_def, name='')
 75 | 
 76 | 
 77 | with tf.Session() as sess:
 78 |     softmax_tensor = sess.graph.get_tensor_by_name('softmax:0')
 79 |     #遍历目录
 80 |     for root,dirs,files in os.walk('images/'):
 81 |         for file in files:
 82 |             #载入图片
 83 |             image_data = tf.gfile.FastGFile(os.path.join(root,file), 'rb').read()
 84 |             predictions = sess.run(softmax_tensor,{'DecodeJpeg/contents:0': image_data})#图片格式是jpg格式
 85 |             predictions = np.squeeze(predictions)#把结果转为1维数据
 86 | 
 87 |             #打印图片路径及名称
 88 |             image_path = os.path.join(root,file)
 89 |             print(image_path)
 90 |             #显示图片
 91 |             img=Image.open(image_path)
 92 |             plt.imshow(img)
 93 |             plt.axis('off')
 94 |             plt.show()
 95 | 
 96 |             #排序
 97 |             top_k = predictions.argsort()[-5:][::-1]
 98 |             node_lookup = NodeLookup()
 99 |             for node_id in top_k:     
100 |                 #获取分类名称
101 |                 human_string = node_lookup.id_to_string(node_id)
102 |                 #获取该分类的置信度
103 |                 score = predictions[node_id]
104 |                 print('%s (score = %.5f)' % (human_string, score))
105 |             print()
106 | 
107 | 
108 | # In[ ]:
109 | 
110 | 
111 | 
112 | 


--------------------------------------------------------------------------------
/Tensorflow基础使用与文本分类应用/程序/MNIST分类.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "51CTO课程频道：http://edu.51cto.com/lecturer/index/user_id-12330098.html<br>\n",
 10 |     "优酷频道：http://i.youku.com/sdxxqbf<br>\n",
 11 |     "微信公众号：深度学习与神经网络<br>\n",
 12 |     "Github：https://github.com/Qinbf<br>"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 2,
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import tensorflow as tf\n",
 24 |     "from tensorflow.examples.tutorials.mnist import input_data"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 3,
 30 |    "metadata": {
 31 |     "collapsed": false
 32 |    },
 33 |    "outputs": [
 34 |     {
 35 |      "name": "stdout",
 36 |      "output_type": "stream",
 37 |      "text": [
 38 |       "Extracting MNIST_data\\train-images-idx3-ubyte.gz\n",
 39 |       "Extracting MNIST_data\\train-labels-idx1-ubyte.gz\n",
 40 |       "Extracting MNIST_data\\t10k-images-idx3-ubyte.gz\n",
 41 |       "Extracting MNIST_data\\t10k-labels-idx1-ubyte.gz\n",
 42 |       "Iter 0,Testing Accuracy 0.8304\n",
 43 |       "Iter 1,Testing Accuracy 0.8702\n",
 44 |       "Iter 2,Testing Accuracy 0.8821\n",
 45 |       "Iter 3,Testing Accuracy 0.8884\n",
 46 |       "Iter 4,Testing Accuracy 0.894\n",
 47 |       "Iter 5,Testing Accuracy 0.8968\n",
 48 |       "Iter 6,Testing Accuracy 0.9011\n",
 49 |       "Iter 7,Testing Accuracy 0.9019\n",
 50 |       "Iter 8,Testing Accuracy 0.9034\n",
 51 |       "Iter 9,Testing Accuracy 0.9049\n",
 52 |       "Iter 10,Testing Accuracy 0.9057\n",
 53 |       "Iter 11,Testing Accuracy 0.9073\n",
 54 |       "Iter 12,Testing Accuracy 0.9081\n",
 55 |       "Iter 13,Testing Accuracy 0.9088\n",
 56 |       "Iter 14,Testing Accuracy 0.9098\n",
 57 |       "Iter 15,Testing Accuracy 0.9108\n",
 58 |       "Iter 16,Testing Accuracy 0.9118\n",
 59 |       "Iter 17,Testing Accuracy 0.9123\n",
 60 |       "Iter 18,Testing Accuracy 0.9127\n",
 61 |       "Iter 19,Testing Accuracy 0.9137\n",
 62 |       "Iter 20,Testing Accuracy 0.9138\n"
 63 |      ]
 64 |     }
 65 |    ],
 66 |    "source": [
 67 |     "#载入数据集\n",
 68 |     "mnist = input_data.read_data_sets(\"MNIST_data\",one_hot=True)\n",
 69 |     "\n",
 70 |     "#每个批次100张照片\n",
 71 |     "batch_size = 100\n",
 72 |     "#计算一共有多少个批次\n",
 73 |     "n_batch = mnist.train.num_examples // batch_size\n",
 74 |     "\n",
 75 |     "#定义两个placeholder\n",
 76 |     "x = tf.placeholder(tf.float32,[None,784])\n",
 77 |     "y = tf.placeholder(tf.float32,[None,10])\n",
 78 |     "\n",
 79 |     "#创建一个简单的神经网络，输入层784个神经元，输出层10个神经元\n",
 80 |     "W = tf.Variable(tf.zeros([784,10]))\n",
 81 |     "b = tf.Variable(tf.zeros([10]))\n",
 82 |     "prediction = tf.nn.softmax(tf.matmul(x,W)+b)\n",
 83 |     "\n",
 84 |     "#二次代价函数\n",
 85 |     "#square是求平方\n",
 86 |     "#reduce_mean是求平均值\n",
 87 |     "loss = tf.reduce_mean(tf.square(y-prediction))\n",
 88 |     "\n",
 89 |     "#使用梯度下降法来最小化loss，学习率是0.2\n",
 90 |     "train_step = tf.train.GradientDescentOptimizer(0.2).minimize(loss)\n",
 91 |     "\n",
 92 |     "#初始化变量\n",
 93 |     "init = tf.global_variables_initializer()\n",
 94 |     "\n",
 95 |     "#结果存放在一个布尔型列表中\n",
 96 |     "correct_prediction = tf.equal(tf.argmax(y,1),tf.argmax(prediction,1))#argmax返回一维张量中最大的值所在的位置\n",
 97 |     "#求准确率\n",
 98 |     "accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))#cast是进行数据格式转换，把布尔型转为float32类型\n",
 99 |     "\n",
100 |     "with tf.Session() as sess:\n",
101 |     "    #执行初始化\n",
102 |     "    sess.run(init)\n",
103 |     "    #迭代21个周期\n",
104 |     "    for epoch in range(21):\n",
105 |     "        #每个周期迭代n_batch个batch，每个batch为100\n",
106 |     "        for batch in range(n_batch):\n",
107 |     "            #获得一个batch的数据和标签\n",
108 |     "            batch_xs,batch_ys =  mnist.train.next_batch(batch_size)\n",
109 |     "            #通过feed喂到模型中进行训练\n",
110 |     "            sess.run(train_step,feed_dict={x:batch_xs,y:batch_ys})\n",
111 |     "        \n",
112 |     "        #计算准确率\n",
113 |     "        acc = sess.run(accuracy,feed_dict={x:mnist.test.images,y:mnist.test.labels})\n",
114 |     "        print(\"Iter \" + str(epoch) + \",Testing Accuracy \" + str(acc))"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {
121 |     "collapsed": true
122 |    },
123 |    "outputs": [],
124 |    "source": []
125 |   }
126 |  ],
127 |  "metadata": {
128 |   "anaconda-cloud": {},
129 |   "kernelspec": {
130 |    "display_name": "Python [default]",
131 |    "language": "python",
132 |    "name": "python3"
133 |   },
134 |   "language_info": {
135 |    "codemirror_mode": {
136 |     "name": "ipython",
137 |     "version": 3
138 |    },
139 |    "file_extension": ".py",
140 |    "mimetype": "text/x-python",
141 |    "name": "python",
142 |    "nbconvert_exporter": "python",
143 |    "pygments_lexer": "ipython3",
144 |    "version": "3.5.2"
145 |   }
146 |  },
147 |  "nbformat": 4,
148 |  "nbformat_minor": 1
149 | }
150 | 


--------------------------------------------------------------------------------
/Tensorflow基础使用与图像识别应用/程序/4MNIST分类.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "#51CTO课程频道：http://edu.51cto.com/lecturer/index/user_id-12330098.html\n",
 12 |     "#优酷频道：http://i.youku.com/sdxxqbf\n",
 13 |     "#微信公众号：深度学习与神经网络\n",
 14 |     "#Github：https://github.com/Qinbf"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import tensorflow as tf\n",
 26 |     "from tensorflow.examples.tutorials.mnist import input_data"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 3,
 32 |    "metadata": {
 33 |     "collapsed": false
 34 |    },
 35 |    "outputs": [
 36 |     {
 37 |      "name": "stdout",
 38 |      "output_type": "stream",
 39 |      "text": [
 40 |       "Extracting MNIST_data\\train-images-idx3-ubyte.gz\n",
 41 |       "Extracting MNIST_data\\train-labels-idx1-ubyte.gz\n",
 42 |       "Extracting MNIST_data\\t10k-images-idx3-ubyte.gz\n",
 43 |       "Extracting MNIST_data\\t10k-labels-idx1-ubyte.gz\n",
 44 |       "Iter 0,Testing Accuracy 0.8304\n",
 45 |       "Iter 1,Testing Accuracy 0.8702\n",
 46 |       "Iter 2,Testing Accuracy 0.8821\n",
 47 |       "Iter 3,Testing Accuracy 0.8884\n",
 48 |       "Iter 4,Testing Accuracy 0.894\n",
 49 |       "Iter 5,Testing Accuracy 0.8968\n",
 50 |       "Iter 6,Testing Accuracy 0.9011\n",
 51 |       "Iter 7,Testing Accuracy 0.9019\n",
 52 |       "Iter 8,Testing Accuracy 0.9034\n",
 53 |       "Iter 9,Testing Accuracy 0.9049\n",
 54 |       "Iter 10,Testing Accuracy 0.9057\n",
 55 |       "Iter 11,Testing Accuracy 0.9073\n",
 56 |       "Iter 12,Testing Accuracy 0.9081\n",
 57 |       "Iter 13,Testing Accuracy 0.9088\n",
 58 |       "Iter 14,Testing Accuracy 0.9098\n",
 59 |       "Iter 15,Testing Accuracy 0.9108\n",
 60 |       "Iter 16,Testing Accuracy 0.9118\n",
 61 |       "Iter 17,Testing Accuracy 0.9123\n",
 62 |       "Iter 18,Testing Accuracy 0.9127\n",
 63 |       "Iter 19,Testing Accuracy 0.9137\n",
 64 |       "Iter 20,Testing Accuracy 0.9138\n"
 65 |      ]
 66 |     }
 67 |    ],
 68 |    "source": [
 69 |     "#载入数据集\n",
 70 |     "mnist = input_data.read_data_sets(\"MNIST_data\",one_hot=True)\n",
 71 |     "\n",
 72 |     "#每个批次100张照片\n",
 73 |     "batch_size = 100\n",
 74 |     "#计算一共有多少个批次\n",
 75 |     "n_batch = mnist.train.num_examples // batch_size\n",
 76 |     "\n",
 77 |     "#定义两个placeholder\n",
 78 |     "x = tf.placeholder(tf.float32,[None,784])\n",
 79 |     "y = tf.placeholder(tf.float32,[None,10])\n",
 80 |     "\n",
 81 |     "#创建一个简单的神经网络，输入层784个神经元，输出层10个神经元\n",
 82 |     "W = tf.Variable(tf.zeros([784,10]))\n",
 83 |     "b = tf.Variable(tf.zeros([10]))\n",
 84 |     "prediction = tf.nn.softmax(tf.matmul(x,W)+b)\n",
 85 |     "\n",
 86 |     "#二次代价函数\n",
 87 |     "#square是求平方\n",
 88 |     "#reduce_mean是求平均值\n",
 89 |     "loss = tf.reduce_mean(tf.square(y-prediction))\n",
 90 |     "\n",
 91 |     "#使用梯度下降法来最小化loss，学习率是0.2\n",
 92 |     "train_step = tf.train.GradientDescentOptimizer(0.2).minimize(loss)\n",
 93 |     "\n",
 94 |     "#初始化变量\n",
 95 |     "init = tf.global_variables_initializer()\n",
 96 |     "\n",
 97 |     "#结果存放在一个布尔型列表中\n",
 98 |     "correct_prediction = tf.equal(tf.argmax(y,1),tf.argmax(prediction,1))#argmax返回一维张量中最大的值所在的位置\n",
 99 |     "#求准确率\n",
100 |     "accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))#cast是进行数据格式转换，把布尔型转为float32类型\n",
101 |     "\n",
102 |     "with tf.Session() as sess:\n",
103 |     "    #执行初始化\n",
104 |     "    sess.run(init)\n",
105 |     "    #迭代21个周期\n",
106 |     "    for epoch in range(21):\n",
107 |     "        #每个周期迭代n_batch个batch，每个batch为100\n",
108 |     "        for batch in range(n_batch):\n",
109 |     "            #获得一个batch的数据和标签\n",
110 |     "            batch_xs,batch_ys =  mnist.train.next_batch(batch_size)\n",
111 |     "            #通过feed喂到模型中进行训练\n",
112 |     "            sess.run(train_step,feed_dict={x:batch_xs,y:batch_ys})\n",
113 |     "        \n",
114 |     "        #计算准确率\n",
115 |     "        acc = sess.run(accuracy,feed_dict={x:mnist.test.images,y:mnist.test.labels})\n",
116 |     "        print(\"Iter \" + str(epoch) + \",Testing Accuracy \" + str(acc))"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "metadata": {
123 |     "collapsed": true
124 |    },
125 |    "outputs": [],
126 |    "source": []
127 |   }
128 |  ],
129 |  "metadata": {
130 |   "anaconda-cloud": {},
131 |   "kernelspec": {
132 |    "display_name": "Python [default]",
133 |    "language": "python",
134 |    "name": "python3"
135 |   },
136 |   "language_info": {
137 |    "codemirror_mode": {
138 |     "name": "ipython",
139 |     "version": 3
140 |    },
141 |    "file_extension": ".py",
142 |    "mimetype": "text/x-python",
143 |    "name": "python",
144 |    "nbconvert_exporter": "python",
145 |    "pygments_lexer": "ipython3",
146 |    "version": "3.5.2"
147 |   }
148 |  },
149 |  "nbformat": 4,
150 |  "nbformat_minor": 1
151 | }
152 | 


--------------------------------------------------------------------------------
/Tensorflow基础使用与文本分类应用/程序/zhihu_eval.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # 51CTO课程频道：http://edu.51cto.com/lecturer/index/user_id-12330098.html<br>
  5 | # 优酷频道：http://i.youku.com/sdxxqbf<br>
  6 | # 微信公众号：深度学习与神经网络<br>
  7 | # Github：https://github.com/Qinbf<br>
  8 | 
  9 | # In[1]:
 10 | 
 11 | 
 12 | import numpy as np
 13 | import pandas as pd
 14 | from tqdm import tqdm
 15 | import tensorflow as tf
 16 | import pickle
 17 | import math
 18 | from six.moves import xrange
 19 | 
 20 | 
 21 | # In[2]:
 22 | 
 23 | # 导入question_train_set
 24 | reader = pd.read_table('./ieee_zhihu_cup/question_eval_set.txt',sep='\t',header=None)
 25 | print(reader.iloc[0:5])
 26 | 
 27 | 
 28 | # In[3]:
 29 | 
 30 | # 计算一段文本中最大词汇数
 31 | x_text = reader.iloc[:,2]
 32 | max_document_length = 0
 33 | for i,line in enumerate(x_text):
 34 |     try:
 35 |         temp = line.split(',')
 36 |         max_document_length = max(max_document_length,len(temp))
 37 |     except:
 38 |         # 其中有一行数据为空
 39 |         pass
 40 | #         x_text[i] = " "
 41 | 
 42 | print("max_document_length:",max_document_length)
 43 | 
 44 | # 载入字典
 45 | vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor.restore("vocab_dict")
 46 | 
 47 | 
 48 | # In[4]:
 49 | 
 50 | # 按','切分数据
 51 | text = []
 52 | for line in x_text:
 53 |     try:
 54 |         text.append(line.split(','))
 55 |     except:
 56 |         # 其中有一行数据为空
 57 |         text.append(' ')
 58 | 
 59 | 
 60 | # In[5]:
 61 | 
 62 | # 把数据集变成编号的形式
 63 | x = []
 64 | for line in tqdm(text):
 65 |     line_len = len(line)
 66 |     text2num = []
 67 |     for i in xrange(max_document_length):
 68 |         if(i < line_len):
 69 |             try:
 70 |                 text2num.append(vocab_processor.vocabulary_.get(line[i])) # 把词转为数字
 71 |             except:
 72 |                 text2num.append(0) # 没有对应的词
 73 |         else:
 74 |             text2num.append(0) # 填充0
 75 |     x.append(text2num)
 76 | x = np.array(x)
 77 | x[:5]
 78 | 
 79 | 
 80 | # In[6]:
 81 | 
 82 | def batch_iter(data, batch_size, num_epochs, shuffle=False):
 83 |     """
 84 |     Generates a batch iterator for a dataset.
 85 |     """
 86 |     data = np.array(data)
 87 |     data_size = len(data)
 88 |     # 每个epoch的num_batch
 89 |     num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1
 90 |     print("num_batches_per_epoch:",num_batches_per_epoch)
 91 |     for epoch in range(num_epochs):
 92 |         # Shuffle the data at each epoch
 93 |         if shuffle:
 94 |             shuffle_indices = np.random.permutation(np.arange(data_size))
 95 |             shuffled_data = data[shuffle_indices]
 96 |         else:
 97 |             shuffled_data = data
 98 |         for batch_num in range(num_batches_per_epoch):
 99 |             start_index = batch_num * batch_size
100 |             end_index = min((batch_num + 1) * batch_size, data_size)
101 |             yield shuffled_data[start_index:end_index]
102 | 
103 | 
104 | # In[7]:
105 | 
106 | def eval(predict_label_and_marked_label_list):
107 |     """
108 |     :param predict_label_and_marked_label_list: 一个元组列表。例如
109 |     [ ([1, 2, 3, 4, 5], [4, 5, 6, 7]),
110 |       ([3, 2, 1, 4, 7], [5, 7, 3])
111 |      ]
112 |     需要注意这里 predict_label 是去重复的，例如 [1,2,3,2,4,1,6]，去重后变成[1,2,3,4,6]
113 |     
114 |     marked_label_list 本身没有顺序性，但提交结果有，例如上例的命中情况分别为
115 |     [0，0，0，1，1]   (4，5命中)
116 |     [1，0，0，0，1]   (3，7命中)
117 | 
118 |     """
119 |     right_label_num = 0  #总命中标签数量
120 |     right_label_at_pos_num = [0, 0, 0, 0, 0]  #在各个位置上总命中数量
121 |     sample_num = 0   #总问题数量
122 |     all_marked_label_num = 0    #总标签数量
123 |     for predict_labels, marked_labels in predict_label_and_marked_label_list:
124 |         sample_num += 1
125 |         marked_label_set = set(marked_labels)
126 |         all_marked_label_num += len(marked_label_set)
127 |         for pos, label in zip(range(0, min(len(predict_labels), 5)), predict_labels):
128 |             if label in marked_label_set:     #命中
129 |                 right_label_num += 1
130 |                 right_label_at_pos_num[pos] += 1
131 | 
132 |     precision = 0.0
133 |     for pos, right_num in zip(range(0, 5), right_label_at_pos_num):
134 |         precision += ((right_num / float(sample_num))) / math.log(2.0 + pos)  # 下标0-4 映射到 pos1-5 + 1，所以最终+2
135 |     recall = float(right_label_num) / all_marked_label_num
136 | 
137 |     return 2*(precision * recall) / (precision + recall )
138 | 
139 | 
140 | # In[8]:
141 | 
142 | # 定义三个placeholder
143 | input_x = tf.placeholder(tf.int32, [None, x.shape[1]], name="input_x")
144 | dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
145 | 
146 | # sequence_length-最长词汇数
147 | sequence_length=x.shape[1]
148 | # num_classes-分类数
149 | num_classes=1999
150 | # vocab_size-总词汇数
151 | vocab_size=len(vocab_processor.vocabulary_)
152 | # embedding_size-词向量长度
153 | embedding_size=256
154 | # filter_sizes-卷积核尺寸3，4，5
155 | filter_sizes=list(map(int, [3,4,5]))
156 | # num_filters-卷积核数量
157 | num_filters=1024
158 | 
159 | Weights = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name="Weights")
160 | # [None, sequence_length, embedding_size]
161 | embedded_chars = tf.nn.embedding_lookup(Weights, input_x)
162 | # 添加一个维度，[None, sequence_length, embedding_size, 1]
163 | embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)
164 | # Create a convolution + maxpool layer for each filter size
165 | pooled_outputs = []
166 | for i, filter_size in enumerate(filter_sizes):
167 |     with tf.name_scope("conv-maxpool-%s" % filter_size):
168 |         # Convolution Layer
169 |         filter_shape = [filter_size, embedding_size, 1, num_filters]
170 |         W = tf.Variable(
171 |             tf.truncated_normal(filter_shape, stddev=0.1), name="W")
172 |         b = tf.Variable(
173 |             tf.constant(0.1, shape=[num_filters]), name="b")
174 |         conv = tf.nn.conv2d(
175 |             embedded_chars_expanded,
176 |             W,
177 |             strides=[1, 1, 1, 1],
178 |             padding="VALID",
179 |             name="conv")
180 |         # Apply nonlinearity
181 |         h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
182 |         # Maxpooling over the outputs
183 |         pooled = tf.nn.max_pool(
184 |             h,
185 |             ksize=[1, sequence_length - filter_size + 1, 1, 1],
186 |             strides=[1, 1, 1, 1],
187 |             padding='VALID',
188 |             name="pool")
189 |         pooled_outputs.append(pooled)
190 | 
191 | # Combine all the pooled features
192 | num_filters_total = num_filters * len(filter_sizes)
193 | print("num_filters_total:", num_filters_total)
194 | h_pool = tf.concat(pooled_outputs, 3)
195 | h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])
196 | 
197 | # Add dropout
198 | with tf.name_scope("dropout"):h_drop = tf.nn.dropout(h_pool_flat,dropout_keep_prob)
199 | 
200 | # Final (unnormalized) scores and predictions
201 | with tf.name_scope("output"):
202 |     W = tf.get_variable(
203 |         "W",
204 |         shape=[num_filters_total, num_classes],
205 |         initializer=tf.contrib.layers.xavier_initializer())
206 |     b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
207 |     scores = tf.nn.xw_plus_b(h_drop, W, b, name="scores")
208 | 
209 | 
210 | # In[9]:
211 | 
212 | # 选择模型
213 | checkpoint_file = "./models/model-10000"
214 |     
215 | with tf.Session() as sess:
216 |     predict_top_5 = tf.nn.top_k(scores, k=5)
217 |     sess.run(tf.global_variables_initializer())
218 |     i = 0
219 |     saver = tf.train.Saver()
220 |     saver.restore(sess, checkpoint_file)
221 | 
222 |     # Generate batches
223 |     batches = batch_iter(list(x), 1000, 1)
224 |     
225 |     for x_batch in batches:
226 |         i = i + 1
227 |         predict_5 = sess.run(predict_top_5,feed_dict={input_x:x_batch,dropout_keep_prob:1.0})
228 |         if i == 1:
229 |             predict = predict_5[1]
230 |         else:
231 |             predict = np.concatenate((predict,predict_5[1]))
232 |         if (i%5==0):
233 |             print ("Evaluation:step",i)
234 | 
235 |     np.savetxt("predict.txt",predict,fmt='%d')
236 | 
237 | 
238 | # In[ ]:
239 | 
240 | 
241 | 
242 | 


--------------------------------------------------------------------------------
/Tensorflow基础使用与文本分类应用/程序/zhihu_predict.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "51CTO课程频道：http://edu.51cto.com/lecturer/index/user_id-12330098.html<br>\n",
  8 |     "优酷频道：http://i.youku.com/sdxxqbf<br>\n",
  9 |     "微信公众号：深度学习与神经网络<br>\n",
 10 |     "Github：https://github.com/Qinbf<br>"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 1,
 16 |    "metadata": {
 17 |     "collapsed": true
 18 |    },
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "import pandas as pd\n",
 22 |     "from tqdm import tqdm\n",
 23 |     "import re\n",
 24 |     "import numpy as np\n",
 25 |     "from six.moves import xrange"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 2,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [
 35 |     {
 36 |      "name": "stdout",
 37 |      "output_type": "stream",
 38 |      "text": [
 39 |       "                     0                                                  1  \\\n",
 40 |       "0   738845194850773558                               -5833678375673307423   \n",
 41 |       "1  3738968195649774859                                2027693463582123305   \n",
 42 |       "2  4738849194894773882                                1127459907694805235   \n",
 43 |       "3  7739004195693774975  2904932941037075699,1160326435131345730,725917...   \n",
 44 |       "4 -7261194805221226386                               -5833678375673307423   \n",
 45 |       "\n",
 46 |       "                  2     3                                                  4  \\\n",
 47 |       "0             c0,c1    w0  c0,c1,c2,c3,c4,c5,c6,c7,c0,c1,c8,c9,c10,c11,c1...   \n",
 48 |       "1           c39,c40   w24  c41,c42,c43,c39,c40,c4,c44,c45,c46,c47,c48,c49...   \n",
 49 |       "2    c172,c31,c0,c1  w102                                                NaN   \n",
 50 |       "3   c39,c40,c5,c173  w103  c39,c40,c23,c21,c174,c74,c5,c173,c17,c35,c39,c...   \n",
 51 |       "4  c36,c31,c45,c237  w148                                          c238,c239   \n",
 52 |       "\n",
 53 |       "                                                   5  \n",
 54 |       "0  w0,w1,w2,w3,w4,w5,w6,w7,w8,w9,w10,w11,w12,w13,...  \n",
 55 |       "1  w24,w25,w26,w27,w28,w6,w29,w30,w11,w31,w32,w33...  \n",
 56 |       "2                                                NaN  \n",
 57 |       "3  w104,w105,w11,w21,w24,w6,w106,w23,w54,w24,w107...  \n",
 58 |       "4                                          w149,w150  \n"
 59 |      ]
 60 |     }
 61 |    ],
 62 |    "source": [
 63 |     "topic_info = pd.read_table(\"./ieee_zhihu_cup/topic_info.txt\",sep='\\t',header=None)\n",
 64 |     "print(topic_info.iloc[0:5])"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 3,
 70 |    "metadata": {
 71 |     "collapsed": true
 72 |    },
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "# 话题字典\n",
 76 |     "topic_dict = {}\n",
 77 |     "for i in xrange(topic_info.shape[0]):\n",
 78 |     "    topic_dict[i] = topic_info.iloc[i][0]"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 4,
 84 |    "metadata": {
 85 |     "collapsed": true
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "predict = open('predict.txt', \"r\")\n",
 90 |     "examples = predict.readlines()\n",
 91 |     "text = np.array([line.split(\" \") for line in examples])"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 5,
 97 |    "metadata": {
 98 |     "collapsed": false
 99 |    },
100 |    "outputs": [
101 |     {
102 |      "name": "stderr",
103 |      "output_type": "stream",
104 |      "text": [
105 |       "100%|██████████| 217360/217360 [00:01<00:00, 160389.86it/s]\n"
106 |      ]
107 |     }
108 |    ],
109 |    "source": [
110 |     "label = []\n",
111 |     "for line in tqdm(text):\n",
112 |     "    num2label = []\n",
113 |     "    for i in xrange(5):\n",
114 |     "        num2label.append(topic_dict[int(line[i])]) # 把0-1999编号转成原来的id\n",
115 |     "    label.append(num2label)\n",
116 |     "label = np.array(label)"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 6,
122 |    "metadata": {
123 |     "collapsed": false
124 |    },
125 |    "outputs": [],
126 |    "source": [
127 |     "np.savetxt(\"temp.txt\",label,fmt='%d')"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 7,
133 |    "metadata": {
134 |     "collapsed": false
135 |    },
136 |    "outputs": [],
137 |    "source": [
138 |     "def clean_str(string):\n",
139 |     "    string = re.sub(r\" \", \",\", string)\n",
140 |     "    return string\n",
141 |     "\n",
142 |     "file1 = open('temp.txt', \"r\")\n",
143 |     "examples = file1.readlines()\n",
144 |     "examples = [clean_str(line) for line in examples]\n",
145 |     "file1.close()\n",
146 |     "\n",
147 |     "file1 = open('temp.txt', \"w\")\n",
148 |     "file1.writelines(examples)\n",
149 |     "file1.close()"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 8,
155 |    "metadata": {
156 |     "collapsed": false
157 |    },
158 |    "outputs": [
159 |     {
160 |      "name": "stdout",
161 |      "output_type": "stream",
162 |      "text": [
163 |       "                                                   0\n",
164 |       "0  -3517637179126242000,-4653836020042332281,4715...\n",
165 |       "1  3418451812342379591,2858911571784840089,238291...\n",
166 |       "2  -7358589937244777363,-5265476641576484497,7477...\n",
167 |       "3  -7046289575185911002,-4653836020042332281,-587...\n",
168 |       "4  4715442001886462944,-8963554618409314978,11274...\n"
169 |      ]
170 |     }
171 |    ],
172 |    "source": [
173 |     "# predict文件导入\n",
174 |     "predict_file = 'temp.txt'\n",
175 |     "predict_reader = pd.read_table(predict_file,sep=' ',header=None)\n",
176 |     "print(predict_reader.iloc[0:5])"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 9,
182 |    "metadata": {
183 |     "collapsed": false
184 |    },
185 |    "outputs": [
186 |     {
187 |      "name": "stdout",
188 |      "output_type": "stream",
189 |      "text": [
190 |       "                     0                                                  1  \\\n",
191 |       "0  6215603645409872328  c924,c531,c102,c284,c188,c104,c98,c107,c11,c11...   \n",
192 |       "1  6649324930261961840  c346,c1549,c413,c294,c675,c504,c183,c74,c541,c...   \n",
193 |       "2 -4251899610700378615  c96,c97,c97,c98,c99,c100,c101,c141,c42,c42,c10...   \n",
194 |       "\n",
195 |       "                                                   2  \\\n",
196 |       "0  w1340,w1341,w55,w1344,w58,w6,w24178,w26959,w47...   \n",
197 |       "1  w40132,w1357,w1556,w1380,w2464,w33,w16791,w109...   \n",
198 |       "2  w53,w54,w1779,w54,w1309,w54,w369,w949,w65587,w...   \n",
199 |       "\n",
200 |       "                                                   3  \\\n",
201 |       "0  c1128,c529,c636,c572,c1321,c139,c540,c223,c510...   \n",
202 |       "1                                                NaN   \n",
203 |       "2  c149,c148,c148,c42,c185,c95,c95,c186,c186,c186...   \n",
204 |       "\n",
205 |       "                                                   4  \n",
206 |       "0  w4094,w1618,w20104,w19234,w1097,w1005,w4228,w2...  \n",
207 |       "1                                                NaN  \n",
208 |       "2                                                NaN  \n"
209 |      ]
210 |     }
211 |    ],
212 |    "source": [
213 |     "# 导入question_train_set\n",
214 |     "eval_reader = pd.read_table('./ieee_zhihu_cup/question_eval_set.txt',sep='\\t',header=None)\n",
215 |     "print(eval_reader.iloc[0:3])"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 10,
221 |    "metadata": {
222 |     "collapsed": false
223 |    },
224 |    "outputs": [
225 |     {
226 |      "name": "stdout",
227 |      "output_type": "stream",
228 |      "text": [
229 |       "                     0                                                  0\n",
230 |       "0  6215603645409872328  -3517637179126242000,-4653836020042332281,4715...\n",
231 |       "1  6649324930261961840  3418451812342379591,2858911571784840089,238291...\n",
232 |       "2 -4251899610700378615  -7358589937244777363,-5265476641576484497,7477...\n",
233 |       "3  6213817087034420233  -7046289575185911002,-4653836020042332281,-587...\n",
234 |       "4 -8930652370334418373  4715442001886462944,-8963554618409314978,11274...\n"
235 |      ]
236 |     }
237 |    ],
238 |    "source": [
239 |     "final_predict = pd.concat([eval_reader.ix[:,0],predict_reader],axis=1)\n",
240 |     "print(final_predict.iloc[0:5])"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 11,
246 |    "metadata": {
247 |     "collapsed": true
248 |    },
249 |    "outputs": [],
250 |    "source": [
251 |     "final_predict.to_csv('temp.txt', header=None, index=None, sep=',')\n",
252 |     "\n",
253 |     "final_file = open('temp.txt', \"r\")\n",
254 |     "final_examples = final_file.readlines()\n",
255 |     "final_examples = [re.sub(r'\"',\"\",line) for line in final_examples]\n",
256 |     "final_file.close()\n",
257 |     "\n",
258 |     "final_file = open('final_predict.csv', \"w\")\n",
259 |     "final_file.writelines(final_examples)\n",
260 |     "final_file.close()"
261 |    ]
262 |   }
263 |  ],
264 |  "metadata": {
265 |   "anaconda-cloud": {},
266 |   "kernelspec": {
267 |    "display_name": "Python [default]",
268 |    "language": "python",
269 |    "name": "python3"
270 |   },
271 |   "language_info": {
272 |    "codemirror_mode": {
273 |     "name": "ipython",
274 |     "version": 3
275 |    },
276 |    "file_extension": ".py",
277 |    "mimetype": "text/x-python",
278 |    "name": "python",
279 |    "nbconvert_exporter": "python",
280 |    "pygments_lexer": "ipython3",
281 |    "version": "3.5.2"
282 |   }
283 |  },
284 |  "nbformat": 4,
285 |  "nbformat_minor": 2
286 | }
287 | 


--------------------------------------------------------------------------------
/Tensorflow基础使用与文本分类应用/程序/cnn.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # 51CTO课程频道：http://edu.51cto.com/lecturer/index/user_id-12330098.html<br>
  5 | # 优酷频道：http://i.youku.com/sdxxqbf<br>
  6 | # 微信公众号：深度学习与神经网络<br>
  7 | # Github：https://github.com/Qinbf<br>
  8 | 
  9 | # In[1]:
 10 | 
 11 | import tensorflow as tf
 12 | import numpy as np
 13 | import os
 14 | import time
 15 | import numpy as np
 16 | import pandas as pd
 17 | import math
 18 | from tqdm import tqdm
 19 | from six.moves import xrange
 20 | 
 21 | 
 22 | # In[2]:
 23 | 
 24 | # Parameters
 25 | # ==================================================
 26 | 
 27 | # Data loading params
 28 | # validation数据集占比
 29 | tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation")
 30 | # 数据集
 31 | tf.flags.DEFINE_string("data_file", "./ieee_zhihu_cup/data_topic_block_0.txt", "Data source for the positive data.")
 32 | 
 33 | # Model Hyperparameters
 34 | # 词向量长度
 35 | tf.flags.DEFINE_integer("embedding_dim", 256, "Dimensionality of character embedding (default: 256)")
 36 | # 卷积核大小
 37 | tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')")
 38 | # 每一种卷积核个数
 39 | tf.flags.DEFINE_integer("num_filters", 1024, "Number of filters per filter size (default: 1024)")
 40 | # dropout参数
 41 | tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
 42 | # l2正则化参数
 43 | tf.flags.DEFINE_float("l2_reg_lambda", 0.0005, "L2 regularization lambda (default: 0.0005)")
 44 | 
 45 | # Training parameters
 46 | # 批次大小
 47 | tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
 48 | # 迭代周期
 49 | tf.flags.DEFINE_integer("num_epochs", 10, "Number of training epochs (default: 10)")
 50 | # 多少step测试一次
 51 | tf.flags.DEFINE_integer("evaluate_every", 50, "Evaluate model on dev set after this many steps (default: 50)")
 52 | # 多少step保存一次模型
 53 | tf.flags.DEFINE_integer("checkpoint_every", 200, "Save model after this many steps (default: 200)")
 54 | # 保存多少个模型
 55 | tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)")
 56 | 
 57 | # flags解析
 58 | FLAGS = tf.flags.FLAGS
 59 | FLAGS._parse_flags()
 60 | 
 61 | # 打印所有参数
 62 | print("\nParameters:")
 63 | for attr, value in sorted(FLAGS.__flags.items()):
 64 |     print("{}={}".format(attr.upper(), value))
 65 | print("")
 66 | 
 67 | 
 68 | # In[3]:
 69 | 
 70 | y = []
 71 | x_text = []
 72 | 
 73 | # 读取训练数据和标签
 74 | reader = pd.read_table(FLAGS.data_file,sep='\t',header=None)
 75 | for i in tqdm(xrange(reader.shape[0])):
 76 |     # 按','切分标签
 77 |     temp = reader.iloc[i][1].split(',')
 78 |     # 如果分类数大于5，只取前5个分类
 79 |     if (len(temp)>5):
 80 |         temp = temp[0:5]
 81 |     # 设置标签的对应位置为1，其余位置为0
 82 |     label = np.zeros(1999)
 83 |     for temp_label in temp:
 84 |         label[int(temp_label)] = 1
 85 |     y.append(label)
 86 |     x_text.append(reader.iloc[i][0])
 87 | 
 88 | 
 89 | # In[4]:
 90 | 
 91 | # 打印x_text和y的前5行
 92 | print(x_text[0:5])
 93 | y = np.array(y, dtype = np.float32)
 94 | print(y[0:5])
 95 | 
 96 | 
 97 | # In[5]:
 98 | 
 99 | # Build vocabulary
100 | # 计算一段文本中最多的词汇数
101 | max_document_length = max([len(x.split(",")) for x in x_text])
102 | vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(max_document_length)
103 | 
104 | x = np.array(list(vocab_processor.fit_transform(x_text)))
105 | print("x_shape:",x.shape)
106 | print("y_shape:",y.shape)
107 | 
108 | # 保存字典
109 | vocab_processor.save("vocab_dict")
110 | 
111 | # Split train/test set
112 | # 数据集切分为两部分，训练集和验证集
113 | dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
114 | x_train, x_dev = x[:dev_sample_index], x[dev_sample_index:]
115 | y_train, y_dev = y[:dev_sample_index], y[dev_sample_index:]
116 | 
117 | print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
118 | print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
119 | print("x:",x_train[0:5])
120 | print("y:",y_train[0:5])
121 | 
122 | 
123 | # In[6]:
124 | 
125 | # 定义三个placeholder
126 | input_x = tf.placeholder(tf.int32, [None, x_train.shape[1]], name="input_x")
127 | input_y = tf.placeholder(tf.float32, [None, y_train.shape[1]], name="input_y")
128 | dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
129 | 
130 | # sequence_length-最长词汇数
131 | sequence_length=x_train.shape[1]
132 | # num_classes-分类数
133 | num_classes=y_train.shape[1]
134 | # vocab_size-总词汇数
135 | vocab_size=len(vocab_processor.vocabulary_)
136 | # embedding_size-词向量长度
137 | embedding_size=FLAGS.embedding_dim
138 | # filter_sizes-卷积核尺寸3，4，5
139 | filter_sizes=list(map(int, FLAGS.filter_sizes.split(",")))
140 | # num_filters-卷积核数量
141 | num_filters=FLAGS.num_filters
142 |         
143 | Weights = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name="Weights")
144 | # shape:[None, sequence_length, embedding_size]
145 | embedded_chars = tf.nn.embedding_lookup(Weights, input_x)
146 | # 添加一个维度，shape:[None, sequence_length, embedding_size, 1]
147 | embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)
148 | 
149 | # Create a convolution + maxpool layer for each filter size
150 | pooled_outputs = []
151 | for i, filter_size in enumerate(filter_sizes):
152 |     with tf.name_scope("conv-maxpool-%s" % filter_size):
153 |         # Convolution Layer
154 |         filter_shape = [filter_size, embedding_size, 1, num_filters]
155 |         W = tf.Variable(
156 |             tf.truncated_normal(filter_shape, stddev=0.1), name="W")
157 |         b = tf.Variable(
158 |             tf.constant(0.1, shape=[num_filters]), name="b")
159 |         conv = tf.nn.conv2d(
160 |             embedded_chars_expanded,
161 |             W,
162 |             strides=[1, 1, 1, 1],
163 |             padding="VALID",
164 |             name="conv")
165 |         # Apply nonlinearity
166 |         h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
167 |         # Maxpooling over the outputs
168 |         pooled = tf.nn.max_pool(
169 |             h,
170 |             ksize=[1, sequence_length - filter_size + 1, 1, 1],
171 |             strides=[1, 1, 1, 1],
172 |             padding='VALID',
173 |             name="pool")
174 |         pooled_outputs.append(pooled)
175 | 
176 | # Combine all the pooled features
177 | num_filters_total = num_filters * len(filter_sizes)
178 | print("num_filters_total:", num_filters_total)
179 | h_pool = tf.concat(pooled_outputs, 3)
180 | h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])
181 | 
182 | # Add dropout
183 | with tf.name_scope("dropout"):h_drop = tf.nn.dropout(h_pool_flat,dropout_keep_prob)
184 | 
185 | # Final (unnormalized) scores and predictions
186 | with tf.name_scope("output"):
187 |     W = tf.get_variable(
188 |         "W",
189 |         shape=[num_filters_total, num_classes],
190 |         initializer=tf.contrib.layers.xavier_initializer())
191 |     b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
192 |     scores = tf.nn.xw_plus_b(h_drop, W, b, name="scores")
193 |     
194 | # 定义loss
195 | with tf.name_scope("loss"):
196 |     loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=scores, labels=input_y))
197 | 
198 | # 定义优化器
199 | with tf.name_scope("optimizer"):
200 |     optimizer = tf.train.AdamOptimizer(1e-3).minimize(loss)
201 | 
202 | 
203 | # In[7]:
204 | 
205 | # 生成批次数据
206 | def batch_iter(data, batch_size, num_epochs, shuffle=False):
207 |     """
208 |     Generates a batch iterator for a dataset.
209 |     """
210 |     data = np.array(data)
211 |     data_size = len(data)
212 |     # 每个epoch的num_batch
213 |     num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1
214 |     print("num_batches_per_epoch:",num_batches_per_epoch)
215 |     for epoch in range(num_epochs):
216 |         # Shuffle the data at each epoch
217 |         if shuffle:
218 |             shuffle_indices = np.random.permutation(np.arange(data_size))
219 |             shuffled_data = data[shuffle_indices]
220 |         else:
221 |             shuffled_data = data
222 |         for batch_num in range(num_batches_per_epoch):
223 |             start_index = batch_num * batch_size
224 |             end_index = min((batch_num + 1) * batch_size, data_size)
225 |             yield shuffled_data[start_index:end_index]
226 | 
227 | 
228 | # In[ ]:
229 | 
230 | # 知乎提供的评测方案
231 | def eval(predict_label_and_marked_label_list):
232 |     """
233 |     :param predict_label_and_marked_label_list: 一个元组列表。例如
234 |     [ ([1, 2, 3, 4, 5], [4, 5, 6, 7]),
235 |       ([3, 2, 1, 4, 7], [5, 7, 3])
236 |      ]
237 |     需要注意这里 predict_label 是去重复的，例如 [1,2,3,2,4,1,6]，去重后变成[1,2,3,4,6]
238 |     
239 |     marked_label_list 本身没有顺序性，但提交结果有，例如上例的命中情况分别为
240 |     [0，0，0，1，1]   (4，5命中)
241 |     [1，0，0，0，1]   (3，7命中)
242 | 
243 |     """
244 |     right_label_num = 0  #总命中标签数量
245 |     right_label_at_pos_num = [0, 0, 0, 0, 0]  #在各个位置上总命中数量
246 |     sample_num = 0   #总问题数量
247 |     all_marked_label_num = 0    #总标签数量
248 |     for predict_labels, marked_labels in predict_label_and_marked_label_list:
249 |         sample_num += 1
250 |         marked_label_set = set(marked_labels)
251 |         all_marked_label_num += len(marked_label_set)
252 |         for pos, label in zip(range(0, min(len(predict_labels), 5)), predict_labels):
253 |             if label in marked_label_set:     #命中
254 |                 right_label_num += 1
255 |                 right_label_at_pos_num[pos] += 1
256 | 
257 |     precision = 0.0
258 |     for pos, right_num in zip(range(0, 5), right_label_at_pos_num):
259 |         precision += ((right_num / float(sample_num))) / math.log(2.0 + pos)  # 下标0-4 映射到 pos1-5 + 1，所以最终+2
260 |     recall = float(right_label_num) / all_marked_label_num
261 | 
262 |     return 2*(precision * recall) / (precision + recall )
263 | 
264 | 
265 | # In[ ]:
266 | 
267 | # 定义saver，只保存最新的5个模型
268 | saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)
269 | 
270 | with tf.Session() as sess:
271 |     predict_top_5 = tf.nn.top_k(scores, k=5)
272 |     label_top_5 = tf.nn.top_k(input_y, k=5) 
273 |     sess.run(tf.global_variables_initializer())
274 |     i = 0
275 |     # 生成数据
276 |     batches = batch_iter(
277 |         list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs)
278 |     for batch in batches:
279 |         i = i + 1
280 |         # 得到一个batch的数据
281 |         x_batch, y_batch = zip(*batch)
282 |         # 优化模型
283 |         sess.run([optimizer],feed_dict={input_x:x_batch, input_y:y_batch, dropout_keep_prob:FLAGS.dropout_keep_prob})
284 | 
285 |         # 每训练50次测试1次
286 |         if (i % FLAGS.evaluate_every == 0):
287 |             print ("Evaluation:step",i)
288 |             predict_5, label_5, _loss = sess.run([predict_top_5,label_top_5,loss],feed_dict={input_x:x_batch,
289 |                                                                                       input_y:y_batch,
290 |                                                                                       dropout_keep_prob:1.0})
291 |             print ("label:",label_5[1][:5])
292 |             print ("predict:",predict_5[1][:5])
293 |             print ("predict:",predict_5[0][:5])
294 |             print ("loss:",_loss)
295 |             predict_label_and_marked_label_list = []
296 |             for predict,label in zip(predict_5[1],label_5[1]):
297 |                 predict_label_and_marked_label_list.append((list(predict),list(label)))
298 |             score = eval(predict_label_and_marked_label_list)
299 |             print("score:",score)
300 | 
301 |         # 每训练200次保存1次模型
302 |         if (i % FLAGS.checkpoint_every == 0):
303 |             path = saver.save(sess, "models/model", global_step=i)
304 |             print("Saved model checkpoint to {}".format(path))
305 | 
306 | 
307 | # In[ ]:
308 | 


--------------------------------------------------------------------------------
/Tensorflow基础使用与文本分类应用/程序/data_handle.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "51CTO课程频道：http://edu.51cto.com/lecturer/index/user_id-12330098.html<br>\n",
  8 |     "优酷频道：http://i.youku.com/sdxxqbf<br>\n",
  9 |     "微信公众号：深度学习与神经网络<br>\n",
 10 |     "Github：https://github.com/Qinbf<br>\n",
 11 |     "\n",
 12 |     "question_train_set.txt：  \n",
 13 |     "    第一列为 问题id；  \n",
 14 |     "    第二列为 title 的字符编号序列；  \n",
 15 |     "    第三列为 title 的词语编号序列；  \n",
 16 |     "    第四列为描述的字符编号序列；  \n",
 17 |     "    第五列为描述的词语标号序列。  \n",
 18 |     "    \n",
 19 |     "question_topic_train_set.txt：  \n",
 20 |     "    第一列 问题 id；  \n",
 21 |     "    第二列 话题 id。  \n",
 22 |     "\n",
 23 |     "topic_info.txt：  \n",
 24 |     "    第一列为话题 id  \n",
 25 |     "    第二列为话题的父话题 id。话题之间是有向无环图结构，一个话题可能有 0 到多个父话题；  \n",
 26 |     "    第三列为话题名字的字符编号序列；  \n",
 27 |     "    第四列为话题名字的词语编号序列；  \n",
 28 |     "    第五列为话题描述的字符编号序列；  \n",
 29 |     "    第六列为话题描述的词语编号序列。  \n",
 30 |     "\n",
 31 |     "1.title通常来说包含的信息最重要。对于question_train_set.txt文件，为了简单起见，我们只取第三列，title的词语编号序列。    \n",
 32 |     "2.对于topic_info.txt，为了简单起见，我们不考虑2,3,4,5,6列。只是简单的提取话题id，然后转为0-1998的数字（一共有1999个话题）  \n",
 33 |     "3.然后合并以上一些数据，得到最后处理后的数据。  "
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 1,
 39 |    "metadata": {
 40 |     "collapsed": true
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "import pandas as pd\n",
 45 |     "from tqdm import tqdm # pip install tqdm\n",
 46 |     "from six.moves import xrange"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 2,
 52 |    "metadata": {
 53 |     "collapsed": false
 54 |    },
 55 |    "outputs": [
 56 |     {
 57 |      "name": "stdout",
 58 |      "output_type": "stream",
 59 |      "text": [
 60 |       "                     0                                                  1  \\\n",
 61 |       "0  6555699376639805223  c324,c39,c40,c155,c180,c180,c181,c17,c4,c1153,...   \n",
 62 |       "1  2887834264226772863  c44,c110,c101,c286,c106,c150,c101,c892,c632,c1...   \n",
 63 |       "2 -2687466858632038806  c15,c768,c769,c1363,c650,c1218,c2361,c11,c90,c...   \n",
 64 |       "3    -5698296155734268  c473,c1528,c528,c428,c295,c15,c101,c188,c146,c...   \n",
 65 |       "4 -6719100304248915192  c190,c147,c105,c219,c220,c101,c647,c219,c220,c...   \n",
 66 |       "\n",
 67 |       "                                                   2  \\\n",
 68 |       "0  w305,w13549,w22752,w11,w7225,w2565,w1106,w16,w...   \n",
 69 |       "1  w377,w54,w285,w57,w349,w54,w108215,w6,w47986,w...   \n",
 70 |       "2  w875,w15450,w42394,w15863,w6,w95421,w25,w803,w...   \n",
 71 |       "3  w8646,w2744,w1462,w9,w54,w138,w54,w50,w110,w14...   \n",
 72 |       "4  w380,w54,w674,w133,w54,w134,w614,w54,w929,w307...   \n",
 73 |       "\n",
 74 |       "                                                   3  \\\n",
 75 |       "0  c335,c101,c611,c189,c97,c144,c147,c101,c15,c76...   \n",
 76 |       "1  c1265,c518,c74,c131,c274,c57,c768,c769,c368,c3...   \n",
 77 |       "2  c693,c100,c279,c99,c189,c532,c101,c189,c145,c1...   \n",
 78 |       "3                                                NaN   \n",
 79 |       "4  c644,c1212,c253,c199,c431,c452,c424,c207,c2,c1...   \n",
 80 |       "\n",
 81 |       "                                                   4  \n",
 82 |       "0  w231,w54,w1681,w54,w11506,w5714,w7,w54,w744,w1...  \n",
 83 |       "1                  w12508,w1380,w72,w27045,w276,w111  \n",
 84 |       "2  w140340,w54,w48398,w54,w140341,w54,w12856,w54,...  \n",
 85 |       "3                                                NaN  \n",
 86 |       "4  w4821,w1301,w16003,w928,w1961,w2565,w50803,w11...  \n"
 87 |      ]
 88 |     }
 89 |    ],
 90 |    "source": [
 91 |     "# 导入question_train_set\n",
 92 |     "reader = pd.read_table('./ieee_zhihu_cup/question_train_set.txt',sep='\\t',header=None)\n",
 93 |     "print(reader.iloc[0:5])"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 3,
 99 |    "metadata": {
100 |     "collapsed": false
101 |    },
102 |    "outputs": [
103 |     {
104 |      "name": "stdout",
105 |      "output_type": "stream",
106 |      "text": [
107 |       "                     0                                                  1\n",
108 |       "0  6555699376639805223            7739004195693774975,3738968195649774859\n",
109 |       "1  2887834264226772863                               -3149765934180654494\n",
110 |       "2 -2687466858632038806                                -760432988437306018\n",
111 |       "3    -5698296155734268           -6758942141122113907,3195914392210930723\n",
112 |       "4 -6719100304248915192  3804601920633030746,4797226510592237555,435133...\n"
113 |      ]
114 |     }
115 |    ],
116 |    "source": [
117 |     "# 导入question_topic_eval_set\n",
118 |     "topic_reader = pd.read_table('./ieee_zhihu_cup/question_topic_train_set.txt',sep='\\t',header=None)\n",
119 |     "print(topic_reader.iloc[0:5])"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 4,
125 |    "metadata": {
126 |     "collapsed": false
127 |    },
128 |    "outputs": [
129 |     {
130 |      "name": "stdout",
131 |      "output_type": "stream",
132 |      "text": [
133 |       "                                                   0  \\\n",
134 |       "0  w305,w13549,w22752,w11,w7225,w2565,w1106,w16,w...   \n",
135 |       "1  w377,w54,w285,w57,w349,w54,w108215,w6,w47986,w...   \n",
136 |       "2  w875,w15450,w42394,w15863,w6,w95421,w25,w803,w...   \n",
137 |       "3  w8646,w2744,w1462,w9,w54,w138,w54,w50,w110,w14...   \n",
138 |       "4  w380,w54,w674,w133,w54,w134,w614,w54,w929,w307...   \n",
139 |       "\n",
140 |       "                                                   1  \n",
141 |       "0            7739004195693774975,3738968195649774859  \n",
142 |       "1                               -3149765934180654494  \n",
143 |       "2                                -760432988437306018  \n",
144 |       "3           -6758942141122113907,3195914392210930723  \n",
145 |       "4  3804601920633030746,4797226510592237555,435133...  \n"
146 |      ]
147 |     }
148 |    ],
149 |    "source": [
150 |     "# 合并title 的词语编号序列和话题 id\n",
151 |     "data_topic = pd.concat([reader.ix[:,2], topic_reader.ix[:,1]], axis=1, ignore_index=True)\n",
152 |     "print(data_topic.iloc[0:5])"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": 5,
158 |    "metadata": {
159 |     "collapsed": false
160 |    },
161 |    "outputs": [
162 |     {
163 |      "name": "stdout",
164 |      "output_type": "stream",
165 |      "text": [
166 |       "                     0                                                  1  \\\n",
167 |       "0   738845194850773558                               -5833678375673307423   \n",
168 |       "1  3738968195649774859                                2027693463582123305   \n",
169 |       "2  4738849194894773882                                1127459907694805235   \n",
170 |       "3  7739004195693774975  2904932941037075699,1160326435131345730,725917...   \n",
171 |       "4 -7261194805221226386                               -5833678375673307423   \n",
172 |       "\n",
173 |       "                  2     3                                                  4  \\\n",
174 |       "0             c0,c1    w0  c0,c1,c2,c3,c4,c5,c6,c7,c0,c1,c8,c9,c10,c11,c1...   \n",
175 |       "1           c39,c40   w24  c41,c42,c43,c39,c40,c4,c44,c45,c46,c47,c48,c49...   \n",
176 |       "2    c172,c31,c0,c1  w102                                                NaN   \n",
177 |       "3   c39,c40,c5,c173  w103  c39,c40,c23,c21,c174,c74,c5,c173,c17,c35,c39,c...   \n",
178 |       "4  c36,c31,c45,c237  w148                                          c238,c239   \n",
179 |       "\n",
180 |       "                                                   5  \n",
181 |       "0  w0,w1,w2,w3,w4,w5,w6,w7,w8,w9,w10,w11,w12,w13,...  \n",
182 |       "1  w24,w25,w26,w27,w28,w6,w29,w30,w11,w31,w32,w33...  \n",
183 |       "2                                                NaN  \n",
184 |       "3  w104,w105,w11,w21,w24,w6,w106,w23,w54,w24,w107...  \n",
185 |       "4                                          w149,w150  \n"
186 |      ]
187 |     }
188 |    ],
189 |    "source": [
190 |     "# 导入topic_info\n",
191 |     "label_reader = pd.read_table('./ieee_zhihu_cup/topic_info.txt',sep='\\t',header=None)\n",
192 |     "print(label_reader.iloc[0:5])"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": 6,
198 |    "metadata": {
199 |     "collapsed": false
200 |    },
201 |    "outputs": [
202 |     {
203 |      "name": "stdout",
204 |      "output_type": "stream",
205 |      "text": [
206 |       "3\n"
207 |      ]
208 |     }
209 |    ],
210 |    "source": [
211 |     "# 把标签转为0-1998的编号\n",
212 |     "labels = list(label_reader.iloc[:,0])\n",
213 |     "my_labels = []\n",
214 |     "for label in labels:\n",
215 |     "    my_labels.append(label)\n",
216 |     "    \n",
217 |     "# 建立topic字典\n",
218 |     "topic_dict = {}\n",
219 |     "for i,label in enumerate(my_labels):\n",
220 |     "    topic_dict[label] = i\n",
221 |     "\n",
222 |     "print(topic_dict[7739004195693774975])"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 7,
228 |    "metadata": {
229 |     "collapsed": false
230 |    },
231 |    "outputs": [
232 |     {
233 |      "name": "stderr",
234 |      "output_type": "stream",
235 |      "text": [
236 |       "100%|██████████████████████████████████████████████████████████████████████| 2999967/2999967 [12:15<00:00, 4076.87it/s]\n"
237 |      ]
238 |     },
239 |     {
240 |      "name": "stdout",
241 |      "output_type": "stream",
242 |      "text": [
243 |       "                                                   0                1\n",
244 |       "0  w305,w13549,w22752,w11,w7225,w2565,w1106,w16,w...              3,1\n",
245 |       "1  w377,w54,w285,w57,w349,w54,w108215,w6,w47986,w...              769\n",
246 |       "2  w875,w15450,w42394,w15863,w6,w95421,w25,w803,w...              342\n",
247 |       "3  w8646,w2744,w1462,w9,w54,w138,w54,w50,w110,w14...          1842,12\n",
248 |       "4  w380,w54,w674,w133,w54,w134,w614,w54,w929,w307...  155,150,110,7,6\n"
249 |      ]
250 |     }
251 |    ],
252 |    "source": [
253 |     "for i in tqdm(xrange(data_topic.shape[0])):\n",
254 |     "    new_label = ''\n",
255 |     "    # 根据“,”切分话题id\n",
256 |     "    temp_topic = data_topic.iloc[i][1].split(',')\n",
257 |     "    for topic in temp_topic:\n",
258 |     "        # 判断该label是否在label文件中，并得到该行\n",
259 |     "        label_num = topic_dict[int(topic)]\n",
260 |     "        new_label = new_label + str(label_num) + ','\n",
261 |     "    data_topic.iloc[i][1] = new_label[:-1]\n",
262 |     "print(data_topic.iloc[:5])"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": 8,
268 |    "metadata": {
269 |     "collapsed": true
270 |    },
271 |    "outputs": [],
272 |    "source": [
273 |     "# 保存处理过后的文件\n",
274 |     "data_topic.to_csv(\"./ieee_zhihu_cup/data_topic.txt\", header=None, index=None, sep='\\t')\n",
275 |     "\n",
276 |     "# 切分成10块保存\n",
277 |     "for i in xrange(10):\n",
278 |     "    data_topic_filename = './ieee_zhihu_cup/data_topic_block_' + str(i) + '.txt'\n",
279 |     "    if (i+1)*300000 < data_topic.shape[0]:\n",
280 |     "        data_topic.iloc[i*300000:(i+1)*300000].to_csv(\n",
281 |     "            data_topic_filename, header=None, index=None, sep='\\t')\n",
282 |     "    else:\n",
283 |     "        data_topic.iloc[i*300000:data_topic.shape[0]].to_csv(\n",
284 |     "            data_topic_filename, header=None, index=None, sep='\\t')"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": null,
290 |    "metadata": {
291 |     "collapsed": true
292 |    },
293 |    "outputs": [],
294 |    "source": []
295 |   }
296 |  ],
297 |  "metadata": {
298 |   "anaconda-cloud": {},
299 |   "kernelspec": {
300 |    "display_name": "Python [default]",
301 |    "language": "python",
302 |    "name": "python3"
303 |   },
304 |   "language_info": {
305 |    "codemirror_mode": {
306 |     "name": "ipython",
307 |     "version": 3
308 |    },
309 |    "file_extension": ".py",
310 |    "mimetype": "text/x-python",
311 |    "name": "python",
312 |    "nbconvert_exporter": "python",
313 |    "pygments_lexer": "ipython3",
314 |    "version": "3.5.2"
315 |   }
316 |  },
317 |  "nbformat": 4,
318 |  "nbformat_minor": 2
319 | }
320 | 


--------------------------------------------------------------------------------
/Tensorflow基础使用与文本分类应用/程序/cnn.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "51CTO课程频道：http://edu.51cto.com/lecturer/index/user_id-12330098.html<br>\n",
  8 |     "优酷频道：http://i.youku.com/sdxxqbf<br>\n",
  9 |     "微信公众号：深度学习与神经网络<br>\n",
 10 |     "Github：https://github.com/Qinbf<br>"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 1,
 16 |    "metadata": {
 17 |     "collapsed": true
 18 |    },
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "import tensorflow as tf\n",
 22 |     "import numpy as np\n",
 23 |     "import os\n",
 24 |     "import time\n",
 25 |     "import numpy as np\n",
 26 |     "import pandas as pd\n",
 27 |     "import math\n",
 28 |     "from tqdm import tqdm\n",
 29 |     "from six.moves import xrange"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 2,
 35 |    "metadata": {
 36 |     "collapsed": false
 37 |    },
 38 |    "outputs": [
 39 |     {
 40 |      "name": "stdout",
 41 |      "output_type": "stream",
 42 |      "text": [
 43 |       "\n",
 44 |       "Parameters:\n",
 45 |       "BATCH_SIZE=64\n",
 46 |       "CHECKPOINT_EVERY=200\n",
 47 |       "DATA_FILE=./ieee_zhihu_cup/data_topic_block_0.txt\n",
 48 |       "DEV_SAMPLE_PERCENTAGE=0.1\n",
 49 |       "DROPOUT_KEEP_PROB=0.5\n",
 50 |       "EMBEDDING_DIM=256\n",
 51 |       "EVALUATE_EVERY=50\n",
 52 |       "FILTER_SIZES=3,4,5\n",
 53 |       "L2_REG_LAMBDA=0.0005\n",
 54 |       "NUM_CHECKPOINTS=5\n",
 55 |       "NUM_EPOCHS=10\n",
 56 |       "NUM_FILTERS=1024\n",
 57 |       "\n"
 58 |      ]
 59 |     }
 60 |    ],
 61 |    "source": [
 62 |     "# Parameters\n",
 63 |     "# ==================================================\n",
 64 |     "\n",
 65 |     "# Data loading params\n",
 66 |     "# validation数据集占比\n",
 67 |     "tf.flags.DEFINE_float(\"dev_sample_percentage\", .1, \"Percentage of the training data to use for validation\")\n",
 68 |     "# 数据集\n",
 69 |     "tf.flags.DEFINE_string(\"data_file\", \"./ieee_zhihu_cup/data_topic_block_0.txt\", \"Data source for the positive data.\")\n",
 70 |     "\n",
 71 |     "# Model Hyperparameters\n",
 72 |     "# 词向量长度\n",
 73 |     "tf.flags.DEFINE_integer(\"embedding_dim\", 256, \"Dimensionality of character embedding (default: 256)\")\n",
 74 |     "# 卷积核大小\n",
 75 |     "tf.flags.DEFINE_string(\"filter_sizes\", \"3,4,5\", \"Comma-separated filter sizes (default: '3,4,5')\")\n",
 76 |     "# 每一种卷积核个数\n",
 77 |     "tf.flags.DEFINE_integer(\"num_filters\", 1024, \"Number of filters per filter size (default: 1024)\")\n",
 78 |     "# dropout参数\n",
 79 |     "tf.flags.DEFINE_float(\"dropout_keep_prob\", 0.5, \"Dropout keep probability (default: 0.5)\")\n",
 80 |     "# l2正则化参数\n",
 81 |     "tf.flags.DEFINE_float(\"l2_reg_lambda\", 0.0005, \"L2 regularization lambda (default: 0.0005)\")\n",
 82 |     "\n",
 83 |     "# Training parameters\n",
 84 |     "# 批次大小\n",
 85 |     "tf.flags.DEFINE_integer(\"batch_size\", 64, \"Batch Size (default: 64)\")\n",
 86 |     "# 迭代周期\n",
 87 |     "tf.flags.DEFINE_integer(\"num_epochs\", 10, \"Number of training epochs (default: 10)\")\n",
 88 |     "# 多少step测试一次\n",
 89 |     "tf.flags.DEFINE_integer(\"evaluate_every\", 50, \"Evaluate model on dev set after this many steps (default: 50)\")\n",
 90 |     "# 多少step保存一次模型\n",
 91 |     "tf.flags.DEFINE_integer(\"checkpoint_every\", 200, \"Save model after this many steps (default: 200)\")\n",
 92 |     "# 保存多少个模型\n",
 93 |     "tf.flags.DEFINE_integer(\"num_checkpoints\", 5, \"Number of checkpoints to store (default: 5)\")\n",
 94 |     "\n",
 95 |     "# flags解析\n",
 96 |     "FLAGS = tf.flags.FLAGS\n",
 97 |     "FLAGS._parse_flags()\n",
 98 |     "\n",
 99 |     "# 打印所有参数\n",
100 |     "print(\"\\nParameters:\")\n",
101 |     "for attr, value in sorted(FLAGS.__flags.items()):\n",
102 |     "    print(\"{}={}\".format(attr.upper(), value))\n",
103 |     "print(\"\")"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 3,
109 |    "metadata": {
110 |     "collapsed": false
111 |    },
112 |    "outputs": [
113 |     {
114 |      "name": "stderr",
115 |      "output_type": "stream",
116 |      "text": [
117 |       "100%|████████████████████████████████████████████████████████████████████████| 300000/300000 [01:15<00:00, 3959.17it/s]\n"
118 |      ]
119 |     }
120 |    ],
121 |    "source": [
122 |     "y = []\n",
123 |     "x_text = []\n",
124 |     "\n",
125 |     "# 读取训练数据和标签\n",
126 |     "reader = pd.read_table(FLAGS.data_file,sep='\\t',header=None)\n",
127 |     "for i in tqdm(xrange(reader.shape[0])):\n",
128 |     "    # 按','切分标签\n",
129 |     "    temp = reader.iloc[i][1].split(',')\n",
130 |     "    # 如果分类数大于5，只取前5个分类\n",
131 |     "    if (len(temp)>5):\n",
132 |     "        temp = temp[0:5]\n",
133 |     "    # 设置标签的对应位置为1，其余位置为0\n",
134 |     "    label = np.zeros(1999)\n",
135 |     "    for temp_label in temp:\n",
136 |     "        label[int(temp_label)] = 1\n",
137 |     "    y.append(label)\n",
138 |     "    x_text.append(reader.iloc[i][0])"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 4,
144 |    "metadata": {
145 |     "collapsed": false
146 |    },
147 |    "outputs": [
148 |     {
149 |      "name": "stdout",
150 |      "output_type": "stream",
151 |      "text": [
152 |       "['w305,w13549,w22752,w11,w7225,w2565,w1106,w16,w31389,w6,w1019,w69288,w111,w3332,w109,w11,w25,w1110,w111', 'w377,w54,w285,w57,w349,w54,w108215,w6,w47986,w875,w3352,w500,w21790,w12144,w111', 'w875,w15450,w42394,w15863,w6,w95421,w25,w803,w346,w6,w3763,w347,w88,w111', 'w8646,w2744,w1462,w9,w54,w138,w54,w50,w110,w140344,w111,w112,w49270,w2129,w6,w6978,w359,w10147,w111', 'w380,w54,w674,w133,w54,w134,w614,w54,w929,w307,w109,w110,w19045,w6,w5830,w111']\n",
153 |       "[[ 0.  1.  0. ...,  0.  0.  0.]\n",
154 |       " [ 0.  0.  0. ...,  0.  0.  0.]\n",
155 |       " [ 0.  0.  0. ...,  0.  0.  0.]\n",
156 |       " [ 0.  0.  0. ...,  0.  0.  0.]\n",
157 |       " [ 0.  0.  0. ...,  0.  0.  0.]]\n"
158 |      ]
159 |     }
160 |    ],
161 |    "source": [
162 |     "# 打印x_text和y的前5行\n",
163 |     "print(x_text[0:5])\n",
164 |     "y = np.array(y, dtype = np.float32)\n",
165 |     "print(y[0:5])"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 5,
171 |    "metadata": {
172 |     "collapsed": false
173 |    },
174 |    "outputs": [
175 |     {
176 |      "name": "stdout",
177 |      "output_type": "stream",
178 |      "text": [
179 |       "x_shape: (300000, 72)\n",
180 |       "y_shape: (300000, 1999)\n",
181 |       "Vocabulary Size: 131900\n",
182 |       "Train/Dev split: 270000/30000\n",
183 |       "x: [[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15  4 16 17 13  0  0  0  0  0\n",
184 |       "   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
185 |       "   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
186 |       " [18 19 20 21 22 19 23 10 24 25 26 27 28 29 13  0  0  0  0  0  0  0  0  0\n",
187 |       "   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
188 |       "   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
189 |       " [25 30 31 32 10 33 16 34 35 10 36 37 38 13  0  0  0  0  0  0  0  0  0  0\n",
190 |       "   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
191 |       "   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
192 |       " [39 40 41 42 19 43 19 44 45 46 13 47 48 49 10 50 51 52 13  0  0  0  0  0\n",
193 |       "   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
194 |       "   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]\n",
195 |       " [53 19 54 55 19 56 57 19 58 59 15 45 60 10 61 13  0  0  0  0  0  0  0  0\n",
196 |       "   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0\n",
197 |       "   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]\n",
198 |       "y: [[ 0.  1.  0. ...,  0.  0.  0.]\n",
199 |       " [ 0.  0.  0. ...,  0.  0.  0.]\n",
200 |       " [ 0.  0.  0. ...,  0.  0.  0.]\n",
201 |       " [ 0.  0.  0. ...,  0.  0.  0.]\n",
202 |       " [ 0.  0.  0. ...,  0.  0.  0.]]\n"
203 |      ]
204 |     }
205 |    ],
206 |    "source": [
207 |     "# Build vocabulary\n",
208 |     "# 计算一段文本中最多的词汇数\n",
209 |     "max_document_length = max([len(x.split(\",\")) for x in x_text])\n",
210 |     "vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(max_document_length)\n",
211 |     "\n",
212 |     "x = np.array(list(vocab_processor.fit_transform(x_text)))\n",
213 |     "print(\"x_shape:\",x.shape)\n",
214 |     "print(\"y_shape:\",y.shape)\n",
215 |     "\n",
216 |     "# 保存字典\n",
217 |     "vocab_processor.save(\"vocab_dict\")\n",
218 |     "\n",
219 |     "# Split train/test set\n",
220 |     "# 数据集切分为两部分，训练集和验证集\n",
221 |     "dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))\n",
222 |     "x_train, x_dev = x[:dev_sample_index], x[dev_sample_index:]\n",
223 |     "y_train, y_dev = y[:dev_sample_index], y[dev_sample_index:]\n",
224 |     "\n",
225 |     "print(\"Vocabulary Size: {:d}\".format(len(vocab_processor.vocabulary_)))\n",
226 |     "print(\"Train/Dev split: {:d}/{:d}\".format(len(y_train), len(y_dev)))\n",
227 |     "print(\"x:\",x_train[0:5])\n",
228 |     "print(\"y:\",y_train[0:5])"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": 6,
234 |    "metadata": {
235 |     "collapsed": false
236 |    },
237 |    "outputs": [
238 |     {
239 |      "name": "stdout",
240 |      "output_type": "stream",
241 |      "text": [
242 |       "num_filters_total: 3072\n"
243 |      ]
244 |     }
245 |    ],
246 |    "source": [
247 |     "# 定义三个placeholder\n",
248 |     "input_x = tf.placeholder(tf.int32, [None, x_train.shape[1]], name=\"input_x\")\n",
249 |     "input_y = tf.placeholder(tf.float32, [None, y_train.shape[1]], name=\"input_y\")\n",
250 |     "dropout_keep_prob = tf.placeholder(tf.float32, name=\"dropout_keep_prob\")\n",
251 |     "\n",
252 |     "# sequence_length-最长词汇数\n",
253 |     "sequence_length=x_train.shape[1]\n",
254 |     "# num_classes-分类数\n",
255 |     "num_classes=y_train.shape[1]\n",
256 |     "# vocab_size-总词汇数\n",
257 |     "vocab_size=len(vocab_processor.vocabulary_)\n",
258 |     "# embedding_size-词向量长度\n",
259 |     "embedding_size=FLAGS.embedding_dim\n",
260 |     "# filter_sizes-卷积核尺寸3，4，5\n",
261 |     "filter_sizes=list(map(int, FLAGS.filter_sizes.split(\",\")))\n",
262 |     "# num_filters-卷积核数量\n",
263 |     "num_filters=FLAGS.num_filters\n",
264 |     "        \n",
265 |     "Weights = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name=\"Weights\")\n",
266 |     "# shape:[None, sequence_length, embedding_size]\n",
267 |     "embedded_chars = tf.nn.embedding_lookup(Weights, input_x)\n",
268 |     "# 添加一个维度，shape:[None, sequence_length, embedding_size, 1]\n",
269 |     "embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)\n",
270 |     "\n",
271 |     "# Create a convolution + maxpool layer for each filter size\n",
272 |     "pooled_outputs = []\n",
273 |     "for i, filter_size in enumerate(filter_sizes):\n",
274 |     "    with tf.name_scope(\"conv-maxpool-%s\" % filter_size):\n",
275 |     "        # Convolution Layer\n",
276 |     "        filter_shape = [filter_size, embedding_size, 1, num_filters]\n",
277 |     "        W = tf.Variable(\n",
278 |     "            tf.truncated_normal(filter_shape, stddev=0.1), name=\"W\")\n",
279 |     "        b = tf.Variable(\n",
280 |     "            tf.constant(0.1, shape=[num_filters]), name=\"b\")\n",
281 |     "        conv = tf.nn.conv2d(\n",
282 |     "            embedded_chars_expanded,\n",
283 |     "            W,\n",
284 |     "            strides=[1, 1, 1, 1],\n",
285 |     "            padding=\"VALID\",\n",
286 |     "            name=\"conv\")\n",
287 |     "        # Apply nonlinearity\n",
288 |     "        h = tf.nn.relu(tf.nn.bias_add(conv, b), name=\"relu\")\n",
289 |     "        # Maxpooling over the outputs\n",
290 |     "        pooled = tf.nn.max_pool(\n",
291 |     "            h,\n",
292 |     "            ksize=[1, sequence_length - filter_size + 1, 1, 1],\n",
293 |     "            strides=[1, 1, 1, 1],\n",
294 |     "            padding='VALID',\n",
295 |     "            name=\"pool\")\n",
296 |     "        pooled_outputs.append(pooled)\n",
297 |     "\n",
298 |     "# Combine all the pooled features\n",
299 |     "num_filters_total = num_filters * len(filter_sizes)\n",
300 |     "print(\"num_filters_total:\", num_filters_total)\n",
301 |     "h_pool = tf.concat(pooled_outputs, 3)\n",
302 |     "h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])\n",
303 |     "\n",
304 |     "# Add dropout\n",
305 |     "with tf.name_scope(\"dropout\"):h_drop = tf.nn.dropout(h_pool_flat,dropout_keep_prob)\n",
306 |     "\n",
307 |     "# Final (unnormalized) scores and predictions\n",
308 |     "with tf.name_scope(\"output\"):\n",
309 |     "    W = tf.get_variable(\n",
310 |     "        \"W\",\n",
311 |     "        shape=[num_filters_total, num_classes],\n",
312 |     "        initializer=tf.contrib.layers.xavier_initializer())\n",
313 |     "    b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name=\"b\")\n",
314 |     "    scores = tf.nn.xw_plus_b(h_drop, W, b, name=\"scores\")\n",
315 |     "    \n",
316 |     "# 定义loss\n",
317 |     "with tf.name_scope(\"loss\"):\n",
318 |     "    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=scores, labels=input_y))\n",
319 |     "\n",
320 |     "# 定义优化器\n",
321 |     "with tf.name_scope(\"optimizer\"):\n",
322 |     "    optimizer = tf.train.AdamOptimizer(1e-3).minimize(loss)"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": 7,
328 |    "metadata": {
329 |     "collapsed": true
330 |    },
331 |    "outputs": [],
332 |    "source": [
333 |     "# 生成批次数据\n",
334 |     "def batch_iter(data, batch_size, num_epochs, shuffle=False):\n",
335 |     "    \"\"\"\n",
336 |     "    Generates a batch iterator for a dataset.\n",
337 |     "    \"\"\"\n",
338 |     "    data = np.array(data)\n",
339 |     "    data_size = len(data)\n",
340 |     "    # 每个epoch的num_batch\n",
341 |     "    num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1\n",
342 |     "    print(\"num_batches_per_epoch:\",num_batches_per_epoch)\n",
343 |     "    for epoch in range(num_epochs):\n",
344 |     "        # Shuffle the data at each epoch\n",
345 |     "        if shuffle:\n",
346 |     "            shuffle_indices = np.random.permutation(np.arange(data_size))\n",
347 |     "            shuffled_data = data[shuffle_indices]\n",
348 |     "        else:\n",
349 |     "            shuffled_data = data\n",
350 |     "        for batch_num in range(num_batches_per_epoch):\n",
351 |     "            start_index = batch_num * batch_size\n",
352 |     "            end_index = min((batch_num + 1) * batch_size, data_size)\n",
353 |     "            yield shuffled_data[start_index:end_index]"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": null,
359 |    "metadata": {
360 |     "collapsed": true
361 |    },
362 |    "outputs": [],
363 |    "source": [
364 |     "# 知乎提供的评测方案\n",
365 |     "def eval(predict_label_and_marked_label_list):\n",
366 |     "    \"\"\"\n",
367 |     "    :param predict_label_and_marked_label_list: 一个元组列表。例如\n",
368 |     "    [ ([1, 2, 3, 4, 5], [4, 5, 6, 7]),\n",
369 |     "      ([3, 2, 1, 4, 7], [5, 7, 3])\n",
370 |     "     ]\n",
371 |     "    需要注意这里 predict_label 是去重复的，例如 [1,2,3,2,4,1,6]，去重后变成[1,2,3,4,6]\n",
372 |     "    \n",
373 |     "    marked_label_list 本身没有顺序性，但提交结果有，例如上例的命中情况分别为\n",
374 |     "    [0，0，0，1，1]   (4，5命中)\n",
375 |     "    [1，0，0，0，1]   (3，7命中)\n",
376 |     "\n",
377 |     "    \"\"\"\n",
378 |     "    right_label_num = 0  #总命中标签数量\n",
379 |     "    right_label_at_pos_num = [0, 0, 0, 0, 0]  #在各个位置上总命中数量\n",
380 |     "    sample_num = 0   #总问题数量\n",
381 |     "    all_marked_label_num = 0    #总标签数量\n",
382 |     "    for predict_labels, marked_labels in predict_label_and_marked_label_list:\n",
383 |     "        sample_num += 1\n",
384 |     "        marked_label_set = set(marked_labels)\n",
385 |     "        all_marked_label_num += len(marked_label_set)\n",
386 |     "        for pos, label in zip(range(0, min(len(predict_labels), 5)), predict_labels):\n",
387 |     "            if label in marked_label_set:     #命中\n",
388 |     "                right_label_num += 1\n",
389 |     "                right_label_at_pos_num[pos] += 1\n",
390 |     "\n",
391 |     "    precision = 0.0\n",
392 |     "    for pos, right_num in zip(range(0, 5), right_label_at_pos_num):\n",
393 |     "        precision += ((right_num / float(sample_num))) / math.log(2.0 + pos)  # 下标0-4 映射到 pos1-5 + 1，所以最终+2\n",
394 |     "    recall = float(right_label_num) / all_marked_label_num\n",
395 |     "\n",
396 |     "    return 2*(precision * recall) / (precision + recall )"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": null,
402 |    "metadata": {
403 |     "collapsed": false
404 |    },
405 |    "outputs": [
406 |     {
407 |      "name": "stdout",
408 |      "output_type": "stream",
409 |      "text": [
410 |       "num_batches_per_epoch: 4219\n"
411 |      ]
412 |     }
413 |    ],
414 |    "source": [
415 |     "# 定义saver，只保存最新的5个模型\n",
416 |     "saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)\n",
417 |     "\n",
418 |     "with tf.Session() as sess:\n",
419 |     "    predict_top_5 = tf.nn.top_k(scores, k=5)\n",
420 |     "    label_top_5 = tf.nn.top_k(input_y, k=5) \n",
421 |     "    sess.run(tf.global_variables_initializer())\n",
422 |     "    i = 0\n",
423 |     "    # 生成数据\n",
424 |     "    batches = batch_iter(\n",
425 |     "        list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs)\n",
426 |     "    for batch in batches:\n",
427 |     "        i = i + 1\n",
428 |     "        # 得到一个batch的数据\n",
429 |     "        x_batch, y_batch = zip(*batch)\n",
430 |     "        # 优化模型\n",
431 |     "        sess.run([optimizer],feed_dict={input_x:x_batch, input_y:y_batch, dropout_keep_prob:FLAGS.dropout_keep_prob})\n",
432 |     "\n",
433 |     "        # 每训练50次测试1次\n",
434 |     "        if (i % FLAGS.evaluate_every == 0):\n",
435 |     "            print (\"Evaluation:step\",i)\n",
436 |     "            predict_5, label_5, _loss = sess.run([predict_top_5,label_top_5,loss],feed_dict={input_x:x_batch,\n",
437 |     "                                                                                      input_y:y_batch,\n",
438 |     "                                                                                      dropout_keep_prob:1.0})\n",
439 |     "            print (\"label:\",label_5[1][:5])\n",
440 |     "            print (\"predict:\",predict_5[1][:5])\n",
441 |     "            print (\"predict:\",predict_5[0][:5])\n",
442 |     "            print (\"loss:\",_loss)\n",
443 |     "            predict_label_and_marked_label_list = []\n",
444 |     "            for predict,label in zip(predict_5[1],label_5[1]):\n",
445 |     "                predict_label_and_marked_label_list.append((list(predict),list(label)))\n",
446 |     "            score = eval(predict_label_and_marked_label_list)\n",
447 |     "            print(\"score:\",score)\n",
448 |     "\n",
449 |     "        # 每训练200次保存1次模型\n",
450 |     "        if (i % FLAGS.checkpoint_every == 0):\n",
451 |     "            path = saver.save(sess, \"models/model\", global_step=i)\n",
452 |     "            print(\"Saved model checkpoint to {}\".format(path))"
453 |    ]
454 |   },
455 |   {
456 |    "cell_type": "code",
457 |    "execution_count": null,
458 |    "metadata": {
459 |     "collapsed": false
460 |    },
461 |    "outputs": [],
462 |    "source": []
463 |   },
464 |   {
465 |    "cell_type": "code",
466 |    "execution_count": null,
467 |    "metadata": {
468 |     "collapsed": false
469 |    },
470 |    "outputs": [],
471 |    "source": []
472 |   },
473 |   {
474 |    "cell_type": "code",
475 |    "execution_count": null,
476 |    "metadata": {
477 |     "collapsed": true
478 |    },
479 |    "outputs": [],
480 |    "source": []
481 |   }
482 |  ],
483 |  "metadata": {
484 |   "anaconda-cloud": {},
485 |   "kernelspec": {
486 |    "display_name": "Python [default]",
487 |    "language": "python",
488 |    "name": "python3"
489 |   },
490 |   "language_info": {
491 |    "codemirror_mode": {
492 |     "name": "ipython",
493 |     "version": 3
494 |    },
495 |    "file_extension": ".py",
496 |    "mimetype": "text/x-python",
497 |    "name": "python",
498 |    "nbconvert_exporter": "python",
499 |    "pygments_lexer": "ipython3",
500 |    "version": "3.5.2"
501 |   }
502 |  },
503 |  "nbformat": 4,
504 |  "nbformat_minor": 2
505 | }
506 | 


--------------------------------------------------------------------------------
/Tensorflow基础使用与文本分类应用/程序/zhihu_eval.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "51CTO课程频道：http://edu.51cto.com/lecturer/index/user_id-12330098.html<br>\n",
  8 |     "优酷频道：http://i.youku.com/sdxxqbf<br>\n",
  9 |     "微信公众号：深度学习与神经网络<br>\n",
 10 |     "Github：https://github.com/Qinbf<br>"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 1,
 16 |    "metadata": {
 17 |     "collapsed": true
 18 |    },
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "#coding:utf-8\n",
 22 |     "import numpy as np\n",
 23 |     "import pandas as pd\n",
 24 |     "from tqdm import tqdm\n",
 25 |     "import tensorflow as tf\n",
 26 |     "import pickle\n",
 27 |     "import math\n",
 28 |     "from six.moves import xrange"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 2,
 34 |    "metadata": {
 35 |     "collapsed": false
 36 |    },
 37 |    "outputs": [
 38 |     {
 39 |      "name": "stdout",
 40 |      "output_type": "stream",
 41 |      "text": [
 42 |       "                     0                                                  1  \\\n",
 43 |       "0  6215603645409872328  c924,c531,c102,c284,c188,c104,c98,c107,c11,c11...   \n",
 44 |       "1  6649324930261961840  c346,c1549,c413,c294,c675,c504,c183,c74,c541,c...   \n",
 45 |       "2 -4251899610700378615  c96,c97,c97,c98,c99,c100,c101,c141,c42,c42,c10...   \n",
 46 |       "3  6213817087034420233  c504,c157,c221,c221,c633,c468,c469,c1637,c1072...   \n",
 47 |       "4 -8930652370334418373  c0,c310,c35,c122,c123,c11,c317,c91,c175,c476,c...   \n",
 48 |       "\n",
 49 |       "                                                   2  \\\n",
 50 |       "0  w1340,w1341,w55,w1344,w58,w6,w24178,w26959,w47...   \n",
 51 |       "1  w40132,w1357,w1556,w1380,w2464,w33,w16791,w109...   \n",
 52 |       "2  w53,w54,w1779,w54,w1309,w54,w369,w949,w65587,w...   \n",
 53 |       "3  w5083,w12537,w10427,w29724,w6,w2566,w11,w18476...   \n",
 54 |       "4  w33792,w21,w83,w6,w21542,w21,w140670,w25,w1110...   \n",
 55 |       "\n",
 56 |       "                                                   3  \\\n",
 57 |       "0  c1128,c529,c636,c572,c1321,c139,c540,c223,c510...   \n",
 58 |       "1                                                NaN   \n",
 59 |       "2  c149,c148,c148,c42,c185,c95,c95,c186,c186,c186...   \n",
 60 |       "3  c15,c131,c39,c40,c85,c166,c969,c2456,c17,c636,...   \n",
 61 |       "4                                                NaN   \n",
 62 |       "\n",
 63 |       "                                                   4  \n",
 64 |       "0  w4094,w1618,w20104,w19234,w1097,w1005,w4228,w2...  \n",
 65 |       "1                                                NaN  \n",
 66 |       "2                                                NaN  \n",
 67 |       "3  w2550,w24,w239,w98,w19456,w11,w108710,w3483,w2...  \n",
 68 |       "4                                                NaN  \n"
 69 |      ]
 70 |     }
 71 |    ],
 72 |    "source": [
 73 |     "# 导入question_train_set\n",
 74 |     "reader = pd.read_table('./ieee_zhihu_cup/question_eval_set.txt',sep='\\t',header=None)\n",
 75 |     "print(reader.iloc[0:5])"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 3,
 81 |    "metadata": {
 82 |     "collapsed": false
 83 |    },
 84 |    "outputs": [
 85 |     {
 86 |      "name": "stdout",
 87 |      "output_type": "stream",
 88 |      "text": [
 89 |       "('max_document_length:', 76)\n"
 90 |      ]
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "# 计算一段文本中最大词汇数\n",
 95 |     "x_text = reader.iloc[:,2]\n",
 96 |     "max_document_length = 0\n",
 97 |     "for i,line in enumerate(x_text):\n",
 98 |     "    try:\n",
 99 |     "        temp = line.split(',')\n",
100 |     "        max_document_length = max(max_document_length,len(temp))\n",
101 |     "    except:\n",
102 |     "        # 其中有一行数据为空\n",
103 |     "        pass\n",
104 |     "#         x_text[i] = \" \"\n",
105 |     "\n",
106 |     "print(\"max_document_length:\",max_document_length)\n",
107 |     "\n",
108 |     "# 载入字典\n",
109 |     "vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor.restore(\"vocab_dict\")"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 4,
115 |    "metadata": {
116 |     "collapsed": true
117 |    },
118 |    "outputs": [],
119 |    "source": [
120 |     "# 按','切分数据\n",
121 |     "text = []\n",
122 |     "for line in x_text:\n",
123 |     "    try:\n",
124 |     "        text.append(line.split(','))\n",
125 |     "    except:\n",
126 |     "        # 其中有一行数据为空\n",
127 |     "        text.append(' ')"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 5,
133 |    "metadata": {
134 |     "collapsed": false
135 |    },
136 |    "outputs": [
137 |     {
138 |      "name": "stderr",
139 |      "output_type": "stream",
140 |      "text": [
141 |       "100%|██████████| 217360/217360 [00:05<00:00, 40820.07it/s]\n"
142 |      ]
143 |     },
144 |     {
145 |      "data": {
146 |       "text/plain": [
147 |        "array([[  4507,   2664,    423,   3387,    425,     10,  84669,   1744,\n",
148 |        "           152,     13,     90,    152,   1556,    403,  17192,     10,\n",
149 |        "          3686,     13,      0,      0,      0,      0,      0,      0,\n",
150 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
151 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
152 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
153 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
154 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
155 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
156 |        "             0,      0,      0,      0],\n",
157 |        "       [ 18531,    861,   1538,    490,  16758,    197,   4225,    658,\n",
158 |        "         18551,     10,   4100,     15,   1929,     52,     13,      0,\n",
159 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
160 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
161 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
162 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
163 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
164 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
165 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
166 |        "             0,      0,      0,      0],\n",
167 |        "       [  1207,     19,    810,     19, 126081,     19,    501,   2249,\n",
168 |        "         85078,     35,    218,    308,     99,    105,    313,     13,\n",
169 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
170 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
171 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
172 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
173 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
174 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
175 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
176 |        "             0,      0,      0,      0],\n",
177 |        "       [  1040,  11856,    360,  23102,     10,   4100,      4,    432,\n",
178 |        "            17,   1424,      0,     13,      0,      0,      0,      0,\n",
179 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
180 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
181 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
182 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
183 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
184 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
185 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
186 |        "             0,      0,      0,      0],\n",
187 |        "       [  3538,    137,   1628,     10,   8450,    137,      0,     16,\n",
188 |        "            17,     13,      0,      0,      0,      0,      0,      0,\n",
189 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
190 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
191 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
192 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
193 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
194 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
195 |        "             0,      0,      0,      0,      0,      0,      0,      0,\n",
196 |        "             0,      0,      0,      0]])"
197 |       ]
198 |      },
199 |      "execution_count": 5,
200 |      "metadata": {},
201 |      "output_type": "execute_result"
202 |     }
203 |    ],
204 |    "source": [
205 |     "# 把数据集变成编号的形式\n",
206 |     "x = []\n",
207 |     "for line in tqdm(text):\n",
208 |     "    line_len = len(line)\n",
209 |     "    text2num = []\n",
210 |     "    for i in xrange(max_document_length):\n",
211 |     "        if(i < line_len):\n",
212 |     "            try:\n",
213 |     "                text2num.append(vocab_processor.vocabulary_.get(line[i])) # 把词转为数字\n",
214 |     "            except:\n",
215 |     "                text2num.append(0) # 没有对应的词\n",
216 |     "        else:\n",
217 |     "            text2num.append(0) # 填充0\n",
218 |     "    x.append(text2num)\n",
219 |     "x = np.array(x)\n",
220 |     "x[:5]"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 6,
226 |    "metadata": {
227 |     "collapsed": false
228 |    },
229 |    "outputs": [],
230 |    "source": [
231 |     "def batch_iter(data, batch_size, num_epochs, shuffle=False):\n",
232 |     "    \"\"\"\n",
233 |     "    Generates a batch iterator for a dataset.\n",
234 |     "    \"\"\"\n",
235 |     "    data = np.array(data)\n",
236 |     "    data_size = len(data)\n",
237 |     "    # 每个epoch的num_batch\n",
238 |     "    num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1\n",
239 |     "    print(\"num_batches_per_epoch:\",num_batches_per_epoch)\n",
240 |     "    for epoch in range(num_epochs):\n",
241 |     "        # Shuffle the data at each epoch\n",
242 |     "        if shuffle:\n",
243 |     "            shuffle_indices = np.random.permutation(np.arange(data_size))\n",
244 |     "            shuffled_data = data[shuffle_indices]\n",
245 |     "        else:\n",
246 |     "            shuffled_data = data\n",
247 |     "        for batch_num in range(num_batches_per_epoch):\n",
248 |     "            start_index = batch_num * batch_size\n",
249 |     "            end_index = min((batch_num + 1) * batch_size, data_size)\n",
250 |     "            yield shuffled_data[start_index:end_index]"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": 7,
256 |    "metadata": {
257 |     "collapsed": false
258 |    },
259 |    "outputs": [],
260 |    "source": [
261 |     "def eval(predict_label_and_marked_label_list):\n",
262 |     "    \"\"\"\n",
263 |     "    :param predict_label_and_marked_label_list: 一个元组列表。例如\n",
264 |     "    [ ([1, 2, 3, 4, 5], [4, 5, 6, 7]),\n",
265 |     "      ([3, 2, 1, 4, 7], [5, 7, 3])\n",
266 |     "     ]\n",
267 |     "    需要注意这里 predict_label 是去重复的，例如 [1,2,3,2,4,1,6]，去重后变成[1,2,3,4,6]\n",
268 |     "    \n",
269 |     "    marked_label_list 本身没有顺序性，但提交结果有，例如上例的命中情况分别为\n",
270 |     "    [0，0，0，1，1]   (4，5命中)\n",
271 |     "    [1，0，0，0，1]   (3，7命中)\n",
272 |     "\n",
273 |     "    \"\"\"\n",
274 |     "    right_label_num = 0  #总命中标签数量\n",
275 |     "    right_label_at_pos_num = [0, 0, 0, 0, 0]  #在各个位置上总命中数量\n",
276 |     "    sample_num = 0   #总问题数量\n",
277 |     "    all_marked_label_num = 0    #总标签数量\n",
278 |     "    for predict_labels, marked_labels in predict_label_and_marked_label_list:\n",
279 |     "        sample_num += 1\n",
280 |     "        marked_label_set = set(marked_labels)\n",
281 |     "        all_marked_label_num += len(marked_label_set)\n",
282 |     "        for pos, label in zip(range(0, min(len(predict_labels), 5)), predict_labels):\n",
283 |     "            if label in marked_label_set:     #命中\n",
284 |     "                right_label_num += 1\n",
285 |     "                right_label_at_pos_num[pos] += 1\n",
286 |     "\n",
287 |     "    precision = 0.0\n",
288 |     "    for pos, right_num in zip(range(0, 5), right_label_at_pos_num):\n",
289 |     "        precision += ((right_num / float(sample_num))) / math.log(2.0 + pos)  # 下标0-4 映射到 pos1-5 + 1，所以最终+2\n",
290 |     "    recall = float(right_label_num) / all_marked_label_num\n",
291 |     "\n",
292 |     "    return 2*(precision * recall) / (precision + recall )"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": 8,
298 |    "metadata": {
299 |     "collapsed": false
300 |    },
301 |    "outputs": [
302 |     {
303 |      "name": "stdout",
304 |      "output_type": "stream",
305 |      "text": [
306 |       "('num_filters_total:', 3072)\n"
307 |      ]
308 |     }
309 |    ],
310 |    "source": [
311 |     "# 定义三个placeholder\n",
312 |     "input_x = tf.placeholder(tf.int32, [None, x.shape[1]], name=\"input_x\")\n",
313 |     "dropout_keep_prob = tf.placeholder(tf.float32, name=\"dropout_keep_prob\")\n",
314 |     "\n",
315 |     "# sequence_length-最长词汇数\n",
316 |     "sequence_length=x.shape[1]\n",
317 |     "# num_classes-分类数\n",
318 |     "num_classes=1999\n",
319 |     "# vocab_size-总词汇数\n",
320 |     "vocab_size=len(vocab_processor.vocabulary_)\n",
321 |     "# embedding_size-词向量长度\n",
322 |     "embedding_size=256\n",
323 |     "# filter_sizes-卷积核尺寸3，4，5\n",
324 |     "filter_sizes=list(map(int, [3,4,5]))\n",
325 |     "# num_filters-卷积核数量\n",
326 |     "num_filters=1024\n",
327 |     "\n",
328 |     "Weights = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name=\"Weights\")\n",
329 |     "# [None, sequence_length, embedding_size]\n",
330 |     "embedded_chars = tf.nn.embedding_lookup(Weights, input_x)\n",
331 |     "# 添加一个维度，[None, sequence_length, embedding_size, 1]\n",
332 |     "embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)\n",
333 |     "# Create a convolution + maxpool layer for each filter size\n",
334 |     "pooled_outputs = []\n",
335 |     "for i, filter_size in enumerate(filter_sizes):\n",
336 |     "    with tf.name_scope(\"conv-maxpool-%s\" % filter_size):\n",
337 |     "        # Convolution Layer\n",
338 |     "        filter_shape = [filter_size, embedding_size, 1, num_filters]\n",
339 |     "        W = tf.Variable(\n",
340 |     "            tf.truncated_normal(filter_shape, stddev=0.1), name=\"W\")\n",
341 |     "        b = tf.Variable(\n",
342 |     "            tf.constant(0.1, shape=[num_filters]), name=\"b\")\n",
343 |     "        conv = tf.nn.conv2d(\n",
344 |     "            embedded_chars_expanded,\n",
345 |     "            W,\n",
346 |     "            strides=[1, 1, 1, 1],\n",
347 |     "            padding=\"VALID\",\n",
348 |     "            name=\"conv\")\n",
349 |     "        # Apply nonlinearity\n",
350 |     "        h = tf.nn.relu(tf.nn.bias_add(conv, b), name=\"relu\")\n",
351 |     "        # Maxpooling over the outputs\n",
352 |     "        pooled = tf.nn.max_pool(\n",
353 |     "            h,\n",
354 |     "            ksize=[1, sequence_length - filter_size + 1, 1, 1],\n",
355 |     "            strides=[1, 1, 1, 1],\n",
356 |     "            padding='VALID',\n",
357 |     "            name=\"pool\")\n",
358 |     "        pooled_outputs.append(pooled)\n",
359 |     "\n",
360 |     "# Combine all the pooled features\n",
361 |     "num_filters_total = num_filters * len(filter_sizes)\n",
362 |     "print(\"num_filters_total:\", num_filters_total)\n",
363 |     "h_pool = tf.concat(pooled_outputs, 3)\n",
364 |     "h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])\n",
365 |     "\n",
366 |     "# Add dropout\n",
367 |     "with tf.name_scope(\"dropout\"):h_drop = tf.nn.dropout(h_pool_flat,dropout_keep_prob)\n",
368 |     "\n",
369 |     "# Final (unnormalized) scores and predictions\n",
370 |     "with tf.name_scope(\"output\"):\n",
371 |     "    W = tf.get_variable(\n",
372 |     "        \"W\",\n",
373 |     "        shape=[num_filters_total, num_classes],\n",
374 |     "        initializer=tf.contrib.layers.xavier_initializer())\n",
375 |     "    b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name=\"b\")\n",
376 |     "    scores = tf.nn.xw_plus_b(h_drop, W, b, name=\"scores\")"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "code",
381 |    "execution_count": 9,
382 |    "metadata": {
383 |     "collapsed": false
384 |    },
385 |    "outputs": [
386 |     {
387 |      "name": "stdout",
388 |      "output_type": "stream",
389 |      "text": [
390 |       "INFO:tensorflow:Restoring parameters from ./models/model_-7200\n",
391 |       "('num_batches_per_epoch:', 218)\n",
392 |       "('Evaluation:step', 5)\n",
393 |       "('Evaluation:step', 10)\n",
394 |       "('Evaluation:step', 15)\n",
395 |       "('Evaluation:step', 20)\n",
396 |       "('Evaluation:step', 25)\n",
397 |       "('Evaluation:step', 30)\n",
398 |       "('Evaluation:step', 35)\n",
399 |       "('Evaluation:step', 40)\n",
400 |       "('Evaluation:step', 45)\n",
401 |       "('Evaluation:step', 50)\n",
402 |       "('Evaluation:step', 55)\n",
403 |       "('Evaluation:step', 60)\n",
404 |       "('Evaluation:step', 65)\n",
405 |       "('Evaluation:step', 70)\n",
406 |       "('Evaluation:step', 75)\n",
407 |       "('Evaluation:step', 80)\n",
408 |       "('Evaluation:step', 85)\n",
409 |       "('Evaluation:step', 90)\n",
410 |       "('Evaluation:step', 95)\n",
411 |       "('Evaluation:step', 100)\n",
412 |       "('Evaluation:step', 105)\n",
413 |       "('Evaluation:step', 110)\n",
414 |       "('Evaluation:step', 115)\n",
415 |       "('Evaluation:step', 120)\n",
416 |       "('Evaluation:step', 125)\n",
417 |       "('Evaluation:step', 130)\n",
418 |       "('Evaluation:step', 135)\n",
419 |       "('Evaluation:step', 140)\n",
420 |       "('Evaluation:step', 145)\n",
421 |       "('Evaluation:step', 150)\n",
422 |       "('Evaluation:step', 155)\n",
423 |       "('Evaluation:step', 160)\n",
424 |       "('Evaluation:step', 165)\n",
425 |       "('Evaluation:step', 170)\n",
426 |       "('Evaluation:step', 175)\n",
427 |       "('Evaluation:step', 180)\n",
428 |       "('Evaluation:step', 185)\n",
429 |       "('Evaluation:step', 190)\n",
430 |       "('Evaluation:step', 195)\n",
431 |       "('Evaluation:step', 200)\n",
432 |       "('Evaluation:step', 205)\n",
433 |       "('Evaluation:step', 210)\n",
434 |       "('Evaluation:step', 215)\n"
435 |      ]
436 |     }
437 |    ],
438 |    "source": [
439 |     "# 选择模型\n",
440 |     "checkpoint_file = \"./models/model-10000\"\n",
441 |     "    \n",
442 |     "with tf.Session() as sess:\n",
443 |     "    predict_top_5 = tf.nn.top_k(scores, k=5)\n",
444 |     "    sess.run(tf.global_variables_initializer())\n",
445 |     "    i = 0\n",
446 |     "    saver = tf.train.Saver()\n",
447 |     "    saver.restore(sess, checkpoint_file)\n",
448 |     "\n",
449 |     "    # Generate batches\n",
450 |     "    batches = batch_iter(list(x), 1000, 1)\n",
451 |     "    \n",
452 |     "    for x_batch in batches:\n",
453 |     "        i = i + 1\n",
454 |     "        predict_5 = sess.run(predict_top_5,feed_dict={input_x:x_batch,dropout_keep_prob:1.0})\n",
455 |     "        if i == 1:\n",
456 |     "            predict = predict_5[1]\n",
457 |     "        else:\n",
458 |     "            predict = np.concatenate((predict,predict_5[1]))\n",
459 |     "        if (i%5==0):\n",
460 |     "            print (\"Evaluation:step\",i)\n",
461 |     "\n",
462 |     "    np.savetxt(\"predict.txt\",predict,fmt='%d')"
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "code",
467 |    "execution_count": null,
468 |    "metadata": {
469 |     "collapsed": true
470 |    },
471 |    "outputs": [],
472 |    "source": []
473 |   }
474 |  ],
475 |  "metadata": {
476 |   "anaconda-cloud": {},
477 |   "kernelspec": {
478 |    "display_name": "Python [default]",
479 |    "language": "python",
480 |    "name": "python3"
481 |   },
482 |   "language_info": {
483 |    "codemirror_mode": {
484 |     "name": "ipython",
485 |     "version": 3
486 |    },
487 |    "file_extension": ".py",
488 |    "mimetype": "text/x-python",
489 |    "name": "python",
490 |    "nbconvert_exporter": "python",
491 |    "pygments_lexer": "ipython3",
492 |    "version": "3.5.2"
493 |   }
494 |  },
495 |  "nbformat": 4,
496 |  "nbformat_minor": 2
497 | }
498 | 


--------------------------------------------------------------------------------