├── main.py
├── word2vec.py
├── visual.py
├── README.md
├── cnn_test.py
├── lstm_test.py
├── Cnn_Model.py
├── mixed_cnn_lstm_test.py
├── readdata.py
├── lstm_train.py
├── cnn_train.py
├── Lstm_Model.py
└── weibo.py


/main.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from weibo import start as spider
 3 | from cnn_test import get_cnn_result
 4 | from lstm_test import get_lstm_result
 5 | from mixed_cnn_lstm_test import get_mixed_result
 6 | from visual import show_emtion
 7 | 
 8 | 
 9 | 
10 | if __name__=="__main__":
11 |     prediction = np.array([])
12 |     print("********************欢迎使用微博舆情分析工具***********************")
13 |     url = input("请输入需要分析的微博url:\n")
14 |     #调用weibo.py接口开始爬取相关微博评论
15 |     spider(url)
16 |     #选择模型
17 |     model_index=int(input("请输入你想选择的AI模型：\n1.CNN\n2.LSTM\n3.CNN & LSTM融合模型\n"))
18 |     #调用AI模型接口返回结果
19 |     if model_index == 1:
20 |         prediction=get_cnn_result()
21 |     elif model_index == 2:
22 |         prediction=get_lstm_result()
23 |     elif model_index == 3:
24 |         prediction=get_mixed_result()
25 |         prediction=[i for i in prediction[0]]
26 |     else:
27 |         print("输入信息错误")
28 |     #移交可视化模块完成数据视化
29 |     print(prediction)
30 |     show_emtion(prediction)


--------------------------------------------------------------------------------
/word2vec.py:
--------------------------------------------------------------------------------
 1 | import gensim
 2 | import numpy as np
 3 | 
 4 | 
 5 | 
 6 | def get_embedding_vector(sentences,embedding_model_path):
 7 |     print("loading word2vec model now...........")
 8 |     model=gensim.models.KeyedVectors.load_word2vec_format(embedding_model_path,binary=True)
 9 |     print("loading word2vec finished")
10 |     all_sample_vector_lists=[]
11 |     padding_embedding=np.array([0] * model.vector_size,dtype=np.float32)
12 |     print("transform word to vector now.......")
13 |     for sentence in sentences:
14 |         sentence_vector = []
15 |         for word in sentence:
16 |             if word in model.vocab:
17 |                 sentence_vector.append(model[word])
18 |             else:
19 |                 sentence_vector.append(padding_embedding)
20 |         all_sample_vector_lists.append(sentence_vector)
21 |         del sentence_vector
22 |     print("transform word to vector finished")
23 |     del sentences
24 |     del model
25 |     return all_sample_vector_lists


--------------------------------------------------------------------------------
/visual.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | 
 3 | 
 4 | def show_emtion(prediction):
 5 |     positive=0
 6 |     negative=0
 7 |     for i, num in enumerate(prediction):
 8 |         if num == 0:
 9 |             positive+=1
10 |         else:
11 |             negative+=1
12 |     #用来正常显示中文标签
13 |     plt.rcParams['font.sans-serif']=['SimHei']
14 |     #用来正常显示负号
15 |     plt.rcParams['axes.unicode_minus']=False
16 |     #调节图形大小(宽,高)
17 |     plt.figure(figsize=(12,8))
18 |     #定义饼状图的标签，标签是列表
19 |     labels = [u'喜悦',u'低落']
20 |     #每个标签的占比,不一定要和为100%
21 |     sizes = [positive,negative]
22 |     colors = ['lightskyblue','FireBrick']
23 |     explode = (0.05,0)
24 |     patches,l_text,p_text = plt.pie(sizes,explode=explode,labels=labels,colors=colors,labeldistance = 1.1,autopct = '%3.1f%%',shadow = False,startangle = 90,pctdistance = 0.5)
25 | 
26 |     #改变文本的大小
27 |     for t in l_text:
28 |         t.set_size(20)
29 |     for t in p_text:
30 |         t.set_size(20)
31 |     #圆
32 |     plt.axis('equal')
33 |     plt.legend()
34 |     plt.show()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 基于微博评论的数据挖掘与情感分析  
 2 | **！！！大学作业不再更新！！！**
 3 | 
 4 | ## 项目简介
 5 | 学习卷积神经网络，循环神经网络在实际环境下的应用，提升实践能力，了解深度学习在自然语言处理方面的进展
 6 | 
 7 | ## cnn_for_text_classify
 8 | 具备较强的自动关键词提取能力，在酒店评论测试集上达到95%的准确率  
 9 | 采用l2正则和dropout来控制过拟合现象  
10 | 4种卷积核使其能提取局部高效的短特征  
11 | 
12 | ## lstm_for_text_classify
13 | 具有较强的对长难句，反问句，阴阳怪气句的判断能力，在在酒店评论测试集上达到97%的准确率  
14 | 采用双向LSTM网络  
15 | 对输入数据进行dropout，模拟增大样本空间  
16 | LSTM层与层之间进行dropout  
17 | 对LSTM网络权重，偏置进行l2正则，抗过拟合  
18 | 网络采用正交初始化，加快收敛速度，提升训练集上的正确率，大幅提升测试集上的正确率  
19 | 采用Clipping Gradients，防止梯度爆炸，提升测试集上的正确率  
20 | 
21 | ## word2vec:
22 | 项目使用的词向量：embedding_64.bin(1.5G)  
23 | 训练语料：百度百科800w条 20G+搜狐新闻400w条 12G+小说：90G左右  
24 | 模型参数：window=5 min_count=5 size=64  
25 | 下载链接：[百度网盘链接](https://pan.baidu.com/s/19bDbZsFzLggx7q9iFn83Nw)
26 | 
27 | 
28 | ## 文件功能介绍
29 | ./  
30 | weibo.py：微博评论爬虫  
31 | readdata.py：为情感分析模型提供多种数据加载相关API  
32 | word2vec.py：为情感分析模型提供多种词向量的相关API  
33 | cnn_model.py：CNN文本分类模型图结构  
34 | cnn_train.py：CNN文本分类训练代码  
35 | cnn_test.py： CNN文本分类测试代码  
36 | lstm_model.py：lstm文本分类模型图结构  
37 | lstm_train.py：lstm文本分类训练代码  
38 | lstm_test.py： lstm文本分类测试代码  
39 | mixed_cnn_lstm_test.py:采用模型融合方式将cnn与lstm的结果进行融合投票绝对最终结果  
40 | visual.py：结果可视化  
41 | main.py：项目主文件，调用各文件API，自动提取，分析，显示  
42 | 
43 | ./data  
44 | happy.txt：开心评价  
45 | angry.txt：愤怒评价  
46 | unhappy：低落评价数据集   
47 | embedding_64.bin：训练好的词向量模型  
48 | ---/cnn:cnn模型训练完成的相关数据参数  
49 | ---/lstm：lstm模型训练完成的相关数据参数  
50 | 
51 | ./summary  
52 | ---/cnn:cnn的log和图结构  
53 | -------/test:测试集上的log  
54 | -------/test:训练集上的log  
55 | ---/lstm：lstm的log和图结构  
56 | -------/test:测试集上的log  
57 | -------/test:训练集上的log  
58 | 
59 | ## 推荐运行环境
60 | python 3.6  
61 | tensorflow-gpu 1.4  
62 | gensim 3.3  
63 | Ubuntu 64 Bit / windows10 64 Bit  
64 | 
65 | ## 使用模型注意事项
66 | 1.文本TXT文件必须采用UTF-8编码格式，非UTF-8格式的，去记事本中另存为的时候选择UTF-8  
67 | 2.pos.txt、neg.txt、test.txt 文件一行为一条评论，长度不限，可以有英文和标点（反正都会去除的），不要词性标注信息  
68 | 3.词向量模型一定要用我放的那个64维度的bin文件  
69 | 4.模型代码在windows上测试过基本没bug，linux平台没测试过，不过肯定需要自行修改文件路径  
70 | 5.测试集比率根据你的样本数量自行调整，太大容易造成显存不够导致失败  
71 | 6.根据文件夹结构自行建立  
72 | 


--------------------------------------------------------------------------------
/cnn_test.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | import readdata
 4 | import word2vec
 5 | import os
 6 | import cnn_model
 7 | 
 8 | 
 9 | test_file_path="./data//test.txt"
10 | train_data_path="./data//cnn//training_params.pickle"
11 | embedding_model_path="./data//embedding_64.bin"
12 | 
13 | class config():
14 |     test_sample_percentage=0.03
15 |     num_labels=2
16 |     embedding_size=64
17 |     filter_sizes=[2,3,4]
18 |     num_filters=128
19 |     dropout_keep_prob=1
20 |     l2_reg_lambda=0.1
21 |     batch_size=32
22 |     num_epochs=15
23 |     max_sentences_length=0
24 |     lr_rate=1e-3
25 | 
26 | def get_cnn_result():
27 |     if not os.path.exists(embedding_model_path):
28 |         print("word2vec model is not found")
29 | 
30 |     if not os.path.exists(train_data_path):
31 |         print("train params is not found")
32 | 
33 |     params = readdata.loadDict(train_data_path)
34 |     train_length = int(params['max_sentences_length'])
35 | 
36 | 
37 | 
38 |     test_sample_lists = readdata.get_cleaned_list(test_file_path)
39 |     test_sample_lists,max_sentences_length = readdata.padding_sentences(test_sample_lists,padding_token='<PADDING>',padding_sentence_length=train_length)
40 |     test_sample_arrays=np.array(word2vec.get_embedding_vector(test_sample_lists,embedding_model_path))
41 |     testconfig=config()
42 |     testconfig.max_sentences_length=max_sentences_length
43 | 
44 |     sess=tf.InteractiveSession()
45 |     cnn=cnn_model.TextCNN(config=testconfig)
46 | 
47 |     #加载参数
48 |     saver = tf.train.Saver()
49 |     saver.restore(sess, "./data/cnn/text_model")
50 | 
51 |     #定义测试函数
52 |     def test_step(x_batch):
53 |         feed_dict={
54 |             cnn.input_x:x_batch,
55 |             cnn.dropout_keep_prob:1.0
56 |             }
57 |         predictions,scores=sess.run(
58 |             [cnn.predictions,cnn.softmax_result],
59 |             feed_dict=feed_dict
60 |             )
61 |         return (predictions,scores)
62 | 
63 | 
64 |     #拿到结果
65 |     predictions,scores=test_step(test_sample_arrays)
66 |     return np.array(predictions)
67 |     #print("(0->neg & 1->pos)the result is:")
68 |     #print(predictions)
69 |     #print("********************************")
70 |     #print("the scores is:")
71 |     #print(scores)
72 |     #print(scores.shape)
73 | 


--------------------------------------------------------------------------------
/lstm_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import readdata
 3 | import word2vec
 4 | import lstm_model
 5 | import numpy as np
 6 | import tensorflow as tf
 7 | 
 8 | 
 9 | 
10 | #文件路径
11 | current_path=os.path.abspath(os.curdir)
12 | test_file_path="./data//test.txt"
13 | embedding_model_path="./data//embedding_64.bin"
14 | train_data_path="./data//lstm//training_params.pickle"
15 | 
16 | 
17 | #模型超参
18 | class config():
19 |     test_sample_percentage=0.03
20 |     num_labels=2
21 |     embedding_size=64
22 |     dropout_keep_prob=1
23 |     batch_size=64
24 |     num_epochs=80
25 |     max_sentences_length=40
26 |     num_layers=2
27 |     max_grad_norm=5
28 |     l2_rate=0.0001
29 | 
30 | 
31 | def get_lstm_result():
32 |     if not os.path.exists(embedding_model_path):
33 |         print("word2vec model is not found")
34 | 
35 |     if not os.path.exists(train_data_path):
36 |         print("train params is not found")
37 | 
38 |     params = readdata.loadDict(train_data_path)
39 |     train_length = int(params['max_sentences_length'])
40 | 
41 | 
42 | 
43 |     test_sample_lists = readdata.get_cleaned_list(test_file_path)
44 |     test_sample_lists,max_sentences_length = readdata.padding_sentences(test_sample_lists,padding_token='<PADDING>',padding_sentence_length=train_length)
45 |     test_sample_arrays=np.array(word2vec.get_embedding_vector(test_sample_lists,embedding_model_path))
46 |     testconfig=config()
47 |     testconfig.max_sentences_length=max_sentences_length
48 | 
49 | 
50 |     sess=tf.InteractiveSession()
51 |     lstm=lstm_model.TextLSTM(config=testconfig)
52 | 
53 |     saver = tf.train.Saver()
54 |     saver.restore(sess, "./data/lstm/text_model")
55 | 
56 |     #定义测试函数
57 |     def test_step(x_batch):
58 |         feed_dict={
59 |             lstm.input_x:x_batch,
60 |             lstm.dropout_keep_prob:testconfig.dropout_keep_prob
61 |         }
62 |         predictions,scores=sess.run(
63 |             [lstm.predictions,lstm.softmax_result],
64 |             feed_dict=feed_dict
65 |         )
66 |         return (predictions,scores)
67 | 
68 |     predictions, scores=test_step(test_sample_arrays)
69 |     return np.array(predictions)
70 |     #print("(0->neg & 1->pos)the result is:")
71 |     #print(predictions)
72 |     #print("********************************")
73 |     #print("the scores is:")
74 |     #print(scores)
75 | 


--------------------------------------------------------------------------------
/Cnn_Model.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | 
  4 | class TextCNN(object):
  5 |     def __init__(self, config):
  6 |         sequence_length = config.max_sentences_length
  7 |         num_classes = config.num_labels
  8 |         embedding_size = config.embedding_size
  9 |         filter_sizes = config.filter_sizes
 10 |         num_filters = config.num_filters
 11 |         l2_reg_lambda = config.l2_reg_lambda
 12 |         l2_loss = tf.constant(0.0)
 13 |         pooled_outputs = []
 14 | 
 15 | 
 16 |         self.input_x=tf.placeholder(tf.float32,[None,sequence_length,embedding_size],name="input_x")
 17 |         self.input_y=tf.placeholder(tf.float32,[None,num_classes],name="input_y")
 18 |         self.dropout_keep_prob=tf.placeholder(tf.float32,name="dropout_rate")
 19 |         self.learning_rate=tf.placeholder(tf.float32,name="lr")
 20 | 
 21 | 
 22 |         self.input_x_expended=tf.expand_dims(self.input_x,-1)
 23 | 
 24 | 
 25 |         for filter_size in filter_sizes:
 26 |             with tf.name_scope("conv-maxpool-%s" % filter_size):
 27 |                 #[filter_height, filter_width, in_channels, out_channels]
 28 |                 filter_shape=[filter_size,embedding_size,1,num_filters]
 29 |                 W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
 30 |                 b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
 31 | 
 32 | 
 33 |                 #添加卷积层
 34 |                 conv=tf.nn.conv2d(
 35 |                     self.input_x_expended,
 36 |                     W,
 37 |                     strides=[1,1,1,1],
 38 |                     padding="VALID",
 39 |                     name="conv"
 40 |                 )
 41 | 
 42 | 
 43 |                 #添加偏置 & relu激活函数
 44 |                 h=tf.nn.relu(tf.nn.bias_add(conv,b),name="relu")
 45 | 
 46 | 
 47 |                 #添加最大池化层
 48 |                 pooled=tf.nn.max_pool(
 49 |                     h,
 50 |                     ksize=[1,sequence_length-filter_size+1,1,1], #[对1个句子 卷积值hight 卷积值width 1个channel]
 51 |                     strides=[1,1,1,1],
 52 |                     padding="VALID",
 53 |                     name="pool"
 54 |                 )
 55 |                 pooled_outputs.append(pooled)
 56 | 
 57 |         num_filters_total = num_filters * len(filter_sizes)
 58 |         self.h_pooled=tf.concat(pooled_outputs, 3)
 59 |         self.h_pooled_flat=tf.reshape(self.h_pooled,[-1,num_filters_total])
 60 | 
 61 | 
 62 |         #添加dropout层
 63 |         with tf.name_scope("dropout"):
 64 |             self.h_drop=tf.nn.dropout(self.h_pooled_flat, self.dropout_keep_prob)
 65 | 
 66 | 
 67 |         #添加分类层
 68 |         with tf.name_scope("output"):
 69 |             self.Weight = tf.get_variable(
 70 |                 "Weight",
 71 |                 shape=[num_filters_total, num_classes],
 72 |                 initializer=tf.contrib.layers.xavier_initializer())
 73 |             self.bias = tf.Variable(tf.constant(0.1, shape=[num_classes], name="bias"))
 74 |             l2_loss += tf.nn.l2_loss(self.Weight)
 75 |             l2_loss += tf.nn.l2_loss(self.bias)
 76 |             self.result=tf.matmul(self.h_drop,self.Weight)+self.bias
 77 |             self.predictions=tf.argmax(self.result,1,name="predictions")
 78 |             tf.summary.histogram("weight",self.Weight)
 79 |             tf.summary.histogram("bias",self.bias)
 80 |             self.softmax_result = tf.nn.softmax(self.result)
 81 | 
 82 | 
 83 |                 #计算损失
 84 |         with tf.name_scope("loss"):
 85 |             losses=tf.nn.softmax_cross_entropy_with_logits(logits=self.result, labels=self.input_y)
 86 |             self.loss=tf.reduce_mean(losses)+l2_reg_lambda*l2_loss
 87 |             tf.summary.scalar("loss",self.loss)
 88 | 
 89 |         #计算正确率
 90 |         with tf.name_scope("accuracy"):
 91 |             correct_predictions=tf.equal(self.predictions, tf.argmax(self.input_y, 1))
 92 |             self.accuracy=tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
 93 |             tf.summary.scalar("accuracy",self.accuracy)
 94 | 
 95 |         #训练操作
 96 |         with tf.name_scope("train_operation"):
 97 |             self.train_op=tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
 98 | 
 99 |         with tf.name_scope("summary"):
100 |             self.merged=tf.summary.merge_all()


--------------------------------------------------------------------------------
/mixed_cnn_lstm_test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import readdata
  3 | import word2vec
  4 | import lstm_model
  5 | import cnn_model
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | 
  9 | 
 10 | 
 11 | #文件路径
 12 | current_path=os.path.abspath(os.curdir)
 13 | test_file_path="./data//test.txt"
 14 | embedding_model_path="./data//embedding_64.bin"
 15 | lstm_train_data_path="./data//lstm//training_params.pickle"
 16 | cnn_train_data_path="./data//cnn//training_params.pickle"
 17 | 
 18 | 
 19 | #模型超参
 20 | class lstmconfig():
 21 |     test_sample_percentage=0.03
 22 |     num_labels=2
 23 |     embedding_size=64
 24 |     dropout_keep_prob=1
 25 |     batch_size=64
 26 |     num_epochs=80
 27 |     max_sentences_length=40
 28 |     num_layers=2
 29 |     max_grad_norm=5
 30 |     l2_rate=0.0001
 31 | 
 32 | class cnnconfig():
 33 |     test_sample_percentage=0.03
 34 |     num_labels=2
 35 |     embedding_size=64
 36 |     filter_sizes=[2,3,4]
 37 |     num_filters=128
 38 |     dropout_keep_prob=1
 39 |     l2_reg_lambda=0.1
 40 |     batch_size=32
 41 |     num_epochs=15
 42 |     max_sentences_length=0
 43 |     lr_rate=1e-3
 44 | 
 45 | def get_mixed_result():
 46 |     if not os.path.exists(embedding_model_path):
 47 |         print("word2vec model is not found")
 48 | 
 49 |     if not os.path.exists(lstm_train_data_path):
 50 |         print("lstm train params is not found")
 51 | 
 52 |     lstm_params = readdata.loadDict(lstm_train_data_path)
 53 |     lstm_train_length = int(lstm_params['max_sentences_length'])
 54 | 
 55 |     if not os.path.exists(cnn_train_data_path):
 56 |         print("cnn train params is not found")
 57 | 
 58 |     cnn_params = readdata.loadDict(cnn_train_data_path)
 59 |     cnn_train_length = int(cnn_params['max_sentences_length'])
 60 | 
 61 | 
 62 |     test_sample_lists = readdata.get_cleaned_list(test_file_path)
 63 |     lstm_test_sample_lists,lstm_max_sentences_length = readdata.padding_sentences(test_sample_lists,padding_token='<PADDING>',padding_sentence_length=lstm_train_length)
 64 |     cnn_test_sample_lists,cnn_max_sentences_length = readdata.padding_sentences(test_sample_lists,padding_token='<PADDING>',padding_sentence_length=cnn_train_length)
 65 |     lstm_test_sample_arrays=np.array(word2vec.get_embedding_vector(lstm_test_sample_lists,embedding_model_path))
 66 |     cnn_test_sample_arrays=np.array(word2vec.get_embedding_vector(cnn_test_sample_lists,embedding_model_path))
 67 |     lstm_config=lstmconfig()
 68 |     cnn_config=cnnconfig()
 69 |     lstm_config.max_sentences_length=lstm_max_sentences_length
 70 |     cnn_config.max_sentences_length=cnn_max_sentences_length
 71 | 
 72 | 
 73 | 
 74 |     lstm_graph=tf.Graph()
 75 |     cnn_graph=tf.Graph()
 76 |     lstm_sess=tf.Session(graph=lstm_graph)
 77 |     cnn_sess=tf.Session(graph=cnn_graph)
 78 | 
 79 | 
 80 |     with lstm_sess.as_default():
 81 |         with lstm_graph.as_default():
 82 |             lstm = lstm_model.TextLSTM(config=lstm_config)
 83 |             lstm_saver = tf.train.Saver()
 84 |             lstm_saver.restore(lstm_sess, "./data/lstm/text_model")
 85 |             def lstm_test_step(x_batch):
 86 |                 feed_dict={
 87 |                     lstm.input_x:x_batch,
 88 |                     lstm.dropout_keep_prob:lstm_config.dropout_keep_prob
 89 |                 }
 90 |                 scores=lstm_sess.run(
 91 |                     [lstm.softmax_result],
 92 |                     feed_dict=feed_dict
 93 |                 )
 94 |                 return scores
 95 | 
 96 | 
 97 |             lstm_scores = lstm_test_step(lstm_test_sample_arrays)
 98 | 
 99 | 
100 |     with cnn_sess.as_default():
101 |         with cnn_graph.as_default():
102 |             cnn = cnn_model.TextCNN(config=cnn_config)
103 |             cnn_saver = tf.train.Saver()
104 |             cnn_saver.restore(cnn_sess, "./data/cnn/text_model")
105 |             def cnn_test_step(x_batch):
106 |                 feed_dict={
107 |                     cnn.input_x:x_batch,
108 |                     cnn.dropout_keep_prob:cnn_config.dropout_keep_prob
109 |                 }
110 |                 scores=cnn_sess.run(
111 |                     [cnn.softmax_result],
112 |                     feed_dict=feed_dict
113 |                 )
114 |                 return scores
115 | 
116 | 
117 |             cnn_scores = cnn_test_step(cnn_test_sample_arrays)
118 | 
119 |     lstm_sess.close()
120 |     cnn_sess.close()
121 |     mixed_scores=np.sum([lstm_scores,cnn_scores],axis=0)
122 |     predictions=np.argmax(mixed_scores,axis=2)
123 |     return np.array(predictions)


--------------------------------------------------------------------------------
/readdata.py:
--------------------------------------------------------------------------------
  1 | # encoding: UTF-8
  2 | import numpy as np
  3 | import re
  4 | import os
  5 | import pickle
  6 | import jieba
  7 | 
  8 | 
  9 | def save(content,path):
 10 |     '''
 11 |     把content用pickle方式存到path里
 12 |     '''
 13 |     f=open(path,'wb')
 14 |     pickle.dump(content,f)
 15 |     f.close()
 16 |     print("file has been saved")
 17 | 
 18 | 
 19 | def clean_str(string):
 20 |     '''
 21 |     接收string，返回去除各种符号的string
 22 |     '''
 23 |     string=re.sub("[^\u4e00-\u9fff]"," ",string)
 24 |     string = re.sub(r"\s{2,}", " ", string)
 25 |     return string
 26 | 
 27 | 
 28 | def split_str(string):
 29 |     '''
 30 |     接收string，返回各个词间用空格隔开的string
 31 |     '''
 32 |     return " ".join([word for word in jieba.cut(string,HMM=True)])
 33 | 
 34 | 
 35 | def get_cleaned_list(file_path):
 36 |     '''
 37 |     接收文件全路径，返回次txt文件的分词好的列表
 38 |     '''
 39 |     print("read txt now..............")
 40 |     f=open(file_path,'r',encoding="utf8")
 41 |     lines=list(f.readlines())
 42 |     lines=[clean_str(split_str(line)) for line in lines]
 43 |     f.close()
 44 |     print("read txt finished")
 45 |     return lines
 46 | 
 47 | 
 48 | def padding_sentences(no_padding_lists, padding_token='<PADDING>',padding_sentence_length = None):
 49 |     '''
 50 |     接收句子列表，将所有句子填充为一样长
 51 |     '''
 52 |     print("padding sentences now..............")
 53 |     all_sample_lists=[sentence.split(' ') for sentence in no_padding_lists]
 54 |     if padding_sentence_length != None:
 55 |         max_sentence_length=padding_sentence_length
 56 |     else:
 57 |         max_sentence_length=max([len(sentence) for sentence in all_sample_lists])
 58 |     for i,sentence in enumerate(all_sample_lists):
 59 |         if len(sentence) > max_sentence_length:
 60 |             all_sample_lists[i]=sentence[:max_sentence_length]
 61 |         else:
 62 |             sentence.extend([padding_token] * (max_sentence_length - len(sentence)))
 63 |     print("padding sentences finished")
 64 |     return (all_sample_lists,max_sentence_length)
 65 | 
 66 | 
 67 | def get_all_data_from_file(positive_file_path,negative_file_path,force_len=None):
 68 |     '''
 69 |     positive_file_path:正评价txt全路径
 70 |     negative_file_path:负评价txt全路径
 71 |     '''
 72 |     positive_sample_lists=get_cleaned_list(positive_file_path)
 73 |     negative_sample_lists=get_cleaned_list(negative_file_path)
 74 |     positive_label_lists=[[0,1] for _ in positive_sample_lists]
 75 |     negative_label_lists=[[1,0] for _ in negative_sample_lists]
 76 | 
 77 |     all_sample_lists = positive_sample_lists + negative_sample_lists #样本为list类型！！
 78 |     if force_len == None:
 79 |         all_sample_lists, max_sentences_length = padding_sentences(all_sample_lists)  #样本为list类型！！
 80 |     else:
 81 |         all_sample_lists, max_sentences_length = padding_sentences(all_sample_lists,padding_token='<PADDING>',padding_sentence_length = force_len)  # 样本为list类型！！
 82 |     all_label_arrays=np.concatenate([positive_label_lists,negative_label_lists], 0)  #标签为array类型
 83 | 
 84 |     return (all_sample_lists,all_label_arrays,max_sentences_length)
 85 | 
 86 | 
 87 | def batch_iter(data, batch_size, num_epochs, shuffle=False):
 88 |     '''
 89 |     生成batches迭代对象
 90 |     '''
 91 |     data = np.array(data)
 92 |     data_size = len(data)
 93 |     num_batches_per_epoch = int((data_size - 1) / batch_size) + 1
 94 |     for epoch in range(num_epochs):
 95 |         if shuffle:
 96 |             #顺序打乱
 97 |             shuffle_indices = np.random.permutation(np.arange(data_size))
 98 |             shuffled_data = data[shuffle_indices]
 99 |         else:
100 |             shuffled_data = data
101 |         for batch_num in range(num_batches_per_epoch):
102 |             start_idx = batch_num * batch_size
103 |             end_idx = min((batch_num + 1) * batch_size, data_size)
104 |             yield shuffled_data[start_idx : end_idx]
105 | 
106 | 
107 | def batch_iter_test(data, batch_size, num_epochs, shuffle=False):
108 |     '''
109 |     生成batches迭代对象
110 |     '''
111 |     data = np.array(data)
112 |     data_size = len(data)
113 |     num_batches_per_epoch = int((data_size - 1) / batch_size) + 1
114 |     for epoch in range(num_epochs):
115 |         if shuffle:
116 |             #顺序打乱
117 |             shuffle_indices = np.random.permutation(np.arange(data_size))
118 |             shuffled_data = data[shuffle_indices]
119 |         else:
120 |             shuffled_data = data
121 |         for batch_num in range(num_batches_per_epoch):
122 |             start_idx = batch_num * batch_size
123 |             end_idx = min((batch_num + 1) * batch_size, data_size)
124 |             yield shuffled_data[start_idx : end_idx]
125 | 
126 | 
127 | def loadDict(train_data_path):
128 |     f=open(train_data_path,'rb')
129 |     params=pickle.load(f)
130 |     return params


--------------------------------------------------------------------------------
/lstm_train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import readdata
  3 | import word2vec
  4 | import lstm_model
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | 
  8 | 
  9 | 
 10 | #文件路径
 11 | current_path=os.path.abspath(os.curdir)
 12 | data_path="./data"
 13 | positive_file_path="./data//pos.txt"
 14 | negative_file_path="./data//neg.txt"
 15 | embedding_model_path="./data//embedding_64.bin"
 16 | train_data_path="./data//lstm//training_params.pickle"
 17 | log_path="./summary//lstm"
 18 | 
 19 | 
 20 | #模型超参
 21 | class config():
 22 |     test_sample_percentage=0.03
 23 |     num_labels=2
 24 |     embedding_size=64
 25 |     dropout_keep_prob=0.6
 26 |     batch_size=64
 27 |     num_epochs=80
 28 |     max_sentences_length=40
 29 |     num_layers=2
 30 |     max_grad_norm=5
 31 |     l2_rate=0.0001
 32 | 
 33 | #加载数据
 34 | all_sample_lists,all_label_arrays,max_sentences_length=readdata.get_all_data_from_file(positive_file_path,negative_file_path,force_len=40)
 35 | all_sample_arrays=np.array(word2vec.get_embedding_vector(all_sample_lists,embedding_model_path))
 36 | del all_sample_lists
 37 | print("sample.shape = {}".format(all_sample_arrays.shape))
 38 | print("label.shape = {}".format(all_label_arrays.shape))
 39 | trainconfig=config()
 40 | trainconfig.max_sentences_length=max_sentences_length
 41 | testconfig=config()
 42 | testconfig.max_sentences_length=max_sentences_length
 43 | testconfig.dropout_keep_prob=1.0
 44 | 
 45 | #存储训练参数
 46 | params={"num_labels":trainconfig.num_labels,"max_sentences_length":max_sentences_length}
 47 | readdata.save(params,train_data_path)
 48 | 
 49 | #打乱样本顺序
 50 | np.random.seed(10)
 51 | random_index=np.random.permutation(np.arange(len(all_label_arrays)))
 52 | random_sample_arrays=all_sample_arrays[random_index]
 53 | del all_sample_arrays
 54 | random_label_arrays=all_label_arrays[random_index]
 55 | 
 56 | #按比例抽取测试样本
 57 | num_tests=int(trainconfig.test_sample_percentage*len(all_label_arrays))
 58 | del all_label_arrays
 59 | test_sample_arrays=random_sample_arrays[:num_tests]
 60 | train_sample_arrays=random_sample_arrays[num_tests:]
 61 | del random_sample_arrays
 62 | train_label_arrays=random_label_arrays[num_tests:]
 63 | test_label_arrays=random_label_arrays[:num_tests]
 64 | del random_label_arrays
 65 | print("Train/Test split: {:d}/{:d}".format(len(train_label_arrays), len(test_label_arrays)))
 66 | 
 67 | 
 68 | #开始训练
 69 | with tf.Graph().as_default():
 70 |     sess=tf.Session()
 71 |     with sess.as_default():
 72 |         lstm=lstm_model.TextLSTM(config=trainconfig)
 73 | 
 74 |         #初始化参数
 75 |         train_writer = tf.summary.FileWriter(log_path + '/train', sess.graph)
 76 |         test_writer = tf.summary.FileWriter(log_path + '/test')
 77 |         step_num=0
 78 |         sess.run(tf.global_variables_initializer())
 79 |         saver=tf.train.Saver()
 80 | 
 81 | 
 82 |         #定义训练函数
 83 |         def train_step(x_batch,y_batch):
 84 |             feed_dict={
 85 |                 lstm.input_x:x_batch,
 86 |                 lstm.input_y:y_batch,
 87 |                 lstm.dropout_keep_prob:config.dropout_keep_prob,
 88 |             }
 89 |             merged,loss,accuracy,_=sess.run(
 90 |                 [lstm.summary_op,lstm.loss,lstm.accuracy,lstm.train_op],
 91 |                 feed_dict=feed_dict
 92 |             )
 93 |             return (merged,loss,accuracy)
 94 | 
 95 |         #定义测试函数
 96 |         def test_step(x_batch,y_batch):
 97 |             feed_dict={
 98 |                 lstm.input_x:x_batch,
 99 |                 lstm.input_y:y_batch,
100 |                 lstm.dropout_keep_prob:testconfig.dropout_keep_prob
101 |             }
102 |             merged,loss, accuracy,_=sess.run(
103 |                 [lstm.summary_op,lstm.loss,lstm.accuracy,lstm.train_op],
104 |                 feed_dict=feed_dict
105 |             )
106 |             return (merged,loss,accuracy)
107 | 
108 |         #生成批数据
109 |         batches=readdata.batch_iter(
110 |             list(zip(train_sample_arrays,train_label_arrays)),trainconfig.batch_size,trainconfig.num_epochs)
111 | 
112 |         #正式开始训练啦
113 |         for batch in batches:
114 |             step_num += 1
115 |             x_batch,y_batch=zip(*batch)
116 |             merged,loss,accuracy=train_step(x_batch,y_batch)
117 |             if step_num % 100 == 0:
118 |                 train_writer.add_summary(merged, step_num)
119 |                 print("For train_samples: step %d, loss %g, accuracy %g" % (step_num, loss, accuracy))
120 |                 if step_num % 200 ==0:
121 |                     merged,loss,accuracy = test_step(test_sample_arrays, test_label_arrays)
122 |                     test_writer.add_summary(merged, step_num)
123 |                     print("For test_samples: step %d, loss %g, accuracy %g" % (step_num, loss, accuracy))
124 | 
125 | 
126 |         saver.save(sess,"data/lstm/text_model")


--------------------------------------------------------------------------------
/cnn_train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import readdata
  3 | import word2vec
  4 | import cnn_model
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | 
  8 | 
  9 | 
 10 | #文件路径
 11 | current_path=os.path.abspath(os.curdir)
 12 | data_path="./data"
 13 | positive_file_path="./data//pos.txt"
 14 | negative_file_path="./data//neg.txt"
 15 | embedding_model_path="./data//embedding_64.bin"
 16 | train_data_path="./data//cnn//training_params.pickle"
 17 | log_path="./summary//cnn"
 18 | 
 19 | 
 20 | #模型超参
 21 | class config():
 22 |     test_sample_percentage=0.03
 23 |     num_labels=2
 24 |     embedding_size=64
 25 |     filter_sizes=[2,3,4]
 26 |     num_filters=128
 27 |     dropout_keep_prob=0.5
 28 |     l2_reg_lambda=0.1
 29 |     batch_size=32
 30 |     num_epochs=15
 31 |     max_sentences_length=0
 32 |     lr_rate=1e-3
 33 | 
 34 | 
 35 | #加载数据
 36 | all_sample_lists,all_label_arrays,max_sentences_length=readdata.get_all_data_from_file(positive_file_path,negative_file_path,force_len=40)
 37 | all_sample_arrays=np.array(word2vec.get_embedding_vector(all_sample_lists,embedding_model_path))
 38 | del all_sample_lists
 39 | print("sample.shape = {}".format(all_sample_arrays.shape))
 40 | print("label.shape = {}".format(all_label_arrays.shape))
 41 | trainconfig=config()
 42 | trainconfig.max_sentences_length=max_sentences_length
 43 | testconfig=config()
 44 | testconfig.max_sentences_length=max_sentences_length
 45 | testconfig.dropout_keep_prob=1.0
 46 | 
 47 | 
 48 | #存储训练参数
 49 | params={"num_labels":trainconfig.num_labels,"max_sentences_length":max_sentences_length}
 50 | readdata.save(params,train_data_path)
 51 | 
 52 | #打乱样本顺序
 53 | np.random.seed(10)
 54 | random_index=np.random.permutation(np.arange(len(all_label_arrays)))
 55 | random_sample_arrays=all_sample_arrays[random_index]
 56 | del all_sample_arrays
 57 | random_label_arrays=all_label_arrays[random_index]
 58 | #按比例抽取测试样本
 59 | num_tests=int(trainconfig.test_sample_percentage*len(all_label_arrays))
 60 | del all_label_arrays,random_index
 61 | test_sample_arrays=random_sample_arrays[:num_tests]
 62 | train_sample_arrays=random_sample_arrays[num_tests:]
 63 | del random_sample_arrays
 64 | test_label_arrays=random_label_arrays[:num_tests]
 65 | train_label_arrays=random_label_arrays[num_tests:]
 66 | del random_label_arrays
 67 | print("Train/Test split: {:d}/{:d}".format(len(train_label_arrays), len(test_label_arrays)))
 68 | 
 69 | #开始训练
 70 | with tf.Graph().as_default():
 71 |     sess=tf.Session()
 72 |     with sess.as_default():
 73 |         cnn=cnn_model.TextCNN(config=trainconfig)
 74 | 
 75 |         #初始化参数
 76 |         train_writer = tf.summary.FileWriter(log_path + '/train', sess.graph)
 77 |         test_writer = tf.summary.FileWriter(log_path + '/test')
 78 |         step_num=0
 79 |         sess.run(tf.global_variables_initializer())
 80 |         saver=tf.train.Saver()
 81 | 
 82 | 
 83 |         #定义训练函数
 84 |         def train_step(x_batch,y_batch,lr_rate):
 85 |             feed_dict={
 86 |                 cnn.input_x:x_batch,
 87 |                 cnn.input_y:y_batch,
 88 |                 cnn.dropout_keep_prob:trainconfig.dropout_keep_prob,
 89 |                 cnn.learning_rate:lr_rate
 90 |             }
 91 |             summary,loss,accuracy,_=sess.run(
 92 |                 [cnn.merged,cnn.loss,cnn.accuracy,cnn.train_op],
 93 |                 feed_dict=feed_dict
 94 |             )
 95 |             return (summary,loss,accuracy)
 96 | 
 97 |         #定义测试函数
 98 |         def test_step(x_batch,y_batch):
 99 |             feed_dict={
100 |                 cnn.input_x:x_batch,
101 |                 cnn.input_y:y_batch,
102 |                 cnn.dropout_keep_prob:testconfig.dropout_keep_prob
103 |             }
104 |             summary,loss,accuracy=sess.run(
105 |                 [cnn.merged,cnn.loss,cnn.accuracy],
106 |                 feed_dict=feed_dict
107 |             )
108 | 
109 |             return (summary,loss,accuracy)
110 | 
111 |         #生成批数据
112 |         batches=readdata.batch_iter(
113 |             list(zip(train_sample_arrays, train_label_arrays)),trainconfig.batch_size,trainconfig.num_epochs)
114 | 
115 | 
116 |         #正式开始训练啦
117 |         for batch in batches:
118 |             step_num += 1
119 |             x_batch,y_batch=zip(*batch)
120 |             summary,loss, accuracy=train_step(x_batch,y_batch,config.lr_rate)
121 |             if step_num % 20 == 0:
122 |                 train_writer.add_summary(summary,step_num)
123 |                 #print("For train_samples: step %d, loss %g, accuracy %g" % (step_num,loss,accuracy))
124 |                 summary,loss, accuracy = test_step(test_sample_arrays, test_label_arrays)
125 |                 #print("Testing loss: %g,Testing accuracy: %g" % (loss, accuracy))
126 |                 test_writer.add_summary(summary, step_num)
127 | 
128 |         #_,loss, accuracy = test_step(test_sample_arrays, test_label_arrays)
129 |         #print("Testing loss: %g,Testing accuracy: %g" % (loss, accuracy))
130 | 
131 |         saver.save(sess,"data/cnn/text_model")
132 |         train_writer.close()
133 |         test_writer.close()


--------------------------------------------------------------------------------
/Lstm_Model.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | 
  4 | class TextLSTM(object):
  5 |     def __init__(self,config):
  6 |         self.num_steps=config.max_sentences_length
  7 |         self.hidden_size=config.embedding_size
  8 |         self.num_classes=config.num_labels
  9 |         self.num_layers=config.num_layers
 10 |         self.batch_size=config.batch_size
 11 |         self.l2_rate=config.l2_rate
 12 |         self.input_x=tf.placeholder(tf.float32,[None,self.num_steps,self.hidden_size],name="input_x")
 13 |         self.input_y=tf.placeholder(tf.float32,[None,self.num_classes],name="input_y")
 14 |         self.dropout_keep_prob=tf.placeholder(tf.float32,name="dropout_keep_prob")
 15 | 
 16 | 
 17 | 
 18 |         with tf.variable_scope("Net",initializer=tf.orthogonal_initializer()):
 19 |             def lstm_cell():
 20 |                 return tf.contrib.rnn.BasicLSTMCell(self.hidden_size,forget_bias=1.0,state_is_tuple=True)
 21 | 
 22 | 
 23 |             attn_cell = lstm_cell
 24 |             if self.dropout_keep_prob is not None:
 25 |                 def attn_cell():
 26 |                     return tf.contrib.rnn.DropoutWrapper(lstm_cell(),output_keep_prob=self.dropout_keep_prob)
 27 | 
 28 | 
 29 |             self.cell_fw=tf.contrib.rnn.MultiRNNCell([attn_cell() for _ in range(config.num_layers)],
 30 |                                                      state_is_tuple=True)
 31 |             self.cell_bw=tf.contrib.rnn.MultiRNNCell([attn_cell() for _ in range(config.num_layers)],
 32 |                                                      state_is_tuple=True)
 33 | 
 34 |             if self.dropout_keep_prob is not None:
 35 |                 inputs=tf.nn.dropout(self.input_x,self.dropout_keep_prob)
 36 |             else:
 37 |                 inputs=self.input_x
 38 | 
 39 |             #shape: (batch_size, num_steps,hidden_size) => (num_steps,batch_size,hidden_size)
 40 |             inputs= tf.transpose(inputs, [1,0,2])
 41 |             outputs,state = tf.nn.bidirectional_dynamic_rnn(cell_fw=self.cell_fw,
 42 |                                                            cell_bw=self.cell_bw,
 43 |                                                            dtype="float32",
 44 |                                                            inputs=inputs,
 45 |                                                            swap_memory=True,
 46 |                                                             time_major=True)
 47 |             outputs_fw,outputs_bw=outputs
 48 |             output_fw=outputs_fw[-1]
 49 |             output_bw=outputs_bw[-1]
 50 |             finial_output=tf.concat([output_fw,output_bw],1)
 51 |             with tf.name_scope("output"):
 52 |                 softmax_w=tf.get_variable("softmax_w",[self.hidden_size*2,self.num_classes],dtype=tf.float32)
 53 |                 softmax_b=tf.get_variable("softmax_b",[self.num_classes],dtype=tf.float32,initializer=tf.random_normal_initializer(stddev=0.01))
 54 |                 self.result=tf.matmul(finial_output,softmax_w)+softmax_b
 55 |                 self.final_state=state
 56 |                 self.predictions=tf.argmax(self.result,1,name="predictions")
 57 |                 tf.summary.histogram("softmax_w",softmax_w)
 58 |                 tf.summary.histogram("softmax_b",softmax_b)
 59 |                 self.softmax_result=tf.nn.softmax(self.result)
 60 | 
 61 | 
 62 |         #计算损失
 63 |             with tf.name_scope("loss"):
 64 |                 losses=tf.nn.softmax_cross_entropy_with_logits(logits=self.result, labels=self.input_y)
 65 |                 self.loss = tf.reduce_mean(losses)
 66 |                 tf.summary.scalar("loss",self.loss)
 67 | 
 68 | 
 69 |             #计算正确率
 70 |             with tf.name_scope("accuracy"):
 71 |                 correct_predictions=tf.equal(self.predictions, tf.argmax(self.input_y, 1))
 72 |                 self.accuracy=tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
 73 |                 tf.summary.scalar("accuracy",self.accuracy)
 74 | 
 75 |             with tf.name_scope("train_op"):
 76 |                 tvars = tf.trainable_variables()
 77 |                 self.l2_loss = 0.001 * tf.reduce_sum([tf.nn.l2_loss(v) for v in tvars])  # 0.001是lambda超参数
 78 |                 grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss+self.l2_loss, tvars), config.max_grad_norm)
 79 |                 optimizer = tf.train.AdamOptimizer()
 80 |                 optimizer.apply_gradients(zip(grads, tvars))
 81 |                 self.train_op = optimizer.apply_gradients(zip(grads, tvars))
 82 | 
 83 |             with tf.name_scope("summary"):
 84 |                 self.summary_op=tf.summary.merge_all()
 85 | 
 86 | '''
 87 | class TextLSTM(object):
 88 |     def __init__(self,config):
 89 |         self.num_steps=config.max_sentences_length
 90 |         self.hidden_size=config.embedding_size
 91 |         self.num_classes=config.num_labels
 92 |         self.num_layers=config.num_layers
 93 |         self.batch_size=config.batch_size
 94 |         self.l2_rate=config.l2_rate
 95 |         self.input_x=tf.placeholder(tf.float32,[None,self.num_steps,self.hidden_size],name="input_x")
 96 |         self.input_y=tf.placeholder(tf.float32,[None,self.num_classes],name="input_y")
 97 |         self.dropout_keep_prob=tf.placeholder(tf.float32,name="dropout_keep_prob")
 98 | 
 99 | 
100 | 
101 |         with tf.variable_scope("Net",initializer=tf.orthogonal_initializer()):
102 |             def lstm_cell():
103 |                 return tf.contrib.rnn.BasicLSTMCell(self.hidden_size,forget_bias=1.0,state_is_tuple=True)
104 | 
105 | 
106 |             attn_cell = lstm_cell
107 |             if self.dropout_keep_prob is not None:
108 |                 def attn_cell():
109 |                     return tf.contrib.rnn.DropoutWrapper(lstm_cell(),output_keep_prob=self.dropout_keep_prob)
110 | 
111 | 
112 |             self.cell=tf.contrib.rnn.MultiRNNCell([attn_cell() for _ in range(config.num_layers)],state_is_tuple=True)
113 |             if self.dropout_keep_prob is not None:
114 |                 inputs=tf.nn.dropout(self.input_x,self.dropout_keep_prob)
115 |             else:
116 |                 inputs=self.input_x
117 | 
118 |             #shape: (batch_size, num_steps,hidden_size) => (num_steps,batch_size,hidden_size)
119 |             inputs= tf.transpose(inputs, [1,0,2])
120 |             outputs,state = tf.nn.dynamic_rnn(cell=self.cell,dtype="float32",inputs=inputs,swap_memory=True,time_major=True)
121 |             output=outputs[-1]
122 |             with tf.name_scope("output"):
123 |                 softmax_w=tf.get_variable("softmax_w",[self.hidden_size,self.num_classes],dtype=tf.float32)
124 |                 softmax_b=tf.get_variable("softmax_b",[self.num_classes],dtype=tf.float32,initializer=tf.random_normal_initializer(stddev=0.01))
125 |                 self.result=tf.matmul(output,softmax_w)+softmax_b
126 |                 self.final_state=state
127 |                 self.predictions=tf.argmax(self.result,1,name="predictions")
128 |                 tf.summary.histogram("softmax_w",softmax_w)
129 |                 tf.summary.histogram("softmax_b",softmax_b)
130 |                 self.softmax_result=tf.nn.softmax(self.result)
131 | 
132 | 
133 |         #计算损失
134 |             with tf.name_scope("loss"):
135 |                 losses=tf.nn.softmax_cross_entropy_with_logits(logits=self.result, labels=self.input_y)
136 |                 self.loss = tf.reduce_mean(losses)
137 |                 tf.summary.scalar("loss",self.loss)
138 | 
139 | 
140 |             #计算正确率
141 |             with tf.name_scope("accuracy"):
142 |                 correct_predictions=tf.equal(self.predictions, tf.argmax(self.input_y, 1))
143 |                 self.accuracy=tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
144 |                 tf.summary.scalar("accuracy",self.accuracy)
145 | 
146 |             with tf.name_scope("train_op"):
147 |                 tvars = tf.trainable_variables()
148 |                 grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), config.max_grad_norm)
149 |                 optimizer = tf.train.AdamOptimizer()
150 |                 optimizer.apply_gradients(zip(grads, tvars))
151 |                 self.train_op = optimizer.apply_gradients(zip(grads, tvars))
152 | 
153 |             with tf.name_scope("summary"):
154 |                 self.summary_op=tf.summary.merge_all()
155 | '''


--------------------------------------------------------------------------------
/weibo.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # @Time    : 2018/3/1 19:54
  4 | # @Author  : 孑曦曦孑 
  5 | # @File    : visitor_weibo_login.py
  6 | 
  7 | import requests
  8 | import re
  9 | import json
 10 | from lxml import etree
 11 | import time
 12 | import os
 13 | 
 14 | # url="https://weibo.com/2447680824/G5nMd0MBJ?type=comment#_rnd1519906057635"
 15 | #模拟游客登录获取cookies
 16 | class visitor():
 17 |     def __init__(self,url):
 18 |         try:
 19 |             self.cookies,self.headers=self.get_cookies()
 20 |             self.id=self.weibo_spider(url)
 21 |             self.fail=False
 22 |         except:
 23 |             print("模拟失败-。-")
 24 |             self.fail=True
 25 | 
 26 | 
 27 |     def get_cookies(self):
 28 |         # 获取dict_data
 29 |         print("正在模拟游客登录")
 30 |         S = requests.session()
 31 |         url = "https://passport.weibo.com/visitor/genvisitor"
 32 |         S.headers.update(
 33 |             {
 34 |                 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0"
 35 |             }
 36 |         )
 37 | 
 38 |         data = {
 39 |             "cb": "gen_callback",
 40 |             "fp": '{"os":"1","browser":"Gecko60,0,0,0","fonts":"undefined","screenInfo":"1536*864*24","plugins":""}'
 41 |         }
 42 |         response = S.post(url, data=data)
 43 |         pattren=re.compile(r"\((.*)\)")
 44 |         data=pattren.findall(response.text)[0]
 45 |         dict_data=json.loads(data)["data"]
 46 |         tid=dict_data["tid"]
 47 |         confidence=dict_data["confidence"]
 48 |         where=dict_data["new_tid"]
 49 |         if where:
 50 |             where=3
 51 |         else:
 52 |             where=2
 53 |         while(len(str(confidence))<3):
 54 |             confidence="0"+str(confidence)
 55 |         #tid="KCEsfUfkAmyXExt9tiPN61f32Vvh4wViWQaeHptBZLc="
 56 |         #手动编码格式转换
 57 |         tid=tid.replace("+","%2b")
 58 |         tid=tid.replace("=","%3d")
 59 | 
 60 |         url="https://passport.weibo.com/visitor/visitor?a=incarnate"\
 61 |             "&t="+str(tid)+ \
 62 |             "&w=" + str(where) + \
 63 |             "&c="+str(confidence)+\
 64 |             "&gc="\
 65 |             "&cb=cross_domain" \
 66 |             "&from=weibo"
 67 |         response=S.get(url)
 68 |         data=pattren.findall(response.text)[0]
 69 |         #https://passport.weibo.com/visitor/visitor?a=incarnate&t=hVEmh0nd32++OFXP3wiB6b05C9A5L38fmq7ArFKTNq8=&w=2&c=095&gc=&cb=cross_domain&from=weibo'
 70 |         #https://passport.weibo.com/visitor/visitor?a=incarnate&t=+A1gVsii+zY9OI9v/e+o1lfhlTPQ20U3Fkuz8nn/7rU=&w=2&c=095&gc=&cb=cross_domain&from=weibo&_rand=0.42337865580692513
 71 |         dict_data = json.loads(data)
 72 |         if "succ" not in dict_data["msg"]:
 73 |             printf("tid不合法")
 74 |             printf("dict_data:",dict_data)
 75 |             self.fail=True
 76 |             return None,None
 77 |         dict_data=dict_data["data"]
 78 |         sub=dict_data["sub"]    #没有
 79 |         subp=dict_data["subp"]
 80 |         # print(sub,subp)
 81 |         url="https://login.sina.com.cn/visitor/visitor?a=crossdomain&cb=return_back"\
 82 |             +"&s="+str(sub)\
 83 |             +"&sp="+str(subp)\
 84 |             +"&from=weibo"
 85 |         response=S.get(url)
 86 |         print("成功获取游客Cookies")
 87 |         return S.cookies,S.headers
 88 | 
 89 |     #base62解码
 90 |     def base62(self,string):
 91 |         alphabet = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
 92 |         base = len(alphabet)
 93 |         strlen = len(string)
 94 |         num = 0
 95 |         idx = 0
 96 |         for char in string:
 97 |             power = (strlen - (idx + 1))
 98 |             num += alphabet.index(char) * (base ** power)
 99 |             idx += 1
100 |         return num
101 | 
102 |     def weibo_spider(self,url):
103 |         S=requests.session()
104 |         S.cookies=self.cookies
105 |         S.headers=self.headers
106 |         response=S.get(url)
107 |         #获取微博大致内容
108 |         selector = etree.HTML(response.text)
109 |         weibo=selector.xpath('/title/text()')
110 |         page = etree.HTML(response.text)
111 |         content = page.xpath('//title/text()')[0]
112 |         content=str(content).replace("\n"," ")
113 |         print("爬取ing\n\t",content)
114 |         # #获取异步加载url中的id
115 |         pattern=re.compile(r"\d\/(.*)\?.*type")
116 |         # https://weibo.com/5678693647/GaERmaQ33?filter=hot&root_comment_id=0&type=comment
117 |         # https://weibo.com/5678693647/GaERmaQ33?type=comment
118 |         content=pattern.search(url).group(1)
119 |         string1=str(self.base62(content[0]))
120 |         string2=str(self.base62(content[1:5]))
121 |         while(len(string2)<7):
122 |             string2="0"+string2
123 |         string3=str(self.base62(content[5:]))
124 |         while (len(string3) < 7):
125 |             string3 = "0" + string3
126 |         id=string1+string2+string3
127 |         return id
128 | 
129 |     def catch_comments(self,page=1,past=None):
130 |         #模拟异步加载
131 |         #https://weibo.com/aj/v6/comment/big?ajwvr=6&id=4213083327566698&filter=hot&page=1
132 |         path="./data"
133 |         if(page==1):
134 |             print("开始爬取~")
135 |             #判断是否存在该文件夹
136 |             if not os.path.exists(path):
137 |                 os.mkdir(path)
138 |         S=requests.session()
139 |         S.cookies=self.cookies
140 |         S.headers=self.headers
141 |         #https://weibo.com/aj/v6/comment/big?ajwvr=6&id=4228711421054898
142 |         # &root_comment_max_id=259085778882534&root_comment_max_id_type=0&
143 |         # root_comment_ext_param=&page=2&filter=hot
144 |         # &sum_comment_number=7215&filter_tips_before=1
145 |         # &from=singleWeiBo&__rnd=1525703793076
146 | 
147 |         url="https://weibo.com/aj/v6/comment/big?ajwvr=6"\
148 |             +"&id="+str(self.id)\
149 |             +"&page="+str(page) \
150 |             + "&filter=hot" \
151 |             +"&from=singleWeiBo"
152 |             # print(url)
153 |         response=S.get(url)
154 |         html=json.loads(response.text)["data"]["html"]
155 |         #如果两次相同表示结束了 -。-
156 |         # if past_html==html:
157 |         #     print("爬取结束")
158 |         #     # print(self.id)
159 |         #     print("共",page,"页")
160 |         #     return
161 |         #搜索评论
162 |         text=etree.HTML(html)
163 |         #print(html)
164 |         #评论数-xpath
165 |         # comments=text.xpath('//div[@class="list_li S_line1 clearfix"]//div[@class="WB_text"]//text()')
166 |         comments = text.xpath('//div[@class="list_li S_line1 clearfix"]/*/div[@class="WB_text"]')
167 |         # 评论数
168 |         points = text.xpath('//div[@class="list_li S_line1 clearfix"]//*/span[@node-type="like_status"]/child::*[2]//text()')
169 |         #点赞数-xpath
170 |         # points
171 |         pattern = re.compile(r'\：(.*)')
172 |         # try:
173 |         if page==1:
174 |             wa="w"
175 |         else:
176 |             wa="a"
177 |         f=open("./data/test.txt", wa,encoding='utf-8')
178 |         # f2=open("weibo_points.txt",wa,encoding='utf-8')
179 |         for i in range(len(comments)):
180 |             comment = comments[i].xpath("text()")
181 |             comment = ",".join(comment[1:])[1:].strip()
182 |             point = points[i]
183 |             if i==0:
184 |                 now = {"len": len(comments), "comment":comment}
185 |                 if now==past:
186 |                     print("爬取结束")
187 |                     return
188 |                 else:
189 |                     past=now
190 |             if point == "赞":
191 |                 point = "0"
192 |             #点赞数为权重0.2
193 |             weights=int(0.2*int(point))
194 |             #写入评论
195 |             comment=comment+"\n"
196 |             f.write(comment)
197 |             for i in range(weights):
198 |                 f.write(comment)
199 |             #写入点赞数
200 |         print("已写入", page, "页")
201 |         f.close()
202 |     # except :
203 |     #     print("写入文件失败")
204 | 
205 |         page+=1
206 |         self.catch_comments(page,past)
207 | 
208 |     #获取图片
209 |     def catch_pictures(self,page=1,past_html=None):
210 |         # 模拟异步加载
211 |         # https://weibo.com/aj/v6/comment/big?ajwvr=6&id=4213083327566698&filter=hot&page=1
212 |         path = "./weibo-pic"
213 |         if (page == 1):
214 |             print("开始爬取~")
215 |             #创建文件夹
216 |             if not os.path.exists(path):
217 |                 os.makedirs(path)
218 |         S = requests.session()
219 |         S.cookies = self.cookies
220 |         S.headers = self.headers
221 |         url = "https://weibo.com/aj/v6/comment/big?ajwvr=6" \
222 |               + "&id=" + str(self.id) \
223 |               + "&page=" + str(page) \
224 |               + "&filter=hot" \
225 |               + "&from=singleWeiBo"
226 |         response = S.get(url)
227 |         html = json.loads(response.text)["data"]["html"]
228 |         # 如果两次相同表示结束了 -。-
229 |         if past_html == html:
230 |             print("爬取结束")
231 |             # print(self.id)
232 |             print("共", page, "页")
233 |             return
234 |         # 搜索图片链接
235 |         text = etree.HTML(html)
236 |         ids = text.xpath('//li[@action-type="comment_media_img"]/attribute::action-data')
237 |         # 写入图片
238 |         try:
239 |             for id in ids:
240 |                 id = id.split("&")
241 |                 id = id[0][4:]
242 |                 url = "https://wx3.sinaimg.cn/bmiddle/" + id + ".jpg"
243 |                 filename = path + "/" + str(id) + ".jpg"
244 |                 response = requests.get(url, stream=True)
245 |                 with open(filename, "wb") as f:
246 |                     for chunk in response.iter_content(128):
247 |                         f.write(chunk)
248 |         except:
249 |             print("写入失败")
250 |         page += 1
251 |         print(page)
252 |         self.catch_pictures(page, html)
253 | 
254 | #模块调用
255 | def start(url=None):
256 |     if url==None:
257 |         print("请输入正确的url")
258 |         return
259 |     else:
260 |         spider=visitor(url)
261 |         spider.catch_comments()
262 | 
263 | if __name__=="__main__":
264 |     url="https://weibo.com/2387903701/G5bn7s5CS?type=comment"
265 |     url = input("输入需要爬取的微博url:\n")
266 |     spider=visitor(url)
267 |     if spider.fail==False:
268 |         spider.catch_comments()
269 | <<<<<<< HEAD
270 |     # spider.catch_pictures()
271 | =======
272 |     # spider.catch_pictures()
273 | #https://weibo.com/1840483562/G48Ajgfhq?type=comment
274 | #https://weibo.com/aj/v6/comment/big?ajwvr=6&id=4209863153871988&filter=hot&page=1
275 | 
276 | #https://weibo.com/2387903701/G5bn7s5CS?type=comment
277 | #https://weibo.com/aj/v6/comment/big?ajwvr=6&id=4212353576694814&filter=hot&page=12
278 | 
279 | 
280 | #------------------
281 | #      ~。~   nice
282 | #------------------
283 | >>>>>>> 06b44860ba75492beb842e51b5b0af857dbfdbb3
284 | 


--------------------------------------------------------------------------------