├── main.py ├── word2vec.py ├── visual.py ├── README.md ├── cnn_test.py ├── lstm_test.py ├── Cnn_Model.py ├── mixed_cnn_lstm_test.py ├── readdata.py ├── lstm_train.py ├── cnn_train.py ├── Lstm_Model.py └── weibo.py /main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from weibo import start as spider 3 | from cnn_test import get_cnn_result 4 | from lstm_test import get_lstm_result 5 | from mixed_cnn_lstm_test import get_mixed_result 6 | from visual import show_emtion 7 | 8 | 9 | 10 | if __name__=="__main__": 11 | prediction = np.array([]) 12 | print("********************欢迎使用微博舆情分析工具***********************") 13 | url = input("请输入需要分析的微博url:\n") 14 | #调用weibo.py接口开始爬取相关微博评论 15 | spider(url) 16 | #选择模型 17 | model_index=int(input("请输入你想选择的AI模型:\n1.CNN\n2.LSTM\n3.CNN & LSTM融合模型\n")) 18 | #调用AI模型接口返回结果 19 | if model_index == 1: 20 | prediction=get_cnn_result() 21 | elif model_index == 2: 22 | prediction=get_lstm_result() 23 | elif model_index == 3: 24 | prediction=get_mixed_result() 25 | prediction=[i for i in prediction[0]] 26 | else: 27 | print("输入信息错误") 28 | #移交可视化模块完成数据视化 29 | print(prediction) 30 | show_emtion(prediction) -------------------------------------------------------------------------------- /word2vec.py: -------------------------------------------------------------------------------- 1 | import gensim 2 | import numpy as np 3 | 4 | 5 | 6 | def get_embedding_vector(sentences,embedding_model_path): 7 | print("loading word2vec model now...........") 8 | model=gensim.models.KeyedVectors.load_word2vec_format(embedding_model_path,binary=True) 9 | print("loading word2vec finished") 10 | all_sample_vector_lists=[] 11 | padding_embedding=np.array([0] * model.vector_size,dtype=np.float32) 12 | print("transform word to vector now.......") 13 | for sentence in sentences: 14 | sentence_vector = [] 15 | for word in sentence: 16 | if word in model.vocab: 17 | sentence_vector.append(model[word]) 18 | else: 19 | sentence_vector.append(padding_embedding) 20 | all_sample_vector_lists.append(sentence_vector) 21 | del sentence_vector 22 | print("transform word to vector finished") 23 | del sentences 24 | del model 25 | return all_sample_vector_lists -------------------------------------------------------------------------------- /visual.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | 4 | def show_emtion(prediction): 5 | positive=0 6 | negative=0 7 | for i, num in enumerate(prediction): 8 | if num == 0: 9 | positive+=1 10 | else: 11 | negative+=1 12 | #用来正常显示中文标签 13 | plt.rcParams['font.sans-serif']=['SimHei'] 14 | #用来正常显示负号 15 | plt.rcParams['axes.unicode_minus']=False 16 | #调节图形大小(宽,高) 17 | plt.figure(figsize=(12,8)) 18 | #定义饼状图的标签,标签是列表 19 | labels = [u'喜悦',u'低落'] 20 | #每个标签的占比,不一定要和为100% 21 | sizes = [positive,negative] 22 | colors = ['lightskyblue','FireBrick'] 23 | explode = (0.05,0) 24 | patches,l_text,p_text = plt.pie(sizes,explode=explode,labels=labels,colors=colors,labeldistance = 1.1,autopct = '%3.1f%%',shadow = False,startangle = 90,pctdistance = 0.5) 25 | 26 | #改变文本的大小 27 | for t in l_text: 28 | t.set_size(20) 29 | for t in p_text: 30 | t.set_size(20) 31 | #圆 32 | plt.axis('equal') 33 | plt.legend() 34 | plt.show() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 基于微博评论的数据挖掘与情感分析 2 | **!!!大学作业不再更新!!!** 3 | 4 | ## 项目简介 5 | 学习卷积神经网络,循环神经网络在实际环境下的应用,提升实践能力,了解深度学习在自然语言处理方面的进展 6 | 7 | ## cnn_for_text_classify 8 | 具备较强的自动关键词提取能力,在酒店评论测试集上达到95%的准确率 9 | 采用l2正则和dropout来控制过拟合现象 10 | 4种卷积核使其能提取局部高效的短特征 11 | 12 | ## lstm_for_text_classify 13 | 具有较强的对长难句,反问句,阴阳怪气句的判断能力,在在酒店评论测试集上达到97%的准确率 14 | 采用双向LSTM网络 15 | 对输入数据进行dropout,模拟增大样本空间 16 | LSTM层与层之间进行dropout 17 | 对LSTM网络权重,偏置进行l2正则,抗过拟合 18 | 网络采用正交初始化,加快收敛速度,提升训练集上的正确率,大幅提升测试集上的正确率 19 | 采用Clipping Gradients,防止梯度爆炸,提升测试集上的正确率 20 | 21 | ## word2vec: 22 | 项目使用的词向量:embedding_64.bin(1.5G) 23 | 训练语料:百度百科800w条 20G+搜狐新闻400w条 12G+小说:90G左右 24 | 模型参数:window=5 min_count=5 size=64 25 | 下载链接:[百度网盘链接](https://pan.baidu.com/s/19bDbZsFzLggx7q9iFn83Nw) 26 | 27 | 28 | ## 文件功能介绍 29 | ./ 30 | weibo.py:微博评论爬虫 31 | readdata.py:为情感分析模型提供多种数据加载相关API 32 | word2vec.py:为情感分析模型提供多种词向量的相关API 33 | cnn_model.py:CNN文本分类模型图结构 34 | cnn_train.py:CNN文本分类训练代码 35 | cnn_test.py: CNN文本分类测试代码 36 | lstm_model.py:lstm文本分类模型图结构 37 | lstm_train.py:lstm文本分类训练代码 38 | lstm_test.py: lstm文本分类测试代码 39 | mixed_cnn_lstm_test.py:采用模型融合方式将cnn与lstm的结果进行融合投票绝对最终结果 40 | visual.py:结果可视化 41 | main.py:项目主文件,调用各文件API,自动提取,分析,显示 42 | 43 | ./data 44 | happy.txt:开心评价 45 | angry.txt:愤怒评价 46 | unhappy:低落评价数据集 47 | embedding_64.bin:训练好的词向量模型 48 | ---/cnn:cnn模型训练完成的相关数据参数 49 | ---/lstm:lstm模型训练完成的相关数据参数 50 | 51 | ./summary 52 | ---/cnn:cnn的log和图结构 53 | -------/test:测试集上的log 54 | -------/test:训练集上的log 55 | ---/lstm:lstm的log和图结构 56 | -------/test:测试集上的log 57 | -------/test:训练集上的log 58 | 59 | ## 推荐运行环境 60 | python 3.6 61 | tensorflow-gpu 1.4 62 | gensim 3.3 63 | Ubuntu 64 Bit / windows10 64 Bit 64 | 65 | ## 使用模型注意事项 66 | 1.文本TXT文件必须采用UTF-8编码格式,非UTF-8格式的,去记事本中另存为的时候选择UTF-8 67 | 2.pos.txt、neg.txt、test.txt 文件一行为一条评论,长度不限,可以有英文和标点(反正都会去除的),不要词性标注信息 68 | 3.词向量模型一定要用我放的那个64维度的bin文件 69 | 4.模型代码在windows上测试过基本没bug,linux平台没测试过,不过肯定需要自行修改文件路径 70 | 5.测试集比率根据你的样本数量自行调整,太大容易造成显存不够导致失败 71 | 6.根据文件夹结构自行建立 72 | -------------------------------------------------------------------------------- /cnn_test.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import readdata 4 | import word2vec 5 | import os 6 | import cnn_model 7 | 8 | 9 | test_file_path="./data//test.txt" 10 | train_data_path="./data//cnn//training_params.pickle" 11 | embedding_model_path="./data//embedding_64.bin" 12 | 13 | class config(): 14 | test_sample_percentage=0.03 15 | num_labels=2 16 | embedding_size=64 17 | filter_sizes=[2,3,4] 18 | num_filters=128 19 | dropout_keep_prob=1 20 | l2_reg_lambda=0.1 21 | batch_size=32 22 | num_epochs=15 23 | max_sentences_length=0 24 | lr_rate=1e-3 25 | 26 | def get_cnn_result(): 27 | if not os.path.exists(embedding_model_path): 28 | print("word2vec model is not found") 29 | 30 | if not os.path.exists(train_data_path): 31 | print("train params is not found") 32 | 33 | params = readdata.loadDict(train_data_path) 34 | train_length = int(params['max_sentences_length']) 35 | 36 | 37 | 38 | test_sample_lists = readdata.get_cleaned_list(test_file_path) 39 | test_sample_lists,max_sentences_length = readdata.padding_sentences(test_sample_lists,padding_token='',padding_sentence_length=train_length) 40 | test_sample_arrays=np.array(word2vec.get_embedding_vector(test_sample_lists,embedding_model_path)) 41 | testconfig=config() 42 | testconfig.max_sentences_length=max_sentences_length 43 | 44 | sess=tf.InteractiveSession() 45 | cnn=cnn_model.TextCNN(config=testconfig) 46 | 47 | #加载参数 48 | saver = tf.train.Saver() 49 | saver.restore(sess, "./data/cnn/text_model") 50 | 51 | #定义测试函数 52 | def test_step(x_batch): 53 | feed_dict={ 54 | cnn.input_x:x_batch, 55 | cnn.dropout_keep_prob:1.0 56 | } 57 | predictions,scores=sess.run( 58 | [cnn.predictions,cnn.softmax_result], 59 | feed_dict=feed_dict 60 | ) 61 | return (predictions,scores) 62 | 63 | 64 | #拿到结果 65 | predictions,scores=test_step(test_sample_arrays) 66 | return np.array(predictions) 67 | #print("(0->neg & 1->pos)the result is:") 68 | #print(predictions) 69 | #print("********************************") 70 | #print("the scores is:") 71 | #print(scores) 72 | #print(scores.shape) 73 | -------------------------------------------------------------------------------- /lstm_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import readdata 3 | import word2vec 4 | import lstm_model 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | 9 | 10 | #文件路径 11 | current_path=os.path.abspath(os.curdir) 12 | test_file_path="./data//test.txt" 13 | embedding_model_path="./data//embedding_64.bin" 14 | train_data_path="./data//lstm//training_params.pickle" 15 | 16 | 17 | #模型超参 18 | class config(): 19 | test_sample_percentage=0.03 20 | num_labels=2 21 | embedding_size=64 22 | dropout_keep_prob=1 23 | batch_size=64 24 | num_epochs=80 25 | max_sentences_length=40 26 | num_layers=2 27 | max_grad_norm=5 28 | l2_rate=0.0001 29 | 30 | 31 | def get_lstm_result(): 32 | if not os.path.exists(embedding_model_path): 33 | print("word2vec model is not found") 34 | 35 | if not os.path.exists(train_data_path): 36 | print("train params is not found") 37 | 38 | params = readdata.loadDict(train_data_path) 39 | train_length = int(params['max_sentences_length']) 40 | 41 | 42 | 43 | test_sample_lists = readdata.get_cleaned_list(test_file_path) 44 | test_sample_lists,max_sentences_length = readdata.padding_sentences(test_sample_lists,padding_token='',padding_sentence_length=train_length) 45 | test_sample_arrays=np.array(word2vec.get_embedding_vector(test_sample_lists,embedding_model_path)) 46 | testconfig=config() 47 | testconfig.max_sentences_length=max_sentences_length 48 | 49 | 50 | sess=tf.InteractiveSession() 51 | lstm=lstm_model.TextLSTM(config=testconfig) 52 | 53 | saver = tf.train.Saver() 54 | saver.restore(sess, "./data/lstm/text_model") 55 | 56 | #定义测试函数 57 | def test_step(x_batch): 58 | feed_dict={ 59 | lstm.input_x:x_batch, 60 | lstm.dropout_keep_prob:testconfig.dropout_keep_prob 61 | } 62 | predictions,scores=sess.run( 63 | [lstm.predictions,lstm.softmax_result], 64 | feed_dict=feed_dict 65 | ) 66 | return (predictions,scores) 67 | 68 | predictions, scores=test_step(test_sample_arrays) 69 | return np.array(predictions) 70 | #print("(0->neg & 1->pos)the result is:") 71 | #print(predictions) 72 | #print("********************************") 73 | #print("the scores is:") 74 | #print(scores) 75 | -------------------------------------------------------------------------------- /Cnn_Model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class TextCNN(object): 5 | def __init__(self, config): 6 | sequence_length = config.max_sentences_length 7 | num_classes = config.num_labels 8 | embedding_size = config.embedding_size 9 | filter_sizes = config.filter_sizes 10 | num_filters = config.num_filters 11 | l2_reg_lambda = config.l2_reg_lambda 12 | l2_loss = tf.constant(0.0) 13 | pooled_outputs = [] 14 | 15 | 16 | self.input_x=tf.placeholder(tf.float32,[None,sequence_length,embedding_size],name="input_x") 17 | self.input_y=tf.placeholder(tf.float32,[None,num_classes],name="input_y") 18 | self.dropout_keep_prob=tf.placeholder(tf.float32,name="dropout_rate") 19 | self.learning_rate=tf.placeholder(tf.float32,name="lr") 20 | 21 | 22 | self.input_x_expended=tf.expand_dims(self.input_x,-1) 23 | 24 | 25 | for filter_size in filter_sizes: 26 | with tf.name_scope("conv-maxpool-%s" % filter_size): 27 | #[filter_height, filter_width, in_channels, out_channels] 28 | filter_shape=[filter_size,embedding_size,1,num_filters] 29 | W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") 30 | b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b") 31 | 32 | 33 | #添加卷积层 34 | conv=tf.nn.conv2d( 35 | self.input_x_expended, 36 | W, 37 | strides=[1,1,1,1], 38 | padding="VALID", 39 | name="conv" 40 | ) 41 | 42 | 43 | #添加偏置 & relu激活函数 44 | h=tf.nn.relu(tf.nn.bias_add(conv,b),name="relu") 45 | 46 | 47 | #添加最大池化层 48 | pooled=tf.nn.max_pool( 49 | h, 50 | ksize=[1,sequence_length-filter_size+1,1,1], #[对1个句子 卷积值hight 卷积值width 1个channel] 51 | strides=[1,1,1,1], 52 | padding="VALID", 53 | name="pool" 54 | ) 55 | pooled_outputs.append(pooled) 56 | 57 | num_filters_total = num_filters * len(filter_sizes) 58 | self.h_pooled=tf.concat(pooled_outputs, 3) 59 | self.h_pooled_flat=tf.reshape(self.h_pooled,[-1,num_filters_total]) 60 | 61 | 62 | #添加dropout层 63 | with tf.name_scope("dropout"): 64 | self.h_drop=tf.nn.dropout(self.h_pooled_flat, self.dropout_keep_prob) 65 | 66 | 67 | #添加分类层 68 | with tf.name_scope("output"): 69 | self.Weight = tf.get_variable( 70 | "Weight", 71 | shape=[num_filters_total, num_classes], 72 | initializer=tf.contrib.layers.xavier_initializer()) 73 | self.bias = tf.Variable(tf.constant(0.1, shape=[num_classes], name="bias")) 74 | l2_loss += tf.nn.l2_loss(self.Weight) 75 | l2_loss += tf.nn.l2_loss(self.bias) 76 | self.result=tf.matmul(self.h_drop,self.Weight)+self.bias 77 | self.predictions=tf.argmax(self.result,1,name="predictions") 78 | tf.summary.histogram("weight",self.Weight) 79 | tf.summary.histogram("bias",self.bias) 80 | self.softmax_result = tf.nn.softmax(self.result) 81 | 82 | 83 | #计算损失 84 | with tf.name_scope("loss"): 85 | losses=tf.nn.softmax_cross_entropy_with_logits(logits=self.result, labels=self.input_y) 86 | self.loss=tf.reduce_mean(losses)+l2_reg_lambda*l2_loss 87 | tf.summary.scalar("loss",self.loss) 88 | 89 | #计算正确率 90 | with tf.name_scope("accuracy"): 91 | correct_predictions=tf.equal(self.predictions, tf.argmax(self.input_y, 1)) 92 | self.accuracy=tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") 93 | tf.summary.scalar("accuracy",self.accuracy) 94 | 95 | #训练操作 96 | with tf.name_scope("train_operation"): 97 | self.train_op=tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss) 98 | 99 | with tf.name_scope("summary"): 100 | self.merged=tf.summary.merge_all() -------------------------------------------------------------------------------- /mixed_cnn_lstm_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import readdata 3 | import word2vec 4 | import lstm_model 5 | import cnn_model 6 | import numpy as np 7 | import tensorflow as tf 8 | 9 | 10 | 11 | #文件路径 12 | current_path=os.path.abspath(os.curdir) 13 | test_file_path="./data//test.txt" 14 | embedding_model_path="./data//embedding_64.bin" 15 | lstm_train_data_path="./data//lstm//training_params.pickle" 16 | cnn_train_data_path="./data//cnn//training_params.pickle" 17 | 18 | 19 | #模型超参 20 | class lstmconfig(): 21 | test_sample_percentage=0.03 22 | num_labels=2 23 | embedding_size=64 24 | dropout_keep_prob=1 25 | batch_size=64 26 | num_epochs=80 27 | max_sentences_length=40 28 | num_layers=2 29 | max_grad_norm=5 30 | l2_rate=0.0001 31 | 32 | class cnnconfig(): 33 | test_sample_percentage=0.03 34 | num_labels=2 35 | embedding_size=64 36 | filter_sizes=[2,3,4] 37 | num_filters=128 38 | dropout_keep_prob=1 39 | l2_reg_lambda=0.1 40 | batch_size=32 41 | num_epochs=15 42 | max_sentences_length=0 43 | lr_rate=1e-3 44 | 45 | def get_mixed_result(): 46 | if not os.path.exists(embedding_model_path): 47 | print("word2vec model is not found") 48 | 49 | if not os.path.exists(lstm_train_data_path): 50 | print("lstm train params is not found") 51 | 52 | lstm_params = readdata.loadDict(lstm_train_data_path) 53 | lstm_train_length = int(lstm_params['max_sentences_length']) 54 | 55 | if not os.path.exists(cnn_train_data_path): 56 | print("cnn train params is not found") 57 | 58 | cnn_params = readdata.loadDict(cnn_train_data_path) 59 | cnn_train_length = int(cnn_params['max_sentences_length']) 60 | 61 | 62 | test_sample_lists = readdata.get_cleaned_list(test_file_path) 63 | lstm_test_sample_lists,lstm_max_sentences_length = readdata.padding_sentences(test_sample_lists,padding_token='',padding_sentence_length=lstm_train_length) 64 | cnn_test_sample_lists,cnn_max_sentences_length = readdata.padding_sentences(test_sample_lists,padding_token='',padding_sentence_length=cnn_train_length) 65 | lstm_test_sample_arrays=np.array(word2vec.get_embedding_vector(lstm_test_sample_lists,embedding_model_path)) 66 | cnn_test_sample_arrays=np.array(word2vec.get_embedding_vector(cnn_test_sample_lists,embedding_model_path)) 67 | lstm_config=lstmconfig() 68 | cnn_config=cnnconfig() 69 | lstm_config.max_sentences_length=lstm_max_sentences_length 70 | cnn_config.max_sentences_length=cnn_max_sentences_length 71 | 72 | 73 | 74 | lstm_graph=tf.Graph() 75 | cnn_graph=tf.Graph() 76 | lstm_sess=tf.Session(graph=lstm_graph) 77 | cnn_sess=tf.Session(graph=cnn_graph) 78 | 79 | 80 | with lstm_sess.as_default(): 81 | with lstm_graph.as_default(): 82 | lstm = lstm_model.TextLSTM(config=lstm_config) 83 | lstm_saver = tf.train.Saver() 84 | lstm_saver.restore(lstm_sess, "./data/lstm/text_model") 85 | def lstm_test_step(x_batch): 86 | feed_dict={ 87 | lstm.input_x:x_batch, 88 | lstm.dropout_keep_prob:lstm_config.dropout_keep_prob 89 | } 90 | scores=lstm_sess.run( 91 | [lstm.softmax_result], 92 | feed_dict=feed_dict 93 | ) 94 | return scores 95 | 96 | 97 | lstm_scores = lstm_test_step(lstm_test_sample_arrays) 98 | 99 | 100 | with cnn_sess.as_default(): 101 | with cnn_graph.as_default(): 102 | cnn = cnn_model.TextCNN(config=cnn_config) 103 | cnn_saver = tf.train.Saver() 104 | cnn_saver.restore(cnn_sess, "./data/cnn/text_model") 105 | def cnn_test_step(x_batch): 106 | feed_dict={ 107 | cnn.input_x:x_batch, 108 | cnn.dropout_keep_prob:cnn_config.dropout_keep_prob 109 | } 110 | scores=cnn_sess.run( 111 | [cnn.softmax_result], 112 | feed_dict=feed_dict 113 | ) 114 | return scores 115 | 116 | 117 | cnn_scores = cnn_test_step(cnn_test_sample_arrays) 118 | 119 | lstm_sess.close() 120 | cnn_sess.close() 121 | mixed_scores=np.sum([lstm_scores,cnn_scores],axis=0) 122 | predictions=np.argmax(mixed_scores,axis=2) 123 | return np.array(predictions) -------------------------------------------------------------------------------- /readdata.py: -------------------------------------------------------------------------------- 1 | # encoding: UTF-8 2 | import numpy as np 3 | import re 4 | import os 5 | import pickle 6 | import jieba 7 | 8 | 9 | def save(content,path): 10 | ''' 11 | 把content用pickle方式存到path里 12 | ''' 13 | f=open(path,'wb') 14 | pickle.dump(content,f) 15 | f.close() 16 | print("file has been saved") 17 | 18 | 19 | def clean_str(string): 20 | ''' 21 | 接收string,返回去除各种符号的string 22 | ''' 23 | string=re.sub("[^\u4e00-\u9fff]"," ",string) 24 | string = re.sub(r"\s{2,}", " ", string) 25 | return string 26 | 27 | 28 | def split_str(string): 29 | ''' 30 | 接收string,返回各个词间用空格隔开的string 31 | ''' 32 | return " ".join([word for word in jieba.cut(string,HMM=True)]) 33 | 34 | 35 | def get_cleaned_list(file_path): 36 | ''' 37 | 接收文件全路径,返回次txt文件的分词好的列表 38 | ''' 39 | print("read txt now..............") 40 | f=open(file_path,'r',encoding="utf8") 41 | lines=list(f.readlines()) 42 | lines=[clean_str(split_str(line)) for line in lines] 43 | f.close() 44 | print("read txt finished") 45 | return lines 46 | 47 | 48 | def padding_sentences(no_padding_lists, padding_token='',padding_sentence_length = None): 49 | ''' 50 | 接收句子列表,将所有句子填充为一样长 51 | ''' 52 | print("padding sentences now..............") 53 | all_sample_lists=[sentence.split(' ') for sentence in no_padding_lists] 54 | if padding_sentence_length != None: 55 | max_sentence_length=padding_sentence_length 56 | else: 57 | max_sentence_length=max([len(sentence) for sentence in all_sample_lists]) 58 | for i,sentence in enumerate(all_sample_lists): 59 | if len(sentence) > max_sentence_length: 60 | all_sample_lists[i]=sentence[:max_sentence_length] 61 | else: 62 | sentence.extend([padding_token] * (max_sentence_length - len(sentence))) 63 | print("padding sentences finished") 64 | return (all_sample_lists,max_sentence_length) 65 | 66 | 67 | def get_all_data_from_file(positive_file_path,negative_file_path,force_len=None): 68 | ''' 69 | positive_file_path:正评价txt全路径 70 | negative_file_path:负评价txt全路径 71 | ''' 72 | positive_sample_lists=get_cleaned_list(positive_file_path) 73 | negative_sample_lists=get_cleaned_list(negative_file_path) 74 | positive_label_lists=[[0,1] for _ in positive_sample_lists] 75 | negative_label_lists=[[1,0] for _ in negative_sample_lists] 76 | 77 | all_sample_lists = positive_sample_lists + negative_sample_lists #样本为list类型!! 78 | if force_len == None: 79 | all_sample_lists, max_sentences_length = padding_sentences(all_sample_lists) #样本为list类型!! 80 | else: 81 | all_sample_lists, max_sentences_length = padding_sentences(all_sample_lists,padding_token='',padding_sentence_length = force_len) # 样本为list类型!! 82 | all_label_arrays=np.concatenate([positive_label_lists,negative_label_lists], 0) #标签为array类型 83 | 84 | return (all_sample_lists,all_label_arrays,max_sentences_length) 85 | 86 | 87 | def batch_iter(data, batch_size, num_epochs, shuffle=False): 88 | ''' 89 | 生成batches迭代对象 90 | ''' 91 | data = np.array(data) 92 | data_size = len(data) 93 | num_batches_per_epoch = int((data_size - 1) / batch_size) + 1 94 | for epoch in range(num_epochs): 95 | if shuffle: 96 | #顺序打乱 97 | shuffle_indices = np.random.permutation(np.arange(data_size)) 98 | shuffled_data = data[shuffle_indices] 99 | else: 100 | shuffled_data = data 101 | for batch_num in range(num_batches_per_epoch): 102 | start_idx = batch_num * batch_size 103 | end_idx = min((batch_num + 1) * batch_size, data_size) 104 | yield shuffled_data[start_idx : end_idx] 105 | 106 | 107 | def batch_iter_test(data, batch_size, num_epochs, shuffle=False): 108 | ''' 109 | 生成batches迭代对象 110 | ''' 111 | data = np.array(data) 112 | data_size = len(data) 113 | num_batches_per_epoch = int((data_size - 1) / batch_size) + 1 114 | for epoch in range(num_epochs): 115 | if shuffle: 116 | #顺序打乱 117 | shuffle_indices = np.random.permutation(np.arange(data_size)) 118 | shuffled_data = data[shuffle_indices] 119 | else: 120 | shuffled_data = data 121 | for batch_num in range(num_batches_per_epoch): 122 | start_idx = batch_num * batch_size 123 | end_idx = min((batch_num + 1) * batch_size, data_size) 124 | yield shuffled_data[start_idx : end_idx] 125 | 126 | 127 | def loadDict(train_data_path): 128 | f=open(train_data_path,'rb') 129 | params=pickle.load(f) 130 | return params -------------------------------------------------------------------------------- /lstm_train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import readdata 3 | import word2vec 4 | import lstm_model 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | 9 | 10 | #文件路径 11 | current_path=os.path.abspath(os.curdir) 12 | data_path="./data" 13 | positive_file_path="./data//pos.txt" 14 | negative_file_path="./data//neg.txt" 15 | embedding_model_path="./data//embedding_64.bin" 16 | train_data_path="./data//lstm//training_params.pickle" 17 | log_path="./summary//lstm" 18 | 19 | 20 | #模型超参 21 | class config(): 22 | test_sample_percentage=0.03 23 | num_labels=2 24 | embedding_size=64 25 | dropout_keep_prob=0.6 26 | batch_size=64 27 | num_epochs=80 28 | max_sentences_length=40 29 | num_layers=2 30 | max_grad_norm=5 31 | l2_rate=0.0001 32 | 33 | #加载数据 34 | all_sample_lists,all_label_arrays,max_sentences_length=readdata.get_all_data_from_file(positive_file_path,negative_file_path,force_len=40) 35 | all_sample_arrays=np.array(word2vec.get_embedding_vector(all_sample_lists,embedding_model_path)) 36 | del all_sample_lists 37 | print("sample.shape = {}".format(all_sample_arrays.shape)) 38 | print("label.shape = {}".format(all_label_arrays.shape)) 39 | trainconfig=config() 40 | trainconfig.max_sentences_length=max_sentences_length 41 | testconfig=config() 42 | testconfig.max_sentences_length=max_sentences_length 43 | testconfig.dropout_keep_prob=1.0 44 | 45 | #存储训练参数 46 | params={"num_labels":trainconfig.num_labels,"max_sentences_length":max_sentences_length} 47 | readdata.save(params,train_data_path) 48 | 49 | #打乱样本顺序 50 | np.random.seed(10) 51 | random_index=np.random.permutation(np.arange(len(all_label_arrays))) 52 | random_sample_arrays=all_sample_arrays[random_index] 53 | del all_sample_arrays 54 | random_label_arrays=all_label_arrays[random_index] 55 | 56 | #按比例抽取测试样本 57 | num_tests=int(trainconfig.test_sample_percentage*len(all_label_arrays)) 58 | del all_label_arrays 59 | test_sample_arrays=random_sample_arrays[:num_tests] 60 | train_sample_arrays=random_sample_arrays[num_tests:] 61 | del random_sample_arrays 62 | train_label_arrays=random_label_arrays[num_tests:] 63 | test_label_arrays=random_label_arrays[:num_tests] 64 | del random_label_arrays 65 | print("Train/Test split: {:d}/{:d}".format(len(train_label_arrays), len(test_label_arrays))) 66 | 67 | 68 | #开始训练 69 | with tf.Graph().as_default(): 70 | sess=tf.Session() 71 | with sess.as_default(): 72 | lstm=lstm_model.TextLSTM(config=trainconfig) 73 | 74 | #初始化参数 75 | train_writer = tf.summary.FileWriter(log_path + '/train', sess.graph) 76 | test_writer = tf.summary.FileWriter(log_path + '/test') 77 | step_num=0 78 | sess.run(tf.global_variables_initializer()) 79 | saver=tf.train.Saver() 80 | 81 | 82 | #定义训练函数 83 | def train_step(x_batch,y_batch): 84 | feed_dict={ 85 | lstm.input_x:x_batch, 86 | lstm.input_y:y_batch, 87 | lstm.dropout_keep_prob:config.dropout_keep_prob, 88 | } 89 | merged,loss,accuracy,_=sess.run( 90 | [lstm.summary_op,lstm.loss,lstm.accuracy,lstm.train_op], 91 | feed_dict=feed_dict 92 | ) 93 | return (merged,loss,accuracy) 94 | 95 | #定义测试函数 96 | def test_step(x_batch,y_batch): 97 | feed_dict={ 98 | lstm.input_x:x_batch, 99 | lstm.input_y:y_batch, 100 | lstm.dropout_keep_prob:testconfig.dropout_keep_prob 101 | } 102 | merged,loss, accuracy,_=sess.run( 103 | [lstm.summary_op,lstm.loss,lstm.accuracy,lstm.train_op], 104 | feed_dict=feed_dict 105 | ) 106 | return (merged,loss,accuracy) 107 | 108 | #生成批数据 109 | batches=readdata.batch_iter( 110 | list(zip(train_sample_arrays,train_label_arrays)),trainconfig.batch_size,trainconfig.num_epochs) 111 | 112 | #正式开始训练啦 113 | for batch in batches: 114 | step_num += 1 115 | x_batch,y_batch=zip(*batch) 116 | merged,loss,accuracy=train_step(x_batch,y_batch) 117 | if step_num % 100 == 0: 118 | train_writer.add_summary(merged, step_num) 119 | print("For train_samples: step %d, loss %g, accuracy %g" % (step_num, loss, accuracy)) 120 | if step_num % 200 ==0: 121 | merged,loss,accuracy = test_step(test_sample_arrays, test_label_arrays) 122 | test_writer.add_summary(merged, step_num) 123 | print("For test_samples: step %d, loss %g, accuracy %g" % (step_num, loss, accuracy)) 124 | 125 | 126 | saver.save(sess,"data/lstm/text_model") -------------------------------------------------------------------------------- /cnn_train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import readdata 3 | import word2vec 4 | import cnn_model 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | 9 | 10 | #文件路径 11 | current_path=os.path.abspath(os.curdir) 12 | data_path="./data" 13 | positive_file_path="./data//pos.txt" 14 | negative_file_path="./data//neg.txt" 15 | embedding_model_path="./data//embedding_64.bin" 16 | train_data_path="./data//cnn//training_params.pickle" 17 | log_path="./summary//cnn" 18 | 19 | 20 | #模型超参 21 | class config(): 22 | test_sample_percentage=0.03 23 | num_labels=2 24 | embedding_size=64 25 | filter_sizes=[2,3,4] 26 | num_filters=128 27 | dropout_keep_prob=0.5 28 | l2_reg_lambda=0.1 29 | batch_size=32 30 | num_epochs=15 31 | max_sentences_length=0 32 | lr_rate=1e-3 33 | 34 | 35 | #加载数据 36 | all_sample_lists,all_label_arrays,max_sentences_length=readdata.get_all_data_from_file(positive_file_path,negative_file_path,force_len=40) 37 | all_sample_arrays=np.array(word2vec.get_embedding_vector(all_sample_lists,embedding_model_path)) 38 | del all_sample_lists 39 | print("sample.shape = {}".format(all_sample_arrays.shape)) 40 | print("label.shape = {}".format(all_label_arrays.shape)) 41 | trainconfig=config() 42 | trainconfig.max_sentences_length=max_sentences_length 43 | testconfig=config() 44 | testconfig.max_sentences_length=max_sentences_length 45 | testconfig.dropout_keep_prob=1.0 46 | 47 | 48 | #存储训练参数 49 | params={"num_labels":trainconfig.num_labels,"max_sentences_length":max_sentences_length} 50 | readdata.save(params,train_data_path) 51 | 52 | #打乱样本顺序 53 | np.random.seed(10) 54 | random_index=np.random.permutation(np.arange(len(all_label_arrays))) 55 | random_sample_arrays=all_sample_arrays[random_index] 56 | del all_sample_arrays 57 | random_label_arrays=all_label_arrays[random_index] 58 | #按比例抽取测试样本 59 | num_tests=int(trainconfig.test_sample_percentage*len(all_label_arrays)) 60 | del all_label_arrays,random_index 61 | test_sample_arrays=random_sample_arrays[:num_tests] 62 | train_sample_arrays=random_sample_arrays[num_tests:] 63 | del random_sample_arrays 64 | test_label_arrays=random_label_arrays[:num_tests] 65 | train_label_arrays=random_label_arrays[num_tests:] 66 | del random_label_arrays 67 | print("Train/Test split: {:d}/{:d}".format(len(train_label_arrays), len(test_label_arrays))) 68 | 69 | #开始训练 70 | with tf.Graph().as_default(): 71 | sess=tf.Session() 72 | with sess.as_default(): 73 | cnn=cnn_model.TextCNN(config=trainconfig) 74 | 75 | #初始化参数 76 | train_writer = tf.summary.FileWriter(log_path + '/train', sess.graph) 77 | test_writer = tf.summary.FileWriter(log_path + '/test') 78 | step_num=0 79 | sess.run(tf.global_variables_initializer()) 80 | saver=tf.train.Saver() 81 | 82 | 83 | #定义训练函数 84 | def train_step(x_batch,y_batch,lr_rate): 85 | feed_dict={ 86 | cnn.input_x:x_batch, 87 | cnn.input_y:y_batch, 88 | cnn.dropout_keep_prob:trainconfig.dropout_keep_prob, 89 | cnn.learning_rate:lr_rate 90 | } 91 | summary,loss,accuracy,_=sess.run( 92 | [cnn.merged,cnn.loss,cnn.accuracy,cnn.train_op], 93 | feed_dict=feed_dict 94 | ) 95 | return (summary,loss,accuracy) 96 | 97 | #定义测试函数 98 | def test_step(x_batch,y_batch): 99 | feed_dict={ 100 | cnn.input_x:x_batch, 101 | cnn.input_y:y_batch, 102 | cnn.dropout_keep_prob:testconfig.dropout_keep_prob 103 | } 104 | summary,loss,accuracy=sess.run( 105 | [cnn.merged,cnn.loss,cnn.accuracy], 106 | feed_dict=feed_dict 107 | ) 108 | 109 | return (summary,loss,accuracy) 110 | 111 | #生成批数据 112 | batches=readdata.batch_iter( 113 | list(zip(train_sample_arrays, train_label_arrays)),trainconfig.batch_size,trainconfig.num_epochs) 114 | 115 | 116 | #正式开始训练啦 117 | for batch in batches: 118 | step_num += 1 119 | x_batch,y_batch=zip(*batch) 120 | summary,loss, accuracy=train_step(x_batch,y_batch,config.lr_rate) 121 | if step_num % 20 == 0: 122 | train_writer.add_summary(summary,step_num) 123 | #print("For train_samples: step %d, loss %g, accuracy %g" % (step_num,loss,accuracy)) 124 | summary,loss, accuracy = test_step(test_sample_arrays, test_label_arrays) 125 | #print("Testing loss: %g,Testing accuracy: %g" % (loss, accuracy)) 126 | test_writer.add_summary(summary, step_num) 127 | 128 | #_,loss, accuracy = test_step(test_sample_arrays, test_label_arrays) 129 | #print("Testing loss: %g,Testing accuracy: %g" % (loss, accuracy)) 130 | 131 | saver.save(sess,"data/cnn/text_model") 132 | train_writer.close() 133 | test_writer.close() -------------------------------------------------------------------------------- /Lstm_Model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class TextLSTM(object): 5 | def __init__(self,config): 6 | self.num_steps=config.max_sentences_length 7 | self.hidden_size=config.embedding_size 8 | self.num_classes=config.num_labels 9 | self.num_layers=config.num_layers 10 | self.batch_size=config.batch_size 11 | self.l2_rate=config.l2_rate 12 | self.input_x=tf.placeholder(tf.float32,[None,self.num_steps,self.hidden_size],name="input_x") 13 | self.input_y=tf.placeholder(tf.float32,[None,self.num_classes],name="input_y") 14 | self.dropout_keep_prob=tf.placeholder(tf.float32,name="dropout_keep_prob") 15 | 16 | 17 | 18 | with tf.variable_scope("Net",initializer=tf.orthogonal_initializer()): 19 | def lstm_cell(): 20 | return tf.contrib.rnn.BasicLSTMCell(self.hidden_size,forget_bias=1.0,state_is_tuple=True) 21 | 22 | 23 | attn_cell = lstm_cell 24 | if self.dropout_keep_prob is not None: 25 | def attn_cell(): 26 | return tf.contrib.rnn.DropoutWrapper(lstm_cell(),output_keep_prob=self.dropout_keep_prob) 27 | 28 | 29 | self.cell_fw=tf.contrib.rnn.MultiRNNCell([attn_cell() for _ in range(config.num_layers)], 30 | state_is_tuple=True) 31 | self.cell_bw=tf.contrib.rnn.MultiRNNCell([attn_cell() for _ in range(config.num_layers)], 32 | state_is_tuple=True) 33 | 34 | if self.dropout_keep_prob is not None: 35 | inputs=tf.nn.dropout(self.input_x,self.dropout_keep_prob) 36 | else: 37 | inputs=self.input_x 38 | 39 | #shape: (batch_size, num_steps,hidden_size) => (num_steps,batch_size,hidden_size) 40 | inputs= tf.transpose(inputs, [1,0,2]) 41 | outputs,state = tf.nn.bidirectional_dynamic_rnn(cell_fw=self.cell_fw, 42 | cell_bw=self.cell_bw, 43 | dtype="float32", 44 | inputs=inputs, 45 | swap_memory=True, 46 | time_major=True) 47 | outputs_fw,outputs_bw=outputs 48 | output_fw=outputs_fw[-1] 49 | output_bw=outputs_bw[-1] 50 | finial_output=tf.concat([output_fw,output_bw],1) 51 | with tf.name_scope("output"): 52 | softmax_w=tf.get_variable("softmax_w",[self.hidden_size*2,self.num_classes],dtype=tf.float32) 53 | softmax_b=tf.get_variable("softmax_b",[self.num_classes],dtype=tf.float32,initializer=tf.random_normal_initializer(stddev=0.01)) 54 | self.result=tf.matmul(finial_output,softmax_w)+softmax_b 55 | self.final_state=state 56 | self.predictions=tf.argmax(self.result,1,name="predictions") 57 | tf.summary.histogram("softmax_w",softmax_w) 58 | tf.summary.histogram("softmax_b",softmax_b) 59 | self.softmax_result=tf.nn.softmax(self.result) 60 | 61 | 62 | #计算损失 63 | with tf.name_scope("loss"): 64 | losses=tf.nn.softmax_cross_entropy_with_logits(logits=self.result, labels=self.input_y) 65 | self.loss = tf.reduce_mean(losses) 66 | tf.summary.scalar("loss",self.loss) 67 | 68 | 69 | #计算正确率 70 | with tf.name_scope("accuracy"): 71 | correct_predictions=tf.equal(self.predictions, tf.argmax(self.input_y, 1)) 72 | self.accuracy=tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") 73 | tf.summary.scalar("accuracy",self.accuracy) 74 | 75 | with tf.name_scope("train_op"): 76 | tvars = tf.trainable_variables() 77 | self.l2_loss = 0.001 * tf.reduce_sum([tf.nn.l2_loss(v) for v in tvars]) # 0.001是lambda超参数 78 | grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss+self.l2_loss, tvars), config.max_grad_norm) 79 | optimizer = tf.train.AdamOptimizer() 80 | optimizer.apply_gradients(zip(grads, tvars)) 81 | self.train_op = optimizer.apply_gradients(zip(grads, tvars)) 82 | 83 | with tf.name_scope("summary"): 84 | self.summary_op=tf.summary.merge_all() 85 | 86 | ''' 87 | class TextLSTM(object): 88 | def __init__(self,config): 89 | self.num_steps=config.max_sentences_length 90 | self.hidden_size=config.embedding_size 91 | self.num_classes=config.num_labels 92 | self.num_layers=config.num_layers 93 | self.batch_size=config.batch_size 94 | self.l2_rate=config.l2_rate 95 | self.input_x=tf.placeholder(tf.float32,[None,self.num_steps,self.hidden_size],name="input_x") 96 | self.input_y=tf.placeholder(tf.float32,[None,self.num_classes],name="input_y") 97 | self.dropout_keep_prob=tf.placeholder(tf.float32,name="dropout_keep_prob") 98 | 99 | 100 | 101 | with tf.variable_scope("Net",initializer=tf.orthogonal_initializer()): 102 | def lstm_cell(): 103 | return tf.contrib.rnn.BasicLSTMCell(self.hidden_size,forget_bias=1.0,state_is_tuple=True) 104 | 105 | 106 | attn_cell = lstm_cell 107 | if self.dropout_keep_prob is not None: 108 | def attn_cell(): 109 | return tf.contrib.rnn.DropoutWrapper(lstm_cell(),output_keep_prob=self.dropout_keep_prob) 110 | 111 | 112 | self.cell=tf.contrib.rnn.MultiRNNCell([attn_cell() for _ in range(config.num_layers)],state_is_tuple=True) 113 | if self.dropout_keep_prob is not None: 114 | inputs=tf.nn.dropout(self.input_x,self.dropout_keep_prob) 115 | else: 116 | inputs=self.input_x 117 | 118 | #shape: (batch_size, num_steps,hidden_size) => (num_steps,batch_size,hidden_size) 119 | inputs= tf.transpose(inputs, [1,0,2]) 120 | outputs,state = tf.nn.dynamic_rnn(cell=self.cell,dtype="float32",inputs=inputs,swap_memory=True,time_major=True) 121 | output=outputs[-1] 122 | with tf.name_scope("output"): 123 | softmax_w=tf.get_variable("softmax_w",[self.hidden_size,self.num_classes],dtype=tf.float32) 124 | softmax_b=tf.get_variable("softmax_b",[self.num_classes],dtype=tf.float32,initializer=tf.random_normal_initializer(stddev=0.01)) 125 | self.result=tf.matmul(output,softmax_w)+softmax_b 126 | self.final_state=state 127 | self.predictions=tf.argmax(self.result,1,name="predictions") 128 | tf.summary.histogram("softmax_w",softmax_w) 129 | tf.summary.histogram("softmax_b",softmax_b) 130 | self.softmax_result=tf.nn.softmax(self.result) 131 | 132 | 133 | #计算损失 134 | with tf.name_scope("loss"): 135 | losses=tf.nn.softmax_cross_entropy_with_logits(logits=self.result, labels=self.input_y) 136 | self.loss = tf.reduce_mean(losses) 137 | tf.summary.scalar("loss",self.loss) 138 | 139 | 140 | #计算正确率 141 | with tf.name_scope("accuracy"): 142 | correct_predictions=tf.equal(self.predictions, tf.argmax(self.input_y, 1)) 143 | self.accuracy=tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") 144 | tf.summary.scalar("accuracy",self.accuracy) 145 | 146 | with tf.name_scope("train_op"): 147 | tvars = tf.trainable_variables() 148 | grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), config.max_grad_norm) 149 | optimizer = tf.train.AdamOptimizer() 150 | optimizer.apply_gradients(zip(grads, tvars)) 151 | self.train_op = optimizer.apply_gradients(zip(grads, tvars)) 152 | 153 | with tf.name_scope("summary"): 154 | self.summary_op=tf.summary.merge_all() 155 | ''' -------------------------------------------------------------------------------- /weibo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2018/3/1 19:54 4 | # @Author : 孑曦曦孑 5 | # @File : visitor_weibo_login.py 6 | 7 | import requests 8 | import re 9 | import json 10 | from lxml import etree 11 | import time 12 | import os 13 | 14 | # url="https://weibo.com/2447680824/G5nMd0MBJ?type=comment#_rnd1519906057635" 15 | #模拟游客登录获取cookies 16 | class visitor(): 17 | def __init__(self,url): 18 | try: 19 | self.cookies,self.headers=self.get_cookies() 20 | self.id=self.weibo_spider(url) 21 | self.fail=False 22 | except: 23 | print("模拟失败-。-") 24 | self.fail=True 25 | 26 | 27 | def get_cookies(self): 28 | # 获取dict_data 29 | print("正在模拟游客登录") 30 | S = requests.session() 31 | url = "https://passport.weibo.com/visitor/genvisitor" 32 | S.headers.update( 33 | { 34 | "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0" 35 | } 36 | ) 37 | 38 | data = { 39 | "cb": "gen_callback", 40 | "fp": '{"os":"1","browser":"Gecko60,0,0,0","fonts":"undefined","screenInfo":"1536*864*24","plugins":""}' 41 | } 42 | response = S.post(url, data=data) 43 | pattren=re.compile(r"\((.*)\)") 44 | data=pattren.findall(response.text)[0] 45 | dict_data=json.loads(data)["data"] 46 | tid=dict_data["tid"] 47 | confidence=dict_data["confidence"] 48 | where=dict_data["new_tid"] 49 | if where: 50 | where=3 51 | else: 52 | where=2 53 | while(len(str(confidence))<3): 54 | confidence="0"+str(confidence) 55 | #tid="KCEsfUfkAmyXExt9tiPN61f32Vvh4wViWQaeHptBZLc=" 56 | #手动编码格式转换 57 | tid=tid.replace("+","%2b") 58 | tid=tid.replace("=","%3d") 59 | 60 | url="https://passport.weibo.com/visitor/visitor?a=incarnate"\ 61 | "&t="+str(tid)+ \ 62 | "&w=" + str(where) + \ 63 | "&c="+str(confidence)+\ 64 | "&gc="\ 65 | "&cb=cross_domain" \ 66 | "&from=weibo" 67 | response=S.get(url) 68 | data=pattren.findall(response.text)[0] 69 | #https://passport.weibo.com/visitor/visitor?a=incarnate&t=hVEmh0nd32++OFXP3wiB6b05C9A5L38fmq7ArFKTNq8=&w=2&c=095&gc=&cb=cross_domain&from=weibo' 70 | #https://passport.weibo.com/visitor/visitor?a=incarnate&t=+A1gVsii+zY9OI9v/e+o1lfhlTPQ20U3Fkuz8nn/7rU=&w=2&c=095&gc=&cb=cross_domain&from=weibo&_rand=0.42337865580692513 71 | dict_data = json.loads(data) 72 | if "succ" not in dict_data["msg"]: 73 | printf("tid不合法") 74 | printf("dict_data:",dict_data) 75 | self.fail=True 76 | return None,None 77 | dict_data=dict_data["data"] 78 | sub=dict_data["sub"] #没有 79 | subp=dict_data["subp"] 80 | # print(sub,subp) 81 | url="https://login.sina.com.cn/visitor/visitor?a=crossdomain&cb=return_back"\ 82 | +"&s="+str(sub)\ 83 | +"&sp="+str(subp)\ 84 | +"&from=weibo" 85 | response=S.get(url) 86 | print("成功获取游客Cookies") 87 | return S.cookies,S.headers 88 | 89 | #base62解码 90 | def base62(self,string): 91 | alphabet = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" 92 | base = len(alphabet) 93 | strlen = len(string) 94 | num = 0 95 | idx = 0 96 | for char in string: 97 | power = (strlen - (idx + 1)) 98 | num += alphabet.index(char) * (base ** power) 99 | idx += 1 100 | return num 101 | 102 | def weibo_spider(self,url): 103 | S=requests.session() 104 | S.cookies=self.cookies 105 | S.headers=self.headers 106 | response=S.get(url) 107 | #获取微博大致内容 108 | selector = etree.HTML(response.text) 109 | weibo=selector.xpath('/title/text()') 110 | page = etree.HTML(response.text) 111 | content = page.xpath('//title/text()')[0] 112 | content=str(content).replace("\n"," ") 113 | print("爬取ing\n\t",content) 114 | # #获取异步加载url中的id 115 | pattern=re.compile(r"\d\/(.*)\?.*type") 116 | # https://weibo.com/5678693647/GaERmaQ33?filter=hot&root_comment_id=0&type=comment 117 | # https://weibo.com/5678693647/GaERmaQ33?type=comment 118 | content=pattern.search(url).group(1) 119 | string1=str(self.base62(content[0])) 120 | string2=str(self.base62(content[1:5])) 121 | while(len(string2)<7): 122 | string2="0"+string2 123 | string3=str(self.base62(content[5:])) 124 | while (len(string3) < 7): 125 | string3 = "0" + string3 126 | id=string1+string2+string3 127 | return id 128 | 129 | def catch_comments(self,page=1,past=None): 130 | #模拟异步加载 131 | #https://weibo.com/aj/v6/comment/big?ajwvr=6&id=4213083327566698&filter=hot&page=1 132 | path="./data" 133 | if(page==1): 134 | print("开始爬取~") 135 | #判断是否存在该文件夹 136 | if not os.path.exists(path): 137 | os.mkdir(path) 138 | S=requests.session() 139 | S.cookies=self.cookies 140 | S.headers=self.headers 141 | #https://weibo.com/aj/v6/comment/big?ajwvr=6&id=4228711421054898 142 | # &root_comment_max_id=259085778882534&root_comment_max_id_type=0& 143 | # root_comment_ext_param=&page=2&filter=hot 144 | # &sum_comment_number=7215&filter_tips_before=1 145 | # &from=singleWeiBo&__rnd=1525703793076 146 | 147 | url="https://weibo.com/aj/v6/comment/big?ajwvr=6"\ 148 | +"&id="+str(self.id)\ 149 | +"&page="+str(page) \ 150 | + "&filter=hot" \ 151 | +"&from=singleWeiBo" 152 | # print(url) 153 | response=S.get(url) 154 | html=json.loads(response.text)["data"]["html"] 155 | #如果两次相同表示结束了 -。- 156 | # if past_html==html: 157 | # print("爬取结束") 158 | # # print(self.id) 159 | # print("共",page,"页") 160 | # return 161 | #搜索评论 162 | text=etree.HTML(html) 163 | #print(html) 164 | #评论数-xpath 165 | # comments=text.xpath('//div[@class="list_li S_line1 clearfix"]//div[@class="WB_text"]//text()') 166 | comments = text.xpath('//div[@class="list_li S_line1 clearfix"]/*/div[@class="WB_text"]') 167 | # 评论数 168 | points = text.xpath('//div[@class="list_li S_line1 clearfix"]//*/span[@node-type="like_status"]/child::*[2]//text()') 169 | #点赞数-xpath 170 | # points 171 | pattern = re.compile(r'\:(.*)') 172 | # try: 173 | if page==1: 174 | wa="w" 175 | else: 176 | wa="a" 177 | f=open("./data/test.txt", wa,encoding='utf-8') 178 | # f2=open("weibo_points.txt",wa,encoding='utf-8') 179 | for i in range(len(comments)): 180 | comment = comments[i].xpath("text()") 181 | comment = ",".join(comment[1:])[1:].strip() 182 | point = points[i] 183 | if i==0: 184 | now = {"len": len(comments), "comment":comment} 185 | if now==past: 186 | print("爬取结束") 187 | return 188 | else: 189 | past=now 190 | if point == "赞": 191 | point = "0" 192 | #点赞数为权重0.2 193 | weights=int(0.2*int(point)) 194 | #写入评论 195 | comment=comment+"\n" 196 | f.write(comment) 197 | for i in range(weights): 198 | f.write(comment) 199 | #写入点赞数 200 | print("已写入", page, "页") 201 | f.close() 202 | # except : 203 | # print("写入文件失败") 204 | 205 | page+=1 206 | self.catch_comments(page,past) 207 | 208 | #获取图片 209 | def catch_pictures(self,page=1,past_html=None): 210 | # 模拟异步加载 211 | # https://weibo.com/aj/v6/comment/big?ajwvr=6&id=4213083327566698&filter=hot&page=1 212 | path = "./weibo-pic" 213 | if (page == 1): 214 | print("开始爬取~") 215 | #创建文件夹 216 | if not os.path.exists(path): 217 | os.makedirs(path) 218 | S = requests.session() 219 | S.cookies = self.cookies 220 | S.headers = self.headers 221 | url = "https://weibo.com/aj/v6/comment/big?ajwvr=6" \ 222 | + "&id=" + str(self.id) \ 223 | + "&page=" + str(page) \ 224 | + "&filter=hot" \ 225 | + "&from=singleWeiBo" 226 | response = S.get(url) 227 | html = json.loads(response.text)["data"]["html"] 228 | # 如果两次相同表示结束了 -。- 229 | if past_html == html: 230 | print("爬取结束") 231 | # print(self.id) 232 | print("共", page, "页") 233 | return 234 | # 搜索图片链接 235 | text = etree.HTML(html) 236 | ids = text.xpath('//li[@action-type="comment_media_img"]/attribute::action-data') 237 | # 写入图片 238 | try: 239 | for id in ids: 240 | id = id.split("&") 241 | id = id[0][4:] 242 | url = "https://wx3.sinaimg.cn/bmiddle/" + id + ".jpg" 243 | filename = path + "/" + str(id) + ".jpg" 244 | response = requests.get(url, stream=True) 245 | with open(filename, "wb") as f: 246 | for chunk in response.iter_content(128): 247 | f.write(chunk) 248 | except: 249 | print("写入失败") 250 | page += 1 251 | print(page) 252 | self.catch_pictures(page, html) 253 | 254 | #模块调用 255 | def start(url=None): 256 | if url==None: 257 | print("请输入正确的url") 258 | return 259 | else: 260 | spider=visitor(url) 261 | spider.catch_comments() 262 | 263 | if __name__=="__main__": 264 | url="https://weibo.com/2387903701/G5bn7s5CS?type=comment" 265 | url = input("输入需要爬取的微博url:\n") 266 | spider=visitor(url) 267 | if spider.fail==False: 268 | spider.catch_comments() 269 | <<<<<<< HEAD 270 | # spider.catch_pictures() 271 | ======= 272 | # spider.catch_pictures() 273 | #https://weibo.com/1840483562/G48Ajgfhq?type=comment 274 | #https://weibo.com/aj/v6/comment/big?ajwvr=6&id=4209863153871988&filter=hot&page=1 275 | 276 | #https://weibo.com/2387903701/G5bn7s5CS?type=comment 277 | #https://weibo.com/aj/v6/comment/big?ajwvr=6&id=4212353576694814&filter=hot&page=12 278 | 279 | 280 | #------------------ 281 | # ~。~ nice 282 | #------------------ 283 | >>>>>>> 06b44860ba75492beb842e51b5b0af857dbfdbb3 284 | --------------------------------------------------------------------------------