├── GeiDict.py ├── README.md ├── SentimentAnalysis_LSTM.py ├── content.bin └── data ├── train_sentiment.csv └── train_word.csv /GeiDict.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | import csv 3 | import codecs 4 | import pandas as pd 5 | 6 | #嵌套字典构造与调用的相关功能 7 | 8 | 9 | 10 | #函数返回我们需要的嵌套字典 11 | #传入文件,输出字典 12 | def getDict(filename): 13 | read = open(filename,'r') #打开文件 14 | lists=read.readlines() #读取行 15 | row_words = {} #存储每一行的数据的字典 16 | words_dict={} #存储为字典格式的所有数据 17 | row_words_length=[] #存储每一行数据长度的数据 18 | linelist=[] #每一行切分后的列表 19 | row_length = len(open(filename,'r').readlines()) #行数总长度 20 | # print row_length 21 | #得到每一行的长度,并存储到数组中 22 | for length in lists: 23 | row_words_length.append(len(length.strip('\n').split(';'))-1) 24 | #print(arrays) 25 | i=0 #i为列数数量自增变量 26 | #转化为字典 27 | for line in lists: 28 | if i 1. 其中只完成了自动读取csv文件转换为list的功能 17 | > 2. 模型运行时将list传入,运行完毕后,将csv文件转换为list,进行预测,并将预测结果转化为csv文件格式,与原来预测的词一一对应 18 | > 3. 上述后面的功能未完成 19 | > 4. 转化为嵌套字典和list的功能代码地址在这里,并进行详细的讲解:https://github.com/xs-L/NestedDict 20 | - 欢迎大家star 21 | 22 | 23 | -------------------------------------------------------------------------------- /SentimentAnalysis_LSTM.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import re 6 | 7 | from bs4 import BeautifulSoup 8 | 9 | import sys 10 | import os 11 | 12 | from keras.preprocessing.text import Tokenizer 13 | from keras.preprocessing.sequence import pad_sequences 14 | from keras.utils.np_utils import to_categorical 15 | 16 | from keras.layers import Masking 17 | from keras.layers import Dense, Input, Flatten 18 | from keras.layers import Conv1D, GlobalMaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional 19 | from keras.models import Sequential, Model 20 | 21 | from keras.layers import Dense, Activation 22 | 23 | from keras.preprocessing.text import one_hot 24 | 25 | import GetDict 26 | #SENTENCE_NUM = 44000 #数据数量,也就是电影评论数据总共的条数 27 | MAX_SEQUENCE_LENGTH = 3 #句子统一长度 28 | MAX_NB_WORDS = 50000 #处理的最大单词数量 29 | EMBEDDING_DIM = 100 #向量维度 30 | VALIDATION_SPLIT = 0.2 #验证集,训练集的一部分比例数据作为验证集,划分在shuffle之后 31 | 32 | 33 | #读取电影评论 34 | data_texts =GetDict.readDict(GetDict.getDict('/SentimentAnalysis_LSTM/data/train_word.csv')).values() 35 | data_labels =GetDict.readDict(GetDict.getDict('/SentimentAnalysis_LSTM/data/train_sentiment.csv')).values() 36 | #print data_texts 37 | #print data_labels 38 | 39 | DIR = "/SentimentAnalysis_LSTM"#这里的路径要修改为自己的路径 40 | #指对应词语的词向量 41 | embeddings_index = {} 42 | f = open(os.path.join(DIR, 'content.bin')) #词向量 43 | for line in f: 44 | values = line.split() 45 | word = values[0] 46 | coefs = np.asarray(values[1:], dtype='float32') 47 | embeddings_index[word] = coefs 48 | f.close() 49 | print('Total %s word vectors.' % len(embeddings_index)) 50 | 51 | 52 | labels = to_categorical(np.asarray(data_labels)) 53 | texts=data_texts 54 | print len(texts),len(labels) 55 | 56 | #Tokenizer是一个用于向量化文本,或将文本转换为序列(即单词在字典中的下标构成的列表,从1算起)的类 57 | tokenizer = Tokenizer(num_words=MAX_NB_WORDS) 58 | #texts,用于训练的文本列表;使用一系列文档来生成token词典 59 | tokenizer.fit_on_texts(texts) 60 | #序列的列表,列表中每个序列对应于一段输入文本,列表中每个序列对应于一段输入文本;将多个文档转换为word下标的向量形式 61 | sequences = tokenizer.texts_to_sequences(texts) 62 | #保存所有word对应的编号id,从1开始;词索引 63 | word_index = tokenizer.word_index 64 | #将长度不足MAX_SEQUENCE_LENGTH=4的语句用0填充,后端填充 65 | data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) 66 | 67 | 68 | 69 | #以向量维度的矩阵,长度为维度大小,词索引依次排列 70 | indices = np.arange(data.shape[0]) 71 | #将列表中的元素打乱 72 | np.random.shuffle(indices) 73 | #将打乱的元素重新装入data中 74 | data = data[indices] 75 | labels = labels[indices] 76 | 77 | #print data 78 | #print labels 79 | 80 | 81 | #验证集所在句子长度中的位置 82 | nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0]) 83 | x_train = data[:-nb_validation_samples] #训练集,整个训练集的前0.8 84 | y_train = labels[:-nb_validation_samples] 85 | x_val = data[-nb_validation_samples:] #验证集,整个训练集的后0.2 86 | y_val = labels[-nb_validation_samples:] 87 | 88 | #生成这个维度(0,1)之间的随机浮点数 89 | embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM)) 90 | for word, i in word_index.items(): 91 | embedding_vector = embeddings_index.get(word) 92 | if embedding_vector is not None: 93 | # words not found in embedding index will be all-zeros. 94 | embedding_matrix[i] = embedding_vector 95 | print ('Length of embedding_matrix:', embedding_matrix.shape[0]) 96 | #len(word_index) + 1,字典长度,即输入数据最大下标+1 97 | #EMBEDDING_DIM,代表全连接嵌入的维度 98 | #weights=[embedding_matrix],用于初始化权值的numpy arrays组成的list 99 | #input_length=MAX_SEQUENCE_LENGTH,当输入序列的长度固定时,该值为其长度 100 | embedding_layer = Embedding(len(word_index) + 1, 101 | EMBEDDING_DIM, 102 | weights=[embedding_matrix], 103 | mask_zero=False, 104 | input_length=MAX_SEQUENCE_LENGTH, 105 | trainable=False) 106 | 107 | print('Traing and validation set number of positive and negative reviews') 108 | print y_train.sum(axis=0) 109 | print y_val.sum(axis=0) 110 | 111 | 112 | #输入张量,维度为句子最大长度 113 | #sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') 114 | #将张量传入嵌入层 115 | #embedded_sequences = embedding_layer(sequence_input) 116 | #Bidirectional,双向rnn包装器;输出维度100,;wo 117 | #l_gru = Bidirectional(LSTM(100, return_sequences=False))(embedded_sequences) 118 | #Dense,全连接层,输出维度100维;activation,激活函数; 119 | #dense_1 = Dense(100,activation='tanh')(l_gru) 120 | #dense_2 = Dense(2, activation='softmax')(dense_1) 121 | 122 | #模型,将上面定义的各种基本组件组合起来 123 | #model = Model(sequence_input, dense_2) 124 | #编译模型 125 | #loss,损失函数;optimizer,优化器;metrics,指标列表 126 | #model.compile(loss='categorical_crossentropy', 127 | # optimizer='rmsprop', 128 | # metrics=['acc']) 129 | #打印出模型概况 130 | #model.summary() 131 | #训练函数 132 | #model.fit(x_train, y_train, validation_data=(x_val, y_val), 133 | # epochs=1, batch_size=1000) 134 | 135 | model = Sequential() 136 | #model.add(Dense(input_dim=4,init='uniform', activation='relu')) 137 | #model.add(Dense(4, input_dim=)) 138 | model.add(embedding_layer) 139 | model.add(Bidirectional(LSTM(100, return_sequences=False))) 140 | model.add(Dense(100, activation='tanh')) 141 | model.add(Dense(2, activation='softmax')) 142 | model.compile(optimizer='rmsprop', 143 | loss='categorical_crossentropy', 144 | metrics=['acc']) 145 | model.summary() 146 | model.fit(x_train, y_train, batch_size=1000,epochs=10,verbose=1,validation_data=(x_val, y_val)) 147 | 148 | 149 | def predict_proba(texts): 150 | # texts=GetDict.readDict(GetDict.getDict(texts)).values() 151 | tokenizer = Tokenizer(num_words=MAX_NB_WORDS) 152 | tokenizer.fit_on_texts(texts) 153 | sequences = tokenizer.texts_to_sequences(texts) 154 | data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) 155 | # data=one_hot(texts, 4) 156 | return model.predict_proba(data, verbose=0) 157 | list1=['很不好','你大爷','够光滑','后悔','有问题','挺坑的','非常不值','差不多','特别好','还不错','还可以'] 158 | #list2=GetDict.readDict(GetDict.getDict('test_word.csv')).values() 159 | print(predict_proba(list1)) 160 | #print predict_proba(list2) 161 | # 162 | --------------------------------------------------------------------------------