├── GeiDict.py
├── README.md
├── SentimentAnalysis_LSTM.py
├── content.bin
└── data
    ├── train_sentiment.csv
    └── train_word.csv


/GeiDict.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:utf-8 -*- 
 2 | import csv
 3 | import codecs
 4 | import pandas as pd 
 5 | 
 6 | #嵌套字典构造与调用的相关功能
 7 | 
 8 | 
 9 | 
10 | #函数返回我们需要的嵌套字典
11 | #传入文件，输出字典
12 | def getDict(filename):
13 | 	read = open(filename,'r')   #打开文件
14 | 	lists=read.readlines()      #读取行
15 | 	row_words = {}              #存储每一行的数据的字典
16 | 	words_dict={}               #存储为字典格式的所有数据
17 | 	row_words_length=[]         #存储每一行数据长度的数据
18 | 	linelist=[]                 #每一行切分后的列表
19 | 	row_length = len(open(filename,'r').readlines())   #行数总长度
20 | #	print row_length 
21 | 	#得到每一行的长度，并存储到数组中
22 | 	for length in lists:
23 | 		row_words_length.append(len(length.strip('\n').split(';'))-1)
24 | 		#print(arrays)
25 | 	i=0     #i为列数数量自增变量
26 |     #转化为字典
27 | 	for line in lists:
28 | 		if i<row_length:	
29 | 			linelist=line.strip().split(';')  #切分数据
30 | 		j=0  #j为每一行词数量自增变量
31 | 		#		print linelist
32 | 		#每一行的数据切分后存储到linelists列表中
33 | 		for linstr in linelist:
34 | 			#给一行数据切分后的列表中每一个数据转化为字典，并附上key，形式为1,2,3...
35 | 			if j<row_words_length[i]:
36 | 				row_words[j]=linstr       #将一行数据转化为字典
37 | 
38 | 				j+=1                      #自增
39 | 		words_dict[i]=row_words          #将每一行转化的字典加入到所有数据字典中
40 | 
41 | 		row_words={}                     #每一行字典为空
42 | 			
43 | 		i+=1                             #自增	
44 | #	print(words_dict)
45 | 
46 | 	read.close()
47 | 	return words_dict
48 | 
49 | 
50 | 
51 | #col_keys：嵌套字典的外层key，即列的id
52 | #col_values：嵌套字典的外层value，即每行所有数据集合的字典
53 | #row_keys：嵌套字典内层的key，即每行数据的key
54 | #row_values：即每一行的数据
55 | #def readDict(words_dict):
56 | #	count=0
57 | #	list={}
58 | #	for col_keys,col_values in words_dict.items():
59 | #		row_datas=col_values
60 | #		for row_keys,row_values in row_datas.items():
61 | #			lists=col_keys
62 | 
63 | #			return lists
64 | 			
65 | #			list[count]=row_values
66 | #			count+=1
67 | #			return list
68 | #	
69 | #			print(col_keys,row_keys,row_values)
70 | 
71 | 
72 | #col_keys：嵌套字典的外层key，即列的id
73 | #col_values：嵌套字典的外层value，即每行所有数据集合的字典
74 | #row_keys：嵌套字典内层的key，即每行数据的key
75 | #row_values：即每一行的数据
76 | def readDict(words_dict):
77 | 	lists={}
78 | 	count=0
79 | 	for x in range(len(words_dict)):
80 | 		col_keys=words_dict.keys()[x]
81 | 		col_values=words_dict[col_keys]
82 | 		for y in range(len(col_values)):
83 | 			row_keys=col_values.keys()[y]
84 | 			row_values=col_values[row_keys]
85 | 			lists[count]=row_values
86 | 			count+=1
87 | 	return lists
88 | #	return col_keys,row_keys,row_values
89 | 
90 | 
91 | 
92 | 
93 | 
94 | #print readDict(getDict('test.csv'))
95 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SentimentAnalysis_LSTM
 2 | 基于Keras使用LSTM对电商评论进行情感分析
 3 | 
 4 | 
 5 | ## 框架：
 6 | - 以词为单位，进行分词，将每个句子截断为MAX_SEQUENCE_LENGTH长度的词（长则截断，不够则补空字符串）
 7 | - 将句子以“词-词向量(embedding)”的矩阵形式输入到LSTM模型中进行学习分类
 8 | - 模型运行完毕后，使用Keras的预测函数，输入将要预测的词传到函数中进行分类预测
 9 | 
10 | 
11 | ## 说明
12 | - 本次数据来自CCF大赛的情感分析比赛
13 | - 其中content.bin该词向量来自于比赛的数据，进行转换后成为词向量
14 | - 其中代码给了详细的注释
15 | - 由于我们组该比赛最终未采用我的方案，所以读取csv文件自动化预测每个词的功能没有完
16 | >    1. 其中只完成了自动读取csv文件转换为list的功能
17 | >    2. 模型运行时将list传入，运行完毕后，将csv文件转换为list，进行预测，并将预测结果转化为csv文件格式，与原来预测的词一一对应
18 | >    3. 上述后面的功能未完成
19 | >    4. 转化为嵌套字典和list的功能代码地址在这里，并进行详细的讲解：https://github.com/xs-L/NestedDict
20 | - 欢迎大家star
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/SentimentAnalysis_LSTM.py:
--------------------------------------------------------------------------------
  1 | #-*- coding:utf-8 -*- 
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import re
  6 | 
  7 | from bs4 import BeautifulSoup
  8 | 
  9 | import sys
 10 | import os
 11 | 
 12 | from keras.preprocessing.text import Tokenizer
 13 | from keras.preprocessing.sequence import pad_sequences
 14 | from keras.utils.np_utils import to_categorical
 15 | 
 16 | from keras.layers import Masking
 17 | from keras.layers import Dense, Input, Flatten
 18 | from keras.layers import Conv1D, GlobalMaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional
 19 | from keras.models import Sequential, Model
 20 | 
 21 | from keras.layers import Dense, Activation
 22 | 
 23 | from keras.preprocessing.text import one_hot 
 24 | 
 25 | import  GetDict 
 26 | #SENTENCE_NUM = 44000     #数据数量，也就是电影评论数据总共的条数
 27 | MAX_SEQUENCE_LENGTH = 3   #句子统一长度
 28 | MAX_NB_WORDS = 50000      #处理的最大单词数量
 29 | EMBEDDING_DIM = 100       #向量维度
 30 | VALIDATION_SPLIT = 0.2    #验证集，训练集的一部分比例数据作为验证集，划分在shuffle之后
 31 | 
 32 | 
 33 | #读取电影评论
 34 | data_texts =GetDict.readDict(GetDict.getDict('/SentimentAnalysis_LSTM/data/train_word.csv')).values()
 35 | data_labels =GetDict.readDict(GetDict.getDict('/SentimentAnalysis_LSTM/data/train_sentiment.csv')).values()
 36 | #print data_texts
 37 | #print data_labels
 38 | 
 39 | DIR = "/SentimentAnalysis_LSTM"#这里的路径要修改为自己的路径
 40 | #指对应词语的词向量
 41 | embeddings_index = {}
 42 | f = open(os.path.join(DIR, 'content.bin'))  #词向量
 43 | for line in f:
 44 |     values = line.split()
 45 |     word = values[0]
 46 |     coefs = np.asarray(values[1:], dtype='float32')
 47 |     embeddings_index[word] = coefs
 48 | f.close()
 49 | print('Total %s word vectors.' % len(embeddings_index))
 50 | 
 51 | 
 52 | labels = to_categorical(np.asarray(data_labels))
 53 | texts=data_texts
 54 | print len(texts),len(labels)
 55 | 
 56 | #Tokenizer是一个用于向量化文本,或将文本转换为序列(即单词在字典中的下标构成的列表，从1算起）的类
 57 | tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
 58 | #texts，用于训练的文本列表；使用一系列文档来生成token词典
 59 | tokenizer.fit_on_texts(texts)
 60 | #序列的列表，列表中每个序列对应于一段输入文本，列表中每个序列对应于一段输入文本；将多个文档转换为word下标的向量形式
 61 | sequences = tokenizer.texts_to_sequences(texts)
 62 | #保存所有word对应的编号id，从1开始；词索引
 63 | word_index = tokenizer.word_index
 64 | #将长度不足MAX_SEQUENCE_LENGTH=4的语句用0填充，后端填充
 65 | data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
 66 | 
 67 | 
 68 | 
 69 | #以向量维度的矩阵，长度为维度大小，词索引依次排列
 70 | indices = np.arange(data.shape[0])
 71 | #将列表中的元素打乱
 72 | np.random.shuffle(indices)
 73 | #将打乱的元素重新装入data中
 74 | data = data[indices]
 75 | labels = labels[indices]
 76 | 
 77 | #print data
 78 | #print labels
 79 | 
 80 | 
 81 | #验证集所在句子长度中的位置
 82 | nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
 83 | x_train = data[:-nb_validation_samples]  #训练集，整个训练集的前0.8
 84 | y_train = labels[:-nb_validation_samples]
 85 | x_val = data[-nb_validation_samples:]    #验证集，整个训练集的后0.2
 86 | y_val = labels[-nb_validation_samples:]
 87 | 
 88 | #生成这个维度（0,1）之间的随机浮点数
 89 | embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
 90 | for word, i in word_index.items():
 91 |     embedding_vector = embeddings_index.get(word)
 92 |     if embedding_vector is not None:
 93 |         # words not found in embedding index will be all-zeros.
 94 |         embedding_matrix[i] = embedding_vector
 95 | print ('Length of embedding_matrix:', embedding_matrix.shape[0])
 96 | #len(word_index) + 1，字典长度，即输入数据最大下标+1
 97 | #EMBEDDING_DIM，代表全连接嵌入的维度
 98 | #weights=[embedding_matrix]，用于初始化权值的numpy arrays组成的list
 99 | #input_length=MAX_SEQUENCE_LENGTH，当输入序列的长度固定时，该值为其长度
100 | embedding_layer = Embedding(len(word_index) + 1,
101 |                             EMBEDDING_DIM,
102 |                             weights=[embedding_matrix],
103 |                             mask_zero=False,
104 |                             input_length=MAX_SEQUENCE_LENGTH,
105 |                             trainable=False)
106 | 
107 | print('Traing and validation set number of positive and negative reviews')
108 | print y_train.sum(axis=0)
109 | print y_val.sum(axis=0)
110 | 
111 | 
112 | #输入张量，维度为句子最大长度
113 | #sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
114 | #将张量传入嵌入层
115 | #embedded_sequences = embedding_layer(sequence_input)
116 | #Bidirectional，双向rnn包装器；输出维度100,；wo
117 | #l_gru = Bidirectional(LSTM(100, return_sequences=False))(embedded_sequences)
118 | #Dense，全连接层，输出维度100维；activation，激活函数；
119 | #dense_1 = Dense(100,activation='tanh')(l_gru)
120 | #dense_2 = Dense(2, activation='softmax')(dense_1)
121 | 
122 | #模型，将上面定义的各种基本组件组合起来
123 | #model = Model(sequence_input, dense_2)
124 | #编译模型
125 | #loss,损失函数；optimizer，优化器；metrics,指标列表
126 | #model.compile(loss='categorical_crossentropy',
127 | #              optimizer='rmsprop',
128 | #              metrics=['acc'])
129 | #打印出模型概况
130 | #model.summary()
131 | #训练函数
132 | #model.fit(x_train, y_train, validation_data=(x_val, y_val),
133 |  #         epochs=1, batch_size=1000)
134 | 
135 | model = Sequential()
136 | #model.add(Dense(input_dim=4,init='uniform', activation='relu'))
137 | #model.add(Dense(4, input_dim=))
138 | model.add(embedding_layer)
139 | model.add(Bidirectional(LSTM(100, return_sequences=False)))
140 | model.add(Dense(100, activation='tanh'))
141 | model.add(Dense(2, activation='softmax'))
142 | model.compile(optimizer='rmsprop',
143 |               loss='categorical_crossentropy',
144 |               metrics=['acc'])
145 | model.summary()
146 | model.fit(x_train, y_train, batch_size=1000,epochs=10,verbose=1,validation_data=(x_val, y_val))
147 | 
148 | 
149 | def predict_proba(texts):
150 | #    texts=GetDict.readDict(GetDict.getDict(texts)).values()
151 |     tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
152 |     tokenizer.fit_on_texts(texts)
153 |     sequences = tokenizer.texts_to_sequences(texts) 
154 |     data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
155 | #    data=one_hot(texts, 4)
156 |     return model.predict_proba(data,  verbose=0)
157 | list1=['很不好','你大爷','够光滑','后悔','有问题','挺坑的','非常不值','差不多','特别好','还不错','还可以']
158 | #list2=GetDict.readDict(GetDict.getDict('test_word.csv')).values()
159 | print(predict_proba(list1))
160 | #print predict_proba(list2)
161 | #
162 | 


--------------------------------------------------------------------------------