├── .gitattributes ├── .gitignore ├── README.md ├── embedding_refine.py ├── glove_twitter_refine.py └── lexicon └── eanew_seed.txt /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows thumbnail cache files 2 | Thumbs.db 3 | ehthumbs.db 4 | ehthumbs_vista.db 5 | 6 | # Folder config file 7 | Desktop.ini 8 | 9 | # Recycle Bin used on file shares 10 | $RECYCLE.BIN/ 11 | 12 | # Windows Installer files 13 | *.cab 14 | *.msi 15 | *.msm 16 | *.msp 17 | 18 | # Windows shortcuts 19 | *.lnk 20 | 21 | # ========================= 22 | # Operating System Files 23 | # ========================= 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Refining Word Embeddings for Sentiment Analysis 2 | The code for paper "Refining Word Embeddings for Sentiment Analysis" 3 | 4 | ## Abstract 5 | Word embeddings that can capture semantic 6 | and syntactic information from contexts 7 | have been extensively used for various 8 | natural language processing tasks. However, 9 | existing methods for learning contextbased 10 | word embeddings typically fail to 11 | capture sufficient sentiment information. 12 | This may result in words with similar vector 13 | representations having an opposite sentiment 14 | polarity (e.g., good and bad), thus 15 | degrading sentiment analysis performance. 16 | Therefore, this study proposes a word vector 17 | refinement model that can be applied to 18 | any pre-trained word vectors (e.g., 19 | Word2vec and GloVe). The refinement 20 | model is based on adjusting the vector representations 21 | of words such that they can be 22 | closer to both semantically and sentimentally 23 | similar words and further away from 24 | sentimentally dissimilar words. Experimental 25 | results show that the proposed 26 | method can improve conventional word 27 | embeddings and outperform previously 28 | proposed sentiment embeddings for both 29 | binary and fine-grained classification on 30 | Stanford Sentiment Treebank (SST). 31 | 32 | ## Parameters 33 | |Parameter|Are| 34 | |:---|:---| 35 | |--filename|pre-trained word embeddings file| 36 | |--lexicon|lexicon which provide sentiment intensity| 37 | |--top|top-k nearest neighbor| 38 | |--iter|refinement interation| 39 | |--beta|parameter beta| 40 | |--gamma|parameter gamma| 41 | |--valence|max value of valence| -------------------------------------------------------------------------------- /embedding_refine.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import absolute_import 3 | 4 | import os 5 | import sys 6 | import logging 7 | import gensim 8 | import codecs 9 | import argparse 10 | 11 | import numpy as np 12 | import pandas as pd 13 | 14 | def update_dict(target_dict, key_a, key_b, val): 15 | if key_a in target_dict: 16 | target_dict[key_a].update({key_b: val}) 17 | else: 18 | target_dict.update({key_a:{key_b: val}}) 19 | 20 | def write_vector(file_name, w2v_model, vertex_matrix, anew_dict): 21 | with codecs.open(file_name, 'w', 'utf8') as my_file: 22 | # refine result 23 | for i in range(len(anew_dict.keys())): 24 | word = anew_dict.keys()[i] 25 | vec = vertex_matrix[i] 26 | my_file.write('%s %s\n' % (word, ' '.join('%f' % val for val in vec))) 27 | 28 | for word in w2v_model.vocab: 29 | if word not in anew_dict.keys(): 30 | vec = w2v_model[word] 31 | my_file.write('%s %s\n' % (word, ' '.join('%f' % val for val in vec))) 32 | 33 | def most_similar(word, w2v_model, anew_dict, weight_dict, top=10): 34 | sim_array = []; word_array = [] 35 | 36 | # get the most similar words from word2vec model 37 | similar_words = w2v_model.most_similar(word, topn=top) 38 | 39 | i = 0 40 | for similar_word in similar_words: 41 | try: 42 | diff = weight_dict[word][similar_word[0]] 43 | sim_array.append([i, diff]) 44 | except: 45 | sim_array.append([i, 0.0]) 46 | 47 | word_array.append(similar_word[0]) 48 | i = i + 1 49 | 50 | sim_array = np.array(sim_array) 51 | sort_index = sim_array[:, 1].argsort(0) 52 | new_array = sim_array[sort_index][::-1] 53 | 54 | ret_dict = {} 55 | for i in range(top): 56 | word = word_array[int(new_array[i][0])] 57 | ret_dict[word] = 1. / float(i + 1.) 58 | 59 | return ret_dict 60 | 61 | if __name__ == '__main__': 62 | program = os.path.basename(sys.argv[0]) 63 | logger = logging.getLogger(program) 64 | 65 | logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') 66 | logging.root.setLevel(level=logging.INFO) 67 | logger.info(r"running %s" % ''.join(sys.argv)) 68 | 69 | parser = argparse.ArgumentParser() 70 | 71 | parser.add_argument("--filename", "-f", help="pre-trained word embeddings file") 72 | parser.add_argument("--lexicon", "-l", help="lexicon which provide sentiment intensity") 73 | parser.add_argument("--top", "-t", help="top-k nearest neighbor", default=10) 74 | parser.add_argument("--iter", "-i", help="refinement interation", default=100) 75 | parser.add_argument("--beta", "-b", help="parameter beta", default=0.1) 76 | parser.add_argument("--gamma", "-g", help="parameter gamma", default=0.1) 77 | parser.add_argument("--valence", "-v", help="max value of valence", default=9.0) 78 | 79 | args = parser.parse_args() 80 | 81 | valence_max = args.valence 82 | gamma =args.gamma 83 | beta = args.beta 84 | top = args.top 85 | max_iter = args.iter 86 | 87 | logging.info('loading w2v_model...') 88 | # w2v_model_file = os.path.join('vector', 'glove.twitter.27B.50d.gensim.txt') 89 | w2v_model_file = args.filename 90 | 91 | w2v_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_model_file, binary=False) 92 | embedding_dim = w2v_model.vector_size 93 | logging.info('w2v_model loaded!') 94 | 95 | # load lexicon 96 | logging.info('loading lexicon...') 97 | # anew_file = os.path.join('lexicon', 'eanew_seed.txt') 98 | anew_file = args.lexicon 99 | anew = pd.read_table(anew_file, header=None, sep='\t', quoting=3) 100 | logging.info('lexicon loaded!') 101 | 102 | logging.info('prepare data...') 103 | anew_dict = {} 104 | vector_dict = {} 105 | for i in range(len(anew[0])): 106 | try: 107 | vector_dict[anew[0][i]] = w2v_model[anew[0][i]] 108 | anew_dict[anew[0][i]] = anew[1][i] 109 | except: 110 | continue 111 | 112 | # weight_dict 113 | logging.info('weight_dict') 114 | weight_dict = {} 115 | for i in anew_dict.keys(): 116 | for j in anew_dict.keys(): 117 | weight = valence_max - abs(anew_dict[i] - anew_dict[j]) 118 | update_dict(weight_dict, i, j, weight) 119 | 120 | # weighted matrix 121 | logging.info('weight_matrix') 122 | weight_matrix = np.zeros((len(anew_dict), len(anew_dict))) 123 | for i in range(len(anew_dict.keys())): 124 | word_i = anew_dict.keys()[i] 125 | sim_dict = most_similar(word_i, w2v_model, anew_dict, weight_dict, top=top) 126 | 127 | for j in range(len(anew_dict.keys())): 128 | word_j = anew_dict.keys()[j] 129 | if word_j in sim_dict.keys(): 130 | weight_matrix[i][j] = sim_dict[word_j] 131 | weight_matrix[j][i] = sim_dict[word_j] 132 | 133 | # vertex matrix 134 | logging.info('vertex_matrix') 135 | vertex_matrix = np.zeros((len(anew_dict.keys()), embedding_dim)) 136 | for i in range(vertex_matrix.shape[0]): 137 | for j in range(vertex_matrix.shape[1]): 138 | vector = vector_dict[anew_dict.keys()[i]] 139 | vertex_matrix[i, j] = vector[j] 140 | 141 | logging.info('weight_matrix shape: ' + str(weight_matrix.shape)) 142 | logging.info('vectex_matrix shape: ' + str(vertex_matrix.shape)) 143 | 144 | logging.info('starting refinement') 145 | origin_vertex_matrix = vertex_matrix 146 | pre_vertex_matrix = vertex_matrix 147 | pre_distance = 0.0 148 | diff = 1.0 149 | num_word = len(anew_dict.keys()) 150 | 151 | for iteration in range(max_iter): 152 | pre_vertex_matrix = vertex_matrix.copy() 153 | for i in range(num_word): 154 | denominator = 0.0 155 | molecule = 0.0 156 | tmp_vertex = np.zeros((embedding_dim, )) 157 | weight_sum = 0.0 158 | for j in range(num_word): 159 | w_multi_v = weight_matrix[i, j] * pre_vertex_matrix[j] 160 | weight_sum = weight_sum + weight_matrix[i, j] 161 | tmp_vertex = tmp_vertex + w_multi_v 162 | 163 | molecule = gamma * pre_vertex_matrix[i] + beta * tmp_vertex 164 | denominator = gamma + beta * weight_sum 165 | delta = molecule / denominator 166 | vertex_matrix[i] = delta 167 | 168 | # for k in range(vector_dim): 169 | # print(vertex_matrix[i, k], pre_vertex_matrix[i, k], delta[k]) 170 | 171 | distance = vertex_matrix - pre_vertex_matrix 172 | distanceT = distance.T 173 | value = np.dot(distance, distance.T) 174 | 175 | ec_distance = 0.0 176 | for i in range(embedding_dim): 177 | ec_distance = ec_distance + value[i, i] 178 | 179 | diff = abs(ec_distance - pre_distance) 180 | logging.info('cost: %f' % (diff)) 181 | pre_distance = ec_distance 182 | 183 | refine_vector_file = w2v_model_file + '.refine' 184 | write_vector(refine_vector_file, w2v_model, vertex_matrix, anew_dict) -------------------------------------------------------------------------------- /glove_twitter_refine.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import absolute_import 3 | 4 | import os 5 | import sys 6 | import logging 7 | import gensim 8 | import codecs 9 | import argparse 10 | 11 | import numpy as np 12 | import pandas as pd 13 | 14 | def update_dict(target_dict, key_a, key_b, val): 15 | if key_a in target_dict: 16 | target_dict[key_a].update({key_b: val}) 17 | else: 18 | target_dict.update({key_a:{key_b: val}}) 19 | 20 | def write_vector(file_name, w2v_model, vertex_matrix, anew_dict): 21 | with codecs.open(file_name, 'w', 'utf8') as my_file: 22 | # refine result 23 | for i in range(len(anew_dict.keys())): 24 | word = anew_dict.keys()[i] 25 | vec = vertex_matrix[i] 26 | my_file.write('%s %s\n' % (word, ' '.join('%f' % val for val in vec))) 27 | 28 | for word in w2v_model.vocab: 29 | if word not in anew_dict.keys(): 30 | vec = w2v_model[word] 31 | my_file.write('%s %s\n' % (word, ' '.join('%f' % val for val in vec))) 32 | 33 | def most_similar(word, w2v_model, anew_dict, weight_dict, top=10): 34 | sim_array = []; word_array = [] 35 | 36 | # get the most similar words from word2vec model 37 | similar_words = w2v_model.most_similar(word, topn=top) 38 | 39 | i = 0 40 | for similar_word in similar_words: 41 | try: 42 | diff = weight_dict[word][similar_word[0]] 43 | sim_array.append([i, diff]) 44 | except: 45 | sim_array.append([i, 0.0]) 46 | 47 | word_array.append(similar_word[0]) 48 | i = i + 1 49 | 50 | sim_array = np.array(sim_array) 51 | sort_index = sim_array[:, 1].argsort(0) 52 | new_array = sim_array[sort_index][::-1] 53 | 54 | ret_dict = {} 55 | for i in range(top): 56 | word = word_array[int(new_array[i][0])] 57 | ret_dict[word] = 1. / float(i + 1.) 58 | 59 | return ret_dict 60 | 61 | if __name__ == '__main__': 62 | program = os.path.basename(sys.argv[0]) 63 | logger = logging.getLogger(program) 64 | 65 | logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') 66 | logging.root.setLevel(level=logging.INFO) 67 | logger.info(r"running %s" % ''.join(sys.argv)) 68 | 69 | parser = argparse.ArgumentParser() 70 | 71 | parser.add_argument("--filename", "-f", help="pre-trained word embeddings file") 72 | parser.add_argument("--lexicon", "-l", help="lexicon which provide sentiment intensity") 73 | parser.add_argument("--top", "-t", help="top-k nearest neighbor", default=10) 74 | parser.add_argument("--iter", "-i", help="refinement interation", default=100) 75 | parser.add_argument("--beta", "-b", help="parameter beta", default=0.1) 76 | parser.add_argument("--gamma", "-g", help="parameter gamma", default=0.1) 77 | parser.add_argument("--valence", "-v", help="max value of valence", default=9.0) 78 | 79 | args = parser.parse_args() 80 | 81 | valence_max = args.valence 82 | gamma =args.gamma 83 | beta = args.beta 84 | top = args.top 85 | max_iter = args.iter 86 | 87 | logging.info('loading w2v_model...') 88 | # w2v_model_file = os.path.join('vector', 'glove.twitter.27B.50d.gensim.txt') 89 | w2v_model_file = args.filename 90 | 91 | w2v_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_model_file, binary=False) 92 | embedding_dim = w2v_model.vector_size 93 | logging.info('w2v_model loaded!') 94 | 95 | # load lexicon 96 | logging.info('loading lexicon...') 97 | # anew_file = os.path.join('lexicon', 'eanew_seed.txt') 98 | anew_file = args.lexicon 99 | anew = pd.read_table(anew_file, header=None, sep='\t', quoting=3) 100 | logging.info('lexicon loaded!') 101 | 102 | logging.info('prepare data...') 103 | anew_dict = {} 104 | vector_dict = {} 105 | for i in range(len(anew[0])): 106 | try: 107 | vector_dict[anew[0][i]] = w2v_model[anew[0][i]] 108 | anew_dict[anew[0][i]] = anew[1][i] 109 | except: 110 | continue 111 | 112 | # weight_dict 113 | logging.info('weight_dict') 114 | weight_dict = {} 115 | for i in anew_dict.keys(): 116 | for j in anew_dict.keys(): 117 | weight = valence_max - abs(anew_dict[i] - anew_dict[j]) 118 | update_dict(weight_dict, i, j, weight) 119 | 120 | # weighted matrix 121 | logging.info('weight_matrix') 122 | weight_matrix = np.zeros((len(anew_dict), len(anew_dict))) 123 | for i in range(len(anew_dict.keys())): 124 | word_i = anew_dict.keys()[i] 125 | sim_dict = most_similar(word_i, w2v_model, anew_dict, weight_dict, top=top) 126 | 127 | for j in range(len(anew_dict.keys())): 128 | word_j = anew_dict.keys()[j] 129 | if word_j in sim_dict.keys(): 130 | weight_matrix[i][j] = sim_dict[word_j] 131 | weight_matrix[j][i] = sim_dict[word_j] 132 | 133 | # vertex matrix 134 | logging.info('vertex_matrix') 135 | vertex_matrix = np.zeros((len(anew_dict.keys()), embedding_dim)) 136 | for i in range(vertex_matrix.shape[0]): 137 | for j in range(vertex_matrix.shape[1]): 138 | vector = vector_dict[anew_dict.keys()[i]] 139 | vertex_matrix[i, j] = vector[j] 140 | 141 | logging.info('weight_matrix shape: ' + str(weight_matrix.shape)) 142 | logging.info('vectex_matrix shape: ' + str(vertex_matrix.shape)) 143 | 144 | logging.info('starting refinement') 145 | origin_vertex_matrix = vertex_matrix 146 | pre_vertex_matrix = vertex_matrix 147 | pre_distance = 0.0 148 | diff = 1.0 149 | num_word = len(anew_dict.keys()) 150 | 151 | for iteration in range(max_iter): 152 | pre_vertex_matrix = vertex_matrix.copy() 153 | for i in range(num_word): 154 | denominator = 0.0 155 | molecule = 0.0 156 | tmp_vertex = np.zeros((embedding_dim, )) 157 | weight_sum = 0.0 158 | for j in range(num_word): 159 | w_multi_v = weight_matrix[i, j] * pre_vertex_matrix[j] 160 | weight_sum = weight_sum + weight_matrix[i, j] 161 | tmp_vertex = tmp_vertex + w_multi_v 162 | 163 | molecule = gamma * pre_vertex_matrix[i] + beta * tmp_vertex 164 | denominator = gamma + beta * weight_sum 165 | delta = molecule / denominator 166 | vertex_matrix[i] = delta 167 | 168 | # for k in range(vector_dim): 169 | # print(vertex_matrix[i, k], pre_vertex_matrix[i, k], delta[k]) 170 | 171 | distance = vertex_matrix - pre_vertex_matrix 172 | distanceT = distance.T 173 | value = np.dot(distance, distance.T) 174 | 175 | ec_distance = 0.0 176 | for i in range(embedding_dim): 177 | ec_distance = ec_distance + value[i, i] 178 | 179 | diff = abs(ec_distance - pre_distance) 180 | logging.info('cost: %f' % (diff)) 181 | pre_distance = ec_distance 182 | 183 | refine_vector_file = w2v_model_file + '.refine' 184 | write_vector(refine_vector_file, w2v_model, vertex_matrix, anew_dict) --------------------------------------------------------------------------------