├── README.md ├── s2v-python2.py └── s2v-python3.py /README.md: -------------------------------------------------------------------------------- 1 | # sentence2vec 2 | 这是对论文A SIMPLE BUT TOUGH-TO-BEAT BASELINE FOR SENTENCE EMBEDDING的python实现. 3 | 4 | 词频估计时使用了约13G的WIKI英文语料库 5 | 词向量模型分别选用了word2vec、glove以及PSL vector 6 | -------------------------------------------------------------------------------- /s2v-python2.py: -------------------------------------------------------------------------------- 1 | # -*- encoding:utf8 -*- 2 | import numpy as np 3 | from sklearn.metrics.pairwise import cosine_similarity 4 | from scipy.stats import pearsonr 5 | from gensim.models.word2vec import KeyedVectors 6 | from sklearn.decomposition import TruncatedSVD 7 | 8 | single_data_path = './data/images2014' 9 | vocab_path = './others/enwiki_vocab_min200.txt' 10 | embed_domins = 300 11 | rmpc = True 12 | 13 | # 是否测试程序 14 | isTest = False 15 | 16 | 17 | # 获取单词对应权重 18 | def getWordsFrequency(weightfile, a=1e-3): 19 | if a <= 0: # when the parameter makes no sense, use unweighted 20 | a = 1.0 21 | # 单词权重字典,key为单词,value是权重 22 | word2weight = {} 23 | # N记录词总数 24 | N = 0.0 25 | with open(weightfile, 'r') as f: 26 | for line in f: 27 | line = line.strip() 28 | if len(line) > 0: 29 | line = line.split() 30 | if len(line) == 2: 31 | word2weight[line[0]] = float(line[1]) 32 | N += float(line[1]) 33 | else: 34 | print line 35 | 36 | for key, value in word2weight.iteritems(): 37 | word2weight[key] = a / (a + value / N) 38 | return word2weight 39 | 40 | 41 | # 返回输入句子list对应的句子向量表 42 | def getSenteceEmbedding(list_sentences, words_dict, weights_dict, istest): 43 | embeds = [] 44 | for s in list_sentences: 45 | semb = [] 46 | s = s.strip() 47 | words = s.split() 48 | for word in words: 49 | if istest: 50 | if word in words_dict.keys(): 51 | embed = words_dict[word] 52 | else: 53 | embed = np.zeros(embed_domins, dtype=float) 54 | else: 55 | if word in words_dict.vocab: 56 | embed = words_dict[word] 57 | else: 58 | embed = np.zeros(embed_domins, dtype=float) 59 | if word in weights_dict: 60 | w = weights_dict[word] 61 | else: 62 | w = 1.0 63 | semb.append(embed * w) 64 | emb = np.sum(semb, axis=0) / len(semb) 65 | embeds.append(emb) 66 | return np.array(embeds, dtype=float) 67 | 68 | 69 | def getCosineSimilarities(emb1, emb2): 70 | return cosine_similarity(emb1, emb2) 71 | 72 | 73 | def compute_pc(X, npc=1): 74 | svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0) 75 | svd.fit(X) 76 | return svd.components_ 77 | 78 | 79 | def remove_pc(X, npc=1): 80 | pc = compute_pc(X, npc) 81 | if npc == 1: 82 | XX = X - X.dot(pc.transpose()) * pc 83 | else: 84 | XX = X - X.dot(pc.transpose()).dot(pc) 85 | return XX 86 | 87 | 88 | if isTest: 89 | words_dict = {'I': np.array([1., 1., 1., 1.], dtype=float), 90 | 'am': np.array([2., 2., 2., 2.], dtype=float), 91 | 'You': np.array([0.9, 0.9, 0.9, 0.9], dtype=float), 92 | 'Today': np.array([10.0, 9.0, 8.0, 7.0], dtype=float), 93 | 'boy': np.array([4.5, 3., 5., 6.], dtype=float), 94 | 'girl': np.array([4.4, 3.1, 5.2, 6.1], dtype=float) 95 | } 96 | else: 97 | print '读取词向量...' 98 | # word2vec词向量字典 99 | words_dict = KeyedVectors.load_word2vec_format('./others/GoogleNews-vectors-negative300.bin.gz', binary=True) 100 | 101 | print '词向量读取完毕!' 102 | 103 | if isTest: 104 | embed_domins = 4 105 | p1 = ['I am a boy . ', 'You are a girl . ', 'I like playing basketball . '] 106 | p2 = ['Today is a nice day .', 'Something will happen today . ', 'Do you love me ? '] 107 | scores = [0.5, 0.4, 0.3] 108 | weights_dict = {'am': 0.5} 109 | else: 110 | p1 = [] 111 | p2 = [] 112 | scores = [] 113 | with open(single_data_path, 'r') as f: 114 | for line in f: 115 | lines = line.split('\t') 116 | p1.append(lines[0]) 117 | p2.append(lines[1]) 118 | scores.append(float(lines[2])) 119 | weights_dict = getWordsFrequency(vocab_path) 120 | sentenceEmbed1 = getSenteceEmbedding(p1, words_dict, weights_dict, isTest) 121 | sentenceEmbed2 = getSenteceEmbedding(p2, words_dict, weights_dict, isTest) 122 | print 'Type of s1: ', type(sentenceEmbed1) 123 | # print sentenceEmbed1 124 | print 'Type of s2: ', type(sentenceEmbed2) 125 | # print sentenceEmbed2 126 | if rmpc: 127 | sentenceEmbed1 = remove_pc(sentenceEmbed1, npc=1) 128 | sentenceEmbed2 = remove_pc(sentenceEmbed2, npc=1) 129 | print '完成主成分移除...' 130 | # print sentenceEmbed1 131 | # print sentenceEmbed2 132 | sims = [] 133 | for i in range(len(sentenceEmbed1)): 134 | sims.append(cosine_similarity([sentenceEmbed1[i]], [sentenceEmbed2[i]])[0][0]) 135 | # print sims 136 | r, p = pearsonr(sims, scores) 137 | # r1, p1 = pearsonr(scores, sims) 138 | print r 139 | -------------------------------------------------------------------------------- /s2v-python3.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | from gensim.models import KeyedVectors 3 | import pickle as pkl 4 | import numpy as np 5 | from typing import List 6 | from sklearn.decomposition import PCA 7 | from sklearn.metrics.pairwise import cosine_similarity 8 | from scipy.stats import pearsonr 9 | import os 10 | import PSLvec as psl 11 | from nltk.tokenize import StanfordTokenizer 12 | 13 | word2vec_path = './GoogleNews-vectors-negative300.bin.gz' 14 | glove_path = './glove_model.txt' 15 | psl_path = './PSL_model.txt' 16 | # traindata = './datasets/sts2013.OnWN.pkl' 17 | freq_table = './mydictionary' 18 | embedding_size = 300 19 | 20 | pslemb = psl.PSL() 21 | 22 | # 载入word2vec模型 23 | # model = KeyedVectors.load_word2vec_format(word2vec_path,binary=True) 24 | # model = KeyedVectors.load_word2vec_format(glove_path,binary=False) 25 | # model = KeyedVectors.load_word2vec_format(psl_path,binary=False) 26 | model = pslemb.w 27 | print('完成模型载入') 28 | 29 | tokenizer = StanfordTokenizer(path_to_jar=r"D:\stanford-parser-full-2016-10-31\stanford-parser.jar") 30 | 31 | 32 | # print(type(model)) 33 | # print(model['sdfsfsdfsadfs']) 34 | 35 | class Word: 36 | def __init__(self, text, vector): 37 | self.text = text 38 | self.vector = vector 39 | 40 | 41 | class Sentence: 42 | def __init__(self, word_list): 43 | self.word_list = word_list 44 | 45 | def len(self) -> int: 46 | return len(self.word_list) 47 | 48 | 49 | def get_word_frequency(word_text, looktable): 50 | if word_text in looktable: 51 | return looktable[word_text] 52 | else: 53 | return 1.0 54 | 55 | 56 | def sentence_to_vec(sentence_list: List[Sentence], embedding_size, looktable, a=1e-3): 57 | sentence_set = [] 58 | for sentence in sentence_list: 59 | vs = np.zeros(embedding_size) # add all word2vec values into one vector for the sentence 60 | sentence_length = sentence.len() 61 | for word in sentence.word_list: 62 | a_value = a / (a + get_word_frequency(word.text, looktable)) # smooth inverse frequency, SIF 63 | vs = np.add(vs, np.multiply(a_value, word.vector)) # vs += sif * word_vector 64 | 65 | vs = np.divide(vs, sentence_length) # weighted average 66 | sentence_set.append(vs) # add to our existing re-calculated set of sentences 67 | 68 | # calculate PCA of this sentence set 69 | pca = PCA(n_components=embedding_size) 70 | pca.fit(np.array(sentence_set)) 71 | u = pca.components_[0] # the PCA vector 72 | u = np.multiply(u, np.transpose(u)) # u x uT 73 | 74 | # pad the vector? (occurs if we have less sentences than embeddings_size) 75 | if len(u) < embedding_size: 76 | for i in range(embedding_size - len(u)): 77 | u = np.append(u, 0) # add needed extension for multiplication below 78 | 79 | # resulting sentence vectors, vs = vs -u x uT x vs 80 | sentence_vecs = [] 81 | for vs in sentence_set: 82 | sub = np.multiply(u, vs) 83 | sentence_vecs.append(np.subtract(vs, sub)) 84 | 85 | return sentence_vecs 86 | 87 | 88 | with open(freq_table, 'rb') as f: 89 | mydict = pkl.load(f) 90 | print('完成词频字典载入') 91 | 92 | paths = ['./datasets/data'] 93 | for path in paths: 94 | files = [] 95 | for file in os.listdir(path=path): 96 | if os.path.isfile(path + '/' + file): 97 | files.append(path + '/' + file) 98 | 99 | for traindata in files: 100 | with open(traindata, 'rb') as f: 101 | train = pkl.load(f) 102 | 103 | print('读取' + traindata + '数据完成') 104 | 105 | gs = [] 106 | pred = [] 107 | allsent = [] 108 | for each in train: 109 | # sent1, sent2, label = each.split('\t') 110 | if len(train[0]) == 3: 111 | sent1, sent2, label = each 112 | else: 113 | sent1, sent2, label, _ = each 114 | gs.append(float(label)) 115 | s1 = [] 116 | s2 = [] 117 | # sw1 = sent1.split() 118 | # sw2 = sent2.split() 119 | for word in sent1: 120 | try: 121 | vec = model[word] 122 | except KeyError: 123 | vec = np.zeros(embedding_size) 124 | s1.append(Word(word, vec)) 125 | for word in sent2: 126 | try: 127 | vec = model[word] 128 | except KeyError: 129 | vec = np.zeros(embedding_size) 130 | s2.append(Word(word, vec)) 131 | 132 | ss1 = Sentence(s1) 133 | ss2 = Sentence(s2) 134 | allsent.append(ss1) 135 | allsent.append(ss2) 136 | 137 | sentence_vectors = sentence_to_vec(allsent, embedding_size, looktable=mydict) 138 | len_sentences = len(sentence_vectors) 139 | for i in range(len_sentences): 140 | if i % 2 == 0: 141 | sim = cosine_similarity([sentence_vectors[i]], [sentence_vectors[i + 1]]) 142 | pred.append(sim[0][0]) 143 | 144 | print('len of pred: ', len(pred)) 145 | print('len of gs: ', len(gs)) 146 | 147 | r, p = pearsonr(pred, gs) 148 | print(traindata + '皮尔逊相关系数:', r) 149 | 150 | 151 | # sentence_vectors = sentence_to_vec([ss1, ss2], embedding_size, looktable=mydict) 152 | # sim = cosine_similarity([sentence_vectors[0]], [sentence_vectors[1]]) 153 | # pred.append(sim[0][0]) 154 | 155 | # r, p = pearsonr(pred, gs) 156 | # print(traindata + '皮尔逊相关系数:', r) # print(sentence_vectors[0]) 157 | # print(sentence_vectors[1]) 158 | --------------------------------------------------------------------------------