├── README.md
├── s2v-python2.py
└── s2v-python3.py


/README.md:
--------------------------------------------------------------------------------
1 | # sentence2vec
2 | 这是对论文A SIMPLE BUT TOUGH-TO-BEAT BASELINE FOR SENTENCE EMBEDDING的python实现.
3 | 
4 | 词频估计时使用了约13G的WIKI英文语料库
5 | 词向量模型分别选用了word2vec、glove以及PSL vector
6 | 


--------------------------------------------------------------------------------
/s2v-python2.py:
--------------------------------------------------------------------------------
  1 | # -*- encoding:utf8 -*-
  2 | import numpy as np
  3 | from sklearn.metrics.pairwise import cosine_similarity
  4 | from scipy.stats import pearsonr
  5 | from gensim.models.word2vec import KeyedVectors
  6 | from sklearn.decomposition import TruncatedSVD
  7 | 
  8 | single_data_path = './data/images2014'
  9 | vocab_path = './others/enwiki_vocab_min200.txt'
 10 | embed_domins = 300
 11 | rmpc = True
 12 | 
 13 | # 是否测试程序
 14 | isTest = False
 15 | 
 16 | 
 17 | # 获取单词对应权重
 18 | def getWordsFrequency(weightfile, a=1e-3):
 19 |     if a <= 0:  # when the parameter makes no sense, use unweighted
 20 |         a = 1.0
 21 |     # 单词权重字典，key为单词，value是权重
 22 |     word2weight = {}
 23 |     # N记录词总数
 24 |     N = 0.0
 25 |     with open(weightfile, 'r') as f:
 26 |         for line in f:
 27 |             line = line.strip()
 28 |             if len(line) > 0:
 29 |                 line = line.split()
 30 |                 if len(line) == 2:
 31 |                     word2weight[line[0]] = float(line[1])
 32 |                     N += float(line[1])
 33 |                 else:
 34 |                     print line
 35 | 
 36 |     for key, value in word2weight.iteritems():
 37 |         word2weight[key] = a / (a + value / N)
 38 |     return word2weight
 39 | 
 40 | 
 41 | # 返回输入句子list对应的句子向量表
 42 | def getSenteceEmbedding(list_sentences, words_dict, weights_dict, istest):
 43 |     embeds = []
 44 |     for s in list_sentences:
 45 |         semb = []
 46 |         s = s.strip()
 47 |         words = s.split()
 48 |         for word in words:
 49 |             if istest:
 50 |                 if word in words_dict.keys():
 51 |                     embed = words_dict[word]
 52 |                 else:
 53 |                     embed = np.zeros(embed_domins, dtype=float)
 54 |             else:
 55 |                 if word in words_dict.vocab:
 56 |                     embed = words_dict[word]
 57 |                 else:
 58 |                     embed = np.zeros(embed_domins, dtype=float)
 59 |             if word in weights_dict:
 60 |                 w = weights_dict[word]
 61 |             else:
 62 |                 w = 1.0
 63 |             semb.append(embed * w)
 64 |         emb = np.sum(semb, axis=0) / len(semb)
 65 |         embeds.append(emb)
 66 |     return np.array(embeds, dtype=float)
 67 | 
 68 | 
 69 | def getCosineSimilarities(emb1, emb2):
 70 |     return cosine_similarity(emb1, emb2)
 71 | 
 72 | 
 73 | def compute_pc(X, npc=1):
 74 |     svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
 75 |     svd.fit(X)
 76 |     return svd.components_
 77 | 
 78 | 
 79 | def remove_pc(X, npc=1):
 80 |     pc = compute_pc(X, npc)
 81 |     if npc == 1:
 82 |         XX = X - X.dot(pc.transpose()) * pc
 83 |     else:
 84 |         XX = X - X.dot(pc.transpose()).dot(pc)
 85 |     return XX
 86 | 
 87 | 
 88 | if isTest:
 89 |     words_dict = {'I': np.array([1., 1., 1., 1.], dtype=float),
 90 |                   'am': np.array([2., 2., 2., 2.], dtype=float),
 91 |                   'You': np.array([0.9, 0.9, 0.9, 0.9], dtype=float),
 92 |                   'Today': np.array([10.0, 9.0, 8.0, 7.0], dtype=float),
 93 |                   'boy': np.array([4.5, 3., 5., 6.], dtype=float),
 94 |                   'girl': np.array([4.4, 3.1, 5.2, 6.1], dtype=float)
 95 |                   }
 96 | else:
 97 |     print '读取词向量...'
 98 |     # word2vec词向量字典
 99 |     words_dict = KeyedVectors.load_word2vec_format('./others/GoogleNews-vectors-negative300.bin.gz', binary=True)
100 | 
101 |     print '词向量读取完毕!'
102 | 
103 | if isTest:
104 |     embed_domins = 4
105 |     p1 = ['I am a boy . ', 'You are a girl . ', 'I like playing basketball . ']
106 |     p2 = ['Today is a nice day .', 'Something will happen today . ', 'Do you love me ? ']
107 |     scores = [0.5, 0.4, 0.3]
108 |     weights_dict = {'am': 0.5}
109 | else:
110 |     p1 = []
111 |     p2 = []
112 |     scores = []
113 |     with open(single_data_path, 'r') as f:
114 |         for line in f:
115 |             lines = line.split('\t')
116 |             p1.append(lines[0])
117 |             p2.append(lines[1])
118 |             scores.append(float(lines[2]))
119 |     weights_dict = getWordsFrequency(vocab_path)
120 | sentenceEmbed1 = getSenteceEmbedding(p1, words_dict, weights_dict, isTest)
121 | sentenceEmbed2 = getSenteceEmbedding(p2, words_dict, weights_dict, isTest)
122 | print 'Type of s1: ', type(sentenceEmbed1)
123 | # print sentenceEmbed1
124 | print 'Type of s2: ', type(sentenceEmbed2)
125 | # print sentenceEmbed2
126 | if rmpc:
127 |     sentenceEmbed1 = remove_pc(sentenceEmbed1, npc=1)
128 |     sentenceEmbed2 = remove_pc(sentenceEmbed2, npc=1)
129 |     print '完成主成分移除...'
130 |     # print sentenceEmbed1
131 |     # print sentenceEmbed2
132 | sims = []
133 | for i in range(len(sentenceEmbed1)):
134 |     sims.append(cosine_similarity([sentenceEmbed1[i]], [sentenceEmbed2[i]])[0][0])
135 | # print sims
136 | r, p = pearsonr(sims, scores)
137 | # r1, p1 = pearsonr(scores, sims)
138 | print r
139 | 


--------------------------------------------------------------------------------
/s2v-python3.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf8 -*-
  2 | from gensim.models import KeyedVectors
  3 | import pickle as pkl
  4 | import numpy as np
  5 | from typing import List
  6 | from sklearn.decomposition import PCA
  7 | from sklearn.metrics.pairwise import cosine_similarity
  8 | from scipy.stats import pearsonr
  9 | import os
 10 | import PSLvec as psl
 11 | from nltk.tokenize import StanfordTokenizer
 12 | 
 13 | word2vec_path = './GoogleNews-vectors-negative300.bin.gz'
 14 | glove_path = './glove_model.txt'
 15 | psl_path = './PSL_model.txt'
 16 | # traindata = './datasets/sts2013.OnWN.pkl'
 17 | freq_table = './mydictionary'
 18 | embedding_size = 300
 19 | 
 20 | pslemb = psl.PSL()
 21 | 
 22 | # 载入word2vec模型
 23 | # model = KeyedVectors.load_word2vec_format(word2vec_path,binary=True)
 24 | # model = KeyedVectors.load_word2vec_format(glove_path,binary=False)
 25 | # model = KeyedVectors.load_word2vec_format(psl_path,binary=False)
 26 | model = pslemb.w
 27 | print('完成模型载入')
 28 | 
 29 | tokenizer = StanfordTokenizer(path_to_jar=r"D:\stanford-parser-full-2016-10-31\stanford-parser.jar")
 30 | 
 31 | 
 32 | # print(type(model))
 33 | # print(model['sdfsfsdfsadfs'])
 34 | 
 35 | class Word:
 36 |     def __init__(self, text, vector):
 37 |         self.text = text
 38 |         self.vector = vector
 39 | 
 40 | 
 41 | class Sentence:
 42 |     def __init__(self, word_list):
 43 |         self.word_list = word_list
 44 | 
 45 |     def len(self) -> int:
 46 |         return len(self.word_list)
 47 | 
 48 | 
 49 | def get_word_frequency(word_text, looktable):
 50 |     if word_text in looktable:
 51 |         return looktable[word_text]
 52 |     else:
 53 |         return 1.0
 54 | 
 55 | 
 56 | def sentence_to_vec(sentence_list: List[Sentence], embedding_size, looktable, a=1e-3):
 57 |     sentence_set = []
 58 |     for sentence in sentence_list:
 59 |         vs = np.zeros(embedding_size)  # add all word2vec values into one vector for the sentence
 60 |         sentence_length = sentence.len()
 61 |         for word in sentence.word_list:
 62 |             a_value = a / (a + get_word_frequency(word.text, looktable))  # smooth inverse frequency, SIF
 63 |             vs = np.add(vs, np.multiply(a_value, word.vector))  # vs += sif * word_vector
 64 | 
 65 |         vs = np.divide(vs, sentence_length)  # weighted average
 66 |         sentence_set.append(vs)  # add to our existing re-calculated set of sentences
 67 | 
 68 |     # calculate PCA of this sentence set
 69 |     pca = PCA(n_components=embedding_size)
 70 |     pca.fit(np.array(sentence_set))
 71 |     u = pca.components_[0]  # the PCA vector
 72 |     u = np.multiply(u, np.transpose(u))  # u x uT
 73 | 
 74 |     # pad the vector?  (occurs if we have less sentences than embeddings_size)
 75 |     if len(u) < embedding_size:
 76 |         for i in range(embedding_size - len(u)):
 77 |             u = np.append(u, 0)  # add needed extension for multiplication below
 78 | 
 79 |     # resulting sentence vectors, vs = vs -u x uT x vs
 80 |     sentence_vecs = []
 81 |     for vs in sentence_set:
 82 |         sub = np.multiply(u, vs)
 83 |         sentence_vecs.append(np.subtract(vs, sub))
 84 | 
 85 |     return sentence_vecs
 86 | 
 87 | 
 88 | with open(freq_table, 'rb') as f:
 89 |     mydict = pkl.load(f)
 90 | print('完成词频字典载入')
 91 | 
 92 | paths = ['./datasets/data']
 93 | for path in paths:
 94 |     files = []
 95 |     for file in os.listdir(path=path):
 96 |         if os.path.isfile(path + '/' + file):
 97 |             files.append(path + '/' + file)
 98 | 
 99 |     for traindata in files:
100 |         with open(traindata, 'rb') as f:
101 |             train = pkl.load(f)
102 | 
103 |         print('读取' + traindata + '数据完成')
104 | 
105 |         gs = []
106 |         pred = []
107 |         allsent = []
108 |         for each in train:
109 |             # sent1, sent2, label = each.split('\t')
110 |             if len(train[0]) == 3:
111 |                 sent1, sent2, label = each
112 |             else:
113 |                 sent1, sent2, label, _ = each
114 |             gs.append(float(label))
115 |             s1 = []
116 |             s2 = []
117 |             # sw1 = sent1.split()
118 |             # sw2 = sent2.split()
119 |             for word in sent1:
120 |                 try:
121 |                     vec = model[word]
122 |                 except KeyError:
123 |                     vec = np.zeros(embedding_size)
124 |                 s1.append(Word(word, vec))
125 |             for word in sent2:
126 |                 try:
127 |                     vec = model[word]
128 |                 except KeyError:
129 |                     vec = np.zeros(embedding_size)
130 |                 s2.append(Word(word, vec))
131 | 
132 |             ss1 = Sentence(s1)
133 |             ss2 = Sentence(s2)
134 |             allsent.append(ss1)
135 |             allsent.append(ss2)
136 | 
137 |         sentence_vectors = sentence_to_vec(allsent, embedding_size, looktable=mydict)
138 |         len_sentences = len(sentence_vectors)
139 |         for i in range(len_sentences):
140 |             if i % 2 == 0:
141 |                 sim = cosine_similarity([sentence_vectors[i]], [sentence_vectors[i + 1]])
142 |                 pred.append(sim[0][0])
143 | 
144 |         print('len of pred: ', len(pred))
145 |         print('len of gs: ', len(gs))
146 | 
147 |         r, p = pearsonr(pred, gs)
148 |         print(traindata + '皮尔逊相关系数:', r)
149 | 
150 | 
151 |         # sentence_vectors = sentence_to_vec([ss1, ss2], embedding_size, looktable=mydict)
152 |         # sim = cosine_similarity([sentence_vectors[0]], [sentence_vectors[1]])
153 |         # pred.append(sim[0][0])
154 | 
155 |         # r, p = pearsonr(pred, gs)
156 |         # print(traindata + '皮尔逊相关系数:', r)  # print(sentence_vectors[0])
157 | # print(sentence_vectors[1])
158 | 


--------------------------------------------------------------------------------