├── .DS_Store ├── figure2.png ├── figure3.png ├── s3e.py ├── README.md └── utils.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinWang28/Sentence-Embedding-S3E/HEAD/.DS_Store -------------------------------------------------------------------------------- /figure2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinWang28/Sentence-Embedding-S3E/HEAD/figure2.png -------------------------------------------------------------------------------- /figure3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BinWang28/Sentence-Embedding-S3E/HEAD/figure3.png -------------------------------------------------------------------------------- /s3e.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import argparse 4 | 5 | from utils import load_file, create_dictionary, load_wordvec, load_word_weight 6 | from utils import semantic_construction, compute_embedding 7 | 8 | 9 | PATH_TO_VEC = [#'./word_embedding/glove.840B.300d.txt', # GloVe Vector 10 | './word_embedding/crawl-300d-2M.vec', # FastText Vector 11 | './word_embedding/lexvec.commoncrawl.300d.W.pos.vectors', # LexVec Vector 12 | './word_embedding/paragram_300_sl999.txt', # PSL Vector 13 | ] 14 | PATH_TO_WORD_WEIGHTS = './word_embedding/enwiki_vocab_min200.txt' # Word Weights Vector 15 | PATH_TO_SENTENCE = './custrev.pos' 16 | 17 | if __name__ == "__main__": 18 | 19 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 20 | # Settings 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument("--cluster_num", default=10, type=int, 23 | help="number of semantic groups to construct") 24 | parser.add_argument("--postprocessing", default=1, type=int, 25 | help="principal component removal") 26 | args = parser.parse_args() 27 | 28 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 29 | # Load text file 30 | sentences = load_file(PATH_TO_SENTENCE) 31 | 32 | # Load dictionary 33 | args.id2word, args.word2id = create_dictionary(sentences) 34 | 35 | # Load word vectors 36 | args.word_vec_np = load_wordvec(PATH_TO_VEC, args.word2id) 37 | args.wvec_dim = args.word_vec_np.shape[1] 38 | 39 | # Load word weights 40 | args.word_weight = load_word_weight(PATH_TO_WORD_WEIGHTS, args.word2id, a=1e-3) 41 | 42 | # Construct semantic groups 43 | semantic_construction(args) 44 | 45 | # Generate embedding 46 | sentence_emb = compute_embedding(args, sentences) 47 | 48 | 49 | 50 | # Provide Example 51 | index1 = int(input("\nThe index for the first sentence:")) 52 | print("The first sentence is:", " ".join(sentences[index1])) 53 | index2 = int(input("The index for the second sentence:")) 54 | print("The second sentence is:", " ".join(sentences[index2])) 55 | 56 | print("The similarity between them are:", sentence_emb[index1].dot(sentence_emb[index2])/np.linalg.norm(sentence_emb[index1])/np.linalg.norm(sentence_emb[index2])) 57 | 58 | 59 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # S3E: Efficient Sentence Embedding via Semantic Subspace Analysis 2 |

3 | Paris 4 |

5 | 6 | S3E provides a way to generate sentence embedding from static word embeddings. It contains three step: semantic group construction, intra-group descriptor and inter-group descriptor. We provide the source code for generating your own sentence embedding. More details from [S3E paper](https://arxiv.org/abs/2002.09620) 7 | 8 | 9 | | Section | Description | 10 | |-|-| 11 | | [Installation](#Installation) | How to setup the environment | 12 | | [Quick Usage Guide](#Quick-Usage-Guide) | A quick guide | 13 | | [Citation](#Citation) | Reference Link | 14 | 15 | 16 | ## Installation 17 | We are using Python 3.7. 18 | 19 | **Create a new environment** 20 | ``` 21 | conda create -n S3E python=3.7 22 | conda activate S3E 23 | ``` 24 | 25 | **Install the dependencies** 26 | 27 | ``` 28 | conda install numpy 29 | conda install -c anaconda scikit-learn 30 | ``` 31 | 32 | **Download Pretrained Word Embedding** 33 | 34 | The files can be download from [Google Drive](https://drive.google.com/drive/folders/1aGhxhXtW9dGYbu85e3Z77L-RDTJe5kAr?usp=sharing) 35 | 36 | Place the word embeddings into folder 'word_embedding' in order to work properly. 37 | 38 | 39 | ## Quick Usage Guide 40 | 41 | To run the example code, simply run 42 | ``` 43 | python s3e.py --cluster_num 10 44 | ``` 45 | 46 | It did the following several things: 47 | - Load word embedding and sentence txt. 48 | - Sentence txt is given by custrev.pos. 49 | - Perform S3E method and generate embedding. 50 | - You can test the similarity of two sentences by giving sentence index in the txt file. 51 | - Example: 52 | 53 |

54 | Paris 55 |

56 | 57 | 58 | 59 | ## Citation 60 | 61 | If you find our model is useful in your research, please consider cite our paper: [Efficient Sentence Embedding via Semantic Subspace Analysis](https://arxiv.org/abs/2002.09620): 62 | 63 | ``` 64 | @article{S3E, 65 | title = {Efficient Sentence Embedding via Semantic Subspace Analysis}, 66 | author = {Wang, Bin and Chen, Fenxiao and Wang, Yuncheng and Kuo, C-C Jay}, 67 | journal={arXiv preprint arXiv:2002.09620}, 68 | year={2020} 69 | } 70 | ``` 71 | 72 | Contact person: Bin Wang, bwang28c@gmail.com 73 | 74 | http://mcl.usc.edu/ 75 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import io 3 | import logging 4 | from sklearn.decomposition import PCA 5 | from sklearn.cluster import KMeans 6 | from sklearn.decomposition import TruncatedSVD 7 | 8 | 9 | 10 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 11 | # Load files 12 | def load_file(path): 13 | 14 | with open(path, 'r') as f: 15 | lines = f.readlines() 16 | sentences =[line.split(' ')[:-1] for line in lines] 17 | sentences = sentences[:-1] 18 | 19 | return sentences 20 | 21 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 22 | # Create dictionary 23 | def create_dictionary(sentences, threshold=0): 24 | words = {} 25 | for s in sentences: 26 | for word in s: 27 | words[word] = words.get(word, 0) + 1 28 | 29 | if threshold > 0: 30 | newwords = {} 31 | for word in words: 32 | if words[word] >= threshold: 33 | newwords[word] = words[word] 34 | words = newwords 35 | words[''] = 1e9 + 4 36 | words[''] = 1e9 + 3 37 | words['

'] = 1e9 + 2 38 | 39 | sorted_words = sorted(words.items(), key=lambda x: -x[1]) # inverse sort 40 | id2word = [] 41 | word2id = {} 42 | for i, (w, _) in enumerate(sorted_words): 43 | id2word.append(w) 44 | word2id[w] = i 45 | 46 | return id2word, word2id 47 | 48 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 49 | # Get word vectors from vocabulary and save as numpy array 50 | def load_wordvec(path_to_vec, word2id): 51 | N = len(word2id) 52 | dim = len(path_to_vec) * 300 53 | word_vec_np = np.zeros((N, dim)) 54 | 55 | # For words known 56 | counts = [] 57 | for i in range(len(path_to_vec)): 58 | count = 0 59 | with io.open(path_to_vec[i], 'r', encoding='utf-8') as f: 60 | # if word2vec or fasttext file : skip first line "next(f)" 61 | for line in f: 62 | word, vec = line.split(' ', 1) 63 | if word in word2id: 64 | count = count + 1 65 | word_vec_np[word2id[word], i*300:(i+1)*300] = np.fromstring(vec, sep=' ') 66 | counts.append(count) 67 | 68 | print(path_to_vec[i]) 69 | logging.info('Found {0} words with word vectors, out of \ 70 | {1} words'.format(count, len(word2id))) 71 | mean_vec = word_vec_np[:, i * 300: (i+1) * 300].sum(0) / count 72 | for j in range(N): 73 | if word_vec_np[j, i*300] == 0: 74 | word_vec_np[j, i*300:(i+1)*300] = mean_vec 75 | 76 | print('Unknowns are represented by mean') 77 | 78 | 79 | 80 | # Pre-processing word embedding: https://arxiv.org/pdf/1808.06305.pdf 81 | print('pre processing word embedding using https://arxiv.org/pdf/1808.06305.pdf') 82 | word_vec_np = word_vec_np - np.mean(word_vec_np,0) 83 | pca = PCA(n_components=300) 84 | pca.fit(word_vec_np) 85 | 86 | U1 = pca.components_ 87 | explained_variance = pca.explained_variance_ 88 | 89 | # Removing Projections on Top Components 90 | PVN_dims = 10 91 | z = [] 92 | for i, x in enumerate(word_vec_np): 93 | for j,u in enumerate(U1[0:PVN_dims]): 94 | ratio = (explained_variance[j]-explained_variance[PVN_dims]) / explained_variance[j] 95 | x = x - ratio*np.dot(u.transpose(),x)*u 96 | z.append(x) 97 | word_vec_np = np.asarray(z) 98 | 99 | return word_vec_np 100 | 101 | 102 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 103 | # Get word weight based on frequency. 104 | def load_word_weight(weightfile, word2id, a=1e-3): 105 | 106 | print('get_word_weights') 107 | if a <=0: # when the parameter makes no sense, use unweighted 108 | a = 1.0 109 | 110 | word2weight = {} 111 | with open(weightfile) as f: 112 | lines = f.readlines() 113 | N = 0 114 | for i in lines: 115 | i=i.strip() 116 | if(len(i) > 0): 117 | i=i.split() 118 | if(len(i) == 2): 119 | word2weight[i[0]] = float(i[1]) 120 | N += float(i[1]) 121 | else: 122 | print(i) 123 | for key, value in word2weight.items(): 124 | word2weight[key] = a / (a + value/N) 125 | 126 | 127 | # Update for current vocabulary 128 | weight4ind = {} 129 | for word, ind in word2id.items(): 130 | if word in word2weight: 131 | weight4ind[ind] = word2weight[word] 132 | else: 133 | weight4ind[ind] = 1.0 # for unknown words - how much weight should be given 134 | 135 | return weight4ind 136 | 137 | 138 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 139 | # Construct semantic groups 140 | def semantic_construction(args): 141 | 142 | weight_list = list(args.word_weight.values()) 143 | weight_list = np.array(weight_list) 144 | print('perform weighted k-means') 145 | kmeans = KMeans(n_clusters=args.cluster_num).fit(args.word_vec_np, sample_weight = weight_list) 146 | 147 | args.word_labels = kmeans.labels_ 148 | args.centroids = kmeans.cluster_centers_ 149 | 150 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 151 | # Compute the sentence embedding 152 | def compute_embedding(args, sentences): 153 | 154 | samples = [sent if sent != [] else ['.'] for sent in sentences] 155 | 156 | sentences_by_id = [] 157 | 158 | for sent in samples: 159 | sentences_by_id.append([args.word2id[word] for word in sent]) 160 | 161 | embeddings = [] 162 | 163 | # Process each sentence at a time 164 | for sent_id in sentences_by_id: 165 | 166 | stage_vec = [{}] 167 | 168 | # Original Word Vector 169 | for word_id in sent_id: 170 | stage_vec[-1][word_id] = args.word_vec_np[word_id,:] 171 | 172 | # C 173 | stage_vec.append({}) 174 | for k,v in stage_vec[-2].items(): 175 | index = args.word_labels[k] 176 | 177 | if index in stage_vec[-1]: 178 | stage_vec[-1][index].append(stage_vec[-2][k]*args.word_weight[k]) 179 | else: 180 | stage_vec[-1][index] = [] 181 | stage_vec[-1][index].append(stage_vec[-2][k]*args.word_weight[k]) 182 | 183 | # VLAD for each cluster 184 | for k,v in stage_vec[-1].items(): 185 | # Centroids 186 | centroid_vec = args.centroids[k] 187 | 188 | # Residual 189 | v = [wv - centroid_vec for wv in v] 190 | stage_vec[-1][k] = np.sum(v,0) 191 | 192 | 193 | # Compute Sentence Embedding 194 | sentvec = [] 195 | vec = np.zeros((args.wvec_dim)) 196 | for key,value in stage_vec[0].items(): 197 | vec = vec + value * args.word_weight[key] 198 | sentvec.append(vec/len(stage_vec[0].keys())) 199 | 200 | # Covariance Descriptor 201 | matrix = np.zeros((args.cluster_num, args.wvec_dim)) 202 | for j in range(args.cluster_num): 203 | if j in stage_vec[-1]: 204 | matrix[j,:] = stage_vec[-1][j] 205 | matrix_no_mean = matrix - matrix.mean(1)[:, np.newaxis] 206 | cov = matrix_no_mean.dot(matrix_no_mean.T) 207 | 208 | # Generate Embedding 209 | iu1 = np.triu_indices(cov.shape[0]) 210 | iu2 = np.triu_indices(cov.shape[0],1) 211 | cov[iu2] = cov[iu2] * np.sqrt(2) 212 | vec = cov[iu1] 213 | 214 | vec = vec / np.linalg.norm(vec) 215 | 216 | sentvec.append(vec) 217 | 218 | sentvec = np.concatenate(sentvec) 219 | 220 | embeddings.append(sentvec) 221 | 222 | embeddings = np.vstack(embeddings) 223 | 224 | # Post processing 225 | if args.postprocessing: 226 | # Principal Component Removal 227 | print('post processing sentence embedding using principal component removal') 228 | svd = TruncatedSVD(n_components=args.postprocessing, n_iter=7, random_state=0) 229 | svd.fit(embeddings) 230 | args.svd_comp = svd.components_ 231 | 232 | if args.postprocessing==1: 233 | embeddings = embeddings - embeddings.dot(args.svd_comp.transpose()) * args.svd_comp 234 | else: 235 | embeddings = embeddings - embeddings.dot(args.svd_comp.transpose()).dot(args.svd_comp) 236 | 237 | return embeddings --------------------------------------------------------------------------------