├── .DS_Store
├── figure2.png
├── figure3.png
├── s3e.py
├── README.md
└── utils.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BinWang28/Sentence-Embedding-S3E/HEAD/.DS_Store


--------------------------------------------------------------------------------
/figure2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BinWang28/Sentence-Embedding-S3E/HEAD/figure2.png


--------------------------------------------------------------------------------
/figure3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BinWang28/Sentence-Embedding-S3E/HEAD/figure3.png


--------------------------------------------------------------------------------
/s3e.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | import argparse
 4 | 
 5 | from utils import load_file, create_dictionary, load_wordvec, load_word_weight
 6 | from utils import semantic_construction, compute_embedding
 7 | 
 8 | 
 9 | PATH_TO_VEC = [#'./word_embedding/glove.840B.300d.txt', # GloVe Vector
10 |                 './word_embedding/crawl-300d-2M.vec', # FastText Vector
11 |                 './word_embedding/lexvec.commoncrawl.300d.W.pos.vectors', # LexVec Vector
12 |                 './word_embedding/paragram_300_sl999.txt', # PSL Vector
13 |                 ]
14 | PATH_TO_WORD_WEIGHTS = './word_embedding/enwiki_vocab_min200.txt' # Word Weights Vector
15 | PATH_TO_SENTENCE = './custrev.pos'
16 | 
17 | if __name__ == "__main__":
18 |     
19 |     # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
20 |     # Settings
21 |     parser = argparse.ArgumentParser()
22 |     parser.add_argument("--cluster_num", default=10, type=int,
23 |                         help="number of semantic groups to construct")
24 |     parser.add_argument("--postprocessing", default=1, type=int,
25 |                         help="principal component removal")
26 |     args = parser.parse_args()
27 | 
28 |     # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
29 |     # Load text file
30 |     sentences = load_file(PATH_TO_SENTENCE)
31 | 
32 |     # Load dictionary
33 |     args.id2word, args.word2id = create_dictionary(sentences)
34 | 
35 |     # Load word vectors
36 |     args.word_vec_np = load_wordvec(PATH_TO_VEC, args.word2id)
37 |     args.wvec_dim = args.word_vec_np.shape[1]
38 | 
39 |     # Load word weights
40 |     args.word_weight = load_word_weight(PATH_TO_WORD_WEIGHTS, args.word2id, a=1e-3)
41 |     
42 |     # Construct semantic groups
43 |     semantic_construction(args)
44 | 
45 |     # Generate embedding
46 |     sentence_emb = compute_embedding(args, sentences)
47 |     
48 | 
49 | 
50 |     # Provide Example
51 |     index1 = int(input("\nThe index for the first sentence:"))
52 |     print("The first sentence is:", " ".join(sentences[index1]))
53 |     index2 = int(input("The index for the second sentence:"))
54 |     print("The second sentence is:", " ".join(sentences[index2]))
55 | 
56 |     print("The similarity between them are:", sentence_emb[index1].dot(sentence_emb[index2])/np.linalg.norm(sentence_emb[index1])/np.linalg.norm(sentence_emb[index2]))
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # S3E: Efficient Sentence Embedding via Semantic Subspace Analysis
 2 | <p align="center">
 3 | <img src="figure2.png" alt="Paris" class="center" width="500">
 4 | </p>
 5 | 
 6 | S3E provides a way to generate sentence embedding from static word embeddings. It contains three step: semantic group construction, intra-group descriptor and inter-group descriptor. We provide the source code for generating your own sentence embedding. More details from [S3E paper](https://arxiv.org/abs/2002.09620)
 7 | 
 8 | 
 9 | | Section | Description |
10 | |-|-|
11 | | [Installation](#Installation) 			| How to setup the environment  	|
12 | | [Quick Usage Guide](#Quick-Usage-Guide)	| A quick guide 					|
13 | | [Citation](#Citation)						| Reference Link		   		 	|
14 | 
15 | 
16 | ## Installation
17 | We are using Python 3.7.
18 | 
19 | **Create a new environment**
20 | ```
21 | conda create -n S3E python=3.7
22 | conda activate S3E
23 | ```
24 | 
25 | **Install the dependencies**
26 | 
27 | ```
28 | conda install numpy
29 | conda install -c anaconda scikit-learn
30 | ```
31 | 
32 | **Download Pretrained Word Embedding**
33 | 
34 | The files can be download from [Google Drive](https://drive.google.com/drive/folders/1aGhxhXtW9dGYbu85e3Z77L-RDTJe5kAr?usp=sharing)
35 | 
36 | Place the word embeddings into folder 'word_embedding' in order to work properly.
37 | 
38 | 
39 | ## Quick Usage Guide
40 | 
41 | To run the example code, simply run
42 | ```
43 | python s3e.py --cluster_num 10
44 | ```
45 | 
46 | It did the following several things:
47 | - Load word embedding and sentence txt.
48 | - Sentence txt is given by custrev.pos.
49 | - Perform S3E method and generate embedding.
50 | - You can test the similarity of two sentences by giving sentence index in the txt file.
51 | - Example:
52 | 
53 | <p align="center">
54 | <img src="figure3.png" alt="Paris" class="center" width="1000">
55 | </p>
56 | 
57 | 
58 | 
59 | ## Citation
60 | 
61 | If you find our model is useful in your research, please consider cite our paper: [Efficient Sentence Embedding via Semantic Subspace Analysis](https://arxiv.org/abs/2002.09620):
62 | 
63 | ``` 
64 | @article{S3E,
65 |     title = {Efficient Sentence Embedding via Semantic Subspace Analysis},
66 |     author = {Wang, Bin and Chen, Fenxiao and Wang, Yuncheng and Kuo, C-C Jay},
67 |     journal={arXiv preprint arXiv:2002.09620},
68 |     year={2020}
69 | }
70 | ```
71 | 
72 | Contact person: Bin Wang, bwang28c@gmail.com
73 | 
74 | http://mcl.usc.edu/
75 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import io
  3 | import logging
  4 | from sklearn.decomposition import PCA
  5 | from sklearn.cluster import KMeans
  6 | from sklearn.decomposition import TruncatedSVD
  7 | 
  8 | 
  9 | 
 10 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
 11 | # Load files
 12 | def load_file(path):
 13 |     
 14 |     with open(path, 'r') as f:
 15 |         lines = f.readlines()
 16 |         sentences =[line.split(' ')[:-1] for line in lines]
 17 |     sentences = sentences[:-1]
 18 | 
 19 |     return sentences
 20 | 
 21 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
 22 | # Create dictionary
 23 | def create_dictionary(sentences, threshold=0):
 24 |     words = {}
 25 |     for s in sentences:
 26 |         for word in s:
 27 |             words[word] = words.get(word, 0) + 1
 28 | 
 29 |     if threshold > 0:
 30 |         newwords = {}
 31 |         for word in words:
 32 |             if words[word] >= threshold:
 33 |                 newwords[word] = words[word]
 34 |         words = newwords
 35 |     words['<s>'] = 1e9 + 4
 36 |     words['</s>'] = 1e9 + 3
 37 |     words['<p>'] = 1e9 + 2
 38 | 
 39 |     sorted_words = sorted(words.items(), key=lambda x: -x[1])  # inverse sort
 40 |     id2word = []
 41 |     word2id = {}
 42 |     for i, (w, _) in enumerate(sorted_words):
 43 |         id2word.append(w)
 44 |         word2id[w] = i
 45 | 
 46 |     return id2word, word2id
 47 | 
 48 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
 49 | # Get word vectors from vocabulary and save as numpy array
 50 | def load_wordvec(path_to_vec, word2id):
 51 |     N = len(word2id)
 52 |     dim = len(path_to_vec) * 300
 53 |     word_vec_np = np.zeros((N, dim))
 54 | 
 55 |     # For words known
 56 |     counts = []
 57 |     for i in range(len(path_to_vec)):
 58 |         count = 0
 59 |         with io.open(path_to_vec[i], 'r', encoding='utf-8') as f:
 60 |             # if word2vec or fasttext file : skip first line "next(f)"
 61 |             for line in f:
 62 |                 word, vec = line.split(' ', 1)
 63 |                 if word in word2id:
 64 |                     count = count + 1
 65 |                     word_vec_np[word2id[word], i*300:(i+1)*300] = np.fromstring(vec, sep=' ')
 66 |         counts.append(count)
 67 |         
 68 |         print(path_to_vec[i])
 69 |         logging.info('Found {0} words with word vectors, out of \
 70 |         {1} words'.format(count, len(word2id)))
 71 |         mean_vec = word_vec_np[:, i * 300: (i+1) * 300].sum(0) / count
 72 |         for j in range(N):
 73 |             if word_vec_np[j, i*300] == 0:
 74 |                 word_vec_np[j, i*300:(i+1)*300] = mean_vec
 75 | 
 76 |     print('Unknowns are represented by mean')
 77 | 
 78 | 
 79 | 
 80 |     # Pre-processing word embedding: https://arxiv.org/pdf/1808.06305.pdf
 81 |     print('pre processing word embedding using https://arxiv.org/pdf/1808.06305.pdf')
 82 |     word_vec_np = word_vec_np - np.mean(word_vec_np,0)
 83 |     pca = PCA(n_components=300)
 84 |     pca.fit(word_vec_np)
 85 | 
 86 |     U1 = pca.components_
 87 |     explained_variance = pca.explained_variance_
 88 | 
 89 |     # Removing Projections on Top Components
 90 |     PVN_dims = 10
 91 |     z = []
 92 |     for i, x in enumerate(word_vec_np):
 93 |         for j,u in enumerate(U1[0:PVN_dims]):
 94 |             ratio = (explained_variance[j]-explained_variance[PVN_dims]) / explained_variance[j]
 95 |             x = x - ratio*np.dot(u.transpose(),x)*u
 96 |         z.append(x)
 97 |     word_vec_np = np.asarray(z)
 98 | 
 99 |     return word_vec_np
100 | 
101 | 
102 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
103 | # Get word weight based on frequency.
104 | def load_word_weight(weightfile, word2id, a=1e-3):
105 | 
106 |     print('get_word_weights')
107 |     if a <=0: # when the parameter makes no sense, use unweighted
108 |         a = 1.0
109 | 
110 |     word2weight = {}
111 |     with open(weightfile) as f:
112 |         lines = f.readlines()
113 |     N = 0
114 |     for i in lines:
115 |         i=i.strip()
116 |         if(len(i) > 0):
117 |             i=i.split()
118 |             if(len(i) == 2):
119 |                 word2weight[i[0]] = float(i[1])
120 |                 N += float(i[1])
121 |             else:
122 |                 print(i)
123 |     for key, value in word2weight.items():
124 |         word2weight[key] = a / (a + value/N)
125 | 
126 | 
127 |     # Update for current vocabulary
128 |     weight4ind = {}
129 |     for word, ind in word2id.items():
130 |         if word in word2weight:
131 |             weight4ind[ind] = word2weight[word]
132 |         else:
133 |             weight4ind[ind] = 1.0 # for unknown words - how much weight should be given
134 | 
135 |     return weight4ind
136 | 
137 | 
138 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
139 | # Construct semantic groups
140 | def semantic_construction(args):
141 | 
142 |     weight_list = list(args.word_weight.values())
143 |     weight_list = np.array(weight_list)
144 |     print('perform weighted k-means')
145 |     kmeans = KMeans(n_clusters=args.cluster_num).fit(args.word_vec_np, sample_weight = weight_list)
146 | 
147 |     args.word_labels = kmeans.labels_
148 |     args.centroids = kmeans.cluster_centers_
149 | 
150 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
151 | # Compute the sentence embedding
152 | def compute_embedding(args, sentences):
153 |     
154 |     samples = [sent if sent != [] else ['.'] for sent in sentences]
155 | 
156 |     sentences_by_id = []
157 | 
158 |     for sent in samples:
159 |         sentences_by_id.append([args.word2id[word] for word in sent])
160 | 
161 |     embeddings = []
162 |     
163 |     # Process each sentence at a time
164 |     for sent_id in sentences_by_id:
165 | 
166 |         stage_vec = [{}]
167 | 
168 |         # Original Word Vector
169 |         for word_id in sent_id:
170 |             stage_vec[-1][word_id] = args.word_vec_np[word_id,:]
171 | 
172 |         # C
173 |         stage_vec.append({})
174 |         for k,v in stage_vec[-2].items():
175 |             index = args.word_labels[k]
176 | 
177 |             if index in stage_vec[-1]:
178 |                 stage_vec[-1][index].append(stage_vec[-2][k]*args.word_weight[k])
179 |             else:
180 |                 stage_vec[-1][index] = []
181 |                 stage_vec[-1][index].append(stage_vec[-2][k]*args.word_weight[k])
182 | 
183 |         # VLAD for each cluster
184 |         for k,v in stage_vec[-1].items():
185 |             # Centroids
186 |             centroid_vec = args.centroids[k]
187 | 
188 |             # Residual
189 |             v = [wv - centroid_vec for wv in v]
190 |             stage_vec[-1][k] = np.sum(v,0)
191 | 
192 | 
193 |         # Compute Sentence Embedding
194 |         sentvec = []
195 |         vec = np.zeros((args.wvec_dim))
196 |         for key,value in stage_vec[0].items():
197 |             vec = vec + value * args.word_weight[key]
198 |         sentvec.append(vec/len(stage_vec[0].keys()))
199 | 
200 |         # Covariance Descriptor
201 |         matrix = np.zeros((args.cluster_num, args.wvec_dim))
202 |         for j in range(args.cluster_num):
203 |             if j in stage_vec[-1]:
204 |                 matrix[j,:] = stage_vec[-1][j]
205 |         matrix_no_mean = matrix - matrix.mean(1)[:, np.newaxis]
206 |         cov = matrix_no_mean.dot(matrix_no_mean.T)
207 | 
208 |         # Generate Embedding
209 |         iu1 = np.triu_indices(cov.shape[0])
210 |         iu2 = np.triu_indices(cov.shape[0],1)
211 |         cov[iu2] = cov[iu2] * np.sqrt(2)
212 |         vec = cov[iu1]
213 | 
214 |         vec = vec / np.linalg.norm(vec)
215 | 
216 |         sentvec.append(vec)
217 | 
218 |         sentvec = np.concatenate(sentvec)
219 | 
220 |         embeddings.append(sentvec)
221 | 
222 |     embeddings = np.vstack(embeddings)
223 | 
224 |     # Post processing
225 |     if args.postprocessing:
226 |         # Principal Component Removal
227 |         print('post processing sentence embedding using principal component removal')
228 |         svd = TruncatedSVD(n_components=args.postprocessing, n_iter=7, random_state=0)
229 |         svd.fit(embeddings)
230 |         args.svd_comp = svd.components_
231 | 
232 |         if args.postprocessing==1:
233 |             embeddings = embeddings - embeddings.dot(args.svd_comp.transpose()) * args.svd_comp
234 |         else:
235 |             embeddings = embeddings - embeddings.dot(args.svd_comp.transpose()).dot(args.svd_comp)
236 | 
237 |     return embeddings


--------------------------------------------------------------------------------