├── Dynamic node2vec ├── dynamic node2vec.py └── main.py ├── README.md ├── node2vec ├── main.py └── node2vec.py └── word2vec ├── dictionary.py ├── keyedvectors.py ├── matutils.py ├── setup.py ├── utils.py ├── voidptr.h ├── word2vec.py ├── word2vec_inner.c ├── word2vec_inner.pxd └── word2vec_inner.pyx /Dynamic node2vec/dynamic node2vec.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import networkx as nx 3 | import threadpool 4 | import multiprocessing 5 | import random 6 | 7 | 8 | def worker(worker_no, walk_length, nodes, graph): 9 | walks = [] 10 | print "worker %s start" % str(worker_no) 11 | random.shuffle(nodes) 12 | for node in nodes: 13 | walks.append(graph.node2vec_walk(walk_length=walk_length, start_node=node)) 14 | print "worker %s finish" % str(worker_no) 15 | return walks 16 | 17 | 18 | class Graph(): 19 | def __init__(self, nx_G, is_directed, p, q): 20 | self.G = nx_G 21 | self.is_directed = is_directed 22 | self.p = p 23 | self.q = q 24 | 25 | def node2vec_walk(self, walk_length, start_node): 26 | ''' 27 | Simulate a random walk starting from start node. 28 | ''' 29 | G = self.G 30 | alias_nodes = self.alias_nodes 31 | alias_edges = self.alias_edges 32 | 33 | walk = [start_node] 34 | 35 | while len(walk) < walk_length: 36 | cur = walk[-1] 37 | cur_nbrs = sorted(G.neighbors(cur)) 38 | if len(cur_nbrs) > 0: 39 | if len(walk) == 1: 40 | walk.append(cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])]) 41 | else: 42 | prev = walk[-2] 43 | next = cur_nbrs[alias_draw(alias_edges[(prev, cur)][0], 44 | alias_edges[(prev, cur)][1])] 45 | walk.append(next) 46 | else: 47 | break 48 | 49 | return walk 50 | 51 | def simulate_walks(self, num_walks, walk_length, nodes=None): 52 | ''' 53 | Repeatedly simulate random walks from each node. 54 | ''' 55 | G = self.G 56 | if nodes is None: 57 | nodes = list(G.nodes()) 58 | print "nodes count", len(nodes) 59 | 60 | walks = [] 61 | for i in range(num_walks): 62 | for node in nodes: 63 | walks.append(self.node2vec_walk(walk_length=walk_length, start_node=node)) 64 | # result = [] 65 | # jobs = [] 66 | # pool = multiprocessing.Pool(processes=num_walks) 67 | # for i in range(num_walks): 68 | # jobs.append((i, walk_length, nodes, self)) 69 | # # result.append(pool.apply_async(worker, args=(i, walk_length, nodes, self))) 70 | # pool.imap(worker, jobs) 71 | # pool.close() 72 | # pool.join() 73 | # 74 | # walks = [] 75 | # for walk in result: 76 | # walks.extend(walk.get()) 77 | return walks 78 | 79 | def get_alias_edge(self, src, dst): 80 | ''' 81 | Get the alias edge setup lists for a given edge. 82 | ''' 83 | G = self.G 84 | p = self.p 85 | q = self.q 86 | 87 | unnormalized_probs = [] 88 | for dst_nbr in sorted(G.neighbors(dst)): 89 | if dst_nbr == src: 90 | unnormalized_probs.append(G[dst][dst_nbr]['weight'] / p) 91 | elif G.has_edge(dst_nbr, src): 92 | unnormalized_probs.append(G[dst][dst_nbr]['weight']) 93 | else: 94 | unnormalized_probs.append(G[dst][dst_nbr]['weight'] / q) 95 | norm_const = sum(unnormalized_probs) 96 | normalized_probs = [float(u_prob) / norm_const for u_prob in unnormalized_probs] 97 | 98 | return alias_setup(normalized_probs) 99 | 100 | def preprocess_transition_probs(self): 101 | ''' 102 | Preprocessing of transition probabilities for guiding the random walks. 103 | ''' 104 | G = self.G 105 | is_directed = self.is_directed 106 | 107 | alias_nodes = {} 108 | for node in G.nodes(): 109 | unnormalized_probs = [G[node][nbr]['weight'] for nbr in sorted(G.neighbors(node))] 110 | norm_const = sum(unnormalized_probs) 111 | normalized_probs = [float(u_prob) / norm_const for u_prob in unnormalized_probs] 112 | alias_nodes[node] = alias_setup(normalized_probs) 113 | 114 | alias_edges = {} 115 | triads = {} 116 | 117 | if is_directed: 118 | for edge in G.edges(): 119 | alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) 120 | else: 121 | for edge in G.edges(): 122 | alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) 123 | alias_edges[(edge[1], edge[0])] = self.get_alias_edge(edge[1], edge[0]) 124 | 125 | self.alias_nodes = alias_nodes 126 | self.alias_edges = alias_edges 127 | 128 | return 129 | 130 | 131 | def alias_setup(probs): 132 | ''' 133 | Compute utility lists for non-uniform sampling from discrete distributions. 134 | Refer to https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/ 135 | for details 136 | ''' 137 | K = len(probs) 138 | q = np.zeros(K) 139 | J = np.zeros(K, dtype=np.int) 140 | 141 | smaller = [] 142 | larger = [] 143 | for kk, prob in enumerate(probs): 144 | q[kk] = K * prob 145 | if q[kk] < 1.0: 146 | smaller.append(kk) 147 | else: 148 | larger.append(kk) 149 | 150 | while len(smaller) > 0 and len(larger) > 0: 151 | small = smaller.pop() 152 | large = larger.pop() 153 | 154 | J[small] = large 155 | q[large] = q[large] + q[small] - 1.0 156 | if q[large] < 1.0: 157 | smaller.append(large) 158 | else: 159 | larger.append(large) 160 | 161 | return J, q 162 | 163 | 164 | def alias_draw(J, q): 165 | ''' 166 | Draw sample from a non-uniform discrete distribution using alias sampling. 167 | ''' 168 | K = len(J) 169 | 170 | kk = int(np.floor(np.random.rand() * K)) 171 | if np.random.rand() < q[kk]: 172 | return kk 173 | else: 174 | return J[kk] 175 | -------------------------------------------------------------------------------- /Dynamic node2vec/main.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Reference implementation of node2vec. 3 | 4 | Author: Aditya Grover 5 | 6 | For more details, refer to the paper: 7 | node2vec: Scalable Feature Learning for Networks 8 | Aditya Grover and Jure Leskovec 9 | Knowledge Discovery and Data Mining (KDD), 2016 10 | ''' 11 | 12 | import argparse 13 | import numpy as np 14 | import networkx as nx 15 | import node2vec 16 | import logging 17 | import time 18 | from word2vec.word2vec import Word2Vec 19 | 20 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO, 21 | filename="node2vec.log", filemode="a") 22 | 23 | 24 | def parse_args(): 25 | ''' 26 | Parses the node2vec arguments. 27 | ''' 28 | parser = argparse.ArgumentParser(description="Run node2vec.") 29 | 30 | parser.add_argument('--input', nargs='?', default='../output/bn.edgelist', 31 | help='Input graph path') 32 | 33 | parser.add_argument('--output', nargs='?', default='b.emb', 34 | help='Embeddings path') 35 | 36 | parser.add_argument('--dimensions', type=int, default=128, 37 | help='Number of dimensions. Default is 128.') 38 | 39 | parser.add_argument('--walk-length', type=int, default=80, 40 | help='Length of walk per source. Default is 80.') 41 | 42 | parser.add_argument('--num-walks', type=int, default=10, 43 | help='Number of walks per source. Default is 10.') 44 | 45 | parser.add_argument('--window-size', type=int, default=10, 46 | help='Context size for optimization. Default is 10.') 47 | 48 | parser.add_argument('--iter', default=1, type=int, 49 | help='Number of epochs in SGD') 50 | 51 | parser.add_argument('--workers', type=int, default=32, 52 | help='Number of parallel workers. Default is 8.') 53 | 54 | parser.add_argument('--p', type=float, default=1, 55 | help='Return hyperparameter. Default is 1.') 56 | 57 | parser.add_argument('--q', type=float, default=1, 58 | help='Inout hyperparameter. Default is 1.') 59 | 60 | parser.add_argument('--weighted', dest='weighted', action='store_true', 61 | help='Boolean specifying (un)weighted. Default is unweighted.') 62 | parser.add_argument('--unweighted', dest='unweighted', action='store_false') 63 | parser.set_defaults(weighted=False) 64 | 65 | parser.add_argument('--directed', dest='directed', action='store_true', 66 | help='Graph is (un)directed. Default is undirected.') 67 | parser.add_argument('--undirected', dest='undirected', action='store_false') 68 | parser.add_argument('--dynamic', dest='dynamic', default=True) 69 | parser.add_argument('--old_input)', dest='oldinput', default="../output/b.edgelist") 70 | parser.add_argument('--old_emb)', dest='oldemb', default="b.emb") 71 | parser.set_defaults(directed=False) 72 | 73 | return parser.parse_args() 74 | 75 | 76 | def read_graph(input): 77 | ''' 78 | Reads the input network in networkx. 79 | ''' 80 | if args.weighted: 81 | G = nx.read_edgelist(input, nodetype=int, data=(('weight', float),), create_using=nx.DiGraph()) 82 | else: 83 | G = nx.read_edgelist(input, nodetype=int, create_using=nx.DiGraph()) 84 | for edge in G.edges(): 85 | G[edge[0]][edge[1]]['weight'] = 1 86 | 87 | if not args.directed: 88 | G = G.to_undirected() 89 | 90 | return G 91 | 92 | 93 | def learn_embeddings(walks): 94 | ''' 95 | Learn embeddings by optimizing the Skipgram objective using SGD. 96 | ''' 97 | walks = [map(str, walk) for walk in walks] 98 | model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, sg=1, workers=args.workers) 99 | model.init_sims(replace=True) 100 | model.wv.save_word2vec_format(args.output) 101 | return 102 | 103 | 104 | def find_changed_edge(old_g, new_g): 105 | old_edge = set([(u, old_g.G.edge[u].keys()[0]) for u in old_g.G.edge]) 106 | new_edge = set([(u, new_g.G.edge[u].keys()[0]) for u in new_g.G.edge]) 107 | 108 | vanish = old_edge - new_edge 109 | add = new_edge - old_edge 110 | 111 | print "-:", len(vanish), "+:", len(add) 112 | 113 | return vanish, add 114 | 115 | 116 | def find_near_node(pair, G, deep=1): 117 | node_set = set([]) 118 | new_node_set = set([]) 119 | for i in range(len(pair)): 120 | node_set.add(pair[i]) 121 | 122 | for node in node_set: 123 | for n in G.G.adj[node].keys(): 124 | if n not in node_set: 125 | new_node_set.add(n) 126 | node_set |= new_node_set 127 | 128 | for i in range(deep): 129 | temp_set = set([]) 130 | for node in new_node_set: 131 | for n in G.G.adj[node].keys(): 132 | if n not in node_set: 133 | node_set.add(n) 134 | temp_set.add(n) 135 | new_node_set = temp_set 136 | return node_set 137 | 138 | 139 | def train_vanish(walks, sent_edge_dict): 140 | walks = [map(str, walk) for walk in walks] 141 | if len(walks) > 0: 142 | model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, sg=1, workers=args.workers, sent_edge_dict=sent_edge_dict) 143 | model.init_sims(replace=True) 144 | vec = {word: model.wv.syn0[model.wv.vocab[word].index] for word in model.wv.vocab} 145 | return vec 146 | else: 147 | return {} 148 | 149 | 150 | def train_add(walks): 151 | walks = [map(str, walk) for walk in walks] 152 | if len(walks) > 0: 153 | model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, sg=1, workers=args.workers) 154 | model.init_sims(replace=True) 155 | vec = {word: model.wv.syn0[model.wv.vocab[word].index] for word in model.wv.vocab} 156 | return vec 157 | else: 158 | return {} 159 | 160 | 161 | def main(args): 162 | ''' 163 | Pipeline for representational learning for all nodes in a graph. 164 | ''' 165 | if args.dynamic == True: 166 | print "dynamic" 167 | nx_G_old = read_graph(args.oldinput) 168 | G_old = node2vec.Graph(nx_G_old, args.directed, args.p, args.q) 169 | G_old.preprocess_transition_probs() 170 | 171 | nx_G_new = read_graph(args.input) 172 | G_new = node2vec.Graph(nx_G_new, args.directed, args.p, args.q) 173 | G_new.preprocess_transition_probs() 174 | 175 | print "load graph finish" 176 | print "old graph: nodes:", len(G_old.G.nodes()), "edges:", len(G_old.G.edges()) 177 | print "new graph: nodes:", len(G_new.G.nodes()), "edges:", len(G_new.G.edges()) 178 | 179 | vanish_edge, add_edge = find_changed_edge(G_old, G_new) 180 | 181 | vec = {} 182 | f = open(args.oldemb, "r") 183 | for line in f: 184 | node_vec = line.strip().split(" ") 185 | if len(node_vec) == args.dimensions + 1: 186 | vec[node_vec[0]] = np.array(map(str, node_vec[1:])) 187 | print "load vec finish" 188 | walk_vanish = [] 189 | edge_count = 0 190 | sent_edge_dict = {} 191 | vanish_dict = {} 192 | 193 | for pair in vanish_edge: 194 | if pair[0] < pair[1]: 195 | if pair[0] in vanish_dict: 196 | vanish_dict[pair[0]].add(pair[1]) 197 | else: 198 | vanish_dict[pair[0]] = {pair[1]} 199 | else: 200 | if pair[1] in vanish_dict: 201 | vanish_dict[pair[1]].add(pair[0]) 202 | else: 203 | vanish_dict[pair[1]] = {pair[0]} 204 | near_node = set([]) 205 | for pair in vanish_edge: 206 | near_node |= find_near_node(pair, G_old) 207 | print "near_node:", len(near_node) 208 | walks = G_old.simulate_walks(50, 5, nodes=list(near_node)) 209 | print "gen corpus:", len(walks) 210 | for l in walks: 211 | p_idx = 0 212 | flag = 0 213 | for index in range(len(l) - 1): 214 | if l[index] < l[index + 1]: 215 | k = l[index] 216 | v = l[index + 1] 217 | else: 218 | k = l[index + 1] 219 | v = l[index] 220 | 221 | if k in vanish_dict and v in vanish_dict[k]: 222 | if flag == 0: 223 | flag = 1 224 | p_idx = index 225 | elif flag == 1: 226 | edge = [l[p_idx], l[p_idx + 1]] 227 | if k not in edge or v not in edge: 228 | flag = 2 229 | break 230 | if flag == 1: 231 | walk_vanish.append(l) 232 | sent_edge_dict[edge_count] = p_idx 233 | edge_count += 1 234 | 235 | print "vanish corpus:", len(walk_vanish) 236 | update_vec = train_vanish(walk_vanish, sent_edge_dict) 237 | 238 | for node in update_vec: 239 | if node in G_new.G.node: 240 | vec[node] = update_vec[node] 241 | else: 242 | del vec[node] 243 | print "update vec" 244 | 245 | near_node = set([]) 246 | for pair in add_edge: 247 | near_node |= find_near_node(pair, G_new) 248 | walks = G_new.simulate_walks(50, 5, nodes=list(near_node)) 249 | print "gen add corpus:", len(walks) 250 | update_vec = train_add(walks) 251 | for node in update_vec: 252 | vec[node] = update_vec[node] 253 | print "update vec" 254 | 255 | f = open(args.output, "a") 256 | f.truncate() 257 | for k in vec: 258 | f.write(k + " " + " ".join(map(str, vec[k])) + "\n") 259 | else: 260 | nx_G = read_graph(args.input) 261 | G = node2vec.Graph(nx_G, args.directed, args.p, args.q) 262 | print "load graph finish" 263 | G.preprocess_transition_probs() 264 | walks = G.simulate_walks(args.num_walks, args.walk_length) 265 | print "gen corpus", len(walks) 266 | learn_embeddings(walks) 267 | print "finish" 268 | 269 | if __name__ == "__main__": 270 | args = parse_args() 271 | # args.dynamic = eval(args.dynamic) 272 | print type(args.dynamic), args.input, args.oldemb, args.output 273 | a = time.time() 274 | main(args) 275 | b = time.time() 276 | logging.info(str(b - a)) 277 | 278 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dynamic_network_embedding -------------------------------------------------------------------------------- /node2vec/main.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Reference implementation of node2vec. 3 | 4 | Author: Aditya Grover 5 | 6 | For more details, refer to the paper: 7 | node2vec: Scalable Feature Learning for Networks 8 | Aditya Grover and Jure Leskovec 9 | Knowledge Discovery and Data Mining (KDD), 2016 10 | ''' 11 | 12 | import argparse 13 | import numpy as np 14 | import networkx as nx 15 | import node2vec 16 | import logging 17 | import time 18 | from word2vec.word2vec import Word2Vec 19 | 20 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO, 21 | filename="node2vec.log", filemode="a") 22 | 23 | 24 | def parse_args(): 25 | ''' 26 | Parses the node2vec arguments. 27 | ''' 28 | parser = argparse.ArgumentParser(description="Run node2vec.") 29 | 30 | parser.add_argument('--input', nargs='?', default='../output/bn.edgelist', 31 | help='Input graph path') 32 | 33 | parser.add_argument('--output', nargs='?', default='b.emb', 34 | help='Embeddings path') 35 | 36 | parser.add_argument('--dimensions', type=int, default=128, 37 | help='Number of dimensions. Default is 128.') 38 | 39 | parser.add_argument('--walk-length', type=int, default=80, 40 | help='Length of walk per source. Default is 80.') 41 | 42 | parser.add_argument('--num-walks', type=int, default=10, 43 | help='Number of walks per source. Default is 10.') 44 | 45 | parser.add_argument('--window-size', type=int, default=10, 46 | help='Context size for optimization. Default is 10.') 47 | 48 | parser.add_argument('--iter', default=1, type=int, 49 | help='Number of epochs in SGD') 50 | 51 | parser.add_argument('--workers', type=int, default=32, 52 | help='Number of parallel workers. Default is 8.') 53 | 54 | parser.add_argument('--p', type=float, default=1, 55 | help='Return hyperparameter. Default is 1.') 56 | 57 | parser.add_argument('--q', type=float, default=1, 58 | help='Inout hyperparameter. Default is 1.') 59 | 60 | parser.add_argument('--weighted', dest='weighted', action='store_true', 61 | help='Boolean specifying (un)weighted. Default is unweighted.') 62 | parser.add_argument('--unweighted', dest='unweighted', action='store_false') 63 | parser.set_defaults(weighted=False) 64 | 65 | parser.add_argument('--directed', dest='directed', action='store_true', 66 | help='Graph is (un)directed. Default is undirected.') 67 | parser.add_argument('--undirected', dest='undirected', action='store_false') 68 | parser.add_argument('--dynamic', dest='dynamic', default=True) 69 | parser.add_argument('--old_input)', dest='oldinput', default="../output/b.edgelist") 70 | parser.add_argument('--old_emb)', dest='oldemb', default="b.emb") 71 | parser.set_defaults(directed=False) 72 | 73 | return parser.parse_args() 74 | 75 | 76 | def read_graph(input): 77 | ''' 78 | Reads the input network in networkx. 79 | ''' 80 | if args.weighted: 81 | G = nx.read_edgelist(input, nodetype=int, data=(('weight', float),), create_using=nx.DiGraph()) 82 | else: 83 | G = nx.read_edgelist(input, nodetype=int, create_using=nx.DiGraph()) 84 | for edge in G.edges(): 85 | G[edge[0]][edge[1]]['weight'] = 1 86 | 87 | if not args.directed: 88 | G = G.to_undirected() 89 | 90 | return G 91 | 92 | 93 | def learn_embeddings(walks): 94 | ''' 95 | Learn embeddings by optimizing the Skipgram objective using SGD. 96 | ''' 97 | walks = [map(str, walk) for walk in walks] 98 | model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, sg=1, workers=args.workers) 99 | model.init_sims(replace=True) 100 | model.wv.save_word2vec_format(args.output) 101 | return 102 | 103 | 104 | def find_changed_edge(old_g, new_g): 105 | old_edge = set([(u, old_g.G.edge[u].keys()[0]) for u in old_g.G.edge]) 106 | new_edge = set([(u, new_g.G.edge[u].keys()[0]) for u in new_g.G.edge]) 107 | 108 | vanish = old_edge - new_edge 109 | add = new_edge - old_edge 110 | 111 | print "-:", len(vanish), "+:", len(add) 112 | 113 | return vanish, add 114 | 115 | 116 | def find_near_node(pair, G, deep=1): 117 | node_set = set([]) 118 | new_node_set = set([]) 119 | for i in range(len(pair)): 120 | node_set.add(pair[i]) 121 | 122 | for node in node_set: 123 | for n in G.G.adj[node].keys(): 124 | if n not in node_set: 125 | new_node_set.add(n) 126 | node_set |= new_node_set 127 | 128 | for i in range(deep): 129 | temp_set = set([]) 130 | for node in new_node_set: 131 | for n in G.G.adj[node].keys(): 132 | if n not in node_set: 133 | node_set.add(n) 134 | temp_set.add(n) 135 | new_node_set = temp_set 136 | return node_set 137 | 138 | 139 | def train_vanish(walks, sent_edge_dict): 140 | walks = [map(str, walk) for walk in walks] 141 | if len(walks) > 0: 142 | model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, sg=1, workers=args.workers, sent_edge_dict=sent_edge_dict) 143 | model.init_sims(replace=True) 144 | vec = {word: model.wv.syn0[model.wv.vocab[word].index] for word in model.wv.vocab} 145 | return vec 146 | else: 147 | return {} 148 | 149 | 150 | def train_add(walks): 151 | walks = [map(str, walk) for walk in walks] 152 | if len(walks) > 0: 153 | model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, sg=1, workers=args.workers) 154 | model.init_sims(replace=True) 155 | vec = {word: model.wv.syn0[model.wv.vocab[word].index] for word in model.wv.vocab} 156 | return vec 157 | else: 158 | return {} 159 | 160 | 161 | def main(args): 162 | ''' 163 | Pipeline for representational learning for all nodes in a graph. 164 | ''' 165 | if args.dynamic == True: 166 | print "dynamic" 167 | nx_G_old = read_graph(args.oldinput) 168 | G_old = node2vec.Graph(nx_G_old, args.directed, args.p, args.q) 169 | G_old.preprocess_transition_probs() 170 | 171 | nx_G_new = read_graph(args.input) 172 | G_new = node2vec.Graph(nx_G_new, args.directed, args.p, args.q) 173 | G_new.preprocess_transition_probs() 174 | 175 | print "load graph finish" 176 | print "old graph: nodes:", len(G_old.G.nodes()), "edges:", len(G_old.G.edges()) 177 | print "new graph: nodes:", len(G_new.G.nodes()), "edges:", len(G_new.G.edges()) 178 | 179 | vanish_edge, add_edge = find_changed_edge(G_old, G_new) 180 | 181 | vec = {} 182 | f = open(args.oldemb, "r") 183 | for line in f: 184 | node_vec = line.strip().split(" ") 185 | if len(node_vec) == args.dimensions + 1: 186 | vec[node_vec[0]] = np.array(map(str, node_vec[1:])) 187 | print "load vec finish" 188 | walk_vanish = [] 189 | edge_count = 0 190 | sent_edge_dict = {} 191 | vanish_dict = {} 192 | 193 | for pair in vanish_edge: 194 | if pair[0] < pair[1]: 195 | if pair[0] in vanish_dict: 196 | vanish_dict[pair[0]].add(pair[1]) 197 | else: 198 | vanish_dict[pair[0]] = {pair[1]} 199 | else: 200 | if pair[1] in vanish_dict: 201 | vanish_dict[pair[1]].add(pair[0]) 202 | else: 203 | vanish_dict[pair[1]] = {pair[0]} 204 | near_node = set([]) 205 | for pair in vanish_edge: 206 | near_node |= find_near_node(pair, G_old) 207 | print "near_node:", len(near_node) 208 | walks = G_old.simulate_walks(50, 5, nodes=list(near_node)) 209 | print "gen corpus:", len(walks) 210 | for l in walks: 211 | p_idx = 0 212 | flag = 0 213 | for index in range(len(l) - 1): 214 | if l[index] < l[index + 1]: 215 | k = l[index] 216 | v = l[index + 1] 217 | else: 218 | k = l[index + 1] 219 | v = l[index] 220 | 221 | if k in vanish_dict and v in vanish_dict[k]: 222 | if flag == 0: 223 | flag = 1 224 | p_idx = index 225 | elif flag == 1: 226 | edge = [l[p_idx], l[p_idx + 1]] 227 | if k not in edge or v not in edge: 228 | flag = 2 229 | break 230 | if flag == 1: 231 | walk_vanish.append(l) 232 | sent_edge_dict[edge_count] = p_idx 233 | edge_count += 1 234 | 235 | print "vanish corpus:", len(walk_vanish) 236 | update_vec = train_vanish(walk_vanish, sent_edge_dict) 237 | 238 | for node in update_vec: 239 | if node in G_new.G.node: 240 | vec[node] = update_vec[node] 241 | else: 242 | del vec[node] 243 | print "update vec" 244 | 245 | near_node = set([]) 246 | for pair in add_edge: 247 | near_node |= find_near_node(pair, G_new) 248 | walks = G_new.simulate_walks(50, 5, nodes=list(near_node)) 249 | print "gen add corpus:", len(walks) 250 | update_vec = train_add(walks) 251 | for node in update_vec: 252 | vec[node] = update_vec[node] 253 | print "update vec" 254 | 255 | f = open(args.output, "a") 256 | f.truncate() 257 | for k in vec: 258 | f.write(k + " " + " ".join(map(str, vec[k])) + "\n") 259 | else: 260 | nx_G = read_graph(args.input) 261 | G = node2vec.Graph(nx_G, args.directed, args.p, args.q) 262 | print "load graph finish" 263 | G.preprocess_transition_probs() 264 | walks = G.simulate_walks(args.num_walks, args.walk_length) 265 | print "gen corpus", len(walks) 266 | learn_embeddings(walks) 267 | print "finish" 268 | 269 | if __name__ == "__main__": 270 | args = parse_args() 271 | # args.dynamic = eval(args.dynamic) 272 | print type(args.dynamic), args.input, args.oldemb, args.output 273 | a = time.time() 274 | main(args) 275 | b = time.time() 276 | logging.info(str(b - a)) 277 | 278 | -------------------------------------------------------------------------------- /node2vec/node2vec.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import networkx as nx 3 | import threadpool 4 | import multiprocessing 5 | import random 6 | 7 | 8 | def worker(worker_no, walk_length, nodes, graph): 9 | walks = [] 10 | print "worker %s start" % str(worker_no) 11 | random.shuffle(nodes) 12 | for node in nodes: 13 | walks.append(graph.node2vec_walk(walk_length=walk_length, start_node=node)) 14 | print "worker %s finish" % str(worker_no) 15 | return walks 16 | 17 | 18 | class Graph(): 19 | def __init__(self, nx_G, is_directed, p, q): 20 | self.G = nx_G 21 | self.is_directed = is_directed 22 | self.p = p 23 | self.q = q 24 | 25 | def node2vec_walk(self, walk_length, start_node): 26 | ''' 27 | Simulate a random walk starting from start node. 28 | ''' 29 | G = self.G 30 | alias_nodes = self.alias_nodes 31 | alias_edges = self.alias_edges 32 | 33 | walk = [start_node] 34 | 35 | while len(walk) < walk_length: 36 | cur = walk[-1] 37 | cur_nbrs = sorted(G.neighbors(cur)) 38 | if len(cur_nbrs) > 0: 39 | if len(walk) == 1: 40 | walk.append(cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])]) 41 | else: 42 | prev = walk[-2] 43 | next = cur_nbrs[alias_draw(alias_edges[(prev, cur)][0], 44 | alias_edges[(prev, cur)][1])] 45 | walk.append(next) 46 | else: 47 | break 48 | 49 | return walk 50 | 51 | def simulate_walks(self, num_walks, walk_length, nodes=None): 52 | ''' 53 | Repeatedly simulate random walks from each node. 54 | ''' 55 | G = self.G 56 | if nodes is None: 57 | nodes = list(G.nodes()) 58 | print "nodes count", len(nodes) 59 | 60 | walks = [] 61 | for i in range(num_walks): 62 | for node in nodes: 63 | walks.append(self.node2vec_walk(walk_length=walk_length, start_node=node)) 64 | # result = [] 65 | # jobs = [] 66 | # pool = multiprocessing.Pool(processes=num_walks) 67 | # for i in range(num_walks): 68 | # jobs.append((i, walk_length, nodes, self)) 69 | # # result.append(pool.apply_async(worker, args=(i, walk_length, nodes, self))) 70 | # pool.imap(worker, jobs) 71 | # pool.close() 72 | # pool.join() 73 | # 74 | # walks = [] 75 | # for walk in result: 76 | # walks.extend(walk.get()) 77 | return walks 78 | 79 | def get_alias_edge(self, src, dst): 80 | ''' 81 | Get the alias edge setup lists for a given edge. 82 | ''' 83 | G = self.G 84 | p = self.p 85 | q = self.q 86 | 87 | unnormalized_probs = [] 88 | for dst_nbr in sorted(G.neighbors(dst)): 89 | if dst_nbr == src: 90 | unnormalized_probs.append(G[dst][dst_nbr]['weight'] / p) 91 | elif G.has_edge(dst_nbr, src): 92 | unnormalized_probs.append(G[dst][dst_nbr]['weight']) 93 | else: 94 | unnormalized_probs.append(G[dst][dst_nbr]['weight'] / q) 95 | norm_const = sum(unnormalized_probs) 96 | normalized_probs = [float(u_prob) / norm_const for u_prob in unnormalized_probs] 97 | 98 | return alias_setup(normalized_probs) 99 | 100 | def preprocess_transition_probs(self): 101 | ''' 102 | Preprocessing of transition probabilities for guiding the random walks. 103 | ''' 104 | G = self.G 105 | is_directed = self.is_directed 106 | 107 | alias_nodes = {} 108 | for node in G.nodes(): 109 | unnormalized_probs = [G[node][nbr]['weight'] for nbr in sorted(G.neighbors(node))] 110 | norm_const = sum(unnormalized_probs) 111 | normalized_probs = [float(u_prob) / norm_const for u_prob in unnormalized_probs] 112 | alias_nodes[node] = alias_setup(normalized_probs) 113 | 114 | alias_edges = {} 115 | triads = {} 116 | 117 | if is_directed: 118 | for edge in G.edges(): 119 | alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) 120 | else: 121 | for edge in G.edges(): 122 | alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) 123 | alias_edges[(edge[1], edge[0])] = self.get_alias_edge(edge[1], edge[0]) 124 | 125 | self.alias_nodes = alias_nodes 126 | self.alias_edges = alias_edges 127 | 128 | return 129 | 130 | 131 | def alias_setup(probs): 132 | ''' 133 | Compute utility lists for non-uniform sampling from discrete distributions. 134 | Refer to https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/ 135 | for details 136 | ''' 137 | K = len(probs) 138 | q = np.zeros(K) 139 | J = np.zeros(K, dtype=np.int) 140 | 141 | smaller = [] 142 | larger = [] 143 | for kk, prob in enumerate(probs): 144 | q[kk] = K * prob 145 | if q[kk] < 1.0: 146 | smaller.append(kk) 147 | else: 148 | larger.append(kk) 149 | 150 | while len(smaller) > 0 and len(larger) > 0: 151 | small = smaller.pop() 152 | large = larger.pop() 153 | 154 | J[small] = large 155 | q[large] = q[large] + q[small] - 1.0 156 | if q[large] < 1.0: 157 | smaller.append(large) 158 | else: 159 | larger.append(large) 160 | 161 | return J, q 162 | 163 | 164 | def alias_draw(J, q): 165 | ''' 166 | Draw sample from a non-uniform discrete distribution using alias sampling. 167 | ''' 168 | K = len(J) 169 | 170 | kk = int(np.floor(np.random.rand() * K)) 171 | if np.random.rand() < q[kk]: 172 | return kk 173 | else: 174 | return J[kk] 175 | -------------------------------------------------------------------------------- /word2vec/dictionary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | 8 | """ 9 | This module implements the concept of Dictionary -- a mapping between words and 10 | their integer ids. 11 | 12 | Dictionaries can be created from a corpus and can later be pruned according to 13 | document frequency (removing (un)common words via the :func:`Dictionary.filter_extremes` method), 14 | save/loaded from disk (via :func:`Dictionary.save` and :func:`Dictionary.load` methods), merged 15 | with other dictionary (:func:`Dictionary.merge_with`) etc. 16 | """ 17 | 18 | from __future__ import with_statement 19 | 20 | from collections import Mapping, defaultdict 21 | import sys 22 | import logging 23 | import itertools 24 | 25 | import utils 26 | 27 | if sys.version_info[0] >= 3: 28 | unicode = str 29 | 30 | from six import PY3, iteritems, iterkeys, itervalues, string_types 31 | from six.moves import xrange 32 | from six.moves import zip as izip 33 | 34 | 35 | logger = logging.getLogger('gensim.corpora.dictionary') 36 | 37 | 38 | class Dictionary(utils.SaveLoad, Mapping): 39 | """ 40 | Dictionary encapsulates the mapping between normalized words and their integer ids. 41 | 42 | The main function is `doc2bow`, which converts a collection of words to its 43 | bag-of-words representation: a list of (word_id, word_frequency) 2-tuples. 44 | """ 45 | def __init__(self, documents=None, prune_at=2000000): 46 | """ 47 | If `documents` are given, use them to initialize Dictionary (see `add_documents()`). 48 | """ 49 | self.token2id = {} # token -> tokenId 50 | self.id2token = {} # reverse mapping for token2id; only formed on request, to save memory 51 | self.dfs = {} # document frequencies: tokenId -> in how many documents this token appeared 52 | 53 | self.num_docs = 0 # number of documents processed 54 | self.num_pos = 0 # total number of corpus positions 55 | self.num_nnz = 0 # total number of non-zeroes in the BOW matrix 56 | 57 | if documents is not None: 58 | self.add_documents(documents, prune_at=prune_at) 59 | 60 | def __getitem__(self, tokenid): 61 | if len(self.id2token) != len(self.token2id): 62 | # the word->id mapping has changed (presumably via add_documents); 63 | # recompute id->word accordingly 64 | self.id2token = dict((v, k) for k, v in iteritems(self.token2id)) 65 | return self.id2token[tokenid] # will throw for non-existent ids 66 | 67 | def __iter__(self): 68 | return iter(self.keys()) 69 | 70 | if PY3: 71 | # restore Py2-style dict API 72 | iterkeys = __iter__ 73 | 74 | def iteritems(self): 75 | return self.items() 76 | 77 | def itervalues(self): 78 | return self.values() 79 | 80 | def keys(self): 81 | """Return a list of all token ids.""" 82 | return list(self.token2id.values()) 83 | 84 | def __len__(self): 85 | """ 86 | Return the number of token->id mappings in the dictionary. 87 | """ 88 | return len(self.token2id) 89 | 90 | def __str__(self): 91 | some_keys = list(itertools.islice(iterkeys(self.token2id), 5)) 92 | return "Dictionary(%i unique tokens: %s%s)" % (len(self), some_keys, '...' if len(self) > 5 else '') 93 | 94 | @staticmethod 95 | def from_documents(documents): 96 | return Dictionary(documents=documents) 97 | 98 | def add_documents(self, documents, prune_at=2000000): 99 | """ 100 | Update dictionary from a collection of documents. Each document is a list 101 | of tokens = **tokenized and normalized** strings (either utf8 or unicode). 102 | 103 | This is a convenience wrapper for calling `doc2bow` on each document 104 | with `allow_update=True`, which also prunes infrequent words, keeping the 105 | total number of unique words <= `prune_at`. This is to save memory on very 106 | large inputs. To disable this pruning, set `prune_at=None`. 107 | 108 | >>> print(Dictionary(["máma mele maso".split(), "ema má máma".split()])) 109 | Dictionary(5 unique tokens) 110 | """ 111 | for docno, document in enumerate(documents): 112 | # log progress & run a regular check for pruning, once every 10k docs 113 | if docno % 10000 == 0: 114 | if prune_at is not None and len(self) > prune_at: 115 | self.filter_extremes(no_below=0, no_above=1.0, keep_n=prune_at) 116 | logger.info("adding document #%i to %s", docno, self) 117 | 118 | # update Dictionary with the document 119 | self.doc2bow(document, allow_update=True) # ignore the result, here we only care about updating token ids 120 | 121 | logger.info( 122 | "built %s from %i documents (total %i corpus positions)", 123 | self, self.num_docs, self.num_pos) 124 | 125 | def doc2bow(self, document, allow_update=False, return_missing=False): 126 | """ 127 | Convert `document` (a list of words) into the bag-of-words format = list 128 | of `(token_id, token_count)` 2-tuples. Each word is assumed to be a 129 | **tokenized and normalized** string (either unicode or utf8-encoded). No further preprocessing 130 | is done on the words in `document`; apply tokenization, stemming etc. before 131 | calling this method. 132 | 133 | If `allow_update` is set, then also update dictionary in the process: create 134 | ids for new words. At the same time, update document frequencies -- for 135 | each word appearing in this document, increase its document frequency (`self.dfs`) 136 | by one. 137 | 138 | If `allow_update` is **not** set, this function is `const`, aka read-only. 139 | """ 140 | if isinstance(document, string_types): 141 | raise TypeError("doc2bow expects an array of unicode tokens on input, not a single string") 142 | 143 | # Construct (word, frequency) mapping. 144 | counter = defaultdict(int) 145 | for w in document: 146 | counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1 147 | 148 | token2id = self.token2id 149 | if allow_update or return_missing: 150 | missing = dict((w, freq) for w, freq in iteritems(counter) if w not in token2id) 151 | if allow_update: 152 | for w in missing: 153 | # new id = number of ids made so far; 154 | # NOTE this assumes there are no gaps in the id sequence! 155 | token2id[w] = len(token2id) 156 | 157 | result = dict((token2id[w], freq) for w, freq in iteritems(counter) if w in token2id) 158 | 159 | if allow_update: 160 | self.num_docs += 1 161 | self.num_pos += sum(itervalues(counter)) 162 | self.num_nnz += len(result) 163 | # increase document count for each unique token that appeared in the document 164 | dfs = self.dfs 165 | for tokenid in iterkeys(result): 166 | dfs[tokenid] = dfs.get(tokenid, 0) + 1 167 | 168 | # return tokenids, in ascending id order 169 | result = sorted(iteritems(result)) 170 | if return_missing: 171 | return result, missing 172 | else: 173 | return result 174 | 175 | def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None): 176 | """ 177 | Filter out tokens that appear in 178 | 179 | 1. less than `no_below` documents (absolute number) or 180 | 2. more than `no_above` documents (fraction of total corpus size, *not* 181 | absolute number). 182 | 3. if tokens are given in keep_tokens (list of strings), they will be kept regardless of 183 | the `no_below` and `no_above` settings 184 | 4. after (1), (2) and (3), keep only the first `keep_n` most frequent tokens (or 185 | keep all if `None`). 186 | 187 | After the pruning, shrink resulting gaps in word ids. 188 | 189 | **Note**: Due to the gap shrinking, the same word may have a different 190 | word id before and after the call to this function! 191 | """ 192 | no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold 193 | 194 | # determine which tokens to keep 195 | if keep_tokens: 196 | keep_ids = [self.token2id[v] for v in keep_tokens if v in self.token2id] 197 | good_ids = ( 198 | v for v in itervalues(self.token2id) 199 | if no_below <= self.dfs.get(v, 0) <= no_above_abs 200 | or v in keep_ids 201 | ) 202 | else: 203 | good_ids = ( 204 | v for v in itervalues(self.token2id) 205 | if no_below <= self.dfs.get(v, 0) <= no_above_abs) 206 | good_ids = sorted(good_ids, key=self.dfs.get, reverse=True) 207 | if keep_n is not None: 208 | good_ids = good_ids[:keep_n] 209 | bad_words = [(self[id], self.dfs.get(id, 0)) for id in set(self).difference(good_ids)] 210 | logger.info("discarding %i tokens: %s...", len(self) - len(good_ids), bad_words[:10]) 211 | logger.info( 212 | "keeping %i tokens which were in no less than %i and no more than %i (=%.1f%%) documents", 213 | len(good_ids), no_below, no_above_abs, 100.0 * no_above) 214 | 215 | # do the actual filtering, then rebuild dictionary to remove gaps in ids 216 | self.filter_tokens(good_ids=good_ids) 217 | logger.info("resulting dictionary: %s", self) 218 | 219 | def filter_n_most_frequent(self, remove_n): 220 | """ 221 | Filter out the 'remove_n' most frequent tokens that appear in the documents. 222 | 223 | After the pruning, shrink resulting gaps in word ids. 224 | 225 | **Note**: Due to the gap shrinking, the same word may have a different 226 | word id before and after the call to this function! 227 | """ 228 | # determine which tokens to keep 229 | most_frequent_ids = (v for v in itervalues(self.token2id)) 230 | most_frequent_ids = sorted(most_frequent_ids, key=self.dfs.get, reverse=True) 231 | most_frequent_ids = most_frequent_ids[:remove_n] 232 | # do the actual filtering, then rebuild dictionary to remove gaps in ids 233 | most_frequent_words = [(self[id], self.dfs.get(id, 0)) for id in most_frequent_ids] 234 | logger.info("discarding %i tokens: %s...", len(most_frequent_ids), most_frequent_words[:10]) 235 | 236 | self.filter_tokens(bad_ids=most_frequent_ids) 237 | logger.info("resulting dictionary: %s" % self) 238 | 239 | def filter_tokens(self, bad_ids=None, good_ids=None): 240 | """ 241 | Remove the selected `bad_ids` tokens from all dictionary mappings, or, keep 242 | selected `good_ids` in the mapping and remove the rest. 243 | 244 | `bad_ids` and `good_ids` are collections of word ids to be removed. 245 | """ 246 | if bad_ids is not None: 247 | bad_ids = set(bad_ids) 248 | self.token2id = dict((token, tokenid) 249 | for token, tokenid in iteritems(self.token2id) 250 | if tokenid not in bad_ids) 251 | self.dfs = dict((tokenid, freq) 252 | for tokenid, freq in iteritems(self.dfs) 253 | if tokenid not in bad_ids) 254 | if good_ids is not None: 255 | good_ids = set(good_ids) 256 | self.token2id = dict((token, tokenid) 257 | for token, tokenid in iteritems(self.token2id) 258 | if tokenid in good_ids) 259 | self.dfs = dict((tokenid, freq) 260 | for tokenid, freq in iteritems(self.dfs) 261 | if tokenid in good_ids) 262 | self.compactify() 263 | 264 | def compactify(self): 265 | """ 266 | Assign new word ids to all words. 267 | 268 | This is done to make the ids more compact, e.g. after some tokens have 269 | been removed via :func:`filter_tokens` and there are gaps in the id series. 270 | Calling this method will remove the gaps. 271 | """ 272 | logger.debug("rebuilding dictionary, shrinking gaps") 273 | 274 | # build mapping from old id -> new id 275 | idmap = dict(izip(itervalues(self.token2id), xrange(len(self.token2id)))) 276 | 277 | # reassign mappings to new ids 278 | self.token2id = dict((token, idmap[tokenid]) for token, tokenid in iteritems(self.token2id)) 279 | self.id2token = {} 280 | self.dfs = dict((idmap[tokenid], freq) for tokenid, freq in iteritems(self.dfs)) 281 | 282 | def save_as_text(self, fname, sort_by_word=True): 283 | """ 284 | Save this Dictionary to a text file, in format: 285 | `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. Sorted by word, 286 | or by decreasing word frequency. 287 | 288 | Note: text format should be use for corpus inspection. Use `save`/`load` 289 | to store in binary format (pickle) for improved performance. 290 | """ 291 | logger.info("saving dictionary mapping to %s", fname) 292 | with utils.smart_open(fname, 'wb') as fout: 293 | if sort_by_word: 294 | for token, tokenid in sorted(iteritems(self.token2id)): 295 | line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0)) 296 | fout.write(utils.to_utf8(line)) 297 | else: 298 | for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]): 299 | line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq) 300 | fout.write(utils.to_utf8(line)) 301 | 302 | def merge_with(self, other): 303 | """ 304 | Merge another dictionary into this dictionary, mapping same tokens to the 305 | same ids and new tokens to new ids. The purpose is to merge two corpora 306 | created using two different dictionaries, one from `self` and one from `other`. 307 | 308 | `other` can be any id=>word mapping (a dict, a Dictionary object, ...). 309 | 310 | Return a transformation object which, when accessed as `result[doc_from_other_corpus]`, 311 | will convert documents from a corpus built using the `other` dictionary 312 | into a document using the new, merged dictionary (see :class:`gensim.interfaces.TransformationABC`). 313 | 314 | Example: 315 | 316 | >>> dict1 = Dictionary(some_documents) 317 | >>> dict2 = Dictionary(other_documents) # ids not compatible with dict1! 318 | >>> dict2_to_dict1 = dict1.merge_with(dict2) 319 | >>> # now we can merge corpora from the two incompatible dictionaries into one 320 | >>> merged_corpus = itertools.chain(some_corpus_from_dict1, dict2_to_dict1[some_corpus_from_dict2]) 321 | 322 | """ 323 | old2new = {} 324 | for other_id, other_token in iteritems(other): 325 | if other_token in self.token2id: 326 | new_id = self.token2id[other_token] 327 | else: 328 | new_id = len(self.token2id) 329 | self.token2id[other_token] = new_id 330 | self.dfs[new_id] = 0 331 | old2new[other_id] = new_id 332 | try: 333 | self.dfs[new_id] += other.dfs[other_id] 334 | except: 335 | # `other` isn't a Dictionary (probably just a dict) => ignore dfs, keep going 336 | pass 337 | try: 338 | self.num_docs += other.num_docs 339 | self.num_nnz += other.num_nnz 340 | self.num_pos += other.num_pos 341 | except: 342 | pass 343 | 344 | import gensim.models 345 | return gensim.models.VocabTransform(old2new) 346 | 347 | @staticmethod 348 | def load_from_text(fname): 349 | """ 350 | Load a previously stored Dictionary from a text file. 351 | Mirror function to `save_as_text`. 352 | """ 353 | result = Dictionary() 354 | with utils.smart_open(fname) as f: 355 | for lineno, line in enumerate(f): 356 | line = utils.to_unicode(line) 357 | try: 358 | wordid, word, docfreq = line[:-1].split('\t') 359 | except Exception: 360 | raise ValueError("invalid line in dictionary file %s: %s" 361 | % (fname, line.strip())) 362 | wordid = int(wordid) 363 | if word in result.token2id: 364 | raise KeyError('token %s is defined as ID %d and as ID %d' % (word, wordid, result.token2id[word])) 365 | result.token2id[word] = wordid 366 | result.dfs[wordid] = int(docfreq) 367 | return result 368 | 369 | @staticmethod 370 | def from_corpus(corpus, id2word=None): 371 | """ 372 | Create Dictionary from an existing corpus. This can be useful if you only 373 | have a term-document BOW matrix (represented by `corpus`), but not the 374 | original text corpus. 375 | 376 | This will scan the term-document count matrix for all word ids that 377 | appear in it, then construct and return Dictionary which maps each 378 | `word_id -> id2word[word_id]`. 379 | 380 | `id2word` is an optional dictionary that maps the `word_id` to a token. In 381 | case `id2word` isn't specified the mapping `id2word[word_id] = str(word_id)` 382 | will be used. 383 | """ 384 | 385 | result = Dictionary() 386 | max_id = -1 387 | for docno, document in enumerate(corpus): 388 | if docno % 10000 == 0: 389 | logger.info("adding document #%i to %s", docno, result) 390 | result.num_docs += 1 391 | result.num_nnz += len(document) 392 | for wordid, word_freq in document: 393 | max_id = max(wordid, max_id) 394 | result.num_pos += word_freq 395 | result.dfs[wordid] = result.dfs.get(wordid, 0) + 1 396 | 397 | if id2word is None: 398 | # make sure length(result) == get_max_id(corpus) + 1 399 | result.token2id = dict((unicode(i), i) for i in xrange(max_id + 1)) 400 | else: 401 | # id=>word mapping given: simply copy it 402 | result.token2id = dict((utils.to_unicode(token), id) for id, token in iteritems(id2word)) 403 | for id in itervalues(result.token2id): 404 | # make sure all token ids have a valid `dfs` entry 405 | result.dfs[id] = result.dfs.get(id, 0) 406 | 407 | logger.info( 408 | "built %s from %i documents (total %i corpus positions)", 409 | result, result.num_docs, result.num_pos) 410 | return result 411 | -------------------------------------------------------------------------------- /word2vec/keyedvectors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2016 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | """ 8 | Word vector storage and similarity look-ups. Common model independent of the way the vectors are trained(Word2Vec, FastText, WordRank, VarEmbed etc) 9 | 10 | The word vectors are considered read-only in this class. 11 | 12 | Initialize the vectors by training e.g. Word2Vec:: 13 | 14 | >>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) 15 | >>> word_vectors = model.wv 16 | 17 | Persist the word vectors to disk with:: 18 | 19 | >>> word_vectors.save(fname) 20 | >>> word_vectors = KeyedVectors.load(fname) 21 | 22 | The vectors can also be instantiated from an existing file on disk in the original Google's word2vec C format as a KeyedVectors instance:: 23 | 24 | >>> from gensim.models.keyedvectors import KeyedVectors 25 | >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format 26 | >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.bin', binary=True) # C binary format 27 | 28 | You can perform various syntactic/semantic NLP word tasks with the vectors. Some of them 29 | are already built-in:: 30 | 31 | >>> word_vectors.most_similar(positive=['woman', 'king'], negative=['man']) 32 | [('queen', 0.50882536), ...] 33 | 34 | >>> word_vectors.most_similar_cosmul(positive=['woman', 'king'], negative=['man']) 35 | [('queen', 0.71382287), ...] 36 | 37 | >>> word_vectors.doesnt_match("breakfast cereal dinner lunch".split()) 38 | 'cereal' 39 | 40 | >>> word_vectors.similarity('woman', 'man') 41 | 0.73723527 42 | 43 | Correlation with human opinion on word similarity:: 44 | 45 | >>> word_vectors.evaluate_word_pairs(os.path.join(module_path, 'test_data','wordsim353.tsv')) 46 | 0.51, 0.62, 0.13 47 | 48 | And on analogies:: 49 | 50 | >>> word_vectors.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt')) 51 | 52 | and so on. 53 | 54 | """ 55 | from __future__ import division # py3 "true division" 56 | 57 | import logging 58 | 59 | try: 60 | from queue import Queue, Empty 61 | except ImportError: 62 | from Queue import Queue, Empty 63 | 64 | # If pyemd C extension is available, import it. 65 | # If pyemd is attempted to be used, but isn't installed, ImportError will be raised in wmdistance 66 | try: 67 | from pyemd import emd 68 | PYEMD_EXT = True 69 | except ImportError: 70 | PYEMD_EXT = False 71 | 72 | from numpy import dot, zeros, dtype, float32 as REAL,\ 73 | double, array, vstack, fromstring, sqrt, newaxis,\ 74 | ndarray, sum as np_sum, prod, ascontiguousarray 75 | 76 | import utils, matutils # utility fnc for pickling, common scipy operations etc 77 | from dictionary import Dictionary 78 | from six import string_types, iteritems 79 | from six.moves import xrange 80 | from scipy import stats 81 | 82 | 83 | logger = logging.getLogger(__name__) 84 | 85 | 86 | class Vocab(object): 87 | """ 88 | A single vocabulary item, used internally for collecting per-word frequency/sampling info, 89 | and for constructing binary trees (incl. both word leaves and inner nodes). 90 | 91 | """ 92 | def __init__(self, **kwargs): 93 | self.count = 0 94 | self.__dict__.update(kwargs) 95 | 96 | def __lt__(self, other): # used for sorting in a priority queue 97 | return self.count < other.count 98 | 99 | def __str__(self): 100 | vals = ['%s:%r' % (key, self.__dict__[key]) for key in sorted(self.__dict__) if not key.startswith('_')] 101 | return "%s(%s)" % (self.__class__.__name__, ', '.join(vals)) 102 | 103 | 104 | class KeyedVectors(utils.SaveLoad): 105 | """ 106 | Class to contain vectors and vocab for the Word2Vec training class and other w2v methods not directly 107 | involved in training such as most_similar() 108 | """ 109 | def __init__(self): 110 | self.syn0 = [] 111 | self.syn0norm = None 112 | self.vocab = {} 113 | self.index2word = [] 114 | self.vector_size = None 115 | 116 | def save(self, *args, **kwargs): 117 | # don't bother storing the cached normalized vectors 118 | kwargs['ignore'] = kwargs.get('ignore', ['syn0norm']) 119 | super(KeyedVectors, self).save(*args, **kwargs) 120 | 121 | def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None): 122 | """ 123 | Store the input-hidden weight matrix in the same format used by the original 124 | C word2vec-tool, for compatibility. 125 | 126 | `fname` is the file used to save the vectors in 127 | `fvocab` is an optional file used to save the vocabulary 128 | `binary` is an optional boolean indicating whether the data is to be saved 129 | in binary word2vec format (default: False) 130 | `total_vec` is an optional parameter to explicitly specify total no. of vectors 131 | (in case word vectors are appended with document vectors afterwards) 132 | 133 | """ 134 | if total_vec is None: 135 | total_vec = len(self.vocab) 136 | vector_size = self.syn0.shape[1] 137 | if fvocab is not None: 138 | logger.info("storing vocabulary in %s" % (fvocab)) 139 | with utils.smart_open(fvocab, 'wb') as vout: 140 | for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): 141 | vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count))) 142 | logger.info("storing %sx%s projection weights into %s" % (total_vec, vector_size, fname)) 143 | assert (len(self.vocab), vector_size) == self.syn0.shape 144 | with utils.smart_open(fname, 'wb') as fout: 145 | fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size))) 146 | # store in sorted order: most frequent words at the top 147 | for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): 148 | row = self.syn0[vocab.index] 149 | if binary: 150 | fout.write(utils.to_utf8(word) + b" " + row.tostring()) 151 | else: 152 | fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row)))) 153 | 154 | 155 | @classmethod 156 | def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', 157 | limit=None, datatype=REAL): 158 | """ 159 | Load the input-hidden weight matrix from the original C word2vec-tool format. 160 | 161 | Note that the information stored in the file is incomplete (the binary tree is missing), 162 | so while you can query for word similarity etc., you cannot continue training 163 | with a model loaded this way. 164 | 165 | `binary` is a boolean indicating whether the data is in binary word2vec format. 166 | `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory. 167 | Word counts are read from `fvocab` filename, if set (this is the file generated 168 | by `-save-vocab` flag of the original C tool). 169 | 170 | If you trained the C model using non-utf8 encoding for words, specify that 171 | encoding in `encoding`. 172 | 173 | `unicode_errors`, default 'strict', is a string suitable to be passed as the `errors` 174 | argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source 175 | file may include word tokens truncated in the middle of a multibyte unicode character 176 | (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help. 177 | 178 | `limit` sets a maximum number of word-vectors to read from the file. The default, 179 | None, means read all. 180 | 181 | `datatype` (experimental) can coerce dimensions to a non-default float type (such 182 | as np.float16) to save memory. (Such types may result in much slower bulk operations 183 | or incompatibility with optimized routines.) 184 | 185 | """ 186 | counts = None 187 | if fvocab is not None: 188 | logger.info("loading word counts from %s", fvocab) 189 | counts = {} 190 | with utils.smart_open(fvocab) as fin: 191 | for line in fin: 192 | word, count = utils.to_unicode(line).strip().split() 193 | counts[word] = int(count) 194 | 195 | logger.info("loading projection weights from %s", fname) 196 | with utils.smart_open(fname) as fin: 197 | header = utils.to_unicode(fin.readline(), encoding=encoding) 198 | vocab_size, vector_size = map(int, header.split()) # throws for invalid file format 199 | if limit: 200 | vocab_size = min(vocab_size, limit) 201 | result = cls() 202 | result.vector_size = vector_size 203 | result.syn0 = zeros((vocab_size, vector_size), dtype=datatype) 204 | 205 | def add_word(word, weights): 206 | word_id = len(result.vocab) 207 | if word in result.vocab: 208 | logger.warning("duplicate word '%s' in %s, ignoring all but first", word, fname) 209 | return 210 | if counts is None: 211 | # most common scenario: no vocab file given. just make up some bogus counts, in descending order 212 | result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id) 213 | elif word in counts: 214 | # use count from the vocab file 215 | result.vocab[word] = Vocab(index=word_id, count=counts[word]) 216 | else: 217 | # vocab file given, but word is missing -- set count to None (TODO: or raise?) 218 | logger.warning("vocabulary file is incomplete: '%s' is missing", word) 219 | result.vocab[word] = Vocab(index=word_id, count=None) 220 | result.syn0[word_id] = weights 221 | result.index2word.append(word) 222 | 223 | if binary: 224 | binary_len = dtype(REAL).itemsize * vector_size 225 | for line_no in xrange(vocab_size): 226 | # mixed text and binary: read text first, then binary 227 | word = [] 228 | while True: 229 | ch = fin.read(1) 230 | if ch == b' ': 231 | break 232 | if ch == b'': 233 | raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?") 234 | if ch != b'\n': # ignore newlines in front of words (some binary files have) 235 | word.append(ch) 236 | word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors) 237 | weights = fromstring(fin.read(binary_len), dtype=REAL) 238 | add_word(word, weights) 239 | else: 240 | for line_no in xrange(vocab_size): 241 | line = fin.readline() 242 | if line == b'': 243 | raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?") 244 | parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") 245 | if len(parts) != vector_size + 1: 246 | raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no)) 247 | word, weights = parts[0], list(map(REAL, parts[1:])) 248 | add_word(word, weights) 249 | if result.syn0.shape[0] != len(result.vocab): 250 | logger.info( 251 | "duplicate words detected, shrinking matrix size from %i to %i", 252 | result.syn0.shape[0], len(result.vocab) 253 | ) 254 | result.syn0 = ascontiguousarray(result.syn0[: len(result.vocab)]) 255 | assert (len(result.vocab), vector_size) == result.syn0.shape 256 | 257 | logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname)) 258 | return result 259 | 260 | def word_vec(self, word, use_norm=False): 261 | """ 262 | Accept a single word as input. 263 | Returns the word's representations in vector space, as a 1D numpy array. 264 | 265 | If `use_norm` is True, returns the normalized word vector. 266 | 267 | Example:: 268 | 269 | >>> trained_model['office'] 270 | array([ -1.40128313e-02, ...]) 271 | 272 | """ 273 | if word in self.vocab: 274 | if use_norm: 275 | return self.syn0norm[self.vocab[word].index] 276 | else: 277 | return self.syn0[self.vocab[word].index] 278 | else: 279 | raise KeyError("word '%s' not in vocabulary" % word) 280 | 281 | def most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None, indexer=None): 282 | """ 283 | Find the top-N most similar words. Positive words contribute positively towards the 284 | similarity, negative words negatively. 285 | 286 | This method computes cosine similarity between a simple mean of the projection 287 | weight vectors of the given words and the vectors for each word in the model. 288 | The method corresponds to the `word-analogy` and `distance` scripts in the original 289 | word2vec implementation. 290 | 291 | If topn is False, most_similar returns the vector of similarity scores. 292 | 293 | `restrict_vocab` is an optional integer which limits the range of vectors which 294 | are searched for most-similar values. For example, restrict_vocab=10000 would 295 | only check the first 10000 word vectors in the vocabulary order. (This may be 296 | meaningful if you've sorted the vocabulary by descending frequency.) 297 | 298 | Example:: 299 | 300 | >>> trained_model.most_similar(positive=['woman', 'king'], negative=['man']) 301 | [('queen', 0.50882536), ...] 302 | 303 | """ 304 | self.init_sims() 305 | 306 | if isinstance(positive, string_types) and not negative: 307 | # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) 308 | positive = [positive] 309 | 310 | # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words 311 | positive = [ 312 | (word, 1.0) if isinstance(word, string_types + (ndarray,)) else word 313 | for word in positive 314 | ] 315 | negative = [ 316 | (word, -1.0) if isinstance(word, string_types + (ndarray,)) else word 317 | for word in negative 318 | ] 319 | 320 | # compute the weighted average of all words 321 | all_words, mean = set(), [] 322 | for word, weight in positive + negative: 323 | if isinstance(word, ndarray): 324 | mean.append(weight * word) 325 | else: 326 | mean.append(weight * self.word_vec(word, use_norm=True)) 327 | if word in self.vocab: 328 | all_words.add(self.vocab[word].index) 329 | if not mean: 330 | raise ValueError("cannot compute similarity with no input") 331 | mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) 332 | 333 | if indexer is not None: 334 | return indexer.most_similar(mean, topn) 335 | 336 | limited = self.syn0norm if restrict_vocab is None else self.syn0norm[:restrict_vocab] 337 | dists = dot(limited, mean) 338 | if not topn: 339 | return dists 340 | best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True) 341 | # ignore (don't return) words from the input 342 | result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words] 343 | return result[:topn] 344 | 345 | def wmdistance(self, document1, document2): 346 | """ 347 | Compute the Word Mover's Distance between two documents. When using this 348 | model, please consider citing the following papers: 349 | 350 | .. Ofir Pele and Michael Werman, "A linear time histogram metric for improved SIFT matching". 351 | .. Ofir Pele and Michael Werman, "Fast and robust earth mover's distances". 352 | .. Matt Kusner et al. "From Word Embeddings To Document Distances". 353 | 354 | Note that if one of the documents have no words that exist in the 355 | Word2Vec vocab, `float('inf')` (i.e. infinity) will be returned. 356 | 357 | This method only works if `pyemd` is installed (can be installed via pip, but requires a C compiler). 358 | 359 | Example: 360 | >>> # Train word2vec model. 361 | >>> model = Word2Vec(sentences) 362 | 363 | >>> # Some sentences to test. 364 | >>> sentence_obama = 'Obama speaks to the media in Illinois'.lower().split() 365 | >>> sentence_president = 'The president greets the press in Chicago'.lower().split() 366 | 367 | >>> # Remove their stopwords. 368 | >>> from nltk.corpus import stopwords 369 | >>> stopwords = nltk.corpus.stopwords.words('english') 370 | >>> sentence_obama = [w for w in sentence_obama if w not in stopwords] 371 | >>> sentence_president = [w for w in sentence_president if w not in stopwords] 372 | 373 | >>> # Compute WMD. 374 | >>> distance = model.wmdistance(sentence_obama, sentence_president) 375 | """ 376 | 377 | if not PYEMD_EXT: 378 | raise ImportError("Please install pyemd Python package to compute WMD.") 379 | 380 | # Remove out-of-vocabulary words. 381 | len_pre_oov1 = len(document1) 382 | len_pre_oov2 = len(document2) 383 | document1 = [token for token in document1 if token in self] 384 | document2 = [token for token in document2 if token in self] 385 | diff1 = len_pre_oov1 - len(document1) 386 | diff2 = len_pre_oov2 - len(document2) 387 | if diff1 > 0 or diff2 > 0: 388 | logger.info('Removed %d and %d OOV words from document 1 and 2 (respectively).', 389 | diff1, diff2) 390 | 391 | if len(document1) == 0 or len(document2) == 0: 392 | logger.info('At least one of the documents had no words that were' 393 | 'in the vocabulary. Aborting (returning inf).') 394 | return float('inf') 395 | 396 | dictionary = Dictionary(documents=[document1, document2]) 397 | vocab_len = len(dictionary) 398 | 399 | if vocab_len == 1: 400 | # Both documents are composed by a single unique token 401 | return 0.0 402 | 403 | # Sets for faster look-up. 404 | docset1 = set(document1) 405 | docset2 = set(document2) 406 | 407 | # Compute distance matrix. 408 | distance_matrix = zeros((vocab_len, vocab_len), dtype=double) 409 | for i, t1 in dictionary.items(): 410 | for j, t2 in dictionary.items(): 411 | if not t1 in docset1 or not t2 in docset2: 412 | continue 413 | # Compute Euclidean distance between word vectors. 414 | distance_matrix[i, j] = sqrt(np_sum((self[t1] - self[t2])**2)) 415 | 416 | if np_sum(distance_matrix) == 0.0: 417 | # `emd` gets stuck if the distance matrix contains only zeros. 418 | logger.info('The distance matrix is all zeros. Aborting (returning inf).') 419 | return float('inf') 420 | 421 | def nbow(document): 422 | d = zeros(vocab_len, dtype=double) 423 | nbow = dictionary.doc2bow(document) # Word frequencies. 424 | doc_len = len(document) 425 | for idx, freq in nbow: 426 | d[idx] = freq / float(doc_len) # Normalized word frequencies. 427 | return d 428 | 429 | # Compute nBOW representation of documents. 430 | d1 = nbow(document1) 431 | d2 = nbow(document2) 432 | 433 | # Compute WMD. 434 | return emd(d1, d2, distance_matrix) 435 | 436 | def most_similar_cosmul(self, positive=[], negative=[], topn=10): 437 | """ 438 | Find the top-N most similar words, using the multiplicative combination objective 439 | proposed by Omer Levy and Yoav Goldberg in [4]_. Positive words still contribute 440 | positively towards the similarity, negative words negatively, but with less 441 | susceptibility to one large distance dominating the calculation. 442 | 443 | In the common analogy-solving case, of two positive and one negative examples, 444 | this method is equivalent to the "3CosMul" objective (equation (4)) of Levy and Goldberg. 445 | 446 | Additional positive or negative examples contribute to the numerator or denominator, 447 | respectively – a potentially sensible but untested extension of the method. (With 448 | a single positive example, rankings will be the same as in the default most_similar.) 449 | 450 | Example:: 451 | 452 | >>> trained_model.most_similar_cosmul(positive=['baghdad', 'england'], negative=['london']) 453 | [(u'iraq', 0.8488819003105164), ...] 454 | 455 | .. [4] Omer Levy and Yoav Goldberg. Linguistic Regularities in Sparse and Explicit Word Representations, 2014. 456 | 457 | """ 458 | self.init_sims() 459 | 460 | if isinstance(positive, string_types) and not negative: 461 | # allow calls like most_similar_cosmul('dog'), as a shorthand for most_similar_cosmul(['dog']) 462 | positive = [positive] 463 | 464 | all_words = set([self.vocab[word].index for word in positive+negative 465 | if not isinstance(word, ndarray) and word in self.vocab]) 466 | 467 | positive = [ 468 | self.word_vec(word, use_norm=True) if isinstance(word, string_types) else word 469 | for word in positive 470 | ] 471 | negative = [ 472 | self.word_vec(word, use_norm=True) if isinstance(word, string_types) else word 473 | for word in negative 474 | ] 475 | 476 | if not positive: 477 | raise ValueError("cannot compute similarity with no input") 478 | 479 | # equation (4) of Levy & Goldberg "Linguistic Regularities...", 480 | # with distances shifted to [0,1] per footnote (7) 481 | pos_dists = [((1 + dot(self.syn0norm, term)) / 2) for term in positive] 482 | neg_dists = [((1 + dot(self.syn0norm, term)) / 2) for term in negative] 483 | dists = prod(pos_dists, axis=0) / (prod(neg_dists, axis=0) + 0.000001) 484 | 485 | if not topn: 486 | return dists 487 | best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True) 488 | # ignore (don't return) words from the input 489 | result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words] 490 | return result[:topn] 491 | 492 | def similar_by_word(self, word, topn=10, restrict_vocab=None): 493 | """ 494 | Find the top-N most similar words. 495 | 496 | If topn is False, similar_by_word returns the vector of similarity scores. 497 | 498 | `restrict_vocab` is an optional integer which limits the range of vectors which 499 | are searched for most-similar values. For example, restrict_vocab=10000 would 500 | only check the first 10000 word vectors in the vocabulary order. (This may be 501 | meaningful if you've sorted the vocabulary by descending frequency.) 502 | 503 | Example:: 504 | 505 | >>> trained_model.similar_by_word('graph') 506 | [('user', 0.9999163150787354), ...] 507 | 508 | """ 509 | 510 | return self.most_similar(positive=[word], topn=topn, restrict_vocab=restrict_vocab) 511 | 512 | def similar_by_vector(self, vector, topn=10, restrict_vocab=None): 513 | """ 514 | Find the top-N most similar words by vector. 515 | 516 | If topn is False, similar_by_vector returns the vector of similarity scores. 517 | 518 | `restrict_vocab` is an optional integer which limits the range of vectors which 519 | are searched for most-similar values. For example, restrict_vocab=10000 would 520 | only check the first 10000 word vectors in the vocabulary order. (This may be 521 | meaningful if you've sorted the vocabulary by descending frequency.) 522 | 523 | Example:: 524 | 525 | >>> trained_model.similar_by_vector([1,2]) 526 | [('survey', 0.9942699074745178), ...] 527 | 528 | """ 529 | 530 | return self.most_similar(positive=[vector], topn=topn, restrict_vocab=restrict_vocab) 531 | 532 | def doesnt_match(self, words): 533 | """ 534 | Which word from the given list doesn't go with the others? 535 | 536 | Example:: 537 | 538 | >>> trained_model.doesnt_match("breakfast cereal dinner lunch".split()) 539 | 'cereal' 540 | 541 | """ 542 | self.init_sims() 543 | 544 | used_words = [word for word in words if word in self] 545 | if len(used_words) != len(words): 546 | ignored_words = set(words) - set(used_words) 547 | logger.warning("vectors for words %s are not present in the model, ignoring these words", ignored_words) 548 | if not used_words: 549 | raise ValueError("cannot select a word from an empty list") 550 | vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL) 551 | mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL) 552 | dists = dot(vectors, mean) 553 | return sorted(zip(dists, used_words))[0][1] 554 | 555 | def __getitem__(self, words): 556 | 557 | """ 558 | Accept a single word or a list of words as input. 559 | 560 | If a single word: returns the word's representations in vector space, as 561 | a 1D numpy array. 562 | 563 | Multiple words: return the words' representations in vector space, as a 564 | 2d numpy array: #words x #vector_size. Matrix rows are in the same order 565 | as in input. 566 | 567 | Example:: 568 | 569 | >>> trained_model['office'] 570 | array([ -1.40128313e-02, ...]) 571 | 572 | >>> trained_model[['office', 'products']] 573 | array([ -1.40128313e-02, ...] 574 | [ -1.70425311e-03, ...] 575 | ...) 576 | 577 | """ 578 | if isinstance(words, string_types): 579 | # allow calls like trained_model['office'], as a shorthand for trained_model[['office']] 580 | return self.word_vec(words) 581 | 582 | return vstack([self.word_vec(word) for word in words]) 583 | 584 | def __contains__(self, word): 585 | return word in self.vocab 586 | 587 | def similarity(self, w1, w2): 588 | """ 589 | Compute cosine similarity between two words. 590 | 591 | Example:: 592 | 593 | >>> trained_model.similarity('woman', 'man') 594 | 0.73723527 595 | 596 | >>> trained_model.similarity('woman', 'woman') 597 | 1.0 598 | 599 | """ 600 | return dot(matutils.unitvec(self[w1]), matutils.unitvec(self[w2])) 601 | 602 | def n_similarity(self, ws1, ws2): 603 | """ 604 | Compute cosine similarity between two sets of words. 605 | 606 | Example:: 607 | 608 | >>> trained_model.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant']) 609 | 0.61540466561049689 610 | 611 | >>> trained_model.n_similarity(['restaurant', 'japanese'], ['japanese', 'restaurant']) 612 | 1.0000000000000004 613 | 614 | >>> trained_model.n_similarity(['sushi'], ['restaurant']) == trained_model.similarity('sushi', 'restaurant') 615 | True 616 | 617 | """ 618 | if not(len(ws1) and len(ws2)): 619 | raise ZeroDivisionError('Atleast one of the passed list is empty.') 620 | v1 = [self[word] for word in ws1] 621 | v2 = [self[word] for word in ws2] 622 | return dot(matutils.unitvec(array(v1).mean(axis=0)), 623 | matutils.unitvec(array(v2).mean(axis=0))) 624 | 625 | @staticmethod 626 | def log_accuracy(section): 627 | correct, incorrect = len(section['correct']), len(section['incorrect']) 628 | if correct + incorrect > 0: 629 | logger.info("%s: %.1f%% (%i/%i)" % 630 | (section['section'], 100.0 * correct / (correct + incorrect), 631 | correct, correct + incorrect)) 632 | 633 | def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, case_insensitive=True): 634 | """ 635 | Compute accuracy of the model. `questions` is a filename where lines are 636 | 4-tuples of words, split into sections by ": SECTION NAME" lines. 637 | See questions-words.txt in https://storage.googleapis.com/google-model-archive-source/v2/model.google.com/word2vec/source-archive.zip for an example. 638 | 639 | The accuracy is reported (=printed to log and returned as a list) for each 640 | section separately, plus there's one aggregate summary at the end. 641 | 642 | Use `restrict_vocab` to ignore all questions containing a word not in the first `restrict_vocab` 643 | words (default 30,000). This may be meaningful if you've sorted the vocabulary by descending frequency. 644 | In case `case_insensitive` is True, the first `restrict_vocab` words are taken first, and then 645 | case normalization is performed. 646 | 647 | Use `case_insensitive` to convert all words in questions and vocab to their uppercase form before 648 | evaluating the accuracy (default True). Useful in case of case-mismatch between training tokens 649 | and question words. In case of multiple case variants of a single word, the vector for the first 650 | occurrence (also the most frequent if vocabulary is sorted) is taken. 651 | 652 | This method corresponds to the `compute-accuracy` script of the original C word2vec. 653 | 654 | """ 655 | ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] 656 | ok_vocab = dict((w.upper(), v) for w, v in reversed(ok_vocab)) if case_insensitive else dict(ok_vocab) 657 | 658 | sections, section = [], None 659 | for line_no, line in enumerate(utils.smart_open(questions)): 660 | # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed 661 | line = utils.to_unicode(line) 662 | if line.startswith(': '): 663 | # a new section starts => store the old section 664 | if section: 665 | sections.append(section) 666 | self.log_accuracy(section) 667 | section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []} 668 | else: 669 | if not section: 670 | raise ValueError("missing section header before line #%i in %s" % (line_no, questions)) 671 | try: 672 | if case_insensitive: 673 | a, b, c, expected = [word.upper() for word in line.split()] 674 | else: 675 | a, b, c, expected = [word for word in line.split()] 676 | except: 677 | logger.info("skipping invalid line #%i in %s" % (line_no, questions)) 678 | continue 679 | if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: 680 | logger.debug("skipping line #%i with OOV words: %s" % (line_no, line.strip())) 681 | continue 682 | 683 | original_vocab = self.vocab 684 | self.vocab = ok_vocab 685 | ignore = set([a, b, c]) # input words to be ignored 686 | predicted = None 687 | # find the most likely prediction, ignoring OOV words and input words 688 | sims = most_similar(self, positive=[b, c], negative=[a], topn=False, restrict_vocab=restrict_vocab) 689 | self.vocab = original_vocab 690 | for index in matutils.argsort(sims, reverse=True): 691 | predicted = self.index2word[index].upper() if case_insensitive else self.index2word[index] 692 | if predicted in ok_vocab and predicted not in ignore: 693 | if predicted != expected: 694 | logger.debug("%s: expected %s, predicted %s", line.strip(), expected, predicted) 695 | break 696 | if predicted == expected: 697 | section['correct'].append((a, b, c, expected)) 698 | else: 699 | section['incorrect'].append((a, b, c, expected)) 700 | if section: 701 | # store the last section, too 702 | sections.append(section) 703 | self.log_accuracy(section) 704 | 705 | total = { 706 | 'section': 'total', 707 | 'correct': sum((s['correct'] for s in sections), []), 708 | 'incorrect': sum((s['incorrect'] for s in sections), []), 709 | } 710 | self.log_accuracy(total) 711 | sections.append(total) 712 | return sections 713 | 714 | @staticmethod 715 | def log_evaluate_word_pairs(pearson, spearman, oov, pairs): 716 | logger.info('Pearson correlation coefficient against %s: %.4f', pairs, pearson[0]) 717 | logger.info('Spearman rank-order correlation coefficient against %s: %.4f', pairs, spearman[0]) 718 | logger.info('Pairs with unknown words ratio: %.1f%%', oov) 719 | 720 | def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, 721 | dummy4unknown=False): 722 | """ 723 | Compute correlation of the model with human similarity judgments. `pairs` is a filename of a dataset where 724 | lines are 3-tuples, each consisting of a word pair and a similarity value, separated by `delimiter'. 725 | An example dataset is included in Gensim (test/test_data/wordsim353.tsv). More datasets can be found at 726 | http://technion.ac.il/~ira.leviant/MultilingualVSMdata.html or https://www.cl.cam.ac.uk/~fh295/simlex.html. 727 | 728 | The model is evaluated using Pearson correlation coefficient and Spearman rank-order correlation coefficient 729 | between the similarities from the dataset and the similarities produced by the model itself. 730 | The results are printed to log and returned as a triple (pearson, spearman, ratio of pairs with unknown words). 731 | 732 | Use `restrict_vocab` to ignore all word pairs containing a word not in the first `restrict_vocab` 733 | words (default 300,000). This may be meaningful if you've sorted the vocabulary by descending frequency. 734 | If `case_insensitive` is True, the first `restrict_vocab` words are taken, and then case normalization 735 | is performed. 736 | 737 | Use `case_insensitive` to convert all words in the pairs and vocab to their uppercase form before 738 | evaluating the model (default True). Useful when you expect case-mismatch between training tokens 739 | and words pairs in the dataset. If there are multiple case variants of a single word, the vector for the first 740 | occurrence (also the most frequent if vocabulary is sorted) is taken. 741 | 742 | Use `dummy4unknown=True' to produce zero-valued similarities for pairs with out-of-vocabulary words. 743 | Otherwise (default False), these pairs are skipped entirely. 744 | """ 745 | ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] 746 | ok_vocab = dict((w.upper(), v) for w, v in reversed(ok_vocab)) if case_insensitive else dict(ok_vocab) 747 | 748 | similarity_gold = [] 749 | similarity_model = [] 750 | oov = 0 751 | 752 | original_vocab = self.vocab 753 | self.vocab = ok_vocab 754 | 755 | for line_no, line in enumerate(utils.smart_open(pairs)): 756 | line = utils.to_unicode(line) 757 | if line.startswith('#'): 758 | # May be a comment 759 | continue 760 | else: 761 | try: 762 | if case_insensitive: 763 | a, b, sim = [word.upper() for word in line.split(delimiter)] 764 | else: 765 | a, b, sim = [word for word in line.split(delimiter)] 766 | sim = float(sim) 767 | except: 768 | logger.info('skipping invalid line #%d in %s', line_no, pairs) 769 | continue 770 | if a not in ok_vocab or b not in ok_vocab: 771 | oov += 1 772 | if dummy4unknown: 773 | similarity_model.append(0.0) 774 | similarity_gold.append(sim) 775 | continue 776 | else: 777 | logger.debug('skipping line #%d with OOV words: %s', line_no, line.strip()) 778 | continue 779 | similarity_gold.append(sim) # Similarity from the dataset 780 | similarity_model.append(self.similarity(a, b)) # Similarity from the model 781 | self.vocab = original_vocab 782 | spearman = stats.spearmanr(similarity_gold, similarity_model) 783 | pearson = stats.pearsonr(similarity_gold, similarity_model) 784 | oov_ratio = float(oov) / (len(similarity_gold) + oov) * 100 785 | 786 | logger.debug( 787 | 'Pearson correlation coefficient against %s: %f with p-value %f', 788 | pairs, pearson[0], pearson[1] 789 | ) 790 | logger.debug( 791 | 'Spearman rank-order correlation coefficient against %s: %f with p-value %f', 792 | pairs, spearman[0], spearman[1] 793 | ) 794 | logger.debug('Pairs with unknown words: %d' % oov) 795 | self.log_evaluate_word_pairs(pearson, spearman, oov_ratio, pairs) 796 | return pearson, spearman, oov_ratio 797 | 798 | 799 | def init_sims(self, replace=False): 800 | """ 801 | Precompute L2-normalized vectors. 802 | 803 | If `replace` is set, forget the original vectors and only keep the normalized 804 | ones = saves lots of memory! 805 | 806 | Note that you **cannot continue training** after doing a replace. The model becomes 807 | effectively read-only = you can call `most_similar`, `similarity` etc., but not `train`. 808 | 809 | """ 810 | if getattr(self, 'syn0norm', None) is None or replace: 811 | logger.info("precomputing L2-norms of word weight vectors") 812 | if replace: 813 | for i in xrange(self.syn0.shape[0]): 814 | self.syn0[i, :] /= sqrt((self.syn0[i, :] ** 2).sum(-1)) 815 | self.syn0norm = self.syn0 816 | else: 817 | self.syn0norm = (self.syn0 / sqrt((self.syn0 ** 2).sum(-1))[..., newaxis]).astype(REAL) 818 | -------------------------------------------------------------------------------- /word2vec/matutils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2011 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | """ 8 | This module contains math helper functions. 9 | """ 10 | 11 | from __future__ import with_statement 12 | 13 | import logging 14 | import math 15 | 16 | import numpy as np 17 | import scipy.linalg 18 | import scipy.sparse 19 | from scipy.linalg.lapack import get_lapack_funcs 20 | from scipy.special import psi # gamma function utils 21 | from scipy.stats import entropy 22 | from six import iteritems, itervalues, string_types 23 | from six.moves import xrange, zip as izip 24 | 25 | import utils 26 | 27 | # scipy is not a stable package yet, locations change, so try to work 28 | # around differences (currently only concerns location of 'triu' in scipy 0.7 vs. 0.8) 29 | try: 30 | from scipy.linalg.basic import triu 31 | except ImportError: 32 | from scipy.linalg.special_matrices import triu 33 | 34 | try: 35 | from np import triu_indices 36 | except ImportError: 37 | # np < 1.4 38 | def triu_indices(n, k=0): 39 | m = np.ones((n, n), int) 40 | a = triu(m, k) 41 | return np.where(a != 0) 42 | 43 | blas = lambda name, ndarray: scipy.linalg.get_blas_funcs((name,), (ndarray,))[0] 44 | 45 | logger = logging.getLogger(__name__) 46 | 47 | 48 | def argsort(x, topn=None, reverse=False): 49 | """ 50 | Return indices of the `topn` smallest elements in array `x`, in ascending order. 51 | 52 | If reverse is True, return the greatest elements instead, in descending order. 53 | 54 | """ 55 | x = np.asarray(x) # unify model path for when `x` is not a np array (list, tuple...) 56 | if topn is None: 57 | topn = x.size 58 | if topn <= 0: 59 | return [] 60 | if reverse: 61 | x = -x 62 | if topn >= x.size or not hasattr(np, 'argpartition'): 63 | return np.argsort(x)[:topn] 64 | # np >= 1.8 has a fast partial argsort, use that! 65 | most_extreme = np.argpartition(x, topn)[:topn] 66 | return most_extreme.take(np.argsort(x.take(most_extreme))) # resort topn into order 67 | 68 | 69 | def corpus2csc(corpus, num_terms=None, dtype=np.float64, num_docs=None, num_nnz=None, printprogress=0): 70 | """ 71 | Convert a streamed corpus into a sparse matrix, in scipy.sparse.csc_matrix format, 72 | with documents as columns. 73 | 74 | If the number of terms, documents and non-zero elements is known, you can pass 75 | them here as parameters and a more memory efficient model path will be taken. 76 | 77 | The input corpus may be a non-repeatable stream (generator). 78 | 79 | This is the mirror function to `Sparse2Corpus`. 80 | 81 | """ 82 | try: 83 | # if the input corpus has the `num_nnz`, `num_docs` and `num_terms` attributes 84 | # (as is the case with MmCorpus for example), we can use a more efficient model path 85 | if num_terms is None: 86 | num_terms = corpus.num_terms 87 | if num_docs is None: 88 | num_docs = corpus.num_docs 89 | if num_nnz is None: 90 | num_nnz = corpus.num_nnz 91 | except AttributeError: 92 | pass # not a MmCorpus... 93 | if printprogress: 94 | logger.info("creating sparse matrix from corpus") 95 | if num_terms is not None and num_docs is not None and num_nnz is not None: 96 | # faster and much more memory-friendly version of creating the sparse csc 97 | posnow, indptr = 0, [0] 98 | indices = np.empty((num_nnz,), dtype=np.int32) # HACK assume feature ids fit in 32bit integer 99 | data = np.empty((num_nnz,), dtype=dtype) 100 | for docno, doc in enumerate(corpus): 101 | if printprogress and docno % printprogress == 0: 102 | logger.info("PROGRESS: at document #%i/%i" % (docno, num_docs)) 103 | posnext = posnow + len(doc) 104 | indices[posnow: posnext] = [feature_id for feature_id, _ in doc] 105 | data[posnow: posnext] = [feature_weight for _, feature_weight in doc] 106 | indptr.append(posnext) 107 | posnow = posnext 108 | assert posnow == num_nnz, "mismatch between supplied and computed number of non-zeros" 109 | result = scipy.sparse.csc_matrix((data, indices, indptr), shape=(num_terms, num_docs), dtype=dtype) 110 | else: 111 | # slower version; determine the sparse matrix parameters during iteration 112 | num_nnz, data, indices, indptr = 0, [], [], [0] 113 | for docno, doc in enumerate(corpus): 114 | if printprogress and docno % printprogress == 0: 115 | logger.info("PROGRESS: at document #%i" % (docno)) 116 | indices.extend([feature_id for feature_id, _ in doc]) 117 | data.extend([feature_weight for _, feature_weight in doc]) 118 | num_nnz += len(doc) 119 | indptr.append(num_nnz) 120 | if num_terms is None: 121 | num_terms = max(indices) + 1 if indices else 0 122 | num_docs = len(indptr) - 1 123 | # now num_docs, num_terms and num_nnz contain the correct values 124 | data = np.asarray(data, dtype=dtype) 125 | indices = np.asarray(indices) 126 | result = scipy.sparse.csc_matrix((data, indices, indptr), shape=(num_terms, num_docs), dtype=dtype) 127 | return result 128 | 129 | 130 | def pad(mat, padrow, padcol): 131 | """ 132 | Add additional rows/columns to a np.matrix `mat`. The new rows/columns 133 | will be initialized with zeros. 134 | """ 135 | if padrow < 0: 136 | padrow = 0 137 | if padcol < 0: 138 | padcol = 0 139 | rows, cols = mat.shape 140 | return np.bmat([ 141 | [mat, np.matrix(np.zeros((rows, padcol)))], 142 | [np.matrix(np.zeros((padrow, cols + padcol)))], 143 | ]) 144 | 145 | 146 | def zeros_aligned(shape, dtype, order='C', align=128): 147 | """Like `np.zeros()`, but the array will be aligned at `align` byte boundary.""" 148 | nbytes = np.prod(shape, dtype=np.int64) * np.dtype(dtype).itemsize 149 | buffer = np.zeros(nbytes + align, dtype=np.uint8) # problematic on win64 ("maximum allowed dimension exceeded") 150 | start_index = -buffer.ctypes.data % align 151 | return buffer[start_index : start_index + nbytes].view(dtype).reshape(shape, order=order) 152 | 153 | 154 | def ismatrix(m): 155 | return isinstance(m, np.ndarray) and m.ndim == 2 or scipy.sparse.issparse(m) 156 | 157 | 158 | def any2sparse(vec, eps=1e-9): 159 | """Convert a np/scipy vector into gensim document format (=list of 2-tuples).""" 160 | if isinstance(vec, np.ndarray): 161 | return dense2vec(vec, eps) 162 | if scipy.sparse.issparse(vec): 163 | return scipy2sparse(vec, eps) 164 | return [(int(fid), float(fw)) for fid, fw in vec if np.abs(fw) > eps] 165 | 166 | 167 | def scipy2sparse(vec, eps=1e-9): 168 | """Convert a scipy.sparse vector into gensim document format (=list of 2-tuples).""" 169 | vec = vec.tocsr() 170 | assert vec.shape[0] == 1 171 | return [(int(pos), float(val)) for pos, val in zip(vec.indices, vec.data) if np.abs(val) > eps] 172 | 173 | 174 | class Scipy2Corpus(object): 175 | """ 176 | Convert a sequence of dense/sparse vectors into a streamed gensim corpus object. 177 | 178 | This is the mirror function to `corpus2csc`. 179 | 180 | """ 181 | def __init__(self, vecs): 182 | """ 183 | `vecs` is a sequence of dense and/or sparse vectors, such as a 2d np array, 184 | or a scipy.sparse.csc_matrix, or any sequence containing a mix of 1d np/scipy vectors. 185 | 186 | """ 187 | self.vecs = vecs 188 | 189 | def __iter__(self): 190 | for vec in self.vecs: 191 | if isinstance(vec, np.ndarray): 192 | yield full2sparse(vec) 193 | else: 194 | yield scipy2sparse(vec) 195 | 196 | def __len__(self): 197 | return len(self.vecs) 198 | 199 | 200 | def sparse2full(doc, length): 201 | """ 202 | Convert a document in sparse document format (=sequence of 2-tuples) into a dense 203 | np array (of size `length`). 204 | 205 | This is the mirror function to `full2sparse`. 206 | 207 | """ 208 | result = np.zeros(length, dtype=np.float32) # fill with zeroes (default value) 209 | # convert indices to int as numpy 1.12 no longer indexes by floats 210 | doc = ((int(id_), float(val_)) for (id_, val_) in doc) 211 | 212 | doc = dict(doc) 213 | # overwrite some of the zeroes with explicit values 214 | result[list(doc)] = list(itervalues(doc)) 215 | return result 216 | 217 | 218 | def full2sparse(vec, eps=1e-9): 219 | """ 220 | Convert a dense np array into the sparse document format (sequence of 2-tuples). 221 | 222 | Values of magnitude < `eps` are treated as zero (ignored). 223 | 224 | This is the mirror function to `sparse2full`. 225 | 226 | """ 227 | vec = np.asarray(vec, dtype=float) 228 | nnz = np.nonzero(abs(vec) > eps)[0] 229 | return list(zip(nnz, vec.take(nnz))) 230 | 231 | dense2vec = full2sparse 232 | 233 | 234 | def full2sparse_clipped(vec, topn, eps=1e-9): 235 | """ 236 | Like `full2sparse`, but only return the `topn` elements of the greatest magnitude (abs). 237 | 238 | """ 239 | # use np.argpartition/argsort and only form tuples that are actually returned. 240 | # this is about 40x faster than explicitly forming all 2-tuples to run sort() or heapq.nlargest() on. 241 | if topn <= 0: 242 | return [] 243 | vec = np.asarray(vec, dtype=float) 244 | nnz = np.nonzero(abs(vec) > eps)[0] 245 | biggest = nnz.take(argsort(abs(vec).take(nnz), topn, reverse=True)) 246 | return list(zip(biggest, vec.take(biggest))) 247 | 248 | 249 | def corpus2dense(corpus, num_terms, num_docs=None, dtype=np.float32): 250 | """ 251 | Convert corpus into a dense np array (documents will be columns). You 252 | must supply the number of features `num_terms`, because dimensionality 253 | cannot be deduced from the sparse vectors alone. 254 | 255 | You can optionally supply `num_docs` (=the corpus length) as well, so that 256 | a more memory-efficient model path is taken. 257 | 258 | This is the mirror function to `Dense2Corpus`. 259 | 260 | """ 261 | if num_docs is not None: 262 | # we know the number of documents => don't bother column_stacking 263 | docno, result = -1, np.empty((num_terms, num_docs), dtype=dtype) 264 | for docno, doc in enumerate(corpus): 265 | result[:, docno] = sparse2full(doc, num_terms) 266 | assert docno + 1 == num_docs 267 | else: 268 | result = np.column_stack(sparse2full(doc, num_terms) for doc in corpus) 269 | return result.astype(dtype) 270 | 271 | 272 | class Dense2Corpus(object): 273 | """ 274 | Treat dense np array as a sparse, streamed gensim corpus. 275 | 276 | No data copy is made (changes to the underlying matrix imply changes in the 277 | corpus). 278 | 279 | This is the mirror function to `corpus2dense`. 280 | 281 | """ 282 | def __init__(self, dense, documents_columns=True): 283 | if documents_columns: 284 | self.dense = dense.T 285 | else: 286 | self.dense = dense 287 | 288 | def __iter__(self): 289 | for doc in self.dense: 290 | yield full2sparse(doc.flat) 291 | 292 | def __len__(self): 293 | return len(self.dense) 294 | #endclass DenseCorpus 295 | 296 | 297 | class Sparse2Corpus(object): 298 | """ 299 | Convert a matrix in scipy.sparse format into a streaming gensim corpus. 300 | 301 | This is the mirror function to `corpus2csc`. 302 | 303 | """ 304 | def __init__(self, sparse, documents_columns=True): 305 | if documents_columns: 306 | self.sparse = sparse.tocsc() 307 | else: 308 | self.sparse = sparse.tocsr().T # make sure shape[1]=number of docs (needed in len()) 309 | 310 | def __iter__(self): 311 | for indprev, indnow in izip(self.sparse.indptr, self.sparse.indptr[1:]): 312 | yield list(zip(self.sparse.indices[indprev:indnow], self.sparse.data[indprev:indnow])) 313 | 314 | def __len__(self): 315 | return self.sparse.shape[1] 316 | #endclass Sparse2Corpus 317 | 318 | 319 | def veclen(vec): 320 | if len(vec) == 0: 321 | return 0.0 322 | length = 1.0 * math.sqrt(sum(val**2 for _, val in vec)) 323 | assert length > 0.0, "sparse documents must not contain any explicit zero entries" 324 | return length 325 | 326 | 327 | def ret_normalized_vec(vec, length): 328 | if length != 1.0: 329 | return [(termid, val / length) for termid, val in vec] 330 | else: 331 | return list(vec) 332 | 333 | 334 | def ret_log_normalize_vec(vec, axis=1): 335 | log_max = 100.0 336 | if len(vec.shape) == 1: 337 | max_val = np.max(vec) 338 | log_shift = log_max - np.log(len(vec) + 1.0) - max_val 339 | tot = np.sum(np.exp(vec + log_shift)) 340 | log_norm = np.log(tot) - log_shift 341 | vec = vec - log_norm 342 | else: 343 | if axis == 1: # independently normalize each sample 344 | max_val = np.max(vec, 1) 345 | log_shift = log_max - np.log(vec.shape[1] + 1.0) - max_val 346 | tot = np.sum(np.exp(vec + log_shift[:, np.newaxis]), 1) 347 | log_norm = np.log(tot) - log_shift 348 | vec = vec - log_norm[:, np.newaxis] 349 | elif axis == 0: # normalize each feature 350 | k = ret_log_normalize_vec(vec.T) 351 | return (k[0].T, k[1]) 352 | else: 353 | raise ValueError("'%s' is not a supported axis" % axis) 354 | return (vec, log_norm) 355 | 356 | 357 | blas_nrm2 = blas('nrm2', np.array([], dtype=float)) 358 | blas_scal = blas('scal', np.array([], dtype=float)) 359 | 360 | 361 | def unitvec(vec, norm='l2'): 362 | """ 363 | Scale a vector to unit length. The only exception is the zero vector, which 364 | is returned back unchanged. 365 | 366 | Output will be in the same format as input (i.e., gensim vector=>gensim vector, 367 | or np array=>np array, scipy.sparse=>scipy.sparse). 368 | """ 369 | if norm not in ('l1', 'l2'): 370 | raise ValueError("'%s' is not a supported norm. Currently supported norms are 'l1' and 'l2'." % norm) 371 | if scipy.sparse.issparse(vec): 372 | vec = vec.tocsr() 373 | if norm == 'l1': 374 | veclen = np.sum(np.abs(vec.data)) 375 | if norm == 'l2': 376 | veclen = np.sqrt(np.sum(vec.data ** 2)) 377 | if veclen > 0.0: 378 | return vec / veclen 379 | else: 380 | return vec 381 | 382 | if isinstance(vec, np.ndarray): 383 | vec = np.asarray(vec, dtype=float) 384 | if norm == 'l1': 385 | veclen = np.sum(np.abs(vec)) 386 | if norm == 'l2': 387 | veclen = blas_nrm2(vec) 388 | if veclen > 0.0: 389 | return blas_scal(1.0 / veclen, vec) 390 | else: 391 | return vec 392 | 393 | try: 394 | first = next(iter(vec)) # is there at least one element? 395 | except: 396 | return vec 397 | 398 | if isinstance(first, (tuple, list)) and len(first) == 2: # gensim sparse format 399 | if norm == 'l1': 400 | length = float(sum(abs(val) for _, val in vec)) 401 | if norm == 'l2': 402 | length = 1.0 * math.sqrt(sum(val ** 2 for _, val in vec)) 403 | assert length > 0.0, "sparse documents must not contain any explicit zero entries" 404 | return ret_normalized_vec(vec, length) 405 | else: 406 | raise ValueError("unknown input type") 407 | 408 | 409 | def cossim(vec1, vec2): 410 | """ 411 | Return cosine similarity between two sparse vectors. 412 | The similarity is a number between <-1.0, 1.0>, higher is more similar. 413 | """ 414 | vec1, vec2 = dict(vec1), dict(vec2) 415 | if not vec1 or not vec2: 416 | return 0.0 417 | vec1len = 1.0 * math.sqrt(sum(val * val for val in itervalues(vec1))) 418 | vec2len = 1.0 * math.sqrt(sum(val * val for val in itervalues(vec2))) 419 | assert vec1len > 0.0 and vec2len > 0.0, "sparse documents must not contain any explicit zero entries" 420 | if len(vec2) < len(vec1): 421 | vec1, vec2 = vec2, vec1 # swap references so that we iterate over the shorter vector 422 | result = sum(value * vec2.get(index, 0.0) for index, value in iteritems(vec1)) 423 | result /= vec1len * vec2len # rescale by vector lengths 424 | return result 425 | 426 | 427 | def isbow(vec): 428 | """ 429 | Checks if vector passed is in bag of words representation or not. 430 | Vec is considered to be in bag of words format if it is 2-tuple format. 431 | """ 432 | if scipy.sparse.issparse(vec): 433 | vec = vec.todense().tolist() 434 | try: 435 | id_, val_ = vec[0] # checking first value to see if it is in bag of words format by unpacking 436 | id_, val_ = int(id_), float(val_) 437 | except IndexError: 438 | return True # this is to handle the empty input case 439 | except Exception: 440 | return False 441 | return True 442 | 443 | 444 | def kullback_leibler(vec1, vec2, num_features=None): 445 | """ 446 | A distance metric between two probability distributions. 447 | Returns a distance value in range <0,1> where values closer to 0 mean less distance (and a higher similarity) 448 | Uses the scipy.stats.entropy method to identify kullback_leibler convergence value. 449 | If the distribution draws from a certain number of docs, that value must be passed. 450 | """ 451 | if scipy.sparse.issparse(vec1): 452 | vec1 = vec1.toarray() 453 | if scipy.sparse.issparse(vec2): 454 | vec2 = vec2.toarray() # converted both the vectors to dense in case they were in sparse matrix 455 | if isbow(vec1) and isbow(vec2): # if they are in bag of words format we make it dense 456 | if num_features is not None: # if not None, make as large as the documents drawing from 457 | dense1 = sparse2full(vec1, num_features) 458 | dense2 = sparse2full(vec2, num_features) 459 | return entropy(dense1, dense2) 460 | else: 461 | max_len = max(len(vec1), len(vec2)) 462 | dense1 = sparse2full(vec1, max_len) 463 | dense2 = sparse2full(vec2, max_len) 464 | return entropy(dense1, dense2) 465 | else: 466 | # this conversion is made because if it is not in bow format, it might be a list within a list after conversion 467 | # the scipy implementation of Kullback fails in such a case so we pick up only the nested list. 468 | if len(vec1) == 1: 469 | vec1 = vec1[0] 470 | if len(vec2) == 1: 471 | vec2 = vec2[0] 472 | return scipy.stats.entropy(vec1, vec2) 473 | 474 | 475 | def hellinger(vec1, vec2): 476 | """ 477 | Hellinger distance is a distance metric to quantify the similarity between two probability distributions. 478 | Distance between distributions will be a number between <0,1>, where 0 is minimum distance (maximum similarity) and 1 is maximum distance (minimum similarity). 479 | """ 480 | if scipy.sparse.issparse(vec1): 481 | vec1 = vec1.toarray() 482 | if scipy.sparse.issparse(vec2): 483 | vec2 = vec2.toarray() 484 | if isbow(vec1) and isbow(vec2): 485 | # if it is a bag of words format, instead of converting to dense we use dictionaries to calculate appropriate distance 486 | vec1, vec2 = dict(vec1), dict(vec2) 487 | if len(vec2) < len(vec1): 488 | vec1, vec2 = vec2, vec1 # swap references so that we iterate over the shorter vector 489 | sim = np.sqrt(0.5*sum((np.sqrt(value) - np.sqrt(vec2.get(index, 0.0)))**2 for index, value in iteritems(vec1))) 490 | return sim 491 | else: 492 | sim = np.sqrt(0.5 * ((np.sqrt(vec1) - np.sqrt(vec2))**2).sum()) 493 | return sim 494 | 495 | 496 | def jaccard(vec1, vec2): 497 | """ 498 | A distance metric between bags of words representation. 499 | Returns 1 minus the intersection divided by union, where union is the sum of the size of the two bags. 500 | If it is not a bag of words representation, the union and intersection is calculated in the traditional manner. 501 | Returns a value in range <0,1> where values closer to 0 mean less distance and thus higher similarity. 502 | 503 | """ 504 | 505 | # converting from sparse for easier manipulation 506 | if scipy.sparse.issparse(vec1): 507 | vec1 = vec1.toarray() 508 | if scipy.sparse.issparse(vec2): 509 | vec2 = vec2.toarray() 510 | if isbow(vec1) and isbow(vec2): 511 | # if it's in bow format, we use the following definitions: 512 | # union = sum of the 'weights' of both the bags 513 | # intersection = lowest weight for a particular id; basically the number of common words or items 514 | union = sum(weight for id_, weight in vec1) + sum(weight for id_, weight in vec2) 515 | vec1, vec2 = dict(vec1), dict(vec2) 516 | intersection = 0.0 517 | for feature_id, feature_weight in iteritems(vec1): 518 | intersection += min(feature_weight, vec2.get(feature_id, 0.0)) 519 | return 1 - float(intersection) / float(union) 520 | else: 521 | # if it isn't in bag of words format, we can use sets to calculate intersection and union 522 | if isinstance(vec1, np.ndarray): 523 | vec1 = vec1.tolist() 524 | if isinstance(vec2, np.ndarray): 525 | vec2 = vec2.tolist() 526 | vec1 = set(vec1) 527 | vec2 = set(vec2) 528 | intersection = vec1 & vec2 529 | union = vec1 | vec2 530 | return 1 - float(len(intersection)) / float(len(union)) 531 | 532 | 533 | def jaccard_distance(set1, set2): 534 | """ 535 | Calculate a distance between set representation (1 minus the intersection divided by union). 536 | Return a value in range <0, 1> where values closer to 0 mean smaller distance and thus higher similarity. 537 | """ 538 | 539 | union_cardinality = len(set1 | set2) 540 | if union_cardinality == 0: # Both sets are empty 541 | return 1. 542 | 543 | return 1. - float(len(set1 & set2)) / float(union_cardinality) 544 | 545 | 546 | def dirichlet_expectation(alpha): 547 | """ 548 | For a vector `theta~Dir(alpha)`, compute `E[log(theta)]`. 549 | 550 | """ 551 | if (len(alpha.shape) == 1): 552 | result = psi(alpha) - psi(np.sum(alpha)) 553 | else: 554 | result = psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis] 555 | return result.astype(alpha.dtype) # keep the same precision as input 556 | 557 | 558 | def qr_destroy(la): 559 | """ 560 | Return QR decomposition of `la[0]`. Content of `la` gets destroyed in the process. 561 | 562 | Using this function should be less memory intense than calling `scipy.linalg.qr(la[0])`, 563 | because the memory used in `la[0]` is reclaimed earlier. 564 | """ 565 | a = np.asfortranarray(la[0]) 566 | del la[0], la # now `a` is the only reference to the input matrix 567 | m, n = a.shape 568 | # perform q, r = QR(a); model hacked out of scipy.linalg.qr 569 | logger.debug("computing QR of %s dense matrix" % str(a.shape)) 570 | geqrf, = get_lapack_funcs(('geqrf',), (a,)) 571 | qr, tau, work, info = geqrf(a, lwork=-1, overwrite_a=True) 572 | qr, tau, work, info = geqrf(a, lwork=work[0], overwrite_a=True) 573 | del a # free up mem 574 | assert info >= 0 575 | r = triu(qr[:n, :n]) 576 | if m < n: # rare case, #features < #topics 577 | qr = qr[:, :m] # retains fortran order 578 | gorgqr, = get_lapack_funcs(('orgqr',), (qr,)) 579 | q, work, info = gorgqr(qr, tau, lwork=-1, overwrite_a=True) 580 | q, work, info = gorgqr(qr, tau, lwork=work[0], overwrite_a=True) 581 | assert info >= 0, "qr failed" 582 | assert q.flags.f_contiguous 583 | return q, r 584 | 585 | 586 | class MmWriter(object): 587 | """ 588 | Store a corpus in Matrix Market format. 589 | 590 | Note that the output is written one document at a time, not the whole 591 | matrix at once (unlike scipy.io.mmread). This allows us to process corpora 592 | which are larger than the available RAM. 593 | 594 | NOTE: the output file is created in a single pass through the input corpus, so 595 | that the input can be a once-only stream (iterator). 596 | To achieve this, a fake MM header is written first, statistics are collected 597 | during the pass (shape of the matrix, number of non-zeroes), followed by a seek 598 | back to the beginning of the file, rewriting the fake header with proper values. 599 | 600 | """ 601 | 602 | HEADER_LINE = b'%%MatrixMarket matrix coordinate real general\n' # the only supported MM format 603 | 604 | def __init__(self, fname): 605 | self.fname = fname 606 | if fname.endswith(".gz") or fname.endswith('.bz2'): 607 | raise NotImplementedError("compressed output not supported with MmWriter") 608 | self.fout = utils.smart_open(self.fname, 'wb+') # open for both reading and writing 609 | self.headers_written = False 610 | 611 | def write_headers(self, num_docs, num_terms, num_nnz): 612 | self.fout.write(MmWriter.HEADER_LINE) 613 | 614 | if num_nnz < 0: 615 | # we don't know the matrix shape/density yet, so only log a general line 616 | logger.info("saving sparse matrix to %s" % self.fname) 617 | self.fout.write(utils.to_utf8(' ' * 50 + '\n')) # 48 digits must be enough for everybody 618 | else: 619 | logger.info( 620 | "saving sparse %sx%s matrix with %i non-zero entries to %s", 621 | num_docs, num_terms, num_nnz, self.fname) 622 | self.fout.write(utils.to_utf8('%s %s %s\n' % (num_docs, num_terms, num_nnz))) 623 | self.last_docno = -1 624 | self.headers_written = True 625 | 626 | def fake_headers(self, num_docs, num_terms, num_nnz): 627 | stats = '%i %i %i' % (num_docs, num_terms, num_nnz) 628 | if len(stats) > 50: 629 | raise ValueError('Invalid stats: matrix too large!') 630 | self.fout.seek(len(MmWriter.HEADER_LINE)) 631 | self.fout.write(utils.to_utf8(stats)) 632 | 633 | def write_vector(self, docno, vector): 634 | """ 635 | Write a single sparse vector to the file. 636 | 637 | Sparse vector is any iterable yielding (field id, field value) pairs. 638 | """ 639 | assert self.headers_written, "must write Matrix Market file headers before writing data!" 640 | assert self.last_docno < docno, "documents %i and %i not in sequential order!" % (self.last_docno, docno) 641 | vector = sorted((i, w) for i, w in vector if abs(w) > 1e-12) # ignore near-zero entries 642 | for termid, weight in vector: # write term ids in sorted order 643 | self.fout.write(utils.to_utf8("%i %i %s\n" % (docno + 1, termid + 1, weight))) # +1 because MM format starts counting from 1 644 | self.last_docno = docno 645 | return (vector[-1][0], len(vector)) if vector else (-1, 0) 646 | 647 | @staticmethod 648 | def write_corpus(fname, corpus, progress_cnt=1000, index=False, num_terms=None, metadata=False): 649 | """ 650 | Save the vector space representation of an entire corpus to disk. 651 | 652 | Note that the documents are processed one at a time, so the whole corpus 653 | is allowed to be larger than the available RAM. 654 | """ 655 | mw = MmWriter(fname) 656 | 657 | # write empty headers to the file (with enough space to be overwritten later) 658 | mw.write_headers(-1, -1, -1) # will print 50 spaces followed by newline on the stats line 659 | 660 | # calculate necessary header info (nnz elements, num terms, num docs) while writing out vectors 661 | _num_terms, num_nnz = 0, 0 662 | docno, poslast = -1, -1 663 | offsets = [] 664 | if hasattr(corpus, 'metadata'): 665 | orig_metadata = corpus.metadata 666 | corpus.metadata = metadata 667 | if metadata: 668 | docno2metadata = {} 669 | else: 670 | metadata = False 671 | for docno, doc in enumerate(corpus): 672 | if metadata: 673 | bow, data = doc 674 | docno2metadata[docno] = data 675 | else: 676 | bow = doc 677 | if docno % progress_cnt == 0: 678 | logger.info("PROGRESS: saving document #%i" % docno) 679 | if index: 680 | posnow = mw.fout.tell() 681 | if posnow == poslast: 682 | offsets[-1] = -1 683 | offsets.append(posnow) 684 | poslast = posnow 685 | max_id, veclen = mw.write_vector(docno, bow) 686 | _num_terms = max(_num_terms, 1 + max_id) 687 | num_nnz += veclen 688 | if metadata: 689 | utils.pickle(docno2metadata, fname + '.metadata.cpickle') 690 | corpus.metadata = orig_metadata 691 | 692 | num_docs = docno + 1 693 | num_terms = num_terms or _num_terms 694 | 695 | if num_docs * num_terms != 0: 696 | logger.info("saved %ix%i matrix, density=%.3f%% (%i/%i)" % ( 697 | num_docs, num_terms, 698 | 100.0 * num_nnz / (num_docs * num_terms), 699 | num_nnz, 700 | num_docs * num_terms)) 701 | 702 | # now write proper headers, by seeking and overwriting the spaces written earlier 703 | mw.fake_headers(num_docs, num_terms, num_nnz) 704 | 705 | mw.close() 706 | if index: 707 | return offsets 708 | 709 | def __del__(self): 710 | """ 711 | Automatic destructor which closes the underlying file. 712 | 713 | There must be no circular references contained in the object for __del__ 714 | to work! Closing the file explicitly via the close() method is preferred 715 | and safer. 716 | """ 717 | self.close() # does nothing if called twice (on an already closed file), so no worries 718 | 719 | def close(self): 720 | logger.debug("closing %s" % self.fname) 721 | if hasattr(self, 'fout'): 722 | self.fout.close() 723 | #endclass MmWriter 724 | 725 | 726 | class MmReader(object): 727 | """ 728 | Wrap a term-document matrix on disk (in matrix-market format), and present it 729 | as an object which supports iteration over the rows (~documents). 730 | 731 | Note that the file is read into memory one document at a time, not the whole 732 | matrix at once (unlike scipy.io.mmread). This allows us to process corpora 733 | which are larger than the available RAM. 734 | """ 735 | def __init__(self, input, transposed=True): 736 | """ 737 | Initialize the matrix reader. 738 | 739 | The `input` refers to a file on local filesystem, which is expected to 740 | be in the sparse (coordinate) Matrix Market format. Documents are assumed 741 | to be rows of the matrix (and document features are columns). 742 | 743 | `input` is either a string (file path) or a file-like object that supports 744 | `seek()` (e.g. gzip.GzipFile, bz2.BZ2File). 745 | """ 746 | logger.info("initializing corpus reader from %s" % input) 747 | self.input, self.transposed = input, transposed 748 | with utils.file_or_filename(self.input) as lines: 749 | try: 750 | header = utils.to_unicode(next(lines)).strip() 751 | if not header.lower().startswith('%%matrixmarket matrix coordinate real general'): 752 | raise ValueError("File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % 753 | (self.input, header)) 754 | except StopIteration: 755 | pass 756 | 757 | self.num_docs = self.num_terms = self.num_nnz = 0 758 | for lineno, line in enumerate(lines): 759 | line = utils.to_unicode(line) 760 | if not line.startswith('%'): 761 | self.num_docs, self.num_terms, self.num_nnz = map(int, line.split()) 762 | if not self.transposed: 763 | self.num_docs, self.num_terms = self.num_terms, self.num_docs 764 | break 765 | 766 | logger.info( 767 | "accepted corpus with %i documents, %i features, %i non-zero entries", 768 | self.num_docs, self.num_terms, self.num_nnz) 769 | 770 | def __len__(self): 771 | return self.num_docs 772 | 773 | def __str__(self): 774 | return ("MmCorpus(%i documents, %i features, %i non-zero entries)" % 775 | (self.num_docs, self.num_terms, self.num_nnz)) 776 | 777 | def skip_headers(self, input_file): 778 | """ 779 | Skip file headers that appear before the first document. 780 | """ 781 | for line in input_file: 782 | if line.startswith(b'%'): 783 | continue 784 | break 785 | 786 | def __iter__(self): 787 | """ 788 | Iteratively yield vectors from the underlying file, in the format (row_no, vector), 789 | where vector is a list of (col_no, value) 2-tuples. 790 | 791 | Note that the total number of vectors returned is always equal to the 792 | number of rows specified in the header; empty documents are inserted and 793 | yielded where appropriate, even if they are not explicitly stored in the 794 | Matrix Market file. 795 | """ 796 | with utils.file_or_filename(self.input) as lines: 797 | self.skip_headers(lines) 798 | 799 | previd = -1 800 | for line in lines: 801 | docid, termid, val = utils.to_unicode(line).split() # needed for python3 802 | if not self.transposed: 803 | termid, docid = docid, termid 804 | docid, termid, val = int(docid) - 1, int(termid) - 1, float(val) # -1 because matrix market indexes are 1-based => convert to 0-based 805 | assert previd <= docid, "matrix columns must come in ascending order" 806 | if docid != previd: 807 | # change of document: return the document read so far (its id is prevId) 808 | if previd >= 0: 809 | yield previd, document 810 | 811 | # return implicit (empty) documents between previous id and new id 812 | # too, to keep consistent document numbering and corpus length 813 | for previd in xrange(previd + 1, docid): 814 | yield previd, [] 815 | 816 | # from now on start adding fields to a new document, with a new id 817 | previd = docid 818 | document = [] 819 | 820 | document.append((termid, val,)) # add another field to the current document 821 | 822 | # handle the last document, as a special case 823 | if previd >= 0: 824 | yield previd, document 825 | 826 | # return empty documents between the last explicit document and the number 827 | # of documents as specified in the header 828 | for previd in xrange(previd + 1, self.num_docs): 829 | yield previd, [] 830 | 831 | def docbyoffset(self, offset): 832 | """Return document at file offset `offset` (in bytes)""" 833 | # empty documents are not stored explicitly in MM format, so the index marks 834 | # them with a special offset, -1. 835 | if offset == -1: 836 | return [] 837 | if isinstance(self.input, string_types): 838 | fin = utils.smart_open(self.input) 839 | else: 840 | fin = self.input 841 | 842 | fin.seek(offset) # works for gzip/bz2 input, too 843 | previd, document = -1, [] 844 | for line in fin: 845 | docid, termid, val = line.split() 846 | if not self.transposed: 847 | termid, docid = docid, termid 848 | docid, termid, val = int(docid) - 1, int(termid) - 1, float(val) # -1 because matrix market indexes are 1-based => convert to 0-based 849 | assert previd <= docid, "matrix columns must come in ascending order" 850 | if docid != previd: 851 | if previd >= 0: 852 | return document 853 | previd = docid 854 | 855 | document.append((termid, val,)) # add another field to the current document 856 | return document 857 | #endclass MmReader 858 | -------------------------------------------------------------------------------- /word2vec/setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from Cython.Build import cythonize 3 | setup(name="mWord2vec", ext_modules=cythonize("word2vec_inner.pyx")) 4 | -------------------------------------------------------------------------------- /word2vec/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | """ 8 | This module contains various general utility functions. 9 | """ 10 | 11 | from __future__ import with_statement 12 | 13 | import logging 14 | import warnings 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | try: 19 | from html.entities import name2codepoint as n2cp 20 | except ImportError: 21 | from htmlentitydefs import name2codepoint as n2cp 22 | try: 23 | import cPickle as _pickle 24 | except ImportError: 25 | import pickle as _pickle 26 | 27 | import re 28 | import unicodedata 29 | import os 30 | import random 31 | import itertools 32 | import tempfile 33 | from functools import wraps # for `synchronous` function lock 34 | import multiprocessing 35 | import shutil 36 | import sys 37 | from contextlib import contextmanager 38 | import subprocess 39 | 40 | import numpy as np 41 | import numbers 42 | import scipy.sparse 43 | 44 | if sys.version_info[0] >= 3: 45 | unicode = str 46 | 47 | from six import iterkeys, iteritems, u, string_types, unichr 48 | from six.moves import xrange 49 | 50 | try: 51 | from smart_open import smart_open 52 | except ImportError: 53 | logger.info("smart_open library not found; falling back to local-filesystem-only") 54 | 55 | def make_closing(base, **attrs): 56 | """ 57 | Add support for `with Base(attrs) as fout:` to the base class if it's missing. 58 | The base class' `close()` method will be called on context exit, to always close the file properly. 59 | 60 | This is needed for gzip.GzipFile, bz2.BZ2File etc in older Pythons (<=2.6), which otherwise 61 | raise "AttributeError: GzipFile instance has no attribute '__exit__'". 62 | 63 | """ 64 | if not hasattr(base, '__enter__'): 65 | attrs['__enter__'] = lambda self: self 66 | if not hasattr(base, '__exit__'): 67 | attrs['__exit__'] = lambda self, type, value, traceback: self.close() 68 | return type('Closing' + base.__name__, (base, object), attrs) 69 | 70 | def smart_open(fname, mode='rb'): 71 | _, ext = os.path.splitext(fname) 72 | if ext == '.bz2': 73 | from bz2 import BZ2File 74 | return make_closing(BZ2File)(fname, mode) 75 | if ext == '.gz': 76 | from gzip import GzipFile 77 | return make_closing(GzipFile)(fname, mode) 78 | return open(fname, mode) 79 | 80 | 81 | PAT_ALPHABETIC = re.compile('(((?![\d])\w)+)', re.UNICODE) 82 | RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE) 83 | 84 | 85 | def get_random_state(seed): 86 | """ 87 | Turn seed into a np.random.RandomState instance. 88 | Method originally from maciejkula/glove-python, and written by @joshloyal. 89 | """ 90 | if seed is None or seed is np.random: 91 | return np.random.mtrand._rand 92 | if isinstance(seed, (numbers.Integral, np.integer)): 93 | return np.random.RandomState(seed) 94 | if isinstance(seed, np.random.RandomState): 95 | return seed 96 | raise ValueError('%r cannot be used to seed a np.random.RandomState instance' % seed) 97 | 98 | 99 | def synchronous(tlockname): 100 | """ 101 | A decorator to place an instance-based lock around a method. 102 | 103 | Adapted from http://model.activestate.com/recipes/577105-synchronization-decorator-for-class-methods/ 104 | """ 105 | def _synched(func): 106 | @wraps(func) 107 | def _synchronizer(self, *args, **kwargs): 108 | tlock = getattr(self, tlockname) 109 | logger.debug("acquiring lock %r for %s" % (tlockname, func.__name__)) 110 | 111 | with tlock: # use lock as a context manager to perform safe acquire/release pairs 112 | logger.debug("acquired lock %r for %s" % (tlockname, func.__name__)) 113 | result = func(self, *args, **kwargs) 114 | logger.debug("releasing lock %r for %s" % (tlockname, func.__name__)) 115 | return result 116 | return _synchronizer 117 | return _synched 118 | 119 | 120 | class NoCM(object): 121 | def acquire(self): 122 | pass 123 | 124 | def release(self): 125 | pass 126 | 127 | def __enter__(self): 128 | pass 129 | 130 | def __exit__(self, type, value, traceback): 131 | pass 132 | nocm = NoCM() 133 | 134 | 135 | @contextmanager 136 | def file_or_filename(input): 137 | """ 138 | Return a file-like object ready to be read from the beginning. `input` is either 139 | a filename (gz/bz2 also supported) or a file-like object supporting seek. 140 | 141 | """ 142 | if isinstance(input, string_types): 143 | # input was a filename: open as file 144 | yield smart_open(input) 145 | else: 146 | # input already a file-like object; just reset to the beginning 147 | input.seek(0) 148 | yield input 149 | 150 | 151 | def deaccent(text): 152 | """ 153 | Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring. 154 | 155 | Return input string with accents removed, as unicode. 156 | 157 | >>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek") 158 | u'Sef chomutovskych komunistu dostal postou bily prasek' 159 | 160 | """ 161 | if not isinstance(text, unicode): 162 | # assume utf8 for byte strings, use default (strict) error handling 163 | text = text.decode('utf8') 164 | norm = unicodedata.normalize("NFD", text) 165 | result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn') 166 | return unicodedata.normalize("NFC", result) 167 | 168 | 169 | def copytree_hardlink(source, dest): 170 | """ 171 | Recursively copy a directory ala shutils.copytree, but hardlink files 172 | instead of copying. Available on UNIX systems only. 173 | """ 174 | copy2 = shutil.copy2 175 | try: 176 | shutil.copy2 = os.link 177 | shutil.copytree(source, dest) 178 | finally: 179 | shutil.copy2 = copy2 180 | 181 | 182 | def tokenize(text, lowercase=False, deacc=False, errors="strict", to_lower=False, lower=False): 183 | """ 184 | Iteratively yield tokens as unicode strings, removing accent marks 185 | and optionally lowercasing the unidoce string by assigning True 186 | to one of the parameters, lowercase, to_lower, or lower. 187 | 188 | Input text may be either unicode or utf8-encoded byte string. 189 | 190 | The tokens on output are maximal contiguous sequences of alphabetic 191 | characters (no digits!). 192 | 193 | >>> list(tokenize('Nic nemůže letět rychlostí vyšší, než 300 tisíc kilometrů za sekundu!', deacc = True)) 194 | [u'Nic', u'nemuze', u'letet', u'rychlosti', u'vyssi', u'nez', u'tisic', u'kilometru', u'za', u'sekundu'] 195 | 196 | """ 197 | lowercase = lowercase or to_lower or lower 198 | text = to_unicode(text, errors=errors) 199 | if lowercase: 200 | text = text.lower() 201 | if deacc: 202 | text = deaccent(text) 203 | for match in PAT_ALPHABETIC.finditer(text): 204 | yield match.group() 205 | 206 | 207 | def simple_preprocess(doc, deacc=False, min_len=2, max_len=15): 208 | """ 209 | Convert a document into a list of tokens. 210 | 211 | This lowercases, tokenizes, de-accents (optional). -- the output are final 212 | tokens = unicode strings, that won't be processed any further. 213 | 214 | """ 215 | tokens = [ 216 | token for token in tokenize(doc, lower=True, deacc=deacc, errors='ignore') 217 | if min_len <= len(token) <= max_len and not token.startswith('_') 218 | ] 219 | return tokens 220 | 221 | 222 | def any2utf8(text, errors='strict', encoding='utf8'): 223 | """Convert a string (unicode or bytestring in `encoding`), to bytestring in utf8.""" 224 | if isinstance(text, unicode): 225 | return text.encode('utf8') 226 | # do bytestring -> unicode -> utf8 full circle, to ensure valid utf8 227 | return unicode(text, encoding, errors=errors).encode('utf8') 228 | to_utf8 = any2utf8 229 | 230 | 231 | def any2unicode(text, encoding='utf8', errors='strict'): 232 | """Convert a string (bytestring in `encoding` or unicode), to unicode.""" 233 | if isinstance(text, unicode): 234 | return text 235 | return unicode(text, encoding, errors=errors) 236 | to_unicode = any2unicode 237 | 238 | 239 | def call_on_class_only(*args, **kwargs): 240 | """Raise exception when load methods are called on instance""" 241 | raise AttributeError('This method should be called on a class object.') 242 | 243 | 244 | class SaveLoad(object): 245 | """ 246 | Objects which inherit from this class have save/load functions, which un/pickle 247 | them to disk. 248 | 249 | This uses pickle for de/serializing, so objects must not contain 250 | unpicklable attributes, such as lambda functions etc. 251 | 252 | """ 253 | @classmethod 254 | def load(cls, fname, mmap=None): 255 | """ 256 | Load a previously saved object from file (also see `save`). 257 | 258 | If the object was saved with large arrays stored separately, you can load 259 | these arrays via mmap (shared memory) using `mmap='r'`. Default: don't use 260 | mmap, load large arrays as normal objects. 261 | 262 | If the file being loaded is compressed (either '.gz' or '.bz2'), then 263 | `mmap=None` must be set. Load will raise an `IOError` if this condition 264 | is encountered. 265 | 266 | """ 267 | logger.info("loading %s object from %s" % (cls.__name__, fname)) 268 | 269 | compress, subname = SaveLoad._adapt_by_suffix(fname) 270 | 271 | obj = unpickle(fname) 272 | obj._load_specials(fname, mmap, compress, subname) 273 | logger.info("loaded %s", fname) 274 | return obj 275 | 276 | def _load_specials(self, fname, mmap, compress, subname): 277 | """ 278 | Loads any attributes that were stored specially, and gives the same 279 | opportunity to recursively included SaveLoad instances. 280 | 281 | """ 282 | mmap_error = lambda x, y: IOError( 283 | 'Cannot mmap compressed object %s in file %s. ' % (x, y) + 284 | 'Use `load(fname, mmap=None)` or uncompress files manually.') 285 | 286 | for attrib in getattr(self, '__recursive_saveloads', []): 287 | cfname = '.'.join((fname, attrib)) 288 | logger.info("loading %s recursively from %s.* with mmap=%s" % ( 289 | attrib, cfname, mmap)) 290 | getattr(self, attrib)._load_specials(cfname, mmap, compress, subname) 291 | 292 | for attrib in getattr(self, '__numpys', []): 293 | logger.info("loading %s from %s with mmap=%s" % ( 294 | attrib, subname(fname, attrib), mmap)) 295 | 296 | if compress: 297 | if mmap: 298 | raise mmap_error(attrib, subname(fname, attrib)) 299 | 300 | val = np.load(subname(fname, attrib))['val'] 301 | else: 302 | val = np.load(subname(fname, attrib), mmap_mode=mmap) 303 | 304 | setattr(self, attrib, val) 305 | 306 | for attrib in getattr(self, '__scipys', []): 307 | logger.info("loading %s from %s with mmap=%s" % ( 308 | attrib, subname(fname, attrib), mmap)) 309 | sparse = unpickle(subname(fname, attrib)) 310 | if compress: 311 | if mmap: 312 | raise mmap_error(attrib, subname(fname, attrib)) 313 | 314 | with np.load(subname(fname, attrib, 'sparse')) as f: 315 | sparse.data = f['data'] 316 | sparse.indptr = f['indptr'] 317 | sparse.indices = f['indices'] 318 | else: 319 | sparse.data = np.load(subname(fname, attrib, 'data'), mmap_mode=mmap) 320 | sparse.indptr = np.load(subname(fname, attrib, 'indptr'), mmap_mode=mmap) 321 | sparse.indices = np.load(subname(fname, attrib, 'indices'), mmap_mode=mmap) 322 | 323 | setattr(self, attrib, sparse) 324 | 325 | for attrib in getattr(self, '__ignoreds', []): 326 | logger.info("setting ignored attribute %s to None" % (attrib)) 327 | setattr(self, attrib, None) 328 | 329 | @staticmethod 330 | def _adapt_by_suffix(fname): 331 | """Give appropriate compress setting and filename formula""" 332 | if fname.endswith('.gz') or fname.endswith('.bz2'): 333 | compress = True 334 | subname = lambda *args: '.'.join(list(args) + ['npz']) 335 | else: 336 | compress = False 337 | subname = lambda *args: '.'.join(list(args) + ['npy']) 338 | return (compress, subname) 339 | 340 | def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, 341 | ignore=frozenset(), pickle_protocol=2): 342 | """ 343 | Save the object to file (also see `load`). 344 | 345 | If `separately` is None, automatically detect large 346 | numpy/scipy.sparse arrays in the object being stored, and store 347 | them into separate files. This avoids pickle memory errors and 348 | allows mmap'ing large arrays back on load efficiently. 349 | 350 | You can also set `separately` manually, in which case it must be 351 | a list of attribute names to be stored in separate files. The 352 | automatic check is not performed in this case. 353 | 354 | `ignore` is a set of attribute names to *not* serialize (file 355 | handles, caches etc). On subsequent load() these attributes will 356 | be set to None. 357 | 358 | `pickle_protocol` defaults to 2 so the pickled object can be imported 359 | in both Python 2 and 3. 360 | 361 | """ 362 | logger.info( 363 | "saving %s object under %s, separately %s" % ( 364 | self.__class__.__name__, fname, separately)) 365 | 366 | compress, subname = SaveLoad._adapt_by_suffix(fname) 367 | 368 | restores = self._save_specials(fname, separately, sep_limit, ignore, pickle_protocol, 369 | compress, subname) 370 | try: 371 | pickle(self, fname, protocol=pickle_protocol) 372 | finally: 373 | # restore attribs handled specially 374 | for obj, asides in restores: 375 | for attrib, val in iteritems(asides): 376 | setattr(obj, attrib, val) 377 | logger.info("saved %s", fname) 378 | 379 | def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, compress, subname): 380 | """ 381 | Save aside any attributes that need to be handled separately, including 382 | by recursion any attributes that are themselves SaveLoad instances. 383 | 384 | Returns a list of (obj, {attrib: value, ...}) settings that the caller 385 | should use to restore each object's attributes that were set aside 386 | during the default pickle(). 387 | 388 | """ 389 | asides = {} 390 | sparse_matrices = (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix) 391 | if separately is None: 392 | separately = [] 393 | for attrib, val in iteritems(self.__dict__): 394 | if isinstance(val, np.ndarray) and val.size >= sep_limit: 395 | separately.append(attrib) 396 | elif isinstance(val, sparse_matrices) and val.nnz >= sep_limit: 397 | separately.append(attrib) 398 | 399 | # whatever's in `separately` or `ignore` at this point won't get pickled 400 | for attrib in separately + list(ignore): 401 | if hasattr(self, attrib): 402 | asides[attrib] = getattr(self, attrib) 403 | delattr(self, attrib) 404 | 405 | recursive_saveloads = [] 406 | restores = [] 407 | for attrib, val in iteritems(self.__dict__): 408 | if hasattr(val, '_save_specials'): # better than 'isinstance(val, SaveLoad)' if IPython reloading 409 | recursive_saveloads.append(attrib) 410 | cfname = '.'.join((fname, attrib)) 411 | restores.extend(val._save_specials( 412 | cfname, None, sep_limit, ignore, 413 | pickle_protocol, compress, subname)) 414 | 415 | try: 416 | numpys, scipys, ignoreds = [], [], [] 417 | for attrib, val in iteritems(asides): 418 | if isinstance(val, np.ndarray) and attrib not in ignore: 419 | numpys.append(attrib) 420 | logger.info("storing np array '%s' to %s" % ( 421 | attrib, subname(fname, attrib))) 422 | 423 | if compress: 424 | np.savez_compressed(subname(fname, attrib), val=np.ascontiguousarray(val)) 425 | else: 426 | np.save(subname(fname, attrib), np.ascontiguousarray(val)) 427 | 428 | elif isinstance(val, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)) and attrib not in ignore: 429 | scipys.append(attrib) 430 | logger.info("storing scipy.sparse array '%s' under %s" % ( 431 | attrib, subname(fname, attrib))) 432 | 433 | if compress: 434 | np.savez_compressed( 435 | subname(fname, attrib, 'sparse'), 436 | data=val.data, 437 | indptr=val.indptr, 438 | indices=val.indices) 439 | else: 440 | np.save(subname(fname, attrib, 'data'), val.data) 441 | np.save(subname(fname, attrib, 'indptr'), val.indptr) 442 | np.save(subname(fname, attrib, 'indices'), val.indices) 443 | 444 | data, indptr, indices = val.data, val.indptr, val.indices 445 | val.data, val.indptr, val.indices = None, None, None 446 | 447 | try: 448 | # store array-less object 449 | pickle(val, subname(fname, attrib), protocol=pickle_protocol) 450 | finally: 451 | val.data, val.indptr, val.indices = data, indptr, indices 452 | else: 453 | logger.info("not storing attribute %s" % (attrib)) 454 | ignoreds.append(attrib) 455 | 456 | self.__dict__['__numpys'] = numpys 457 | self.__dict__['__scipys'] = scipys 458 | self.__dict__['__ignoreds'] = ignoreds 459 | self.__dict__['__recursive_saveloads'] = recursive_saveloads 460 | except: 461 | # restore the attributes if exception-interrupted 462 | for attrib, val in iteritems(asides): 463 | setattr(self, attrib, val) 464 | raise 465 | return restores + [(self, asides)] 466 | 467 | def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, 468 | ignore=frozenset(), pickle_protocol=2): 469 | """ 470 | Save the object to file (also see `load`). 471 | 472 | `fname_or_handle` is either a string specifying the file name to 473 | save to, or an open file-like object which can be written to. If 474 | the object is a file handle, no special array handling will be 475 | performed; all attributes will be saved to the same file. 476 | 477 | If `separately` is None, automatically detect large 478 | numpy/scipy.sparse arrays in the object being stored, and store 479 | them into separate files. This avoids pickle memory errors and 480 | allows mmap'ing large arrays back on load efficiently. 481 | 482 | You can also set `separately` manually, in which case it must be 483 | a list of attribute names to be stored in separate files. The 484 | automatic check is not performed in this case. 485 | 486 | `ignore` is a set of attribute names to *not* serialize (file 487 | handles, caches etc). On subsequent load() these attributes will 488 | be set to None. 489 | 490 | `pickle_protocol` defaults to 2 so the pickled object can be imported 491 | in both Python 2 and 3. 492 | 493 | """ 494 | try: 495 | _pickle.dump(self, fname_or_handle, protocol=pickle_protocol) 496 | logger.info("saved %s object" % self.__class__.__name__) 497 | except TypeError: # `fname_or_handle` does not have write attribute 498 | self._smart_save(fname_or_handle, separately, sep_limit, ignore, 499 | pickle_protocol=pickle_protocol) 500 | #endclass SaveLoad 501 | 502 | 503 | def identity(p): 504 | """Identity fnc, for flows that don't accept lambda (pickling etc).""" 505 | return p 506 | 507 | 508 | def get_max_id(corpus): 509 | """ 510 | Return the highest feature id that appears in the corpus. 511 | 512 | For empty corpora (no features at all), return -1. 513 | 514 | """ 515 | maxid = -1 516 | for document in corpus: 517 | maxid = max(maxid, max([-1] + [fieldid for fieldid, _ in document])) # [-1] to avoid exceptions from max(empty) 518 | return maxid 519 | 520 | 521 | class FakeDict(object): 522 | """ 523 | Objects of this class act as dictionaries that map integer->str(integer), for 524 | a specified range of integers <0, num_terms). 525 | 526 | This is meant to avoid allocating real dictionaries when `num_terms` is huge, which 527 | is a waste of memory. 528 | 529 | """ 530 | def __init__(self, num_terms): 531 | self.num_terms = num_terms 532 | 533 | def __str__(self): 534 | return "FakeDict(num_terms=%s)" % self.num_terms 535 | 536 | def __getitem__(self, val): 537 | if 0 <= val < self.num_terms: 538 | return str(val) 539 | raise ValueError("internal id out of bounds (%s, expected <0..%s))" % 540 | (val, self.num_terms)) 541 | 542 | def iteritems(self): 543 | for i in xrange(self.num_terms): 544 | yield i, str(i) 545 | 546 | def keys(self): 547 | """ 548 | Override the dict.keys() function, which is used to determine the maximum 549 | internal id of a corpus = the vocabulary dimensionality. 550 | 551 | HACK: To avoid materializing the whole `range(0, self.num_terms)`, this returns 552 | the highest id = `[self.num_terms - 1]` only. 553 | 554 | """ 555 | return [self.num_terms - 1] 556 | 557 | def __len__(self): 558 | return self.num_terms 559 | 560 | def get(self, val, default=None): 561 | if 0 <= val < self.num_terms: 562 | return str(val) 563 | return default 564 | 565 | 566 | def dict_from_corpus(corpus): 567 | """ 568 | Scan corpus for all word ids that appear in it, then construct and return a mapping 569 | which maps each `wordId -> str(wordId)`. 570 | 571 | This function is used whenever *words* need to be displayed (as opposed to just 572 | their ids) but no wordId->word mapping was provided. The resulting mapping 573 | only covers words actually used in the corpus, up to the highest wordId found. 574 | 575 | """ 576 | num_terms = 1 + get_max_id(corpus) 577 | id2word = FakeDict(num_terms) 578 | return id2word 579 | 580 | 581 | def is_corpus(obj): 582 | """ 583 | Check whether `obj` is a corpus. Return (is_corpus, new) 2-tuple, where 584 | `new is obj` if `obj` was an iterable, or `new` yields the same sequence as 585 | `obj` if it was an iterator. 586 | 587 | `obj` is a corpus if it supports iteration over documents, where a document 588 | is in turn anything that acts as a sequence of 2-tuples (int, float). 589 | 590 | Note: An "empty" corpus (empty input sequence) is ambiguous, so in this case the 591 | result is forcefully defined as `is_corpus=False`. 592 | 593 | """ 594 | try: 595 | if 'Corpus' in obj.__class__.__name__: # the most common case, quick hack 596 | return True, obj 597 | except: 598 | pass 599 | try: 600 | if hasattr(obj, 'next') or hasattr(obj, '__next__'): 601 | # the input is an iterator object, meaning once we call next() 602 | # that element could be gone forever. we must be careful to put 603 | # whatever we retrieve back again 604 | doc1 = next(obj) 605 | obj = itertools.chain([doc1], obj) 606 | else: 607 | doc1 = next(iter(obj)) # empty corpus is resolved to False here 608 | if len(doc1) == 0: # sparse documents must have a __len__ function (list, tuple...) 609 | return True, obj # the first document is empty=>assume this is a corpus 610 | id1, val1 = next(iter(doc1)) # if obj is a 1D numpy array(scalars) instead of 2-tuples, it resolves to False here 611 | id1, val1 = int(id1), float(val1) # must be a 2-tuple (integer, float) 612 | except Exception: 613 | return False, obj 614 | return True, obj 615 | 616 | 617 | def get_my_ip(): 618 | """ 619 | Try to obtain our external ip (from the pyro nameserver's point of view) 620 | 621 | This tries to sidestep the issue of bogus `/etc/hosts` entries and other 622 | local misconfigurations, which often mess up hostname resolution. 623 | 624 | If all else fails, fall back to simple `socket.gethostbyname()` lookup. 625 | 626 | """ 627 | import socket 628 | try: 629 | import Pyro4 630 | # we know the nameserver must exist, so use it as our anchor point 631 | ns = Pyro4.naming.locateNS() 632 | s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) 633 | s.connect((ns._pyroUri.host, ns._pyroUri.port)) 634 | result, port = s.getsockname() 635 | except: 636 | try: 637 | # see what ifconfig says about our default interface 638 | import commands 639 | result = commands.getoutput("ifconfig").split("\n")[1].split()[1][5:] 640 | if len(result.split('.')) != 4: 641 | raise Exception() 642 | except: 643 | # give up, leave the resolution to gethostbyname 644 | result = socket.gethostbyname(socket.gethostname()) 645 | return result 646 | 647 | 648 | class RepeatCorpus(SaveLoad): 649 | """ 650 | Used in the tutorial on distributed computing and likely not useful anywhere else. 651 | 652 | """ 653 | def __init__(self, corpus, reps): 654 | """ 655 | Wrap a `corpus` as another corpus of length `reps`. This is achieved by 656 | repeating documents from `corpus` over and over again, until the requested 657 | length `len(result)==reps` is reached. Repetition is done 658 | on-the-fly=efficiently, via `itertools`. 659 | 660 | >>> corpus = [[(1, 0.5)], []] # 2 documents 661 | >>> list(RepeatCorpus(corpus, 5)) # repeat 2.5 times to get 5 documents 662 | [[(1, 0.5)], [], [(1, 0.5)], [], [(1, 0.5)]] 663 | 664 | """ 665 | self.corpus = corpus 666 | self.reps = reps 667 | 668 | def __iter__(self): 669 | return itertools.islice(itertools.cycle(self.corpus), self.reps) 670 | 671 | 672 | class RepeatCorpusNTimes(SaveLoad): 673 | 674 | def __init__(self, corpus, n): 675 | """ 676 | Repeat a `corpus` `n` times. 677 | 678 | >>> corpus = [[(1, 0.5)], []] 679 | >>> list(RepeatCorpusNTimes(corpus, 3)) # repeat 3 times 680 | [[(1, 0.5)], [], [(1, 0.5)], [], [(1, 0.5)], []] 681 | """ 682 | self.corpus = corpus 683 | self.n = n 684 | 685 | def __iter__(self): 686 | for _ in xrange(self.n): 687 | for document in self.corpus: 688 | yield document 689 | 690 | 691 | class ClippedCorpus(SaveLoad): 692 | def __init__(self, corpus, max_docs=None): 693 | """ 694 | Return a corpus that is the "head" of input iterable `corpus`. 695 | 696 | Any documents after `max_docs` are ignored. This effectively limits the 697 | length of the returned corpus to <= `max_docs`. Set `max_docs=None` for 698 | "no limit", effectively wrapping the entire input corpus. 699 | 700 | """ 701 | self.corpus = corpus 702 | self.max_docs = max_docs 703 | 704 | def __iter__(self): 705 | return itertools.islice(self.corpus, self.max_docs) 706 | 707 | def __len__(self): 708 | return min(self.max_docs, len(self.corpus)) 709 | 710 | 711 | class SlicedCorpus(SaveLoad): 712 | def __init__(self, corpus, slice_): 713 | """ 714 | Return a corpus that is the slice of input iterable `corpus`. 715 | 716 | Negative slicing can only be used if the corpus is indexable. 717 | Otherwise, the corpus will be iterated over. 718 | 719 | Slice can also be a np.ndarray to support fancy indexing. 720 | 721 | NOTE: calculating the size of a SlicedCorpus is expensive 722 | when using a slice as the corpus has to be iterated over once. 723 | Using a list or np.ndarray does not have this drawback, but 724 | consumes more memory. 725 | """ 726 | self.corpus = corpus 727 | self.slice_ = slice_ 728 | self.length = None 729 | 730 | def __iter__(self): 731 | if hasattr(self.corpus, 'index') and len(self.corpus.index) > 0: 732 | return (self.corpus.docbyoffset(i) for i in 733 | self.corpus.index[self.slice_]) 734 | else: 735 | return itertools.islice(self.corpus, self.slice_.start, 736 | self.slice_.stop, self.slice_.step) 737 | 738 | def __len__(self): 739 | # check cached length, calculate if needed 740 | if self.length is None: 741 | if isinstance(self.slice_, (list, np.ndarray)): 742 | self.length = len(self.slice_) 743 | else: 744 | self.length = sum(1 for x in self) 745 | 746 | return self.length 747 | 748 | 749 | def safe_unichr(intval): 750 | try: 751 | return unichr(intval) 752 | except ValueError: 753 | # ValueError: unichr() arg not in range(0x10000) (narrow Python build) 754 | s = "\\U%08x" % intval 755 | # return UTF16 surrogate pair 756 | return s.decode('unicode-escape') 757 | 758 | 759 | def decode_htmlentities(text): 760 | """ 761 | Decode HTML entities in text, coded as hex, decimal or named. 762 | 763 | Adapted from http://github.com/sku/python-twitter-ircbot/blob/321d94e0e40d0acc92f5bf57d126b57369da70de/html_decode.py 764 | 765 | >>> u = u'E tu vivrai nel terrore - L'aldilà (1981)' 766 | >>> print(decode_htmlentities(u).encode('UTF-8')) 767 | E tu vivrai nel terrore - L'aldilà (1981) 768 | >>> print(decode_htmlentities("l'eau")) 769 | l'eau 770 | >>> print(decode_htmlentities("foo < bar")) 771 | foo < bar 772 | 773 | """ 774 | def substitute_entity(match): 775 | try: 776 | ent = match.group(3) 777 | if match.group(1) == "#": 778 | # decoding by number 779 | if match.group(2) == '': 780 | # number is in decimal 781 | return safe_unichr(int(ent)) 782 | elif match.group(2) in ['x', 'X']: 783 | # number is in hex 784 | return safe_unichr(int(ent, 16)) 785 | else: 786 | # they were using a name 787 | cp = n2cp.get(ent) 788 | if cp: 789 | return safe_unichr(cp) 790 | else: 791 | return match.group() 792 | except: 793 | # in case of errors, return original input 794 | return match.group() 795 | 796 | return RE_HTML_ENTITY.sub(substitute_entity, text) 797 | 798 | 799 | def chunkize_serial(iterable, chunksize, as_numpy=False): 800 | """ 801 | Return elements from the iterable in `chunksize`-ed lists. The last returned 802 | element may be smaller (if length of collection is not divisible by `chunksize`). 803 | 804 | >>> print(list(grouper(range(10), 3))) 805 | [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]] 806 | 807 | """ 808 | it = iter(iterable) 809 | while True: 810 | if as_numpy: 811 | # convert each document to a 2d numpy array (~6x faster when transmitting 812 | # chunk data over the wire, in Pyro) 813 | wrapped_chunk = [[np.array(doc) for doc in itertools.islice(it, int(chunksize))]] 814 | else: 815 | wrapped_chunk = [list(itertools.islice(it, int(chunksize)))] 816 | if not wrapped_chunk[0]: 817 | break 818 | # memory opt: wrap the chunk and then pop(), to avoid leaving behind a dangling reference 819 | yield wrapped_chunk.pop() 820 | 821 | grouper = chunkize_serial 822 | 823 | 824 | class InputQueue(multiprocessing.Process): 825 | def __init__(self, q, corpus, chunksize, maxsize, as_numpy): 826 | super(InputQueue, self).__init__() 827 | self.q = q 828 | self.maxsize = maxsize 829 | self.corpus = corpus 830 | self.chunksize = chunksize 831 | self.as_numpy = as_numpy 832 | 833 | def run(self): 834 | it = iter(self.corpus) 835 | while True: 836 | chunk = itertools.islice(it, self.chunksize) 837 | if self.as_numpy: 838 | # HACK XXX convert documents to numpy arrays, to save memory. 839 | # This also gives a scipy warning at runtime: 840 | # "UserWarning: indices array has non-integer dtype (float64)" 841 | wrapped_chunk = [[np.asarray(doc) for doc in chunk]] 842 | else: 843 | wrapped_chunk = [list(chunk)] 844 | 845 | if not wrapped_chunk[0]: 846 | self.q.put(None, block=True) 847 | break 848 | 849 | try: 850 | qsize = self.q.qsize() 851 | except NotImplementedError: 852 | qsize = '?' 853 | logger.debug("prepared another chunk of %i documents (qsize=%s)" % 854 | (len(wrapped_chunk[0]), qsize)) 855 | self.q.put(wrapped_chunk.pop(), block=True) 856 | #endclass InputQueue 857 | 858 | 859 | if os.name == 'nt': 860 | warnings.warn("detected Windows; aliasing chunkize to chunkize_serial") 861 | 862 | def chunkize(corpus, chunksize, maxsize=0, as_numpy=False): 863 | for chunk in chunkize_serial(corpus, chunksize, as_numpy=as_numpy): 864 | yield chunk 865 | else: 866 | def chunkize(corpus, chunksize, maxsize=0, as_numpy=False): 867 | """ 868 | Split a stream of values into smaller chunks. 869 | Each chunk is of length `chunksize`, except the last one which may be smaller. 870 | A once-only input stream (`corpus` from a generator) is ok, chunking is done 871 | efficiently via itertools. 872 | 873 | If `maxsize > 1`, don't wait idly in between successive chunk `yields`, but 874 | rather keep filling a short queue (of size at most `maxsize`) with forthcoming 875 | chunks in advance. This is realized by starting a separate process, and is 876 | meant to reduce I/O delays, which can be significant when `corpus` comes 877 | from a slow medium (like harddisk). 878 | 879 | If `maxsize==0`, don't fool around with parallelism and simply yield the chunksize 880 | via `chunkize_serial()` (no I/O optimizations). 881 | 882 | >>> for chunk in chunkize(range(10), 4): print(chunk) 883 | [0, 1, 2, 3] 884 | [4, 5, 6, 7] 885 | [8, 9] 886 | 887 | """ 888 | assert chunksize > 0 889 | 890 | if maxsize > 0: 891 | q = multiprocessing.Queue(maxsize=maxsize) 892 | worker = InputQueue(q, corpus, chunksize, maxsize=maxsize, as_numpy=as_numpy) 893 | worker.daemon = True 894 | worker.start() 895 | while True: 896 | chunk = [q.get(block=True)] 897 | if chunk[0] is None: 898 | break 899 | yield chunk.pop() 900 | else: 901 | for chunk in chunkize_serial(corpus, chunksize, as_numpy=as_numpy): 902 | yield chunk 903 | 904 | 905 | def smart_extension(fname, ext): 906 | fname, oext = os.path.splitext(fname) 907 | if oext.endswith('.bz2'): 908 | fname = fname + oext[:-4] + ext + '.bz2' 909 | elif oext.endswith('.gz'): 910 | fname = fname + oext[:-3] + ext + '.gz' 911 | else: 912 | fname = fname + oext + ext 913 | 914 | return fname 915 | 916 | 917 | def pickle(obj, fname, protocol=2): 918 | """Pickle object `obj` to file `fname`. 919 | 920 | `protocol` defaults to 2 so pickled objects are compatible across 921 | Python 2.x and 3.x. 922 | 923 | """ 924 | with smart_open(fname, 'wb') as fout: # 'b' for binary, needed on Windows 925 | _pickle.dump(obj, fout, protocol=protocol) 926 | 927 | 928 | def unpickle(fname): 929 | """Load pickled object from `fname`""" 930 | with smart_open(fname, 'rb') as f: 931 | # Because of loading from S3 load can't be used (missing readline in smart_open) 932 | if sys.version_info > (3, 0): 933 | return _pickle.load(f, encoding='latin1') 934 | else: 935 | return _pickle.loads(f.read()) 936 | 937 | 938 | def revdict(d): 939 | """ 940 | Reverse a dictionary mapping. 941 | 942 | When two keys map to the same value, only one of them will be kept in the 943 | result (which one is kept is arbitrary). 944 | 945 | """ 946 | return dict((v, k) for (k, v) in iteritems(dict(d))) 947 | 948 | 949 | def toptexts(query, texts, index, n=10): 950 | """ 951 | Debug fnc to help inspect the top `n` most similar documents (according to a 952 | similarity index `index`), to see if they are actually related to the query. 953 | 954 | `texts` is any object that can return something insightful for each document 955 | via `texts[docid]`, such as its fulltext or snippet. 956 | 957 | Return a list of 3-tuples (docid, doc's similarity to the query, texts[docid]). 958 | 959 | """ 960 | sims = index[query] # perform a similarity query against the corpus 961 | sims = sorted(enumerate(sims), key=lambda item: -item[1]) 962 | 963 | result = [] 964 | for topid, topcosine in sims[:n]: # only consider top-n most similar docs 965 | result.append((topid, topcosine, texts[topid])) 966 | return result 967 | 968 | 969 | def randfname(prefix='gensim'): 970 | randpart = hex(random.randint(0, 0xffffff))[2:] 971 | return os.path.join(tempfile.gettempdir(), prefix + randpart) 972 | 973 | 974 | def upload_chunked(server, docs, chunksize=1000, preprocess=None): 975 | """ 976 | Memory-friendly upload of documents to a SimServer (or Pyro SimServer proxy). 977 | 978 | Use this function to train or index large collections -- avoid sending the 979 | entire corpus over the wire as a single Pyro in-memory object. The documents 980 | will be sent in smaller chunks, of `chunksize` documents each. 981 | 982 | """ 983 | start = 0 984 | for chunk in grouper(docs, chunksize): 985 | end = start + len(chunk) 986 | logger.info("uploading documents %i-%i" % (start, end - 1)) 987 | if preprocess is not None: 988 | pchunk = [] 989 | for doc in chunk: 990 | doc['tokens'] = preprocess(doc['text']) 991 | del doc['text'] 992 | pchunk.append(doc) 993 | chunk = pchunk 994 | server.buffer(chunk) 995 | start = end 996 | 997 | 998 | def getNS(host=None, port=None, broadcast=True, hmac_key=None): 999 | """ 1000 | Return a Pyro name server proxy. 1001 | """ 1002 | import Pyro4 1003 | try: 1004 | return Pyro4.locateNS(host, port, broadcast, hmac_key) 1005 | except Pyro4.errors.NamingError: 1006 | raise RuntimeError("Pyro name server not found") 1007 | 1008 | 1009 | def pyro_daemon(name, obj, random_suffix=False, ip=None, port=None, ns_conf={}): 1010 | """ 1011 | Register object with name server (starting the name server if not running 1012 | yet) and block until the daemon is terminated. The object is registered under 1013 | `name`, or `name`+ some random suffix if `random_suffix` is set. 1014 | 1015 | """ 1016 | if random_suffix: 1017 | name += '.' + hex(random.randint(0, 0xffffff))[2:] 1018 | import Pyro4 1019 | with getNS(**ns_conf) as ns: 1020 | with Pyro4.Daemon(ip or get_my_ip(), port or 0) as daemon: 1021 | # register server for remote access 1022 | uri = daemon.register(obj, name) 1023 | ns.remove(name) 1024 | ns.register(name, uri) 1025 | logger.info("%s registered with nameserver (URI '%s')" % (name, uri)) 1026 | daemon.requestLoop() 1027 | 1028 | 1029 | def has_pattern(): 1030 | """ 1031 | Function which returns a flag indicating whether pattern is installed or not 1032 | """ 1033 | try: 1034 | from pattern.en import parse 1035 | return True 1036 | except ImportError: 1037 | return False 1038 | 1039 | 1040 | def lemmatize( 1041 | content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False, 1042 | stopwords=frozenset(), min_length=2, max_length=15): 1043 | """ 1044 | This function is only available when the optional 'pattern' package is installed. 1045 | 1046 | Use the English lemmatizer from `pattern` to extract UTF8-encoded tokens in 1047 | their base form=lemma, e.g. "are, is, being" -> "be" etc. 1048 | This is a smarter version of stemming, taking word context into account. 1049 | 1050 | Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded). 1051 | 1052 | >>> lemmatize('Hello World! How is it going?! Nonexistentword, 21') 1053 | ['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN'] 1054 | 1055 | >>> lemmatize('The study ranks high.') 1056 | ['study/NN', 'rank/VB', 'high/JJ'] 1057 | 1058 | >>> lemmatize('The ranks study hard.') 1059 | ['rank/NN', 'study/VB', 'hard/RB'] 1060 | 1061 | """ 1062 | if not has_pattern(): 1063 | raise ImportError("Pattern library is not installed. Pattern library is needed in order to use lemmatize function") 1064 | from pattern.en import parse 1065 | 1066 | if light: 1067 | import warnings 1068 | warnings.warn("The light flag is no longer supported by pattern.") 1069 | 1070 | # tokenization in `pattern` is weird; it gets thrown off by non-letters, 1071 | # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little 1072 | # FIXME this throws away all fancy parsing cues, including sentence structure, 1073 | # abbreviations etc. 1074 | content = u(' ').join(tokenize(content, lower=True, errors='ignore')) 1075 | 1076 | parsed = parse(content, lemmata=True, collapse=False) 1077 | result = [] 1078 | for sentence in parsed: 1079 | for token, tag, _, _, lemma in sentence: 1080 | if min_length <= len(lemma) <= max_length and not lemma.startswith('_') and lemma not in stopwords: 1081 | if allowed_tags.match(tag): 1082 | lemma += "/" + tag[:2] 1083 | result.append(lemma.encode('utf8')) 1084 | return result 1085 | 1086 | 1087 | def mock_data_row(dim=1000, prob_nnz=0.5, lam=1.0): 1088 | """ 1089 | Create a random gensim sparse vector. Each coordinate is nonzero with 1090 | probability `prob_nnz`, each non-zero coordinate value is drawn from 1091 | a Poisson distribution with parameter lambda equal to `lam`. 1092 | 1093 | """ 1094 | nnz = np.random.uniform(size=(dim,)) 1095 | data = [(i, float(np.random.poisson(lam=lam) + 1.0)) 1096 | for i in xrange(dim) if nnz[i] < prob_nnz] 1097 | return data 1098 | 1099 | 1100 | def mock_data(n_items=1000, dim=1000, prob_nnz=0.5, lam=1.0): 1101 | """ 1102 | Create a random gensim-style corpus, as a list of lists of (int, float) tuples, 1103 | to be used as a mock corpus. 1104 | 1105 | """ 1106 | data = [mock_data_row(dim=dim, prob_nnz=prob_nnz, lam=lam) 1107 | for _ in xrange(n_items)] 1108 | return data 1109 | 1110 | 1111 | def prune_vocab(vocab, min_reduce, trim_rule=None): 1112 | """ 1113 | Remove all entries from the `vocab` dictionary with count smaller than `min_reduce`. 1114 | 1115 | Modifies `vocab` in place, returns the sum of all counts that were pruned. 1116 | 1117 | """ 1118 | result = 0 1119 | old_len = len(vocab) 1120 | for w in list(vocab): # make a copy of dict's keys 1121 | if not keep_vocab_item(w, vocab[w], min_reduce, trim_rule): # vocab[w] <= min_reduce: 1122 | result += vocab[w] 1123 | del vocab[w] 1124 | logger.info("pruned out %i tokens with count <=%i (before %i, after %i)", 1125 | old_len - len(vocab), min_reduce, old_len, len(vocab)) 1126 | return result 1127 | 1128 | 1129 | def qsize(queue): 1130 | """Return the (approximate) queue size where available; -1 where not (OS X).""" 1131 | try: 1132 | return queue.qsize() 1133 | except NotImplementedError: 1134 | # OS X doesn't support qsize 1135 | return -1 1136 | 1137 | RULE_DEFAULT = 0 1138 | RULE_DISCARD = 1 1139 | RULE_KEEP = 2 1140 | 1141 | 1142 | def keep_vocab_item(word, count, min_count, trim_rule=None): 1143 | default_res = count >= min_count 1144 | 1145 | if trim_rule is None: 1146 | return default_res 1147 | else: 1148 | rule_res = trim_rule(word, count, min_count) 1149 | if rule_res == RULE_KEEP: 1150 | return True 1151 | elif rule_res == RULE_DISCARD: 1152 | return False 1153 | else: 1154 | return default_res 1155 | 1156 | 1157 | def check_output(stdout=subprocess.PIPE, *popenargs, **kwargs): 1158 | """ 1159 | Run command with arguments and return its output as a byte string. 1160 | Backported from Python 2.7 as it's implemented as pure python on stdlib. 1161 | 1162 | >>> check_output(args=['/usr/bin/python', '--version']) 1163 | Python 2.6.2 1164 | Added extra KeyboardInterrupt handling 1165 | """ 1166 | try: 1167 | logger.debug("COMMAND: %s %s", popenargs, kwargs) 1168 | process = subprocess.Popen(stdout=stdout, *popenargs, **kwargs) 1169 | output, unused_err = process.communicate() 1170 | retcode = process.poll() 1171 | if retcode: 1172 | cmd = kwargs.get("args") 1173 | if cmd is None: 1174 | cmd = popenargs[0] 1175 | error = subprocess.CalledProcessError(retcode, cmd) 1176 | error.output = output 1177 | raise error 1178 | return output 1179 | except KeyboardInterrupt: 1180 | process.terminate() 1181 | raise 1182 | 1183 | 1184 | def sample_dict(d, n=10, use_random=True): 1185 | """ 1186 | Pick `n` items from dictionary `d` and return them as a list. 1187 | The items are picked randomly if `use_random` is True, otherwise picked 1188 | according to natural dict iteration. 1189 | """ 1190 | selected_keys = random.sample(list(d), min(len(d), n)) if use_random else itertools.islice(iterkeys(d), n) 1191 | return [(key, d[key]) for key in selected_keys] 1192 | -------------------------------------------------------------------------------- /word2vec/voidptr.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #if PY_VERSION_HEX >= 0x03020000 4 | 5 | /* 6 | ** compatibility with python >= 3.2, which doesn't have CObject anymore 7 | */ 8 | static void * PyCObject_AsVoidPtr(PyObject *obj) 9 | { 10 | void *ret = PyCapsule_GetPointer(obj, NULL); 11 | if (ret == NULL) { 12 | PyErr_Clear(); 13 | } 14 | return ret; 15 | } 16 | 17 | #endif -------------------------------------------------------------------------------- /word2vec/word2vec_inner.pxd: -------------------------------------------------------------------------------- 1 | # 2 | # shared type definitions for word2vec_inner 3 | # used by both word2vec_inner.pyx (automatically) and doc2vec_inner.pyx (by explicit cimport) 4 | # 5 | # Copyright (C) 2013 Radim Rehurek 6 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.htmlcimport numpy as np 7 | 8 | cdef extern from "voidptr.h": 9 | void* PyCObject_AsVoidPtr(object obj) 10 | 11 | cimport numpy as np 12 | ctypedef np.float32_t REAL_t 13 | 14 | # BLAS routine signatures 15 | ctypedef void (*scopy_ptr) (const int *N, const float *X, const int *incX, float *Y, const int *incY) nogil 16 | ctypedef void (*saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil 17 | ctypedef float (*sdot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil 18 | ctypedef double (*dsdot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil 19 | ctypedef double (*snrm2_ptr) (const int *N, const float *X, const int *incX) nogil 20 | ctypedef void (*sscal_ptr) (const int *N, const float *alpha, const float *X, const int *incX) nogil 21 | 22 | cdef scopy_ptr scopy 23 | cdef saxpy_ptr saxpy 24 | cdef sdot_ptr sdot 25 | cdef dsdot_ptr dsdot 26 | cdef snrm2_ptr snrm2 27 | cdef sscal_ptr sscal 28 | 29 | # precalculated sigmoid table 30 | DEF EXP_TABLE_SIZE = 1000 31 | DEF MAX_EXP = 6 32 | cdef REAL_t[EXP_TABLE_SIZE] EXP_TABLE 33 | 34 | # function implementations swapped based on BLAS detected in word2vec_inner.pyx init() 35 | ctypedef REAL_t (*our_dot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil 36 | ctypedef void (*our_saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil 37 | 38 | cdef our_dot_ptr our_dot 39 | cdef our_saxpy_ptr our_saxpy 40 | 41 | # for when fblas.sdot returns a double 42 | cdef REAL_t our_dot_double(const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil 43 | 44 | # for when fblas.sdot returns a float 45 | cdef REAL_t our_dot_float(const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil 46 | 47 | # for when no blas available 48 | cdef REAL_t our_dot_noblas(const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil 49 | cdef void our_saxpy_noblas(const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil 50 | 51 | # to support random draws from negative-sampling cum_table 52 | cdef unsigned long long bisect_left(np.uint32_t *a, unsigned long long x, unsigned long long lo, unsigned long long hi) nogil 53 | 54 | cdef unsigned long long random_int32(unsigned long long *next_random) nogil 55 | -------------------------------------------------------------------------------- /word2vec/word2vec_inner.pyx: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cython 2 | # cython: boundscheck=False 3 | # cython: wraparound=False 4 | # cython: cdivision=True 5 | # coding: utf-8 6 | # 7 | # Copyright (C) 2013 Radim Rehurek 8 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 9 | 10 | import cython 11 | import numpy as np 12 | cimport numpy as np 13 | 14 | from libc.math cimport exp 15 | from libc.math cimport log 16 | from libc.string cimport memset 17 | 18 | # scipy <= 0.15 19 | try: 20 | from scipy.linalg.blas import fblas 21 | except ImportError: 22 | # in scipy > 0.15, fblas function has been removed 23 | import scipy.linalg.blas as fblas 24 | 25 | REAL = np.float32 26 | 27 | DEF MAX_SENTENCE_LEN = 10000 28 | 29 | cdef scopy_ptr scopy=PyCObject_AsVoidPtr(fblas.scopy._cpointer) # y = x 30 | cdef saxpy_ptr saxpy=PyCObject_AsVoidPtr(fblas.saxpy._cpointer) # y += alpha * x 31 | cdef sdot_ptr sdot=PyCObject_AsVoidPtr(fblas.sdot._cpointer) # float = dot(x, y) 32 | cdef dsdot_ptr dsdot=PyCObject_AsVoidPtr(fblas.sdot._cpointer) # double = dot(x, y) 33 | cdef snrm2_ptr snrm2=PyCObject_AsVoidPtr(fblas.snrm2._cpointer) # sqrt(x^2) 34 | cdef sscal_ptr sscal=PyCObject_AsVoidPtr(fblas.sscal._cpointer) # x = alpha * x 35 | 36 | DEF EXP_TABLE_SIZE = 1000 37 | DEF MAX_EXP = 6 38 | 39 | cdef REAL_t[EXP_TABLE_SIZE] EXP_TABLE 40 | cdef REAL_t[EXP_TABLE_SIZE] LOG_TABLE 41 | 42 | cdef int ONE = 1 43 | cdef REAL_t ONEF = 1.0 44 | 45 | # for when fblas.sdot returns a double 46 | cdef REAL_t our_dot_double(const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil: 47 | return dsdot(N, X, incX, Y, incY) 48 | 49 | # for when fblas.sdot returns a float 50 | cdef REAL_t our_dot_float(const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil: 51 | return sdot(N, X, incX, Y, incY) 52 | 53 | # for when no blas available 54 | cdef REAL_t our_dot_noblas(const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil: 55 | # not a true full dot()-implementation: just enough for our cases 56 | cdef int i 57 | cdef REAL_t a 58 | a = 0.0 59 | for i from 0 <= i < N[0] by 1: 60 | a += X[i] * Y[i] 61 | return a 62 | 63 | # for when no blas available 64 | cdef void our_saxpy_noblas(const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil: 65 | cdef int i 66 | for i from 0 <= i < N[0] by 1: 67 | Y[i * (incY[0])] = (alpha[0]) * X[i * (incX[0])] + Y[i * (incY[0])] 68 | 69 | 70 | cdef void fast_sentence_sg_hs( 71 | const np.uint32_t *word_point, const np.uint8_t *word_code, const int codelen, 72 | REAL_t *syn0, REAL_t *syn1, const int size, 73 | const np.uint32_t word2_index, const REAL_t alpha, REAL_t *work, REAL_t *word_locks) nogil: 74 | 75 | cdef long long a, b 76 | cdef long long row1 = word2_index * size, row2 77 | cdef REAL_t f, g 78 | 79 | memset(work, 0, size * cython.sizeof(REAL_t)) 80 | for b in range(codelen): 81 | row2 = word_point[b] * size 82 | f = our_dot(&size, &syn0[row1], &ONE, &syn1[row2], &ONE) 83 | if f <= -MAX_EXP or f >= MAX_EXP: 84 | continue 85 | f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] 86 | g = (1 - word_code[b] - f) * alpha 87 | our_saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE) 88 | our_saxpy(&size, &g, &syn0[row1], &ONE, &syn1[row2], &ONE) 89 | our_saxpy(&size, &word_locks[word2_index], work, &ONE, &syn0[row1], &ONE) 90 | 91 | 92 | # to support random draws from negative-sampling cum_table 93 | cdef inline unsigned long long bisect_left(np.uint32_t *a, unsigned long long x, unsigned long long lo, unsigned long long hi) nogil: 94 | cdef unsigned long long mid 95 | while hi > lo: 96 | mid = (lo + hi) >> 1 97 | if a[mid] >= x: 98 | hi = mid 99 | else: 100 | lo = mid + 1 101 | return lo 102 | 103 | # this quick & dirty RNG apparently matches Java's (non-Secure)Random 104 | # note this function side-effects next_random to set up the next number 105 | cdef inline unsigned long long random_int32(unsigned long long *next_random) nogil: 106 | cdef unsigned long long this_random = next_random[0] >> 16 107 | next_random[0] = (next_random[0] * 25214903917ULL + 11) & 281474976710655ULL 108 | return this_random 109 | 110 | cdef unsigned long long fast_sentence_sg_neg( 111 | const int negative, np.uint32_t *cum_table, unsigned long long cum_table_len, 112 | REAL_t *syn0, REAL_t *syn1neg, const int size, const np.uint32_t word_index, 113 | const np.uint32_t word2_index, const REAL_t alpha, REAL_t *work, 114 | unsigned long long next_random, REAL_t *word_locks) nogil: 115 | 116 | cdef long long a 117 | cdef long long row1 = word2_index * size, row2 118 | cdef unsigned long long modulo = 281474976710655ULL 119 | cdef REAL_t f, g, label 120 | cdef np.uint32_t target_index 121 | cdef int d 122 | 123 | memset(work, 0, size * cython.sizeof(REAL_t)) 124 | 125 | for d in range(negative+1): 126 | if d == 0: 127 | target_index = word_index 128 | label = ONEF 129 | else: 130 | target_index = bisect_left(cum_table, (next_random >> 16) % cum_table[cum_table_len-1], 0, cum_table_len) 131 | next_random = (next_random * 25214903917ULL + 11) & modulo 132 | if target_index == word_index: 133 | continue 134 | label = 0.0 135 | 136 | row2 = target_index * size 137 | f = our_dot(&size, &syn0[row1], &ONE, &syn1neg[row2], &ONE) 138 | if f <= -MAX_EXP or f >= MAX_EXP: 139 | continue 140 | f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] 141 | g = (label - f) * alpha 142 | our_saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE) 143 | our_saxpy(&size, &g, &syn0[row1], &ONE, &syn1neg[row2], &ONE) 144 | 145 | our_saxpy(&size, &word_locks[word2_index], work, &ONE, &syn0[row1], &ONE) 146 | 147 | return next_random 148 | 149 | cdef unsigned long long fast_sentence_sg_neg_dynamic( 150 | const int negative, np.uint32_t *cum_table, unsigned long long cum_table_len, 151 | REAL_t *syn0, REAL_t *syn1neg, const int size, const np.uint32_t word_index, 152 | const np.uint32_t word2_index, const REAL_t alpha, REAL_t *work, 153 | unsigned long long next_random, REAL_t *word_locks, int eliminate) nogil: 154 | 155 | cdef long long a 156 | cdef long long row1 = word2_index * size, row2 157 | cdef unsigned long long modulo = 281474976710655ULL 158 | cdef REAL_t f, g, label 159 | cdef np.uint32_t target_index 160 | cdef int d 161 | cdef REAL_t neg=-1 162 | 163 | memset(work, 0, size * cython.sizeof(REAL_t)) 164 | 165 | for d in range(negative+1): 166 | if d == 0: 167 | target_index = word_index 168 | label = ONEF 169 | else: 170 | target_index = bisect_left(cum_table, (next_random >> 16) % cum_table[cum_table_len-1], 0, cum_table_len) 171 | next_random = (next_random * 25214903917ULL + 11) & modulo 172 | if target_index == word_index: 173 | continue 174 | label = 0.0 175 | 176 | row2 = target_index * size 177 | f = our_dot(&size, &syn0[row1], &ONE, &syn1neg[row2], &ONE) 178 | if f <= -MAX_EXP or f >= MAX_EXP: 179 | continue 180 | f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] 181 | g = (label - f) * alpha 182 | our_saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE) 183 | our_saxpy(&size, &g, &syn0[row1], &ONE, &syn1neg[row2], &ONE) 184 | 185 | if eliminate == 0: 186 | our_saxpy(&size, &word_locks[word2_index], work, &ONE, &syn0[row1], &ONE) 187 | else: 188 | sscal(&size, &neg, work, &ONE) 189 | our_saxpy(&size, &word_locks[word2_index], work, &ONE, &syn0[row1], &ONE) 190 | 191 | return next_random 192 | 193 | 194 | cdef void fast_sentence_cbow_hs( 195 | const np.uint32_t *word_point, const np.uint8_t *word_code, int codelens[MAX_SENTENCE_LEN], 196 | REAL_t *neu1, REAL_t *syn0, REAL_t *syn1, const int size, 197 | const np.uint32_t indexes[MAX_SENTENCE_LEN], const REAL_t alpha, REAL_t *work, 198 | int i, int j, int k, int cbow_mean, REAL_t *word_locks) nogil: 199 | 200 | cdef long long a, b 201 | cdef long long row2 202 | cdef REAL_t f, g, count, inv_count = 1.0 203 | cdef int m 204 | 205 | memset(neu1, 0, size * cython.sizeof(REAL_t)) 206 | count = 0.0 207 | for m in range(j, k): 208 | if m == i: 209 | continue 210 | else: 211 | count += ONEF 212 | our_saxpy(&size, &ONEF, &syn0[indexes[m] * size], &ONE, neu1, &ONE) 213 | if count > (0.5): 214 | inv_count = ONEF/count 215 | if cbow_mean: 216 | sscal(&size, &inv_count, neu1, &ONE) # (does this need BLAS-variants like saxpy?) 217 | 218 | memset(work, 0, size * cython.sizeof(REAL_t)) 219 | for b in range(codelens[i]): 220 | row2 = word_point[b] * size 221 | f = our_dot(&size, neu1, &ONE, &syn1[row2], &ONE) 222 | if f <= -MAX_EXP or f >= MAX_EXP: 223 | continue 224 | f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] 225 | g = (1 - word_code[b] - f) * alpha 226 | our_saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE) 227 | our_saxpy(&size, &g, neu1, &ONE, &syn1[row2], &ONE) 228 | 229 | if not cbow_mean: # divide error over summed window vectors 230 | sscal(&size, &inv_count, work, &ONE) # (does this need BLAS-variants like saxpy?) 231 | 232 | for m in range(j, k): 233 | if m == i: 234 | continue 235 | else: 236 | our_saxpy(&size, &word_locks[indexes[m]], work, &ONE, &syn0[indexes[m] * size], &ONE) 237 | 238 | 239 | cdef unsigned long long fast_sentence_cbow_neg( 240 | const int negative, np.uint32_t *cum_table, unsigned long long cum_table_len, int codelens[MAX_SENTENCE_LEN], 241 | REAL_t *neu1, REAL_t *syn0, REAL_t *syn1neg, const int size, 242 | const np.uint32_t indexes[MAX_SENTENCE_LEN], const REAL_t alpha, REAL_t *work, 243 | int i, int j, int k, int cbow_mean, unsigned long long next_random, REAL_t *word_locks) nogil: 244 | 245 | cdef long long a 246 | cdef long long row2 247 | cdef unsigned long long modulo = 281474976710655ULL 248 | cdef REAL_t f, g, count, inv_count = 1.0, label 249 | cdef np.uint32_t target_index, word_index 250 | cdef int d, m 251 | 252 | word_index = indexes[i] 253 | 254 | memset(neu1, 0, size * cython.sizeof(REAL_t)) 255 | count = 0.0 256 | for m in range(j, k): 257 | if m == i: 258 | continue 259 | else: 260 | count += ONEF 261 | our_saxpy(&size, &ONEF, &syn0[indexes[m] * size], &ONE, neu1, &ONE) 262 | if count > (0.5): 263 | inv_count = ONEF/count 264 | if cbow_mean: 265 | sscal(&size, &inv_count, neu1, &ONE) # (does this need BLAS-variants like saxpy?) 266 | 267 | memset(work, 0, size * cython.sizeof(REAL_t)) 268 | 269 | for d in range(negative+1): 270 | if d == 0: 271 | target_index = word_index 272 | label = ONEF 273 | else: 274 | target_index = bisect_left(cum_table, (next_random >> 16) % cum_table[cum_table_len-1], 0, cum_table_len) 275 | next_random = (next_random * 25214903917ULL + 11) & modulo 276 | if target_index == word_index: 277 | continue 278 | label = 0.0 279 | 280 | row2 = target_index * size 281 | f = our_dot(&size, neu1, &ONE, &syn1neg[row2], &ONE) 282 | if f <= -MAX_EXP or f >= MAX_EXP: 283 | continue 284 | f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] 285 | g = (label - f) * alpha 286 | our_saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE) 287 | our_saxpy(&size, &g, neu1, &ONE, &syn1neg[row2], &ONE) 288 | 289 | if not cbow_mean: # divide error over summed window vectors 290 | sscal(&size, &inv_count, work, &ONE) # (does this need BLAS-variants like saxpy?) 291 | 292 | for m in range(j,k): 293 | if m == i: 294 | continue 295 | else: 296 | our_saxpy(&size, &word_locks[indexes[m]], work, &ONE, &syn0[indexes[m]*size], &ONE) 297 | 298 | return next_random 299 | 300 | 301 | def train_batch_sg(model, sentences, alpha, _work): 302 | cdef int hs = model.hs 303 | cdef int negative = model.negative 304 | cdef int sample = (model.sample != 0) 305 | 306 | cdef REAL_t *syn0 = (np.PyArray_DATA(model.wv.syn0)) 307 | cdef REAL_t *word_locks = (np.PyArray_DATA(model.syn0_lockf)) 308 | cdef REAL_t *work 309 | cdef REAL_t _alpha = alpha 310 | cdef int size = model.layer1_size 311 | 312 | cdef int codelens[MAX_SENTENCE_LEN] 313 | cdef np.uint32_t indexes[MAX_SENTENCE_LEN] 314 | cdef np.uint32_t reduced_windows[MAX_SENTENCE_LEN] 315 | cdef int sentence_idx[MAX_SENTENCE_LEN + 1] 316 | cdef int sentence_pair_idx[MAX_SENTENCE_LEN + 1] 317 | cdef int window = model.window 318 | 319 | cdef int i, j, k 320 | cdef int effective_words = 0, effective_sentences = 0 321 | cdef int sent_idx, idx_start, idx_end 322 | 323 | # For hierarchical softmax 324 | cdef REAL_t *syn1 325 | cdef np.uint32_t *points[MAX_SENTENCE_LEN] 326 | cdef np.uint8_t *codes[MAX_SENTENCE_LEN] 327 | 328 | # For negative sampling 329 | cdef REAL_t *syn1neg 330 | cdef np.uint32_t *cum_table 331 | cdef unsigned long long cum_table_len 332 | # for sampling (negative and frequent-word downsampling) 333 | cdef unsigned long long next_random 334 | 335 | cdef int w1 336 | cdef int w2 337 | cdef int w_next 338 | cdef int pair_1 339 | cdef int pair_2 340 | cdef int wi 341 | cdef int w_idx 342 | cdef int eliminate 343 | cdef int dynamic 344 | if model.dynamic: 345 | dynamic = 1 346 | else: 347 | dynamic = 0 348 | 349 | if hs: 350 | syn1 = (np.PyArray_DATA(model.syn1)) 351 | 352 | if negative: 353 | syn1neg = (np.PyArray_DATA(model.syn1neg)) 354 | cum_table = (np.PyArray_DATA(model.cum_table)) 355 | cum_table_len = len(model.cum_table) 356 | if negative or sample: 357 | next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24) 358 | 359 | # convert Python structures to primitive types, so we can release the GIL 360 | work = np.PyArray_DATA(_work) 361 | 362 | # prepare C structures so we can go "full C" and release the Python GIL 363 | vlookup = model.wv.vocab 364 | sentence_idx[0] = 0 # indices of the first sentence always start at 0 365 | for s in sentences: 366 | sent = model.sentences[s] 367 | if not sent: 368 | continue # ignore empty sentences; leave effective_sentences unchanged 369 | 370 | for token in sent: 371 | word = vlookup[token] if token in vlookup else None 372 | if word is None: 373 | continue # leaving `effective_words` unchanged = shortening the sentence = expanding the window 374 | if sample and word.sample_int < random_int32(&next_random): 375 | continue 376 | indexes[effective_words] = word.index 377 | if hs: 378 | codelens[effective_words] = len(word.code) 379 | codes[effective_words] = np.PyArray_DATA(word.code) 380 | points[effective_words] = np.PyArray_DATA(word.point) 381 | effective_words += 1 382 | if effective_words == MAX_SENTENCE_LEN: 383 | break # TODO: log warning, tally overflow? 384 | 385 | # keep track of which words go into which sentence, so we don't train 386 | # across sentence boundaries. 387 | # indices of sentence number X are between idx_end: 415 | k = idx_end 416 | if dynamic: 417 | for j in range(j, k): 418 | eliminate = 0 419 | if j < i: 420 | for wi in range(j, i): 421 | w_next = wi + 1 422 | w1 = indexes[wi] 423 | w2 = indexes[w_next] 424 | if w1 == pair_1 and w2 == pair_2: 425 | eliminate = 1 426 | break 427 | elif j > i: 428 | for wi in range(i, j): 429 | w_next = wi + 1 430 | w1 = indexes[wi] 431 | w2 = indexes[w_next] 432 | if w1 == pair_1 and w2 == pair_2: 433 | eliminate = 1 434 | break 435 | elif j == i: 436 | continue 437 | if hs: 438 | fast_sentence_sg_hs(points[i], codes[i], codelens[i], syn0, syn1, size, indexes[j], _alpha, work, word_locks) 439 | if negative: 440 | next_random = fast_sentence_sg_neg_dynamic(negative, cum_table, cum_table_len, syn0, syn1neg, size, indexes[i], indexes[j], _alpha, work, next_random, word_locks, eliminate) 441 | else: 442 | for j in range(j, k): 443 | if j == i: 444 | continue 445 | if hs: 446 | fast_sentence_sg_hs(points[i], codes[i], codelens[i], syn0, syn1, size, indexes[j], _alpha, work, word_locks) 447 | if negative: 448 | next_random = fast_sentence_sg_neg(negative, cum_table, cum_table_len, syn0, syn1neg, size, indexes[i], indexes[j], _alpha, work, next_random, word_locks) 449 | 450 | return effective_words 451 | 452 | 453 | def train_batch_cbow(model, sentences, alpha, _work, _neu1): 454 | cdef int hs = model.hs 455 | cdef int negative = model.negative 456 | cdef int sample = (model.sample != 0) 457 | cdef int cbow_mean = model.cbow_mean 458 | 459 | cdef REAL_t *syn0 = (np.PyArray_DATA(model.wv.syn0)) 460 | cdef REAL_t *word_locks = (np.PyArray_DATA(model.syn0_lockf)) 461 | cdef REAL_t *work 462 | cdef REAL_t _alpha = alpha 463 | cdef int size = model.layer1_size 464 | 465 | cdef int codelens[MAX_SENTENCE_LEN] 466 | cdef np.uint32_t indexes[MAX_SENTENCE_LEN] 467 | cdef np.uint32_t reduced_windows[MAX_SENTENCE_LEN] 468 | cdef int sentence_idx[MAX_SENTENCE_LEN + 1] 469 | cdef int window = model.window 470 | 471 | cdef int i, j, k 472 | cdef int effective_words = 0, effective_sentences = 0 473 | cdef int sent_idx, idx_start, idx_end 474 | 475 | # For hierarchical softmax 476 | cdef REAL_t *syn1 477 | cdef np.uint32_t *points[MAX_SENTENCE_LEN] 478 | cdef np.uint8_t *codes[MAX_SENTENCE_LEN] 479 | 480 | # For negative sampling 481 | cdef REAL_t *syn1neg 482 | cdef np.uint32_t *cum_table 483 | cdef unsigned long long cum_table_len 484 | # for sampling (negative and frequent-word downsampling) 485 | cdef unsigned long long next_random 486 | 487 | if hs: 488 | syn1 = (np.PyArray_DATA(model.syn1)) 489 | 490 | if negative: 491 | syn1neg = (np.PyArray_DATA(model.syn1neg)) 492 | cum_table = (np.PyArray_DATA(model.cum_table)) 493 | cum_table_len = len(model.cum_table) 494 | if negative or sample: 495 | next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24) 496 | 497 | # convert Python structures to primitive types, so we can release the GIL 498 | work = np.PyArray_DATA(_work) 499 | neu1 = np.PyArray_DATA(_neu1) 500 | 501 | # prepare C structures so we can go "full C" and release the Python GIL 502 | vlookup = model.wv.vocab 503 | sentence_idx[0] = 0 # indices of the first sentence always start at 0 504 | for sent in sentences: 505 | if not sent: 506 | continue # ignore empty sentences; leave effective_sentences unchanged 507 | for token in sent: 508 | word = vlookup[token] if token in vlookup else None 509 | if word is None: 510 | continue # leaving `effective_words` unchanged = shortening the sentence = expanding the window 511 | if sample and word.sample_int < random_int32(&next_random): 512 | continue 513 | indexes[effective_words] = word.index 514 | if hs: 515 | codelens[effective_words] = len(word.code) 516 | codes[effective_words] = np.PyArray_DATA(word.code) 517 | points[effective_words] = np.PyArray_DATA(word.point) 518 | effective_words += 1 519 | if effective_words == MAX_SENTENCE_LEN: 520 | break # TODO: log warning, tally overflow? 521 | 522 | # keep track of which words go into which sentence, so we don't train 523 | # across sentence boundaries. 524 | # indices of sentence number X are between idx_end: 546 | k = idx_end 547 | if hs: 548 | fast_sentence_cbow_hs(points[i], codes[i], codelens, neu1, syn0, syn1, size, indexes, _alpha, work, i, j, k, cbow_mean, word_locks) 549 | if negative: 550 | next_random = fast_sentence_cbow_neg(negative, cum_table, cum_table_len, codelens, neu1, syn0, syn1neg, size, indexes, _alpha, work, i, j, k, cbow_mean, next_random, word_locks) 551 | 552 | return effective_words 553 | 554 | 555 | # Score is only implemented for hierarchical softmax 556 | def score_sentence_sg(model, sentence, _work): 557 | 558 | cdef REAL_t *syn0 = (np.PyArray_DATA(model.wv.syn0)) 559 | cdef REAL_t *work 560 | cdef int size = model.layer1_size 561 | 562 | cdef int codelens[MAX_SENTENCE_LEN] 563 | cdef np.uint32_t indexes[MAX_SENTENCE_LEN] 564 | cdef int sentence_len 565 | cdef int window = model.window 566 | 567 | cdef int i, j, k 568 | cdef long result = 0 569 | 570 | cdef REAL_t *syn1 571 | cdef np.uint32_t *points[MAX_SENTENCE_LEN] 572 | cdef np.uint8_t *codes[MAX_SENTENCE_LEN] 573 | 574 | syn1 = (np.PyArray_DATA(model.syn1)) 575 | 576 | # convert Python structures to primitive types, so we can release the GIL 577 | work = np.PyArray_DATA(_work) 578 | 579 | vlookup = model.wv.vocab 580 | i = 0 581 | for token in sentence: 582 | word = vlookup[token] if token in vlookup else None 583 | if word is None: 584 | continue # should drop the 585 | indexes[i] = word.index 586 | codelens[i] = len(word.code) 587 | codes[i] = np.PyArray_DATA(word.code) 588 | points[i] = np.PyArray_DATA(word.point) 589 | result += 1 590 | i += 1 591 | if i == MAX_SENTENCE_LEN: 592 | break # TODO: log warning, tally overflow? 593 | sentence_len = i 594 | 595 | # release GIL & train on the sentence 596 | work[0] = 0.0 597 | 598 | with nogil: 599 | for i in range(sentence_len): 600 | if codelens[i] == 0: 601 | continue 602 | j = i - window 603 | if j < 0: 604 | j = 0 605 | k = i + window + 1 606 | if k > sentence_len: 607 | k = sentence_len 608 | for j in range(j, k): 609 | if j == i or codelens[j] == 0: 610 | continue 611 | score_pair_sg_hs(points[i], codes[i], codelens[i], syn0, syn1, size, indexes[j], work) 612 | 613 | return work[0] 614 | 615 | cdef void score_pair_sg_hs( 616 | const np.uint32_t *word_point, const np.uint8_t *word_code, const int codelen, 617 | REAL_t *syn0, REAL_t *syn1, const int size, 618 | const np.uint32_t word2_index, REAL_t *work) nogil: 619 | 620 | cdef long long b 621 | cdef long long row1 = word2_index * size, row2, sgn 622 | cdef REAL_t f 623 | 624 | for b in range(codelen): 625 | row2 = word_point[b] * size 626 | f = our_dot(&size, &syn0[row1], &ONE, &syn1[row2], &ONE) 627 | sgn = (-1)**word_code[b] # ch function: 0-> 1, 1 -> -1 628 | f = sgn*f 629 | if f <= -MAX_EXP or f >= MAX_EXP: 630 | continue 631 | f = LOG_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] 632 | work[0] += f 633 | 634 | def score_sentence_cbow(model, sentence, _work, _neu1): 635 | 636 | cdef int cbow_mean = model.cbow_mean 637 | 638 | cdef REAL_t *syn0 = (np.PyArray_DATA(model.wv.syn0)) 639 | cdef REAL_t *work 640 | cdef REAL_t *neu1 641 | cdef int size = model.layer1_size 642 | 643 | cdef int codelens[MAX_SENTENCE_LEN] 644 | cdef np.uint32_t indexes[MAX_SENTENCE_LEN] 645 | cdef int sentence_len 646 | cdef int window = model.window 647 | 648 | cdef int i, j, k 649 | cdef long result = 0 650 | 651 | # For hierarchical softmax 652 | cdef REAL_t *syn1 653 | cdef np.uint32_t *points[MAX_SENTENCE_LEN] 654 | cdef np.uint8_t *codes[MAX_SENTENCE_LEN] 655 | 656 | syn1 = (np.PyArray_DATA(model.syn1)) 657 | 658 | # convert Python structures to primitive types, so we can release the GIL 659 | work = np.PyArray_DATA(_work) 660 | neu1 = np.PyArray_DATA(_neu1) 661 | 662 | vlookup = model.wv.vocab 663 | i = 0 664 | for token in sentence: 665 | word = vlookup[token] if token in vlookup else None 666 | if word is None: 667 | continue # for score, should this be a default negative value? 668 | indexes[i] = word.index 669 | codelens[i] = len(word.code) 670 | codes[i] = np.PyArray_DATA(word.code) 671 | points[i] = np.PyArray_DATA(word.point) 672 | result += 1 673 | i += 1 674 | if i == MAX_SENTENCE_LEN: 675 | break # TODO: log warning, tally overflow? 676 | sentence_len = i 677 | 678 | # release GIL & train on the sentence 679 | work[0] = 0.0 680 | with nogil: 681 | for i in range(sentence_len): 682 | if codelens[i] == 0: 683 | continue 684 | j = i - window 685 | if j < 0: 686 | j = 0 687 | k = i + window + 1 688 | if k > sentence_len: 689 | k = sentence_len 690 | score_pair_cbow_hs(points[i], codes[i], codelens, neu1, syn0, syn1, size, indexes, work, i, j, k, cbow_mean) 691 | 692 | return work[0] 693 | 694 | cdef void score_pair_cbow_hs( 695 | const np.uint32_t *word_point, const np.uint8_t *word_code, int codelens[MAX_SENTENCE_LEN], 696 | REAL_t *neu1, REAL_t *syn0, REAL_t *syn1, const int size, 697 | const np.uint32_t indexes[MAX_SENTENCE_LEN], REAL_t *work, 698 | int i, int j, int k, int cbow_mean) nogil: 699 | 700 | cdef long long a, b 701 | cdef long long row2 702 | cdef REAL_t f, g, count, inv_count, sgn 703 | cdef int m 704 | 705 | memset(neu1, 0, size * cython.sizeof(REAL_t)) 706 | count = 0.0 707 | for m in range(j, k): 708 | if m == i or codelens[m] == 0: 709 | continue 710 | else: 711 | count += ONEF 712 | our_saxpy(&size, &ONEF, &syn0[indexes[m] * size], &ONE, neu1, &ONE) 713 | if count > (0.5): 714 | inv_count = ONEF/count 715 | if cbow_mean: 716 | sscal(&size, &inv_count, neu1, &ONE) 717 | 718 | for b in range(codelens[i]): 719 | row2 = word_point[b] * size 720 | f = our_dot(&size, neu1, &ONE, &syn1[row2], &ONE) 721 | sgn = (-1)**word_code[b] # ch function: 0-> 1, 1 -> -1 722 | f = sgn*f 723 | if f <= -MAX_EXP or f >= MAX_EXP: 724 | continue 725 | f = LOG_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] 726 | work[0] += f 727 | 728 | 729 | def init(): 730 | """ 731 | Precompute function `sigmoid(x) = 1 / (1 + exp(-x))`, for x values discretized 732 | into table EXP_TABLE. Also calculate log(sigmoid(x)) into LOG_TABLE. 733 | 734 | """ 735 | global our_dot 736 | global our_saxpy 737 | 738 | cdef int i 739 | cdef float *x = [10.0] 740 | cdef float *y = [0.01] 741 | cdef float expected = 0.1 742 | cdef int size = 1 743 | cdef double d_res 744 | cdef float *p_res 745 | 746 | # build the sigmoid table 747 | for i in range(EXP_TABLE_SIZE): 748 | EXP_TABLE[i] = exp((i / EXP_TABLE_SIZE * 2 - 1) * MAX_EXP) 749 | EXP_TABLE[i] = (EXP_TABLE[i] / (EXP_TABLE[i] + 1)) 750 | LOG_TABLE[i] = log( EXP_TABLE[i] ) 751 | 752 | # check whether sdot returns double or float 753 | d_res = dsdot(&size, x, &ONE, y, &ONE) 754 | p_res = &d_res 755 | if (abs(d_res - expected) < 0.0001): 756 | our_dot = our_dot_double 757 | our_saxpy = saxpy 758 | return 0 # double 759 | elif (abs(p_res[0] - expected) < 0.0001): 760 | our_dot = our_dot_float 761 | our_saxpy = saxpy 762 | return 1 # float 763 | else: 764 | # neither => use cython loops, no BLAS 765 | # actually, the BLAS is so messed up we'll probably have segfaulted above and never even reach here 766 | our_dot = our_dot_noblas 767 | our_saxpy = our_saxpy_noblas 768 | return 2 769 | 770 | FAST_VERSION = init() # initialize the module 771 | MAX_WORDS_IN_BATCH = MAX_SENTENCE_LEN 772 | --------------------------------------------------------------------------------