├── Dynamic node2vec
    ├── dynamic node2vec.py
    └── main.py
├── README.md
├── node2vec
    ├── main.py
    └── node2vec.py
└── word2vec
    ├── dictionary.py
    ├── keyedvectors.py
    ├── matutils.py
    ├── setup.py
    ├── utils.py
    ├── voidptr.h
    ├── word2vec.py
    ├── word2vec_inner.c
    ├── word2vec_inner.pxd
    └── word2vec_inner.pyx


/Dynamic node2vec/dynamic node2vec.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import networkx as nx
  3 | import threadpool
  4 | import multiprocessing
  5 | import random
  6 | 
  7 | 
  8 | def worker(worker_no, walk_length, nodes, graph):
  9 |     walks = []
 10 |     print "worker %s start" % str(worker_no)
 11 |     random.shuffle(nodes)
 12 |     for node in nodes:
 13 |         walks.append(graph.node2vec_walk(walk_length=walk_length, start_node=node))
 14 |     print "worker %s finish" % str(worker_no)
 15 |     return walks
 16 | 
 17 | 
 18 | class Graph():
 19 |     def __init__(self, nx_G, is_directed, p, q):
 20 |         self.G = nx_G
 21 |         self.is_directed = is_directed
 22 |         self.p = p
 23 |         self.q = q
 24 | 
 25 |     def node2vec_walk(self, walk_length, start_node):
 26 |         '''
 27 |         Simulate a random walk starting from start node.
 28 |         '''
 29 |         G = self.G
 30 |         alias_nodes = self.alias_nodes
 31 |         alias_edges = self.alias_edges
 32 | 
 33 |         walk = [start_node]
 34 | 
 35 |         while len(walk) < walk_length:
 36 |             cur = walk[-1]
 37 |             cur_nbrs = sorted(G.neighbors(cur))
 38 |             if len(cur_nbrs) > 0:
 39 |                 if len(walk) == 1:
 40 |                     walk.append(cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])])
 41 |                 else:
 42 |                     prev = walk[-2]
 43 |                     next = cur_nbrs[alias_draw(alias_edges[(prev, cur)][0],
 44 |                                                alias_edges[(prev, cur)][1])]
 45 |                     walk.append(next)
 46 |             else:
 47 |                 break
 48 | 
 49 |         return walk
 50 | 
 51 |     def simulate_walks(self, num_walks, walk_length, nodes=None):
 52 |         '''
 53 |         Repeatedly simulate random walks from each node.
 54 |         '''
 55 |         G = self.G
 56 |         if nodes is None:
 57 |             nodes = list(G.nodes())
 58 |             print "nodes count", len(nodes)
 59 | 
 60 |         walks = []
 61 |         for i in range(num_walks):
 62 |             for node in nodes:
 63 |                 walks.append(self.node2vec_walk(walk_length=walk_length, start_node=node))
 64 |         # result = []
 65 |         # jobs = []
 66 |         # pool = multiprocessing.Pool(processes=num_walks)
 67 |         # for i in range(num_walks):
 68 |         #     jobs.append((i, walk_length, nodes, self))
 69 |         #     # result.append(pool.apply_async(worker, args=(i, walk_length, nodes, self)))
 70 |         # pool.imap(worker, jobs)
 71 |         # pool.close()
 72 |         # pool.join()
 73 |         #
 74 |         # walks = []
 75 |         # for walk in result:
 76 |         #     walks.extend(walk.get())
 77 |         return walks
 78 | 
 79 |     def get_alias_edge(self, src, dst):
 80 |         '''
 81 |         Get the alias edge setup lists for a given edge.
 82 |         '''
 83 |         G = self.G
 84 |         p = self.p
 85 |         q = self.q
 86 | 
 87 |         unnormalized_probs = []
 88 |         for dst_nbr in sorted(G.neighbors(dst)):
 89 |             if dst_nbr == src:
 90 |                 unnormalized_probs.append(G[dst][dst_nbr]['weight'] / p)
 91 |             elif G.has_edge(dst_nbr, src):
 92 |                 unnormalized_probs.append(G[dst][dst_nbr]['weight'])
 93 |             else:
 94 |                 unnormalized_probs.append(G[dst][dst_nbr]['weight'] / q)
 95 |         norm_const = sum(unnormalized_probs)
 96 |         normalized_probs = [float(u_prob) / norm_const for u_prob in unnormalized_probs]
 97 | 
 98 |         return alias_setup(normalized_probs)
 99 | 
100 |     def preprocess_transition_probs(self):
101 |         '''
102 |         Preprocessing of transition probabilities for guiding the random walks.
103 |         '''
104 |         G = self.G
105 |         is_directed = self.is_directed
106 | 
107 |         alias_nodes = {}
108 |         for node in G.nodes():
109 |             unnormalized_probs = [G[node][nbr]['weight'] for nbr in sorted(G.neighbors(node))]
110 |             norm_const = sum(unnormalized_probs)
111 |             normalized_probs = [float(u_prob) / norm_const for u_prob in unnormalized_probs]
112 |             alias_nodes[node] = alias_setup(normalized_probs)
113 | 
114 |         alias_edges = {}
115 |         triads = {}
116 | 
117 |         if is_directed:
118 |             for edge in G.edges():
119 |                 alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
120 |         else:
121 |             for edge in G.edges():
122 |                 alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
123 |                 alias_edges[(edge[1], edge[0])] = self.get_alias_edge(edge[1], edge[0])
124 | 
125 |         self.alias_nodes = alias_nodes
126 |         self.alias_edges = alias_edges
127 | 
128 |         return
129 | 
130 | 
131 | def alias_setup(probs):
132 |     '''
133 |     Compute utility lists for non-uniform sampling from discrete distributions.
134 |     Refer to https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/
135 |     for details
136 |     '''
137 |     K = len(probs)
138 |     q = np.zeros(K)
139 |     J = np.zeros(K, dtype=np.int)
140 | 
141 |     smaller = []
142 |     larger = []
143 |     for kk, prob in enumerate(probs):
144 |         q[kk] = K * prob
145 |         if q[kk] < 1.0:
146 |             smaller.append(kk)
147 |         else:
148 |             larger.append(kk)
149 | 
150 |     while len(smaller) > 0 and len(larger) > 0:
151 |         small = smaller.pop()
152 |         large = larger.pop()
153 | 
154 |         J[small] = large
155 |         q[large] = q[large] + q[small] - 1.0
156 |         if q[large] < 1.0:
157 |             smaller.append(large)
158 |         else:
159 |             larger.append(large)
160 | 
161 |     return J, q
162 | 
163 | 
164 | def alias_draw(J, q):
165 |     '''
166 |     Draw sample from a non-uniform discrete distribution using alias sampling.
167 |     '''
168 |     K = len(J)
169 | 
170 |     kk = int(np.floor(np.random.rand() * K))
171 |     if np.random.rand() < q[kk]:
172 |         return kk
173 |     else:
174 |         return J[kk]
175 | 


--------------------------------------------------------------------------------
/Dynamic node2vec/main.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Reference implementation of node2vec. 
  3 | 
  4 | Author: Aditya Grover
  5 | 
  6 | For more details, refer to the paper:
  7 | node2vec: Scalable Feature Learning for Networks
  8 | Aditya Grover and Jure Leskovec 
  9 | Knowledge Discovery and Data Mining (KDD), 2016
 10 | '''
 11 | 
 12 | import argparse
 13 | import numpy as np
 14 | import networkx as nx
 15 | import node2vec
 16 | import logging
 17 | import time
 18 | from word2vec.word2vec import Word2Vec
 19 | 
 20 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO,
 21 |                     filename="node2vec.log", filemode="a")
 22 | 
 23 | 
 24 | def parse_args():
 25 |     '''
 26 |     Parses the node2vec arguments.
 27 |     '''
 28 |     parser = argparse.ArgumentParser(description="Run node2vec.")
 29 | 
 30 |     parser.add_argument('--input', nargs='?', default='../output/bn.edgelist',
 31 |                         help='Input graph path')
 32 | 
 33 |     parser.add_argument('--output', nargs='?', default='b.emb',
 34 |                         help='Embeddings path')
 35 | 
 36 |     parser.add_argument('--dimensions', type=int, default=128,
 37 |                         help='Number of dimensions. Default is 128.')
 38 | 
 39 |     parser.add_argument('--walk-length', type=int, default=80,
 40 |                         help='Length of walk per source. Default is 80.')
 41 | 
 42 |     parser.add_argument('--num-walks', type=int, default=10,
 43 |                         help='Number of walks per source. Default is 10.')
 44 | 
 45 |     parser.add_argument('--window-size', type=int, default=10,
 46 |                         help='Context size for optimization. Default is 10.')
 47 | 
 48 |     parser.add_argument('--iter', default=1, type=int,
 49 |                         help='Number of epochs in SGD')
 50 | 
 51 |     parser.add_argument('--workers', type=int, default=32,
 52 |                         help='Number of parallel workers. Default is 8.')
 53 | 
 54 |     parser.add_argument('--p', type=float, default=1,
 55 |                         help='Return hyperparameter. Default is 1.')
 56 | 
 57 |     parser.add_argument('--q', type=float, default=1,
 58 |                         help='Inout hyperparameter. Default is 1.')
 59 | 
 60 |     parser.add_argument('--weighted', dest='weighted', action='store_true',
 61 |                         help='Boolean specifying (un)weighted. Default is unweighted.')
 62 |     parser.add_argument('--unweighted', dest='unweighted', action='store_false')
 63 |     parser.set_defaults(weighted=False)
 64 | 
 65 |     parser.add_argument('--directed', dest='directed', action='store_true',
 66 |                         help='Graph is (un)directed. Default is undirected.')
 67 |     parser.add_argument('--undirected', dest='undirected', action='store_false')
 68 |     parser.add_argument('--dynamic', dest='dynamic', default=True)
 69 |     parser.add_argument('--old_input)', dest='oldinput', default="../output/b.edgelist")
 70 |     parser.add_argument('--old_emb)', dest='oldemb', default="b.emb")
 71 |     parser.set_defaults(directed=False)
 72 | 
 73 |     return parser.parse_args()
 74 | 
 75 | 
 76 | def read_graph(input):
 77 |     '''
 78 |     Reads the input network in networkx.
 79 |     '''
 80 |     if args.weighted:
 81 |         G = nx.read_edgelist(input, nodetype=int, data=(('weight', float),), create_using=nx.DiGraph())
 82 |     else:
 83 |         G = nx.read_edgelist(input, nodetype=int, create_using=nx.DiGraph())
 84 |         for edge in G.edges():
 85 |             G[edge[0]][edge[1]]['weight'] = 1
 86 | 
 87 |     if not args.directed:
 88 |         G = G.to_undirected()
 89 | 
 90 |     return G
 91 | 
 92 | 
 93 | def learn_embeddings(walks):
 94 |     '''
 95 |     Learn embeddings by optimizing the Skipgram objective using SGD.
 96 |     '''
 97 |     walks = [map(str, walk) for walk in walks]
 98 |     model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, sg=1, workers=args.workers)
 99 |     model.init_sims(replace=True)
100 |     model.wv.save_word2vec_format(args.output)
101 |     return
102 | 
103 | 
104 | def find_changed_edge(old_g, new_g):
105 |     old_edge = set([(u, old_g.G.edge[u].keys()[0]) for u in old_g.G.edge])
106 |     new_edge = set([(u, new_g.G.edge[u].keys()[0]) for u in new_g.G.edge])
107 | 
108 |     vanish = old_edge - new_edge
109 |     add = new_edge - old_edge
110 | 
111 |     print "-:", len(vanish), "+:", len(add)
112 | 
113 |     return vanish, add
114 | 
115 | 
116 | def find_near_node(pair, G, deep=1):
117 |     node_set = set([])
118 |     new_node_set = set([])
119 |     for i in range(len(pair)):
120 |         node_set.add(pair[i])
121 | 
122 |     for node in node_set:
123 |         for n in G.G.adj[node].keys():
124 |             if n not in node_set:
125 |                 new_node_set.add(n)
126 |     node_set |= new_node_set
127 | 
128 |     for i in range(deep):
129 |         temp_set = set([])
130 |         for node in new_node_set:
131 |             for n in G.G.adj[node].keys():
132 |                 if n not in node_set:
133 |                     node_set.add(n)
134 |                     temp_set.add(n)
135 |         new_node_set = temp_set
136 |     return node_set
137 | 
138 | 
139 | def train_vanish(walks, sent_edge_dict):
140 |     walks = [map(str, walk) for walk in walks]
141 |     if len(walks) > 0:
142 |         model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, sg=1, workers=args.workers, sent_edge_dict=sent_edge_dict)
143 |         model.init_sims(replace=True)
144 |         vec = {word: model.wv.syn0[model.wv.vocab[word].index] for word in model.wv.vocab}
145 |         return vec
146 |     else:
147 |         return {}
148 | 
149 | 
150 | def train_add(walks):
151 |     walks = [map(str, walk) for walk in walks]
152 |     if len(walks) > 0:
153 |         model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, sg=1, workers=args.workers)
154 |         model.init_sims(replace=True)
155 |         vec = {word: model.wv.syn0[model.wv.vocab[word].index] for word in model.wv.vocab}
156 |         return vec
157 |     else:
158 |         return {}
159 | 
160 | 
161 | def main(args):
162 |     '''
163 |     Pipeline for representational learning for all nodes in a graph.
164 |     '''
165 |     if args.dynamic == True:
166 |         print "dynamic"
167 |         nx_G_old = read_graph(args.oldinput)
168 |         G_old = node2vec.Graph(nx_G_old, args.directed, args.p, args.q)
169 |         G_old.preprocess_transition_probs()
170 | 
171 |         nx_G_new = read_graph(args.input)
172 |         G_new = node2vec.Graph(nx_G_new, args.directed, args.p, args.q)
173 |         G_new.preprocess_transition_probs()
174 | 
175 |         print "load graph finish"
176 |         print "old graph: nodes:", len(G_old.G.nodes()), "edges:", len(G_old.G.edges())
177 |         print "new graph: nodes:", len(G_new.G.nodes()), "edges:", len(G_new.G.edges())
178 | 
179 |         vanish_edge, add_edge = find_changed_edge(G_old, G_new)
180 | 
181 |         vec = {}
182 |         f = open(args.oldemb, "r")
183 |         for line in f:
184 |             node_vec = line.strip().split(" ")
185 |             if len(node_vec) == args.dimensions + 1:
186 |                 vec[node_vec[0]] = np.array(map(str, node_vec[1:]))
187 |         print "load vec finish"
188 |         walk_vanish = []
189 |         edge_count = 0
190 |         sent_edge_dict = {}
191 |         vanish_dict = {}
192 | 
193 |         for pair in vanish_edge:
194 |             if pair[0] < pair[1]:
195 |                 if pair[0] in vanish_dict:
196 |                     vanish_dict[pair[0]].add(pair[1])
197 |                 else:
198 |                     vanish_dict[pair[0]] = {pair[1]}
199 |             else:
200 |                 if pair[1] in vanish_dict:
201 |                     vanish_dict[pair[1]].add(pair[0])
202 |                 else:
203 |                     vanish_dict[pair[1]] = {pair[0]}
204 |         near_node = set([])
205 |         for pair in vanish_edge:
206 |             near_node |= find_near_node(pair, G_old)
207 |         print "near_node:", len(near_node)
208 |         walks = G_old.simulate_walks(50, 5, nodes=list(near_node))
209 |         print "gen corpus:", len(walks)
210 |         for l in walks:
211 |             p_idx = 0
212 |             flag = 0
213 |             for index in range(len(l) - 1):
214 |                 if l[index] < l[index + 1]:
215 |                     k = l[index]
216 |                     v = l[index + 1]
217 |                 else:
218 |                     k = l[index + 1]
219 |                     v = l[index]
220 | 
221 |                 if k in vanish_dict and v in vanish_dict[k]:
222 |                     if flag == 0:
223 |                         flag = 1
224 |                         p_idx = index
225 |                     elif flag == 1:
226 |                         edge = [l[p_idx], l[p_idx + 1]]
227 |                         if k not in edge or v not in edge:
228 |                             flag = 2
229 |                             break
230 |             if flag == 1:
231 |                 walk_vanish.append(l)
232 |                 sent_edge_dict[edge_count] = p_idx
233 |                 edge_count += 1
234 | 
235 |         print "vanish corpus:", len(walk_vanish)
236 |         update_vec = train_vanish(walk_vanish, sent_edge_dict)
237 | 
238 |         for node in update_vec:
239 |             if node in G_new.G.node:
240 |                 vec[node] = update_vec[node]
241 |             else:
242 |                 del vec[node]
243 |         print "update vec"
244 | 
245 |         near_node = set([])
246 |         for pair in add_edge:
247 |             near_node |= find_near_node(pair, G_new)
248 |         walks = G_new.simulate_walks(50, 5, nodes=list(near_node))
249 |         print "gen add corpus:", len(walks)
250 |         update_vec = train_add(walks)
251 |         for node in update_vec:
252 |             vec[node] = update_vec[node]
253 |         print "update vec"
254 | 
255 |         f = open(args.output, "a")
256 |         f.truncate()
257 |         for k in vec:
258 |             f.write(k + " " + " ".join(map(str, vec[k])) + "\n")
259 |     else:
260 |         nx_G = read_graph(args.input)
261 |         G = node2vec.Graph(nx_G, args.directed, args.p, args.q)
262 |         print "load graph finish"
263 |         G.preprocess_transition_probs()
264 |         walks = G.simulate_walks(args.num_walks, args.walk_length)
265 |         print "gen corpus", len(walks)
266 |         learn_embeddings(walks)
267 |         print "finish"
268 | 
269 | if __name__ == "__main__":
270 |     args = parse_args()
271 |     # args.dynamic = eval(args.dynamic)
272 |     print type(args.dynamic), args.input, args.oldemb, args.output
273 |     a = time.time()
274 |     main(args)
275 |     b = time.time()
276 |     logging.info(str(b - a))
277 | 
278 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # dynamic_network_embedding


--------------------------------------------------------------------------------
/node2vec/main.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Reference implementation of node2vec. 
  3 | 
  4 | Author: Aditya Grover
  5 | 
  6 | For more details, refer to the paper:
  7 | node2vec: Scalable Feature Learning for Networks
  8 | Aditya Grover and Jure Leskovec 
  9 | Knowledge Discovery and Data Mining (KDD), 2016
 10 | '''
 11 | 
 12 | import argparse
 13 | import numpy as np
 14 | import networkx as nx
 15 | import node2vec
 16 | import logging
 17 | import time
 18 | from word2vec.word2vec import Word2Vec
 19 | 
 20 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO,
 21 |                     filename="node2vec.log", filemode="a")
 22 | 
 23 | 
 24 | def parse_args():
 25 |     '''
 26 |     Parses the node2vec arguments.
 27 |     '''
 28 |     parser = argparse.ArgumentParser(description="Run node2vec.")
 29 | 
 30 |     parser.add_argument('--input', nargs='?', default='../output/bn.edgelist',
 31 |                         help='Input graph path')
 32 | 
 33 |     parser.add_argument('--output', nargs='?', default='b.emb',
 34 |                         help='Embeddings path')
 35 | 
 36 |     parser.add_argument('--dimensions', type=int, default=128,
 37 |                         help='Number of dimensions. Default is 128.')
 38 | 
 39 |     parser.add_argument('--walk-length', type=int, default=80,
 40 |                         help='Length of walk per source. Default is 80.')
 41 | 
 42 |     parser.add_argument('--num-walks', type=int, default=10,
 43 |                         help='Number of walks per source. Default is 10.')
 44 | 
 45 |     parser.add_argument('--window-size', type=int, default=10,
 46 |                         help='Context size for optimization. Default is 10.')
 47 | 
 48 |     parser.add_argument('--iter', default=1, type=int,
 49 |                         help='Number of epochs in SGD')
 50 | 
 51 |     parser.add_argument('--workers', type=int, default=32,
 52 |                         help='Number of parallel workers. Default is 8.')
 53 | 
 54 |     parser.add_argument('--p', type=float, default=1,
 55 |                         help='Return hyperparameter. Default is 1.')
 56 | 
 57 |     parser.add_argument('--q', type=float, default=1,
 58 |                         help='Inout hyperparameter. Default is 1.')
 59 | 
 60 |     parser.add_argument('--weighted', dest='weighted', action='store_true',
 61 |                         help='Boolean specifying (un)weighted. Default is unweighted.')
 62 |     parser.add_argument('--unweighted', dest='unweighted', action='store_false')
 63 |     parser.set_defaults(weighted=False)
 64 | 
 65 |     parser.add_argument('--directed', dest='directed', action='store_true',
 66 |                         help='Graph is (un)directed. Default is undirected.')
 67 |     parser.add_argument('--undirected', dest='undirected', action='store_false')
 68 |     parser.add_argument('--dynamic', dest='dynamic', default=True)
 69 |     parser.add_argument('--old_input)', dest='oldinput', default="../output/b.edgelist")
 70 |     parser.add_argument('--old_emb)', dest='oldemb', default="b.emb")
 71 |     parser.set_defaults(directed=False)
 72 | 
 73 |     return parser.parse_args()
 74 | 
 75 | 
 76 | def read_graph(input):
 77 |     '''
 78 |     Reads the input network in networkx.
 79 |     '''
 80 |     if args.weighted:
 81 |         G = nx.read_edgelist(input, nodetype=int, data=(('weight', float),), create_using=nx.DiGraph())
 82 |     else:
 83 |         G = nx.read_edgelist(input, nodetype=int, create_using=nx.DiGraph())
 84 |         for edge in G.edges():
 85 |             G[edge[0]][edge[1]]['weight'] = 1
 86 | 
 87 |     if not args.directed:
 88 |         G = G.to_undirected()
 89 | 
 90 |     return G
 91 | 
 92 | 
 93 | def learn_embeddings(walks):
 94 |     '''
 95 |     Learn embeddings by optimizing the Skipgram objective using SGD.
 96 |     '''
 97 |     walks = [map(str, walk) for walk in walks]
 98 |     model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, sg=1, workers=args.workers)
 99 |     model.init_sims(replace=True)
100 |     model.wv.save_word2vec_format(args.output)
101 |     return
102 | 
103 | 
104 | def find_changed_edge(old_g, new_g):
105 |     old_edge = set([(u, old_g.G.edge[u].keys()[0]) for u in old_g.G.edge])
106 |     new_edge = set([(u, new_g.G.edge[u].keys()[0]) for u in new_g.G.edge])
107 | 
108 |     vanish = old_edge - new_edge
109 |     add = new_edge - old_edge
110 | 
111 |     print "-:", len(vanish), "+:", len(add)
112 | 
113 |     return vanish, add
114 | 
115 | 
116 | def find_near_node(pair, G, deep=1):
117 |     node_set = set([])
118 |     new_node_set = set([])
119 |     for i in range(len(pair)):
120 |         node_set.add(pair[i])
121 | 
122 |     for node in node_set:
123 |         for n in G.G.adj[node].keys():
124 |             if n not in node_set:
125 |                 new_node_set.add(n)
126 |     node_set |= new_node_set
127 | 
128 |     for i in range(deep):
129 |         temp_set = set([])
130 |         for node in new_node_set:
131 |             for n in G.G.adj[node].keys():
132 |                 if n not in node_set:
133 |                     node_set.add(n)
134 |                     temp_set.add(n)
135 |         new_node_set = temp_set
136 |     return node_set
137 | 
138 | 
139 | def train_vanish(walks, sent_edge_dict):
140 |     walks = [map(str, walk) for walk in walks]
141 |     if len(walks) > 0:
142 |         model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, sg=1, workers=args.workers, sent_edge_dict=sent_edge_dict)
143 |         model.init_sims(replace=True)
144 |         vec = {word: model.wv.syn0[model.wv.vocab[word].index] for word in model.wv.vocab}
145 |         return vec
146 |     else:
147 |         return {}
148 | 
149 | 
150 | def train_add(walks):
151 |     walks = [map(str, walk) for walk in walks]
152 |     if len(walks) > 0:
153 |         model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, sg=1, workers=args.workers)
154 |         model.init_sims(replace=True)
155 |         vec = {word: model.wv.syn0[model.wv.vocab[word].index] for word in model.wv.vocab}
156 |         return vec
157 |     else:
158 |         return {}
159 | 
160 | 
161 | def main(args):
162 |     '''
163 |     Pipeline for representational learning for all nodes in a graph.
164 |     '''
165 |     if args.dynamic == True:
166 |         print "dynamic"
167 |         nx_G_old = read_graph(args.oldinput)
168 |         G_old = node2vec.Graph(nx_G_old, args.directed, args.p, args.q)
169 |         G_old.preprocess_transition_probs()
170 | 
171 |         nx_G_new = read_graph(args.input)
172 |         G_new = node2vec.Graph(nx_G_new, args.directed, args.p, args.q)
173 |         G_new.preprocess_transition_probs()
174 | 
175 |         print "load graph finish"
176 |         print "old graph: nodes:", len(G_old.G.nodes()), "edges:", len(G_old.G.edges())
177 |         print "new graph: nodes:", len(G_new.G.nodes()), "edges:", len(G_new.G.edges())
178 | 
179 |         vanish_edge, add_edge = find_changed_edge(G_old, G_new)
180 | 
181 |         vec = {}
182 |         f = open(args.oldemb, "r")
183 |         for line in f:
184 |             node_vec = line.strip().split(" ")
185 |             if len(node_vec) == args.dimensions + 1:
186 |                 vec[node_vec[0]] = np.array(map(str, node_vec[1:]))
187 |         print "load vec finish"
188 |         walk_vanish = []
189 |         edge_count = 0
190 |         sent_edge_dict = {}
191 |         vanish_dict = {}
192 | 
193 |         for pair in vanish_edge:
194 |             if pair[0] < pair[1]:
195 |                 if pair[0] in vanish_dict:
196 |                     vanish_dict[pair[0]].add(pair[1])
197 |                 else:
198 |                     vanish_dict[pair[0]] = {pair[1]}
199 |             else:
200 |                 if pair[1] in vanish_dict:
201 |                     vanish_dict[pair[1]].add(pair[0])
202 |                 else:
203 |                     vanish_dict[pair[1]] = {pair[0]}
204 |         near_node = set([])
205 |         for pair in vanish_edge:
206 |             near_node |= find_near_node(pair, G_old)
207 |         print "near_node:", len(near_node)
208 |         walks = G_old.simulate_walks(50, 5, nodes=list(near_node))
209 |         print "gen corpus:", len(walks)
210 |         for l in walks:
211 |             p_idx = 0
212 |             flag = 0
213 |             for index in range(len(l) - 1):
214 |                 if l[index] < l[index + 1]:
215 |                     k = l[index]
216 |                     v = l[index + 1]
217 |                 else:
218 |                     k = l[index + 1]
219 |                     v = l[index]
220 | 
221 |                 if k in vanish_dict and v in vanish_dict[k]:
222 |                     if flag == 0:
223 |                         flag = 1
224 |                         p_idx = index
225 |                     elif flag == 1:
226 |                         edge = [l[p_idx], l[p_idx + 1]]
227 |                         if k not in edge or v not in edge:
228 |                             flag = 2
229 |                             break
230 |             if flag == 1:
231 |                 walk_vanish.append(l)
232 |                 sent_edge_dict[edge_count] = p_idx
233 |                 edge_count += 1
234 | 
235 |         print "vanish corpus:", len(walk_vanish)
236 |         update_vec = train_vanish(walk_vanish, sent_edge_dict)
237 | 
238 |         for node in update_vec:
239 |             if node in G_new.G.node:
240 |                 vec[node] = update_vec[node]
241 |             else:
242 |                 del vec[node]
243 |         print "update vec"
244 | 
245 |         near_node = set([])
246 |         for pair in add_edge:
247 |             near_node |= find_near_node(pair, G_new)
248 |         walks = G_new.simulate_walks(50, 5, nodes=list(near_node))
249 |         print "gen add corpus:", len(walks)
250 |         update_vec = train_add(walks)
251 |         for node in update_vec:
252 |             vec[node] = update_vec[node]
253 |         print "update vec"
254 | 
255 |         f = open(args.output, "a")
256 |         f.truncate()
257 |         for k in vec:
258 |             f.write(k + " " + " ".join(map(str, vec[k])) + "\n")
259 |     else:
260 |         nx_G = read_graph(args.input)
261 |         G = node2vec.Graph(nx_G, args.directed, args.p, args.q)
262 |         print "load graph finish"
263 |         G.preprocess_transition_probs()
264 |         walks = G.simulate_walks(args.num_walks, args.walk_length)
265 |         print "gen corpus", len(walks)
266 |         learn_embeddings(walks)
267 |         print "finish"
268 | 
269 | if __name__ == "__main__":
270 |     args = parse_args()
271 |     # args.dynamic = eval(args.dynamic)
272 |     print type(args.dynamic), args.input, args.oldemb, args.output
273 |     a = time.time()
274 |     main(args)
275 |     b = time.time()
276 |     logging.info(str(b - a))
277 | 
278 | 


--------------------------------------------------------------------------------
/node2vec/node2vec.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import networkx as nx
  3 | import threadpool
  4 | import multiprocessing
  5 | import random
  6 | 
  7 | 
  8 | def worker(worker_no, walk_length, nodes, graph):
  9 |     walks = []
 10 |     print "worker %s start" % str(worker_no)
 11 |     random.shuffle(nodes)
 12 |     for node in nodes:
 13 |         walks.append(graph.node2vec_walk(walk_length=walk_length, start_node=node))
 14 |     print "worker %s finish" % str(worker_no)
 15 |     return walks
 16 | 
 17 | 
 18 | class Graph():
 19 |     def __init__(self, nx_G, is_directed, p, q):
 20 |         self.G = nx_G
 21 |         self.is_directed = is_directed
 22 |         self.p = p
 23 |         self.q = q
 24 | 
 25 |     def node2vec_walk(self, walk_length, start_node):
 26 |         '''
 27 |         Simulate a random walk starting from start node.
 28 |         '''
 29 |         G = self.G
 30 |         alias_nodes = self.alias_nodes
 31 |         alias_edges = self.alias_edges
 32 | 
 33 |         walk = [start_node]
 34 | 
 35 |         while len(walk) < walk_length:
 36 |             cur = walk[-1]
 37 |             cur_nbrs = sorted(G.neighbors(cur))
 38 |             if len(cur_nbrs) > 0:
 39 |                 if len(walk) == 1:
 40 |                     walk.append(cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])])
 41 |                 else:
 42 |                     prev = walk[-2]
 43 |                     next = cur_nbrs[alias_draw(alias_edges[(prev, cur)][0],
 44 |                                                alias_edges[(prev, cur)][1])]
 45 |                     walk.append(next)
 46 |             else:
 47 |                 break
 48 | 
 49 |         return walk
 50 | 
 51 |     def simulate_walks(self, num_walks, walk_length, nodes=None):
 52 |         '''
 53 |         Repeatedly simulate random walks from each node.
 54 |         '''
 55 |         G = self.G
 56 |         if nodes is None:
 57 |             nodes = list(G.nodes())
 58 |             print "nodes count", len(nodes)
 59 | 
 60 |         walks = []
 61 |         for i in range(num_walks):
 62 |             for node in nodes:
 63 |                 walks.append(self.node2vec_walk(walk_length=walk_length, start_node=node))
 64 |         # result = []
 65 |         # jobs = []
 66 |         # pool = multiprocessing.Pool(processes=num_walks)
 67 |         # for i in range(num_walks):
 68 |         #     jobs.append((i, walk_length, nodes, self))
 69 |         #     # result.append(pool.apply_async(worker, args=(i, walk_length, nodes, self)))
 70 |         # pool.imap(worker, jobs)
 71 |         # pool.close()
 72 |         # pool.join()
 73 |         #
 74 |         # walks = []
 75 |         # for walk in result:
 76 |         #     walks.extend(walk.get())
 77 |         return walks
 78 | 
 79 |     def get_alias_edge(self, src, dst):
 80 |         '''
 81 |         Get the alias edge setup lists for a given edge.
 82 |         '''
 83 |         G = self.G
 84 |         p = self.p
 85 |         q = self.q
 86 | 
 87 |         unnormalized_probs = []
 88 |         for dst_nbr in sorted(G.neighbors(dst)):
 89 |             if dst_nbr == src:
 90 |                 unnormalized_probs.append(G[dst][dst_nbr]['weight'] / p)
 91 |             elif G.has_edge(dst_nbr, src):
 92 |                 unnormalized_probs.append(G[dst][dst_nbr]['weight'])
 93 |             else:
 94 |                 unnormalized_probs.append(G[dst][dst_nbr]['weight'] / q)
 95 |         norm_const = sum(unnormalized_probs)
 96 |         normalized_probs = [float(u_prob) / norm_const for u_prob in unnormalized_probs]
 97 | 
 98 |         return alias_setup(normalized_probs)
 99 | 
100 |     def preprocess_transition_probs(self):
101 |         '''
102 |         Preprocessing of transition probabilities for guiding the random walks.
103 |         '''
104 |         G = self.G
105 |         is_directed = self.is_directed
106 | 
107 |         alias_nodes = {}
108 |         for node in G.nodes():
109 |             unnormalized_probs = [G[node][nbr]['weight'] for nbr in sorted(G.neighbors(node))]
110 |             norm_const = sum(unnormalized_probs)
111 |             normalized_probs = [float(u_prob) / norm_const for u_prob in unnormalized_probs]
112 |             alias_nodes[node] = alias_setup(normalized_probs)
113 | 
114 |         alias_edges = {}
115 |         triads = {}
116 | 
117 |         if is_directed:
118 |             for edge in G.edges():
119 |                 alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
120 |         else:
121 |             for edge in G.edges():
122 |                 alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
123 |                 alias_edges[(edge[1], edge[0])] = self.get_alias_edge(edge[1], edge[0])
124 | 
125 |         self.alias_nodes = alias_nodes
126 |         self.alias_edges = alias_edges
127 | 
128 |         return
129 | 
130 | 
131 | def alias_setup(probs):
132 |     '''
133 |     Compute utility lists for non-uniform sampling from discrete distributions.
134 |     Refer to https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/
135 |     for details
136 |     '''
137 |     K = len(probs)
138 |     q = np.zeros(K)
139 |     J = np.zeros(K, dtype=np.int)
140 | 
141 |     smaller = []
142 |     larger = []
143 |     for kk, prob in enumerate(probs):
144 |         q[kk] = K * prob
145 |         if q[kk] < 1.0:
146 |             smaller.append(kk)
147 |         else:
148 |             larger.append(kk)
149 | 
150 |     while len(smaller) > 0 and len(larger) > 0:
151 |         small = smaller.pop()
152 |         large = larger.pop()
153 | 
154 |         J[small] = large
155 |         q[large] = q[large] + q[small] - 1.0
156 |         if q[large] < 1.0:
157 |             smaller.append(large)
158 |         else:
159 |             larger.append(large)
160 | 
161 |     return J, q
162 | 
163 | 
164 | def alias_draw(J, q):
165 |     '''
166 |     Draw sample from a non-uniform discrete distribution using alias sampling.
167 |     '''
168 |     K = len(J)
169 | 
170 |     kk = int(np.floor(np.random.rand() * K))
171 |     if np.random.rand() < q[kk]:
172 |         return kk
173 |     else:
174 |         return J[kk]
175 | 


--------------------------------------------------------------------------------
/word2vec/dictionary.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
  5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
  6 | 
  7 | 
  8 | """
  9 | This module implements the concept of Dictionary -- a mapping between words and
 10 | their integer ids.
 11 | 
 12 | Dictionaries can be created from a corpus and can later be pruned according to
 13 | document frequency (removing (un)common words via the :func:`Dictionary.filter_extremes` method),
 14 | save/loaded from disk (via :func:`Dictionary.save` and :func:`Dictionary.load` methods), merged
 15 | with other dictionary (:func:`Dictionary.merge_with`) etc.
 16 | """
 17 | 
 18 | from __future__ import with_statement
 19 | 
 20 | from collections import Mapping, defaultdict
 21 | import sys
 22 | import logging
 23 | import itertools
 24 | 
 25 | import utils
 26 | 
 27 | if sys.version_info[0] >= 3:
 28 |     unicode = str
 29 | 
 30 | from six import PY3, iteritems, iterkeys, itervalues, string_types
 31 | from six.moves import xrange
 32 | from six.moves import zip as izip
 33 | 
 34 | 
 35 | logger = logging.getLogger('gensim.corpora.dictionary')
 36 | 
 37 | 
 38 | class Dictionary(utils.SaveLoad, Mapping):
 39 |     """
 40 |     Dictionary encapsulates the mapping between normalized words and their integer ids.
 41 | 
 42 |     The main function is `doc2bow`, which converts a collection of words to its
 43 |     bag-of-words representation: a list of (word_id, word_frequency) 2-tuples.
 44 |     """
 45 |     def __init__(self, documents=None, prune_at=2000000):
 46 |         """
 47 |         If `documents` are given, use them to initialize Dictionary (see `add_documents()`).
 48 |         """
 49 |         self.token2id = {}  # token -> tokenId
 50 |         self.id2token = {}  # reverse mapping for token2id; only formed on request, to save memory
 51 |         self.dfs = {}  # document frequencies: tokenId -> in how many documents this token appeared
 52 | 
 53 |         self.num_docs = 0  # number of documents processed
 54 |         self.num_pos = 0  # total number of corpus positions
 55 |         self.num_nnz = 0  # total number of non-zeroes in the BOW matrix
 56 | 
 57 |         if documents is not None:
 58 |             self.add_documents(documents, prune_at=prune_at)
 59 | 
 60 |     def __getitem__(self, tokenid):
 61 |         if len(self.id2token) != len(self.token2id):
 62 |             # the word->id mapping has changed (presumably via add_documents);
 63 |             # recompute id->word accordingly
 64 |             self.id2token = dict((v, k) for k, v in iteritems(self.token2id))
 65 |         return self.id2token[tokenid]  # will throw for non-existent ids
 66 | 
 67 |     def __iter__(self):
 68 |         return iter(self.keys())
 69 | 
 70 |     if PY3:
 71 |         # restore Py2-style dict API
 72 |         iterkeys = __iter__
 73 | 
 74 |         def iteritems(self):
 75 |             return self.items()
 76 | 
 77 |         def itervalues(self):
 78 |             return self.values()
 79 | 
 80 |     def keys(self):
 81 |         """Return a list of all token ids."""
 82 |         return list(self.token2id.values())
 83 | 
 84 |     def __len__(self):
 85 |         """
 86 |         Return the number of token->id mappings in the dictionary.
 87 |         """
 88 |         return len(self.token2id)
 89 | 
 90 |     def __str__(self):
 91 |         some_keys = list(itertools.islice(iterkeys(self.token2id), 5))
 92 |         return "Dictionary(%i unique tokens: %s%s)" % (len(self), some_keys, '...' if len(self) > 5 else '')
 93 | 
 94 |     @staticmethod
 95 |     def from_documents(documents):
 96 |         return Dictionary(documents=documents)
 97 | 
 98 |     def add_documents(self, documents, prune_at=2000000):
 99 |         """
100 |         Update dictionary from a collection of documents. Each document is a list
101 |         of tokens = **tokenized and normalized** strings (either utf8 or unicode).
102 | 
103 |         This is a convenience wrapper for calling `doc2bow` on each document
104 |         with `allow_update=True`, which also prunes infrequent words, keeping the
105 |         total number of unique words <= `prune_at`. This is to save memory on very
106 |         large inputs. To disable this pruning, set `prune_at=None`.
107 | 
108 |         >>> print(Dictionary(["máma mele maso".split(), "ema má máma".split()]))
109 |         Dictionary(5 unique tokens)
110 |         """
111 |         for docno, document in enumerate(documents):
112 |             # log progress & run a regular check for pruning, once every 10k docs
113 |             if docno % 10000 == 0:
114 |                 if prune_at is not None and len(self) > prune_at:
115 |                     self.filter_extremes(no_below=0, no_above=1.0, keep_n=prune_at)
116 |                 logger.info("adding document #%i to %s", docno, self)
117 | 
118 |             # update Dictionary with the document
119 |             self.doc2bow(document, allow_update=True)  # ignore the result, here we only care about updating token ids
120 | 
121 |         logger.info(
122 |             "built %s from %i documents (total %i corpus positions)",
123 |             self, self.num_docs, self.num_pos)
124 | 
125 |     def doc2bow(self, document, allow_update=False, return_missing=False):
126 |         """
127 |         Convert `document` (a list of words) into the bag-of-words format = list
128 |         of `(token_id, token_count)` 2-tuples. Each word is assumed to be a
129 |         **tokenized and normalized** string (either unicode or utf8-encoded). No further preprocessing
130 |         is done on the words in `document`; apply tokenization, stemming etc. before
131 |         calling this method.
132 | 
133 |         If `allow_update` is set, then also update dictionary in the process: create
134 |         ids for new words. At the same time, update document frequencies -- for
135 |         each word appearing in this document, increase its document frequency (`self.dfs`)
136 |         by one.
137 | 
138 |         If `allow_update` is **not** set, this function is `const`, aka read-only.
139 |         """
140 |         if isinstance(document, string_types):
141 |             raise TypeError("doc2bow expects an array of unicode tokens on input, not a single string")
142 | 
143 |         # Construct (word, frequency) mapping.
144 |         counter = defaultdict(int)
145 |         for w in document:
146 |             counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1
147 | 
148 |         token2id = self.token2id
149 |         if allow_update or return_missing:
150 |             missing = dict((w, freq) for w, freq in iteritems(counter) if w not in token2id)
151 |             if allow_update:
152 |                 for w in missing:
153 |                     # new id = number of ids made so far;
154 |                     # NOTE this assumes there are no gaps in the id sequence!
155 |                     token2id[w] = len(token2id)
156 | 
157 |         result = dict((token2id[w], freq) for w, freq in iteritems(counter) if w in token2id)
158 | 
159 |         if allow_update:
160 |             self.num_docs += 1
161 |             self.num_pos += sum(itervalues(counter))
162 |             self.num_nnz += len(result)
163 |             # increase document count for each unique token that appeared in the document
164 |             dfs = self.dfs
165 |             for tokenid in iterkeys(result):
166 |                 dfs[tokenid] = dfs.get(tokenid, 0) + 1
167 | 
168 |         # return tokenids, in ascending id order
169 |         result = sorted(iteritems(result))
170 |         if return_missing:
171 |             return result, missing
172 |         else:
173 |             return result
174 | 
175 |     def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None):
176 |         """
177 |         Filter out tokens that appear in
178 | 
179 |         1. less than `no_below` documents (absolute number) or
180 |         2. more than `no_above` documents (fraction of total corpus size, *not*
181 |            absolute number).
182 |         3. if tokens are given in keep_tokens (list of strings), they will be kept regardless of
183 |            the `no_below` and `no_above` settings           
184 |         4. after (1), (2) and (3), keep only the first `keep_n` most frequent tokens (or
185 |            keep all if `None`).
186 | 
187 |         After the pruning, shrink resulting gaps in word ids.
188 | 
189 |         **Note**: Due to the gap shrinking, the same word may have a different
190 |         word id before and after the call to this function!
191 |         """
192 |         no_above_abs = int(no_above * self.num_docs)  # convert fractional threshold to absolute threshold
193 | 
194 |         # determine which tokens to keep
195 |         if keep_tokens:
196 |             keep_ids = [self.token2id[v] for v in keep_tokens if v in self.token2id]
197 |             good_ids = (
198 |                 v for v in itervalues(self.token2id)
199 |                 if no_below <= self.dfs.get(v, 0) <= no_above_abs
200 |                 or v in keep_ids
201 |             )
202 |         else:
203 |             good_ids = (
204 |                 v for v in itervalues(self.token2id)
205 |                 if no_below <= self.dfs.get(v, 0) <= no_above_abs)
206 |         good_ids = sorted(good_ids, key=self.dfs.get, reverse=True)
207 |         if keep_n is not None:
208 |             good_ids = good_ids[:keep_n]
209 |         bad_words = [(self[id], self.dfs.get(id, 0)) for id in set(self).difference(good_ids)]
210 |         logger.info("discarding %i tokens: %s...", len(self) - len(good_ids), bad_words[:10])
211 |         logger.info(
212 |             "keeping %i tokens which were in no less than %i and no more than %i (=%.1f%%) documents",
213 |             len(good_ids), no_below, no_above_abs, 100.0 * no_above)
214 | 
215 |         # do the actual filtering, then rebuild dictionary to remove gaps in ids
216 |         self.filter_tokens(good_ids=good_ids)
217 |         logger.info("resulting dictionary: %s", self)
218 | 
219 |     def filter_n_most_frequent(self, remove_n):
220 |         """
221 |         Filter out the 'remove_n' most frequent tokens that appear in the documents.
222 | 
223 |         After the pruning, shrink resulting gaps in word ids.
224 | 
225 |         **Note**: Due to the gap shrinking, the same word may have a different
226 |         word id before and after the call to this function!
227 |         """
228 |         # determine which tokens to keep
229 |         most_frequent_ids = (v for v in itervalues(self.token2id))
230 |         most_frequent_ids = sorted(most_frequent_ids, key=self.dfs.get, reverse=True)
231 |         most_frequent_ids = most_frequent_ids[:remove_n]
232 |         # do the actual filtering, then rebuild dictionary to remove gaps in ids
233 |         most_frequent_words = [(self[id], self.dfs.get(id, 0)) for id in most_frequent_ids]
234 |         logger.info("discarding %i tokens: %s...", len(most_frequent_ids), most_frequent_words[:10])
235 |         
236 |         self.filter_tokens(bad_ids=most_frequent_ids)
237 |         logger.info("resulting dictionary: %s" % self)
238 | 
239 |     def filter_tokens(self, bad_ids=None, good_ids=None):
240 |         """
241 |         Remove the selected `bad_ids` tokens from all dictionary mappings, or, keep
242 |         selected `good_ids` in the mapping and remove the rest.
243 | 
244 |         `bad_ids` and `good_ids` are collections of word ids to be removed.
245 |         """
246 |         if bad_ids is not None:
247 |             bad_ids = set(bad_ids)
248 |             self.token2id = dict((token, tokenid)
249 |                                  for token, tokenid in iteritems(self.token2id)
250 |                                  if tokenid not in bad_ids)
251 |             self.dfs = dict((tokenid, freq)
252 |                             for tokenid, freq in iteritems(self.dfs)
253 |                             if tokenid not in bad_ids)
254 |         if good_ids is not None:
255 |             good_ids = set(good_ids)
256 |             self.token2id = dict((token, tokenid)
257 |                                  for token, tokenid in iteritems(self.token2id)
258 |                                  if tokenid in good_ids)
259 |             self.dfs = dict((tokenid, freq)
260 |                             for tokenid, freq in iteritems(self.dfs)
261 |                             if tokenid in good_ids)
262 |         self.compactify()
263 | 
264 |     def compactify(self):
265 |         """
266 |         Assign new word ids to all words.
267 | 
268 |         This is done to make the ids more compact, e.g. after some tokens have
269 |         been removed via :func:`filter_tokens` and there are gaps in the id series.
270 |         Calling this method will remove the gaps.
271 |         """
272 |         logger.debug("rebuilding dictionary, shrinking gaps")
273 | 
274 |         # build mapping from old id -> new id
275 |         idmap = dict(izip(itervalues(self.token2id), xrange(len(self.token2id))))
276 | 
277 |         # reassign mappings to new ids
278 |         self.token2id = dict((token, idmap[tokenid]) for token, tokenid in iteritems(self.token2id))
279 |         self.id2token = {}
280 |         self.dfs = dict((idmap[tokenid], freq) for tokenid, freq in iteritems(self.dfs))
281 | 
282 |     def save_as_text(self, fname, sort_by_word=True):
283 |         """
284 |         Save this Dictionary to a text file, in format:
285 |         `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. Sorted by word,
286 |         or by decreasing word frequency.
287 | 
288 |         Note: text format should be use for corpus inspection. Use `save`/`load`
289 |         to store in binary format (pickle) for improved performance.
290 |         """
291 |         logger.info("saving dictionary mapping to %s", fname)
292 |         with utils.smart_open(fname, 'wb') as fout:
293 |             if sort_by_word:
294 |                 for token, tokenid in sorted(iteritems(self.token2id)):
295 |                     line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0))
296 |                     fout.write(utils.to_utf8(line))
297 |             else:
298 |                 for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]):
299 |                     line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq)
300 |                     fout.write(utils.to_utf8(line))
301 | 
302 |     def merge_with(self, other):
303 |         """
304 |         Merge another dictionary into this dictionary, mapping same tokens to the
305 |         same ids and new tokens to new ids. The purpose is to merge two corpora
306 |         created using two different dictionaries, one from `self` and one from `other`.
307 | 
308 |         `other` can be any id=>word mapping (a dict, a Dictionary object, ...).
309 | 
310 |         Return a transformation object which, when accessed as `result[doc_from_other_corpus]`,
311 |         will convert documents from a corpus built using the `other` dictionary
312 |         into a document using the new, merged dictionary (see :class:`gensim.interfaces.TransformationABC`).
313 | 
314 |         Example:
315 | 
316 |         >>> dict1 = Dictionary(some_documents)
317 |         >>> dict2 = Dictionary(other_documents)  # ids not compatible with dict1!
318 |         >>> dict2_to_dict1 = dict1.merge_with(dict2)
319 |         >>> # now we can merge corpora from the two incompatible dictionaries into one
320 |         >>> merged_corpus = itertools.chain(some_corpus_from_dict1, dict2_to_dict1[some_corpus_from_dict2])
321 | 
322 |         """
323 |         old2new = {}
324 |         for other_id, other_token in iteritems(other):
325 |             if other_token in self.token2id:
326 |                 new_id = self.token2id[other_token]
327 |             else:
328 |                 new_id = len(self.token2id)
329 |                 self.token2id[other_token] = new_id
330 |                 self.dfs[new_id] = 0
331 |             old2new[other_id] = new_id
332 |             try:
333 |                 self.dfs[new_id] += other.dfs[other_id]
334 |             except:
335 |                 # `other` isn't a Dictionary (probably just a dict) => ignore dfs, keep going
336 |                 pass
337 |         try:
338 |             self.num_docs += other.num_docs
339 |             self.num_nnz += other.num_nnz
340 |             self.num_pos += other.num_pos
341 |         except:
342 |             pass
343 | 
344 |         import gensim.models
345 |         return gensim.models.VocabTransform(old2new)
346 | 
347 |     @staticmethod
348 |     def load_from_text(fname):
349 |         """
350 |         Load a previously stored Dictionary from a text file.
351 |         Mirror function to `save_as_text`.
352 |         """
353 |         result = Dictionary()
354 |         with utils.smart_open(fname) as f:
355 |             for lineno, line in enumerate(f):
356 |                 line = utils.to_unicode(line)
357 |                 try:
358 |                     wordid, word, docfreq = line[:-1].split('\t')
359 |                 except Exception:
360 |                     raise ValueError("invalid line in dictionary file %s: %s"
361 |                                      % (fname, line.strip()))
362 |                 wordid = int(wordid)
363 |                 if word in result.token2id:
364 |                     raise KeyError('token %s is defined as ID %d and as ID %d' % (word, wordid, result.token2id[word]))
365 |                 result.token2id[word] = wordid
366 |                 result.dfs[wordid] = int(docfreq)
367 |         return result
368 | 
369 |     @staticmethod
370 |     def from_corpus(corpus, id2word=None):
371 |         """
372 |         Create Dictionary from an existing corpus. This can be useful if you only
373 |         have a term-document BOW matrix (represented by `corpus`), but not the
374 |         original text corpus.
375 | 
376 |         This will scan the term-document count matrix for all word ids that
377 |         appear in it, then construct and return Dictionary which maps each
378 |         `word_id -> id2word[word_id]`.
379 | 
380 |         `id2word` is an optional dictionary that maps the `word_id` to a token. In
381 |         case `id2word` isn't specified the mapping `id2word[word_id] = str(word_id)`
382 |         will be used.
383 |         """
384 | 
385 |         result = Dictionary()
386 |         max_id = -1
387 |         for docno, document in enumerate(corpus):
388 |             if docno % 10000 == 0:
389 |                 logger.info("adding document #%i to %s", docno, result)
390 |             result.num_docs += 1
391 |             result.num_nnz += len(document)
392 |             for wordid, word_freq in document:
393 |                 max_id = max(wordid, max_id)
394 |                 result.num_pos += word_freq
395 |                 result.dfs[wordid] = result.dfs.get(wordid, 0) + 1
396 | 
397 |         if id2word is None:
398 |             # make sure length(result) == get_max_id(corpus) + 1
399 |             result.token2id = dict((unicode(i), i) for i in xrange(max_id + 1))
400 |         else:
401 |             # id=>word mapping given: simply copy it
402 |             result.token2id = dict((utils.to_unicode(token), id) for id, token in iteritems(id2word))
403 |         for id in itervalues(result.token2id):
404 |             # make sure all token ids have a valid `dfs` entry
405 |             result.dfs[id] = result.dfs.get(id, 0)
406 | 
407 |         logger.info(
408 |             "built %s from %i documents (total %i corpus positions)",
409 |             result, result.num_docs, result.num_pos)
410 |         return result
411 | 


--------------------------------------------------------------------------------
/word2vec/keyedvectors.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright (C) 2016 Radim Rehurek <me@radimrehurek.com>
  5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
  6 | 
  7 | """
  8 | Word vector storage and similarity look-ups. Common model independent of the way the vectors are trained(Word2Vec, FastText, WordRank, VarEmbed etc)
  9 | 
 10 | The word vectors are considered read-only in this class.
 11 | 
 12 | Initialize the vectors by training e.g. Word2Vec::
 13 | 
 14 | >>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
 15 | >>> word_vectors = model.wv
 16 | 
 17 | Persist the word vectors to disk with::
 18 | 
 19 | >>> word_vectors.save(fname)
 20 | >>> word_vectors = KeyedVectors.load(fname)
 21 | 
 22 | The vectors can also be instantiated from an existing file on disk in the original Google's word2vec C format as a KeyedVectors instance::
 23 | 
 24 |   >>> from gensim.models.keyedvectors import KeyedVectors
 25 |   >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False)  # C text format
 26 |   >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.bin', binary=True)  # C binary format
 27 | 
 28 | You can perform various syntactic/semantic NLP word tasks with the vectors. Some of them
 29 | are already built-in::
 30 | 
 31 |   >>> word_vectors.most_similar(positive=['woman', 'king'], negative=['man'])
 32 |   [('queen', 0.50882536), ...]
 33 | 
 34 |   >>> word_vectors.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])
 35 |   [('queen', 0.71382287), ...]
 36 | 
 37 |   >>> word_vectors.doesnt_match("breakfast cereal dinner lunch".split())
 38 |   'cereal'
 39 | 
 40 |   >>> word_vectors.similarity('woman', 'man')
 41 |   0.73723527
 42 | 
 43 | Correlation with human opinion on word similarity::
 44 | 
 45 |   >>> word_vectors.evaluate_word_pairs(os.path.join(module_path, 'test_data','wordsim353.tsv'))
 46 |   0.51, 0.62, 0.13
 47 | 
 48 | And on analogies::
 49 | 
 50 |   >>> word_vectors.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt'))
 51 | 
 52 | and so on.
 53 | 
 54 | """
 55 | from __future__ import division  # py3 "true division"
 56 | 
 57 | import logging
 58 | 
 59 | try:
 60 |     from queue import Queue, Empty
 61 | except ImportError:
 62 |     from Queue import Queue, Empty
 63 | 
 64 | # If pyemd C extension is available, import it.
 65 | # If pyemd is attempted to be used, but isn't installed, ImportError will be raised in wmdistance
 66 | try:
 67 |     from pyemd import emd
 68 |     PYEMD_EXT = True
 69 | except ImportError:
 70 |     PYEMD_EXT = False
 71 | 
 72 | from numpy import dot, zeros, dtype, float32 as REAL,\
 73 |     double, array, vstack, fromstring, sqrt, newaxis,\
 74 |     ndarray, sum as np_sum, prod, ascontiguousarray
 75 | 
 76 | import utils, matutils  # utility fnc for pickling, common scipy operations etc
 77 | from dictionary import Dictionary
 78 | from six import string_types, iteritems
 79 | from six.moves import xrange
 80 | from scipy import stats
 81 | 
 82 | 
 83 | logger = logging.getLogger(__name__)
 84 | 
 85 | 
 86 | class Vocab(object):
 87 |     """
 88 |     A single vocabulary item, used internally for collecting per-word frequency/sampling info,
 89 |     and for constructing binary trees (incl. both word leaves and inner nodes).
 90 | 
 91 |     """
 92 |     def __init__(self, **kwargs):
 93 |         self.count = 0
 94 |         self.__dict__.update(kwargs)
 95 | 
 96 |     def __lt__(self, other):  # used for sorting in a priority queue
 97 |         return self.count < other.count
 98 | 
 99 |     def __str__(self):
100 |         vals = ['%s:%r' % (key, self.__dict__[key]) for key in sorted(self.__dict__) if not key.startswith('_')]
101 |         return "%s(%s)" % (self.__class__.__name__, ', '.join(vals))
102 | 
103 | 
104 | class KeyedVectors(utils.SaveLoad):
105 |     """
106 |     Class to contain vectors and vocab for the Word2Vec training class and other w2v methods not directly
107 |     involved in training such as most_similar()
108 |     """
109 |     def __init__(self):
110 |         self.syn0 = []
111 |         self.syn0norm = None
112 |         self.vocab = {}
113 |         self.index2word = []
114 |         self.vector_size = None
115 | 
116 |     def save(self, *args, **kwargs):
117 |         # don't bother storing the cached normalized vectors
118 |         kwargs['ignore'] = kwargs.get('ignore', ['syn0norm'])
119 |         super(KeyedVectors, self).save(*args, **kwargs)
120 | 
121 |     def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None):
122 |         """
123 |         Store the input-hidden weight matrix in the same format used by the original
124 |         C word2vec-tool, for compatibility.
125 | 
126 |          `fname` is the file used to save the vectors in
127 |          `fvocab` is an optional file used to save the vocabulary
128 |          `binary` is an optional boolean indicating whether the data is to be saved
129 |          in binary word2vec format (default: False)
130 |          `total_vec` is an optional parameter to explicitly specify total no. of vectors
131 |          (in case word vectors are appended with document vectors afterwards)
132 | 
133 |         """
134 |         if total_vec is None:
135 |             total_vec = len(self.vocab)
136 |         vector_size = self.syn0.shape[1]
137 |         if fvocab is not None:
138 |             logger.info("storing vocabulary in %s" % (fvocab))
139 |             with utils.smart_open(fvocab, 'wb') as vout:
140 |                 for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
141 |                     vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count)))
142 |         logger.info("storing %sx%s projection weights into %s" % (total_vec, vector_size, fname))
143 |         assert (len(self.vocab), vector_size) == self.syn0.shape
144 |         with utils.smart_open(fname, 'wb') as fout:
145 |             fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
146 |             # store in sorted order: most frequent words at the top
147 |             for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
148 |                 row = self.syn0[vocab.index]
149 |                 if binary:
150 |                     fout.write(utils.to_utf8(word) + b" " + row.tostring())
151 |                 else:
152 |                     fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))
153 | 
154 | 
155 |     @classmethod
156 |     def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict',
157 |                              limit=None, datatype=REAL):
158 |         """
159 |         Load the input-hidden weight matrix from the original C word2vec-tool format.
160 | 
161 |         Note that the information stored in the file is incomplete (the binary tree is missing),
162 |         so while you can query for word similarity etc., you cannot continue training
163 |         with a model loaded this way.
164 | 
165 |         `binary` is a boolean indicating whether the data is in binary word2vec format.
166 |         `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory.
167 |         Word counts are read from `fvocab` filename, if set (this is the file generated
168 |         by `-save-vocab` flag of the original C tool).
169 | 
170 |         If you trained the C model using non-utf8 encoding for words, specify that
171 |         encoding in `encoding`.
172 | 
173 |         `unicode_errors`, default 'strict', is a string suitable to be passed as the `errors`
174 |         argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source
175 |         file may include word tokens truncated in the middle of a multibyte unicode character
176 |         (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help.
177 | 
178 |         `limit` sets a maximum number of word-vectors to read from the file. The default,
179 |         None, means read all.
180 | 
181 |         `datatype` (experimental) can coerce dimensions to a non-default float type (such
182 |         as np.float16) to save memory. (Such types may result in much slower bulk operations
183 |         or incompatibility with optimized routines.)
184 | 
185 |         """
186 |         counts = None
187 |         if fvocab is not None:
188 |             logger.info("loading word counts from %s", fvocab)
189 |             counts = {}
190 |             with utils.smart_open(fvocab) as fin:
191 |                 for line in fin:
192 |                     word, count = utils.to_unicode(line).strip().split()
193 |                     counts[word] = int(count)
194 | 
195 |         logger.info("loading projection weights from %s", fname)
196 |         with utils.smart_open(fname) as fin:
197 |             header = utils.to_unicode(fin.readline(), encoding=encoding)
198 |             vocab_size, vector_size = map(int, header.split())  # throws for invalid file format
199 |             if limit:
200 |                 vocab_size = min(vocab_size, limit)
201 |             result = cls()
202 |             result.vector_size = vector_size
203 |             result.syn0 = zeros((vocab_size, vector_size), dtype=datatype)
204 | 
205 |             def add_word(word, weights):
206 |                 word_id = len(result.vocab)
207 |                 if word in result.vocab:
208 |                     logger.warning("duplicate word '%s' in %s, ignoring all but first", word, fname)
209 |                     return
210 |                 if counts is None:
211 |                     # most common scenario: no vocab file given. just make up some bogus counts, in descending order
212 |                     result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id)
213 |                 elif word in counts:
214 |                     # use count from the vocab file
215 |                     result.vocab[word] = Vocab(index=word_id, count=counts[word])
216 |                 else:
217 |                     # vocab file given, but word is missing -- set count to None (TODO: or raise?)
218 |                     logger.warning("vocabulary file is incomplete: '%s' is missing", word)
219 |                     result.vocab[word] = Vocab(index=word_id, count=None)
220 |                 result.syn0[word_id] = weights
221 |                 result.index2word.append(word)
222 | 
223 |             if binary:
224 |                 binary_len = dtype(REAL).itemsize * vector_size
225 |                 for line_no in xrange(vocab_size):
226 |                     # mixed text and binary: read text first, then binary
227 |                     word = []
228 |                     while True:
229 |                         ch = fin.read(1)
230 |                         if ch == b' ':
231 |                             break
232 |                         if ch == b'':
233 |                             raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
234 |                         if ch != b'\n':  # ignore newlines in front of words (some binary files have)
235 |                             word.append(ch)
236 |                     word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors)
237 |                     weights = fromstring(fin.read(binary_len), dtype=REAL)
238 |                     add_word(word, weights)
239 |             else:
240 |                 for line_no in xrange(vocab_size):
241 |                     line = fin.readline()
242 |                     if line == b'':
243 |                         raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
244 |                     parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ")
245 |                     if len(parts) != vector_size + 1:
246 |                         raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no))
247 |                     word, weights = parts[0], list(map(REAL, parts[1:]))
248 |                     add_word(word, weights)
249 |         if result.syn0.shape[0] != len(result.vocab):
250 |             logger.info(
251 |                 "duplicate words detected, shrinking matrix size from %i to %i",
252 |                 result.syn0.shape[0], len(result.vocab)
253 |             )
254 |             result.syn0 = ascontiguousarray(result.syn0[: len(result.vocab)])
255 |         assert (len(result.vocab), vector_size) == result.syn0.shape
256 | 
257 |         logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname))
258 |         return result
259 | 
260 |     def word_vec(self, word, use_norm=False):
261 |         """
262 |         Accept a single word as input.
263 |         Returns the word's representations in vector space, as a 1D numpy array.
264 | 
265 |         If `use_norm` is True, returns the normalized word vector.
266 | 
267 |         Example::
268 | 
269 |           >>> trained_model['office']
270 |           array([ -1.40128313e-02, ...])
271 | 
272 |         """
273 |         if word in self.vocab:
274 |             if use_norm:
275 |                 return self.syn0norm[self.vocab[word].index]
276 |             else:
277 |                 return self.syn0[self.vocab[word].index]
278 |         else:
279 |             raise KeyError("word '%s' not in vocabulary" % word)
280 | 
281 |     def most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None, indexer=None):
282 |         """
283 |         Find the top-N most similar words. Positive words contribute positively towards the
284 |         similarity, negative words negatively.
285 | 
286 |         This method computes cosine similarity between a simple mean of the projection
287 |         weight vectors of the given words and the vectors for each word in the model.
288 |         The method corresponds to the `word-analogy` and `distance` scripts in the original
289 |         word2vec implementation.
290 | 
291 |         If topn is False, most_similar returns the vector of similarity scores.
292 | 
293 |         `restrict_vocab` is an optional integer which limits the range of vectors which
294 |         are searched for most-similar values. For example, restrict_vocab=10000 would
295 |         only check the first 10000 word vectors in the vocabulary order. (This may be
296 |         meaningful if you've sorted the vocabulary by descending frequency.)
297 | 
298 |         Example::
299 | 
300 |           >>> trained_model.most_similar(positive=['woman', 'king'], negative=['man'])
301 |           [('queen', 0.50882536), ...]
302 | 
303 |         """
304 |         self.init_sims()
305 | 
306 |         if isinstance(positive, string_types) and not negative:
307 |             # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
308 |             positive = [positive]
309 | 
310 |         # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words
311 |         positive = [
312 |             (word, 1.0) if isinstance(word, string_types + (ndarray,)) else word
313 |             for word in positive
314 |         ]
315 |         negative = [
316 |             (word, -1.0) if isinstance(word, string_types + (ndarray,)) else word
317 |             for word in negative
318 |         ]
319 | 
320 |         # compute the weighted average of all words
321 |         all_words, mean = set(), []
322 |         for word, weight in positive + negative:
323 |             if isinstance(word, ndarray):
324 |                 mean.append(weight * word)
325 |             else:
326 |                 mean.append(weight * self.word_vec(word, use_norm=True))
327 |                 if word in self.vocab:
328 |                     all_words.add(self.vocab[word].index)
329 |         if not mean:
330 |             raise ValueError("cannot compute similarity with no input")
331 |         mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)
332 | 
333 |         if indexer is not None:
334 |             return indexer.most_similar(mean, topn)
335 | 
336 |         limited = self.syn0norm if restrict_vocab is None else self.syn0norm[:restrict_vocab]
337 |         dists = dot(limited, mean)
338 |         if not topn:
339 |             return dists
340 |         best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True)
341 |         # ignore (don't return) words from the input
342 |         result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words]
343 |         return result[:topn]
344 | 
345 |     def wmdistance(self, document1, document2):
346 |         """
347 |         Compute the Word Mover's Distance between two documents. When using this
348 |         model, please consider citing the following papers:
349 | 
350 |         .. Ofir Pele and Michael Werman, "A linear time histogram metric for improved SIFT matching".
351 |         .. Ofir Pele and Michael Werman, "Fast and robust earth mover's distances".
352 |         .. Matt Kusner et al. "From Word Embeddings To Document Distances".
353 | 
354 |         Note that if one of the documents have no words that exist in the
355 |         Word2Vec vocab, `float('inf')` (i.e. infinity) will be returned.
356 | 
357 |         This method only works if `pyemd` is installed (can be installed via pip, but requires a C compiler).
358 | 
359 |         Example:
360 |             >>> # Train word2vec model.
361 |             >>> model = Word2Vec(sentences)
362 | 
363 |             >>> # Some sentences to test.
364 |             >>> sentence_obama = 'Obama speaks to the media in Illinois'.lower().split()
365 |             >>> sentence_president = 'The president greets the press in Chicago'.lower().split()
366 | 
367 |             >>> # Remove their stopwords.
368 |             >>> from nltk.corpus import stopwords
369 |             >>> stopwords = nltk.corpus.stopwords.words('english')
370 |             >>> sentence_obama = [w for w in sentence_obama if w not in stopwords]
371 |             >>> sentence_president = [w for w in sentence_president if w not in stopwords]
372 | 
373 |             >>> # Compute WMD.
374 |             >>> distance = model.wmdistance(sentence_obama, sentence_president)
375 |         """
376 | 
377 |         if not PYEMD_EXT:
378 |             raise ImportError("Please install pyemd Python package to compute WMD.")
379 | 
380 |         # Remove out-of-vocabulary words.
381 |         len_pre_oov1 = len(document1)
382 |         len_pre_oov2 = len(document2)
383 |         document1 = [token for token in document1 if token in self]
384 |         document2 = [token for token in document2 if token in self]
385 |         diff1 = len_pre_oov1 - len(document1)
386 |         diff2 = len_pre_oov2 - len(document2)
387 |         if diff1 > 0 or diff2 > 0:
388 |             logger.info('Removed %d and %d OOV words from document 1 and 2 (respectively).',
389 |                         diff1, diff2)
390 | 
391 |         if len(document1) == 0 or len(document2) == 0:
392 |             logger.info('At least one of the documents had no words that were'
393 |                         'in the vocabulary. Aborting (returning inf).')
394 |             return float('inf')
395 | 
396 |         dictionary = Dictionary(documents=[document1, document2])
397 |         vocab_len = len(dictionary)
398 | 
399 |         if vocab_len == 1:
400 |             # Both documents are composed by a single unique token
401 |             return 0.0
402 | 
403 |         # Sets for faster look-up.
404 |         docset1 = set(document1)
405 |         docset2 = set(document2)
406 | 
407 |         # Compute distance matrix.
408 |         distance_matrix = zeros((vocab_len, vocab_len), dtype=double)
409 |         for i, t1 in dictionary.items():
410 |             for j, t2 in dictionary.items():
411 |                 if not t1 in docset1 or not t2 in docset2:
412 |                     continue
413 |                 # Compute Euclidean distance between word vectors.
414 |                 distance_matrix[i, j] = sqrt(np_sum((self[t1] - self[t2])**2))
415 | 
416 |         if np_sum(distance_matrix) == 0.0:
417 |             # `emd` gets stuck if the distance matrix contains only zeros.
418 |             logger.info('The distance matrix is all zeros. Aborting (returning inf).')
419 |             return float('inf')
420 | 
421 |         def nbow(document):
422 |             d = zeros(vocab_len, dtype=double)
423 |             nbow = dictionary.doc2bow(document)  # Word frequencies.
424 |             doc_len = len(document)
425 |             for idx, freq in nbow:
426 |                 d[idx] = freq / float(doc_len)  # Normalized word frequencies.
427 |             return d
428 | 
429 |         # Compute nBOW representation of documents.
430 |         d1 = nbow(document1)
431 |         d2 = nbow(document2)
432 | 
433 |         # Compute WMD.
434 |         return emd(d1, d2, distance_matrix)
435 | 
436 |     def most_similar_cosmul(self, positive=[], negative=[], topn=10):
437 |         """
438 |         Find the top-N most similar words, using the multiplicative combination objective
439 |         proposed by Omer Levy and Yoav Goldberg in [4]_. Positive words still contribute
440 |         positively towards the similarity, negative words negatively, but with less
441 |         susceptibility to one large distance dominating the calculation.
442 | 
443 |         In the common analogy-solving case, of two positive and one negative examples,
444 |         this method is equivalent to the "3CosMul" objective (equation (4)) of Levy and Goldberg.
445 | 
446 |         Additional positive or negative examples contribute to the numerator or denominator,
447 |         respectively – a potentially sensible but untested extension of the method. (With
448 |         a single positive example, rankings will be the same as in the default most_similar.)
449 | 
450 |         Example::
451 | 
452 |           >>> trained_model.most_similar_cosmul(positive=['baghdad', 'england'], negative=['london'])
453 |           [(u'iraq', 0.8488819003105164), ...]
454 | 
455 |         .. [4] Omer Levy and Yoav Goldberg. Linguistic Regularities in Sparse and Explicit Word Representations, 2014.
456 | 
457 |         """
458 |         self.init_sims()
459 | 
460 |         if isinstance(positive, string_types) and not negative:
461 |             # allow calls like most_similar_cosmul('dog'), as a shorthand for most_similar_cosmul(['dog'])
462 |             positive = [positive]
463 | 
464 |         all_words = set([self.vocab[word].index for word in positive+negative
465 |             if not isinstance(word, ndarray) and word in self.vocab])
466 | 
467 |         positive = [
468 |             self.word_vec(word, use_norm=True) if isinstance(word, string_types) else word
469 |             for word in positive
470 |         ]
471 |         negative = [
472 |             self.word_vec(word, use_norm=True) if isinstance(word, string_types) else word
473 |             for word in negative
474 |         ]
475 | 
476 |         if not positive:
477 |             raise ValueError("cannot compute similarity with no input")
478 | 
479 |         # equation (4) of Levy & Goldberg "Linguistic Regularities...",
480 |         # with distances shifted to [0,1] per footnote (7)
481 |         pos_dists = [((1 + dot(self.syn0norm, term)) / 2) for term in positive]
482 |         neg_dists = [((1 + dot(self.syn0norm, term)) / 2) for term in negative]
483 |         dists = prod(pos_dists, axis=0) / (prod(neg_dists, axis=0) + 0.000001)
484 | 
485 |         if not topn:
486 |             return dists
487 |         best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True)
488 |         # ignore (don't return) words from the input
489 |         result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words]
490 |         return result[:topn]
491 | 
492 |     def similar_by_word(self, word, topn=10, restrict_vocab=None):
493 |         """
494 |         Find the top-N most similar words.
495 | 
496 |         If topn is False, similar_by_word returns the vector of similarity scores.
497 | 
498 |         `restrict_vocab` is an optional integer which limits the range of vectors which
499 |         are searched for most-similar values. For example, restrict_vocab=10000 would
500 |         only check the first 10000 word vectors in the vocabulary order. (This may be
501 |         meaningful if you've sorted the vocabulary by descending frequency.)
502 | 
503 |         Example::
504 | 
505 |           >>> trained_model.similar_by_word('graph')
506 |           [('user', 0.9999163150787354), ...]
507 | 
508 |         """
509 | 
510 |         return self.most_similar(positive=[word], topn=topn, restrict_vocab=restrict_vocab)
511 | 
512 |     def similar_by_vector(self, vector, topn=10, restrict_vocab=None):
513 |         """
514 |         Find the top-N most similar words by vector.
515 | 
516 |         If topn is False, similar_by_vector returns the vector of similarity scores.
517 | 
518 |         `restrict_vocab` is an optional integer which limits the range of vectors which
519 |         are searched for most-similar values. For example, restrict_vocab=10000 would
520 |         only check the first 10000 word vectors in the vocabulary order. (This may be
521 |         meaningful if you've sorted the vocabulary by descending frequency.)
522 | 
523 |         Example::
524 | 
525 |           >>> trained_model.similar_by_vector([1,2])
526 |           [('survey', 0.9942699074745178), ...]
527 | 
528 |         """
529 | 
530 |         return self.most_similar(positive=[vector], topn=topn, restrict_vocab=restrict_vocab)
531 | 
532 |     def doesnt_match(self, words):
533 |         """
534 |         Which word from the given list doesn't go with the others?
535 | 
536 |         Example::
537 | 
538 |           >>> trained_model.doesnt_match("breakfast cereal dinner lunch".split())
539 |           'cereal'
540 | 
541 |         """
542 |         self.init_sims()
543 | 
544 |         used_words = [word for word in words if word in self]
545 |         if len(used_words) != len(words):
546 |             ignored_words = set(words) - set(used_words)
547 |             logger.warning("vectors for words %s are not present in the model, ignoring these words", ignored_words)
548 |         if not used_words:
549 |             raise ValueError("cannot select a word from an empty list")
550 |         vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
551 |         mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL)
552 |         dists = dot(vectors, mean)
553 |         return sorted(zip(dists, used_words))[0][1]
554 | 
555 |     def __getitem__(self, words):
556 | 
557 |         """
558 |         Accept a single word or a list of words as input.
559 | 
560 |         If a single word: returns the word's representations in vector space, as
561 |         a 1D numpy array.
562 | 
563 |         Multiple words: return the words' representations in vector space, as a
564 |         2d numpy array: #words x #vector_size. Matrix rows are in the same order
565 |         as in input.
566 | 
567 |         Example::
568 | 
569 |           >>> trained_model['office']
570 |           array([ -1.40128313e-02, ...])
571 | 
572 |           >>> trained_model[['office', 'products']]
573 |           array([ -1.40128313e-02, ...]
574 |                 [ -1.70425311e-03, ...]
575 |                  ...)
576 | 
577 |         """
578 |         if isinstance(words, string_types):
579 |             # allow calls like trained_model['office'], as a shorthand for trained_model[['office']]
580 |             return self.word_vec(words)
581 | 
582 |         return vstack([self.word_vec(word) for word in words])
583 | 
584 |     def __contains__(self, word):
585 |         return word in self.vocab
586 | 
587 |     def similarity(self, w1, w2):
588 |         """
589 |         Compute cosine similarity between two words.
590 | 
591 |         Example::
592 | 
593 |           >>> trained_model.similarity('woman', 'man')
594 |           0.73723527
595 | 
596 |           >>> trained_model.similarity('woman', 'woman')
597 |           1.0
598 | 
599 |         """
600 |         return dot(matutils.unitvec(self[w1]), matutils.unitvec(self[w2]))
601 | 
602 |     def n_similarity(self, ws1, ws2):
603 |         """
604 |         Compute cosine similarity between two sets of words.
605 | 
606 |         Example::
607 | 
608 |           >>> trained_model.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant'])
609 |           0.61540466561049689
610 | 
611 |           >>> trained_model.n_similarity(['restaurant', 'japanese'], ['japanese', 'restaurant'])
612 |           1.0000000000000004
613 | 
614 |           >>> trained_model.n_similarity(['sushi'], ['restaurant']) == trained_model.similarity('sushi', 'restaurant')
615 |           True
616 | 
617 |         """
618 |         if not(len(ws1) and len(ws2)):
619 |             raise ZeroDivisionError('Atleast one of the passed list is empty.')
620 |         v1 = [self[word] for word in ws1]
621 |         v2 = [self[word] for word in ws2]
622 |         return dot(matutils.unitvec(array(v1).mean(axis=0)),
623 |                    matutils.unitvec(array(v2).mean(axis=0)))
624 | 
625 |     @staticmethod
626 |     def log_accuracy(section):
627 |         correct, incorrect = len(section['correct']), len(section['incorrect'])
628 |         if correct + incorrect > 0:
629 |             logger.info("%s: %.1f%% (%i/%i)" %
630 |                         (section['section'], 100.0 * correct / (correct + incorrect),
631 |                          correct, correct + incorrect))
632 | 
633 |     def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, case_insensitive=True):
634 |         """
635 |         Compute accuracy of the model. `questions` is a filename where lines are
636 |         4-tuples of words, split into sections by ": SECTION NAME" lines.
637 |         See questions-words.txt in https://storage.googleapis.com/google-model-archive-source/v2/model.google.com/word2vec/source-archive.zip for an example.
638 | 
639 |         The accuracy is reported (=printed to log and returned as a list) for each
640 |         section separately, plus there's one aggregate summary at the end.
641 | 
642 |         Use `restrict_vocab` to ignore all questions containing a word not in the first `restrict_vocab`
643 |         words (default 30,000). This may be meaningful if you've sorted the vocabulary by descending frequency.
644 |         In case `case_insensitive` is True, the first `restrict_vocab` words are taken first, and then
645 |         case normalization is performed.
646 | 
647 |         Use `case_insensitive` to convert all words in questions and vocab to their uppercase form before
648 |         evaluating the accuracy (default True). Useful in case of case-mismatch between training tokens
649 |         and question words. In case of multiple case variants of a single word, the vector for the first
650 |         occurrence (also the most frequent if vocabulary is sorted) is taken.
651 | 
652 |         This method corresponds to the `compute-accuracy` script of the original C word2vec.
653 | 
654 |         """
655 |         ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]]
656 |         ok_vocab = dict((w.upper(), v) for w, v in reversed(ok_vocab)) if case_insensitive else dict(ok_vocab)
657 | 
658 |         sections, section = [], None
659 |         for line_no, line in enumerate(utils.smart_open(questions)):
660 |             # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed
661 |             line = utils.to_unicode(line)
662 |             if line.startswith(': '):
663 |                 # a new section starts => store the old section
664 |                 if section:
665 |                     sections.append(section)
666 |                     self.log_accuracy(section)
667 |                 section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []}
668 |             else:
669 |                 if not section:
670 |                     raise ValueError("missing section header before line #%i in %s" % (line_no, questions))
671 |                 try:
672 |                     if case_insensitive:
673 |                         a, b, c, expected = [word.upper() for word in line.split()]
674 |                     else:
675 |                         a, b, c, expected = [word for word in line.split()]
676 |                 except:
677 |                     logger.info("skipping invalid line #%i in %s" % (line_no, questions))
678 |                     continue
679 |                 if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab:
680 |                     logger.debug("skipping line #%i with OOV words: %s" % (line_no, line.strip()))
681 |                     continue
682 | 
683 |                 original_vocab = self.vocab
684 |                 self.vocab = ok_vocab
685 |                 ignore = set([a, b, c])  # input words to be ignored
686 |                 predicted = None
687 |                 # find the most likely prediction, ignoring OOV words and input words
688 |                 sims = most_similar(self, positive=[b, c], negative=[a], topn=False, restrict_vocab=restrict_vocab)
689 |                 self.vocab = original_vocab
690 |                 for index in matutils.argsort(sims, reverse=True):
691 |                     predicted = self.index2word[index].upper() if case_insensitive else self.index2word[index]
692 |                     if predicted in ok_vocab and predicted not in ignore:
693 |                         if predicted != expected:
694 |                             logger.debug("%s: expected %s, predicted %s", line.strip(), expected, predicted)
695 |                         break
696 |                 if predicted == expected:
697 |                     section['correct'].append((a, b, c, expected))
698 |                 else:
699 |                     section['incorrect'].append((a, b, c, expected))
700 |         if section:
701 |             # store the last section, too
702 |             sections.append(section)
703 |             self.log_accuracy(section)
704 | 
705 |         total = {
706 |             'section': 'total',
707 |             'correct': sum((s['correct'] for s in sections), []),
708 |             'incorrect': sum((s['incorrect'] for s in sections), []),
709 |         }
710 |         self.log_accuracy(total)
711 |         sections.append(total)
712 |         return sections
713 | 
714 |     @staticmethod
715 |     def log_evaluate_word_pairs(pearson, spearman, oov, pairs):
716 |         logger.info('Pearson correlation coefficient against %s: %.4f', pairs, pearson[0])
717 |         logger.info('Spearman rank-order correlation coefficient against %s: %.4f', pairs, spearman[0])
718 |         logger.info('Pairs with unknown words ratio: %.1f%%', oov)
719 | 
720 |     def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True,
721 |                             dummy4unknown=False):
722 |         """
723 |         Compute correlation of the model with human similarity judgments. `pairs` is a filename of a dataset where
724 |         lines are 3-tuples, each consisting of a word pair and a similarity value, separated by `delimiter'.
725 |         An example dataset is included in Gensim (test/test_data/wordsim353.tsv). More datasets can be found at
726 |         http://technion.ac.il/~ira.leviant/MultilingualVSMdata.html or https://www.cl.cam.ac.uk/~fh295/simlex.html.
727 | 
728 |         The model is evaluated using Pearson correlation coefficient and Spearman rank-order correlation coefficient
729 |         between the similarities from the dataset and the similarities produced by the model itself.
730 |         The results are printed to log and returned as a triple (pearson, spearman, ratio of pairs with unknown words).
731 | 
732 |         Use `restrict_vocab` to ignore all word pairs containing a word not in the first `restrict_vocab`
733 |         words (default 300,000). This may be meaningful if you've sorted the vocabulary by descending frequency.
734 |         If `case_insensitive` is True, the first `restrict_vocab` words are taken, and then case normalization
735 |         is performed.
736 | 
737 |         Use `case_insensitive` to convert all words in the pairs and vocab to their uppercase form before
738 |         evaluating the model (default True). Useful when you expect case-mismatch between training tokens
739 |         and words pairs in the dataset. If there are multiple case variants of a single word, the vector for the first
740 |         occurrence (also the most frequent if vocabulary is sorted) is taken.
741 | 
742 |         Use `dummy4unknown=True' to produce zero-valued similarities for pairs with out-of-vocabulary words.
743 |         Otherwise (default False), these pairs are skipped entirely.
744 |         """
745 |         ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]]
746 |         ok_vocab = dict((w.upper(), v) for w, v in reversed(ok_vocab)) if case_insensitive else dict(ok_vocab)
747 | 
748 |         similarity_gold = []
749 |         similarity_model = []
750 |         oov = 0
751 | 
752 |         original_vocab = self.vocab
753 |         self.vocab = ok_vocab
754 | 
755 |         for line_no, line in enumerate(utils.smart_open(pairs)):
756 |             line = utils.to_unicode(line)
757 |             if line.startswith('#'):
758 |                 # May be a comment
759 |                 continue
760 |             else:
761 |                 try:
762 |                     if case_insensitive:
763 |                         a, b, sim = [word.upper() for word in line.split(delimiter)]
764 |                     else:
765 |                         a, b, sim = [word for word in line.split(delimiter)]
766 |                     sim = float(sim)
767 |                 except:
768 |                     logger.info('skipping invalid line #%d in %s', line_no, pairs)
769 |                     continue
770 |                 if a not in ok_vocab or b not in ok_vocab:
771 |                     oov += 1
772 |                     if dummy4unknown:
773 |                         similarity_model.append(0.0)
774 |                         similarity_gold.append(sim)
775 |                         continue
776 |                     else:
777 |                         logger.debug('skipping line #%d with OOV words: %s', line_no, line.strip())
778 |                         continue
779 |                 similarity_gold.append(sim)  # Similarity from the dataset
780 |                 similarity_model.append(self.similarity(a, b))  # Similarity from the model
781 |         self.vocab = original_vocab
782 |         spearman = stats.spearmanr(similarity_gold, similarity_model)
783 |         pearson = stats.pearsonr(similarity_gold, similarity_model)
784 |         oov_ratio = float(oov) / (len(similarity_gold) + oov) * 100
785 | 
786 |         logger.debug(
787 |             'Pearson correlation coefficient against %s: %f with p-value %f',
788 |             pairs, pearson[0], pearson[1]
789 |         )
790 |         logger.debug(
791 |             'Spearman rank-order correlation coefficient against %s: %f with p-value %f',
792 |             pairs, spearman[0], spearman[1]
793 |         )
794 |         logger.debug('Pairs with unknown words: %d' % oov)
795 |         self.log_evaluate_word_pairs(pearson, spearman, oov_ratio, pairs)
796 |         return pearson, spearman, oov_ratio
797 | 
798 | 
799 |     def init_sims(self, replace=False):
800 |         """
801 |         Precompute L2-normalized vectors.
802 | 
803 |         If `replace` is set, forget the original vectors and only keep the normalized
804 |         ones = saves lots of memory!
805 | 
806 |         Note that you **cannot continue training** after doing a replace. The model becomes
807 |         effectively read-only = you can call `most_similar`, `similarity` etc., but not `train`.
808 | 
809 |         """
810 |         if getattr(self, 'syn0norm', None) is None or replace:
811 |             logger.info("precomputing L2-norms of word weight vectors")
812 |             if replace:
813 |                 for i in xrange(self.syn0.shape[0]):
814 |                     self.syn0[i, :] /= sqrt((self.syn0[i, :] ** 2).sum(-1))
815 |                 self.syn0norm = self.syn0
816 |             else:
817 |                 self.syn0norm = (self.syn0 / sqrt((self.syn0 ** 2).sum(-1))[..., newaxis]).astype(REAL)
818 | 


--------------------------------------------------------------------------------
/word2vec/matutils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright (C) 2011 Radim Rehurek <radimrehurek@seznam.cz>
  5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
  6 | 
  7 | """
  8 | This module contains math helper functions.
  9 | """
 10 | 
 11 | from __future__ import with_statement
 12 | 
 13 | import logging
 14 | import math
 15 | 
 16 | import numpy as np
 17 | import scipy.linalg
 18 | import scipy.sparse
 19 | from scipy.linalg.lapack import get_lapack_funcs
 20 | from scipy.special import psi  # gamma function utils
 21 | from scipy.stats import entropy
 22 | from six import iteritems, itervalues, string_types
 23 | from six.moves import xrange, zip as izip
 24 | 
 25 | import utils
 26 | 
 27 | # scipy is not a stable package yet, locations change, so try to work
 28 | # around differences (currently only concerns location of 'triu' in scipy 0.7 vs. 0.8)
 29 | try:
 30 |     from scipy.linalg.basic import triu
 31 | except ImportError:
 32 |     from scipy.linalg.special_matrices import triu
 33 | 
 34 | try:
 35 |     from np import triu_indices
 36 | except ImportError:
 37 |     # np < 1.4
 38 |     def triu_indices(n, k=0):
 39 |         m = np.ones((n, n), int)
 40 |         a = triu(m, k)
 41 |         return np.where(a != 0)
 42 | 
 43 | blas = lambda name, ndarray: scipy.linalg.get_blas_funcs((name,), (ndarray,))[0]
 44 | 
 45 | logger = logging.getLogger(__name__)
 46 | 
 47 | 
 48 | def argsort(x, topn=None, reverse=False):
 49 |     """
 50 |     Return indices of the `topn` smallest elements in array `x`, in ascending order.
 51 | 
 52 |     If reverse is True, return the greatest elements instead, in descending order.
 53 | 
 54 |     """
 55 |     x = np.asarray(x)  # unify model path for when `x` is not a np array (list, tuple...)
 56 |     if topn is None:
 57 |         topn = x.size
 58 |     if topn <= 0:
 59 |         return []
 60 |     if reverse:
 61 |         x = -x
 62 |     if topn >= x.size or not hasattr(np, 'argpartition'):
 63 |         return np.argsort(x)[:topn]
 64 |     # np >= 1.8 has a fast partial argsort, use that!
 65 |     most_extreme = np.argpartition(x, topn)[:topn]
 66 |     return most_extreme.take(np.argsort(x.take(most_extreme)))  # resort topn into order
 67 | 
 68 | 
 69 | def corpus2csc(corpus, num_terms=None, dtype=np.float64, num_docs=None, num_nnz=None, printprogress=0):
 70 |     """
 71 |     Convert a streamed corpus into a sparse matrix, in scipy.sparse.csc_matrix format,
 72 |     with documents as columns.
 73 | 
 74 |     If the number of terms, documents and non-zero elements is known, you can pass
 75 |     them here as parameters and a more memory efficient model path will be taken.
 76 | 
 77 |     The input corpus may be a non-repeatable stream (generator).
 78 | 
 79 |     This is the mirror function to `Sparse2Corpus`.
 80 | 
 81 |     """
 82 |     try:
 83 |         # if the input corpus has the `num_nnz`, `num_docs` and `num_terms` attributes
 84 |         # (as is the case with MmCorpus for example), we can use a more efficient model path
 85 |         if num_terms is None:
 86 |             num_terms = corpus.num_terms
 87 |         if num_docs is None:
 88 |             num_docs = corpus.num_docs
 89 |         if num_nnz is None:
 90 |             num_nnz = corpus.num_nnz
 91 |     except AttributeError:
 92 |         pass  # not a MmCorpus...
 93 |     if printprogress:
 94 |         logger.info("creating sparse matrix from corpus")
 95 |     if num_terms is not None and num_docs is not None and num_nnz is not None:
 96 |         # faster and much more memory-friendly version of creating the sparse csc
 97 |         posnow, indptr = 0, [0]
 98 |         indices = np.empty((num_nnz,), dtype=np.int32)  # HACK assume feature ids fit in 32bit integer
 99 |         data = np.empty((num_nnz,), dtype=dtype)
100 |         for docno, doc in enumerate(corpus):
101 |             if printprogress and docno % printprogress == 0:
102 |                 logger.info("PROGRESS: at document #%i/%i" % (docno, num_docs))
103 |             posnext = posnow + len(doc)
104 |             indices[posnow: posnext] = [feature_id for feature_id, _ in doc]
105 |             data[posnow: posnext] = [feature_weight for _, feature_weight in doc]
106 |             indptr.append(posnext)
107 |             posnow = posnext
108 |         assert posnow == num_nnz, "mismatch between supplied and computed number of non-zeros"
109 |         result = scipy.sparse.csc_matrix((data, indices, indptr), shape=(num_terms, num_docs), dtype=dtype)
110 |     else:
111 |         # slower version; determine the sparse matrix parameters during iteration
112 |         num_nnz, data, indices, indptr = 0, [], [], [0]
113 |         for docno, doc in enumerate(corpus):
114 |             if printprogress and docno % printprogress == 0:
115 |                 logger.info("PROGRESS: at document #%i" % (docno))
116 |             indices.extend([feature_id for feature_id, _ in doc])
117 |             data.extend([feature_weight for _, feature_weight in doc])
118 |             num_nnz += len(doc)
119 |             indptr.append(num_nnz)
120 |         if num_terms is None:
121 |             num_terms = max(indices) + 1 if indices else 0
122 |         num_docs = len(indptr) - 1
123 |         # now num_docs, num_terms and num_nnz contain the correct values
124 |         data = np.asarray(data, dtype=dtype)
125 |         indices = np.asarray(indices)
126 |         result = scipy.sparse.csc_matrix((data, indices, indptr), shape=(num_terms, num_docs), dtype=dtype)
127 |     return result
128 | 
129 | 
130 | def pad(mat, padrow, padcol):
131 |     """
132 |     Add additional rows/columns to a np.matrix `mat`. The new rows/columns
133 |     will be initialized with zeros.
134 |     """
135 |     if padrow < 0:
136 |         padrow = 0
137 |     if padcol < 0:
138 |         padcol = 0
139 |     rows, cols = mat.shape
140 |     return np.bmat([
141 |         [mat, np.matrix(np.zeros((rows, padcol)))],
142 |         [np.matrix(np.zeros((padrow, cols + padcol)))],
143 |     ])
144 | 
145 | 
146 | def zeros_aligned(shape, dtype, order='C', align=128):
147 |     """Like `np.zeros()`, but the array will be aligned at `align` byte boundary."""
148 |     nbytes = np.prod(shape, dtype=np.int64) * np.dtype(dtype).itemsize
149 |     buffer = np.zeros(nbytes + align, dtype=np.uint8)  # problematic on win64 ("maximum allowed dimension exceeded")
150 |     start_index = -buffer.ctypes.data % align
151 |     return buffer[start_index : start_index + nbytes].view(dtype).reshape(shape, order=order)
152 | 
153 | 
154 | def ismatrix(m):
155 |     return isinstance(m, np.ndarray) and m.ndim == 2 or scipy.sparse.issparse(m)
156 | 
157 | 
158 | def any2sparse(vec, eps=1e-9):
159 |     """Convert a np/scipy vector into gensim document format (=list of 2-tuples)."""
160 |     if isinstance(vec, np.ndarray):
161 |         return dense2vec(vec, eps)
162 |     if scipy.sparse.issparse(vec):
163 |         return scipy2sparse(vec, eps)
164 |     return [(int(fid), float(fw)) for fid, fw in vec if np.abs(fw) > eps]
165 | 
166 | 
167 | def scipy2sparse(vec, eps=1e-9):
168 |     """Convert a scipy.sparse vector into gensim document format (=list of 2-tuples)."""
169 |     vec = vec.tocsr()
170 |     assert vec.shape[0] == 1
171 |     return [(int(pos), float(val)) for pos, val in zip(vec.indices, vec.data) if np.abs(val) > eps]
172 | 
173 | 
174 | class Scipy2Corpus(object):
175 |     """
176 |     Convert a sequence of dense/sparse vectors into a streamed gensim corpus object.
177 | 
178 |     This is the mirror function to `corpus2csc`.
179 | 
180 |     """
181 |     def __init__(self, vecs):
182 |         """
183 |         `vecs` is a sequence of dense and/or sparse vectors, such as a 2d np array,
184 |         or a scipy.sparse.csc_matrix, or any sequence containing a mix of 1d np/scipy vectors.
185 | 
186 |         """
187 |         self.vecs = vecs
188 | 
189 |     def __iter__(self):
190 |         for vec in self.vecs:
191 |             if isinstance(vec, np.ndarray):
192 |                 yield full2sparse(vec)
193 |             else:
194 |                 yield scipy2sparse(vec)
195 | 
196 |     def __len__(self):
197 |         return len(self.vecs)
198 | 
199 | 
200 | def sparse2full(doc, length):
201 |     """
202 |     Convert a document in sparse document format (=sequence of 2-tuples) into a dense
203 |     np array (of size `length`).
204 | 
205 |     This is the mirror function to `full2sparse`.
206 | 
207 |     """
208 |     result = np.zeros(length, dtype=np.float32)  # fill with zeroes (default value)
209 |     # convert indices to int as numpy 1.12 no longer indexes by floats
210 |     doc = ((int(id_), float(val_)) for (id_, val_) in doc)
211 | 
212 |     doc = dict(doc)
213 |     # overwrite some of the zeroes with explicit values
214 |     result[list(doc)] = list(itervalues(doc))
215 |     return result
216 | 
217 | 
218 | def full2sparse(vec, eps=1e-9):
219 |     """
220 |     Convert a dense np array into the sparse document format (sequence of 2-tuples).
221 | 
222 |     Values of magnitude < `eps` are treated as zero (ignored).
223 | 
224 |     This is the mirror function to `sparse2full`.
225 | 
226 |     """
227 |     vec = np.asarray(vec, dtype=float)
228 |     nnz = np.nonzero(abs(vec) > eps)[0]
229 |     return list(zip(nnz, vec.take(nnz)))
230 | 
231 | dense2vec = full2sparse
232 | 
233 | 
234 | def full2sparse_clipped(vec, topn, eps=1e-9):
235 |     """
236 |     Like `full2sparse`, but only return the `topn` elements of the greatest magnitude (abs).
237 | 
238 |     """
239 |     # use np.argpartition/argsort and only form tuples that are actually returned.
240 |     # this is about 40x faster than explicitly forming all 2-tuples to run sort() or heapq.nlargest() on.
241 |     if topn <= 0:
242 |         return []
243 |     vec = np.asarray(vec, dtype=float)
244 |     nnz = np.nonzero(abs(vec) > eps)[0]
245 |     biggest = nnz.take(argsort(abs(vec).take(nnz), topn, reverse=True))
246 |     return list(zip(biggest, vec.take(biggest)))
247 | 
248 | 
249 | def corpus2dense(corpus, num_terms, num_docs=None, dtype=np.float32):
250 |     """
251 |     Convert corpus into a dense np array (documents will be columns). You
252 |     must supply the number of features `num_terms`, because dimensionality
253 |     cannot be deduced from the sparse vectors alone.
254 | 
255 |     You can optionally supply `num_docs` (=the corpus length) as well, so that
256 |     a more memory-efficient model path is taken.
257 | 
258 |     This is the mirror function to `Dense2Corpus`.
259 | 
260 |     """
261 |     if num_docs is not None:
262 |         # we know the number of documents => don't bother column_stacking
263 |         docno, result = -1, np.empty((num_terms, num_docs), dtype=dtype)
264 |         for docno, doc in enumerate(corpus):
265 |             result[:, docno] = sparse2full(doc, num_terms)
266 |         assert docno + 1 == num_docs
267 |     else:
268 |         result = np.column_stack(sparse2full(doc, num_terms) for doc in corpus)
269 |     return result.astype(dtype)
270 | 
271 | 
272 | class Dense2Corpus(object):
273 |     """
274 |     Treat dense np array as a sparse, streamed gensim corpus.
275 | 
276 |     No data copy is made (changes to the underlying matrix imply changes in the
277 |     corpus).
278 | 
279 |     This is the mirror function to `corpus2dense`.
280 | 
281 |     """
282 |     def __init__(self, dense, documents_columns=True):
283 |         if documents_columns:
284 |             self.dense = dense.T
285 |         else:
286 |             self.dense = dense
287 | 
288 |     def __iter__(self):
289 |         for doc in self.dense:
290 |             yield full2sparse(doc.flat)
291 | 
292 |     def __len__(self):
293 |         return len(self.dense)
294 | #endclass DenseCorpus
295 | 
296 | 
297 | class Sparse2Corpus(object):
298 |     """
299 |     Convert a matrix in scipy.sparse format into a streaming gensim corpus.
300 | 
301 |     This is the mirror function to `corpus2csc`.
302 | 
303 |     """
304 |     def __init__(self, sparse, documents_columns=True):
305 |         if documents_columns:
306 |             self.sparse = sparse.tocsc()
307 |         else:
308 |             self.sparse = sparse.tocsr().T  # make sure shape[1]=number of docs (needed in len())
309 | 
310 |     def __iter__(self):
311 |         for indprev, indnow in izip(self.sparse.indptr, self.sparse.indptr[1:]):
312 |             yield list(zip(self.sparse.indices[indprev:indnow], self.sparse.data[indprev:indnow]))
313 | 
314 |     def __len__(self):
315 |         return self.sparse.shape[1]
316 | #endclass Sparse2Corpus
317 | 
318 | 
319 | def veclen(vec):
320 |     if len(vec) == 0:
321 |         return 0.0
322 |     length = 1.0 * math.sqrt(sum(val**2 for _, val in vec))
323 |     assert length > 0.0, "sparse documents must not contain any explicit zero entries"
324 |     return length
325 | 
326 | 
327 | def ret_normalized_vec(vec, length):
328 |     if length != 1.0:
329 |         return [(termid, val / length) for termid, val in vec]
330 |     else:
331 |         return list(vec)
332 | 
333 | 
334 | def ret_log_normalize_vec(vec, axis=1):
335 |     log_max = 100.0
336 |     if len(vec.shape) == 1:
337 |         max_val = np.max(vec)
338 |         log_shift = log_max - np.log(len(vec) + 1.0) - max_val
339 |         tot = np.sum(np.exp(vec + log_shift))
340 |         log_norm = np.log(tot) - log_shift
341 |         vec = vec - log_norm
342 |     else:
343 |         if axis == 1:  # independently normalize each sample
344 |             max_val = np.max(vec, 1)
345 |             log_shift = log_max - np.log(vec.shape[1] + 1.0) - max_val
346 |             tot = np.sum(np.exp(vec + log_shift[:, np.newaxis]), 1)
347 |             log_norm = np.log(tot) - log_shift
348 |             vec = vec - log_norm[:, np.newaxis]
349 |         elif axis == 0:  # normalize each feature
350 |             k = ret_log_normalize_vec(vec.T)
351 |             return (k[0].T, k[1])
352 |         else:
353 |             raise ValueError("'%s' is not a supported axis" % axis)
354 |     return (vec, log_norm)
355 | 
356 | 
357 | blas_nrm2 = blas('nrm2', np.array([], dtype=float))
358 | blas_scal = blas('scal', np.array([], dtype=float))
359 | 
360 | 
361 | def unitvec(vec, norm='l2'):
362 |     """
363 |     Scale a vector to unit length. The only exception is the zero vector, which
364 |     is returned back unchanged.
365 | 
366 |     Output will be in the same format as input (i.e., gensim vector=>gensim vector,
367 |     or np array=>np array, scipy.sparse=>scipy.sparse).
368 |     """
369 |     if norm not in ('l1', 'l2'):
370 |         raise ValueError("'%s' is not a supported norm. Currently supported norms are 'l1' and 'l2'." % norm)
371 |     if scipy.sparse.issparse(vec):
372 |         vec = vec.tocsr()
373 |         if norm == 'l1':
374 |             veclen = np.sum(np.abs(vec.data))
375 |         if norm == 'l2':
376 |             veclen = np.sqrt(np.sum(vec.data ** 2))
377 |         if veclen > 0.0:
378 |             return vec / veclen
379 |         else:
380 |             return vec
381 | 
382 |     if isinstance(vec, np.ndarray):
383 |         vec = np.asarray(vec, dtype=float)
384 |         if norm == 'l1':
385 |             veclen = np.sum(np.abs(vec))
386 |         if norm == 'l2':
387 |             veclen = blas_nrm2(vec)
388 |         if veclen > 0.0:
389 |             return blas_scal(1.0 / veclen, vec)
390 |         else:
391 |             return vec
392 | 
393 |     try:
394 |         first = next(iter(vec))  # is there at least one element?
395 |     except:
396 |         return vec
397 | 
398 |     if isinstance(first, (tuple, list)) and len(first) == 2:  # gensim sparse format
399 |         if norm == 'l1':
400 |             length = float(sum(abs(val) for _, val in vec))
401 |         if norm == 'l2':
402 |             length = 1.0 * math.sqrt(sum(val ** 2 for _, val in vec))
403 |         assert length > 0.0, "sparse documents must not contain any explicit zero entries"
404 |         return ret_normalized_vec(vec, length)
405 |     else:
406 |         raise ValueError("unknown input type")
407 | 
408 | 
409 | def cossim(vec1, vec2):
410 |     """
411 |     Return cosine similarity between two sparse vectors.
412 |     The similarity is a number between <-1.0, 1.0>, higher is more similar.
413 |     """
414 |     vec1, vec2 = dict(vec1), dict(vec2)
415 |     if not vec1 or not vec2:
416 |         return 0.0
417 |     vec1len = 1.0 * math.sqrt(sum(val * val for val in itervalues(vec1)))
418 |     vec2len = 1.0 * math.sqrt(sum(val * val for val in itervalues(vec2)))
419 |     assert vec1len > 0.0 and vec2len > 0.0, "sparse documents must not contain any explicit zero entries"
420 |     if len(vec2) < len(vec1):
421 |         vec1, vec2 = vec2, vec1  # swap references so that we iterate over the shorter vector
422 |     result = sum(value * vec2.get(index, 0.0) for index, value in iteritems(vec1))
423 |     result /= vec1len * vec2len  # rescale by vector lengths
424 |     return result
425 | 
426 | 
427 | def isbow(vec):
428 |     """
429 |     Checks if vector passed is in bag of words representation or not.
430 |     Vec is considered to be in bag of words format if it is 2-tuple format.
431 |     """
432 |     if scipy.sparse.issparse(vec):
433 |         vec = vec.todense().tolist()
434 |     try:
435 |         id_, val_ = vec[0]  # checking first value to see if it is in bag of words format by unpacking
436 |         id_, val_ = int(id_), float(val_)
437 |     except IndexError:
438 |         return True  # this is to handle the empty input case
439 |     except Exception:
440 |         return False
441 |     return True
442 | 
443 | 
444 | def kullback_leibler(vec1, vec2, num_features=None):
445 |     """
446 |     A distance metric between two probability distributions.
447 |     Returns a distance value in range <0,1> where values closer to 0 mean less distance (and a higher similarity)
448 |     Uses the scipy.stats.entropy method to identify kullback_leibler convergence value.
449 |     If the distribution draws from a certain number of docs, that value must be passed.
450 |     """
451 |     if scipy.sparse.issparse(vec1):
452 |         vec1 = vec1.toarray()
453 |     if scipy.sparse.issparse(vec2):
454 |         vec2 = vec2.toarray()  # converted both the vectors to dense in case they were in sparse matrix
455 |     if isbow(vec1) and isbow(vec2):  # if they are in bag of words format we make it dense
456 |         if num_features is not None:  # if not None, make as large as the documents drawing from
457 |             dense1 = sparse2full(vec1, num_features)
458 |             dense2 = sparse2full(vec2, num_features)
459 |             return entropy(dense1, dense2)
460 |         else:
461 |             max_len = max(len(vec1), len(vec2))
462 |             dense1 = sparse2full(vec1, max_len)
463 |             dense2 = sparse2full(vec2, max_len)
464 |             return entropy(dense1, dense2)
465 |     else:
466 |         # this conversion is made because if it is not in bow format, it might be a list within a list after conversion
467 |         # the scipy implementation of Kullback fails in such a case so we pick up only the nested list.
468 |         if len(vec1) == 1:
469 |             vec1 = vec1[0]
470 |         if len(vec2) == 1:
471 |             vec2 = vec2[0]
472 |         return scipy.stats.entropy(vec1, vec2)
473 | 
474 | 
475 | def hellinger(vec1, vec2):
476 |     """
477 |     Hellinger distance is a distance metric to quantify the similarity between two probability distributions.
478 |     Distance between distributions will be a number between <0,1>, where 0 is minimum distance (maximum similarity) and 1 is maximum distance (minimum similarity).
479 |     """
480 |     if scipy.sparse.issparse(vec1):
481 |         vec1 = vec1.toarray()
482 |     if scipy.sparse.issparse(vec2):
483 |         vec2 = vec2.toarray()
484 |     if isbow(vec1) and isbow(vec2):
485 |         # if it is a bag of words format, instead of converting to dense we use dictionaries to calculate appropriate distance
486 |         vec1, vec2 = dict(vec1), dict(vec2)
487 |         if len(vec2) < len(vec1):
488 |             vec1, vec2 = vec2, vec1  # swap references so that we iterate over the shorter vector
489 |         sim = np.sqrt(0.5*sum((np.sqrt(value) - np.sqrt(vec2.get(index, 0.0)))**2 for index, value in iteritems(vec1)))
490 |         return sim
491 |     else:
492 |         sim = np.sqrt(0.5 * ((np.sqrt(vec1) - np.sqrt(vec2))**2).sum())
493 |         return sim
494 | 
495 | 
496 | def jaccard(vec1, vec2):
497 |     """
498 |     A distance metric between bags of words representation.
499 |     Returns 1 minus the intersection divided by union, where union is the sum of the size of the two bags.
500 |     If it is not a bag of words representation, the union and intersection is calculated in the traditional manner.
501 |     Returns a value in range <0,1> where values closer to 0 mean less distance and thus higher similarity.
502 | 
503 |     """
504 | 
505 |     # converting from sparse for easier manipulation
506 |     if scipy.sparse.issparse(vec1):
507 |         vec1 = vec1.toarray()
508 |     if scipy.sparse.issparse(vec2):
509 |         vec2 = vec2.toarray()
510 |     if isbow(vec1) and isbow(vec2):
511 |         # if it's in bow format, we use the following definitions:
512 |         # union = sum of the 'weights' of both the bags
513 |         # intersection = lowest weight for a particular id; basically the number of common words or items
514 |         union = sum(weight for id_, weight in vec1) + sum(weight for id_, weight in vec2)
515 |         vec1, vec2 = dict(vec1), dict(vec2)
516 |         intersection = 0.0
517 |         for feature_id, feature_weight in iteritems(vec1):
518 |             intersection += min(feature_weight, vec2.get(feature_id, 0.0))
519 |         return 1 - float(intersection) / float(union)
520 |     else:
521 |         # if it isn't in bag of words format, we can use sets to calculate intersection and union
522 |         if isinstance(vec1, np.ndarray):
523 |             vec1 = vec1.tolist()
524 |         if isinstance(vec2, np.ndarray):
525 |             vec2 = vec2.tolist()
526 |         vec1 = set(vec1)
527 |         vec2 = set(vec2)
528 |         intersection = vec1 & vec2
529 |         union = vec1 | vec2
530 |         return 1 - float(len(intersection)) / float(len(union))
531 | 
532 | 
533 | def jaccard_distance(set1, set2):
534 |     """
535 |     Calculate a distance between set representation (1 minus the intersection divided by union).
536 |     Return a value in range <0, 1> where values closer to 0 mean smaller distance and thus higher similarity.
537 |     """
538 | 
539 |     union_cardinality = len(set1 | set2)
540 |     if union_cardinality == 0:  # Both sets are empty
541 |         return 1.
542 | 
543 |     return 1. - float(len(set1 & set2)) / float(union_cardinality)
544 | 
545 | 
546 | def dirichlet_expectation(alpha):
547 |     """
548 |     For a vector `theta~Dir(alpha)`, compute `E[log(theta)]`.
549 | 
550 |     """
551 |     if (len(alpha.shape) == 1):
552 |         result = psi(alpha) - psi(np.sum(alpha))
553 |     else:
554 |         result = psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis]
555 |     return result.astype(alpha.dtype)  # keep the same precision as input
556 | 
557 | 
558 | def qr_destroy(la):
559 |     """
560 |     Return QR decomposition of `la[0]`. Content of `la` gets destroyed in the process.
561 | 
562 |     Using this function should be less memory intense than calling `scipy.linalg.qr(la[0])`,
563 |     because the memory used in `la[0]` is reclaimed earlier.
564 |     """
565 |     a = np.asfortranarray(la[0])
566 |     del la[0], la  # now `a` is the only reference to the input matrix
567 |     m, n = a.shape
568 |     # perform q, r = QR(a); model hacked out of scipy.linalg.qr
569 |     logger.debug("computing QR of %s dense matrix" % str(a.shape))
570 |     geqrf, = get_lapack_funcs(('geqrf',), (a,))
571 |     qr, tau, work, info = geqrf(a, lwork=-1, overwrite_a=True)
572 |     qr, tau, work, info = geqrf(a, lwork=work[0], overwrite_a=True)
573 |     del a  # free up mem
574 |     assert info >= 0
575 |     r = triu(qr[:n, :n])
576 |     if m < n:  # rare case, #features < #topics
577 |         qr = qr[:, :m]  # retains fortran order
578 |     gorgqr, = get_lapack_funcs(('orgqr',), (qr,))
579 |     q, work, info = gorgqr(qr, tau, lwork=-1, overwrite_a=True)
580 |     q, work, info = gorgqr(qr, tau, lwork=work[0], overwrite_a=True)
581 |     assert info >= 0, "qr failed"
582 |     assert q.flags.f_contiguous
583 |     return q, r
584 | 
585 | 
586 | class MmWriter(object):
587 |     """
588 |     Store a corpus in Matrix Market format.
589 | 
590 |     Note that the output is written one document at a time, not the whole
591 |     matrix at once (unlike scipy.io.mmread). This allows us to process corpora
592 |     which are larger than the available RAM.
593 | 
594 |     NOTE: the output file is created in a single pass through the input corpus, so
595 |     that the input can be a once-only stream (iterator).
596 |     To achieve this, a fake MM header is written first, statistics are collected
597 |     during the pass (shape of the matrix, number of non-zeroes), followed by a seek
598 |     back to the beginning of the file, rewriting the fake header with proper values.
599 | 
600 |     """
601 | 
602 |     HEADER_LINE = b'%%MatrixMarket matrix coordinate real general\n'  # the only supported MM format
603 | 
604 |     def __init__(self, fname):
605 |         self.fname = fname
606 |         if fname.endswith(".gz") or fname.endswith('.bz2'):
607 |             raise NotImplementedError("compressed output not supported with MmWriter")
608 |         self.fout = utils.smart_open(self.fname, 'wb+')  # open for both reading and writing
609 |         self.headers_written = False
610 | 
611 |     def write_headers(self, num_docs, num_terms, num_nnz):
612 |         self.fout.write(MmWriter.HEADER_LINE)
613 | 
614 |         if num_nnz < 0:
615 |             # we don't know the matrix shape/density yet, so only log a general line
616 |             logger.info("saving sparse matrix to %s" % self.fname)
617 |             self.fout.write(utils.to_utf8(' ' * 50 + '\n'))  # 48 digits must be enough for everybody
618 |         else:
619 |             logger.info(
620 |                 "saving sparse %sx%s matrix with %i non-zero entries to %s",
621 |                 num_docs, num_terms, num_nnz, self.fname)
622 |             self.fout.write(utils.to_utf8('%s %s %s\n' % (num_docs, num_terms, num_nnz)))
623 |         self.last_docno = -1
624 |         self.headers_written = True
625 | 
626 |     def fake_headers(self, num_docs, num_terms, num_nnz):
627 |         stats = '%i %i %i' % (num_docs, num_terms, num_nnz)
628 |         if len(stats) > 50:
629 |             raise ValueError('Invalid stats: matrix too large!')
630 |         self.fout.seek(len(MmWriter.HEADER_LINE))
631 |         self.fout.write(utils.to_utf8(stats))
632 | 
633 |     def write_vector(self, docno, vector):
634 |         """
635 |         Write a single sparse vector to the file.
636 | 
637 |         Sparse vector is any iterable yielding (field id, field value) pairs.
638 |         """
639 |         assert self.headers_written, "must write Matrix Market file headers before writing data!"
640 |         assert self.last_docno < docno, "documents %i and %i not in sequential order!" % (self.last_docno, docno)
641 |         vector = sorted((i, w) for i, w in vector if abs(w) > 1e-12)  # ignore near-zero entries
642 |         for termid, weight in vector:  # write term ids in sorted order
643 |             self.fout.write(utils.to_utf8("%i %i %s\n" % (docno + 1, termid + 1, weight)))  # +1 because MM format starts counting from 1
644 |         self.last_docno = docno
645 |         return (vector[-1][0], len(vector)) if vector else (-1, 0)
646 | 
647 |     @staticmethod
648 |     def write_corpus(fname, corpus, progress_cnt=1000, index=False, num_terms=None, metadata=False):
649 |         """
650 |         Save the vector space representation of an entire corpus to disk.
651 | 
652 |         Note that the documents are processed one at a time, so the whole corpus
653 |         is allowed to be larger than the available RAM.
654 |         """
655 |         mw = MmWriter(fname)
656 | 
657 |         # write empty headers to the file (with enough space to be overwritten later)
658 |         mw.write_headers(-1, -1, -1)  # will print 50 spaces followed by newline on the stats line
659 | 
660 |         # calculate necessary header info (nnz elements, num terms, num docs) while writing out vectors
661 |         _num_terms, num_nnz = 0, 0
662 |         docno, poslast = -1, -1
663 |         offsets = []
664 |         if hasattr(corpus, 'metadata'):
665 |             orig_metadata = corpus.metadata
666 |             corpus.metadata = metadata
667 |             if metadata:
668 |                 docno2metadata = {}
669 |         else:
670 |             metadata = False
671 |         for docno, doc in enumerate(corpus):
672 |             if metadata:
673 |                 bow, data = doc
674 |                 docno2metadata[docno] = data
675 |             else:
676 |                 bow = doc
677 |             if docno % progress_cnt == 0:
678 |                 logger.info("PROGRESS: saving document #%i" % docno)
679 |             if index:
680 |                 posnow = mw.fout.tell()
681 |                 if posnow == poslast:
682 |                     offsets[-1] = -1
683 |                 offsets.append(posnow)
684 |                 poslast = posnow
685 |             max_id, veclen = mw.write_vector(docno, bow)
686 |             _num_terms = max(_num_terms, 1 + max_id)
687 |             num_nnz += veclen
688 |         if metadata:
689 |             utils.pickle(docno2metadata, fname + '.metadata.cpickle')
690 |             corpus.metadata = orig_metadata
691 | 
692 |         num_docs = docno + 1
693 |         num_terms = num_terms or _num_terms
694 | 
695 |         if num_docs * num_terms != 0:
696 |             logger.info("saved %ix%i matrix, density=%.3f%% (%i/%i)" % (
697 |                 num_docs, num_terms,
698 |                 100.0 * num_nnz / (num_docs * num_terms),
699 |                 num_nnz,
700 |                 num_docs * num_terms))
701 | 
702 |         # now write proper headers, by seeking and overwriting the spaces written earlier
703 |         mw.fake_headers(num_docs, num_terms, num_nnz)
704 | 
705 |         mw.close()
706 |         if index:
707 |             return offsets
708 | 
709 |     def __del__(self):
710 |         """
711 |         Automatic destructor which closes the underlying file.
712 | 
713 |         There must be no circular references contained in the object for __del__
714 |         to work! Closing the file explicitly via the close() method is preferred
715 |         and safer.
716 |         """
717 |         self.close()  # does nothing if called twice (on an already closed file), so no worries
718 | 
719 |     def close(self):
720 |         logger.debug("closing %s" % self.fname)
721 |         if hasattr(self, 'fout'):
722 |             self.fout.close()
723 | #endclass MmWriter
724 | 
725 | 
726 | class MmReader(object):
727 |     """
728 |     Wrap a term-document matrix on disk (in matrix-market format), and present it
729 |     as an object which supports iteration over the rows (~documents).
730 | 
731 |     Note that the file is read into memory one document at a time, not the whole
732 |     matrix at once (unlike scipy.io.mmread). This allows us to process corpora
733 |     which are larger than the available RAM.
734 |     """
735 |     def __init__(self, input, transposed=True):
736 |         """
737 |         Initialize the matrix reader.
738 | 
739 |         The `input` refers to a file on local filesystem, which is expected to
740 |         be in the sparse (coordinate) Matrix Market format. Documents are assumed
741 |         to be rows of the matrix (and document features are columns).
742 | 
743 |         `input` is either a string (file path) or a file-like object that supports
744 |         `seek()` (e.g. gzip.GzipFile, bz2.BZ2File).
745 |         """
746 |         logger.info("initializing corpus reader from %s" % input)
747 |         self.input, self.transposed = input, transposed
748 |         with utils.file_or_filename(self.input) as lines:
749 |             try:
750 |                 header = utils.to_unicode(next(lines)).strip()
751 |                 if not header.lower().startswith('%%matrixmarket matrix coordinate real general'):
752 |                     raise ValueError("File %s not in Matrix Market format with coordinate real general; instead found: \n%s" %
753 |                                     (self.input, header))
754 |             except StopIteration:
755 |                 pass
756 | 
757 |             self.num_docs = self.num_terms = self.num_nnz = 0
758 |             for lineno, line in enumerate(lines):
759 |                 line = utils.to_unicode(line)
760 |                 if not line.startswith('%'):
761 |                     self.num_docs, self.num_terms, self.num_nnz = map(int, line.split())
762 |                     if not self.transposed:
763 |                         self.num_docs, self.num_terms = self.num_terms, self.num_docs
764 |                     break
765 | 
766 |         logger.info(
767 |             "accepted corpus with %i documents, %i features, %i non-zero entries",
768 |             self.num_docs, self.num_terms, self.num_nnz)
769 | 
770 |     def __len__(self):
771 |         return self.num_docs
772 | 
773 |     def __str__(self):
774 |         return ("MmCorpus(%i documents, %i features, %i non-zero entries)" %
775 |                 (self.num_docs, self.num_terms, self.num_nnz))
776 | 
777 |     def skip_headers(self, input_file):
778 |         """
779 |         Skip file headers that appear before the first document.
780 |         """
781 |         for line in input_file:
782 |             if line.startswith(b'%'):
783 |                 continue
784 |             break
785 | 
786 |     def __iter__(self):
787 |         """
788 |         Iteratively yield vectors from the underlying file, in the format (row_no, vector),
789 |         where vector is a list of (col_no, value) 2-tuples.
790 | 
791 |         Note that the total number of vectors returned is always equal to the
792 |         number of rows specified in the header; empty documents are inserted and
793 |         yielded where appropriate, even if they are not explicitly stored in the
794 |         Matrix Market file.
795 |         """
796 |         with utils.file_or_filename(self.input) as lines:
797 |             self.skip_headers(lines)
798 | 
799 |             previd = -1
800 |             for line in lines:
801 |                 docid, termid, val = utils.to_unicode(line).split()  # needed for python3
802 |                 if not self.transposed:
803 |                     termid, docid = docid, termid
804 |                 docid, termid, val = int(docid) - 1, int(termid) - 1, float(val)  # -1 because matrix market indexes are 1-based => convert to 0-based
805 |                 assert previd <= docid, "matrix columns must come in ascending order"
806 |                 if docid != previd:
807 |                     # change of document: return the document read so far (its id is prevId)
808 |                     if previd >= 0:
809 |                         yield previd, document
810 | 
811 |                     # return implicit (empty) documents between previous id and new id
812 |                     # too, to keep consistent document numbering and corpus length
813 |                     for previd in xrange(previd + 1, docid):
814 |                         yield previd, []
815 | 
816 |                     # from now on start adding fields to a new document, with a new id
817 |                     previd = docid
818 |                     document = []
819 | 
820 |                 document.append((termid, val,))  # add another field to the current document
821 | 
822 |         # handle the last document, as a special case
823 |         if previd >= 0:
824 |             yield previd, document
825 | 
826 |         # return empty documents between the last explicit document and the number
827 |         # of documents as specified in the header
828 |         for previd in xrange(previd + 1, self.num_docs):
829 |             yield previd, []
830 | 
831 |     def docbyoffset(self, offset):
832 |         """Return document at file offset `offset` (in bytes)"""
833 |         # empty documents are not stored explicitly in MM format, so the index marks
834 |         # them with a special offset, -1.
835 |         if offset == -1:
836 |             return []
837 |         if isinstance(self.input, string_types):
838 |             fin = utils.smart_open(self.input)
839 |         else:
840 |             fin = self.input
841 | 
842 |         fin.seek(offset)  # works for gzip/bz2 input, too
843 |         previd, document = -1, []
844 |         for line in fin:
845 |             docid, termid, val = line.split()
846 |             if not self.transposed:
847 |                 termid, docid = docid, termid
848 |             docid, termid, val = int(docid) - 1, int(termid) - 1, float(val)  # -1 because matrix market indexes are 1-based => convert to 0-based
849 |             assert previd <= docid, "matrix columns must come in ascending order"
850 |             if docid != previd:
851 |                 if previd >= 0:
852 |                     return document
853 |                 previd = docid
854 | 
855 |             document.append((termid, val,))  # add another field to the current document
856 |         return document
857 | #endclass MmReader
858 | 


--------------------------------------------------------------------------------
/word2vec/setup.py:
--------------------------------------------------------------------------------
1 | from distutils.core import setup
2 | from Cython.Build import cythonize
3 | setup(name="mWord2vec", ext_modules=cythonize("word2vec_inner.pyx"))
4 | 


--------------------------------------------------------------------------------
/word2vec/utils.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python
   2 | # -*- coding: utf-8 -*-
   3 | #
   4 | # Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
   5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
   6 | 
   7 | """
   8 | This module contains various general utility functions.
   9 | """
  10 | 
  11 | from __future__ import with_statement
  12 | 
  13 | import logging
  14 | import warnings
  15 | 
  16 | logger = logging.getLogger(__name__)
  17 | 
  18 | try:
  19 |     from html.entities import name2codepoint as n2cp
  20 | except ImportError:
  21 |     from htmlentitydefs import name2codepoint as n2cp
  22 | try:
  23 |     import cPickle as _pickle
  24 | except ImportError:
  25 |     import pickle as _pickle
  26 | 
  27 | import re
  28 | import unicodedata
  29 | import os
  30 | import random
  31 | import itertools
  32 | import tempfile
  33 | from functools import wraps  # for `synchronous` function lock
  34 | import multiprocessing
  35 | import shutil
  36 | import sys
  37 | from contextlib import contextmanager
  38 | import subprocess
  39 | 
  40 | import numpy as np
  41 | import numbers
  42 | import scipy.sparse
  43 | 
  44 | if sys.version_info[0] >= 3:
  45 |     unicode = str
  46 | 
  47 | from six import iterkeys, iteritems, u, string_types, unichr
  48 | from six.moves import xrange
  49 | 
  50 | try:
  51 |     from smart_open import smart_open
  52 | except ImportError:
  53 |     logger.info("smart_open library not found; falling back to local-filesystem-only")
  54 | 
  55 |     def make_closing(base, **attrs):
  56 |         """
  57 |         Add support for `with Base(attrs) as fout:` to the base class if it's missing.
  58 |         The base class' `close()` method will be called on context exit, to always close the file properly.
  59 | 
  60 |         This is needed for gzip.GzipFile, bz2.BZ2File etc in older Pythons (<=2.6), which otherwise
  61 |         raise "AttributeError: GzipFile instance has no attribute '__exit__'".
  62 | 
  63 |         """
  64 |         if not hasattr(base, '__enter__'):
  65 |             attrs['__enter__'] = lambda self: self
  66 |         if not hasattr(base, '__exit__'):
  67 |             attrs['__exit__'] = lambda self, type, value, traceback: self.close()
  68 |         return type('Closing' + base.__name__, (base, object), attrs)
  69 | 
  70 |     def smart_open(fname, mode='rb'):
  71 |         _, ext = os.path.splitext(fname)
  72 |         if ext == '.bz2':
  73 |             from bz2 import BZ2File
  74 |             return make_closing(BZ2File)(fname, mode)
  75 |         if ext == '.gz':
  76 |             from gzip import GzipFile
  77 |             return make_closing(GzipFile)(fname, mode)
  78 |         return open(fname, mode)
  79 | 
  80 | 
  81 | PAT_ALPHABETIC = re.compile('(((?![\d])\w)+)', re.UNICODE)
  82 | RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE)
  83 | 
  84 | 
  85 | def get_random_state(seed):
  86 |     """
  87 |     Turn seed into a np.random.RandomState instance.
  88 |     Method originally from maciejkula/glove-python, and written by @joshloyal.
  89 |     """
  90 |     if seed is None or seed is np.random:
  91 |         return np.random.mtrand._rand
  92 |     if isinstance(seed, (numbers.Integral, np.integer)):
  93 |         return np.random.RandomState(seed)
  94 |     if isinstance(seed, np.random.RandomState):
  95 |         return seed
  96 |     raise ValueError('%r cannot be used to seed a np.random.RandomState instance' % seed)
  97 | 
  98 | 
  99 | def synchronous(tlockname):
 100 |     """
 101 |     A decorator to place an instance-based lock around a method.
 102 | 
 103 |     Adapted from http://model.activestate.com/recipes/577105-synchronization-decorator-for-class-methods/
 104 |     """
 105 |     def _synched(func):
 106 |         @wraps(func)
 107 |         def _synchronizer(self, *args, **kwargs):
 108 |             tlock = getattr(self, tlockname)
 109 |             logger.debug("acquiring lock %r for %s" % (tlockname, func.__name__))
 110 | 
 111 |             with tlock:  # use lock as a context manager to perform safe acquire/release pairs
 112 |                 logger.debug("acquired lock %r for %s" % (tlockname, func.__name__))
 113 |                 result = func(self, *args, **kwargs)
 114 |                 logger.debug("releasing lock %r for %s" % (tlockname, func.__name__))
 115 |                 return result
 116 |         return _synchronizer
 117 |     return _synched
 118 | 
 119 | 
 120 | class NoCM(object):
 121 |     def acquire(self):
 122 |         pass
 123 | 
 124 |     def release(self):
 125 |         pass
 126 | 
 127 |     def __enter__(self):
 128 |         pass
 129 | 
 130 |     def __exit__(self, type, value, traceback):
 131 |         pass
 132 | nocm = NoCM()
 133 | 
 134 | 
 135 | @contextmanager
 136 | def file_or_filename(input):
 137 |     """
 138 |     Return a file-like object ready to be read from the beginning. `input` is either
 139 |     a filename (gz/bz2 also supported) or a file-like object supporting seek.
 140 | 
 141 |     """
 142 |     if isinstance(input, string_types):
 143 |         # input was a filename: open as file
 144 |         yield smart_open(input)
 145 |     else:
 146 |         # input already a file-like object; just reset to the beginning
 147 |         input.seek(0)
 148 |         yield input
 149 | 
 150 | 
 151 | def deaccent(text):
 152 |     """
 153 |     Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring.
 154 | 
 155 |     Return input string with accents removed, as unicode.
 156 | 
 157 |     >>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek")
 158 |     u'Sef chomutovskych komunistu dostal postou bily prasek'
 159 | 
 160 |     """
 161 |     if not isinstance(text, unicode):
 162 |         # assume utf8 for byte strings, use default (strict) error handling
 163 |         text = text.decode('utf8')
 164 |     norm = unicodedata.normalize("NFD", text)
 165 |     result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
 166 |     return unicodedata.normalize("NFC", result)
 167 | 
 168 | 
 169 | def copytree_hardlink(source, dest):
 170 |     """
 171 |     Recursively copy a directory ala shutils.copytree, but hardlink files
 172 |     instead of copying. Available on UNIX systems only.
 173 |     """
 174 |     copy2 = shutil.copy2
 175 |     try:
 176 |         shutil.copy2 = os.link
 177 |         shutil.copytree(source, dest)
 178 |     finally:
 179 |         shutil.copy2 = copy2
 180 | 
 181 | 
 182 | def tokenize(text, lowercase=False, deacc=False, errors="strict", to_lower=False, lower=False):
 183 |     """
 184 |     Iteratively yield tokens as unicode strings, removing accent marks
 185 |     and optionally lowercasing the unidoce string by assigning True
 186 |     to one of the parameters, lowercase, to_lower, or lower.
 187 | 
 188 |     Input text may be either unicode or utf8-encoded byte string.
 189 | 
 190 |     The tokens on output are maximal contiguous sequences of alphabetic
 191 |     characters (no digits!).
 192 | 
 193 |     >>> list(tokenize('Nic nemůže letět rychlostí vyšší, než 300 tisíc kilometrů za sekundu!', deacc = True))
 194 |     [u'Nic', u'nemuze', u'letet', u'rychlosti', u'vyssi', u'nez', u'tisic', u'kilometru', u'za', u'sekundu']
 195 | 
 196 |     """
 197 |     lowercase = lowercase or to_lower or lower
 198 |     text = to_unicode(text, errors=errors)
 199 |     if lowercase:
 200 |         text = text.lower()
 201 |     if deacc:
 202 |         text = deaccent(text)
 203 |     for match in PAT_ALPHABETIC.finditer(text):
 204 |         yield match.group()
 205 | 
 206 | 
 207 | def simple_preprocess(doc, deacc=False, min_len=2, max_len=15):
 208 |     """
 209 |     Convert a document into a list of tokens.
 210 | 
 211 |     This lowercases, tokenizes, de-accents (optional). -- the output are final
 212 |     tokens = unicode strings, that won't be processed any further.
 213 | 
 214 |     """
 215 |     tokens = [
 216 |         token for token in tokenize(doc, lower=True, deacc=deacc, errors='ignore')
 217 |         if min_len <= len(token) <= max_len and not token.startswith('_')
 218 |     ]
 219 |     return tokens
 220 | 
 221 | 
 222 | def any2utf8(text, errors='strict', encoding='utf8'):
 223 |     """Convert a string (unicode or bytestring in `encoding`), to bytestring in utf8."""
 224 |     if isinstance(text, unicode):
 225 |         return text.encode('utf8')
 226 |     # do bytestring -> unicode -> utf8 full circle, to ensure valid utf8
 227 |     return unicode(text, encoding, errors=errors).encode('utf8')
 228 | to_utf8 = any2utf8
 229 | 
 230 | 
 231 | def any2unicode(text, encoding='utf8', errors='strict'):
 232 |     """Convert a string (bytestring in `encoding` or unicode), to unicode."""
 233 |     if isinstance(text, unicode):
 234 |         return text
 235 |     return unicode(text, encoding, errors=errors)
 236 | to_unicode = any2unicode
 237 | 
 238 | 
 239 | def call_on_class_only(*args, **kwargs):
 240 |     """Raise exception when load methods are called on instance"""
 241 |     raise AttributeError('This method should be called on a class object.')
 242 | 
 243 | 
 244 | class SaveLoad(object):
 245 |     """
 246 |     Objects which inherit from this class have save/load functions, which un/pickle
 247 |     them to disk.
 248 | 
 249 |     This uses pickle for de/serializing, so objects must not contain
 250 |     unpicklable attributes, such as lambda functions etc.
 251 | 
 252 |     """
 253 |     @classmethod
 254 |     def load(cls, fname, mmap=None):
 255 |         """
 256 |         Load a previously saved object from file (also see `save`).
 257 | 
 258 |         If the object was saved with large arrays stored separately, you can load
 259 |         these arrays via mmap (shared memory) using `mmap='r'`. Default: don't use
 260 |         mmap, load large arrays as normal objects.
 261 | 
 262 |         If the file being loaded is compressed (either '.gz' or '.bz2'), then
 263 |         `mmap=None` must be set.  Load will raise an `IOError` if this condition
 264 |         is encountered.
 265 | 
 266 |         """
 267 |         logger.info("loading %s object from %s" % (cls.__name__, fname))
 268 | 
 269 |         compress, subname = SaveLoad._adapt_by_suffix(fname)
 270 | 
 271 |         obj = unpickle(fname)
 272 |         obj._load_specials(fname, mmap, compress, subname)
 273 |         logger.info("loaded %s", fname)
 274 |         return obj
 275 | 
 276 |     def _load_specials(self, fname, mmap, compress, subname):
 277 |         """
 278 |         Loads any attributes that were stored specially, and gives the same
 279 |         opportunity to recursively included SaveLoad instances.
 280 | 
 281 |         """
 282 |         mmap_error = lambda x, y: IOError(
 283 |             'Cannot mmap compressed object %s in file %s. ' % (x, y) +
 284 |             'Use `load(fname, mmap=None)` or uncompress files manually.')
 285 | 
 286 |         for attrib in getattr(self, '__recursive_saveloads', []):
 287 |             cfname = '.'.join((fname, attrib))
 288 |             logger.info("loading %s recursively from %s.* with mmap=%s" % (
 289 |                 attrib, cfname, mmap))
 290 |             getattr(self, attrib)._load_specials(cfname, mmap, compress, subname)
 291 | 
 292 |         for attrib in getattr(self, '__numpys', []):
 293 |             logger.info("loading %s from %s with mmap=%s" % (
 294 |                 attrib, subname(fname, attrib), mmap))
 295 | 
 296 |             if compress:
 297 |                 if mmap:
 298 |                     raise mmap_error(attrib, subname(fname, attrib))
 299 | 
 300 |                 val = np.load(subname(fname, attrib))['val']
 301 |             else:
 302 |                 val = np.load(subname(fname, attrib), mmap_mode=mmap)
 303 | 
 304 |             setattr(self, attrib, val)
 305 | 
 306 |         for attrib in getattr(self, '__scipys', []):
 307 |             logger.info("loading %s from %s with mmap=%s" % (
 308 |                 attrib, subname(fname, attrib), mmap))
 309 |             sparse = unpickle(subname(fname, attrib))
 310 |             if compress:
 311 |                 if mmap:
 312 |                     raise mmap_error(attrib, subname(fname, attrib))
 313 | 
 314 |                 with np.load(subname(fname, attrib, 'sparse')) as f:
 315 |                     sparse.data = f['data']
 316 |                     sparse.indptr = f['indptr']
 317 |                     sparse.indices = f['indices']
 318 |             else:
 319 |                 sparse.data = np.load(subname(fname, attrib, 'data'), mmap_mode=mmap)
 320 |                 sparse.indptr = np.load(subname(fname, attrib, 'indptr'), mmap_mode=mmap)
 321 |                 sparse.indices = np.load(subname(fname, attrib, 'indices'), mmap_mode=mmap)
 322 | 
 323 |             setattr(self, attrib, sparse)
 324 | 
 325 |         for attrib in getattr(self, '__ignoreds', []):
 326 |             logger.info("setting ignored attribute %s to None" % (attrib))
 327 |             setattr(self, attrib, None)
 328 | 
 329 |     @staticmethod
 330 |     def _adapt_by_suffix(fname):
 331 |         """Give appropriate compress setting and filename formula"""
 332 |         if fname.endswith('.gz') or fname.endswith('.bz2'):
 333 |             compress = True
 334 |             subname = lambda *args: '.'.join(list(args) + ['npz'])
 335 |         else:
 336 |             compress = False
 337 |             subname = lambda *args: '.'.join(list(args) + ['npy'])
 338 |         return (compress, subname)
 339 | 
 340 |     def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2,
 341 |                     ignore=frozenset(), pickle_protocol=2):
 342 |         """
 343 |         Save the object to file (also see `load`).
 344 | 
 345 |         If `separately` is None, automatically detect large
 346 |         numpy/scipy.sparse arrays in the object being stored, and store
 347 |         them into separate files. This avoids pickle memory errors and
 348 |         allows mmap'ing large arrays back on load efficiently.
 349 | 
 350 |         You can also set `separately` manually, in which case it must be
 351 |         a list of attribute names to be stored in separate files. The
 352 |         automatic check is not performed in this case.
 353 | 
 354 |         `ignore` is a set of attribute names to *not* serialize (file
 355 |         handles, caches etc). On subsequent load() these attributes will
 356 |         be set to None.
 357 | 
 358 |         `pickle_protocol` defaults to 2 so the pickled object can be imported
 359 |         in both Python 2 and 3.
 360 | 
 361 |         """
 362 |         logger.info(
 363 |             "saving %s object under %s, separately %s" % (
 364 |                 self.__class__.__name__, fname, separately))
 365 | 
 366 |         compress, subname = SaveLoad._adapt_by_suffix(fname)
 367 | 
 368 |         restores = self._save_specials(fname, separately, sep_limit, ignore, pickle_protocol,
 369 |                                        compress, subname)
 370 |         try:
 371 |             pickle(self, fname, protocol=pickle_protocol)
 372 |         finally:
 373 |             # restore attribs handled specially
 374 |             for obj, asides in restores:
 375 |                 for attrib, val in iteritems(asides):
 376 |                     setattr(obj, attrib, val)
 377 |         logger.info("saved %s", fname)
 378 | 
 379 |     def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, compress, subname):
 380 |         """
 381 |         Save aside any attributes that need to be handled separately, including
 382 |         by recursion any attributes that are themselves SaveLoad instances.
 383 | 
 384 |         Returns a list of (obj, {attrib: value, ...}) settings that the caller
 385 |         should use to restore each object's attributes that were set aside
 386 |         during the default pickle().
 387 | 
 388 |         """
 389 |         asides = {}
 390 |         sparse_matrices = (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)
 391 |         if separately is None:
 392 |             separately = []
 393 |             for attrib, val in iteritems(self.__dict__):
 394 |                 if isinstance(val, np.ndarray) and val.size >= sep_limit:
 395 |                     separately.append(attrib)
 396 |                 elif isinstance(val, sparse_matrices) and val.nnz >= sep_limit:
 397 |                     separately.append(attrib)
 398 | 
 399 |         # whatever's in `separately` or `ignore` at this point won't get pickled
 400 |         for attrib in separately + list(ignore):
 401 |             if hasattr(self, attrib):
 402 |                 asides[attrib] = getattr(self, attrib)
 403 |                 delattr(self, attrib)
 404 | 
 405 |         recursive_saveloads = []
 406 |         restores = []
 407 |         for attrib, val in iteritems(self.__dict__):
 408 |             if hasattr(val, '_save_specials'):  # better than 'isinstance(val, SaveLoad)' if IPython reloading
 409 |                 recursive_saveloads.append(attrib)
 410 |                 cfname = '.'.join((fname, attrib))
 411 |                 restores.extend(val._save_specials(
 412 |                     cfname, None, sep_limit, ignore,
 413 |                     pickle_protocol, compress, subname))
 414 | 
 415 |         try:
 416 |             numpys, scipys, ignoreds = [], [], []
 417 |             for attrib, val in iteritems(asides):
 418 |                 if isinstance(val, np.ndarray) and attrib not in ignore:
 419 |                     numpys.append(attrib)
 420 |                     logger.info("storing np array '%s' to %s" % (
 421 |                         attrib, subname(fname, attrib)))
 422 | 
 423 |                     if compress:
 424 |                         np.savez_compressed(subname(fname, attrib), val=np.ascontiguousarray(val))
 425 |                     else:
 426 |                         np.save(subname(fname, attrib), np.ascontiguousarray(val))
 427 | 
 428 |                 elif isinstance(val, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)) and attrib not in ignore:
 429 |                     scipys.append(attrib)
 430 |                     logger.info("storing scipy.sparse array '%s' under %s" % (
 431 |                         attrib, subname(fname, attrib)))
 432 | 
 433 |                     if compress:
 434 |                         np.savez_compressed(
 435 |                             subname(fname, attrib, 'sparse'),
 436 |                             data=val.data,
 437 |                             indptr=val.indptr,
 438 |                             indices=val.indices)
 439 |                     else:
 440 |                         np.save(subname(fname, attrib, 'data'), val.data)
 441 |                         np.save(subname(fname, attrib, 'indptr'), val.indptr)
 442 |                         np.save(subname(fname, attrib, 'indices'), val.indices)
 443 | 
 444 |                     data, indptr, indices = val.data, val.indptr, val.indices
 445 |                     val.data, val.indptr, val.indices = None, None, None
 446 | 
 447 |                     try:
 448 |                         # store array-less object
 449 |                         pickle(val, subname(fname, attrib), protocol=pickle_protocol)
 450 |                     finally:
 451 |                         val.data, val.indptr, val.indices = data, indptr, indices
 452 |                 else:
 453 |                     logger.info("not storing attribute %s" % (attrib))
 454 |                     ignoreds.append(attrib)
 455 | 
 456 |             self.__dict__['__numpys'] = numpys
 457 |             self.__dict__['__scipys'] = scipys
 458 |             self.__dict__['__ignoreds'] = ignoreds
 459 |             self.__dict__['__recursive_saveloads'] = recursive_saveloads
 460 |         except:
 461 |             # restore the attributes if exception-interrupted
 462 |             for attrib, val in iteritems(asides):
 463 |                 setattr(self, attrib, val)
 464 |             raise
 465 |         return restores + [(self, asides)]
 466 | 
 467 |     def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2,
 468 |              ignore=frozenset(), pickle_protocol=2):
 469 |         """
 470 |         Save the object to file (also see `load`).
 471 | 
 472 |         `fname_or_handle` is either a string specifying the file name to
 473 |         save to, or an open file-like object which can be written to. If
 474 |         the object is a file handle, no special array handling will be
 475 |         performed; all attributes will be saved to the same file.
 476 | 
 477 |         If `separately` is None, automatically detect large
 478 |         numpy/scipy.sparse arrays in the object being stored, and store
 479 |         them into separate files. This avoids pickle memory errors and
 480 |         allows mmap'ing large arrays back on load efficiently.
 481 | 
 482 |         You can also set `separately` manually, in which case it must be
 483 |         a list of attribute names to be stored in separate files. The
 484 |         automatic check is not performed in this case.
 485 | 
 486 |         `ignore` is a set of attribute names to *not* serialize (file
 487 |         handles, caches etc). On subsequent load() these attributes will
 488 |         be set to None.
 489 | 
 490 |         `pickle_protocol` defaults to 2 so the pickled object can be imported
 491 |         in both Python 2 and 3.
 492 | 
 493 |         """
 494 |         try:
 495 |             _pickle.dump(self, fname_or_handle, protocol=pickle_protocol)
 496 |             logger.info("saved %s object" % self.__class__.__name__)
 497 |         except TypeError:  # `fname_or_handle` does not have write attribute
 498 |             self._smart_save(fname_or_handle, separately, sep_limit, ignore,
 499 |                              pickle_protocol=pickle_protocol)
 500 | #endclass SaveLoad
 501 | 
 502 | 
 503 | def identity(p):
 504 |     """Identity fnc, for flows that don't accept lambda (pickling etc)."""
 505 |     return p
 506 | 
 507 | 
 508 | def get_max_id(corpus):
 509 |     """
 510 |     Return the highest feature id that appears in the corpus.
 511 | 
 512 |     For empty corpora (no features at all), return -1.
 513 | 
 514 |     """
 515 |     maxid = -1
 516 |     for document in corpus:
 517 |         maxid = max(maxid, max([-1] + [fieldid for fieldid, _ in document]))  # [-1] to avoid exceptions from max(empty)
 518 |     return maxid
 519 | 
 520 | 
 521 | class FakeDict(object):
 522 |     """
 523 |     Objects of this class act as dictionaries that map integer->str(integer), for
 524 |     a specified range of integers <0, num_terms).
 525 | 
 526 |     This is meant to avoid allocating real dictionaries when `num_terms` is huge, which
 527 |     is a waste of memory.
 528 | 
 529 |     """
 530 |     def __init__(self, num_terms):
 531 |         self.num_terms = num_terms
 532 | 
 533 |     def __str__(self):
 534 |         return "FakeDict(num_terms=%s)" % self.num_terms
 535 | 
 536 |     def __getitem__(self, val):
 537 |         if 0 <= val < self.num_terms:
 538 |             return str(val)
 539 |         raise ValueError("internal id out of bounds (%s, expected <0..%s))" %
 540 |                          (val, self.num_terms))
 541 | 
 542 |     def iteritems(self):
 543 |         for i in xrange(self.num_terms):
 544 |             yield i, str(i)
 545 | 
 546 |     def keys(self):
 547 |         """
 548 |         Override the dict.keys() function, which is used to determine the maximum
 549 |         internal id of a corpus = the vocabulary dimensionality.
 550 | 
 551 |         HACK: To avoid materializing the whole `range(0, self.num_terms)`, this returns
 552 |         the highest id = `[self.num_terms - 1]` only.
 553 | 
 554 |         """
 555 |         return [self.num_terms - 1]
 556 | 
 557 |     def __len__(self):
 558 |         return self.num_terms
 559 | 
 560 |     def get(self, val, default=None):
 561 |         if 0 <= val < self.num_terms:
 562 |             return str(val)
 563 |         return default
 564 | 
 565 | 
 566 | def dict_from_corpus(corpus):
 567 |     """
 568 |     Scan corpus for all word ids that appear in it, then construct and return a mapping
 569 |     which maps each `wordId -> str(wordId)`.
 570 | 
 571 |     This function is used whenever *words* need to be displayed (as opposed to just
 572 |     their ids) but no wordId->word mapping was provided. The resulting mapping
 573 |     only covers words actually used in the corpus, up to the highest wordId found.
 574 | 
 575 |     """
 576 |     num_terms = 1 + get_max_id(corpus)
 577 |     id2word = FakeDict(num_terms)
 578 |     return id2word
 579 | 
 580 | 
 581 | def is_corpus(obj):
 582 |     """
 583 |     Check whether `obj` is a corpus. Return (is_corpus, new) 2-tuple, where
 584 |     `new is obj` if `obj` was an iterable, or `new` yields the same sequence as
 585 |     `obj` if it was an iterator.
 586 | 
 587 |     `obj` is a corpus if it supports iteration over documents, where a document
 588 |     is in turn anything that acts as a sequence of 2-tuples (int, float).
 589 | 
 590 |     Note: An "empty" corpus (empty input sequence) is ambiguous, so in this case the
 591 |     result is forcefully defined as `is_corpus=False`.
 592 | 
 593 |     """
 594 |     try:
 595 |         if 'Corpus' in obj.__class__.__name__:  # the most common case, quick hack
 596 |             return True, obj
 597 |     except:
 598 |         pass
 599 |     try:
 600 |         if hasattr(obj, 'next') or hasattr(obj, '__next__'):
 601 |             # the input is an iterator object, meaning once we call next()
 602 |             # that element could be gone forever. we must be careful to put
 603 |             # whatever we retrieve back again
 604 |             doc1 = next(obj)
 605 |             obj = itertools.chain([doc1], obj)
 606 |         else:
 607 |             doc1 = next(iter(obj))  # empty corpus is resolved to False here
 608 |         if len(doc1) == 0:  # sparse documents must have a __len__ function (list, tuple...)
 609 |             return True, obj  # the first document is empty=>assume this is a corpus
 610 |         id1, val1 = next(iter(doc1))  # if obj is a 1D numpy array(scalars) instead of 2-tuples, it resolves to False here
 611 |         id1, val1 = int(id1), float(val1)  # must be a 2-tuple (integer, float)
 612 |     except Exception:
 613 |         return False, obj
 614 |     return True, obj
 615 | 
 616 | 
 617 | def get_my_ip():
 618 |     """
 619 |     Try to obtain our external ip (from the pyro nameserver's point of view)
 620 | 
 621 |     This tries to sidestep the issue of bogus `/etc/hosts` entries and other
 622 |     local misconfigurations, which often mess up hostname resolution.
 623 | 
 624 |     If all else fails, fall back to simple `socket.gethostbyname()` lookup.
 625 | 
 626 |     """
 627 |     import socket
 628 |     try:
 629 |         import Pyro4
 630 |         # we know the nameserver must exist, so use it as our anchor point
 631 |         ns = Pyro4.naming.locateNS()
 632 |         s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
 633 |         s.connect((ns._pyroUri.host, ns._pyroUri.port))
 634 |         result, port = s.getsockname()
 635 |     except:
 636 |         try:
 637 |             # see what ifconfig says about our default interface
 638 |             import commands
 639 |             result = commands.getoutput("ifconfig").split("\n")[1].split()[1][5:]
 640 |             if len(result.split('.')) != 4:
 641 |                 raise Exception()
 642 |         except:
 643 |             # give up, leave the resolution to gethostbyname
 644 |             result = socket.gethostbyname(socket.gethostname())
 645 |     return result
 646 | 
 647 | 
 648 | class RepeatCorpus(SaveLoad):
 649 |     """
 650 |     Used in the tutorial on distributed computing and likely not useful anywhere else.
 651 | 
 652 |     """
 653 |     def __init__(self, corpus, reps):
 654 |         """
 655 |         Wrap a `corpus` as another corpus of length `reps`. This is achieved by
 656 |         repeating documents from `corpus` over and over again, until the requested
 657 |         length `len(result)==reps` is reached. Repetition is done
 658 |         on-the-fly=efficiently, via `itertools`.
 659 | 
 660 |         >>> corpus = [[(1, 0.5)], []] # 2 documents
 661 |         >>> list(RepeatCorpus(corpus, 5)) # repeat 2.5 times to get 5 documents
 662 |         [[(1, 0.5)], [], [(1, 0.5)], [], [(1, 0.5)]]
 663 | 
 664 |         """
 665 |         self.corpus = corpus
 666 |         self.reps = reps
 667 | 
 668 |     def __iter__(self):
 669 |         return itertools.islice(itertools.cycle(self.corpus), self.reps)
 670 | 
 671 | 
 672 | class RepeatCorpusNTimes(SaveLoad):
 673 | 
 674 |     def __init__(self, corpus, n):
 675 |         """
 676 |         Repeat a `corpus` `n` times.
 677 | 
 678 |         >>> corpus = [[(1, 0.5)], []]
 679 |         >>> list(RepeatCorpusNTimes(corpus, 3)) # repeat 3 times
 680 |         [[(1, 0.5)], [], [(1, 0.5)], [], [(1, 0.5)], []]
 681 |         """
 682 |         self.corpus = corpus
 683 |         self.n = n
 684 | 
 685 |     def __iter__(self):
 686 |         for _ in xrange(self.n):
 687 |             for document in self.corpus:
 688 |                 yield document
 689 | 
 690 | 
 691 | class ClippedCorpus(SaveLoad):
 692 |     def __init__(self, corpus, max_docs=None):
 693 |         """
 694 |         Return a corpus that is the "head" of input iterable `corpus`.
 695 | 
 696 |         Any documents after `max_docs` are ignored. This effectively limits the
 697 |         length of the returned corpus to <= `max_docs`. Set `max_docs=None` for
 698 |         "no limit", effectively wrapping the entire input corpus.
 699 | 
 700 |         """
 701 |         self.corpus = corpus
 702 |         self.max_docs = max_docs
 703 | 
 704 |     def __iter__(self):
 705 |         return itertools.islice(self.corpus, self.max_docs)
 706 | 
 707 |     def __len__(self):
 708 |         return min(self.max_docs, len(self.corpus))
 709 | 
 710 | 
 711 | class SlicedCorpus(SaveLoad):
 712 |     def __init__(self, corpus, slice_):
 713 |         """
 714 |         Return a corpus that is the slice of input iterable `corpus`.
 715 | 
 716 |         Negative slicing can only be used if the corpus is indexable.
 717 |         Otherwise, the corpus will be iterated over.
 718 | 
 719 |         Slice can also be a np.ndarray to support fancy indexing.
 720 | 
 721 |         NOTE: calculating the size of a SlicedCorpus is expensive
 722 |         when using a slice as the corpus has to be iterated over once.
 723 |         Using a list or np.ndarray does not have this drawback, but
 724 |         consumes more memory.
 725 |         """
 726 |         self.corpus = corpus
 727 |         self.slice_ = slice_
 728 |         self.length = None
 729 | 
 730 |     def __iter__(self):
 731 |         if hasattr(self.corpus, 'index') and len(self.corpus.index) > 0:
 732 |             return (self.corpus.docbyoffset(i) for i in
 733 |                     self.corpus.index[self.slice_])
 734 |         else:
 735 |             return itertools.islice(self.corpus, self.slice_.start,
 736 |                                     self.slice_.stop, self.slice_.step)
 737 | 
 738 |     def __len__(self):
 739 |         # check cached length, calculate if needed
 740 |         if self.length is None:
 741 |             if isinstance(self.slice_, (list, np.ndarray)):
 742 |                 self.length = len(self.slice_)
 743 |             else:
 744 |                 self.length = sum(1 for x in self)
 745 | 
 746 |         return self.length
 747 | 
 748 | 
 749 | def safe_unichr(intval):
 750 |     try:
 751 |         return unichr(intval)
 752 |     except ValueError:
 753 |         # ValueError: unichr() arg not in range(0x10000) (narrow Python build)
 754 |         s = "\\U%08x" % intval
 755 |         # return UTF16 surrogate pair
 756 |         return s.decode('unicode-escape')
 757 | 
 758 | 
 759 | def decode_htmlentities(text):
 760 |     """
 761 |     Decode HTML entities in text, coded as hex, decimal or named.
 762 | 
 763 |     Adapted from http://github.com/sku/python-twitter-ircbot/blob/321d94e0e40d0acc92f5bf57d126b57369da70de/html_decode.py
 764 | 
 765 |     >>> u = u'E tu vivrai nel terrore - L&#x27;aldil&#xE0; (1981)'
 766 |     >>> print(decode_htmlentities(u).encode('UTF-8'))
 767 |     E tu vivrai nel terrore - L'aldilà (1981)
 768 |     >>> print(decode_htmlentities("l&#39;eau"))
 769 |     l'eau
 770 |     >>> print(decode_htmlentities("foo &lt; bar"))
 771 |     foo < bar
 772 | 
 773 |     """
 774 |     def substitute_entity(match):
 775 |         try:
 776 |             ent = match.group(3)
 777 |             if match.group(1) == "#":
 778 |                 # decoding by number
 779 |                 if match.group(2) == '':
 780 |                     # number is in decimal
 781 |                     return safe_unichr(int(ent))
 782 |                 elif match.group(2) in ['x', 'X']:
 783 |                     # number is in hex
 784 |                     return safe_unichr(int(ent, 16))
 785 |             else:
 786 |                 # they were using a name
 787 |                 cp = n2cp.get(ent)
 788 |                 if cp:
 789 |                     return safe_unichr(cp)
 790 |                 else:
 791 |                     return match.group()
 792 |         except:
 793 |             # in case of errors, return original input
 794 |             return match.group()
 795 | 
 796 |     return RE_HTML_ENTITY.sub(substitute_entity, text)
 797 | 
 798 | 
 799 | def chunkize_serial(iterable, chunksize, as_numpy=False):
 800 |     """
 801 |     Return elements from the iterable in `chunksize`-ed lists. The last returned
 802 |     element may be smaller (if length of collection is not divisible by `chunksize`).
 803 | 
 804 |     >>> print(list(grouper(range(10), 3)))
 805 |     [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
 806 | 
 807 |     """
 808 |     it = iter(iterable)
 809 |     while True:
 810 |         if as_numpy:
 811 |             # convert each document to a 2d numpy array (~6x faster when transmitting
 812 |             # chunk data over the wire, in Pyro)
 813 |             wrapped_chunk = [[np.array(doc) for doc in itertools.islice(it, int(chunksize))]]
 814 |         else:
 815 |             wrapped_chunk = [list(itertools.islice(it, int(chunksize)))]
 816 |         if not wrapped_chunk[0]:
 817 |             break
 818 |         # memory opt: wrap the chunk and then pop(), to avoid leaving behind a dangling reference
 819 |         yield wrapped_chunk.pop()
 820 | 
 821 | grouper = chunkize_serial
 822 | 
 823 | 
 824 | class InputQueue(multiprocessing.Process):
 825 |     def __init__(self, q, corpus, chunksize, maxsize, as_numpy):
 826 |         super(InputQueue, self).__init__()
 827 |         self.q = q
 828 |         self.maxsize = maxsize
 829 |         self.corpus = corpus
 830 |         self.chunksize = chunksize
 831 |         self.as_numpy = as_numpy
 832 | 
 833 |     def run(self):
 834 |         it = iter(self.corpus)
 835 |         while True:
 836 |             chunk = itertools.islice(it, self.chunksize)
 837 |             if self.as_numpy:
 838 |                 # HACK XXX convert documents to numpy arrays, to save memory.
 839 |                 # This also gives a scipy warning at runtime:
 840 |                 # "UserWarning: indices array has non-integer dtype (float64)"
 841 |                 wrapped_chunk = [[np.asarray(doc) for doc in chunk]]
 842 |             else:
 843 |                 wrapped_chunk = [list(chunk)]
 844 | 
 845 |             if not wrapped_chunk[0]:
 846 |                 self.q.put(None, block=True)
 847 |                 break
 848 | 
 849 |             try:
 850 |                 qsize = self.q.qsize()
 851 |             except NotImplementedError:
 852 |                 qsize = '?'
 853 |             logger.debug("prepared another chunk of %i documents (qsize=%s)" %
 854 |                         (len(wrapped_chunk[0]), qsize))
 855 |             self.q.put(wrapped_chunk.pop(), block=True)
 856 | #endclass InputQueue
 857 | 
 858 | 
 859 | if os.name == 'nt':
 860 |     warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
 861 | 
 862 |     def chunkize(corpus, chunksize, maxsize=0, as_numpy=False):
 863 |         for chunk in chunkize_serial(corpus, chunksize, as_numpy=as_numpy):
 864 |             yield chunk
 865 | else:
 866 |     def chunkize(corpus, chunksize, maxsize=0, as_numpy=False):
 867 |         """
 868 |         Split a stream of values into smaller chunks.
 869 |         Each chunk is of length `chunksize`, except the last one which may be smaller.
 870 |         A once-only input stream (`corpus` from a generator) is ok, chunking is done
 871 |         efficiently via itertools.
 872 | 
 873 |         If `maxsize > 1`, don't wait idly in between successive chunk `yields`, but
 874 |         rather keep filling a short queue (of size at most `maxsize`) with forthcoming
 875 |         chunks in advance. This is realized by starting a separate process, and is
 876 |         meant to reduce I/O delays, which can be significant when `corpus` comes
 877 |         from a slow medium (like harddisk).
 878 | 
 879 |         If `maxsize==0`, don't fool around with parallelism and simply yield the chunksize
 880 |         via `chunkize_serial()` (no I/O optimizations).
 881 | 
 882 |         >>> for chunk in chunkize(range(10), 4): print(chunk)
 883 |         [0, 1, 2, 3]
 884 |         [4, 5, 6, 7]
 885 |         [8, 9]
 886 | 
 887 |         """
 888 |         assert chunksize > 0
 889 | 
 890 |         if maxsize > 0:
 891 |             q = multiprocessing.Queue(maxsize=maxsize)
 892 |             worker = InputQueue(q, corpus, chunksize, maxsize=maxsize, as_numpy=as_numpy)
 893 |             worker.daemon = True
 894 |             worker.start()
 895 |             while True:
 896 |                 chunk = [q.get(block=True)]
 897 |                 if chunk[0] is None:
 898 |                     break
 899 |                 yield chunk.pop()
 900 |         else:
 901 |             for chunk in chunkize_serial(corpus, chunksize, as_numpy=as_numpy):
 902 |                 yield chunk
 903 | 
 904 | 
 905 | def smart_extension(fname, ext):
 906 |     fname, oext = os.path.splitext(fname)
 907 |     if oext.endswith('.bz2'):
 908 |         fname = fname + oext[:-4] + ext + '.bz2'
 909 |     elif oext.endswith('.gz'):
 910 |         fname = fname + oext[:-3] + ext + '.gz'
 911 |     else:
 912 |         fname = fname + oext + ext
 913 | 
 914 |     return fname
 915 | 
 916 | 
 917 | def pickle(obj, fname, protocol=2):
 918 |     """Pickle object `obj` to file `fname`.
 919 | 
 920 |     `protocol` defaults to 2 so pickled objects are compatible across
 921 |     Python 2.x and 3.x.
 922 | 
 923 |     """
 924 |     with smart_open(fname, 'wb') as fout:  # 'b' for binary, needed on Windows
 925 |         _pickle.dump(obj, fout, protocol=protocol)
 926 | 
 927 | 
 928 | def unpickle(fname):
 929 |     """Load pickled object from `fname`"""
 930 |     with smart_open(fname, 'rb') as f:
 931 |         # Because of loading from S3 load can't be used (missing readline in smart_open)
 932 |         if sys.version_info > (3, 0):
 933 |             return _pickle.load(f, encoding='latin1')
 934 |         else:
 935 |             return _pickle.loads(f.read())
 936 | 
 937 | 
 938 | def revdict(d):
 939 |     """
 940 |     Reverse a dictionary mapping.
 941 | 
 942 |     When two keys map to the same value, only one of them will be kept in the
 943 |     result (which one is kept is arbitrary).
 944 | 
 945 |     """
 946 |     return dict((v, k) for (k, v) in iteritems(dict(d)))
 947 | 
 948 | 
 949 | def toptexts(query, texts, index, n=10):
 950 |     """
 951 |     Debug fnc to help inspect the top `n` most similar documents (according to a
 952 |     similarity index `index`), to see if they are actually related to the query.
 953 | 
 954 |     `texts` is any object that can return something insightful for each document
 955 |     via `texts[docid]`, such as its fulltext or snippet.
 956 | 
 957 |     Return a list of 3-tuples (docid, doc's similarity to the query, texts[docid]).
 958 | 
 959 |     """
 960 |     sims = index[query]  # perform a similarity query against the corpus
 961 |     sims = sorted(enumerate(sims), key=lambda item: -item[1])
 962 | 
 963 |     result = []
 964 |     for topid, topcosine in sims[:n]:  # only consider top-n most similar docs
 965 |         result.append((topid, topcosine, texts[topid]))
 966 |     return result
 967 | 
 968 | 
 969 | def randfname(prefix='gensim'):
 970 |     randpart = hex(random.randint(0, 0xffffff))[2:]
 971 |     return os.path.join(tempfile.gettempdir(), prefix + randpart)
 972 | 
 973 | 
 974 | def upload_chunked(server, docs, chunksize=1000, preprocess=None):
 975 |     """
 976 |     Memory-friendly upload of documents to a SimServer (or Pyro SimServer proxy).
 977 | 
 978 |     Use this function to train or index large collections -- avoid sending the
 979 |     entire corpus over the wire as a single Pyro in-memory object. The documents
 980 |     will be sent in smaller chunks, of `chunksize` documents each.
 981 | 
 982 |     """
 983 |     start = 0
 984 |     for chunk in grouper(docs, chunksize):
 985 |         end = start + len(chunk)
 986 |         logger.info("uploading documents %i-%i" % (start, end - 1))
 987 |         if preprocess is not None:
 988 |             pchunk = []
 989 |             for doc in chunk:
 990 |                 doc['tokens'] = preprocess(doc['text'])
 991 |                 del doc['text']
 992 |                 pchunk.append(doc)
 993 |             chunk = pchunk
 994 |         server.buffer(chunk)
 995 |         start = end
 996 | 
 997 | 
 998 | def getNS(host=None, port=None, broadcast=True, hmac_key=None):
 999 |     """
1000 |     Return a Pyro name server proxy.
1001 |     """
1002 |     import Pyro4
1003 |     try:
1004 |         return Pyro4.locateNS(host, port, broadcast, hmac_key)
1005 |     except Pyro4.errors.NamingError:
1006 |         raise RuntimeError("Pyro name server not found")
1007 | 
1008 | 
1009 | def pyro_daemon(name, obj, random_suffix=False, ip=None, port=None, ns_conf={}):
1010 |     """
1011 |     Register object with name server (starting the name server if not running
1012 |     yet) and block until the daemon is terminated. The object is registered under
1013 |     `name`, or `name`+ some random suffix if `random_suffix` is set.
1014 | 
1015 |     """
1016 |     if random_suffix:
1017 |         name += '.' + hex(random.randint(0, 0xffffff))[2:]
1018 |     import Pyro4
1019 |     with getNS(**ns_conf) as ns:
1020 |         with Pyro4.Daemon(ip or get_my_ip(), port or 0) as daemon:
1021 |             # register server for remote access
1022 |             uri = daemon.register(obj, name)
1023 |             ns.remove(name)
1024 |             ns.register(name, uri)
1025 |             logger.info("%s registered with nameserver (URI '%s')" % (name, uri))
1026 |             daemon.requestLoop()
1027 | 
1028 | 
1029 | def has_pattern():
1030 |     """
1031 |     Function which returns a flag indicating whether pattern is installed or not
1032 |     """
1033 |     try:
1034 |         from pattern.en import parse
1035 |         return True
1036 |     except ImportError:
1037 |         return False
1038 | 
1039 | 
1040 | def lemmatize(
1041 |         content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False,
1042 |         stopwords=frozenset(), min_length=2, max_length=15):
1043 |     """
1044 |     This function is only available when the optional 'pattern' package is installed.
1045 | 
1046 |     Use the English lemmatizer from `pattern` to extract UTF8-encoded tokens in
1047 |     their base form=lemma, e.g. "are, is, being" -> "be" etc.
1048 |     This is a smarter version of stemming, taking word context into account.
1049 | 
1050 |     Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded).
1051 | 
1052 |     >>> lemmatize('Hello World! How is it going?! Nonexistentword, 21')
1053 |     ['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN']
1054 | 
1055 |     >>> lemmatize('The study ranks high.')
1056 |     ['study/NN', 'rank/VB', 'high/JJ']
1057 | 
1058 |     >>> lemmatize('The ranks study hard.')
1059 |     ['rank/NN', 'study/VB', 'hard/RB']
1060 | 
1061 |     """
1062 |     if not has_pattern():
1063 |         raise ImportError("Pattern library is not installed. Pattern library is needed in order to use lemmatize function")
1064 |     from pattern.en import parse
1065 | 
1066 |     if light:
1067 |         import warnings
1068 |         warnings.warn("The light flag is no longer supported by pattern.")
1069 | 
1070 |     # tokenization in `pattern` is weird; it gets thrown off by non-letters,
1071 |     # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
1072 |     # FIXME this throws away all fancy parsing cues, including sentence structure,
1073 |     # abbreviations etc.
1074 |     content = u(' ').join(tokenize(content, lower=True, errors='ignore'))
1075 | 
1076 |     parsed = parse(content, lemmata=True, collapse=False)
1077 |     result = []
1078 |     for sentence in parsed:
1079 |         for token, tag, _, _, lemma in sentence:
1080 |             if min_length <= len(lemma) <= max_length and not lemma.startswith('_') and lemma not in stopwords:
1081 |                 if allowed_tags.match(tag):
1082 |                     lemma += "/" + tag[:2]
1083 |                     result.append(lemma.encode('utf8'))
1084 |     return result
1085 | 
1086 | 
1087 | def mock_data_row(dim=1000, prob_nnz=0.5, lam=1.0):
1088 |     """
1089 |     Create a random gensim sparse vector. Each coordinate is nonzero with
1090 |     probability `prob_nnz`, each non-zero coordinate value is drawn from
1091 |     a Poisson distribution with parameter lambda equal to `lam`.
1092 | 
1093 |     """
1094 |     nnz = np.random.uniform(size=(dim,))
1095 |     data = [(i, float(np.random.poisson(lam=lam) + 1.0))
1096 |             for i in xrange(dim) if nnz[i] < prob_nnz]
1097 |     return data
1098 | 
1099 | 
1100 | def mock_data(n_items=1000, dim=1000, prob_nnz=0.5, lam=1.0):
1101 |     """
1102 |     Create a random gensim-style corpus, as a list of lists of (int, float) tuples,
1103 |     to be used as a mock corpus.
1104 | 
1105 |     """
1106 |     data = [mock_data_row(dim=dim, prob_nnz=prob_nnz, lam=lam)
1107 |             for _ in xrange(n_items)]
1108 |     return data
1109 | 
1110 | 
1111 | def prune_vocab(vocab, min_reduce, trim_rule=None):
1112 |     """
1113 |     Remove all entries from the `vocab` dictionary with count smaller than `min_reduce`.
1114 | 
1115 |     Modifies `vocab` in place, returns the sum of all counts that were pruned.
1116 | 
1117 |     """
1118 |     result = 0
1119 |     old_len = len(vocab)
1120 |     for w in list(vocab):  # make a copy of dict's keys
1121 |         if not keep_vocab_item(w, vocab[w], min_reduce, trim_rule):  # vocab[w] <= min_reduce:
1122 |             result += vocab[w]
1123 |             del vocab[w]
1124 |     logger.info("pruned out %i tokens with count <=%i (before %i, after %i)",
1125 |                 old_len - len(vocab), min_reduce, old_len, len(vocab))
1126 |     return result
1127 | 
1128 | 
1129 | def qsize(queue):
1130 |     """Return the (approximate) queue size where available; -1 where not (OS X)."""
1131 |     try:
1132 |         return queue.qsize()
1133 |     except NotImplementedError:
1134 |         # OS X doesn't support qsize
1135 |         return -1
1136 | 
1137 | RULE_DEFAULT = 0
1138 | RULE_DISCARD = 1
1139 | RULE_KEEP = 2
1140 | 
1141 | 
1142 | def keep_vocab_item(word, count, min_count, trim_rule=None):
1143 |     default_res = count >= min_count
1144 | 
1145 |     if trim_rule is None:
1146 |         return default_res
1147 |     else:
1148 |         rule_res = trim_rule(word, count, min_count)
1149 |         if rule_res == RULE_KEEP:
1150 |             return True
1151 |         elif rule_res == RULE_DISCARD:
1152 |             return False
1153 |         else:
1154 |             return default_res
1155 | 
1156 | 
1157 | def check_output(stdout=subprocess.PIPE, *popenargs, **kwargs):
1158 |     """
1159 |     Run command with arguments and return its output as a byte string.
1160 |     Backported from Python 2.7 as it's implemented as pure python on stdlib.
1161 | 
1162 |     >>> check_output(args=['/usr/bin/python', '--version'])
1163 |     Python 2.6.2
1164 |     Added extra KeyboardInterrupt handling
1165 |     """
1166 |     try:
1167 |         logger.debug("COMMAND: %s %s", popenargs, kwargs)
1168 |         process = subprocess.Popen(stdout=stdout, *popenargs, **kwargs)
1169 |         output, unused_err = process.communicate()
1170 |         retcode = process.poll()
1171 |         if retcode:
1172 |             cmd = kwargs.get("args")
1173 |             if cmd is None:
1174 |                 cmd = popenargs[0]
1175 |             error = subprocess.CalledProcessError(retcode, cmd)
1176 |             error.output = output
1177 |             raise error
1178 |         return output
1179 |     except KeyboardInterrupt:
1180 |         process.terminate()
1181 |         raise
1182 | 
1183 | 
1184 | def sample_dict(d, n=10, use_random=True):
1185 |     """
1186 |     Pick `n` items from dictionary `d` and return them as a list.
1187 |     The items are picked randomly if `use_random` is True, otherwise picked
1188 |     according to natural dict iteration.
1189 |     """
1190 |     selected_keys = random.sample(list(d), min(len(d), n)) if use_random else itertools.islice(iterkeys(d), n)
1191 |     return [(key, d[key]) for key in selected_keys]
1192 | 


--------------------------------------------------------------------------------
/word2vec/voidptr.h:
--------------------------------------------------------------------------------
 1 | #include <Python.h>
 2 | 
 3 | #if PY_VERSION_HEX >= 0x03020000
 4 | 
 5 | /*
 6 | ** compatibility with python >= 3.2, which doesn't have CObject anymore
 7 | */
 8 | static void * PyCObject_AsVoidPtr(PyObject *obj)
 9 | {
10 |     void *ret = PyCapsule_GetPointer(obj, NULL);
11 |     if (ret == NULL) {
12 |         PyErr_Clear();
13 |     }
14 |     return ret;
15 | }
16 | 
17 | #endif


--------------------------------------------------------------------------------
/word2vec/word2vec_inner.pxd:
--------------------------------------------------------------------------------
 1 | #
 2 | # shared type definitions for word2vec_inner
 3 | # used by both word2vec_inner.pyx (automatically) and doc2vec_inner.pyx (by explicit cimport)
 4 | #
 5 | # Copyright (C) 2013 Radim Rehurek <me@radimrehurek.com>
 6 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.htmlcimport numpy as np
 7 | 
 8 | cdef extern from "voidptr.h":
 9 |     void* PyCObject_AsVoidPtr(object obj)
10 | 
11 | cimport numpy as np
12 | ctypedef np.float32_t REAL_t
13 | 
14 | # BLAS routine signatures
15 | ctypedef void (*scopy_ptr) (const int *N, const float *X, const int *incX, float *Y, const int *incY) nogil
16 | ctypedef void (*saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil
17 | ctypedef float (*sdot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil
18 | ctypedef double (*dsdot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil
19 | ctypedef double (*snrm2_ptr) (const int *N, const float *X, const int *incX) nogil
20 | ctypedef void (*sscal_ptr) (const int *N, const float *alpha, const float *X, const int *incX) nogil
21 | 
22 | cdef scopy_ptr scopy
23 | cdef saxpy_ptr saxpy
24 | cdef sdot_ptr sdot
25 | cdef dsdot_ptr dsdot
26 | cdef snrm2_ptr snrm2
27 | cdef sscal_ptr sscal
28 | 
29 | # precalculated sigmoid table
30 | DEF EXP_TABLE_SIZE = 1000
31 | DEF MAX_EXP = 6
32 | cdef REAL_t[EXP_TABLE_SIZE] EXP_TABLE
33 | 
34 | # function implementations swapped based on BLAS detected in word2vec_inner.pyx init()
35 | ctypedef REAL_t (*our_dot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil
36 | ctypedef void (*our_saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil
37 | 
38 | cdef our_dot_ptr our_dot
39 | cdef our_saxpy_ptr our_saxpy
40 | 
41 | # for when fblas.sdot returns a double
42 | cdef REAL_t our_dot_double(const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil
43 | 
44 | # for when fblas.sdot returns a float
45 | cdef REAL_t our_dot_float(const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil
46 | 
47 | # for when no blas available
48 | cdef REAL_t our_dot_noblas(const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil
49 | cdef void our_saxpy_noblas(const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil
50 | 
51 | # to support random draws from negative-sampling cum_table
52 | cdef unsigned long long bisect_left(np.uint32_t *a, unsigned long long x, unsigned long long lo, unsigned long long hi) nogil
53 | 
54 | cdef unsigned long long random_int32(unsigned long long *next_random) nogil
55 | 


--------------------------------------------------------------------------------
/word2vec/word2vec_inner.pyx:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env cython
  2 | # cython: boundscheck=False
  3 | # cython: wraparound=False
  4 | # cython: cdivision=True
  5 | # coding: utf-8
  6 | #
  7 | # Copyright (C) 2013 Radim Rehurek <me@radimrehurek.com>
  8 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
  9 | 
 10 | import cython
 11 | import numpy as np
 12 | cimport numpy as np
 13 | 
 14 | from libc.math cimport exp
 15 | from libc.math cimport log
 16 | from libc.string cimport memset
 17 | 
 18 | # scipy <= 0.15
 19 | try:
 20 |     from scipy.linalg.blas import fblas
 21 | except ImportError:
 22 |     # in scipy > 0.15, fblas function has been removed
 23 |     import scipy.linalg.blas as fblas
 24 | 
 25 | REAL = np.float32
 26 | 
 27 | DEF MAX_SENTENCE_LEN = 10000
 28 | 
 29 | cdef scopy_ptr scopy=<scopy_ptr>PyCObject_AsVoidPtr(fblas.scopy._cpointer)  # y = x
 30 | cdef saxpy_ptr saxpy=<saxpy_ptr>PyCObject_AsVoidPtr(fblas.saxpy._cpointer)  # y += alpha * x
 31 | cdef sdot_ptr sdot=<sdot_ptr>PyCObject_AsVoidPtr(fblas.sdot._cpointer)  # float = dot(x, y)
 32 | cdef dsdot_ptr dsdot=<dsdot_ptr>PyCObject_AsVoidPtr(fblas.sdot._cpointer)  # double = dot(x, y)
 33 | cdef snrm2_ptr snrm2=<snrm2_ptr>PyCObject_AsVoidPtr(fblas.snrm2._cpointer)  # sqrt(x^2)
 34 | cdef sscal_ptr sscal=<sscal_ptr>PyCObject_AsVoidPtr(fblas.sscal._cpointer) # x = alpha * x
 35 | 
 36 | DEF EXP_TABLE_SIZE = 1000
 37 | DEF MAX_EXP = 6
 38 | 
 39 | cdef REAL_t[EXP_TABLE_SIZE] EXP_TABLE
 40 | cdef REAL_t[EXP_TABLE_SIZE] LOG_TABLE
 41 | 
 42 | cdef int ONE = 1
 43 | cdef REAL_t ONEF = <REAL_t>1.0
 44 | 
 45 | # for when fblas.sdot returns a double
 46 | cdef REAL_t our_dot_double(const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil:
 47 |     return <REAL_t>dsdot(N, X, incX, Y, incY)
 48 | 
 49 | # for when fblas.sdot returns a float
 50 | cdef REAL_t our_dot_float(const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil:
 51 |     return <REAL_t>sdot(N, X, incX, Y, incY)
 52 | 
 53 | # for when no blas available
 54 | cdef REAL_t our_dot_noblas(const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil:
 55 |     # not a true full dot()-implementation: just enough for our cases
 56 |     cdef int i
 57 |     cdef REAL_t a
 58 |     a = <REAL_t>0.0
 59 |     for i from 0 <= i < N[0] by 1:
 60 |         a += X[i] * Y[i]
 61 |     return a
 62 | 
 63 | # for when no blas available
 64 | cdef void our_saxpy_noblas(const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil:
 65 |     cdef int i
 66 |     for i from 0 <= i < N[0] by 1:
 67 |         Y[i * (incY[0])] = (alpha[0]) * X[i * (incX[0])] + Y[i * (incY[0])]
 68 | 
 69 | 
 70 | cdef void fast_sentence_sg_hs(
 71 |     const np.uint32_t *word_point, const np.uint8_t *word_code, const int codelen,
 72 |     REAL_t *syn0, REAL_t *syn1, const int size,
 73 |     const np.uint32_t word2_index, const REAL_t alpha, REAL_t *work, REAL_t *word_locks) nogil:
 74 | 
 75 |     cdef long long a, b
 76 |     cdef long long row1 = word2_index * size, row2
 77 |     cdef REAL_t f, g
 78 | 
 79 |     memset(work, 0, size * cython.sizeof(REAL_t))
 80 |     for b in range(codelen):
 81 |         row2 = word_point[b] * size
 82 |         f = our_dot(&size, &syn0[row1], &ONE, &syn1[row2], &ONE)
 83 |         if f <= -MAX_EXP or f >= MAX_EXP:
 84 |             continue
 85 |         f = EXP_TABLE[<int>((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
 86 |         g = (1 - word_code[b] - f) * alpha
 87 |         our_saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE)
 88 |         our_saxpy(&size, &g, &syn0[row1], &ONE, &syn1[row2], &ONE)
 89 |     our_saxpy(&size, &word_locks[word2_index], work, &ONE, &syn0[row1], &ONE)
 90 | 
 91 | 
 92 | # to support random draws from negative-sampling cum_table
 93 | cdef inline unsigned long long bisect_left(np.uint32_t *a, unsigned long long x, unsigned long long lo, unsigned long long hi) nogil:
 94 |     cdef unsigned long long mid
 95 |     while hi > lo:
 96 |         mid = (lo + hi) >> 1
 97 |         if a[mid] >= x:
 98 |             hi = mid
 99 |         else:
100 |             lo = mid + 1
101 |     return lo
102 | 
103 | # this quick & dirty RNG apparently matches Java's (non-Secure)Random
104 | # note this function side-effects next_random to set up the next number
105 | cdef inline unsigned long long random_int32(unsigned long long *next_random) nogil:
106 |     cdef unsigned long long this_random = next_random[0] >> 16
107 |     next_random[0] = (next_random[0] * <unsigned long long>25214903917ULL + 11) & 281474976710655ULL
108 |     return this_random
109 | 
110 | cdef unsigned long long fast_sentence_sg_neg(
111 |     const int negative, np.uint32_t *cum_table, unsigned long long cum_table_len,
112 |     REAL_t *syn0, REAL_t *syn1neg, const int size, const np.uint32_t word_index,
113 |     const np.uint32_t word2_index, const REAL_t alpha, REAL_t *work,
114 |     unsigned long long next_random, REAL_t *word_locks) nogil:
115 | 
116 |     cdef long long a
117 |     cdef long long row1 = word2_index * size, row2
118 |     cdef unsigned long long modulo = 281474976710655ULL
119 |     cdef REAL_t f, g, label
120 |     cdef np.uint32_t target_index
121 |     cdef int d
122 | 
123 |     memset(work, 0, size * cython.sizeof(REAL_t))
124 | 
125 |     for d in range(negative+1):
126 |         if d == 0:
127 |             target_index = word_index
128 |             label = ONEF
129 |         else:
130 |             target_index = bisect_left(cum_table, (next_random >> 16) % cum_table[cum_table_len-1], 0, cum_table_len)
131 |             next_random = (next_random * <unsigned long long>25214903917ULL + 11) & modulo
132 |             if target_index == word_index:
133 |                 continue
134 |             label = <REAL_t>0.0
135 | 
136 |         row2 = target_index * size
137 |         f = our_dot(&size, &syn0[row1], &ONE, &syn1neg[row2], &ONE)
138 |         if f <= -MAX_EXP or f >= MAX_EXP:
139 |             continue
140 |         f = EXP_TABLE[<int>((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
141 |         g = (label - f) * alpha
142 |         our_saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE)
143 |         our_saxpy(&size, &g, &syn0[row1], &ONE, &syn1neg[row2], &ONE)
144 | 
145 |     our_saxpy(&size, &word_locks[word2_index], work, &ONE, &syn0[row1], &ONE)
146 | 
147 |     return next_random
148 | 
149 | cdef unsigned long long fast_sentence_sg_neg_dynamic(
150 |     const int negative, np.uint32_t *cum_table, unsigned long long cum_table_len,
151 |     REAL_t *syn0, REAL_t *syn1neg, const int size, const np.uint32_t word_index,
152 |     const np.uint32_t word2_index, const REAL_t alpha, REAL_t *work,
153 |     unsigned long long next_random, REAL_t *word_locks, int eliminate) nogil:
154 | 
155 |     cdef long long a
156 |     cdef long long row1 = word2_index * size, row2
157 |     cdef unsigned long long modulo = 281474976710655ULL
158 |     cdef REAL_t f, g, label
159 |     cdef np.uint32_t target_index
160 |     cdef int d
161 |     cdef REAL_t neg=-1
162 | 
163 |     memset(work, 0, size * cython.sizeof(REAL_t))
164 | 
165 |     for d in range(negative+1):
166 |         if d == 0:
167 |             target_index = word_index
168 |             label = ONEF
169 |         else:
170 |             target_index = bisect_left(cum_table, (next_random >> 16) % cum_table[cum_table_len-1], 0, cum_table_len)
171 |             next_random = (next_random * <unsigned long long>25214903917ULL + 11) & modulo
172 |             if target_index == word_index:
173 |                 continue
174 |             label = <REAL_t>0.0
175 | 
176 |         row2 = target_index * size
177 |         f = our_dot(&size, &syn0[row1], &ONE, &syn1neg[row2], &ONE)
178 |         if f <= -MAX_EXP or f >= MAX_EXP:
179 |             continue
180 |         f = EXP_TABLE[<int>((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
181 |         g = (label - f) * alpha
182 |         our_saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE)
183 |         our_saxpy(&size, &g, &syn0[row1], &ONE, &syn1neg[row2], &ONE)
184 | 
185 |     if eliminate == 0:
186 |         our_saxpy(&size, &word_locks[word2_index], work, &ONE, &syn0[row1], &ONE)
187 |     else:
188 |         sscal(&size, &neg, work, &ONE)
189 |         our_saxpy(&size, &word_locks[word2_index], work, &ONE, &syn0[row1], &ONE)
190 | 
191 |     return next_random
192 | 
193 | 
194 | cdef void fast_sentence_cbow_hs(
195 |     const np.uint32_t *word_point, const np.uint8_t *word_code, int codelens[MAX_SENTENCE_LEN],
196 |     REAL_t *neu1, REAL_t *syn0, REAL_t *syn1, const int size,
197 |     const np.uint32_t indexes[MAX_SENTENCE_LEN], const REAL_t alpha, REAL_t *work,
198 |     int i, int j, int k, int cbow_mean, REAL_t *word_locks) nogil:
199 | 
200 |     cdef long long a, b
201 |     cdef long long row2
202 |     cdef REAL_t f, g, count, inv_count = 1.0
203 |     cdef int m
204 | 
205 |     memset(neu1, 0, size * cython.sizeof(REAL_t))
206 |     count = <REAL_t>0.0
207 |     for m in range(j, k):
208 |         if m == i:
209 |             continue
210 |         else:
211 |             count += ONEF
212 |             our_saxpy(&size, &ONEF, &syn0[indexes[m] * size], &ONE, neu1, &ONE)
213 |     if count > (<REAL_t>0.5):
214 |         inv_count = ONEF/count
215 |     if cbow_mean:
216 |         sscal(&size, &inv_count, neu1, &ONE)  # (does this need BLAS-variants like saxpy?)
217 | 
218 |     memset(work, 0, size * cython.sizeof(REAL_t))
219 |     for b in range(codelens[i]):
220 |         row2 = word_point[b] * size
221 |         f = our_dot(&size, neu1, &ONE, &syn1[row2], &ONE)
222 |         if f <= -MAX_EXP or f >= MAX_EXP:
223 |             continue
224 |         f = EXP_TABLE[<int>((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
225 |         g = (1 - word_code[b] - f) * alpha
226 |         our_saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE)
227 |         our_saxpy(&size, &g, neu1, &ONE, &syn1[row2], &ONE)
228 | 
229 |     if not cbow_mean:  # divide error over summed window vectors
230 |         sscal(&size, &inv_count, work, &ONE)  # (does this need BLAS-variants like saxpy?)
231 | 
232 |     for m in range(j, k):
233 |         if m == i:
234 |             continue
235 |         else:
236 |             our_saxpy(&size, &word_locks[indexes[m]], work, &ONE, &syn0[indexes[m] * size], &ONE)
237 | 
238 | 
239 | cdef unsigned long long fast_sentence_cbow_neg(
240 |     const int negative, np.uint32_t *cum_table, unsigned long long cum_table_len, int codelens[MAX_SENTENCE_LEN],
241 |     REAL_t *neu1,  REAL_t *syn0, REAL_t *syn1neg, const int size,
242 |     const np.uint32_t indexes[MAX_SENTENCE_LEN], const REAL_t alpha, REAL_t *work,
243 |     int i, int j, int k, int cbow_mean, unsigned long long next_random, REAL_t *word_locks) nogil:
244 | 
245 |     cdef long long a
246 |     cdef long long row2
247 |     cdef unsigned long long modulo = 281474976710655ULL
248 |     cdef REAL_t f, g, count, inv_count = 1.0, label
249 |     cdef np.uint32_t target_index, word_index
250 |     cdef int d, m
251 | 
252 |     word_index = indexes[i]
253 | 
254 |     memset(neu1, 0, size * cython.sizeof(REAL_t))
255 |     count = <REAL_t>0.0
256 |     for m in range(j, k):
257 |         if m == i:
258 |             continue
259 |         else:
260 |             count += ONEF
261 |             our_saxpy(&size, &ONEF, &syn0[indexes[m] * size], &ONE, neu1, &ONE)
262 |     if count > (<REAL_t>0.5):
263 |         inv_count = ONEF/count
264 |     if cbow_mean:
265 |         sscal(&size, &inv_count, neu1, &ONE)  # (does this need BLAS-variants like saxpy?)
266 | 
267 |     memset(work, 0, size * cython.sizeof(REAL_t))
268 | 
269 |     for d in range(negative+1):
270 |         if d == 0:
271 |             target_index = word_index
272 |             label = ONEF
273 |         else:
274 |             target_index = bisect_left(cum_table, (next_random >> 16) % cum_table[cum_table_len-1], 0, cum_table_len)
275 |             next_random = (next_random * <unsigned long long>25214903917ULL + 11) & modulo
276 |             if target_index == word_index:
277 |                 continue
278 |             label = <REAL_t>0.0
279 | 
280 |         row2 = target_index * size
281 |         f = our_dot(&size, neu1, &ONE, &syn1neg[row2], &ONE)
282 |         if f <= -MAX_EXP or f >= MAX_EXP:
283 |             continue
284 |         f = EXP_TABLE[<int>((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
285 |         g = (label - f) * alpha
286 |         our_saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE)
287 |         our_saxpy(&size, &g, neu1, &ONE, &syn1neg[row2], &ONE)
288 | 
289 |     if not cbow_mean:  # divide error over summed window vectors
290 |         sscal(&size, &inv_count, work, &ONE)  # (does this need BLAS-variants like saxpy?)
291 | 
292 |     for m in range(j,k):
293 |         if m == i:
294 |             continue
295 |         else:
296 |             our_saxpy(&size, &word_locks[indexes[m]], work, &ONE, &syn0[indexes[m]*size], &ONE)
297 | 
298 |     return next_random
299 | 
300 | 
301 | def train_batch_sg(model, sentences, alpha, _work):
302 |     cdef int hs = model.hs
303 |     cdef int negative = model.negative
304 |     cdef int sample = (model.sample != 0)
305 | 
306 |     cdef REAL_t *syn0 = <REAL_t *>(np.PyArray_DATA(model.wv.syn0))
307 |     cdef REAL_t *word_locks = <REAL_t *>(np.PyArray_DATA(model.syn0_lockf))
308 |     cdef REAL_t *work
309 |     cdef REAL_t _alpha = alpha
310 |     cdef int size = model.layer1_size
311 | 
312 |     cdef int codelens[MAX_SENTENCE_LEN]
313 |     cdef np.uint32_t indexes[MAX_SENTENCE_LEN]
314 |     cdef np.uint32_t reduced_windows[MAX_SENTENCE_LEN]
315 |     cdef int sentence_idx[MAX_SENTENCE_LEN + 1]
316 |     cdef int sentence_pair_idx[MAX_SENTENCE_LEN + 1]
317 |     cdef int window = model.window
318 | 
319 |     cdef int i, j, k
320 |     cdef int effective_words = 0, effective_sentences = 0
321 |     cdef int sent_idx, idx_start, idx_end
322 | 
323 |     # For hierarchical softmax
324 |     cdef REAL_t *syn1
325 |     cdef np.uint32_t *points[MAX_SENTENCE_LEN]
326 |     cdef np.uint8_t *codes[MAX_SENTENCE_LEN]
327 | 
328 |     # For negative sampling
329 |     cdef REAL_t *syn1neg
330 |     cdef np.uint32_t *cum_table
331 |     cdef unsigned long long cum_table_len
332 |     # for sampling (negative and frequent-word downsampling)
333 |     cdef unsigned long long next_random
334 | 
335 |     cdef int w1
336 |     cdef int w2
337 |     cdef int w_next
338 |     cdef int pair_1
339 |     cdef int pair_2
340 |     cdef int wi
341 |     cdef int w_idx
342 |     cdef int eliminate
343 |     cdef int dynamic
344 |     if model.dynamic:
345 |         dynamic = 1
346 |     else:
347 |         dynamic = 0
348 | 
349 |     if hs:
350 |         syn1 = <REAL_t *>(np.PyArray_DATA(model.syn1))
351 | 
352 |     if negative:
353 |         syn1neg = <REAL_t *>(np.PyArray_DATA(model.syn1neg))
354 |         cum_table = <np.uint32_t *>(np.PyArray_DATA(model.cum_table))
355 |         cum_table_len = len(model.cum_table)
356 |     if negative or sample:
357 |         next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24)
358 | 
359 |     # convert Python structures to primitive types, so we can release the GIL
360 |     work = <REAL_t *>np.PyArray_DATA(_work)
361 | 
362 |     # prepare C structures so we can go "full C" and release the Python GIL
363 |     vlookup = model.wv.vocab
364 |     sentence_idx[0] = 0  # indices of the first sentence always start at 0
365 |     for s in sentences:
366 |         sent = model.sentences[s]
367 |         if not sent:
368 |             continue  # ignore empty sentences; leave effective_sentences unchanged
369 | 
370 |         for token in sent:
371 |             word = vlookup[token] if token in vlookup else None
372 |             if word is None:
373 |                 continue  # leaving `effective_words` unchanged = shortening the sentence = expanding the window
374 |             if sample and word.sample_int < random_int32(&next_random):
375 |                 continue
376 |             indexes[effective_words] = word.index
377 |             if hs:
378 |                 codelens[effective_words] = <int>len(word.code)
379 |                 codes[effective_words] = <np.uint8_t *>np.PyArray_DATA(word.code)
380 |                 points[effective_words] = <np.uint32_t *>np.PyArray_DATA(word.point)
381 |             effective_words += 1
382 |             if effective_words == MAX_SENTENCE_LEN:
383 |                 break  # TODO: log warning, tally overflow?
384 | 
385 |         # keep track of which words go into which sentence, so we don't train
386 |         # across sentence boundaries.
387 |         # indices of sentence number X are between <sentence_idx[X], sentence_idx[X])
388 |         effective_sentences += 1
389 |         sentence_idx[effective_sentences] = effective_words
390 |         if dynamic:
391 |             sentence_pair_idx[effective_sentences] = model.sent_edge_dict[s]
392 | 
393 | 
394 |         if effective_words == MAX_SENTENCE_LEN:
395 |             break  # TODO: log warning, tally overflow?
396 | 
397 |     # precompute "reduced window" offsets in a single randint() call
398 |     for i, item in enumerate(model.random.randint(0, window, effective_words)):
399 |         reduced_windows[i] = item
400 | 
401 |     # release GIL & train on all sentences
402 |     with nogil:
403 |         for sent_idx in range(effective_sentences):
404 |             if dynamic:
405 |                 pair_1 = indexes[sentence_pair_idx[sent_idx]]
406 |                 pair_2 = indexes[sentence_pair_idx[sent_idx + 1]]
407 |             idx_start = sentence_idx[sent_idx]
408 |             idx_end = sentence_idx[sent_idx + 1]
409 |             for i in range(idx_start, idx_end):
410 |                 j = i - window + reduced_windows[i]
411 |                 if j < idx_start:
412 |                     j = idx_start
413 |                 k = i + window + 1 - reduced_windows[i]
414 |                 if k > idx_end:
415 |                     k = idx_end
416 |                 if dynamic:
417 |                     for j in range(j, k):
418 |                         eliminate = 0
419 |                         if j < i:
420 |                             for wi in range(j, i):
421 |                                 w_next = wi + 1
422 |                                 w1 = indexes[wi]
423 |                                 w2 = indexes[w_next]
424 |                                 if w1 == pair_1 and w2 == pair_2:
425 |                                     eliminate = 1
426 |                                     break
427 |                         elif j > i:
428 |                             for wi in range(i, j):
429 |                                 w_next = wi + 1
430 |                                 w1 = indexes[wi]
431 |                                 w2 = indexes[w_next]
432 |                                 if w1 == pair_1 and w2 == pair_2:
433 |                                     eliminate = 1
434 |                                     break
435 |                         elif j == i:
436 |                             continue
437 |                         if hs:
438 |                             fast_sentence_sg_hs(points[i], codes[i], codelens[i], syn0, syn1, size, indexes[j], _alpha, work, word_locks)
439 |                         if negative:
440 |                             next_random = fast_sentence_sg_neg_dynamic(negative, cum_table, cum_table_len, syn0, syn1neg, size, indexes[i], indexes[j], _alpha, work, next_random, word_locks, eliminate)
441 |                 else:
442 |                     for j in range(j, k):
443 |                         if j == i:
444 |                             continue
445 |                         if hs:
446 |                             fast_sentence_sg_hs(points[i], codes[i], codelens[i], syn0, syn1, size, indexes[j], _alpha, work, word_locks)
447 |                         if negative:
448 |                             next_random = fast_sentence_sg_neg(negative, cum_table, cum_table_len, syn0, syn1neg, size, indexes[i], indexes[j], _alpha, work, next_random, word_locks)
449 | 
450 |     return effective_words
451 | 
452 | 
453 | def train_batch_cbow(model, sentences, alpha, _work, _neu1):
454 |     cdef int hs = model.hs
455 |     cdef int negative = model.negative
456 |     cdef int sample = (model.sample != 0)
457 |     cdef int cbow_mean = model.cbow_mean
458 | 
459 |     cdef REAL_t *syn0 = <REAL_t *>(np.PyArray_DATA(model.wv.syn0))
460 |     cdef REAL_t *word_locks = <REAL_t *>(np.PyArray_DATA(model.syn0_lockf))
461 |     cdef REAL_t *work
462 |     cdef REAL_t _alpha = alpha
463 |     cdef int size = model.layer1_size
464 | 
465 |     cdef int codelens[MAX_SENTENCE_LEN]
466 |     cdef np.uint32_t indexes[MAX_SENTENCE_LEN]
467 |     cdef np.uint32_t reduced_windows[MAX_SENTENCE_LEN]
468 |     cdef int sentence_idx[MAX_SENTENCE_LEN + 1]
469 |     cdef int window = model.window
470 | 
471 |     cdef int i, j, k
472 |     cdef int effective_words = 0, effective_sentences = 0
473 |     cdef int sent_idx, idx_start, idx_end
474 | 
475 |     # For hierarchical softmax
476 |     cdef REAL_t *syn1
477 |     cdef np.uint32_t *points[MAX_SENTENCE_LEN]
478 |     cdef np.uint8_t *codes[MAX_SENTENCE_LEN]
479 | 
480 |     # For negative sampling
481 |     cdef REAL_t *syn1neg
482 |     cdef np.uint32_t *cum_table
483 |     cdef unsigned long long cum_table_len
484 |     # for sampling (negative and frequent-word downsampling)
485 |     cdef unsigned long long next_random
486 | 
487 |     if hs:
488 |         syn1 = <REAL_t *>(np.PyArray_DATA(model.syn1))
489 | 
490 |     if negative:
491 |         syn1neg = <REAL_t *>(np.PyArray_DATA(model.syn1neg))
492 |         cum_table = <np.uint32_t *>(np.PyArray_DATA(model.cum_table))
493 |         cum_table_len = len(model.cum_table)
494 |     if negative or sample:
495 |         next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24)
496 | 
497 |     # convert Python structures to primitive types, so we can release the GIL
498 |     work = <REAL_t *>np.PyArray_DATA(_work)
499 |     neu1 = <REAL_t *>np.PyArray_DATA(_neu1)
500 | 
501 |     # prepare C structures so we can go "full C" and release the Python GIL
502 |     vlookup = model.wv.vocab
503 |     sentence_idx[0] = 0  # indices of the first sentence always start at 0
504 |     for sent in sentences:
505 |         if not sent:
506 |             continue  # ignore empty sentences; leave effective_sentences unchanged
507 |         for token in sent:
508 |             word = vlookup[token] if token in vlookup else None
509 |             if word is None:
510 |                 continue  # leaving `effective_words` unchanged = shortening the sentence = expanding the window
511 |             if sample and word.sample_int < random_int32(&next_random):
512 |                 continue
513 |             indexes[effective_words] = word.index
514 |             if hs:
515 |                 codelens[effective_words] = <int>len(word.code)
516 |                 codes[effective_words] = <np.uint8_t *>np.PyArray_DATA(word.code)
517 |                 points[effective_words] = <np.uint32_t *>np.PyArray_DATA(word.point)
518 |             effective_words += 1
519 |             if effective_words == MAX_SENTENCE_LEN:
520 |                 break  # TODO: log warning, tally overflow?
521 | 
522 |         # keep track of which words go into which sentence, so we don't train
523 |         # across sentence boundaries.
524 |         # indices of sentence number X are between <sentence_idx[X], sentence_idx[X])
525 |         effective_sentences += 1
526 |         sentence_idx[effective_sentences] = effective_words
527 | 
528 |         if effective_words == MAX_SENTENCE_LEN:
529 |             break  # TODO: log warning, tally overflow?
530 | 
531 |     # precompute "reduced window" offsets in a single randint() call
532 |     for i, item in enumerate(model.random.randint(0, window, effective_words)):
533 |         reduced_windows[i] = item
534 | 
535 |     # release GIL & train on all sentences
536 |     with nogil:
537 |         for sent_idx in range(effective_sentences):
538 |             idx_start = sentence_idx[sent_idx]
539 |             idx_end = sentence_idx[sent_idx + 1]
540 |             for i in range(idx_start, idx_end):
541 |                 j = i - window + reduced_windows[i]
542 |                 if j < idx_start:
543 |                     j = idx_start
544 |                 k = i + window + 1 - reduced_windows[i]
545 |                 if k > idx_end:
546 |                     k = idx_end
547 |                 if hs:
548 |                     fast_sentence_cbow_hs(points[i], codes[i], codelens, neu1, syn0, syn1, size, indexes, _alpha, work, i, j, k, cbow_mean, word_locks)
549 |                 if negative:
550 |                     next_random = fast_sentence_cbow_neg(negative, cum_table, cum_table_len, codelens, neu1, syn0, syn1neg, size, indexes, _alpha, work, i, j, k, cbow_mean, next_random, word_locks)
551 | 
552 |     return effective_words
553 | 
554 | 
555 | # Score is only implemented for hierarchical softmax
556 | def score_sentence_sg(model, sentence, _work):
557 | 
558 |     cdef REAL_t *syn0 = <REAL_t *>(np.PyArray_DATA(model.wv.syn0))
559 |     cdef REAL_t *work
560 |     cdef int size = model.layer1_size
561 | 
562 |     cdef int codelens[MAX_SENTENCE_LEN]
563 |     cdef np.uint32_t indexes[MAX_SENTENCE_LEN]
564 |     cdef int sentence_len
565 |     cdef int window = model.window
566 | 
567 |     cdef int i, j, k
568 |     cdef long result = 0
569 | 
570 |     cdef REAL_t *syn1
571 |     cdef np.uint32_t *points[MAX_SENTENCE_LEN]
572 |     cdef np.uint8_t *codes[MAX_SENTENCE_LEN]
573 | 
574 |     syn1 = <REAL_t *>(np.PyArray_DATA(model.syn1))
575 | 
576 |     # convert Python structures to primitive types, so we can release the GIL
577 |     work = <REAL_t *>np.PyArray_DATA(_work)
578 | 
579 |     vlookup = model.wv.vocab
580 |     i = 0
581 |     for token in sentence:
582 |         word = vlookup[token] if token in vlookup else None
583 |         if word is None:
584 |             continue  # should drop the
585 |         indexes[i] = word.index
586 |         codelens[i] = <int>len(word.code)
587 |         codes[i] = <np.uint8_t *>np.PyArray_DATA(word.code)
588 |         points[i] = <np.uint32_t *>np.PyArray_DATA(word.point)
589 |         result += 1
590 |         i += 1
591 |         if i == MAX_SENTENCE_LEN:
592 |             break  # TODO: log warning, tally overflow?
593 |     sentence_len = i
594 | 
595 |     # release GIL & train on the sentence
596 |     work[0] = 0.0
597 | 
598 |     with nogil:
599 |         for i in range(sentence_len):
600 |             if codelens[i] == 0:
601 |                 continue
602 |             j = i - window
603 |             if j < 0:
604 |                 j = 0
605 |             k = i + window + 1
606 |             if k > sentence_len:
607 |                 k = sentence_len
608 |             for j in range(j, k):
609 |                 if j == i or codelens[j] == 0:
610 |                     continue
611 |                 score_pair_sg_hs(points[i], codes[i], codelens[i], syn0, syn1, size, indexes[j], work)
612 | 
613 |     return work[0]
614 | 
615 | cdef void score_pair_sg_hs(
616 |     const np.uint32_t *word_point, const np.uint8_t *word_code, const int codelen,
617 |     REAL_t *syn0, REAL_t *syn1, const int size,
618 |     const np.uint32_t word2_index, REAL_t *work) nogil:
619 | 
620 |     cdef long long b
621 |     cdef long long row1 = word2_index * size, row2, sgn
622 |     cdef REAL_t f
623 | 
624 |     for b in range(codelen):
625 |         row2 = word_point[b] * size
626 |         f = our_dot(&size, &syn0[row1], &ONE, &syn1[row2], &ONE)
627 |         sgn = (-1)**word_code[b] # ch function: 0-> 1, 1 -> -1
628 |         f = sgn*f
629 |         if f <= -MAX_EXP or f >= MAX_EXP:
630 |             continue
631 |         f = LOG_TABLE[<int>((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
632 |         work[0] += f
633 | 
634 | def score_sentence_cbow(model, sentence, _work, _neu1):
635 | 
636 |     cdef int cbow_mean = model.cbow_mean
637 | 
638 |     cdef REAL_t *syn0 = <REAL_t *>(np.PyArray_DATA(model.wv.syn0))
639 |     cdef REAL_t *work
640 |     cdef REAL_t *neu1
641 |     cdef int size = model.layer1_size
642 | 
643 |     cdef int codelens[MAX_SENTENCE_LEN]
644 |     cdef np.uint32_t indexes[MAX_SENTENCE_LEN]
645 |     cdef int sentence_len
646 |     cdef int window = model.window
647 | 
648 |     cdef int i, j, k
649 |     cdef long result = 0
650 | 
651 |     # For hierarchical softmax
652 |     cdef REAL_t *syn1
653 |     cdef np.uint32_t *points[MAX_SENTENCE_LEN]
654 |     cdef np.uint8_t *codes[MAX_SENTENCE_LEN]
655 | 
656 |     syn1 = <REAL_t *>(np.PyArray_DATA(model.syn1))
657 | 
658 |     # convert Python structures to primitive types, so we can release the GIL
659 |     work = <REAL_t *>np.PyArray_DATA(_work)
660 |     neu1 = <REAL_t *>np.PyArray_DATA(_neu1)
661 | 
662 |     vlookup = model.wv.vocab
663 |     i = 0
664 |     for token in sentence:
665 |         word = vlookup[token] if token in vlookup else None
666 |         if word is None:
667 |             continue  # for score, should this be a default negative value?
668 |         indexes[i] = word.index
669 |         codelens[i] = <int>len(word.code)
670 |         codes[i] = <np.uint8_t *>np.PyArray_DATA(word.code)
671 |         points[i] = <np.uint32_t *>np.PyArray_DATA(word.point)
672 |         result += 1
673 |         i += 1
674 |         if i == MAX_SENTENCE_LEN:
675 |             break  # TODO: log warning, tally overflow?
676 |     sentence_len = i
677 | 
678 |     # release GIL & train on the sentence
679 |     work[0] = 0.0
680 |     with nogil:
681 |         for i in range(sentence_len):
682 |             if codelens[i] == 0:
683 |                 continue
684 |             j = i - window
685 |             if j < 0:
686 |                 j = 0
687 |             k = i + window + 1
688 |             if k > sentence_len:
689 |                 k = sentence_len
690 |             score_pair_cbow_hs(points[i], codes[i], codelens, neu1, syn0, syn1, size, indexes, work, i, j, k, cbow_mean)
691 | 
692 |     return work[0]
693 | 
694 | cdef void score_pair_cbow_hs(
695 |     const np.uint32_t *word_point, const np.uint8_t *word_code, int codelens[MAX_SENTENCE_LEN],
696 |     REAL_t *neu1, REAL_t *syn0, REAL_t *syn1, const int size,
697 |     const np.uint32_t indexes[MAX_SENTENCE_LEN], REAL_t *work,
698 |     int i, int j, int k, int cbow_mean) nogil:
699 | 
700 |     cdef long long a, b
701 |     cdef long long row2
702 |     cdef REAL_t f, g, count, inv_count, sgn
703 |     cdef int m
704 | 
705 |     memset(neu1, 0, size * cython.sizeof(REAL_t))
706 |     count = <REAL_t>0.0
707 |     for m in range(j, k):
708 |         if m == i or codelens[m] == 0:
709 |             continue
710 |         else:
711 |             count += ONEF
712 |             our_saxpy(&size, &ONEF, &syn0[indexes[m] * size], &ONE, neu1, &ONE)
713 |     if count > (<REAL_t>0.5):
714 |         inv_count = ONEF/count
715 |     if cbow_mean:
716 |         sscal(&size, &inv_count, neu1, &ONE)
717 | 
718 |     for b in range(codelens[i]):
719 |         row2 = word_point[b] * size
720 |         f = our_dot(&size, neu1, &ONE, &syn1[row2], &ONE)
721 |         sgn = (-1)**word_code[b] # ch function: 0-> 1, 1 -> -1
722 |         f = sgn*f
723 |         if f <= -MAX_EXP or f >= MAX_EXP:
724 |             continue
725 |         f = LOG_TABLE[<int>((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
726 |         work[0] += f
727 | 
728 | 
729 | def init():
730 |     """
731 |     Precompute function `sigmoid(x) = 1 / (1 + exp(-x))`, for x values discretized
732 |     into table EXP_TABLE.  Also calculate log(sigmoid(x)) into LOG_TABLE.
733 | 
734 |     """
735 |     global our_dot
736 |     global our_saxpy
737 | 
738 |     cdef int i
739 |     cdef float *x = [<float>10.0]
740 |     cdef float *y = [<float>0.01]
741 |     cdef float expected = <float>0.1
742 |     cdef int size = 1
743 |     cdef double d_res
744 |     cdef float *p_res
745 | 
746 |     # build the sigmoid table
747 |     for i in range(EXP_TABLE_SIZE):
748 |         EXP_TABLE[i] = <REAL_t>exp((i / <REAL_t>EXP_TABLE_SIZE * 2 - 1) * MAX_EXP)
749 |         EXP_TABLE[i] = <REAL_t>(EXP_TABLE[i] / (EXP_TABLE[i] + 1))
750 |         LOG_TABLE[i] = <REAL_t>log( EXP_TABLE[i] )
751 | 
752 |     # check whether sdot returns double or float
753 |     d_res = dsdot(&size, x, &ONE, y, &ONE)
754 |     p_res = <float *>&d_res
755 |     if (abs(d_res - expected) < 0.0001):
756 |         our_dot = our_dot_double
757 |         our_saxpy = saxpy
758 |         return 0  # double
759 |     elif (abs(p_res[0] - expected) < 0.0001):
760 |         our_dot = our_dot_float
761 |         our_saxpy = saxpy
762 |         return 1  # float
763 |     else:
764 |         # neither => use cython loops, no BLAS
765 |         # actually, the BLAS is so messed up we'll probably have segfaulted above and never even reach here
766 |         our_dot = our_dot_noblas
767 |         our_saxpy = our_saxpy_noblas
768 |         return 2
769 | 
770 | FAST_VERSION = init()  # initialize the module
771 | MAX_WORDS_IN_BATCH = MAX_SENTENCE_LEN
772 | 


--------------------------------------------------------------------------------