├── README.md ├── kge ├── data │ └── freebase15k │ │ ├── freebase_15k-id2entity.pkl │ │ ├── freebase_15k-id2relation.pkl │ │ ├── freebase_15k-test.pkl │ │ ├── freebase_15k-train.pkl │ │ └── freebase_15k-valid.pkl └── hole.py ├── pom.xml ├── python └── sansa │ ├── __init__.py │ └── ml │ ├── __init__.py │ └── kbc │ ├── __init__.py │ ├── keras │ ├── __init__.py │ ├── actfun.py │ ├── base.py │ ├── hole.py │ ├── holek.py │ ├── param.py │ ├── sample.py │ └── util.py │ └── rdfio.py └── src └── main └── scala └── net └── sansa_stack └── ml └── kge ├── Functions.scala ├── Main.scala ├── RDFDatasetReader.scala └── model └── TransE.scala /README.md: -------------------------------------------------------------------------------- 1 | # Spark-Tensors 2 | Temporary repository for implementing tensor factorization algorithms on Apache Spark 3 | 4 | Currently I am working on the following 3 algorithms: 5 | 6 | 1. PARAFAC (parallel algorithms given here: [GigaTensor](https://www.cs.cmu.edu/~epapalex/papers/gigatensor_KDD2012.pdf), [U. Kang's PhD thesis](http://datalab.snu.ac.kr/~ukang/papers/KangThesis.pdf)) Also check the [HaTen2 paper](https://www.cs.cmu.edu/~epapalex/papers/haten2_icde2015.pdf) that apparently improves upon GigaTensor. 7 | 2. RESCAL [RESCAL paper](http://www.icml-2011.org/papers/438_icmlpaper.pdf), [M. Nickel's PhD thesis](http://edoc.ub.uni-muenchen.de/16056/1/Nickel_Maximilian.pdf) (Spark-based distributed algorithm will be designed for this) 8 | 3. HolE [Holographic Embeddings of Knowledge Graphs](http://arxiv.org/pdf/1510.04935v2) (Spark-based distributed algorithm will be designed for this) 9 | 10 | This will be divided across the Spark-RDF (interface, I/O, storage) and Spark-Sem-ML (algorithm) repositories eventually. 11 | -------------------------------------------------------------------------------- /kge/data/freebase15k/freebase_15k-id2entity.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SANSA-Stack/Spark-Tensors/9d834e75b917c6c476f426ebab47eec0830f190b/kge/data/freebase15k/freebase_15k-id2entity.pkl -------------------------------------------------------------------------------- /kge/data/freebase15k/freebase_15k-id2relation.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SANSA-Stack/Spark-Tensors/9d834e75b917c6c476f426ebab47eec0830f190b/kge/data/freebase15k/freebase_15k-id2relation.pkl -------------------------------------------------------------------------------- /kge/data/freebase15k/freebase_15k-test.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SANSA-Stack/Spark-Tensors/9d834e75b917c6c476f426ebab47eec0830f190b/kge/data/freebase15k/freebase_15k-test.pkl -------------------------------------------------------------------------------- /kge/data/freebase15k/freebase_15k-train.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SANSA-Stack/Spark-Tensors/9d834e75b917c6c476f426ebab47eec0830f190b/kge/data/freebase15k/freebase_15k-train.pkl -------------------------------------------------------------------------------- /kge/data/freebase15k/freebase_15k-valid.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SANSA-Stack/Spark-Tensors/9d834e75b917c6c476f426ebab47eec0830f190b/kge/data/freebase15k/freebase_15k-valid.pkl -------------------------------------------------------------------------------- /kge/hole.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from __future__ import print_function 4 | 5 | from collections import OrderedDict 6 | import keras 7 | import theano as th 8 | import theano.tensor as T 9 | 10 | from keras import backend as K 11 | from keras.optimizers import Adagrad, SGD 12 | import keras 13 | from keras.engine.topology import Layer 14 | from keras.models import Sequential, Model 15 | from keras.layers import merge, Input, Embedding, Dropout, Convolution1D, Lambda, Activation, LSTM, Dense, TimeDistributed, \ 16 | ActivityRegularization, Reshape, Flatten 17 | from keras.constraints import unitnorm 18 | 19 | import os 20 | import sys 21 | import random 22 | import numpy as np 23 | from time import strftime, gmtime 24 | import six.moves.cPickle as pickle 25 | from keras.optimizers import RMSprop, Adam, SGD, Adadelta, Adagrad 26 | from scipy.stats import rankdata 27 | 28 | __author__ = 'nilesh' 29 | 30 | class KgeModel: 31 | def __init__(self, config): 32 | self.subject = Input(shape=(config['subject_len'],), dtype='int32', name='subject_base') 33 | self.subject_bad = Input(shape=(config['subject_len'],), dtype='int32', name='subject_bad_base') 34 | self.relation = Input(shape=(config['relation_len'],), dtype='int32', name='relation_base') 35 | self.object_good = Input(shape=(config['object_len'],), dtype='int32', name='object_good_base') 36 | self.object_bad = Input(shape=(config['object_len'],), dtype='int32', name='object_bad_base') 37 | 38 | self.config = config 39 | self.model_params = config.get('model_params', dict()) 40 | self.similarity_params = config.get('similarity_params', dict()) 41 | 42 | # initialize a bunch of variables that will be set later 43 | self._models = None 44 | self._similarities = None 45 | self._object = None 46 | self._subject = None 47 | self._kge_model = None 48 | 49 | self.training_model = None 50 | self.prediction_model = None 51 | 52 | def get_object(self): 53 | if self._object is None: 54 | self._object = Input(shape=(self.config['object_len'],), dtype='int32', name='object') 55 | return self._object 56 | 57 | def get_subject(self): 58 | if self._subject is None: 59 | self._subject = Input(shape=(self.config['subject_len'],), dtype='int32', name='subject') 60 | return self._subject 61 | 62 | # @abstractmethod 63 | def build(self): 64 | return 65 | 66 | def get_similarity(self): 67 | ''' Specify similarity in configuration under 'similarity_params' -> 'mode' 68 | If a parameter is needed for the model, specify it in 'similarity_params' 69 | Example configuration: 70 | config = { 71 | ... other parameters ... 72 | 'similarity_params': { 73 | 'mode': 'gesd', 74 | 'gamma': 1, 75 | 'c': 1, 76 | } 77 | } 78 | cosine: dot(a, b) / sqrt(dot(a, a) * dot(b, b)) 79 | polynomial: (gamma * dot(a, b) + c) ^ d 80 | sigmoid: tanh(gamma * dot(a, b) + c) 81 | rbf: exp(-gamma * l2_norm(a-b) ^ 2) 82 | euclidean: 1 / (1 + l2_norm(a - b)) 83 | exponential: exp(-gamma * l2_norm(a - b)) 84 | gesd: euclidean * sigmoid 85 | aesd: (euclidean + sigmoid) / 2 86 | ''' 87 | 88 | params = self.similarity_params 89 | similarity = params['mode'] 90 | 91 | axis = lambda a: len(a._keras_shape) - 1 92 | dot = lambda a, b: K.batch_dot(a, b, axes=axis(a)) 93 | l2_norm = lambda a, b: K.sqrt(K.sum((a - b) ** 2, axis=axis(a), keepdims=True)) 94 | l1_norm = lambda a, b: K.sum(K.abs(a - b), axis=axis(a), keepdims=True) 95 | 96 | if similarity == 'cosine': 97 | return lambda x: dot(x[0], x[1]) / K.sqrt(dot(x[0], x[0]) * dot(x[1], x[1])) 98 | elif similarity == 'polynomial': 99 | return lambda x: (params['gamma'] * dot(x[0], x[1]) + params['c']) ** params['d'] 100 | elif similarity == 'sigmoid': 101 | return lambda x: K.tanh(params['gamma'] * dot(x[0], x[1]) + params['c']) 102 | elif similarity == 'rbf': 103 | return lambda x: K.exp(-1 * params['gamma'] * l2_norm(x[0], x[1]) ** 2) 104 | elif similarity == 'euclidean': 105 | return lambda x: 1 / (1 + l2_norm(x[0], x[1])) 106 | elif similarity == 'l1': 107 | return lambda x: -l1_norm(x[0], x[1]) 108 | elif similarity == 'exponential': 109 | return lambda x: K.exp(-1 * params['gamma'] * l2_norm(x[0], x[1])) 110 | elif similarity == 'gesd': 111 | euclidean = lambda x: 1 / (1 + l2_norm(x[0], x[1])) 112 | sigmoid = lambda x: 1 / (1 + K.exp(-1 * params['gamma'] * (dot(x[0], x[1]) + params['c']))) 113 | return lambda x: euclidean(x) * sigmoid(x) 114 | elif similarity == 'aesd': 115 | euclidean = lambda x: 0.5 / (1 + l2_norm(x[0], x[1])) 116 | sigmoid = lambda x: 0.5 / (1 + K.exp(-1 * params['gamma'] * (dot(x[0], x[1]) + params['c']))) 117 | return lambda x: euclidean(x) + sigmoid(x) 118 | else: 119 | raise Exception('Invalid similarity: {}'.format(similarity)) 120 | 121 | def get_kge_model(self): 122 | if self._models is None: 123 | self._models = self.build() 124 | 125 | if self._kge_model is None: 126 | subject_output, relation_output, object_output = self._models 127 | 128 | # relation_output2 = Reshape((100,100))(relation_output) 129 | sp_output = merge([subject_output, relation_output], mode='sum') 130 | # so_output = merge([subject_output, object_output], mode=lambda x: np.outer(x[0], x[1]).reshape(100000,)) 131 | # spo_output = merge([sp_output, Reshape((0,100))(object_output)], mode=lambda a, b: K.batch_dot(a, b, axes=len(a._keras_shape) - 1), 132 | # output_shape=lambda x: x[0]) 133 | spo_output = merge([sp_output, object_output], mode='sum', output_shape=lambda x: x[:-1]) 134 | 135 | self._kge_model = Model(input=[self.subject, self.relation, self.get_object()], output=[spo_output]) 136 | return self._kge_model 137 | 138 | 139 | def compile(self, optimizer, **kwargs): 140 | kge_model = self.get_kge_model() 141 | 142 | good_output = kge_model([self.subject, self.relation, self.object_good]) 143 | bad_output = kge_model([self.subject, self.relation, self.object_bad]) 144 | 145 | loss = merge([good_output, bad_output], 146 | mode=lambda x: K.maximum(1e-6, self.config['margin'] - x[0] + x[1]), 147 | output_shape=lambda x: x[0]) 148 | 149 | self.training_model = Model(input=[self.subject, self.relation, self.object_good, self.object_bad], output=loss) 150 | self.training_model.compile(loss=lambda y_true, y_pred: y_pred + y_true - y_true, optimizer=optimizer, **kwargs) 151 | 152 | self.prediction_model = Model(input=[self.subject, self.relation, self.object_good], output=good_output) 153 | self.prediction_model.compile(loss='binary_crossentropy', optimizer=optimizer, **kwargs) 154 | self.training_model.summary() 155 | 156 | def fit(self, x, **kwargs): 157 | assert self.training_model is not None, 'Must compile the model before fitting data' 158 | y = np.zeros(shape=x[0].shape[:1]) 159 | return self.training_model.fit(x, y, **kwargs) 160 | 161 | 162 | def train_on_batch(self, x, **kwargs): 163 | assert self.training_model is not None, 'Must compile the model before fitting data' 164 | y = np.zeros(shape=x[0].shape[:1]) 165 | return self.training_model.train_on_batch(x, y, **kwargs) 166 | 167 | def predict(self, x, **kwargs): 168 | return self.prediction_model.predict(x, **kwargs) 169 | 170 | def save_weights(self, file_name, **kwargs): 171 | assert self.prediction_model is not None, 'Must compile the model before saving weights' 172 | self.prediction_model.save_weights(file_name, **kwargs) 173 | 174 | def load_weights(self, file_name, **kwargs): 175 | assert self.prediction_model is not None, 'Must compile the model loading weights' 176 | self.prediction_model.load_weights(file_name, **kwargs) 177 | 178 | 179 | 180 | 181 | class RescalModel(KgeModel): 182 | def build(self): 183 | subject = self.subject 184 | relation = self.relation 185 | object_ = self.get_object() 186 | embedding_size = self.model_params.get('n_embed_dims', 100) 187 | 188 | # add embedding layers 189 | embedding_rel = Embedding(input_dim=self.config['n_rel'], 190 | output_dim=self.model_params.get('n_embed_dims', 100), 191 | init='he_uniform', 192 | mask_zero=False) 193 | embedding_ent = Embedding(input_dim=self.config['n_ent'], 194 | output_dim=self.model_params.get('n_embed_dims', 100), 195 | init='he_uniform', 196 | W_constraint=unitnorm(axis=1), 197 | mask_zero=False) 198 | subject_embedding = embedding_ent(subject) 199 | relation_embedding = embedding_rel(relation) 200 | object_embedding = embedding_ent(object_) 201 | 202 | subject_output = Reshape((embedding_size,))(subject_embedding) 203 | relation_output = Reshape((embedding_size,))(relation_embedding) 204 | object_output = Reshape((embedding_size,))(object_embedding) 205 | 206 | return subject_output, relation_output, object_output 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | random.seed(42) 217 | os.environ['FREEBASE_15K'] = 'data/freebase15k' 218 | 219 | 220 | class Evaluator: 221 | def __init__(self, conf=None): 222 | try: 223 | data_path = os.environ['FREEBASE_15K'] 224 | except KeyError: 225 | print("FREEBASE_15K is not set.") 226 | sys.exit(1) 227 | self.path = data_path 228 | self.conf = dict() if conf is None else conf 229 | self.params = conf.get('training_params', dict()) 230 | self.entity = self.load('freebase_15k-id2entity.pkl') 231 | self._vocab = None 232 | self._reverse_vocab = None 233 | self._eval_sets = None 234 | 235 | ##### Resources ##### 236 | 237 | def load(self, name): 238 | return pickle.load(open(os.path.join(self.path, name), 'rb')) 239 | 240 | def vocab(self): 241 | if self._vocab is None: 242 | self._vocab = self.load('vocabulary') 243 | return self._vocab 244 | 245 | def reverse_vocab(self): 246 | if self._reverse_vocab is None: 247 | vocab = self.vocab() 248 | self._reverse_vocab = dict((v.lower(), k) for k, v in vocab.items()) 249 | return self._reverse_vocab 250 | 251 | ##### Loading / saving ##### 252 | 253 | def save_epoch(self, model, epoch): 254 | if not os.path.exists('models/freebase_models/embedding/'): 255 | os.makedirs('models/freebase_models/embedding/') 256 | model.save_weights('models/freebase_models/embedding/weights_epoch_%d.h5' % epoch, overwrite=True) 257 | 258 | def load_epoch(self, model, epoch): 259 | assert os.path.exists('models/freebase_models/embedding/weights_epoch_%d.h5' % epoch),\ 260 | 'Weights at epoch %d not found' % epoch 261 | model.load_weights('models/freebase_models/embedding/weights_epoch_%d.h5' % epoch) 262 | 263 | ##### Converting / reverting ##### 264 | 265 | def convert(self, words): 266 | rvocab = self.reverse_vocab() 267 | if type(words) == str: 268 | words = words.strip().lower().split(' ') 269 | return [rvocab.get(w, 0) for w in words] 270 | 271 | def revert(self, indices): 272 | vocab = self.vocab() 273 | return [vocab.get(i, 'X') for i in indices] 274 | 275 | ##### Padding ##### 276 | 277 | def padq(self, data): 278 | return self.pad(data, self.conf.get('question_len', None)) 279 | 280 | def pada(self, data): 281 | return self.pad(data, self.conf.get('answer_len', None)) 282 | 283 | def pad(self, data, len=None): 284 | from keras.preprocessing.sequence import pad_sequences 285 | return pad_sequences(data, maxlen=len, padding='post', truncating='post', value=0) 286 | 287 | ##### Training ##### 288 | 289 | def print_time(self): 290 | print(strftime('%Y-%m-%d %H:%M:%S :: ', gmtime()), end='') 291 | 292 | def train(self, model): 293 | eval_every = self.params.get('eval_every', None) 294 | save_every = self.params.get('save_every', None) 295 | batch_size = self.params.get('batch_size', 128) 296 | nb_epoch = self.params.get('nb_epoch', 10) 297 | split = self.params.get('validation_split', 0) 298 | 299 | training_set = self.load('freebase_15k-train.pkl') 300 | valid_set = self.load('freebase_15k-valid.pkl') 301 | 302 | subjects = list() 303 | relations = list() 304 | good_objects = list() 305 | 306 | for line in training_set: 307 | triplet = line.split('\t') 308 | subjects += [[int(triplet[0])]] 309 | relations += [[int(triplet[1])]] 310 | good_objects += [[int(triplet[2])]] 311 | 312 | subjects = np.asarray(subjects) 313 | relations = np.asarray(relations) 314 | good_objects = np.asarray(good_objects) 315 | 316 | # subjects_valid = list() 317 | # relations_valid = list() 318 | # good_objects_valid = list() 319 | # 320 | # for line in valid_set: 321 | # triplet = line.split('\t') 322 | # subjects_valid += [[int(triplet[0])]] 323 | # relations_valid += [[int(triplet[1])]] 324 | # good_objects_valid += [[int(triplet[2])]] 325 | 326 | # subjects_valid = np.asarray(subjects_valid) 327 | # relations_valid = np.asarray(relations_valid) 328 | # good_objects_valid = np.asarray(good_objects_valid) 329 | 330 | val_loss = {'loss': 1., 'epoch': 0} 331 | 332 | for i in range(1, nb_epoch+1): 333 | # bad_answers = np.roll(good_answers, random.randint(10, len(questions) - 10)) 334 | # bad_answers = good_answers.copy() 335 | # random.shuffle(bad_answers) 336 | bad_objects = np.asarray([[int(random.choice(list(self.entity.keys())))] for _ in range(len(good_objects))]) 337 | 338 | # shuffle questionsj 339 | # zipped = zip(questions, good_answers) 340 | # random.shuffle(zipped) 341 | # questions[:], good_answers[:] = zip(*zipped) 342 | 343 | print('Epoch %d :: ' % i, end='') 344 | self.print_time() 345 | model.fit([subjects, relations, good_objects, bad_objects], nb_epoch=1, batch_size=batch_size) 346 | 347 | # if hist.history['val_loss'][0] < val_loss['loss']: 348 | # val_loss = {'loss': hist.history['val_loss'][0], 'epoch': i} 349 | # print('Best: Loss = {}, Epoch = {}'.format(val_loss['loss'], val_loss['epoch'])) 350 | 351 | if eval_every is not None and i % eval_every == 0: 352 | self.get_mrr(model) 353 | 354 | if save_every is not None and i % save_every == 0: 355 | self.save_epoch(model, i) 356 | 357 | ##### Evaluation ##### 358 | 359 | def prog_bar(self, so_far, total, n_bars=20): 360 | n_complete = int(so_far * n_bars / total) 361 | if n_complete >= n_bars - 1: 362 | print('\r[' + '=' * n_bars + ']', end='') 363 | else: 364 | s = '\r[' + '=' * (n_complete - 1) + '>' + '.' * (n_bars - n_complete) + ']' 365 | print(s, end='') 366 | 367 | def eval_sets(self): 368 | if self._eval_sets is None: 369 | self._eval_sets = dict([(s, self.load(s)) for s in ['freebase_15k-test.pkl']]) 370 | return self._eval_sets 371 | 372 | def get_mrr(self, model, evaluate_all=False): 373 | top1s = list() 374 | mrrs = list() 375 | for name, data in self.eval_sets().items(): 376 | if evaluate_all: 377 | self.print_time() 378 | print('----- %s -----' % name) 379 | 380 | random.shuffle(data) 381 | 382 | if not evaluate_all and 'n_eval' in self.params: 383 | data = data[:self.params['n_eval']] 384 | 385 | # c_1 for hit@1, c_3 for hit@3, c_10 for hit@10 386 | c_1, c_3, c_10 = 0, 0, 0 387 | mean_ranks = list() 388 | 389 | for i, d in enumerate(data): 390 | triplet = d.split('\t') 391 | if evaluate_all: 392 | self.prog_bar(i, len(data)) 393 | 394 | candidate_objects = self.entity.keys() 395 | candidate_objects.remove(int(triplet[2])) 396 | 397 | subject = np.asarray([[int(triplet[0])]] * (len(candidate_objects)+1)) 398 | relation = np.asarray([[int(triplet[1])]] * (len(candidate_objects)+1)) 399 | objects = np.asarray([[int(triplet[2])]] + [[entity_id] for entity_id in candidate_objects]) 400 | sims = model.predict([subject, relation, objects], batch_size=len(self.entity)).flatten() 401 | r = rankdata(sims, method='max') 402 | 403 | target_rank = r[0] 404 | num_candidate = len(sims) 405 | real_rank = num_candidate - target_rank + 1 406 | 407 | # print(' '.join(self.revert(d['question']))) 408 | # print(' '.join(self.revert(self.answers[indices[max_r]]))) 409 | # print(' '.join(self.revert(self.answers[indices[max_n]]))) 410 | 411 | c_1 += 1 if target_rank == num_candidate else 0 412 | c_3 += 1 if target_rank + 3 > num_candidate else 0 413 | c_10 += 1 if target_rank + 10 > num_candidate else 0 414 | mean_ranks.append(real_rank) 415 | # c_2 += 1 / float(r[max_r] - r[max_n] + 1) 416 | 417 | hit_at_1 = c_1 / float(len(data)) 418 | hit_at_3 = c_3 / float(len(data)) 419 | hit_at_10 = c_10 / float(len(data)) 420 | avg_rank = np.mean(mean_ranks) 421 | 422 | del data 423 | 424 | if evaluate_all: 425 | print('Hit@1 Precision: %f' % hit_at_1) 426 | print('Hit@3 Precision: %f' % hit_at_3) 427 | print('Hit@10 Precision: %f' % hit_at_10) 428 | print('Mean Rank: %f' % avg_rank) 429 | 430 | # top1s.append(top1) 431 | # mrrs.append(mrr) 432 | 433 | # rerun the evaluation if above some threshold 434 | if not evaluate_all: 435 | print('Top-1 Precision: {}'.format(top1s)) 436 | print('MRR: {}'.format(mrrs)) 437 | evaluate_all_threshold = self.params.get('evaluate_all_threshold', dict()) 438 | evaluate_mode = evaluate_all_threshold.get('mode', 'all') 439 | mrr_theshold = evaluate_all_threshold.get('mrr', 1) 440 | top1_threshold = evaluate_all_threshold.get('top1', 1) 441 | 442 | if evaluate_mode == 'any': 443 | evaluate_all = evaluate_all or any([x >= top1_threshold for x in top1s]) 444 | evaluate_all = evaluate_all or any([x >= mrr_theshold for x in mrrs]) 445 | else: 446 | evaluate_all = evaluate_all or all([x >= top1_threshold for x in top1s]) 447 | evaluate_all = evaluate_all or all([x >= mrr_theshold for x in mrrs]) 448 | 449 | if evaluate_all: 450 | return self.get_mrr(model, evaluate_all=True) 451 | 452 | if __name__ == '__main__': 453 | conf = { 454 | 'subject_len': 1, 455 | 'relation_len': 1, 456 | 'object_len': 1, 457 | 'n_rel': 1345, # len(vocabulary) 458 | 'n_ent': 14951, 459 | 'margin': 0.2, 460 | 461 | 'training_params': { 462 | 'save_every': 100, 463 | 'eval_every': 1, 464 | 'batch_size': 128, 465 | 'nb_epoch': 1000, 466 | 'validation_split': 0, 467 | 'optimizer': Adam(), 468 | # 'optimizer': Adam(clip_norm=0.1), 469 | # 'n_eval': 100, 470 | 471 | 'evaluate_all_threshold': { 472 | 'mode': 'all', 473 | 'top1': 0.4, 474 | }, 475 | }, 476 | 477 | 'model_params': { 478 | 'n_embed_dims': 100, 479 | 'n_hidden': 200, 480 | 481 | # convolution 482 | 'nb_filters': 1000, # * 4 483 | 'conv_activation': 'relu', 484 | 485 | # recurrent 486 | 'n_lstm_dims': 141, # * 2 487 | 488 | # 'initial_embed_weights': np.load('models/wordnet_word2vec_1000_dim.h5'), 489 | }, 490 | 491 | 'similarity_params': { 492 | 'mode': 'cosine', 493 | 'gamma': 1, 494 | 'c': 1, 495 | 'd': 2, 496 | } 497 | } 498 | 499 | evaluator = Evaluator(conf) 500 | 501 | ##### Embedding model ###### 502 | model = RescalModel(conf) 503 | optimizer = conf.get('training_params', dict()).get('optimizer', 'adam') 504 | 505 | # TransE model 506 | # model = TranEModel(conf) 507 | # optimizer = conf.get('training_params', dict()).get('optimizer', 'adam') 508 | 509 | model.compile(optimizer=optimizer) 510 | 511 | # save embedding layer 512 | # evaluator.load_epoch(model, 33) 513 | # embedding_layer = model.prediction_model.layers[2].layers[2] 514 | # evaluator.load_epoch(model, 100) 515 | # evaluator.train(model) 516 | # weights = embedding_layer.get_weights()[0] 517 | # np.save(open('models/embedding_1000_dim.h5', 'wb'), weights) 518 | 519 | # train the model 520 | # evaluator.load_epoch(model, 54) 521 | evaluator.train(model) 522 | # embedding_matrix = model.prediction_model.layers[3].layers[3].get_weights()[0] 523 | # print(np.linalg.norm(embedding_matrix[1, :])) 524 | # print(np.linalg.norm(embedding_matrix[:, 1])) 525 | 526 | # evaluate mrr for a particular epoch 527 | # evaluator.load_epoch(model, 5) 528 | # evaluator.get_mrr(model, evaluate_all=True) 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | # class HolE(Layer): 540 | # def __init__(self, ndim=50, marge=1., lremb=0.1, lrparam=1., **kwargs): 541 | # super().__init__(**kwargs) 542 | # self.ndim = ndim 543 | # self.marge = marge 544 | # self.lremb = lremb 545 | # self.lrparam = lrparam 546 | 547 | 548 | 549 | 550 | # import itertools 551 | # import logging 552 | # import numpy as np 553 | # import os 554 | # import time 555 | # import theano as th 556 | # import theano.tensor as T 557 | # from .gradient_descent import gd 558 | # from ..data_structures import triple_tensor as tt 559 | # from ..experiments.metrics import auprc 560 | # from .optimization import sgd_on_triples 561 | # from ..experiments.helper import tolist 562 | # _log = logging.getLogger(__name__) 563 | # DTYPE = th.config.floatX # @UndefinedVariable 564 | # def init_uniform(rng, n, d, dtype=np.float32): 565 | # wbound = np.sqrt(6. / d) 566 | # W_values = rng.uniform(low=-wbound, high=wbound, size=(d, n)) 567 | # W_values = W_values / np.sqrt(np.sum(W_values ** 2, axis=0)) 568 | # W_values = np.asarray(W_values, dtype=dtype) 569 | # return W_values.T 570 | # class TranslationalEmbeddingsModel(object): 571 | # """Translational Embeddings Model. 572 | # Implementation of TransE: 573 | # Antoine Bordes, Nicolas Usunier, Alberto Garcia-Duran, Jason Weston, Oksana 574 | # Yakhnenko. Translating Embeddings for Modeling Multi-relational Data. 575 | # NIPS 2013 576 | # Parameters 577 | # ---------- 578 | # consider_tc : bool 579 | # Whether or not to consider information about type constraints in the 580 | # data. 581 | # Defaults to True. 582 | # simfn : string. 583 | # 'L1' or 'L2' similarity function. 584 | # Defaults to 'L1'. 585 | # ndim : int 586 | # Dimension of the latent embeddings (rank). 587 | # Defaults to 50. 588 | # marge : float 589 | # Margin in the margin based ranking function (gamma in the paper). 590 | # Defaults to 1. 591 | # lremb : float 592 | # Learning rate for latent embeddings. 593 | # Defaults to 0.1. 594 | # lrparam : float 595 | # Learning rate for other parameters. 596 | # Defaults to 1.0. 597 | # mbatchsize : int 598 | # Size of the minibatch. 599 | # Defaults to 128. 600 | # totepoches : int 601 | # Maximum epoches (how often the model is trained on the complete 602 | # dataset). 603 | # Defaults to 500. 604 | # neval : int 605 | # Validate performance every nth minibatch. 606 | # Defaults to 1. 607 | # lcwa : bool 608 | # If true and consider_tc is True, approximate the type constraints from 609 | # the data with the local closed-world assumption. 610 | # Defaults to `False`. 611 | # seed : int 612 | # Seed used for random number generation. 613 | # Defaults to 123. 614 | # savepath : string 615 | # Location where to save the best model parameters. 616 | # Defaults to ./transE. 617 | # """ 618 | # def __init__(self, consider_tc=True, simfn='L1', ndim=50, marge=1., 619 | # lremb=0.1, lrparam=1., mbatchsize=128, maxepoch=500, 620 | # neval=100, lcwa=False, seed=123, conv=1e-4, 621 | # savepath='./transE', dtype=DTYPE, 622 | # mid=np.random.randint(1000000)): 623 | # model_id = (time.strftime('%d_%m_%y___%H_%M_%S') + 624 | # '%d-%d_' % (mid, np.random.randint(100000))) 625 | # self.simfn = simfn 626 | # self.ndim = ndim 627 | # self.marge = marge 628 | # self.lremb = lremb 629 | # self.lrparam = lrparam 630 | # self.mbatchsize = mbatchsize 631 | # self.maxepoch = maxepoch 632 | # self.neval = neval 633 | # self.seed = seed 634 | # self.corrupted = 1 635 | # self.corrupted_axes = [0, 1] 636 | # self.rng = np.random.RandomState(seed) 637 | # self.dtype = dtype 638 | # self.consider_tc = consider_tc 639 | # self.lcwa = lcwa 640 | # self.conv = conv 641 | # self.params = [ndim, marge, lremb, lrparam, simfn, seed, consider_tc, 642 | # lcwa] 643 | # self.parallization_precautions = False 644 | # self.savefile = os.path.join(savepath, 645 | # model_id+type(self).__name__+".pkl") 646 | # # create path where the model is saved 647 | # if not os.path.isdir(savepath): 648 | # os.mkdir(savepath) 649 | # def __graph_pred(self, X): 650 | # # Translational Embeddings Function d(h+l,t) 651 | # e = self.E[X[:, :2].T.reshape((-1,))] 652 | # h = e[:e.shape[0]//2] 653 | # l = self.R[X[:, 2]] 654 | # t = e[e.shape[0]//2:] 655 | # return (-T.sum(T.abs_((h+l)-t), axis=1) 656 | # if self.simfn == 'L1' 657 | # else - T.sqrt(T.sum(T.sqr((h+l)-t), axis=1))) 658 | # def __graph_train(self, X, Xc): 659 | # # Translational Embeddings max-margin loss function 660 | # E = self.E[T.concatenate([X[:, :2], Xc[:, :2]], 661 | # axis=1).T.reshape((-1,))] 662 | # R = self.R[T.concatenate([X[:, 2], Xc[:, 2]])] 663 | # e = E[:E.shape[0]//2] 664 | # h = e[:e.shape[0]//2] 665 | # l = R[:R.shape[0]//2] 666 | # t = e[e.shape[0]//2:] 667 | # outputX = (-T.sum(T.abs_((h+l)-t), axis=1) 668 | # if self.simfn == 'L1' 669 | # else - T.sqrt(T.sum(T.sqr((h+l)-t), axis=1))) 670 | # ec = E[E.shape[0]//2:] 671 | # hc = ec[:ec.shape[0]//2] 672 | # lc = R[R.shape[0]//2:] 673 | # tc = ec[ec.shape[0]//2:] 674 | # outputXc = (-T.sum(T.abs_((hc+lc)-tc), axis=1) 675 | # if self.simfn == 'L1' 676 | # else - T.sqrt(T.sum(T.sqr((hc+lc)-tc), axis=1))) 677 | # loss = outputXc - outputX + self.marge 678 | # return T.sum(loss * (loss > 0)) 679 | # def loss_func(self, indices, Y): 680 | # # Metric used for early stopping 681 | # return 1-auprc(Y, self.func(indices)) 682 | # def fit(self, tensor): 683 | # if not self.consider_tc: 684 | # # remove type-constraint information 685 | # tensor.type_constraints = [[None, None] 686 | # for i in xrange(tensor.shape[2])] 687 | # elif self.lcwa: 688 | # tensor.approximate_type_constraints() 689 | # self.type_constraints = tensor.type_constraints 690 | # self.Nent = tensor.shape[0] 691 | # self.Nrel = tensor.shape[2] 692 | # self.samplefunc = tt.compute_corrupted_bordes 693 | # X = T.imatrix("X") # matrices with triple indices 694 | # Xc = T.imatrix("Xc") # corrupted entities 695 | # self.E = th.shared( 696 | # value=init_uniform(self.rng, tensor.shape[0], self.ndim, 697 | # dtype=self.dtype), name="Ents_emb") 698 | # self.R = th.shared( 699 | # value=init_uniform(self.rng, tensor.shape[0], self.ndim, 700 | # dtype=self.dtype), name="Rels_emb") 701 | # self.parameters = [self.E, self.R] 702 | # # Output function TransE: d(h+l,t) 703 | # self.func = th.function([X], self.__graph_pred(X)) 704 | # # Define the cost function 705 | # loss_pos = self.__graph_train(X, Xc) 706 | # # Normalization function for embeddings of entities: 707 | # batch_idcs = T.ivector('batch_idcs') 708 | # update = OrderedDict({self.E: T.set_subtensor( 709 | # self.E[batch_idcs], self.E[batch_idcs] / 710 | # T.sqrt(T.sum(self.E[batch_idcs] ** 2, axis=1, keepdims=True)))}) 711 | # self.normalize = th.function([batch_idcs], [], updates=update) 712 | # # Update function 713 | # self.update_func = gd([X, Xc], loss_pos, self.parameters, 714 | # lr=[self.lremb, 715 | # self.lrparam/float(self.mbatchsize)]) 716 | # # Train the model with stg 717 | # fitted_parameters, self.used_epochs, self.epoch_times = ( 718 | # sgd_on_triples(self.rng, tensor, self, neval=self.neval, 719 | # mbsize=self.mbatchsize, unlabeled=True, 720 | # copy_X_train=not self.parallization_precautions)) 721 | # for i, parameter in enumerate(fitted_parameters): 722 | # self.parameters[i].set_value(parameter.get_value()) 723 | # @property 724 | # def sparsity(self): 725 | # raise NotImplementedError 726 | # def clear(self): 727 | # """Deletes the memory expensive parameters.""" 728 | # del self.E 729 | # del self.R 730 | # del self.parameters 731 | # os.remove(self.savefile) 732 | # def predict(self, indices): 733 | # # This should be just d(h+l,t) 734 | # return self.func(indices) 735 | # @staticmethod 736 | # def model_creator(settings): 737 | # # For loading multiple model parameters from a configuration file 738 | # confs = None 739 | # if settings['try_all_reg_combinations']: 740 | # confs = list(itertools.product(tolist(settings['rank']), 741 | # tolist(settings['gamma']), 742 | # tolist(settings['lrate_emb']), 743 | # tolist(settings['lrate_par']))) 744 | # else: 745 | # confs = [[r, m, lr1, lr2] 746 | # for r, m, lr1, lr2 in 747 | # zip(tolist(settings['rank']), 748 | # tolist(settings['gamma']), 749 | # tolist(settings['lrate_emb']), 750 | # tolist(settings['lrate_par']))] 751 | # confs = list(itertools.product(tolist(settings['seed']), confs)) 752 | # models = [] 753 | # for i, conf in enumerate(confs): 754 | # s, conf = conf 755 | # r, m, lr1, lr2 = conf 756 | # models.append(TranslationalEmbeddingsModel( 757 | # consider_tc=settings['consider_tc'], 758 | # simfn=str.upper(settings['simfn']), 759 | # ndim=r, 760 | # marge=m, 761 | # lremb=lr1, 762 | # lrparam=lr2, 763 | # mbatchsize=settings['mbatchsize'], 764 | # maxepoch=settings['maxepoch'], 765 | # neval=settings['neval'], 766 | # lcwa=settings['lcwa'], 767 | # seed=s, 768 | # savepath=settings['savepath'], 769 | # mid=i)) 770 | # return models -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | net.sansa_stack.ml.kge 8 | sansa-kge 9 | 0.0.1-SNAPSHOT 10 | ${project.artifactId} 11 | Knowledge graph factorization 12 | 2016 13 | 14 | 15 | 1.8 16 | 1.8 17 | UTF-8 18 | 2.0.1 19 | 2.11.7 20 | 2.11 21 | ${scala.compat.version} 22 | 23 | 24 | 25 | https://github.com/SANSA-Stack/Spark-Tensors 26 | scm:git:git://github.com/SANSA-Stack/Spark-Tensors.git 27 | scm:git:git@github.com:SANSA-Stack/Spark-Tensors.git 28 | HEAD 29 | 30 | 31 | 32 | 33 | oss-sonatype 34 | oss-sonatype 35 | https://oss.sonatype.org/content/repositories/snapshots/ 36 | 37 | true 38 | 39 | 40 | 41 | apache-snapshot 42 | Apache repository (snapshots) 43 | https://repository.apache.org/content/repositories/snapshots/ 44 | 45 | true 46 | 47 | 48 | 49 | maven.aksw.internal 50 | AKSW Release Repository 51 | http://maven.aksw.org/archiva/repository/internal 52 | 53 | true 54 | 55 | 56 | false 57 | 58 | 59 | 60 | maven.aksw.snapshots 61 | AKSW Snapshot Repository 62 | http://maven.aksw.org/archiva/repository/snapshots 63 | 64 | false 65 | 66 | 67 | true 68 | 69 | 70 | 71 | 72 | 73 | 74 | ml.dmlc.mxnet 75 | mxnet-spark_${scala.compat.version} 76 | 0.10.1-SNAPSHOT 77 | 78 | 79 | * 80 | * 81 | 82 | 83 | 84 | 85 | ml.dmlc.mxnet 86 | mxnet-full_${scala.compat.version}-${platform} 87 | 0.10.1-SNAPSHOT 88 | 89 | 90 | * 91 | * 92 | 93 | 94 | 95 | 96 | net.sansa-stack 97 | sansa-rdf-spark-core 98 | 0.1.1-SNAPSHOT 99 | 100 | 101 | org.apache.spark 102 | spark-graphx_${scala.compat.version} 103 | ${spark.version} 104 | 105 | 106 | org.apache.spark 107 | spark-core_${scala.compat.version} 108 | ${spark.version} 109 | 110 | 111 | org.apache.spark 112 | spark-mllib_${scala.compat.version} 113 | ${spark.version} 114 | 115 | 116 | 117 | 118 | 119 | 120 | org.apache.maven.plugins 121 | maven-compiler-plugin 122 | 123 | 124 | 125 | net.alchim31.maven 126 | scala-maven-plugin 127 | 128 | 129 | 130 | org.apache.maven.plugins 131 | maven-surefire-plugin 132 | 133 | 134 | 135 | com.amashchenko.maven.plugin 136 | gitflow-maven-plugin 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | org.apache.maven.plugins 147 | maven-compiler-plugin 148 | 3.6.0 149 | 150 | ${maven.compiler.source} 151 | ${maven.compiler.target} 152 | 153 | 154 | 155 | 156 | net.alchim31.maven 157 | scala-maven-plugin 158 | 3.2.2 159 | 160 | 161 | 162 | 163 | 164 | 165 | add-source 166 | compile 167 | testCompile 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | org.apache.maven.plugins 184 | maven-surefire-plugin 185 | 2.18.1 186 | 187 | false 188 | true 189 | 190 | 191 | 192 | **/*Test.* 193 | **/*Suite.* 194 | 195 | 196 | 197 | 198 | 199 | com.amashchenko.maven.plugin 200 | gitflow-maven-plugin 201 | 1.3.1 202 | 203 | 204 | v 205 | 206 | false 207 | 208 | 209 | 210 | 211 | org.apache.maven.plugins 212 | maven-shade-plugin 213 | 2.4.3 214 | 215 | 216 | 217 | package 218 | 219 | shade 220 | 221 | 222 | true 223 | jar-with-dependencies 224 | 225 | 226 | *:* 227 | 228 | META-INF/*.SF 229 | META-INF/*.DSA 230 | META-INF/*.RSA 231 | 232 | 233 | 234 | 235 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 249 | 250 | org.eclipse.m2e 251 | lifecycle-mapping 252 | 1.0.0 253 | 254 | 255 | 256 | 257 | 258 | 259 | net.alchim31.maven 260 | 261 | 262 | scala-maven-plugin 263 | 264 | 265 | [3.2.0,) 266 | 267 | 268 | testCompile 269 | compile 270 | add-source 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | doclint-java8-disable 289 | 290 | [1.8,) 291 | 292 | 293 | 294 | 295 | 296 | org.apache.maven.plugins 297 | maven-javadoc-plugin 298 | 2.9.1 299 | 300 | 301 | attach-javadocs 302 | 303 | jar 304 | 305 | 306 | false 307 | 308 | 309 | 310 | 311 | -Xdoclint:none 312 | 313 | 314 | 315 | 316 | 317 | 318 | osx-x86_64-cpu 319 | 320 | 321 | mac 322 | x86_64 323 | 324 | 325 | 326 | osx-x86_64-cpu 327 | 328 | 329 | 330 | linux-x86_64-cpu 331 | 332 | 333 | linux 334 | 335 | 336 | 337 | linux-x86_64-cpu 338 | 339 | 340 | 341 | linux-x86_64-gpu 342 | 343 | linux-x86_64-gpu 344 | 345 | 346 | 347 | release 348 | 349 | 350 | 351 | org.apache.maven.plugins 352 | maven-gpg-plugin 353 | 1.6 354 | 355 | 356 | sign-artifacts 357 | verify 358 | 359 | sign 360 | 361 | 362 | AKSW 363 | ${gpg.keyname} 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | -------------------------------------------------------------------------------- /python/sansa/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'nilesh' 2 | -------------------------------------------------------------------------------- /python/sansa/ml/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'nilesh' 2 | -------------------------------------------------------------------------------- /python/sansa/ml/kbc/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'nilesh' 2 | 3 | import keras -------------------------------------------------------------------------------- /python/sansa/ml/kbc/keras/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'nilesh' 2 | 3 | from .hole import HolE 4 | from .base import StochasticTrainer, PairwiseStochasticTrainer, KerasTrainer 5 | from .actfun import afuns as activation_functions 6 | -------------------------------------------------------------------------------- /python/sansa/ml/kbc/keras/actfun.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | import inspect 4 | 5 | 6 | class ActivationFunction(object): 7 | 8 | @classmethod 9 | def key(cls): 10 | return cls.__name__.lower() 11 | 12 | 13 | class Linear(ActivationFunction): 14 | 15 | @staticmethod 16 | def f(x): 17 | return x 18 | 19 | @staticmethod 20 | def g_given_f(fx): 21 | #return 1 22 | return np.ones(fx.shape[0]) 23 | 24 | # return np.ones((fx.shape[0], 1)) 25 | 26 | 27 | class Sigmoid(ActivationFunction): 28 | 29 | @staticmethod 30 | def f(x): 31 | return 1.0 / (1 + np.exp(-x)) 32 | 33 | @staticmethod 34 | def g_given_f(fx): 35 | return fx * (1.0 - fx) 36 | 37 | 38 | class Tanh(ActivationFunction): 39 | 40 | @staticmethod 41 | def f(x): 42 | return np.tanh(x) 43 | 44 | @staticmethod 45 | def g_given_f(fx): 46 | return 1 - fx ** 2 47 | 48 | 49 | class ReLU(ActivationFunction): 50 | 51 | @staticmethod 52 | def f(x): 53 | return np.maximum(0, x) 54 | 55 | @staticmethod 56 | def g_given_f(fx): 57 | return np.int_(fx > 0) 58 | 59 | 60 | class Softplus(ActivationFunction): 61 | 62 | @staticmethod 63 | def f(x): 64 | return np.log(1 + np.exp(x)) 65 | 66 | @staticmethod 67 | def g(x): 68 | raise NotImplementedError() 69 | 70 | 71 | afuns = {} 72 | for cls in ActivationFunction.__subclasses__(): 73 | afuns[cls.key()] = cls 74 | -------------------------------------------------------------------------------- /python/sansa/ml/kbc/keras/base.py: -------------------------------------------------------------------------------- 1 | import keras 2 | from keras.engine.topology import Layer 3 | from keras.models import Sequential 4 | from keras.layers import Dense, Activation 5 | from keras import backend as K 6 | from keras.optimizers import Adagrad, SGD 7 | import math 8 | import theano.tensor as T 9 | import numpy as np 10 | from numpy.random import shuffle 11 | from collections import defaultdict, Counter 12 | from sansa.ml.kbc.keras.param import Parameter, AdaGrad 13 | import timeit 14 | import pickle 15 | 16 | _cutoff = 30 17 | 18 | _DEF_NBATCHES = 100 19 | _DEF_POST_EPOCH = [] 20 | _DEF_LEARNING_RATE = 0.1 21 | _DEF_SAMPLE_FUN = None 22 | _DEF_MAX_EPOCHS = 1000 23 | _DEF_MARGIN = 1.0 24 | 25 | 26 | class Config(object): 27 | 28 | def __init__(self, model, trainer): 29 | self.model = model 30 | self.trainer = trainer 31 | 32 | def __getstate__(self): 33 | return { 34 | 'model': self.model, 35 | 'trainer': self.trainer 36 | } 37 | 38 | 39 | class Model(object): 40 | """ 41 | Base class for all Knowledge Graph models 42 | 43 | Implements basic setup routines for parameters and serialization methods 44 | 45 | Subclasses need to implement: 46 | - scores(self, ss, ps, os) 47 | - _gradients(self, xys) for StochasticTrainer 48 | - _pairwise_gradients(self, pxs, nxs) for PairwiseStochasticTrainer 49 | """ 50 | 51 | def __init__(self, *args, **kwargs): 52 | #super(Model, self).__init__(*args, **) 53 | self.params = {} 54 | self.hyperparams = {} 55 | self.add_hyperparam('init', kwargs.pop('init', 'nunif')) 56 | 57 | def add_param(self, param_id, shape, post=None, value=None): 58 | if value is None: 59 | value = Parameter(shape, self.init, name=param_id, post=post) 60 | setattr(self, param_id, value) 61 | self.params[param_id] = value 62 | 63 | def add_hyperparam(self, param_id, value): 64 | setattr(self, param_id, value) 65 | self.hyperparams[param_id] = value 66 | 67 | def __getstate__(self): 68 | return { 69 | 'hyperparams': self.hyperparams, 70 | 'params': self.params 71 | } 72 | 73 | def __setstate__(self, st): 74 | self.params = {} 75 | self.hyperparams = {} 76 | for pid, p in st['params'].items(): 77 | self.add_param(pid, None, None, value=p) 78 | for pid, p in st['hyperparams'].items(): 79 | self.add_hyperparam(pid, p) 80 | 81 | def save(self, fname, protocol=pickle.HIGHEST_PROTOCOL): 82 | with open(fname, 'wb') as fout: 83 | pickle.dump(self, fout, protocol=protocol) 84 | 85 | @staticmethod 86 | def load(fname): 87 | with open(fname, 'rb') as fin: 88 | mdl = pickle.load(fin) 89 | return mdl 90 | 91 | class KerasTrainer(object): 92 | """ 93 | Keras model trainer 94 | """ 95 | 96 | def __init__(self, *args, **kwargs): 97 | self.model = args[0] 98 | self.hyperparams = {} 99 | self.add_hyperparam('max_epochs', kwargs.pop('max_epochs', _DEF_MAX_EPOCHS)) 100 | self.add_hyperparam('nbatches', kwargs.pop('nbatches', _DEF_NBATCHES)) 101 | self.add_hyperparam('learning_rate', kwargs.pop('learning_rate', _DEF_LEARNING_RATE)) 102 | 103 | self.post_epoch = kwargs.pop('post_epoch', _DEF_POST_EPOCH) 104 | self.samplef = kwargs.pop('samplef', _DEF_SAMPLE_FUN) 105 | 106 | def __getstate__(self): 107 | return self.hyperparams 108 | 109 | def __setstate__(self, st): 110 | for pid, p in st['hyperparams']: 111 | self.add_hyperparam(pid, p) 112 | 113 | def add_hyperparam(self, param_id, value): 114 | setattr(self, param_id, value) 115 | self.hyperparams[param_id] = value 116 | 117 | def fit(self, xs, ys): 118 | self._optim(list(zip(xs, ys))) 119 | 120 | 121 | def getModel(self): 122 | # # Training stuff 123 | # batch_placeholder = K.placeholder(shape=(3,), name="batch") 124 | # label_placeholder = K.placeholder(shape=(1,), name="label") 125 | 126 | # Model stuff 127 | # E = K.variable(self.model.E, name="entity_embeddings") 128 | # R = K.variable(self.model.R, name="relation_embeddings") 129 | model = Sequential() 130 | model.add(HolographicLayerTest(self.model.E.shape[0], self.model.R.shape[0], self.model.E.shape[1], self.model.rparam)) 131 | # model.add(Dense(5, input_dim=(10,))) 132 | # model.add(Activation('sigmoid')) 133 | # adagrad = Adagrad(lr=0.001, epsilon=1e-07) 134 | adagrad = SGD(lr=0.001, decay=1e-06, nesterov=True, momentum=0.5) 135 | 136 | def loss(y_true, y_pred): 137 | print(y_pred) 138 | return -K.sum(K.log(K.sigmoid(-y_true * y_pred))) 139 | 140 | print("Compiling new model") 141 | model.compile(optimizer=adagrad, loss=loss) 142 | return model 143 | 144 | def _pre_epoch(self): 145 | self.loss = 0 146 | 147 | def _optim(self, xys): 148 | idx = np.arange(len(xys)) 149 | # self.batch_size = np.ceil(len(xys) / self.nbatches) 150 | 151 | # batch_idx = np.arange(self.batch_size, len(xys), self.batch_size) 152 | 153 | model = self.getModel() 154 | 155 | for self.epoch in range(1, self.max_epochs + 1): 156 | # shuffle training examples 157 | self._pre_epoch() 158 | shuffle(idx) 159 | 160 | # store epoch for callback 161 | self.epoch_start = timeit.default_timer() 162 | 163 | # process mini-batches 164 | # for batch in np.split(idx, batch_idx): 165 | # # select indices for current batch 166 | # bxys = [xys[z] for z in batch] 167 | # self._process_batch(bxys, model) 168 | 169 | self._process_batch(xys, model) 170 | 171 | # check callback function, if false return 172 | for f in self.post_epoch: 173 | if not f(self): 174 | break 175 | 176 | # 177 | # print (self.model.E.shape) 178 | # print (self.model.R.shape) 179 | 180 | def _process_batch(self, xys, model): 181 | # if enabled, sample additional examples 182 | if self.samplef is not None: 183 | xys += self.samplef(xys) 184 | 185 | if hasattr(self.model, '_prepare_batch_step'): 186 | self.model._prepare_batch_step(xys) 187 | 188 | shuffle(xys) 189 | # take step for batch 190 | assert isinstance(model, keras.models.Model) 191 | xs, ys = [np.array(i) for i in list(zip(*xys))] 192 | # print(xs, ys) 193 | # print(xs.shape, ys.shape) 194 | 195 | class LossHistory(keras.callbacks.Callback): 196 | def on_train_begin(self, logs={}): 197 | self.loss = -1 198 | 199 | def on_batch_end(self, batch, logs={}): 200 | self.loss = logs.get('loss') 201 | 202 | history = LossHistory() 203 | # print(Counter(ys)) 204 | # x = K.placeholder(shape=(1,3)) 205 | # func = K.function([x], model(x)) 206 | # for i, j in zip(xs, ys): 207 | # print(func([[i]]), j) 208 | model.fit(xs, ys, batch_size=len(xs)/100, nb_epoch=100, callbacks=[history]) 209 | loss = history.loss 210 | # loss = model.train_on_batch(xs, ys) 211 | E, R = model.layers[0].get_weights() 212 | # print (np.linalg.norm(self.model.E-E, 'fro')) 213 | # print (np.linalg.norm(self.model.R-R, 'fro')) 214 | self.model.E, self.model.R = E, R 215 | 216 | # print(loss) 217 | 218 | self.loss += loss 219 | # print (acc) 220 | 221 | class HolographicLayer(Layer): 222 | def __init__(self, E, R, rparam, input_shape=(3,), **kwargs): 223 | self.E = E 224 | self.R = R 225 | self.rparam = rparam 226 | kwargs["input_shape"] = input_shape 227 | super(HolographicLayer, self).__init__(**kwargs) 228 | 229 | def ccorr1d_sc(self, input, filters, image_shape=None, filter_shape=None, 230 | border_mode='valid', subsample=(1,), filter_flip=True): 231 | """ 232 | using conv2d with a single input channel 233 | """ 234 | # if border_mode not in ('valid', 0, (0,)): 235 | # raise RuntimeError("Unsupported border_mode for conv1d_sc: " 236 | # "%s" % border_mode) 237 | 238 | if image_shape is None: 239 | image_shape_sc = None 240 | else: 241 | # (b, c, i0) to (b, 1, c, i0) 242 | image_shape_sc = (image_shape[0], 1, image_shape[1], image_shape[2]) 243 | 244 | if filter_shape is None: 245 | filter_shape_sc = None 246 | else: 247 | filter_shape_sc = (filter_shape[0], 1, filter_shape[1], 248 | filter_shape[2]) 249 | 250 | input_sc = input.dimshuffle('x', 'x', 0, 'x') 251 | # We need to flip the channels dimension because it will be convolved over. 252 | filters_sc = filters.dimshuffle('x', 'x', 0, 'x')[:, :, ::-1, :] 253 | 254 | conved = T.nnet.conv2d(input_sc, filters_sc, image_shape_sc, 255 | filter_shape_sc, subsample=(1, subsample[0]), 256 | filter_flip=filter_flip, border_mode=border_mode).flatten() 257 | return conved # drop the unused dimension 258 | 259 | def build(self, input_shape): 260 | self.trainable_weights = [self.E, self.R] 261 | # from keras.regularizers import l2 262 | # regularizer = l2(self.rparam) 263 | # regularizer.set_param(K.concatenate([self.E, self.R], axis=0)) 264 | # self.regularizers.append(regularizer) 265 | 266 | def call(self, x, mask=None): 267 | batch_placeholder = K.cast(x, 'int32')[0] 268 | s, o, p = [batch_placeholder[i] for i in range(3)] 269 | 270 | s2v = K.gather(self.E, s) 271 | o2v = K.gather(self.E, o) 272 | r2v = K.gather(self.R, p) 273 | 274 | # print(K.shape(s2v).eval()) 275 | # print(self.E[[0]].shape.eval()) 276 | 277 | def ccorr(a, b): 278 | return self.ccorr1d_sc(a, b, border_mode='half') 279 | 280 | eta = K.dot(K.transpose(r2v), ccorr(s2v, o2v)) 281 | return eta 282 | 283 | def get_output_shape_for(self, input_shape): 284 | return (input_shape[0], 1) 285 | 286 | class HolographicLayerTest(Layer): 287 | def __init__(self, E, R, d, rparam, input_shape=(3,), **kwargs): 288 | bnd = math.sqrt(6) / math.sqrt(2*E) 289 | from numpy.random import uniform 290 | self.init = [K.variable(uniform(size=(E,d), low=-bnd, high=bnd), name="E"), 291 | K.variable(uniform(size=(R,d*d), low=-bnd, high=bnd), name="R")] 292 | self.rparam = rparam 293 | kwargs["input_shape"] = input_shape 294 | super(HolographicLayerTest, self).__init__(**kwargs) 295 | 296 | def ccorr1d_sc(self, input, filters, image_shape=None, filter_shape=None, 297 | border_mode='valid', subsample=(1,), filter_flip=True): 298 | """ 299 | using conv2d with a single input channel 300 | """ 301 | # if border_mode not in ('valid', 0, (0,)): 302 | # raise RuntimeError("Unsupported border_mode for conv1d_sc: " 303 | # "%s" % border_mode) 304 | 305 | if image_shape is None: 306 | image_shape_sc = None 307 | else: 308 | # (b, c, i0) to (b, 1, c, i0) 309 | image_shape_sc = (image_shape[0], 1, image_shape[1], image_shape[2]) 310 | 311 | if filter_shape is None: 312 | filter_shape_sc = None 313 | else: 314 | filter_shape_sc = (filter_shape[0], 1, filter_shape[1], 315 | filter_shape[2]) 316 | 317 | input_sc = input.dimshuffle('x', 'x', 0, 'x') 318 | # We need to flip the channels dimension because it will be convolved over. 319 | filters_sc = filters.dimshuffle('x', 'x', 0, 'x')[:, :, ::-1, :] 320 | 321 | conved = T.nnet.conv2d(input_sc, filters_sc, image_shape_sc, 322 | filter_shape_sc, subsample=(1, subsample[0]), 323 | filter_flip=filter_flip, border_mode=border_mode).flatten() 324 | return conved # drop the unused dimension 325 | 326 | def build(self, input_shape): 327 | self.E, self.R = self.init 328 | self.trainable_weights = [self.E, self.R] 329 | # from keras.regularizers import l2 330 | # regularizer = l2(self.rparam) 331 | # regularizer.set_param(self.E) 332 | # self.regularizers.append(regularizer) 333 | # 334 | # regularizer = l2(self.rparam) 335 | # regularizer.set_param(self.R) 336 | # self.regularizers.append(regularizer) 337 | 338 | def call(self, x, mask=None): 339 | batch_placeholder = K.cast(x, 'int32')[0] 340 | s, o, p = [batch_placeholder[i] for i in range(3)] 341 | 342 | s2v = K.gather(self.E, s) 343 | o2v = K.gather(self.E, o) 344 | r2v = K.gather(self.R, p) 345 | 346 | def ccorr(a, b): 347 | return T.outer(a,b).flatten() 348 | # return self.ccorr1d_sc(a, b, border_mode='half') 349 | eta = K.dot(r2v, ccorr(s2v, o2v)) 350 | 351 | # func = K.function([s2v,o2v,r2v], K.gradients(K.sigmoid(eta), [s2v,o2v,r2v])) 352 | # print(func([np.random.random(150),np.random.random(150),np.random.random(150)])) 353 | 354 | return eta 355 | 356 | def get_output_shape_for(self, input_shape): 357 | return (input_shape[0], 1) 358 | 359 | class TheanoGradTest(object): 360 | def ccorr1d_sc(self, input, filters, image_shape=None, filter_shape=None, 361 | border_mode='valid', subsample=(1,), filter_flip=True): 362 | """ 363 | using conv2d with a single input channel 364 | """ 365 | # if border_mode not in ('valid', 0, (0,)): 366 | # raise RuntimeError("Unsupported border_mode for conv1d_sc: " 367 | # "%s" % border_mode) 368 | 369 | if image_shape is None: 370 | image_shape_sc = None 371 | else: 372 | # (b, c, i0) to (b, 1, c, i0) 373 | image_shape_sc = (image_shape[0], 1, image_shape[1], image_shape[2]) 374 | 375 | if filter_shape is None: 376 | filter_shape_sc = None 377 | else: 378 | filter_shape_sc = (filter_shape[0], 1, filter_shape[1], 379 | filter_shape[2]) 380 | 381 | input_sc = input.dimshuffle('x', 'x', 0, 'x') 382 | # We need to flip the channels dimension because it will be convolved over. 383 | filters_sc = filters.dimshuffle('x', 'x', 0, 'x')[:, :, ::-1, :] 384 | 385 | conved = T.nnet.conv2d(input_sc, filters_sc, image_shape_sc, 386 | filter_shape_sc, subsample=(1, subsample[0]), 387 | filter_flip=filter_flip, border_mode=border_mode).flatten() 388 | return conved # drop the unused dimension 389 | 390 | 391 | def call(self): 392 | E = K.variable(np.random.random((1000,100)), name="entity_embeddings") 393 | R = K.variable(np.random.random((10,10000)), name="relation_embeddings") 394 | x = K.placeholder(shape=(1,3), name="spo") 395 | y = K.placeholder(ndim=0, name="y") 396 | batch_placeholder = K.cast(x, 'int32')[0] 397 | # print(batch_placeholder.eval()) 398 | s, o, p = [batch_placeholder[i] for i in range(3)] 399 | 400 | s2v = K.gather(E, s) 401 | o2v = K.gather(E, o) 402 | r2v = K.gather(R, p) 403 | 404 | def ccorr(a, b): 405 | return T.outer(a,b).flatten() 406 | # return T.arctan(s2v) + T.arctan(o2v) 407 | # return (s2v.dimshuffle('x', 'x', 0, 'x') + o2v.dimshuffle('x', 'x', 0, 'x')).flatten() 408 | # return T.nnet.conv2d(a.dimshuffle('x', 'x', 0, 'x'), b.dimshuffle('x', 'x', 0, 'x'), None, 409 | # None, 410 | # filter_flip=True, border_mode='half') 411 | # return self.ccorr1d_sc(a, b, border_mode='half') 412 | eta = K.dot(r2v, ccorr(s2v, o2v)) 413 | # py = 1/(1+K.exp(-eta)) 414 | # l = -K.log(py) 415 | # from theano import pp, function, printing 416 | # grad = T.grad(eta, E) 417 | # print(pp(grad)) 418 | # func = function([x], grad) 419 | func = K.function([x, y], K.gradients(eta, [s2v, o2v, r2v, E, R])) 420 | 421 | # for i in func.maker.fgraph.outputs: 422 | # print(pp(i)) 423 | # print (T.grad(py, s2v)) 424 | print (func([[[1,2,3]], -1])) 425 | 426 | class StochasticTrainer(object): 427 | """ 428 | Stochastic gradient descent trainer with scalar loss function. 429 | 430 | Models need to implement 431 | 432 | _gradients(self, xys) 433 | 434 | to be trained with this class. 435 | 436 | """ 437 | 438 | def __init__(self, *args, **kwargs): 439 | self.model = args[0] 440 | self.hyperparams = {} 441 | self.add_hyperparam('max_epochs', kwargs.pop('max_epochs', _DEF_MAX_EPOCHS)) 442 | self.add_hyperparam('nbatches', kwargs.pop('nbatches', _DEF_NBATCHES)) 443 | self.add_hyperparam('learning_rate', kwargs.pop('learning_rate', _DEF_LEARNING_RATE)) 444 | 445 | self.post_epoch = kwargs.pop('post_epoch', _DEF_POST_EPOCH) 446 | self.samplef = kwargs.pop('samplef', _DEF_SAMPLE_FUN) 447 | pu = kwargs.pop('param_update', AdaGrad) 448 | self._updaters = { 449 | key: pu(param, self.learning_rate) 450 | for key, param in self.model.params.items() 451 | } 452 | 453 | def __getstate__(self): 454 | return self.hyperparams 455 | 456 | def __setstate__(self, st): 457 | for pid, p in st['hyperparams']: 458 | self.add_hyperparam(pid, p) 459 | 460 | def add_hyperparam(self, param_id, value): 461 | setattr(self, param_id, value) 462 | self.hyperparams[param_id] = value 463 | 464 | def fit(self, xs, ys): 465 | self._optim(list(zip(xs, ys))) 466 | 467 | def _pre_epoch(self): 468 | self.loss = 0 469 | 470 | def _optim(self, xys): 471 | idx = np.arange(len(xys)) 472 | self.batch_size = np.ceil(len(xys) / self.nbatches) 473 | batch_idx = np.arange(self.batch_size, len(xys), self.batch_size) 474 | 475 | for self.epoch in range(1, self.max_epochs + 1): 476 | # shuffle training examples 477 | self._pre_epoch() 478 | shuffle(idx) 479 | 480 | # store epoch for callback 481 | self.epoch_start = timeit.default_timer() 482 | 483 | # process mini-batches 484 | for batch in np.split(idx, batch_idx): 485 | # select indices for current batch 486 | bxys = [xys[z] for z in batch] 487 | self._process_batch(bxys) 488 | 489 | # check callback function, if false return 490 | for f in self.post_epoch: 491 | if not f(self): 492 | break 493 | 494 | def _process_batch(self, xys): 495 | # if enabled, sample additional examples 496 | if self.samplef is not None: 497 | xys += self.samplef(xys) 498 | 499 | if hasattr(self.model, '_prepare_batch_step'): 500 | self.model._prepare_batch_step(xys) 501 | 502 | # take step for batch 503 | grads = self.model._gradients(xys) 504 | self.loss += self.model.loss 505 | self._batch_step(grads) 506 | 507 | def _batch_step(self, grads): 508 | for paramID in self._updaters.keys(): 509 | self._updaters[paramID](*grads[paramID]) 510 | 511 | 512 | class PairwiseStochasticTrainer(StochasticTrainer): 513 | """ 514 | Stochastic gradient descent trainer with pairwise ranking loss functions. 515 | 516 | Models need to implement 517 | 518 | _pairwise_gradients(self, pxs, nxs) 519 | 520 | to be trained with this class. 521 | 522 | """ 523 | 524 | 525 | def __init__(self, *args, **kwargs): 526 | super(PairwiseStochasticTrainer, self).__init__(*args, **kwargs) 527 | self.model.add_hyperparam('margin', kwargs.pop('margin', _DEF_MARGIN)) 528 | 529 | def fit(self, xs, ys): 530 | if self.samplef is None: 531 | pidx = np.where(np.array(ys) == 1)[0] 532 | nidx = np.where(np.array(ys) != 1)[0] 533 | pxs = [xs[i] for i in pidx] 534 | self.nxs = [xs[i] for i in nidx] 535 | self.pxs = int(len(self.nxs) / len(pxs)) * pxs 536 | xys = list(range(min(len(pxs), len(self.nxs)))) 537 | self._optim(xys) 538 | else: 539 | self._optim(list(zip(xs, ys))) 540 | 541 | def _pre_epoch(self): 542 | self.nviolations = 0 543 | if self.samplef is None: 544 | shuffle(self.pxs) 545 | shuffle(self.nxs) 546 | 547 | def _process_batch(self, xys): 548 | pxs = [] 549 | nxs = [] 550 | 551 | for xy in xys: 552 | if self.samplef is not None: 553 | for nx in self.samplef([xy]): 554 | pxs.append(xy) 555 | nxs.append(nx) 556 | else: 557 | pxs.append((self.pxs[xy], 1)) 558 | nxs.append((self.nxs[xy], 1)) 559 | 560 | # take step for batch 561 | if hasattr(self.model, '_prepare_batch_step'): 562 | self.model._prepare_batch_step(pxs, nxs) 563 | grads = self.model._pairwise_gradients(pxs, nxs) 564 | 565 | # update if examples violate margin 566 | if grads is not None: 567 | self.nviolations += self.model.nviolations 568 | self._batch_step(grads) 569 | 570 | 571 | def sigmoid(fs): 572 | # compute elementwise gradient for sigmoid 573 | for i in range(len(fs)): 574 | if fs[i] > _cutoff: 575 | fs[i] = 1.0 576 | elif fs[i] < -_cutoff: 577 | fs[i] = 0.0 578 | else: 579 | fs[i] = 1.0 / (1 + np.exp(-fs[i])) 580 | return fs[:, np.newaxis] 581 | 582 | if __name__ =="__main__": 583 | TheanoGradTest().call() 584 | 585 | -------------------------------------------------------------------------------- /python/sansa/ml/kbc/keras/hole.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sansa.ml.kbc.keras.base import Model 3 | from sansa.ml.kbc.keras.util import grad_sum_matrix, unzip_triples, ccorr, cconv 4 | from sansa.ml.kbc.keras.param import normless1 5 | from sansa.ml.kbc.keras import actfun as af 6 | 7 | 8 | class HolE(Model): 9 | 10 | def __init__(self, *args, **kwargs): 11 | super(HolE, self).__init__(*args, **kwargs) 12 | self.add_hyperparam('sz', args[0]) 13 | self.add_hyperparam('ncomp', args[1]) 14 | self.add_hyperparam('rparam', kwargs.pop('rparam', 0.0)) 15 | self.add_hyperparam('af', kwargs.pop('af', af.Sigmoid)) 16 | self.add_param('E', (self.sz[0], self.ncomp), post=normless1) 17 | self.add_param('R', (self.sz[2], self.ncomp*self.ncomp)) 18 | 19 | def _scores(self, ss, ps, os): 20 | return np.sum(self.R[ps] * ccorr(self.E[ss], self.E[os]), axis=1) 21 | 22 | def _gradients(self, xys): 23 | ss, ps, os, ys = unzip_triples(xys, with_ys=True) 24 | 25 | yscores = ys * self._scores(ss, ps, os) 26 | self.loss = np.sum(np.logaddexp(0, -yscores)) 27 | #preds = af.Sigmoid.f(yscores) 28 | fs = -(ys * af.Sigmoid.f(-yscores))[:, np.newaxis] 29 | #self.loss -= np.sum(np.log(preds)) 30 | 31 | ridx, Sm, n = grad_sum_matrix(ps) 32 | gr = Sm.dot(fs * ccorr(self.E[ss], self.E[os])) / n 33 | gr += self.rparam * self.R[ridx] 34 | 35 | eidx, Sm, n = grad_sum_matrix(list(ss) + list(os)) 36 | ge = Sm.dot(np.vstack(( 37 | fs * ccorr(self.R[ps], self.E[os]), 38 | fs * cconv(self.E[ss], self.R[ps]) 39 | ))) / n 40 | ge += self.rparam * self.E[eidx] 41 | 42 | return {'E': (ge, eidx), 'R':(gr, ridx)} 43 | 44 | def _pairwise_gradients(self, pxs, nxs): 45 | # indices of positive examples 46 | sp, pp, op = unzip_triples(pxs) 47 | # indices of negative examples 48 | sn, pn, on = unzip_triples(nxs) 49 | 50 | pscores = self.af.f(self._scores(sp, pp, op)) 51 | nscores = self.af.f(self._scores(sn, pn, on)) 52 | 53 | #print("avg = %f/%f, min = %f/%f, max = %f/%f" % (pscores.mean(), nscores.mean(), pscores.min(), nscores.min(), pscores.max(), nscores.max())) 54 | 55 | # find examples that violate margin 56 | ind = np.where(nscores + self.margin > pscores)[0] 57 | self.nviolations = len(ind) 58 | if len(ind) == 0: 59 | return 60 | 61 | # aux vars 62 | sp, sn = list(sp[ind]), list(sn[ind]) 63 | op, on = list(op[ind]), list(on[ind]) 64 | pp, pn = list(pp[ind]), list(pn[ind]) 65 | gpscores = -self.af.g_given_f(pscores[ind])[:, np.newaxis] 66 | gnscores = self.af.g_given_f(nscores[ind])[:, np.newaxis] 67 | 68 | # object role gradients 69 | ridx, Sm, n = grad_sum_matrix(pp + pn) 70 | grp = gpscores * ccorr(self.E[sp], self.E[op]) 71 | grn = gnscores * ccorr(self.E[sn], self.E[on]) 72 | #gr = (Sm.dot(np.vstack((grp, grn))) + self.rparam * self.R[ridx]) / n 73 | gr = Sm.dot(np.vstack((grp, grn))) / n 74 | gr += self.rparam * self.R[ridx] 75 | 76 | # filler gradients 77 | eidx, Sm, n = grad_sum_matrix(sp + sn + op + on) 78 | geip = gpscores * ccorr(self.R[pp], self.E[op]) 79 | gein = gnscores * ccorr(self.R[pn], self.E[on]) 80 | gejp = gpscores * cconv(self.E[sp], self.R[pp]) 81 | gejn = gnscores * cconv(self.E[sn], self.R[pn]) 82 | ge = Sm.dot(np.vstack((geip, gein, gejp, gejn))) / n 83 | #ge += self.rparam * self.E[eidx] 84 | 85 | return {'E': (ge, eidx), 'R':(gr, ridx)} 86 | -------------------------------------------------------------------------------- /python/sansa/ml/kbc/keras/holek.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from random import shuffle 3 | from keras.models import Model 4 | from keras.layers import Input, Activation, Dense, Reshape 5 | from keras.models import Sequential 6 | import keras 7 | from keras.layers import Layer 8 | import math 9 | import numpy as np 10 | import keras.backend as K 11 | from keras.optimizers import Adagrad 12 | from theano import tensor as T 13 | from sansa.ml.kbc.keras import sample 14 | 15 | __author__ = 'nilesh' 16 | 17 | class KerasHole(object): 18 | def __init__(self, numEntities, numRelations, ndim, rparam): 19 | self.numEntities = numEntities 20 | self.numRelations = numRelations 21 | self.ndim = ndim 22 | self.rparam = rparam 23 | 24 | def buildModel(self): 25 | inputs = Input(shape=(2,3)) 26 | score = HolographicLayer2(self.numEntities, self.numRelations, self.ndim, self.rparam)(inputs) 27 | # score = Reshape((1,))(score) 28 | # score = Activation("sigmoid")(score) 29 | model = Model(input=inputs, output=score) 30 | adagrad = Adagrad(lr=0.001, epsilon=1e-06) 31 | 32 | def max_margin(y_true, y_pred): 33 | return T.sum(T.maximum(0., 1. + y_pred[1] + y_pred[0])) 34 | 35 | def loss(y_true, y_pred): 36 | # print(y_pred) 37 | return K.sum(K.log(1. + K.exp(-y_true * y_pred))) 38 | 39 | model.compile(optimizer='rmsprop', loss='binary_crossentropy') 40 | # Or try setting model's output=prediction and loss='binary_crossentropy' - essentially same thing as above 41 | return model 42 | 43 | def fit2(self, xs, ys): 44 | sampler = sample.RandomModeSampler(1, [0, 1], xs, (self.numEntities, self.numEntities, self.numRelations)) 45 | xys = list(zip(xs, ys)) 46 | xyns = sampler.sample(xys) 47 | shuffle(xys) 48 | shuffle(xyns) 49 | xs, ys = [np.array(i) for i in list(zip(*xys))] 50 | xns, yns = [np.array(i) for i in list(zip(*xyns))] 51 | # print(xs[:100], ys[:100]) 52 | xpairs = [np.array(i) for i in list(zip(xs, xns))] 53 | ypairs = [np.array(i) for i in list(zip(ys, yns))] 54 | 55 | print (xpairs[0].shape) 56 | 57 | model = self.buildModel() 58 | # x = K.placeholder((3,)) 59 | # func = K.function([x], model(x)) 60 | # for x in xs: 61 | # print(func([x])) 62 | best_weights_filepath = './best_weights.hdf5' 63 | # earlyStopping= keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='auto') 64 | saveBestModel = keras.callbacks.ModelCheckpoint(best_weights_filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto') 65 | 66 | # train model 67 | history = model.fit(xpairs, ypairs, batch_size=len(xs)/1000, validation_split=0.1, nb_epoch=100, 68 | callbacks=[saveBestModel]) 69 | 70 | #reload best weights 71 | model.load_weights(best_weights_filepath) 72 | 73 | self.model = self 74 | self.E, self.R = model.layers[1].get_weights() 75 | 76 | def fit(self, xs, ys): 77 | sampler = sample.RandomModeSampler(1, [0, 1], xs, (self.numEntities, self.numEntities, self.numRelations)) 78 | xys = list(zip(xs, ys)) 79 | print(len(xys)) 80 | xys += sampler.sample(xys) 81 | print(len(xys)) 82 | shuffle(xys) 83 | xs, ys = [np.array(i) for i in list(zip(*xys))] 84 | # print(xs[:100], ys[:100]) 85 | 86 | model = self.buildModel() 87 | # x = K.placeholder((3,)) 88 | # func = K.function([x], model(x)) 89 | # for x in xs: 90 | # print(func([x])) 91 | best_weights_filepath = './best_weights.hdf5' 92 | # earlyStopping= keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='auto') 93 | saveBestModel = keras.callbacks.ModelCheckpoint(best_weights_filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto') 94 | 95 | # train model 96 | history = model.fit(xs, ys, batch_size=len(xs)/1000, validation_split=0.05, nb_epoch=100, 97 | callbacks=[saveBestModel]) 98 | 99 | #reload best weights 100 | model.load_weights(best_weights_filepath) 101 | 102 | self.model = self 103 | self.E, self.R = model.layers[1].get_weights() 104 | 105 | 106 | class HolographicLayer(Layer): 107 | def __init__(self, E, R, d, rparam, input_shape=(3,), **kwargs): 108 | from keras.initializations import glorot_normal 109 | self.init = [glorot_normal(shape=(E,d), name="E"), glorot_normal(shape=(R,d,d), name="R")] 110 | self.rparam = rparam 111 | kwargs["input_shape"] = input_shape 112 | super(HolographicLayer, self).__init__(**kwargs) 113 | 114 | 115 | def build(self, input_shape): 116 | self.E, self.R = self.init 117 | self.trainable_weights = [self.E, self.R] 118 | from keras.regularizers import l2 119 | # regularizer = l2(self.rparam) 120 | # regularizer.set_param(self.E) 121 | # self.regularizers.append(regularizer) 122 | # 123 | # regularizer = l2(self.rparam) 124 | # regularizer.set_param(self.R) 125 | # self.regularizers.append(regularizer) 126 | 127 | def call(self, x, mask=None): 128 | batch_placeholder = K.cast(x, 'int32')[0] 129 | s, o, p = [batch_placeholder[i] for i in range(3)] 130 | 131 | s2v = K.gather(self.E, s) 132 | o2v = K.gather(self.E, o) 133 | r2v = K.gather(self.R, p) 134 | 135 | def ccorr(a, b): 136 | # Return tensor product - basically bilinear/RESCAL models 137 | return T.outer(a,b).flatten() 138 | 139 | # Or cross-correlation op? 140 | # return T.nnet.conv2d(a.dimshuffle('x', 'x', 0, 'x'), b.dimshuffle('x', 'x', 0, 'x'), None, 141 | # None, 142 | # filter_flip=True, border_mode='half').flatten()[:-1] 143 | # return self.ccorr1d_sc(a, b, border_mode='half') 144 | # eta = K.dot(r2v, ccorr(s2v, o2v)) 145 | eta = K.dot(K.dot(s2v, r2v), o2v) 146 | 147 | # func = K.function([s2v,o2v,r2v], K.gradients(K.sigmoid(eta), [s2v,o2v,r2v])) 148 | # print(func([np.random.random(150),np.random.random(150),np.random.random(150)])) 149 | 150 | return eta 151 | 152 | def get_output_shape_for(self, input_shape): 153 | return (input_shape[0], 1) 154 | 155 | class HolographicLayer2(Layer): 156 | def __init__(self, E, R, d, rparam, input_shape=(2,3), **kwargs): 157 | from keras.initializations import glorot_normal 158 | self.init = [glorot_normal(shape=(E,d), name="E"), glorot_normal(shape=(R,d*d), name="R")] 159 | self.rparam = rparam 160 | kwargs["input_shape"] = input_shape 161 | super(HolographicLayer2, self).__init__(**kwargs) 162 | 163 | 164 | def build(self, input_shape): 165 | self.E, self.R = self.init 166 | self.trainable_weights = [self.E, self.R] 167 | from keras.regularizers import l2 168 | regularizer = l2(self.rparam) 169 | regularizer.set_param(self.E) 170 | self.regularizers.append(regularizer) 171 | 172 | regularizer = l2(self.rparam) 173 | regularizer.set_param(self.R) 174 | self.regularizers.append(regularizer) 175 | 176 | def call(self, x, mask=None): 177 | pos = K.cast(x, 'int32')[0][0] 178 | neg = K.cast(x, 'int32')[0][1] 179 | 180 | def eta(s, o, p): 181 | s2v = K.gather(self.E, s) 182 | o2v = K.gather(self.E, o) 183 | r2v = K.gather(self.R, p) 184 | 185 | def ccorr(a, b): 186 | # Return tensor product - basically bilinear/RESCAL models 187 | return T.outer(a,b).flatten() 188 | 189 | # Or cross-correlation op? 190 | # return T.nnet.conv2d(a.dimshuffle('x', 'x', 0, 'x'), b.dimshuffle('x', 'x', 0, 'x'), None, 191 | # None, 192 | # filter_flip=True, border_mode='half').flatten()[:-1] 193 | # return self.ccorr1d_sc(a, b, border_mode='half') 194 | eta = K.dot(r2v, ccorr(s2v, o2v)) 195 | 196 | return eta 197 | 198 | 199 | pos_eta = eta(*[pos[i] for i in range(3)]) 200 | neg_eta = eta(*[neg[i] for i in range(3)]) 201 | return K.variable(np.array([pos_eta, neg_eta])) 202 | 203 | def get_output_shape_for(self, input_shape): 204 | return (input_shape[0], 2) 205 | 206 | if __name__ == "__main__": 207 | wnbin = "/Users/nilesh/python/holographic-embeddings/data/wn18.bin" 208 | with open(wnbin, 'rb') as fin: 209 | data = pickle.load(fin) 210 | 211 | N = len(data['entities']) 212 | M = len(data['relations']) 213 | 214 | xs = data['train_subs'] 215 | ys = np.ones(len(xs)) 216 | 217 | trainer = KerasHole(N, M, 10, 0.01) 218 | trainer.fit2(xs, ys) -------------------------------------------------------------------------------- /python/sansa/ml/kbc/keras/param.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | from numpy import sqrt, squeeze, zeros_like 4 | from numpy.random import randn, uniform 5 | 6 | 7 | def init_unif(sz): 8 | """ 9 | Uniform intialization 10 | 11 | Heuristic commonly used to initialize deep neural networks 12 | """ 13 | bnd = 1 / sqrt(sz[0]) 14 | p = uniform(low=-bnd, high=bnd, size=sz) 15 | return squeeze(p) 16 | 17 | 18 | def init_nunif(sz): 19 | """ 20 | Normalized uniform initialization 21 | 22 | See Glorot X., Bengio Y.: "Understanding the difficulty of training 23 | deep feedforward neural networks". AISTATS, 2010 24 | """ 25 | bnd = sqrt(6) / sqrt(sz[0] + sz[1]) 26 | p = uniform(low=-bnd, high=bnd, size=sz) 27 | return squeeze(p) 28 | 29 | 30 | def init_randn(sz): 31 | return squeeze(randn(*sz)) 32 | 33 | 34 | class Parameter(np.ndarray): 35 | 36 | def __new__(cls, *args, **kwargs): 37 | # TODO: hackish, find better way to handle higher-order parameters 38 | if len(args[0]) == 3: 39 | sz = (args[0][1], args[0][2]) 40 | arr = np.array([Parameter._init_array(sz, args[1]) for _ in range(args[0][0])]) 41 | else: 42 | arr = Parameter._init_array(args[0], args[1]) 43 | arr = arr.view(cls) 44 | arr.name = kwargs.pop('name', None) 45 | arr.post = kwargs.pop('post', None) 46 | 47 | if arr.post is not None: 48 | arr = arr.post(arr) 49 | 50 | return arr 51 | 52 | def __array_finalize__(self, obj): 53 | if obj is None: 54 | return 55 | self.name = getattr(obj, 'name', None) 56 | self.post = getattr(obj, 'post', None) 57 | 58 | @staticmethod 59 | def _init_array(shape, method): 60 | mod = sys.modules[__name__] 61 | method = 'init_%s' % method 62 | if not hasattr(mod, method): 63 | raise ValueError('Unknown initialization (%s)' % method) 64 | elif len(shape) != 2: 65 | raise ValueError('Shape must be of size 2') 66 | return getattr(mod, method)(shape) 67 | 68 | 69 | class ParameterUpdate(object): 70 | 71 | def __init__(self, param, learning_rate): 72 | self.param = param 73 | self.learning_rate = learning_rate 74 | 75 | def __call__(self, gradient, idx=None): 76 | self._update(gradient, idx) 77 | if self.param.post is not None: 78 | self.param = self.param.post(self.param, idx) 79 | 80 | def reset(self): 81 | pass 82 | 83 | 84 | class SGD(ParameterUpdate): 85 | """ 86 | Class to perform SGD updates on a parameter 87 | """ 88 | 89 | def _update(self, g, idx): 90 | self.param[idx] -= self.learning_rate * g 91 | 92 | 93 | class AdaGrad(ParameterUpdate): 94 | 95 | def __init__(self, param, learning_rate): 96 | super(AdaGrad, self).__init__(param, learning_rate) 97 | self.p2 = zeros_like(param) 98 | 99 | def _update(self, g, idx=None): 100 | self.p2[idx] += g * g 101 | H = np.maximum(np.sqrt(self.p2[idx]), 1e-7) 102 | self.param[idx] -= self.learning_rate * g / H 103 | 104 | def reset(self): 105 | self.p2 = zeros_like(self.p2) 106 | 107 | 108 | def normalize(M, idx=None): 109 | if idx is None: 110 | M = M / np.sqrt(np.sum(M ** 2, axis=1))[:, np.newaxis] 111 | else: 112 | nrm = np.sqrt(np.sum(M[idx, :] ** 2, axis=1))[:, np.newaxis] 113 | M[idx, :] = M[idx, :] / nrm 114 | return M 115 | 116 | 117 | def normless1(M, idx=None): 118 | nrm = np.sum(M[idx] ** 2, axis=1)[:, np.newaxis] 119 | nrm[nrm < 1] = 1 120 | M[idx] = M[idx] / nrm 121 | return M 122 | -------------------------------------------------------------------------------- /python/sansa/ml/kbc/keras/sample.py: -------------------------------------------------------------------------------- 1 | """ 2 | Sampling strategies to generate negative examples from knowledge graphs 3 | with an open-world assumption 4 | """ 5 | 6 | from copy import deepcopy 7 | from collections import defaultdict as ddict 8 | from numpy.random import randint 9 | 10 | 11 | class Sampler(object): 12 | 13 | def __init__(self, n, modes, ntries=100): 14 | self.n = n 15 | self.modes = modes 16 | self.ntries = ntries 17 | 18 | def sample(self, xys): 19 | res = [] 20 | for x, _ in xys: 21 | for _ in range(self.n): 22 | for mode in self.modes: 23 | t = self._sample(x, mode) 24 | if t is not None: 25 | res.append(t) 26 | return res 27 | 28 | 29 | class RandomModeSampler(Sampler): 30 | """ 31 | Sample negative triples randomly 32 | """ 33 | 34 | def __init__(self, n, modes, xs, sz): 35 | super(RandomModeSampler, self).__init__(n, modes) 36 | self.xs = set(xs) 37 | self.sz = sz 38 | 39 | def _sample(self, x, mode): 40 | nex = list(x) 41 | res = None 42 | for _ in range(self.ntries): 43 | nex[mode] = randint(self.sz[mode]) 44 | if tuple(nex) not in self.xs: 45 | res = (tuple(nex), -1.0) 46 | break 47 | return res 48 | 49 | 50 | class RandomSampler(Sampler): 51 | 52 | def __init__(self, n, xs, sz): 53 | super(RandomSampler, self).__init__(n) 54 | self.xs = set(xs) 55 | self.sz = sz 56 | 57 | def _sample(self, x, mode): 58 | res = None 59 | for _ in range(self.ntries): 60 | nex = (randint(self.sz[0]), 61 | randint(self.sz[0]), 62 | randint(self.sz[1])) 63 | if nex not in self.xs: 64 | res = (nex, -1.0) 65 | break 66 | return res 67 | 68 | 69 | class CorruptedSampler(Sampler): 70 | 71 | def __init__(self, n, xs, type_index): 72 | super(CorruptedSampler, self).__init__(n) 73 | self.xs = set(xs) 74 | self.type_index = type_index 75 | 76 | def _sample(self, x, mode): 77 | nex = list(deepcopy(x)) 78 | res = None 79 | for _ in range(self.ntries): 80 | if mode == 2: 81 | nex[2] = randint(len(self.type_index)) 82 | else: 83 | k = x[2] 84 | n = len(self.type_index[k][mode]) 85 | nex[mode] = self.type_index[k][mode][randint(n)] 86 | if tuple(nex) not in self.xs: 87 | res = (tuple(nex), -1.0) 88 | break 89 | return res 90 | 91 | 92 | class LCWASampler(RandomModeSampler): 93 | """ 94 | Sample negative examples according to the local closed world assumption 95 | """ 96 | 97 | def __init__(self, n, modes, xs, sz): 98 | super(LCWASampler, self).__init__(n, modes, xs, sz) 99 | self.counts = ddict(int) 100 | for s, o, p in xs: 101 | self.counts[(s, p)] += 1 102 | 103 | def _sample(self, x, mode): 104 | nex = list(deepcopy(x)) 105 | res = None 106 | for _ in range(self.ntries): 107 | nex[mode] = randint(self.sz[mode]) 108 | if self.counts[(nex[0], nex[2])] > 0 and tuple(nex) not in self.xs: 109 | res = (tuple(nex), -1.0) 110 | break 111 | return res 112 | 113 | 114 | def type_index(xs): 115 | index = ddict(lambda: {0: set(), 1: set()}) 116 | for i, j, k in xs: 117 | index[k][0].add(i) 118 | index[k][1].add(j) 119 | #for p, idx in index.items(): 120 | # print(p, len(idx[0]), len(idx[1])) 121 | return {k: {0: list(v[0]), 1: list(v[1])} for k, v in index.items()} 122 | -------------------------------------------------------------------------------- /python/sansa/ml/kbc/keras/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.fft import fft, ifft 3 | import scipy.sparse as sp 4 | import functools 5 | import collections 6 | 7 | 8 | def cconv(a, b): 9 | """ 10 | Circular convolution of vectors 11 | 12 | Computes the circular convolution of two vectors a and b via their 13 | fast fourier transforms 14 | 15 | a \ast b = \mathcal{F}^{-1}(\mathcal{F}(a) \odot \mathcal{F}(b)) 16 | 17 | Parameter 18 | --------- 19 | a: real valued array (shape N) 20 | b: real valued array (shape N) 21 | 22 | Returns 23 | ------- 24 | c: real valued array (shape N), representing the circular 25 | convolution of a and b 26 | """ 27 | return ifft(fft(a) * fft(b)).real 28 | 29 | 30 | def ccorr(a, b): 31 | """ 32 | Circular correlation of vectors 33 | 34 | Computes the circular correlation of two vectors a and b via their 35 | fast fourier transforms 36 | 37 | a \ast b = \mathcal{F}^{-1}(\overline{\mathcal{F}(a)} \odot \mathcal{F}(b)) 38 | 39 | Parameter 40 | --------- 41 | a: real valued array (shape N) 42 | b: real valued array (shape N) 43 | 44 | Returns 45 | ------- 46 | c: real valued array (shape N), representing the circular 47 | correlation of a and b 48 | """ 49 | 50 | return ifft(np.conj(fft(a)) * fft(b)).real 51 | 52 | 53 | def grad_sum_matrix(idx): 54 | uidx, iinv = np.unique(idx, return_inverse=True) 55 | sz = len(iinv) 56 | M = sp.coo_matrix((np.ones(sz), (iinv, np.arange(sz)))).tocsr() 57 | # normalize summation matrix so that each row sums to one 58 | n = np.array(M.sum(axis=1)) 59 | #M = M.T.dot(np.diag(n)) 60 | return uidx, M, n 61 | 62 | 63 | def unzip_triples(xys, with_ys=False): 64 | xs, ys = list(zip(*xys)) 65 | ss, os, ps = list(zip(*xs)) 66 | if with_ys: 67 | return np.array(ss), np.array(ps), np.array(os), np.array(ys) 68 | else: 69 | return np.array(ss), np.array(ps), np.array(os) 70 | 71 | 72 | def to_tensor(xs, ys, sz): 73 | T = [sp.lil_matrix((sz[0], sz[1])) for _ in range(sz[2])] 74 | for i in range(len(xs)): 75 | i, j, k = xs[i] 76 | T[k][i, j] = ys[i] 77 | return T 78 | 79 | 80 | def init_nvecs(xs, ys, sz, rank, with_T=False): 81 | from scipy.sparse.linalg import eigsh 82 | 83 | T = to_tensor(xs, ys, sz) 84 | T = [Tk.tocsr() for Tk in T] 85 | S = sum([T[k] + T[k].T for k in range(len(T))]) 86 | _, E = eigsh(sp.csr_matrix(S), rank) 87 | if not with_T: 88 | return E 89 | else: 90 | return E, T 91 | 92 | 93 | class memoized(object): 94 | ''' 95 | Decorator. Caches a function's return value each time it is called. 96 | If called later with the same arguments, the cached value is returned 97 | (not reevaluated). 98 | 99 | see https://wiki.python.org/moin/PythonDecoratorLibrary#Memoize 100 | ''' 101 | 102 | def __init__(self, func): 103 | self.func = func 104 | self.cache = {} 105 | 106 | def __call__(self, *args): 107 | if not isinstance(args, collections.Hashable): 108 | # uncachable, return direct function application 109 | return self.func(*args) 110 | if args in self.cache: 111 | return self.cache[args] 112 | else: 113 | val = self.func(*args) 114 | self.cache[args] = val 115 | return val 116 | 117 | def __repr__(self): 118 | '''return function's docstring''' 119 | return self.func.__doc__ 120 | 121 | def __get__(self, obj, objtype): 122 | '''support instance methods''' 123 | return functools.partial(self.__call__, obj) 124 | -------------------------------------------------------------------------------- /python/sansa/ml/kbc/rdfio.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import rdflib 4 | from rdflib.term import URIRef 5 | import math 6 | from pyspark import SQLContext, SparkContext, RDD 7 | from pyspark.rdd import Partitioner 8 | 9 | __author__ = 'nilesh' 10 | 11 | 12 | class ThreeWayTensorPartitioner(Partitioner): 13 | def __init__(self, dimensions: tuple, blockSizes: tuple): 14 | self.dims = dimensions 15 | self.partitionSizes = blockSizes 16 | self.numPartitions = [int(math.ceil(self.dims[i] * 1.0 / self.partitionSizes[i])) for i in range(3)] 17 | self.totalPartitions = reduce(lambda x, y: x*y, self.numPartitions) 18 | 19 | def __eq__(self, other): 20 | return (isinstance(other, ThreeWayTensorPartitioner) 21 | and self.dims == other.dims 22 | and self.partitionSizes == other.partitionSizes) 23 | 24 | def __call__(self, k): 25 | return self.partitionFunc(k) 26 | 27 | def partitionFunc(self, key): 28 | for i in range(len(self.dims)): 29 | assert(0 <= key[i] <= self.dims[i]) 30 | 31 | i, j, k = key 32 | ps1, ps2, ps3 = self.partitionSizes 33 | pn1, pn2, pn3 = self.numPartitions 34 | 35 | return i / ps1 + j / ps2 * pn1 + k / ps3 * pn2 * pn1 36 | 37 | 38 | class RDFReader(object): 39 | def __init__(self, sc: SparkContext): 40 | self.sc = SQLContext(sc) 41 | 42 | def tripleRDD(self, file) -> RDD: 43 | def parseNTriples(lines): 44 | g = rdflib.Graph() 45 | g.parse(data="\n".join(lines), format="nt") 46 | allURIs = lambda statement: False not in [isinstance(term, URIRef) for term in statement] 47 | return [statement for statement in g if not allURIs(statement)] 48 | 49 | triples = self.sc.read.text(file).map(lambda x: x.value).mapPartitions(parseNTriples) 50 | return triples 51 | 52 | def tripleTensor(self, file, blockSizes: tuple): 53 | spo = self.tripleRDD(file) 54 | # Already filtered by URIs, no need to check types a la pattern matching 55 | entityIDs = spo.flatMap(lambda x: [x[0], x[2]]).distinct().zipWithUniqueId() # (eURI, eID) 56 | numEntities = entityIDs.countByKey() 57 | relationIDs = spo.map(lambda x: x[1]).distinct().zipWithUniqueId() # (rURI, rID) 58 | numRelations = relationIDs.countByKey() 59 | 60 | s_po = spo.map(lambda x: (x[0], (x[1], x[2]))) 61 | 62 | def mapSubjectIDs(s__po_sid): 63 | (s, ((p, o), sid)) = s__po_sid 64 | return o, (sid, p) 65 | 66 | o__sid_p = s_po.join(entityIDs).map(mapSubjectIDs) 67 | p__oid_sid = o__sid_p.join(entityIDs).map(mapSubjectIDs) 68 | sid__pid_oid = p__oid_sid.join(relationIDs).map(mapSubjectIDs) 69 | 70 | spoMapped = sid__pid_oid.map(lambda x: (x[0], x[1][0], x[1][1])) 71 | assert isinstance(spoMapped, RDD) 72 | 73 | d1, d2, d3 = blockSizes 74 | 75 | def blockify(s, o, p): 76 | blockD1Index = int(s / d1) 77 | 78 | spoMapped.groupByKey().mapPartitions() 79 | 80 | 81 | return sid__pid_oid 82 | 83 | 84 | 85 | 86 | os.environ['SPARK_HOME'] = "/Users/nilesh/IdeaProjects/spark-1.6.2-bin-hadoop2.6" 87 | os.environ['PYSPARK_PYTHON'] = "python3" 88 | os.environ['PYSPARK_DRIVER_PYTHON'] = "python3" 89 | reader = RDFReader(SparkContext(master="local[4]", appName="test", sparkHome="/Users/nilesh/IdeaProjects/spark-1.6.2-bin-hadoop2.6")) 90 | print(reader.tripleTensor("/Users/nilesh/IdeaProjects/elinker3/small-dataset.nt", 1).collect()) -------------------------------------------------------------------------------- /src/main/scala/net/sansa_stack/ml/kge/Functions.scala: -------------------------------------------------------------------------------- 1 | package net.sansa_stack.ml.kge 2 | 3 | import ml.dmlc.mxnet._ 4 | import ml.dmlc.mxnet.{Symbol => s} 5 | 6 | /** 7 | * Created by nilesh on 31/05/2017. 8 | */ 9 | object MaxMarginLoss { 10 | def apply(margin: Float): (Symbol, Symbol) => Symbol = { 11 | loss(margin) _ 12 | } 13 | 14 | def loss(margin: Float)(positiveScore: Symbol, negativeScore: Symbol): Symbol = { 15 | var loss = s.max(negativeScore - positiveScore + margin, 0) 16 | loss = s.sum(name = "sum")()(Map("data" -> loss)) 17 | s.make_loss(name = "loss")()(Map("data" -> loss)) 18 | } 19 | } 20 | 21 | object Sigmoid { 22 | def apply(x: Symbol): Symbol = { 23 | s.Activation(name = "sigmoid")()(Map("data" -> x, "act_type" -> "sigmoid")) 24 | } 25 | } 26 | 27 | object Tanh { 28 | def apply(x: Symbol): Symbol = { 29 | s.Activation(name = "tanh")()(Map("data" -> x, "act_type" -> "tanh")) 30 | } 31 | } 32 | 33 | object L2Similarity { 34 | def apply(x: Symbol, y: Symbol): Symbol = { 35 | val difference = x - y 36 | var score = s.square()()(Map("data" -> difference)) 37 | score = s.sum()()(Map("data" -> score, "axis" -> 0)) 38 | score*(-1.0) 39 | } 40 | } 41 | 42 | object DotSimilarity { 43 | def apply(x: Symbol, y: Symbol): Symbol = { 44 | s.dot("dot")()(Map("lhs" -> x, "rhs" -> y)) 45 | } 46 | } 47 | 48 | object Hits { 49 | def hitsAt1(label: NDArray, predicted: NDArray): Float = { 50 | val labelA = label.toArray 51 | val predA = predicted.toArray 52 | labelA.zip(predA).map(x => if(x._1.toInt == x._2.toInt && x._1.toInt == 1) 1 else 0).sum 53 | } 54 | } -------------------------------------------------------------------------------- /src/main/scala/net/sansa_stack/ml/kge/Main.scala: -------------------------------------------------------------------------------- 1 | package net.sansa_stack.ml.kge 2 | 3 | import ml.dmlc.mxnet.spark.MXNet 4 | 5 | 6 | /** 7 | * Created by nilesh on 19/05/2017. 8 | */ 9 | object Main extends App { 10 | // import ml.dmlc.mxnet.Symbol 11 | import ml.dmlc.mxnet._ 12 | val x = Symbol.Variable("x") 13 | val y = Symbol.Variable("y") 14 | val diff = Symbol.pow(x, 2) + Symbol.pow(y, 3) 15 | val a = NDArray.ones(1) * 10 16 | val b = NDArray.ones(1) * 2 17 | val ga = NDArray.empty(1) 18 | val ga2 = NDArray.empty(1) 19 | val executor = diff.bind(Context.cpu(), args=Map("x" -> a, "y" -> b), argsGrad=Map("x" -> ga, "y" -> ga2)) 20 | executor.forward() 21 | println(executor.outputs(0).toArray.mkString(",")) 22 | // executor. 23 | 24 | // test gradient 25 | val outGrad = NDArray.ones(1) 26 | executor.backward(Array(outGrad)) 27 | println(executor.gradDict.toArray.apply(1).x._2.toArray.mkString(",")) 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/net/sansa_stack/ml/kge/RDFDatasetReader.scala: -------------------------------------------------------------------------------- 1 | package net.sansa_stack.ml.kge 2 | 3 | import net.sansa_stack.rdf.spark.model.{JenaSparkRDD, JenaSparkRDDOps} 4 | import net.sansa_stack.rdf.spark.model.TripleRDD._ 5 | import net.sansa_stack.rdf.spark.model.JenaSparkRDD 6 | import org.apache.jena.graph.Node_URI 7 | import org.apache.spark.SparkContext 8 | import org.apache.spark.rdd.RDD 9 | import ml.dmlc.mxnet._ 10 | 11 | /** 12 | * Created by nilesh on 31/05/2017. 13 | */ 14 | class RDFDatasetReader(sc: SparkContext, path: String) { 15 | type Node = JenaSparkRDD#Node 16 | 17 | private val ops = JenaSparkRDDOps(sc) 18 | import ops._ 19 | 20 | private val triplesWithURIs = { 21 | val graph = ops.loadGraphFromNTriples(path, "") 22 | graph.filter{ 23 | case Triple(s, p, o) => 24 | s.isURI && p.isURI && o.isURI 25 | } 26 | } 27 | 28 | val relationIDs = triplesWithURIs.getPredicates.zipWithUniqueId() 29 | 30 | val entityIDs = (triplesWithURIs.getSubjects 31 | ++ triplesWithURIs.getObjects) 32 | .distinct 33 | .zipWithUniqueId() 34 | 35 | def getNumEntities = entityIDs.count() 36 | 37 | def getNumRelations = relationIDs.count() 38 | 39 | def getMappedTriples(): Unit = { 40 | val joinedBySubject = entityIDs.join(triplesWithURIs.map{ 41 | case Triple(s, p, o) => 42 | (s, (p, o)) 43 | }) 44 | 45 | val subjectMapped: RDD[(Long, Node_URI, Node)] = joinedBySubject.map{ 46 | case (_, _ @ (subjectID: Long, _ @ (predicate: Node_URI, obj: Node))) => 47 | (subjectID, predicate, obj) 48 | } 49 | 50 | val joinedByObject = entityIDs.join(subjectMapped.map{ 51 | case (s, p, o) => 52 | (o, (s, p)) 53 | }) 54 | 55 | val subjectObjectMapped = joinedByObject.map{ 56 | case (_, _ @ (objectID: Long, _ @ (subjectID: Long, predicate: Node_URI))) => 57 | (subjectID, predicate, objectID) 58 | } 59 | 60 | val joinedByPredicate = relationIDs.join(subjectObjectMapped.map{ 61 | case (s, p, o) => 62 | (p, (s, o)) 63 | }) 64 | 65 | val allMapped = joinedByPredicate.map{ 66 | case (_: Node, _ @ (predicateID: Long, _ @ (subjectID: Long, objectID: Long))) => 67 | (subjectID, predicateID, objectID) 68 | } 69 | 70 | allMapped 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/main/scala/net/sansa_stack/ml/kge/model/TransE.scala: -------------------------------------------------------------------------------- 1 | package net.sansa_stack.ml.kge.model 2 | 3 | import ml.dmlc.mxnet._ 4 | import ml.dmlc.mxnet.{Symbol => s} 5 | import scala.io.Source 6 | import scala.util.Random 7 | import ml.dmlc.mxnet.optimizer.Adam 8 | import net.sansa_stack.ml.kge.{MaxMarginLoss, L2Similarity, Main, Hits} 9 | 10 | /** 11 | * Created by nilesh on 01/06/2017. 12 | */ 13 | class TransE(numEntities: Int, numRelations: Int, latentFactors: Int, batchSize: Int) { 14 | def getNet(): (Symbol, Seq[String]) = { 15 | // embedding weight vectors 16 | val entityWeight = s.Variable("entityWeight") 17 | val relationWeight = s.Variable("relationWeight") 18 | 19 | def entityEmbedding(data: Symbol) = 20 | s.Embedding()()(Map("data" -> data, "weight" -> entityWeight, "input_dim" -> numEntities, "output_dim" -> latentFactors)) 21 | 22 | def relationEmbedding(data: Symbol) = 23 | s.Embedding()()(Map("data" -> data, "weight" -> relationWeight, "input_dim" -> numRelations, "output_dim" -> latentFactors)) 24 | 25 | // inputs 26 | var head = s.Variable("subjectEntity") 27 | var relation = s.Variable("predicateRelation") 28 | var tail = s.Variable("objectEntity") 29 | var corruptHead = s.Variable("corruptSubjectEntity") 30 | var corruptTail = s.Variable("corruptObjectEntity") 31 | 32 | head = entityEmbedding(head) 33 | relation = relationEmbedding(relation) 34 | tail = entityEmbedding(tail) 35 | corruptHead = entityEmbedding(corruptHead) 36 | corruptTail = entityEmbedding(corruptTail) 37 | 38 | def getScore(head: Symbol, relation: Symbol, tail: Symbol) = L2Similarity(head + relation, tail) 39 | 40 | val posScore = getScore(head, relation, tail) 41 | val negScore = getScore(corruptHead, relation, corruptTail) 42 | val loss = MaxMarginLoss(1.0f)(posScore, negScore) 43 | 44 | (loss, Seq("subjectEntity", "predicateRelation", "objectEntity", "corruptSubjectEntity", "corruptObjectEntity")) 45 | } 46 | 47 | def train() = { 48 | val ctx = Context.cpu() 49 | // val numEntities = 40943 50 | val (transeModel, paramNames) = getNet() 51 | 52 | import ml.dmlc.mxnet.Xavier 53 | 54 | val initializer = new Xavier(factorType = "in", magnitude = 2.34f) 55 | 56 | val (argShapes, outputShapes, auxShapes) = transeModel.inferShape( 57 | (for (paramName <- paramNames) yield paramName -> Shape(batchSize, 1)) 58 | toMap) 59 | 60 | val argNames = transeModel.listArguments() 61 | val argDict = argNames.zip(argShapes.map(NDArray.empty(_, ctx))).toMap 62 | val gradDict = argNames.zip(argShapes).filter { 63 | case (name, shape) => 64 | !paramNames.contains(name) 65 | }.map(x => x._1 -> NDArray.empty(x._2, ctx)).toMap 66 | argDict.foreach { 67 | case (name, ndArray) => 68 | if (!paramNames.contains(name)) { 69 | initializer.initWeight(name, ndArray) 70 | } 71 | } 72 | 73 | def readDataBatched(stage: String) = { 74 | val triplesFile = s"/home/nilesh/utils/Spark-Tensors/data/$stage.txt" 75 | val entityIDFile = "/home/nilesh/utils/Spark-Tensors/data/entity2id.txt" 76 | val relationIDFile = "/home/nilesh/utils/Spark-Tensors/data/relation2id.txt" 77 | 78 | 79 | def getIDMap(path: String) = Source.fromFile(path) 80 | .getLines() 81 | .map(_.split("\t")) 82 | .map(x => x(0) -> x(1).toFloat).toMap 83 | 84 | val entityID = getIDMap(entityIDFile) 85 | val relationID = getIDMap(relationIDFile) 86 | 87 | val triples = Random.shuffle(Source.fromFile(triplesFile).getLines().map(_.split("\t")).toSeq) 88 | 89 | (triples.map(x => entityID(x(0))).toArray.grouped(batchSize).toSeq, 90 | triples.map(x => relationID(x(2))).toArray.grouped(batchSize).toSeq, 91 | triples.map(x => entityID(x(1))).toArray.grouped(batchSize).toSeq, 92 | triples.map(x => Random.nextInt(numEntities).toFloat).toArray.grouped(batchSize).toSeq, 93 | triples.map(x => Random.nextInt(numEntities).toFloat).toArray.grouped(batchSize).toSeq) 94 | } 95 | 96 | val executor = transeModel.bind(ctx, argDict, gradDict) 97 | 98 | val opt = new Adam(learningRate = 0.001f, wd = 0.0001f) 99 | val paramsGrads = gradDict.toList.zipWithIndex.map { case ((name, grad), idx) => 100 | (idx, name, grad, opt.createState(idx, argDict(name))) 101 | } 102 | 103 | val head = argDict("subjectEntity") 104 | val relation = argDict("predicateRelation") 105 | val tail = argDict("objectEntity") 106 | val corruptHead = argDict("corruptSubjectEntity") 107 | val corruptTail = argDict("corruptObjectEntity") 108 | 109 | val (trainSubjects, trainRelations, trainObjects, trainCorruptSubjects, trainCorruptObjects) = readDataBatched("train") 110 | val (testSubjects, testRelations, testObjects, _, _) = readDataBatched("test") 111 | 112 | var iter = 0 113 | var minTestHits = 100f 114 | for (epoch <- 0 until 100000) { 115 | head.set(trainSubjects(iter)) 116 | relation.set(trainRelations(iter)) 117 | tail.set(trainObjects(iter)) 118 | corruptHead.set(trainCorruptSubjects(iter)) 119 | corruptTail.set(trainCorruptObjects(iter)) 120 | iter += 1 121 | 122 | if (iter >= trainSubjects.length) iter = 0 123 | 124 | executor.forward(isTrain = true) 125 | executor.backward() 126 | 127 | paramsGrads.foreach { 128 | case (idx, name, grad, optimState) => 129 | opt.update(idx, argDict(name), grad, optimState) 130 | } 131 | 132 | // println(s"iter $epoch, training Hits@1: ${Math.sqrt(Hits.hitsAt1(NDArray.ones(batchSize), executor.outputs(0)) / batchSize)}, min test Hits@1: $minTestHits") 133 | 134 | println(s"iter $epoch, training loss: ${executor.outputs(0).toArray.sum}") 135 | if (epoch != 0 && epoch % 50 == 0) { 136 | val tmp = for (i <- 0 until testSubjects.length) yield { 137 | head.set(testSubjects(iter)) 138 | relation.set(testRelations(iter)) 139 | tail.set(testObjects(iter)) 140 | 141 | executor.forward(isTrain = false) 142 | Hits.hitsAt1(NDArray.ones(batchSize), executor.outputs(0)) 143 | } 144 | val testHits = Math.sqrt(tmp.toArray.sum / (testSubjects.length * batchSize)) 145 | if (testHits < minTestHits) minTestHits = testHits.toFloat 146 | } 147 | } 148 | 149 | } 150 | } 151 | --------------------------------------------------------------------------------