├── .gitignore ├── .hgignore ├── LOGS.NOBACKUP └── .keep ├── README ├── batch ├── diagnostics.py ├── hyperparameters.py ├── hyperparameters.random-indexing.yaml ├── induce.py └── vocabulary.py /.gitignore: -------------------------------------------------------------------------------- 1 | .hgignore -------------------------------------------------------------------------------- /.hgignore: -------------------------------------------------------------------------------- 1 | syntax: glob 2 | *.pyc 3 | *.sw? 4 | LOGS.NOBACKUP/run* 5 | -------------------------------------------------------------------------------- /LOGS.NOBACKUP/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turian/random-indexing-wordrepresentations/4cfb683aa68df4ba3275a1586564b62ee72200bc/LOGS.NOBACKUP/.keep -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | random-indexing-wordrepresentations 2 | ----------------------------------- 3 | 4 | by Joseph Turian 5 | 6 | Induce word representations using random indexing (RI). 7 | 8 | For information about random indexing, see: 9 | http://www.sics.se/~mange/random_indexing.html 10 | 11 | You can control the hyperparameters by editing 12 | hyperparameters.random-indexing.yaml 13 | Or you can control the hyperparameters using command-line options. 14 | 15 | Another implementation (in Java) is semanticvectors: 16 | http://code.google.com/p/semanticvectors/ 17 | 18 | See also: 19 | http://github.com/turian/pyrandomprojection 20 | a generic library for transforming a Python dictionary into a 21 | low-dimensional numpy array. 22 | 23 | This code is based upon my neural language model code 24 | (http://github.com/turian/neural-language-model), so it shares similar 25 | idioms for the data it expects. For example, instead of building a 26 | vocabulary on-the-fly, we assume that the vocabulary has been preextracted 27 | and will be read in. Also, we assume that there is one-sentence-per-line 28 | in the training input. 29 | 30 | 31 | REQUIREMENTS: 32 | * My python common code: 33 | http://github.com/turian/common 34 | -------------------------------------------------------------------------------- /batch: -------------------------------------------------------------------------------- 1 | dbidispatch --exp_dir="T" ./induce.py \ 2 | '--REPRESENTATION_SIZE={{25,50,100,200}}' \ 3 | '--TERNARY_NON_ZERO_PERCENT={{0.01,0.02,0.04,0.08,0.16}}' \ 4 | '--RANDOMIZATION_TYPE={{ternary,gaussian}}' 5 | -------------------------------------------------------------------------------- /diagnostics.py: -------------------------------------------------------------------------------- 1 | """ 2 | Verbose debug output for the model. 3 | """ 4 | 5 | from common.stats import stats 6 | from common.str import percent 7 | 8 | import logging 9 | import sys 10 | import numpy 11 | import random 12 | 13 | def diagnostics(cnt, embeddings): 14 | logging.info(stats()) 15 | vocab_size = embeddings.shape[0] 16 | idxs = range(vocab_size) 17 | random.shuffle(idxs) 18 | idxs = idxs[:100] 19 | 20 | embeddings_debug(embeddings[idxs], cnt, "rand 100 words") 21 | embeddings_debug(embeddings[:100], cnt, "top 100 words") 22 | embeddings_debug(embeddings[vocab_size/2-50:vocab_size/2+50], cnt, "mid 100 words") 23 | embeddings_debug(embeddings[-100:], cnt, "last 100 words") 24 | logging.info(stats()) 25 | 26 | def visualizedebug(cnt, embeddings, rundir, newkeystr, WORDCNT=500): 27 | vocab_size = embeddings.shape[0] 28 | idxs = range(vocab_size) 29 | random.shuffle(idxs) 30 | idxs = idxs[:WORDCNT] 31 | 32 | visualize(cnt, embeddings, rundir, idxs, "randomized%s" % newkeystr) 33 | visualize(cnt, embeddings, rundir, range(WORDCNT), "mostcommon%s" % newkeystr) 34 | visualize(cnt, embeddings, rundir, range(-1, -WORDCNT, -1), "leastcommon%s" % newkeystr) 35 | visualize(cnt, embeddings, rundir, range(vocab_size/2-WORDCNT/2,vocab_size/2+WORDCNT/2), "midcommon%s" % newkeystr) 36 | 37 | def visualize(cnt, embeddings, rundir, idxs, str): 38 | """ 39 | Visualize a set of examples using t-SNE. 40 | """ 41 | from vocabulary import wordmap 42 | PERPLEXITY=30 43 | 44 | x = embeddings[idxs] 45 | print x.shape 46 | titles = [wordmap.str(id) for id in idxs] 47 | 48 | import os.path 49 | filename = os.path.join(rundir, "embeddings-%s-%d.png" % (str, cnt)) 50 | try: 51 | from textSNE.calc_tsne import tsne 52 | # from textSNE.tsne import tsne 53 | out = tsne(x, perplexity=PERPLEXITY) 54 | from textSNE.render import render 55 | render([(title, point[0], point[1]) for title, point in zip(titles, out)], filename) 56 | except IOError: 57 | logging.info("ERROR visualizing", filename, ". Continuing...") 58 | 59 | def embeddings_debug(w, cnt, str): 60 | """ 61 | Output the l2norm mean and max of the embeddings, including in debug out the str and training cnt 62 | """ 63 | totalcnt = numpy.sum(numpy.abs(w) >= 0) 64 | notsmallcnt = numpy.sum(numpy.abs(w) >= 0.1) 65 | logging.info("%d %s dimensions of %s have absolute value >= 0.1" % (cnt, percent(notsmallcnt, totalcnt), str)) 66 | notsmallcnt = numpy.sum(numpy.abs(w) >= 0.01) 67 | logging.info("%d %s dimensions of %s have absolute value >= 0.01" % (cnt, percent(notsmallcnt, totalcnt), str)) 68 | 69 | l2norm = numpy.sqrt(numpy.square(w).sum(axis=1)) 70 | median = numpy.median(l2norm) 71 | mean = numpy.mean(l2norm) 72 | std = numpy.std(l2norm) 73 | # print("%d l2norm of top 100 words: mean = %f stddev=%f" % (cnt, numpy.mean(l2norm), numpy.std(l2norm),)) 74 | l2norm = l2norm.tolist() 75 | l2norm.sort() 76 | l2norm.reverse() 77 | logging.info("%d l2norm of %s: median = %f mean = %f stddev=%f top3=%s" % (cnt, str, median, mean, std, `l2norm[:3]`)) 78 | # print("top 5 = %s" % `l2norm[:5]`) 79 | -------------------------------------------------------------------------------- /hyperparameters.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module to update hyperparameters automatically. 3 | """ 4 | 5 | from os.path import join 6 | import common.hyperparameters 7 | HYPERPARAMETERS = common.hyperparameters.read("random-indexing") 8 | DATA_DIR = HYPERPARAMETERS["locations"]["DATA_DIR"] 9 | RUN_NAME = HYPERPARAMETERS["RUN_NAME"] 10 | VOCABULARY_SIZE = HYPERPARAMETERS["VOCABULARY_SIZE"] 11 | INCLUDE_UNKNOWN_WORD = HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"] 12 | HYPERPARAMETERS["TRAIN_SENTENCES"] = join(DATA_DIR, "%s.train.txt.gz" % RUN_NAME) 13 | #HYPERPARAMETERS["ORIGINAL VALIDATION_SENTENCES"] = join(DATA_DIR, "%s.validation.txt.gz" % RUN_NAME) 14 | #HYPERPARAMETERS["VALIDATION_SENTENCES"] = join(DATA_DIR, "%s.validation-%d.txt.gz" % (RUN_NAME, HYPERPARAMETERS["VALIDATION EXAMPLES"])) 15 | HYPERPARAMETERS["VOCABULARY"] = join(DATA_DIR, "vocabulary-%s-%d.txt.gz" % (RUN_NAME, VOCABULARY_SIZE)) 16 | HYPERPARAMETERS["VOCABULARY_IDMAP_FILE"] = join(DATA_DIR, "idmap.%s-%d.include_unknown=%s.pkl.gz" % (RUN_NAME, VOCABULARY_SIZE, INCLUDE_UNKNOWN_WORD)) 17 | 18 | -------------------------------------------------------------------------------- /hyperparameters.random-indexing.yaml: -------------------------------------------------------------------------------- 1 | 2 | # CONTEXT_TYPES is a list of lists. For example [[-2, -1], [+1, +2]]. This 3 | # means that words two to the left (-2) and one to the left (-1) of the 4 | # focus word will be treated as one kind of context vector. Similarly, 5 | # words one to the right (+1) and two to the right (+2) of the focus word 6 | # are another kind of context vector. 7 | CONTEXT_TYPES: [[-1], [+1]] 8 | 9 | # Where to read the input data from 10 | locations: {"DATA_DIR": "/home/fringant2/lisa/turian/dev/python/language-model/data/"} 11 | #locations: {"DATA_DIR": "/home/turianjo/dev/python/language-model/data/"} 12 | 13 | # Should we induce an embedding for OOV words? 14 | INCLUDE_UNKNOWN_WORD: True 15 | 16 | RUN_NAME: "rcv1.case-intact" 17 | VOCABULARY_SIZE: 268810 18 | 19 | REPRESENTATION_SIZE: 10 20 | 21 | # Either "ternary" (-1, 0, +1) or "gaussian" random matrix. 22 | #RANDOMIZATION_TYPE: ternary 23 | #RANDOMIZATION_TYPE: gaussian 24 | RANDOMIZATION_TYPE: "" 25 | 26 | # Percentage of each context (random) representation that is non-zero. 27 | TERNARY_NON_ZERO_PERCENT: 0.0 28 | 29 | RANDOM_SEED: 0 30 | -------------------------------------------------------------------------------- /induce.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import logging 4 | import sys 5 | import string 6 | from common.file import myopen 7 | from common.stats import stats 8 | from common.str import percent 9 | 10 | import numpy 11 | import random 12 | 13 | import diagnostics 14 | 15 | def trainingsentences(): 16 | """ 17 | For each line (sentence) in the training data, transform it into a list of token IDs. 18 | """ 19 | 20 | HYPERPARAMETERS = common.hyperparameters.read("random-indexing") 21 | from vocabulary import wordmap 22 | filename = HYPERPARAMETERS["TRAIN_SENTENCES"] 23 | count = 0 24 | for l in myopen(filename): 25 | tokens = [] 26 | for w in string.split(l): 27 | w = string.strip(w) 28 | assert wordmap.exists(w) # Not exactly clear what to do 29 | # if the word isn't in the vocab. 30 | tokens.append(wordmap.id(w)) 31 | yield tokens 32 | count += 1 33 | if count % 1000 == 0: 34 | logging.info("Read %d lines from training file %s..." % (count, filename)) 35 | logging.info(stats()) 36 | 37 | def generate_context_vectors(): 38 | """ 39 | Generate the (random) context vectors. 40 | """ 41 | 42 | HYPERPARAMETERS = common.hyperparameters.read("random-indexing") 43 | from vocabulary import wordmap 44 | 45 | if HYPERPARAMETERS["RANDOMIZATION_TYPE"] == "gaussian": 46 | context_vectors = [numpy.random.normal(size=(wordmap.len, HYPERPARAMETERS["REPRESENTATION_SIZE"])) for i in range(len(HYPERPARAMETERS["CONTEXT_TYPES"]))] 47 | elif HYPERPARAMETERS["RANDOMIZATION_TYPE"] == "ternary": 48 | NONZEROS = int(HYPERPARAMETERS["TERNARY_NON_ZERO_PERCENT"] * HYPERPARAMETERS["REPRESENTATION_SIZE"] + 0.5) 49 | 50 | logging.info("Generating %d nonzeros per %d-length random context vector" % (NONZEROS, HYPERPARAMETERS["REPRESENTATION_SIZE"])) 51 | 52 | # Generate one set of context vectors per list in HYPERPARAMETERS["CONTEXT_TYPES"] 53 | context_vectors = [] 54 | for i in range(len(HYPERPARAMETERS["CONTEXT_TYPES"])): 55 | logging.info("Generated %s context matrixes" % (percent(i, len(HYPERPARAMETERS["CONTEXT_TYPES"])))) 56 | logging.info(stats()) 57 | thiscontext = numpy.zeros((wordmap.len, HYPERPARAMETERS["REPRESENTATION_SIZE"])) 58 | for j in range(wordmap.len): 59 | idxs = range(HYPERPARAMETERS["REPRESENTATION_SIZE"]) 60 | random.shuffle(idxs) 61 | for k in idxs[:NONZEROS]: 62 | thiscontext[j][k] = random.choice([-1, +1]) 63 | # print thiscontext[j] 64 | context_vectors.append(thiscontext) 65 | else: 66 | assert 0 67 | 68 | logging.info("Done generating %s context matrixes" % (percent(i, len(HYPERPARAMETERS["CONTEXT_TYPES"])))) 69 | logging.info(stats()) 70 | return context_vectors 71 | 72 | if __name__ == "__main__": 73 | import common.hyperparameters, common.options 74 | HYPERPARAMETERS = common.hyperparameters.read("random-indexing") 75 | HYPERPARAMETERS, options, args, newkeystr = common.options.reparse(HYPERPARAMETERS) 76 | import hyperparameters 77 | 78 | from common import myyaml 79 | import common.dump 80 | print >> sys.stderr, myyaml.dump(common.dump.vars_seq([hyperparameters])) 81 | 82 | rundir = common.dump.create_canonical_directory(HYPERPARAMETERS) 83 | 84 | import os.path, os 85 | logfile = os.path.join(rundir, "log") 86 | if newkeystr != "": 87 | verboselogfile = os.path.join(rundir, "log%s" % newkeystr) 88 | print >> sys.stderr, "Logging to %s, and creating link %s" % (logfile, verboselogfile) 89 | os.system("ln -s log %s " % (verboselogfile)) 90 | else: 91 | print >> sys.stderr, "Logging to %s, not creating any link because of default settings" % logfile 92 | 93 | logging.basicConfig(filename=logfile, filemode="w", level=logging.DEBUG) 94 | logging.info("INITIALIZING TRAINING STATE") 95 | logging.info(myyaml.dump(common.dump.vars_seq([hyperparameters]))) 96 | 97 | 98 | import random, numpy 99 | random.seed(HYPERPARAMETERS["RANDOM_SEED"]) 100 | numpy.random.seed(HYPERPARAMETERS["RANDOM_SEED"]) 101 | from vocabulary import wordmap 102 | 103 | cnt = 0 104 | random_representations = numpy.zeros((wordmap.len, HYPERPARAMETERS["REPRESENTATION_SIZE"])) 105 | 106 | context_vectors = generate_context_vectors() 107 | 108 | for tokens in trainingsentences(): 109 | for i in range(len(tokens)): 110 | for j, context in enumerate(HYPERPARAMETERS["CONTEXT_TYPES"]): 111 | for k in context: 112 | tokidx = i + k 113 | if tokidx < 0 or tokidx >= len(tokens): continue 114 | random_representations[tokens[i]] += context_vectors[j][tokens[tokidx]] 115 | cnt += 1 116 | if cnt % 10000 == 0: 117 | diagnostics.diagnostics(cnt, random_representations) 118 | 119 | logging.info("DONE. Dividing embeddings by their standard deviation...") 120 | random_representations = random_representations * (1. / numpy.std(random_representations)) 121 | diagnostics.diagnostics(cnt, random_representations) 122 | diagnostics.visualizedebug(cnt, random_representations, rundir, newkeystr) 123 | 124 | outfile = os.path.join(rundir, "random_representations") 125 | if newkeystr != "": 126 | verboseoutfile = os.path.join(rundir, "random_representations%s" % newkeystr) 127 | logging.info("Writing representations to %s, and creating link %s" % (outfile, verboseoutfile)) 128 | os.system("ln -s random_representations %s " % (verboseoutfile)) 129 | else: 130 | logging.info("Writing representations to %s, not creating any link because of default settings" % outfile) 131 | 132 | o = open(outfile, "wt") 133 | from vocabulary import wordmap 134 | for i in range(wordmap.len): 135 | o.write(wordmap.str(i) + " ") 136 | for v in random_representations[i]: 137 | o.write(`v` + " ") 138 | o.write("\n") 139 | -------------------------------------------------------------------------------- /vocabulary.py: -------------------------------------------------------------------------------- 1 | """ 2 | Automatically load the wordmap, if available. 3 | """ 4 | 5 | import cPickle 6 | from common.file import myopen 7 | import sys 8 | 9 | def _wordmap_filename(): 10 | import common.hyperparameters, common.options 11 | HYPERPARAMETERS = common.hyperparameters.read("random-indexing") 12 | return HYPERPARAMETERS["VOCABULARY_IDMAP_FILE"] 13 | 14 | wordmap = None 15 | try: 16 | wordmap = cPickle.load(myopen(_wordmap_filename())) 17 | wordmap.str = wordmap.key 18 | except: pass 19 | 20 | def write(wordmap): 21 | """ 22 | Write the word ID map, passed as a parameter. 23 | """ 24 | print >> sys.stderr, "Writing word map to %s..." % _wordmap_filename() 25 | cPickle.dump(wordmap, myopen(_wordmap_filename(), "w")) 26 | --------------------------------------------------------------------------------