├── .gitignore
├── .hgignore
├── LOGS.NOBACKUP
    └── .keep
├── README
├── batch
├── diagnostics.py
├── hyperparameters.py
├── hyperparameters.random-indexing.yaml
├── induce.py
└── vocabulary.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .hgignore


--------------------------------------------------------------------------------
/.hgignore:
--------------------------------------------------------------------------------
1 | syntax: glob
2 | *.pyc
3 | *.sw?
4 | LOGS.NOBACKUP/run*
5 | 


--------------------------------------------------------------------------------
/LOGS.NOBACKUP/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turian/random-indexing-wordrepresentations/4cfb683aa68df4ba3275a1586564b62ee72200bc/LOGS.NOBACKUP/.keep


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | random-indexing-wordrepresentations
 2 | -----------------------------------
 3 | 
 4 |     by Joseph Turian
 5 | 
 6 | Induce word representations using random indexing (RI).
 7 | 
 8 | For information about random indexing, see:
 9 |     http://www.sics.se/~mange/random_indexing.html
10 | 
11 | You can control the hyperparameters by editing
12 |     hyperparameters.random-indexing.yaml
13 | Or you can control the hyperparameters using command-line options.
14 | 
15 | Another implementation (in Java) is semanticvectors:
16 |     http://code.google.com/p/semanticvectors/
17 | 
18 | See also:
19 |     http://github.com/turian/pyrandomprojection
20 | a generic library for transforming a Python dictionary into a
21 | low-dimensional numpy array.
22 | 
23 | This code is based upon my neural language model code
24 | (http://github.com/turian/neural-language-model), so it shares similar
25 | idioms for the data it expects. For example, instead of building a
26 | vocabulary on-the-fly, we assume that the vocabulary has been preextracted
27 | and will be read in. Also, we assume that there is one-sentence-per-line
28 | in the training input.
29 | 
30 | 
31 | REQUIREMENTS:
32 |     * My python common code:
33 |         http://github.com/turian/common
34 | 


--------------------------------------------------------------------------------
/batch:
--------------------------------------------------------------------------------
1 | dbidispatch --exp_dir="T" ./induce.py \
2 |     '--REPRESENTATION_SIZE={{25,50,100,200}}' \
3 |     '--TERNARY_NON_ZERO_PERCENT={{0.01,0.02,0.04,0.08,0.16}}' \
4 |     '--RANDOMIZATION_TYPE={{ternary,gaussian}}'
5 | 


--------------------------------------------------------------------------------
/diagnostics.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Verbose debug output for the model.
 3 | """
 4 | 
 5 | from common.stats import stats
 6 | from common.str import percent
 7 | 
 8 | import logging
 9 | import sys
10 | import numpy
11 | import random
12 | 
13 | def diagnostics(cnt, embeddings):
14 |     logging.info(stats())
15 |     vocab_size = embeddings.shape[0]
16 |     idxs = range(vocab_size)
17 |     random.shuffle(idxs)
18 |     idxs = idxs[:100]
19 | 
20 |     embeddings_debug(embeddings[idxs], cnt, "rand 100 words")
21 |     embeddings_debug(embeddings[:100], cnt, "top  100 words")
22 |     embeddings_debug(embeddings[vocab_size/2-50:vocab_size/2+50], cnt, "mid  100 words")
23 |     embeddings_debug(embeddings[-100:], cnt, "last 100 words")
24 |     logging.info(stats())
25 | 
26 | def visualizedebug(cnt, embeddings, rundir, newkeystr, WORDCNT=500):
27 |     vocab_size = embeddings.shape[0]
28 |     idxs = range(vocab_size)
29 |     random.shuffle(idxs)
30 |     idxs = idxs[:WORDCNT]
31 | 
32 |     visualize(cnt, embeddings, rundir, idxs, "randomized%s" % newkeystr)
33 |     visualize(cnt, embeddings, rundir, range(WORDCNT), "mostcommon%s" % newkeystr)
34 |     visualize(cnt, embeddings, rundir, range(-1, -WORDCNT, -1), "leastcommon%s" % newkeystr)
35 |     visualize(cnt, embeddings, rundir, range(vocab_size/2-WORDCNT/2,vocab_size/2+WORDCNT/2), "midcommon%s" % newkeystr)
36 | 
37 | def visualize(cnt, embeddings, rundir, idxs, str):
38 |     """
39 |     Visualize a set of examples using t-SNE.
40 |     """
41 |     from vocabulary import wordmap
42 |     PERPLEXITY=30
43 | 
44 |     x = embeddings[idxs]
45 |     print x.shape
46 |     titles = [wordmap.str(id) for id in idxs]
47 | 
48 |     import os.path
49 |     filename = os.path.join(rundir, "embeddings-%s-%d.png" % (str, cnt))
50 |     try:
51 |         from textSNE.calc_tsne import tsne
52 | #       from textSNE.tsne import tsne
53 |         out = tsne(x, perplexity=PERPLEXITY)
54 |         from textSNE.render import render
55 |         render([(title, point[0], point[1]) for title, point in zip(titles, out)], filename)
56 |     except IOError:
57 |         logging.info("ERROR visualizing", filename, ". Continuing...")
58 | 
59 | def embeddings_debug(w, cnt, str):
60 |     """
61 |     Output the l2norm mean and max of the embeddings, including in debug out the str and training cnt
62 |     """
63 |     totalcnt = numpy.sum(numpy.abs(w) >= 0)
64 |     notsmallcnt = numpy.sum(numpy.abs(w) >= 0.1)
65 |     logging.info("%d %s dimensions of %s have absolute value >= 0.1" % (cnt, percent(notsmallcnt, totalcnt), str))
66 |     notsmallcnt = numpy.sum(numpy.abs(w) >= 0.01)
67 |     logging.info("%d %s dimensions of %s have absolute value >= 0.01" % (cnt, percent(notsmallcnt, totalcnt), str))
68 | 
69 |     l2norm = numpy.sqrt(numpy.square(w).sum(axis=1))
70 |     median = numpy.median(l2norm)
71 |     mean = numpy.mean(l2norm)
72 |     std = numpy.std(l2norm)
73 | #    print("%d l2norm of top 100 words: mean = %f stddev=%f" % (cnt, numpy.mean(l2norm), numpy.std(l2norm),))
74 |     l2norm = l2norm.tolist()
75 |     l2norm.sort()
76 |     l2norm.reverse()
77 |     logging.info("%d l2norm of %s: median = %f mean = %f stddev=%f top3=%s" % (cnt, str, median, mean, std, `l2norm[:3]`))
78 | #    print("top 5 = %s" % `l2norm[:5]`)
79 | 


--------------------------------------------------------------------------------
/hyperparameters.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module to update hyperparameters automatically.
 3 | """
 4 | 
 5 | from os.path import join
 6 | import common.hyperparameters
 7 | HYPERPARAMETERS = common.hyperparameters.read("random-indexing")
 8 | DATA_DIR = HYPERPARAMETERS["locations"]["DATA_DIR"]
 9 | RUN_NAME = HYPERPARAMETERS["RUN_NAME"]
10 | VOCABULARY_SIZE = HYPERPARAMETERS["VOCABULARY_SIZE"]
11 | INCLUDE_UNKNOWN_WORD = HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"]
12 | HYPERPARAMETERS["TRAIN_SENTENCES"] = join(DATA_DIR, "%s.train.txt.gz" % RUN_NAME)
13 | #HYPERPARAMETERS["ORIGINAL VALIDATION_SENTENCES"] = join(DATA_DIR, "%s.validation.txt.gz" % RUN_NAME)
14 | #HYPERPARAMETERS["VALIDATION_SENTENCES"] = join(DATA_DIR, "%s.validation-%d.txt.gz" % (RUN_NAME, HYPERPARAMETERS["VALIDATION EXAMPLES"]))
15 | HYPERPARAMETERS["VOCABULARY"] = join(DATA_DIR, "vocabulary-%s-%d.txt.gz" % (RUN_NAME, VOCABULARY_SIZE))
16 | HYPERPARAMETERS["VOCABULARY_IDMAP_FILE"] = join(DATA_DIR, "idmap.%s-%d.include_unknown=%s.pkl.gz" % (RUN_NAME, VOCABULARY_SIZE, INCLUDE_UNKNOWN_WORD))
17 | 
18 | 


--------------------------------------------------------------------------------
/hyperparameters.random-indexing.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # CONTEXT_TYPES is a list of lists. For example [[-2, -1], [+1, +2]]. This
 3 | # means that words two to the left (-2) and one to the left (-1) of the
 4 | # focus word will be treated as one kind of context vector. Similarly,
 5 | # words one to the right (+1) and two to the right (+2) of the focus word
 6 | # are another kind of context vector.
 7 | CONTEXT_TYPES: [[-1], [+1]]
 8 | 
 9 | # Where to read the input data from
10 | locations: {"DATA_DIR": "/home/fringant2/lisa/turian/dev/python/language-model/data/"}
11 | #locations: {"DATA_DIR": "/home/turianjo/dev/python/language-model/data/"}
12 | 
13 | # Should we induce an embedding for OOV words?
14 | INCLUDE_UNKNOWN_WORD: True
15 | 
16 | RUN_NAME: "rcv1.case-intact"
17 | VOCABULARY_SIZE: 268810
18 | 
19 | REPRESENTATION_SIZE: 10
20 | 
21 | # Either "ternary" (-1, 0, +1) or "gaussian" random matrix.
22 | #RANDOMIZATION_TYPE: ternary
23 | #RANDOMIZATION_TYPE: gaussian
24 | RANDOMIZATION_TYPE: ""
25 | 
26 | # Percentage of each context (random) representation that is non-zero.
27 | TERNARY_NON_ZERO_PERCENT: 0.0
28 | 
29 | RANDOM_SEED: 0
30 | 


--------------------------------------------------------------------------------
/induce.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import logging
  4 | import sys
  5 | import string
  6 | from common.file import myopen
  7 | from common.stats import stats
  8 | from common.str import percent
  9 | 
 10 | import numpy
 11 | import random
 12 | 
 13 | import diagnostics
 14 | 
 15 | def trainingsentences():
 16 |     """
 17 |     For each line (sentence) in the training data, transform it into a list of token IDs.
 18 |     """
 19 | 
 20 |     HYPERPARAMETERS = common.hyperparameters.read("random-indexing")
 21 |     from vocabulary import wordmap
 22 |     filename = HYPERPARAMETERS["TRAIN_SENTENCES"]
 23 |     count = 0
 24 |     for l in myopen(filename):
 25 |         tokens = []
 26 |         for w in string.split(l):
 27 |             w = string.strip(w)
 28 |             assert wordmap.exists(w)     # Not exactly clear what to do
 29 |                                          # if the word isn't in the vocab.
 30 |             tokens.append(wordmap.id(w))
 31 |         yield tokens
 32 |         count += 1
 33 |         if count % 1000 == 0:
 34 |             logging.info("Read %d lines from training file %s..." % (count, filename))
 35 |             logging.info(stats())
 36 | 
 37 | def generate_context_vectors():
 38 |     """
 39 |     Generate the (random) context vectors.
 40 |     """
 41 | 
 42 |     HYPERPARAMETERS = common.hyperparameters.read("random-indexing")
 43 |     from vocabulary import wordmap
 44 | 
 45 |     if HYPERPARAMETERS["RANDOMIZATION_TYPE"] == "gaussian":
 46 |         context_vectors = [numpy.random.normal(size=(wordmap.len, HYPERPARAMETERS["REPRESENTATION_SIZE"])) for i in range(len(HYPERPARAMETERS["CONTEXT_TYPES"]))]
 47 |     elif HYPERPARAMETERS["RANDOMIZATION_TYPE"] == "ternary":
 48 |         NONZEROS = int(HYPERPARAMETERS["TERNARY_NON_ZERO_PERCENT"] * HYPERPARAMETERS["REPRESENTATION_SIZE"] + 0.5)
 49 |     
 50 |         logging.info("Generating %d nonzeros per %d-length random context vector" % (NONZEROS, HYPERPARAMETERS["REPRESENTATION_SIZE"]))
 51 |     
 52 |         # Generate one set of context vectors per list in HYPERPARAMETERS["CONTEXT_TYPES"]
 53 |         context_vectors = []
 54 |         for i in range(len(HYPERPARAMETERS["CONTEXT_TYPES"])):
 55 |             logging.info("Generated %s context matrixes" % (percent(i, len(HYPERPARAMETERS["CONTEXT_TYPES"]))))
 56 |             logging.info(stats())
 57 |             thiscontext = numpy.zeros((wordmap.len, HYPERPARAMETERS["REPRESENTATION_SIZE"]))
 58 |             for j in range(wordmap.len):
 59 |                 idxs = range(HYPERPARAMETERS["REPRESENTATION_SIZE"])
 60 |                 random.shuffle(idxs)
 61 |                 for k in idxs[:NONZEROS]:
 62 |                     thiscontext[j][k] = random.choice([-1, +1])
 63 |     #            print thiscontext[j]
 64 |             context_vectors.append(thiscontext)
 65 |     else:
 66 |         assert 0
 67 |     
 68 |     logging.info("Done generating %s context matrixes" % (percent(i, len(HYPERPARAMETERS["CONTEXT_TYPES"]))))
 69 |     logging.info(stats())
 70 |     return context_vectors
 71 | 
 72 | if __name__ == "__main__":
 73 |     import common.hyperparameters, common.options
 74 |     HYPERPARAMETERS = common.hyperparameters.read("random-indexing")
 75 |     HYPERPARAMETERS, options, args, newkeystr = common.options.reparse(HYPERPARAMETERS)
 76 |     import hyperparameters
 77 | 
 78 |     from common import myyaml
 79 |     import common.dump
 80 |     print >> sys.stderr, myyaml.dump(common.dump.vars_seq([hyperparameters]))
 81 | 
 82 |     rundir = common.dump.create_canonical_directory(HYPERPARAMETERS)
 83 | 
 84 |     import os.path, os
 85 |     logfile = os.path.join(rundir, "log")
 86 |     if newkeystr != "":
 87 |         verboselogfile = os.path.join(rundir, "log%s" % newkeystr)
 88 |         print >> sys.stderr, "Logging to %s, and creating link %s" % (logfile, verboselogfile)
 89 |         os.system("ln -s log %s " % (verboselogfile))
 90 |     else:
 91 |         print >> sys.stderr, "Logging to %s, not creating any link because of default settings" % logfile
 92 | 
 93 |     logging.basicConfig(filename=logfile, filemode="w", level=logging.DEBUG)
 94 |     logging.info("INITIALIZING TRAINING STATE")
 95 |     logging.info(myyaml.dump(common.dump.vars_seq([hyperparameters])))
 96 | 
 97 | 
 98 |     import random, numpy
 99 |     random.seed(HYPERPARAMETERS["RANDOM_SEED"])
100 |     numpy.random.seed(HYPERPARAMETERS["RANDOM_SEED"])
101 |     from vocabulary import wordmap
102 | 
103 |     cnt = 0
104 |     random_representations = numpy.zeros((wordmap.len, HYPERPARAMETERS["REPRESENTATION_SIZE"]))
105 | 
106 |     context_vectors = generate_context_vectors()
107 | 
108 |     for tokens in trainingsentences():
109 |         for i in range(len(tokens)):
110 |             for j, context in enumerate(HYPERPARAMETERS["CONTEXT_TYPES"]):
111 |                 for k in context:
112 |                     tokidx = i + k
113 |                     if tokidx < 0 or tokidx >= len(tokens): continue
114 |                     random_representations[tokens[i]] += context_vectors[j][tokens[tokidx]]
115 |         cnt += 1
116 |         if cnt % 10000 == 0:
117 |             diagnostics.diagnostics(cnt, random_representations)
118 | 
119 |     logging.info("DONE. Dividing embeddings by their standard deviation...")
120 |     random_representations = random_representations * (1. / numpy.std(random_representations))
121 |     diagnostics.diagnostics(cnt, random_representations)
122 |     diagnostics.visualizedebug(cnt, random_representations, rundir, newkeystr)
123 | 
124 |     outfile = os.path.join(rundir, "random_representations")
125 |     if newkeystr != "":
126 |         verboseoutfile = os.path.join(rundir, "random_representations%s" % newkeystr)
127 |         logging.info("Writing representations to %s, and creating link %s" % (outfile, verboseoutfile))
128 |         os.system("ln -s random_representations %s " % (verboseoutfile))
129 |     else:
130 |         logging.info("Writing representations to %s, not creating any link because of default settings" % outfile)
131 | 
132 |     o = open(outfile, "wt")
133 |     from vocabulary import wordmap
134 |     for i in range(wordmap.len):
135 |         o.write(wordmap.str(i) + " ")
136 |         for v in random_representations[i]:
137 |             o.write(`v` + " ")
138 |         o.write("\n")
139 | 


--------------------------------------------------------------------------------
/vocabulary.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Automatically load the wordmap, if available.
 3 | """
 4 | 
 5 | import cPickle
 6 | from common.file import myopen
 7 | import sys
 8 | 
 9 | def _wordmap_filename():
10 |     import common.hyperparameters, common.options
11 |     HYPERPARAMETERS = common.hyperparameters.read("random-indexing")
12 |     return HYPERPARAMETERS["VOCABULARY_IDMAP_FILE"]
13 | 
14 | wordmap = None
15 | try:
16 |     wordmap = cPickle.load(myopen(_wordmap_filename()))
17 |     wordmap.str = wordmap.key
18 | except: pass
19 | 
20 | def write(wordmap):
21 |     """
22 |     Write the word ID map, passed as a parameter.
23 |     """
24 |     print >> sys.stderr, "Writing word map to %s..." % _wordmap_filename()
25 |     cPickle.dump(wordmap, myopen(_wordmap_filename(), "w"))
26 | 


--------------------------------------------------------------------------------