├── models └── .gitkeep ├── normalize.py ├── plot-correlation.py ├── plot-reduction.py ├── plot-tsne.py ├── project-tsne.sh ├── readme.md ├── setup.sh ├── tokens-to-correlation.py ├── tokens-to-vectors.py └── words-to-vectors.py /models/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kylemcdonald/EmbeddingScripts/953f7958e8eb5c9c9c88f6cbf240cbdd41833c5c/models/.gitkeep -------------------------------------------------------------------------------- /normalize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import numpy 3 | d = numpy.loadtxt("/dev/stdin"); 4 | d -= d.min(axis=0); 5 | d /= d.max(axis=0); 6 | numpy.savetxt("/dev/stdout", d, fmt="%.8f", delimiter="\t") 7 | -------------------------------------------------------------------------------- /plot-correlation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | from numpy import loadtxt, genfromtxt, corrcoef, sum, log, arange 4 | from numpy.random import rand 5 | from scipy.spatial.distance import cdist 6 | from pylab import pcolor, show, colorbar, xticks, yticks, savefig 7 | from tsp_solver.greedy import solve_tsp 8 | 9 | parser = argparse.ArgumentParser( 10 | description='Plot tsne output.') 11 | parser.add_argument('-i', '--input', default='data') 12 | args = parser.parse_args() 13 | 14 | data = loadtxt('{0}/vectors'.format(args.input)) 15 | 16 | labels = [] 17 | with open('{}/words'.format(args.input)) as f: 18 | for line in f: 19 | labels.append(line.strip()) 20 | 21 | distanceMatrix = cdist(data, data, 'euclidean') 22 | path = solve_tsp(distanceMatrix) 23 | pcolor(data[path], cmap='binary') 24 | savefig('{}/correlation.png'.format(args.input), figsize=(4,4), dpi=600) 25 | 26 | for i in path: 27 | print(labels[i]) 28 | -------------------------------------------------------------------------------- /plot-reduction.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | from time import time 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | from matplotlib import offsetbox 7 | from sklearn import (manifold, datasets, decomposition, ensemble, lda, random_projection) 8 | 9 | parser = argparse.ArgumentParser( 10 | description='Plot many kinds of dimensionality reduction algorithms.') 11 | parser.add_argument('-i', '--input', default='data') 12 | args = parser.parse_args() 13 | 14 | y = [] 15 | with open('{}/words'.format(args.input)) as f: 16 | for line in f: 17 | y.append(line.strip()) 18 | 19 | X = np.loadtxt('{}/vectors'.format(args.input)) 20 | n_samples, n_features = X.shape 21 | n_neighbors = 30 22 | 23 | 24 | #---------------------------------------------------------------------- 25 | # Scale and visualize the embedding vectors 26 | def plot_embedding(X, title=None): 27 | x_min, x_max = np.min(X, 0), np.max(X, 0) 28 | X = (X - x_min) / (x_max - x_min) 29 | 30 | plt.figure() 31 | ax = plt.subplot(111) 32 | for i in range(X.shape[0]): 33 | plt.text(X[i, 0], X[i, 1], str(y[i]), 34 | # color=plt.cm.Set1(y[i] / 10.), 35 | fontdict={'weight': 'bold', 'size': 9}) 36 | 37 | if hasattr(offsetbox, 'AnnotationBbox'): 38 | # only print thumbnails with matplotlib > 1.0 39 | shown_images = np.array([[1., 1.]]) # just something big 40 | for i in range(X.shape[0]): 41 | dist = np.sum((X[i] - shown_images) ** 2, 1) 42 | if np.min(dist) < 4e-3: 43 | # don't show points that are too close 44 | continue 45 | # shown_images = np.r_[shown_images, [X[i]]] 46 | # imagebox = offsetbox.AnnotationBbox( 47 | # offsetbox.OffsetImage(digits.images[i], cmap=plt.cm.gray_r), 48 | # X[i]) 49 | # ax.add_artist(imagebox) 50 | plt.xticks([]), plt.yticks([]) 51 | if title is not None: 52 | plt.title(title) 53 | 54 | 55 | #---------------------------------------------------------------------- 56 | # Plot images 57 | # n_img_per_row = 20 58 | # img = np.zeros((10 * n_img_per_row, 10 * n_img_per_row)) 59 | # for i in range(n_img_per_row): 60 | # ix = 10 * i + 1 61 | # for j in range(n_img_per_row): 62 | # iy = 10 * j + 1 63 | # img[ix:ix + 8, iy:iy + 8] = X[i * n_img_per_row + j].reshape((8, 8)) 64 | 65 | # plt.imshow(img, cmap=plt.cm.binary) 66 | # plt.xticks([]) 67 | # plt.yticks([]) 68 | # plt.title('A selection from the 64-dimensional digits dataset') 69 | 70 | 71 | #---------------------------------------------------------------------- 72 | # Random 2D projection using a random unitary matrix 73 | # print("Computing random projection") 74 | # rp = random_projection.SparseRandomProjection(n_components=2, random_state=42) 75 | # X_projected = rp.fit_transform(X) 76 | # plot_embedding(X_projected, "Random Projection") 77 | 78 | 79 | #---------------------------------------------------------------------- 80 | # Projection on to the first 2 principal components 81 | 82 | try: 83 | print("Computing PCA projection") 84 | t0 = time() 85 | X_pca = decomposition.TruncatedSVD(n_components=2).fit_transform(X) 86 | plot_embedding(X_pca, 87 | "Principal Components projection (time %.2fs)" % 88 | (time() - t0)) 89 | except: 90 | pass 91 | 92 | #---------------------------------------------------------------------- 93 | # Projection on to the first 2 linear discriminant components 94 | 95 | try: 96 | print("Computing LDA projection") 97 | X2 = X.copy() 98 | X2.flat[::X.shape[1] + 1] += 0.01 # Make X invertible 99 | t0 = time() 100 | X_lda = lda.LDA(n_components=2).fit_transform(X2, y) 101 | plot_embedding(X_lda, 102 | "Linear Discriminant projection (time %.2fs)" % 103 | (time() - t0)) 104 | except: 105 | pass 106 | 107 | #---------------------------------------------------------------------- 108 | # Isomap projection dataset 109 | try: 110 | print("Computing Isomap embedding") 111 | t0 = time() 112 | X_iso = manifold.Isomap(n_neighbors, n_components=2).fit_transform(X) 113 | print("Done.") 114 | plot_embedding(X_iso, 115 | "Isomap projection (time %.2fs)" % 116 | (time() - t0)) 117 | except: 118 | pass 119 | 120 | #---------------------------------------------------------------------- 121 | # Locally linear embedding dataset 122 | try: 123 | print("Computing LLE embedding") 124 | clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, 125 | method='standard') 126 | t0 = time() 127 | X_lle = clf.fit_transform(X) 128 | print("Done. Reconstruction error: %g" % clf.reconstruction_error_) 129 | plot_embedding(X_lle, 130 | "Locally Linear Embedding (time %.2fs)" % 131 | (time() - t0)) 132 | except: 133 | pass 134 | 135 | #---------------------------------------------------------------------- 136 | # Modified Locally linear embedding dataset 137 | try: 138 | print("Computing modified LLE embedding") 139 | clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, 140 | method='modified') 141 | t0 = time() 142 | X_mlle = clf.fit_transform(X) 143 | print("Done. Reconstruction error: %g" % clf.reconstruction_error_) 144 | plot_embedding(X_mlle, 145 | "Modified Locally Linear Embedding (time %.2fs)" % 146 | (time() - t0)) 147 | except: 148 | pass 149 | 150 | #---------------------------------------------------------------------- 151 | # HLLE embedding dataset 152 | try: 153 | print("Computing Hessian LLE embedding") 154 | clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, 155 | method='hessian') 156 | t0 = time() 157 | X_hlle = clf.fit_transform(X) 158 | print("Done. Reconstruction error: %g" % clf.reconstruction_error_) 159 | plot_embedding(X_hlle, 160 | "Hessian Locally Linear Embedding (time %.2fs)" % 161 | (time() - t0)) 162 | except: 163 | pass 164 | 165 | #---------------------------------------------------------------------- 166 | # LTSA embedding dataset 167 | try: 168 | print("Computing LTSA embedding") 169 | clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, 170 | method='ltsa') 171 | t0 = time() 172 | X_ltsa = clf.fit_transform(X) 173 | print("Done. Reconstruction error: %g" % clf.reconstruction_error_) 174 | plot_embedding(X_ltsa, 175 | "Local Tangent Space Alignment (time %.2fs)" % 176 | (time() - t0)) 177 | except: 178 | pass 179 | 180 | #---------------------------------------------------------------------- 181 | # MDS embedding dataset 182 | try: 183 | print("Computing MDS embedding") 184 | clf = manifold.MDS(n_components=2, n_init=1, max_iter=100) 185 | t0 = time() 186 | X_mds = clf.fit_transform(X) 187 | print("Done. Stress: %f" % clf.stress_) 188 | plot_embedding(X_mds, 189 | "MDS embedding (time %.2fs)" % 190 | (time() - t0)) 191 | except: 192 | pass 193 | 194 | #---------------------------------------------------------------------- 195 | # Random Trees embedding dataset 196 | try: 197 | print("Computing Totally Random Trees embedding") 198 | hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0, 199 | max_depth=5) 200 | t0 = time() 201 | X_transformed = hasher.fit_transform(X) 202 | pca = decomposition.TruncatedSVD(n_components=2) 203 | X_reduced = pca.fit_transform(X_transformed) 204 | 205 | plot_embedding(X_reduced, 206 | "Random forest embedding (time %.2fs)" % 207 | (time() - t0)) 208 | except: 209 | pass 210 | 211 | #---------------------------------------------------------------------- 212 | # Spectral embedding dataset 213 | try: 214 | print("Computing Spectral embedding") 215 | embedder = manifold.SpectralEmbedding(n_components=2, random_state=0, 216 | eigen_solver="arpack") 217 | t0 = time() 218 | X_se = embedder.fit_transform(X) 219 | 220 | plot_embedding(X_se, 221 | "Spectral embedding (time %.2fs)" % 222 | (time() - t0)) 223 | except: 224 | pass 225 | 226 | #---------------------------------------------------------------------- 227 | # t-SNE embedding dataset 228 | try: 229 | print("Computing t-SNE embedding") 230 | tsne = manifold.TSNE(n_components=2, init='pca', random_state=0) 231 | t0 = time() 232 | X_tsne = tsne.fit_transform(X) 233 | 234 | plot_embedding(X_tsne, 235 | "t-SNE embedding (time %.2fs)" % 236 | (time() - t0)) 237 | except: 238 | pass 239 | 240 | plt.show() 241 | -------------------------------------------------------------------------------- /plot-tsne.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from matplotlib.backends.backend_pdf import PdfPages 6 | from scipy.spatial import Voronoi, voronoi_plot_2d 7 | 8 | parser = argparse.ArgumentParser( 9 | description='Plot tsne output.') 10 | parser.add_argument('-i', '--input', default='data') 11 | parser.add_argument('-p', '--perplexity', default=1) 12 | args = parser.parse_args() 13 | 14 | labels = [] 15 | with open('{}/words'.format(args.input)) as f: 16 | for line in f: 17 | labels.append(line.strip()) 18 | 19 | data2d = np.loadtxt('{0}/{1}.2d.tsne'.format(args.input, args.perplexity)) 20 | data3d = np.loadtxt('{0}/{1}.3d.tsne'.format(args.input, args.perplexity)) 21 | 22 | plt.figure(figsize=(10, 10), dpi=100) 23 | 24 | vor = Voronoi(data2d) 25 | for i in range(len(data2d)): 26 | if vor.point_region[i] != -1: 27 | region = vor.regions[vor.point_region[i]] 28 | if not -1 in region: 29 | polygon = [vor.vertices[j] for j in region] 30 | plt.fill(*zip(*polygon), color=data3d[i]) 31 | 32 | for label, x, y in zip(labels, data2d[:, 0], data2d[:, 1]): 33 | plt.annotate(label, xy = (x, y), size = 2, va = 'center', ha = 'center') 34 | 35 | plt.axis('off') 36 | plt.xlim([0,1]) 37 | plt.ylim([0,1]) 38 | 39 | pp = PdfPages('{0}/{1}-plot.pdf'.format(args.input, args.perplexity)) 40 | plt.savefig(pp, format='pdf', bbox_inches='tight', pad_inches=0, aspect='normal') 41 | pp.close() 42 | -------------------------------------------------------------------------------- /project-tsne.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | function tsne { 4 | if [ ! -f $1/$2.tsne ]; then 5 | python bh_tsne/bhtsne.py -v -d 2 -p $2 -i $1/vectors -o $1/cache 6 | cat $1/cache | python normalize.py > $1/$2.2d.tsne 7 | python bh_tsne/bhtsne.py -v -d 3 -p $2 -i $1/vectors -o $1/cache 8 | cat $1/cache | python normalize.py > $1/$2.3d.tsne 9 | rm $1/cache 10 | fi 11 | } 12 | 13 | if [ ! -f $1/vectors ]; then 14 | # if there are no vectors 15 | if [ ! -f $1/tokens ]; then 16 | # and no tokens, use word2vec to create vectors 17 | python3 words-to-vectors.py -i $1 18 | else 19 | # otherwise create vectors from tokens 20 | # cp $1/wordlist $1/words 21 | # python tokens-to-vectors.py -i $1 22 | # or create correlation vectors 23 | python tokens-to-correlation.py -i $1 24 | fi 25 | fi 26 | 27 | tsne $1 1 28 | tsne $1 5 29 | tsne $1 10 30 | tsne $1 50 31 | tsne $1 100 32 | tsne $1 500 33 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Embedding Scripts 2 | 3 | A small collection of scripts to project/embed high dimensional data in two dimensions. 4 | 5 | First run `./setup.sh` which will make sure python has the necessary libraries. It will also compile Barnes-Hut t-SNE from source, and download a word2vec model trained on the Google News dataset (a very large file that will decompress to ~3.6GB). 6 | 7 | Each dataset is stored in a folder. Inside the folder you might have: 8 | 9 | - `tokens` a tab-separated file of samples, where each column has one token in it. For example, one line of `cocktails/tokens` might look like `whiskey\tginger ale\tlemon` 10 | - `wordlist` are words that are going to be projected using word2vec. For example `moods/wordlist` might read `happy\nsad\nhungry\ndelighted\n` 11 | - `vectors` is a tab-separated list of high dimensional vectors used as input to the nonlinear projection algorithms. 12 | - `words` is a list of labels for each of the lines in `vectors`. If the `vectors` are generated from `wordlist`, some words maybe not have word2vec definitions and `words` will be a subset of `wordlist`. 13 | 14 | ## Scripts 15 | 16 | All Python scripts take `-i` as an argument for your input folder. 17 | 18 | 19 | ## word-to-vectors.py 20 | 21 | This will generate `vectors` from `wordlist` using word2vec. It will also generate `words` which may be a subset of `wordlist`. 22 | 23 | ### tokens-to-vectors.py 24 | 25 | This will generate binary `vectors` from `tokens`. So if you have 600 cocktails with 3-8 ingredients each, and 180 unique ingredients, the output will be 600 vectors of length 180 with 3-8 values set to 1. 26 | 27 | ### tokens-to-correlation.py 28 | 29 | This will generate floating point `vectors` from `tokens` using the correlation/co-occurence between different tokens. If you have 600 cocktails with 3-8 ingredients each, and 180 unique ingredients, the output will be 180 vectors of length 180, and if there are ingredients that co-occur more often the value will be higher. Except for very complex datasets, most elements will be 0. 30 | 31 | ### plot-reduction.py 32 | 33 | After generating `vectors` using one of the above techniques or by providing them directly, this script will attempt to run many nonlinear dimensionality reduction algorithms from scikit-learn on the input data. This is usually a good way to figure out what direction to head next. 34 | 35 | ### plot-correlation.py 36 | 37 | This plots a basic correlation matrix, with the rows sorted by solving a travelling salesperson problem. It will also print a list of labels "sorted by similarity". Output is stored in the input folder as a png file. 38 | 39 | ### project.sh 40 | 41 | This takes one argument for the input folder, and will generate `vectors` if they don't exist, either using `tokens-to-vectors.py` or `word-to-vectors.py` depending on which files are present, and then run `bh_tsne` with perplexities of 1, 5, 10, 50, 100 and 500 for both 2d projection and 3d projection. The results are stored in the input folder. 42 | 43 | ### plot-tsne.py 44 | 45 | Besides the argument for the input folder, this script also takes an argument for the perplexity to process using `-p`. It then takes the results of `bh_tsne` and projects it using the 2d projection for placing labels and 3d projection for choosing colors for voronoi cells in the background that can provide a high dimensional intuition for distances in some cases: if two adjacent vectors are "strongly" similar they have similar colors (i.e., they are still adjacent in a higher dimensional space. If they are "weakly" similar they have different colors (they become separated in a higher dimensional space). The output image is saved in the input folder as a pdf file. -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | echo "Setting up Python 3" 2 | pip3 install numpy gensim 3 | 4 | echo "Setting up Python 2" 5 | pip install numpy scipy scikit-learn matplotlib pylab 6 | pip install git+https://github.com/dmishin/tsp-solver.git 7 | 8 | echo "Downloading bh_tsne from http://lvdmaaten.github.io/tsne/" 9 | curl -O http://lvdmaaten.github.io/tsne/code/bh_tsne.tar.gz 10 | tar -zxvf bh_tsne.tar.gz 11 | rm bh_tsne.tar.gz 12 | cd bh_tsne 13 | 14 | if [ -d /Applications/Xcode.app ]; then 15 | echo "Building bh_tsne for OSX..." 16 | g++ sptree.cpp tsne.cpp -o bh_tsne -O3 -I/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.9.sdk/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/ -lcblas 17 | else 18 | echo "Building bh_tsne for Linux..." 19 | g++ sptree.cpp tsne.cpp -o bh_tsne -O3 -I./CBLAS/include -L./ -lcblas 20 | fi 21 | echo "Done building bh_tsne" 22 | 23 | echo "Downloading GoogleNews-vectors-negative300.bin.gz from https://code.google.com/p/word2vec/" 24 | echo "(exit now if you don't want to use word2vec)" 25 | curl -o "models/GoogleNews-vectors-negative300.bin.gz" -Lk "https://googledrive.com/host/0B7XkCwpI5KDYNlNUTTlSS21pQmM" 26 | echo "Extracting GoogleNews-vectors-negative300.bin.gz" 27 | gunzip models/GoogleNews-vectors-negative300.bin.gz 28 | -------------------------------------------------------------------------------- /tokens-to-correlation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse, sys, numpy 3 | from collections import defaultdict 4 | 5 | parser = argparse.ArgumentParser( 6 | description='Generate a .tsv from tab separated tokens using correlation of the tokens.') 7 | parser.add_argument('-i', '--input', default='data') 8 | args = parser.parse_args() 9 | 10 | unique = set() 11 | counts = defaultdict( 12 | lambda: defaultdict( 13 | lambda: 0.0)) 14 | with open('{}/tokens'.format(args.input)) as f: 15 | for line in f: 16 | tokens = line.strip().split('\t') 17 | unique.update(tokens) 18 | for a in tokens: 19 | for b in tokens: 20 | counts[a][b] += 1 21 | counts[b][a] += 1 22 | 23 | words = [] 24 | vectors = [] 25 | for a in unique: 26 | words.append(a) 27 | vector = [] 28 | for b in unique: 29 | vector.append(counts[a][b]) 30 | vectors.append(vector / numpy.max(vector)) 31 | 32 | numpy.savetxt('{}/words'.format(args.input), words, fmt='%s') 33 | numpy.savetxt('{}/vectors'.format(args.input), vectors, fmt='%.8f', delimiter='\t') 34 | -------------------------------------------------------------------------------- /tokens-to-vectors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse, sys, numpy 3 | 4 | parser = argparse.ArgumentParser( 5 | description='Generate a .tsv from tab separated tokens.') 6 | parser.add_argument('-i', '--input', default='data') 7 | args = parser.parse_args() 8 | 9 | # read through file once to get all tokens 10 | unique = set() 11 | with open('{}/tokens'.format(args.input)) as f: 12 | for line in f: 13 | tokens = line.strip().split('\t') 14 | unique.update(tokens) 15 | 16 | # read through file again to output vectors 17 | vectors = [] 18 | with open('{}/tokens'.format(args.input)) as f: 19 | for line in f: 20 | vector = [] 21 | tokens = line.strip().split('\t') 22 | for ref in unique: 23 | if ref in tokens: 24 | vector.append(1) 25 | else: 26 | vector.append(0) 27 | vectors.append(vector) 28 | 29 | numpy.savetxt('{}/vectors'.format(args.input), vectors, fmt='%.1f', delimiter='\t') 30 | -------------------------------------------------------------------------------- /words-to-vectors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse, sys, numpy 4 | from gensim.models import Word2Vec 5 | 6 | parser = argparse.ArgumentParser( 7 | description='Generate a .tsv of word2vec vectors for a word list.') 8 | parser.add_argument('-i', '--input', default='data') 9 | parser.add_argument('-m', '--model', default='models/GoogleNews-vectors-negative300.bin') 10 | args = parser.parse_args() 11 | 12 | print('Loading model from ' + args.model) 13 | model = Word2Vec.load_word2vec_format(args.model, binary=True) 14 | wordlist = numpy.genfromtxt('{}/wordlist'.format(args.input), dtype='str') 15 | words = [] 16 | vectors = [] 17 | print('Looking up {} words.'.format(len(wordlist))) 18 | for word in wordlist: 19 | if word in model: 20 | words.append(word) 21 | vectors.append(model[word]) 22 | print('Saving {:.2%} of the words.'.format(len(words) / len(wordlist))) 23 | numpy.savetxt('{}/words'.format(args.input), words, fmt='%s') 24 | print('Saving word vectors.') 25 | numpy.savetxt('{}/vectors'.format(args.input), vectors, fmt='%.8f', delimiter='\t') 26 | --------------------------------------------------------------------------------