├── data
    ├── cache
    │   └── .gitkeep
    └── glove
    │   └── .gitkeep
├── .dockerignore
├── requirements.txt
├── .gitignore
├── download_data.sh
├── Dockerfile
├── LICENSE
├── utils.py
├── word_clustering.py
├── word_arithmetic.py
├── word_game.py
└── README.md


/data/cache/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/glove/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | data/
2 | venv/
3 | *.pyc
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.16.3
2 | pandas==0.24.2
3 | scikit-learn==0.20.3
4 | scipy==1.2.1
5 | termcolor==1.1.0


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | data/
3 | !data/cache/.gitkeep
4 | !data/glove/.gitkeep
5 | venv/
6 | .vscode/
7 | *.pyc
8 | .DS_Store
9 | 


--------------------------------------------------------------------------------
/download_data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | curl -L http://www-nlp.stanford.edu/data/glove.6B.zip -o data/glove.6B.zip
4 | unzip data/glove.6B.zip -d data/glove
5 | rm data/glove.6B.zip
6 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7-slim-stretch
 2 | 
 3 | RUN apt-get update && \
 4 |     apt-get install -y --no-install-recommends curl unzip git build-essential && \
 5 |     git clone https://github.com/brannondorsey/glove-experiments && \
 6 |     cd glove-experiments && \
 7 |     pip install -r requirements.txt && \
 8 |     ./download_data.sh
 9 | 
10 | WORKDIR /glove-experiments
11 | CMD python word_arithmetic.py
12 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Brannon Dorsey <brannon@brannondorsey.com>
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import codecs, json
 2 | import numpy as np
 3 | 
 4 | '''Serializable/Pickleable class to replicate the functionality of collections.defaultdict'''
 5 | class autovivify_list(dict):
 6 |         def __missing__(self, key):
 7 |                 value = self[key] = []
 8 |                 return value
 9 | 
10 |         def __add__(self, x):
11 |                 '''Override addition for numeric types when self is empty'''
12 |                 if not self and isinstance(x, Number):
13 |                         return x
14 |                 raise ValueError
15 | 
16 |         def __sub__(self, x):
17 |                 '''Also provide subtraction method'''
18 |                 if not self and isinstance(x, Number):
19 |                         return -1 * x
20 |                 raise ValueError
21 | 
22 | def build_word_vector_matrix(vector_file, n_words):
23 | 	'''Read a GloVe array from sys.argv[1] and return its vectors and labels as arrays'''
24 | 	np_arrays = []
25 | 	labels_array = []
26 | 
27 | 	with codecs.open(vector_file, 'r', 'utf-8') as f:
28 | 		for i, line in enumerate(f):
29 | 			sr = line.split()
30 | 			labels_array.append(sr[0])
31 | 			np_arrays.append(np.array([float(j) for j in sr[1:]]))
32 | 			if i == n_words - 1:
33 | 				return np.array(np_arrays), labels_array
34 | 		return np.array(np_arrays), labels_array
35 | 
36 | def get_cache_filename_from_args(args):
37 |         a = (args.vector_dim, args.num_words, args.num_clusters)
38 |         return '{}D_{}-words_{}-clusters.json'.format(*a)
39 | 
40 | def get_label_dictionaries(labels_array):
41 |         id_to_word = dict(zip(range(len(labels_array)), labels_array))
42 |         word_to_id = dict((v,k) for k,v in id_to_word.items())
43 |         return word_to_id, id_to_word
44 | 
45 | def save_json(filename, results):
46 |         with open(filename, 'w') as f:
47 |                 json.dump(results, f)
48 | 
49 | def load_json(filename):
50 |         with open(filename, 'r') as f:
51 |                 return json.load(f)
52 | 


--------------------------------------------------------------------------------
/word_clustering.py:
--------------------------------------------------------------------------------
 1 | # Notes for extension of script:
 2 | # 	- User readline() to interactively search for word groups
 3 | # 	- On a word miss, use L2 or cosine distance to select the nearest word vector
 4 | # 		- This would require all 6B tokens to loaded in ram (but not clustered)
 5 | #		- Or use levenshtein distance assuming the word is spelled the same.
 6 | #   - Provide an interface to perform basic arithmetic on words (king - man + woman = queen)
 7 | # Look at this result from 2014 English Wikipedia:
 8 | # 'islamic', 'militant', 'islam', 'radical', 'extremists', 'islamist', 'extremist', 'outlawed'
 9 | # 'war' - 'violence' + 'peace' = 'treaty' | 300d
10 | 
11 | from sklearn.cluster import KMeans
12 | from numbers import Number
13 | from pandas import DataFrame
14 | import numpy as np
15 | import os, sys, codecs, argparse, pprint, time
16 | from utils import *
17 | from word_arithmetic import *
18 | 
19 | def find_word_clusters(labels_array, cluster_labels):
20 | 	cluster_to_words = autovivify_list()
21 | 	for c, i in enumerate(cluster_labels):
22 | 		cluster_to_words[i].append(labels_array[c])
23 | 	return cluster_to_words
24 | 
25 | def parse_args():
26 | 	parser = argparse.ArgumentParser()
27 | 	parser.add_argument('--vector_dim', '-d',
28 | 						type=int,
29 | 						choices=[50, 100, 200, 300],
30 | 						default=100,
31 | 						help='What vector GloVe vector dimension to use '
32 | 							 '(default: 100).')
33 | 	parser.add_argument('--num_words', '-n',
34 | 						type=int,
35 | 						default=10000,
36 | 						help='The number of lines to read from the GloVe '
37 | 							 'vector file (default: 10000).')
38 | 	parser.add_argument('--num_clusters', '-k',
39 | 						default=1000,
40 | 						type=int,
41 | 						help='Number of resulting word clusters. '
42 | 						'The number of K in K-Means (default: 1000).')
43 | 	parser.add_argument('--n_jobs', '-j',
44 | 						type=int,
45 | 						default=-1,
46 | 						help='Number of cores to use when fitting K-Means. '
47 | 						     '-1 = all cores. '
48 | 							 'More cores = less time, more memory (default: -1).')
49 | 	parser.add_argument('--glove_path', '-i',
50 | 		                default='data/glove',
51 | 		                help='GloVe vector file path (default: data/glove)')
52 | 	return parser.parse_args()
53 | 
54 | if __name__ == '__main__':
55 | 
56 | 	args = parse_args()
57 | 
58 | 	filename = path = 'data/cache/{}'.format(get_cache_filename_from_args(args))
59 | 	cluster_to_words = None
60 | 	start_time = time.time()
61 | 
62 | 	vector_file = args.glove_path + '/' + 'glove.6B.' + str(args.vector_dim) + 'd.txt'
63 | 	df, labels_array = build_word_vector_matrix(vector_file, args.num_words)
64 | 
65 | 	# if these are clustering parameters we've never seen before
66 | 	if not os.path.isfile(filename):
67 | 
68 | 		print('No cached cluster found. Clustering using K-Means... ')
69 | 		kmeans_model = KMeans(init='k-means++', n_clusters=args.num_clusters, n_jobs=args.n_jobs, n_init=10)
70 | 		kmeans_model.fit(df)
71 | 
72 | 		cluster_labels   = kmeans_model.labels_
73 | 		# cluster_inertia = kmeans_model.inertia_
74 | 		cluster_to_words = list(find_word_clusters(labels_array, cluster_labels).values())
75 | 
76 | 		# cache these clustering results
77 | 		save_json(path, cluster_to_words)
78 | 		print('Saved {} clusters to {}. Cached for later use.'.format(len(cluster_to_words), path))
79 | 
80 | 	# if this kmeans fitting has already been cached
81 | 	else:
82 | 		print('Cached K-Means cluster found, loading from disk.')
83 | 		cluster_to_words = load_json(filename)
84 | 
85 | 	for i, words in enumerate(cluster_to_words):
86 | 		print('CLUSTER {}: {}'.format(i + 1, ', '.join(words)))
87 | 
88 | 	if start_time != None:
89 | 			print("--- {:.2f} seconds ---".format((time.time() - start_time)))
90 | 


--------------------------------------------------------------------------------
/word_arithmetic.py:
--------------------------------------------------------------------------------
  1 | import argparse, utils, sys, readline
  2 | from scipy.spatial.distance import cosine
  3 | 
  4 | def word_arithmetic(start_word, minus_words, plus_words, word_to_id, id_to_word, df, num_results=5):
  5 | 	'''Returns a word string that is the result of the vector arithmetic'''
  6 | 	try:
  7 | 		start_vec  = df[word_to_id[start_word]]
  8 | 		minus_vecs = [df[word_to_id[minus_word]] for minus_word in minus_words]
  9 | 		plus_vecs  = [df[word_to_id[plus_word]] for plus_word in plus_words]
 10 | 	except KeyError as err:
 11 | 		return err, None
 12 | 
 13 | 	result = start_vec
 14 | 
 15 | 	if minus_vecs:
 16 | 		for i, vec in enumerate(minus_vecs):
 17 | 			result = result - vec
 18 | 
 19 | 	if plus_vecs:
 20 | 		for i, vec in enumerate(plus_vecs):
 21 | 			result = result + vec
 22 | 
 23 | 	# result = start_vec - minus_vec + plus_vec
 24 | 	words = [start_word] + minus_words + plus_words
 25 | 	return None, find_nearest(words, result, id_to_word, df, num_results)
 26 | 
 27 | def find_nearest(words, vec, id_to_word, df, num_results, method='cosine'):
 28 | 
 29 | 	if method == 'cosine':
 30 | 		minim = [] # min, index
 31 | 		for i, v in enumerate(df):
 32 | 			# skip the base word, its usually the closest
 33 | 			if id_to_word[i] in words:
 34 | 				continue
 35 | 			dist = cosine(vec, v)
 36 | 			minim.append((dist, i))
 37 | 		minim = sorted(minim, key=lambda v: v[0])
 38 | 		# return list of (word, cosine distance) tuples
 39 | 		return [(id_to_word[minim[i][1]], minim[i][0]) for i in range(num_results)]
 40 | 	else:
 41 | 		raise Exception('{} is not an excepted method parameter'.format(method))
 42 | 
 43 | def parse_expression(expr):
 44 | 
 45 | 	split = expr.split()
 46 | 	start_word = split[0]
 47 | 	minus_words, plus_words = [], []
 48 | 	for i, token in enumerate(split[1:]):
 49 | 		if token == '+':
 50 | 			plus_words.append(split[i + 2])
 51 | 		elif token == '-':
 52 | 			minus_words.append(split[i + 2])
 53 | 	return start_word, minus_words, plus_words
 54 | 
 55 | def process(num_results):
 56 | 	inpt = input('> ')
 57 | 	if inpt == 'exit':
 58 | 		exit()
 59 | 	start_word, minus_words, plus_words = parse_expression(inpt)
 60 | 	err, results = word_arithmetic(start_word=start_word,
 61 | 		                          minus_words=minus_words,
 62 | 		                          plus_words=plus_words,
 63 | 		                          word_to_id=word_to_id,
 64 | 		                          id_to_word=id_to_word,
 65 | 		                          df=df,
 66 | 								  num_results=num_results)
 67 | 	if results:
 68 | 		print()
 69 | 		for res in results:
 70 | 			print(res[0].ljust(15), '     {0:.2f}'.format(res[1]))
 71 | 		print()
 72 | 	else:
 73 | 		print('{} not found in the dataset.'.format(err), file=sys.stderr)
 74 | 
 75 | 
 76 | def parse_args():
 77 | 	parser = argparse.ArgumentParser()
 78 | 	parser.add_argument('--vector_dim', '-d',
 79 | 						type=int,
 80 | 						choices=[50, 100, 200, 300],
 81 | 						default=100,
 82 | 						help='What vector GloVe vector depth to use '
 83 | 						     '(default: 100).')
 84 | 	parser.add_argument('--num_words', '-n',
 85 | 						type=int,
 86 | 						default=10000,
 87 | 						help='The number of lines to read from the GloVe '
 88 | 						     'vector file (default: 10000).')
 89 | 	parser.add_argument('--num_output', '-o',
 90 | 						type=int,
 91 | 						default=1,
 92 | 						help='The number of result words to display (default: 1)')
 93 | 	parser.add_argument('--glove_path', '-i',
 94 | 		                default='data/glove',
 95 | 		                help='GloVe vector file path (default: data/glove)')
 96 | 	return parser.parse_args()
 97 | 
 98 | if __name__ == '__main__':
 99 | 
100 | 	args = parse_args()
101 | 	vector_file = args.glove_path + '/' + 'glove.6B.' + str(args.vector_dim) + 'd.txt'
102 | 
103 | 	if args.num_words > 400000:
104 | 		print('--num_words must be equal to or less than 400,000. Exiting.')
105 | 		exit(1)
106 | 
107 | 	df, labels_array = utils.build_word_vector_matrix(vector_file, args.num_words)
108 | 	word_to_id, id_to_word = utils.get_label_dictionaries(labels_array)
109 | 
110 | 	while True:
111 | 		process(args.num_output)
112 | 


--------------------------------------------------------------------------------
/word_game.py:
--------------------------------------------------------------------------------
  1 | import argparse, utils, sys, readline
  2 | from termcolor import colored
  3 | from scipy.spatial.distance import cosine
  4 | 
  5 | def word_arithmetic(start_word, minus_words, plus_words, word_to_id, id_to_word, df):
  6 | 	'''Returns a word string that is the result of the vector arithmetic'''
  7 | 	try:
  8 | 		start_vec  = df[word_to_id[start_word]]
  9 | 		minus_vecs = [df[word_to_id[minus_word]] for minus_word in minus_words]
 10 | 		plus_vecs  = [df[word_to_id[plus_word]] for plus_word in plus_words]
 11 | 	except KeyError as err:
 12 | 		return err, None
 13 | 
 14 | 	result = start_vec
 15 | 
 16 | 	if minus_vecs:
 17 | 		for i, vec in enumerate(minus_vecs):
 18 | 			result = result - vec
 19 | 
 20 | 	if plus_vecs:
 21 | 		for i, vec in enumerate(plus_vecs):
 22 | 			result = result + vec
 23 | 
 24 | 	return None, result
 25 | 
 26 | def find_nearest(skip_words, vec, id_to_word, df, num_results=1, method='cosine'):
 27 | 
 28 | 	if method == 'cosine':
 29 | 		minim = [] # min, index
 30 | 		for i, v in enumerate(df):
 31 | 			# skip the base word, its usually the closest
 32 | 			if id_to_word[i] in skip_words:
 33 | 				continue
 34 | 			dist = cosine(vec, v)
 35 | 			minim.append((dist, i, v))
 36 | 		minim = sorted(minim, key=lambda v: v[0])
 37 | 		# return list of (word, cosine distance, vector) tuples
 38 | 		return [(id_to_word[minim[i][1]], minim[i][0], minim[i][2]) for i in range(num_results)]
 39 | 	else:
 40 | 		raise Exception('{} is not an excepted method parameter'.format(method))
 41 | 
 42 | def eval_expression(expr, word_to_id, id_to_word, df):
 43 | 	start_word, minus_words, plus_words = parse_expression(expr)
 44 | 	err, vec = word_arithmetic(start_word=start_word,
 45 | 		                          minus_words=minus_words,
 46 | 		                          plus_words=plus_words,
 47 | 		                          word_to_id=word_to_id,
 48 | 		                          id_to_word=id_to_word,
 49 | 		                          df=df)
 50 | 	if err == None:
 51 | 		return vec, [start_word] + minus_words + plus_words # vector, skip words
 52 | 	else:
 53 | 		raise Exception('Error: {} not found in the dataset.'.format(err))
 54 | 
 55 | def parse_expression(expr):
 56 | 
 57 | 	split = expr.split()
 58 | 	start_word = split[0]
 59 | 	minus_words, plus_words = [], []
 60 | 	for i, token in enumerate(split[1:]):
 61 | 		if token == '+':
 62 | 			plus_words.append(split[i + 2])
 63 | 		elif token == '-':
 64 | 			minus_words.append(split[i + 2])
 65 | 	return start_word, minus_words, plus_words
 66 | 
 67 | def parse_args():
 68 | 	parser = argparse.ArgumentParser()
 69 | 	parser.add_argument('--vector_dim', '-d',
 70 | 						type=int,
 71 | 						choices=[50, 100, 200, 300],
 72 | 						default=100,
 73 | 						help='What vector GloVe vector depth to use (default: 100).')
 74 | 	parser.add_argument('--num_words', '-n',
 75 | 						type=int,
 76 | 						default=10000,
 77 | 						help='The number of lines to read from the GloVe vector file (default: 10000).')
 78 | 	parser.add_argument('--soft_score', '-s',
 79 | 						action='store_true',
 80 | 						help='points are scored relative to the distance a '
 81 | 						'player\'s word is from the result of the '
 82 | 						'input expression. This is in contrast to the default '
 83 | 						'1 point per-round scoring system. Soft scoring is '
 84 | 						'recommended for a more fair-and-balanced game experience (default: false)')
 85 | 	parser.add_argument('--glove_path', '-i',
 86 | 		                default='data/glove',
 87 | 		                help='GloVe vector file path (default: data/glove)')
 88 | 	return parser.parse_args()
 89 | 
 90 | def game_setup(args):
 91 | 
 92 | 	gs = {} # game state
 93 | 	gs['players'] = read_players()
 94 | 	gs['winning_score'] = read_winning_score(len(gs['players'].keys()))
 95 | 	gs['turn_number'] = 0
 96 | 	return gs
 97 | 
 98 | def read_players():
 99 | 	players = {}
100 | 	while len(players.keys()) == 0:
101 | 		print('Enter the name of each player, seperated by commas.')
102 | 		names = input('> ').split(',')
103 | 		confirm = input('There are {} players correct? [yes]: '.format(len(names)))
104 | 		if confirm == '' or confirm.lower() == 'yes':
105 | 			for name in names:
106 | 				players[name.strip()] = 0 # start with a  score of zero
107 | 			return players
108 | 
109 | def read_winning_score(num_players):
110 | 	# todo recommend a winning score based on number of players
111 | 	winning_score = 0
112 | 	while winning_score == 0:
113 | 		score = input('What score would you like to play to? [10]: ')
114 | 		if score == '':
115 | 			winning_score = 10
116 | 			return winning_score
117 | 		else:
118 | 			try:
119 | 				winning_score = int(score)
120 | 			except ValueError as err:
121 | 				print('Invalid score, please try again.')
122 | 				break
123 | 			return winning_score
124 | 
125 | def print_standings(gs):
126 | 	print()
127 | 	standings = ''
128 | 	for name, score in gs['players'].items():
129 | 		standings += '     {}: {}'.format(name, score)
130 | 	print(standings)
131 | 	print()
132 | 
133 | def turn(gs, word_to_id, id_to_word, df, soft_score):
134 | 
135 | 	gs['turn_number'] += 1
136 | 	names = list(gs['players'].keys())
137 | 	current_player = names[(gs['turn_number'] % len(names) - 1)]
138 | 	while True:
139 | 		expr = input('{}, please enter a word expression:\n> '.format(current_player))
140 | 		try:
141 | 			vec, skip_words = eval_expression(expr, word_to_id, word_to_id, df)
142 | 		except Exception as err:
143 | 			print(err)
144 | 			continue
145 | 		break
146 | 
147 | 	answers = {}
148 | 	for name in gs['players']:
149 | 		while True:
150 | 			word = input('{}, please enter your answer: '.format(name))
151 | 			if word in word_to_id:
152 | 				answers[name] = df[word_to_id[word]]
153 | 				break
154 | 			else:
155 | 				print('{} is not in the dataset, please another word.'.format(word))
156 | 
157 | 	answer_word, answer_dist, answer_vec = find_nearest(skip_words, vec, id_to_word, df)[0]
158 | 	# transform answers from vectors to distances
159 | 	for k, v in answers.items():
160 | 		answers[k] = cosine(v, answer_vec)
161 | 
162 | 	winner = min(answers, key=answers.get)
163 | 
164 | 	if not soft_score:
165 | 		gs['players'][winner] += 1
166 | 	else:
167 | 		for name in answers:
168 | 			gs['players'][name] += round(answers[name], 2)
169 | 
170 | 	print('Computer says {} = {}'.format(expr, colored(answer_word, 'cyan')))
171 | 	print('{} wins this round.'.format(colored(winner, 'green')))
172 | 	print_standings(gs)
173 | 
174 | if __name__ == '__main__':
175 | 
176 | 	args = parse_args()
177 | 	vector_file = args.glove_path + '/' + 'glove.6B.' + str(args.vector_dim) + 'd.txt'
178 | 
179 | 	df, labels_array = utils.build_word_vector_matrix(vector_file, args.num_words)
180 | 	word_to_id, id_to_word = utils.get_label_dictionaries(labels_array)
181 | 
182 | 	gs = game_setup(args)
183 | 
184 | 	while max(gs['players'].values()) < gs['winning_score']:
185 | 		turn(gs, word_to_id, id_to_word, df, args.soft_score)
186 | 
187 | 	print('{} is the winner!'.format(colored(max(gs['players'], key=gs['players'].get), 'green')))
188 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # GloVe Experiments
  2 | 
  3 | This repository contains a few brief experiments with [Stanford NLP's GloVe](https://nlp.stanford.edu/projects/glove/), an unsupervised learning algorithm for obtaining vector representations for words. Similar to Word2Vec, GloVe creates a continuous N-dimensional representation of a word that is learned from its surrounding context words in a training corpus. Trained on a large corpus of text, these co-occurance statistics (an N-dimensional vector embedding) cause semantically similar words to appear near each-other in their resulting N-dimensional embedding space (e.g. "dog" and "cat" may appear nearby a region of other pet related words in the embedding space because the context words that surround both "dog" and "cat" in the training corpus are similar).
  4 | 
  5 | I've created three small python programs for exploring GloVe embeddings:
  6 | 
  7 | - `word_arithmetic.py`: Create word analogy searches using basic arithmetic operations (e.g. `king - man + women = queen`).
  8 | - `word_game.py`: A small terminal-based multiplayer text game for creating word analogies.
  9 | - `word_clustering.py`: Create [K-Means clusters](https://en.wikipedia.org/wiki/K-means_clustering) using GloVe embeddings. Saves results to JSON.
 10 | 
 11 | All three scripts use the GloVe.6B pre-trained word embeddings created from the combined Wikipedia 2014 and Gigaword 5 datasets. They were trained using 6 billion tokens and contains 400,000 unique lowercase words. Trained embeddings are provided in 50, 100, 200, and 300 dimensions (822 MB download).
 12 | 
 13 | ## Getting Started
 14 | 
 15 | These small experiments can be run in MacOS or Linux environments (sorry ~~not sorry~~ Windoze users). If you'd prefer to run these experiments via Docker jump down to the [Running with Docker](#running-with-docker) section.
 16 | 
 17 | ```bash
 18 | # clone this repo
 19 | git clone https://github.com/brannondorsey/GloVe-experiments.git
 20 | cd GloVe-experiments
 21 | 
 22 | # install python dependencies
 23 | pip3 install -r requirements.txt
 24 | 
 25 | # dowload the pre-trained embeddings. This might take a while...
 26 | ./download_data.sh
 27 | ```
 28 | 
 29 | ## Word Arithmetic
 30 | 
 31 | `word_arithmetic.py` allows you to write simple +/- arithmetic operations using words to find the closest approximated resulting word from the given word expression. Math operations are applied in the embedding space and a K-nearest-neighbor search is used to display the `K` words closest to the result of the algebraic transformation.
 32 | 
 33 | ```bash
 34 | python3 word_arithmetic.py
 35 | > king - man + woman
 36 | 
 37 | queen                0.22
 38 | ```
 39 | 
 40 | `word - word + word` is the traditional word analogy format, however `word_arithmetic.py` supports any number of `+` or `-` operations provided all words are in the database. The meaning of less traditional expressions, `word + word + word...` is more ambiguous but can lead to interesting results nonetheless. Specifying an order of operations is not supported at this time (e.g. `(word - word) + word`).
 41 | 
 42 | By default, `word_arithmetic.py` loads the 10,000 most frequently used words from the dataset and uses a 100-dimensional embedding vector. It also prints only the single nearest word to the resulting vector point from the expression (the "nearest neighbor"). You can specify your own values for each of these parameters if you would like:
 43 | 
 44 | ```bash
 45 | python3 word_arithmetic.py --num_words 100000 --vector_dim 300 --num_output 10
 46 | > king - man + woman
 47 | 
 48 | queen                0.31
 49 | monarch              0.44
 50 | throne               0.44
 51 | princess             0.45
 52 | mother               0.49
 53 | daughter             0.49
 54 | kingdom              0.50
 55 | prince               0.50
 56 | elizabeth            0.51
 57 | wife                 0.52
 58 | ```
 59 | 
 60 | Increasing `--num_words` and `--vector_dim` increases the number of usable words in the dictionary and accuracy of the resulting word expressions respectively. Increasing either will increase the processing time for each expression as well as the memory requirements needed to run the program.
 61 | 
 62 | ```
 63 | usage: word_arithmetic.py [-h] [--vector_dim {50,100,200,300}]
 64 |                           [--num_words NUM_WORDS] [--num_output NUM_OUTPUT]
 65 |                           [--glove_path GLOVE_PATH]
 66 | 
 67 | optional arguments:
 68 |   -h, --help            show this help message and exit
 69 |   --vector_dim {50,100,200,300}, -d {50,100,200,300}
 70 |                         What vector GloVe vector depth to use (default: 100).
 71 |   --num_words NUM_WORDS, -n NUM_WORDS
 72 |                         The number of lines to read from the GloVe vector file
 73 |                         (default: 10000).
 74 |   --num_output NUM_OUTPUT, -o NUM_OUTPUT
 75 |                         The number of result words to display (default: 1)
 76 |   --glove_path GLOVE_PATH, -i GLOVE_PATH
 77 |                         GloVe vector file path
 78 | ```
 79 | 
 80 | ## Word Game
 81 | 
 82 | `word_game.py` is a small text-based multiplayer game where players take turns creating and answering `word_arithmetic.py`-style word expressions. Players win points when they propose a solution word to a word expression that is nearest to the answer word out of all players guesses.
 83 | 
 84 | ```
 85 | Enter the name of each player, seperated by commas.
 86 | > bob, alice
 87 | There are 2 players correct? [yes]: yes
 88 | What score would you like to play to? [10]: 10
 89 | alice, please enter a word expression:
 90 | > home - earth + space
 91 | alice, please enter your answer: rocket
 92 | bob, please enter your answer: moon
 93 | Computer says home - earth + space = office
 94 | bob wins this round.
 95 | 
 96 |      alice: 0     bob: 1
 97 | 
 98 | bob, please enter a word expression:
 99 | >
100 | ```
101 | 
102 | The game is far from perfect, and the automated judging can be aggravating at times (try with `--soft_score`), but it can lead to some fun times given the right crowd 💻🍻🎉. Increase the dictionary size and vector dimensions for best results:
103 | 
104 | ```bash
105 | python3 word_game.py --vector_dim 200 --num_words 100000 --soft_score
106 | ```
107 | 
108 | ```
109 | usage: word_game.py [-h] [--vector_dim {50,100,200,300}]
110 |                     [--num_words NUM_WORDS] [--soft_score]
111 |                     [--glove_path GLOVE_PATH]
112 | 
113 | optional arguments:
114 |   -h, --help            show this help message and exit
115 |   --vector_dim {50,100,200,300}, -d {50,100,200,300}
116 |                         What vector GloVe vector depth to use (default: 100).
117 |   --num_words NUM_WORDS, -n NUM_WORDS
118 |                         The number of lines to read from the GloVe vector file
119 |                         (default: 10000).
120 |   --soft_score, -s      points are scored relative to the distance a player's
121 |                         word is from the result of the input expression. This
122 |                         is in contrast to the default 1 point per-round
123 |                         scoring system. Soft scoring is recommended for a more
124 |                         fair-and-balanced game experience (default: false)
125 |   --glove_path GLOVE_PATH, -i GLOVE_PATH
126 |                         GloVe vector file path (default: data/glove)
127 | ```
128 | 
129 | ## Word Clustering
130 | 
131 | `word_clustering.py` uses unsupervised learning to clusters words into related groups using K-Means.
132 | 
133 | ```bash
134 | python3 word_clustering.py
135 | No cached cluster found. Clustering using K-Means...
136 | Saved 1000 clusters to data/cache/100D_10000-words_1000-clusters.json. Cached for later use.
137 | CLUSTER 1: athens, stockholm, oslo, helsinki
138 | CLUSTER 2: long, short, longer, normal, usual, periods, lengthy, shorter, duration
139 | CLUSTER 3: current, term, future, key, position, primary, internal, existing, core, external
140 | CLUSTER 4: newton, luther, canon
141 | CLUSTER 5: ball, pitch, catch, throw, balls, swing, bat, kicked, opener, slip, spell, foul, knock, pitches, toss, kicking, bounced, kicks, scoreboard, bounce
142 | CLUSTER 6: popular, famous, prominent, notable, influential, renowned, well-known, famed, acclaimed, finest
143 | CLUSTER 7: affected, affect, affecting, affects
144 | CLUSTER 8: assassination, murdered, slain, assassinated
145 | CLUSTER 9: jordan, carter, jimmy
146 | CLUSTER 10: 1999, 1994, 1995, 1993, 1992, 1991, 1990, 1989, 1988, 1986, 1987, 1984, 1980, 1985, 1979, 1983, 1982, 1981
147 | CLUSTER 11: alongside, joining, touring, completing, toured, thereafter, whilst, filming, assignment, boarding, stint
148 | CLUSTER 12: 10, 20, 15, 30, 11, 12, 18, 25, 14, 13, 16, 17, 24, 19, 22, 21, 23, 26, 28, 27, 31, 29
149 | CLUSTER 13: support, provide, aid, access, provided, additional, offers, relief, provides, assistance, providing, funding
150 | CLUSTER 14: communist, regime, dictator, suharto, dictatorship, communism, monarchy
151 | ...
152 | --- 28.54 seconds ---
153 | ```
154 | 
155 | Clusters are printed to the screen and also saved as JSON arrays in `data/cache`. By default, the script clusters the 10,000 most-common words from GloVe.6B into 1,000 clusters using 100-D vector embeddings. This can be changed like so:
156 | 
157 | ```bash
158 | # note: this will take a *long* time to run...
159 | python3 word_clustering.py --num_words 100000 --num_clusters 10000 --vector_dim 300
160 | ```
161 | 
162 | ```
163 | usage: word_clustering.py [-h] [--vector_dim {50,100,200,300}]
164 |                           [--num_words NUM_WORDS]
165 |                           [--num_clusters NUM_CLUSTERS] [--n_jobs N_JOBS]
166 |                           [--glove_path GLOVE_PATH]
167 | 
168 | optional arguments:
169 |   -h, --help            show this help message and exit
170 |   --vector_dim {50,100,200,300}, -d {50,100,200,300}
171 |                         What vector GloVe vector dimension to use (default:
172 |                         100).
173 |   --num_words NUM_WORDS, -n NUM_WORDS
174 |                         The number of lines to read from the GloVe vector file
175 |                         (default: 10000).
176 |   --num_clusters NUM_CLUSTERS, -k NUM_CLUSTERS
177 |                         Number of resulting word clusters. The number of K in
178 |                         K-Means (default: 1000).
179 |   --n_jobs N_JOBS, -j N_JOBS
180 |                         Number of cores to use when fitting K-Means. -1 = all
181 |                         cores. More cores = less time, more memory (default:
182 |                         -1).
183 |   --glove_path GLOVE_PATH, -i GLOVE_PATH
184 |                         GloVe vector file path (default: data/glove)
185 | ```
186 | 
187 | ## Running with Docker
188 | 
189 | These experiments, and the GloVe data they use, are available via a Docker image on Docker Hub. If you have Docker installed on your machine you can pull the images and run them inside of containers instead of installing them on your host machine.
190 | 
191 | ```bash
192 | docker run --rm -it brannondorsey/glove-experiments python word_arithmetic.py
193 | docker run --rm -it brannondorsey/glove-experiments python word_game.py
194 | docker run --rm -it brannondorsey/glove-experiments python word_clustering.py
195 | ```
196 | 
197 | These images have been built for 64-bit x86 CPU architectures.
198 | 
199 | ## License and Attribution
200 | 
201 | All code is released under an [MIT license](LICENSE). You are free to copy, edit, share, or sell it under those terms.
202 | 
203 | ### GloVe citation
204 | 
205 | Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. [GloVe: Global Vectors for Word Representation](https://nlp.stanford.edu/pubs/glove.pdf).
206 | 
207 | ```
208 | @inproceedings{pennington2014glove,
209 |   author = {Jeffrey Pennington and Richard Socher and Christopher D. Manning},
210 |   booktitle = {Empirical Methods in Natural Language Processing (EMNLP)},
211 |   title = {GloVe: Global Vectors for Word Representation},
212 |   year = {2014},
213 |   pages = {1532--1543},
214 |   url = {http://www.aclweb.org/anthology/D14-1162},
215 | }
216 | ```
217 | 


--------------------------------------------------------------------------------