├── data ├── cache │ └── .gitkeep └── glove │ └── .gitkeep ├── .dockerignore ├── requirements.txt ├── .gitignore ├── download_data.sh ├── Dockerfile ├── LICENSE ├── utils.py ├── word_clustering.py ├── word_arithmetic.py ├── word_game.py └── README.md /data/cache/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/glove/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | data/ 2 | venv/ 3 | *.pyc 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.16.3 2 | pandas==0.24.2 3 | scikit-learn==0.20.3 4 | scipy==1.2.1 5 | termcolor==1.1.0 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | data/ 3 | !data/cache/.gitkeep 4 | !data/glove/.gitkeep 5 | venv/ 6 | .vscode/ 7 | *.pyc 8 | .DS_Store 9 | -------------------------------------------------------------------------------- /download_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | curl -L http://www-nlp.stanford.edu/data/glove.6B.zip -o data/glove.6B.zip 4 | unzip data/glove.6B.zip -d data/glove 5 | rm data/glove.6B.zip 6 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7-slim-stretch 2 | 3 | RUN apt-get update && \ 4 | apt-get install -y --no-install-recommends curl unzip git build-essential && \ 5 | git clone https://github.com/brannondorsey/glove-experiments && \ 6 | cd glove-experiments && \ 7 | pip install -r requirements.txt && \ 8 | ./download_data.sh 9 | 10 | WORKDIR /glove-experiments 11 | CMD python word_arithmetic.py 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Brannon Dorsey 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import codecs, json 2 | import numpy as np 3 | 4 | '''Serializable/Pickleable class to replicate the functionality of collections.defaultdict''' 5 | class autovivify_list(dict): 6 | def __missing__(self, key): 7 | value = self[key] = [] 8 | return value 9 | 10 | def __add__(self, x): 11 | '''Override addition for numeric types when self is empty''' 12 | if not self and isinstance(x, Number): 13 | return x 14 | raise ValueError 15 | 16 | def __sub__(self, x): 17 | '''Also provide subtraction method''' 18 | if not self and isinstance(x, Number): 19 | return -1 * x 20 | raise ValueError 21 | 22 | def build_word_vector_matrix(vector_file, n_words): 23 | '''Read a GloVe array from sys.argv[1] and return its vectors and labels as arrays''' 24 | np_arrays = [] 25 | labels_array = [] 26 | 27 | with codecs.open(vector_file, 'r', 'utf-8') as f: 28 | for i, line in enumerate(f): 29 | sr = line.split() 30 | labels_array.append(sr[0]) 31 | np_arrays.append(np.array([float(j) for j in sr[1:]])) 32 | if i == n_words - 1: 33 | return np.array(np_arrays), labels_array 34 | return np.array(np_arrays), labels_array 35 | 36 | def get_cache_filename_from_args(args): 37 | a = (args.vector_dim, args.num_words, args.num_clusters) 38 | return '{}D_{}-words_{}-clusters.json'.format(*a) 39 | 40 | def get_label_dictionaries(labels_array): 41 | id_to_word = dict(zip(range(len(labels_array)), labels_array)) 42 | word_to_id = dict((v,k) for k,v in id_to_word.items()) 43 | return word_to_id, id_to_word 44 | 45 | def save_json(filename, results): 46 | with open(filename, 'w') as f: 47 | json.dump(results, f) 48 | 49 | def load_json(filename): 50 | with open(filename, 'r') as f: 51 | return json.load(f) 52 | -------------------------------------------------------------------------------- /word_clustering.py: -------------------------------------------------------------------------------- 1 | # Notes for extension of script: 2 | # - User readline() to interactively search for word groups 3 | # - On a word miss, use L2 or cosine distance to select the nearest word vector 4 | # - This would require all 6B tokens to loaded in ram (but not clustered) 5 | # - Or use levenshtein distance assuming the word is spelled the same. 6 | # - Provide an interface to perform basic arithmetic on words (king - man + woman = queen) 7 | # Look at this result from 2014 English Wikipedia: 8 | # 'islamic', 'militant', 'islam', 'radical', 'extremists', 'islamist', 'extremist', 'outlawed' 9 | # 'war' - 'violence' + 'peace' = 'treaty' | 300d 10 | 11 | from sklearn.cluster import KMeans 12 | from numbers import Number 13 | from pandas import DataFrame 14 | import numpy as np 15 | import os, sys, codecs, argparse, pprint, time 16 | from utils import * 17 | from word_arithmetic import * 18 | 19 | def find_word_clusters(labels_array, cluster_labels): 20 | cluster_to_words = autovivify_list() 21 | for c, i in enumerate(cluster_labels): 22 | cluster_to_words[i].append(labels_array[c]) 23 | return cluster_to_words 24 | 25 | def parse_args(): 26 | parser = argparse.ArgumentParser() 27 | parser.add_argument('--vector_dim', '-d', 28 | type=int, 29 | choices=[50, 100, 200, 300], 30 | default=100, 31 | help='What vector GloVe vector dimension to use ' 32 | '(default: 100).') 33 | parser.add_argument('--num_words', '-n', 34 | type=int, 35 | default=10000, 36 | help='The number of lines to read from the GloVe ' 37 | 'vector file (default: 10000).') 38 | parser.add_argument('--num_clusters', '-k', 39 | default=1000, 40 | type=int, 41 | help='Number of resulting word clusters. ' 42 | 'The number of K in K-Means (default: 1000).') 43 | parser.add_argument('--n_jobs', '-j', 44 | type=int, 45 | default=-1, 46 | help='Number of cores to use when fitting K-Means. ' 47 | '-1 = all cores. ' 48 | 'More cores = less time, more memory (default: -1).') 49 | parser.add_argument('--glove_path', '-i', 50 | default='data/glove', 51 | help='GloVe vector file path (default: data/glove)') 52 | return parser.parse_args() 53 | 54 | if __name__ == '__main__': 55 | 56 | args = parse_args() 57 | 58 | filename = path = 'data/cache/{}'.format(get_cache_filename_from_args(args)) 59 | cluster_to_words = None 60 | start_time = time.time() 61 | 62 | vector_file = args.glove_path + '/' + 'glove.6B.' + str(args.vector_dim) + 'd.txt' 63 | df, labels_array = build_word_vector_matrix(vector_file, args.num_words) 64 | 65 | # if these are clustering parameters we've never seen before 66 | if not os.path.isfile(filename): 67 | 68 | print('No cached cluster found. Clustering using K-Means... ') 69 | kmeans_model = KMeans(init='k-means++', n_clusters=args.num_clusters, n_jobs=args.n_jobs, n_init=10) 70 | kmeans_model.fit(df) 71 | 72 | cluster_labels = kmeans_model.labels_ 73 | # cluster_inertia = kmeans_model.inertia_ 74 | cluster_to_words = list(find_word_clusters(labels_array, cluster_labels).values()) 75 | 76 | # cache these clustering results 77 | save_json(path, cluster_to_words) 78 | print('Saved {} clusters to {}. Cached for later use.'.format(len(cluster_to_words), path)) 79 | 80 | # if this kmeans fitting has already been cached 81 | else: 82 | print('Cached K-Means cluster found, loading from disk.') 83 | cluster_to_words = load_json(filename) 84 | 85 | for i, words in enumerate(cluster_to_words): 86 | print('CLUSTER {}: {}'.format(i + 1, ', '.join(words))) 87 | 88 | if start_time != None: 89 | print("--- {:.2f} seconds ---".format((time.time() - start_time))) 90 | -------------------------------------------------------------------------------- /word_arithmetic.py: -------------------------------------------------------------------------------- 1 | import argparse, utils, sys, readline 2 | from scipy.spatial.distance import cosine 3 | 4 | def word_arithmetic(start_word, minus_words, plus_words, word_to_id, id_to_word, df, num_results=5): 5 | '''Returns a word string that is the result of the vector arithmetic''' 6 | try: 7 | start_vec = df[word_to_id[start_word]] 8 | minus_vecs = [df[word_to_id[minus_word]] for minus_word in minus_words] 9 | plus_vecs = [df[word_to_id[plus_word]] for plus_word in plus_words] 10 | except KeyError as err: 11 | return err, None 12 | 13 | result = start_vec 14 | 15 | if minus_vecs: 16 | for i, vec in enumerate(minus_vecs): 17 | result = result - vec 18 | 19 | if plus_vecs: 20 | for i, vec in enumerate(plus_vecs): 21 | result = result + vec 22 | 23 | # result = start_vec - minus_vec + plus_vec 24 | words = [start_word] + minus_words + plus_words 25 | return None, find_nearest(words, result, id_to_word, df, num_results) 26 | 27 | def find_nearest(words, vec, id_to_word, df, num_results, method='cosine'): 28 | 29 | if method == 'cosine': 30 | minim = [] # min, index 31 | for i, v in enumerate(df): 32 | # skip the base word, its usually the closest 33 | if id_to_word[i] in words: 34 | continue 35 | dist = cosine(vec, v) 36 | minim.append((dist, i)) 37 | minim = sorted(minim, key=lambda v: v[0]) 38 | # return list of (word, cosine distance) tuples 39 | return [(id_to_word[minim[i][1]], minim[i][0]) for i in range(num_results)] 40 | else: 41 | raise Exception('{} is not an excepted method parameter'.format(method)) 42 | 43 | def parse_expression(expr): 44 | 45 | split = expr.split() 46 | start_word = split[0] 47 | minus_words, plus_words = [], [] 48 | for i, token in enumerate(split[1:]): 49 | if token == '+': 50 | plus_words.append(split[i + 2]) 51 | elif token == '-': 52 | minus_words.append(split[i + 2]) 53 | return start_word, minus_words, plus_words 54 | 55 | def process(num_results): 56 | inpt = input('> ') 57 | if inpt == 'exit': 58 | exit() 59 | start_word, minus_words, plus_words = parse_expression(inpt) 60 | err, results = word_arithmetic(start_word=start_word, 61 | minus_words=minus_words, 62 | plus_words=plus_words, 63 | word_to_id=word_to_id, 64 | id_to_word=id_to_word, 65 | df=df, 66 | num_results=num_results) 67 | if results: 68 | print() 69 | for res in results: 70 | print(res[0].ljust(15), ' {0:.2f}'.format(res[1])) 71 | print() 72 | else: 73 | print('{} not found in the dataset.'.format(err), file=sys.stderr) 74 | 75 | 76 | def parse_args(): 77 | parser = argparse.ArgumentParser() 78 | parser.add_argument('--vector_dim', '-d', 79 | type=int, 80 | choices=[50, 100, 200, 300], 81 | default=100, 82 | help='What vector GloVe vector depth to use ' 83 | '(default: 100).') 84 | parser.add_argument('--num_words', '-n', 85 | type=int, 86 | default=10000, 87 | help='The number of lines to read from the GloVe ' 88 | 'vector file (default: 10000).') 89 | parser.add_argument('--num_output', '-o', 90 | type=int, 91 | default=1, 92 | help='The number of result words to display (default: 1)') 93 | parser.add_argument('--glove_path', '-i', 94 | default='data/glove', 95 | help='GloVe vector file path (default: data/glove)') 96 | return parser.parse_args() 97 | 98 | if __name__ == '__main__': 99 | 100 | args = parse_args() 101 | vector_file = args.glove_path + '/' + 'glove.6B.' + str(args.vector_dim) + 'd.txt' 102 | 103 | if args.num_words > 400000: 104 | print('--num_words must be equal to or less than 400,000. Exiting.') 105 | exit(1) 106 | 107 | df, labels_array = utils.build_word_vector_matrix(vector_file, args.num_words) 108 | word_to_id, id_to_word = utils.get_label_dictionaries(labels_array) 109 | 110 | while True: 111 | process(args.num_output) 112 | -------------------------------------------------------------------------------- /word_game.py: -------------------------------------------------------------------------------- 1 | import argparse, utils, sys, readline 2 | from termcolor import colored 3 | from scipy.spatial.distance import cosine 4 | 5 | def word_arithmetic(start_word, minus_words, plus_words, word_to_id, id_to_word, df): 6 | '''Returns a word string that is the result of the vector arithmetic''' 7 | try: 8 | start_vec = df[word_to_id[start_word]] 9 | minus_vecs = [df[word_to_id[minus_word]] for minus_word in minus_words] 10 | plus_vecs = [df[word_to_id[plus_word]] for plus_word in plus_words] 11 | except KeyError as err: 12 | return err, None 13 | 14 | result = start_vec 15 | 16 | if minus_vecs: 17 | for i, vec in enumerate(minus_vecs): 18 | result = result - vec 19 | 20 | if plus_vecs: 21 | for i, vec in enumerate(plus_vecs): 22 | result = result + vec 23 | 24 | return None, result 25 | 26 | def find_nearest(skip_words, vec, id_to_word, df, num_results=1, method='cosine'): 27 | 28 | if method == 'cosine': 29 | minim = [] # min, index 30 | for i, v in enumerate(df): 31 | # skip the base word, its usually the closest 32 | if id_to_word[i] in skip_words: 33 | continue 34 | dist = cosine(vec, v) 35 | minim.append((dist, i, v)) 36 | minim = sorted(minim, key=lambda v: v[0]) 37 | # return list of (word, cosine distance, vector) tuples 38 | return [(id_to_word[minim[i][1]], minim[i][0], minim[i][2]) for i in range(num_results)] 39 | else: 40 | raise Exception('{} is not an excepted method parameter'.format(method)) 41 | 42 | def eval_expression(expr, word_to_id, id_to_word, df): 43 | start_word, minus_words, plus_words = parse_expression(expr) 44 | err, vec = word_arithmetic(start_word=start_word, 45 | minus_words=minus_words, 46 | plus_words=plus_words, 47 | word_to_id=word_to_id, 48 | id_to_word=id_to_word, 49 | df=df) 50 | if err == None: 51 | return vec, [start_word] + minus_words + plus_words # vector, skip words 52 | else: 53 | raise Exception('Error: {} not found in the dataset.'.format(err)) 54 | 55 | def parse_expression(expr): 56 | 57 | split = expr.split() 58 | start_word = split[0] 59 | minus_words, plus_words = [], [] 60 | for i, token in enumerate(split[1:]): 61 | if token == '+': 62 | plus_words.append(split[i + 2]) 63 | elif token == '-': 64 | minus_words.append(split[i + 2]) 65 | return start_word, minus_words, plus_words 66 | 67 | def parse_args(): 68 | parser = argparse.ArgumentParser() 69 | parser.add_argument('--vector_dim', '-d', 70 | type=int, 71 | choices=[50, 100, 200, 300], 72 | default=100, 73 | help='What vector GloVe vector depth to use (default: 100).') 74 | parser.add_argument('--num_words', '-n', 75 | type=int, 76 | default=10000, 77 | help='The number of lines to read from the GloVe vector file (default: 10000).') 78 | parser.add_argument('--soft_score', '-s', 79 | action='store_true', 80 | help='points are scored relative to the distance a ' 81 | 'player\'s word is from the result of the ' 82 | 'input expression. This is in contrast to the default ' 83 | '1 point per-round scoring system. Soft scoring is ' 84 | 'recommended for a more fair-and-balanced game experience (default: false)') 85 | parser.add_argument('--glove_path', '-i', 86 | default='data/glove', 87 | help='GloVe vector file path (default: data/glove)') 88 | return parser.parse_args() 89 | 90 | def game_setup(args): 91 | 92 | gs = {} # game state 93 | gs['players'] = read_players() 94 | gs['winning_score'] = read_winning_score(len(gs['players'].keys())) 95 | gs['turn_number'] = 0 96 | return gs 97 | 98 | def read_players(): 99 | players = {} 100 | while len(players.keys()) == 0: 101 | print('Enter the name of each player, seperated by commas.') 102 | names = input('> ').split(',') 103 | confirm = input('There are {} players correct? [yes]: '.format(len(names))) 104 | if confirm == '' or confirm.lower() == 'yes': 105 | for name in names: 106 | players[name.strip()] = 0 # start with a score of zero 107 | return players 108 | 109 | def read_winning_score(num_players): 110 | # todo recommend a winning score based on number of players 111 | winning_score = 0 112 | while winning_score == 0: 113 | score = input('What score would you like to play to? [10]: ') 114 | if score == '': 115 | winning_score = 10 116 | return winning_score 117 | else: 118 | try: 119 | winning_score = int(score) 120 | except ValueError as err: 121 | print('Invalid score, please try again.') 122 | break 123 | return winning_score 124 | 125 | def print_standings(gs): 126 | print() 127 | standings = '' 128 | for name, score in gs['players'].items(): 129 | standings += ' {}: {}'.format(name, score) 130 | print(standings) 131 | print() 132 | 133 | def turn(gs, word_to_id, id_to_word, df, soft_score): 134 | 135 | gs['turn_number'] += 1 136 | names = list(gs['players'].keys()) 137 | current_player = names[(gs['turn_number'] % len(names) - 1)] 138 | while True: 139 | expr = input('{}, please enter a word expression:\n> '.format(current_player)) 140 | try: 141 | vec, skip_words = eval_expression(expr, word_to_id, word_to_id, df) 142 | except Exception as err: 143 | print(err) 144 | continue 145 | break 146 | 147 | answers = {} 148 | for name in gs['players']: 149 | while True: 150 | word = input('{}, please enter your answer: '.format(name)) 151 | if word in word_to_id: 152 | answers[name] = df[word_to_id[word]] 153 | break 154 | else: 155 | print('{} is not in the dataset, please another word.'.format(word)) 156 | 157 | answer_word, answer_dist, answer_vec = find_nearest(skip_words, vec, id_to_word, df)[0] 158 | # transform answers from vectors to distances 159 | for k, v in answers.items(): 160 | answers[k] = cosine(v, answer_vec) 161 | 162 | winner = min(answers, key=answers.get) 163 | 164 | if not soft_score: 165 | gs['players'][winner] += 1 166 | else: 167 | for name in answers: 168 | gs['players'][name] += round(answers[name], 2) 169 | 170 | print('Computer says {} = {}'.format(expr, colored(answer_word, 'cyan'))) 171 | print('{} wins this round.'.format(colored(winner, 'green'))) 172 | print_standings(gs) 173 | 174 | if __name__ == '__main__': 175 | 176 | args = parse_args() 177 | vector_file = args.glove_path + '/' + 'glove.6B.' + str(args.vector_dim) + 'd.txt' 178 | 179 | df, labels_array = utils.build_word_vector_matrix(vector_file, args.num_words) 180 | word_to_id, id_to_word = utils.get_label_dictionaries(labels_array) 181 | 182 | gs = game_setup(args) 183 | 184 | while max(gs['players'].values()) < gs['winning_score']: 185 | turn(gs, word_to_id, id_to_word, df, args.soft_score) 186 | 187 | print('{} is the winner!'.format(colored(max(gs['players'], key=gs['players'].get), 'green'))) 188 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GloVe Experiments 2 | 3 | This repository contains a few brief experiments with [Stanford NLP's GloVe](https://nlp.stanford.edu/projects/glove/), an unsupervised learning algorithm for obtaining vector representations for words. Similar to Word2Vec, GloVe creates a continuous N-dimensional representation of a word that is learned from its surrounding context words in a training corpus. Trained on a large corpus of text, these co-occurance statistics (an N-dimensional vector embedding) cause semantically similar words to appear near each-other in their resulting N-dimensional embedding space (e.g. "dog" and "cat" may appear nearby a region of other pet related words in the embedding space because the context words that surround both "dog" and "cat" in the training corpus are similar). 4 | 5 | I've created three small python programs for exploring GloVe embeddings: 6 | 7 | - `word_arithmetic.py`: Create word analogy searches using basic arithmetic operations (e.g. `king - man + women = queen`). 8 | - `word_game.py`: A small terminal-based multiplayer text game for creating word analogies. 9 | - `word_clustering.py`: Create [K-Means clusters](https://en.wikipedia.org/wiki/K-means_clustering) using GloVe embeddings. Saves results to JSON. 10 | 11 | All three scripts use the GloVe.6B pre-trained word embeddings created from the combined Wikipedia 2014 and Gigaword 5 datasets. They were trained using 6 billion tokens and contains 400,000 unique lowercase words. Trained embeddings are provided in 50, 100, 200, and 300 dimensions (822 MB download). 12 | 13 | ## Getting Started 14 | 15 | These small experiments can be run in MacOS or Linux environments (sorry ~~not sorry~~ Windoze users). If you'd prefer to run these experiments via Docker jump down to the [Running with Docker](#running-with-docker) section. 16 | 17 | ```bash 18 | # clone this repo 19 | git clone https://github.com/brannondorsey/GloVe-experiments.git 20 | cd GloVe-experiments 21 | 22 | # install python dependencies 23 | pip3 install -r requirements.txt 24 | 25 | # dowload the pre-trained embeddings. This might take a while... 26 | ./download_data.sh 27 | ``` 28 | 29 | ## Word Arithmetic 30 | 31 | `word_arithmetic.py` allows you to write simple +/- arithmetic operations using words to find the closest approximated resulting word from the given word expression. Math operations are applied in the embedding space and a K-nearest-neighbor search is used to display the `K` words closest to the result of the algebraic transformation. 32 | 33 | ```bash 34 | python3 word_arithmetic.py 35 | > king - man + woman 36 | 37 | queen 0.22 38 | ``` 39 | 40 | `word - word + word` is the traditional word analogy format, however `word_arithmetic.py` supports any number of `+` or `-` operations provided all words are in the database. The meaning of less traditional expressions, `word + word + word...` is more ambiguous but can lead to interesting results nonetheless. Specifying an order of operations is not supported at this time (e.g. `(word - word) + word`). 41 | 42 | By default, `word_arithmetic.py` loads the 10,000 most frequently used words from the dataset and uses a 100-dimensional embedding vector. It also prints only the single nearest word to the resulting vector point from the expression (the "nearest neighbor"). You can specify your own values for each of these parameters if you would like: 43 | 44 | ```bash 45 | python3 word_arithmetic.py --num_words 100000 --vector_dim 300 --num_output 10 46 | > king - man + woman 47 | 48 | queen 0.31 49 | monarch 0.44 50 | throne 0.44 51 | princess 0.45 52 | mother 0.49 53 | daughter 0.49 54 | kingdom 0.50 55 | prince 0.50 56 | elizabeth 0.51 57 | wife 0.52 58 | ``` 59 | 60 | Increasing `--num_words` and `--vector_dim` increases the number of usable words in the dictionary and accuracy of the resulting word expressions respectively. Increasing either will increase the processing time for each expression as well as the memory requirements needed to run the program. 61 | 62 | ``` 63 | usage: word_arithmetic.py [-h] [--vector_dim {50,100,200,300}] 64 | [--num_words NUM_WORDS] [--num_output NUM_OUTPUT] 65 | [--glove_path GLOVE_PATH] 66 | 67 | optional arguments: 68 | -h, --help show this help message and exit 69 | --vector_dim {50,100,200,300}, -d {50,100,200,300} 70 | What vector GloVe vector depth to use (default: 100). 71 | --num_words NUM_WORDS, -n NUM_WORDS 72 | The number of lines to read from the GloVe vector file 73 | (default: 10000). 74 | --num_output NUM_OUTPUT, -o NUM_OUTPUT 75 | The number of result words to display (default: 1) 76 | --glove_path GLOVE_PATH, -i GLOVE_PATH 77 | GloVe vector file path 78 | ``` 79 | 80 | ## Word Game 81 | 82 | `word_game.py` is a small text-based multiplayer game where players take turns creating and answering `word_arithmetic.py`-style word expressions. Players win points when they propose a solution word to a word expression that is nearest to the answer word out of all players guesses. 83 | 84 | ``` 85 | Enter the name of each player, seperated by commas. 86 | > bob, alice 87 | There are 2 players correct? [yes]: yes 88 | What score would you like to play to? [10]: 10 89 | alice, please enter a word expression: 90 | > home - earth + space 91 | alice, please enter your answer: rocket 92 | bob, please enter your answer: moon 93 | Computer says home - earth + space = office 94 | bob wins this round. 95 | 96 | alice: 0 bob: 1 97 | 98 | bob, please enter a word expression: 99 | > 100 | ``` 101 | 102 | The game is far from perfect, and the automated judging can be aggravating at times (try with `--soft_score`), but it can lead to some fun times given the right crowd 💻🍻🎉. Increase the dictionary size and vector dimensions for best results: 103 | 104 | ```bash 105 | python3 word_game.py --vector_dim 200 --num_words 100000 --soft_score 106 | ``` 107 | 108 | ``` 109 | usage: word_game.py [-h] [--vector_dim {50,100,200,300}] 110 | [--num_words NUM_WORDS] [--soft_score] 111 | [--glove_path GLOVE_PATH] 112 | 113 | optional arguments: 114 | -h, --help show this help message and exit 115 | --vector_dim {50,100,200,300}, -d {50,100,200,300} 116 | What vector GloVe vector depth to use (default: 100). 117 | --num_words NUM_WORDS, -n NUM_WORDS 118 | The number of lines to read from the GloVe vector file 119 | (default: 10000). 120 | --soft_score, -s points are scored relative to the distance a player's 121 | word is from the result of the input expression. This 122 | is in contrast to the default 1 point per-round 123 | scoring system. Soft scoring is recommended for a more 124 | fair-and-balanced game experience (default: false) 125 | --glove_path GLOVE_PATH, -i GLOVE_PATH 126 | GloVe vector file path (default: data/glove) 127 | ``` 128 | 129 | ## Word Clustering 130 | 131 | `word_clustering.py` uses unsupervised learning to clusters words into related groups using K-Means. 132 | 133 | ```bash 134 | python3 word_clustering.py 135 | No cached cluster found. Clustering using K-Means... 136 | Saved 1000 clusters to data/cache/100D_10000-words_1000-clusters.json. Cached for later use. 137 | CLUSTER 1: athens, stockholm, oslo, helsinki 138 | CLUSTER 2: long, short, longer, normal, usual, periods, lengthy, shorter, duration 139 | CLUSTER 3: current, term, future, key, position, primary, internal, existing, core, external 140 | CLUSTER 4: newton, luther, canon 141 | CLUSTER 5: ball, pitch, catch, throw, balls, swing, bat, kicked, opener, slip, spell, foul, knock, pitches, toss, kicking, bounced, kicks, scoreboard, bounce 142 | CLUSTER 6: popular, famous, prominent, notable, influential, renowned, well-known, famed, acclaimed, finest 143 | CLUSTER 7: affected, affect, affecting, affects 144 | CLUSTER 8: assassination, murdered, slain, assassinated 145 | CLUSTER 9: jordan, carter, jimmy 146 | CLUSTER 10: 1999, 1994, 1995, 1993, 1992, 1991, 1990, 1989, 1988, 1986, 1987, 1984, 1980, 1985, 1979, 1983, 1982, 1981 147 | CLUSTER 11: alongside, joining, touring, completing, toured, thereafter, whilst, filming, assignment, boarding, stint 148 | CLUSTER 12: 10, 20, 15, 30, 11, 12, 18, 25, 14, 13, 16, 17, 24, 19, 22, 21, 23, 26, 28, 27, 31, 29 149 | CLUSTER 13: support, provide, aid, access, provided, additional, offers, relief, provides, assistance, providing, funding 150 | CLUSTER 14: communist, regime, dictator, suharto, dictatorship, communism, monarchy 151 | ... 152 | --- 28.54 seconds --- 153 | ``` 154 | 155 | Clusters are printed to the screen and also saved as JSON arrays in `data/cache`. By default, the script clusters the 10,000 most-common words from GloVe.6B into 1,000 clusters using 100-D vector embeddings. This can be changed like so: 156 | 157 | ```bash 158 | # note: this will take a *long* time to run... 159 | python3 word_clustering.py --num_words 100000 --num_clusters 10000 --vector_dim 300 160 | ``` 161 | 162 | ``` 163 | usage: word_clustering.py [-h] [--vector_dim {50,100,200,300}] 164 | [--num_words NUM_WORDS] 165 | [--num_clusters NUM_CLUSTERS] [--n_jobs N_JOBS] 166 | [--glove_path GLOVE_PATH] 167 | 168 | optional arguments: 169 | -h, --help show this help message and exit 170 | --vector_dim {50,100,200,300}, -d {50,100,200,300} 171 | What vector GloVe vector dimension to use (default: 172 | 100). 173 | --num_words NUM_WORDS, -n NUM_WORDS 174 | The number of lines to read from the GloVe vector file 175 | (default: 10000). 176 | --num_clusters NUM_CLUSTERS, -k NUM_CLUSTERS 177 | Number of resulting word clusters. The number of K in 178 | K-Means (default: 1000). 179 | --n_jobs N_JOBS, -j N_JOBS 180 | Number of cores to use when fitting K-Means. -1 = all 181 | cores. More cores = less time, more memory (default: 182 | -1). 183 | --glove_path GLOVE_PATH, -i GLOVE_PATH 184 | GloVe vector file path (default: data/glove) 185 | ``` 186 | 187 | ## Running with Docker 188 | 189 | These experiments, and the GloVe data they use, are available via a Docker image on Docker Hub. If you have Docker installed on your machine you can pull the images and run them inside of containers instead of installing them on your host machine. 190 | 191 | ```bash 192 | docker run --rm -it brannondorsey/glove-experiments python word_arithmetic.py 193 | docker run --rm -it brannondorsey/glove-experiments python word_game.py 194 | docker run --rm -it brannondorsey/glove-experiments python word_clustering.py 195 | ``` 196 | 197 | These images have been built for 64-bit x86 CPU architectures. 198 | 199 | ## License and Attribution 200 | 201 | All code is released under an [MIT license](LICENSE). You are free to copy, edit, share, or sell it under those terms. 202 | 203 | ### GloVe citation 204 | 205 | Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. [GloVe: Global Vectors for Word Representation](https://nlp.stanford.edu/pubs/glove.pdf). 206 | 207 | ``` 208 | @inproceedings{pennington2014glove, 209 | author = {Jeffrey Pennington and Richard Socher and Christopher D. Manning}, 210 | booktitle = {Empirical Methods in Natural Language Processing (EMNLP)}, 211 | title = {GloVe: Global Vectors for Word Representation}, 212 | year = {2014}, 213 | pages = {1532--1543}, 214 | url = {http://www.aclweb.org/anthology/D14-1162}, 215 | } 216 | ``` 217 | --------------------------------------------------------------------------------