├── .gitignore ├── LICENSE ├── Makefile ├── bench └── bench_corpus.py ├── changelog.md ├── circle.yml ├── examples ├── analogy_tasks_evaluation.py └── example.py ├── glove ├── __init__.py ├── corpus.py ├── corpus_cython.cpp ├── corpus_cython.pyx ├── glove.py ├── glove_cython.c ├── glove_cython.pyx └── metrics │ ├── __init__.py │ ├── accuracy.py │ ├── accuracy_cython.c │ └── accuracy_cython.pyx ├── readme.md ├── setup.cfg ├── setup.py └── tests ├── stanford_test.txt ├── test_corpus.py ├── test_glove.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.*~ 3 | *.so 4 | dist/* 5 | *.egg* 6 | *.model 7 | *.corpus 8 | build/* 9 | Makefile~ 10 | *.npy 11 | *.bz2 12 | *#* 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 2014 Maciej Kula 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | python setup.py build_ext 3 | 4 | get-wiki: 5 | wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-meta-current1.xml-p000000010p000010000.bz2 6 | 7 | process-wiki: 8 | python -- examples/example.py -w -c enwiki-latest-pages-meta-current1.xml-p000000010p000010000.bz2 9 | 10 | train-wiki: 11 | python -i -- examples/example.py -t 30 -p 2 12 | 13 | all-wiki: get-wiki process-wiki train-wiki 14 | 15 | .PHONY: all get-wiki process-wiki train-wiki 16 | -------------------------------------------------------------------------------- /bench/bench_corpus.py: -------------------------------------------------------------------------------- 1 | import array 2 | import timeit 3 | 4 | import numpy as np 5 | import scipy.sparse as sp 6 | 7 | from glove import Corpus 8 | from glove.glove import check_random_state 9 | 10 | 11 | def generate_training_corpus(num_sentences, 12 | vocabulary_size=30000, 13 | sentence_min_size=2, 14 | sentence_max_size=30, 15 | seed=None): 16 | 17 | rs = check_random_state(seed) 18 | 19 | for _ in range(num_sentences): 20 | sentence_size = rs.randint(sentence_min_size, 21 | sentence_max_size) 22 | yield [str(x) for x in 23 | rs.randint(0, vocabulary_size, sentence_size)] 24 | 25 | 26 | def fit_corpus(corpus): 27 | 28 | model = Corpus() 29 | model.fit(corpus) 30 | 31 | return corpus 32 | 33 | 34 | if __name__ == '__main__': 35 | 36 | number = 10 37 | 38 | elapsed = timeit.timeit('fit_corpus(corpus)', 39 | setup=('from __main__ import generate_training_corpus;' 40 | 'from __main__ import fit_corpus;' 41 | 'corpus = list(generate_training_corpus(100000, seed=10))'), 42 | number=number) 43 | 44 | one_loop_time = elapsed / number 45 | 46 | print('Seconds per fit: %s' % one_loop_time) 47 | -------------------------------------------------------------------------------- /changelog.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## [0.1.0][2016-01-11] 4 | ### Changed 5 | - add algorithm tests for corpus construction and model fitting 6 | - remove dependency on Cython for intallation, the required .c and .cpp files are now included 7 | - use py.test for testing 8 | - removed dependency on C++11 features by using a different sparse matrix structure for corpus construction 9 | - faster coocurrence matrix construction 10 | 11 | ### Removed 12 | - max_map_size argument removed from Corpus.fit 13 | -------------------------------------------------------------------------------- /circle.yml: -------------------------------------------------------------------------------- 1 | dependencies: 2 | pre: 3 | - pip install numpy 4 | - pip install scipy 5 | -------------------------------------------------------------------------------- /examples/analogy_tasks_evaluation.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from collections import defaultdict 3 | import numpy as np 4 | 5 | from glove import Glove, metrics 6 | 7 | 8 | if __name__ == '__main__': 9 | 10 | parser = argparse.ArgumentParser(description=('Evaluate a trained GloVe ' 11 | 'model on an analogy task.')) 12 | parser.add_argument('--test', '-t', action='store', 13 | required=True, 14 | help='The filename of the analogy test set.') 15 | parser.add_argument('--model', '-m', action='store', 16 | required=True, 17 | help='The filename of the stored GloVe model.') 18 | parser.add_argument('--encode', '-e', action='store_true', 19 | default=False, 20 | help=('If True, words from the ' 21 | 'evaluation set will be utf-8 encoded ' 22 | 'before looking them up in the ' 23 | 'model dictionary')) 24 | parser.add_argument('--parallelism', '-p', action='store', 25 | default=1, 26 | help=('Number of parallel threads to use')) 27 | 28 | args = parser.parse_args() 29 | 30 | # Load the GloVe model 31 | glove = Glove.load(args.model) 32 | 33 | 34 | if args.encode: 35 | encode = lambda words: [x.lower().encode('utf-8') for x in words] 36 | else: 37 | encode = lambda words: [unicode(x.lower()) for x in words] 38 | 39 | 40 | # Load the analogy task dataset. One example can be obtained at 41 | # https://word2vec.googlecode.com/svn/trunk/questions-words.txt 42 | sections = defaultdict(list) 43 | evaluation_words = [sections[section].append(encode(words)) for section, words in 44 | metrics.read_analogy_file(args.test)] 45 | 46 | section_ranks = [] 47 | 48 | for section, words in sections.items(): 49 | evaluation_ids = metrics.construct_analogy_test_set(words, 50 | glove.dictionary, 51 | ignore_missing=True) 52 | 53 | # Get the rank array. 54 | ranks = metrics.analogy_rank_score(evaluation_ids, glove.word_vectors, 55 | no_threads=int(args.parallelism)) 56 | section_ranks.append(ranks) 57 | 58 | print('Section %s mean rank: %s, accuracy: %s' % (section, ranks.mean(), 59 | (ranks == 0).sum() / float(len(ranks)))) 60 | 61 | ranks = np.hstack(section_ranks) 62 | 63 | print('Overall rank: %s, accuracy: %s' % (ranks.mean(), 64 | (ranks == 0).sum() / float(len(ranks)))) 65 | 66 | -------------------------------------------------------------------------------- /examples/example.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse 3 | import pprint 4 | import gensim 5 | 6 | from glove import Glove 7 | from glove import Corpus 8 | 9 | 10 | def read_corpus(filename): 11 | 12 | delchars = [chr(c) for c in range(256)] 13 | delchars = [x for x in delchars if not x.isalnum()] 14 | delchars.remove(' ') 15 | delchars = ''.join(delchars) 16 | 17 | with open(filename, 'r') as datafile: 18 | for line in datafile: 19 | yield line.lower().translate(None, delchars).split(' ') 20 | 21 | 22 | def read_wikipedia_corpus(filename): 23 | 24 | # We don't want to do a dictionary construction pass. 25 | corpus = gensim.corpora.WikiCorpus(filename, dictionary={}) 26 | 27 | for text in corpus.get_texts(): 28 | yield text 29 | 30 | 31 | if __name__ == '__main__': 32 | 33 | # Set up command line parameters. 34 | parser = argparse.ArgumentParser(description='Fit a GloVe model.') 35 | 36 | parser.add_argument('--create', '-c', action='store', 37 | default=None, 38 | help=('The filename of the corpus to pre-process. ' 39 | 'The pre-processed corpus will be saved ' 40 | 'and will be ready for training.')) 41 | parser.add_argument('-wiki', '-w', action='store_true', 42 | default=False, 43 | help=('Assume the corpus input file is in the ' 44 | 'Wikipedia dump format')) 45 | parser.add_argument('--train', '-t', action='store', 46 | default=0, 47 | help=('Train the GloVe model with this number of epochs.' 48 | 'If not supplied, ' 49 | 'We\'ll attempt to load a trained model')) 50 | parser.add_argument('--parallelism', '-p', action='store', 51 | default=1, 52 | help=('Number of parallel threads to use for training')) 53 | parser.add_argument('--query', '-q', action='store', 54 | default='', 55 | help='Get closes words to this word.') 56 | args = parser.parse_args() 57 | 58 | 59 | if args.create: 60 | # Build the corpus dictionary and the cooccurrence matrix. 61 | print('Pre-processing corpus') 62 | 63 | if args.wiki: 64 | print('Using wikipedia corpus') 65 | get_data = read_wikipedia_corpus 66 | else: 67 | get_data = read_corpus 68 | 69 | corpus_model = Corpus() 70 | corpus_model.fit(get_data(args.create), window=10) 71 | corpus_model.save('corpus.model') 72 | 73 | print('Dict size: %s' % len(corpus_model.dictionary)) 74 | print('Collocations: %s' % corpus_model.matrix.nnz) 75 | 76 | if args.train: 77 | # Train the GloVe model and save it to disk. 78 | 79 | if not args.create: 80 | # Try to load a corpus from disk. 81 | print('Reading corpus statistics') 82 | corpus_model = Corpus.load('corpus.model') 83 | 84 | print('Dict size: %s' % len(corpus_model.dictionary)) 85 | print('Collocations: %s' % corpus_model.matrix.nnz) 86 | 87 | print('Training the GloVe model') 88 | 89 | glove = Glove(no_components=100, learning_rate=0.05) 90 | glove.fit(corpus_model.matrix, epochs=int(args.train), 91 | no_threads=args.parallelism, verbose=True) 92 | glove.add_dictionary(corpus_model.dictionary) 93 | 94 | glove.save('glove.model') 95 | 96 | if args.query: 97 | # Finally, query the model for most similar words. 98 | if not args.train: 99 | print('Loading pre-trained GloVe model') 100 | glove = Glove.load('glove.model') 101 | 102 | print('Querying for %s' % args.query) 103 | pprint.pprint(glove.most_similar(args.query, number=10)) 104 | -------------------------------------------------------------------------------- /glove/__init__.py: -------------------------------------------------------------------------------- 1 | from .corpus import Corpus 2 | from .glove import Glove 3 | -------------------------------------------------------------------------------- /glove/corpus.py: -------------------------------------------------------------------------------- 1 | # Cooccurrence matrix construction tools 2 | # for fitting the GloVe model. 3 | import numpy as np 4 | try: 5 | # Python 2 compat 6 | import cPickle as pickle 7 | except ImportError: 8 | import pickle 9 | 10 | from .corpus_cython import construct_cooccurrence_matrix 11 | 12 | 13 | class Corpus(object): 14 | """ 15 | Class for constructing a cooccurrence matrix 16 | from a corpus. 17 | 18 | A dictionry mapping words to ids can optionally 19 | be supplied. If left None, it will be constructed 20 | from the corpus. 21 | """ 22 | 23 | def __init__(self, dictionary=None): 24 | 25 | self.dictionary = {} 26 | self.dictionary_supplied = False 27 | self.matrix = None 28 | 29 | if dictionary is not None: 30 | self._check_dict(dictionary) 31 | self.dictionary = dictionary 32 | self.dictionary_supplied = True 33 | 34 | def _check_dict(self, dictionary): 35 | 36 | if (np.max(list(dictionary.values())) != (len(dictionary) - 1)): 37 | raise Exception('The largest id in the dictionary ' 38 | 'should be equal to its length minus one.') 39 | 40 | if np.min(list(dictionary.values())) != 0: 41 | raise Exception('Dictionary ids should start at zero') 42 | 43 | def fit(self, corpus, window=10, ignore_missing=False): 44 | """ 45 | Perform a pass through the corpus to construct 46 | the cooccurrence matrix. 47 | 48 | Parameters: 49 | - iterable of lists of strings corpus 50 | - int window: the length of the (symmetric) 51 | context window used for cooccurrence. 52 | - bool ignore_missing: whether to ignore words missing from 53 | the dictionary (if it was supplied). 54 | Context window distances will be preserved 55 | even if out-of-vocabulary words are 56 | ignored. 57 | If False, a KeyError is raised. 58 | """ 59 | 60 | self.matrix = construct_cooccurrence_matrix(corpus, 61 | self.dictionary, 62 | int(self.dictionary_supplied), 63 | int(window), 64 | int(ignore_missing)) 65 | 66 | def save(self, filename): 67 | 68 | with open(filename, 'wb') as savefile: 69 | pickle.dump((self.dictionary, self.matrix), 70 | savefile, 71 | protocol=pickle.HIGHEST_PROTOCOL) 72 | 73 | @classmethod 74 | def load(cls, filename): 75 | 76 | instance = cls() 77 | 78 | with open(filename, 'rb') as savefile: 79 | instance.dictionary, instance.matrix = pickle.load(savefile) 80 | 81 | return instance 82 | -------------------------------------------------------------------------------- /glove/corpus_cython.pyx: -------------------------------------------------------------------------------- 1 | #!python 2 | # distutils: language = c++ 3 | # cython: boundscheck=False, wraparound=False, nonecheck=False, cdivision=True 4 | 5 | import numpy as np 6 | import scipy.sparse as sp 7 | 8 | from libc.stdlib cimport malloc, free 9 | 10 | from cython.operator cimport dereference as deref 11 | from libcpp.vector cimport vector 12 | 13 | 14 | cdef inline int int_min(int a, int b) nogil: return a if a <= b else b 15 | 16 | 17 | cdef int binary_search(int* vec, int size, int first, int last, int x) nogil: 18 | """ 19 | Binary seach in an array of ints 20 | """ 21 | 22 | cdef int mid 23 | 24 | while (first < last): 25 | mid = (first + last) / 2 26 | if (vec[mid] == x): 27 | return mid 28 | elif vec[mid] > x: 29 | last = mid - 1 30 | else: 31 | first = mid + 1 32 | 33 | if (first == size): 34 | return first 35 | elif vec[first] > x: 36 | return first 37 | else: 38 | return first + 1 39 | 40 | 41 | cdef struct SparseRowMatrix: 42 | vector[vector[int]] *indices 43 | vector[vector[float]] *data 44 | 45 | 46 | cdef SparseRowMatrix* new_matrix(): 47 | """ 48 | Allocate and initialize a new matrix 49 | """ 50 | 51 | cdef SparseRowMatrix* mat 52 | 53 | mat = malloc(sizeof(SparseRowMatrix)) 54 | 55 | if mat == NULL: 56 | raise MemoryError() 57 | 58 | mat.indices = new vector[vector[int]]() 59 | mat.data = new vector[vector[float]]() 60 | 61 | return mat 62 | 63 | 64 | cdef void free_matrix(SparseRowMatrix* mat) nogil: 65 | """ 66 | Deallocate the data of a matrix 67 | """ 68 | 69 | cdef int i 70 | cdef int rows = mat.indices.size() 71 | 72 | for i in range(rows): 73 | deref(mat.indices)[i].clear() 74 | deref(mat.data)[i].clear() 75 | 76 | del mat.indices 77 | del mat.data 78 | 79 | free(mat) 80 | 81 | 82 | cdef void increment_matrix(SparseRowMatrix* mat, int row, int col, float increment) nogil: 83 | """ 84 | Increment the (row, col) entry of mat by increment. 85 | """ 86 | 87 | cdef vector[int]* row_indices 88 | cdef vector[float]* row_data 89 | cdef int idx 90 | cdef int col_at_idx 91 | 92 | # Add new row if necessary 93 | while row >= mat.indices.size(): 94 | mat.indices.push_back(vector[int]()) 95 | mat.data.push_back(vector[float]()) 96 | 97 | row_indices = &(deref(mat.indices)[row]) 98 | row_data = &(deref(mat.data)[row]) 99 | 100 | # Find the column element, or the position where 101 | # a new element should be inserted 102 | if row_indices.size() == 0: 103 | idx = 0 104 | else: 105 | idx = binary_search(&(deref(row_indices)[0]), row_indices.size(), 106 | 0, row_indices.size(), col) 107 | 108 | # Element to be added at the end 109 | if idx == row_indices.size(): 110 | row_indices.insert(row_indices.begin() + idx, col) 111 | row_data.insert(row_data.begin() + idx, increment) 112 | return 113 | 114 | col_at_idx = deref(row_indices)[idx] 115 | 116 | if col_at_idx == col: 117 | # Element to be incremented 118 | deref(row_data)[idx] = deref(row_data)[idx] + increment 119 | else: 120 | # Element to be inserted 121 | row_indices.insert(row_indices.begin() + idx, col) 122 | row_data.insert(row_data.begin() + idx, increment) 123 | 124 | 125 | cdef int matrix_nnz(SparseRowMatrix* mat) nogil: 126 | """ 127 | Get the number of nonzero entries in mat 128 | """ 129 | 130 | cdef int i 131 | cdef int size = 0 132 | 133 | for i in range(mat.indices.size()): 134 | size += deref(mat.indices)[i].size() 135 | 136 | return size 137 | 138 | 139 | cdef matrix_to_coo(SparseRowMatrix* mat, int shape): 140 | """ 141 | Convert to a shape by shape COO matrix. 142 | """ 143 | 144 | cdef int i, j 145 | cdef int row 146 | cdef int col 147 | cdef int rows = mat.indices.size() 148 | cdef int no_collocations = matrix_nnz(mat) 149 | 150 | # Create the constituent numpy arrays. 151 | row_np = np.empty(no_collocations, dtype=np.int32) 152 | col_np = np.empty(no_collocations, dtype=np.int32) 153 | data_np = np.empty(no_collocations, dtype=np.float64) 154 | cdef int[:] row_view = row_np 155 | cdef int[:] col_view = col_np 156 | cdef double[:] data_view = data_np 157 | 158 | j = 0 159 | 160 | for row in range(rows): 161 | for i in range(deref(mat.indices)[row].size()): 162 | 163 | row_view[j] = row 164 | col_view[j] = deref(mat.indices)[row][i] 165 | data_view[j] = deref(mat.data)[row][i] 166 | 167 | j += 1 168 | 169 | # Create and return the matrix. 170 | return sp.coo_matrix((data_np, (row_np, col_np)), 171 | shape=(shape, 172 | shape), 173 | dtype=np.float64) 174 | 175 | 176 | cdef int words_to_ids(list words, vector[int]& word_ids, 177 | dictionary, int supplied, int ignore_missing): 178 | """ 179 | Convert a list of words into a vector of word ids, using either 180 | the supplied dictionary or by consructing a new one. 181 | 182 | If the dictionary was supplied, a word is missing from it, 183 | and we are not ignoring out-of-vocabulary (OOV) words, an 184 | error value of -1 is returned. 185 | 186 | If we have an OOV word and we do want to ignore them, we use 187 | a -1 placeholder for it in the word_ids vector to preserve 188 | correct context windows (otherwise words that are far apart 189 | with the full vocabulary could become close together with a 190 | filtered vocabulary). 191 | """ 192 | 193 | cdef int word_id 194 | 195 | word_ids.resize(0) 196 | 197 | if supplied == 1: 198 | for word in words: 199 | # Raise an error if the word 200 | # is missing from the supplied 201 | # dictionary. 202 | word_id = dictionary.get(word, -1) 203 | if word_id == -1 and ignore_missing == 0: 204 | return -1 205 | 206 | word_ids.push_back(word_id) 207 | 208 | else: 209 | for word in words: 210 | word_id = dictionary.setdefault(word, 211 | len(dictionary)) 212 | word_ids.push_back(word_id) 213 | 214 | return 0 215 | 216 | 217 | def construct_cooccurrence_matrix(corpus, dictionary, int supplied, 218 | int window_size, int ignore_missing): 219 | """ 220 | Construct the word-id dictionary and cooccurrence matrix for 221 | a given corpus, using a given window size. 222 | 223 | Returns the dictionary and a scipy.sparse COO cooccurrence matrix. 224 | """ 225 | 226 | # Declare the cooccurrence map 227 | cdef SparseRowMatrix* matrix = new_matrix() 228 | 229 | # String processing variables. 230 | cdef list words 231 | cdef int i, j, outer_word, inner_word 232 | cdef int wordslen, window_stop, error 233 | cdef vector[int] word_ids 234 | 235 | # Pre-allocate some reasonable size 236 | # for the word ids vector. 237 | word_ids.reserve(1000) 238 | 239 | # Iterate over the corpus. 240 | for words in corpus: 241 | 242 | # Convert words to a numeric vector. 243 | error = words_to_ids(words, word_ids, dictionary, 244 | supplied, ignore_missing) 245 | if error == -1: 246 | raise KeyError('Word missing from dictionary') 247 | 248 | wordslen = word_ids.size() 249 | 250 | # Record co-occurrences in a moving window. 251 | for i in range(wordslen): 252 | outer_word = word_ids[i] 253 | 254 | # Continue if we have an OOD token. 255 | if outer_word == -1: 256 | continue 257 | 258 | window_stop = int_min(i + window_size + 1, wordslen) 259 | 260 | for j in range(i, window_stop): 261 | inner_word = word_ids[j] 262 | 263 | if inner_word == -1: 264 | continue 265 | 266 | # Do nothing if the words are the same. 267 | if inner_word == outer_word: 268 | continue 269 | 270 | if inner_word < outer_word: 271 | increment_matrix(matrix, 272 | inner_word, 273 | outer_word, 274 | 1.0 / (j - i)) 275 | else: 276 | increment_matrix(matrix, 277 | outer_word, 278 | inner_word, 279 | 1.0 / (j - i)) 280 | 281 | # Create the matrix. 282 | mat = matrix_to_coo(matrix, len(dictionary)) 283 | free_matrix(matrix) 284 | 285 | return mat 286 | -------------------------------------------------------------------------------- /glove/glove.py: -------------------------------------------------------------------------------- 1 | # GloVe model from the NLP lab at Stanford: 2 | # http://nlp.stanford.edu/projects/glove/. 3 | import array 4 | import collections 5 | import io 6 | try: 7 | # Python 2 compat 8 | import cPickle as pickle 9 | except ImportError: 10 | import pickle 11 | 12 | import numpy as np 13 | import scipy.sparse as sp 14 | import numbers 15 | 16 | from .glove_cython import fit_vectors, transform_paragraph 17 | 18 | 19 | def check_random_state(seed): 20 | """ Turn seed into a np.random.RandomState instance. 21 | 22 | This is a copy of the check_random_state function in sklearn 23 | in order to avoid outside dependencies. 24 | """ 25 | if seed is None or seed is np.random: 26 | return np.random.mtrand._rand 27 | if isinstance(seed, (numbers.Integral, np.integer)): 28 | return np.random.RandomState(seed) 29 | if isinstance(seed, np.random.RandomState): 30 | return seed 31 | raise ValueError('%r cannot be used to seed a numpy.random.RandomState' 32 | ' instance' % seed) 33 | 34 | 35 | class Glove(object): 36 | """ 37 | Class for estimating GloVe word embeddings using the 38 | corpus coocurrence matrix. 39 | """ 40 | 41 | def __init__(self, no_components=30, learning_rate=0.05, 42 | alpha=0.75, max_count=100, max_loss=10.0, 43 | random_state=None): 44 | """ 45 | Parameters: 46 | - int no_components: number of latent dimensions 47 | - float learning_rate: learning rate for SGD estimation. 48 | - float alpha, float max_count: parameters for the 49 | weighting function (see the paper). 50 | - float max_loss: the maximum absolute value of calculated 51 | gradient for any single co-occurrence pair. 52 | Only try setting to a lower value if you 53 | are experiencing problems with numerical 54 | stability. 55 | - random_state: random statue used to intialize optimization 56 | """ 57 | 58 | self.no_components = no_components 59 | self.learning_rate = float(learning_rate) 60 | self.alpha = float(alpha) 61 | self.max_count = float(max_count) 62 | self.max_loss = max_loss 63 | 64 | self.word_vectors = None 65 | self.word_biases = None 66 | 67 | self.vectors_sum_gradients = None 68 | self.biases_sum_gradients = None 69 | 70 | self.dictionary = None 71 | self.inverse_dictionary = None 72 | 73 | self.random_state = random_state 74 | 75 | def fit(self, matrix, epochs=5, no_threads=2, verbose=False): 76 | """ 77 | Estimate the word embeddings. 78 | 79 | Parameters: 80 | - scipy.sparse.coo_matrix matrix: coocurrence matrix 81 | - int epochs: number of training epochs 82 | - int no_threads: number of training threads 83 | - bool verbose: print progress messages if True 84 | """ 85 | 86 | shape = matrix.shape 87 | 88 | if (len(shape) != 2 or 89 | shape[0] != shape[1]): 90 | raise Exception('Coocurrence matrix must be square') 91 | 92 | if not sp.isspmatrix_coo(matrix): 93 | raise Exception('Coocurrence matrix must be in the COO format') 94 | 95 | random_state = check_random_state(self.random_state) 96 | self.word_vectors = ((random_state.rand(shape[0], 97 | self.no_components) - 0.5) 98 | / self.no_components) 99 | self.word_biases = np.zeros(shape[0], 100 | dtype=np.float64) 101 | 102 | self.vectors_sum_gradients = np.ones_like(self.word_vectors) 103 | self.biases_sum_gradients = np.ones_like(self.word_biases) 104 | 105 | shuffle_indices = np.arange(matrix.nnz, dtype=np.int32) 106 | 107 | if verbose: 108 | print('Performing %s training epochs ' 109 | 'with %s threads' % (epochs, no_threads)) 110 | 111 | for epoch in range(epochs): 112 | 113 | if verbose: 114 | print('Epoch %s' % epoch) 115 | 116 | # Shuffle the coocurrence matrix 117 | random_state.shuffle(shuffle_indices) 118 | 119 | fit_vectors(self.word_vectors, 120 | self.vectors_sum_gradients, 121 | self.word_biases, 122 | self.biases_sum_gradients, 123 | matrix.row, 124 | matrix.col, 125 | matrix.data, 126 | shuffle_indices, 127 | self.learning_rate, 128 | self.max_count, 129 | self.alpha, 130 | self.max_loss, 131 | int(no_threads)) 132 | 133 | if not np.isfinite(self.word_vectors).all(): 134 | raise Exception('Non-finite values in word vectors. ' 135 | 'Try reducing the learning rate or the ' 136 | 'max_loss parameter.') 137 | 138 | def transform_paragraph(self, paragraph, epochs=50, ignore_missing=False): 139 | """ 140 | Transform an iterable of tokens into its vector representation 141 | (a paragraph vector). 142 | 143 | Experimental. This will return something close to a tf-idf 144 | weighted average of constituent token vectors by fitting 145 | rare words (with low word bias values) more closely. 146 | """ 147 | 148 | if self.word_vectors is None: 149 | raise Exception('Model must be fit to transform paragraphs') 150 | 151 | if self.dictionary is None: 152 | raise Exception('Dictionary must be provided to ' 153 | 'transform paragraphs') 154 | 155 | cooccurrence = collections.defaultdict(lambda: 0.0) 156 | 157 | for token in paragraph: 158 | try: 159 | cooccurrence[self.dictionary[token]] += self.max_count / 10.0 160 | except KeyError: 161 | if not ignore_missing: 162 | raise 163 | 164 | random_state = check_random_state(self.random_state) 165 | 166 | word_ids = np.array(cooccurrence.keys(), dtype=np.int32) 167 | values = np.array(cooccurrence.values(), dtype=np.float64) 168 | shuffle_indices = np.arange(len(word_ids), dtype=np.int32) 169 | 170 | # Initialize the vector to mean of constituent word vectors 171 | paragraph_vector = np.mean(self.word_vectors[word_ids], axis=0) 172 | sum_gradients = np.ones_like(paragraph_vector) 173 | 174 | # Shuffle the coocurrence matrix 175 | random_state.shuffle(shuffle_indices) 176 | transform_paragraph(self.word_vectors, 177 | self.word_biases, 178 | paragraph_vector, 179 | sum_gradients, 180 | word_ids, 181 | values, 182 | shuffle_indices, 183 | self.learning_rate, 184 | self.max_count, 185 | self.alpha, 186 | epochs) 187 | 188 | return paragraph_vector 189 | 190 | def add_dictionary(self, dictionary): 191 | """ 192 | Supply a word-id dictionary to allow similarity queries. 193 | """ 194 | 195 | if self.word_vectors is None: 196 | raise Exception('Model must be fit before adding a dictionary') 197 | 198 | if len(dictionary) > self.word_vectors.shape[0]: 199 | raise Exception('Dictionary length must be smaller ' 200 | 'or equal to the number of word vectors') 201 | 202 | self.dictionary = dictionary 203 | if hasattr(self.dictionary, 'iteritems'): 204 | # Python 2 compat 205 | items_iterator = self.dictionary.iteritems() 206 | else: 207 | items_iterator = self.dictionary.items() 208 | 209 | self.inverse_dictionary = {v: k for k, v in items_iterator} 210 | 211 | def save(self, filename): 212 | """ 213 | Serialize model to filename. 214 | """ 215 | 216 | with open(filename, 'wb') as savefile: 217 | pickle.dump(self.__dict__, 218 | savefile, 219 | protocol=pickle.HIGHEST_PROTOCOL) 220 | 221 | @classmethod 222 | def load(cls, filename): 223 | """ 224 | Load model from filename. 225 | """ 226 | 227 | instance = Glove() 228 | 229 | with open(filename, 'rb') as savefile: 230 | instance.__dict__ = pickle.load(savefile) 231 | 232 | return instance 233 | 234 | @classmethod 235 | def load_stanford(cls, filename): 236 | """ 237 | Load model from the output files generated by 238 | the C code from http://nlp.stanford.edu/projects/glove/. 239 | 240 | The entries of the word dictionary will be of type 241 | unicode in Python 2 and str in Python 3. 242 | """ 243 | 244 | dct = {} 245 | vectors = array.array('d') 246 | 247 | # Read in the data. 248 | with io.open(filename, 'r', encoding='utf-8') as savefile: 249 | for i, line in enumerate(savefile): 250 | tokens = line.split(' ') 251 | 252 | word = tokens[0] 253 | entries = tokens[1:] 254 | 255 | dct[word] = i 256 | vectors.extend(float(x) for x in entries) 257 | 258 | # Infer word vectors dimensions. 259 | no_components = len(entries) 260 | no_vectors = len(dct) 261 | 262 | # Set up the model instance. 263 | instance = Glove() 264 | instance.no_components = no_components 265 | instance.word_vectors = (np.array(vectors) 266 | .reshape(no_vectors, 267 | no_components)) 268 | instance.word_biases = np.zeros(no_vectors) 269 | instance.add_dictionary(dct) 270 | 271 | return instance 272 | 273 | def _similarity_query(self, word_vec, number): 274 | 275 | dst = (np.dot(self.word_vectors, word_vec) 276 | / np.linalg.norm(self.word_vectors, axis=1) 277 | / np.linalg.norm(word_vec)) 278 | word_ids = np.argsort(-dst) 279 | 280 | return [(self.inverse_dictionary[x], dst[x]) for x in word_ids[:number] 281 | if x in self.inverse_dictionary] 282 | 283 | def most_similar(self, word, number=5): 284 | """ 285 | Run a similarity query, retrieving number 286 | most similar words. 287 | """ 288 | 289 | if self.word_vectors is None: 290 | raise Exception('Model must be fit before querying') 291 | 292 | if self.dictionary is None: 293 | raise Exception('No word dictionary supplied') 294 | 295 | try: 296 | word_idx = self.dictionary[word] 297 | except KeyError: 298 | raise Exception('Word not in dictionary') 299 | 300 | return self._similarity_query(self.word_vectors[word_idx], number)[1:] 301 | 302 | def most_similar_paragraph(self, paragraph, number=5, **kwargs): 303 | """ 304 | Return words most similar to a given paragraph (iterable of tokens). 305 | """ 306 | 307 | paragraph_vector = self.transform_paragraph(paragraph, **kwargs) 308 | 309 | return self._similarity_query(paragraph_vector, number) 310 | -------------------------------------------------------------------------------- /glove/glove_cython.pyx: -------------------------------------------------------------------------------- 1 | #!python 2 | #cython: boundscheck=False, wraparound=False, cdivision=True, initializedcheck=False 3 | 4 | import numpy as np 5 | import scipy.sparse as sp 6 | import collections 7 | from cython.parallel import parallel, prange 8 | 9 | 10 | cdef inline double double_min(double a, double b) nogil: return a if a <= b else b 11 | cdef inline int int_min(int a, int b) nogil: return a if a <= b else b 12 | cdef inline int int_max(int a, int b) nogil: return a if a > b else b 13 | 14 | 15 | cdef extern from "math.h" nogil: 16 | double sqrt(double) 17 | double c_log "log"(double) 18 | 19 | 20 | def fit_vectors(double[:, ::1] wordvec, 21 | double[:, ::1] wordvec_sum_gradients, 22 | double[::1] wordbias, 23 | double[::1] wordbias_sum_gradients, 24 | int[::1] row, 25 | int[::1] col, 26 | double[::1] counts, 27 | int[::1] shuffle_indices, 28 | double initial_learning_rate, 29 | double max_count, 30 | double alpha, 31 | double max_loss, 32 | int no_threads): 33 | """ 34 | Estimate GloVe word embeddings given the cooccurrence matrix. 35 | Modifies the word vector and word bias array in-place. 36 | 37 | Training is performed via asynchronous stochastic gradient descent, 38 | using the AdaGrad per-coordinate learning rate. 39 | """ 40 | 41 | # Get number of latent dimensions and 42 | # number of cooccurrences. 43 | cdef int dim = wordvec.shape[1] 44 | cdef int no_cooccurrences = row.shape[0] 45 | 46 | # Hold indices of current words and 47 | # the cooccurrence count. 48 | cdef int word_a, word_b 49 | cdef double count, learning_rate, gradient 50 | 51 | # Loss and gradient variables. 52 | cdef double prediction, entry_weight, loss 53 | 54 | # Iteration variables 55 | cdef int i, j, shuffle_index 56 | 57 | # We iterate over random indices to simulate 58 | # shuffling the cooccurrence matrix. 59 | with nogil: 60 | for j in prange(no_cooccurrences, num_threads=no_threads, 61 | schedule='dynamic'): 62 | shuffle_index = shuffle_indices[j] 63 | word_a = row[shuffle_index] 64 | word_b = col[shuffle_index] 65 | count = counts[shuffle_index] 66 | 67 | # Get prediction 68 | prediction = 0.0 69 | 70 | for i in range(dim): 71 | prediction = prediction + wordvec[word_a, i] * wordvec[word_b, i] 72 | 73 | prediction = prediction + wordbias[word_a] + wordbias[word_b] 74 | 75 | # Compute loss and the example weight. 76 | entry_weight = double_min(1.0, (count / max_count)) ** alpha 77 | loss = entry_weight * (prediction - c_log(count)) 78 | 79 | # Clip the loss for numerical stability. 80 | if loss < -max_loss: 81 | loss = -max_loss 82 | elif loss > max_loss: 83 | loss = max_loss 84 | 85 | # Update step: apply gradients and reproject 86 | # onto the unit sphere. 87 | for i in range(dim): 88 | 89 | learning_rate = initial_learning_rate / sqrt(wordvec_sum_gradients[word_a, i]) 90 | gradient = loss * wordvec[word_b, i] 91 | wordvec[word_a, i] = (wordvec[word_a, i] - learning_rate 92 | * gradient) 93 | wordvec_sum_gradients[word_a, i] += gradient ** 2 94 | 95 | learning_rate = initial_learning_rate / sqrt(wordvec_sum_gradients[word_b, i]) 96 | gradient = loss * wordvec[word_a, i] 97 | wordvec[word_b, i] = (wordvec[word_b, i] - learning_rate 98 | * gradient) 99 | wordvec_sum_gradients[word_b, i] += gradient ** 2 100 | 101 | # Update word biases. 102 | learning_rate = initial_learning_rate / sqrt(wordbias_sum_gradients[word_a]) 103 | wordbias[word_a] -= learning_rate * loss 104 | wordbias_sum_gradients[word_a] += loss ** 2 105 | 106 | learning_rate = initial_learning_rate / sqrt(wordbias_sum_gradients[word_b]) 107 | wordbias[word_b] -= learning_rate * loss 108 | wordbias_sum_gradients[word_b] += loss ** 2 109 | 110 | 111 | def transform_paragraph(double[:, ::1] wordvec, 112 | double[::1] wordbias, 113 | double[::1] paragraphvec, 114 | double[::1] sum_gradients, 115 | int[::1] row, 116 | double[::1] counts, 117 | int[::1] shuffle_indices, 118 | double initial_learning_rate, 119 | double max_count, 120 | double alpha, 121 | int epochs): 122 | """ 123 | Compute a vector representation of a paragraph. This has 124 | the effect of making the paragraph vector close to words 125 | that occur in it. The representation should be more 126 | similar to words that occur in it multiple times, and 127 | less close to words that are common in the corpus (have 128 | large word bias values). 129 | 130 | This should be be similar to a tf-idf weighting. 131 | """ 132 | 133 | # Get number of latent dimensions and 134 | # number of cooccurrences. 135 | cdef int dim = wordvec.shape[1] 136 | cdef int no_cooccurrences = row.shape[0] 137 | 138 | # Hold indices of current words and 139 | # the cooccurrence count. 140 | cdef int word_b, word_a 141 | cdef double count 142 | 143 | # Loss and gradient variables. 144 | cdef double prediction 145 | cdef double entry_weight 146 | cdef double loss 147 | cdef double gradient 148 | 149 | # Iteration variables 150 | cdef int epoch, i, j, shuffle_index 151 | 152 | # We iterate over random indices to simulate 153 | # shuffling the cooccurrence matrix. 154 | for epoch in range(epochs): 155 | for j in range(no_cooccurrences): 156 | shuffle_index = shuffle_indices[j] 157 | 158 | word_b = row[shuffle_index] 159 | count = counts[shuffle_index] 160 | 161 | # Get prediction 162 | prediction = 0.0 163 | for i in range(dim): 164 | prediction = prediction + paragraphvec[i] * wordvec[word_b, i] 165 | prediction += wordbias[word_b] 166 | 167 | # Compute loss and the example weight. 168 | entry_weight = double_min(1.0, (count / max_count)) ** alpha 169 | loss = entry_weight * (prediction - c_log(count)) 170 | 171 | # Update step: apply gradients. 172 | for i in range(dim): 173 | learning_rate = initial_learning_rate / sqrt(sum_gradients[i]) 174 | gradient = loss * wordvec[word_b, i] 175 | paragraphvec[i] = (paragraphvec[i] - learning_rate 176 | * gradient) 177 | sum_gradients[i] += gradient ** 2 178 | -------------------------------------------------------------------------------- /glove/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | from .accuracy import (read_analogy_file, 2 | construct_analogy_test_set, 3 | analogy_rank_score) 4 | -------------------------------------------------------------------------------- /glove/metrics/accuracy.py: -------------------------------------------------------------------------------- 1 | try: 2 | from itertools import izip 3 | except ImportError: 4 | izip = zip 5 | import numpy as np 6 | 7 | from .accuracy_cython import compute_rank_violations 8 | 9 | 10 | def read_analogy_file(filename): 11 | """ 12 | Read the analogy task test set from a file. 13 | """ 14 | 15 | section = None 16 | 17 | with open(filename, 'r') as questions_file: 18 | for line in questions_file: 19 | if line.startswith(':'): 20 | section = line[2:].replace('\n', '') 21 | continue 22 | else: 23 | words = line.replace('\n', '').split(' ') 24 | 25 | yield section, words 26 | 27 | 28 | def construct_analogy_test_set(test_examples, dictionary, ignore_missing=False): 29 | """ 30 | Construct the analogy test set by mapping the words to their 31 | word vector ids. 32 | 33 | Arguments: 34 | - test_examples: iterable of 4-word iterables 35 | - dictionay: a mapping from words to ids 36 | - boolean ignore_missing: if True, words in the test set 37 | that are not in the dictionary 38 | will be dropeed. 39 | 40 | Returns: 41 | - a N by 4 numpy matrix. 42 | """ 43 | 44 | test = [] 45 | 46 | for example in test_examples: 47 | try: 48 | test.append([dictionary[word] for word in example]) 49 | except KeyError: 50 | if ignore_missing: 51 | pass 52 | else: 53 | raise 54 | 55 | try: 56 | test = np.array(test, dtype=np.int32) 57 | except ValueError as e: 58 | # This should use raise ... from ... in Python 3. 59 | raise ValueError('Each row of the test set should contain ' 60 | '4 integer word ids', e) 61 | 62 | return test 63 | 64 | 65 | def analogy_rank_score(analogies, word_vectors, no_threads=1): 66 | """ 67 | Calculate the analogy rank score for the given set of analogies. 68 | 69 | A rank of zero denotes a perfect score; with random word vectors 70 | we would expect a rank of 0.5. 71 | 72 | Arguments: 73 | - analogies: a numpy array holding the ids of the words in the analogy tasks, 74 | as constructed by `construct_analogy_test_set`. 75 | - word_vectors: numpy array holding the word vectors to use. 76 | - num_threads: number of parallel threads to use in the calculation. 77 | 78 | Returns: 79 | - ranks: a numpy array holding the normalized rank of the target word 80 | in each analogy task. Rank 0 means that the target words was 81 | returned first; rank 1 means it was returned last. 82 | """ 83 | 84 | # The mean of the vectors for the 85 | # second, third, and the negative of 86 | # the first word. 87 | input_vectors = (word_vectors[analogies[:, 1]] 88 | + word_vectors[analogies[:, 2]] 89 | - word_vectors[analogies[:, 0]]) 90 | 91 | word_vector_norms = np.linalg.norm(word_vectors, 92 | axis=1) 93 | 94 | # Pre-allocate the array storing the rank violations 95 | rank_violations = np.zeros(input_vectors.shape[0], dtype=np.int32) 96 | 97 | compute_rank_violations(word_vectors, 98 | word_vector_norms, 99 | input_vectors, 100 | analogies[:, 3], 101 | analogies, 102 | rank_violations, 103 | no_threads) 104 | 105 | return rank_violations / float(word_vectors.shape[0]) 106 | -------------------------------------------------------------------------------- /glove/metrics/accuracy_cython.pyx: -------------------------------------------------------------------------------- 1 | #!python 2 | #cython: boundscheck=False, wraparound=False, cdivision=True, initializedcheck=False 3 | 4 | from cython.parallel import prange 5 | 6 | 7 | cdef double dot(double[::1] x, 8 | double[::1] y, 9 | int dim) nogil: 10 | 11 | cdef int i 12 | cdef double result = 0.0 13 | 14 | for i in range(dim): 15 | result += x[i] * y[i] 16 | 17 | return result 18 | 19 | 20 | def compute_rank_violations(double[:, ::1] wordvec, 21 | double[::1] wordvec_norm, 22 | double[:, ::1] input, 23 | int[:] expected, 24 | int[:, ::1] inputs, 25 | int[::1] rank_violations, 26 | int no_threads): 27 | """ 28 | Compute the rank violations 29 | of the expected words in the word analogy task. 30 | """ 31 | 32 | cdef int i, j, k, no_input_vectors, no_wordvec, skip_word 33 | cdef int no_components, violations 34 | 35 | cdef double score_of_expected, score 36 | 37 | no_input_vectors = input.shape[0] 38 | no_wordvec = wordvec.shape[0] 39 | no_components = wordvec.shape[1] 40 | 41 | with nogil: 42 | for i in prange(no_input_vectors, num_threads=no_threads, 43 | schedule='dynamic'): 44 | 45 | # Compute the score of the expected word. 46 | score_of_expected = (dot(input[i], 47 | wordvec[expected[i]], 48 | no_components) 49 | / wordvec_norm[expected[i]]) 50 | 51 | # Compute all other scores and count 52 | # rank violations. 53 | violations = 0 54 | 55 | for j in range(no_wordvec): 56 | 57 | # Words from the input do not 58 | # count as violations. 59 | skip_word = 0 60 | for k in range(4): 61 | if inputs[i, k] == j: 62 | skip_word = 1 63 | break 64 | 65 | if skip_word == 1: 66 | continue 67 | 68 | score = (dot(input[i], 69 | wordvec[j], 70 | no_components) 71 | / wordvec_norm[j]) 72 | 73 | if score >= score_of_expected: 74 | violations = violations + 1 75 | 76 | # Update the average rank with the rank 77 | # of this example. 78 | rank_violations[i] = violations 79 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # glove-python 2 | 3 | [![Circle CI](https://circleci.com/gh/maciejkula/glove-python.svg?style=svg)](https://circleci.com/gh/maciejkula/glove-python) 4 | 5 | A toy python implementation of [GloVe](http://www-nlp.stanford.edu/projects/glove/). 6 | 7 | Glove produces dense vector embeddings of words, where words that occur together are close in the resulting vector space. 8 | 9 | While this produces embeddings which are similar to [word2vec](https://code.google.com/p/word2vec/) (which has a great python implementation in [gensim](http://radimrehurek.com/gensim/models/word2vec.html)), the method is different: GloVe produces embeddings by factorizing the logarithm of the corpus word co-occurrence matrix. 10 | 11 | The code uses asynchronous stochastic gradient descent, and is implemented in Cython. Most likely, it contains a tremendous amount of bugs. 12 | 13 | ## Installation 14 | Install from pypi using pip: `pip install glove_python`. 15 | 16 | Note for OSX users: due to its use of OpenMP, glove-python does not compile under Clang. To install it, you will need a reasonably recent version of `gcc` (from Homebrew for instance). This should be picked up by `setup.py`; if it is not, please open an issue. 17 | 18 | Building with the default Python distribution included in OSX is also not supported; please try the version from Homebrew or Anaconda. 19 | 20 | ## Usage 21 | Producing the embeddings is a two-step process: creating a co-occurrence matrix from the corpus, and then using it to produce the embeddings. The `Corpus` class helps in constructing a corpus from an interable of tokens; the `Glove` class trains the embeddings (with a sklearn-esque API). 22 | 23 | There is also support for rudimentary pagragraph vectors. A paragraph vector (in this case) is an embedding of a paragraph (a multi-word piece of text) in the word vector space in such a way that the paragraph representation is close to the words it contains, adjusted for the frequency of words in the corpus (in a manner similar to tf-idf weighting). These can be obtained after having trained word embeddings by calling the `transform_paragraph` method on the trained model. 24 | 25 | ## Examples 26 | `example.py` has some example code for running simple training scripts: `ipython -i -- examples/example.py -c my_corpus.txt -t 10` should process your corpus, run 10 training epochs of GloVe, and drop you into an `ipython` shell where `glove.most_similar('physics')` should produce a list of similar words. 27 | 28 | If you want to process a wikipedia corpus, you can pass file from [here](http://dumps.wikimedia.org/enwiki/latest/) into the `example.py` script using the `-w` flag. Running `make all-wiki` should download a small wikipedia dump file, process it, and train the embeddings. Building the cooccurrence matrix will take some time; training the vectors can be speeded up by increasing the training parallelism to match the number of physical CPU cores available. 29 | 30 | Running this on my machine yields roughly the following results: 31 | 32 | ``` 33 | In [1]: glove.most_similar('physics') 34 | Out[1]: 35 | [('biology', 0.89425889335342257), 36 | ('chemistry', 0.88913708236100086), 37 | ('quantum', 0.88859617025616333), 38 | ('mechanics', 0.88821824562025431)] 39 | 40 | In [4]: glove.most_similar('north') 41 | Out[4]: 42 | [('west', 0.99047203572917908), 43 | ('south', 0.98655786905501008), 44 | ('east', 0.97914140138065575), 45 | ('coast', 0.97680427897282185)] 46 | 47 | In [6]: glove.most_similar('queen') 48 | Out[6]: 49 | [('anne', 0.88284931171714842), 50 | ('mary', 0.87615260138308615), 51 | ('elizabeth', 0.87362497374226267), 52 | ('prince', 0.87011034923161801)] 53 | 54 | In [19]: glove.most_similar('car') 55 | Out[19]: 56 | [('race', 0.89549347066796814), 57 | ('driver', 0.89350343749207217), 58 | ('cars', 0.83601334715106568), 59 | ('racing', 0.83157724991920212)] 60 | ``` 61 | 62 | ## Development 63 | Pull requests are welcome. 64 | 65 | When making changes to the `.pyx` extension files, you'll need to run `python setup.py cythonize` in order to produce the extension `.c` and `.cpp` files before running `pip install -e .`. 66 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import platform 4 | import subprocess 5 | import sys 6 | 7 | from setuptools import Command, Extension, setup, find_packages 8 | from setuptools.command.test import test as TestCommand 9 | 10 | 11 | def define_extensions(cythonize=False): 12 | 13 | compile_args = ['-fopenmp', 14 | '-ffast-math'] 15 | 16 | # There are problems with illegal ASM instructions 17 | # when using the Anaconda distribution (at least on OSX). 18 | # This could be because Anaconda uses its own assembler? 19 | # To work around this we do not add -march=native if we 20 | # know we're dealing with Anaconda 21 | if 'anaconda' not in sys.version.lower(): 22 | compile_args.append('-march=native') 23 | 24 | if cythonize: 25 | glove_cython = "glove/glove_cython.pyx" 26 | glove_metrics = "glove/metrics/accuracy_cython.pyx" 27 | glove_corpus = "glove/corpus_cython.pyx" 28 | else: 29 | glove_cython = "glove/glove_cython.c" 30 | glove_metrics = "glove/metrics/accuracy_cython.c" 31 | glove_corpus = "glove/corpus_cython.cpp" 32 | 33 | return [Extension("glove.glove_cython", [glove_cython], 34 | extra_link_args=["-fopenmp"], 35 | extra_compile_args=compile_args), 36 | Extension("glove.metrics.accuracy_cython", 37 | [glove_metrics], 38 | extra_link_args=["-fopenmp"], 39 | extra_compile_args=compile_args), 40 | Extension("glove.corpus_cython", [glove_corpus], 41 | language='C++', 42 | libraries=["stdc++"], 43 | extra_link_args=compile_args, 44 | extra_compile_args=compile_args)] 45 | 46 | 47 | def set_gcc(): 48 | """ 49 | Try to find and use GCC on OSX for OpenMP support. 50 | """ 51 | 52 | # For macports and homebrew 53 | patterns = ['/opt/local/bin/gcc-mp-[0-9].[0-9]', 54 | '/opt/local/bin/gcc-mp-[0-9]', 55 | '/usr/local/bin/gcc-[0-9].[0-9]', 56 | '/usr/local/bin/gcc-[0-9]'] 57 | 58 | if 'darwin' in platform.platform().lower(): 59 | 60 | gcc_binaries = [] 61 | for pattern in patterns: 62 | gcc_binaries += glob.glob(pattern) 63 | gcc_binaries.sort() 64 | 65 | if gcc_binaries: 66 | _, gcc = os.path.split(gcc_binaries[-1]) 67 | os.environ["CC"] = gcc 68 | 69 | else: 70 | raise Exception('No GCC available. Install gcc from Homebrew ' 71 | 'using brew install gcc.') 72 | 73 | 74 | class Cythonize(Command): 75 | """ 76 | Compile the extension .pyx files. 77 | """ 78 | 79 | user_options = [] 80 | 81 | def initialize_options(self): 82 | pass 83 | 84 | def finalize_options(self): 85 | pass 86 | 87 | def run(self): 88 | 89 | import Cython 90 | from Cython.Build import cythonize 91 | 92 | cythonize(define_extensions(cythonize=True)) 93 | 94 | 95 | class Clean(Command): 96 | """ 97 | Clean build files. 98 | """ 99 | 100 | user_options = [] 101 | 102 | def initialize_options(self): 103 | pass 104 | 105 | def finalize_options(self): 106 | pass 107 | 108 | def run(self): 109 | 110 | pth = os.path.dirname(os.path.abspath(__file__)) 111 | 112 | subprocess.call(['rm', '-rf', os.path.join(pth, 'build')]) 113 | subprocess.call(['rm', '-rf', os.path.join(pth, '*.egg-info')]) 114 | subprocess.call(['find', pth, '-name', '*.pyc', '-type', 'f', '-delete']) 115 | subprocess.call(['rm', os.path.join(pth, 'glove', 'corpus_cython.so')]) 116 | subprocess.call(['rm', os.path.join(pth, 'glove', 'glove_cython.so')]) 117 | 118 | 119 | class PyTest(TestCommand): 120 | user_options = [('pytest-args=', 'a', "Arguments to pass to py.test")] 121 | 122 | def initialize_options(self): 123 | TestCommand.initialize_options(self) 124 | self.pytest_args = ['tests/'] 125 | 126 | def finalize_options(self): 127 | TestCommand.finalize_options(self) 128 | self.test_args = [] 129 | self.test_suite = True 130 | 131 | def run_tests(self): 132 | # import here, cause outside the eggs aren't loaded 133 | import pytest 134 | errno = pytest.main(self.pytest_args) 135 | sys.exit(errno) 136 | 137 | 138 | setup( 139 | name='glove_python', 140 | version='0.1.0', 141 | description=('Python implementation of Global Vectors ' 142 | 'for Word Representation (GloVe)'), 143 | long_description='', 144 | packages=find_packages(), 145 | install_requires=['numpy', 'scipy'], 146 | tests_require=['pytest'], 147 | cmdclass={'test': PyTest, 'cythonize': Cythonize, 'clean': Clean}, 148 | author='Maciej Kula', 149 | url='https://github.com/maciejkula/glove-python', 150 | download_url='https://github.com/maciejkula/glove-python/tarball/0.1.0', 151 | license='Apache 2.0', 152 | classifiers=['Development Status :: 3 - Alpha', 153 | 'License :: OSI Approved :: Apache Software License', 154 | 'Topic :: Scientific/Engineering :: Artificial Intelligence'], 155 | ext_modules=define_extensions() 156 | ) 157 | -------------------------------------------------------------------------------- /tests/stanford_test.txt: -------------------------------------------------------------------------------- 1 | 0.62415 0.62476 -0.082335 0.20101 -0.13741 -0.11431 0.77909 2.6356 -0.46351 0.57465 -0.024888 -0.015466 -2.9696 -0.49876 0.095034 -0.94879 -0.017336 -0.86349 -1.3348 0.046811 0.36999 -0.57663 -0.48469 0.40078 0.75345 2 | . 0.69586 -1.1469 -0.41797 -0.022311 -0.023801 0.82358 1.2228 1.741 -0.90979 1.3725 0.1153 -0.63906 -3.2252 0.61269 0.33544 -0.57058 -0.50861 -0.16575 -0.98153 -0.8213 0.24333 -0.14482 -0.67877 0.7061 0.40833 3 | : 1.1242 0.054519 -0.037362 0.10046 0.11923 -0.30009 1.0938 2.537 -0.072802 1.0491 1.0931 0.066084 -2.7036 -0.14391 -0.22031 -0.99347 -0.65072 -0.030948 -1.0817 -0.64701 0.32341 -0.41612 -0.5268 -0.047166 0.71549 4 | rt 0.74056 0.9155 -0.16352 0.35843 0.05266 0.1456 1.0421 2.8073 0.12865 1.0492 0.13033 0.20508 -2.6686 -0.50551 -0.29574 -0.91433 -0.40456 -1.0988 -1.0333 -0.17875 0.37979 -0.25922 -0.74854 0.36001 0.61206 5 | , 0.84705 -1.0349 -0.050419 0.27164 -0.58659 0.99514 0.25267 1.6963 0.10313 0.80073 0.74655 -1.2667 -4.036 -0.22557 0.16322 -0.67015 -0.64812 0.010373 -0.71889 -0.74997 0.24862 0.10319 -1.1732 0.58196 0.33846 6 | 0.67867 -0.74651 -0.31831 -0.093681 0.062057 0.77956 1.5604 2.0332 -0.95379 1.2358 -0.081705 -0.42269 -2.5273 0.51772 0.29574 -0.76079 -0.57992 -0.51783 -1.1715 -0.53952 0.36752 -0.2758 -0.086496 1.0115 0.56436 7 | 0.18227 -0.29194 -1.3632 -1.201 0.084332 0.018943 1.3408 2.3866 -1.2761 0.39897 -0.16731 -0.52372 -3.5758 -0.25648 -0.5531 -0.62011 -0.71249 -0.12025 -0.91766 0.65635 -0.55258 -1.1655 0.10899 -1.6099 1.6189 8 | 1.3956 0.2892 0.48572 -1.1412 0.21461 1.0714 0.25408 2.1181 0.30252 0.75955 1.1299 -0.021373 -3.7757 0.89387 -0.71476 -1.6997 -0.42166 -0.12601 -1.2984 0.41689 -0.84993 -1.5199 0.40681 0.15024 0.26997 9 | 0.80384 -1.0366 -0.53877 -1.0806 0.84718 -0.36196 1.0065 1.3067 -0.61225 0.30781 0.46974 -0.23264 -3.3882 -0.46778 -0.55105 -1.6926 -0.78708 0.28378 -0.73638 0.10216 -0.18703 -2.133 -0.17787 -0.97788 1.394 10 | ! 0.4049 -0.87651 -0.23362 -0.34844 -0.097002 0.40895 1.6928 1.7058 -1.293 0.70091 -0.12498 -0.75998 -3.1586 0.14081 0.57255 -0.46097 -0.75721 -0.72414 -1.4071 -0.17224 0.0099324 -0.45711 0.074886 1.2035 1.1614 11 | i -0.26079 0.59108 0.61622 -0.70368 -0.85159 -0.23238 1.0481 0.066642 -0.54907 0.70047 -0.87221 -0.013954 -5.9671 -0.43106 -0.9154 0.53744 0.57099 -0.27181 -0.84178 -0.59682 0.4516 0.34097 0.076869 0.2284 0.2758 12 | a 0.21294 0.31035 0.17694 0.87498 0.067926 0.59171 -0.098218 1.5896 -0.428 -1.3655 -0.15278 -2.501 -5.5652 -0.10232 0.39577 0.1555 -0.55181 0.34671 -0.57379 -0.30717 0.043623 -0.39707 0.64551 -0.33537 0.020467 13 | " 1.0822 -0.59378 -0.19992 0.66626 0.18051 0.014404 1.4227 2.3584 -0.2701 1.4194 0.61099 -0.29541 -2.8885 -0.070205 -0.038122 -0.50855 -0.4445 0.076176 -0.96879 -0.57778 0.39206 0.20976 -0.73835 0.031611 0.72533 14 | the -0.010167 0.020194 0.21473 0.17289 -0.43659 -0.14687 1.8429 -0.15753 0.18187 -0.31782 0.06839 0.51776 -6.3371 0.48066 0.13777 -0.48568 0.39 -0.0019506 -0.10218 0.21262 -0.86146 0.17263 0.18783 -0.8425 -0.31208 15 | ? 1.104 -0.34629 0.088792 -0.2554 -0.023462 0.51487 0.7491 1.7858 0.16928 0.93679 0.010994 -0.98983 -3.7061 -0.82598 0.90447 -0.41301 -0.617 -0.62424 -1.1698 -0.022587 0.26791 -0.076523 -1.1142 1.336 0.20145 16 | you -0.41586 0.32548 -0.087621 0.2018 -0.80017 -0.34418 2.1431 0.37188 -0.9409 0.24283 -0.86396 0.63858 -6.0171 -0.54081 -0.43305 0.095707 0.37971 -1.1432 0.11382 -0.38361 0.41758 0.081476 -0.02659 0.75438 -0.77178 17 | to 0.28228 0.019558 0.11509 -0.39242 -1.0503 -0.54278 1.1357 -0.34251 0.80636 -0.47359 -0.77194 -0.73689 -6.2619 -0.34902 -0.35532 -0.60148 -0.054534 -0.67057 -0.39972 -1.324 -0.43765 0.30045 0.2143 0.25422 -0.26674 18 | ( 0.026645 -0.15996 -0.13042 0.32999 0.24416 0.41042 1.3001 2.6126 0.70933 0.91401 0.21455 0.2219 -2.6304 -0.11566 -0.32597 -2.167 -1.0084 0.43317 -0.85766 -0.20587 -0.037961 -1.5767 0.15105 0.24585 1.1149 19 | 0.82488 -0.3125 -1.2156 -1.0703 -0.26568 -0.2475 2.1968 2.179 -0.37712 1.3096 0.51299 0.68645 -2.0813 -0.052276 -0.4715 -1.7417 -1.3162 -0.32637 -0.78276 0.50433 0.078971 -2.1496 0.63889 -0.57727 1.4871 20 | 0.23809 -0.09146 0.15923 -0.018792 0.12084 0.68245 1.3484 2.7759 -0.78706 0.85131 -0.95748 -0.34804 -2.0028 -0.51581 0.15512 -1.2631 -0.48455 -1.1553 -1.7698 0.39001 0.76965 0.24155 0.84985 2.0607 0.85529 21 | ) 0.34127 -0.43348 -0.35918 -0.15297 -0.078167 0.064745 1.3919 2.2717 0.53841 1.2455 0.65984 0.549 -2.6548 0.00082321 -0.38957 -1.952 -1.1767 0.2034 -0.91539 0.09037 0.16488 -1.8562 0.13423 0.57839 1.074 22 | me 0.58866 0.0060408 -0.22022 1.0119 -0.7583 0.12081 -0.025355 1.596 -1.521 -1.1867 -0.42468 -2.0128 -5.3977 -1.2343 0.17889 1.3491 -0.011538 -0.063358 -0.18676 -0.18863 0.81819 -0.33465 1.6392 1.4183 0.16919 23 | de 1.423 -0.46838 -0.16331 1.2443 1.0157 1.1604 -2.0031 2.5195 -0.47779 -1.8382 0.32809 -3.2301 -3.0671 -0.2536 0.87798 0.3083 -0.88685 1.2904 -1.3443 1.1462 -0.026837 -0.449 -0.006978 -0.73663 1.6404 24 | 0.15671 -0.024377 -0.04252 -0.22052 -0.21045 -0.15969 0.70284 2.2709 -0.91873 1.5789 -0.25527 -0.63909 -2.944 -0.042341 0.12256 -1.0834 0.44036 -0.60795 -1.611 0.40592 -0.37838 -0.1601 -1.0792 1.8263 0.55963 25 | ! 0.98004 0.38132 0.29754 -0.30478 0.40033 0.31853 1.8654 0.48166 0.56297 -0.82152 -1.2386 1.4854 -1.7059 0.093414 -2.4631 -1.365 -0.85534 0.20354 0.14737 2.1994 0.1779 -2.8695 2.1895 1.6762 2.3846 26 | que 1.8163 -0.9435 -0.6624 1.0099 0.031072 0.33463 -0.95627 2.9703 -0.54155 -2.4489 0.29555 -3.9631 -2.5559 -0.5695 1.0982 0.73903 -1.1868 0.5865 -0.45852 0.49212 0.87361 0.14368 0.64574 0.86255 0.4955 27 | and -0.81216 -0.28605 0.062502 -0.036869 -0.61118 -0.15568 1.625 -0.42602 0.1973 -0.19418 0.53267 0.64592 -6.1336 -0.3309 -0.0017279 -0.15173 0.20383 -0.77496 0.17629 -0.10884 -0.31234 0.2401 -0.36097 -0.049996 -0.7247 28 | 。 0.97257 1.2053 0.65594 0.7481 0.21479 0.030439 0.92262 1.9799 0.13767 -0.13729 -1.0628 1.475 -2.2513 0.92853 -2.5643 -1.6855 0.41263 1.0644 1.5803 1.2377 -0.13871 -3.2106 0.84125 0.18398 1.9644 29 | - 0.7717 -1.0602 -0.34383 -0.09264 0.031247 0.10274 1.1822 2.0774 0.20992 0.88188 0.65696 0.041836 -3.3736 -0.71065 0.041693 -1.535 -0.55627 0.64587 -0.7243 -0.399 -0.31172 -0.58834 -0.11027 -0.067876 1.0723 30 | my -0.74175 0.54942 0.6749 0.67924 0.13115 -0.2858 1.9227 0.11975 -0.62351 0.39304 -0.87884 0.39575 -5.9879 -0.49659 -0.26535 -0.04049 1.1247 -0.75211 -0.38015 0.49567 -0.53343 0.056762 0.69697 0.53384 -0.70807 31 | no 1.2722 0.22154 0.1395 0.50897 0.11663 0.10291 0.21448 2.2064 -0.5623 -1.0633 0.039293 -2.9371 -4.5097 -0.51896 1.0116 0.14003 -0.32955 0.076449 1.1712 -0.66266 0.53255 0.072198 1.184 0.95736 0.20746 32 | 、 0.85424 0.25535 0.80356 -0.043311 -0.062442 0.71904 0.91777 1.8196 0.93903 -0.041324 -0.27476 2.223 -1.9891 0.60383 -3.2757 -1.4099 1.3996 1.067 0.26134 1.3922 1.2872 -2.578 0.86285 0.53713 2.3683 33 | is -0.12532 -0.20207 -0.12672 -0.57474 -0.30313 -0.029884 1.1792 -0.1491 -0.71315 -0.12112 0.40652 1.4784 -5.995 -0.21617 0.47806 0.43448 0.13489 0.88961 -0.56926 0.33094 0.13661 0.65844 -0.41766 0.25164 -0.055809 34 | it 0.16758 0.21434 -0.093086 0.16379 -0.60001 -0.037103 1.8577 -0.24306 -0.44864 0.28734 -0.43609 1.0839 -6.0385 -0.14872 0.31843 0.08263 0.47562 -0.5009 -0.099384 -0.18034 -0.10614 0.15238 0.32532 0.73795 -0.40859 35 | … 0.84691 -0.27254 -0.46382 0.4686 0.77397 0.95429 0.566 2.0054 -0.79725 0.46677 -0.5743 0.55761 -2.5497 0.78614 -0.58253 -1.1329 -0.60002 0.10997 -0.66984 -0.43048 1.2045 -1.8733 0.27826 0.24504 0.83927 36 | in -0.32929 -0.16037 0.10785 -0.3961 -0.48827 -0.17528 0.23056 -0.49115 -0.065798 0.84382 0.38091 0.46377 -5.9545 0.57595 -0.18242 0.36494 -0.0042541 0.96687 -1.5674 -0.40454 -0.79557 -0.0050535 0.021972 -0.73638 0.65277 37 | n 0.53229 -0.30423 -0.6065 -0.15941 0.52165 -0.065076 1.3758 2.4098 -1.033 0.73698 -0.40591 -0.18263 -2.7087 0.28421 -1.8023 -1.4446 -1.4078 -0.20802 -0.94007 -0.10846 0.047255 -0.85601 0.94209 0.34083 0.66958 38 | for -0.21749 0.45183 -0.23211 -0.27781 -0.067977 -0.63951 1.1218 -0.37536 0.18676 -0.50864 0.016423 -0.13329 -5.903 0.14596 -0.067031 -0.66199 -0.17362 -0.87281 -0.49771 -0.55289 -1.0515 -0.18484 -0.30848 -0.04478 -0.24358 39 | / 0.26362 -0.5406 -0.30588 -0.42799 0.4699 0.95839 1.316 2.1394 0.33286 0.44204 0.82635 0.40852 -3.0257 -0.57578 -0.6385 -1.4769 -0.94778 0.17089 -0.98404 -0.36094 -0.32045 -0.99178 0.47383 0.49367 0.86134 40 | of 0.32543 -0.089637 -0.14733 0.4285 -0.092613 -0.17938 1.2835 -0.59714 -0.28134 -0.048954 0.54827 0.6941 -6.12 0.6724 0.018078 -0.24165 0.50342 0.65325 -0.20674 0.27639 -0.79097 0.10432 -0.6175 -0.54592 -0.069893 41 | la 0.14261 -0.2807 -0.1258 0.45119 0.17715 0.919 -1.0333 3.8694 0.41688 -0.67503 -0.020023 -3.0843 -2.9433 0.81947 1.6001 1.7433 0.21815 0.88131 -0.97446 1.3757 1.0597 -1.1426 0.29684 -0.84936 -0.012225 42 | 's -0.21143 -0.16532 0.42022 -0.28705 -0.20637 -0.3565 1.3455 0.21057 -0.14089 -0.66701 0.54621 0.62353 -5.8742 -0.2406 0.19936 -0.61046 0.034438 0.23217 -0.66933 0.28861 0.2833 0.37067 -0.092633 -0.092978 -0.30293 43 | * -0.0010152 -0.39872 0.18076 0.65818 -0.27114 0.15688 1.7757 2.4463 0.10599 1.254 0.11974 -0.057964 -2.2856 -1.187 -0.99771 -0.43614 -0.71053 -0.26881 -1.1703 -0.23262 -0.16339 -0.48445 0.84881 1.7162 0.76949 44 | do 1.6477 0.12903 0.76911 -0.030854 0.27506 -0.49298 0.8206 -0.12559 0.57068 -0.79898 -1.6912 -1.7656 -5.3186 -1.2511 -0.73013 -0.90697 -1.0932 -0.53634 0.17967 -1.6247 -0.0029176 1.3184 0.45594 0.3682 0.87591 45 | n't 0.31872 0.52105 -0.056364 -0.34805 -0.77221 -0.28169 2.06 -0.56607 -0.32574 0.073742 -0.46097 0.54654 -6.0364 -0.56174 -0.084994 0.34263 0.1017 -0.82377 0.14404 -0.73248 0.63707 0.82175 0.53894 0.3674 -0.25653 46 | that 0.20823 0.22476 -0.070949 0.23917 -0.36076 -0.23443 1.8633 -0.4573 -0.40894 -0.055079 -0.11599 1.0568 -6.2614 -0.24912 0.37123 0.21891 0.67926 -0.35585 0.18441 -0.11821 0.58806 0.59916 0.40883 0.15874 -0.55338 47 | on 0.21228 -0.2435 -0.57013 0.33778 -0.86072 -0.1771 0.86891 -0.11103 0.53467 -0.0036497 0.11068 0.44655 -5.6486 -0.033026 0.36245 0.74407 -0.16614 -0.61851 -1.8327 0.51321 -0.31933 -0.68438 0.59145 -0.55647 -0.31049 48 | y 0.21767 0.19018 -0.27414 0.69654 0.12748 0.83719 -1.2804 3.8718 -0.96223 -1.1195 0.985 -2.4167 -2.9994 0.017624 1.3301 0.48203 -0.19386 0.09823 1.2263 0.80212 0.48846 -0.98181 0.53096 0.4342 0.67874 49 | ' 0.44205 -0.67697 -0.079938 0.89579 -0.043245 0.35863 0.51735 1.433 -0.21658 0.93923 0.36207 -0.27295 -3.4128 0.46583 -0.87769 -0.42464 -1.3648 0.43996 -2.4477 -0.23733 0.42426 0.18637 -0.19753 0.26109 0.44809 50 | e 1.2775 -0.29531 0.57591 -0.42937 0.12591 -0.81734 -0.70924 1.6399 0.52233 -0.53468 -0.85909 -2.6308 -3.5076 -0.60357 -1.8392 -0.43235 -2.3822 0.14332 -1.2133 -0.33507 -0.75788 0.58005 0.38244 0.29998 0.66194 51 | o 1.3635 0.04007 0.82928 1.0005 0.51972 0.067199 -0.39481 1.9431 0.050642 -0.139 -0.67126 -2.5206 -3.1961 -0.67686 -0.75984 -1.5995 -1.178 -0.014596 -0.88455 -0.48474 0.21677 0.41828 0.24096 0.83342 1.1436 52 | u 0.4532 0.95597 -0.15188 -0.76201 -0.44016 0.031701 0.28219 1.9646 -0.64687 0.48962 -1.0939 0.0013035 -4.7503 0.14003 -1.4439 -0.0030859 -0.47395 -0.51914 -0.5441 0.48264 -0.076634 -0.064356 0.14666 0.48416 0.11541 53 | en 0.69564 -0.48183 -0.080013 0.33814 0.37114 1.3014 -2.2778 2.8792 -1.5228 -0.8133 1.0348 -2.0311 -2.8536 0.61582 1.254 1.0985 0.46716 1.5167 -1.086 0.94948 0.401 -1.4937 0.25163 -0.9494 1.5508 54 | this -0.17895 0.38406 0.073035 -0.32363 -0.092441 -0.40767 2.1 -0.11363 -0.58784 -0.17034 -0.6433 0.72388 -5.7839 -0.10406 0.52152 -0.11314 0.59554 -0.47587 -0.4551 0.084431 -0.4582 -0.16727 0.54594 0.035478 -0.16073 55 | el 0.11335 0.59796 0.38876 0.83878 0.83717 0.24276 -1.4375 4.2874 -0.51924 -1.0783 1.0522 -2.5613 -2.5292 0.47829 2.1483 0.013401 0.58133 0.6934 0.296 -0.012265 0.37054 -0.71858 1.6555 -0.60154 0.86965 56 | so 0.39543 -0.60706 0.34448 -0.93783 -0.30466 0.46151 1.5214 0.070674 -0.36075 0.029852 -1.1005 -0.053799 -5.079 -0.73424 -0.40314 -0.10083 -0.0022164 -0.47121 -0.88651 -1.1737 0.22514 0.87842 -0.10534 1.27 -0.22951 57 | be -0.3435 1.0138 -0.039231 -0.61739 -0.13 0.65973 1.1861 -0.117 -0.61421 0.39945 -0.33834 0.54643 -5.4199 0.31714 -0.62972 -0.49683 0.38104 -0.52959 -0.51274 -0.88274 0.524 1.032 -0.62416 0.12028 -0.10696 58 | 'm -0.60745 0.40046 0.72375 -0.51941 -0.60935 0.49649 1.7686 -0.48596 -0.35951 0.68387 -0.94178 0.2865 -5.0159 -0.46631 -0.36731 -0.46284 0.48544 -0.32663 -0.82502 -0.88353 0.89693 0.12028 0.030383 -0.043274 0.64118 59 | with -0.9476 0.32533 0.23967 0.29609 -0.098118 -0.10892 1.3503 -0.014157 0.15739 0.13604 -0.06848 0.68701 -5.6666 -0.41398 0.22936 -0.3325 0.49592 -0.74203 -0.032459 0.40253 -1.0907 -0.11469 -0.25527 -0.40069 -0.47669 60 | just -0.35518 0.4803 0.49681 -0.76379 -0.64588 0.083208 1.7889 -0.3547 -0.4497 0.022174 0.18026 0.81539 -5.7024 -0.75963 -0.066477 0.52203 0.50433 -0.42471 -0.37414 -0.67191 0.48804 0.43652 0.29954 0.43855 -0.40954 61 | > 0.67403 -0.1364 -0.080726 -0.89577 0.33463 0.21482 1.0867 2.0046 -0.61399 0.25588 -0.36965 -0.28485 -3.3158 -0.092295 -1.1617 -0.87728 -1.5146 -0.65678 -1.0679 0.61796 -0.61208 -0.614 0.39864 0.41253 0.56256 62 | your -0.48666 0.32014 -0.27703 1.0928 0.45773 -0.37923 1.9602 -0.35099 -0.34286 0.26806 -0.49852 0.34517 -6.0877 -0.18907 -0.45247 0.026142 0.56855 -0.64653 0.21283 -0.15051 -0.59593 -0.31407 0.2869 0.27501 -1.4474 63 | ^ 0.039588 -0.11419 -0.076219 -0.12791 -0.33003 0.43771 1.8312 1.9416 -0.86341 0.62057 -0.56913 0.3571 -1.7911 -0.40879 -0.53269 -1.4991 -1.053 -0.7939 -1.8975 0.5277 -0.37142 -1.7164 -0.055501 1.9065 0.92442 64 | like 0.068004 0.10737 0.61292 0.35446 -0.28576 0.44095 1.7574 -0.0079057 -0.66561 0.20433 -0.51421 0.46797 -5.349 -0.99746 -0.12069 0.11433 0.37355 -0.97219 -0.089747 -0.14982 0.34141 0.58987 0.51226 -0.06509 -0.068817 65 | have -0.058224 0.79651 -0.060888 -0.58459 -0.56228 -0.072496 1.8592 -0.42394 -0.56051 -0.057096 0.44406 0.31399 -5.8856 0.061965 -0.20702 -0.046447 0.75287 -0.58468 0.25053 -0.72399 -0.13122 0.37822 -0.17631 0.43037 -0.45315 66 | te 0.81979 -0.44008 -1.1408 1.1369 -0.71241 0.030568 -1.3383 2.9954 -1.413 -1.1037 0.57031 -2.9081 -2.3991 -1.1408 -0.15021 1.0955 -0.38832 0.88401 -0.92539 0.41138 0.77758 -0.91392 1.165 1.9987 0.51215 67 | at -1.0206 -0.12834 1.0937 -0.74474 -0.51548 -0.50633 1.0489 -0.18161 0.86851 0.38252 -0.34213 -0.079467 -4.9305 0.91787 0.046519 -0.83393 0.77692 -0.70471 -0.99819 -0.73907 -0.57397 -0.2561 -0.24104 -0.58805 0.48638 68 | ? 0.62413 0.77774 0.50666 -1.0425 0.69918 0.72191 0.91422 1.2456 0.21374 0.56368 -2.2575 2.2898 -1.8461 -0.57553 -2.3108 -1.4953 -0.80619 1.9734 -0.28921 2.0816 1.9227 -2.8119 0.46788 0.94142 0.99366 69 | love -0.62645 -0.082389 0.070538 0.5782 -0.87199 -0.14816 2.2315 0.98573 -1.3154 -0.34921 -0.8847 0.14585 -4.97 -0.73369 -0.94359 0.035859 -0.026733 -0.77538 -0.30014 0.48853 -0.16678 -0.016651 -0.53164 0.64236 -0.10922 70 | se 1.5771 -0.89176 -0.62529 0.42748 -0.83033 -0.064147 -1.6374 3.2134 0.12522 -1.2291 0.33955 -3.5723 -2.9215 -0.14489 0.10353 1.2451 -1.1518 0.27602 -0.26116 0.098035 0.73908 0.092399 0.80997 0.44311 0.62663 71 | are 0.1866 -0.098326 -0.12268 -0.93822 -0.40161 0.6383 1.6686 -0.68036 -0.98359 -0.079512 0.38078 0.039076 -5.4147 0.02829 -0.47007 0.11377 -0.52725 -0.79312 0.58203 -0.61829 0.37025 0.2261 -0.73014 -0.1019 -0.21382 72 | < 0.58806 -0.15251 -0.064951 -0.88166 0.15004 -0.19889 1.1618 2.0507 -0.91723 0.38324 -0.60172 0.085248 -3.0157 0.3511 -1.1353 -0.77262 -1.6993 -0.61297 -1.0837 0.67306 -0.41953 -0.76792 0.64669 0.56938 0.53808 73 | m 0.46768 0.24442 -0.79763 -0.74 -0.41127 0.26882 -0.21232 2.257 -0.3162 -0.17906 -0.15279 -0.50826 -4.0466 0.57889 -1.1724 0.031238 -1.1387 0.039347 -1.32 1.2395 -0.56249 -1.0717 0.5102 -0.021507 0.24223 74 | r 0.55525 0.71333 -0.49285 -0.90042 0.043085 0.11183 0.11446 2.2269 -0.69619 0.064417 -0.39974 -0.43526 -3.9523 0.35152 -1.7713 -0.77199 -1.4343 -0.39785 -0.72236 0.86474 -0.18005 -0.72005 0.44048 -0.36406 0.3805 75 | if 0.18243 0.70534 -0.34209 -0.10779 -0.72721 -0.58802 1.7457 -0.13666 -0.61576 0.15336 -0.19019 0.70282 -5.725 -0.20901 -0.33692 0.16916 0.35872 -0.9871 0.45495 -0.36607 0.62973 0.11066 0.31315 0.08787 -0.88679 76 | all -0.18232 0.96997 0.32174 -0.074793 -0.11618 0.095187 1.4986 -0.41357 -0.86298 0.42067 -0.53125 0.10797 -5.56 0.25913 -0.58389 0.12241 -0.15168 -0.71983 -0.16605 -0.092514 -0.34099 0.12464 0.086193 -0.1411 -0.18656 77 | b 0.50344 0.61831 -0.24684 -0.93508 -0.11275 0.22688 0.15222 2.3432 -0.24032 0.12045 -0.60827 -0.25307 -4.0088 0.42389 -1.2794 -0.72646 -1.152 -0.36694 -0.77069 0.80603 -0.52068 -0.40503 0.23572 0.12789 0.21919 78 | ・ 1.4696 -1.1562 -1.2549 -0.045791 0.26885 1.3117 0.093971 2.056 1.8101 -0.78516 -1.127 2.8487 -2.1782 0.14884 -2.6502 -1.9661 -0.32747 1.1759 0.82536 1.7901 0.36039 -2.2292 1.1782 -0.17536 1.0635 79 | not 0.35377 0.32604 -0.22682 -0.32412 -0.18555 0.1486 1.3914 -0.65154 -0.38197 0.17129 -0.43405 0.39154 -5.7918 -0.20201 -0.23216 -0.10638 0.070835 -0.2146 -0.094385 -1.0851 0.61683 0.82184 -0.35102 0.19177 -0.43818 80 | but 0.17129 -0.012287 -0.32958 -0.16519 -0.54661 0.17683 1.7577 -0.63738 -0.47622 0.13892 -0.43254 0.789 -5.7212 0.068779 0.37439 0.51778 0.331 -0.74517 -0.20935 -0.21567 0.66493 0.52874 0.1066 0.47777 -0.48408 81 | we 0.49653 1.1435 -0.28609 -0.63378 -1.2347 -0.096415 0.81466 0.31406 -0.81064 0.43028 -0.10844 0.23407 -5.5359 0.045617 -0.15732 0.74855 0.35315 -0.11169 -0.12048 -0.13965 -0.27711 0.34342 -0.40377 0.61924 0.5394 82 | es 0.074369 -0.84513 -1.1513 0.2921 0.88949 0.8157 -0.71079 3.9407 -0.90175 -0.99754 1.1565 -2.4328 -1.9748 -0.28035 1.8613 0.84797 -0.055742 0.95519 -0.066362 -0.10171 1.2204 -0.12053 0.26966 0.22542 0.65419 83 | ya 0.18773 1.2677 0.031241 0.62269 -0.081441 0.68537 -0.36275 2.9738 -0.72967 1.035 0.41136 -0.99949 -2.8181 0.24364 0.51258 -0.51171 0.6693 -0.59732 -0.91591 0.23611 0.22535 0.42765 -0.7947 1.5532 0.1225 84 | & -0.47122 0.056607 -0.29293 0.14845 -0.38806 0.98729 0.70408 1.1875 -0.11685 -0.022892 0.77426 0.54195 -4.7114 -0.50027 -0.44136 -0.29372 -0.31554 -0.58285 -0.1643 0.10002 -0.14504 -0.79716 -0.7261 0.2252 -0.17583 85 | follow 0.0033407 0.91533 -1.2207 -0.45773 -0.59682 -0.64277 1.0497 1.7013 -1.3643 0.3601 -0.25219 -0.14811 -3.8711 -1.0949 -1.3944 0.22114 -0.76684 -2.0907 -1.0099 0.20968 -0.27403 -1.8368 -0.4521 0.8915 -1.0369 86 | up -0.19555 0.8236 0.73457 0.12395 -0.21509 0.51058 1.1859 0.0058648 -0.18281 0.35138 -0.42667 0.66238 -5.3506 -0.066143 0.15166 -0.11217 0.25905 -1.2281 -0.63229 -0.49146 -0.2852 0.10816 1.0703 0.11823 -0.010115 87 | what 0.5292 0.34413 -0.055991 0.15468 -0.48548 -0.42618 2.0155 -0.082657 -0.40249 0.066622 -0.56097 0.61953 -5.6142 -0.40026 0.54612 -0.021634 0.46933 -0.33373 0.096102 0.095193 0.53975 0.33677 -0.28179 0.50744 -0.51784 88 | get -0.33344 1.2678 0.04472 -0.6707 -0.2079 0.12289 1.3696 -0.22981 -0.53645 -0.1833 -0.13394 0.63001 -5.6577 -0.66158 -0.26856 -0.1472 0.71835 -1.1591 -0.055771 -0.8732 -0.025146 0.3069 1.0787 0.38923 -0.028963 89 | lol 0.073266 0.069397 0.15877 -0.20805 -0.43151 -0.26647 1.3905 0.48968 -0.73748 0.78262 -1.1841 0.1629 -4.6268 -0.76179 0.80136 0.78043 0.39787 -0.9309 -1.41 0.32624 1.2311 0.42486 0.70665 0.76985 0.20826 90 | un -0.10621 0.057591 -0.752 1.3185 0.79391 0.56179 -0.85496 3.275 -0.45897 -0.35925 1.7636 -2.7374 -2.2786 0.11344 1.8262 0.91508 0.30052 0.35935 -2.1484 0.12763 0.17556 -0.79025 1.0724 -0.76144 0.37949 91 | ♥ -0.84469 -1.0147 -0.2948 0.50243 -0.032096 -0.25413 1.7008 2.7719 -2.0656 -0.016464 -0.10534 0.0078914 -1.8015 0.04799 -0.96726 -1.0318 -1.3104 -0.51772 -1.1396 0.53056 0.02979 -0.82598 0.19418 2.0574 0.50733 92 | lo 0.67317 0.46962 -1.0861 0.70384 -0.033311 0.0016168 -0.20346 4.6008 0.083431 0.087919 1.3443 -2.0923 -1.781 0.069451 1.2344 1.1293 -0.13422 -0.1458 -0.15684 0.087938 0.31166 0.20576 -0.015365 1.2647 -0.26626 93 | when -0.26148 0.2644 0.44876 0.1599 -0.47692 -0.31942 2.1561 -0.52634 -0.56854 0.28894 -0.3091 0.69452 -5.6008 -0.18411 0.075871 0.69657 0.35998 -0.64724 0.34793 -0.5933 0.57665 0.30556 0.48815 -0.11032 -0.78019 94 | was -0.16063 0.021235 0.95695 -1.0642 -0.42496 0.070767 0.88473 -0.38835 -0.96585 -0.032367 0.2431 1.4682 -5.7357 -0.31611 0.59085 0.33569 0.92369 1.0457 -0.96856 -0.30444 0.65033 0.94053 -0.003684 0.14969 0.38408 95 | “ 1.1393 0.31722 0.33893 0.59917 0.41805 0.26417 1.5744 2.0874 -0.38514 0.47434 0.3168 0.093502 -2.9127 -0.14853 -0.43053 -0.97776 -0.56379 0.31383 -0.67353 -0.63432 0.88654 -0.22208 -0.12107 -0.19183 0.6994 96 | ” 0.74133 -0.56073 0.0037826 0.25398 0.64676 -0.03136 1.5837 1.5481 -1.3584 0.62118 -0.14075 0.17492 -3.2719 -0.011897 -0.16072 -0.57518 -0.55753 0.22912 -0.49103 -0.25186 0.97739 -0.30327 0.39648 0.28217 1.0422 97 | one 0.39657 0.15653 0.50676 -0.039995 -0.1177 -0.011625 1.7677 0.33504 -0.84748 -0.27969 0.036325 -0.146 -5.2788 0.053348 -0.60437 0.26285 0.15334 -0.31598 -0.18437 -0.21645 -0.095925 -0.07569 0.18185 -0.18519 -0.33499 98 | por 0.98549 0.19405 0.77539 0.44123 0.58736 0.55549 -1.4343 2.9726 -1.1735 -1.2678 0.15086 -3.3556 -2.1731 -0.41532 1.1496 -0.46623 -1.331 0.71864 0.64299 -0.42066 0.51122 -0.81006 0.44971 0.087221 0.35618 99 | si 0.31329 -0.29282 -0.88699 0.45418 -0.77082 -0.55735 -0.23928 4.1561 -0.27757 0.1545 0.9518 -2.3782 -1.9816 -0.11735 1.1881 1.358 0.56444 0.24605 -0.98873 0.64185 0.35727 -0.078937 -0.28172 0.88359 -0.51221 100 | out -0.28653 0.60501 0.62592 -0.034889 -0.10508 0.063965 1.1527 -0.18502 -0.22128 0.29563 -0.061197 0.71973 -5.4451 -0.055855 0.078477 -0.0090364 0.32605 -0.90771 -0.53689 -0.34474 -0.3713 -0.17721 0.87016 -0.15274 0.026154 101 | -------------------------------------------------------------------------------- /tests/test_corpus.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import array 3 | 4 | import pytest 5 | 6 | import numpy as np 7 | import scipy.sparse as sp 8 | 9 | from glove import Corpus 10 | from glove.glove import check_random_state 11 | 12 | from utils import (build_coocurrence_matrix, 13 | generate_training_corpus) 14 | 15 | 16 | def test_corpus_construction(): 17 | 18 | corpus_words = ['a', 'naïve', 'fox'] 19 | corpus = [corpus_words] 20 | 21 | model = Corpus() 22 | model.fit(corpus, window=10) 23 | 24 | for word in corpus_words: 25 | assert word in model.dictionary 26 | 27 | assert model.matrix.shape == (len(corpus_words), 28 | len(corpus_words)) 29 | 30 | expected = [[0.0, 1.0, 0.5], 31 | [0.0, 0.0, 1.0], 32 | [0.0, 0.0, 0.0]] 33 | 34 | assert (model.matrix.todense().tolist() 35 | == expected) 36 | 37 | 38 | def test_supplied_dictionary(): 39 | 40 | dictionary = {'a': 2, 41 | 'naïve': 1, 42 | 'fox': 0} 43 | 44 | corpus = [['a', 'naïve', 'fox']] 45 | 46 | model = Corpus(dictionary=dictionary) 47 | model.fit(corpus, window=10) 48 | 49 | assert model.dictionary == dictionary 50 | 51 | assert model.matrix.shape == (len(dictionary), 52 | len(dictionary)) 53 | 54 | assert (model.matrix.tocsr()[2]).sum() == 0 55 | 56 | 57 | def test_supplied_dict_checks(): 58 | 59 | dictionary = {'a': 4, 60 | 'naïve': 1, 61 | 'fox': 0} 62 | 63 | with pytest.raises(Exception): 64 | Corpus(dictionary=dictionary) 65 | 66 | 67 | def test_supplied_dict_missing(): 68 | 69 | dictionary = {'a': 1, 70 | 'naïve': 0} 71 | 72 | corpus = [['a', 'naïve', 'fox']] 73 | 74 | model = Corpus(dictionary=dictionary) 75 | 76 | with pytest.raises(KeyError): 77 | model.fit(corpus, window=10) 78 | 79 | 80 | def test_supplied_dict_missing_ignored(): 81 | 82 | dictionary = {'a': 0, 83 | 'fox': 1} 84 | 85 | corpus = [['a', 'naïve', 'fox']] 86 | 87 | model = Corpus(dictionary=dictionary) 88 | model.fit(corpus, window=10, ignore_missing=True) 89 | 90 | assert model.dictionary == dictionary 91 | 92 | assert model.matrix.shape == (len(dictionary), 93 | len(dictionary)) 94 | 95 | # Ensure that context windows and context window 96 | # weights are preserved. 97 | full_model = Corpus() 98 | full_model.fit(corpus, window=10) 99 | 100 | assert (full_model.matrix.todense()[0, 2] 101 | == model.matrix.todense()[0, 1] 102 | == 0.5) 103 | 104 | 105 | def test_large_corpus_construction(): 106 | 107 | num_sentences = 5000 108 | seed = 10 109 | 110 | corpus = Corpus() 111 | 112 | corpus.fit(generate_training_corpus(num_sentences, seed=seed)) 113 | 114 | matrix = corpus.matrix.tocsr().tocoo() 115 | check_matrix = build_coocurrence_matrix(generate_training_corpus(num_sentences, 116 | seed=seed)) 117 | 118 | assert (matrix.row == check_matrix.row).all() 119 | assert (matrix.col == check_matrix.col).all() 120 | assert np.allclose(matrix.data, check_matrix.data) 121 | assert (matrix.data > 0).all() 122 | -------------------------------------------------------------------------------- /tests/test_glove.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | 4 | from glove import Corpus, Glove 5 | 6 | from utils import generate_training_corpus 7 | 8 | 9 | def _reproduce_input_matrix(glove_model): 10 | 11 | wvec = glove_model.word_vectors 12 | wbias = glove_model.word_biases 13 | 14 | out = np.dot(wvec, wvec.T) 15 | 16 | for i in range(wvec.shape[0]): 17 | for j in range(wvec.shape[0]): 18 | if i == j: 19 | out[i, j] = 0.0 20 | elif i < j: 21 | out[i, j] += wbias[i] + wbias[j] 22 | else: 23 | out[i, j] = 0.0 24 | 25 | return np.asarray(out) 26 | 27 | 28 | def test_stanford_loading(): 29 | 30 | model = Glove.load_stanford('tests/stanford_test.txt') 31 | 32 | assert model.word_vectors is not None 33 | assert model.word_vectors.shape == (100, 25) 34 | assert len(model.dictionary) == 100 35 | 36 | # Python 2/3 compatibility. Check the ellipsis 37 | # character is in the dictionary. 38 | try: 39 | # Python 2 40 | assert unichr(8230) in model.dictionary 41 | except NameError: 42 | # Pyton 3 43 | assert '…' in model.dictionary 44 | 45 | 46 | def test_fitting(): 47 | """ 48 | Verify that the square error diminishes with fitting 49 | """ 50 | 51 | num_sentences = 5000 52 | seed = 10 53 | 54 | corpus = Corpus() 55 | 56 | corpus.fit(generate_training_corpus(num_sentences, 57 | vocabulary_size=50, 58 | seed=seed)) 59 | 60 | # Check that the performance is poor without fitting 61 | glove_model = Glove(no_components=100, learning_rate=0.05) 62 | glove_model.fit(corpus.matrix, 63 | epochs=0, 64 | no_threads=2) 65 | 66 | log_cooc_mat = corpus.matrix.copy() 67 | log_cooc_mat.data = np.log(log_cooc_mat.data) 68 | log_cooc_mat = np.asarray(log_cooc_mat.todense()) 69 | 70 | repr_matrix = _reproduce_input_matrix(glove_model) 71 | 72 | assert ((repr_matrix - log_cooc_mat) ** 2).sum() > 30000.0 73 | 74 | # Check that it is good with fitting 75 | glove_model = Glove(no_components=100, learning_rate=0.05) 76 | glove_model.fit(corpus.matrix, 77 | epochs=500, 78 | no_threads=2) 79 | 80 | repr_matrix = _reproduce_input_matrix(glove_model) 81 | 82 | assert ((repr_matrix - log_cooc_mat) ** 2).sum() < 1500.0 83 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import array 3 | 4 | import numpy as np 5 | import scipy.sparse as sp 6 | 7 | from glove.glove import check_random_state 8 | 9 | 10 | def generate_training_corpus(num_sentences, 11 | vocabulary_size=30000, 12 | sentence_min_size=2, 13 | sentence_max_size=30, 14 | seed=None): 15 | 16 | rs = check_random_state(seed) 17 | 18 | for _ in range(num_sentences): 19 | sentence_size = rs.randint(sentence_min_size, 20 | sentence_max_size) 21 | yield [str(x) for x in 22 | rs.randint(0, vocabulary_size, sentence_size)] 23 | 24 | 25 | def build_coocurrence_matrix(sentences): 26 | 27 | dictionary = {} 28 | rows = [] 29 | cols = [] 30 | data = array.array('f') 31 | 32 | window = 10 33 | 34 | for sentence in sentences: 35 | for i, first_word in enumerate(sentence): 36 | first_word_idx = dictionary.setdefault(first_word, 37 | len(dictionary)) 38 | for j, second_word in enumerate(sentence[i:i + window + 1]): 39 | second_word_idx = dictionary.setdefault(second_word, 40 | len(dictionary)) 41 | 42 | distance = j 43 | 44 | if first_word_idx == second_word_idx: 45 | pass 46 | elif first_word_idx < second_word_idx: 47 | rows.append(first_word_idx) 48 | 49 | cols.append(second_word_idx) 50 | data.append(np.float32(1.0) / distance) 51 | else: 52 | rows.append(second_word_idx) 53 | cols.append(first_word_idx) 54 | data.append(np.float32(1.0) / distance) 55 | 56 | return sp.coo_matrix((data, (rows, cols)), 57 | shape=(len(dictionary), 58 | len(dictionary)), 59 | dtype=np.float32).tocsr().tocoo() 60 | --------------------------------------------------------------------------------