├── .gitignore
├── LICENSE
├── Makefile
├── bench
    └── bench_corpus.py
├── changelog.md
├── circle.yml
├── examples
    ├── analogy_tasks_evaluation.py
    └── example.py
├── glove
    ├── __init__.py
    ├── corpus.py
    ├── corpus_cython.cpp
    ├── corpus_cython.pyx
    ├── glove.py
    ├── glove_cython.c
    ├── glove_cython.pyx
    └── metrics
    │   ├── __init__.py
    │   ├── accuracy.py
    │   ├── accuracy_cython.c
    │   └── accuracy_cython.pyx
├── readme.md
├── setup.cfg
├── setup.py
└── tests
    ├── stanford_test.txt
    ├── test_corpus.py
    ├── test_glove.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.*~
 3 | *.so
 4 | dist/*
 5 | *.egg*
 6 | *.model
 7 | *.corpus
 8 | build/*
 9 | Makefile~
10 | *.npy
11 | *.bz2
12 | *#*
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright 2014 Maciej Kula
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | all:
 2 | 	python setup.py build_ext
 3 | 
 4 | get-wiki:
 5 | 	wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-meta-current1.xml-p000000010p000010000.bz2
 6 | 
 7 | process-wiki:
 8 | 	python -- examples/example.py -w -c enwiki-latest-pages-meta-current1.xml-p000000010p000010000.bz2
 9 | 
10 | train-wiki:
11 | 	python -i -- examples/example.py -t 30 -p 2
12 | 
13 | all-wiki: get-wiki process-wiki train-wiki
14 | 
15 | .PHONY: all get-wiki process-wiki train-wiki
16 | 


--------------------------------------------------------------------------------
/bench/bench_corpus.py:
--------------------------------------------------------------------------------
 1 | import array
 2 | import timeit
 3 | 
 4 | import numpy as np
 5 | import scipy.sparse as sp
 6 | 
 7 | from glove import Corpus
 8 | from glove.glove import check_random_state
 9 | 
10 | 
11 | def generate_training_corpus(num_sentences,
12 |                              vocabulary_size=30000,
13 |                              sentence_min_size=2,
14 |                              sentence_max_size=30,
15 |                              seed=None):
16 | 
17 |     rs = check_random_state(seed)
18 | 
19 |     for _ in range(num_sentences):
20 |         sentence_size = rs.randint(sentence_min_size,
21 |                                    sentence_max_size)
22 |         yield [str(x) for x in
23 |                rs.randint(0, vocabulary_size, sentence_size)]
24 | 
25 | 
26 | def fit_corpus(corpus):
27 | 
28 |     model = Corpus()
29 |     model.fit(corpus)
30 | 
31 |     return corpus
32 | 
33 | 
34 | if __name__ == '__main__':
35 | 
36 |     number = 10
37 | 
38 |     elapsed = timeit.timeit('fit_corpus(corpus)',
39 |                             setup=('from __main__ import generate_training_corpus;'
40 |                                    'from __main__ import fit_corpus;'
41 |                                    'corpus = list(generate_training_corpus(100000, seed=10))'),
42 |                             number=number)
43 | 
44 |     one_loop_time = elapsed / number
45 | 
46 |     print('Seconds per fit: %s' % one_loop_time)
47 | 


--------------------------------------------------------------------------------
/changelog.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | ## [0.1.0][2016-01-11]
 4 | ### Changed
 5 | - add algorithm tests for corpus construction and model fitting
 6 | - remove dependency on Cython for intallation, the required .c and .cpp files are now included
 7 | - use py.test for testing
 8 | - removed dependency on C++11 features by using a different sparse matrix structure for corpus construction
 9 | - faster coocurrence matrix construction
10 | 
11 | ### Removed
12 | - max_map_size argument removed from Corpus.fit
13 | 


--------------------------------------------------------------------------------
/circle.yml:
--------------------------------------------------------------------------------
1 | dependencies:
2 |   pre:
3 |     - pip install numpy
4 |     - pip install scipy
5 | 


--------------------------------------------------------------------------------
/examples/analogy_tasks_evaluation.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from collections import defaultdict
 3 | import numpy as np
 4 | 
 5 | from glove import Glove, metrics
 6 | 
 7 | 
 8 | if __name__ == '__main__':
 9 |     
10 |     parser = argparse.ArgumentParser(description=('Evaluate a trained GloVe '
11 |                                                   'model on an analogy task.'))
12 |     parser.add_argument('--test', '-t', action='store',
13 |                         required=True,
14 |                         help='The filename of the analogy test set.')
15 |     parser.add_argument('--model', '-m', action='store',
16 |                         required=True,
17 |                         help='The filename of the stored GloVe model.')
18 |     parser.add_argument('--encode', '-e', action='store_true',
19 |                         default=False,
20 |                         help=('If True, words from the '
21 |                               'evaluation set will be utf-8 encoded '
22 |                               'before looking them up in the '
23 |                               'model dictionary'))
24 |     parser.add_argument('--parallelism', '-p', action='store',
25 |                         default=1,
26 |                         help=('Number of parallel threads to use'))
27 | 
28 |     args = parser.parse_args()
29 | 
30 |     # Load the GloVe model
31 |     glove = Glove.load(args.model)
32 | 
33 | 
34 |     if args.encode:
35 |         encode = lambda words: [x.lower().encode('utf-8') for x in words]
36 |     else:
37 |         encode = lambda words: [unicode(x.lower()) for x in words]
38 | 
39 | 
40 |     # Load the analogy task dataset. One example can be obtained at
41 |     # https://word2vec.googlecode.com/svn/trunk/questions-words.txt
42 |     sections = defaultdict(list)
43 |     evaluation_words = [sections[section].append(encode(words)) for section, words in
44 |                         metrics.read_analogy_file(args.test)]
45 | 
46 |     section_ranks = []
47 | 
48 |     for section, words in sections.items():
49 |         evaluation_ids = metrics.construct_analogy_test_set(words,
50 |                                                             glove.dictionary,
51 |                                                             ignore_missing=True)
52 | 
53 |         # Get the rank array.
54 |         ranks = metrics.analogy_rank_score(evaluation_ids, glove.word_vectors,
55 |                                            no_threads=int(args.parallelism))
56 |         section_ranks.append(ranks)
57 | 
58 |         print('Section %s mean rank: %s, accuracy: %s' % (section, ranks.mean(), 
59 |                                                           (ranks == 0).sum() / float(len(ranks))))
60 |     
61 |     ranks = np.hstack(section_ranks)
62 | 
63 |     print('Overall rank: %s, accuracy: %s' % (ranks.mean(), 
64 |                                               (ranks == 0).sum() / float(len(ranks))))
65 | 
66 | 


--------------------------------------------------------------------------------
/examples/example.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import argparse
  3 | import pprint
  4 | import gensim
  5 | 
  6 | from glove import Glove
  7 | from glove import Corpus
  8 | 
  9 | 
 10 | def read_corpus(filename):
 11 | 
 12 |     delchars = [chr(c) for c in range(256)]
 13 |     delchars = [x for x in delchars if not x.isalnum()]
 14 |     delchars.remove(' ')
 15 |     delchars = ''.join(delchars)
 16 | 
 17 |     with open(filename, 'r') as datafile:
 18 |         for line in datafile:
 19 |             yield line.lower().translate(None, delchars).split(' ')
 20 | 
 21 | 
 22 | def read_wikipedia_corpus(filename):
 23 | 
 24 |     # We don't want to do a dictionary construction pass.
 25 |     corpus = gensim.corpora.WikiCorpus(filename, dictionary={})
 26 | 
 27 |     for text in corpus.get_texts():
 28 |         yield text
 29 | 
 30 | 
 31 | if __name__ == '__main__':
 32 |     
 33 |     # Set up command line parameters.
 34 |     parser = argparse.ArgumentParser(description='Fit a GloVe model.')
 35 | 
 36 |     parser.add_argument('--create', '-c', action='store',
 37 |                         default=None,
 38 |                         help=('The filename of the corpus to pre-process. '
 39 |                               'The pre-processed corpus will be saved '
 40 |                               'and will be ready for training.'))
 41 |     parser.add_argument('-wiki', '-w', action='store_true',
 42 |                         default=False,
 43 |                         help=('Assume the corpus input file is in the '
 44 |                               'Wikipedia dump format'))
 45 |     parser.add_argument('--train', '-t', action='store',
 46 |                         default=0,
 47 |                         help=('Train the GloVe model with this number of epochs.'
 48 |                               'If not supplied, '
 49 |                               'We\'ll attempt to load a trained model'))
 50 |     parser.add_argument('--parallelism', '-p', action='store',
 51 |                         default=1,
 52 |                         help=('Number of parallel threads to use for training'))
 53 |     parser.add_argument('--query', '-q', action='store',
 54 |                         default='',
 55 |                         help='Get closes words to this word.')
 56 |     args = parser.parse_args()
 57 | 
 58 | 
 59 |     if args.create:
 60 |         # Build the corpus dictionary and the cooccurrence matrix.
 61 |         print('Pre-processing corpus')
 62 | 
 63 |         if args.wiki:
 64 |             print('Using wikipedia corpus')
 65 |             get_data = read_wikipedia_corpus
 66 |         else:
 67 |             get_data = read_corpus
 68 | 
 69 |         corpus_model = Corpus()
 70 |         corpus_model.fit(get_data(args.create), window=10)
 71 |         corpus_model.save('corpus.model')
 72 |         
 73 |         print('Dict size: %s' % len(corpus_model.dictionary))
 74 |         print('Collocations: %s' % corpus_model.matrix.nnz)
 75 | 
 76 |     if args.train:
 77 |         # Train the GloVe model and save it to disk.
 78 | 
 79 |         if not args.create:
 80 |             # Try to load a corpus from disk.
 81 |             print('Reading corpus statistics')
 82 |             corpus_model = Corpus.load('corpus.model')
 83 | 
 84 |             print('Dict size: %s' % len(corpus_model.dictionary))
 85 |             print('Collocations: %s' % corpus_model.matrix.nnz)
 86 | 
 87 |         print('Training the GloVe model')
 88 | 
 89 |         glove = Glove(no_components=100, learning_rate=0.05)
 90 |         glove.fit(corpus_model.matrix, epochs=int(args.train),
 91 |                   no_threads=args.parallelism, verbose=True)
 92 |         glove.add_dictionary(corpus_model.dictionary)
 93 | 
 94 |         glove.save('glove.model')
 95 | 
 96 |     if args.query:
 97 |         # Finally, query the model for most similar words.
 98 |         if not args.train:
 99 |             print('Loading pre-trained GloVe model')
100 |             glove = Glove.load('glove.model')
101 | 
102 |         print('Querying for %s' % args.query)
103 |         pprint.pprint(glove.most_similar(args.query, number=10))
104 | 


--------------------------------------------------------------------------------
/glove/__init__.py:
--------------------------------------------------------------------------------
1 | from .corpus import Corpus
2 | from .glove import Glove
3 | 


--------------------------------------------------------------------------------
/glove/corpus.py:
--------------------------------------------------------------------------------
 1 | # Cooccurrence matrix construction tools
 2 | # for fitting the GloVe model.
 3 | import numpy as np
 4 | try:
 5 |     # Python 2 compat
 6 |     import cPickle as pickle
 7 | except ImportError:
 8 |     import pickle
 9 | 
10 | from .corpus_cython import construct_cooccurrence_matrix
11 | 
12 | 
13 | class Corpus(object):
14 |     """
15 |     Class for constructing a cooccurrence matrix
16 |     from a corpus.
17 | 
18 |     A dictionry mapping words to ids can optionally
19 |     be supplied. If left None, it will be constructed
20 |     from the corpus.
21 |     """
22 | 
23 |     def __init__(self, dictionary=None):
24 | 
25 |         self.dictionary = {}
26 |         self.dictionary_supplied = False
27 |         self.matrix = None
28 | 
29 |         if dictionary is not None:
30 |             self._check_dict(dictionary)
31 |             self.dictionary = dictionary
32 |             self.dictionary_supplied = True
33 | 
34 |     def _check_dict(self, dictionary):
35 | 
36 |         if (np.max(list(dictionary.values())) != (len(dictionary) - 1)):
37 |             raise Exception('The largest id in the dictionary '
38 |                             'should be equal to its length minus one.')
39 | 
40 |         if np.min(list(dictionary.values())) != 0:
41 |             raise Exception('Dictionary ids should start at zero')
42 | 
43 |     def fit(self, corpus, window=10, ignore_missing=False):
44 |         """
45 |         Perform a pass through the corpus to construct
46 |         the cooccurrence matrix.
47 | 
48 |         Parameters:
49 |         - iterable of lists of strings corpus
50 |         - int window: the length of the (symmetric)
51 |           context window used for cooccurrence.
52 |         - bool ignore_missing: whether to ignore words missing from
53 |                                the dictionary (if it was supplied).
54 |                                Context window distances will be preserved
55 |                                even if out-of-vocabulary words are
56 |                                ignored.
57 |                                If False, a KeyError is raised.
58 |         """
59 | 
60 |         self.matrix = construct_cooccurrence_matrix(corpus,
61 |                                                     self.dictionary,
62 |                                                     int(self.dictionary_supplied),
63 |                                                     int(window),
64 |                                                     int(ignore_missing))
65 | 
66 |     def save(self, filename):
67 | 
68 |         with open(filename, 'wb') as savefile:
69 |             pickle.dump((self.dictionary, self.matrix),
70 |                         savefile,
71 |                         protocol=pickle.HIGHEST_PROTOCOL)
72 | 
73 |     @classmethod
74 |     def load(cls, filename):
75 | 
76 |         instance = cls()
77 | 
78 |         with open(filename, 'rb') as savefile:
79 |             instance.dictionary, instance.matrix = pickle.load(savefile)
80 | 
81 |         return instance
82 | 


--------------------------------------------------------------------------------
/glove/corpus_cython.pyx:
--------------------------------------------------------------------------------
  1 | #!python
  2 | # distutils: language = c++
  3 | # cython: boundscheck=False, wraparound=False, nonecheck=False, cdivision=True
  4 | 
  5 | import numpy as np
  6 | import scipy.sparse as sp
  7 | 
  8 | from libc.stdlib cimport malloc, free
  9 | 
 10 | from cython.operator cimport dereference as deref
 11 | from libcpp.vector cimport vector
 12 | 
 13 | 
 14 | cdef inline int int_min(int a, int b) nogil: return a if a <= b else b
 15 | 
 16 | 
 17 | cdef int binary_search(int* vec, int size, int first, int last, int x) nogil:
 18 |     """
 19 |     Binary seach in an array of ints
 20 |     """
 21 | 
 22 |     cdef int mid
 23 | 
 24 |     while (first < last):
 25 |         mid = (first + last) / 2
 26 |         if (vec[mid] == x):
 27 |             return mid
 28 |         elif vec[mid] > x:
 29 |             last = mid - 1
 30 |         else:
 31 |             first = mid + 1
 32 | 
 33 |     if (first == size):
 34 |         return first
 35 |     elif vec[first] > x:
 36 |         return first
 37 |     else:
 38 |         return first + 1
 39 | 
 40 | 
 41 | cdef struct SparseRowMatrix:
 42 |     vector[vector[int]] *indices
 43 |     vector[vector[float]] *data
 44 | 
 45 | 
 46 | cdef SparseRowMatrix* new_matrix():
 47 |     """
 48 |     Allocate and initialize a new matrix
 49 |     """
 50 | 
 51 |     cdef SparseRowMatrix* mat
 52 | 
 53 |     mat = <SparseRowMatrix*>malloc(sizeof(SparseRowMatrix))
 54 | 
 55 |     if mat == NULL:
 56 |         raise MemoryError()
 57 | 
 58 |     mat.indices = new vector[vector[int]]()
 59 |     mat.data = new vector[vector[float]]()
 60 | 
 61 |     return mat
 62 | 
 63 | 
 64 | cdef void free_matrix(SparseRowMatrix* mat) nogil:
 65 |     """
 66 |     Deallocate the data of a matrix
 67 |     """
 68 | 
 69 |     cdef int i
 70 |     cdef int rows = mat.indices.size()
 71 | 
 72 |     for i in range(rows):
 73 |         deref(mat.indices)[i].clear()
 74 |         deref(mat.data)[i].clear()
 75 | 
 76 |     del mat.indices
 77 |     del mat.data
 78 | 
 79 |     free(mat)
 80 | 
 81 | 
 82 | cdef void increment_matrix(SparseRowMatrix* mat, int row, int col, float increment) nogil:
 83 |     """
 84 |     Increment the (row, col) entry of mat by increment.
 85 |     """
 86 | 
 87 |     cdef vector[int]* row_indices
 88 |     cdef vector[float]* row_data
 89 |     cdef int idx
 90 |     cdef int col_at_idx
 91 | 
 92 |     # Add new row if necessary
 93 |     while row >= mat.indices.size():
 94 |         mat.indices.push_back(vector[int]())
 95 |         mat.data.push_back(vector[float]())
 96 | 
 97 |     row_indices = &(deref(mat.indices)[row])
 98 |     row_data = &(deref(mat.data)[row])
 99 | 
100 |     # Find the column element, or the position where
101 |     # a new element should be inserted
102 |     if row_indices.size() == 0:
103 |         idx = 0
104 |     else:
105 |         idx = binary_search(&(deref(row_indices)[0]), row_indices.size(),
106 |                             0, row_indices.size(), col)
107 | 
108 |     # Element to be added at the end
109 |     if idx == row_indices.size():
110 |         row_indices.insert(row_indices.begin() + idx, col)
111 |         row_data.insert(row_data.begin() + idx, increment)
112 |         return
113 | 
114 |     col_at_idx = deref(row_indices)[idx]
115 | 
116 |     if col_at_idx == col:
117 |         # Element to be incremented
118 |         deref(row_data)[idx] = deref(row_data)[idx] + increment
119 |     else:
120 |         # Element to be inserted
121 |         row_indices.insert(row_indices.begin() + idx, col)
122 |         row_data.insert(row_data.begin() + idx, increment)
123 | 
124 | 
125 | cdef int matrix_nnz(SparseRowMatrix* mat) nogil:
126 |     """
127 |     Get the number of nonzero entries in mat
128 |     """
129 | 
130 |     cdef int i
131 |     cdef int size = 0
132 | 
133 |     for i in range(mat.indices.size()):
134 |         size += deref(mat.indices)[i].size()
135 | 
136 |     return size
137 | 
138 | 
139 | cdef matrix_to_coo(SparseRowMatrix* mat, int shape):
140 |     """
141 |     Convert to a shape by shape COO matrix.
142 |     """
143 | 
144 |     cdef int i, j
145 |     cdef int row
146 |     cdef int col
147 |     cdef int rows = mat.indices.size()
148 |     cdef int no_collocations = matrix_nnz(mat)
149 | 
150 |     # Create the constituent numpy arrays.
151 |     row_np = np.empty(no_collocations, dtype=np.int32)
152 |     col_np = np.empty(no_collocations, dtype=np.int32)
153 |     data_np = np.empty(no_collocations, dtype=np.float64)
154 |     cdef int[:] row_view = row_np
155 |     cdef int[:] col_view = col_np
156 |     cdef double[:] data_view = data_np
157 | 
158 |     j = 0
159 | 
160 |     for row in range(rows):
161 |         for i in range(deref(mat.indices)[row].size()):
162 | 
163 |             row_view[j] = row
164 |             col_view[j] = deref(mat.indices)[row][i]
165 |             data_view[j] = deref(mat.data)[row][i]
166 | 
167 |             j += 1
168 | 
169 |     # Create and return the matrix.
170 |     return sp.coo_matrix((data_np, (row_np, col_np)),
171 |                          shape=(shape,
172 |                                 shape),
173 |                          dtype=np.float64)
174 | 
175 | 
176 | cdef int words_to_ids(list words, vector[int]& word_ids,
177 |                       dictionary, int supplied, int ignore_missing):
178 |     """
179 |     Convert a list of words into a vector of word ids, using either
180 |     the supplied dictionary or by consructing a new one.
181 | 
182 |     If the dictionary was supplied, a word is missing from it,
183 |     and we are not ignoring out-of-vocabulary (OOV) words, an
184 |     error value of -1 is returned.
185 | 
186 |     If we have an OOV word and we do want to ignore them, we use
187 |     a -1 placeholder for it in the word_ids vector to preserve
188 |     correct context windows (otherwise words that are far apart
189 |     with the full vocabulary could become close together with a
190 |     filtered vocabulary).
191 |     """
192 | 
193 |     cdef int word_id
194 | 
195 |     word_ids.resize(0)
196 | 
197 |     if supplied == 1:
198 |         for word in words:
199 |             # Raise an error if the word
200 |             # is missing from the supplied
201 |             # dictionary.
202 |             word_id = dictionary.get(word, -1)
203 |             if word_id == -1 and ignore_missing == 0:
204 |                 return -1
205 | 
206 |             word_ids.push_back(word_id)
207 | 
208 |     else:
209 |         for word in words:
210 |             word_id = dictionary.setdefault(word,
211 |                                             len(dictionary))
212 |             word_ids.push_back(word_id)
213 | 
214 |     return 0
215 | 
216 | 
217 | def construct_cooccurrence_matrix(corpus, dictionary, int supplied,
218 |                                   int window_size, int ignore_missing):
219 |     """
220 |     Construct the word-id dictionary and cooccurrence matrix for
221 |     a given corpus, using a given window size.
222 | 
223 |     Returns the dictionary and a scipy.sparse COO cooccurrence matrix.
224 |     """
225 | 
226 |     # Declare the cooccurrence map
227 |     cdef SparseRowMatrix* matrix = new_matrix()
228 | 
229 |     # String processing variables.
230 |     cdef list words
231 |     cdef int i, j, outer_word, inner_word
232 |     cdef int wordslen, window_stop, error
233 |     cdef vector[int] word_ids
234 | 
235 |     # Pre-allocate some reasonable size
236 |     # for the word ids vector.
237 |     word_ids.reserve(1000)
238 | 
239 |     # Iterate over the corpus.
240 |     for words in corpus:
241 | 
242 |         # Convert words to a numeric vector.
243 |         error = words_to_ids(words, word_ids, dictionary,
244 |                              supplied, ignore_missing)
245 |         if error == -1:
246 |             raise KeyError('Word missing from dictionary')
247 | 
248 |         wordslen = word_ids.size()
249 | 
250 |         # Record co-occurrences in a moving window.
251 |         for i in range(wordslen):
252 |             outer_word = word_ids[i]
253 | 
254 |             # Continue if we have an OOD token.
255 |             if outer_word == -1:
256 |                 continue
257 | 
258 |             window_stop = int_min(i + window_size + 1, wordslen)
259 | 
260 |             for j in range(i, window_stop):
261 |                 inner_word = word_ids[j]
262 | 
263 |                 if inner_word == -1:
264 |                     continue
265 | 
266 |                 # Do nothing if the words are the same.
267 |                 if inner_word == outer_word:
268 |                     continue
269 | 
270 |                 if inner_word < outer_word:
271 |                     increment_matrix(matrix,
272 |                                      inner_word,
273 |                                      outer_word,
274 |                                      1.0 / (j - i))
275 |                 else:
276 |                     increment_matrix(matrix,
277 |                                      outer_word,
278 |                                      inner_word,
279 |                                      1.0 / (j - i))
280 | 
281 |     # Create the matrix.
282 |     mat = matrix_to_coo(matrix, len(dictionary))
283 |     free_matrix(matrix)
284 | 
285 |     return mat
286 | 


--------------------------------------------------------------------------------
/glove/glove.py:
--------------------------------------------------------------------------------
  1 | # GloVe model from the NLP lab at Stanford:
  2 | # http://nlp.stanford.edu/projects/glove/.
  3 | import array
  4 | import collections
  5 | import io
  6 | try:
  7 |     # Python 2 compat
  8 |     import cPickle as pickle
  9 | except ImportError:
 10 |     import pickle
 11 | 
 12 | import numpy as np
 13 | import scipy.sparse as sp
 14 | import numbers
 15 | 
 16 | from .glove_cython import fit_vectors, transform_paragraph
 17 | 
 18 | 
 19 | def check_random_state(seed):
 20 |     """ Turn seed into a np.random.RandomState instance.
 21 | 
 22 |         This is a copy of the check_random_state function in sklearn
 23 |         in order to avoid outside dependencies.
 24 |     """
 25 |     if seed is None or seed is np.random:
 26 |         return np.random.mtrand._rand
 27 |     if isinstance(seed, (numbers.Integral, np.integer)):
 28 |         return np.random.RandomState(seed)
 29 |     if isinstance(seed, np.random.RandomState):
 30 |         return seed
 31 |     raise ValueError('%r cannot be used to seed a numpy.random.RandomState'
 32 |                      ' instance' % seed)
 33 | 
 34 | 
 35 | class Glove(object):
 36 |     """
 37 |     Class for estimating GloVe word embeddings using the
 38 |     corpus coocurrence matrix.
 39 |     """
 40 | 
 41 |     def __init__(self, no_components=30, learning_rate=0.05,
 42 |                  alpha=0.75, max_count=100, max_loss=10.0,
 43 |                  random_state=None):
 44 |         """
 45 |         Parameters:
 46 |         - int no_components: number of latent dimensions
 47 |         - float learning_rate: learning rate for SGD estimation.
 48 |         - float alpha, float max_count: parameters for the
 49 |           weighting function (see the paper).
 50 |         - float max_loss: the maximum absolute value of calculated
 51 |                           gradient for any single co-occurrence pair.
 52 |                           Only try setting to a lower value if you
 53 |                           are experiencing problems with numerical
 54 |                           stability.
 55 |         - random_state: random statue used to intialize optimization
 56 |         """
 57 | 
 58 |         self.no_components = no_components
 59 |         self.learning_rate = float(learning_rate)
 60 |         self.alpha = float(alpha)
 61 |         self.max_count = float(max_count)
 62 |         self.max_loss = max_loss
 63 | 
 64 |         self.word_vectors = None
 65 |         self.word_biases = None
 66 | 
 67 |         self.vectors_sum_gradients = None
 68 |         self.biases_sum_gradients = None
 69 | 
 70 |         self.dictionary = None
 71 |         self.inverse_dictionary = None
 72 | 
 73 |         self.random_state = random_state
 74 | 
 75 |     def fit(self, matrix, epochs=5, no_threads=2, verbose=False):
 76 |         """
 77 |         Estimate the word embeddings.
 78 | 
 79 |         Parameters:
 80 |         - scipy.sparse.coo_matrix matrix: coocurrence matrix
 81 |         - int epochs: number of training epochs
 82 |         - int no_threads: number of training threads
 83 |         - bool verbose: print progress messages if True
 84 |         """
 85 | 
 86 |         shape = matrix.shape
 87 | 
 88 |         if (len(shape) != 2 or
 89 |             shape[0] != shape[1]):
 90 |             raise Exception('Coocurrence matrix must be square')
 91 | 
 92 |         if not sp.isspmatrix_coo(matrix):
 93 |             raise Exception('Coocurrence matrix must be in the COO format')
 94 | 
 95 |         random_state = check_random_state(self.random_state)
 96 |         self.word_vectors = ((random_state.rand(shape[0],
 97 |                                                 self.no_components) - 0.5)
 98 |                              / self.no_components)
 99 |         self.word_biases = np.zeros(shape[0],
100 |                                     dtype=np.float64)
101 | 
102 |         self.vectors_sum_gradients = np.ones_like(self.word_vectors)
103 |         self.biases_sum_gradients = np.ones_like(self.word_biases)
104 | 
105 |         shuffle_indices = np.arange(matrix.nnz, dtype=np.int32)
106 | 
107 |         if verbose:
108 |             print('Performing %s training epochs '
109 |                   'with %s threads' % (epochs, no_threads))
110 | 
111 |         for epoch in range(epochs):
112 | 
113 |             if verbose:
114 |                 print('Epoch %s' % epoch)
115 | 
116 |             # Shuffle the coocurrence matrix
117 |             random_state.shuffle(shuffle_indices)
118 | 
119 |             fit_vectors(self.word_vectors,
120 |                         self.vectors_sum_gradients,
121 |                         self.word_biases,
122 |                         self.biases_sum_gradients,
123 |                         matrix.row,
124 |                         matrix.col,
125 |                         matrix.data,
126 |                         shuffle_indices,
127 |                         self.learning_rate,
128 |                         self.max_count,
129 |                         self.alpha,
130 |                         self.max_loss,
131 |                         int(no_threads))
132 | 
133 |             if not np.isfinite(self.word_vectors).all():
134 |                 raise Exception('Non-finite values in word vectors. '
135 |                                 'Try reducing the learning rate or the '
136 |                                 'max_loss parameter.')
137 | 
138 |     def transform_paragraph(self, paragraph, epochs=50, ignore_missing=False):
139 |         """
140 |         Transform an iterable of tokens into its vector representation
141 |         (a paragraph vector).
142 | 
143 |         Experimental. This will return something close to a tf-idf
144 |         weighted average of constituent token vectors by fitting
145 |         rare words (with low word bias values) more closely.
146 |         """
147 | 
148 |         if self.word_vectors is None:
149 |             raise Exception('Model must be fit to transform paragraphs')
150 | 
151 |         if self.dictionary is None:
152 |             raise Exception('Dictionary must be provided to '
153 |                             'transform paragraphs')
154 | 
155 |         cooccurrence = collections.defaultdict(lambda: 0.0)
156 | 
157 |         for token in paragraph:
158 |             try:
159 |                 cooccurrence[self.dictionary[token]] += self.max_count / 10.0
160 |             except KeyError:
161 |                 if not ignore_missing:
162 |                     raise
163 | 
164 |         random_state = check_random_state(self.random_state)
165 | 
166 |         word_ids = np.array(cooccurrence.keys(), dtype=np.int32)
167 |         values = np.array(cooccurrence.values(), dtype=np.float64)
168 |         shuffle_indices = np.arange(len(word_ids), dtype=np.int32)
169 | 
170 |         # Initialize the vector to mean of constituent word vectors
171 |         paragraph_vector = np.mean(self.word_vectors[word_ids], axis=0)
172 |         sum_gradients = np.ones_like(paragraph_vector)
173 | 
174 |         # Shuffle the coocurrence matrix
175 |         random_state.shuffle(shuffle_indices)
176 |         transform_paragraph(self.word_vectors,
177 |                             self.word_biases,
178 |                             paragraph_vector,
179 |                             sum_gradients,
180 |                             word_ids,
181 |                             values,
182 |                             shuffle_indices,
183 |                             self.learning_rate,
184 |                             self.max_count,
185 |                             self.alpha,
186 |                             epochs)
187 | 
188 |         return paragraph_vector
189 | 
190 |     def add_dictionary(self, dictionary):
191 |         """
192 |         Supply a word-id dictionary to allow similarity queries.
193 |         """
194 | 
195 |         if self.word_vectors is None:
196 |             raise Exception('Model must be fit before adding a dictionary')
197 | 
198 |         if len(dictionary) > self.word_vectors.shape[0]:
199 |             raise Exception('Dictionary length must be smaller '
200 |                             'or equal to the number of word vectors')
201 | 
202 |         self.dictionary = dictionary
203 |         if hasattr(self.dictionary, 'iteritems'):
204 |             # Python 2 compat
205 |             items_iterator = self.dictionary.iteritems()
206 |         else:
207 |             items_iterator = self.dictionary.items()
208 | 
209 |         self.inverse_dictionary = {v: k for k, v in items_iterator}
210 | 
211 |     def save(self, filename):
212 |         """
213 |         Serialize model to filename.
214 |         """
215 | 
216 |         with open(filename, 'wb') as savefile:
217 |             pickle.dump(self.__dict__,
218 |                         savefile,
219 |                         protocol=pickle.HIGHEST_PROTOCOL)
220 | 
221 |     @classmethod
222 |     def load(cls, filename):
223 |         """
224 |         Load model from filename.
225 |         """
226 | 
227 |         instance = Glove()
228 | 
229 |         with open(filename, 'rb') as savefile:
230 |             instance.__dict__ = pickle.load(savefile)
231 | 
232 |         return instance
233 | 
234 |     @classmethod
235 |     def load_stanford(cls, filename):
236 |         """
237 |         Load model from the output files generated by
238 |         the C code from http://nlp.stanford.edu/projects/glove/.
239 | 
240 |         The entries of the word dictionary will be of type
241 |         unicode in Python 2 and str in Python 3.
242 |         """
243 | 
244 |         dct = {}
245 |         vectors = array.array('d')
246 | 
247 |         # Read in the data.
248 |         with io.open(filename, 'r', encoding='utf-8') as savefile:
249 |             for i, line in enumerate(savefile):
250 |                 tokens = line.split(' ')
251 | 
252 |                 word = tokens[0]
253 |                 entries = tokens[1:]
254 | 
255 |                 dct[word] = i
256 |                 vectors.extend(float(x) for x in entries)
257 | 
258 |         # Infer word vectors dimensions.
259 |         no_components = len(entries)
260 |         no_vectors = len(dct)
261 | 
262 |         # Set up the model instance.
263 |         instance = Glove()
264 |         instance.no_components = no_components
265 |         instance.word_vectors = (np.array(vectors)
266 |                                  .reshape(no_vectors,
267 |                                           no_components))
268 |         instance.word_biases = np.zeros(no_vectors)
269 |         instance.add_dictionary(dct)
270 | 
271 |         return instance
272 | 
273 |     def _similarity_query(self, word_vec, number):
274 | 
275 |         dst = (np.dot(self.word_vectors, word_vec)
276 |                / np.linalg.norm(self.word_vectors, axis=1)
277 |                / np.linalg.norm(word_vec))
278 |         word_ids = np.argsort(-dst)
279 | 
280 |         return [(self.inverse_dictionary[x], dst[x]) for x in word_ids[:number]
281 |                 if x in self.inverse_dictionary]
282 | 
283 |     def most_similar(self, word, number=5):
284 |         """
285 |         Run a similarity query, retrieving number
286 |         most similar words.
287 |         """
288 | 
289 |         if self.word_vectors is None:
290 |             raise Exception('Model must be fit before querying')
291 | 
292 |         if self.dictionary is None:
293 |             raise Exception('No word dictionary supplied')
294 | 
295 |         try:
296 |             word_idx = self.dictionary[word]
297 |         except KeyError:
298 |             raise Exception('Word not in dictionary')
299 | 
300 |         return self._similarity_query(self.word_vectors[word_idx], number)[1:]
301 | 
302 |     def most_similar_paragraph(self, paragraph, number=5, **kwargs):
303 |         """
304 |         Return words most similar to a given paragraph (iterable of tokens).
305 |         """
306 | 
307 |         paragraph_vector = self.transform_paragraph(paragraph, **kwargs)
308 | 
309 |         return self._similarity_query(paragraph_vector, number)
310 | 


--------------------------------------------------------------------------------
/glove/glove_cython.pyx:
--------------------------------------------------------------------------------
  1 | #!python
  2 | #cython: boundscheck=False, wraparound=False, cdivision=True, initializedcheck=False
  3 | 
  4 | import numpy as np
  5 | import scipy.sparse as sp
  6 | import collections
  7 | from cython.parallel import parallel, prange
  8 | 
  9 | 
 10 | cdef inline double double_min(double a, double b) nogil: return a if a <= b else b
 11 | cdef inline int int_min(int a, int b) nogil: return a if a <= b else b
 12 | cdef inline int int_max(int a, int b) nogil: return a if a > b else b
 13 | 
 14 | 
 15 | cdef extern from "math.h" nogil:
 16 |     double sqrt(double)
 17 |     double c_log "log"(double)
 18 | 
 19 | 
 20 | def fit_vectors(double[:, ::1] wordvec,
 21 |                 double[:, ::1] wordvec_sum_gradients,
 22 |                 double[::1] wordbias,
 23 |                 double[::1] wordbias_sum_gradients,
 24 |                 int[::1] row,
 25 |                 int[::1] col,
 26 |                 double[::1] counts,
 27 |                 int[::1] shuffle_indices,
 28 |                 double initial_learning_rate,
 29 |                 double max_count,
 30 |                 double alpha,
 31 |                 double max_loss,
 32 |                 int no_threads):
 33 |     """
 34 |     Estimate GloVe word embeddings given the cooccurrence matrix.
 35 |     Modifies the word vector and word bias array in-place.
 36 | 
 37 |     Training is performed via asynchronous stochastic gradient descent,
 38 |     using the AdaGrad per-coordinate learning rate.
 39 |     """
 40 | 
 41 |     # Get number of latent dimensions and
 42 |     # number of cooccurrences.
 43 |     cdef int dim = wordvec.shape[1]
 44 |     cdef int no_cooccurrences = row.shape[0]
 45 | 
 46 |     # Hold indices of current words and
 47 |     # the cooccurrence count.
 48 |     cdef int word_a, word_b
 49 |     cdef double count, learning_rate, gradient
 50 | 
 51 |     # Loss and gradient variables.
 52 |     cdef double prediction, entry_weight, loss
 53 | 
 54 |     # Iteration variables
 55 |     cdef int i, j, shuffle_index
 56 | 
 57 |     # We iterate over random indices to simulate
 58 |     # shuffling the cooccurrence matrix.
 59 |     with nogil:
 60 |         for j in prange(no_cooccurrences, num_threads=no_threads,
 61 |                         schedule='dynamic'):
 62 |             shuffle_index = shuffle_indices[j]
 63 |             word_a = row[shuffle_index]
 64 |             word_b = col[shuffle_index]
 65 |             count = counts[shuffle_index]
 66 | 
 67 |             # Get prediction
 68 |             prediction = 0.0
 69 | 
 70 |             for i in range(dim):
 71 |                 prediction = prediction + wordvec[word_a, i] * wordvec[word_b, i]
 72 | 
 73 |             prediction = prediction + wordbias[word_a] + wordbias[word_b]
 74 | 
 75 |             # Compute loss and the example weight.
 76 |             entry_weight = double_min(1.0, (count / max_count)) ** alpha
 77 |             loss = entry_weight * (prediction - c_log(count))
 78 | 
 79 |             # Clip the loss for numerical stability.
 80 |             if loss < -max_loss:
 81 |                 loss = -max_loss
 82 |             elif loss > max_loss:
 83 |                 loss = max_loss
 84 | 
 85 |             # Update step: apply gradients and reproject
 86 |             # onto the unit sphere.
 87 |             for i in range(dim):
 88 | 
 89 |                 learning_rate = initial_learning_rate / sqrt(wordvec_sum_gradients[word_a, i])
 90 |                 gradient = loss * wordvec[word_b, i]
 91 |                 wordvec[word_a, i] = (wordvec[word_a, i] - learning_rate 
 92 |                                       * gradient)
 93 |                 wordvec_sum_gradients[word_a, i] += gradient ** 2
 94 | 
 95 |                 learning_rate = initial_learning_rate / sqrt(wordvec_sum_gradients[word_b, i])
 96 |                 gradient = loss * wordvec[word_a, i]
 97 |                 wordvec[word_b, i] = (wordvec[word_b, i] - learning_rate
 98 |                                       * gradient)
 99 |                 wordvec_sum_gradients[word_b, i] += gradient ** 2
100 | 
101 |             # Update word biases.
102 |             learning_rate = initial_learning_rate / sqrt(wordbias_sum_gradients[word_a])
103 |             wordbias[word_a] -= learning_rate * loss
104 |             wordbias_sum_gradients[word_a] += loss ** 2
105 | 
106 |             learning_rate = initial_learning_rate / sqrt(wordbias_sum_gradients[word_b])
107 |             wordbias[word_b] -= learning_rate * loss
108 |             wordbias_sum_gradients[word_b] += loss ** 2
109 | 
110 | 
111 | def transform_paragraph(double[:, ::1] wordvec,
112 |                         double[::1] wordbias,
113 |                         double[::1] paragraphvec,
114 |                         double[::1] sum_gradients,
115 |                         int[::1] row,
116 |                         double[::1] counts,
117 |                         int[::1] shuffle_indices,
118 |                         double initial_learning_rate,
119 |                         double max_count,
120 |                         double alpha,
121 |                         int epochs):
122 |     """
123 |     Compute a vector representation of a paragraph. This has
124 |     the effect of making the paragraph vector close to words
125 |     that occur in it. The representation should be more
126 |     similar to words that occur in it multiple times, and
127 |     less close to words that are common in the corpus (have
128 |     large word bias values).
129 | 
130 |     This should be be similar to a tf-idf weighting.
131 |     """
132 | 
133 |     # Get number of latent dimensions and
134 |     # number of cooccurrences.
135 |     cdef int dim = wordvec.shape[1]
136 |     cdef int no_cooccurrences = row.shape[0]
137 | 
138 |     # Hold indices of current words and
139 |     # the cooccurrence count.
140 |     cdef int word_b, word_a
141 |     cdef double count
142 | 
143 |     # Loss and gradient variables.
144 |     cdef double prediction
145 |     cdef double entry_weight
146 |     cdef double loss
147 |     cdef double gradient
148 | 
149 |     # Iteration variables
150 |     cdef int epoch, i, j, shuffle_index
151 | 
152 |     # We iterate over random indices to simulate
153 |     # shuffling the cooccurrence matrix.
154 |     for epoch in range(epochs):
155 |         for j in range(no_cooccurrences):
156 |             shuffle_index = shuffle_indices[j]
157 | 
158 |             word_b = row[shuffle_index]
159 |             count = counts[shuffle_index]
160 | 
161 |             # Get prediction
162 |             prediction = 0.0
163 |             for i in range(dim):
164 |                 prediction = prediction + paragraphvec[i] * wordvec[word_b, i]
165 |             prediction += wordbias[word_b]
166 | 
167 |             # Compute loss and the example weight.
168 |             entry_weight = double_min(1.0, (count / max_count)) ** alpha
169 |             loss = entry_weight * (prediction - c_log(count))
170 | 
171 |             # Update step: apply gradients.
172 |             for i in range(dim):
173 |                 learning_rate = initial_learning_rate / sqrt(sum_gradients[i])
174 |                 gradient = loss * wordvec[word_b, i]
175 |                 paragraphvec[i] = (paragraphvec[i] - learning_rate
176 |                                    * gradient)
177 |                 sum_gradients[i] += gradient ** 2
178 | 


--------------------------------------------------------------------------------
/glove/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | from .accuracy import (read_analogy_file,
2 |                        construct_analogy_test_set,
3 |                        analogy_rank_score)
4 | 


--------------------------------------------------------------------------------
/glove/metrics/accuracy.py:
--------------------------------------------------------------------------------
  1 | try:
  2 |     from itertools import izip
  3 | except ImportError:
  4 |     izip = zip
  5 | import numpy as np
  6 | 
  7 | from .accuracy_cython import compute_rank_violations
  8 | 
  9 | 
 10 | def read_analogy_file(filename):
 11 |     """
 12 |     Read the analogy task test set from a file.
 13 |     """
 14 |     
 15 |     section = None
 16 | 
 17 |     with open(filename, 'r') as questions_file:
 18 |         for line in questions_file:
 19 |             if line.startswith(':'):
 20 |                 section = line[2:].replace('\n', '')
 21 |                 continue
 22 |             else:
 23 |                 words = line.replace('\n', '').split(' ')
 24 | 
 25 |             yield section, words
 26 | 
 27 | 
 28 | def construct_analogy_test_set(test_examples, dictionary, ignore_missing=False):
 29 |     """
 30 |     Construct the analogy test set by mapping the words to their
 31 |     word vector ids.
 32 | 
 33 |     Arguments:
 34 |     - test_examples: iterable of 4-word iterables
 35 |     - dictionay: a mapping from words to ids
 36 |     - boolean ignore_missing: if True, words in the test set
 37 |                               that are not in the dictionary
 38 |                               will be dropeed.
 39 | 
 40 |     Returns:
 41 |     - a N by 4 numpy matrix.
 42 |     """
 43 | 
 44 |     test = []
 45 |     
 46 |     for example in test_examples:
 47 |         try:
 48 |             test.append([dictionary[word] for word in example])
 49 |         except KeyError:
 50 |             if ignore_missing:
 51 |                 pass
 52 |             else:
 53 |                 raise
 54 | 
 55 |     try:
 56 |         test = np.array(test, dtype=np.int32)
 57 |     except ValueError as e:
 58 |         # This should use raise ... from ... in Python 3.
 59 |         raise ValueError('Each row of the test set should contain '
 60 |                         '4 integer word ids', e)
 61 | 
 62 |     return test
 63 | 
 64 | 
 65 | def analogy_rank_score(analogies, word_vectors, no_threads=1):
 66 |     """
 67 |     Calculate the analogy rank score for the given set of analogies.
 68 | 
 69 |     A rank of zero denotes a perfect score; with random word vectors
 70 |     we would expect a rank of 0.5.
 71 | 
 72 |     Arguments:
 73 |     - analogies: a numpy array holding the ids of the words in the analogy tasks,
 74 |                  as constructed by `construct_analogy_test_set`.
 75 |     - word_vectors: numpy array holding the word vectors to use.
 76 |     - num_threads: number of parallel threads to use in the calculation.
 77 | 
 78 |     Returns:
 79 |     - ranks: a numpy array holding the normalized rank of the target word
 80 |              in each analogy task. Rank 0 means that the target words was
 81 |              returned first; rank 1 means it was returned last.
 82 |     """
 83 | 
 84 |     # The mean of the vectors for the
 85 |     # second, third, and the negative of
 86 |     # the first word.
 87 |     input_vectors = (word_vectors[analogies[:, 1]]
 88 |                      + word_vectors[analogies[:, 2]]
 89 |                      - word_vectors[analogies[:, 0]])
 90 | 
 91 |     word_vector_norms = np.linalg.norm(word_vectors,
 92 |                                        axis=1)
 93 | 
 94 |     # Pre-allocate the array storing the rank violations
 95 |     rank_violations = np.zeros(input_vectors.shape[0], dtype=np.int32)
 96 | 
 97 |     compute_rank_violations(word_vectors,
 98 |                             word_vector_norms,
 99 |                             input_vectors,
100 |                             analogies[:, 3],
101 |                             analogies,
102 |                             rank_violations,
103 |                             no_threads)
104 | 
105 |     return rank_violations / float(word_vectors.shape[0])
106 | 


--------------------------------------------------------------------------------
/glove/metrics/accuracy_cython.pyx:
--------------------------------------------------------------------------------
 1 | #!python
 2 | #cython: boundscheck=False, wraparound=False, cdivision=True, initializedcheck=False
 3 | 
 4 | from cython.parallel import prange
 5 | 
 6 | 
 7 | cdef double dot(double[::1] x,
 8 |                 double[::1] y,
 9 |                 int dim) nogil:
10 | 
11 |     cdef int i
12 |     cdef double result = 0.0
13 | 
14 |     for i in range(dim):
15 |         result += x[i] * y[i]
16 | 
17 |     return result
18 | 
19 | 
20 | def compute_rank_violations(double[:, ::1] wordvec,
21 |                             double[::1] wordvec_norm,
22 |                             double[:, ::1] input,
23 |                             int[:] expected,
24 |                             int[:, ::1] inputs,
25 |                             int[::1] rank_violations,
26 |                             int no_threads):
27 |     """
28 |     Compute the rank violations
29 |     of the expected words in the word analogy task.
30 |     """
31 | 
32 |     cdef int i, j, k, no_input_vectors, no_wordvec, skip_word
33 |     cdef int no_components, violations
34 | 
35 |     cdef double score_of_expected, score
36 | 
37 |     no_input_vectors = input.shape[0]
38 |     no_wordvec = wordvec.shape[0]
39 |     no_components = wordvec.shape[1]
40 | 
41 |     with nogil:
42 |         for i in prange(no_input_vectors, num_threads=no_threads,
43 |                         schedule='dynamic'):
44 | 
45 |             # Compute the score of the expected word.
46 |             score_of_expected = (dot(input[i],
47 |                                      wordvec[expected[i]],
48 |                                      no_components)
49 |                                      / wordvec_norm[expected[i]])
50 | 
51 |             # Compute all other scores and count
52 |             # rank violations.
53 |             violations = 0
54 | 
55 |             for j in range(no_wordvec):
56 | 
57 |                 # Words from the input do not
58 |                 # count as violations.
59 |                 skip_word = 0
60 |                 for k in range(4):
61 |                     if inputs[i, k] == j:
62 |                         skip_word = 1
63 |                         break
64 | 
65 |                 if skip_word == 1:
66 |                     continue
67 | 
68 |                 score = (dot(input[i],
69 |                             wordvec[j],
70 |                             no_components)
71 |                          / wordvec_norm[j])
72 | 
73 |                 if score >= score_of_expected:
74 |                     violations = violations + 1
75 | 
76 |             # Update the average rank with the rank
77 |             # of this example.
78 |             rank_violations[i] = violations
79 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # glove-python
 2 | 
 3 | [![Circle CI](https://circleci.com/gh/maciejkula/glove-python.svg?style=svg)](https://circleci.com/gh/maciejkula/glove-python)
 4 | 
 5 | A toy python implementation of [GloVe](http://www-nlp.stanford.edu/projects/glove/).
 6 | 
 7 | Glove produces dense vector embeddings of words, where words that occur together are close in the resulting vector space.
 8 | 
 9 | While this produces embeddings which are similar to [word2vec](https://code.google.com/p/word2vec/) (which has a great python implementation in [gensim](http://radimrehurek.com/gensim/models/word2vec.html)), the method is different: GloVe produces embeddings by factorizing the logarithm of the corpus word co-occurrence matrix.
10 | 
11 | The code uses asynchronous stochastic gradient descent, and is implemented in Cython. Most likely, it contains a tremendous amount of bugs.
12 | 
13 | ## Installation
14 | Install from pypi using pip: `pip install glove_python`.
15 | 
16 | Note for OSX users: due to its use of OpenMP, glove-python does not compile under Clang. To install it, you will need a reasonably recent version of `gcc` (from Homebrew for instance). This should be picked up by `setup.py`; if it is not, please open an issue.
17 | 
18 | Building with the default Python distribution included in OSX is also not supported; please try the version from Homebrew or Anaconda.
19 | 
20 | ## Usage
21 | Producing the embeddings is a two-step process: creating a co-occurrence matrix from the corpus, and then using it to produce the embeddings. The `Corpus` class helps in constructing a corpus from an interable of tokens; the `Glove` class trains the embeddings (with a sklearn-esque API).
22 | 
23 | There is also support for rudimentary pagragraph vectors. A paragraph vector (in this case) is an embedding of a paragraph (a multi-word piece of text) in the word vector space in such a way that the paragraph representation is close to the words it contains, adjusted for the frequency of words in the corpus (in a manner similar to tf-idf weighting). These can be obtained after having trained word embeddings by calling the `transform_paragraph` method on the trained model.
24 | 
25 | ## Examples
26 | `example.py` has some example code for running simple training scripts: `ipython -i -- examples/example.py -c my_corpus.txt -t 10` should process your corpus, run 10 training epochs of GloVe, and drop you into an `ipython` shell where `glove.most_similar('physics')` should produce a list of similar words.
27 | 
28 | If you want to process a wikipedia corpus, you can pass file from [here](http://dumps.wikimedia.org/enwiki/latest/) into the `example.py` script using the `-w` flag. Running `make all-wiki` should download a small wikipedia dump file, process it, and train the embeddings. Building the cooccurrence matrix will take some time; training the vectors can be speeded up by increasing the training parallelism to match the number of physical CPU cores available.
29 | 
30 | Running this on my machine yields roughly the following results:
31 | 
32 | ```
33 | In [1]: glove.most_similar('physics')
34 | Out[1]:
35 | [('biology', 0.89425889335342257),
36 |  ('chemistry', 0.88913708236100086),
37 |  ('quantum', 0.88859617025616333),
38 |  ('mechanics', 0.88821824562025431)]
39 | 
40 | In [4]: glove.most_similar('north')
41 | Out[4]:
42 | [('west', 0.99047203572917908),
43 |  ('south', 0.98655786905501008),
44 |  ('east', 0.97914140138065575),
45 |  ('coast', 0.97680427897282185)]
46 | 
47 | In [6]: glove.most_similar('queen')
48 | Out[6]:
49 | [('anne', 0.88284931171714842),
50 |  ('mary', 0.87615260138308615),
51 |  ('elizabeth', 0.87362497374226267),
52 |  ('prince', 0.87011034923161801)]
53 | 
54 | In [19]: glove.most_similar('car')
55 | Out[19]:
56 | [('race', 0.89549347066796814),
57 |  ('driver', 0.89350343749207217),
58 |  ('cars', 0.83601334715106568),
59 |  ('racing', 0.83157724991920212)]
60 | ```
61 | 
62 | ## Development
63 | Pull requests are welcome.
64 | 
65 | When making changes to the `.pyx` extension files, you'll need to run `python setup.py cythonize` in order to produce the extension `.c` and `.cpp` files before running `pip install -e .`.
66 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os
  3 | import platform
  4 | import subprocess
  5 | import sys
  6 | 
  7 | from setuptools import Command, Extension, setup, find_packages
  8 | from setuptools.command.test import test as TestCommand
  9 | 
 10 | 
 11 | def define_extensions(cythonize=False):
 12 | 
 13 |     compile_args = ['-fopenmp',
 14 |                     '-ffast-math']
 15 | 
 16 |     # There are problems with illegal ASM instructions
 17 |     # when using the Anaconda distribution (at least on OSX).
 18 |     # This could be because Anaconda uses its own assembler?
 19 |     # To work around this we do not add -march=native if we
 20 |     # know we're dealing with Anaconda
 21 |     if 'anaconda' not in sys.version.lower():
 22 |         compile_args.append('-march=native')
 23 | 
 24 |     if cythonize:
 25 |         glove_cython = "glove/glove_cython.pyx"
 26 |         glove_metrics = "glove/metrics/accuracy_cython.pyx"
 27 |         glove_corpus = "glove/corpus_cython.pyx"
 28 |     else:
 29 |         glove_cython = "glove/glove_cython.c"
 30 |         glove_metrics = "glove/metrics/accuracy_cython.c"
 31 |         glove_corpus = "glove/corpus_cython.cpp"
 32 | 
 33 |     return [Extension("glove.glove_cython", [glove_cython],
 34 |                       extra_link_args=["-fopenmp"],
 35 |                       extra_compile_args=compile_args),
 36 |             Extension("glove.metrics.accuracy_cython",
 37 |                       [glove_metrics],
 38 |                       extra_link_args=["-fopenmp"],
 39 |                       extra_compile_args=compile_args),
 40 |             Extension("glove.corpus_cython", [glove_corpus],
 41 |                       language='C++',
 42 |                       libraries=["stdc++"],
 43 |                       extra_link_args=compile_args,
 44 |                       extra_compile_args=compile_args)]
 45 | 
 46 | 
 47 | def set_gcc():
 48 |     """
 49 |     Try to find and use GCC on OSX for OpenMP support.
 50 |     """
 51 | 
 52 |     # For macports and homebrew
 53 |     patterns = ['/opt/local/bin/gcc-mp-[0-9].[0-9]',
 54 |                 '/opt/local/bin/gcc-mp-[0-9]',
 55 |                 '/usr/local/bin/gcc-[0-9].[0-9]',
 56 |                 '/usr/local/bin/gcc-[0-9]']
 57 | 
 58 |     if 'darwin' in platform.platform().lower():
 59 | 
 60 |         gcc_binaries = []
 61 |         for pattern in patterns:
 62 |             gcc_binaries += glob.glob(pattern)
 63 |         gcc_binaries.sort()
 64 | 
 65 |         if gcc_binaries:
 66 |             _, gcc = os.path.split(gcc_binaries[-1])
 67 |             os.environ["CC"] = gcc
 68 | 
 69 |         else:
 70 |             raise Exception('No GCC available. Install gcc from Homebrew '
 71 |                             'using brew install gcc.')
 72 | 
 73 | 
 74 | class Cythonize(Command):
 75 |     """
 76 |     Compile the extension .pyx files.
 77 |     """
 78 | 
 79 |     user_options = []
 80 | 
 81 |     def initialize_options(self):
 82 |         pass
 83 | 
 84 |     def finalize_options(self):
 85 |         pass
 86 | 
 87 |     def run(self):
 88 | 
 89 |         import Cython
 90 |         from Cython.Build import cythonize
 91 | 
 92 |         cythonize(define_extensions(cythonize=True))
 93 | 
 94 | 
 95 | class Clean(Command):
 96 |     """
 97 |     Clean build files.
 98 |     """
 99 | 
100 |     user_options = []
101 | 
102 |     def initialize_options(self):
103 |         pass
104 | 
105 |     def finalize_options(self):
106 |         pass
107 | 
108 |     def run(self):
109 | 
110 |         pth = os.path.dirname(os.path.abspath(__file__))
111 | 
112 |         subprocess.call(['rm', '-rf', os.path.join(pth, 'build')])
113 |         subprocess.call(['rm', '-rf', os.path.join(pth, '*.egg-info')])
114 |         subprocess.call(['find', pth, '-name', '*.pyc', '-type', 'f', '-delete'])
115 |         subprocess.call(['rm', os.path.join(pth, 'glove', 'corpus_cython.so')])
116 |         subprocess.call(['rm', os.path.join(pth, 'glove', 'glove_cython.so')])
117 | 
118 | 
119 | class PyTest(TestCommand):
120 |     user_options = [('pytest-args=', 'a', "Arguments to pass to py.test")]
121 | 
122 |     def initialize_options(self):
123 |         TestCommand.initialize_options(self)
124 |         self.pytest_args = ['tests/']
125 | 
126 |     def finalize_options(self):
127 |         TestCommand.finalize_options(self)
128 |         self.test_args = []
129 |         self.test_suite = True
130 | 
131 |     def run_tests(self):
132 |         # import here, cause outside the eggs aren't loaded
133 |         import pytest
134 |         errno = pytest.main(self.pytest_args)
135 |         sys.exit(errno)
136 | 
137 | 
138 | setup(
139 |     name='glove_python',
140 |     version='0.1.0',
141 |     description=('Python implementation of Global Vectors '
142 |                  'for Word Representation (GloVe)'),
143 |     long_description='',
144 |     packages=find_packages(),
145 |     install_requires=['numpy', 'scipy'],
146 |     tests_require=['pytest'],
147 |     cmdclass={'test': PyTest, 'cythonize': Cythonize, 'clean': Clean},
148 |     author='Maciej Kula',
149 |     url='https://github.com/maciejkula/glove-python',
150 |     download_url='https://github.com/maciejkula/glove-python/tarball/0.1.0',
151 |     license='Apache 2.0',
152 |     classifiers=['Development Status :: 3 - Alpha',
153 |                  'License :: OSI Approved :: Apache Software License',
154 |                  'Topic :: Scientific/Engineering :: Artificial Intelligence'],
155 |     ext_modules=define_extensions()
156 | )
157 | 


--------------------------------------------------------------------------------
/tests/stanford_test.txt:
--------------------------------------------------------------------------------
  1 | <user> 0.62415 0.62476 -0.082335 0.20101 -0.13741 -0.11431 0.77909 2.6356 -0.46351 0.57465 -0.024888 -0.015466 -2.9696 -0.49876 0.095034 -0.94879 -0.017336 -0.86349 -1.3348 0.046811 0.36999 -0.57663 -0.48469 0.40078 0.75345
  2 | . 0.69586 -1.1469 -0.41797 -0.022311 -0.023801 0.82358 1.2228 1.741 -0.90979 1.3725 0.1153 -0.63906 -3.2252 0.61269 0.33544 -0.57058 -0.50861 -0.16575 -0.98153 -0.8213 0.24333 -0.14482 -0.67877 0.7061 0.40833
  3 | : 1.1242 0.054519 -0.037362 0.10046 0.11923 -0.30009 1.0938 2.537 -0.072802 1.0491 1.0931 0.066084 -2.7036 -0.14391 -0.22031 -0.99347 -0.65072 -0.030948 -1.0817 -0.64701 0.32341 -0.41612 -0.5268 -0.047166 0.71549
  4 | rt 0.74056 0.9155 -0.16352 0.35843 0.05266 0.1456 1.0421 2.8073 0.12865 1.0492 0.13033 0.20508 -2.6686 -0.50551 -0.29574 -0.91433 -0.40456 -1.0988 -1.0333 -0.17875 0.37979 -0.25922 -0.74854 0.36001 0.61206
  5 | , 0.84705 -1.0349 -0.050419 0.27164 -0.58659 0.99514 0.25267 1.6963 0.10313 0.80073 0.74655 -1.2667 -4.036 -0.22557 0.16322 -0.67015 -0.64812 0.010373 -0.71889 -0.74997 0.24862 0.10319 -1.1732 0.58196 0.33846
  6 | <repeat> 0.67867 -0.74651 -0.31831 -0.093681 0.062057 0.77956 1.5604 2.0332 -0.95379 1.2358 -0.081705 -0.42269 -2.5273 0.51772 0.29574 -0.76079 -0.57992 -0.51783 -1.1715 -0.53952 0.36752 -0.2758 -0.086496 1.0115 0.56436
  7 | <hashtag> 0.18227 -0.29194 -1.3632 -1.201 0.084332 0.018943 1.3408 2.3866 -1.2761 0.39897 -0.16731 -0.52372 -3.5758 -0.25648 -0.5531 -0.62011 -0.71249 -0.12025 -0.91766 0.65635 -0.55258 -1.1655 0.10899 -1.6099 1.6189
  8 | <number> 1.3956 0.2892 0.48572 -1.1412 0.21461 1.0714 0.25408 2.1181 0.30252 0.75955 1.1299 -0.021373 -3.7757 0.89387 -0.71476 -1.6997 -0.42166 -0.12601 -1.2984 0.41689 -0.84993 -1.5199 0.40681 0.15024 0.26997
  9 | <url> 0.80384 -1.0366 -0.53877 -1.0806 0.84718 -0.36196 1.0065 1.3067 -0.61225 0.30781 0.46974 -0.23264 -3.3882 -0.46778 -0.55105 -1.6926 -0.78708 0.28378 -0.73638 0.10216 -0.18703 -2.133 -0.17787 -0.97788 1.394
 10 | ! 0.4049 -0.87651 -0.23362 -0.34844 -0.097002 0.40895 1.6928 1.7058 -1.293 0.70091 -0.12498 -0.75998 -3.1586 0.14081 0.57255 -0.46097 -0.75721 -0.72414 -1.4071 -0.17224 0.0099324 -0.45711 0.074886 1.2035 1.1614
 11 | i -0.26079 0.59108 0.61622 -0.70368 -0.85159 -0.23238 1.0481 0.066642 -0.54907 0.70047 -0.87221 -0.013954 -5.9671 -0.43106 -0.9154 0.53744 0.57099 -0.27181 -0.84178 -0.59682 0.4516 0.34097 0.076869 0.2284 0.2758
 12 | a 0.21294 0.31035 0.17694 0.87498 0.067926 0.59171 -0.098218 1.5896 -0.428 -1.3655 -0.15278 -2.501 -5.5652 -0.10232 0.39577 0.1555 -0.55181 0.34671 -0.57379 -0.30717 0.043623 -0.39707 0.64551 -0.33537 0.020467
 13 | " 1.0822 -0.59378 -0.19992 0.66626 0.18051 0.014404 1.4227 2.3584 -0.2701 1.4194 0.61099 -0.29541 -2.8885 -0.070205 -0.038122 -0.50855 -0.4445 0.076176 -0.96879 -0.57778 0.39206 0.20976 -0.73835 0.031611 0.72533
 14 | the -0.010167 0.020194 0.21473 0.17289 -0.43659 -0.14687 1.8429 -0.15753 0.18187 -0.31782 0.06839 0.51776 -6.3371 0.48066 0.13777 -0.48568 0.39 -0.0019506 -0.10218 0.21262 -0.86146 0.17263 0.18783 -0.8425 -0.31208
 15 | ? 1.104 -0.34629 0.088792 -0.2554 -0.023462 0.51487 0.7491 1.7858 0.16928 0.93679 0.010994 -0.98983 -3.7061 -0.82598 0.90447 -0.41301 -0.617 -0.62424 -1.1698 -0.022587 0.26791 -0.076523 -1.1142 1.336 0.20145
 16 | you -0.41586 0.32548 -0.087621 0.2018 -0.80017 -0.34418 2.1431 0.37188 -0.9409 0.24283 -0.86396 0.63858 -6.0171 -0.54081 -0.43305 0.095707 0.37971 -1.1432 0.11382 -0.38361 0.41758 0.081476 -0.02659 0.75438 -0.77178
 17 | to 0.28228 0.019558 0.11509 -0.39242 -1.0503 -0.54278 1.1357 -0.34251 0.80636 -0.47359 -0.77194 -0.73689 -6.2619 -0.34902 -0.35532 -0.60148 -0.054534 -0.67057 -0.39972 -1.324 -0.43765 0.30045 0.2143 0.25422 -0.26674
 18 | ( 0.026645 -0.15996 -0.13042 0.32999 0.24416 0.41042 1.3001 2.6126 0.70933 0.91401 0.21455 0.2219 -2.6304 -0.11566 -0.32597 -2.167 -1.0084 0.43317 -0.85766 -0.20587 -0.037961 -1.5767 0.15105 0.24585 1.1149
 19 | <allcaps> 0.82488 -0.3125 -1.2156 -1.0703 -0.26568 -0.2475 2.1968 2.179 -0.37712 1.3096 0.51299 0.68645 -2.0813 -0.052276 -0.4715 -1.7417 -1.3162 -0.32637 -0.78276 0.50433 0.078971 -2.1496 0.63889 -0.57727 1.4871
 20 | <elong> 0.23809 -0.09146 0.15923 -0.018792 0.12084 0.68245 1.3484 2.7759 -0.78706 0.85131 -0.95748 -0.34804 -2.0028 -0.51581 0.15512 -1.2631 -0.48455 -1.1553 -1.7698 0.39001 0.76965 0.24155 0.84985 2.0607 0.85529
 21 | ) 0.34127 -0.43348 -0.35918 -0.15297 -0.078167 0.064745 1.3919 2.2717 0.53841 1.2455 0.65984 0.549 -2.6548 0.00082321 -0.38957 -1.952 -1.1767 0.2034 -0.91539 0.09037 0.16488 -1.8562 0.13423 0.57839 1.074
 22 | me 0.58866 0.0060408 -0.22022 1.0119 -0.7583 0.12081 -0.025355 1.596 -1.521 -1.1867 -0.42468 -2.0128 -5.3977 -1.2343 0.17889 1.3491 -0.011538 -0.063358 -0.18676 -0.18863 0.81819 -0.33465 1.6392 1.4183 0.16919
 23 | de 1.423 -0.46838 -0.16331 1.2443 1.0157 1.1604 -2.0031 2.5195 -0.47779 -1.8382 0.32809 -3.2301 -3.0671 -0.2536 0.87798 0.3083 -0.88685 1.2904 -1.3443 1.1462 -0.026837 -0.449 -0.006978 -0.73663 1.6404
 24 | <smile> 0.15671 -0.024377 -0.04252 -0.22052 -0.21045 -0.15969 0.70284 2.2709 -0.91873 1.5789 -0.25527 -0.63909 -2.944 -0.042341 0.12256 -1.0834 0.44036 -0.60795 -1.611 0.40592 -0.37838 -0.1601 -1.0792 1.8263 0.55963
 25 | ！ 0.98004 0.38132 0.29754 -0.30478 0.40033 0.31853 1.8654 0.48166 0.56297 -0.82152 -1.2386 1.4854 -1.7059 0.093414 -2.4631 -1.365 -0.85534 0.20354 0.14737 2.1994 0.1779 -2.8695 2.1895 1.6762 2.3846
 26 | que 1.8163 -0.9435 -0.6624 1.0099 0.031072 0.33463 -0.95627 2.9703 -0.54155 -2.4489 0.29555 -3.9631 -2.5559 -0.5695 1.0982 0.73903 -1.1868 0.5865 -0.45852 0.49212 0.87361 0.14368 0.64574 0.86255 0.4955
 27 | and -0.81216 -0.28605 0.062502 -0.036869 -0.61118 -0.15568 1.625 -0.42602 0.1973 -0.19418 0.53267 0.64592 -6.1336 -0.3309 -0.0017279 -0.15173 0.20383 -0.77496 0.17629 -0.10884 -0.31234 0.2401 -0.36097 -0.049996 -0.7247
 28 | 。 0.97257 1.2053 0.65594 0.7481 0.21479 0.030439 0.92262 1.9799 0.13767 -0.13729 -1.0628 1.475 -2.2513 0.92853 -2.5643 -1.6855 0.41263 1.0644 1.5803 1.2377 -0.13871 -3.2106 0.84125 0.18398 1.9644
 29 | - 0.7717 -1.0602 -0.34383 -0.09264 0.031247 0.10274 1.1822 2.0774 0.20992 0.88188 0.65696 0.041836 -3.3736 -0.71065 0.041693 -1.535 -0.55627 0.64587 -0.7243 -0.399 -0.31172 -0.58834 -0.11027 -0.067876 1.0723
 30 | my -0.74175 0.54942 0.6749 0.67924 0.13115 -0.2858 1.9227 0.11975 -0.62351 0.39304 -0.87884 0.39575 -5.9879 -0.49659 -0.26535 -0.04049 1.1247 -0.75211 -0.38015 0.49567 -0.53343 0.056762 0.69697 0.53384 -0.70807
 31 | no 1.2722 0.22154 0.1395 0.50897 0.11663 0.10291 0.21448 2.2064 -0.5623 -1.0633 0.039293 -2.9371 -4.5097 -0.51896 1.0116 0.14003 -0.32955 0.076449 1.1712 -0.66266 0.53255 0.072198 1.184 0.95736 0.20746
 32 | 、 0.85424 0.25535 0.80356 -0.043311 -0.062442 0.71904 0.91777 1.8196 0.93903 -0.041324 -0.27476 2.223 -1.9891 0.60383 -3.2757 -1.4099 1.3996 1.067 0.26134 1.3922 1.2872 -2.578 0.86285 0.53713 2.3683
 33 | is -0.12532 -0.20207 -0.12672 -0.57474 -0.30313 -0.029884 1.1792 -0.1491 -0.71315 -0.12112 0.40652 1.4784 -5.995 -0.21617 0.47806 0.43448 0.13489 0.88961 -0.56926 0.33094 0.13661 0.65844 -0.41766 0.25164 -0.055809
 34 | it 0.16758 0.21434 -0.093086 0.16379 -0.60001 -0.037103 1.8577 -0.24306 -0.44864 0.28734 -0.43609 1.0839 -6.0385 -0.14872 0.31843 0.08263 0.47562 -0.5009 -0.099384 -0.18034 -0.10614 0.15238 0.32532 0.73795 -0.40859
 35 | … 0.84691 -0.27254 -0.46382 0.4686 0.77397 0.95429 0.566 2.0054 -0.79725 0.46677 -0.5743 0.55761 -2.5497 0.78614 -0.58253 -1.1329 -0.60002 0.10997 -0.66984 -0.43048 1.2045 -1.8733 0.27826 0.24504 0.83927
 36 | in -0.32929 -0.16037 0.10785 -0.3961 -0.48827 -0.17528 0.23056 -0.49115 -0.065798 0.84382 0.38091 0.46377 -5.9545 0.57595 -0.18242 0.36494 -0.0042541 0.96687 -1.5674 -0.40454 -0.79557 -0.0050535 0.021972 -0.73638 0.65277
 37 | n 0.53229 -0.30423 -0.6065 -0.15941 0.52165 -0.065076 1.3758 2.4098 -1.033 0.73698 -0.40591 -0.18263 -2.7087 0.28421 -1.8023 -1.4446 -1.4078 -0.20802 -0.94007 -0.10846 0.047255 -0.85601 0.94209 0.34083 0.66958
 38 | for -0.21749 0.45183 -0.23211 -0.27781 -0.067977 -0.63951 1.1218 -0.37536 0.18676 -0.50864 0.016423 -0.13329 -5.903 0.14596 -0.067031 -0.66199 -0.17362 -0.87281 -0.49771 -0.55289 -1.0515 -0.18484 -0.30848 -0.04478 -0.24358
 39 | / 0.26362 -0.5406 -0.30588 -0.42799 0.4699 0.95839 1.316 2.1394 0.33286 0.44204 0.82635 0.40852 -3.0257 -0.57578 -0.6385 -1.4769 -0.94778 0.17089 -0.98404 -0.36094 -0.32045 -0.99178 0.47383 0.49367 0.86134
 40 | of 0.32543 -0.089637 -0.14733 0.4285 -0.092613 -0.17938 1.2835 -0.59714 -0.28134 -0.048954 0.54827 0.6941 -6.12 0.6724 0.018078 -0.24165 0.50342 0.65325 -0.20674 0.27639 -0.79097 0.10432 -0.6175 -0.54592 -0.069893
 41 | la 0.14261 -0.2807 -0.1258 0.45119 0.17715 0.919 -1.0333 3.8694 0.41688 -0.67503 -0.020023 -3.0843 -2.9433 0.81947 1.6001 1.7433 0.21815 0.88131 -0.97446 1.3757 1.0597 -1.1426 0.29684 -0.84936 -0.012225
 42 | 's -0.21143 -0.16532 0.42022 -0.28705 -0.20637 -0.3565 1.3455 0.21057 -0.14089 -0.66701 0.54621 0.62353 -5.8742 -0.2406 0.19936 -0.61046 0.034438 0.23217 -0.66933 0.28861 0.2833 0.37067 -0.092633 -0.092978 -0.30293
 43 | * -0.0010152 -0.39872 0.18076 0.65818 -0.27114 0.15688 1.7757 2.4463 0.10599 1.254 0.11974 -0.057964 -2.2856 -1.187 -0.99771 -0.43614 -0.71053 -0.26881 -1.1703 -0.23262 -0.16339 -0.48445 0.84881 1.7162 0.76949
 44 | do 1.6477 0.12903 0.76911 -0.030854 0.27506 -0.49298 0.8206 -0.12559 0.57068 -0.79898 -1.6912 -1.7656 -5.3186 -1.2511 -0.73013 -0.90697 -1.0932 -0.53634 0.17967 -1.6247 -0.0029176 1.3184 0.45594 0.3682 0.87591
 45 | n't 0.31872 0.52105 -0.056364 -0.34805 -0.77221 -0.28169 2.06 -0.56607 -0.32574 0.073742 -0.46097 0.54654 -6.0364 -0.56174 -0.084994 0.34263 0.1017 -0.82377 0.14404 -0.73248 0.63707 0.82175 0.53894 0.3674 -0.25653
 46 | that 0.20823 0.22476 -0.070949 0.23917 -0.36076 -0.23443 1.8633 -0.4573 -0.40894 -0.055079 -0.11599 1.0568 -6.2614 -0.24912 0.37123 0.21891 0.67926 -0.35585 0.18441 -0.11821 0.58806 0.59916 0.40883 0.15874 -0.55338
 47 | on 0.21228 -0.2435 -0.57013 0.33778 -0.86072 -0.1771 0.86891 -0.11103 0.53467 -0.0036497 0.11068 0.44655 -5.6486 -0.033026 0.36245 0.74407 -0.16614 -0.61851 -1.8327 0.51321 -0.31933 -0.68438 0.59145 -0.55647 -0.31049
 48 | y 0.21767 0.19018 -0.27414 0.69654 0.12748 0.83719 -1.2804 3.8718 -0.96223 -1.1195 0.985 -2.4167 -2.9994 0.017624 1.3301 0.48203 -0.19386 0.09823 1.2263 0.80212 0.48846 -0.98181 0.53096 0.4342 0.67874
 49 | ' 0.44205 -0.67697 -0.079938 0.89579 -0.043245 0.35863 0.51735 1.433 -0.21658 0.93923 0.36207 -0.27295 -3.4128 0.46583 -0.87769 -0.42464 -1.3648 0.43996 -2.4477 -0.23733 0.42426 0.18637 -0.19753 0.26109 0.44809
 50 | e 1.2775 -0.29531 0.57591 -0.42937 0.12591 -0.81734 -0.70924 1.6399 0.52233 -0.53468 -0.85909 -2.6308 -3.5076 -0.60357 -1.8392 -0.43235 -2.3822 0.14332 -1.2133 -0.33507 -0.75788 0.58005 0.38244 0.29998 0.66194
 51 | o 1.3635 0.04007 0.82928 1.0005 0.51972 0.067199 -0.39481 1.9431 0.050642 -0.139 -0.67126 -2.5206 -3.1961 -0.67686 -0.75984 -1.5995 -1.178 -0.014596 -0.88455 -0.48474 0.21677 0.41828 0.24096 0.83342 1.1436
 52 | u 0.4532 0.95597 -0.15188 -0.76201 -0.44016 0.031701 0.28219 1.9646 -0.64687 0.48962 -1.0939 0.0013035 -4.7503 0.14003 -1.4439 -0.0030859 -0.47395 -0.51914 -0.5441 0.48264 -0.076634 -0.064356 0.14666 0.48416 0.11541
 53 | en 0.69564 -0.48183 -0.080013 0.33814 0.37114 1.3014 -2.2778 2.8792 -1.5228 -0.8133 1.0348 -2.0311 -2.8536 0.61582 1.254 1.0985 0.46716 1.5167 -1.086 0.94948 0.401 -1.4937 0.25163 -0.9494 1.5508
 54 | this -0.17895 0.38406 0.073035 -0.32363 -0.092441 -0.40767 2.1 -0.11363 -0.58784 -0.17034 -0.6433 0.72388 -5.7839 -0.10406 0.52152 -0.11314 0.59554 -0.47587 -0.4551 0.084431 -0.4582 -0.16727 0.54594 0.035478 -0.16073
 55 | el 0.11335 0.59796 0.38876 0.83878 0.83717 0.24276 -1.4375 4.2874 -0.51924 -1.0783 1.0522 -2.5613 -2.5292 0.47829 2.1483 0.013401 0.58133 0.6934 0.296 -0.012265 0.37054 -0.71858 1.6555 -0.60154 0.86965
 56 | so 0.39543 -0.60706 0.34448 -0.93783 -0.30466 0.46151 1.5214 0.070674 -0.36075 0.029852 -1.1005 -0.053799 -5.079 -0.73424 -0.40314 -0.10083 -0.0022164 -0.47121 -0.88651 -1.1737 0.22514 0.87842 -0.10534 1.27 -0.22951
 57 | be -0.3435 1.0138 -0.039231 -0.61739 -0.13 0.65973 1.1861 -0.117 -0.61421 0.39945 -0.33834 0.54643 -5.4199 0.31714 -0.62972 -0.49683 0.38104 -0.52959 -0.51274 -0.88274 0.524 1.032 -0.62416 0.12028 -0.10696
 58 | 'm -0.60745 0.40046 0.72375 -0.51941 -0.60935 0.49649 1.7686 -0.48596 -0.35951 0.68387 -0.94178 0.2865 -5.0159 -0.46631 -0.36731 -0.46284 0.48544 -0.32663 -0.82502 -0.88353 0.89693 0.12028 0.030383 -0.043274 0.64118
 59 | with -0.9476 0.32533 0.23967 0.29609 -0.098118 -0.10892 1.3503 -0.014157 0.15739 0.13604 -0.06848 0.68701 -5.6666 -0.41398 0.22936 -0.3325 0.49592 -0.74203 -0.032459 0.40253 -1.0907 -0.11469 -0.25527 -0.40069 -0.47669
 60 | just -0.35518 0.4803 0.49681 -0.76379 -0.64588 0.083208 1.7889 -0.3547 -0.4497 0.022174 0.18026 0.81539 -5.7024 -0.75963 -0.066477 0.52203 0.50433 -0.42471 -0.37414 -0.67191 0.48804 0.43652 0.29954 0.43855 -0.40954
 61 | > 0.67403 -0.1364 -0.080726 -0.89577 0.33463 0.21482 1.0867 2.0046 -0.61399 0.25588 -0.36965 -0.28485 -3.3158 -0.092295 -1.1617 -0.87728 -1.5146 -0.65678 -1.0679 0.61796 -0.61208 -0.614 0.39864 0.41253 0.56256
 62 | your -0.48666 0.32014 -0.27703 1.0928 0.45773 -0.37923 1.9602 -0.35099 -0.34286 0.26806 -0.49852 0.34517 -6.0877 -0.18907 -0.45247 0.026142 0.56855 -0.64653 0.21283 -0.15051 -0.59593 -0.31407 0.2869 0.27501 -1.4474
 63 | ^ 0.039588 -0.11419 -0.076219 -0.12791 -0.33003 0.43771 1.8312 1.9416 -0.86341 0.62057 -0.56913 0.3571 -1.7911 -0.40879 -0.53269 -1.4991 -1.053 -0.7939 -1.8975 0.5277 -0.37142 -1.7164 -0.055501 1.9065 0.92442
 64 | like 0.068004 0.10737 0.61292 0.35446 -0.28576 0.44095 1.7574 -0.0079057 -0.66561 0.20433 -0.51421 0.46797 -5.349 -0.99746 -0.12069 0.11433 0.37355 -0.97219 -0.089747 -0.14982 0.34141 0.58987 0.51226 -0.06509 -0.068817
 65 | have -0.058224 0.79651 -0.060888 -0.58459 -0.56228 -0.072496 1.8592 -0.42394 -0.56051 -0.057096 0.44406 0.31399 -5.8856 0.061965 -0.20702 -0.046447 0.75287 -0.58468 0.25053 -0.72399 -0.13122 0.37822 -0.17631 0.43037 -0.45315
 66 | te 0.81979 -0.44008 -1.1408 1.1369 -0.71241 0.030568 -1.3383 2.9954 -1.413 -1.1037 0.57031 -2.9081 -2.3991 -1.1408 -0.15021 1.0955 -0.38832 0.88401 -0.92539 0.41138 0.77758 -0.91392 1.165 1.9987 0.51215
 67 | at -1.0206 -0.12834 1.0937 -0.74474 -0.51548 -0.50633 1.0489 -0.18161 0.86851 0.38252 -0.34213 -0.079467 -4.9305 0.91787 0.046519 -0.83393 0.77692 -0.70471 -0.99819 -0.73907 -0.57397 -0.2561 -0.24104 -0.58805 0.48638
 68 | ？ 0.62413 0.77774 0.50666 -1.0425 0.69918 0.72191 0.91422 1.2456 0.21374 0.56368 -2.2575 2.2898 -1.8461 -0.57553 -2.3108 -1.4953 -0.80619 1.9734 -0.28921 2.0816 1.9227 -2.8119 0.46788 0.94142 0.99366
 69 | love -0.62645 -0.082389 0.070538 0.5782 -0.87199 -0.14816 2.2315 0.98573 -1.3154 -0.34921 -0.8847 0.14585 -4.97 -0.73369 -0.94359 0.035859 -0.026733 -0.77538 -0.30014 0.48853 -0.16678 -0.016651 -0.53164 0.64236 -0.10922
 70 | se 1.5771 -0.89176 -0.62529 0.42748 -0.83033 -0.064147 -1.6374 3.2134 0.12522 -1.2291 0.33955 -3.5723 -2.9215 -0.14489 0.10353 1.2451 -1.1518 0.27602 -0.26116 0.098035 0.73908 0.092399 0.80997 0.44311 0.62663
 71 | are 0.1866 -0.098326 -0.12268 -0.93822 -0.40161 0.6383 1.6686 -0.68036 -0.98359 -0.079512 0.38078 0.039076 -5.4147 0.02829 -0.47007 0.11377 -0.52725 -0.79312 0.58203 -0.61829 0.37025 0.2261 -0.73014 -0.1019 -0.21382
 72 | < 0.58806 -0.15251 -0.064951 -0.88166 0.15004 -0.19889 1.1618 2.0507 -0.91723 0.38324 -0.60172 0.085248 -3.0157 0.3511 -1.1353 -0.77262 -1.6993 -0.61297 -1.0837 0.67306 -0.41953 -0.76792 0.64669 0.56938 0.53808
 73 | m 0.46768 0.24442 -0.79763 -0.74 -0.41127 0.26882 -0.21232 2.257 -0.3162 -0.17906 -0.15279 -0.50826 -4.0466 0.57889 -1.1724 0.031238 -1.1387 0.039347 -1.32 1.2395 -0.56249 -1.0717 0.5102 -0.021507 0.24223
 74 | r 0.55525 0.71333 -0.49285 -0.90042 0.043085 0.11183 0.11446 2.2269 -0.69619 0.064417 -0.39974 -0.43526 -3.9523 0.35152 -1.7713 -0.77199 -1.4343 -0.39785 -0.72236 0.86474 -0.18005 -0.72005 0.44048 -0.36406 0.3805
 75 | if 0.18243 0.70534 -0.34209 -0.10779 -0.72721 -0.58802 1.7457 -0.13666 -0.61576 0.15336 -0.19019 0.70282 -5.725 -0.20901 -0.33692 0.16916 0.35872 -0.9871 0.45495 -0.36607 0.62973 0.11066 0.31315 0.08787 -0.88679
 76 | all -0.18232 0.96997 0.32174 -0.074793 -0.11618 0.095187 1.4986 -0.41357 -0.86298 0.42067 -0.53125 0.10797 -5.56 0.25913 -0.58389 0.12241 -0.15168 -0.71983 -0.16605 -0.092514 -0.34099 0.12464 0.086193 -0.1411 -0.18656
 77 | b 0.50344 0.61831 -0.24684 -0.93508 -0.11275 0.22688 0.15222 2.3432 -0.24032 0.12045 -0.60827 -0.25307 -4.0088 0.42389 -1.2794 -0.72646 -1.152 -0.36694 -0.77069 0.80603 -0.52068 -0.40503 0.23572 0.12789 0.21919
 78 | ・ 1.4696 -1.1562 -1.2549 -0.045791 0.26885 1.3117 0.093971 2.056 1.8101 -0.78516 -1.127 2.8487 -2.1782 0.14884 -2.6502 -1.9661 -0.32747 1.1759 0.82536 1.7901 0.36039 -2.2292 1.1782 -0.17536 1.0635
 79 | not 0.35377 0.32604 -0.22682 -0.32412 -0.18555 0.1486 1.3914 -0.65154 -0.38197 0.17129 -0.43405 0.39154 -5.7918 -0.20201 -0.23216 -0.10638 0.070835 -0.2146 -0.094385 -1.0851 0.61683 0.82184 -0.35102 0.19177 -0.43818
 80 | but 0.17129 -0.012287 -0.32958 -0.16519 -0.54661 0.17683 1.7577 -0.63738 -0.47622 0.13892 -0.43254 0.789 -5.7212 0.068779 0.37439 0.51778 0.331 -0.74517 -0.20935 -0.21567 0.66493 0.52874 0.1066 0.47777 -0.48408
 81 | we 0.49653 1.1435 -0.28609 -0.63378 -1.2347 -0.096415 0.81466 0.31406 -0.81064 0.43028 -0.10844 0.23407 -5.5359 0.045617 -0.15732 0.74855 0.35315 -0.11169 -0.12048 -0.13965 -0.27711 0.34342 -0.40377 0.61924 0.5394
 82 | es 0.074369 -0.84513 -1.1513 0.2921 0.88949 0.8157 -0.71079 3.9407 -0.90175 -0.99754 1.1565 -2.4328 -1.9748 -0.28035 1.8613 0.84797 -0.055742 0.95519 -0.066362 -0.10171 1.2204 -0.12053 0.26966 0.22542 0.65419
 83 | ya 0.18773 1.2677 0.031241 0.62269 -0.081441 0.68537 -0.36275 2.9738 -0.72967 1.035 0.41136 -0.99949 -2.8181 0.24364 0.51258 -0.51171 0.6693 -0.59732 -0.91591 0.23611 0.22535 0.42765 -0.7947 1.5532 0.1225
 84 | & -0.47122 0.056607 -0.29293 0.14845 -0.38806 0.98729 0.70408 1.1875 -0.11685 -0.022892 0.77426 0.54195 -4.7114 -0.50027 -0.44136 -0.29372 -0.31554 -0.58285 -0.1643 0.10002 -0.14504 -0.79716 -0.7261 0.2252 -0.17583
 85 | follow 0.0033407 0.91533 -1.2207 -0.45773 -0.59682 -0.64277 1.0497 1.7013 -1.3643 0.3601 -0.25219 -0.14811 -3.8711 -1.0949 -1.3944 0.22114 -0.76684 -2.0907 -1.0099 0.20968 -0.27403 -1.8368 -0.4521 0.8915 -1.0369
 86 | up -0.19555 0.8236 0.73457 0.12395 -0.21509 0.51058 1.1859 0.0058648 -0.18281 0.35138 -0.42667 0.66238 -5.3506 -0.066143 0.15166 -0.11217 0.25905 -1.2281 -0.63229 -0.49146 -0.2852 0.10816 1.0703 0.11823 -0.010115
 87 | what 0.5292 0.34413 -0.055991 0.15468 -0.48548 -0.42618 2.0155 -0.082657 -0.40249 0.066622 -0.56097 0.61953 -5.6142 -0.40026 0.54612 -0.021634 0.46933 -0.33373 0.096102 0.095193 0.53975 0.33677 -0.28179 0.50744 -0.51784
 88 | get -0.33344 1.2678 0.04472 -0.6707 -0.2079 0.12289 1.3696 -0.22981 -0.53645 -0.1833 -0.13394 0.63001 -5.6577 -0.66158 -0.26856 -0.1472 0.71835 -1.1591 -0.055771 -0.8732 -0.025146 0.3069 1.0787 0.38923 -0.028963
 89 | lol 0.073266 0.069397 0.15877 -0.20805 -0.43151 -0.26647 1.3905 0.48968 -0.73748 0.78262 -1.1841 0.1629 -4.6268 -0.76179 0.80136 0.78043 0.39787 -0.9309 -1.41 0.32624 1.2311 0.42486 0.70665 0.76985 0.20826
 90 | un -0.10621 0.057591 -0.752 1.3185 0.79391 0.56179 -0.85496 3.275 -0.45897 -0.35925 1.7636 -2.7374 -2.2786 0.11344 1.8262 0.91508 0.30052 0.35935 -2.1484 0.12763 0.17556 -0.79025 1.0724 -0.76144 0.37949
 91 | ♥ -0.84469 -1.0147 -0.2948 0.50243 -0.032096 -0.25413 1.7008 2.7719 -2.0656 -0.016464 -0.10534 0.0078914 -1.8015 0.04799 -0.96726 -1.0318 -1.3104 -0.51772 -1.1396 0.53056 0.02979 -0.82598 0.19418 2.0574 0.50733
 92 | lo 0.67317 0.46962 -1.0861 0.70384 -0.033311 0.0016168 -0.20346 4.6008 0.083431 0.087919 1.3443 -2.0923 -1.781 0.069451 1.2344 1.1293 -0.13422 -0.1458 -0.15684 0.087938 0.31166 0.20576 -0.015365 1.2647 -0.26626
 93 | when -0.26148 0.2644 0.44876 0.1599 -0.47692 -0.31942 2.1561 -0.52634 -0.56854 0.28894 -0.3091 0.69452 -5.6008 -0.18411 0.075871 0.69657 0.35998 -0.64724 0.34793 -0.5933 0.57665 0.30556 0.48815 -0.11032 -0.78019
 94 | was -0.16063 0.021235 0.95695 -1.0642 -0.42496 0.070767 0.88473 -0.38835 -0.96585 -0.032367 0.2431 1.4682 -5.7357 -0.31611 0.59085 0.33569 0.92369 1.0457 -0.96856 -0.30444 0.65033 0.94053 -0.003684 0.14969 0.38408
 95 | “ 1.1393 0.31722 0.33893 0.59917 0.41805 0.26417 1.5744 2.0874 -0.38514 0.47434 0.3168 0.093502 -2.9127 -0.14853 -0.43053 -0.97776 -0.56379 0.31383 -0.67353 -0.63432 0.88654 -0.22208 -0.12107 -0.19183 0.6994
 96 | ” 0.74133 -0.56073 0.0037826 0.25398 0.64676 -0.03136 1.5837 1.5481 -1.3584 0.62118 -0.14075 0.17492 -3.2719 -0.011897 -0.16072 -0.57518 -0.55753 0.22912 -0.49103 -0.25186 0.97739 -0.30327 0.39648 0.28217 1.0422
 97 | one 0.39657 0.15653 0.50676 -0.039995 -0.1177 -0.011625 1.7677 0.33504 -0.84748 -0.27969 0.036325 -0.146 -5.2788 0.053348 -0.60437 0.26285 0.15334 -0.31598 -0.18437 -0.21645 -0.095925 -0.07569 0.18185 -0.18519 -0.33499
 98 | por 0.98549 0.19405 0.77539 0.44123 0.58736 0.55549 -1.4343 2.9726 -1.1735 -1.2678 0.15086 -3.3556 -2.1731 -0.41532 1.1496 -0.46623 -1.331 0.71864 0.64299 -0.42066 0.51122 -0.81006 0.44971 0.087221 0.35618
 99 | si 0.31329 -0.29282 -0.88699 0.45418 -0.77082 -0.55735 -0.23928 4.1561 -0.27757 0.1545 0.9518 -2.3782 -1.9816 -0.11735 1.1881 1.358 0.56444 0.24605 -0.98873 0.64185 0.35727 -0.078937 -0.28172 0.88359 -0.51221
100 | out -0.28653 0.60501 0.62592 -0.034889 -0.10508 0.063965 1.1527 -0.18502 -0.22128 0.29563 -0.061197 0.71973 -5.4451 -0.055855 0.078477 -0.0090364 0.32605 -0.90771 -0.53689 -0.34474 -0.3713 -0.17721 0.87016 -0.15274 0.026154
101 | 


--------------------------------------------------------------------------------
/tests/test_corpus.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import array
  3 | 
  4 | import pytest
  5 | 
  6 | import numpy as np
  7 | import scipy.sparse as sp
  8 | 
  9 | from glove import Corpus
 10 | from glove.glove import check_random_state
 11 | 
 12 | from utils import (build_coocurrence_matrix,
 13 |                    generate_training_corpus)
 14 | 
 15 | 
 16 | def test_corpus_construction():
 17 | 
 18 |     corpus_words = ['a', 'naïve', 'fox']
 19 |     corpus = [corpus_words]
 20 | 
 21 |     model = Corpus()
 22 |     model.fit(corpus, window=10)
 23 | 
 24 |     for word in corpus_words:
 25 |         assert word in model.dictionary
 26 | 
 27 |     assert model.matrix.shape == (len(corpus_words),
 28 |                                   len(corpus_words))
 29 | 
 30 |     expected = [[0.0, 1.0, 0.5],
 31 |                 [0.0, 0.0, 1.0],
 32 |                 [0.0, 0.0, 0.0]]
 33 | 
 34 |     assert (model.matrix.todense().tolist()
 35 |             == expected)
 36 | 
 37 | 
 38 | def test_supplied_dictionary():
 39 | 
 40 |     dictionary = {'a': 2,
 41 |                   'naïve': 1,
 42 |                   'fox': 0}
 43 | 
 44 |     corpus = [['a', 'naïve', 'fox']]
 45 | 
 46 |     model = Corpus(dictionary=dictionary)
 47 |     model.fit(corpus, window=10)
 48 | 
 49 |     assert model.dictionary == dictionary
 50 | 
 51 |     assert model.matrix.shape == (len(dictionary),
 52 |                                   len(dictionary))
 53 | 
 54 |     assert (model.matrix.tocsr()[2]).sum() == 0
 55 | 
 56 | 
 57 | def test_supplied_dict_checks():
 58 | 
 59 |     dictionary = {'a': 4,
 60 |                   'naïve': 1,
 61 |                   'fox': 0}
 62 | 
 63 |     with pytest.raises(Exception):
 64 |         Corpus(dictionary=dictionary)
 65 | 
 66 | 
 67 | def test_supplied_dict_missing():
 68 | 
 69 |     dictionary = {'a': 1,
 70 |                   'naïve': 0}
 71 | 
 72 |     corpus = [['a', 'naïve', 'fox']]
 73 | 
 74 |     model = Corpus(dictionary=dictionary)
 75 | 
 76 |     with pytest.raises(KeyError):
 77 |         model.fit(corpus, window=10)
 78 | 
 79 | 
 80 | def test_supplied_dict_missing_ignored():
 81 | 
 82 |     dictionary = {'a': 0,
 83 |                   'fox': 1}
 84 | 
 85 |     corpus = [['a', 'naïve', 'fox']]
 86 | 
 87 |     model = Corpus(dictionary=dictionary)
 88 |     model.fit(corpus, window=10, ignore_missing=True)
 89 | 
 90 |     assert model.dictionary == dictionary
 91 | 
 92 |     assert model.matrix.shape == (len(dictionary),
 93 |                                   len(dictionary))
 94 | 
 95 |     # Ensure that context windows and context window
 96 |     # weights are preserved.
 97 |     full_model = Corpus()
 98 |     full_model.fit(corpus, window=10)
 99 | 
100 |     assert (full_model.matrix.todense()[0, 2]
101 |             == model.matrix.todense()[0, 1]
102 |             == 0.5)
103 | 
104 | 
105 | def test_large_corpus_construction():
106 | 
107 |     num_sentences = 5000
108 |     seed = 10
109 | 
110 |     corpus = Corpus()
111 | 
112 |     corpus.fit(generate_training_corpus(num_sentences, seed=seed))
113 | 
114 |     matrix = corpus.matrix.tocsr().tocoo()
115 |     check_matrix = build_coocurrence_matrix(generate_training_corpus(num_sentences,
116 |                                                                      seed=seed))
117 | 
118 |     assert (matrix.row == check_matrix.row).all()
119 |     assert (matrix.col == check_matrix.col).all()
120 |     assert np.allclose(matrix.data, check_matrix.data)
121 |     assert (matrix.data > 0).all()
122 | 


--------------------------------------------------------------------------------
/tests/test_glove.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | 
 4 | from glove import Corpus, Glove
 5 | 
 6 | from utils import generate_training_corpus
 7 | 
 8 | 
 9 | def _reproduce_input_matrix(glove_model):
10 | 
11 |     wvec = glove_model.word_vectors
12 |     wbias = glove_model.word_biases
13 | 
14 |     out = np.dot(wvec, wvec.T)
15 | 
16 |     for i in range(wvec.shape[0]):
17 |         for j in range(wvec.shape[0]):
18 |             if i == j:
19 |                 out[i, j] = 0.0
20 |             elif i < j:
21 |                 out[i, j] += wbias[i] + wbias[j]
22 |             else:
23 |                 out[i, j] = 0.0
24 | 
25 |     return np.asarray(out)
26 | 
27 | 
28 | def test_stanford_loading():
29 | 
30 |     model = Glove.load_stanford('tests/stanford_test.txt')
31 | 
32 |     assert model.word_vectors is not None
33 |     assert model.word_vectors.shape == (100, 25)
34 |     assert len(model.dictionary) == 100
35 | 
36 |     # Python 2/3 compatibility. Check the ellipsis
37 |     # character is in the dictionary.
38 |     try:
39 |         # Python 2
40 |         assert unichr(8230) in model.dictionary
41 |     except NameError:
42 |         # Pyton 3
43 |         assert '…' in model.dictionary
44 | 
45 | 
46 | def test_fitting():
47 |     """
48 |     Verify that the square error diminishes with fitting
49 |     """
50 | 
51 |     num_sentences = 5000
52 |     seed = 10
53 | 
54 |     corpus = Corpus()
55 | 
56 |     corpus.fit(generate_training_corpus(num_sentences,
57 |                                         vocabulary_size=50,
58 |                                         seed=seed))
59 | 
60 |     # Check that the performance is poor without fitting
61 |     glove_model = Glove(no_components=100, learning_rate=0.05)
62 |     glove_model.fit(corpus.matrix,
63 |                     epochs=0,
64 |                     no_threads=2)
65 | 
66 |     log_cooc_mat = corpus.matrix.copy()
67 |     log_cooc_mat.data = np.log(log_cooc_mat.data)
68 |     log_cooc_mat = np.asarray(log_cooc_mat.todense())
69 | 
70 |     repr_matrix = _reproduce_input_matrix(glove_model)
71 | 
72 |     assert ((repr_matrix - log_cooc_mat) ** 2).sum() > 30000.0
73 | 
74 |     # Check that it is good with fitting
75 |     glove_model = Glove(no_components=100, learning_rate=0.05)
76 |     glove_model.fit(corpus.matrix,
77 |                     epochs=500,
78 |                     no_threads=2)
79 | 
80 |     repr_matrix = _reproduce_input_matrix(glove_model)
81 | 
82 |     assert ((repr_matrix - log_cooc_mat) ** 2).sum() < 1500.0
83 | 


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import array
 3 | 
 4 | import numpy as np
 5 | import scipy.sparse as sp
 6 | 
 7 | from glove.glove import check_random_state
 8 | 
 9 | 
10 | def generate_training_corpus(num_sentences,
11 |                              vocabulary_size=30000,
12 |                              sentence_min_size=2,
13 |                              sentence_max_size=30,
14 |                              seed=None):
15 | 
16 |     rs = check_random_state(seed)
17 | 
18 |     for _ in range(num_sentences):
19 |         sentence_size = rs.randint(sentence_min_size,
20 |                                    sentence_max_size)
21 |         yield [str(x) for x in
22 |                rs.randint(0, vocabulary_size, sentence_size)]
23 | 
24 | 
25 | def build_coocurrence_matrix(sentences):
26 | 
27 |     dictionary = {}
28 |     rows = []
29 |     cols = []
30 |     data = array.array('f')
31 | 
32 |     window = 10
33 | 
34 |     for sentence in sentences:
35 |         for i, first_word in enumerate(sentence):
36 |             first_word_idx = dictionary.setdefault(first_word,
37 |                                                    len(dictionary))
38 |             for j, second_word in enumerate(sentence[i:i + window + 1]):
39 |                 second_word_idx = dictionary.setdefault(second_word,
40 |                                                         len(dictionary))
41 | 
42 |                 distance = j
43 | 
44 |                 if first_word_idx == second_word_idx:
45 |                     pass
46 |                 elif first_word_idx < second_word_idx:
47 |                     rows.append(first_word_idx)
48 | 
49 |                     cols.append(second_word_idx)
50 |                     data.append(np.float32(1.0) / distance)
51 |                 else:
52 |                     rows.append(second_word_idx)
53 |                     cols.append(first_word_idx)
54 |                     data.append(np.float32(1.0) / distance)
55 | 
56 |     return sp.coo_matrix((data, (rows, cols)),
57 |                          shape=(len(dictionary),
58 |                                 len(dictionary)),
59 |                          dtype=np.float32).tocsr().tocoo()
60 | 


--------------------------------------------------------------------------------