├── .gitignore ├── .travis.yml ├── LICENSE ├── Makefile ├── README.md ├── demo.sh ├── eval ├── matlab │ ├── WordLookup.m │ ├── evaluate_vectors.m │ └── read_and_evaluate.m ├── octave │ ├── WordLookup_octave.m │ ├── evaluate_vectors_octave.m │ └── read_and_evaluate_octave.m ├── python │ ├── distance.py │ ├── evaluate.py │ └── word_analogy.py └── question-data │ ├── capital-common-countries.txt │ ├── capital-world.txt │ ├── city-in-state.txt │ ├── currency.txt │ ├── family.txt │ ├── gram1-adjective-to-adverb.txt │ ├── gram2-opposite.txt │ ├── gram3-comparative.txt │ ├── gram4-superlative.txt │ ├── gram5-present-participle.txt │ ├── gram6-nationality-adjective.txt │ ├── gram7-past-tense.txt │ ├── gram8-plural.txt │ └── gram9-plural-verbs.txt ├── randomization.test.sh └── src ├── README.md ├── common.c ├── common.h ├── cooccur.c ├── glove.c ├── shuffle.c └── vocab_count.c /.gitignore: -------------------------------------------------------------------------------- 1 | # Object files 2 | *.o 3 | *.ko 4 | *.obj 5 | *.elf 6 | 7 | # Precompiled Headers 8 | *.gch 9 | *.pch 10 | 11 | # Libraries 12 | *.lib 13 | *.a 14 | *.la 15 | *.lo 16 | 17 | # Shared objects (inc. Windows DLLs) 18 | *.dll 19 | *.so 20 | *.so.* 21 | *.dylib 22 | 23 | # Executables 24 | *.exe 25 | *.out 26 | *.app 27 | *.i*86 28 | *.x86_64 29 | *.hex 30 | 31 | # Debug files 32 | *.dSYM/ 33 | 34 | 35 | build/* 36 | *.swp 37 | 38 | # OS X stuff 39 | ._* 40 | 41 | # demo.sh-produced artifacts 42 | /cooccurrence.bin 43 | /cooccurrence.shuf.bin 44 | /text8 45 | /vectors.bin 46 | /vectors.txt 47 | /vocab.txt 48 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: c 2 | dist: trusty 3 | sudo: required 4 | before_install: 5 | - sudo apt-get install python2.7 python-numpy python-pip 6 | script: pip install numpy && ./demo.sh | tee results.txt && [[ `cat results.txt | egrep "Total accuracy. 2[23]" | wc -l` = "1" ]] && echo test-passed 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 2014 The Board of Trustees of The Leland Stanford Junior University 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | #For older gcc, use -O3 or -O2 instead of -Ofast 3 | # CFLAGS = -lm -pthread -Ofast -march=native -funroll-loops -Wno-unused-result 4 | 5 | # Use -Ofast with caution. It speeds up training, but the checks for NaN will not work 6 | # (-Ofast turns on --fast-math, which turns on -ffinite-math-only, 7 | # which assumes everything is NOT NaN or +-Inf, so checks for NaN always return false 8 | # see https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html) 9 | # CFLAGS = -lm -pthread -Ofast -march=native -funroll-loops -Wall -Wextra -Wpedantic 10 | 11 | CPU_ARCHITECTURE = $(shell uname -m) 12 | OS = $(shell uname -o) 13 | # Non-empty string if Apple Silicon, empty string otherwise. 14 | APPLE_SILICON = $(and $(filter Darwin,$(OS)),$(filter arm64,$(CPU_ARCHITECTURE))) 15 | 16 | # clang (which masquerades as gcc on macOS) doesn't support this option, at least 17 | # not the Apple-provided clang on Apple Silicon as of macOS 13.2.1. 18 | ifeq ($(APPLE_SILICON),) 19 | CPU_ARCHITECTURE_FLAGS = -march=native 20 | endif 21 | 22 | CFLAGS = -lm -pthread -O3 $(CPU_ARCHITECTURE_FLAGS) -funroll-loops -Wall -Wextra -Wpedantic 23 | BUILDDIR := build 24 | SRCDIR := src 25 | OBJDIR := $(BUILDDIR) 26 | 27 | OBJ := $(OBJDIR)/vocab_count.o $(OBJDIR)/cooccur.o $(OBJDIR)/shuffle.o $(OBJDIR)/glove.o 28 | HEADERS := $(SRCDIR)/common.h 29 | MODULES := $(BUILDDIR)/vocab_count $(BUILDDIR)/cooccur $(BUILDDIR)/shuffle $(BUILDDIR)/glove 30 | 31 | 32 | all: dir $(OBJ) $(MODULES) 33 | dir : 34 | mkdir -p $(BUILDDIR) 35 | $(BUILDDIR)/glove : $(OBJDIR)/glove.o $(OBJDIR)/common.o 36 | $(CC) $^ -o $@ $(CFLAGS) 37 | $(BUILDDIR)/shuffle : $(OBJDIR)/shuffle.o $(OBJDIR)/common.o 38 | $(CC) $^ -o $@ $(CFLAGS) 39 | $(BUILDDIR)/cooccur : $(OBJDIR)/cooccur.o $(OBJDIR)/common.o 40 | $(CC) $^ -o $@ $(CFLAGS) 41 | $(BUILDDIR)/vocab_count : $(OBJDIR)/vocab_count.o $(OBJDIR)/common.o 42 | $(CC) $^ -o $@ $(CFLAGS) 43 | $(OBJDIR)/%.o : $(SRCDIR)/%.c $(HEADERS) 44 | $(CC) -c $< -o $@ $(CFLAGS) 45 | .PHONY: clean 46 | clean: 47 | rm -rf $(BUILDDIR) 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## GloVe: Global Vectors for Word Representation 2 | 3 | 4 | | nearest neighbors of
frog | Litoria | Leptodactylidae | Rana | Eleutherodactylus | 5 | | --- | ------------------------------- | ------------------- | ---------------- | ------------------- | 6 | | Pictures | | | | | 7 | 8 | | Comparisons | man -> woman | city -> zip | comparative -> superlative | 9 | | --- | ------------------------|-------------------------|-------------------------| 10 | | GloVe Geometry | | | | 11 | 12 | We provide an implementation of the GloVe model for learning word representations, and describe how to download web-dataset vectors or train your own. See the [project page](https://nlp.stanford.edu/projects/glove/) or the [paper](https://nlp.stanford.edu/pubs/glove.pdf) for more information on glove vectors. 13 | 14 | ## Download pre-trained word vectors 15 | The links below contain word vectors obtained from the respective corpora. If you want word vectors trained on massive web datasets, you need only download one of these text files! Pre-trained word vectors are made available under the Public Domain Dedication and License. 16 |
17 | 23 |
24 | 25 | ## Train word vectors on a new corpus 26 | 27 | 28 | 29 | If the web datasets above don't match the semantics of your end use case, you can train word vectors on your own corpus. 30 | 31 | $ git clone https://github.com/stanfordnlp/glove 32 | $ cd glove && make 33 | $ ./demo.sh 34 | 35 | Make sure you have the following prerequisites installed when running the steps above: 36 | 37 | * GNU Make 38 | * GCC (Clang pretending to be GCC is fine) 39 | * Python and NumPy 40 | 41 | The demo.sh script downloads a small corpus, consisting of the first 100M characters of Wikipedia. It collects unigram counts, constructs and shuffles cooccurrence data, and trains a simple version of the GloVe model. It also runs a word analogy evaluation script in python to verify word vector quality. More details about training on your own corpus can be found by reading [demo.sh](https://github.com/stanfordnlp/GloVe/blob/master/demo.sh) or the [src/README.md](https://github.com/stanfordnlp/GloVe/tree/master/src) 42 | 43 | ### License 44 | All work contained in this package is licensed under the Apache License, Version 2.0. See the include LICENSE file. 45 | -------------------------------------------------------------------------------- /demo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # Makes programs, downloads sample data, trains a GloVe model, and then evaluates it. 5 | # One optional argument can specify the language used for eval script: matlab, octave or [default] python 6 | 7 | make 8 | if [ ! -e text8 ]; then 9 | if hash wget 2>/dev/null; then 10 | wget http://mattmahoney.net/dc/text8.zip 11 | else 12 | curl -O http://mattmahoney.net/dc/text8.zip 13 | fi 14 | unzip text8.zip 15 | rm text8.zip 16 | fi 17 | 18 | CORPUS=text8 19 | VOCAB_FILE=vocab.txt 20 | COOCCURRENCE_FILE=cooccurrence.bin 21 | COOCCURRENCE_SHUF_FILE=cooccurrence.shuf.bin 22 | BUILDDIR=build 23 | SAVE_FILE=vectors 24 | VERBOSE=2 25 | MEMORY=4.0 26 | VOCAB_MIN_COUNT=5 27 | VECTOR_SIZE=50 28 | MAX_ITER=15 29 | WINDOW_SIZE=15 30 | BINARY=2 31 | NUM_THREADS=8 32 | X_MAX=10 33 | if hash python 2>/dev/null; then 34 | PYTHON=python 35 | else 36 | PYTHON=python3 37 | fi 38 | 39 | echo 40 | echo "$ $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE" 41 | $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE 42 | echo "$ $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE" 43 | $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE 44 | echo "$ $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE" 45 | $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE 46 | echo "$ $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE" 47 | $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE 48 | if [ "$CORPUS" = 'text8' ]; then 49 | if [ "$1" = 'matlab' ]; then 50 | matlab -nodisplay -nodesktop -nojvm -nosplash < ./eval/matlab/read_and_evaluate.m 1>&2 51 | elif [ "$1" = 'octave' ]; then 52 | octave < ./eval/octave/read_and_evaluate_octave.m 1>&2 53 | else 54 | echo "$ $PYTHON eval/python/evaluate.py" 55 | $PYTHON eval/python/evaluate.py 56 | fi 57 | fi 58 | -------------------------------------------------------------------------------- /eval/matlab/WordLookup.m: -------------------------------------------------------------------------------- 1 | function index = WordLookup(InputString) 2 | global wordMap 3 | if wordMap.isKey(InputString) 4 | index = wordMap(InputString); 5 | elseif wordMap.isKey('') 6 | index = wordMap(''); 7 | else 8 | index = 0; 9 | end 10 | -------------------------------------------------------------------------------- /eval/matlab/evaluate_vectors.m: -------------------------------------------------------------------------------- 1 | function [BB] = evaluate_vectors(W) 2 | 3 | global wordMap 4 | 5 | filenames = {'capital-common-countries' 'capital-world' 'currency' 'city-in-state' 'family' 'gram1-adjective-to-adverb' ... 6 | 'gram2-opposite' 'gram3-comparative' 'gram4-superlative' 'gram5-present-participle' 'gram6-nationality-adjective' ... 7 | 'gram7-past-tense' 'gram8-plural' 'gram9-plural-verbs'}; 8 | path = './eval/question-data/'; 9 | 10 | split_size = 100; %to avoid memory overflow, could be increased/decreased depending on system and vocab size 11 | 12 | correct_sem = 0; %count correct semantic questions 13 | correct_syn = 0; %count correct syntactic questions 14 | correct_tot = 0; %count correct questions 15 | count_sem = 0; %count all semantic questions 16 | count_syn = 0; %count all syntactic questions 17 | count_tot = 0; %count all questions 18 | full_count = 0; %count all questions, including those with unknown words 19 | 20 | if wordMap.isKey('') 21 | unkkey = wordMap(''); 22 | else 23 | unkkey = 0; 24 | end 25 | 26 | for j=1:length(filenames); 27 | 28 | clear dist; 29 | 30 | fid=fopen([path filenames{j} '.txt']); 31 | temp=textscan(fid,'%s%s%s%s'); 32 | fclose(fid); 33 | ind1 = cellfun(@WordLookup,temp{1}); %indices of first word in analogy 34 | ind2 = cellfun(@WordLookup,temp{2}); %indices of second word in analogy 35 | ind3 = cellfun(@WordLookup,temp{3}); %indices of third word in analogy 36 | ind4 = cellfun(@WordLookup,temp{4}); %indices of answer word in analogy 37 | full_count = full_count + length(ind1); 38 | ind = (ind1 ~= unkkey) & (ind2 ~= unkkey) & (ind3 ~= unkkey) & (ind4 ~= unkkey); %only look at those questions which have no unknown words 39 | ind1 = ind1(ind); 40 | ind2 = ind2(ind); 41 | ind3 = ind3(ind); 42 | ind4 = ind4(ind); 43 | disp([filenames{j} ':']); 44 | mx = zeros(1,length(ind1)); 45 | num_iter = ceil(length(ind1)/split_size); 46 | for jj=1:num_iter 47 | range = (jj-1)*split_size+1:min(jj*split_size,length(ind1)); 48 | dist = full(W * (W(ind2(range),:)' - W(ind1(range),:)' + W(ind3(range),:)')); %cosine similarity if input W has been normalized 49 | for i=1:length(range) 50 | dist(ind1(range(i)),i) = -Inf; 51 | dist(ind2(range(i)),i) = -Inf; 52 | dist(ind3(range(i)),i) = -Inf; 53 | end 54 | [~, mx(range)] = max(dist); %predicted word index 55 | end 56 | 57 | val = (ind4 == mx'); %correct predictions 58 | count_tot = count_tot + length(ind1); 59 | correct_tot = correct_tot + sum(val); 60 | disp(['ACCURACY TOP1: ' num2str(mean(val)*100,'%-2.2f') '% (' num2str(sum(val)) '/' num2str(length(val)) ')']); 61 | if j < 6 62 | count_sem = count_sem + length(ind1); 63 | correct_sem = correct_sem + sum(val); 64 | else 65 | count_syn = count_syn + length(ind1); 66 | correct_syn = correct_syn + sum(val); 67 | end 68 | 69 | disp(['Total accuracy: ' num2str(100*correct_tot/count_tot,'%-2.2f') '% Semantic accuracy: ' num2str(100*correct_sem/count_sem,'%-2.2f') '% Syntactic accuracy: ' num2str(100*correct_syn/count_syn,'%-2.2f') '%']); 70 | 71 | end 72 | disp('________________________________________________________________________________'); 73 | disp(['Questions seen/total: ' num2str(100*count_tot/full_count,'%-2.2f') '% (' num2str(count_tot) '/' num2str(full_count) ')']); 74 | disp(['Semantic Accuracy: ' num2str(100*correct_sem/count_sem,'%-2.2f') '% (' num2str(correct_sem) '/' num2str(count_sem) ')']); 75 | disp(['Syntactic Accuracy: ' num2str(100*correct_syn/count_syn,'%-2.2f') '% (' num2str(correct_syn) '/' num2str(count_syn) ')']); 76 | disp(['Total Accuracy: ' num2str(100*correct_tot/count_tot,'%-2.2f') '% (' num2str(correct_tot) '/' num2str(count_tot) ')']); 77 | BB = [100*correct_sem/count_sem 100*correct_syn/count_syn 100*correct_tot/count_tot]; 78 | 79 | end 80 | -------------------------------------------------------------------------------- /eval/matlab/read_and_evaluate.m: -------------------------------------------------------------------------------- 1 | addpath('./eval/matlab'); 2 | if(~exist('vocab_file')) 3 | vocab_file = 'vocab.txt'; 4 | end 5 | if(~exist('vectors_file')) 6 | vectors_file = 'vectors.bin'; 7 | end 8 | 9 | fid = fopen(vocab_file, 'r'); 10 | words = textscan(fid, '%s %f'); 11 | fclose(fid); 12 | words = words{1}; 13 | vocab_size = length(words); 14 | global wordMap 15 | wordMap = containers.Map(words(1:vocab_size),1:vocab_size); 16 | 17 | fid = fopen(vectors_file,'r'); 18 | fseek(fid,0,'eof'); 19 | vector_size = ftell(fid)/16/vocab_size - 1; 20 | frewind(fid); 21 | WW = fread(fid, [vector_size+1 2*vocab_size], 'double')'; 22 | fclose(fid); 23 | 24 | W1 = WW(1:vocab_size, 1:vector_size); % word vectors 25 | W2 = WW(vocab_size+1:end, 1:vector_size); % context (tilde) word vectors 26 | 27 | W = W1 + W2; %Evaluate on sum of word vectors 28 | W = bsxfun(@rdivide,W,sqrt(sum(W.*W,2))); %normalize vectors before evaluation 29 | evaluate_vectors(W); 30 | exit 31 | 32 | -------------------------------------------------------------------------------- /eval/octave/WordLookup_octave.m: -------------------------------------------------------------------------------- 1 | function index = WordLookup_octave(InputString) 2 | global wordMap 3 | 4 | if isfield(wordMap, InputString) 5 | index = wordMap.(InputString); 6 | elseif isfield(wordMap, '') 7 | index = wordMap.(''); 8 | else 9 | index = 0; 10 | end 11 | -------------------------------------------------------------------------------- /eval/octave/evaluate_vectors_octave.m: -------------------------------------------------------------------------------- 1 | function [BB] = evaluate_vectors_octave(W) 2 | 3 | global wordMap 4 | 5 | filenames = {'capital-common-countries' 'capital-world' 'currency' 'city-in-state' 'family' 'gram1-adjective-to-adverb' ... 6 | 'gram2-opposite' 'gram3-comparative' 'gram4-superlative' 'gram5-present-participle' 'gram6-nationality-adjective' ... 7 | 'gram7-past-tense' 'gram8-plural' 'gram9-plural-verbs'}; 8 | path = './eval/question-data/'; 9 | 10 | split_size = 100; %to avoid memory overflow, could be increased/decreased depending on system and vocab size 11 | 12 | correct_sem = 0; %count correct semantic questions 13 | correct_syn = 0; %count correct syntactic questions 14 | correct_tot = 0; %count correct questions 15 | count_sem = 0; %count all semantic questions 16 | count_syn = 0; %count all syntactic questions 17 | count_tot = 0; %count all questions 18 | full_count = 0; %count all questions, including those with unknown words 19 | 20 | 21 | if isfield(wordMap, '') 22 | unkkey = wordMap.(''); 23 | else 24 | unkkey = 0; 25 | end 26 | 27 | for j=1:length(filenames); 28 | 29 | clear dist; 30 | 31 | fid=fopen([path filenames{j} '.txt']); 32 | temp=textscan(fid,'%s%s%s%s'); 33 | fclose(fid); 34 | ind1 = cellfun(@WordLookup_octave,temp{1}); %indices of first word in analogy 35 | ind2 = cellfun(@WordLookup_octave,temp{2}); %indices of second word in analogy 36 | ind3 = cellfun(@WordLookup_octave,temp{3}); %indices of third word in analogy 37 | ind4 = cellfun(@WordLookup_octave,temp{4}); %indices of answer word in analogy 38 | full_count = full_count + length(ind1); 39 | ind = (ind1 ~= unkkey) & (ind2 ~= unkkey) & (ind3 ~= unkkey) & (ind4 ~= unkkey); %only look at those questions which have no unknown words 40 | ind1 = ind1(ind); 41 | ind2 = ind2(ind); 42 | ind3 = ind3(ind); 43 | ind4 = ind4(ind); 44 | disp([filenames{j} ':']); 45 | mx = zeros(1,length(ind1)); 46 | num_iter = ceil(length(ind1)/split_size); 47 | for jj=1:num_iter 48 | range = (jj-1)*split_size+1:min(jj*split_size,length(ind1)); 49 | dist = full(W * (W(ind2(range),:)' - W(ind1(range),:)' + W(ind3(range),:)')); %cosine similarity if input W has been normalized 50 | for i=1:length(range) 51 | dist(ind1(range(i)),i) = -Inf; 52 | dist(ind2(range(i)),i) = -Inf; 53 | dist(ind3(range(i)),i) = -Inf; 54 | end 55 | [~, mx(range)] = max(dist); %predicted word index 56 | end 57 | 58 | val = (ind4 == mx'); %correct predictions 59 | count_tot = count_tot + length(ind1); 60 | correct_tot = correct_tot + sum(val); 61 | disp(['ACCURACY TOP1: ' num2str(mean(val)*100,'%-2.2f') '% (' num2str(sum(val)) '/' num2str(length(val)) ')']); 62 | if j < 6 63 | count_sem = count_sem + length(ind1); 64 | correct_sem = correct_sem + sum(val); 65 | else 66 | count_syn = count_syn + length(ind1); 67 | correct_syn = correct_syn + sum(val); 68 | end 69 | 70 | disp(['Total accuracy: ' num2str(100*correct_tot/count_tot,'%-2.2f') '% Semantic accuracy: ' num2str(100*correct_sem/count_sem,'%-2.2f') '% Syntactic accuracy: ' num2str(100*correct_syn/count_syn,'%-2.2f') '%']); 71 | 72 | end 73 | disp('________________________________________________________________________________'); 74 | disp(['Questions seen/total: ' num2str(100*count_tot/full_count,'%-2.2f') '% (' num2str(count_tot) '/' num2str(full_count) ')']); 75 | disp(['Semantic Accuracy: ' num2str(100*correct_sem/count_sem,'%-2.2f') '% (' num2str(correct_sem) '/' num2str(count_sem) ')']); 76 | disp(['Syntactic Accuracy: ' num2str(100*correct_syn/count_syn,'%-2.2f') '% (' num2str(correct_syn) '/' num2str(count_syn) ')']); 77 | disp(['Total Accuracy: ' num2str(100*correct_tot/count_tot,'%-2.2f') '% (' num2str(correct_tot) '/' num2str(count_tot) ')']); 78 | BB = [100*correct_sem/count_sem 100*correct_syn/count_syn 100*correct_tot/count_tot]; 79 | 80 | end 81 | -------------------------------------------------------------------------------- /eval/octave/read_and_evaluate_octave.m: -------------------------------------------------------------------------------- 1 | addpath('./eval/octave'); 2 | if(~exist('vocab_file')) 3 | vocab_file = 'vocab.txt'; 4 | end 5 | if(~exist('vectors_file')) 6 | vectors_file = 'vectors.bin'; 7 | end 8 | 9 | fid = fopen(vocab_file, 'r'); 10 | words = textscan(fid, '%s %f'); 11 | fclose(fid); 12 | words = words{1}; 13 | vocab_size = length(words); 14 | global wordMap 15 | 16 | wordMap = struct(); 17 | for i=1:numel(words) 18 | wordMap.(words{i}) = i; 19 | end 20 | 21 | fid = fopen(vectors_file,'r'); 22 | fseek(fid,0,'eof'); 23 | vector_size = ftell(fid)/16/vocab_size - 1; 24 | frewind(fid); 25 | WW = fread(fid, [vector_size+1 2*vocab_size], 'double')'; 26 | fclose(fid); 27 | 28 | W1 = WW(1:vocab_size, 1:vector_size); % word vectors 29 | W2 = WW(vocab_size+1:end, 1:vector_size); % context (tilde) word vectors 30 | 31 | W = W1 + W2; %Evaluate on sum of word vectors 32 | W = bsxfun(@rdivide,W,sqrt(sum(W.*W,2))); %normalize vectors before evaluation 33 | evaluate_vectors_octave(W); 34 | exit 35 | 36 | -------------------------------------------------------------------------------- /eval/python/distance.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import sys 4 | 5 | def generate(): 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('--vocab_file', default='vocab.txt', type=str) 8 | parser.add_argument('--vectors_file', default='vectors.txt', type=str) 9 | args = parser.parse_args() 10 | 11 | with open(args.vocab_file, 'r') as f: 12 | words = [x.rstrip().split(' ')[0] for x in f.readlines()] 13 | with open(args.vectors_file, 'r') as f: 14 | vectors = {} 15 | for line in f: 16 | vals = line.rstrip().split(' ') 17 | vectors[vals[0]] = [float(x) for x in vals[1:]] 18 | 19 | vocab_size = len(words) 20 | vocab = {w: idx for idx, w in enumerate(words)} 21 | ivocab = {idx: w for idx, w in enumerate(words)} 22 | 23 | vector_dim = len(vectors[ivocab[0]]) 24 | W = np.zeros((vocab_size, vector_dim)) 25 | for word, v in vectors.items(): 26 | if word == '': 27 | continue 28 | W[vocab[word], :] = v 29 | 30 | # normalize each word vector to unit variance 31 | W_norm = np.zeros(W.shape) 32 | d = (np.sum(W ** 2, 1) ** (0.5)) 33 | W_norm = (W.T / d).T 34 | return (W_norm, vocab, ivocab) 35 | 36 | 37 | def distance(W, vocab, ivocab, input_term): 38 | for idx, term in enumerate(input_term.split(' ')): 39 | if term in vocab: 40 | print('Word: %s Position in vocabulary: %i' % (term, vocab[term])) 41 | if idx == 0: 42 | vec_result = np.copy(W[vocab[term], :]) 43 | else: 44 | vec_result += W[vocab[term], :] 45 | else: 46 | print('Word: %s Out of dictionary!\n' % term) 47 | return 48 | 49 | vec_norm = np.zeros(vec_result.shape) 50 | d = (np.sum(vec_result ** 2,) ** (0.5)) 51 | vec_norm = (vec_result.T / d).T 52 | 53 | dist = np.dot(W, vec_norm.T) 54 | 55 | for term in input_term.split(' '): 56 | index = vocab[term] 57 | dist[index] = -np.Inf 58 | 59 | a = np.argsort(-dist)[:N] 60 | 61 | print("\n Word Cosine distance\n") 62 | print("---------------------------------------------------------\n") 63 | for x in a: 64 | print("%35s\t\t%f\n" % (ivocab[x], dist[x])) 65 | 66 | 67 | if __name__ == "__main__": 68 | N = 100 # number of closest words that will be shown 69 | W, vocab, ivocab = generate() 70 | while True: 71 | input_term = input("\nEnter word or sentence (EXIT to break): ") 72 | if input_term == 'EXIT': 73 | break 74 | else: 75 | distance(W, vocab, ivocab, input_term) 76 | -------------------------------------------------------------------------------- /eval/python/evaluate.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | 4 | def main(): 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument('--vocab_file', default='vocab.txt', type=str) 7 | parser.add_argument('--vectors_file', default='vectors.txt', type=str) 8 | args = parser.parse_args() 9 | 10 | with open(args.vocab_file, 'r') as f: 11 | words = [x.rstrip().split(' ')[0] for x in f.readlines()] 12 | with open(args.vectors_file, 'r') as f: 13 | vectors = {} 14 | for line in f: 15 | vals = line.rstrip().split(' ') 16 | vectors[vals[0]] = [float(x) for x in vals[1:]] 17 | 18 | vocab_size = len(words) 19 | vocab = {w: idx for idx, w in enumerate(words)} 20 | ivocab = {idx: w for idx, w in enumerate(words)} 21 | 22 | vector_dim = len(vectors[ivocab[0]]) 23 | W = np.zeros((vocab_size, vector_dim)) 24 | for word, v in vectors.items(): 25 | if word == '': 26 | continue 27 | W[vocab[word], :] = v 28 | 29 | # normalize each word vector to unit length 30 | W_norm = np.zeros(W.shape) 31 | d = (np.sum(W ** 2, 1) ** (0.5)) 32 | W_norm = (W.T / d).T 33 | evaluate_vectors(W_norm, vocab) 34 | 35 | def evaluate_vectors(W, vocab): 36 | """Evaluate the trained word vectors on a variety of tasks""" 37 | 38 | filenames = [ 39 | 'capital-common-countries.txt', 'capital-world.txt', 'currency.txt', 40 | 'city-in-state.txt', 'family.txt', 'gram1-adjective-to-adverb.txt', 41 | 'gram2-opposite.txt', 'gram3-comparative.txt', 'gram4-superlative.txt', 42 | 'gram5-present-participle.txt', 'gram6-nationality-adjective.txt', 43 | 'gram7-past-tense.txt', 'gram8-plural.txt', 'gram9-plural-verbs.txt', 44 | ] 45 | prefix = './eval/question-data/' 46 | 47 | # to avoid memory overflow, could be increased/decreased 48 | # depending on system and vocab size 49 | split_size = 100 50 | 51 | correct_sem = 0; # count correct semantic questions 52 | correct_syn = 0; # count correct syntactic questions 53 | correct_tot = 0 # count correct questions 54 | count_sem = 0; # count all semantic questions 55 | count_syn = 0; # count all syntactic questions 56 | count_tot = 0 # count all questions 57 | full_count = 0 # count all questions, including those with unknown words 58 | 59 | for i in range(len(filenames)): 60 | with open('%s/%s' % (prefix, filenames[i]), 'r') as f: 61 | full_data = [line.rstrip().split(' ') for line in f] 62 | full_count += len(full_data) 63 | data = [x for x in full_data if all(word in vocab for word in x)] 64 | 65 | if len(data) == 0: 66 | print("ERROR: no lines of vocab kept for %s !" % filenames[i]) 67 | print("Example missing line:", full_data[0]) 68 | continue 69 | 70 | indices = np.array([[vocab[word] for word in row] for row in data]) 71 | ind1, ind2, ind3, ind4 = indices.T 72 | 73 | predictions = np.zeros((len(indices),)) 74 | num_iter = int(np.ceil(len(indices) / float(split_size))) 75 | for j in range(num_iter): 76 | subset = np.arange(j*split_size, min((j + 1)*split_size, len(ind1))) 77 | 78 | pred_vec = (W[ind2[subset], :] - W[ind1[subset], :] 79 | + W[ind3[subset], :]) 80 | #cosine similarity if input W has been normalized 81 | dist = np.dot(W, pred_vec.T) 82 | 83 | for k in range(len(subset)): 84 | dist[ind1[subset[k]], k] = -np.inf 85 | dist[ind2[subset[k]], k] = -np.inf 86 | dist[ind3[subset[k]], k] = -np.inf 87 | 88 | # predicted word index 89 | predictions[subset] = np.argmax(dist, 0).flatten() 90 | 91 | val = (ind4 == predictions) # correct predictions 92 | count_tot = count_tot + len(ind1) 93 | correct_tot = correct_tot + sum(val) 94 | if i < 5: 95 | count_sem = count_sem + len(ind1) 96 | correct_sem = correct_sem + sum(val) 97 | else: 98 | count_syn = count_syn + len(ind1) 99 | correct_syn = correct_syn + sum(val) 100 | 101 | print("%s:" % filenames[i]) 102 | print('ACCURACY TOP1: %.2f%% (%d/%d)' % 103 | (np.mean(val) * 100, np.sum(val), len(val))) 104 | 105 | print('Questions seen/total: %.2f%% (%d/%d)' % 106 | (100 * count_tot / float(full_count), count_tot, full_count)) 107 | print('Semantic accuracy: %.2f%% (%i/%i)' % 108 | (100 * correct_sem / float(count_sem), correct_sem, count_sem)) 109 | print('Syntactic accuracy: %.2f%% (%i/%i)' % 110 | (100 * correct_syn / float(count_syn), correct_syn, count_syn)) 111 | print('Total accuracy: %.2f%% (%i/%i)' % (100 * correct_tot / float(count_tot), correct_tot, count_tot)) 112 | 113 | 114 | if __name__ == "__main__": 115 | main() 116 | -------------------------------------------------------------------------------- /eval/python/word_analogy.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | 4 | def generate(): 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument('--vocab_file', default='vocab.txt', type=str) 7 | parser.add_argument('--vectors_file', default='vectors.txt', type=str) 8 | args = parser.parse_args() 9 | 10 | with open(args.vocab_file, 'r') as f: 11 | words = [x.rstrip().split(' ')[0] for x in f.readlines()] 12 | with open(args.vectors_file, 'r') as f: 13 | vectors = {} 14 | for line in f: 15 | vals = line.rstrip().split(' ') 16 | vectors[vals[0]] = [float(x) for x in vals[1:]] 17 | 18 | vocab_size = len(words) 19 | vocab = {w: idx for idx, w in enumerate(words)} 20 | ivocab = {idx: w for idx, w in enumerate(words)} 21 | 22 | vector_dim = len(vectors[ivocab[0]]) 23 | W = np.zeros((vocab_size, vector_dim)) 24 | for word, v in vectors.items(): 25 | if word == '': 26 | continue 27 | W[vocab[word], :] = v 28 | 29 | # normalize each word vector to unit variance 30 | W_norm = np.zeros(W.shape) 31 | d = (np.sum(W ** 2, 1) ** (0.5)) 32 | W_norm = (W.T / d).T 33 | return (W_norm, vocab, ivocab) 34 | 35 | 36 | def distance(W, vocab, ivocab, input_term): 37 | vecs = {} 38 | if len(input_term.split(' ')) < 3: 39 | print("Only %i words were entered.. three words are needed at the input to perform the calculation\n" % len(input_term.split(' '))) 40 | return 41 | else: 42 | for idx, term in enumerate(input_term.split(' ')): 43 | if term in vocab: 44 | print('Word: %s Position in vocabulary: %i' % (term, vocab[term])) 45 | vecs[idx] = W[vocab[term], :] 46 | else: 47 | print('Word: %s Out of dictionary!\n' % term) 48 | return 49 | 50 | vec_result = vecs[1] - vecs[0] + vecs[2] 51 | 52 | vec_norm = np.zeros(vec_result.shape) 53 | d = (np.sum(vec_result ** 2,) ** (0.5)) 54 | vec_norm = (vec_result.T / d).T 55 | 56 | dist = np.dot(W, vec_norm.T) 57 | 58 | for term in input_term.split(' '): 59 | index = vocab[term] 60 | dist[index] = -np.Inf 61 | 62 | a = np.argsort(-dist)[:N] 63 | 64 | print("\n Word Cosine distance\n") 65 | print("---------------------------------------------------------\n") 66 | for x in a: 67 | print("%35s\t\t%f\n" % (ivocab[x], dist[x])) 68 | 69 | 70 | if __name__ == "__main__": 71 | N = 100; # number of closest words that will be shown 72 | W, vocab, ivocab = generate() 73 | while True: 74 | input_term = input("\nEnter three words (EXIT to break): ") 75 | if input_term == 'EXIT': 76 | break 77 | else: 78 | distance(W, vocab, ivocab, input_term) 79 | 80 | -------------------------------------------------------------------------------- /eval/question-data/capital-common-countries.txt: -------------------------------------------------------------------------------- 1 | athens greece baghdad iraq 2 | athens greece bangkok thailand 3 | athens greece beijing china 4 | athens greece berlin germany 5 | athens greece bern switzerland 6 | athens greece cairo egypt 7 | athens greece canberra australia 8 | athens greece hanoi vietnam 9 | athens greece havana cuba 10 | athens greece helsinki finland 11 | athens greece islamabad pakistan 12 | athens greece kabul afghanistan 13 | athens greece london england 14 | athens greece madrid spain 15 | athens greece moscow russia 16 | athens greece oslo norway 17 | athens greece ottawa canada 18 | athens greece paris france 19 | athens greece rome italy 20 | athens greece stockholm sweden 21 | athens greece tehran iran 22 | athens greece tokyo japan 23 | baghdad iraq bangkok thailand 24 | baghdad iraq beijing china 25 | baghdad iraq berlin germany 26 | baghdad iraq bern switzerland 27 | baghdad iraq cairo egypt 28 | baghdad iraq canberra australia 29 | baghdad iraq hanoi vietnam 30 | baghdad iraq havana cuba 31 | baghdad iraq helsinki finland 32 | baghdad iraq islamabad pakistan 33 | baghdad iraq kabul afghanistan 34 | baghdad iraq london england 35 | baghdad iraq madrid spain 36 | baghdad iraq moscow russia 37 | baghdad iraq oslo norway 38 | baghdad iraq ottawa canada 39 | baghdad iraq paris france 40 | baghdad iraq rome italy 41 | baghdad iraq stockholm sweden 42 | baghdad iraq tehran iran 43 | baghdad iraq tokyo japan 44 | baghdad iraq athens greece 45 | bangkok thailand beijing china 46 | bangkok thailand berlin germany 47 | bangkok thailand bern switzerland 48 | bangkok thailand cairo egypt 49 | bangkok thailand canberra australia 50 | bangkok thailand hanoi vietnam 51 | bangkok thailand havana cuba 52 | bangkok thailand helsinki finland 53 | bangkok thailand islamabad pakistan 54 | bangkok thailand kabul afghanistan 55 | bangkok thailand london england 56 | bangkok thailand madrid spain 57 | bangkok thailand moscow russia 58 | bangkok thailand oslo norway 59 | bangkok thailand ottawa canada 60 | bangkok thailand paris france 61 | bangkok thailand rome italy 62 | bangkok thailand stockholm sweden 63 | bangkok thailand tehran iran 64 | bangkok thailand tokyo japan 65 | bangkok thailand athens greece 66 | bangkok thailand baghdad iraq 67 | beijing china berlin germany 68 | beijing china bern switzerland 69 | beijing china cairo egypt 70 | beijing china canberra australia 71 | beijing china hanoi vietnam 72 | beijing china havana cuba 73 | beijing china helsinki finland 74 | beijing china islamabad pakistan 75 | beijing china kabul afghanistan 76 | beijing china london england 77 | beijing china madrid spain 78 | beijing china moscow russia 79 | beijing china oslo norway 80 | beijing china ottawa canada 81 | beijing china paris france 82 | beijing china rome italy 83 | beijing china stockholm sweden 84 | beijing china tehran iran 85 | beijing china tokyo japan 86 | beijing china athens greece 87 | beijing china baghdad iraq 88 | beijing china bangkok thailand 89 | berlin germany bern switzerland 90 | berlin germany cairo egypt 91 | berlin germany canberra australia 92 | berlin germany hanoi vietnam 93 | berlin germany havana cuba 94 | berlin germany helsinki finland 95 | berlin germany islamabad pakistan 96 | berlin germany kabul afghanistan 97 | berlin germany london england 98 | berlin germany madrid spain 99 | berlin germany moscow russia 100 | berlin germany oslo norway 101 | berlin germany ottawa canada 102 | berlin germany paris france 103 | berlin germany rome italy 104 | berlin germany stockholm sweden 105 | berlin germany tehran iran 106 | berlin germany tokyo japan 107 | berlin germany athens greece 108 | berlin germany baghdad iraq 109 | berlin germany bangkok thailand 110 | berlin germany beijing china 111 | bern switzerland cairo egypt 112 | bern switzerland canberra australia 113 | bern switzerland hanoi vietnam 114 | bern switzerland havana cuba 115 | bern switzerland helsinki finland 116 | bern switzerland islamabad pakistan 117 | bern switzerland kabul afghanistan 118 | bern switzerland london england 119 | bern switzerland madrid spain 120 | bern switzerland moscow russia 121 | bern switzerland oslo norway 122 | bern switzerland ottawa canada 123 | bern switzerland paris france 124 | bern switzerland rome italy 125 | bern switzerland stockholm sweden 126 | bern switzerland tehran iran 127 | bern switzerland tokyo japan 128 | bern switzerland athens greece 129 | bern switzerland baghdad iraq 130 | bern switzerland bangkok thailand 131 | bern switzerland beijing china 132 | bern switzerland berlin germany 133 | cairo egypt canberra australia 134 | cairo egypt hanoi vietnam 135 | cairo egypt havana cuba 136 | cairo egypt helsinki finland 137 | cairo egypt islamabad pakistan 138 | cairo egypt kabul afghanistan 139 | cairo egypt london england 140 | cairo egypt madrid spain 141 | cairo egypt moscow russia 142 | cairo egypt oslo norway 143 | cairo egypt ottawa canada 144 | cairo egypt paris france 145 | cairo egypt rome italy 146 | cairo egypt stockholm sweden 147 | cairo egypt tehran iran 148 | cairo egypt tokyo japan 149 | cairo egypt athens greece 150 | cairo egypt baghdad iraq 151 | cairo egypt bangkok thailand 152 | cairo egypt beijing china 153 | cairo egypt berlin germany 154 | cairo egypt bern switzerland 155 | canberra australia hanoi vietnam 156 | canberra australia havana cuba 157 | canberra australia helsinki finland 158 | canberra australia islamabad pakistan 159 | canberra australia kabul afghanistan 160 | canberra australia london england 161 | canberra australia madrid spain 162 | canberra australia moscow russia 163 | canberra australia oslo norway 164 | canberra australia ottawa canada 165 | canberra australia paris france 166 | canberra australia rome italy 167 | canberra australia stockholm sweden 168 | canberra australia tehran iran 169 | canberra australia tokyo japan 170 | canberra australia athens greece 171 | canberra australia baghdad iraq 172 | canberra australia bangkok thailand 173 | canberra australia beijing china 174 | canberra australia berlin germany 175 | canberra australia bern switzerland 176 | canberra australia cairo egypt 177 | hanoi vietnam havana cuba 178 | hanoi vietnam helsinki finland 179 | hanoi vietnam islamabad pakistan 180 | hanoi vietnam kabul afghanistan 181 | hanoi vietnam london england 182 | hanoi vietnam madrid spain 183 | hanoi vietnam moscow russia 184 | hanoi vietnam oslo norway 185 | hanoi vietnam ottawa canada 186 | hanoi vietnam paris france 187 | hanoi vietnam rome italy 188 | hanoi vietnam stockholm sweden 189 | hanoi vietnam tehran iran 190 | hanoi vietnam tokyo japan 191 | hanoi vietnam athens greece 192 | hanoi vietnam baghdad iraq 193 | hanoi vietnam bangkok thailand 194 | hanoi vietnam beijing china 195 | hanoi vietnam berlin germany 196 | hanoi vietnam bern switzerland 197 | hanoi vietnam cairo egypt 198 | hanoi vietnam canberra australia 199 | havana cuba helsinki finland 200 | havana cuba islamabad pakistan 201 | havana cuba kabul afghanistan 202 | havana cuba london england 203 | havana cuba madrid spain 204 | havana cuba moscow russia 205 | havana cuba oslo norway 206 | havana cuba ottawa canada 207 | havana cuba paris france 208 | havana cuba rome italy 209 | havana cuba stockholm sweden 210 | havana cuba tehran iran 211 | havana cuba tokyo japan 212 | havana cuba athens greece 213 | havana cuba baghdad iraq 214 | havana cuba bangkok thailand 215 | havana cuba beijing china 216 | havana cuba berlin germany 217 | havana cuba bern switzerland 218 | havana cuba cairo egypt 219 | havana cuba canberra australia 220 | havana cuba hanoi vietnam 221 | helsinki finland islamabad pakistan 222 | helsinki finland kabul afghanistan 223 | helsinki finland london england 224 | helsinki finland madrid spain 225 | helsinki finland moscow russia 226 | helsinki finland oslo norway 227 | helsinki finland ottawa canada 228 | helsinki finland paris france 229 | helsinki finland rome italy 230 | helsinki finland stockholm sweden 231 | helsinki finland tehran iran 232 | helsinki finland tokyo japan 233 | helsinki finland athens greece 234 | helsinki finland baghdad iraq 235 | helsinki finland bangkok thailand 236 | helsinki finland beijing china 237 | helsinki finland berlin germany 238 | helsinki finland bern switzerland 239 | helsinki finland cairo egypt 240 | helsinki finland canberra australia 241 | helsinki finland hanoi vietnam 242 | helsinki finland havana cuba 243 | islamabad pakistan kabul afghanistan 244 | islamabad pakistan london england 245 | islamabad pakistan madrid spain 246 | islamabad pakistan moscow russia 247 | islamabad pakistan oslo norway 248 | islamabad pakistan ottawa canada 249 | islamabad pakistan paris france 250 | islamabad pakistan rome italy 251 | islamabad pakistan stockholm sweden 252 | islamabad pakistan tehran iran 253 | islamabad pakistan tokyo japan 254 | islamabad pakistan athens greece 255 | islamabad pakistan baghdad iraq 256 | islamabad pakistan bangkok thailand 257 | islamabad pakistan beijing china 258 | islamabad pakistan berlin germany 259 | islamabad pakistan bern switzerland 260 | islamabad pakistan cairo egypt 261 | islamabad pakistan canberra australia 262 | islamabad pakistan hanoi vietnam 263 | islamabad pakistan havana cuba 264 | islamabad pakistan helsinki finland 265 | kabul afghanistan london england 266 | kabul afghanistan madrid spain 267 | kabul afghanistan moscow russia 268 | kabul afghanistan oslo norway 269 | kabul afghanistan ottawa canada 270 | kabul afghanistan paris france 271 | kabul afghanistan rome italy 272 | kabul afghanistan stockholm sweden 273 | kabul afghanistan tehran iran 274 | kabul afghanistan tokyo japan 275 | kabul afghanistan athens greece 276 | kabul afghanistan baghdad iraq 277 | kabul afghanistan bangkok thailand 278 | kabul afghanistan beijing china 279 | kabul afghanistan berlin germany 280 | kabul afghanistan bern switzerland 281 | kabul afghanistan cairo egypt 282 | kabul afghanistan canberra australia 283 | kabul afghanistan hanoi vietnam 284 | kabul afghanistan havana cuba 285 | kabul afghanistan helsinki finland 286 | kabul afghanistan islamabad pakistan 287 | london england madrid spain 288 | london england moscow russia 289 | london england oslo norway 290 | london england ottawa canada 291 | london england paris france 292 | london england rome italy 293 | london england stockholm sweden 294 | london england tehran iran 295 | london england tokyo japan 296 | london england athens greece 297 | london england baghdad iraq 298 | london england bangkok thailand 299 | london england beijing china 300 | london england berlin germany 301 | london england bern switzerland 302 | london england cairo egypt 303 | london england canberra australia 304 | london england hanoi vietnam 305 | london england havana cuba 306 | london england helsinki finland 307 | london england islamabad pakistan 308 | london england kabul afghanistan 309 | madrid spain moscow russia 310 | madrid spain oslo norway 311 | madrid spain ottawa canada 312 | madrid spain paris france 313 | madrid spain rome italy 314 | madrid spain stockholm sweden 315 | madrid spain tehran iran 316 | madrid spain tokyo japan 317 | madrid spain athens greece 318 | madrid spain baghdad iraq 319 | madrid spain bangkok thailand 320 | madrid spain beijing china 321 | madrid spain berlin germany 322 | madrid spain bern switzerland 323 | madrid spain cairo egypt 324 | madrid spain canberra australia 325 | madrid spain hanoi vietnam 326 | madrid spain havana cuba 327 | madrid spain helsinki finland 328 | madrid spain islamabad pakistan 329 | madrid spain kabul afghanistan 330 | madrid spain london england 331 | moscow russia oslo norway 332 | moscow russia ottawa canada 333 | moscow russia paris france 334 | moscow russia rome italy 335 | moscow russia stockholm sweden 336 | moscow russia tehran iran 337 | moscow russia tokyo japan 338 | moscow russia athens greece 339 | moscow russia baghdad iraq 340 | moscow russia bangkok thailand 341 | moscow russia beijing china 342 | moscow russia berlin germany 343 | moscow russia bern switzerland 344 | moscow russia cairo egypt 345 | moscow russia canberra australia 346 | moscow russia hanoi vietnam 347 | moscow russia havana cuba 348 | moscow russia helsinki finland 349 | moscow russia islamabad pakistan 350 | moscow russia kabul afghanistan 351 | moscow russia london england 352 | moscow russia madrid spain 353 | oslo norway ottawa canada 354 | oslo norway paris france 355 | oslo norway rome italy 356 | oslo norway stockholm sweden 357 | oslo norway tehran iran 358 | oslo norway tokyo japan 359 | oslo norway athens greece 360 | oslo norway baghdad iraq 361 | oslo norway bangkok thailand 362 | oslo norway beijing china 363 | oslo norway berlin germany 364 | oslo norway bern switzerland 365 | oslo norway cairo egypt 366 | oslo norway canberra australia 367 | oslo norway hanoi vietnam 368 | oslo norway havana cuba 369 | oslo norway helsinki finland 370 | oslo norway islamabad pakistan 371 | oslo norway kabul afghanistan 372 | oslo norway london england 373 | oslo norway madrid spain 374 | oslo norway moscow russia 375 | ottawa canada paris france 376 | ottawa canada rome italy 377 | ottawa canada stockholm sweden 378 | ottawa canada tehran iran 379 | ottawa canada tokyo japan 380 | ottawa canada athens greece 381 | ottawa canada baghdad iraq 382 | ottawa canada bangkok thailand 383 | ottawa canada beijing china 384 | ottawa canada berlin germany 385 | ottawa canada bern switzerland 386 | ottawa canada cairo egypt 387 | ottawa canada canberra australia 388 | ottawa canada hanoi vietnam 389 | ottawa canada havana cuba 390 | ottawa canada helsinki finland 391 | ottawa canada islamabad pakistan 392 | ottawa canada kabul afghanistan 393 | ottawa canada london england 394 | ottawa canada madrid spain 395 | ottawa canada moscow russia 396 | ottawa canada oslo norway 397 | paris france rome italy 398 | paris france stockholm sweden 399 | paris france tehran iran 400 | paris france tokyo japan 401 | paris france athens greece 402 | paris france baghdad iraq 403 | paris france bangkok thailand 404 | paris france beijing china 405 | paris france berlin germany 406 | paris france bern switzerland 407 | paris france cairo egypt 408 | paris france canberra australia 409 | paris france hanoi vietnam 410 | paris france havana cuba 411 | paris france helsinki finland 412 | paris france islamabad pakistan 413 | paris france kabul afghanistan 414 | paris france london england 415 | paris france madrid spain 416 | paris france moscow russia 417 | paris france oslo norway 418 | paris france ottawa canada 419 | rome italy stockholm sweden 420 | rome italy tehran iran 421 | rome italy tokyo japan 422 | rome italy athens greece 423 | rome italy baghdad iraq 424 | rome italy bangkok thailand 425 | rome italy beijing china 426 | rome italy berlin germany 427 | rome italy bern switzerland 428 | rome italy cairo egypt 429 | rome italy canberra australia 430 | rome italy hanoi vietnam 431 | rome italy havana cuba 432 | rome italy helsinki finland 433 | rome italy islamabad pakistan 434 | rome italy kabul afghanistan 435 | rome italy london england 436 | rome italy madrid spain 437 | rome italy moscow russia 438 | rome italy oslo norway 439 | rome italy ottawa canada 440 | rome italy paris france 441 | stockholm sweden tehran iran 442 | stockholm sweden tokyo japan 443 | stockholm sweden athens greece 444 | stockholm sweden baghdad iraq 445 | stockholm sweden bangkok thailand 446 | stockholm sweden beijing china 447 | stockholm sweden berlin germany 448 | stockholm sweden bern switzerland 449 | stockholm sweden cairo egypt 450 | stockholm sweden canberra australia 451 | stockholm sweden hanoi vietnam 452 | stockholm sweden havana cuba 453 | stockholm sweden helsinki finland 454 | stockholm sweden islamabad pakistan 455 | stockholm sweden kabul afghanistan 456 | stockholm sweden london england 457 | stockholm sweden madrid spain 458 | stockholm sweden moscow russia 459 | stockholm sweden oslo norway 460 | stockholm sweden ottawa canada 461 | stockholm sweden paris france 462 | stockholm sweden rome italy 463 | tehran iran tokyo japan 464 | tehran iran athens greece 465 | tehran iran baghdad iraq 466 | tehran iran bangkok thailand 467 | tehran iran beijing china 468 | tehran iran berlin germany 469 | tehran iran bern switzerland 470 | tehran iran cairo egypt 471 | tehran iran canberra australia 472 | tehran iran hanoi vietnam 473 | tehran iran havana cuba 474 | tehran iran helsinki finland 475 | tehran iran islamabad pakistan 476 | tehran iran kabul afghanistan 477 | tehran iran london england 478 | tehran iran madrid spain 479 | tehran iran moscow russia 480 | tehran iran oslo norway 481 | tehran iran ottawa canada 482 | tehran iran paris france 483 | tehran iran rome italy 484 | tehran iran stockholm sweden 485 | tokyo japan athens greece 486 | tokyo japan baghdad iraq 487 | tokyo japan bangkok thailand 488 | tokyo japan beijing china 489 | tokyo japan berlin germany 490 | tokyo japan bern switzerland 491 | tokyo japan cairo egypt 492 | tokyo japan canberra australia 493 | tokyo japan hanoi vietnam 494 | tokyo japan havana cuba 495 | tokyo japan helsinki finland 496 | tokyo japan islamabad pakistan 497 | tokyo japan kabul afghanistan 498 | tokyo japan london england 499 | tokyo japan madrid spain 500 | tokyo japan moscow russia 501 | tokyo japan oslo norway 502 | tokyo japan ottawa canada 503 | tokyo japan paris france 504 | tokyo japan rome italy 505 | tokyo japan stockholm sweden 506 | tokyo japan tehran iran 507 | -------------------------------------------------------------------------------- /eval/question-data/currency.txt: -------------------------------------------------------------------------------- 1 | algeria dinar angola kwanza 2 | algeria dinar argentina peso 3 | algeria dinar armenia dram 4 | algeria dinar brazil real 5 | algeria dinar bulgaria lev 6 | algeria dinar cambodia riel 7 | algeria dinar canada dollar 8 | algeria dinar croatia kuna 9 | algeria dinar denmark krone 10 | algeria dinar europe euro 11 | algeria dinar hungary forint 12 | algeria dinar india rupee 13 | algeria dinar iran rial 14 | algeria dinar japan yen 15 | algeria dinar korea won 16 | algeria dinar latvia lats 17 | algeria dinar lithuania litas 18 | algeria dinar macedonia denar 19 | algeria dinar malaysia ringgit 20 | algeria dinar mexico peso 21 | algeria dinar nigeria naira 22 | algeria dinar poland zloty 23 | algeria dinar romania leu 24 | algeria dinar russia ruble 25 | algeria dinar sweden krona 26 | algeria dinar thailand baht 27 | algeria dinar ukraine hryvnia 28 | algeria dinar usa dollar 29 | algeria dinar vietnam dong 30 | angola kwanza argentina peso 31 | angola kwanza armenia dram 32 | angola kwanza brazil real 33 | angola kwanza bulgaria lev 34 | angola kwanza cambodia riel 35 | angola kwanza canada dollar 36 | angola kwanza croatia kuna 37 | angola kwanza denmark krone 38 | angola kwanza europe euro 39 | angola kwanza hungary forint 40 | angola kwanza india rupee 41 | angola kwanza iran rial 42 | angola kwanza japan yen 43 | angola kwanza korea won 44 | angola kwanza latvia lats 45 | angola kwanza lithuania litas 46 | angola kwanza macedonia denar 47 | angola kwanza malaysia ringgit 48 | angola kwanza mexico peso 49 | angola kwanza nigeria naira 50 | angola kwanza poland zloty 51 | angola kwanza romania leu 52 | angola kwanza russia ruble 53 | angola kwanza sweden krona 54 | angola kwanza thailand baht 55 | angola kwanza ukraine hryvnia 56 | angola kwanza usa dollar 57 | angola kwanza vietnam dong 58 | angola kwanza algeria dinar 59 | argentina peso armenia dram 60 | argentina peso brazil real 61 | argentina peso bulgaria lev 62 | argentina peso cambodia riel 63 | argentina peso canada dollar 64 | argentina peso croatia kuna 65 | argentina peso denmark krone 66 | argentina peso europe euro 67 | argentina peso hungary forint 68 | argentina peso india rupee 69 | argentina peso iran rial 70 | argentina peso japan yen 71 | argentina peso korea won 72 | argentina peso latvia lats 73 | argentina peso lithuania litas 74 | argentina peso macedonia denar 75 | argentina peso malaysia ringgit 76 | argentina peso nigeria naira 77 | argentina peso poland zloty 78 | argentina peso romania leu 79 | argentina peso russia ruble 80 | argentina peso sweden krona 81 | argentina peso thailand baht 82 | argentina peso ukraine hryvnia 83 | argentina peso usa dollar 84 | argentina peso vietnam dong 85 | argentina peso algeria dinar 86 | argentina peso angola kwanza 87 | armenia dram brazil real 88 | armenia dram bulgaria lev 89 | armenia dram cambodia riel 90 | armenia dram canada dollar 91 | armenia dram croatia kuna 92 | armenia dram denmark krone 93 | armenia dram europe euro 94 | armenia dram hungary forint 95 | armenia dram india rupee 96 | armenia dram iran rial 97 | armenia dram japan yen 98 | armenia dram korea won 99 | armenia dram latvia lats 100 | armenia dram lithuania litas 101 | armenia dram macedonia denar 102 | armenia dram malaysia ringgit 103 | armenia dram mexico peso 104 | armenia dram nigeria naira 105 | armenia dram poland zloty 106 | armenia dram romania leu 107 | armenia dram russia ruble 108 | armenia dram sweden krona 109 | armenia dram thailand baht 110 | armenia dram ukraine hryvnia 111 | armenia dram usa dollar 112 | armenia dram vietnam dong 113 | armenia dram algeria dinar 114 | armenia dram angola kwanza 115 | armenia dram argentina peso 116 | brazil real bulgaria lev 117 | brazil real cambodia riel 118 | brazil real canada dollar 119 | brazil real croatia kuna 120 | brazil real denmark krone 121 | brazil real europe euro 122 | brazil real hungary forint 123 | brazil real india rupee 124 | brazil real iran rial 125 | brazil real japan yen 126 | brazil real korea won 127 | brazil real latvia lats 128 | brazil real lithuania litas 129 | brazil real macedonia denar 130 | brazil real malaysia ringgit 131 | brazil real mexico peso 132 | brazil real nigeria naira 133 | brazil real poland zloty 134 | brazil real romania leu 135 | brazil real russia ruble 136 | brazil real sweden krona 137 | brazil real thailand baht 138 | brazil real ukraine hryvnia 139 | brazil real usa dollar 140 | brazil real vietnam dong 141 | brazil real algeria dinar 142 | brazil real angola kwanza 143 | brazil real argentina peso 144 | brazil real armenia dram 145 | bulgaria lev cambodia riel 146 | bulgaria lev canada dollar 147 | bulgaria lev croatia kuna 148 | bulgaria lev denmark krone 149 | bulgaria lev europe euro 150 | bulgaria lev hungary forint 151 | bulgaria lev india rupee 152 | bulgaria lev iran rial 153 | bulgaria lev japan yen 154 | bulgaria lev korea won 155 | bulgaria lev latvia lats 156 | bulgaria lev lithuania litas 157 | bulgaria lev macedonia denar 158 | bulgaria lev malaysia ringgit 159 | bulgaria lev mexico peso 160 | bulgaria lev nigeria naira 161 | bulgaria lev poland zloty 162 | bulgaria lev romania leu 163 | bulgaria lev russia ruble 164 | bulgaria lev sweden krona 165 | bulgaria lev thailand baht 166 | bulgaria lev ukraine hryvnia 167 | bulgaria lev usa dollar 168 | bulgaria lev vietnam dong 169 | bulgaria lev algeria dinar 170 | bulgaria lev angola kwanza 171 | bulgaria lev argentina peso 172 | bulgaria lev armenia dram 173 | bulgaria lev brazil real 174 | cambodia riel canada dollar 175 | cambodia riel croatia kuna 176 | cambodia riel denmark krone 177 | cambodia riel europe euro 178 | cambodia riel hungary forint 179 | cambodia riel india rupee 180 | cambodia riel iran rial 181 | cambodia riel japan yen 182 | cambodia riel korea won 183 | cambodia riel latvia lats 184 | cambodia riel lithuania litas 185 | cambodia riel macedonia denar 186 | cambodia riel malaysia ringgit 187 | cambodia riel mexico peso 188 | cambodia riel nigeria naira 189 | cambodia riel poland zloty 190 | cambodia riel romania leu 191 | cambodia riel russia ruble 192 | cambodia riel sweden krona 193 | cambodia riel thailand baht 194 | cambodia riel ukraine hryvnia 195 | cambodia riel usa dollar 196 | cambodia riel vietnam dong 197 | cambodia riel algeria dinar 198 | cambodia riel angola kwanza 199 | cambodia riel argentina peso 200 | cambodia riel armenia dram 201 | cambodia riel brazil real 202 | cambodia riel bulgaria lev 203 | canada dollar croatia kuna 204 | canada dollar denmark krone 205 | canada dollar europe euro 206 | canada dollar hungary forint 207 | canada dollar india rupee 208 | canada dollar iran rial 209 | canada dollar japan yen 210 | canada dollar korea won 211 | canada dollar latvia lats 212 | canada dollar lithuania litas 213 | canada dollar macedonia denar 214 | canada dollar malaysia ringgit 215 | canada dollar mexico peso 216 | canada dollar nigeria naira 217 | canada dollar poland zloty 218 | canada dollar romania leu 219 | canada dollar russia ruble 220 | canada dollar sweden krona 221 | canada dollar thailand baht 222 | canada dollar ukraine hryvnia 223 | canada dollar vietnam dong 224 | canada dollar algeria dinar 225 | canada dollar angola kwanza 226 | canada dollar argentina peso 227 | canada dollar armenia dram 228 | canada dollar brazil real 229 | canada dollar bulgaria lev 230 | canada dollar cambodia riel 231 | croatia kuna denmark krone 232 | croatia kuna europe euro 233 | croatia kuna hungary forint 234 | croatia kuna india rupee 235 | croatia kuna iran rial 236 | croatia kuna japan yen 237 | croatia kuna korea won 238 | croatia kuna latvia lats 239 | croatia kuna lithuania litas 240 | croatia kuna macedonia denar 241 | croatia kuna malaysia ringgit 242 | croatia kuna mexico peso 243 | croatia kuna nigeria naira 244 | croatia kuna poland zloty 245 | croatia kuna romania leu 246 | croatia kuna russia ruble 247 | croatia kuna sweden krona 248 | croatia kuna thailand baht 249 | croatia kuna ukraine hryvnia 250 | croatia kuna usa dollar 251 | croatia kuna vietnam dong 252 | croatia kuna algeria dinar 253 | croatia kuna angola kwanza 254 | croatia kuna argentina peso 255 | croatia kuna armenia dram 256 | croatia kuna brazil real 257 | croatia kuna bulgaria lev 258 | croatia kuna cambodia riel 259 | croatia kuna canada dollar 260 | denmark krone europe euro 261 | denmark krone hungary forint 262 | denmark krone india rupee 263 | denmark krone iran rial 264 | denmark krone japan yen 265 | denmark krone korea won 266 | denmark krone latvia lats 267 | denmark krone lithuania litas 268 | denmark krone macedonia denar 269 | denmark krone malaysia ringgit 270 | denmark krone mexico peso 271 | denmark krone nigeria naira 272 | denmark krone poland zloty 273 | denmark krone romania leu 274 | denmark krone russia ruble 275 | denmark krone sweden krona 276 | denmark krone thailand baht 277 | denmark krone ukraine hryvnia 278 | denmark krone usa dollar 279 | denmark krone vietnam dong 280 | denmark krone algeria dinar 281 | denmark krone angola kwanza 282 | denmark krone argentina peso 283 | denmark krone armenia dram 284 | denmark krone brazil real 285 | denmark krone bulgaria lev 286 | denmark krone cambodia riel 287 | denmark krone canada dollar 288 | denmark krone croatia kuna 289 | europe euro hungary forint 290 | europe euro india rupee 291 | europe euro iran rial 292 | europe euro japan yen 293 | europe euro korea won 294 | europe euro latvia lats 295 | europe euro lithuania litas 296 | europe euro macedonia denar 297 | europe euro malaysia ringgit 298 | europe euro mexico peso 299 | europe euro nigeria naira 300 | europe euro poland zloty 301 | europe euro romania leu 302 | europe euro russia ruble 303 | europe euro sweden krona 304 | europe euro thailand baht 305 | europe euro ukraine hryvnia 306 | europe euro usa dollar 307 | europe euro vietnam dong 308 | europe euro algeria dinar 309 | europe euro angola kwanza 310 | europe euro argentina peso 311 | europe euro armenia dram 312 | europe euro brazil real 313 | europe euro bulgaria lev 314 | europe euro cambodia riel 315 | europe euro canada dollar 316 | europe euro croatia kuna 317 | europe euro denmark krone 318 | hungary forint india rupee 319 | hungary forint iran rial 320 | hungary forint japan yen 321 | hungary forint korea won 322 | hungary forint latvia lats 323 | hungary forint lithuania litas 324 | hungary forint macedonia denar 325 | hungary forint malaysia ringgit 326 | hungary forint mexico peso 327 | hungary forint nigeria naira 328 | hungary forint poland zloty 329 | hungary forint romania leu 330 | hungary forint russia ruble 331 | hungary forint sweden krona 332 | hungary forint thailand baht 333 | hungary forint ukraine hryvnia 334 | hungary forint usa dollar 335 | hungary forint vietnam dong 336 | hungary forint algeria dinar 337 | hungary forint angola kwanza 338 | hungary forint argentina peso 339 | hungary forint armenia dram 340 | hungary forint brazil real 341 | hungary forint bulgaria lev 342 | hungary forint cambodia riel 343 | hungary forint canada dollar 344 | hungary forint croatia kuna 345 | hungary forint denmark krone 346 | hungary forint europe euro 347 | india rupee iran rial 348 | india rupee japan yen 349 | india rupee korea won 350 | india rupee latvia lats 351 | india rupee lithuania litas 352 | india rupee macedonia denar 353 | india rupee malaysia ringgit 354 | india rupee mexico peso 355 | india rupee nigeria naira 356 | india rupee poland zloty 357 | india rupee romania leu 358 | india rupee russia ruble 359 | india rupee sweden krona 360 | india rupee thailand baht 361 | india rupee ukraine hryvnia 362 | india rupee usa dollar 363 | india rupee vietnam dong 364 | india rupee algeria dinar 365 | india rupee angola kwanza 366 | india rupee argentina peso 367 | india rupee armenia dram 368 | india rupee brazil real 369 | india rupee bulgaria lev 370 | india rupee cambodia riel 371 | india rupee canada dollar 372 | india rupee croatia kuna 373 | india rupee denmark krone 374 | india rupee europe euro 375 | india rupee hungary forint 376 | iran rial japan yen 377 | iran rial korea won 378 | iran rial latvia lats 379 | iran rial lithuania litas 380 | iran rial macedonia denar 381 | iran rial malaysia ringgit 382 | iran rial mexico peso 383 | iran rial nigeria naira 384 | iran rial poland zloty 385 | iran rial romania leu 386 | iran rial russia ruble 387 | iran rial sweden krona 388 | iran rial thailand baht 389 | iran rial ukraine hryvnia 390 | iran rial usa dollar 391 | iran rial vietnam dong 392 | iran rial algeria dinar 393 | iran rial angola kwanza 394 | iran rial argentina peso 395 | iran rial armenia dram 396 | iran rial brazil real 397 | iran rial bulgaria lev 398 | iran rial cambodia riel 399 | iran rial canada dollar 400 | iran rial croatia kuna 401 | iran rial denmark krone 402 | iran rial europe euro 403 | iran rial hungary forint 404 | iran rial india rupee 405 | japan yen korea won 406 | japan yen latvia lats 407 | japan yen lithuania litas 408 | japan yen macedonia denar 409 | japan yen malaysia ringgit 410 | japan yen mexico peso 411 | japan yen nigeria naira 412 | japan yen poland zloty 413 | japan yen romania leu 414 | japan yen russia ruble 415 | japan yen sweden krona 416 | japan yen thailand baht 417 | japan yen ukraine hryvnia 418 | japan yen usa dollar 419 | japan yen vietnam dong 420 | japan yen algeria dinar 421 | japan yen angola kwanza 422 | japan yen argentina peso 423 | japan yen armenia dram 424 | japan yen brazil real 425 | japan yen bulgaria lev 426 | japan yen cambodia riel 427 | japan yen canada dollar 428 | japan yen croatia kuna 429 | japan yen denmark krone 430 | japan yen europe euro 431 | japan yen hungary forint 432 | japan yen india rupee 433 | japan yen iran rial 434 | korea won latvia lats 435 | korea won lithuania litas 436 | korea won macedonia denar 437 | korea won malaysia ringgit 438 | korea won mexico peso 439 | korea won nigeria naira 440 | korea won poland zloty 441 | korea won romania leu 442 | korea won russia ruble 443 | korea won sweden krona 444 | korea won thailand baht 445 | korea won ukraine hryvnia 446 | korea won usa dollar 447 | korea won vietnam dong 448 | korea won algeria dinar 449 | korea won angola kwanza 450 | korea won argentina peso 451 | korea won armenia dram 452 | korea won brazil real 453 | korea won bulgaria lev 454 | korea won cambodia riel 455 | korea won canada dollar 456 | korea won croatia kuna 457 | korea won denmark krone 458 | korea won europe euro 459 | korea won hungary forint 460 | korea won india rupee 461 | korea won iran rial 462 | korea won japan yen 463 | latvia lats lithuania litas 464 | latvia lats macedonia denar 465 | latvia lats malaysia ringgit 466 | latvia lats mexico peso 467 | latvia lats nigeria naira 468 | latvia lats poland zloty 469 | latvia lats romania leu 470 | latvia lats russia ruble 471 | latvia lats sweden krona 472 | latvia lats thailand baht 473 | latvia lats ukraine hryvnia 474 | latvia lats usa dollar 475 | latvia lats vietnam dong 476 | latvia lats algeria dinar 477 | latvia lats angola kwanza 478 | latvia lats argentina peso 479 | latvia lats armenia dram 480 | latvia lats brazil real 481 | latvia lats bulgaria lev 482 | latvia lats cambodia riel 483 | latvia lats canada dollar 484 | latvia lats croatia kuna 485 | latvia lats denmark krone 486 | latvia lats europe euro 487 | latvia lats hungary forint 488 | latvia lats india rupee 489 | latvia lats iran rial 490 | latvia lats japan yen 491 | latvia lats korea won 492 | lithuania litas macedonia denar 493 | lithuania litas malaysia ringgit 494 | lithuania litas mexico peso 495 | lithuania litas nigeria naira 496 | lithuania litas poland zloty 497 | lithuania litas romania leu 498 | lithuania litas russia ruble 499 | lithuania litas sweden krona 500 | lithuania litas thailand baht 501 | lithuania litas ukraine hryvnia 502 | lithuania litas usa dollar 503 | lithuania litas vietnam dong 504 | lithuania litas algeria dinar 505 | lithuania litas angola kwanza 506 | lithuania litas argentina peso 507 | lithuania litas armenia dram 508 | lithuania litas brazil real 509 | lithuania litas bulgaria lev 510 | lithuania litas cambodia riel 511 | lithuania litas canada dollar 512 | lithuania litas croatia kuna 513 | lithuania litas denmark krone 514 | lithuania litas europe euro 515 | lithuania litas hungary forint 516 | lithuania litas india rupee 517 | lithuania litas iran rial 518 | lithuania litas japan yen 519 | lithuania litas korea won 520 | lithuania litas latvia lats 521 | macedonia denar malaysia ringgit 522 | macedonia denar mexico peso 523 | macedonia denar nigeria naira 524 | macedonia denar poland zloty 525 | macedonia denar romania leu 526 | macedonia denar russia ruble 527 | macedonia denar sweden krona 528 | macedonia denar thailand baht 529 | macedonia denar ukraine hryvnia 530 | macedonia denar usa dollar 531 | macedonia denar vietnam dong 532 | macedonia denar algeria dinar 533 | macedonia denar angola kwanza 534 | macedonia denar argentina peso 535 | macedonia denar armenia dram 536 | macedonia denar brazil real 537 | macedonia denar bulgaria lev 538 | macedonia denar cambodia riel 539 | macedonia denar canada dollar 540 | macedonia denar croatia kuna 541 | macedonia denar denmark krone 542 | macedonia denar europe euro 543 | macedonia denar hungary forint 544 | macedonia denar india rupee 545 | macedonia denar iran rial 546 | macedonia denar japan yen 547 | macedonia denar korea won 548 | macedonia denar latvia lats 549 | macedonia denar lithuania litas 550 | malaysia ringgit mexico peso 551 | malaysia ringgit nigeria naira 552 | malaysia ringgit poland zloty 553 | malaysia ringgit romania leu 554 | malaysia ringgit russia ruble 555 | malaysia ringgit sweden krona 556 | malaysia ringgit thailand baht 557 | malaysia ringgit ukraine hryvnia 558 | malaysia ringgit usa dollar 559 | malaysia ringgit vietnam dong 560 | malaysia ringgit algeria dinar 561 | malaysia ringgit angola kwanza 562 | malaysia ringgit argentina peso 563 | malaysia ringgit armenia dram 564 | malaysia ringgit brazil real 565 | malaysia ringgit bulgaria lev 566 | malaysia ringgit cambodia riel 567 | malaysia ringgit canada dollar 568 | malaysia ringgit croatia kuna 569 | malaysia ringgit denmark krone 570 | malaysia ringgit europe euro 571 | malaysia ringgit hungary forint 572 | malaysia ringgit india rupee 573 | malaysia ringgit iran rial 574 | malaysia ringgit japan yen 575 | malaysia ringgit korea won 576 | malaysia ringgit latvia lats 577 | malaysia ringgit lithuania litas 578 | malaysia ringgit macedonia denar 579 | mexico peso nigeria naira 580 | mexico peso poland zloty 581 | mexico peso romania leu 582 | mexico peso russia ruble 583 | mexico peso sweden krona 584 | mexico peso thailand baht 585 | mexico peso ukraine hryvnia 586 | mexico peso usa dollar 587 | mexico peso vietnam dong 588 | mexico peso algeria dinar 589 | mexico peso angola kwanza 590 | mexico peso armenia dram 591 | mexico peso brazil real 592 | mexico peso bulgaria lev 593 | mexico peso cambodia riel 594 | mexico peso canada dollar 595 | mexico peso croatia kuna 596 | mexico peso denmark krone 597 | mexico peso europe euro 598 | mexico peso hungary forint 599 | mexico peso india rupee 600 | mexico peso iran rial 601 | mexico peso japan yen 602 | mexico peso korea won 603 | mexico peso latvia lats 604 | mexico peso lithuania litas 605 | mexico peso macedonia denar 606 | mexico peso malaysia ringgit 607 | nigeria naira poland zloty 608 | nigeria naira romania leu 609 | nigeria naira russia ruble 610 | nigeria naira sweden krona 611 | nigeria naira thailand baht 612 | nigeria naira ukraine hryvnia 613 | nigeria naira usa dollar 614 | nigeria naira vietnam dong 615 | nigeria naira algeria dinar 616 | nigeria naira angola kwanza 617 | nigeria naira argentina peso 618 | nigeria naira armenia dram 619 | nigeria naira brazil real 620 | nigeria naira bulgaria lev 621 | nigeria naira cambodia riel 622 | nigeria naira canada dollar 623 | nigeria naira croatia kuna 624 | nigeria naira denmark krone 625 | nigeria naira europe euro 626 | nigeria naira hungary forint 627 | nigeria naira india rupee 628 | nigeria naira iran rial 629 | nigeria naira japan yen 630 | nigeria naira korea won 631 | nigeria naira latvia lats 632 | nigeria naira lithuania litas 633 | nigeria naira macedonia denar 634 | nigeria naira malaysia ringgit 635 | nigeria naira mexico peso 636 | poland zloty romania leu 637 | poland zloty russia ruble 638 | poland zloty sweden krona 639 | poland zloty thailand baht 640 | poland zloty ukraine hryvnia 641 | poland zloty usa dollar 642 | poland zloty vietnam dong 643 | poland zloty algeria dinar 644 | poland zloty angola kwanza 645 | poland zloty argentina peso 646 | poland zloty armenia dram 647 | poland zloty brazil real 648 | poland zloty bulgaria lev 649 | poland zloty cambodia riel 650 | poland zloty canada dollar 651 | poland zloty croatia kuna 652 | poland zloty denmark krone 653 | poland zloty europe euro 654 | poland zloty hungary forint 655 | poland zloty india rupee 656 | poland zloty iran rial 657 | poland zloty japan yen 658 | poland zloty korea won 659 | poland zloty latvia lats 660 | poland zloty lithuania litas 661 | poland zloty macedonia denar 662 | poland zloty malaysia ringgit 663 | poland zloty mexico peso 664 | poland zloty nigeria naira 665 | romania leu russia ruble 666 | romania leu sweden krona 667 | romania leu thailand baht 668 | romania leu ukraine hryvnia 669 | romania leu usa dollar 670 | romania leu vietnam dong 671 | romania leu algeria dinar 672 | romania leu angola kwanza 673 | romania leu argentina peso 674 | romania leu armenia dram 675 | romania leu brazil real 676 | romania leu bulgaria lev 677 | romania leu cambodia riel 678 | romania leu canada dollar 679 | romania leu croatia kuna 680 | romania leu denmark krone 681 | romania leu europe euro 682 | romania leu hungary forint 683 | romania leu india rupee 684 | romania leu iran rial 685 | romania leu japan yen 686 | romania leu korea won 687 | romania leu latvia lats 688 | romania leu lithuania litas 689 | romania leu macedonia denar 690 | romania leu malaysia ringgit 691 | romania leu mexico peso 692 | romania leu nigeria naira 693 | romania leu poland zloty 694 | russia ruble sweden krona 695 | russia ruble thailand baht 696 | russia ruble ukraine hryvnia 697 | russia ruble usa dollar 698 | russia ruble vietnam dong 699 | russia ruble algeria dinar 700 | russia ruble angola kwanza 701 | russia ruble argentina peso 702 | russia ruble armenia dram 703 | russia ruble brazil real 704 | russia ruble bulgaria lev 705 | russia ruble cambodia riel 706 | russia ruble canada dollar 707 | russia ruble croatia kuna 708 | russia ruble denmark krone 709 | russia ruble europe euro 710 | russia ruble hungary forint 711 | russia ruble india rupee 712 | russia ruble iran rial 713 | russia ruble japan yen 714 | russia ruble korea won 715 | russia ruble latvia lats 716 | russia ruble lithuania litas 717 | russia ruble macedonia denar 718 | russia ruble malaysia ringgit 719 | russia ruble mexico peso 720 | russia ruble nigeria naira 721 | russia ruble poland zloty 722 | russia ruble romania leu 723 | sweden krona thailand baht 724 | sweden krona ukraine hryvnia 725 | sweden krona usa dollar 726 | sweden krona vietnam dong 727 | sweden krona algeria dinar 728 | sweden krona angola kwanza 729 | sweden krona argentina peso 730 | sweden krona armenia dram 731 | sweden krona brazil real 732 | sweden krona bulgaria lev 733 | sweden krona cambodia riel 734 | sweden krona canada dollar 735 | sweden krona croatia kuna 736 | sweden krona denmark krone 737 | sweden krona europe euro 738 | sweden krona hungary forint 739 | sweden krona india rupee 740 | sweden krona iran rial 741 | sweden krona japan yen 742 | sweden krona korea won 743 | sweden krona latvia lats 744 | sweden krona lithuania litas 745 | sweden krona macedonia denar 746 | sweden krona malaysia ringgit 747 | sweden krona mexico peso 748 | sweden krona nigeria naira 749 | sweden krona poland zloty 750 | sweden krona romania leu 751 | sweden krona russia ruble 752 | thailand baht ukraine hryvnia 753 | thailand baht usa dollar 754 | thailand baht vietnam dong 755 | thailand baht algeria dinar 756 | thailand baht angola kwanza 757 | thailand baht argentina peso 758 | thailand baht armenia dram 759 | thailand baht brazil real 760 | thailand baht bulgaria lev 761 | thailand baht cambodia riel 762 | thailand baht canada dollar 763 | thailand baht croatia kuna 764 | thailand baht denmark krone 765 | thailand baht europe euro 766 | thailand baht hungary forint 767 | thailand baht india rupee 768 | thailand baht iran rial 769 | thailand baht japan yen 770 | thailand baht korea won 771 | thailand baht latvia lats 772 | thailand baht lithuania litas 773 | thailand baht macedonia denar 774 | thailand baht malaysia ringgit 775 | thailand baht mexico peso 776 | thailand baht nigeria naira 777 | thailand baht poland zloty 778 | thailand baht romania leu 779 | thailand baht russia ruble 780 | thailand baht sweden krona 781 | ukraine hryvnia usa dollar 782 | ukraine hryvnia vietnam dong 783 | ukraine hryvnia algeria dinar 784 | ukraine hryvnia angola kwanza 785 | ukraine hryvnia argentina peso 786 | ukraine hryvnia armenia dram 787 | ukraine hryvnia brazil real 788 | ukraine hryvnia bulgaria lev 789 | ukraine hryvnia cambodia riel 790 | ukraine hryvnia canada dollar 791 | ukraine hryvnia croatia kuna 792 | ukraine hryvnia denmark krone 793 | ukraine hryvnia europe euro 794 | ukraine hryvnia hungary forint 795 | ukraine hryvnia india rupee 796 | ukraine hryvnia iran rial 797 | ukraine hryvnia japan yen 798 | ukraine hryvnia korea won 799 | ukraine hryvnia latvia lats 800 | ukraine hryvnia lithuania litas 801 | ukraine hryvnia macedonia denar 802 | ukraine hryvnia malaysia ringgit 803 | ukraine hryvnia mexico peso 804 | ukraine hryvnia nigeria naira 805 | ukraine hryvnia poland zloty 806 | ukraine hryvnia romania leu 807 | ukraine hryvnia russia ruble 808 | ukraine hryvnia sweden krona 809 | ukraine hryvnia thailand baht 810 | usa dollar vietnam dong 811 | usa dollar algeria dinar 812 | usa dollar angola kwanza 813 | usa dollar argentina peso 814 | usa dollar armenia dram 815 | usa dollar brazil real 816 | usa dollar bulgaria lev 817 | usa dollar cambodia riel 818 | usa dollar croatia kuna 819 | usa dollar denmark krone 820 | usa dollar europe euro 821 | usa dollar hungary forint 822 | usa dollar india rupee 823 | usa dollar iran rial 824 | usa dollar japan yen 825 | usa dollar korea won 826 | usa dollar latvia lats 827 | usa dollar lithuania litas 828 | usa dollar macedonia denar 829 | usa dollar malaysia ringgit 830 | usa dollar mexico peso 831 | usa dollar nigeria naira 832 | usa dollar poland zloty 833 | usa dollar romania leu 834 | usa dollar russia ruble 835 | usa dollar sweden krona 836 | usa dollar thailand baht 837 | usa dollar ukraine hryvnia 838 | vietnam dong algeria dinar 839 | vietnam dong angola kwanza 840 | vietnam dong argentina peso 841 | vietnam dong armenia dram 842 | vietnam dong brazil real 843 | vietnam dong bulgaria lev 844 | vietnam dong cambodia riel 845 | vietnam dong canada dollar 846 | vietnam dong croatia kuna 847 | vietnam dong denmark krone 848 | vietnam dong europe euro 849 | vietnam dong hungary forint 850 | vietnam dong india rupee 851 | vietnam dong iran rial 852 | vietnam dong japan yen 853 | vietnam dong korea won 854 | vietnam dong latvia lats 855 | vietnam dong lithuania litas 856 | vietnam dong macedonia denar 857 | vietnam dong malaysia ringgit 858 | vietnam dong mexico peso 859 | vietnam dong nigeria naira 860 | vietnam dong poland zloty 861 | vietnam dong romania leu 862 | vietnam dong russia ruble 863 | vietnam dong sweden krona 864 | vietnam dong thailand baht 865 | vietnam dong ukraine hryvnia 866 | vietnam dong usa dollar 867 | -------------------------------------------------------------------------------- /eval/question-data/family.txt: -------------------------------------------------------------------------------- 1 | boy girl brother sister 2 | boy girl brothers sisters 3 | boy girl dad mom 4 | boy girl father mother 5 | boy girl grandfather grandmother 6 | boy girl grandpa grandma 7 | boy girl grandson granddaughter 8 | boy girl groom bride 9 | boy girl he she 10 | boy girl his her 11 | boy girl husband wife 12 | boy girl king queen 13 | boy girl man woman 14 | boy girl nephew niece 15 | boy girl policeman policewoman 16 | boy girl prince princess 17 | boy girl son daughter 18 | boy girl sons daughters 19 | boy girl stepbrother stepsister 20 | boy girl stepfather stepmother 21 | boy girl stepson stepdaughter 22 | boy girl uncle aunt 23 | brother sister brothers sisters 24 | brother sister dad mom 25 | brother sister father mother 26 | brother sister grandfather grandmother 27 | brother sister grandpa grandma 28 | brother sister grandson granddaughter 29 | brother sister groom bride 30 | brother sister he she 31 | brother sister his her 32 | brother sister husband wife 33 | brother sister king queen 34 | brother sister man woman 35 | brother sister nephew niece 36 | brother sister policeman policewoman 37 | brother sister prince princess 38 | brother sister son daughter 39 | brother sister sons daughters 40 | brother sister stepbrother stepsister 41 | brother sister stepfather stepmother 42 | brother sister stepson stepdaughter 43 | brother sister uncle aunt 44 | brother sister boy girl 45 | brothers sisters dad mom 46 | brothers sisters father mother 47 | brothers sisters grandfather grandmother 48 | brothers sisters grandpa grandma 49 | brothers sisters grandson granddaughter 50 | brothers sisters groom bride 51 | brothers sisters he she 52 | brothers sisters his her 53 | brothers sisters husband wife 54 | brothers sisters king queen 55 | brothers sisters man woman 56 | brothers sisters nephew niece 57 | brothers sisters policeman policewoman 58 | brothers sisters prince princess 59 | brothers sisters son daughter 60 | brothers sisters sons daughters 61 | brothers sisters stepbrother stepsister 62 | brothers sisters stepfather stepmother 63 | brothers sisters stepson stepdaughter 64 | brothers sisters uncle aunt 65 | brothers sisters boy girl 66 | brothers sisters brother sister 67 | dad mom father mother 68 | dad mom grandfather grandmother 69 | dad mom grandpa grandma 70 | dad mom grandson granddaughter 71 | dad mom groom bride 72 | dad mom he she 73 | dad mom his her 74 | dad mom husband wife 75 | dad mom king queen 76 | dad mom man woman 77 | dad mom nephew niece 78 | dad mom policeman policewoman 79 | dad mom prince princess 80 | dad mom son daughter 81 | dad mom sons daughters 82 | dad mom stepbrother stepsister 83 | dad mom stepfather stepmother 84 | dad mom stepson stepdaughter 85 | dad mom uncle aunt 86 | dad mom boy girl 87 | dad mom brother sister 88 | dad mom brothers sisters 89 | father mother grandfather grandmother 90 | father mother grandpa grandma 91 | father mother grandson granddaughter 92 | father mother groom bride 93 | father mother he she 94 | father mother his her 95 | father mother husband wife 96 | father mother king queen 97 | father mother man woman 98 | father mother nephew niece 99 | father mother policeman policewoman 100 | father mother prince princess 101 | father mother son daughter 102 | father mother sons daughters 103 | father mother stepbrother stepsister 104 | father mother stepfather stepmother 105 | father mother stepson stepdaughter 106 | father mother uncle aunt 107 | father mother boy girl 108 | father mother brother sister 109 | father mother brothers sisters 110 | father mother dad mom 111 | grandfather grandmother grandpa grandma 112 | grandfather grandmother grandson granddaughter 113 | grandfather grandmother groom bride 114 | grandfather grandmother he she 115 | grandfather grandmother his her 116 | grandfather grandmother husband wife 117 | grandfather grandmother king queen 118 | grandfather grandmother man woman 119 | grandfather grandmother nephew niece 120 | grandfather grandmother policeman policewoman 121 | grandfather grandmother prince princess 122 | grandfather grandmother son daughter 123 | grandfather grandmother sons daughters 124 | grandfather grandmother stepbrother stepsister 125 | grandfather grandmother stepfather stepmother 126 | grandfather grandmother stepson stepdaughter 127 | grandfather grandmother uncle aunt 128 | grandfather grandmother boy girl 129 | grandfather grandmother brother sister 130 | grandfather grandmother brothers sisters 131 | grandfather grandmother dad mom 132 | grandfather grandmother father mother 133 | grandpa grandma grandson granddaughter 134 | grandpa grandma groom bride 135 | grandpa grandma he she 136 | grandpa grandma his her 137 | grandpa grandma husband wife 138 | grandpa grandma king queen 139 | grandpa grandma man woman 140 | grandpa grandma nephew niece 141 | grandpa grandma policeman policewoman 142 | grandpa grandma prince princess 143 | grandpa grandma son daughter 144 | grandpa grandma sons daughters 145 | grandpa grandma stepbrother stepsister 146 | grandpa grandma stepfather stepmother 147 | grandpa grandma stepson stepdaughter 148 | grandpa grandma uncle aunt 149 | grandpa grandma boy girl 150 | grandpa grandma brother sister 151 | grandpa grandma brothers sisters 152 | grandpa grandma dad mom 153 | grandpa grandma father mother 154 | grandpa grandma grandfather grandmother 155 | grandson granddaughter groom bride 156 | grandson granddaughter he she 157 | grandson granddaughter his her 158 | grandson granddaughter husband wife 159 | grandson granddaughter king queen 160 | grandson granddaughter man woman 161 | grandson granddaughter nephew niece 162 | grandson granddaughter policeman policewoman 163 | grandson granddaughter prince princess 164 | grandson granddaughter son daughter 165 | grandson granddaughter sons daughters 166 | grandson granddaughter stepbrother stepsister 167 | grandson granddaughter stepfather stepmother 168 | grandson granddaughter stepson stepdaughter 169 | grandson granddaughter uncle aunt 170 | grandson granddaughter boy girl 171 | grandson granddaughter brother sister 172 | grandson granddaughter brothers sisters 173 | grandson granddaughter dad mom 174 | grandson granddaughter father mother 175 | grandson granddaughter grandfather grandmother 176 | grandson granddaughter grandpa grandma 177 | groom bride he she 178 | groom bride his her 179 | groom bride husband wife 180 | groom bride king queen 181 | groom bride man woman 182 | groom bride nephew niece 183 | groom bride policeman policewoman 184 | groom bride prince princess 185 | groom bride son daughter 186 | groom bride sons daughters 187 | groom bride stepbrother stepsister 188 | groom bride stepfather stepmother 189 | groom bride stepson stepdaughter 190 | groom bride uncle aunt 191 | groom bride boy girl 192 | groom bride brother sister 193 | groom bride brothers sisters 194 | groom bride dad mom 195 | groom bride father mother 196 | groom bride grandfather grandmother 197 | groom bride grandpa grandma 198 | groom bride grandson granddaughter 199 | he she his her 200 | he she husband wife 201 | he she king queen 202 | he she man woman 203 | he she nephew niece 204 | he she policeman policewoman 205 | he she prince princess 206 | he she son daughter 207 | he she sons daughters 208 | he she stepbrother stepsister 209 | he she stepfather stepmother 210 | he she stepson stepdaughter 211 | he she uncle aunt 212 | he she boy girl 213 | he she brother sister 214 | he she brothers sisters 215 | he she dad mom 216 | he she father mother 217 | he she grandfather grandmother 218 | he she grandpa grandma 219 | he she grandson granddaughter 220 | he she groom bride 221 | his her husband wife 222 | his her king queen 223 | his her man woman 224 | his her nephew niece 225 | his her policeman policewoman 226 | his her prince princess 227 | his her son daughter 228 | his her sons daughters 229 | his her stepbrother stepsister 230 | his her stepfather stepmother 231 | his her stepson stepdaughter 232 | his her uncle aunt 233 | his her boy girl 234 | his her brother sister 235 | his her brothers sisters 236 | his her dad mom 237 | his her father mother 238 | his her grandfather grandmother 239 | his her grandpa grandma 240 | his her grandson granddaughter 241 | his her groom bride 242 | his her he she 243 | husband wife king queen 244 | husband wife man woman 245 | husband wife nephew niece 246 | husband wife policeman policewoman 247 | husband wife prince princess 248 | husband wife son daughter 249 | husband wife sons daughters 250 | husband wife stepbrother stepsister 251 | husband wife stepfather stepmother 252 | husband wife stepson stepdaughter 253 | husband wife uncle aunt 254 | husband wife boy girl 255 | husband wife brother sister 256 | husband wife brothers sisters 257 | husband wife dad mom 258 | husband wife father mother 259 | husband wife grandfather grandmother 260 | husband wife grandpa grandma 261 | husband wife grandson granddaughter 262 | husband wife groom bride 263 | husband wife he she 264 | husband wife his her 265 | king queen man woman 266 | king queen nephew niece 267 | king queen policeman policewoman 268 | king queen prince princess 269 | king queen son daughter 270 | king queen sons daughters 271 | king queen stepbrother stepsister 272 | king queen stepfather stepmother 273 | king queen stepson stepdaughter 274 | king queen uncle aunt 275 | king queen boy girl 276 | king queen brother sister 277 | king queen brothers sisters 278 | king queen dad mom 279 | king queen father mother 280 | king queen grandfather grandmother 281 | king queen grandpa grandma 282 | king queen grandson granddaughter 283 | king queen groom bride 284 | king queen he she 285 | king queen his her 286 | king queen husband wife 287 | man woman nephew niece 288 | man woman policeman policewoman 289 | man woman prince princess 290 | man woman son daughter 291 | man woman sons daughters 292 | man woman stepbrother stepsister 293 | man woman stepfather stepmother 294 | man woman stepson stepdaughter 295 | man woman uncle aunt 296 | man woman boy girl 297 | man woman brother sister 298 | man woman brothers sisters 299 | man woman dad mom 300 | man woman father mother 301 | man woman grandfather grandmother 302 | man woman grandpa grandma 303 | man woman grandson granddaughter 304 | man woman groom bride 305 | man woman he she 306 | man woman his her 307 | man woman husband wife 308 | man woman king queen 309 | nephew niece policeman policewoman 310 | nephew niece prince princess 311 | nephew niece son daughter 312 | nephew niece sons daughters 313 | nephew niece stepbrother stepsister 314 | nephew niece stepfather stepmother 315 | nephew niece stepson stepdaughter 316 | nephew niece uncle aunt 317 | nephew niece boy girl 318 | nephew niece brother sister 319 | nephew niece brothers sisters 320 | nephew niece dad mom 321 | nephew niece father mother 322 | nephew niece grandfather grandmother 323 | nephew niece grandpa grandma 324 | nephew niece grandson granddaughter 325 | nephew niece groom bride 326 | nephew niece he she 327 | nephew niece his her 328 | nephew niece husband wife 329 | nephew niece king queen 330 | nephew niece man woman 331 | policeman policewoman prince princess 332 | policeman policewoman son daughter 333 | policeman policewoman sons daughters 334 | policeman policewoman stepbrother stepsister 335 | policeman policewoman stepfather stepmother 336 | policeman policewoman stepson stepdaughter 337 | policeman policewoman uncle aunt 338 | policeman policewoman boy girl 339 | policeman policewoman brother sister 340 | policeman policewoman brothers sisters 341 | policeman policewoman dad mom 342 | policeman policewoman father mother 343 | policeman policewoman grandfather grandmother 344 | policeman policewoman grandpa grandma 345 | policeman policewoman grandson granddaughter 346 | policeman policewoman groom bride 347 | policeman policewoman he she 348 | policeman policewoman his her 349 | policeman policewoman husband wife 350 | policeman policewoman king queen 351 | policeman policewoman man woman 352 | policeman policewoman nephew niece 353 | prince princess son daughter 354 | prince princess sons daughters 355 | prince princess stepbrother stepsister 356 | prince princess stepfather stepmother 357 | prince princess stepson stepdaughter 358 | prince princess uncle aunt 359 | prince princess boy girl 360 | prince princess brother sister 361 | prince princess brothers sisters 362 | prince princess dad mom 363 | prince princess father mother 364 | prince princess grandfather grandmother 365 | prince princess grandpa grandma 366 | prince princess grandson granddaughter 367 | prince princess groom bride 368 | prince princess he she 369 | prince princess his her 370 | prince princess husband wife 371 | prince princess king queen 372 | prince princess man woman 373 | prince princess nephew niece 374 | prince princess policeman policewoman 375 | son daughter sons daughters 376 | son daughter stepbrother stepsister 377 | son daughter stepfather stepmother 378 | son daughter stepson stepdaughter 379 | son daughter uncle aunt 380 | son daughter boy girl 381 | son daughter brother sister 382 | son daughter brothers sisters 383 | son daughter dad mom 384 | son daughter father mother 385 | son daughter grandfather grandmother 386 | son daughter grandpa grandma 387 | son daughter grandson granddaughter 388 | son daughter groom bride 389 | son daughter he she 390 | son daughter his her 391 | son daughter husband wife 392 | son daughter king queen 393 | son daughter man woman 394 | son daughter nephew niece 395 | son daughter policeman policewoman 396 | son daughter prince princess 397 | sons daughters stepbrother stepsister 398 | sons daughters stepfather stepmother 399 | sons daughters stepson stepdaughter 400 | sons daughters uncle aunt 401 | sons daughters boy girl 402 | sons daughters brother sister 403 | sons daughters brothers sisters 404 | sons daughters dad mom 405 | sons daughters father mother 406 | sons daughters grandfather grandmother 407 | sons daughters grandpa grandma 408 | sons daughters grandson granddaughter 409 | sons daughters groom bride 410 | sons daughters he she 411 | sons daughters his her 412 | sons daughters husband wife 413 | sons daughters king queen 414 | sons daughters man woman 415 | sons daughters nephew niece 416 | sons daughters policeman policewoman 417 | sons daughters prince princess 418 | sons daughters son daughter 419 | stepbrother stepsister stepfather stepmother 420 | stepbrother stepsister stepson stepdaughter 421 | stepbrother stepsister uncle aunt 422 | stepbrother stepsister boy girl 423 | stepbrother stepsister brother sister 424 | stepbrother stepsister brothers sisters 425 | stepbrother stepsister dad mom 426 | stepbrother stepsister father mother 427 | stepbrother stepsister grandfather grandmother 428 | stepbrother stepsister grandpa grandma 429 | stepbrother stepsister grandson granddaughter 430 | stepbrother stepsister groom bride 431 | stepbrother stepsister he she 432 | stepbrother stepsister his her 433 | stepbrother stepsister husband wife 434 | stepbrother stepsister king queen 435 | stepbrother stepsister man woman 436 | stepbrother stepsister nephew niece 437 | stepbrother stepsister policeman policewoman 438 | stepbrother stepsister prince princess 439 | stepbrother stepsister son daughter 440 | stepbrother stepsister sons daughters 441 | stepfather stepmother stepson stepdaughter 442 | stepfather stepmother uncle aunt 443 | stepfather stepmother boy girl 444 | stepfather stepmother brother sister 445 | stepfather stepmother brothers sisters 446 | stepfather stepmother dad mom 447 | stepfather stepmother father mother 448 | stepfather stepmother grandfather grandmother 449 | stepfather stepmother grandpa grandma 450 | stepfather stepmother grandson granddaughter 451 | stepfather stepmother groom bride 452 | stepfather stepmother he she 453 | stepfather stepmother his her 454 | stepfather stepmother husband wife 455 | stepfather stepmother king queen 456 | stepfather stepmother man woman 457 | stepfather stepmother nephew niece 458 | stepfather stepmother policeman policewoman 459 | stepfather stepmother prince princess 460 | stepfather stepmother son daughter 461 | stepfather stepmother sons daughters 462 | stepfather stepmother stepbrother stepsister 463 | stepson stepdaughter uncle aunt 464 | stepson stepdaughter boy girl 465 | stepson stepdaughter brother sister 466 | stepson stepdaughter brothers sisters 467 | stepson stepdaughter dad mom 468 | stepson stepdaughter father mother 469 | stepson stepdaughter grandfather grandmother 470 | stepson stepdaughter grandpa grandma 471 | stepson stepdaughter grandson granddaughter 472 | stepson stepdaughter groom bride 473 | stepson stepdaughter he she 474 | stepson stepdaughter his her 475 | stepson stepdaughter husband wife 476 | stepson stepdaughter king queen 477 | stepson stepdaughter man woman 478 | stepson stepdaughter nephew niece 479 | stepson stepdaughter policeman policewoman 480 | stepson stepdaughter prince princess 481 | stepson stepdaughter son daughter 482 | stepson stepdaughter sons daughters 483 | stepson stepdaughter stepbrother stepsister 484 | stepson stepdaughter stepfather stepmother 485 | uncle aunt boy girl 486 | uncle aunt brother sister 487 | uncle aunt brothers sisters 488 | uncle aunt dad mom 489 | uncle aunt father mother 490 | uncle aunt grandfather grandmother 491 | uncle aunt grandpa grandma 492 | uncle aunt grandson granddaughter 493 | uncle aunt groom bride 494 | uncle aunt he she 495 | uncle aunt his her 496 | uncle aunt husband wife 497 | uncle aunt king queen 498 | uncle aunt man woman 499 | uncle aunt nephew niece 500 | uncle aunt policeman policewoman 501 | uncle aunt prince princess 502 | uncle aunt son daughter 503 | uncle aunt sons daughters 504 | uncle aunt stepbrother stepsister 505 | uncle aunt stepfather stepmother 506 | uncle aunt stepson stepdaughter 507 | -------------------------------------------------------------------------------- /eval/question-data/gram9-plural-verbs.txt: -------------------------------------------------------------------------------- 1 | decrease decreases describe describes 2 | decrease decreases eat eats 3 | decrease decreases enhance enhances 4 | decrease decreases estimate estimates 5 | decrease decreases find finds 6 | decrease decreases generate generates 7 | decrease decreases go goes 8 | decrease decreases implement implements 9 | decrease decreases increase increases 10 | decrease decreases listen listens 11 | decrease decreases play plays 12 | decrease decreases predict predicts 13 | decrease decreases provide provides 14 | decrease decreases say says 15 | decrease decreases scream screams 16 | decrease decreases search searches 17 | decrease decreases see sees 18 | decrease decreases shuffle shuffles 19 | decrease decreases sing sings 20 | decrease decreases sit sits 21 | decrease decreases slow slows 22 | decrease decreases speak speaks 23 | decrease decreases swim swims 24 | decrease decreases talk talks 25 | decrease decreases think thinks 26 | decrease decreases vanish vanishes 27 | decrease decreases walk walks 28 | decrease decreases work works 29 | decrease decreases write writes 30 | describe describes eat eats 31 | describe describes enhance enhances 32 | describe describes estimate estimates 33 | describe describes find finds 34 | describe describes generate generates 35 | describe describes go goes 36 | describe describes implement implements 37 | describe describes increase increases 38 | describe describes listen listens 39 | describe describes play plays 40 | describe describes predict predicts 41 | describe describes provide provides 42 | describe describes say says 43 | describe describes scream screams 44 | describe describes search searches 45 | describe describes see sees 46 | describe describes shuffle shuffles 47 | describe describes sing sings 48 | describe describes sit sits 49 | describe describes slow slows 50 | describe describes speak speaks 51 | describe describes swim swims 52 | describe describes talk talks 53 | describe describes think thinks 54 | describe describes vanish vanishes 55 | describe describes walk walks 56 | describe describes work works 57 | describe describes write writes 58 | describe describes decrease decreases 59 | eat eats enhance enhances 60 | eat eats estimate estimates 61 | eat eats find finds 62 | eat eats generate generates 63 | eat eats go goes 64 | eat eats implement implements 65 | eat eats increase increases 66 | eat eats listen listens 67 | eat eats play plays 68 | eat eats predict predicts 69 | eat eats provide provides 70 | eat eats say says 71 | eat eats scream screams 72 | eat eats search searches 73 | eat eats see sees 74 | eat eats shuffle shuffles 75 | eat eats sing sings 76 | eat eats sit sits 77 | eat eats slow slows 78 | eat eats speak speaks 79 | eat eats swim swims 80 | eat eats talk talks 81 | eat eats think thinks 82 | eat eats vanish vanishes 83 | eat eats walk walks 84 | eat eats work works 85 | eat eats write writes 86 | eat eats decrease decreases 87 | eat eats describe describes 88 | enhance enhances estimate estimates 89 | enhance enhances find finds 90 | enhance enhances generate generates 91 | enhance enhances go goes 92 | enhance enhances implement implements 93 | enhance enhances increase increases 94 | enhance enhances listen listens 95 | enhance enhances play plays 96 | enhance enhances predict predicts 97 | enhance enhances provide provides 98 | enhance enhances say says 99 | enhance enhances scream screams 100 | enhance enhances search searches 101 | enhance enhances see sees 102 | enhance enhances shuffle shuffles 103 | enhance enhances sing sings 104 | enhance enhances sit sits 105 | enhance enhances slow slows 106 | enhance enhances speak speaks 107 | enhance enhances swim swims 108 | enhance enhances talk talks 109 | enhance enhances think thinks 110 | enhance enhances vanish vanishes 111 | enhance enhances walk walks 112 | enhance enhances work works 113 | enhance enhances write writes 114 | enhance enhances decrease decreases 115 | enhance enhances describe describes 116 | enhance enhances eat eats 117 | estimate estimates find finds 118 | estimate estimates generate generates 119 | estimate estimates go goes 120 | estimate estimates implement implements 121 | estimate estimates increase increases 122 | estimate estimates listen listens 123 | estimate estimates play plays 124 | estimate estimates predict predicts 125 | estimate estimates provide provides 126 | estimate estimates say says 127 | estimate estimates scream screams 128 | estimate estimates search searches 129 | estimate estimates see sees 130 | estimate estimates shuffle shuffles 131 | estimate estimates sing sings 132 | estimate estimates sit sits 133 | estimate estimates slow slows 134 | estimate estimates speak speaks 135 | estimate estimates swim swims 136 | estimate estimates talk talks 137 | estimate estimates think thinks 138 | estimate estimates vanish vanishes 139 | estimate estimates walk walks 140 | estimate estimates work works 141 | estimate estimates write writes 142 | estimate estimates decrease decreases 143 | estimate estimates describe describes 144 | estimate estimates eat eats 145 | estimate estimates enhance enhances 146 | find finds generate generates 147 | find finds go goes 148 | find finds implement implements 149 | find finds increase increases 150 | find finds listen listens 151 | find finds play plays 152 | find finds predict predicts 153 | find finds provide provides 154 | find finds say says 155 | find finds scream screams 156 | find finds search searches 157 | find finds see sees 158 | find finds shuffle shuffles 159 | find finds sing sings 160 | find finds sit sits 161 | find finds slow slows 162 | find finds speak speaks 163 | find finds swim swims 164 | find finds talk talks 165 | find finds think thinks 166 | find finds vanish vanishes 167 | find finds walk walks 168 | find finds work works 169 | find finds write writes 170 | find finds decrease decreases 171 | find finds describe describes 172 | find finds eat eats 173 | find finds enhance enhances 174 | find finds estimate estimates 175 | generate generates go goes 176 | generate generates implement implements 177 | generate generates increase increases 178 | generate generates listen listens 179 | generate generates play plays 180 | generate generates predict predicts 181 | generate generates provide provides 182 | generate generates say says 183 | generate generates scream screams 184 | generate generates search searches 185 | generate generates see sees 186 | generate generates shuffle shuffles 187 | generate generates sing sings 188 | generate generates sit sits 189 | generate generates slow slows 190 | generate generates speak speaks 191 | generate generates swim swims 192 | generate generates talk talks 193 | generate generates think thinks 194 | generate generates vanish vanishes 195 | generate generates walk walks 196 | generate generates work works 197 | generate generates write writes 198 | generate generates decrease decreases 199 | generate generates describe describes 200 | generate generates eat eats 201 | generate generates enhance enhances 202 | generate generates estimate estimates 203 | generate generates find finds 204 | go goes implement implements 205 | go goes increase increases 206 | go goes listen listens 207 | go goes play plays 208 | go goes predict predicts 209 | go goes provide provides 210 | go goes say says 211 | go goes scream screams 212 | go goes search searches 213 | go goes see sees 214 | go goes shuffle shuffles 215 | go goes sing sings 216 | go goes sit sits 217 | go goes slow slows 218 | go goes speak speaks 219 | go goes swim swims 220 | go goes talk talks 221 | go goes think thinks 222 | go goes vanish vanishes 223 | go goes walk walks 224 | go goes work works 225 | go goes write writes 226 | go goes decrease decreases 227 | go goes describe describes 228 | go goes eat eats 229 | go goes enhance enhances 230 | go goes estimate estimates 231 | go goes find finds 232 | go goes generate generates 233 | implement implements increase increases 234 | implement implements listen listens 235 | implement implements play plays 236 | implement implements predict predicts 237 | implement implements provide provides 238 | implement implements say says 239 | implement implements scream screams 240 | implement implements search searches 241 | implement implements see sees 242 | implement implements shuffle shuffles 243 | implement implements sing sings 244 | implement implements sit sits 245 | implement implements slow slows 246 | implement implements speak speaks 247 | implement implements swim swims 248 | implement implements talk talks 249 | implement implements think thinks 250 | implement implements vanish vanishes 251 | implement implements walk walks 252 | implement implements work works 253 | implement implements write writes 254 | implement implements decrease decreases 255 | implement implements describe describes 256 | implement implements eat eats 257 | implement implements enhance enhances 258 | implement implements estimate estimates 259 | implement implements find finds 260 | implement implements generate generates 261 | implement implements go goes 262 | increase increases listen listens 263 | increase increases play plays 264 | increase increases predict predicts 265 | increase increases provide provides 266 | increase increases say says 267 | increase increases scream screams 268 | increase increases search searches 269 | increase increases see sees 270 | increase increases shuffle shuffles 271 | increase increases sing sings 272 | increase increases sit sits 273 | increase increases slow slows 274 | increase increases speak speaks 275 | increase increases swim swims 276 | increase increases talk talks 277 | increase increases think thinks 278 | increase increases vanish vanishes 279 | increase increases walk walks 280 | increase increases work works 281 | increase increases write writes 282 | increase increases decrease decreases 283 | increase increases describe describes 284 | increase increases eat eats 285 | increase increases enhance enhances 286 | increase increases estimate estimates 287 | increase increases find finds 288 | increase increases generate generates 289 | increase increases go goes 290 | increase increases implement implements 291 | listen listens play plays 292 | listen listens predict predicts 293 | listen listens provide provides 294 | listen listens say says 295 | listen listens scream screams 296 | listen listens search searches 297 | listen listens see sees 298 | listen listens shuffle shuffles 299 | listen listens sing sings 300 | listen listens sit sits 301 | listen listens slow slows 302 | listen listens speak speaks 303 | listen listens swim swims 304 | listen listens talk talks 305 | listen listens think thinks 306 | listen listens vanish vanishes 307 | listen listens walk walks 308 | listen listens work works 309 | listen listens write writes 310 | listen listens decrease decreases 311 | listen listens describe describes 312 | listen listens eat eats 313 | listen listens enhance enhances 314 | listen listens estimate estimates 315 | listen listens find finds 316 | listen listens generate generates 317 | listen listens go goes 318 | listen listens implement implements 319 | listen listens increase increases 320 | play plays predict predicts 321 | play plays provide provides 322 | play plays say says 323 | play plays scream screams 324 | play plays search searches 325 | play plays see sees 326 | play plays shuffle shuffles 327 | play plays sing sings 328 | play plays sit sits 329 | play plays slow slows 330 | play plays speak speaks 331 | play plays swim swims 332 | play plays talk talks 333 | play plays think thinks 334 | play plays vanish vanishes 335 | play plays walk walks 336 | play plays work works 337 | play plays write writes 338 | play plays decrease decreases 339 | play plays describe describes 340 | play plays eat eats 341 | play plays enhance enhances 342 | play plays estimate estimates 343 | play plays find finds 344 | play plays generate generates 345 | play plays go goes 346 | play plays implement implements 347 | play plays increase increases 348 | play plays listen listens 349 | predict predicts provide provides 350 | predict predicts say says 351 | predict predicts scream screams 352 | predict predicts search searches 353 | predict predicts see sees 354 | predict predicts shuffle shuffles 355 | predict predicts sing sings 356 | predict predicts sit sits 357 | predict predicts slow slows 358 | predict predicts speak speaks 359 | predict predicts swim swims 360 | predict predicts talk talks 361 | predict predicts think thinks 362 | predict predicts vanish vanishes 363 | predict predicts walk walks 364 | predict predicts work works 365 | predict predicts write writes 366 | predict predicts decrease decreases 367 | predict predicts describe describes 368 | predict predicts eat eats 369 | predict predicts enhance enhances 370 | predict predicts estimate estimates 371 | predict predicts find finds 372 | predict predicts generate generates 373 | predict predicts go goes 374 | predict predicts implement implements 375 | predict predicts increase increases 376 | predict predicts listen listens 377 | predict predicts play plays 378 | provide provides say says 379 | provide provides scream screams 380 | provide provides search searches 381 | provide provides see sees 382 | provide provides shuffle shuffles 383 | provide provides sing sings 384 | provide provides sit sits 385 | provide provides slow slows 386 | provide provides speak speaks 387 | provide provides swim swims 388 | provide provides talk talks 389 | provide provides think thinks 390 | provide provides vanish vanishes 391 | provide provides walk walks 392 | provide provides work works 393 | provide provides write writes 394 | provide provides decrease decreases 395 | provide provides describe describes 396 | provide provides eat eats 397 | provide provides enhance enhances 398 | provide provides estimate estimates 399 | provide provides find finds 400 | provide provides generate generates 401 | provide provides go goes 402 | provide provides implement implements 403 | provide provides increase increases 404 | provide provides listen listens 405 | provide provides play plays 406 | provide provides predict predicts 407 | say says scream screams 408 | say says search searches 409 | say says see sees 410 | say says shuffle shuffles 411 | say says sing sings 412 | say says sit sits 413 | say says slow slows 414 | say says speak speaks 415 | say says swim swims 416 | say says talk talks 417 | say says think thinks 418 | say says vanish vanishes 419 | say says walk walks 420 | say says work works 421 | say says write writes 422 | say says decrease decreases 423 | say says describe describes 424 | say says eat eats 425 | say says enhance enhances 426 | say says estimate estimates 427 | say says find finds 428 | say says generate generates 429 | say says go goes 430 | say says implement implements 431 | say says increase increases 432 | say says listen listens 433 | say says play plays 434 | say says predict predicts 435 | say says provide provides 436 | scream screams search searches 437 | scream screams see sees 438 | scream screams shuffle shuffles 439 | scream screams sing sings 440 | scream screams sit sits 441 | scream screams slow slows 442 | scream screams speak speaks 443 | scream screams swim swims 444 | scream screams talk talks 445 | scream screams think thinks 446 | scream screams vanish vanishes 447 | scream screams walk walks 448 | scream screams work works 449 | scream screams write writes 450 | scream screams decrease decreases 451 | scream screams describe describes 452 | scream screams eat eats 453 | scream screams enhance enhances 454 | scream screams estimate estimates 455 | scream screams find finds 456 | scream screams generate generates 457 | scream screams go goes 458 | scream screams implement implements 459 | scream screams increase increases 460 | scream screams listen listens 461 | scream screams play plays 462 | scream screams predict predicts 463 | scream screams provide provides 464 | scream screams say says 465 | search searches see sees 466 | search searches shuffle shuffles 467 | search searches sing sings 468 | search searches sit sits 469 | search searches slow slows 470 | search searches speak speaks 471 | search searches swim swims 472 | search searches talk talks 473 | search searches think thinks 474 | search searches vanish vanishes 475 | search searches walk walks 476 | search searches work works 477 | search searches write writes 478 | search searches decrease decreases 479 | search searches describe describes 480 | search searches eat eats 481 | search searches enhance enhances 482 | search searches estimate estimates 483 | search searches find finds 484 | search searches generate generates 485 | search searches go goes 486 | search searches implement implements 487 | search searches increase increases 488 | search searches listen listens 489 | search searches play plays 490 | search searches predict predicts 491 | search searches provide provides 492 | search searches say says 493 | search searches scream screams 494 | see sees shuffle shuffles 495 | see sees sing sings 496 | see sees sit sits 497 | see sees slow slows 498 | see sees speak speaks 499 | see sees swim swims 500 | see sees talk talks 501 | see sees think thinks 502 | see sees vanish vanishes 503 | see sees walk walks 504 | see sees work works 505 | see sees write writes 506 | see sees decrease decreases 507 | see sees describe describes 508 | see sees eat eats 509 | see sees enhance enhances 510 | see sees estimate estimates 511 | see sees find finds 512 | see sees generate generates 513 | see sees go goes 514 | see sees implement implements 515 | see sees increase increases 516 | see sees listen listens 517 | see sees play plays 518 | see sees predict predicts 519 | see sees provide provides 520 | see sees say says 521 | see sees scream screams 522 | see sees search searches 523 | shuffle shuffles sing sings 524 | shuffle shuffles sit sits 525 | shuffle shuffles slow slows 526 | shuffle shuffles speak speaks 527 | shuffle shuffles swim swims 528 | shuffle shuffles talk talks 529 | shuffle shuffles think thinks 530 | shuffle shuffles vanish vanishes 531 | shuffle shuffles walk walks 532 | shuffle shuffles work works 533 | shuffle shuffles write writes 534 | shuffle shuffles decrease decreases 535 | shuffle shuffles describe describes 536 | shuffle shuffles eat eats 537 | shuffle shuffles enhance enhances 538 | shuffle shuffles estimate estimates 539 | shuffle shuffles find finds 540 | shuffle shuffles generate generates 541 | shuffle shuffles go goes 542 | shuffle shuffles implement implements 543 | shuffle shuffles increase increases 544 | shuffle shuffles listen listens 545 | shuffle shuffles play plays 546 | shuffle shuffles predict predicts 547 | shuffle shuffles provide provides 548 | shuffle shuffles say says 549 | shuffle shuffles scream screams 550 | shuffle shuffles search searches 551 | shuffle shuffles see sees 552 | sing sings sit sits 553 | sing sings slow slows 554 | sing sings speak speaks 555 | sing sings swim swims 556 | sing sings talk talks 557 | sing sings think thinks 558 | sing sings vanish vanishes 559 | sing sings walk walks 560 | sing sings work works 561 | sing sings write writes 562 | sing sings decrease decreases 563 | sing sings describe describes 564 | sing sings eat eats 565 | sing sings enhance enhances 566 | sing sings estimate estimates 567 | sing sings find finds 568 | sing sings generate generates 569 | sing sings go goes 570 | sing sings implement implements 571 | sing sings increase increases 572 | sing sings listen listens 573 | sing sings play plays 574 | sing sings predict predicts 575 | sing sings provide provides 576 | sing sings say says 577 | sing sings scream screams 578 | sing sings search searches 579 | sing sings see sees 580 | sing sings shuffle shuffles 581 | sit sits slow slows 582 | sit sits speak speaks 583 | sit sits swim swims 584 | sit sits talk talks 585 | sit sits think thinks 586 | sit sits vanish vanishes 587 | sit sits walk walks 588 | sit sits work works 589 | sit sits write writes 590 | sit sits decrease decreases 591 | sit sits describe describes 592 | sit sits eat eats 593 | sit sits enhance enhances 594 | sit sits estimate estimates 595 | sit sits find finds 596 | sit sits generate generates 597 | sit sits go goes 598 | sit sits implement implements 599 | sit sits increase increases 600 | sit sits listen listens 601 | sit sits play plays 602 | sit sits predict predicts 603 | sit sits provide provides 604 | sit sits say says 605 | sit sits scream screams 606 | sit sits search searches 607 | sit sits see sees 608 | sit sits shuffle shuffles 609 | sit sits sing sings 610 | slow slows speak speaks 611 | slow slows swim swims 612 | slow slows talk talks 613 | slow slows think thinks 614 | slow slows vanish vanishes 615 | slow slows walk walks 616 | slow slows work works 617 | slow slows write writes 618 | slow slows decrease decreases 619 | slow slows describe describes 620 | slow slows eat eats 621 | slow slows enhance enhances 622 | slow slows estimate estimates 623 | slow slows find finds 624 | slow slows generate generates 625 | slow slows go goes 626 | slow slows implement implements 627 | slow slows increase increases 628 | slow slows listen listens 629 | slow slows play plays 630 | slow slows predict predicts 631 | slow slows provide provides 632 | slow slows say says 633 | slow slows scream screams 634 | slow slows search searches 635 | slow slows see sees 636 | slow slows shuffle shuffles 637 | slow slows sing sings 638 | slow slows sit sits 639 | speak speaks swim swims 640 | speak speaks talk talks 641 | speak speaks think thinks 642 | speak speaks vanish vanishes 643 | speak speaks walk walks 644 | speak speaks work works 645 | speak speaks write writes 646 | speak speaks decrease decreases 647 | speak speaks describe describes 648 | speak speaks eat eats 649 | speak speaks enhance enhances 650 | speak speaks estimate estimates 651 | speak speaks find finds 652 | speak speaks generate generates 653 | speak speaks go goes 654 | speak speaks implement implements 655 | speak speaks increase increases 656 | speak speaks listen listens 657 | speak speaks play plays 658 | speak speaks predict predicts 659 | speak speaks provide provides 660 | speak speaks say says 661 | speak speaks scream screams 662 | speak speaks search searches 663 | speak speaks see sees 664 | speak speaks shuffle shuffles 665 | speak speaks sing sings 666 | speak speaks sit sits 667 | speak speaks slow slows 668 | swim swims talk talks 669 | swim swims think thinks 670 | swim swims vanish vanishes 671 | swim swims walk walks 672 | swim swims work works 673 | swim swims write writes 674 | swim swims decrease decreases 675 | swim swims describe describes 676 | swim swims eat eats 677 | swim swims enhance enhances 678 | swim swims estimate estimates 679 | swim swims find finds 680 | swim swims generate generates 681 | swim swims go goes 682 | swim swims implement implements 683 | swim swims increase increases 684 | swim swims listen listens 685 | swim swims play plays 686 | swim swims predict predicts 687 | swim swims provide provides 688 | swim swims say says 689 | swim swims scream screams 690 | swim swims search searches 691 | swim swims see sees 692 | swim swims shuffle shuffles 693 | swim swims sing sings 694 | swim swims sit sits 695 | swim swims slow slows 696 | swim swims speak speaks 697 | talk talks think thinks 698 | talk talks vanish vanishes 699 | talk talks walk walks 700 | talk talks work works 701 | talk talks write writes 702 | talk talks decrease decreases 703 | talk talks describe describes 704 | talk talks eat eats 705 | talk talks enhance enhances 706 | talk talks estimate estimates 707 | talk talks find finds 708 | talk talks generate generates 709 | talk talks go goes 710 | talk talks implement implements 711 | talk talks increase increases 712 | talk talks listen listens 713 | talk talks play plays 714 | talk talks predict predicts 715 | talk talks provide provides 716 | talk talks say says 717 | talk talks scream screams 718 | talk talks search searches 719 | talk talks see sees 720 | talk talks shuffle shuffles 721 | talk talks sing sings 722 | talk talks sit sits 723 | talk talks slow slows 724 | talk talks speak speaks 725 | talk talks swim swims 726 | think thinks vanish vanishes 727 | think thinks walk walks 728 | think thinks work works 729 | think thinks write writes 730 | think thinks decrease decreases 731 | think thinks describe describes 732 | think thinks eat eats 733 | think thinks enhance enhances 734 | think thinks estimate estimates 735 | think thinks find finds 736 | think thinks generate generates 737 | think thinks go goes 738 | think thinks implement implements 739 | think thinks increase increases 740 | think thinks listen listens 741 | think thinks play plays 742 | think thinks predict predicts 743 | think thinks provide provides 744 | think thinks say says 745 | think thinks scream screams 746 | think thinks search searches 747 | think thinks see sees 748 | think thinks shuffle shuffles 749 | think thinks sing sings 750 | think thinks sit sits 751 | think thinks slow slows 752 | think thinks speak speaks 753 | think thinks swim swims 754 | think thinks talk talks 755 | vanish vanishes walk walks 756 | vanish vanishes work works 757 | vanish vanishes write writes 758 | vanish vanishes decrease decreases 759 | vanish vanishes describe describes 760 | vanish vanishes eat eats 761 | vanish vanishes enhance enhances 762 | vanish vanishes estimate estimates 763 | vanish vanishes find finds 764 | vanish vanishes generate generates 765 | vanish vanishes go goes 766 | vanish vanishes implement implements 767 | vanish vanishes increase increases 768 | vanish vanishes listen listens 769 | vanish vanishes play plays 770 | vanish vanishes predict predicts 771 | vanish vanishes provide provides 772 | vanish vanishes say says 773 | vanish vanishes scream screams 774 | vanish vanishes search searches 775 | vanish vanishes see sees 776 | vanish vanishes shuffle shuffles 777 | vanish vanishes sing sings 778 | vanish vanishes sit sits 779 | vanish vanishes slow slows 780 | vanish vanishes speak speaks 781 | vanish vanishes swim swims 782 | vanish vanishes talk talks 783 | vanish vanishes think thinks 784 | walk walks work works 785 | walk walks write writes 786 | walk walks decrease decreases 787 | walk walks describe describes 788 | walk walks eat eats 789 | walk walks enhance enhances 790 | walk walks estimate estimates 791 | walk walks find finds 792 | walk walks generate generates 793 | walk walks go goes 794 | walk walks implement implements 795 | walk walks increase increases 796 | walk walks listen listens 797 | walk walks play plays 798 | walk walks predict predicts 799 | walk walks provide provides 800 | walk walks say says 801 | walk walks scream screams 802 | walk walks search searches 803 | walk walks see sees 804 | walk walks shuffle shuffles 805 | walk walks sing sings 806 | walk walks sit sits 807 | walk walks slow slows 808 | walk walks speak speaks 809 | walk walks swim swims 810 | walk walks talk talks 811 | walk walks think thinks 812 | walk walks vanish vanishes 813 | work works write writes 814 | work works decrease decreases 815 | work works describe describes 816 | work works eat eats 817 | work works enhance enhances 818 | work works estimate estimates 819 | work works find finds 820 | work works generate generates 821 | work works go goes 822 | work works implement implements 823 | work works increase increases 824 | work works listen listens 825 | work works play plays 826 | work works predict predicts 827 | work works provide provides 828 | work works say says 829 | work works scream screams 830 | work works search searches 831 | work works see sees 832 | work works shuffle shuffles 833 | work works sing sings 834 | work works sit sits 835 | work works slow slows 836 | work works speak speaks 837 | work works swim swims 838 | work works talk talks 839 | work works think thinks 840 | work works vanish vanishes 841 | work works walk walks 842 | write writes decrease decreases 843 | write writes describe describes 844 | write writes eat eats 845 | write writes enhance enhances 846 | write writes estimate estimates 847 | write writes find finds 848 | write writes generate generates 849 | write writes go goes 850 | write writes implement implements 851 | write writes increase increases 852 | write writes listen listens 853 | write writes play plays 854 | write writes predict predicts 855 | write writes provide provides 856 | write writes say says 857 | write writes scream screams 858 | write writes search searches 859 | write writes see sees 860 | write writes shuffle shuffles 861 | write writes sing sings 862 | write writes sit sits 863 | write writes slow slows 864 | write writes speak speaks 865 | write writes swim swims 866 | write writes talk talks 867 | write writes think thinks 868 | write writes vanish vanishes 869 | write writes walk walks 870 | write writes work works 871 | -------------------------------------------------------------------------------- /randomization.test.sh: -------------------------------------------------------------------------------- 1 | # Tests for ensuring randomization is being controlled 2 | 3 | make 4 | 5 | if [ ! -e text8 ]; then 6 | if hash wget 2>/dev/null; then 7 | wget http://mattmahoney.net/dc/text8.zip 8 | else 9 | curl -O http://mattmahoney.net/dc/text8.zip 10 | fi 11 | unzip text8.zip 12 | rm text8.zip 13 | fi 14 | 15 | # Global constants 16 | CORPUS=text8 17 | VERBOSE=2 18 | BUILDDIR=build 19 | MEMORY=4.0 20 | VOCAB_MIN_COUNT=20 21 | 22 | # Re-used files 23 | VOCAB_FILE=$(mktemp vocab.test.txt.XXXXXX) 24 | COOCCURRENCE_FILE=$(mktemp cooccurrence.test.bin.XXXXXX) 25 | COOCCURRENCE_SHUF_FILE=$(mktemp cooccurrence_shuf.test.bin.XXXXXX) 26 | 27 | # Make vocab 28 | $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE 29 | 30 | # Make Coocurrences 31 | $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size 5 < $CORPUS > $COOCCURRENCE_FILE 32 | 33 | # Shuffle Coocurrences 34 | $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE -seed 1 < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE 35 | 36 | # Keep track of failure 37 | num_failed=0 38 | 39 | check_exit() { 40 | eval $2 41 | failed=$(( $1 != $? )) 42 | num_failed=$(( $num_failed + $failed )) 43 | if [[ $failed -eq 0 ]]; then 44 | echo PASSED 45 | else 46 | echo FAILED 47 | fi 48 | } 49 | 50 | # Test control of random seed in shuffle 51 | printf "\n\n--- TEST SET: Control of random seed in shuffle\n" 52 | TEST_FILE=$(mktemp cooc_shuf.test.bin.XXXXXX) 53 | 54 | printf "\n- TEST: Using the same seed should get the same shuffle\n" 55 | $BUILDDIR/shuffle -memory $MEMORY -verbose 0 -seed 1 < $COOCCURRENCE_FILE > $TEST_FILE 56 | check_exit 0 "cmp --quiet $COOCCURRENCE_SHUF_FILE $TEST_FILE" 57 | 58 | printf "\n- TEST: Changing the seed should change the shuffle\n" 59 | $BUILDDIR/shuffle -memory $MEMORY -verbose 0 -seed 2 < $COOCCURRENCE_FILE > $TEST_FILE 60 | check_exit 1 "cmp --quiet $COOCCURRENCE_SHUF_FILE $TEST_FILE" 61 | 62 | rm $TEST_FILE # Clean up 63 | # --- 64 | 65 | # Control randomization in GloVe 66 | printf "\n\n--- TEST SET: Control of random seed in glove\n" 67 | # Note "-threads" must equal 1 for these to pass, since order in which results come back from individual threads is uncontrolled 68 | BASE_PREFIX=$(mktemp base_vectors.XXXXXX) 69 | TEST_PREFIX=$(mktemp test_vectors.XXXXXX) 70 | 71 | printf "\n- TEST: Reusing seed should give the same vectors\n" 72 | $BUILDDIR/glove -save-file $BASE_PREFIX -threads 1 -input-file $COOCCURRENCE_SHUF_FILE -iter 3 -vector-size 10 -binary 1 -vocab-file $VOCAB_FILE -verbose 0 -seed 1 73 | $BUILDDIR/glove -save-file $TEST_PREFIX -threads 1 -input-file $COOCCURRENCE_SHUF_FILE -iter 3 -vector-size 10 -binary 1 -vocab-file $VOCAB_FILE -verbose 0 -seed 1 74 | check_exit 0 "cmp --quiet $BASE_PREFIX.bin $TEST_PREFIX.bin" 75 | 76 | printf "\n- TEST: Changing seed should change the learned vectors\n" 77 | $BUILDDIR/glove -save-file $TEST_PREFIX -threads 1 -input-file $COOCCURRENCE_SHUF_FILE -iter 3 -vector-size 10 -binary 1 -vocab-file $VOCAB_FILE -verbose 0 -seed 2 78 | check_exit 1 "cmp --quiet $BASE_PREFIX.bin $TEST_PREFIX.bin" 79 | 80 | printf "\n- TEST: Should be able to save/load initial parameters\n" 81 | $BUILDDIR/glove -save-file $BASE_PREFIX -threads 1 -input-file $COOCCURRENCE_SHUF_FILE -iter 3 -vector-size 10 -binary 1 -vocab-file $VOCAB_FILE -verbose 0 -save-init-param 1 82 | $BUILDDIR/glove -save-file $TEST_PREFIX -threads 1 -input-file $COOCCURRENCE_SHUF_FILE -iter 3 -vector-size 10 -binary 1 -vocab-file $VOCAB_FILE -verbose 0 -save-init-param 1 -load-init-param 1 -init-param-file "$BASE_PREFIX.000.bin" 83 | check_exit 0 "cmp --quiet $BASE_PREFIX.000.bin $TEST_PREFIX.000.bin && cmp --quiet $BASE_PREFIX.bin $TEST_PREFIX.bin" 84 | 85 | rm "$BASE_PREFIX.000.bin" "$TEST_PREFIX.000.bin" "$BASE_PREFIX.bin" "$TEST_PREFIX.bin" # Clean up 86 | rm $BASE_PREFIX $TEST_PREFIX 87 | 88 | # ---- 89 | 90 | printf "\n- TEST: Should be able to save/load initial parameters and gradsq\n" 91 | # note: the seed will be randomly assigned and should not matter 92 | $BUILDDIR/glove -save-file $BASE_PREFIX -gradsq-file $BASE_PREFIX.gradsq -threads 1 -input-file $COOCCURRENCE_SHUF_FILE -iter 6 -vector-size 10 -binary 1 -vocab-file $VOCAB_FILE -verbose 0 -checkpoint-every 2 93 | 94 | $BUILDDIR/glove -save-file $TEST_PREFIX -gradsq-file $TEST_PREFIX.gradsq -threads 1 -input-file $COOCCURRENCE_SHUF_FILE -iter 4 -vector-size 10 -binary 1 -vocab-file $VOCAB_FILE -verbose 0 -checkpoint-every 2 -load-init-param 1 -init-param-file "$BASE_PREFIX.002.bin" -load-init-gradsq 1 -init-gradsq-file "$BASE_PREFIX.gradsq.002.bin" 95 | 96 | echo "Compare vectors before & after load gradsq - 2 iterations" 97 | check_exit 0 "cmp --quiet $BASE_PREFIX.004.bin $TEST_PREFIX.002.bin" 98 | echo "Compare vectors before & after load gradsq - 4 iterations" 99 | check_exit 0 "cmp --quiet $BASE_PREFIX.006.bin $TEST_PREFIX.004.bin" 100 | echo "Compare vectors before & after load gradsq - final" 101 | check_exit 0 "cmp --quiet $BASE_PREFIX.bin $TEST_PREFIX.bin" 102 | 103 | echo "Compare gradsq before & after load gradsq - 2 iterations" 104 | check_exit 0 "cmp --quiet $BASE_PREFIX.gradsq.004.bin $TEST_PREFIX.gradsq.002.bin" 105 | echo "Compare gradsq before & after load gradsq - 4 iterations" 106 | check_exit 0 "cmp --quiet $BASE_PREFIX.gradsq.006.bin $TEST_PREFIX.gradsq.004.bin" 107 | echo "Compare gradsq before & after load gradsq - final" 108 | check_exit 0 "cmp --quiet $BASE_PREFIX.gradsq.bin $TEST_PREFIX.gradsq.bin" 109 | 110 | echo "Cleaning up files" 111 | check_exit 0 "rm $BASE_PREFIX.002.bin $BASE_PREFIX.004.bin $BASE_PREFIX.006.bin $BASE_PREFIX.bin" 112 | check_exit 0 "rm $BASE_PREFIX.gradsq.002.bin $BASE_PREFIX.gradsq.004.bin $BASE_PREFIX.gradsq.006.bin $BASE_PREFIX.gradsq.bin" 113 | check_exit 0 "rm $TEST_PREFIX.002.bin $TEST_PREFIX.004.bin $TEST_PREFIX.bin" 114 | check_exit 0 "rm $TEST_PREFIX.gradsq.002.bin $TEST_PREFIX.gradsq.004.bin $TEST_PREFIX.gradsq.bin" 115 | check_exit 0 "rm $VOCAB_FILE $COOCCURRENCE_FILE $COOCCURRENCE_SHUF_FILE" 116 | 117 | echo 118 | echo SUMMARY: 119 | if [[ $num_failed -gt 0 ]]; then 120 | echo $num_failed tests failed. 121 | exit 1 122 | else 123 | echo All tests passed. 124 | exit 0 125 | fi 126 | 127 | 128 | -------------------------------------------------------------------------------- /src/README.md: -------------------------------------------------------------------------------- 1 | ### Package Contents 2 | 3 | To train your own GloVe vectors, first you'll need to prepare your corpus as a single text file with all words separated by one or more spaces or tabs. If your corpus has multiple documents, the documents (only) should be separated by new line characters. Cooccurrence contexts for words do not extend past newline characters. Once you create your corpus, you can train GloVe vectors using the following 4 tools. An example is included in `demo.sh`, which you can modify as necessary. 4 | 5 | The four main tools in this package are: 6 | 7 | #### 1) vocab_count 8 | This tool requires an input corpus that should already consist of whitespace-separated tokens. Use something like the [Stanford Tokenizer](https://nlp.stanford.edu/software/tokenizer.html) first on raw text. From the corpus, it constructs unigram counts from a corpus, and optionally thresholds the resulting vocabulary based on total vocabulary size or minimum frequency count. 9 | 10 | #### 2) cooccur 11 | Constructs word-word cooccurrence statistics from a corpus. The user should supply a vocabulary file, as produced by `vocab_count`, and may specify a variety of parameters, as described by running `./build/cooccur`. 12 | 13 | #### 3) shuffle 14 | Shuffles the binary file of cooccurrence statistics produced by `cooccur`. For large files, the file is automatically split into chunks, each of which is shuffled and stored on disk before being merged and shuffled together. The user may specify a number of parameters, as described by running `./build/shuffle`. 15 | 16 | #### 4) glove 17 | Train the GloVe model on the specified cooccurrence data, which typically will be the output of the `shuffle` tool. The user should supply a vocabulary file, as given by `vocab_count`, and may specify a number of other parameters, which are described by running `./build/glove`. 18 | -------------------------------------------------------------------------------- /src/common.c: -------------------------------------------------------------------------------- 1 | // Common code for cooccur.c, vocab_count.c, 2 | // glove.c and shuffle.c 3 | // 4 | // GloVe: Global Vectors for Word Representation 5 | // Copyright (c) 2014 The Board of Trustees of 6 | // The Leland Stanford Junior University. All Rights Reserved. 7 | // 8 | // Licensed under the Apache License, Version 2.0 (the "License"); 9 | // you may not use this file except in compliance with the License. 10 | // You may obtain a copy of the License at 11 | // 12 | // http://www.apache.org/licenses/LICENSE-2.0 13 | // 14 | // Unless required by applicable law or agreed to in writing, software 15 | // distributed under the License is distributed on an "AS IS" BASIS, 16 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | // See the License for the specific language governing permissions and 18 | // limitations under the License. 19 | // 20 | // 21 | // For more information, bug reports, fixes, contact: 22 | // Jeffrey Pennington (jpennin@stanford.edu) 23 | // Christopher Manning (manning@cs.stanford.edu) 24 | // https://github.com/stanfordnlp/GloVe/ 25 | // GlobalVectors@googlegroups.com 26 | // http://nlp.stanford.edu/projects/glove/ 27 | 28 | #include 29 | #include 30 | #include 31 | #include "common.h" 32 | 33 | #ifdef _MSC_VER 34 | #define STRERROR(ERRNO, BUF, BUFSIZE) strerror_s((BUF), (BUFSIZE), (ERRNO)) 35 | #else 36 | #define STRERROR(ERRNO, BUF, BUFSIZE) strerror_r((ERRNO), (BUF), (BUFSIZE)) 37 | #endif 38 | 39 | /* Efficient string comparison */ 40 | int scmp( char *s1, char *s2 ) { 41 | while (*s1 != '\0' && *s1 == *s2) {s1++; s2++;} 42 | return (*s1 - *s2); 43 | } 44 | 45 | /* Move-to-front hashing and hash function from Hugh Williams, http://www.seg.rmit.edu.au/code/zwh-ipl/ */ 46 | 47 | /* Simple bitwise hash function */ 48 | unsigned int bitwisehash(char *word, int tsize, unsigned int seed) { 49 | char c; 50 | unsigned int h; 51 | h = seed; 52 | for ( ; (c = *word) != '\0'; word++) h ^= ((h << 5) + c + (h >> 2)); 53 | return (unsigned int)((h & 0x7fffffff) % tsize); 54 | } 55 | 56 | /* Create hash table, initialise pointers to NULL */ 57 | HASHREC ** inithashtable() { 58 | int i; 59 | HASHREC **ht; 60 | ht = (HASHREC **) malloc( sizeof(HASHREC *) * TSIZE ); 61 | for (i = 0; i < TSIZE; i++) ht[i] = (HASHREC *) NULL; 62 | return ht; 63 | } 64 | 65 | /* Read word from input stream. Return 1 when encounter '\n' or EOF (but separate from word), 0 otherwise. 66 | Words can be separated by space(s), tab(s), or newline(s). Carriage return characters are just ignored. 67 | (Okay for Windows, but not for Mac OS 9-. Ignored even if by themselves or in words.) 68 | A newline is taken as indicating a new document (contexts won't cross newline). 69 | Argument word array is assumed to be of size MAX_STRING_LENGTH. 70 | words will be truncated if too long. They are truncated with some care so that they 71 | cannot truncate in the middle of a utf-8 character, but 72 | still little to no harm will be done for other encodings like iso-8859-1. 73 | (This function appears identically copied in vocab_count.c and cooccur.c.) 74 | */ 75 | int get_word(char *word, FILE *fin) { 76 | int i = 0, ch; 77 | for ( ; ; ) { 78 | ch = fgetc(fin); 79 | if (ch == '\r') continue; 80 | if (i == 0 && ((ch == '\n') || (ch == EOF))) { 81 | word[i] = 0; 82 | return 1; 83 | } 84 | if (i == 0 && ((ch == ' ') || (ch == '\t'))) continue; // skip leading space 85 | if ((ch == EOF) || (ch == ' ') || (ch == '\t') || (ch == '\n')) { 86 | if (ch == '\n') ungetc(ch, fin); // return the newline next time as document ender 87 | break; 88 | } 89 | if (i < MAX_STRING_LENGTH - 1) 90 | word[i++] = ch; // don't allow words to exceed MAX_STRING_LENGTH 91 | } 92 | word[i] = 0; //null terminate 93 | // avoid truncation destroying a multibyte UTF-8 char except if only thing on line (so the i > x tests won't overwrite word[0]) 94 | // see https://en.wikipedia.org/wiki/UTF-8#Description 95 | if (i == MAX_STRING_LENGTH - 1 && (word[i-1] & 0x80) == 0x80) { 96 | if ((word[i-1] & 0xC0) == 0xC0) { 97 | word[i-1] = '\0'; 98 | } else if (i > 2 && (word[i-2] & 0xE0) == 0xE0) { 99 | word[i-2] = '\0'; 100 | } else if (i > 3 && (word[i-3] & 0xF8) == 0xF0) { 101 | word[i-3] = '\0'; 102 | } 103 | } 104 | return 0; 105 | } 106 | 107 | int find_arg(char *str, int argc, char **argv) { 108 | int i; 109 | for (i = 1; i < argc; i++) { 110 | if (!scmp(str, argv[i])) { 111 | if (i == argc - 1) { 112 | printf("No argument given for %s\n", str); 113 | exit(1); 114 | } 115 | return i; 116 | } 117 | } 118 | return -1; 119 | } 120 | 121 | void free_table(HASHREC **ht) { 122 | int i; 123 | HASHREC* current; 124 | HASHREC* tmp; 125 | for (i = 0; i < TSIZE; i++) { 126 | current = ht[i]; 127 | while (current != NULL) { 128 | tmp = current; 129 | current = current->next; 130 | free(tmp->word); 131 | free(tmp); 132 | } 133 | } 134 | free(ht); 135 | } 136 | 137 | void free_fid(FILE **fid, const int num) { 138 | int i; 139 | for(i = 0; i < num; i++) { 140 | if(fid[i] != NULL) 141 | fclose(fid[i]); 142 | } 143 | free(fid); 144 | } 145 | 146 | 147 | int log_file_loading_error(char *file_description, char *file_name) { 148 | fprintf(stderr, "Unable to open %s %s.\n", file_description, file_name); 149 | fprintf(stderr, "Errno: %d\n", errno); 150 | char error[MAX_STRING_LENGTH]; 151 | STRERROR(errno, error, MAX_STRING_LENGTH); 152 | fprintf(stderr, "Error description: %s\n", error); 153 | return errno; 154 | } 155 | -------------------------------------------------------------------------------- /src/common.h: -------------------------------------------------------------------------------- 1 | #ifndef COMMON_H 2 | #define COMMON_H 3 | 4 | // Common code for cooccur.c, vocab_count.c, 5 | // glove.c and shuffle.c 6 | // 7 | // GloVe: Global Vectors for Word Representation 8 | // Copyright (c) 2014 The Board of Trustees of 9 | // The Leland Stanford Junior University. All Rights Reserved. 10 | // 11 | // Licensed under the Apache License, Version 2.0 (the "License"); 12 | // you may not use this file except in compliance with the License. 13 | // You may obtain a copy of the License at 14 | // 15 | // http://www.apache.org/licenses/LICENSE-2.0 16 | // 17 | // Unless required by applicable law or agreed to in writing, software 18 | // distributed under the License is distributed on an "AS IS" BASIS, 19 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 20 | // See the License for the specific language governing permissions and 21 | // limitations under the License. 22 | // 23 | // 24 | // For more information, bug reports, fixes, contact: 25 | // Jeffrey Pennington (jpennin@stanford.edu) 26 | // Christopher Manning (manning@cs.stanford.edu) 27 | // https://github.com/stanfordnlp/GloVe/ 28 | // GlobalVectors@googlegroups.com 29 | // http://nlp.stanford.edu/projects/glove/ 30 | 31 | #include 32 | 33 | #define MAX_STRING_LENGTH 1000 34 | #define TSIZE 1048576 35 | #define SEED 1159241 36 | #define HASHFN bitwisehash 37 | 38 | typedef double real; 39 | typedef struct cooccur_rec { 40 | int word1; 41 | int word2; 42 | real val; 43 | } CREC; 44 | typedef struct hashrec { 45 | char *word; 46 | long long num; //count or id 47 | struct hashrec *next; 48 | } HASHREC; 49 | 50 | 51 | int scmp( char *s1, char *s2 ); 52 | unsigned int bitwisehash(char *word, int tsize, unsigned int seed); 53 | HASHREC **inithashtable(); 54 | int get_word(char *word, FILE *fin); 55 | void free_table(HASHREC **ht); 56 | int find_arg(char *str, int argc, char **argv); 57 | void free_fid(FILE **fid, const int num); 58 | 59 | // logs errors when loading files. call after a failed load 60 | int log_file_loading_error(char *file_description, char *file_name); 61 | 62 | #endif /* COMMON_H */ 63 | 64 | -------------------------------------------------------------------------------- /src/cooccur.c: -------------------------------------------------------------------------------- 1 | // Tool to calculate word-word cooccurrence statistics 2 | // 3 | // Copyright (c) 2014, 2018 The Board of Trustees of 4 | // The Leland Stanford Junior University. All Rights Reserved. 5 | // 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | // 18 | // 19 | // For more information, bug reports, fixes, contact: 20 | // Jeffrey Pennington (jpennin@stanford.edu) 21 | // Christopher Manning (manning@cs.stanford.edu) 22 | // https://github.com/stanfordnlp/GloVe/ 23 | // GlobalVectors@googlegroups.com 24 | // http://nlp.stanford.edu/projects/glove/ 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include "common.h" 31 | 32 | typedef struct cooccur_rec_id { 33 | int word1; 34 | int word2; 35 | real val; 36 | int id; 37 | } CRECID; 38 | 39 | int verbose = 2; // 0, 1, or 2 40 | long long max_product; // Cutoff for product of word frequency ranks below which cooccurrence counts will be stored in a compressed full array 41 | long long overflow_length; // Number of cooccurrence records whose product exceeds max_product to store in memory before writing to disk 42 | int window_size = 15; // default context window size 43 | int symmetric = 1; // 0: asymmetric, 1: symmetric 44 | real memory_limit = 3; // soft limit, in gigabytes, used to estimate optimal array sizes 45 | int distance_weighting = 1; // Flag to control the distance weighting of cooccurrence counts 46 | char *vocab_file, *file_head; 47 | 48 | /* Search hash table for given string, return record if found, else NULL */ 49 | HASHREC *hashsearch(HASHREC **ht, char *w) { 50 | HASHREC *htmp, *hprv; 51 | unsigned int hval = HASHFN(w, TSIZE, SEED); 52 | for (hprv = NULL, htmp=ht[hval]; htmp != NULL && scmp(htmp->word, w) != 0; hprv = htmp, htmp = htmp->next); 53 | if ( htmp != NULL && hprv!=NULL ) { // move to front on access 54 | hprv->next = htmp->next; 55 | htmp->next = ht[hval]; 56 | ht[hval] = htmp; 57 | } 58 | return(htmp); 59 | } 60 | 61 | /* Insert string in hash table, check for duplicates which should be absent */ 62 | void hashinsert(HASHREC **ht, char *w, long long id) { 63 | HASHREC *htmp, *hprv; 64 | unsigned int hval = HASHFN(w, TSIZE, SEED); 65 | for (hprv = NULL, htmp = ht[hval]; htmp != NULL && scmp(htmp->word, w) != 0; hprv = htmp, htmp = htmp->next); 66 | if (htmp == NULL) { 67 | htmp = (HASHREC *) malloc(sizeof(HASHREC)); 68 | htmp->word = (char *) malloc(strlen(w) + 1); 69 | strcpy(htmp->word, w); 70 | htmp->num = id; 71 | htmp->next = NULL; 72 | if (hprv == NULL) ht[hval] = htmp; 73 | else hprv->next = htmp; 74 | } 75 | else fprintf(stderr, "Error, duplicate entry located: %s.\n",htmp->word); 76 | return; 77 | } 78 | 79 | /* Write sorted chunk of cooccurrence records to file, accumulating duplicate entries */ 80 | int write_chunk(CREC *cr, long long length, FILE *fout) { 81 | if (length == 0) return 0; 82 | 83 | long long a = 0; 84 | CREC old = cr[a]; 85 | 86 | for (a = 1; a < length; a++) { 87 | if (cr[a].word1 == old.word1 && cr[a].word2 == old.word2) { 88 | old.val += cr[a].val; 89 | continue; 90 | } 91 | fwrite(&old, sizeof(CREC), 1, fout); 92 | old = cr[a]; 93 | } 94 | fwrite(&old, sizeof(CREC), 1, fout); 95 | return 0; 96 | } 97 | 98 | /* Check if two cooccurrence records are for the same two words, used for qsort */ 99 | int compare_crec(const void *a, const void *b) { 100 | int c; 101 | if ( (c = ((CREC *) a)->word1 - ((CREC *) b)->word1) != 0) return c; 102 | else return (((CREC *) a)->word2 - ((CREC *) b)->word2); 103 | 104 | } 105 | 106 | /* Check if two cooccurrence records are for the same two words */ 107 | int compare_crecid(CRECID a, CRECID b) { 108 | int c; 109 | if ( (c = a.word1 - b.word1) != 0) return c; 110 | else return a.word2 - b.word2; 111 | } 112 | 113 | /* Swap two entries of priority queue */ 114 | void swap_entry(CRECID *pq, int i, int j) { 115 | CRECID temp = pq[i]; 116 | pq[i] = pq[j]; 117 | pq[j] = temp; 118 | } 119 | 120 | /* Insert entry into priority queue */ 121 | void insert(CRECID *pq, CRECID new, int size) { 122 | int j = size - 1, p; 123 | pq[j] = new; 124 | while ( (p=(j-1)/2) >= 0 ) { 125 | if (compare_crecid(pq[p],pq[j]) > 0) {swap_entry(pq,p,j); j = p;} 126 | else break; 127 | } 128 | } 129 | 130 | /* Delete entry from priority queue */ 131 | void delete(CRECID *pq, int size) { 132 | int j, p = 0; 133 | pq[p] = pq[size - 1]; 134 | while ( (j = 2*p+1) < size - 1 ) { 135 | if (j == size - 2) { 136 | if (compare_crecid(pq[p],pq[j]) > 0) swap_entry(pq,p,j); 137 | return; 138 | } 139 | else { 140 | if (compare_crecid(pq[j], pq[j+1]) < 0) { 141 | if (compare_crecid(pq[p],pq[j]) > 0) {swap_entry(pq,p,j); p = j;} 142 | else return; 143 | } 144 | else { 145 | if (compare_crecid(pq[p],pq[j+1]) > 0) {swap_entry(pq,p,j+1); p = j + 1;} 146 | else return; 147 | } 148 | } 149 | } 150 | } 151 | 152 | /* Write top node of priority queue to file, accumulating duplicate entries */ 153 | int merge_write(CRECID new, CRECID *old, FILE *fout) { 154 | if (new.word1 == old->word1 && new.word2 == old->word2) { 155 | old->val += new.val; 156 | return 0; // Indicates duplicate entry 157 | } 158 | fwrite(old, sizeof(CREC), 1, fout); 159 | *old = new; 160 | return 1; // Actually wrote to file 161 | } 162 | 163 | /* Merge [num] sorted files of cooccurrence records */ 164 | int merge_files(int num) { 165 | int i, size; 166 | long long counter = 0; 167 | CRECID *pq, new, old; 168 | char filename[200]; 169 | FILE **fid, *fout; 170 | fid = calloc(num, sizeof(FILE)); 171 | pq = malloc(sizeof(CRECID) * num); 172 | fout = stdout; 173 | if (verbose > 1) fprintf(stderr, "Merging cooccurrence files: processed 0 lines."); 174 | 175 | /* Open all files and add first entry of each to priority queue */ 176 | for (i = 0; i < num; i++) { 177 | sprintf(filename,"%s_%04d.bin",file_head,i); 178 | fid[i] = fopen(filename,"rb"); 179 | if (fid[i] == NULL) {log_file_loading_error("file", filename); free_fid(fid, num); free(pq); return 1;} 180 | fread(&new, sizeof(CREC), 1, fid[i]); 181 | new.id = i; 182 | insert(pq,new,i+1); 183 | } 184 | 185 | /* Pop top node, save it in old to see if the next entry is a duplicate */ 186 | size = num; 187 | old = pq[0]; 188 | i = pq[0].id; 189 | delete(pq, size); 190 | fread(&new, sizeof(CREC), 1, fid[i]); 191 | if (feof(fid[i])) size--; 192 | else { 193 | new.id = i; 194 | insert(pq, new, size); 195 | } 196 | 197 | /* Repeatedly pop top node and fill priority queue until files have reached EOF */ 198 | while (size > 0) { 199 | counter += merge_write(pq[0], &old, fout); // Only count the lines written to file, not duplicates 200 | if ((counter%100000) == 0) if (verbose > 1) fprintf(stderr,"\033[39G%lld lines.",counter); 201 | i = pq[0].id; 202 | delete(pq, size); 203 | fread(&new, sizeof(CREC), 1, fid[i]); 204 | if (feof(fid[i])) size--; 205 | else { 206 | new.id = i; 207 | insert(pq, new, size); 208 | } 209 | } 210 | fwrite(&old, sizeof(CREC), 1, fout); 211 | fprintf(stderr,"\033[0GMerging cooccurrence files: processed %lld lines.\n",++counter); 212 | for (i=0;i 0) { 244 | fprintf(stderr, "window size: %d\n", window_size); 245 | if (symmetric == 0) fprintf(stderr, "context: asymmetric\n"); 246 | else fprintf(stderr, "context: symmetric\n"); 247 | } 248 | if (verbose > 1) fprintf(stderr, "max product: %lld\n", max_product); 249 | if (verbose > 1) fprintf(stderr, "overflow length: %lld\n", overflow_length); 250 | sprintf(format,"%%%ds %%lld", MAX_STRING_LENGTH); // Format to read from vocab file, which has (irrelevant) frequency data 251 | if (verbose > 1) fprintf(stderr, "Reading vocab from file \"%s\"...", vocab_file); 252 | fid = fopen(vocab_file,"r"); 253 | if (fid == NULL) { 254 | log_file_loading_error("vocab file", vocab_file); 255 | free_resources(vocab_hash, cr, lookup, history, bigram_table); 256 | return 1; 257 | } 258 | while (fscanf(fid, format, str, &id) != EOF) hashinsert(vocab_hash, str, ++j); // Here id is not used: inserting vocab words into hash table with their frequency rank, j 259 | fclose(fid); 260 | vocab_size = j; 261 | j = 0; 262 | if (verbose > 1) fprintf(stderr, "loaded %lld words.\nBuilding lookup table...", vocab_size); 263 | 264 | /* Build auxiliary lookup table used to index into bigram_table */ 265 | lookup = (long long *)calloc( vocab_size + 1, sizeof(long long) ); 266 | if (lookup == NULL) { 267 | fprintf(stderr, "Couldn't allocate memory!"); 268 | free_resources(vocab_hash, cr, lookup, history, bigram_table); 269 | return 1; 270 | } 271 | lookup[0] = 1; 272 | for (a = 1; a <= vocab_size; a++) { 273 | if ((lookup[a] = max_product / a) < vocab_size) lookup[a] += lookup[a-1]; 274 | else lookup[a] = lookup[a-1] + vocab_size; 275 | } 276 | if (verbose > 1) fprintf(stderr, "table contains %lld elements.\n",lookup[a-1]); 277 | 278 | /* Allocate memory for full array which will store all cooccurrence counts for words whose product of frequency ranks is less than max_product */ 279 | bigram_table = (real *)calloc( lookup[a-1] , sizeof(real) ); 280 | if (bigram_table == NULL) { 281 | fprintf(stderr, "Couldn't allocate memory!"); 282 | free_resources(vocab_hash, cr, lookup, history, bigram_table); 283 | return 1; 284 | } 285 | 286 | fid = stdin; 287 | // sprintf(format,"%%%ds",MAX_STRING_LENGTH); 288 | sprintf(filename,"%s_%04d.bin", file_head, fidcounter); 289 | foverflow = fopen(filename,"wb"); 290 | if (verbose > 1) fprintf(stderr,"Processing token: 0"); 291 | 292 | // if symmetric > 0, we can increment ind twice per iteration, 293 | // meaning up to 2x window_size in one loop 294 | long long const overflow_threshold = symmetric == 0 ? overflow_length - window_size : overflow_length - 2 * window_size; 295 | 296 | /* For each token in input stream, calculate a weighted cooccurrence sum within window_size */ 297 | while (1) { 298 | if (ind >= overflow_threshold) { 299 | // If overflow buffer is (almost) full, sort it and write it to temporary file 300 | qsort(cr, ind, sizeof(CREC), compare_crec); 301 | write_chunk(cr,ind,foverflow); 302 | fclose(foverflow); 303 | fidcounter++; 304 | sprintf(filename,"%s_%04d.bin",file_head,fidcounter); 305 | foverflow = fopen(filename,"wb"); 306 | ind = 0; 307 | } 308 | flag = get_word(str, fid); 309 | if (verbose > 2) fprintf(stderr, "Maybe processing token: %s\n", str); 310 | if (flag == 1) { 311 | // Newline, reset line index (j); maybe eof. 312 | if (feof(fid)) { 313 | if (verbose > 2) fprintf(stderr, "Not getting coocurs as at eof\n"); 314 | break; 315 | } 316 | j = 0; 317 | if (verbose > 2) fprintf(stderr, "Not getting coocurs as at newline\n"); 318 | continue; 319 | } 320 | counter++; 321 | if ((counter%100000) == 0) if (verbose > 1) fprintf(stderr,"\033[19G%lld",counter); 322 | htmp = hashsearch(vocab_hash, str); 323 | if (htmp == NULL) { 324 | if (verbose > 2) fprintf(stderr, "Not getting coocurs as word not in vocab\n"); 325 | continue; // Skip out-of-vocabulary words 326 | } 327 | w2 = htmp->num; // Target word (frequency rank) 328 | for (k = j - 1; k >= ( (j > window_size) ? j - window_size : 0 ); k--) { // Iterate over all words to the left of target word, but not past beginning of line 329 | w1 = history[k % window_size]; // Context word (frequency rank) 330 | if (verbose > 2) fprintf(stderr, "Adding cooccur between words %lld and %lld.\n", w1, w2); 331 | if ( w1 < max_product/w2 ) { // Product is small enough to store in a full array 332 | bigram_table[lookup[w1-1] + w2 - 2] += distance_weighting ? 1.0/((real)(j-k)) : 1.0; // Weight by inverse of distance between words if needed 333 | if (symmetric > 0) bigram_table[lookup[w2-1] + w1 - 2] += distance_weighting ? 1.0/((real)(j-k)) : 1.0; // If symmetric context is used, exchange roles of w2 and w1 (ie look at right context too) 334 | } 335 | else { // Product is too big, data is likely to be sparse. Store these entries in a temporary buffer to be sorted, merged (accumulated), and written to file when it gets full. 336 | cr[ind].word1 = w1; 337 | cr[ind].word2 = w2; 338 | cr[ind].val = distance_weighting ? 1.0/((real)(j-k)) : 1.0; 339 | ind++; // Keep track of how full temporary buffer is 340 | if (symmetric > 0) { // Symmetric context 341 | cr[ind].word1 = w2; 342 | cr[ind].word2 = w1; 343 | cr[ind].val = distance_weighting ? 1.0/((real)(j-k)) : 1.0; 344 | ind++; 345 | } 346 | } 347 | } 348 | history[j % window_size] = w2; // Target word is stored in circular buffer to become context word in the future 349 | j++; 350 | } 351 | 352 | /* Write out temp buffer for the final time (it may not be full) */ 353 | if (verbose > 1) fprintf(stderr,"\033[0GProcessed %lld tokens.\n",counter); 354 | qsort(cr, ind, sizeof(CREC), compare_crec); 355 | write_chunk(cr,ind,foverflow); 356 | sprintf(filename,"%s_0000.bin",file_head); 357 | 358 | /* Write out full bigram_table, skipping zeros */ 359 | if (verbose > 1) fprintf(stderr, "Writing cooccurrences to disk"); 360 | fid = fopen(filename,"wb"); 361 | j = 1e6; 362 | for (x = 1; x <= vocab_size; x++) { 363 | if ( (long long) (0.75*log(vocab_size / x)) < j) { 364 | j = (long long) (0.75*log(vocab_size / x)); 365 | if (verbose > 1) fprintf(stderr,"."); 366 | } // log's to make it look (sort of) pretty 367 | for (y = 1; y <= (lookup[x] - lookup[x-1]); y++) { 368 | if ((r = bigram_table[lookup[x-1] - 2 + y]) != 0) { 369 | fwrite(&x, sizeof(int), 1, fid); 370 | fwrite(&y, sizeof(int), 1, fid); 371 | fwrite(&r, sizeof(real), 1, fid); 372 | } 373 | } 374 | } 375 | 376 | if (verbose > 1) fprintf(stderr,"%d files in total.\n",fidcounter + 1); 377 | fclose(fid); 378 | fclose(foverflow); 379 | free_resources(vocab_hash, cr, lookup, history, bigram_table); 380 | return merge_files(fidcounter + 1); // Merge the sorted temporary files 381 | } 382 | 383 | int main(int argc, char **argv) { 384 | int i; 385 | real rlimit, n = 1e5; 386 | vocab_file = malloc(sizeof(char) * MAX_STRING_LENGTH); 387 | file_head = malloc(sizeof(char) * MAX_STRING_LENGTH); 388 | 389 | if (argc == 1) { 390 | printf("Tool to calculate word-word cooccurrence statistics\n"); 391 | printf("Author: Jeffrey Pennington (jpennin@stanford.edu)\n\n"); 392 | printf("Usage options:\n"); 393 | printf("\t-verbose \n"); 394 | printf("\t\tSet verbosity: 0, 1, 2 (default), or 3\n"); 395 | printf("\t-symmetric \n"); 396 | printf("\t\tIf = 0, only use left context; if = 1 (default), use left and right\n"); 397 | printf("\t-window-size \n"); 398 | printf("\t\tNumber of context words to the left (and to the right, if symmetric = 1); default 15\n"); 399 | printf("\t-vocab-file \n"); 400 | printf("\t\tFile containing vocabulary (truncated unigram counts, produced by 'vocab_count'); default vocab.txt\n"); 401 | printf("\t-memory \n"); 402 | printf("\t\tSoft limit for memory consumption, in GB -- based on simple heuristic, so not extremely accurate; default 4.0\n"); 403 | printf("\t-max-product \n"); 404 | printf("\t\tLimit the size of dense cooccurrence array by specifying the max product of the frequency counts of the two cooccurring words.\n\t\tThis value overrides that which is automatically produced by '-memory'. Typically only needs adjustment for use with very large corpora.\n"); 405 | printf("\t-overflow-length \n"); 406 | printf("\t\tLimit to length the sparse overflow array, which buffers cooccurrence data that does not fit in the dense array, before writing to disk. \n\t\tThis value overrides that which is automatically produced by '-memory'. Typically only needs adjustment for use with very large corpora.\n"); 407 | printf("\t-overflow-file \n"); 408 | printf("\t\tFilename, excluding extension, for temporary files; default overflow\n"); 409 | printf("\t-distance-weighting \n"); 410 | printf("\t\tIf = 0, do not weight cooccurrence count by distance between words; if = 1 (default), weight the cooccurrence count by inverse of distance between words\n"); 411 | 412 | printf("\nExample usage:\n"); 413 | printf("./cooccur -verbose 2 -symmetric 0 -window-size 10 -vocab-file vocab.txt -memory 8.0 -overflow-file tempoverflow < corpus.txt > cooccurrences.bin\n\n"); 414 | free(vocab_file); 415 | free(file_head); 416 | return 0; 417 | } 418 | 419 | if ((i = find_arg((char *)"-verbose", argc, argv)) > 0) verbose = atoi(argv[i + 1]); 420 | if ((i = find_arg((char *)"-symmetric", argc, argv)) > 0) symmetric = atoi(argv[i + 1]); 421 | if ((i = find_arg((char *)"-window-size", argc, argv)) > 0) window_size = atoi(argv[i + 1]); 422 | if ((i = find_arg((char *)"-vocab-file", argc, argv)) > 0) strcpy(vocab_file, argv[i + 1]); 423 | else strcpy(vocab_file, (char *)"vocab.txt"); 424 | if ((i = find_arg((char *)"-overflow-file", argc, argv)) > 0) strcpy(file_head, argv[i + 1]); 425 | else strcpy(file_head, (char *)"overflow"); 426 | if ((i = find_arg((char *)"-memory", argc, argv)) > 0) memory_limit = atof(argv[i + 1]); 427 | if ((i = find_arg((char *)"-distance-weighting", argc, argv)) > 0) distance_weighting = atoi(argv[i + 1]); 428 | 429 | /* The memory_limit determines a limit on the number of elements in bigram_table and the overflow buffer */ 430 | /* Estimate the maximum value that max_product can take so that this limit is still satisfied */ 431 | rlimit = 0.85 * (real)memory_limit * 1073741824/(sizeof(CREC)); 432 | while (fabs(rlimit - n * (log(n) + 0.1544313298)) > 1e-3) n = rlimit / (log(n) + 0.1544313298); 433 | max_product = (long long) n; 434 | overflow_length = (long long) rlimit/6; // 0.85 + 1/6 ~= 1 435 | 436 | /* Override estimates by specifying limits explicitly on the command line */ 437 | if ((i = find_arg((char *)"-max-product", argc, argv)) > 0) max_product = atoll(argv[i + 1]); 438 | if ((i = find_arg((char *)"-overflow-length", argc, argv)) > 0) overflow_length = atoll(argv[i + 1]); 439 | 440 | const int returned_value = get_cooccurrence(); 441 | free(vocab_file); 442 | free(file_head); 443 | return returned_value; 444 | } 445 | 446 | -------------------------------------------------------------------------------- /src/glove.c: -------------------------------------------------------------------------------- 1 | // GloVe: Global Vectors for Word Representation 2 | // 3 | // Copyright (c) 2014 The Board of Trustees of 4 | // The Leland Stanford Junior University. All Rights Reserved. 5 | // 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | // 18 | // 19 | // For more information, bug reports, fixes, contact: 20 | // Jeffrey Pennington (jpennin@stanford.edu) 21 | // GlobalVectors@googlegroups.com 22 | // http://nlp.stanford.edu/projects/glove/ 23 | 24 | // silence the many complaints from visual studio 25 | #define _CRT_SECURE_NO_WARNINGS 26 | 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | // windows pthread.h is buggy, but this #define fixes it 35 | #define HAVE_STRUCT_TIMESPEC 36 | #include 37 | 38 | #include "common.h" 39 | 40 | #define _FILE_OFFSET_BITS 64 41 | 42 | int write_header=0; //0=no, 1=yes; writes vocab_size/vector_size as first line for use with some libraries, such as gensim. 43 | int verbose = 2; // 0, 1, or 2 44 | int seed = 0; 45 | int use_unk_vec = 1; // 0 or 1 46 | int num_threads = 8; // pthreads 47 | int num_iter = 25; // Number of full passes through cooccurrence matrix 48 | int vector_size = 50; // Word vector size 49 | int save_gradsq = 0; // By default don't save squared gradient values 50 | int use_binary = 0; // 0: save as text files; 1: save as binary; 2: both. For binary, save both word and context word vectors. 51 | int model = 2; // For text file output only. 0: concatenate word and context vectors (and biases) i.e. save everything; 1: Just save word vectors (no bias); 2: Save (word + context word) vectors (no biases) 52 | int checkpoint_every = 0; // checkpoint the model for every checkpoint_every iterations. Do nothing if checkpoint_every <= 0 53 | int load_init_param = 0; // if 1 initial paramters are loaded from -init-param-file 54 | int save_init_param = 0; // if 1 initial paramters are saved (i.e., in the 0 checkpoint) 55 | int load_init_gradsq = 0; // if 1 initial squared gradients are loaded from -init-gradsq-file 56 | real eta = 0.05; // Initial learning rate 57 | real alpha = 0.75, x_max = 100.0; // Weighting function parameters, not extremely sensitive to corpus, though may need adjustment for very small or very large corpora 58 | real grad_clip_value = 100.0; // Clipping parameter for gradient components. Values will be clipped to [-grad_clip_value, grad_clip_value] interval. 59 | real *W, *gradsq, *cost; 60 | long long num_lines, *lines_per_thread, vocab_size; 61 | char vocab_file[MAX_STRING_LENGTH]; 62 | char input_file[MAX_STRING_LENGTH]; 63 | char save_W_file[MAX_STRING_LENGTH]; 64 | char save_gradsq_file[MAX_STRING_LENGTH]; 65 | char init_param_file[MAX_STRING_LENGTH]; 66 | char init_gradsq_file[MAX_STRING_LENGTH]; 67 | 68 | /** 69 | * Loads a save file for use as the initial values for the parameters or gradsq 70 | * Return value: 0 if success, -1 if fail 71 | */ 72 | int load_init_file(char *file_name, real *array, long long array_size) { 73 | FILE *fin; 74 | long long a; 75 | fin = fopen(file_name, "rb"); 76 | if (fin == NULL) { 77 | log_file_loading_error("init file", file_name); 78 | return -1; 79 | } 80 | for (a = 0; a < array_size; a++) { 81 | if (feof(fin)) { 82 | fprintf(stderr, "EOF reached before data fully loaded in %s.\n", file_name); 83 | fclose(fin); 84 | return -1; 85 | } 86 | fread(&array[a], sizeof(real), 1, fin); 87 | } 88 | fclose(fin); 89 | return 0; 90 | } 91 | 92 | void initialize_parameters() { 93 | // TODO: return an error code when an error occurs, clean up in the calling routine 94 | if (seed == 0) { 95 | seed = time(0); 96 | } 97 | fprintf(stderr, "Using random seed %d\n", seed); 98 | srand(seed); 99 | long long a; 100 | long long W_size = 2 * vocab_size * (vector_size + 1); // +1 to allocate space for bias 101 | 102 | /* Allocate space for word vectors and context word vectors, and correspodning gradsq */ 103 | a = posix_memalign((void **)&W, 128, W_size * sizeof(real)); // Might perform better than malloc 104 | if (W == NULL) { 105 | fprintf(stderr, "Error allocating memory for W\n"); 106 | exit(1); 107 | } 108 | a = posix_memalign((void **)&gradsq, 128, W_size * sizeof(real)); // Might perform better than malloc 109 | if (gradsq == NULL) { 110 | fprintf(stderr, "Error allocating memory for gradsq\n"); 111 | free(W); 112 | exit(1); 113 | } 114 | if (load_init_param) { 115 | // Load existing parameters 116 | fprintf(stderr, "\nLoading initial parameters from %s \n", init_param_file); 117 | if (load_init_file(init_param_file, W, W_size)) { 118 | free(W); 119 | free(gradsq); 120 | exit(1); 121 | } 122 | } else { 123 | // Initialize new parameters 124 | for (a = 0; a < W_size; ++a) { 125 | W[a] = (rand() / (real)RAND_MAX - 0.5) / vector_size; 126 | } 127 | } 128 | 129 | if (load_init_gradsq) { 130 | // Load existing squared gradients 131 | fprintf(stderr, "\nLoading initial squared gradients from %s \n", init_gradsq_file); 132 | if (load_init_file(init_gradsq_file, gradsq, W_size)) { 133 | free(W); 134 | free(gradsq); 135 | exit(1); 136 | } 137 | } else { 138 | // Initialize new squared gradients 139 | for (a = 0; a < W_size; ++a) { 140 | gradsq[a] = 1.0; // So initial value of eta is equal to initial learning rate 141 | } 142 | } 143 | } 144 | 145 | static inline real check_nan(real update) { 146 | if (isnan(update) || isinf(update)) { 147 | fprintf(stderr,"\ncaught NaN in update"); 148 | return 0.; 149 | } else { 150 | return update; 151 | } 152 | } 153 | 154 | /* Train the GloVe model */ 155 | void *glove_thread(void *vid) { 156 | long long a, b ,l1, l2; 157 | long long id = *(long long*)vid; 158 | CREC cr; 159 | real diff, fdiff, temp1, temp2; 160 | FILE *fin; 161 | fin = fopen(input_file, "rb"); 162 | if (fin == NULL) { 163 | // TODO: exit all the threads or somehow mark that glove failed 164 | log_file_loading_error("input file", input_file); 165 | pthread_exit(NULL); 166 | } 167 | fseeko(fin, (num_lines / num_threads * id) * (sizeof(CREC)), SEEK_SET); //Threads spaced roughly equally throughout file 168 | cost[id] = 0; 169 | 170 | real* W_updates1 = (real*)malloc(vector_size * sizeof(real)); 171 | if (NULL == W_updates1){ 172 | fclose(fin); 173 | pthread_exit(NULL); 174 | } 175 | real* W_updates2 = (real*)malloc(vector_size * sizeof(real)); 176 | if (NULL == W_updates2){ 177 | fclose(fin); 178 | free(W_updates1); 179 | pthread_exit(NULL); 180 | } 181 | for (a = 0; a < lines_per_thread[id]; a++) { 182 | fread(&cr, sizeof(CREC), 1, fin); 183 | if (feof(fin)) break; 184 | if (cr.word1 < 1 || cr.word2 < 1) { continue; } 185 | 186 | /* Get location of words in W & gradsq */ 187 | l1 = (cr.word1 - 1LL) * (vector_size + 1); // cr word indices start at 1 188 | l2 = ((cr.word2 - 1LL) + vocab_size) * (vector_size + 1); // shift by vocab_size to get separate vectors for context words 189 | 190 | /* Calculate cost, save diff for gradients */ 191 | diff = 0; 192 | for (b = 0; b < vector_size; b++) diff += W[b + l1] * W[b + l2]; // dot product of word and context word vector 193 | diff += W[vector_size + l1] + W[vector_size + l2] - log(cr.val); // add separate bias for each word 194 | fdiff = (cr.val > x_max) ? diff : pow(cr.val / x_max, alpha) * diff; // multiply weighting function (f) with diff 195 | 196 | // Check for NaN and inf() in the diffs. 197 | if (isnan(diff) || isnan(fdiff) || isinf(diff) || isinf(fdiff)) { 198 | fprintf(stderr,"Caught NaN in diff for kdiff for thread. Skipping update"); 199 | continue; 200 | } 201 | 202 | cost[id] += 0.5 * fdiff * diff; // weighted squared error 203 | 204 | /* Adaptive gradient updates */ 205 | real W_updates1_sum = 0; 206 | real W_updates2_sum = 0; 207 | for (b = 0; b < vector_size; b++) { 208 | // learning rate times gradient for word vectors 209 | temp1 = fmin(fmax(fdiff * W[b + l2], -grad_clip_value), grad_clip_value) * eta; 210 | temp2 = fmin(fmax(fdiff * W[b + l1], -grad_clip_value), grad_clip_value) * eta; 211 | // adaptive updates 212 | W_updates1[b] = temp1 / sqrt(gradsq[b + l1]); 213 | W_updates2[b] = temp2 / sqrt(gradsq[b + l2]); 214 | W_updates1_sum += W_updates1[b]; 215 | W_updates2_sum += W_updates2[b]; 216 | gradsq[b + l1] += temp1 * temp1; 217 | gradsq[b + l2] += temp2 * temp2; 218 | } 219 | if (!isnan(W_updates1_sum) && !isinf(W_updates1_sum) && !isnan(W_updates2_sum) && !isinf(W_updates2_sum)) { 220 | for (b = 0; b < vector_size; b++) { 221 | W[b + l1] -= W_updates1[b]; 222 | W[b + l2] -= W_updates2[b]; 223 | } 224 | } 225 | 226 | // updates for bias terms 227 | W[vector_size + l1] -= check_nan(fdiff / sqrt(gradsq[vector_size + l1])); 228 | W[vector_size + l2] -= check_nan(fdiff / sqrt(gradsq[vector_size + l2])); 229 | fdiff *= fdiff; 230 | gradsq[vector_size + l1] += fdiff; 231 | gradsq[vector_size + l2] += fdiff; 232 | 233 | } 234 | free(W_updates1); 235 | free(W_updates2); 236 | 237 | fclose(fin); 238 | pthread_exit(NULL); 239 | } 240 | 241 | /* Save params to file */ 242 | int save_params(int nb_iter) { 243 | /* 244 | * nb_iter is the number of iteration (= a full pass through the cooccurrence matrix). 245 | * nb_iter > 0 => checkpointing the intermediate parameters, so nb_iter is in the filename of output file. 246 | * nb_iter == 0 => checkpointing the initial parameters 247 | * else => saving the final paramters, so nb_iter is ignored. 248 | */ 249 | 250 | long long a, b; 251 | char format[20]; 252 | char output_file[MAX_STRING_LENGTH+20], output_file_gsq[MAX_STRING_LENGTH+20]; 253 | char *word = malloc(sizeof(char) * MAX_STRING_LENGTH + 1); 254 | if (NULL == word) { 255 | return 1; 256 | } 257 | FILE *fid, *fout; 258 | FILE *fgs = NULL; 259 | 260 | if (use_binary > 0 || nb_iter == 0) { 261 | // Save parameters in binary file 262 | // note: always save initial parameters in binary, as the reading code expects binary 263 | if (nb_iter < 0) 264 | sprintf(output_file,"%s.bin",save_W_file); 265 | else 266 | sprintf(output_file,"%s.%03d.bin",save_W_file,nb_iter); 267 | 268 | fout = fopen(output_file,"wb"); 269 | if (fout == NULL) {log_file_loading_error("weights file", save_W_file); free(word); return 1;} 270 | for (a = 0; a < 2 * vocab_size * (vector_size + 1); a++) fwrite(&W[a], sizeof(real), 1,fout); 271 | fclose(fout); 272 | if (save_gradsq > 0) { 273 | if (nb_iter < 0) 274 | sprintf(output_file_gsq,"%s.bin",save_gradsq_file); 275 | else 276 | sprintf(output_file_gsq,"%s.%03d.bin",save_gradsq_file,nb_iter); 277 | 278 | fgs = fopen(output_file_gsq,"wb"); 279 | if (fgs == NULL) {log_file_loading_error("gradsq file", save_gradsq_file); free(word); return 1;} 280 | for (a = 0; a < 2 * vocab_size * (vector_size + 1); a++) fwrite(&gradsq[a], sizeof(real), 1,fgs); 281 | fclose(fgs); 282 | } 283 | } 284 | if (use_binary != 1) { // Save parameters in text file 285 | if (nb_iter < 0) 286 | sprintf(output_file,"%s.txt",save_W_file); 287 | else 288 | sprintf(output_file,"%s.%03d.txt",save_W_file,nb_iter); 289 | if (save_gradsq > 0) { 290 | if (nb_iter < 0) 291 | sprintf(output_file_gsq,"%s.txt",save_gradsq_file); 292 | else 293 | sprintf(output_file_gsq,"%s.%03d.txt",save_gradsq_file,nb_iter); 294 | 295 | fgs = fopen(output_file_gsq,"wb"); 296 | if (fgs == NULL) {log_file_loading_error("gradsq file", save_gradsq_file); free(word); return 1;} 297 | } 298 | fout = fopen(output_file,"wb"); 299 | if (fout == NULL) {log_file_loading_error("weights file", save_W_file); free(word); return 1;} 300 | fid = fopen(vocab_file, "r"); 301 | sprintf(format,"%%%ds",MAX_STRING_LENGTH); 302 | if (fid == NULL) {log_file_loading_error("vocab file", vocab_file); free(word); fclose(fout); return 1;} 303 | if (write_header) fprintf(fout, "%lld %d\n", vocab_size, vector_size); 304 | for (a = 0; a < vocab_size; a++) { 305 | if (fscanf(fid,format,word) == 0) {free(word); fclose(fid); fclose(fout); return 1;} 306 | // input vocab cannot contain special keyword 307 | if (strcmp(word, "") == 0) {free(word); fclose(fid); fclose(fout); return 1;} 308 | fprintf(fout, "%s",word); 309 | if (model == 0) { // Save all parameters (including bias) 310 | for (b = 0; b < (vector_size + 1); b++) fprintf(fout," %lf", W[a * (vector_size + 1) + b]); 311 | for (b = 0; b < (vector_size + 1); b++) fprintf(fout," %lf", W[(vocab_size + a) * (vector_size + 1) + b]); 312 | } 313 | if (model == 1) // Save only "word" vectors (without bias) 314 | for (b = 0; b < vector_size; b++) fprintf(fout," %lf", W[a * (vector_size + 1) + b]); 315 | if (model == 2) // Save "word + context word" vectors (without bias) 316 | for (b = 0; b < vector_size; b++) fprintf(fout," %lf", W[a * (vector_size + 1) + b] + W[(vocab_size + a) * (vector_size + 1) + b]); 317 | if (model == 3) { // Save "word" and "context" vectors (without bias; row-concatenated) 318 | for (b = 0; b < vector_size; b++) fprintf(fout," %lf", W[a * (vector_size + 1) + b]); 319 | for (b = 0; b < vector_size; b++) fprintf(fout," %lf", W[(vocab_size + a) * (vector_size + 1) + b]); 320 | } 321 | fprintf(fout,"\n"); 322 | if (save_gradsq > 0) { // Save gradsq 323 | fprintf(fgs, "%s",word); 324 | for (b = 0; b < (vector_size + 1); b++) fprintf(fgs," %lf", gradsq[a * (vector_size + 1) + b]); 325 | for (b = 0; b < (vector_size + 1); b++) fprintf(fgs," %lf", gradsq[(vocab_size + a) * (vector_size + 1) + b]); 326 | fprintf(fgs,"\n"); 327 | } 328 | if (fscanf(fid,format,word) == 0) { 329 | // Eat irrelevant frequency entry 330 | fclose(fout); 331 | fclose(fid); 332 | free(word); 333 | return 1; 334 | } 335 | } 336 | 337 | if (use_unk_vec) { 338 | real* unk_vec = (real*)calloc((vector_size + 1), sizeof(real)); 339 | real* unk_context = (real*)calloc((vector_size + 1), sizeof(real)); 340 | strcpy(word, ""); 341 | 342 | long long num_rare_words = vocab_size < 100 ? vocab_size : 100; 343 | 344 | for (a = vocab_size - num_rare_words; a < vocab_size; a++) { 345 | for (b = 0; b < (vector_size + 1); b++) { 346 | unk_vec[b] += W[a * (vector_size + 1) + b] / num_rare_words; 347 | unk_context[b] += W[(vocab_size + a) * (vector_size + 1) + b] / num_rare_words; 348 | } 349 | } 350 | 351 | fprintf(fout, "%s",word); 352 | if (model == 0) { // Save all parameters (including bias) 353 | for (b = 0; b < (vector_size + 1); b++) fprintf(fout," %lf", unk_vec[b]); 354 | for (b = 0; b < (vector_size + 1); b++) fprintf(fout," %lf", unk_context[b]); 355 | } 356 | if (model == 1) // Save only "word" vectors (without bias) 357 | for (b = 0; b < vector_size; b++) fprintf(fout," %lf", unk_vec[b]); 358 | if (model == 2) // Save "word + context word" vectors (without bias) 359 | for (b = 0; b < vector_size; b++) fprintf(fout," %lf", unk_vec[b] + unk_context[b]); 360 | if (model == 3) { // Save "word" and "context" vectors (without bias; row-concatenated) 361 | for (b = 0; b < vector_size; b++) fprintf(fout," %lf", unk_vec[b]); 362 | for (b = 0; b < vector_size; b++) fprintf(fout," %lf", unk_context[b]); 363 | } 364 | fprintf(fout,"\n"); 365 | 366 | free(unk_vec); 367 | free(unk_context); 368 | } 369 | 370 | fclose(fid); 371 | fclose(fout); 372 | if (save_gradsq > 0) fclose(fgs); 373 | } 374 | free(word); 375 | return 0; 376 | } 377 | 378 | /* Train model */ 379 | int train_glove() { 380 | long long a, file_size; 381 | int save_params_return_code; 382 | int b; 383 | FILE *fin; 384 | real total_cost = 0; 385 | 386 | fprintf(stderr, "TRAINING MODEL\n"); 387 | 388 | fin = fopen(input_file, "rb"); 389 | if (fin == NULL) {log_file_loading_error("cooccurrence file", input_file); return 1;} 390 | fseeko(fin, 0, SEEK_END); 391 | file_size = ftello(fin); 392 | num_lines = file_size/(sizeof(CREC)); // Assuming the file isn't corrupt and consists only of CREC's 393 | fclose(fin); 394 | fprintf(stderr,"Read %lld lines.\n", num_lines); 395 | if (verbose > 1) fprintf(stderr,"Initializing parameters..."); 396 | initialize_parameters(); 397 | if (verbose > 1) fprintf(stderr,"done.\n"); 398 | if (save_init_param) { 399 | if (verbose > 1) fprintf(stderr,"Saving initial parameters... "); 400 | save_params_return_code = save_params(0); 401 | if (save_params_return_code != 0) 402 | return save_params_return_code; 403 | if (verbose > 1) fprintf(stderr,"done.\n"); 404 | } 405 | if (verbose > 0) fprintf(stderr,"vector size: %d\n", vector_size); 406 | if (verbose > 0) fprintf(stderr,"vocab size: %lld\n", vocab_size); 407 | if (verbose > 0) fprintf(stderr,"x_max: %lf\n", x_max); 408 | if (verbose > 0) fprintf(stderr,"alpha: %lf\n", alpha); 409 | pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t)); 410 | lines_per_thread = (long long *) malloc(num_threads * sizeof(long long)); 411 | 412 | time_t rawtime; 413 | struct tm *info; 414 | char time_buffer[80]; 415 | // Lock-free asynchronous SGD 416 | for (b = 0; b < num_iter; b++) { 417 | total_cost = 0; 418 | for (a = 0; a < num_threads - 1; a++) lines_per_thread[a] = num_lines / num_threads; 419 | lines_per_thread[a] = num_lines / num_threads + num_lines % num_threads; 420 | long long *thread_ids = (long long*)malloc(sizeof(long long) * num_threads); 421 | for (a = 0; a < num_threads; a++) thread_ids[a] = a; 422 | for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, glove_thread, (void *)&thread_ids[a]); 423 | for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL); 424 | for (a = 0; a < num_threads; a++) total_cost += cost[a]; 425 | free(thread_ids); 426 | 427 | time(&rawtime); 428 | info = localtime(&rawtime); 429 | strftime(time_buffer,80,"%x - %I:%M.%S%p", info); 430 | fprintf(stderr, "%s, iter: %03d, cost: %lf\n", time_buffer, b+1, total_cost/num_lines); 431 | 432 | if (checkpoint_every > 0 && (b + 1) % checkpoint_every == 0) { 433 | fprintf(stderr," saving intermediate parameters for iter %03d...", b+1); 434 | save_params_return_code = save_params(b+1); 435 | if (save_params_return_code != 0) { 436 | free(pt); 437 | free(lines_per_thread); 438 | return save_params_return_code; 439 | } 440 | fprintf(stderr,"done.\n"); 441 | } 442 | } 443 | free(pt); 444 | free(lines_per_thread); 445 | return save_params(-1); 446 | } 447 | 448 | int main(int argc, char **argv) { 449 | int i; 450 | FILE *fid; 451 | int result = 0; 452 | 453 | if (argc == 1) { 454 | printf("GloVe: Global Vectors for Word Representation, v0.2\n"); 455 | printf("Author: Jeffrey Pennington (jpennin@stanford.edu)\n\n"); 456 | printf("Usage options:\n"); 457 | printf("\t-verbose \n"); 458 | printf("\t\tSet verbosity: 0, 1, or 2 (default)\n"); 459 | printf("\t-write-header \n"); 460 | printf("\t\tIf 1, write vocab_size/vector_size as first line. Do nothing if 0 (default).\n"); 461 | printf("\t-vector-size \n"); 462 | printf("\t\tDimension of word vector representations (excluding bias term); default 50\n"); 463 | printf("\t-threads \n"); 464 | printf("\t\tNumber of threads; default 8\n"); 465 | printf("\t-iter \n"); 466 | printf("\t\tNumber of training iterations; default 25\n"); 467 | printf("\t-eta \n"); 468 | printf("\t\tInitial learning rate; default 0.05\n"); 469 | printf("\t-alpha \n"); 470 | printf("\t\tParameter in exponent of weighting function; default 0.75\n"); 471 | printf("\t-x-max \n"); 472 | printf("\t\tParameter specifying cutoff in weighting function; default 100.0\n"); 473 | printf("\t-grad-clip\n"); 474 | printf("\t\tGradient components clipping parameter. Values will be clipped to [-grad-clip, grad-clip] interval\n"); 475 | printf("\t-binary \n"); 476 | printf("\t\tSave output in binary format (0: text, 1: binary, 2: both); default 0\n"); 477 | printf("\t-model \n"); 478 | printf("\t\tModel for word vector output (for text output only); default 2\n"); 479 | printf("\t\t 0: output all data, for both word and context word vectors, including bias terms\n"); 480 | printf("\t\t 1: output word vectors, excluding bias terms\n"); 481 | printf("\t\t 2: output word vectors + context word vectors, excluding bias terms\n"); 482 | printf("\t\t 3: output word vectors and context word vectors, excluding bias terms; context word vectors are row-concatenated to the word vectors\n"); 483 | printf("\t-input-file \n"); 484 | printf("\t\tBinary input file of shuffled cooccurrence data (produced by 'cooccur' and 'shuffle'); default cooccurrence.shuf.bin\n"); 485 | printf("\t-vocab-file \n"); 486 | printf("\t\tFile containing vocabulary (truncated unigram counts, produced by 'vocab_count'); default vocab.txt\n"); 487 | printf("\t-save-file \n"); 488 | printf("\t\tFilename, excluding extension, for word vector output; default vectors\n"); 489 | printf("\t-gradsq-file \n"); 490 | printf("\t\tFilename, excluding extension, for squared gradient output; default gradsq\n"); 491 | printf("\t-save-gradsq \n"); 492 | printf("\t\tSave accumulated squared gradients; default 0 (off); ignored if gradsq-file is specified\n"); 493 | printf("\t-checkpoint-every \n"); 494 | printf("\t\tCheckpoint a model every iterations; default 0 (off)\n"); 495 | printf("\t-load-init-param \n"); 496 | printf("\t\tLoad initial parameters from -init-param-file; default 0 (false)\n"); 497 | printf("\t-save-init-param \n"); 498 | printf("\t\tSave initial parameters (i.e., checkpoint the model before any training); default 0 (false)\n"); 499 | printf("\t-init-param-file \n"); 500 | printf("\t\tBinary initial parameters file to be loaded if -load-init-params is 1; (default is to look for vectors.000.bin)\n"); 501 | printf("\t-load-init-gradsq \n"); 502 | printf("\t\tLoad initial squared gradients from -init-gradsq-file; default 0 (false)\n"); 503 | printf("\t-init-gradsq-file \n"); 504 | printf("\t\tBinary initial squared gradients file to be loaded if -load-init-gradsq is 1; (default is to look for gradsq.000.bin)\n"); 505 | printf("\t-seed \n"); 506 | printf("\t\tRandom seed to use. If not set, will be randomized using current time."); 507 | printf("\nExample usage:\n"); 508 | printf("./glove -input-file cooccurrence.shuf.bin -vocab-file vocab.txt -save-file vectors -gradsq-file gradsq -verbose 2 -vector-size 100 -threads 16 -alpha 0.75 -x-max 100.0 -eta 0.05 -binary 2 -model 2\n\n"); 509 | result = 0; 510 | } else { 511 | if ((i = find_arg((char *)"-write-header", argc, argv)) > 0) write_header = atoi(argv[i + 1]); 512 | if ((i = find_arg((char *)"-verbose", argc, argv)) > 0) verbose = atoi(argv[i + 1]); 513 | if ((i = find_arg((char *)"-vector-size", argc, argv)) > 0) vector_size = atoi(argv[i + 1]); 514 | if ((i = find_arg((char *)"-iter", argc, argv)) > 0) num_iter = atoi(argv[i + 1]); 515 | if ((i = find_arg((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]); 516 | cost = malloc(sizeof(real) * num_threads); 517 | if ((i = find_arg((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]); 518 | if ((i = find_arg((char *)"-x-max", argc, argv)) > 0) x_max = atof(argv[i + 1]); 519 | if ((i = find_arg((char *)"-eta", argc, argv)) > 0) eta = atof(argv[i + 1]); 520 | if ((i = find_arg((char *)"-grad-clip", argc, argv)) > 0) grad_clip_value = atof(argv[i + 1]); 521 | if ((i = find_arg((char *)"-binary", argc, argv)) > 0) use_binary = atoi(argv[i + 1]); 522 | if ((i = find_arg((char *)"-model", argc, argv)) > 0) model = atoi(argv[i + 1]); 523 | if (model != 0 && model != 1) model = 2; 524 | if ((i = find_arg((char *)"-save-gradsq", argc, argv)) > 0) save_gradsq = atoi(argv[i + 1]); 525 | if ((i = find_arg((char *)"-vocab-file", argc, argv)) > 0) strcpy(vocab_file, argv[i + 1]); 526 | else strcpy(vocab_file, (char *)"vocab.txt"); 527 | if ((i = find_arg((char *)"-save-file", argc, argv)) > 0) strcpy(save_W_file, argv[i + 1]); 528 | else strcpy(save_W_file, (char *)"vectors"); 529 | if ((i = find_arg((char *)"-gradsq-file", argc, argv)) > 0) { 530 | strcpy(save_gradsq_file, argv[i + 1]); 531 | save_gradsq = 1; 532 | } 533 | else if (save_gradsq > 0) strcpy(save_gradsq_file, (char *)"gradsq"); 534 | if ((i = find_arg((char *)"-input-file", argc, argv)) > 0) strcpy(input_file, argv[i + 1]); 535 | else strcpy(input_file, (char *)"cooccurrence.shuf.bin"); 536 | if ((i = find_arg((char *)"-checkpoint-every", argc, argv)) > 0) checkpoint_every = atoi(argv[i + 1]); 537 | if ((i = find_arg((char *)"-init-param-file", argc, argv)) > 0) strcpy(init_param_file, argv[i + 1]); 538 | else strcpy(init_param_file, (char *)"vectors.000.bin"); 539 | if ((i = find_arg((char *)"-load-init-param", argc, argv)) > 0) load_init_param = atoi(argv[i + 1]); 540 | if ((i = find_arg((char *)"-save-init-param", argc, argv)) > 0) save_init_param = atoi(argv[i + 1]); 541 | if ((i = find_arg((char *)"-init-gradsq-file", argc, argv)) > 0) strcpy(init_gradsq_file, argv[i + 1]); 542 | else strcpy(init_gradsq_file, (char *)"gradsq.000.bin"); 543 | if ((i = find_arg((char *)"-load-init-gradsq", argc, argv)) > 0) load_init_gradsq = atoi(argv[i + 1]); 544 | if ((i = find_arg((char *)"-seed", argc, argv)) > 0) seed = atoi(argv[i + 1]); 545 | 546 | vocab_size = 0; 547 | fid = fopen(vocab_file, "r"); 548 | if (fid == NULL) {log_file_loading_error("vocab file", vocab_file); free(cost); return 1;} 549 | while ((i = getc(fid)) != EOF) if (i == '\n') vocab_size++; // Count number of entries in vocab_file 550 | fclose(fid); 551 | if (vocab_size == 0) {fprintf(stderr, "Unable to find any vocab entries in vocab file %s.\n", vocab_file); free(cost); return 1;} 552 | result = train_glove(); 553 | free(cost); 554 | } 555 | free(W); 556 | free(gradsq); 557 | 558 | return result; 559 | } 560 | -------------------------------------------------------------------------------- /src/shuffle.c: -------------------------------------------------------------------------------- 1 | // Tool to shuffle entries of word-word cooccurrence files 2 | // 3 | // Copyright (c) 2014 The Board of Trustees of 4 | // The Leland Stanford Junior University. All Rights Reserved. 5 | // 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | // 18 | // 19 | // For more information, bug reports, fixes, contact: 20 | // Jeffrey Pennington (jpennin@stanford.edu) 21 | // GlobalVectors@googlegroups.com 22 | // http://nlp.stanford.edu/projects/glove/ 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include "common.h" 29 | 30 | 31 | static const long LRAND_MAX = ((long) RAND_MAX + 2) * (long)RAND_MAX; 32 | 33 | int verbose = 2; // 0, 1, or 2 34 | int seed = 0; 35 | long long array_size = 2000000; // size of chunks to shuffle individually 36 | char *file_head; // temporary file string 37 | real memory_limit = 2.0; // soft limit, in gigabytes 38 | 39 | /* Generate uniformly distributed random long ints */ 40 | static long rand_long(long n) { 41 | long limit = LRAND_MAX - LRAND_MAX % n; 42 | long rnd; 43 | do { 44 | rnd = ((long)RAND_MAX + 1) * (long)rand() + (long)rand(); 45 | } while (rnd >= limit); 46 | return rnd % n; 47 | } 48 | 49 | /* Write contents of array to binary file */ 50 | int write_chunk(CREC *array, long size, FILE *fout) { 51 | long i = 0; 52 | for (i = 0; i < size; i++) fwrite(&array[i], sizeof(CREC), 1, fout); 53 | return 0; 54 | } 55 | 56 | /* Fisher-Yates shuffle */ 57 | void shuffle(CREC *array, long n) { 58 | long i, j; 59 | CREC tmp; 60 | for (i = n - 1; i > 0; i--) { 61 | j = rand_long(i + 1); 62 | tmp = array[j]; 63 | array[j] = array[i]; 64 | array[i] = tmp; 65 | } 66 | } 67 | 68 | /* Merge shuffled temporary files; doesn't necessarily produce a perfect shuffle, but good enough */ 69 | int shuffle_merge(int num) { 70 | long i, j, k, l = 0; 71 | int fidcounter = 0; 72 | CREC *array; 73 | char filename[MAX_STRING_LENGTH]; 74 | FILE **fid, *fout = stdout; 75 | 76 | array = malloc(sizeof(CREC) * array_size); 77 | fid = calloc(num, sizeof(FILE)); 78 | for (fidcounter = 0; fidcounter < num; fidcounter++) { //num = number of temporary files to merge 79 | sprintf(filename,"%s_%04d.bin",file_head, fidcounter); 80 | fid[fidcounter] = fopen(filename, "rb"); 81 | if (fid[fidcounter] == NULL) { 82 | log_file_loading_error("temp file", filename); 83 | free(array); 84 | free_fid(fid, num); 85 | return 1; 86 | } 87 | } 88 | if (verbose > 0) fprintf(stderr, "Merging temp files: processed %ld lines.", l); 89 | 90 | while (1) { //Loop until EOF in all files 91 | i = 0; 92 | //Read at most array_size values into array, roughly array_size/num from each temp file 93 | for (j = 0; j < num; j++) { 94 | if (feof(fid[j])) continue; 95 | for (k = 0; k < array_size / num; k++){ 96 | fread(&array[i], sizeof(CREC), 1, fid[j]); 97 | if (feof(fid[j])) break; 98 | i++; 99 | } 100 | } 101 | if (i == 0) break; 102 | l += i; 103 | shuffle(array, i-1); // Shuffles lines between temp files 104 | write_chunk(array,i,fout); 105 | if (verbose > 0) fprintf(stderr, "\033[31G%ld lines.", l); 106 | } 107 | fprintf(stderr, "\033[0GMerging temp files: processed %ld lines.", l); 108 | for (fidcounter = 0; fidcounter < num; fidcounter++) { 109 | fclose(fid[fidcounter]); 110 | sprintf(filename,"%s_%04d.bin",file_head, fidcounter); 111 | remove(filename); 112 | } 113 | fprintf(stderr, "\n\n"); 114 | free(array); 115 | free(fid); 116 | return 0; 117 | } 118 | 119 | /* Shuffle large input stream by splitting into chunks */ 120 | int shuffle_by_chunks() { 121 | if (seed == 0) { 122 | seed = time(0); 123 | } 124 | fprintf(stderr, "Using random seed %d\n", seed); 125 | srand(seed); 126 | long i = 0, l = 0; 127 | int fidcounter = 0; 128 | char filename[MAX_STRING_LENGTH]; 129 | CREC *array; 130 | FILE *fin = stdin, *fid; 131 | array = malloc(sizeof(CREC) * array_size); 132 | 133 | fprintf(stderr,"SHUFFLING COOCCURRENCES\n"); 134 | if (verbose > 0) fprintf(stderr,"array size: %lld\n", array_size); 135 | sprintf(filename,"%s_%04d.bin",file_head, fidcounter); 136 | fid = fopen(filename,"w"); 137 | if (fid == NULL) { 138 | log_file_loading_error("file", filename); 139 | free(array); 140 | return 1; 141 | } 142 | if (verbose > 1) fprintf(stderr, "Shuffling by chunks: processed 0 lines."); 143 | 144 | while (1) { //Continue until EOF 145 | if (i >= array_size) {// If array is full, shuffle it and save to temporary file 146 | shuffle(array, i-2); 147 | l += i; 148 | if (verbose > 1) fprintf(stderr, "\033[22Gprocessed %ld lines.", l); 149 | write_chunk(array,i,fid); 150 | fclose(fid); 151 | fidcounter++; 152 | sprintf(filename,"%s_%04d.bin",file_head, fidcounter); 153 | fid = fopen(filename,"w"); 154 | if (fid == NULL) { 155 | log_file_loading_error("file", filename); 156 | free(array); 157 | return 1; 158 | } 159 | i = 0; 160 | } 161 | fread(&array[i], sizeof(CREC), 1, fin); 162 | if (feof(fin)) break; 163 | i++; 164 | } 165 | shuffle(array, i-2); //Last chunk may be smaller than array_size 166 | write_chunk(array,i,fid); 167 | l += i; 168 | if (verbose > 1) fprintf(stderr, "\033[22Gprocessed %ld lines.\n", l); 169 | if (verbose > 1) fprintf(stderr, "Wrote %d temporary file(s).\n", fidcounter + 1); 170 | fclose(fid); 171 | free(array); 172 | return shuffle_merge(fidcounter + 1); // Merge and shuffle together temporary files 173 | } 174 | 175 | int main(int argc, char **argv) { 176 | int i; 177 | 178 | if (argc == 2 && 179 | (!scmp(argv[1], "-h") || !scmp(argv[1], "-help") || !scmp(argv[1], "--help"))) { 180 | printf("Tool to shuffle entries of word-word cooccurrence files\n"); 181 | printf("Author: Jeffrey Pennington (jpennin@stanford.edu)\n\n"); 182 | printf("Usage options:\n"); 183 | printf("\t-verbose \n"); 184 | printf("\t\tSet verbosity: 0, 1, or 2 (default)\n"); 185 | printf("\t-memory \n"); 186 | printf("\t\tSoft limit for memory consumption, in GB; default 4.0\n"); 187 | printf("\t-array-size \n"); 188 | printf("\t\tLimit to length the buffer which stores chunks of data to shuffle before writing to disk. \n\t\tThis value overrides that which is automatically produced by '-memory'.\n"); 189 | printf("\t-temp-file \n"); 190 | printf("\t\tFilename, excluding extension, for temporary files; default temp_shuffle\n"); 191 | printf("\t-seed \n"); 192 | printf("\t\tRandom seed to use. If not set, will be randomized using current time."); 193 | printf("\nExample usage: (assuming 'cooccurrence.bin' has been produced by 'coccur')\n"); 194 | printf("./shuffle -verbose 2 -memory 8.0 < cooccurrence.bin > cooccurrence.shuf.bin\n"); 195 | return 0; 196 | } 197 | 198 | file_head = malloc(sizeof(char) * MAX_STRING_LENGTH); 199 | if ((i = find_arg((char *)"-verbose", argc, argv)) > 0) verbose = atoi(argv[i + 1]); 200 | if ((i = find_arg((char *)"-temp-file", argc, argv)) > 0) strcpy(file_head, argv[i + 1]); 201 | else strcpy(file_head, (char *)"temp_shuffle"); 202 | if ((i = find_arg((char *)"-memory", argc, argv)) > 0) memory_limit = atof(argv[i + 1]); 203 | array_size = (long long) (0.95 * (real)memory_limit * 1073741824/(sizeof(CREC))); 204 | if ((i = find_arg((char *)"-array-size", argc, argv)) > 0) array_size = atoll(argv[i + 1]); 205 | if ((i = find_arg((char *)"-seed", argc, argv)) > 0) seed = atoi(argv[i + 1]); 206 | const int returned_value = shuffle_by_chunks(); 207 | free(file_head); 208 | return returned_value; 209 | } 210 | 211 | -------------------------------------------------------------------------------- /src/vocab_count.c: -------------------------------------------------------------------------------- 1 | // Tool to extract unigram counts 2 | // 3 | // GloVe: Global Vectors for Word Representation 4 | // Copyright (c) 2014 The Board of Trustees of 5 | // The Leland Stanford Junior University. All Rights Reserved. 6 | // 7 | // Licensed under the Apache License, Version 2.0 (the "License"); 8 | // you may not use this file except in compliance with the License. 9 | // You may obtain a copy of the License at 10 | // 11 | // http://www.apache.org/licenses/LICENSE-2.0 12 | // 13 | // Unless required by applicable law or agreed to in writing, software 14 | // distributed under the License is distributed on an "AS IS" BASIS, 15 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | // See the License for the specific language governing permissions and 17 | // limitations under the License. 18 | // 19 | // 20 | // For more information, bug reports, fixes, contact: 21 | // Jeffrey Pennington (jpennin@stanford.edu) 22 | // Christopher Manning (manning@cs.stanford.edu) 23 | // https://github.com/stanfordnlp/GloVe/ 24 | // GlobalVectors@googlegroups.com 25 | // http://nlp.stanford.edu/projects/glove/ 26 | 27 | #include 28 | #include 29 | #include 30 | #include "common.h" 31 | 32 | typedef struct vocabulary { 33 | char *word; 34 | long long count; 35 | } VOCAB; 36 | 37 | int verbose = 2; // 0, 1, or 2 38 | long long min_count = 1; // min occurrences for inclusion in vocab 39 | long long max_vocab = 0; // max_vocab = 0 for no limit 40 | 41 | 42 | /* Vocab frequency comparison; break ties alphabetically */ 43 | int CompareVocabTie(const void *a, const void *b) { 44 | long long c; 45 | if ( (c = ((VOCAB *) b)->count - ((VOCAB *) a)->count) != 0) return ( c > 0 ? 1 : -1 ); 46 | else return (scmp(((VOCAB *) a)->word,((VOCAB *) b)->word)); 47 | 48 | } 49 | 50 | /* Vocab frequency comparison; no tie-breaker */ 51 | int CompareVocab(const void *a, const void *b) { 52 | long long c; 53 | if ( (c = ((VOCAB *) b)->count - ((VOCAB *) a)->count) != 0) return ( c > 0 ? 1 : -1 ); 54 | else return 0; 55 | } 56 | 57 | /* Search hash table for given string, insert if not found */ 58 | void hashinsert(HASHREC **ht, char *w) { 59 | HASHREC *htmp, *hprv; 60 | unsigned int hval = HASHFN(w, TSIZE, SEED); 61 | 62 | for (hprv = NULL, htmp = ht[hval]; htmp != NULL && scmp(htmp->word, w) != 0; hprv = htmp, htmp = htmp->next); 63 | if (htmp == NULL) { 64 | htmp = (HASHREC *) malloc( sizeof(HASHREC) ); 65 | htmp->word = (char *) malloc( strlen(w) + 1 ); 66 | strcpy(htmp->word, w); 67 | htmp->num = 1; 68 | htmp->next = NULL; 69 | if ( hprv==NULL ) 70 | ht[hval] = htmp; 71 | else 72 | hprv->next = htmp; 73 | } 74 | else { 75 | /* new records are not moved to front */ 76 | htmp->num++; 77 | if (hprv != NULL) { 78 | /* move to front on access */ 79 | hprv->next = htmp->next; 80 | htmp->next = ht[hval]; 81 | ht[hval] = htmp; 82 | } 83 | } 84 | return; 85 | } 86 | 87 | int get_counts() { 88 | long long i = 0, j = 0, vocab_size = 12500; 89 | // char format[20]; 90 | char str[MAX_STRING_LENGTH + 1]; 91 | HASHREC **vocab_hash = inithashtable(); 92 | HASHREC *htmp; 93 | VOCAB *vocab; 94 | FILE *fid = stdin; 95 | 96 | fprintf(stderr, "BUILDING VOCABULARY\n"); 97 | if (verbose > 1) fprintf(stderr, "Processed %lld tokens.", i); 98 | // sprintf(format,"%%%ds",MAX_STRING_LENGTH); 99 | while ( ! feof(fid)) { 100 | // Insert all tokens into hashtable 101 | int nl = get_word(str, fid); 102 | if (nl) continue; // just a newline marker or feof 103 | if (strcmp(str, "") == 0) { 104 | fprintf(stderr, "\nError, vector found in corpus.\nPlease remove s from your corpus (e.g. cat text8 | sed -e 's///g' > text8.new)"); 105 | free_table(vocab_hash); 106 | return 1; 107 | } 108 | hashinsert(vocab_hash, str); 109 | if (((++i)%100000) == 0) if (verbose > 1) fprintf(stderr,"\033[11G%lld tokens.", i); 110 | } 111 | if (verbose > 1) fprintf(stderr, "\033[0GProcessed %lld tokens.\n", i); 112 | vocab = malloc(sizeof(VOCAB) * vocab_size); 113 | for (i = 0; i < TSIZE; i++) { // Migrate vocab to array 114 | htmp = vocab_hash[i]; 115 | while (htmp != NULL) { 116 | vocab[j].word = htmp->word; 117 | vocab[j].count = htmp->num; 118 | j++; 119 | if (j>=vocab_size) { 120 | vocab_size += 2500; 121 | vocab = (VOCAB *)realloc(vocab, sizeof(VOCAB) * vocab_size); 122 | } 123 | htmp = htmp->next; 124 | } 125 | } 126 | if (verbose > 1) fprintf(stderr, "Counted %lld unique words.\n", j); 127 | if (max_vocab > 0 && max_vocab < j) 128 | // If the vocabulary exceeds limit, first sort full vocab by frequency without alphabetical tie-breaks. 129 | // This results in pseudo-random ordering for words with same frequency, so that when truncated, the words span whole alphabet 130 | qsort(vocab, j, sizeof(VOCAB), CompareVocab); 131 | else max_vocab = j; 132 | qsort(vocab, max_vocab, sizeof(VOCAB), CompareVocabTie); //After (possibly) truncating, sort (possibly again), breaking ties alphabetically 133 | 134 | for (i = 0; i < max_vocab; i++) { 135 | if (vocab[i].count < min_count) { // If a minimum frequency cutoff exists, truncate vocabulary 136 | if (verbose > 0) fprintf(stderr, "Truncating vocabulary at min count %lld.\n",min_count); 137 | break; 138 | } 139 | printf("%s %lld\n",vocab[i].word,vocab[i].count); 140 | } 141 | 142 | if (i == max_vocab && max_vocab < j) if (verbose > 0) fprintf(stderr, "Truncating vocabulary at size %lld.\n", max_vocab); 143 | fprintf(stderr, "Using vocabulary of size %lld.\n\n", i); 144 | free_table(vocab_hash); 145 | free(vocab); 146 | return 0; 147 | } 148 | 149 | int main(int argc, char **argv) { 150 | if (argc == 2 && 151 | (!scmp(argv[1], "-h") || !scmp(argv[1], "-help") || !scmp(argv[1], "--help"))) { 152 | printf("Simple tool to extract unigram counts\n"); 153 | printf("Author: Jeffrey Pennington (jpennin@stanford.edu)\n\n"); 154 | printf("Usage options:\n"); 155 | printf("\t-verbose \n"); 156 | printf("\t\tSet verbosity: 0, 1, or 2 (default)\n"); 157 | printf("\t-max-vocab \n"); 158 | printf("\t\tUpper bound on vocabulary size, i.e. keep the most frequent words. The minimum frequency words are randomly sampled so as to obtain an even distribution over the alphabet.\n"); 159 | printf("\t-min-count \n"); 160 | printf("\t\tLower limit such that words which occur fewer than times are discarded.\n"); 161 | printf("\nExample usage:\n"); 162 | printf("./vocab_count -verbose 2 -max-vocab 100000 -min-count 10 < corpus.txt > vocab.txt\n"); 163 | return 0; 164 | } 165 | 166 | int i; 167 | if ((i = find_arg((char *)"-verbose", argc, argv)) > 0) verbose = atoi(argv[i + 1]); 168 | if ((i = find_arg((char *)"-max-vocab", argc, argv)) > 0) max_vocab = atoll(argv[i + 1]); 169 | if ((i = find_arg((char *)"-min-count", argc, argv)) > 0) min_count = atoll(argv[i + 1]); 170 | return get_counts(); 171 | } 172 | 173 | --------------------------------------------------------------------------------