├── demo-word.sh ├── demo-classes.sh ├── demo-word-accuracy.sh ├── .gitignore ├── demo-analogy.sh ├── demo-phrases.sh ├── demo-phrase-accuracy.sh ├── Makefile ├── win32-port.h ├── README.txt ├── vs2017 ├── word2vec.sln ├── distance.vcxproj ├── word2vec.vcxproj ├── word-analogy.vcxproj ├── word2phrase.vcxproj ├── compute-accuracy.vcxproj └── word2vec-doc2vec.vcxproj ├── vs2015 ├── word2vec.sln ├── distance.vcxproj ├── word2vec.vcxproj ├── word2phrase.vcxproj ├── word-analogy.vcxproj ├── compute-accuracy.vcxproj └── word2vec-doc2vec.vcxproj ├── distance.c ├── word-analogy.c ├── demo-train-big-model-v1.sh ├── compute-accuracy.c ├── word2phrase.c ├── LICENSE.txt ├── word2vec-doc2vec.c └── word2vec.c /demo-word.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e text8 ]; then 3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz 4 | gzip -d text8.gz -f 5 | fi 6 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15 7 | ./distance vectors.bin 8 | -------------------------------------------------------------------------------- /demo-classes.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e text8 ]; then 3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz 4 | gzip -d text8.gz -f 5 | fi 6 | time ./word2vec -train text8 -output classes.txt -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -iter 15 -classes 500 7 | sort classes.txt -k 2 -n > classes.sorted.txt 8 | echo The word classes were saved to file classes.sorted.txt 9 | -------------------------------------------------------------------------------- /demo-word-accuracy.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e text8 ]; then 3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz 4 | gzip -d text8.gz -f 5 | fi 6 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15 7 | ./compute-accuracy vectors.bin 30000 < questions-words.txt 8 | # to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | 3 | # Compiled Object files 4 | *.slo 5 | *.lo 6 | *.o 7 | *.co 8 | *.obj 9 | *.iobj 10 | *.ipdb 11 | *.pdb 12 | 13 | # Visual Studio 14 | *sdf 15 | *.suo 16 | *.ncb 17 | *.user 18 | Debug 19 | Release 20 | 21 | *.exe 22 | text8 23 | text8-phrase 24 | vectors.bin 25 | vectors-phrase.bin 26 | classes.txt 27 | classes.sorted.txt 28 | 29 | compute-accuracy 30 | distance 31 | word-analogy 32 | word2phrase 33 | word2vec 34 | word2vec-blas 35 | word2vec-doc2vec 36 | *.opendb 37 | -------------------------------------------------------------------------------- /demo-analogy.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e text8 ]; then 3 | wget http://mattmahoney.net/dc/text8.zip -O text8.gz 4 | gzip -d text8.gz -f 5 | fi 6 | echo --------------------------------------------------------------------------------------------------- 7 | echo Note that for the word analogy to perform well, the model should be trained on much larger data set 8 | echo Example input: paris france berlin 9 | echo --------------------------------------------------------------------------------------------------- 10 | time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15 11 | ./word-analogy vectors.bin 12 | -------------------------------------------------------------------------------- /demo-phrases.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e news.2012.en.shuffled ]; then 3 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz 4 | gzip -d news.2012.en.shuffled.gz -f 5 | fi 6 | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0 7 | time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2 8 | time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2 9 | tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1 10 | time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15 11 | ./distance vectors-phrase.bin 12 | -------------------------------------------------------------------------------- /demo-phrase-accuracy.sh: -------------------------------------------------------------------------------- 1 | make 2 | if [ ! -e news.2012.en.shuffled ]; then 3 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz 4 | gzip -d news.2012.en.shuffled.gz -f 5 | fi 6 | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0 7 | time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2 8 | time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2 9 | tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1 10 | time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15 11 | ./compute-accuracy vectors-phrase.bin < questions-phrases.txt 12 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | #Using -Ofast instead of -O3 might result in faster code, but is supported only by newer GCC versions 3 | CFLAGS = -lm -O3 -march=native -mtune=native -Wall -funroll-loops -Wno-unused-result -DNDEBUG 4 | 5 | all: word2vec word2vec-doc2vec word2phrase distance word-analogy compute-accuracy 6 | 7 | word2vec : word2vec.c 8 | $(CC) word2vec.c -o word2vec $(CFLAGS) -pthread 9 | word2vec-doc2vec : word2vec-doc2vec.c 10 | $(CC) word2vec-doc2vec.c -o word2vec-doc2vec $(CFLAGS) -pthread 11 | word2phrase : word2phrase.c 12 | $(CC) word2phrase.c -o word2phrase $(CFLAGS) 13 | distance : distance.c 14 | $(CC) distance.c -o distance $(CFLAGS) 15 | word-analogy : word-analogy.c 16 | $(CC) word-analogy.c -o word-analogy $(CFLAGS) 17 | compute-accuracy : compute-accuracy.c 18 | $(CC) compute-accuracy.c -o compute-accuracy $(CFLAGS) 19 | word2vec-blas : word2vec.c 20 | $(CC) word2vec.c -o word2vec-blas $(CFLAGS) -pthread -DHAVE_CBLAS=1 -lopenblas 21 | 22 | clean: 23 | rm -f word2vec word2vec-doc2vec word2vec-blas word2phrase distance word-analogy compute-accuracy 24 | rm -f word2vec.exe word2vec-doc2vec.exe word2vec-blas.exe word2phrase.exe distance.exe word-analogy.exe compute-accuracy.exe 25 | -------------------------------------------------------------------------------- /win32-port.h: -------------------------------------------------------------------------------- 1 | #if !defined WIN32_LEAN_AND_MEAN 2 | #define WIN32_LEAN_AND_MEAN 3 | #endif 4 | #include 5 | #include 6 | #include 7 | 8 | typedef struct { 9 | void *(*pthread_routine)(void *); 10 | void *pthread_arg; 11 | HANDLE handle; 12 | } pthread_t; 13 | 14 | static unsigned __stdcall win32_start_routine(void *arg) { 15 | pthread_t *p = (pthread_t *)arg; 16 | p->pthread_routine(p->pthread_arg); 17 | return 0; 18 | } 19 | 20 | static int pthread_create(pthread_t *id, void *attr, 21 | void *(*start_routine)(void *), void *arg) { 22 | assert(attr == 0); 23 | id->pthread_routine = start_routine; 24 | id->pthread_arg = arg; 25 | id->handle = 26 | (HANDLE)_beginthreadex(0, 0, win32_start_routine, (void *)id, 0, 0); 27 | if (id->handle != 0) return 0; 28 | return -1; 29 | } 30 | 31 | static int pthread_join(pthread_t thread, void **retval) { 32 | WaitForSingleObject(thread.handle, INFINITE); 33 | if (retval) { 34 | *retval = 0; 35 | } 36 | return 0; 37 | } 38 | 39 | static void pthread_exit(void *p) { _endthreadex(0); } 40 | 41 | static int posix_memalign(void **memptr, size_t alignment, size_t size) { 42 | assert(memptr); 43 | *memptr = _aligned_malloc(size, alignment); 44 | if (*memptr) { 45 | return 0; 46 | } else { 47 | return -1; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | Tools for computing distributed representtion of words 2 | ------------------------------------------------------ 3 | 4 | We provide an implementation of the Continuous Bag-of-Words (CBOW) and the Skip-gram model (SG), as well as several demo scripts. 5 | 6 | Given a text corpus, the word2vec tool learns a vector for every word in the vocabulary using the Continuous 7 | Bag-of-Words or the Skip-Gram neural network architectures. The user should to specify the following: 8 | - desired vector dimensionality 9 | - the size of the context window for either the Skip-Gram or the Continuous Bag-of-Words model 10 | - training algorithm: hierarchical softmax and / or negative sampling 11 | - threshold for downsampling the frequent words 12 | - number of threads to use 13 | - the format of the output word vector file (text or binary) 14 | 15 | Usually, the other hyper-parameters such as the learning rate do not need to be tuned for different training sets. 16 | 17 | The script demo-word.sh downloads a small (100MB) text corpus from the web, and trains a small word vector model. After the training 18 | is finished, the user can interactively explore the similarity of the words. 19 | 20 | More information about the scripts is provided at https://code.google.com/p/word2vec/ 21 | 22 | ------------------------------------------------------ 23 | In order to get Python wrapper (https://github.com/danielfrg/word2vec) working 24 | added word2vec-doc2vec tool support (https://github.com/nliu86/word2vec-doc2vec) 25 | -------------------------------------------------------------------------------- /vs2017/word2vec.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 15 4 | VisualStudioVersion = 15.0.26403.3 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "word2vec", "word2vec.vcxproj", "{3665E45D-8606-4F60-B864-2AD85FB18CA1}" 7 | EndProject 8 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "distance", "distance.vcxproj", "{FE73759F-BDF7-4118-9CC9-BA2AE2D7D9A7}" 9 | EndProject 10 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "word2phrase", "word2phrase.vcxproj", "{66CAAE3C-A752-4FD1-BE30-8F65DAD73137}" 11 | EndProject 12 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "compute-accuracy", "compute-accuracy.vcxproj", "{78698725-BA8A-410B-9971-2BF28562B2D1}" 13 | EndProject 14 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "word-analogy", "word-analogy.vcxproj", "{8C667D06-771F-441F-B94B-4DBE6D5BE3B6}" 15 | EndProject 16 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "word2vec-doc2vec", "word2vec-doc2vec.vcxproj", "{4192BAE0-FC98-4AE4-819A-65C0B896C38E}" 17 | EndProject 18 | Global 19 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 20 | Debug|x64 = Debug|x64 21 | Release|x64 = Release|x64 22 | EndGlobalSection 23 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 24 | {3665E45D-8606-4F60-B864-2AD85FB18CA1}.Debug|x64.ActiveCfg = Debug|x64 25 | {3665E45D-8606-4F60-B864-2AD85FB18CA1}.Debug|x64.Build.0 = Debug|x64 26 | {3665E45D-8606-4F60-B864-2AD85FB18CA1}.Release|x64.ActiveCfg = Release|x64 27 | {3665E45D-8606-4F60-B864-2AD85FB18CA1}.Release|x64.Build.0 = Release|x64 28 | {FE73759F-BDF7-4118-9CC9-BA2AE2D7D9A7}.Debug|x64.ActiveCfg = Debug|x64 29 | {FE73759F-BDF7-4118-9CC9-BA2AE2D7D9A7}.Debug|x64.Build.0 = Debug|x64 30 | {FE73759F-BDF7-4118-9CC9-BA2AE2D7D9A7}.Release|x64.ActiveCfg = Release|x64 31 | {FE73759F-BDF7-4118-9CC9-BA2AE2D7D9A7}.Release|x64.Build.0 = Release|x64 32 | {66CAAE3C-A752-4FD1-BE30-8F65DAD73137}.Debug|x64.ActiveCfg = Debug|x64 33 | {66CAAE3C-A752-4FD1-BE30-8F65DAD73137}.Debug|x64.Build.0 = Debug|x64 34 | {66CAAE3C-A752-4FD1-BE30-8F65DAD73137}.Release|x64.ActiveCfg = Release|x64 35 | {66CAAE3C-A752-4FD1-BE30-8F65DAD73137}.Release|x64.Build.0 = Release|x64 36 | {78698725-BA8A-410B-9971-2BF28562B2D1}.Debug|x64.ActiveCfg = Debug|x64 37 | {78698725-BA8A-410B-9971-2BF28562B2D1}.Debug|x64.Build.0 = Debug|x64 38 | {78698725-BA8A-410B-9971-2BF28562B2D1}.Release|x64.ActiveCfg = Release|x64 39 | {78698725-BA8A-410B-9971-2BF28562B2D1}.Release|x64.Build.0 = Release|x64 40 | {8C667D06-771F-441F-B94B-4DBE6D5BE3B6}.Debug|x64.ActiveCfg = Debug|x64 41 | {8C667D06-771F-441F-B94B-4DBE6D5BE3B6}.Debug|x64.Build.0 = Debug|x64 42 | {8C667D06-771F-441F-B94B-4DBE6D5BE3B6}.Release|x64.ActiveCfg = Release|x64 43 | {8C667D06-771F-441F-B94B-4DBE6D5BE3B6}.Release|x64.Build.0 = Release|x64 44 | {4192BAE0-FC98-4AE4-819A-65C0B896C38E}.Debug|x64.ActiveCfg = Debug|x64 45 | {4192BAE0-FC98-4AE4-819A-65C0B896C38E}.Debug|x64.Build.0 = Debug|x64 46 | {4192BAE0-FC98-4AE4-819A-65C0B896C38E}.Release|x64.ActiveCfg = Release|x64 47 | {4192BAE0-FC98-4AE4-819A-65C0B896C38E}.Release|x64.Build.0 = Release|x64 48 | EndGlobalSection 49 | GlobalSection(SolutionProperties) = preSolution 50 | HideSolutionNode = FALSE 51 | EndGlobalSection 52 | EndGlobal 53 | -------------------------------------------------------------------------------- /vs2015/word2vec.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 14 4 | VisualStudioVersion = 14.0.25420.1 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "word2vec", "word2vec.vcxproj", "{3665E45D-8606-4F60-B864-2AD85FB18CA1}" 7 | EndProject 8 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "distance", "distance.vcxproj", "{FE73759F-BDF7-4118-9CC9-BA2AE2D7D9A7}" 9 | EndProject 10 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "word2phrase", "word2phrase.vcxproj", "{66CAAE3C-A752-4FD1-BE30-8F65DAD73137}" 11 | EndProject 12 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "compute-accuracy", "compute-accuracy.vcxproj", "{78698725-BA8A-410B-9971-2BF28562B2D1}" 13 | EndProject 14 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "word-analogy", "word-analogy.vcxproj", "{8C667D06-771F-441F-B94B-4DBE6D5BE3B6}" 15 | EndProject 16 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "word2vec-doc2vec", "word2vec-doc2vec.vcxproj", "{4192BAE0-FC98-4AE4-819A-65C0B896C38E}" 17 | EndProject 18 | Global 19 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 20 | Debug|Win32 = Debug|Win32 21 | Debug|x64 = Debug|x64 22 | Release|Win32 = Release|Win32 23 | Release|x64 = Release|x64 24 | EndGlobalSection 25 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 26 | {3665E45D-8606-4F60-B864-2AD85FB18CA1}.Debug|Win32.ActiveCfg = Debug|Win32 27 | {3665E45D-8606-4F60-B864-2AD85FB18CA1}.Debug|Win32.Build.0 = Debug|Win32 28 | {3665E45D-8606-4F60-B864-2AD85FB18CA1}.Debug|x64.ActiveCfg = Debug|x64 29 | {3665E45D-8606-4F60-B864-2AD85FB18CA1}.Debug|x64.Build.0 = Debug|x64 30 | {3665E45D-8606-4F60-B864-2AD85FB18CA1}.Release|Win32.ActiveCfg = Release|Win32 31 | {3665E45D-8606-4F60-B864-2AD85FB18CA1}.Release|Win32.Build.0 = Release|Win32 32 | {3665E45D-8606-4F60-B864-2AD85FB18CA1}.Release|x64.ActiveCfg = Release|x64 33 | {3665E45D-8606-4F60-B864-2AD85FB18CA1}.Release|x64.Build.0 = Release|x64 34 | {FE73759F-BDF7-4118-9CC9-BA2AE2D7D9A7}.Debug|Win32.ActiveCfg = Debug|Win32 35 | {FE73759F-BDF7-4118-9CC9-BA2AE2D7D9A7}.Debug|Win32.Build.0 = Debug|Win32 36 | {FE73759F-BDF7-4118-9CC9-BA2AE2D7D9A7}.Debug|x64.ActiveCfg = Debug|x64 37 | {FE73759F-BDF7-4118-9CC9-BA2AE2D7D9A7}.Debug|x64.Build.0 = Debug|x64 38 | {FE73759F-BDF7-4118-9CC9-BA2AE2D7D9A7}.Release|Win32.ActiveCfg = Release|Win32 39 | {FE73759F-BDF7-4118-9CC9-BA2AE2D7D9A7}.Release|Win32.Build.0 = Release|Win32 40 | {FE73759F-BDF7-4118-9CC9-BA2AE2D7D9A7}.Release|x64.ActiveCfg = Release|x64 41 | {FE73759F-BDF7-4118-9CC9-BA2AE2D7D9A7}.Release|x64.Build.0 = Release|x64 42 | {66CAAE3C-A752-4FD1-BE30-8F65DAD73137}.Debug|Win32.ActiveCfg = Debug|Win32 43 | {66CAAE3C-A752-4FD1-BE30-8F65DAD73137}.Debug|Win32.Build.0 = Debug|Win32 44 | {66CAAE3C-A752-4FD1-BE30-8F65DAD73137}.Debug|x64.ActiveCfg = Debug|x64 45 | {66CAAE3C-A752-4FD1-BE30-8F65DAD73137}.Debug|x64.Build.0 = Debug|x64 46 | {66CAAE3C-A752-4FD1-BE30-8F65DAD73137}.Release|Win32.ActiveCfg = Release|Win32 47 | {66CAAE3C-A752-4FD1-BE30-8F65DAD73137}.Release|Win32.Build.0 = Release|Win32 48 | {66CAAE3C-A752-4FD1-BE30-8F65DAD73137}.Release|x64.ActiveCfg = Release|x64 49 | {66CAAE3C-A752-4FD1-BE30-8F65DAD73137}.Release|x64.Build.0 = Release|x64 50 | {78698725-BA8A-410B-9971-2BF28562B2D1}.Debug|Win32.ActiveCfg = Debug|Win32 51 | {78698725-BA8A-410B-9971-2BF28562B2D1}.Debug|Win32.Build.0 = Debug|Win32 52 | {78698725-BA8A-410B-9971-2BF28562B2D1}.Debug|x64.ActiveCfg = Debug|x64 53 | {78698725-BA8A-410B-9971-2BF28562B2D1}.Debug|x64.Build.0 = Debug|x64 54 | {78698725-BA8A-410B-9971-2BF28562B2D1}.Release|Win32.ActiveCfg = Release|Win32 55 | {78698725-BA8A-410B-9971-2BF28562B2D1}.Release|Win32.Build.0 = Release|Win32 56 | {78698725-BA8A-410B-9971-2BF28562B2D1}.Release|x64.ActiveCfg = Release|x64 57 | {78698725-BA8A-410B-9971-2BF28562B2D1}.Release|x64.Build.0 = Release|x64 58 | {8C667D06-771F-441F-B94B-4DBE6D5BE3B6}.Debug|Win32.ActiveCfg = Debug|Win32 59 | {8C667D06-771F-441F-B94B-4DBE6D5BE3B6}.Debug|Win32.Build.0 = Debug|Win32 60 | {8C667D06-771F-441F-B94B-4DBE6D5BE3B6}.Debug|x64.ActiveCfg = Debug|x64 61 | {8C667D06-771F-441F-B94B-4DBE6D5BE3B6}.Debug|x64.Build.0 = Debug|x64 62 | {8C667D06-771F-441F-B94B-4DBE6D5BE3B6}.Release|Win32.ActiveCfg = Release|Win32 63 | {8C667D06-771F-441F-B94B-4DBE6D5BE3B6}.Release|Win32.Build.0 = Release|Win32 64 | {8C667D06-771F-441F-B94B-4DBE6D5BE3B6}.Release|x64.ActiveCfg = Release|x64 65 | {8C667D06-771F-441F-B94B-4DBE6D5BE3B6}.Release|x64.Build.0 = Release|x64 66 | {4192BAE0-FC98-4AE4-819A-65C0B896C38E}.Debug|Win32.ActiveCfg = Debug|Win32 67 | {4192BAE0-FC98-4AE4-819A-65C0B896C38E}.Debug|Win32.Build.0 = Debug|Win32 68 | {4192BAE0-FC98-4AE4-819A-65C0B896C38E}.Debug|x64.ActiveCfg = Debug|x64 69 | {4192BAE0-FC98-4AE4-819A-65C0B896C38E}.Debug|x64.Build.0 = Debug|x64 70 | {4192BAE0-FC98-4AE4-819A-65C0B896C38E}.Release|Win32.ActiveCfg = Release|Win32 71 | {4192BAE0-FC98-4AE4-819A-65C0B896C38E}.Release|Win32.Build.0 = Release|Win32 72 | {4192BAE0-FC98-4AE4-819A-65C0B896C38E}.Release|x64.ActiveCfg = Release|x64 73 | {4192BAE0-FC98-4AE4-819A-65C0B896C38E}.Release|x64.Build.0 = Release|x64 74 | EndGlobalSection 75 | GlobalSection(SolutionProperties) = preSolution 76 | HideSolutionNode = FALSE 77 | EndGlobalSection 78 | EndGlobal 79 | -------------------------------------------------------------------------------- /distance.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #define max_size 2000 // max length of strings 21 | #define N 40 // number of closest words that will be shown 22 | #define max_w 50 // max length of vocabulary entries 23 | 24 | int main(int argc, char **argv) { 25 | FILE *f; 26 | char st1[max_size]; 27 | char *bestw[N]; 28 | char file_name[max_size], st[100][max_size]; 29 | float dist, len, bestd[N], vec[max_size]; 30 | long long words, size, a, b, c, d, cn, bi[100]; 31 | float *M; 32 | char *vocab; 33 | if (argc < 2) { 34 | printf( 35 | "Usage: ./distance \nwhere FILE contains word projections in the " 36 | "BINARY FORMAT\n"); 37 | return 0; 38 | } 39 | strcpy(file_name, argv[1]); 40 | f = fopen(file_name, "rb"); 41 | if (f == NULL) { 42 | printf("Input file not found\n"); 43 | return -1; 44 | } 45 | fscanf(f, "%lld", &words); 46 | fscanf(f, "%lld", &size); 47 | vocab = (char *)malloc((long long)words * max_w * sizeof(char)); 48 | for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char)); 49 | M = (float *)malloc((long long)words * (long long)size * sizeof(float)); 50 | if (M == NULL) { 51 | printf("Cannot allocate memory: %lld MB %lld %lld\n", 52 | (long long)words * size * sizeof(float) / 1048576, words, size); 53 | return -1; 54 | } 55 | for (b = 0; b < words; b++) { 56 | a = 0; 57 | while (1) { 58 | vocab[b * max_w + a] = fgetc(f); 59 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break; 60 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; 61 | } 62 | vocab[b * max_w + a] = 0; 63 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); 64 | len = 0; 65 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 66 | len = sqrt(len); 67 | for (a = 0; a < size; a++) M[a + b * size] /= len; 68 | } 69 | fclose(f); 70 | while (1) { 71 | for (a = 0; a < N; a++) bestd[a] = 0; 72 | for (a = 0; a < N; a++) bestw[a][0] = 0; 73 | printf("Enter word or sentence (EXIT to break): "); 74 | a = 0; 75 | while (1) { 76 | st1[a] = fgetc(stdin); 77 | if ((st1[a] == '\n') || (a >= max_size - 1)) { 78 | st1[a] = 0; 79 | break; 80 | } 81 | a++; 82 | } 83 | if (!strcmp(st1, "EXIT")) break; 84 | cn = 0; 85 | b = 0; 86 | c = 0; 87 | while (1) { 88 | st[cn][b] = st1[c]; 89 | b++; 90 | c++; 91 | st[cn][b] = 0; 92 | if (st1[c] == 0) break; 93 | if (st1[c] == ' ') { 94 | cn++; 95 | b = 0; 96 | c++; 97 | } 98 | } 99 | cn++; 100 | for (a = 0; a < cn; a++) { 101 | for (b = 0; b < words; b++) 102 | if (!strcmp(&vocab[b * max_w], st[a])) break; 103 | if (b == words) b = -1; 104 | bi[a] = b; 105 | printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]); 106 | if (b == -1) { 107 | printf("Out of dictionary word!\n"); 108 | break; 109 | } 110 | } 111 | if (b == -1) continue; 112 | printf( 113 | "\n Word Cosine " 114 | "distance\n------------------------------------------------------------" 115 | "------------\n"); 116 | for (a = 0; a < size; a++) vec[a] = 0; 117 | for (b = 0; b < cn; b++) { 118 | if (bi[b] == -1) continue; 119 | for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size]; 120 | } 121 | len = 0; 122 | for (a = 0; a < size; a++) len += vec[a] * vec[a]; 123 | len = sqrt(len); 124 | for (a = 0; a < size; a++) vec[a] /= len; 125 | for (a = 0; a < N; a++) bestd[a] = -1; 126 | for (a = 0; a < N; a++) bestw[a][0] = 0; 127 | for (c = 0; c < words; c++) { 128 | a = 0; 129 | for (b = 0; b < cn; b++) 130 | if (bi[b] == c) a = 1; 131 | if (a == 1) continue; 132 | dist = 0; 133 | for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size]; 134 | for (a = 0; a < N; a++) { 135 | if (dist > bestd[a]) { 136 | for (d = N - 1; d > a; d--) { 137 | bestd[d] = bestd[d - 1]; 138 | strcpy(bestw[d], bestw[d - 1]); 139 | } 140 | bestd[a] = dist; 141 | strcpy(bestw[a], &vocab[c * max_w]); 142 | break; 143 | } 144 | } 145 | } 146 | for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]); 147 | } 148 | return 0; 149 | } 150 | -------------------------------------------------------------------------------- /word-analogy.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #define max_size 2000 // max length of strings 21 | #define N 40 // number of closest words that will be shown 22 | #define max_w 50 // max length of vocabulary entries 23 | 24 | int main(int argc, char **argv) { 25 | FILE *f; 26 | char st1[max_size]; 27 | char bestw[N][max_size]; 28 | char file_name[max_size], st[100][max_size]; 29 | float dist, len, bestd[N], vec[max_size]; 30 | long long words, size, a, b, c, d, cn, bi[100]; 31 | float *M; 32 | char *vocab; 33 | if (argc < 2) { 34 | printf( 35 | "Usage: ./word-analogy \nwhere FILE contains word projections in " 36 | "the BINARY FORMAT\n"); 37 | return 0; 38 | } 39 | strcpy(file_name, argv[1]); 40 | f = fopen(file_name, "rb"); 41 | if (f == NULL) { 42 | printf("Input file not found\n"); 43 | return -1; 44 | } 45 | fscanf(f, "%lld", &words); 46 | fscanf(f, "%lld", &size); 47 | vocab = (char *)malloc((long long)words * max_w * sizeof(char)); 48 | M = (float *)malloc((long long)words * (long long)size * sizeof(float)); 49 | if (M == NULL) { 50 | printf("Cannot allocate memory: %lld MB %lld %lld\n", 51 | (long long)words * size * sizeof(float) / 1048576, words, size); 52 | return -1; 53 | } 54 | for (b = 0; b < words; b++) { 55 | a = 0; 56 | while (1) { 57 | vocab[b * max_w + a] = fgetc(f); 58 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break; 59 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; 60 | } 61 | vocab[b * max_w + a] = 0; 62 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); 63 | len = 0; 64 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 65 | len = sqrt(len); 66 | for (a = 0; a < size; a++) M[a + b * size] /= len; 67 | } 68 | fclose(f); 69 | while (1) { 70 | for (a = 0; a < N; a++) bestd[a] = 0; 71 | for (a = 0; a < N; a++) bestw[a][0] = 0; 72 | printf("Enter three words (EXIT to break): "); 73 | a = 0; 74 | while (1) { 75 | st1[a] = fgetc(stdin); 76 | if ((st1[a] == '\n') || (a >= max_size - 1)) { 77 | st1[a] = 0; 78 | break; 79 | } 80 | a++; 81 | } 82 | if (!strcmp(st1, "EXIT")) break; 83 | cn = 0; 84 | b = 0; 85 | c = 0; 86 | while (1) { 87 | st[cn][b] = st1[c]; 88 | b++; 89 | c++; 90 | st[cn][b] = 0; 91 | if (st1[c] == 0) break; 92 | if (st1[c] == ' ') { 93 | cn++; 94 | b = 0; 95 | c++; 96 | } 97 | } 98 | cn++; 99 | if (cn < 3) { 100 | printf( 101 | "Only %lld words were entered.. three words are needed at the input " 102 | "to perform the calculation\n", 103 | cn); 104 | continue; 105 | } 106 | for (a = 0; a < cn; a++) { 107 | for (b = 0; b < words; b++) 108 | if (!strcmp(&vocab[b * max_w], st[a])) break; 109 | if (b == words) b = 0; 110 | bi[a] = b; 111 | printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]); 112 | if (b == 0) { 113 | printf("Out of dictionary word!\n"); 114 | break; 115 | } 116 | } 117 | if (b == 0) continue; 118 | printf( 119 | "\n Word " 120 | "Distance\n------------------------------------------------------------" 121 | "------------\n"); 122 | for (a = 0; a < size; a++) 123 | vec[a] = M[a + bi[1] * size] - M[a + bi[0] * size] + M[a + bi[2] * size]; 124 | len = 0; 125 | for (a = 0; a < size; a++) len += vec[a] * vec[a]; 126 | len = sqrt(len); 127 | for (a = 0; a < size; a++) vec[a] /= len; 128 | for (a = 0; a < N; a++) bestd[a] = 0; 129 | for (a = 0; a < N; a++) bestw[a][0] = 0; 130 | for (c = 0; c < words; c++) { 131 | if (c == bi[0]) continue; 132 | if (c == bi[1]) continue; 133 | if (c == bi[2]) continue; 134 | a = 0; 135 | for (b = 0; b < cn; b++) 136 | if (bi[b] == c) a = 1; 137 | if (a == 1) continue; 138 | dist = 0; 139 | for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size]; 140 | for (a = 0; a < N; a++) { 141 | if (dist > bestd[a]) { 142 | for (d = N - 1; d > a; d--) { 143 | bestd[d] = bestd[d - 1]; 144 | strcpy(bestw[d], bestw[d - 1]); 145 | } 146 | bestd[a] = dist; 147 | strcpy(bestw[a], &vocab[c * max_w]); 148 | break; 149 | } 150 | } 151 | } 152 | for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]); 153 | } 154 | return 0; 155 | } 156 | -------------------------------------------------------------------------------- /demo-train-big-model-v1.sh: -------------------------------------------------------------------------------- 1 | ############################################################################################### 2 | # 3 | # Script for training good word and phrase vector model using public corpora, version 1.0. 4 | # The training time will be from several hours to about a day. 5 | # 6 | # Downloads about 8 billion words, makes phrases using two runs of word2phrase, trains 7 | # a 500-dimensional vector model and evaluates it on word and phrase analogy tasks. 8 | # 9 | ############################################################################################### 10 | 11 | # This function will convert text to lowercase and remove special characters 12 | normalize_text() { 13 | awk '{print tolower($0);}' | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" -e "s/'/ ' /g" -e "s/“/\"/g" -e "s/”/\"/g" \ 14 | -e 's/"/ " /g' -e 's/\./ \. /g' -e 's/
/ /g' -e 's/, / , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \ 15 | -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' -e 's/-/ - /g' -e 's/=/ /g' -e 's/=/ /g' -e 's/*/ /g' -e 's/|/ /g' \ 16 | -e 's/«/ /g' | tr 0-9 " " 17 | } 18 | 19 | mkdir word2vec 20 | cd word2vec 21 | 22 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz 23 | wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.en.shuffled.gz 24 | gzip -d news.2012.en.shuffled.gz 25 | gzip -d news.2013.en.shuffled.gz 26 | normalize_text < news.2012.en.shuffled > data.txt 27 | normalize_text < news.2013.en.shuffled >> data.txt 28 | 29 | wget http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz 30 | tar -xvf 1-billion-word-language-modeling-benchmark-r13output.tar.gz 31 | for i in `ls 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled`; do 32 | normalize_text < 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/$i >> data.txt 33 | done 34 | 35 | wget http://ebiquity.umbc.edu/redirect/to/resource/id/351/UMBC-webbase-corpus 36 | tar -zxvf umbc_webbase_corpus.tar.gz webbase_all/*.txt 37 | for i in `ls webbase_all`; do 38 | normalize_text < webbase_all/$i >> data.txt 39 | done 40 | 41 | wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 42 | bzip2 -c -d enwiki-latest-pages-articles.xml.bz2 | awk '{print tolower($0);}' | perl -e ' 43 | # Program to filter Wikipedia XML dumps to "clean" text consisting only of lowercase 44 | # letters (a-z, converted from A-Z), and spaces (never consecutive)... 45 | # All other characters are converted to spaces. Only text which normally appears. 46 | # in the web browser is displayed. Tables are removed. Image captions are. 47 | # preserved. Links are converted to normal text. Digits are spelled out. 48 | # *** Modified to not spell digits or throw away non-ASCII characters *** 49 | 50 | # Written by Matt Mahoney, June 10, 2006. This program is released to the public domain. 51 | 52 | $/=">"; # input record separator 53 | while (<>) { 54 | if (/ ... 55 | if (/#redirect/i) {$text=0;} # remove #REDIRECT 56 | if ($text) { 57 | 58 | # Remove any text not normally visible 59 | if (/<\/text>/) {$text=0;} 60 | s/<.*>//; # remove xml tags 61 | s/&/&/g; # decode URL encoded chars 62 | s/<//g; 64 | s///g; # remove references ... 65 | s/<[^>]*>//g; # remove xhtml tags 66 | s/\[http:[^] ]*/[/g; # remove normal url, preserve visible text 67 | s/\|thumb//ig; # remove images links, preserve caption 68 | s/\|left//ig; 69 | s/\|right//ig; 70 | s/\|\d+px//ig; 71 | s/\[\[image:[^\[\]]*\|//ig; 72 | s/\[\[category:([^|\]]*)[^]]*\]\]/[[$1]]/ig; # show categories without markup 73 | s/\[\[[a-z\-]*:[^\]]*\]\]//g; # remove links to other languages 74 | s/\[\[[^\|\]]*\|/[[/g; # remove wiki url, preserve visible text 75 | s/{{[^}]*}}//g; # remove {{icons}} and {tables} 76 | s/{[^}]*}//g; 77 | s/\[//g; # remove [ and ] 78 | s/\]//g; 79 | s/&[^;]*;/ /g; # remove URL encoded chars 80 | 81 | $_=" $_ "; 82 | chop; 83 | print $_; 84 | } 85 | } 86 | ' | normalize_text | awk '{if (NF>1) print;}' >> data.txt 87 | 88 | wget http://word2vec.googlecode.com/svn/trunk/word2vec.c 89 | wget http://word2vec.googlecode.com/svn/trunk/word2phrase.c 90 | wget http://word2vec.googlecode.com/svn/trunk/compute-accuracy.c 91 | wget http://word2vec.googlecode.com/svn/trunk/questions-words.txt 92 | wget http://word2vec.googlecode.com/svn/trunk/questions-phrases.txt 93 | gcc word2vec.c -o word2vec -lm -pthread -O3 -march=native -funroll-loops 94 | gcc word2phrase.c -o word2phrase -lm -pthread -O3 -march=native -funroll-loops 95 | gcc compute-accuracy.c -o compute-accuracy -lm -pthread -O3 -march=native -funroll-loops 96 | ./word2phrase -train data.txt -output data-phrase.txt -threshold 200 -debug 2 97 | ./word2phrase -train data-phrase.txt -output data-phrase2.txt -threshold 100 -debug 2 98 | ./word2vec -train data-phrase2.txt -output vectors.bin -cbow 1 -size 500 -window 10 -negative 10 -hs 0 -sample 1e-5 -threads 40 -binary 1 -iter 3 -min-count 10 99 | ./compute-accuracy vectors.bin 400000 < questions-words.txt # should get to almost 78% accuracy on 99.7% of questions 100 | ./compute-accuracy vectors.bin 1000000 < questions-phrases.txt # about 78% accuracy with 77% coverage 101 | -------------------------------------------------------------------------------- /vs2017/distance.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | x64 7 | 8 | 9 | Release 10 | x64 11 | 12 | 13 | 14 | 15 | 16 | 17 | {FE73759F-BDF7-4118-9CC9-BA2AE2D7D9A7} 18 | Win32Proj 19 | test 20 | distance 21 | 10.0.14393.0 22 | 23 | 24 | 25 | Application 26 | true 27 | v141 28 | NotSet 29 | 30 | 31 | Application 32 | false 33 | v141 34 | true 35 | NotSet 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | false 49 | $(SolutionDir)\$(Configuration)\ 50 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 51 | 52 | 53 | false 54 | $(SolutionDir)\$(Configuration)\ 55 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 56 | 57 | 58 | 59 | 60 | 61 | TurnOffAllWarnings 62 | Disabled 63 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 64 | ProgramDatabase 65 | MultiThreadedDebug 66 | 67 | 68 | 69 | 70 | %(AdditionalIncludeDirectories) 71 | Default 72 | 73 | 74 | 75 | 76 | Console 77 | true 78 | %(AdditionalDependencies) 79 | 80 | 81 | 82 | 83 | TurnOffAllWarnings 84 | 85 | 86 | MaxSpeed 87 | true 88 | true 89 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 90 | 91 | 92 | Default 93 | %(AdditionalIncludeDirectories) 94 | MultiThreaded 95 | 96 | 97 | Console 98 | true 99 | true 100 | true 101 | %(AdditionalDependencies) 102 | $(OutDir)$(TargetName)$(TargetExt) 103 | 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /vs2017/word2vec.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | x64 7 | 8 | 9 | Release 10 | x64 11 | 12 | 13 | 14 | 15 | 16 | 17 | {3665E45D-8606-4F60-B864-2AD85FB18CA1} 18 | Win32Proj 19 | test 20 | word2vec 21 | 10.0.14393.0 22 | 23 | 24 | 25 | Application 26 | true 27 | v141 28 | NotSet 29 | 30 | 31 | Application 32 | false 33 | v141 34 | true 35 | NotSet 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | false 49 | $(SolutionDir)\$(Configuration)\ 50 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 51 | 52 | 53 | false 54 | $(SolutionDir)\$(Configuration)\ 55 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 56 | 57 | 58 | 59 | 60 | 61 | TurnOffAllWarnings 62 | Disabled 63 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 64 | ProgramDatabase 65 | MultiThreadedDebug 66 | 67 | 68 | 69 | 70 | %(AdditionalIncludeDirectories) 71 | Default 72 | 73 | 74 | 75 | 76 | Console 77 | true 78 | %(AdditionalDependencies) 79 | 80 | 81 | 82 | 83 | TurnOffAllWarnings 84 | 85 | 86 | MaxSpeed 87 | true 88 | true 89 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 90 | 91 | 92 | Default 93 | %(AdditionalIncludeDirectories) 94 | MultiThreaded 95 | 96 | 97 | Console 98 | true 99 | true 100 | true 101 | %(AdditionalDependencies) 102 | $(OutDir)$(TargetName)$(TargetExt) 103 | 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /vs2017/word-analogy.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | x64 7 | 8 | 9 | Release 10 | x64 11 | 12 | 13 | 14 | 15 | 16 | 17 | {8C667D06-771F-441F-B94B-4DBE6D5BE3B6} 18 | Win32Proj 19 | test 20 | word-analogy 21 | 10.0.14393.0 22 | 23 | 24 | 25 | Application 26 | true 27 | v141 28 | NotSet 29 | 30 | 31 | Application 32 | false 33 | v141 34 | true 35 | NotSet 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | false 49 | $(SolutionDir)\$(Configuration)\ 50 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 51 | 52 | 53 | false 54 | $(SolutionDir)\$(Configuration)\ 55 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 56 | 57 | 58 | 59 | 60 | 61 | TurnOffAllWarnings 62 | Disabled 63 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 64 | ProgramDatabase 65 | MultiThreadedDebug 66 | 67 | 68 | 69 | 70 | %(AdditionalIncludeDirectories) 71 | Default 72 | 73 | 74 | 75 | 76 | Console 77 | true 78 | %(AdditionalDependencies) 79 | 80 | 81 | 82 | 83 | TurnOffAllWarnings 84 | 85 | 86 | MaxSpeed 87 | true 88 | true 89 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 90 | 91 | 92 | Default 93 | %(AdditionalIncludeDirectories) 94 | MultiThreaded 95 | 96 | 97 | Console 98 | true 99 | true 100 | true 101 | %(AdditionalDependencies) 102 | $(OutDir)$(TargetName)$(TargetExt) 103 | 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /vs2017/word2phrase.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | x64 7 | 8 | 9 | Release 10 | x64 11 | 12 | 13 | 14 | 15 | 16 | 17 | {66CAAE3C-A752-4FD1-BE30-8F65DAD73137} 18 | Win32Proj 19 | test 20 | word2phrase 21 | 10.0.14393.0 22 | 23 | 24 | 25 | Application 26 | true 27 | v141 28 | NotSet 29 | 30 | 31 | Application 32 | false 33 | v141 34 | true 35 | NotSet 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | false 49 | $(SolutionDir)\$(Configuration)\ 50 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 51 | 52 | 53 | false 54 | $(SolutionDir)\$(Configuration)\ 55 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 56 | 57 | 58 | 59 | 60 | 61 | TurnOffAllWarnings 62 | Disabled 63 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 64 | ProgramDatabase 65 | MultiThreadedDebug 66 | 67 | 68 | 69 | 70 | %(AdditionalIncludeDirectories) 71 | Default 72 | 73 | 74 | 75 | 76 | Console 77 | true 78 | %(AdditionalDependencies) 79 | 80 | 81 | 82 | 83 | TurnOffAllWarnings 84 | 85 | 86 | MaxSpeed 87 | true 88 | true 89 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 90 | 91 | 92 | Default 93 | %(AdditionalIncludeDirectories) 94 | MultiThreaded 95 | 96 | 97 | Console 98 | true 99 | true 100 | true 101 | %(AdditionalDependencies) 102 | $(OutDir)$(TargetName)$(TargetExt) 103 | 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /vs2017/compute-accuracy.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | x64 7 | 8 | 9 | Release 10 | x64 11 | 12 | 13 | 14 | 15 | 16 | 17 | {78698725-BA8A-410B-9971-2BF28562B2D1} 18 | Win32Proj 19 | test 20 | compute-accuracy 21 | 10.0.14393.0 22 | 23 | 24 | 25 | Application 26 | true 27 | v141 28 | NotSet 29 | 30 | 31 | Application 32 | false 33 | v141 34 | true 35 | NotSet 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | false 49 | $(SolutionDir)\$(Configuration)\ 50 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 51 | 52 | 53 | false 54 | $(SolutionDir)\$(Configuration)\ 55 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 56 | 57 | 58 | 59 | 60 | 61 | TurnOffAllWarnings 62 | Disabled 63 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 64 | ProgramDatabase 65 | MultiThreadedDebug 66 | 67 | 68 | 69 | 70 | %(AdditionalIncludeDirectories) 71 | Default 72 | 73 | 74 | 75 | 76 | Console 77 | true 78 | %(AdditionalDependencies) 79 | 80 | 81 | 82 | 83 | TurnOffAllWarnings 84 | 85 | 86 | MaxSpeed 87 | true 88 | true 89 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 90 | 91 | 92 | Default 93 | %(AdditionalIncludeDirectories) 94 | MultiThreaded 95 | 96 | 97 | Console 98 | true 99 | true 100 | true 101 | %(AdditionalDependencies) 102 | $(OutDir)$(TargetName)$(TargetExt) 103 | 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /vs2017/word2vec-doc2vec.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | x64 7 | 8 | 9 | Release 10 | x64 11 | 12 | 13 | 14 | 15 | 16 | 17 | {4192BAE0-FC98-4AE4-819A-65C0B896C38E} 18 | Win32Proj 19 | test 20 | word2vec-doc2vec 21 | 10.0.14393.0 22 | 23 | 24 | 25 | Application 26 | true 27 | v141 28 | NotSet 29 | 30 | 31 | Application 32 | false 33 | v141 34 | true 35 | NotSet 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | false 49 | $(SolutionDir)\$(Configuration)\ 50 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 51 | 52 | 53 | false 54 | $(SolutionDir)\$(Configuration)\ 55 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 56 | 57 | 58 | 59 | 60 | 61 | TurnOffAllWarnings 62 | Disabled 63 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 64 | ProgramDatabase 65 | MultiThreadedDebug 66 | 67 | 68 | 69 | 70 | %(AdditionalIncludeDirectories) 71 | Default 72 | 73 | 74 | 75 | 76 | Console 77 | true 78 | %(AdditionalDependencies) 79 | 80 | 81 | 82 | 83 | TurnOffAllWarnings 84 | 85 | 86 | MaxSpeed 87 | true 88 | true 89 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 90 | 91 | 92 | Default 93 | %(AdditionalIncludeDirectories) 94 | MultiThreaded 95 | 96 | 97 | Console 98 | true 99 | true 100 | true 101 | %(AdditionalDependencies) 102 | $(OutDir)$(TargetName)$(TargetExt) 103 | 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /compute-accuracy.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | const long long max_size = 2000; // max length of strings 23 | const long long N = 1; // number of closest words 24 | const long long max_w = 50; // max length of vocabulary entries 25 | 26 | int main(int argc, char **argv) { 27 | FILE *f; 28 | char st1[max_size], st2[max_size], st3[max_size], st4[max_size], 29 | bestw[N][max_size], file_name[max_size]; 30 | float dist, len, bestd[N], vec[max_size]; 31 | long long words, size, a, b, c, d, b1, b2, b3, threshold = 0; 32 | float *M; 33 | char *vocab; 34 | int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, 35 | QID = 0, TQ = 0, TQS = 0; 36 | if (argc < 2) { 37 | printf( 38 | "Usage: ./compute-accuracy \nwhere FILE contains " 39 | "word projections, and threshold is used to reduce vocabulary of the " 40 | "model for fast approximate evaluation (0 = off, otherwise typical " 41 | "value is 30000)\n"); 42 | return 0; 43 | } 44 | strcpy(file_name, argv[1]); 45 | if (argc > 2) threshold = atoi(argv[2]); 46 | f = fopen(file_name, "rb"); 47 | if (f == NULL) { 48 | printf("Input file not found\n"); 49 | return -1; 50 | } 51 | fscanf(f, "%lld", &words); 52 | if (threshold) 53 | if (words > threshold) words = threshold; 54 | fscanf(f, "%lld", &size); 55 | vocab = (char *)malloc(words * max_w * sizeof(char)); 56 | M = (float *)malloc(words * size * sizeof(float)); 57 | if (M == NULL) { 58 | printf("Cannot allocate memory: %lld MB\n", 59 | words * size * sizeof(float) / 1048576); 60 | return -1; 61 | } 62 | for (b = 0; b < words; b++) { 63 | a = 0; 64 | while (1) { 65 | vocab[b * max_w + a] = fgetc(f); 66 | if (feof(f) || (vocab[b * max_w + a] == ' ')) break; 67 | if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; 68 | } 69 | vocab[b * max_w + a] = 0; 70 | for (a = 0; a < max_w; a++) 71 | vocab[b * max_w + a] = toupper(vocab[b * max_w + a]); 72 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); 73 | len = 0; 74 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 75 | len = sqrt(len); 76 | for (a = 0; a < size; a++) M[a + b * size] /= len; 77 | } 78 | fclose(f); 79 | TCN = 0; 80 | while (1) { 81 | for (a = 0; a < N; a++) bestd[a] = 0; 82 | for (a = 0; a < N; a++) bestw[a][0] = 0; 83 | scanf("%s", st1); 84 | for (a = 0; a < strlen(st1); a++) st1[a] = toupper(st1[a]); 85 | if ((!strcmp(st1, ":")) || (!strcmp(st1, "EXIT")) || feof(stdin)) { 86 | if (TCN == 0) TCN = 1; 87 | if (QID != 0) { 88 | printf("ACCURACY TOP1: %.2f %% (%d / %d)\n", CCN / (float)TCN * 100, 89 | CCN, TCN); 90 | printf( 91 | "Total accuracy: %.2f %% Semantic accuracy: %.2f %% Syntactic " 92 | "accuracy: %.2f %% \n", 93 | CACN / (float)TACN * 100, SEAC / (float)SECN * 100, 94 | SYAC / (float)SYCN * 100); 95 | } 96 | QID++; 97 | scanf("%s", st1); 98 | if (feof(stdin)) break; 99 | printf("%s:\n", st1); 100 | TCN = 0; 101 | CCN = 0; 102 | continue; 103 | } 104 | if (!strcmp(st1, "EXIT")) break; 105 | scanf("%s", st2); 106 | for (a = 0; a < strlen(st2); a++) st2[a] = toupper(st2[a]); 107 | scanf("%s", st3); 108 | for (a = 0; a < strlen(st3); a++) st3[a] = toupper(st3[a]); 109 | scanf("%s", st4); 110 | for (a = 0; a < strlen(st4); a++) st4[a] = toupper(st4[a]); 111 | for (b = 0; b < words; b++) 112 | if (!strcmp(&vocab[b * max_w], st1)) break; 113 | b1 = b; 114 | for (b = 0; b < words; b++) 115 | if (!strcmp(&vocab[b * max_w], st2)) break; 116 | b2 = b; 117 | for (b = 0; b < words; b++) 118 | if (!strcmp(&vocab[b * max_w], st3)) break; 119 | b3 = b; 120 | for (a = 0; a < N; a++) bestd[a] = 0; 121 | for (a = 0; a < N; a++) bestw[a][0] = 0; 122 | TQ++; 123 | if (b1 == words) continue; 124 | if (b2 == words) continue; 125 | if (b3 == words) continue; 126 | for (b = 0; b < words; b++) 127 | if (!strcmp(&vocab[b * max_w], st4)) break; 128 | if (b == words) continue; 129 | for (a = 0; a < size; a++) 130 | vec[a] = (M[a + b2 * size] - M[a + b1 * size]) + M[a + b3 * size]; 131 | TQS++; 132 | for (c = 0; c < words; c++) { 133 | if (c == b1) continue; 134 | if (c == b2) continue; 135 | if (c == b3) continue; 136 | dist = 0; 137 | for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size]; 138 | for (a = 0; a < N; a++) { 139 | if (dist > bestd[a]) { 140 | for (d = N - 1; d > a; d--) { 141 | bestd[d] = bestd[d - 1]; 142 | strcpy(bestw[d], bestw[d - 1]); 143 | } 144 | bestd[a] = dist; 145 | strcpy(bestw[a], &vocab[c * max_w]); 146 | break; 147 | } 148 | } 149 | } 150 | if (!strcmp(st4, bestw[0])) { 151 | CCN++; 152 | CACN++; 153 | if (QID <= 5) 154 | SEAC++; 155 | else 156 | SYAC++; 157 | } 158 | if (QID <= 5) 159 | SECN++; 160 | else 161 | SYCN++; 162 | TCN++; 163 | TACN++; 164 | } 165 | printf("Questions seen / total: %d %d %.2f %% \n", TQS, TQ, 166 | TQS / (float)TQ * 100); 167 | return 0; 168 | } 169 | -------------------------------------------------------------------------------- /vs2015/distance.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Debug 10 | x64 11 | 12 | 13 | Release 14 | Win32 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | 23 | 24 | 25 | {FE73759F-BDF7-4118-9CC9-BA2AE2D7D9A7} 26 | Win32Proj 27 | test 28 | distance 29 | 10.0.14393.0 30 | 31 | 32 | 33 | Application 34 | true 35 | v141 36 | NotSet 37 | 38 | 39 | Application 40 | true 41 | v141 42 | NotSet 43 | 44 | 45 | Application 46 | false 47 | v140 48 | true 49 | NotSet 50 | 51 | 52 | Application 53 | false 54 | v140 55 | true 56 | NotSet 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | false 76 | $(SolutionDir)\$(Configuration)\ 77 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 78 | 79 | 80 | false 81 | $(SolutionDir)\$(Configuration)\ 82 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 83 | 84 | 85 | false 86 | $(SolutionDir)\$(Configuration)\ 87 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 88 | 89 | 90 | false 91 | $(SolutionDir)\$(Configuration)\ 92 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 93 | 94 | 95 | 96 | 97 | 98 | TurnOffAllWarnings 99 | Disabled 100 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 101 | ProgramDatabase 102 | MultiThreadedDebug 103 | 104 | 105 | %(AdditionalIncludeDirectories) 106 | Default 107 | 108 | 109 | 110 | 111 | Console 112 | true 113 | %(AdditionalDependencies) 114 | 115 | 116 | 117 | 118 | 119 | 120 | TurnOffAllWarnings 121 | Disabled 122 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 123 | ProgramDatabase 124 | MultiThreadedDebug 125 | 126 | 127 | 128 | 129 | %(AdditionalIncludeDirectories) 130 | Default 131 | 132 | 133 | 134 | 135 | Console 136 | true 137 | %(AdditionalDependencies) 138 | 139 | 140 | 141 | 142 | TurnOffAllWarnings 143 | 144 | 145 | MaxSpeed 146 | true 147 | true 148 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 149 | 150 | 151 | Default 152 | %(AdditionalIncludeDirectories) 153 | MultiThreaded 154 | 155 | 156 | Console 157 | true 158 | true 159 | true 160 | %(AdditionalDependencies) 161 | ..\$(TargetName)$(TargetExt) 162 | 163 | 164 | 165 | 166 | TurnOffAllWarnings 167 | 168 | 169 | MaxSpeed 170 | true 171 | true 172 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 173 | 174 | 175 | Default 176 | %(AdditionalIncludeDirectories) 177 | MultiThreaded 178 | 179 | 180 | Console 181 | true 182 | true 183 | true 184 | %(AdditionalDependencies) 185 | ..\$(TargetName)$(TargetExt) 186 | 187 | 188 | 189 | 190 | 191 | -------------------------------------------------------------------------------- /vs2015/word2vec.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Debug 10 | x64 11 | 12 | 13 | Release 14 | Win32 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | 23 | 24 | 25 | {3665E45D-8606-4F60-B864-2AD85FB18CA1} 26 | Win32Proj 27 | test 28 | word2vec 29 | 10.0.14393.0 30 | 31 | 32 | 33 | Application 34 | true 35 | v141 36 | NotSet 37 | 38 | 39 | Application 40 | true 41 | v141 42 | NotSet 43 | 44 | 45 | Application 46 | false 47 | v140 48 | true 49 | NotSet 50 | 51 | 52 | Application 53 | false 54 | v140 55 | true 56 | NotSet 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | false 76 | $(SolutionDir)\$(Configuration)\ 77 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 78 | 79 | 80 | false 81 | $(SolutionDir)\$(Configuration)\ 82 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 83 | 84 | 85 | false 86 | $(SolutionDir)\$(Configuration)\ 87 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 88 | 89 | 90 | false 91 | $(SolutionDir)\$(Configuration)\ 92 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 93 | 94 | 95 | 96 | 97 | 98 | TurnOffAllWarnings 99 | Disabled 100 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 101 | ProgramDatabase 102 | MultiThreadedDebug 103 | 104 | 105 | %(AdditionalIncludeDirectories) 106 | Default 107 | 108 | 109 | 110 | 111 | Console 112 | true 113 | %(AdditionalDependencies) 114 | 115 | 116 | 117 | 118 | 119 | 120 | TurnOffAllWarnings 121 | Disabled 122 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 123 | ProgramDatabase 124 | MultiThreadedDebug 125 | 126 | 127 | 128 | 129 | %(AdditionalIncludeDirectories) 130 | Default 131 | 132 | 133 | 134 | 135 | Console 136 | true 137 | %(AdditionalDependencies) 138 | 139 | 140 | 141 | 142 | TurnOffAllWarnings 143 | 144 | 145 | MaxSpeed 146 | true 147 | true 148 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 149 | 150 | 151 | Default 152 | %(AdditionalIncludeDirectories) 153 | MultiThreaded 154 | 155 | 156 | Console 157 | true 158 | true 159 | true 160 | %(AdditionalDependencies) 161 | ..\$(TargetName)$(TargetExt) 162 | 163 | 164 | 165 | 166 | TurnOffAllWarnings 167 | 168 | 169 | MaxSpeed 170 | true 171 | true 172 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 173 | 174 | 175 | Default 176 | %(AdditionalIncludeDirectories) 177 | MultiThreaded 178 | 179 | 180 | Console 181 | true 182 | true 183 | true 184 | %(AdditionalDependencies) 185 | ..\$(TargetName)$(TargetExt) 186 | 187 | 188 | 189 | 190 | 191 | -------------------------------------------------------------------------------- /vs2015/word2phrase.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Debug 10 | x64 11 | 12 | 13 | Release 14 | Win32 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | 23 | 24 | 25 | {66CAAE3C-A752-4FD1-BE30-8F65DAD73137} 26 | Win32Proj 27 | test 28 | word2phrase 29 | 10.0.14393.0 30 | 31 | 32 | 33 | Application 34 | true 35 | v141 36 | NotSet 37 | 38 | 39 | Application 40 | true 41 | v141 42 | NotSet 43 | 44 | 45 | Application 46 | false 47 | v140 48 | true 49 | NotSet 50 | 51 | 52 | Application 53 | false 54 | v140 55 | true 56 | NotSet 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | false 76 | $(SolutionDir)\$(Configuration)\ 77 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 78 | 79 | 80 | false 81 | $(SolutionDir)\$(Configuration)\ 82 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 83 | 84 | 85 | false 86 | $(SolutionDir)\$(Configuration)\ 87 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 88 | 89 | 90 | false 91 | $(SolutionDir)\$(Configuration)\ 92 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 93 | 94 | 95 | 96 | 97 | 98 | TurnOffAllWarnings 99 | Disabled 100 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 101 | ProgramDatabase 102 | MultiThreadedDebug 103 | 104 | 105 | %(AdditionalIncludeDirectories) 106 | Default 107 | 108 | 109 | 110 | 111 | Console 112 | true 113 | %(AdditionalDependencies) 114 | 115 | 116 | 117 | 118 | 119 | 120 | TurnOffAllWarnings 121 | Disabled 122 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 123 | ProgramDatabase 124 | MultiThreadedDebug 125 | 126 | 127 | 128 | 129 | %(AdditionalIncludeDirectories) 130 | Default 131 | 132 | 133 | 134 | 135 | Console 136 | true 137 | %(AdditionalDependencies) 138 | 139 | 140 | 141 | 142 | TurnOffAllWarnings 143 | 144 | 145 | MaxSpeed 146 | true 147 | true 148 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 149 | 150 | 151 | Default 152 | %(AdditionalIncludeDirectories) 153 | MultiThreaded 154 | 155 | 156 | Console 157 | true 158 | true 159 | true 160 | %(AdditionalDependencies) 161 | ..\$(TargetName)$(TargetExt) 162 | 163 | 164 | 165 | 166 | TurnOffAllWarnings 167 | 168 | 169 | MaxSpeed 170 | true 171 | true 172 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 173 | 174 | 175 | Default 176 | %(AdditionalIncludeDirectories) 177 | MultiThreaded 178 | 179 | 180 | Console 181 | true 182 | true 183 | true 184 | %(AdditionalDependencies) 185 | ..\$(TargetName)$(TargetExt) 186 | 187 | 188 | 189 | 190 | 191 | -------------------------------------------------------------------------------- /vs2015/word-analogy.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Debug 10 | x64 11 | 12 | 13 | Release 14 | Win32 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | 23 | 24 | 25 | {8C667D06-771F-441F-B94B-4DBE6D5BE3B6} 26 | Win32Proj 27 | test 28 | word-analogy 29 | 10.0.14393.0 30 | 31 | 32 | 33 | Application 34 | true 35 | v141 36 | NotSet 37 | 38 | 39 | Application 40 | true 41 | v141 42 | NotSet 43 | 44 | 45 | Application 46 | false 47 | v140 48 | true 49 | NotSet 50 | 51 | 52 | Application 53 | false 54 | v140 55 | true 56 | NotSet 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | false 76 | $(SolutionDir)\$(Configuration)\ 77 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 78 | 79 | 80 | false 81 | $(SolutionDir)\$(Configuration)\ 82 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 83 | 84 | 85 | false 86 | $(SolutionDir)\$(Configuration)\ 87 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 88 | 89 | 90 | false 91 | $(SolutionDir)\$(Configuration)\ 92 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 93 | 94 | 95 | 96 | 97 | 98 | TurnOffAllWarnings 99 | Disabled 100 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 101 | ProgramDatabase 102 | MultiThreadedDebug 103 | 104 | 105 | %(AdditionalIncludeDirectories) 106 | Default 107 | 108 | 109 | 110 | 111 | Console 112 | true 113 | %(AdditionalDependencies) 114 | 115 | 116 | 117 | 118 | 119 | 120 | TurnOffAllWarnings 121 | Disabled 122 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 123 | ProgramDatabase 124 | MultiThreadedDebug 125 | 126 | 127 | 128 | 129 | %(AdditionalIncludeDirectories) 130 | Default 131 | 132 | 133 | 134 | 135 | Console 136 | true 137 | %(AdditionalDependencies) 138 | 139 | 140 | 141 | 142 | TurnOffAllWarnings 143 | 144 | 145 | MaxSpeed 146 | true 147 | true 148 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 149 | 150 | 151 | Default 152 | %(AdditionalIncludeDirectories) 153 | MultiThreaded 154 | 155 | 156 | Console 157 | true 158 | true 159 | true 160 | %(AdditionalDependencies) 161 | ..\$(TargetName)$(TargetExt) 162 | 163 | 164 | 165 | 166 | TurnOffAllWarnings 167 | 168 | 169 | MaxSpeed 170 | true 171 | true 172 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 173 | 174 | 175 | Default 176 | %(AdditionalIncludeDirectories) 177 | MultiThreaded 178 | 179 | 180 | Console 181 | true 182 | true 183 | true 184 | %(AdditionalDependencies) 185 | ..\$(TargetName)$(TargetExt) 186 | 187 | 188 | 189 | 190 | 191 | -------------------------------------------------------------------------------- /vs2015/compute-accuracy.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Debug 10 | x64 11 | 12 | 13 | Release 14 | Win32 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | 23 | 24 | 25 | {78698725-BA8A-410B-9971-2BF28562B2D1} 26 | Win32Proj 27 | test 28 | compute-accuracy 29 | 10.0.14393.0 30 | 31 | 32 | 33 | Application 34 | true 35 | v141 36 | NotSet 37 | 38 | 39 | Application 40 | true 41 | v141 42 | NotSet 43 | 44 | 45 | Application 46 | false 47 | v140 48 | true 49 | NotSet 50 | 51 | 52 | Application 53 | false 54 | v140 55 | true 56 | NotSet 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | false 76 | $(SolutionDir)\$(Configuration)\ 77 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 78 | 79 | 80 | false 81 | $(SolutionDir)\$(Configuration)\ 82 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 83 | 84 | 85 | false 86 | $(SolutionDir)\$(Configuration)\ 87 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 88 | 89 | 90 | false 91 | $(SolutionDir)\$(Configuration)\ 92 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 93 | 94 | 95 | 96 | 97 | 98 | TurnOffAllWarnings 99 | Disabled 100 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 101 | ProgramDatabase 102 | MultiThreadedDebug 103 | 104 | 105 | %(AdditionalIncludeDirectories) 106 | Default 107 | 108 | 109 | 110 | 111 | Console 112 | true 113 | %(AdditionalDependencies) 114 | 115 | 116 | 117 | 118 | 119 | 120 | TurnOffAllWarnings 121 | Disabled 122 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 123 | ProgramDatabase 124 | MultiThreadedDebug 125 | 126 | 127 | 128 | 129 | %(AdditionalIncludeDirectories) 130 | Default 131 | 132 | 133 | 134 | 135 | Console 136 | true 137 | %(AdditionalDependencies) 138 | 139 | 140 | 141 | 142 | TurnOffAllWarnings 143 | 144 | 145 | MaxSpeed 146 | true 147 | true 148 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 149 | 150 | 151 | Default 152 | %(AdditionalIncludeDirectories) 153 | MultiThreaded 154 | 155 | 156 | Console 157 | true 158 | true 159 | true 160 | %(AdditionalDependencies) 161 | ..\$(TargetName)$(TargetExt) 162 | 163 | 164 | 165 | 166 | TurnOffAllWarnings 167 | 168 | 169 | MaxSpeed 170 | true 171 | true 172 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 173 | 174 | 175 | Default 176 | %(AdditionalIncludeDirectories) 177 | MultiThreaded 178 | 179 | 180 | Console 181 | true 182 | true 183 | true 184 | %(AdditionalDependencies) 185 | ..\$(TargetName)$(TargetExt) 186 | 187 | 188 | 189 | 190 | 191 | -------------------------------------------------------------------------------- /vs2015/word2vec-doc2vec.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Debug 10 | x64 11 | 12 | 13 | Release 14 | Win32 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | 23 | 24 | 25 | {4192BAE0-FC98-4AE4-819A-65C0B896C38E} 26 | Win32Proj 27 | test 28 | word2vec-doc2vec 29 | 10.0.14393.0 30 | 31 | 32 | 33 | Application 34 | true 35 | v141 36 | NotSet 37 | 38 | 39 | Application 40 | true 41 | v141 42 | NotSet 43 | 44 | 45 | Application 46 | false 47 | v140 48 | true 49 | NotSet 50 | 51 | 52 | Application 53 | false 54 | v140 55 | true 56 | NotSet 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | false 76 | $(SolutionDir)\$(Configuration)\ 77 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 78 | 79 | 80 | false 81 | $(SolutionDir)\$(Configuration)\ 82 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 83 | 84 | 85 | false 86 | $(SolutionDir)\$(Configuration)\ 87 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 88 | 89 | 90 | false 91 | $(SolutionDir)\$(Configuration)\ 92 | $(SolutionDir)\$(Configuration)\$(ProjectName)\ 93 | 94 | 95 | 96 | 97 | 98 | TurnOffAllWarnings 99 | Disabled 100 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 101 | ProgramDatabase 102 | MultiThreadedDebug 103 | 104 | 105 | %(AdditionalIncludeDirectories) 106 | Default 107 | 108 | 109 | 110 | 111 | Console 112 | true 113 | %(AdditionalDependencies) 114 | 115 | 116 | 117 | 118 | 119 | 120 | TurnOffAllWarnings 121 | Disabled 122 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 123 | ProgramDatabase 124 | MultiThreadedDebug 125 | 126 | 127 | 128 | 129 | %(AdditionalIncludeDirectories) 130 | Default 131 | 132 | 133 | 134 | 135 | Console 136 | true 137 | %(AdditionalDependencies) 138 | 139 | 140 | 141 | 142 | TurnOffAllWarnings 143 | 144 | 145 | MaxSpeed 146 | true 147 | true 148 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 149 | 150 | 151 | Default 152 | %(AdditionalIncludeDirectories) 153 | MultiThreaded 154 | 155 | 156 | Console 157 | true 158 | true 159 | true 160 | %(AdditionalDependencies) 161 | ..\$(TargetName)$(TargetExt) 162 | 163 | 164 | 165 | 166 | TurnOffAllWarnings 167 | 168 | 169 | MaxSpeed 170 | true 171 | true 172 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 173 | 174 | 175 | Default 176 | %(AdditionalIncludeDirectories) 177 | MultiThreaded 178 | 179 | 180 | Console 181 | true 182 | true 183 | true 184 | %(AdditionalDependencies) 185 | ..\$(TargetName)$(TargetExt) 186 | 187 | 188 | 189 | 190 | 191 | -------------------------------------------------------------------------------- /word2phrase.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #define MAX_STRING 60 21 | 22 | const int vocab_hash_size = 23 | 500000000; // Maximum 500M entries in the vocabulary 24 | 25 | typedef float real; // Precision of float numbers 26 | 27 | struct vocab_word { 28 | long long cn; 29 | char *word; 30 | }; 31 | 32 | char train_file[MAX_STRING], output_file[MAX_STRING]; 33 | struct vocab_word *vocab; 34 | int debug_mode = 2, min_count = 5, *vocab_hash, min_reduce = 1; 35 | long long vocab_max_size = 10000, vocab_size = 0; 36 | long long train_words = 0; 37 | real threshold = 100; 38 | 39 | unsigned long long next_random = 1; 40 | 41 | // Reads a single word from a file, assuming space + tab + EOL to be word 42 | // boundaries 43 | void ReadWord(char *word, FILE *fin) { 44 | int a = 0, ch; 45 | while (!feof(fin)) { 46 | ch = fgetc(fin); 47 | if (ch == 13) continue; 48 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 49 | if (a > 0) { 50 | if (ch == '\n') ungetc(ch, fin); 51 | break; 52 | } 53 | if (ch == '\n') { 54 | strcpy(word, (char *)""); 55 | return; 56 | } else 57 | continue; 58 | } 59 | word[a] = ch; 60 | a++; 61 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 62 | } 63 | word[a] = 0; 64 | } 65 | 66 | // Returns hash value of a word 67 | int GetWordHash(char *word) { 68 | unsigned long long a, hash = 1; 69 | for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; 70 | hash = hash % vocab_hash_size; 71 | return hash; 72 | } 73 | 74 | // Returns position of a word in the vocabulary; if the word is not found, 75 | // returns -1 76 | int SearchVocab(char *word) { 77 | unsigned int hash = GetWordHash(word); 78 | while (1) { 79 | if (vocab_hash[hash] == -1) return -1; 80 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; 81 | hash = (hash + 1) % vocab_hash_size; 82 | } 83 | return -1; 84 | } 85 | 86 | // Reads a word and returns its index in the vocabulary 87 | int ReadWordIndex(FILE *fin) { 88 | char word[MAX_STRING]; 89 | ReadWord(word, fin); 90 | if (feof(fin)) return -1; 91 | return SearchVocab(word); 92 | } 93 | 94 | // Adds a word to the vocabulary 95 | int AddWordToVocab(char *word) { 96 | unsigned int hash, length = strlen(word) + 1; 97 | if (length > MAX_STRING) length = MAX_STRING; 98 | vocab[vocab_size].word = (char *)calloc(length, sizeof(char)); 99 | strcpy(vocab[vocab_size].word, word); 100 | vocab[vocab_size].cn = 0; 101 | vocab_size++; 102 | // Reallocate memory if needed 103 | if (vocab_size + 2 >= vocab_max_size) { 104 | vocab_max_size += 10000; 105 | vocab = (struct vocab_word *)realloc( 106 | vocab, vocab_max_size * sizeof(struct vocab_word)); 107 | } 108 | hash = GetWordHash(word); 109 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 110 | vocab_hash[hash] = vocab_size - 1; 111 | return vocab_size - 1; 112 | } 113 | 114 | // Used later for sorting by word counts 115 | int VocabCompare(const void *a, const void *b) { 116 | return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn; 117 | } 118 | 119 | // Sorts the vocabulary by frequency using word counts 120 | void SortVocab() { 121 | int a; 122 | unsigned int hash; 123 | // Sort the vocabulary and keep at the first position 124 | qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare); 125 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 126 | for (a = 0; a < vocab_size; a++) { 127 | // Words occuring less than min_count times will be discarded from the vocab 128 | if (vocab[a].cn < min_count) { 129 | vocab_size--; 130 | free(vocab[vocab_size].word); 131 | } else { 132 | // Hash will be re-computed, as after the sorting it is not actual 133 | hash = GetWordHash(vocab[a].word); 134 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 135 | vocab_hash[hash] = a; 136 | } 137 | } 138 | vocab = (struct vocab_word *)realloc(vocab, 139 | vocab_size * sizeof(struct vocab_word)); 140 | } 141 | 142 | // Reduces the vocabulary by removing infrequent tokens 143 | void ReduceVocab() { 144 | int a, b = 0; 145 | unsigned int hash; 146 | for (a = 0; a < vocab_size; a++) 147 | if (vocab[a].cn > min_reduce) { 148 | vocab[b].cn = vocab[a].cn; 149 | vocab[b].word = vocab[a].word; 150 | b++; 151 | } else 152 | free(vocab[a].word); 153 | vocab_size = b; 154 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 155 | for (a = 0; a < vocab_size; a++) { 156 | // Hash will be re-computed, as it is not actual 157 | hash = GetWordHash(vocab[a].word); 158 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 159 | vocab_hash[hash] = a; 160 | } 161 | fflush(stdout); 162 | min_reduce++; 163 | } 164 | 165 | void LearnVocabFromTrainFile() { 166 | char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2]; 167 | FILE *fin; 168 | long long a, i, start = 1; 169 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 170 | fin = fopen(train_file, "rb"); 171 | if (fin == NULL) { 172 | printf("ERROR: training data file not found!\n"); 173 | exit(1); 174 | } 175 | vocab_size = 0; 176 | AddWordToVocab((char *)""); 177 | while (1) { 178 | ReadWord(word, fin); 179 | if (feof(fin)) break; 180 | if (!strcmp(word, "")) { 181 | start = 1; 182 | continue; 183 | } else 184 | start = 0; 185 | train_words++; 186 | if ((debug_mode > 1) && (train_words % 100000 == 0)) { 187 | printf("Words processed: %lldK Vocab size: %lldK %c", 188 | train_words / 1000, vocab_size / 1000, 13); 189 | fflush(stdout); 190 | } 191 | i = SearchVocab(word); 192 | if (i == -1) { 193 | a = AddWordToVocab(word); 194 | vocab[a].cn = 1; 195 | } else 196 | vocab[i].cn++; 197 | if (start) continue; 198 | sprintf(bigram_word, "%s_%s", last_word, word); 199 | bigram_word[MAX_STRING - 1] = 0; 200 | strcpy(last_word, word); 201 | i = SearchVocab(bigram_word); 202 | if (i == -1) { 203 | a = AddWordToVocab(bigram_word); 204 | vocab[a].cn = 1; 205 | } else 206 | vocab[i].cn++; 207 | if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); 208 | } 209 | SortVocab(); 210 | if (debug_mode > 0) { 211 | printf("\nVocab size (unigrams + bigrams): %lld\n", vocab_size); 212 | printf("Words in train file: %lld\n", train_words); 213 | } 214 | fclose(fin); 215 | } 216 | 217 | void TrainModel() { 218 | long long pa = 0, pb = 0, pab = 0, oov, i, li = -1, cn = 0; 219 | char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2]; 220 | real score; 221 | FILE *fo, *fin; 222 | printf("Starting training using file %s\n", train_file); 223 | LearnVocabFromTrainFile(); 224 | fin = fopen(train_file, "rb"); 225 | fo = fopen(output_file, "wb"); 226 | word[0] = 0; 227 | while (1) { 228 | strcpy(last_word, word); 229 | ReadWord(word, fin); 230 | if (feof(fin)) break; 231 | if (!strcmp(word, "")) { 232 | fprintf(fo, "\n"); 233 | continue; 234 | } 235 | cn++; 236 | if ((debug_mode > 1) && (cn % 100000 == 0)) { 237 | printf("Words written: %lldK%c", cn / 1000, 13); 238 | fflush(stdout); 239 | } 240 | oov = 0; 241 | i = SearchVocab(word); 242 | if (i == -1) 243 | oov = 1; 244 | else 245 | pb = vocab[i].cn; 246 | if (li == -1) oov = 1; 247 | li = i; 248 | sprintf(bigram_word, "%s_%s", last_word, word); 249 | bigram_word[MAX_STRING - 1] = 0; 250 | i = SearchVocab(bigram_word); 251 | if (i == -1) 252 | oov = 1; 253 | else 254 | pab = vocab[i].cn; 255 | if (pa < min_count) oov = 1; 256 | if (pb < min_count) oov = 1; 257 | if (oov) 258 | score = 0; 259 | else 260 | score = (pab - min_count) / (real)pa / (real)pb * (real)train_words; 261 | if (score > threshold) { 262 | fprintf(fo, "_%s", word); 263 | pb = 0; 264 | } else 265 | fprintf(fo, " %s", word); 266 | pa = pb; 267 | } 268 | fclose(fo); 269 | fclose(fin); 270 | } 271 | 272 | int ArgPos(char *str, int argc, char **argv) { 273 | int a; 274 | for (a = 1; a < argc; a++) 275 | if (!strcmp(str, argv[a])) { 276 | if (a == argc - 1) { 277 | printf("Argument missing for %s\n", str); 278 | exit(1); 279 | } 280 | return a; 281 | } 282 | return -1; 283 | } 284 | 285 | int main(int argc, char **argv) { 286 | int i; 287 | if (argc == 1) { 288 | printf("WORD2PHRASE tool v0.1a\n\n"); 289 | printf("Options:\n"); 290 | printf("Parameters for training:\n"); 291 | printf("\t-train \n"); 292 | printf("\t\tUse text data from to train the model\n"); 293 | printf("\t-output \n"); 294 | printf( 295 | "\t\tUse to save the resulting word vectors / word clusters / " 296 | "phrases\n"); 297 | printf("\t-min-count \n"); 298 | printf( 299 | "\t\tThis will discard words that appear less than times; " 300 | "default is 5\n"); 301 | printf("\t-threshold \n"); 302 | printf( 303 | "\t\t The value represents threshold for forming the phrases " 304 | "(higher means less phrases); default 100\n"); 305 | printf("\t-debug \n"); 306 | printf( 307 | "\t\tSet the debug mode (default = 2 = more info during training)\n"); 308 | printf("\nExamples:\n"); 309 | printf( 310 | "./word2phrase -train text.txt -output phrases.txt -threshold 100 " 311 | "-debug 2\n\n"); 312 | return 0; 313 | } 314 | if ((i = ArgPos((char *)"-train", argc, argv)) > 0) 315 | strcpy(train_file, argv[i + 1]); 316 | if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) 317 | debug_mode = atoi(argv[i + 1]); 318 | if ((i = ArgPos((char *)"-output", argc, argv)) > 0) 319 | strcpy(output_file, argv[i + 1]); 320 | if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) 321 | min_count = atoi(argv[i + 1]); 322 | if ((i = ArgPos((char *)"-threshold", argc, argv)) > 0) 323 | threshold = atof(argv[i + 1]); 324 | vocab = 325 | (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); 326 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); 327 | TrainModel(); 328 | return 0; 329 | } 330 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /word2vec-doc2vec.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #if defined _WIN32 21 | #include "win32-port.h" 22 | #else 23 | #include 24 | #endif 25 | 26 | #define MAX_STRING 100 27 | #define EXP_TABLE_SIZE 1000 28 | #define MAX_EXP 6 29 | #define MAX_SENTENCE_LENGTH 1000 30 | #define MAX_CODE_LENGTH 40 31 | 32 | const int vocab_hash_size = 33 | 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary 34 | 35 | typedef float real; // Precision of float numbers 36 | 37 | struct vocab_word { 38 | long long cn; 39 | int *point; 40 | char *word, *code, codelen; 41 | }; 42 | 43 | char train_file[MAX_STRING], output_file[MAX_STRING]; 44 | char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING]; 45 | struct vocab_word *vocab; 46 | int binary = 0, cbow = 1, debug_mode = 2, window = 5, min_count = 5, 47 | num_threads = 12, min_reduce = 1; 48 | int *vocab_hash; 49 | long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100; 50 | long long sentence_vectors = 0; 51 | long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, 52 | classes = 0; 53 | real alpha = 0.025, starting_alpha, sample = 1e-3; 54 | real *syn0, *syn1, *syn1neg, *expTable; 55 | clock_t start; 56 | 57 | int hs = 0, negative = 5; 58 | const int table_size = 1e8; 59 | int *table; 60 | 61 | void InitUnigramTable() { 62 | int a, i; 63 | double train_words_pow = 0; 64 | double d1, power = 0.75; 65 | table = (int *)malloc(table_size * sizeof(int)); 66 | for (a = 1; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power); 67 | i = 1; 68 | d1 = pow(vocab[i].cn, power) / train_words_pow; 69 | for (a = 0; a < table_size; a++) { 70 | table[a] = i; 71 | if (a / (double)table_size > d1) { 72 | i++; 73 | d1 += pow(vocab[i].cn, power) / train_words_pow; 74 | } 75 | if (i >= vocab_size) i = vocab_size - 1; 76 | } 77 | } 78 | 79 | // Reads a single word from a file, assuming space + tab + EOL to be word 80 | // boundaries 81 | void ReadWord(char *word, FILE *fin) { 82 | int a = 0, ch; 83 | while (!feof(fin)) { 84 | ch = fgetc(fin); 85 | if (ch == 13) continue; 86 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 87 | if (a > 0) { 88 | if (ch == '\n') ungetc(ch, fin); 89 | break; 90 | } 91 | if (ch == '\n') { 92 | strcpy(word, (char *)""); 93 | return; 94 | } else 95 | continue; 96 | } 97 | word[a] = ch; 98 | a++; 99 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 100 | } 101 | word[a] = 0; 102 | } 103 | 104 | // Returns hash value of a word 105 | int GetWordHash(char *word) { 106 | unsigned long long a, hash = 0; 107 | for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; 108 | hash = hash % vocab_hash_size; 109 | return hash; 110 | } 111 | 112 | // Returns position of a word in the vocabulary; if the word is not found, 113 | // returns -1 114 | int SearchVocab(char *word) { 115 | unsigned int hash = GetWordHash(word); 116 | while (1) { 117 | if (vocab_hash[hash] == -1) return -1; 118 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; 119 | hash = (hash + 1) % vocab_hash_size; 120 | } 121 | return -1; 122 | } 123 | 124 | // Reads a word and returns its index in the vocabulary 125 | int ReadWordIndex(FILE *fin) { 126 | char word[MAX_STRING]; 127 | ReadWord(word, fin); 128 | if (feof(fin)) return -1; 129 | return SearchVocab(word); 130 | } 131 | 132 | // Adds a word to the vocabulary 133 | int AddWordToVocab(char *word) { 134 | unsigned int hash, length = strlen(word) + 1; 135 | if (length > MAX_STRING) length = MAX_STRING; 136 | vocab[vocab_size].word = (char *)calloc(length, sizeof(char)); 137 | strcpy(vocab[vocab_size].word, word); 138 | vocab[vocab_size].cn = 0; 139 | vocab_size++; 140 | // Reallocate memory if needed 141 | if (vocab_size + 2 >= vocab_max_size) { 142 | vocab_max_size += 1000; 143 | vocab = (struct vocab_word *)realloc( 144 | vocab, vocab_max_size * sizeof(struct vocab_word)); 145 | } 146 | hash = GetWordHash(word); 147 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 148 | vocab_hash[hash] = vocab_size - 1; 149 | return vocab_size - 1; 150 | } 151 | 152 | // Used later for sorting by word counts 153 | int VocabCompare(const void *a, const void *b) { 154 | return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn; 155 | } 156 | 157 | // Sorts the vocabulary by frequency using word counts 158 | void SortVocab() { 159 | int a, size; 160 | unsigned int hash; 161 | // Sort the vocabulary and keep at the first position 162 | qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare); 163 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 164 | size = vocab_size; 165 | train_words = 0; 166 | for (a = 0; a < size; a++) { 167 | // Words occuring less than min_count times will be discarded from the vocab 168 | if ((vocab[a].cn < min_count) && (a != 0)) { 169 | vocab_size--; 170 | free(vocab[a].word); 171 | } else { 172 | // Hash will be re-computed, as after the sorting it is not actual 173 | hash = GetWordHash(vocab[a].word); 174 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 175 | vocab_hash[hash] = a; 176 | train_words += vocab[a].cn; 177 | } 178 | } 179 | vocab = (struct vocab_word *)realloc( 180 | vocab, (vocab_size + 1) * sizeof(struct vocab_word)); 181 | // Allocate memory for the binary tree construction 182 | for (a = 0; a < vocab_size; a++) { 183 | vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char)); 184 | vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int)); 185 | } 186 | } 187 | 188 | // Reduces the vocabulary by removing infrequent tokens 189 | void ReduceVocab() { 190 | int a, b = 0; 191 | unsigned int hash; 192 | for (a = 0; a < vocab_size; a++) 193 | if (vocab[a].cn > min_reduce) { 194 | vocab[b].cn = vocab[a].cn; 195 | vocab[b].word = vocab[a].word; 196 | b++; 197 | } else 198 | free(vocab[a].word); 199 | vocab_size = b; 200 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 201 | for (a = 0; a < vocab_size; a++) { 202 | // Hash will be re-computed, as it is not actual 203 | hash = GetWordHash(vocab[a].word); 204 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 205 | vocab_hash[hash] = a; 206 | } 207 | fflush(stdout); 208 | min_reduce++; 209 | } 210 | 211 | // Create binary Huffman tree using the word counts 212 | // Frequent words will have short uniqe binary codes 213 | void CreateBinaryTree() { 214 | long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH]; 215 | char code[MAX_CODE_LENGTH]; 216 | long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 217 | long long *binary = 218 | (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 219 | long long *parent_node = 220 | (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 221 | for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn; 222 | for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15; 223 | pos1 = vocab_size - 1; 224 | pos2 = vocab_size; 225 | // Following algorithm constructs the Huffman tree by adding one node at a 226 | // time 227 | for (a = 0; a < vocab_size - 1; a++) { 228 | // First, find two smallest nodes 'min1, min2' 229 | if (pos1 >= 0) { 230 | if (count[pos1] < count[pos2]) { 231 | min1i = pos1; 232 | pos1--; 233 | } else { 234 | min1i = pos2; 235 | pos2++; 236 | } 237 | } else { 238 | min1i = pos2; 239 | pos2++; 240 | } 241 | if (pos1 >= 0) { 242 | if (count[pos1] < count[pos2]) { 243 | min2i = pos1; 244 | pos1--; 245 | } else { 246 | min2i = pos2; 247 | pos2++; 248 | } 249 | } else { 250 | min2i = pos2; 251 | pos2++; 252 | } 253 | count[vocab_size + a] = count[min1i] + count[min2i]; 254 | parent_node[min1i] = vocab_size + a; 255 | parent_node[min2i] = vocab_size + a; 256 | binary[min2i] = 1; 257 | } 258 | // Now assign binary code to each vocabulary word 259 | for (a = 0; a < vocab_size; a++) { 260 | b = a; 261 | i = 0; 262 | while (1) { 263 | code[i] = binary[b]; 264 | point[i] = b; 265 | i++; 266 | b = parent_node[b]; 267 | if (b == vocab_size * 2 - 2) break; 268 | } 269 | vocab[a].codelen = i; 270 | vocab[a].point[0] = vocab_size - 2; 271 | for (b = 0; b < i; b++) { 272 | vocab[a].code[i - b - 1] = code[b]; 273 | vocab[a].point[i - b] = point[b] - vocab_size; 274 | } 275 | } 276 | free(count); 277 | free(binary); 278 | free(parent_node); 279 | } 280 | 281 | void LearnVocabFromTrainFile() { 282 | char word[MAX_STRING]; 283 | FILE *fin; 284 | long long a, i; 285 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 286 | fin = fopen(train_file, "rb"); 287 | if (fin == NULL) { 288 | printf("ERROR: training data file not found!\n"); 289 | exit(1); 290 | } 291 | vocab_size = 0; 292 | AddWordToVocab((char *)""); 293 | while (1) { 294 | ReadWord(word, fin); 295 | if (feof(fin)) break; 296 | train_words++; 297 | if ((debug_mode > 1) && (train_words % 100000 == 0)) { 298 | printf("%lldK%c", train_words / 1000, 13); 299 | fflush(stdout); 300 | } 301 | i = SearchVocab(word); 302 | if (i == -1) { 303 | a = AddWordToVocab(word); 304 | vocab[a].cn = 1; 305 | } else 306 | vocab[i].cn++; 307 | if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); 308 | } 309 | SortVocab(); 310 | if (debug_mode > 0) { 311 | printf("Vocab size: %lld\n", vocab_size); 312 | printf("Words in train file: %lld\n", train_words); 313 | } 314 | file_size = ftell(fin); 315 | fclose(fin); 316 | } 317 | 318 | void SaveVocab() { 319 | long long i; 320 | FILE *fo = fopen(save_vocab_file, "wb"); 321 | for (i = 0; i < vocab_size; i++) 322 | fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn); 323 | fclose(fo); 324 | } 325 | 326 | void ReadVocab() { 327 | long long a, i = 0; 328 | char c; 329 | char word[MAX_STRING]; 330 | FILE *fin = fopen(read_vocab_file, "rb"); 331 | if (fin == NULL) { 332 | printf("Vocabulary file not found\n"); 333 | exit(1); 334 | } 335 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 336 | vocab_size = 0; 337 | while (1) { 338 | ReadWord(word, fin); 339 | if (feof(fin)) break; 340 | a = AddWordToVocab(word); 341 | fscanf(fin, "%lld%c", &vocab[a].cn, &c); 342 | i++; 343 | } 344 | SortVocab(); 345 | if (debug_mode > 0) { 346 | printf("Vocab size: %lld\n", vocab_size); 347 | printf("Words in train file: %lld\n", train_words); 348 | } 349 | fin = fopen(train_file, "rb"); 350 | if (fin == NULL) { 351 | printf("ERROR: training data file not found!\n"); 352 | exit(1); 353 | } 354 | fseek(fin, 0, SEEK_END); 355 | file_size = ftell(fin); 356 | fclose(fin); 357 | } 358 | 359 | void InitNet() { 360 | long long a, b; 361 | unsigned long long next_random = 1; 362 | a = posix_memalign((void **)&syn0, 128, 363 | (long long)vocab_size * layer1_size * sizeof(real)); 364 | if (syn0 == NULL) { 365 | printf("Memory allocation failed\n"); 366 | exit(1); 367 | } 368 | if (hs) { 369 | a = posix_memalign((void **)&syn1, 128, 370 | (long long)vocab_size * layer1_size * sizeof(real)); 371 | if (syn1 == NULL) { 372 | printf("Memory allocation failed\n"); 373 | exit(1); 374 | } 375 | for (a = 0; a < vocab_size; a++) 376 | for (b = 0; b < layer1_size; b++) syn1[a * layer1_size + b] = 0; 377 | } 378 | if (negative > 0) { 379 | a = posix_memalign((void **)&syn1neg, 128, 380 | (long long)vocab_size * layer1_size * sizeof(real)); 381 | if (syn1neg == NULL) { 382 | printf("Memory allocation failed\n"); 383 | exit(1); 384 | } 385 | for (a = 0; a < vocab_size; a++) 386 | for (b = 0; b < layer1_size; b++) syn1neg[a * layer1_size + b] = 0; 387 | } 388 | for (a = 0; a < vocab_size; a++) 389 | for (b = 0; b < layer1_size; b++) { 390 | next_random = next_random * (unsigned long long)25214903917 + 11; 391 | syn0[a * layer1_size + b] = 392 | (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size; 393 | } 394 | CreateBinaryTree(); 395 | } 396 | 397 | void *TrainModelThread(void *id) { 398 | long long a, b, d, cw, word, last_word, sentence_length = 0, 399 | sentence_position = 0; 400 | long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1]; 401 | long long l1, l2, c, target, label, local_iter = iter; 402 | unsigned long long next_random = (long long)id; 403 | real f, g; 404 | clock_t now; 405 | real *neu1 = (real *)calloc(layer1_size, sizeof(real)); 406 | real *neu1e = (real *)calloc(layer1_size, sizeof(real)); 407 | FILE *fi = fopen(train_file, "rb"); 408 | fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); 409 | while (1) { 410 | if (word_count - last_word_count > 10000) { 411 | word_count_actual += word_count - last_word_count; 412 | last_word_count = word_count; 413 | if ((debug_mode > 1)) { 414 | now = clock(); 415 | printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, 416 | alpha, word_count_actual / (real)(iter * train_words + 1) * 100, 417 | word_count_actual / 418 | ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); 419 | fflush(stdout); 420 | } 421 | alpha = starting_alpha * 422 | (1 - word_count_actual / (real)(iter * train_words + 1)); 423 | if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001; 424 | } 425 | if (sentence_length == 0) { 426 | while (1) { 427 | word = ReadWordIndex(fi); 428 | if (feof(fi)) break; 429 | if (word == -1) continue; 430 | word_count++; 431 | if (word == 0) break; 432 | // The subsampling randomly discards frequent words while keeping the 433 | // ranking same 434 | if (sample > 0) { 435 | real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * 436 | (sample * train_words) / vocab[word].cn; 437 | next_random = next_random * (unsigned long long)25214903917 + 11; 438 | if (ran < (next_random & 0xFFFF) / (real)65536) continue; 439 | } 440 | sen[sentence_length] = word; 441 | sentence_length++; 442 | if (sentence_length >= MAX_SENTENCE_LENGTH) break; 443 | } 444 | sentence_position = 0; 445 | } 446 | if (feof(fi) || (word_count > train_words / num_threads)) { 447 | word_count_actual += word_count - last_word_count; 448 | local_iter--; 449 | if (local_iter == 0) break; 450 | word_count = 0; 451 | last_word_count = 0; 452 | sentence_length = 0; 453 | fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); 454 | continue; 455 | } 456 | word = sen[sentence_position]; 457 | if (word == -1) continue; 458 | for (c = 0; c < layer1_size; c++) neu1[c] = 0; 459 | for (c = 0; c < layer1_size; c++) neu1e[c] = 0; 460 | next_random = next_random * (unsigned long long)25214903917 + 11; 461 | b = next_random % window; 462 | if (cbow) { // train the cbow architecture 463 | // in -> hidden 464 | cw = 0; 465 | for (a = b; a < window * 1 + 1 - b; a++) 466 | if (a != window) { 467 | c = sentence_position - window + a; 468 | if (c < 0) continue; 469 | if (c >= sentence_length) continue; 470 | if (sentence_vectors && (c == 0)) continue; 471 | last_word = sen[c]; 472 | if (last_word == -1) continue; 473 | for (c = 0; c < layer1_size; c++) 474 | neu1[c] += syn0[c + last_word * layer1_size]; 475 | cw++; 476 | } 477 | if (sentence_vectors) { 478 | last_word = sen[0]; 479 | if (last_word == -1) continue; 480 | for (c = 0; c < layer1_size; c++) 481 | neu1[c] += syn0[c + last_word * layer1_size]; 482 | cw++; 483 | } 484 | if (cw) { 485 | for (c = 0; c < layer1_size; c++) neu1[c] /= cw; 486 | if (hs) 487 | for (d = 0; d < vocab[word].codelen; d++) { 488 | f = 0; 489 | l2 = vocab[word].point[d] * layer1_size; 490 | // Propagate hidden -> output 491 | for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2]; 492 | if (f <= -MAX_EXP) 493 | continue; 494 | else if (f >= MAX_EXP) 495 | continue; 496 | else 497 | f = expTable[(int)((f + MAX_EXP) * 498 | (EXP_TABLE_SIZE / MAX_EXP / 2))]; 499 | // 'g' is the gradient multiplied by the learning rate 500 | g = (1 - vocab[word].code[d] - f) * alpha; 501 | // Propagate errors output -> hidden 502 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; 503 | // Learn weights hidden -> output 504 | for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c]; 505 | } 506 | // NEGATIVE SAMPLING 507 | if (negative > 0) 508 | for (d = 0; d < negative + 1; d++) { 509 | if (d == 0) { 510 | target = word; 511 | label = 1; 512 | } else { 513 | next_random = next_random * (unsigned long long)25214903917 + 11; 514 | target = table[(next_random >> 16) % table_size]; 515 | if (target == word) continue; 516 | label = 0; 517 | } 518 | l2 = target * layer1_size; 519 | f = 0; 520 | for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2]; 521 | if (f > MAX_EXP) 522 | g = (label - 1) * alpha; 523 | else if (f < -MAX_EXP) 524 | g = (label - 0) * alpha; 525 | else 526 | g = (label - expTable[(int)((f + MAX_EXP) * 527 | (EXP_TABLE_SIZE / MAX_EXP / 2))]) * 528 | alpha; 529 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; 530 | for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c]; 531 | } 532 | // hidden -> in 533 | for (a = b; a < window * 1 + 1 - b; a++) 534 | if (a != window) { 535 | c = sentence_position - window + a; 536 | if (c < 0) continue; 537 | if (c >= sentence_length) continue; 538 | if (sentence_vectors && (c == 0)) continue; 539 | last_word = sen[c]; 540 | if (last_word == -1) continue; 541 | for (c = 0; c < layer1_size; c++) 542 | syn0[c + last_word * layer1_size] += neu1e[c]; 543 | } 544 | if (sentence_vectors) { 545 | last_word = sen[0]; 546 | if (last_word == -1) continue; 547 | for (c = 0; c < layer1_size; c++) 548 | syn0[c + last_word * layer1_size] += neu1e[c]; 549 | } 550 | } 551 | } else { // train skip-gram 552 | for (a = b; a < window * 2 + 1 + sentence_vectors - b; a++) 553 | if (a != window) { 554 | c = sentence_position - window + a; 555 | if (sentence_vectors) 556 | if (a >= window * 2 + sentence_vectors - b) c = 0; 557 | if (c < 0) continue; 558 | if (c >= sentence_length) continue; 559 | last_word = sen[c]; 560 | if (last_word == -1) continue; 561 | l1 = last_word * layer1_size; 562 | for (c = 0; c < layer1_size; c++) neu1e[c] = 0; 563 | // HIERARCHICAL SOFTMAX 564 | if (hs) 565 | for (d = 0; d < vocab[word].codelen; d++) { 566 | f = 0; 567 | l2 = vocab[word].point[d] * layer1_size; 568 | // Propagate hidden -> output 569 | for (c = 0; c < layer1_size; c++) 570 | f += syn0[c + l1] * syn1[c + l2]; 571 | if (f <= -MAX_EXP) 572 | continue; 573 | else if (f >= MAX_EXP) 574 | continue; 575 | else 576 | f = expTable[(int)((f + MAX_EXP) * 577 | (EXP_TABLE_SIZE / MAX_EXP / 2))]; 578 | // 'g' is the gradient multiplied by the learning rate 579 | g = (1 - vocab[word].code[d] - f) * alpha; 580 | // Propagate errors output -> hidden 581 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; 582 | // Learn weights hidden -> output 583 | for (c = 0; c < layer1_size; c++) 584 | syn1[c + l2] += g * syn0[c + l1]; 585 | } 586 | // NEGATIVE SAMPLING 587 | if (negative > 0) 588 | for (d = 0; d < negative + 1; d++) { 589 | if (d == 0) { 590 | target = word; 591 | label = 1; 592 | } else { 593 | next_random = 594 | next_random * (unsigned long long)25214903917 + 11; 595 | target = table[(next_random >> 16) % table_size]; 596 | if (target == word) continue; 597 | label = 0; 598 | } 599 | l2 = target * layer1_size; 600 | f = 0; 601 | for (c = 0; c < layer1_size; c++) 602 | f += syn0[c + l1] * syn1neg[c + l2]; 603 | if (f > MAX_EXP) 604 | g = (label - 1) * alpha; 605 | else if (f < -MAX_EXP) 606 | g = (label - 0) * alpha; 607 | else 608 | g = (label - expTable[(int)((f + MAX_EXP) * 609 | (EXP_TABLE_SIZE / MAX_EXP / 2))]) * 610 | alpha; 611 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; 612 | for (c = 0; c < layer1_size; c++) 613 | syn1neg[c + l2] += g * syn0[c + l1]; 614 | } 615 | // Learn weights input -> hidden 616 | for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c]; 617 | } 618 | } 619 | sentence_position++; 620 | if (sentence_position >= sentence_length) { 621 | sentence_length = 0; 622 | continue; 623 | } 624 | } 625 | fclose(fi); 626 | free(neu1); 627 | free(neu1e); 628 | pthread_exit(NULL); 629 | } 630 | 631 | void TrainModel() { 632 | long a, b, c, d; 633 | FILE *fo; 634 | pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t)); 635 | printf("Starting training using file %s\n", train_file); 636 | starting_alpha = alpha; 637 | if (read_vocab_file[0] != 0) 638 | ReadVocab(); 639 | else 640 | LearnVocabFromTrainFile(); 641 | if (save_vocab_file[0] != 0) SaveVocab(); 642 | if (output_file[0] == 0) return; 643 | InitNet(); 644 | if (negative > 0) InitUnigramTable(); 645 | start = clock(); 646 | for (a = 0; a < num_threads; a++) 647 | pthread_create(&pt[a], NULL, TrainModelThread, (void *)a); 648 | for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL); 649 | fo = fopen(output_file, "wb"); 650 | if (classes == 0) { 651 | // Save the word vectors 652 | fprintf(fo, "%lld %lld\n", vocab_size, layer1_size); 653 | for (a = 0; a < vocab_size; a++) { 654 | fprintf(fo, "%s ", vocab[a].word); 655 | if (binary) 656 | for (b = 0; b < layer1_size; b++) 657 | fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo); 658 | else 659 | for (b = 0; b < layer1_size; b++) 660 | fprintf(fo, "%lf ", syn0[a * layer1_size + b]); 661 | fprintf(fo, "\n"); 662 | } 663 | } else { 664 | // Run K-means on the word vectors 665 | int clcn = classes, iter = 10, closeid; 666 | int *centcn = (int *)malloc(classes * sizeof(int)); 667 | int *cl = (int *)calloc(vocab_size, sizeof(int)); 668 | real closev, x; 669 | real *cent = (real *)calloc(classes * layer1_size, sizeof(real)); 670 | for (a = 0; a < vocab_size; a++) cl[a] = a % clcn; 671 | for (a = 0; a < iter; a++) { 672 | for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0; 673 | for (b = 0; b < clcn; b++) centcn[b] = 1; 674 | for (c = 0; c < vocab_size; c++) { 675 | for (d = 0; d < layer1_size; d++) 676 | cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d]; 677 | centcn[cl[c]]++; 678 | } 679 | for (b = 0; b < clcn; b++) { 680 | closev = 0; 681 | for (c = 0; c < layer1_size; c++) { 682 | cent[layer1_size * b + c] /= centcn[b]; 683 | closev += cent[layer1_size * b + c] * cent[layer1_size * b + c]; 684 | } 685 | closev = sqrt(closev); 686 | for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev; 687 | } 688 | for (c = 0; c < vocab_size; c++) { 689 | closev = -10; 690 | closeid = 0; 691 | for (d = 0; d < clcn; d++) { 692 | x = 0; 693 | for (b = 0; b < layer1_size; b++) 694 | x += cent[layer1_size * d + b] * syn0[c * layer1_size + b]; 695 | if (x > closev) { 696 | closev = x; 697 | closeid = d; 698 | } 699 | } 700 | cl[c] = closeid; 701 | } 702 | } 703 | // Save the K-means classes 704 | for (a = 0; a < vocab_size; a++) 705 | fprintf(fo, "%s %d\n", vocab[a].word, cl[a]); 706 | free(centcn); 707 | free(cent); 708 | free(cl); 709 | } 710 | fclose(fo); 711 | } 712 | 713 | int ArgPos(char *str, int argc, char **argv) { 714 | int a; 715 | for (a = 1; a < argc; a++) 716 | if (!strcmp(str, argv[a])) { 717 | if (a == argc - 1) { 718 | printf("Argument missing for %s\n", str); 719 | exit(1); 720 | } 721 | return a; 722 | } 723 | return -1; 724 | } 725 | 726 | int main(int argc, char **argv) { 727 | int i; 728 | if (argc == 1) { 729 | printf("WORD VECTOR estimation toolkit v 0.1c\n\n"); 730 | printf("Options:\n"); 731 | printf("Parameters for training:\n"); 732 | printf("\t-train \n"); 733 | printf("\t\tUse text data from to train the model\n"); 734 | printf("\t-output \n"); 735 | printf( 736 | "\t\tUse to save the resulting word vectors / word clusters\n"); 737 | printf("\t-size \n"); 738 | printf("\t\tSet size of word vectors; default is 100\n"); 739 | printf("\t-window \n"); 740 | printf("\t\tSet max skip length between words; default is 5\n"); 741 | printf("\t-sample \n"); 742 | printf( 743 | "\t\tSet threshold for occurrence of words. Those that appear with " 744 | "higher frequency in the training data\n"); 745 | printf( 746 | "\t\twill be randomly down-sampled; default is 1e-3, useful range is " 747 | "(0, 1e-5)\n"); 748 | printf("\t-hs \n"); 749 | printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n"); 750 | printf("\t-negative \n"); 751 | printf( 752 | "\t\tNumber of negative examples; default is 5, common values are 3 - " 753 | "10 (0 = not used)\n"); 754 | printf("\t-threads \n"); 755 | printf("\t\tUse threads (default 12)\n"); 756 | printf("\t-iter \n"); 757 | printf("\t\tRun more training iterations (default 5)\n"); 758 | printf("\t-min-count \n"); 759 | printf( 760 | "\t\tThis will discard words that appear less than times; " 761 | "default is 5\n"); 762 | printf("\t-alpha \n"); 763 | printf( 764 | "\t\tSet the starting learning rate; default is 0.025 for skip-gram " 765 | "and 0.05 for CBOW\n"); 766 | printf("\t-classes \n"); 767 | printf( 768 | "\t\tOutput word classes rather than word vectors; default number of " 769 | "classes is 0 (vectors are written)\n"); 770 | printf("\t-debug \n"); 771 | printf( 772 | "\t\tSet the debug mode (default = 2 = more info during training)\n"); 773 | printf("\t-binary \n"); 774 | printf( 775 | "\t\tSave the resulting vectors in binary moded; default is 0 (off)\n"); 776 | printf("\t-save-vocab \n"); 777 | printf("\t\tThe vocabulary will be saved to \n"); 778 | printf("\t-read-vocab \n"); 779 | printf( 780 | "\t\tThe vocabulary will be read from , not constructed from the " 781 | "training data\n"); 782 | printf("\t-cbow \n"); 783 | printf( 784 | "\t\tUse the continuous bag of words model; default is 1 (use 0 for " 785 | "skip-gram model)\n"); 786 | printf("\t-sentence-vectors \n"); 787 | printf( 788 | "\t\tAssume the first token at the beginning of each line is a " 789 | "sentence ID. This token will be trained\n"); 790 | printf( 791 | "\t\twith full sentence context instead of just the window. Use 1 to " 792 | "turn on.\n"); 793 | printf("\nExamples:\n"); 794 | printf( 795 | "./word2vec -train data.txt -output vec.txt -size 200 -window 5 " 796 | "-sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3\n\n"); 797 | return 0; 798 | } 799 | output_file[0] = 0; 800 | save_vocab_file[0] = 0; 801 | read_vocab_file[0] = 0; 802 | if ((i = ArgPos((char *)"-size", argc, argv)) > 0) 803 | layer1_size = atoi(argv[i + 1]); 804 | if ((i = ArgPos((char *)"-train", argc, argv)) > 0) 805 | strcpy(train_file, argv[i + 1]); 806 | if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) 807 | strcpy(save_vocab_file, argv[i + 1]); 808 | if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) 809 | strcpy(read_vocab_file, argv[i + 1]); 810 | if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) 811 | debug_mode = atoi(argv[i + 1]); 812 | if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) 813 | binary = atoi(argv[i + 1]); 814 | if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]); 815 | if (cbow) alpha = 0.05; 816 | if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]); 817 | if ((i = ArgPos((char *)"-output", argc, argv)) > 0) 818 | strcpy(output_file, argv[i + 1]); 819 | if ((i = ArgPos((char *)"-window", argc, argv)) > 0) 820 | window = atoi(argv[i + 1]); 821 | if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) 822 | sample = atof(argv[i + 1]); 823 | if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]); 824 | if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) 825 | negative = atoi(argv[i + 1]); 826 | if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) 827 | num_threads = atoi(argv[i + 1]); 828 | if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]); 829 | if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) 830 | min_count = atoi(argv[i + 1]); 831 | if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) 832 | classes = atoi(argv[i + 1]); 833 | if ((i = ArgPos((char *)"-sentence-vectors", argc, argv)) > 0) 834 | sentence_vectors = atoi(argv[i + 1]); 835 | vocab = 836 | (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); 837 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); 838 | expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real)); 839 | for (i = 0; i < EXP_TABLE_SIZE; i++) { 840 | expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * 841 | MAX_EXP); // Precompute the exp() table 842 | expTable[i] = 843 | expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1) 844 | } 845 | TrainModel(); 846 | return 0; 847 | } 848 | -------------------------------------------------------------------------------- /word2vec.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #if defined _WIN32 21 | #include "win32-port.h" 22 | #else 23 | #include 24 | #endif 25 | 26 | #if HAVE_CBLAS == 1 27 | // CBLAS declaration 28 | extern void cblas_scopy(const int n, const float *x, const int incx, float *y, 29 | const int incy); 30 | extern void cblas_saxpy(const int n, const float alpha, const float *x, 31 | const int incx, float *y, const int incy); 32 | extern float cblas_sdot(const int n, const float *x, const int incx, 33 | const float *y, const int incy); 34 | extern void cblas_sscal(const int n, const float alpha, float *x, 35 | const int incx); 36 | static const float zero = 0; 37 | #endif 38 | 39 | #define MAX_STRING 100 40 | #define EXP_TABLE_SIZE 1000 41 | #define MAX_EXP 6 42 | #define MAX_SENTENCE_LENGTH 1000 43 | #define MAX_CODE_LENGTH 40 44 | 45 | const int vocab_hash_size = 46 | 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary 47 | 48 | typedef float real; // Precision of float numbers 49 | 50 | struct vocab_word { 51 | long long cn; 52 | int *point; 53 | char *word, *code, codelen; 54 | }; 55 | 56 | char train_file[MAX_STRING], output_file[MAX_STRING]; 57 | char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING]; 58 | struct vocab_word *vocab; 59 | int binary = 0, cbow = 1, debug_mode = 2, window = 5, min_count = 5, 60 | num_threads = 12, min_reduce = 1; 61 | int *vocab_hash; 62 | long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100; 63 | long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, 64 | classes = 0; 65 | real alpha = 0.025, starting_alpha, sample = 1e-3; 66 | real *syn0, *syn1, *syn1neg, *expTable; 67 | clock_t start; 68 | 69 | int hs = 0, negative = 5; 70 | const int table_size = 1e8; 71 | int *table; 72 | 73 | void InitUnigramTable() { 74 | int a, i; 75 | double train_words_pow = 0; 76 | double d1, power = 0.75; 77 | table = (int *)malloc(table_size * sizeof(int)); 78 | for (a = 1; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power); 79 | i = 1; 80 | d1 = pow(vocab[i].cn, power) / train_words_pow; 81 | for (a = 0; a < table_size; a++) { 82 | table[a] = i; 83 | if (a / (double)table_size > d1) { 84 | i++; 85 | d1 += pow(vocab[i].cn, power) / train_words_pow; 86 | } 87 | if (i >= vocab_size) i = vocab_size - 1; 88 | } 89 | } 90 | 91 | // Reads a single word from a file, assuming space + tab + EOL to be word 92 | // boundaries 93 | void ReadWord(char *word, FILE *fin) { 94 | int a = 0, ch; 95 | while (!feof(fin)) { 96 | ch = fgetc(fin); 97 | if (ch == 13) continue; 98 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 99 | if (a > 0) { 100 | if (ch == '\n') ungetc(ch, fin); 101 | break; 102 | } 103 | if (ch == '\n') { 104 | strcpy(word, (char *)""); 105 | return; 106 | } else 107 | continue; 108 | } 109 | word[a] = ch; 110 | a++; 111 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 112 | } 113 | word[a] = 0; 114 | } 115 | 116 | // Returns hash value of a word 117 | int GetWordHash(char *word) { 118 | unsigned long long a, hash = 0; 119 | for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; 120 | hash = hash % vocab_hash_size; 121 | return hash; 122 | } 123 | 124 | // Returns position of a word in the vocabulary; if the word is not found, 125 | // returns -1 126 | int SearchVocab(char *word) { 127 | unsigned int hash = GetWordHash(word); 128 | while (1) { 129 | if (vocab_hash[hash] == -1) return -1; 130 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; 131 | hash = (hash + 1) % vocab_hash_size; 132 | } 133 | return -1; 134 | } 135 | 136 | // Reads a word and returns its index in the vocabulary 137 | int ReadWordIndex(FILE *fin) { 138 | char word[MAX_STRING]; 139 | ReadWord(word, fin); 140 | if (feof(fin)) return -1; 141 | return SearchVocab(word); 142 | } 143 | 144 | // Adds a word to the vocabulary 145 | int AddWordToVocab(char *word) { 146 | unsigned int hash, length = strlen(word) + 1; 147 | if (length > MAX_STRING) length = MAX_STRING; 148 | vocab[vocab_size].word = (char *)calloc(length, sizeof(char)); 149 | strcpy(vocab[vocab_size].word, word); 150 | vocab[vocab_size].cn = 0; 151 | vocab_size++; 152 | // Reallocate memory if needed 153 | if (vocab_size + 2 >= vocab_max_size) { 154 | vocab_max_size += 1000; 155 | vocab = (struct vocab_word *)realloc( 156 | vocab, vocab_max_size * sizeof(struct vocab_word)); 157 | } 158 | hash = GetWordHash(word); 159 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 160 | vocab_hash[hash] = vocab_size - 1; 161 | return vocab_size - 1; 162 | } 163 | 164 | // Used later for sorting by word counts 165 | int VocabCompare(const void *a, const void *b) { 166 | return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn; 167 | } 168 | 169 | // Sorts the vocabulary by frequency using word counts 170 | void SortVocab() { 171 | int a, size; 172 | unsigned int hash; 173 | // Sort the vocabulary and keep at the first position 174 | qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare); 175 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 176 | size = vocab_size; 177 | train_words = 0; 178 | for (a = 0; a < size; a++) { 179 | // Words occuring less than min_count times will be discarded from the vocab 180 | if ((vocab[a].cn < min_count) && (a != 0)) { 181 | vocab_size--; 182 | free(vocab[a].word); 183 | } else { 184 | // Hash will be re-computed, as after the sorting it is not actual 185 | hash = GetWordHash(vocab[a].word); 186 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 187 | vocab_hash[hash] = a; 188 | train_words += vocab[a].cn; 189 | } 190 | } 191 | vocab = (struct vocab_word *)realloc( 192 | vocab, (vocab_size + 1) * sizeof(struct vocab_word)); 193 | // Allocate memory for the binary tree construction 194 | for (a = 0; a < vocab_size; a++) { 195 | vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char)); 196 | vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int)); 197 | } 198 | } 199 | 200 | // Reduces the vocabulary by removing infrequent tokens 201 | void ReduceVocab() { 202 | int a, b = 0; 203 | unsigned int hash; 204 | for (a = 0; a < vocab_size; a++) 205 | if (vocab[a].cn > min_reduce) { 206 | vocab[b].cn = vocab[a].cn; 207 | vocab[b].word = vocab[a].word; 208 | b++; 209 | } else 210 | free(vocab[a].word); 211 | vocab_size = b; 212 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 213 | for (a = 0; a < vocab_size; a++) { 214 | // Hash will be re-computed, as it is not actual 215 | hash = GetWordHash(vocab[a].word); 216 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 217 | vocab_hash[hash] = a; 218 | } 219 | fflush(stdout); 220 | min_reduce++; 221 | } 222 | 223 | // Create binary Huffman tree using the word counts 224 | // Frequent words will have short uniqe binary codes 225 | void CreateBinaryTree() { 226 | long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH]; 227 | char code[MAX_CODE_LENGTH]; 228 | long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 229 | long long *binary = 230 | (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 231 | long long *parent_node = 232 | (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 233 | for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn; 234 | for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15; 235 | pos1 = vocab_size - 1; 236 | pos2 = vocab_size; 237 | // Following algorithm constructs the Huffman tree by adding one node at a 238 | // time 239 | for (a = 0; a < vocab_size - 1; a++) { 240 | // First, find two smallest nodes 'min1, min2' 241 | if (pos1 >= 0) { 242 | if (count[pos1] < count[pos2]) { 243 | min1i = pos1; 244 | pos1--; 245 | } else { 246 | min1i = pos2; 247 | pos2++; 248 | } 249 | } else { 250 | min1i = pos2; 251 | pos2++; 252 | } 253 | if (pos1 >= 0) { 254 | if (count[pos1] < count[pos2]) { 255 | min2i = pos1; 256 | pos1--; 257 | } else { 258 | min2i = pos2; 259 | pos2++; 260 | } 261 | } else { 262 | min2i = pos2; 263 | pos2++; 264 | } 265 | count[vocab_size + a] = count[min1i] + count[min2i]; 266 | parent_node[min1i] = vocab_size + a; 267 | parent_node[min2i] = vocab_size + a; 268 | binary[min2i] = 1; 269 | } 270 | // Now assign binary code to each vocabulary word 271 | for (a = 0; a < vocab_size; a++) { 272 | b = a; 273 | i = 0; 274 | while (1) { 275 | code[i] = binary[b]; 276 | point[i] = b; 277 | i++; 278 | b = parent_node[b]; 279 | if (b == vocab_size * 2 - 2) break; 280 | } 281 | vocab[a].codelen = i; 282 | vocab[a].point[0] = vocab_size - 2; 283 | for (b = 0; b < i; b++) { 284 | vocab[a].code[i - b - 1] = code[b]; 285 | vocab[a].point[i - b] = point[b] - vocab_size; 286 | } 287 | } 288 | free(count); 289 | free(binary); 290 | free(parent_node); 291 | } 292 | 293 | void LearnVocabFromTrainFile() { 294 | char word[MAX_STRING]; 295 | FILE *fin; 296 | long long a, i; 297 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 298 | fin = fopen(train_file, "rb"); 299 | if (fin == NULL) { 300 | printf("ERROR: training data file not found!\n"); 301 | exit(1); 302 | } 303 | vocab_size = 0; 304 | AddWordToVocab((char *)""); 305 | while (1) { 306 | ReadWord(word, fin); 307 | if (feof(fin)) break; 308 | train_words++; 309 | if ((debug_mode > 1) && (train_words % 100000 == 0)) { 310 | printf("%lldK%c", train_words / 1000, 13); 311 | fflush(stdout); 312 | } 313 | i = SearchVocab(word); 314 | if (i == -1) { 315 | a = AddWordToVocab(word); 316 | vocab[a].cn = 1; 317 | } else 318 | vocab[i].cn++; 319 | if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); 320 | } 321 | SortVocab(); 322 | if (debug_mode > 0) { 323 | printf("Vocab size: %lld\n", vocab_size); 324 | printf("Words in train file: %lld\n", train_words); 325 | } 326 | file_size = ftell(fin); 327 | fclose(fin); 328 | } 329 | 330 | void SaveVocab() { 331 | long long i; 332 | FILE *fo = fopen(save_vocab_file, "wb"); 333 | for (i = 0; i < vocab_size; i++) 334 | fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn); 335 | fclose(fo); 336 | } 337 | 338 | void ReadVocab() { 339 | long long a, i = 0; 340 | char c; 341 | char word[MAX_STRING]; 342 | FILE *fin = fopen(read_vocab_file, "rb"); 343 | if (fin == NULL) { 344 | printf("Vocabulary file not found\n"); 345 | exit(1); 346 | } 347 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 348 | vocab_size = 0; 349 | while (1) { 350 | ReadWord(word, fin); 351 | if (feof(fin)) break; 352 | a = AddWordToVocab(word); 353 | fscanf(fin, "%lld%c", &vocab[a].cn, &c); 354 | i++; 355 | } 356 | SortVocab(); 357 | if (debug_mode > 0) { 358 | printf("Vocab size: %lld\n", vocab_size); 359 | printf("Words in train file: %lld\n", train_words); 360 | } 361 | fin = fopen(train_file, "rb"); 362 | if (fin == NULL) { 363 | printf("ERROR: training data file not found!\n"); 364 | exit(1); 365 | } 366 | fseek(fin, 0, SEEK_END); 367 | file_size = ftell(fin); 368 | fclose(fin); 369 | } 370 | 371 | void InitNet() { 372 | long long a, b; 373 | unsigned long long next_random = 1; 374 | a = posix_memalign((void **)&syn0, 128, 375 | (long long)vocab_size * layer1_size * sizeof(real)); 376 | if (syn0 == NULL) { 377 | printf("Memory allocation failed\n"); 378 | exit(1); 379 | } 380 | if (hs) { 381 | a = posix_memalign((void **)&syn1, 128, 382 | (long long)vocab_size * layer1_size * sizeof(real)); 383 | if (syn1 == NULL) { 384 | printf("Memory allocation failed\n"); 385 | exit(1); 386 | } 387 | for (a = 0; a < vocab_size; a++) 388 | #if HAVE_CBLAS == 1 389 | cblas_scopy(layer1_size, &zero, 1, syn1 + a * layer1_size, 1); 390 | #else 391 | for (b = 0; b < layer1_size; b++) syn1[a * layer1_size + b] = 0; 392 | #endif 393 | } 394 | if (negative > 0) { 395 | a = posix_memalign((void **)&syn1neg, 128, 396 | (long long)vocab_size * layer1_size * sizeof(real)); 397 | if (syn1neg == NULL) { 398 | printf("Memory allocation failed\n"); 399 | exit(1); 400 | } 401 | for (a = 0; a < vocab_size; a++) 402 | #if HAVE_CBLAS == 1 403 | cblas_scopy(layer1_size, &zero, 0, syn1neg + a * layer1_size, 1); 404 | #else 405 | for (b = 0; b < layer1_size; b++) syn1neg[a * layer1_size + b] = 0; 406 | #endif 407 | } 408 | for (a = 0; a < vocab_size; a++) 409 | for (b = 0; b < layer1_size; b++) { 410 | next_random = next_random * (unsigned long long)25214903917 + 11; 411 | syn0[a * layer1_size + b] = 412 | (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size; 413 | } 414 | CreateBinaryTree(); 415 | } 416 | 417 | void *TrainModelThread(void *id) { 418 | long long a, b, d, cw, word, last_word, sentence_length = 0, 419 | sentence_position = 0; 420 | long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1]; 421 | long long l1, l2, c, target, label, local_iter = iter; 422 | unsigned long long next_random = (long long)id; 423 | real f, g; 424 | clock_t now; 425 | real *neu1 = (real *)calloc(layer1_size, sizeof(real)); 426 | real *neu1e = (real *)calloc(layer1_size, sizeof(real)); 427 | FILE *fi = fopen(train_file, "rb"); 428 | fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); 429 | while (1) { 430 | if (word_count - last_word_count > 10000) { 431 | word_count_actual += word_count - last_word_count; 432 | last_word_count = word_count; 433 | if ((debug_mode > 1)) { 434 | now = clock(); 435 | printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, 436 | alpha, word_count_actual / (real)(iter * train_words + 1) * 100, 437 | word_count_actual / 438 | ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); 439 | fflush(stdout); 440 | } 441 | alpha = starting_alpha * 442 | (1 - word_count_actual / (real)(iter * train_words + 1)); 443 | if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001; 444 | } 445 | if (sentence_length == 0) { 446 | while (1) { 447 | word = ReadWordIndex(fi); 448 | if (feof(fi)) break; 449 | if (word == -1) continue; 450 | word_count++; 451 | if (word == 0) break; 452 | // The subsampling randomly discards frequent words while keeping the 453 | // ranking same 454 | if (sample > 0) { 455 | real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * 456 | (sample * train_words) / vocab[word].cn; 457 | next_random = next_random * (unsigned long long)25214903917 + 11; 458 | if (ran < (next_random & 0xFFFF) / (real)65536) continue; 459 | } 460 | sen[sentence_length] = word; 461 | sentence_length++; 462 | if (sentence_length >= MAX_SENTENCE_LENGTH) break; 463 | } 464 | sentence_position = 0; 465 | } 466 | if (feof(fi) || (word_count > train_words / num_threads)) { 467 | word_count_actual += word_count - last_word_count; 468 | local_iter--; 469 | if (local_iter == 0) break; 470 | word_count = 0; 471 | last_word_count = 0; 472 | sentence_length = 0; 473 | fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); 474 | continue; 475 | } 476 | word = sen[sentence_position]; 477 | if (word == -1) continue; 478 | #if HAVE_CBLAS == 1 479 | cblas_scopy(layer1_size, &zero, 0, neu1, 1); 480 | cblas_scopy(layer1_size, &zero, 0, neu1e, 1); 481 | #else 482 | for (c = 0; c < layer1_size; c++) neu1[c] = 0; 483 | for (c = 0; c < layer1_size; c++) neu1e[c] = 0; 484 | #endif 485 | next_random = next_random * (unsigned long long)25214903917 + 11; 486 | b = next_random % window; 487 | if (cbow) { // train the cbow architecture 488 | // in -> hidden 489 | cw = 0; 490 | for (a = b; a < window * 2 + 1 - b; a++) 491 | if (a != window) { 492 | c = sentence_position - window + a; 493 | if (c < 0) continue; 494 | if (c >= sentence_length) continue; 495 | last_word = sen[c]; 496 | if (last_word == -1) continue; 497 | #if HAVE_CBLAS == 1 498 | cblas_saxpy(layer1_size, 1.0f, syn0 + last_word * layer1_size, 1, 499 | neu1, 1); 500 | #else 501 | for (c = 0; c < layer1_size; c++) 502 | neu1[c] += syn0[c + last_word * layer1_size]; 503 | #endif 504 | cw++; 505 | } 506 | if (cw) { 507 | #if HAVE_CBLAS == 1 508 | cblas_sscal(layer1_size, 1.0f / cw, neu1, 1); 509 | #else 510 | for (c = 0; c < layer1_size; c++) neu1[c] /= cw; 511 | #endif 512 | if (hs) 513 | for (d = 0; d < vocab[word].codelen; d++) { 514 | l2 = vocab[word].point[d] * layer1_size; 515 | #if HAVE_CBLAS == 1 516 | // Propagate hidden -> output 517 | f = cblas_sdot(layer1_size, neu1, 1, syn1 + l2, 1); 518 | #else 519 | // Propagate hidden -> output 520 | f = 0; 521 | for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2]; 522 | #endif 523 | if (f <= -MAX_EXP) 524 | continue; 525 | else if (f >= MAX_EXP) 526 | continue; 527 | else 528 | f = expTable[(int)((f + MAX_EXP) * 529 | (EXP_TABLE_SIZE / MAX_EXP / 2))]; 530 | // 'g' is the gradient multiplied by the learning rate 531 | g = (1 - vocab[word].code[d] - f) * alpha; 532 | #if HAVE_CBLAS == 1 533 | // Propagate errors output -> hidden 534 | cblas_saxpy(layer1_size, g, syn1 + l2, 1, neu1e, 1); 535 | // Learn weights hidden -> output 536 | cblas_saxpy(layer1_size, g, neu1, 1, syn1 + l2, 1); 537 | #else 538 | // Propagate errors output -> hidden 539 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; 540 | // Learn weights hidden -> output 541 | for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c]; 542 | #endif 543 | } 544 | // NEGATIVE SAMPLING 545 | if (negative > 0) 546 | for (d = 0; d < negative + 1; d++) { 547 | if (d == 0) { 548 | target = word; 549 | label = 1; 550 | } else { 551 | next_random = next_random * (unsigned long long)25214903917 + 11; 552 | target = table[(next_random >> 16) % table_size]; 553 | if (target == word) continue; 554 | label = 0; 555 | } 556 | l2 = target * layer1_size; 557 | #if HAVE_CBLAS == 1 558 | f = cblas_sdot(layer1_size, neu1, 1, syn1neg + l2, 1); 559 | #else 560 | f = 0; 561 | for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2]; 562 | #endif 563 | if (f > MAX_EXP) 564 | g = (label - 1) * alpha; 565 | else if (f < -MAX_EXP) 566 | g = (label - 0) * alpha; 567 | else 568 | g = (label - expTable[(int)((f + MAX_EXP) * 569 | (EXP_TABLE_SIZE / MAX_EXP / 2))]) * 570 | alpha; 571 | #if HAVE_CBLAS == 1 572 | cblas_saxpy(layer1_size, g, syn1neg + l2, 1, neu1e, 1); 573 | cblas_saxpy(layer1_size, g, neu1, 1, syn1neg + l2, 1); 574 | #else 575 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; 576 | for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c]; 577 | #endif 578 | } 579 | // hidden -> in 580 | for (a = b; a < window * 2 + 1 - b; a++) 581 | if (a != window) { 582 | c = sentence_position - window + a; 583 | if (c < 0) continue; 584 | if (c >= sentence_length) continue; 585 | last_word = sen[c]; 586 | if (last_word == -1) continue; 587 | #if HAVE_CBLAS == 1 588 | cblas_saxpy(layer1_size, 1, neu1e, 1, 589 | syn0 + last_word * layer1_size, 1); 590 | #else 591 | for (c = 0; c < layer1_size; c++) 592 | syn0[c + last_word * layer1_size] += neu1e[c]; 593 | #endif 594 | } 595 | } 596 | } else { // train skip-gram 597 | for (a = b; a < window * 2 + 1 - b; a++) 598 | if (a != window) { 599 | c = sentence_position - window + a; 600 | if (c < 0) continue; 601 | if (c >= sentence_length) continue; 602 | last_word = sen[c]; 603 | if (last_word == -1) continue; 604 | l1 = last_word * layer1_size; 605 | #if HAVE_CBLAS == 1 606 | cblas_scopy(layer1_size, &zero, 0, neu1e, 1); 607 | #else 608 | for (c = 0; c < layer1_size; c++) neu1e[c] = 0; 609 | #endif 610 | // HIERARCHICAL SOFTMAX 611 | if (hs) 612 | for (d = 0; d < vocab[word].codelen; d++) { 613 | l2 = vocab[word].point[d] * layer1_size; 614 | #if HAVE_CBLAS == 1 615 | // Propagate hidden -> output 616 | f = cblas_sdot(layer1_size, syn0 + l1, 1, syn1 + l2, 1); 617 | #else 618 | // Propagate hidden -> output 619 | f = 0; 620 | for (c = 0; c < layer1_size; c++) 621 | f += syn0[c + l1] * syn1[c + l2]; 622 | #endif 623 | if (f <= -MAX_EXP) 624 | continue; 625 | else if (f >= MAX_EXP) 626 | continue; 627 | else 628 | f = expTable[(int)((f + MAX_EXP) * 629 | (EXP_TABLE_SIZE / MAX_EXP / 2))]; 630 | // 'g' is the gradient multiplied by the learning rate 631 | g = (1 - vocab[word].code[d] - f) * alpha; 632 | #if HAVE_CBLAS == 1 633 | // Propagate errors output -> hidden 634 | cblas_saxpy(layer1_size, g, syn1 + l2, 1, neu1e, 1); 635 | // Learn weights hidden -> output 636 | cblas_saxpy(layer1_size, g, syn0 + l1, 1, syn1 + l2, 1); 637 | #else 638 | // Propagate errors output -> hidden 639 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; 640 | // Learn weights hidden -> output 641 | for (c = 0; c < layer1_size; c++) 642 | syn1[c + l2] += g * syn0[c + l1]; 643 | #endif 644 | } 645 | // NEGATIVE SAMPLING 646 | if (negative > 0) 647 | for (d = 0; d < negative + 1; d++) { 648 | if (d == 0) { 649 | target = word; 650 | label = 1; 651 | } else { 652 | next_random = 653 | next_random * (unsigned long long)25214903917 + 11; 654 | target = table[(next_random >> 16) % table_size]; 655 | if (target == word) continue; 656 | label = 0; 657 | } 658 | l2 = target * layer1_size; 659 | #if HAVE_CBLAS == 1 660 | f = cblas_sdot(layer1_size, syn0 + l1, 1, syn1neg + l2, 1); 661 | #else 662 | f = 0; 663 | for (c = 0; c < layer1_size; c++) 664 | f += syn0[c + l1] * syn1neg[c + l2]; 665 | #endif 666 | if (f > MAX_EXP) 667 | g = (label - 1) * alpha; 668 | else if (f < -MAX_EXP) 669 | g = (label - 0) * alpha; 670 | else 671 | g = (label - expTable[(int)((f + MAX_EXP) * 672 | (EXP_TABLE_SIZE / MAX_EXP / 2))]) * 673 | alpha; 674 | #if HAVE_CBLAS == 1 675 | cblas_saxpy(layer1_size, g, syn1neg + l2, 1, neu1e, 1); 676 | cblas_saxpy(layer1_size, g, syn0 + l1, 1, syn1neg + l2, 1); 677 | #else 678 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; 679 | for (c = 0; c < layer1_size; c++) 680 | syn1neg[c + l2] += g * syn0[c + l1]; 681 | #endif 682 | } 683 | #if HAVE_CBLAS == 1 684 | // Learn weights input -> hidden 685 | cblas_saxpy(layer1_size, 1, neu1e, 1, syn0 + l1, 1); 686 | #else 687 | // Learn weights input -> hidden 688 | for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c]; 689 | #endif 690 | } 691 | } 692 | sentence_position++; 693 | if (sentence_position >= sentence_length) { 694 | sentence_length = 0; 695 | continue; 696 | } 697 | } 698 | fclose(fi); 699 | free(neu1); 700 | free(neu1e); 701 | pthread_exit(NULL); 702 | } 703 | 704 | void TrainModel() { 705 | long a, b, c, d; 706 | FILE *fo; 707 | pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t)); 708 | printf("Starting training using file %s\n", train_file); 709 | starting_alpha = alpha; 710 | if (read_vocab_file[0] != 0) 711 | ReadVocab(); 712 | else 713 | LearnVocabFromTrainFile(); 714 | if (save_vocab_file[0] != 0) SaveVocab(); 715 | if (output_file[0] == 0) return; 716 | InitNet(); 717 | if (negative > 0) InitUnigramTable(); 718 | start = clock(); 719 | for (a = 0; a < num_threads; a++) 720 | pthread_create(&pt[a], NULL, TrainModelThread, (void *)a); 721 | for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL); 722 | fo = fopen(output_file, "wb"); 723 | if (classes == 0) { 724 | // Save the word vectors 725 | fprintf(fo, "%lld %lld\n", vocab_size, layer1_size); 726 | for (a = 0; a < vocab_size; a++) { 727 | fprintf(fo, "%s ", vocab[a].word); 728 | if (binary) 729 | for (b = 0; b < layer1_size; b++) 730 | fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo); 731 | else 732 | for (b = 0; b < layer1_size; b++) 733 | fprintf(fo, "%lf ", syn0[a * layer1_size + b]); 734 | fprintf(fo, "\n"); 735 | } 736 | } else { 737 | // Run K-means on the word vectors 738 | int clcn = classes, iter = 10, closeid; 739 | int *centcn = (int *)malloc(classes * sizeof(int)); 740 | int *cl = (int *)calloc(vocab_size, sizeof(int)); 741 | real closev, x; 742 | real *cent = (real *)calloc(classes * layer1_size, sizeof(real)); 743 | for (a = 0; a < vocab_size; a++) cl[a] = a % clcn; 744 | for (a = 0; a < iter; a++) { 745 | for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0; 746 | for (b = 0; b < clcn; b++) centcn[b] = 1; 747 | for (c = 0; c < vocab_size; c++) { 748 | for (d = 0; d < layer1_size; d++) 749 | cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d]; 750 | centcn[cl[c]]++; 751 | } 752 | for (b = 0; b < clcn; b++) { 753 | closev = 0; 754 | for (c = 0; c < layer1_size; c++) { 755 | cent[layer1_size * b + c] /= centcn[b]; 756 | closev += cent[layer1_size * b + c] * cent[layer1_size * b + c]; 757 | } 758 | closev = sqrt(closev); 759 | for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev; 760 | } 761 | for (c = 0; c < vocab_size; c++) { 762 | closev = -10; 763 | closeid = 0; 764 | for (d = 0; d < clcn; d++) { 765 | x = 0; 766 | for (b = 0; b < layer1_size; b++) 767 | x += cent[layer1_size * d + b] * syn0[c * layer1_size + b]; 768 | if (x > closev) { 769 | closev = x; 770 | closeid = d; 771 | } 772 | } 773 | cl[c] = closeid; 774 | } 775 | } 776 | // Save the K-means classes 777 | for (a = 0; a < vocab_size; a++) 778 | fprintf(fo, "%s %d\n", vocab[a].word, cl[a]); 779 | free(centcn); 780 | free(cent); 781 | free(cl); 782 | } 783 | fclose(fo); 784 | } 785 | 786 | int ArgPos(char *str, int argc, char **argv) { 787 | int a; 788 | for (a = 1; a < argc; a++) 789 | if (!strcmp(str, argv[a])) { 790 | if (a == argc - 1) { 791 | printf("Argument missing for %s\n", str); 792 | exit(1); 793 | } 794 | return a; 795 | } 796 | return -1; 797 | } 798 | 799 | int main(int argc, char **argv) { 800 | int i; 801 | if (argc == 1) { 802 | printf("WORD VECTOR estimation toolkit v 0.1c\n\n"); 803 | printf("Options:\n"); 804 | printf("Parameters for training:\n"); 805 | printf("\t-train \n"); 806 | printf("\t\tUse text data from to train the model\n"); 807 | printf("\t-output \n"); 808 | printf( 809 | "\t\tUse to save the resulting word vectors / word clusters\n"); 810 | printf("\t-size \n"); 811 | printf("\t\tSet size of word vectors; default is 100\n"); 812 | printf("\t-window \n"); 813 | printf("\t\tSet max skip length between words; default is 5\n"); 814 | printf("\t-sample \n"); 815 | printf( 816 | "\t\tSet threshold for occurrence of words. Those that appear with " 817 | "higher frequency in the training data\n"); 818 | printf( 819 | "\t\twill be randomly down-sampled; default is 1e-3, useful range is " 820 | "(0, 1e-5)\n"); 821 | printf("\t-hs \n"); 822 | printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n"); 823 | printf("\t-negative \n"); 824 | printf( 825 | "\t\tNumber of negative examples; default is 5, common values are 3 - " 826 | "10 (0 = not used)\n"); 827 | printf("\t-threads \n"); 828 | printf("\t\tUse threads (default 12)\n"); 829 | printf("\t-iter \n"); 830 | printf("\t\tRun more training iterations (default 5)\n"); 831 | printf("\t-min-count \n"); 832 | printf( 833 | "\t\tThis will discard words that appear less than times; " 834 | "default is 5\n"); 835 | printf("\t-alpha \n"); 836 | printf( 837 | "\t\tSet the starting learning rate; default is 0.025 for skip-gram " 838 | "and 0.05 for CBOW\n"); 839 | printf("\t-classes \n"); 840 | printf( 841 | "\t\tOutput word classes rather than word vectors; default number of " 842 | "classes is 0 (vectors are written)\n"); 843 | printf("\t-debug \n"); 844 | printf( 845 | "\t\tSet the debug mode (default = 2 = more info during training)\n"); 846 | printf("\t-binary \n"); 847 | printf( 848 | "\t\tSave the resulting vectors in binary moded; default is 0 (off)\n"); 849 | printf("\t-save-vocab \n"); 850 | printf("\t\tThe vocabulary will be saved to \n"); 851 | printf("\t-read-vocab \n"); 852 | printf( 853 | "\t\tThe vocabulary will be read from , not constructed from the " 854 | "training data\n"); 855 | printf("\t-cbow \n"); 856 | printf( 857 | "\t\tUse the continuous bag of words model; default is 1 (use 0 for " 858 | "skip-gram model)\n"); 859 | printf("\nExamples:\n"); 860 | printf( 861 | "./word2vec -train data.txt -output vec.txt -size 200 -window 5 " 862 | "-sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3\n\n"); 863 | return 0; 864 | } 865 | output_file[0] = 0; 866 | save_vocab_file[0] = 0; 867 | read_vocab_file[0] = 0; 868 | if ((i = ArgPos((char *)"-size", argc, argv)) > 0) 869 | layer1_size = atoi(argv[i + 1]); 870 | if ((i = ArgPos((char *)"-train", argc, argv)) > 0) 871 | strcpy(train_file, argv[i + 1]); 872 | if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) 873 | strcpy(save_vocab_file, argv[i + 1]); 874 | if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) 875 | strcpy(read_vocab_file, argv[i + 1]); 876 | if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) 877 | debug_mode = atoi(argv[i + 1]); 878 | if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) 879 | binary = atoi(argv[i + 1]); 880 | if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]); 881 | if (cbow) alpha = 0.05; 882 | if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]); 883 | if ((i = ArgPos((char *)"-output", argc, argv)) > 0) 884 | strcpy(output_file, argv[i + 1]); 885 | if ((i = ArgPos((char *)"-window", argc, argv)) > 0) 886 | window = atoi(argv[i + 1]); 887 | if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) 888 | sample = atof(argv[i + 1]); 889 | if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]); 890 | if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) 891 | negative = atoi(argv[i + 1]); 892 | if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) 893 | num_threads = atoi(argv[i + 1]); 894 | if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]); 895 | if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) 896 | min_count = atoi(argv[i + 1]); 897 | if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) 898 | classes = atoi(argv[i + 1]); 899 | vocab = 900 | (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); 901 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); 902 | expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real)); 903 | for (i = 0; i < EXP_TABLE_SIZE; i++) { 904 | expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * 905 | MAX_EXP); // Precompute the exp() table 906 | expTable[i] = 907 | expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1) 908 | } 909 | TrainModel(); 910 | return 0; 911 | } 912 | --------------------------------------------------------------------------------