├── word2vec
    ├── tests
    │   └── __init__.py
    ├── utils.py
    ├── __init__.py
    ├── wordclusters.py
    ├── io.py
    ├── scripts_interface.py
    └── wordvectors.py
├── MANIFEST.in
├── .gitignore
├── word2vec-c
    ├── demos
    │   ├── demo-classes.sh
    │   ├── demo-word.sh
    │   ├── demo-word-accuracy.sh
    │   ├── demo-analogy.sh
    │   ├── demo-phrases.sh
    │   └── demo-phrase-accuracy.sh
    ├── makefile
    ├── distance.c
    ├── word-analogy.c
    ├── compute-accuracy.c
    ├── word2phrase.c
    └── word2vec.c
├── setup.py
├── README.md
├── examples
    ├── demo-clusters.ipynb
    └── path.ipynb
└── LICENSE.txt


/word2vec/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE.txt
2 | include word2vec-c/makefile
3 | recursive-include word2vec-c *.c
4 | prune bin


--------------------------------------------------------------------------------
/word2vec/utils.py:
--------------------------------------------------------------------------------
1 | from numpy import linalg as LA
2 | 
3 | 
4 | def unitvec(vec):
5 |     return (1.0 / LA.norm(vec, ord=2)) * vec
6 | 


--------------------------------------------------------------------------------
/word2vec/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | from io import *
3 | from wordvectors import *
4 | from wordclusters import *
5 | from scripts_interface import *
6 | 


--------------------------------------------------------------------------------
/word2vec/wordclusters.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class WordClusters(object):
 5 | 
 6 |     def __init__(self, vocab, clusters):
 7 |         self.vocab = vocab
 8 |         self.clusters = clusters
 9 | 
10 |     def __getitem__(self, cluster):
11 |         return self.vocab[self.clusters == cluster]
12 | 
13 |     @classmethod
14 |     def from_text(cls, fname):
15 |         vocab = np.genfromtxt(fname, dtype=np.object, delimiter=' ', usecols=0)
16 |         clusters = np.genfromtxt(fname, dtype=int, delimiter=' ', usecols=1)
17 |         return cls(vocab=vocab, clusters=clusters)
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | MANIFEST
21 | 
22 | # Installer logs
23 | pip-log.txt
24 | 
25 | # Unit test / coverage reports
26 | .coverage
27 | .tox
28 | nosetests.xml
29 | 
30 | # Translations
31 | *.mo
32 | 
33 | # Mr Developer
34 | .mr.developer.cfg
35 | .project
36 | .pydevproject
37 | 
38 | 
39 | # IPython Notebook
40 | .ipynb_checkpoints
41 | 
42 | examples/text8
43 | examples/*.vec
44 | examples/*.clusters
45 | examples/*.gz
46 | examples/*.txt
47 | examples/*.bin
48 | examples/data/*
49 | 


--------------------------------------------------------------------------------
/word2vec/io.py:
--------------------------------------------------------------------------------
 1 | import word2vec
 2 | import numpy as np
 3 | 
 4 | 
 5 | def load(fname, kind='bin', save_memory=True):
 6 |     '''
 7 |     Loads a word vectors file
 8 |     '''
 9 |     if kind == 'bin':
10 |         return word2vec.WordVectors.from_binary(fname, save_memory=save_memory)
11 |     elif kind == 'txt':
12 |         return word2vec.WordVectors.from_text(fname, save_memory=save_memory)
13 |     elif kind == 'mmap':
14 |         return word2vec.WordVectors.from_mmap(fname)
15 |     else:
16 |         raise Exception('Unknown kind')
17 | 
18 | 
19 | def load_clusters(fname):
20 |     '''
21 |     Loads a word cluster file
22 |     '''
23 |     return word2vec.WordClusters.from_text(fname)
24 | 


--------------------------------------------------------------------------------
/word2vec-c/demos/demo-classes.sh:
--------------------------------------------------------------------------------
 1 | DATA_DIR=../data
 2 | SRC_DIR=../src
 3 | BIN_DIR=../bin
 4 | 
 5 | TEXT_DATA=$DATA_DIR/text8
 6 | CLASSES_DATA=$DATA_DIR/classes.txt
 7 | 
 8 | pushd ${SRC_DIR} && make; popd
 9 | 
10 |   
11 | if [ ! -e $CLASSES_DATA ]; then
12 |   
13 |   if [ ! -e $TEXT_DATA ]; then
14 |     wget http://mattmahoney.net/dc/text8.zip -O $DATA_DIR/text8.gz
15 |     gzip -d $DATA_DIR/text8.gz -f
16 |   fi
17 |   echo -----------------------------------------------------------------------------------------------------
18 |   echo -- Training vectors...
19 |   time $BIN_DIR/word2vec -train $TEXT_DATA -output $CLASSES_DATA -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -classes 500
20 |   
21 | fi
22 | 
23 | sort $CLASSES_DATA -k 2 -n > $DATA_DIR/classes.sorted.txt
24 | echo The word classes were saved to file $DATA_DIR/classes.sorted.txt
25 | 


--------------------------------------------------------------------------------
/word2vec-c/makefile:
--------------------------------------------------------------------------------
 1 | SCRIPTS_DIR=../scripts
 2 | BIN_DIR=../bin
 3 | 
 4 | CC = gcc
 5 | #The -Ofast might not work with older versions of gcc; in that case, use -O2
 6 | CFLAGS = -lm -pthread -O2 -Wall -funroll-loops
 7 | 
 8 | all: word2vec word2phrase w2v-distance w2v-word-analogy w2v-compute-accuracy
 9 | 
10 | word2vec : word2vec.c
11 | 	$(CC) word2vec.c -o ${BIN_DIR}/word2vec $(CFLAGS)
12 | word2phrase : word2phrase.c
13 | 	$(CC) word2phrase.c -o ${BIN_DIR}/word2phrase $(CFLAGS)
14 | w2v-distance : distance.c
15 | 	$(CC) distance.c -o ${BIN_DIR}/w2v-distance $(CFLAGS)
16 | w2v-word-analogy : word-analogy.c
17 | 	$(CC) word-analogy.c -o ${BIN_DIR}/w2v-word-analogy $(CFLAGS)
18 | w2v-compute-accuracy : compute-accuracy.c
19 | 	$(CC) compute-accuracy.c -o ${BIN_DIR}/w2v-compute-accuracy $(CFLAGS)
20 | 
21 | clean:
22 | 	pushd ${BIN_DIR} && rm -rf word2vec word2phrase distance word-analogy compute-accuracy; popd
23 | 


--------------------------------------------------------------------------------
/word2vec-c/demos/demo-word.sh:
--------------------------------------------------------------------------------
 1 | DATA_DIR=../data
 2 | BIN_DIR=../bin
 3 | 
 4 | TEXT_DATA=$DATA_DIR/text8
 5 | VECTOR_DATA=$DATA_DIR/text8-vector.bin
 6 | 
 7 | TEXT_DATA=$DATA_DIR/fashion_blogs.txt
 8 | VECTOR_DATA=$DATA_DIR/fashion_blogs.bin
 9 | 
10 | pushd ${BIN_DIR} && make; popd
11 | 
12 | if [ ! -e $VECTOR_DATA ]; then
13 | 
14 |   if [ ! -e $TEXT_DATA ]; then
15 |     wget http://mattmahoney.net/dc/text8.zip -O $DATA_DIR/text8.gz
16 |     gzip -d $DATA_DIR/text8.gz -f
17 |   fi
18 |   echo -----------------------------------------------------------------------------------------------------
19 |   echo -- Training vectors...
20 |   time $BIN_DIR/word2vec -train $TEXT_DATA -output $VECTOR_DATA -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
21 | 
22 | fi
23 | 
24 | echo -----------------------------------------------------------------------------------------------------
25 | echo -- distance...
26 | 
27 | $BIN_DIR/distance $DATA_DIR/$VECTOR_DATA
28 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | from distutils.core import setup
 4 | 
 5 | '''
 6 | To update to a new version:
 7 | 1. change version
 8 | 2. python setup.py sdist upload
 9 | '''
10 | 
11 | DESCRIPTION = 'Google word2vec python wrapper'
12 | 
13 | directory = 'bin'
14 | if not os.path.exists(directory):
15 |     os.makedirs(directory)
16 | 
17 | subprocess.call(['make', '-C', 'word2vec-c'])
18 | 
19 | setup(
20 |     name='word2vec',
21 |     version='0.5.1',
22 |     maintainer='Daniel Rodriguez',
23 |     maintainer_email='df.rodriguez143@gmail.com',
24 |     url='https://github.com/danielfrg/word2vec',
25 |     packages=['word2vec'],
26 |     description=DESCRIPTION,
27 |     license='Apache License Version 2.0, January 2004',
28 |     data_files=[('bin', ['bin/word2vec', 'bin/word2phrase', 'bin/w2v-distance',
29 |                          'bin/w2v-word-analogy', 'bin/w2v-compute-accuracy'])],
30 |     install_requires=[
31 |         'numpy>=1.7.1'
32 |     ],
33 | )
34 | 


--------------------------------------------------------------------------------
/word2vec-c/demos/demo-word-accuracy.sh:
--------------------------------------------------------------------------------
 1 | DATA_DIR=../data
 2 | BIN_DIR=../bin
 3 | SRC_DIR=../src
 4 | 
 5 | TEXT_DATA=$DATA_DIR/text8
 6 | VECTOR_DATA=$DATA_DIR/text8-vector.bin
 7 | 
 8 | pushd ${SRC_DIR} && make; popd
 9 | 
10 |   
11 | if [ ! -e $VECTOR_DATA ]; then
12 |   
13 |   if [ ! -e $TEXT_DATA ]; then
14 |     wget http://mattmahoney.net/dc/text8.zip -O $DATA_DIR/text8.gz
15 |     gzip -d $DATA_DIR/text8.gz -f
16 |   fi
17 |   echo -----------------------------------------------------------------------------------------------------
18 |   echo -- Training vectors...
19 |   time $BIN_DIR/word2vec -train $TEXT_DATA -output $VECTOR_DATA -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
20 |   
21 | fi
22 | 
23 | echo -----------------------------------------------------------------------------------------------------
24 | echo -- distance...
25 | 
26 | # to compute accuracy with the full vocabulary, use: compute-accuracy $VECTOR_DATA < $DATA_DIR/questions-words.txt
27 | set -x
28 | $BIN_DIR/compute-accuracy $VECTOR_DATA 30000 < $DATA_DIR/questions-words.txt
29 | 
30 | 


--------------------------------------------------------------------------------
/word2vec-c/demos/demo-analogy.sh:
--------------------------------------------------------------------------------
 1 | DATA_DIR=../data
 2 | BIN_DIR=../bin
 3 | SRC_DIR=../src
 4 | 
 5 | TEXT_DATA=$DATA_DIR/text8
 6 | VECTOR_DATA=$DATA_DIR/text8-vector.bin
 7 | 
 8 | pushd ${SRC_DIR} && make; popd
 9 | 
10 | echo -----------------------------------------------------------------------------------------------------
11 | echo Note that for the word analogy to perform well, the models should be trained on much larger data sets
12 | echo Example input: paris france berlin
13 | echo -----------------------------------------------------------------------------------------------------
14 | 
15 | if [ ! -e $VECTOR_DATA ]; then
16 |   
17 |   if [ ! -e $TEXT_DATA ]; then
18 |     wget http://mattmahoney.net/dc/text8.zip -O $DATA_DIR/text8.gz
19 |     gzip -d $DATA_DIR/text8.gz -f
20 |   fi
21 |   echo -----------------------------------------------------------------------------------------------------
22 |   echo -- Training vectors...
23 |   time $BIN_DIR/word2vec -train $TEXT_DATA -output $VECTOR_DATA -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
24 |   
25 | fi
26 | 
27 | echo -----------------------------------------------------------------------------------------------------
28 | echo -- analogy...
29 | 
30 | $BIN_DIR/word-analogy $VECTOR_DATA
31 | 


--------------------------------------------------------------------------------
/word2vec-c/demos/demo-phrases.sh:
--------------------------------------------------------------------------------
 1 | DATA_DIR=../data
 2 | BIN_DIR=../bin
 3 | SRC_DIR=../src
 4 | 
 5 | TEXT_DATA=$DATA_DIR/text8
 6 | PHRASES_DATA=$DATA_DIR/text8-phrases
 7 | PHRASES_VECTOR_DATA=$DATA_DIR/vectors-phrase.bin
 8 | 
 9 | pushd ${SRC_DIR} && make; popd
10 | 
11 | if [ ! -e $PHRASES_VECTOR_DATA ]; then
12 | 
13 |   if [ ! -e $PHRASES_DATA ]; then
14 | 
15 |     if [ ! -e $TEXT_DATA ]; then
16 |       wget http://mattmahoney.net/dc/text8.zip -O $DATA_DIR/text8.gz
17 |       gzip -d $DATA_DIR/text8.gz -f
18 |     fi
19 |     echo -----------------------------------------------------------------------------------------------------
20 |     echo -- Creating phrases...
21 |     time $BIN_DIR/word2phrase -train $DATA_DIR/text8 -output $PHRASES_DATA -threshold 500 -debug 2
22 | 
23 |   fi
24 | 
25 |   echo -----------------------------------------------------------------------------------------------------
26 |   echo -- Training vectors from phrases...
27 |   time $BIN_DIR/word2vec -train $PHRASES_DATA -output $PHRASES_VECTOR_DATA -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 0
28 | 
29 | fi
30 | 
31 | echo -----------------------------------------------------------------------------------------------------
32 | echo -- distance...
33 | 
34 | $BIN_DIR/distance $PHRASES_VECTOR_DATA
35 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | word2vec
 2 | ========
 3 | 
 4 | Python interface to Google word2vec.
 5 | 
 6 | Training is done using the original C code plus some patches, the other funcionality is pure python + numpy.
 7 | 
 8 | ## Installation
 9 | 
10 | `pip install word2vec`
11 | 
12 | I highly recommend the use the [Anaconda python distribution](http://continuum.io/downloads)
13 | 
14 | ## Usage
15 | 
16 | The easiest way is to look at this examples:
17 | [word2vec](http://nbviewer.ipython.org/urls/raw.github.com/danielfrg/word2vec/master/examples/demo-word.ipynb)
18 | and
19 | [word clusters](http://nbviewer.ipython.org/urls/raw.github.com/danielfrg/word2vec/master/examples/demo-clusters.ipynb)
20 | 
21 | The default functionality from word2vec is also available from the command line:
22 | - word2vec
23 | - word2phrase
24 | - w2v-distance
25 | - w2v-word-analogy
26 | - w2v-compute-accuracy
27 | 
28 | ## Issues
29 | 
30 | Some people reported that they needed to do this if running OS X:
31 | [http://stackoverflow.com/questions/15590169/ld-library-not-found-for-lgfortran-mac-symlink-issue](http://stackoverflow.com/questions/15590169/ld-library-not-found-for-lgfortran-mac-symlink-issue)
32 | 
33 | ## Development
34 | 
35 | 1. create a `bin` directory
36 | 2. run the make file inside `word2vec-c`
37 | 3. export the new `bin` directory to PATH
38 | 
39 | Tests require `py.test`
40 | 


--------------------------------------------------------------------------------
/word2vec-c/demos/demo-phrase-accuracy.sh:
--------------------------------------------------------------------------------
 1 | DATA_DIR=../data
 2 | BIN_DIR=../bin
 3 | SRC_DIR=../src
 4 | 
 5 | TEXT_DATA=$DATA_DIR/text8
 6 | PHRASES_DATA=$DATA_DIR/text8-phrases
 7 | PHRASES_VECTOR_DATA=$DATA_DIR/vectors-phrase.bin
 8 | 
 9 | echo ----------------------------------------------------------------------------------------------------------------
10 | echo Note that the accuracy and coverage of the test set questions is going to be low with this small training corpus
11 | echo To achieve better accuracy, larger training set is needed
12 | echo ----------------------------------------------------------------------------------------------------------------
13 | 
14 | pushd ${SRC_DIR} && make; popd
15 | 
16 | if [ ! -e $PHRASES_VECTOR_DATA ]; then
17 |   
18 |   if [ ! -e $PHRASES_DATA ]; then
19 |     
20 |     if [ ! -e $TEXT_DATA ]; then
21 |       wget http://mattmahoney.net/dc/text8.zip -O $DATA_DIR/text8.gz
22 |       gzip -d $DATA_DIR/text8.gz -f
23 |     fi
24 |     echo -----------------------------------------------------------------------------------------------------
25 |     echo -- Creating phrases...
26 |     time $BIN_DIR/word2phrase -train $DATA_DIR/text8 -output $PHRASES_DATA -threshold 500 -debug 2-min-count 3
27 |     
28 |   fi
29 | 
30 |   echo -----------------------------------------------------------------------------------------------------
31 |   echo -- Training vectors from phrases...
32 |   time $BIN_DIR/word2vec -train $PHRASES_DATA -output $PHRASES_VECTOR_DATA -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 -min-count 3
33 |   
34 | fi
35 | 
36 | echo -----------------------------------------------------------------------------------------------------
37 | echo -- distance...
38 | 
39 | $BIN_DIR/compute-accuracy $PHRASES_VECTOR_DATA < $DATA_DIR/questions-phrases.txt
40 | 


--------------------------------------------------------------------------------
/word2vec/scripts_interface.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import subprocess
 4 | 
 5 | realpath = os.path.dirname(os.path.realpath(__file__))
 6 | datadir = "../bin"
 7 | 
 8 | def word2vec(train, output, size=100, window=5, sample=0, hs=1, negative=0, threads=4,
 9 |              min_count=5, alpha=0.025, debug=2, binary=1, cbow=0,
10 |              save_vocab=None, read_vocab=None, verbose=False):
11 |     process = [os.path.join(realpath, datadir, 'word2vec')]
12 |     args = ['-train', '-output', '-size', '-window', '-sample', '-hs', '-negative', '-threads',
13 |             '-min-count', '-alpha', '-debug', '-binary', '-cbow']
14 |     values = [train, output, size, window, sample, hs, negative, threads,
15 |               min_count, alpha, debug, binary, cbow]
16 |     for arg, value in zip(args, values):
17 |         process.append(arg)
18 |         process.append(str(value))
19 |     if save_vocab is not None:
20 |         process.append('-save-vocab')
21 |         process.append(str(save_vocab))
22 |     if read_vocab is not None:
23 |         process.append('-read-vocab')
24 |         process.append(str(read_vocab))
25 | 
26 |     proc = subprocess.Popen(process, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
27 | 
28 |     if verbose:
29 |         for line in proc.stdout:
30 |             sys.stdout.write(line)
31 |             sys.stdout.flush()
32 | 
33 |     out, err = proc.communicate()
34 |     if 'ERROR:' in out:
35 |         raise Exception(out)
36 | 
37 | 
38 | def word2clusters(train, output, classes, size=100, window=5, sample=0, hs=1, negative=0, threads=4,
39 |                   min_count=5, alpha=0.025, debug=2, binary=0, cbow=0,
40 |                   save_vocab=None, read_vocab=None, verbose=False):
41 |     process = [os.path.join(realpath, datadir, 'word2vec')]
42 |     args = ['-train', '-output', '-size', '-window', '-sample', '-hs', '-negative', '-threads',
43 |             '-min-count', '-alpha', '-classes', '-debug', '-binary', '-cbow']
44 |     values = [train, output, size, window, sample, hs, negative, threads,
45 |               min_count, alpha, classes, debug, binary, cbow]
46 |     for arg, value in zip(args, values):
47 |         process.append(arg)
48 |         process.append(str(value))
49 |     if save_vocab is not None:
50 |         process.append('-save-vocab')
51 |         process.append(str(save_vocab))
52 |     if read_vocab is not None:
53 |         process.append('-read-vocab')
54 |         process.append(str(read_vocab))
55 | 
56 |     proc = subprocess.Popen(process, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
57 | 
58 |     if verbose:
59 |         for line in proc.stdout:
60 |             sys.stdout.write(line)
61 |             sys.stdout.flush()
62 | 
63 |     out, err = proc.communicate()
64 |     if 'ERROR:' in out:
65 |         raise Exception(out)
66 | 
67 | 
68 | def word2phrase(train, output, min_count=5, threshold=100, debug=2, verbose=False):
69 |     process = [realpath + datadir + '/' +'word2phrase']
70 |     args = ['-train', '-output', '-min-count', '-threshold', '-debug']
71 |     values = [train, output, min_count, threshold, debug]
72 |     for arg, value in zip(args, values):
73 |         process.append(arg)
74 |         process.append(str(value))
75 | 
76 |     proc = subprocess.Popen(process, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
77 | 
78 |     if verbose:
79 |         for line in proc.stdout:
80 |             sys.stdout.write(line)
81 |             sys.stdout.flush()
82 | 
83 |     out, err = proc.communicate()
84 |     if 'ERROR:' in out:
85 |         raise Exception(out)
86 | 


--------------------------------------------------------------------------------
/examples/demo-clusters.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": ""
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "markdown",
 12 |      "metadata": {},
 13 |      "source": [
 14 |       "This is equivalent to `demo-classes.sh` from Google."
 15 |      ]
 16 |     },
 17 |     {
 18 |      "cell_type": "heading",
 19 |      "level": 3,
 20 |      "metadata": {},
 21 |      "source": [
 22 |       "Grab some data"
 23 |      ]
 24 |     },
 25 |     {
 26 |      "cell_type": "markdown",
 27 |      "metadata": {},
 28 |      "source": [
 29 |       "Download some data, for example: [http://mattmahoney.net/dc/text8.zip](http://mattmahoney.net/dc/text8.zip)"
 30 |      ]
 31 |     },
 32 |     {
 33 |      "cell_type": "code",
 34 |      "collapsed": false,
 35 |      "input": [
 36 |       "!wget http://mattmahoney.net/dc/text8.zip -O text8.gz"
 37 |      ],
 38 |      "language": "python",
 39 |      "metadata": {},
 40 |      "outputs": []
 41 |     },
 42 |     {
 43 |      "cell_type": "code",
 44 |      "collapsed": false,
 45 |      "input": [
 46 |       "!gzip -d text8.gz -f"
 47 |      ],
 48 |      "language": "python",
 49 |      "metadata": {},
 50 |      "outputs": []
 51 |     },
 52 |     {
 53 |      "cell_type": "heading",
 54 |      "level": 3,
 55 |      "metadata": {},
 56 |      "source": [
 57 |       "Training"
 58 |      ]
 59 |     },
 60 |     {
 61 |      "cell_type": "code",
 62 |      "collapsed": false,
 63 |      "input": [
 64 |       "import word2vec"
 65 |      ],
 66 |      "language": "python",
 67 |      "metadata": {},
 68 |      "outputs": [],
 69 |      "prompt_number": 1
 70 |     },
 71 |     {
 72 |      "cell_type": "markdown",
 73 |      "metadata": {},
 74 |      "source": [
 75 |       "Note that this will take a long time depending on the parameters"
 76 |      ]
 77 |     },
 78 |     {
 79 |      "cell_type": "code",
 80 |      "collapsed": false,
 81 |      "input": [
 82 |       "word2vec.word2clusters('data/text8', 'data/text8.clusters', 100)"
 83 |      ],
 84 |      "language": "python",
 85 |      "metadata": {},
 86 |      "outputs": [],
 87 |      "prompt_number": 2
 88 |     },
 89 |     {
 90 |      "cell_type": "markdown",
 91 |      "metadata": {},
 92 |      "source": [
 93 |       "That generated a `text8.clusters` cluster containing the word clusters."
 94 |      ]
 95 |     },
 96 |     {
 97 |      "cell_type": "heading",
 98 |      "level": 3,
 99 |      "metadata": {},
100 |      "source": [
101 |       "Predictions"
102 |      ]
103 |     },
104 |     {
105 |      "cell_type": "code",
106 |      "collapsed": false,
107 |      "input": [
108 |       "import word2vec"
109 |      ],
110 |      "language": "python",
111 |      "metadata": {},
112 |      "outputs": [],
113 |      "prompt_number": 1
114 |     },
115 |     {
116 |      "cell_type": "code",
117 |      "collapsed": false,
118 |      "input": [
119 |       "clusters = word2vec.load_clusters('data/text8.clusters')"
120 |      ],
121 |      "language": "python",
122 |      "metadata": {},
123 |      "outputs": [],
124 |      "prompt_number": 2
125 |     },
126 |     {
127 |      "cell_type": "markdown",
128 |      "metadata": {},
129 |      "source": [
130 |       "We can see all the words grouped on an specific cluster"
131 |      ]
132 |     },
133 |     {
134 |      "cell_type": "code",
135 |      "collapsed": false,
136 |      "input": [
137 |       "clusters[20]"
138 |      ],
139 |      "language": "python",
140 |      "metadata": {},
141 |      "outputs": [
142 |       {
143 |        "metadata": {},
144 |        "output_type": "pyout",
145 |        "prompt_number": 3,
146 |        "text": [
147 |         "array(['ways', 'manner', 'treated', ..., 'summarise', 'disproof',\n",
148 |         "       'discourteous'], dtype=object)"
149 |        ]
150 |       }
151 |      ],
152 |      "prompt_number": 3
153 |     }
154 |    ],
155 |    "metadata": {}
156 |   }
157 |  ]
158 | }


--------------------------------------------------------------------------------
/examples/path.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:461b801e463dea39cdf372462e71ac5f6d27f436e2764a8bf7f339c09d059c40"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "code",
 13 |      "collapsed": false,
 14 |      "input": [
 15 |       "import word2vec\n",
 16 |       "import numpy as np\n",
 17 |       "import networkx as nx"
 18 |      ],
 19 |      "language": "python",
 20 |      "metadata": {},
 21 |      "outputs": [],
 22 |      "prompt_number": 1
 23 |     },
 24 |     {
 25 |      "cell_type": "code",
 26 |      "collapsed": false,
 27 |      "input": [
 28 |       "model = word2vec.load('data/text8.bin')"
 29 |      ],
 30 |      "language": "python",
 31 |      "metadata": {},
 32 |      "outputs": [],
 33 |      "prompt_number": 2
 34 |     },
 35 |     {
 36 |      "cell_type": "code",
 37 |      "collapsed": false,
 38 |      "input": [
 39 |       "vocab_size = model.vocab.shape[0]"
 40 |      ],
 41 |      "language": "python",
 42 |      "metadata": {},
 43 |      "outputs": [],
 44 |      "prompt_number": 3
 45 |     },
 46 |     {
 47 |      "cell_type": "code",
 48 |      "collapsed": false,
 49 |      "input": [
 50 |       "vocab_size"
 51 |      ],
 52 |      "language": "python",
 53 |      "metadata": {},
 54 |      "outputs": [
 55 |       {
 56 |        "metadata": {},
 57 |        "output_type": "pyout",
 58 |        "prompt_number": 4,
 59 |        "text": [
 60 |         "71291"
 61 |        ]
 62 |       }
 63 |      ],
 64 |      "prompt_number": 4
 65 |     },
 66 |     {
 67 |      "cell_type": "code",
 68 |      "collapsed": false,
 69 |      "input": [
 70 |       "dists = np.zeros((vocab_size, vocab_size))"
 71 |      ],
 72 |      "language": "python",
 73 |      "metadata": {},
 74 |      "outputs": [],
 75 |      "prompt_number": 5
 76 |     },
 77 |     {
 78 |      "cell_type": "code",
 79 |      "collapsed": false,
 80 |      "input": [
 81 |       "n = 10"
 82 |      ],
 83 |      "language": "python",
 84 |      "metadata": {},
 85 |      "outputs": [],
 86 |      "prompt_number": 6
 87 |     },
 88 |     {
 89 |      "cell_type": "code",
 90 |      "collapsed": false,
 91 |      "input": [
 92 |       "%%timeit -n1 -r1\n",
 93 |       "for ix, word in enumerate(model.vocab[:1000]):\n",
 94 |       "    metrics = np.dot(model.l2norm, model.get_vector(word))\n",
 95 |       "    best = np.argsort(metrics)[::-1][1:n + 1]\n",
 96 |       "    dists[ix, best] = metrics[best]"
 97 |      ],
 98 |      "language": "python",
 99 |      "metadata": {},
100 |      "outputs": [
101 |       {
102 |        "output_type": "stream",
103 |        "stream": "stdout",
104 |        "text": [
105 |         "1 loops, best of 1: 15.9 s per loop\n"
106 |        ]
107 |       }
108 |      ],
109 |      "prompt_number": 7
110 |     },
111 |     {
112 |      "cell_type": "code",
113 |      "collapsed": false,
114 |      "input": [
115 |       "model.cosine('dog')"
116 |      ],
117 |      "language": "python",
118 |      "metadata": {},
119 |      "outputs": [
120 |       {
121 |        "metadata": {},
122 |        "output_type": "pyout",
123 |        "prompt_number": 52,
124 |        "text": [
125 |         "{'dog': [('catahoula', 0.55066107867304437),\n",
126 |         "  ('shepherd', 0.54683936460837557),\n",
127 |         "  ('azawakh', 0.54571415535760059),\n",
128 |         "  ('haired', 0.53117246665323714),\n",
129 |         "  ('pyrenean', 0.52469939821586631),\n",
130 |         "  ('jindo', 0.51431665685657291),\n",
131 |         "  ('ass', 0.51208124629656004),\n",
132 |         "  ('mastiff', 0.51106035997299537),\n",
133 |         "  ('terrier', 0.50580184597076816),\n",
134 |         "  ('hairless', 0.50463618449995062)]}"
135 |        ]
136 |       }
137 |      ],
138 |      "prompt_number": 52
139 |     },
140 |     {
141 |      "cell_type": "code",
142 |      "collapsed": false,
143 |      "input": [
144 |       "dists[model.ix('dog'), model.ix('catahoula')]"
145 |      ],
146 |      "language": "python",
147 |      "metadata": {},
148 |      "outputs": [
149 |       {
150 |        "metadata": {},
151 |        "output_type": "pyout",
152 |        "prompt_number": 53,
153 |        "text": [
154 |         "0.55066107867304437"
155 |        ]
156 |       }
157 |      ],
158 |      "prompt_number": 53
159 |     },
160 |     {
161 |      "cell_type": "code",
162 |      "collapsed": false,
163 |      "input": [
164 |       "dists.size"
165 |      ],
166 |      "language": "python",
167 |      "metadata": {},
168 |      "outputs": [
169 |       {
170 |        "metadata": {},
171 |        "output_type": "pyout",
172 |        "prompt_number": 54,
173 |        "text": [
174 |         "5082406681"
175 |        ]
176 |       }
177 |      ],
178 |      "prompt_number": 54
179 |     }
180 |    ],
181 |    "metadata": {}
182 |   }
183 |  ]
184 | }


--------------------------------------------------------------------------------
/word2vec-c/distance.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <string.h>
 17 | #include <math.h>
 18 | #include <stdlib.h> // mac os x
 19 | 
 20 | 
 21 | const long long max_size = 2000;         // max length of strings
 22 | const long long N = 40;                  // number of closest words that will be shown
 23 | const long long max_w = 50;              // max length of vocabulary entries
 24 | 
 25 | int main(int argc, char **argv) {
 26 |   FILE *f;
 27 |   char st1[max_size];
 28 |   char bestw[N][max_size];
 29 |   char file_name[max_size], st[100][max_size];
 30 |   float dist, len, bestd[N], vec[max_size];
 31 |   long long words, size, a, b, c, d, cn, bi[100];
 32 |   char ch;
 33 |   float *M;
 34 |   char *vocab;
 35 |   if (argc < 2) {
 36 |     printf("Usage: ./distance <FILE>\nwhere FILE contains word projections in the BINARY FORMAT\n");
 37 |     return 0;
 38 |   }
 39 |   strcpy(file_name, argv[1]);
 40 |   f = fopen(file_name, "rb");
 41 |   if (f == NULL) {
 42 |     printf("Input file not found\n");
 43 |     return -1;
 44 |   }
 45 |   fscanf(f, "%lld", &words);
 46 |   fscanf(f, "%lld", &size);
 47 |   vocab = (char *)malloc((long long)words * max_w * sizeof(char));
 48 |   M = (float *)malloc((long long)words * (long long)size * sizeof(float));
 49 |   if (M == NULL) {
 50 |     printf("Cannot allocate memory: %lld MB    %lld  %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
 51 |     return -1;
 52 |   }
 53 |   for (b = 0; b < words; b++) {
 54 |     fscanf(f, "%s%c", &vocab[b * max_w], &ch);
 55 |     for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
 56 |     len = 0;
 57 |     for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
 58 |     len = sqrt(len);
 59 |     for (a = 0; a < size; a++) M[a + b * size] /= len;
 60 |   }
 61 |   fclose(f);
 62 |   while (1) {
 63 |     for (a = 0; a < N; a++) bestd[a] = 0;
 64 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
 65 |     printf("Enter word or sentence (EXIT to break): ");
 66 |     a = 0;
 67 |     while (1) {
 68 |       st1[a] = fgetc(stdin);
 69 |       if ((st1[a] == '\n') || (a >= max_size - 1)) {
 70 |         st1[a] = 0;
 71 |         break;
 72 |       }
 73 |       a++;
 74 |     }
 75 |     if (!strcmp(st1, "EXIT")) break;
 76 |     cn = 0;
 77 |     b = 0;
 78 |     c = 0;
 79 |     while (1) {
 80 |       st[cn][b] = st1[c];
 81 |       b++;
 82 |       c++;
 83 |       st[cn][b] = 0;
 84 |       if (st1[c] == 0) break;
 85 |       if (st1[c] == ' ') {
 86 |         cn++;
 87 |         b = 0;
 88 |         c++;
 89 |       }
 90 |     }
 91 |     cn++;
 92 |     for (a = 0; a < cn; a++) {
 93 |       for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
 94 |       if (b == words) b = -1;
 95 |       bi[a] = b;
 96 |       printf("\nWord: %s  Position in vocabulary: %lld\n", st[a], bi[a]);
 97 |       if (b == -1) {
 98 |         printf("Out of dictionary word!\n");
 99 |         break;
100 |       }
101 |     }
102 |     if (b == -1) continue;
103 |     printf("\n                                              Word       Cosine distance\n------------------------------------------------------------------------\n");
104 |     for (a = 0; a < size; a++) vec[a] = 0;
105 |     for (b = 0; b < cn; b++) {
106 |       if (bi[b] == -1) continue;
107 |       for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size];
108 |     }
109 |     len = 0;
110 |     for (a = 0; a < size; a++) len += vec[a] * vec[a];
111 |     len = sqrt(len);
112 |     for (a = 0; a < size; a++) vec[a] /= len;
113 |     for (a = 0; a < N; a++) bestd[a] = 0;
114 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
115 |     for (c = 0; c < words; c++) {
116 |       a = 0;
117 |       for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
118 |       if (a == 1) continue;
119 |       dist = 0;
120 |       for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
121 |       for (a = 0; a < N; a++) {
122 |         if (dist > bestd[a]) {
123 |           for (d = N - 1; d > a; d--) {
124 |             bestd[d] = bestd[d - 1];
125 |             strcpy(bestw[d], bestw[d - 1]);
126 |           }
127 |           bestd[a] = dist;
128 |           strcpy(bestw[a], &vocab[c * max_w]);
129 |           break;
130 |         }
131 |       }
132 |     }
133 |     for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]);
134 |   }
135 |   return 0;
136 | }
137 | 


--------------------------------------------------------------------------------
/word2vec-c/word-analogy.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <string.h>
 17 | #include <math.h>
 18 | #include <stdlib.h> // mac os x
 19 | 
 20 | const long long max_size = 2000;         // max length of strings
 21 | const long long N = 40;                  // number of closest words that will be shown
 22 | const long long max_w = 50;              // max length of vocabulary entries
 23 | 
 24 | int main(int argc, char **argv) {
 25 |   FILE *f;
 26 |   char st1[max_size];
 27 |   char bestw[N][max_size];
 28 |   char file_name[max_size], st[100][max_size];
 29 |   float dist, len, bestd[N], vec[max_size];
 30 |   long long words, size, a, b, c, d, cn, bi[100];
 31 |   char ch;
 32 |   float *M;
 33 |   char *vocab;
 34 |   if (argc < 2) {
 35 |     printf("Usage: ./word-analogy <FILE>\nwhere FILE contains word projections in the BINARY FORMAT\n");
 36 |     return 0;
 37 |   }
 38 |   strcpy(file_name, argv[1]);
 39 |   f = fopen(file_name, "rb");
 40 |   if (f == NULL) {
 41 |     printf("Input file not found\n");
 42 |     return -1;
 43 |   }
 44 |   fscanf(f, "%lld", &words);
 45 |   fscanf(f, "%lld", &size);
 46 |   vocab = (char *)malloc((long long)words * max_w * sizeof(char));
 47 |   M = (float *)malloc((long long)words * (long long)size * sizeof(float));
 48 |   if (M == NULL) {
 49 |     printf("Cannot allocate memory: %lld MB    %lld  %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
 50 |     return -1;
 51 |   }
 52 |   for (b = 0; b < words; b++) {
 53 |     fscanf(f, "%s%c", &vocab[b * max_w], &ch);
 54 |     for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
 55 |     len = 0;
 56 |     for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
 57 |     len = sqrt(len);
 58 |     for (a = 0; a < size; a++) M[a + b * size] /= len;
 59 |   }
 60 |   fclose(f);
 61 |   while (1) {
 62 |     for (a = 0; a < N; a++) bestd[a] = 0;
 63 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
 64 |     printf("Enter three words (EXIT to break): ");
 65 |     a = 0;
 66 |     while (1) {
 67 |       st1[a] = fgetc(stdin);
 68 |       if ((st1[a] == '\n') || (a >= max_size - 1)) {
 69 |         st1[a] = 0;
 70 |         break;
 71 |       }
 72 |       a++;
 73 |     }
 74 |     if (!strcmp(st1, "EXIT")) break;
 75 |     cn = 0;
 76 |     b = 0;
 77 |     c = 0;
 78 |     while (1) {
 79 |       st[cn][b] = st1[c];
 80 |       b++;
 81 |       c++;
 82 |       st[cn][b] = 0;
 83 |       if (st1[c] == 0) break;
 84 |       if (st1[c] == ' ') {
 85 |         cn++;
 86 |         b = 0;
 87 |         c++;
 88 |       }
 89 |     }
 90 |     cn++;
 91 |     if (cn < 3) {
 92 |       printf("Only %lld words were entered.. three words are needed at the input to perform the calculation\n", cn);
 93 |       continue;
 94 |     }
 95 |     for (a = 0; a < cn; a++) {
 96 |       for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
 97 |       if (b == words) b = 0;
 98 |       bi[a] = b;
 99 |       printf("\nWord: %s  Position in vocabulary: %lld\n", st[a], bi[a]);
100 |       if (b == 0) {
101 |         printf("Out of dictionary word!\n");
102 |         break;
103 |       }
104 |     }
105 |     if (b == 0) continue;
106 |     printf("\n                                              Word              Distance\n------------------------------------------------------------------------\n");
107 |     for (a = 0; a < size; a++) vec[a] = M[a + bi[1] * size] - M[a + bi[0] * size] + M[a + bi[2] * size];
108 |     len = 0;
109 |     for (a = 0; a < size; a++) len += vec[a] * vec[a];
110 |     len = sqrt(len);
111 |     for (a = 0; a < size; a++) vec[a] /= len;
112 |     for (a = 0; a < N; a++) bestd[a] = 0;
113 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
114 |     for (c = 0; c < words; c++) {
115 |       if (c == bi[0]) continue;
116 |       if (c == bi[1]) continue;
117 |       if (c == bi[2]) continue;
118 |       a = 0;
119 |       for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
120 |       if (a == 1) continue;
121 |       dist = 0;
122 |       for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
123 |       for (a = 0; a < N; a++) {
124 |         if (dist > bestd[a]) {
125 |           for (d = N - 1; d > a; d--) {
126 |             bestd[d] = bestd[d - 1];
127 |             strcpy(bestw[d], bestw[d - 1]);
128 |           }
129 |           bestd[a] = dist;
130 |           strcpy(bestw[a], &vocab[c * max_w]);
131 |           break;
132 |         }
133 |       }
134 |     }
135 |     for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]);
136 |   }
137 |   return 0;
138 | }
139 | 


--------------------------------------------------------------------------------
/word2vec-c/compute-accuracy.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <stdlib.h>
 17 | #include <string.h>
 18 | #include <math.h>
 19 | #include <stdlib.h> // mac os x
 20 | #include <ctype.h>
 21 | 
 22 | const long long max_size = 2000;         // max length of strings
 23 | const long long N = 1;                   // number of closest words
 24 | const long long max_w = 50;              // max length of vocabulary entries
 25 | 
 26 | int main(int argc, char **argv)
 27 | {
 28 |   FILE *f;
 29 |   char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size], ch;
 30 |   float dist, len, bestd[N], vec[max_size];
 31 |   long long words, size, a, b, c, d, b1, b2, b3, threshold = 0;
 32 |   float *M;
 33 |   char *vocab;
 34 |   int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, QID = 0, TQ = 0, TQS = 0;
 35 |   if (argc < 2) {
 36 |     printf("Usage: ./compute-accuracy <FILE> <threshold>\nwhere FILE contains word projections, and threshold is used to reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30000)\n");
 37 |     return 0;
 38 |   }
 39 |   strcpy(file_name, argv[1]);
 40 |   if (argc > 2) threshold = atoi(argv[2]);
 41 |   f = fopen(file_name, "rb");
 42 |   if (f == NULL) {
 43 |     printf("Input file not found\n");
 44 |     return -1;
 45 |   }
 46 |   fscanf(f, "%lld", &words);
 47 |   if (threshold) if (words > threshold) words = threshold;
 48 |   fscanf(f, "%lld", &size);
 49 |   vocab = (char *)malloc(words * max_w * sizeof(char));
 50 |   M = (float *)malloc(words * size * sizeof(float));
 51 |   if (M == NULL) {
 52 |     printf("Cannot allocate memory: %lld MB\n", words * size * sizeof(float) / 1048576);
 53 |     return -1;
 54 |   }
 55 |   for (b = 0; b < words; b++) {
 56 |     fscanf(f, "%s%c", &vocab[b * max_w], &ch);
 57 |     for (a = 0; a < max_w; a++) vocab[b * max_w + a] = toupper(vocab[b * max_w + a]);
 58 |     for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
 59 |     len = 0;
 60 |     for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
 61 |     len = sqrt(len);
 62 |     for (a = 0; a < size; a++) M[a + b * size] /= len;
 63 |   }
 64 |   fclose(f);
 65 |   TCN = 0;
 66 |   while (1) {
 67 |     for (a = 0; a < N; a++) bestd[a] = 0;
 68 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
 69 |     scanf("%s", st1);
 70 |     for (a = 0; a < strlen(st1); a++) st1[a] = toupper(st1[a]);
 71 |     if ((!strcmp(st1, ":")) || (!strcmp(st1, "EXIT")) || feof(stdin)) {
 72 |       if (TCN == 0) TCN = 1;
 73 |       if (QID != 0) {
 74 |         printf("ACCURACY TOP1: %.2f %%  (%d / %d)\n", CCN / (float)TCN * 100, CCN, TCN);
 75 |         printf("Total accuracy: %.2f %%   Semantic accuracy: %.2f %%   Syntactic accuracy: %.2f %% \n", CACN / (float)TACN * 100, SEAC / (float)SECN * 100, SYAC / (float)SYCN * 100);
 76 |       }
 77 |       QID++;
 78 |       scanf("%s", st1);
 79 |       if (feof(stdin)) break;
 80 |       printf("%s:\n", st1);
 81 |       TCN = 0;
 82 |       CCN = 0;
 83 |       continue;
 84 |     }
 85 |     if (!strcmp(st1, "EXIT")) break;
 86 |     scanf("%s", st2);
 87 |     for (a = 0; a < strlen(st2); a++) st2[a] = toupper(st2[a]);
 88 |     scanf("%s", st3);
 89 |     for (a = 0; a<strlen(st3); a++) st3[a] = toupper(st3[a]);
 90 |     scanf("%s", st4);
 91 |     for (a = 0; a < strlen(st4); a++) st4[a] = toupper(st4[a]);
 92 |     for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st1)) break;
 93 |     b1 = b;
 94 |     for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st2)) break;
 95 |     b2 = b;
 96 |     for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st3)) break;
 97 |     b3 = b;
 98 |     for (a = 0; a < N; a++) bestd[a] = 0;
 99 |     for (a = 0; a < N; a++) bestw[a][0] = 0;
100 |     TQ++;
101 |     if (b1 == words) continue;
102 |     if (b2 == words) continue;
103 |     if (b3 == words) continue;
104 |     for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st4)) break;
105 |     if (b == words) continue;
106 |     for (a = 0; a < size; a++) vec[a] = (M[a + b2 * size] - M[a + b1 * size]) + M[a + b3 * size];
107 |     TQS++;
108 |     for (c = 0; c < words; c++) {
109 |       if (c == b1) continue;
110 |       if (c == b2) continue;
111 |       if (c == b3) continue;
112 |       dist = 0;
113 |       for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
114 |       for (a = 0; a < N; a++) {
115 |         if (dist > bestd[a]) {
116 |           for (d = N - 1; d > a; d--) {
117 |             bestd[d] = bestd[d - 1];
118 |             strcpy(bestw[d], bestw[d - 1]);
119 |           }
120 |           bestd[a] = dist;
121 |           strcpy(bestw[a], &vocab[c * max_w]);
122 |           break;
123 |         }
124 |       }
125 |     }
126 |     if (!strcmp(st4, bestw[0])) {
127 |       CCN++;
128 |       CACN++;
129 |       if (QID <= 5) SEAC++; else SYAC++;
130 |     }
131 |     if (QID <= 5) SECN++; else SYCN++;
132 |     TCN++;
133 |     TACN++;
134 |   }
135 |   printf("Questions seen / total: %d %d   %.2f %% \n", TQS, TQ, TQS/(float)TQ*100);
136 |   return 0;
137 | }
138 | 


--------------------------------------------------------------------------------
/word2vec/wordvectors.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | try:
  3 |     from sklearn.externals import joblib
  4 | except:
  5 |     joblib = None
  6 | 
  7 | from word2vec.utils import unitvec
  8 | 
  9 | 
 10 | class WordVectors(object):
 11 | 
 12 |     def __init__(self, vocab, vectors=None, l2norm=None, save_memory=True):
 13 |         """
 14 |         Initialize a WordVectors class based on vocabulary and vectors
 15 | 
 16 |         This initializer precomputes the l2norm of the vectors
 17 | 
 18 |         Parameters
 19 |         ----------
 20 |         vocab : np.array
 21 |             1d array with the vocabulary
 22 |         vectors : np.array
 23 |             2d array with the vectors calculated by word2vec
 24 |         l2norm : np.array
 25 |             2d array with the calulated l2norm of the vectors
 26 |         save_memory : boolean
 27 |             wheter or not save the original vectors in `self.vectors`
 28 |         """
 29 |         if vectors is None and l2norm is None:
 30 |             raise Exception('Need vectors OR l2norm arguments')
 31 | 
 32 |         self.vocab = vocab
 33 | 
 34 |         if l2norm is None:
 35 |             if not save_memory:
 36 |                 self.vectors = vectors
 37 |             self.l2norm = np.vstack(unitvec(vec) for vec in vectors)
 38 |         else:
 39 |             self.l2norm = l2norm
 40 | 
 41 |     def ix(self, word):
 42 |         """
 43 |         Returns the index on self.vocab and self.l2norm for `word`
 44 |         """
 45 |         temp = np.where(self.vocab == word)[0]
 46 |         if temp.size == 0:
 47 |             raise KeyError('Word not in vocabulary')
 48 |         else:
 49 |             return temp[0]
 50 | 
 51 |     def get_vector(self, word):
 52 |         """
 53 |         Returns the (l2norm) vector for `word` in the vocabulary
 54 |         """
 55 |         idx = self.ix(word)
 56 |         return self.l2norm[idx]
 57 | 
 58 |     def __getitem__(self, word):
 59 |         return self.get_vector(word)
 60 | 
 61 |     def generate_response(self, indexes, metric):
 62 |         """
 63 |         Generates a response as a list of tuples based on the indexes
 64 |         Each tuple is: (vocab[i], metric[i])
 65 |         """
 66 |         return [(word, sim) for word, sim in zip(self.vocab[indexes], metric[indexes])]
 67 | 
 68 |     def cosine(self, words, n=10):
 69 |         """
 70 |         Cosine similarity.
 71 | 
 72 |         metric = dot(l2norm_of_vectors, l2norm_of_target_vector)
 73 |         Uses a precomputed l2norm of the vectors
 74 | 
 75 |         Parameters
 76 |         ----------
 77 |         words : string or list of string
 78 |             word(s) in the vocabulary to calculate the vectors
 79 |         n : int, optional (default 10)
 80 |             number of neighbors to return
 81 | 
 82 |         Returns
 83 |         -------
 84 |         dict: of list of tuples
 85 | 
 86 |         Example
 87 |         -------
 88 |         >>> model.cosine('black', n=2)
 89 |         ```
 90 |         ```
 91 |         {'black': [('white', 0.94757425919916516),
 92 |                    ('yellow', 0.94640807944950878)]
 93 |         }
 94 |         """
 95 |         if isinstance(words, basestring):
 96 |             words = [words]
 97 | 
 98 |         targets = np.vstack((self.get_vector(word) for word in words))
 99 |         metrics = np.dot(self.l2norm, targets.T)
100 | 
101 |         ans = {}
102 |         for col, word in enumerate(words):
103 |             best = np.argsort(metrics[:, col])[::-1][1:n + 1]
104 |             best = self.generate_response(best, metrics[:, col])
105 |             ans[word] = best
106 | 
107 |         return ans
108 | 
109 |     def _cosine(self, word, n=10):
110 |         """
111 |         Test method for cosine distance using `scipy.distance.cosine`
112 | 
113 |         Note: This method is **a lot** slower than `self.cosine`
114 |         and results are the almost the same, you should be using `self.cosine`
115 | 
116 |         Requires: `__init__(..., save_memory=False)`
117 | 
118 |         Parameters
119 |         ----------
120 |         word : string
121 |             word in the vocabulary to calculate the vectors
122 |         n : int, optional (default 10)
123 |             number of neighbors to return
124 |         """
125 |         from scipy.spatial import distance
126 | 
127 |         target_vec = self[word]
128 |         metric = np.empty(self.vocab.shape)
129 |         for idx, vector in enumerate(self.vectors):
130 |             metric[idx] = distance.cosine(target_vec, vector)
131 |         best = metric.argsort()[1:n + 1]
132 | 
133 |         return self.generate_response(best, metric)
134 | 
135 |     def analogy(self, pos, neg, n=10):
136 |         """
137 |         Analogy similarity.
138 | 
139 |         Parameters
140 |         ----------
141 |         pos : list
142 |         neg : list
143 | 
144 |         Returns
145 |         -------
146 |         List of tuples, each tuple is  (word, similarity)
147 | 
148 | 
149 |         Example
150 |         -------
151 |             `king - man + woman = queen` will be:
152 |             `pos=['king', 'woman'], neg=['man']`
153 |         """
154 |         words = pos + neg
155 | 
156 |         pos = [(word, 1.0) for word in pos]
157 |         neg = [(word, -1.0) for word in neg]
158 | 
159 |         mean = []
160 |         for word, direction in pos + neg:
161 |             mean.append(direction * unitvec(self.get_vector(word)))
162 |         mean = np.array(mean).mean(axis=0)
163 | 
164 |         similarities = np.dot(self.l2norm, mean)
165 |         best = similarities.argsort()[::-1][1:n + len(words) - 1]
166 |         return self.generate_response(best, similarities)
167 | 
168 |     def to_mmap(self, fname):
169 |         if not joblib:
170 |             raise Exception("sklearn needed for save as mmap")
171 | 
172 |         joblib.dump(self, fname)
173 | 
174 |     @classmethod
175 |     def from_binary(cls, fname, save_memory=True):
176 |         """
177 |         Create a WordVectors class based on a word2vec binary file
178 | 
179 |         Parameters
180 |         ----------
181 |         fname : path to file
182 |         save_memory : boolean
183 | 
184 |         Returns
185 |         -------
186 |         WordVectors class
187 |         """
188 |         with open(fname) as fin:
189 |             header = fin.readline()
190 |             vocab_size, vector_size = map(int, header.split())
191 |             vocab = []
192 | 
193 |             vectors = np.empty((vocab_size, vector_size), dtype=np.float)
194 |             binary_len = np.dtype(np.float32).itemsize * vector_size
195 |             for line_number in xrange(vocab_size):
196 |                 # mixed text and binary: read text first, then binary
197 |                 word = ''
198 |                 while True:
199 |                     ch = fin.read(1)
200 |                     if ch == ' ':
201 |                         break
202 |                     word += ch
203 |                 vocab.append(word)
204 | 
205 |                 vector = np.fromstring(fin.read(binary_len), np.float32)
206 |                 vectors[line_number] = vector
207 |                 fin.read(1)  # newline
208 |         vocab = np.array(vocab)
209 | 
210 |         return cls(vocab=vocab, vectors=vectors, save_memory=save_memory)
211 | 
212 |     @classmethod
213 |     def from_text(cls, fname, save_memory=True):
214 |         """
215 |         Create a WordVectors class based on a word2vec text file
216 | 
217 |         Parameters
218 |         ----------
219 |         fname : path to file
220 |         save_memory : boolean
221 | 
222 |         Returns
223 |         -------
224 |         WordVectors class
225 |         """
226 |         with open(fname) as f:
227 |             parts = f.readline().strip().split(' ')
228 |             shape = int(parts[0]), int(parts[1])
229 | 
230 |         vocab = np.genfromtxt(fname, dtype=object, delimiter=' ', usecols=0, skip_header=1)
231 | 
232 |         cols = np.arange(1, shape[1] + 1)
233 |         vectors = np.genfromtxt(fname, dtype=float, delimiter=' ', usecols=cols, skip_header=1)
234 | 
235 |         return cls(vocab=vocab, vectors=vectors, save_memory=save_memory)
236 | 
237 |     @classmethod
238 |     def from_mmap(cls, fname):
239 |         """
240 |         Create a WordVectors class from a memory map
241 | 
242 |         Parameters
243 |         ----------
244 |         fname : path to file
245 |         save_memory : boolean
246 | 
247 |         Returns
248 |         -------
249 |         WordVectors class
250 |         """
251 |         memmaped = joblib.load(fname, mmap_mode='r+')
252 |         return cls(vocab=memmaped.vocab, l2norm=memmaped.l2norm)
253 | 


--------------------------------------------------------------------------------
/word2vec-c/word2phrase.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <stdlib.h>
 17 | #include <string.h>
 18 | #include <math.h>
 19 | #include <pthread.h>
 20 | 
 21 | #define MAX_STRING 60
 22 | 
 23 | const int vocab_hash_size = 500000000; // Maximum 500M entries in the vocabulary
 24 | 
 25 | typedef float real;                    // Precision of float numbers
 26 | 
 27 | struct vocab_word {
 28 |   long long cn;
 29 |   char *word;
 30 | };
 31 | 
 32 | char train_file[MAX_STRING], output_file[MAX_STRING];
 33 | struct vocab_word *vocab;
 34 | int debug_mode = 2, min_count = 5, *vocab_hash, min_reduce = 1;
 35 | long long vocab_max_size = 10000, vocab_size = 0;
 36 | long long train_words = 0;
 37 | real threshold = 100;
 38 | 
 39 | unsigned long long next_random = 1;
 40 | 
 41 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries
 42 | void ReadWord(char *word, FILE *fin) {
 43 |   int a = 0, ch;
 44 |   while (!feof(fin)) {
 45 |     ch = fgetc(fin);
 46 |     if (ch == 13) continue;
 47 |     if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
 48 |       if (a > 0) {
 49 |         if (ch == '\n') ungetc(ch, fin);
 50 |         break;
 51 |       }
 52 |       if (ch == '\n') {
 53 |         strcpy(word, (char *)"</s>");
 54 |         return;
 55 |       } else continue;
 56 |     }
 57 |     word[a] = ch;
 58 |     a++;
 59 |     if (a >= MAX_STRING - 1) a--;   // Truncate too long words
 60 |   }
 61 |   word[a] = 0;
 62 | }
 63 | 
 64 | // Returns hash value of a word
 65 | int GetWordHash(char *word) {
 66 |   unsigned long long a, hash = 1;
 67 |   for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
 68 |   hash = hash % vocab_hash_size;
 69 |   return hash;
 70 | }
 71 | 
 72 | // Returns position of a word in the vocabulary; if the word is not found, returns -1
 73 | int SearchVocab(char *word) {
 74 |   unsigned int hash = GetWordHash(word);
 75 |   while (1) {
 76 |     if (vocab_hash[hash] == -1) return -1;
 77 |     if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
 78 |     hash = (hash + 1) % vocab_hash_size;
 79 |   }
 80 |   return -1;
 81 | }
 82 | 
 83 | // Reads a word and returns its index in the vocabulary
 84 | int ReadWordIndex(FILE *fin) {
 85 |   char word[MAX_STRING];
 86 |   ReadWord(word, fin);
 87 |   if (feof(fin)) return -1;
 88 |   return SearchVocab(word);
 89 | }
 90 | 
 91 | // Adds a word to the vocabulary
 92 | int AddWordToVocab(char *word) {
 93 |   unsigned int hash, length = strlen(word) + 1;
 94 |   if (length > MAX_STRING) length = MAX_STRING;
 95 |   vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
 96 |   strcpy(vocab[vocab_size].word, word);
 97 |   vocab[vocab_size].cn = 0;
 98 |   vocab_size++;
 99 |   // Reallocate memory if needed
100 |   if (vocab_size + 2 >= vocab_max_size) {
101 |     vocab_max_size += 10000;
102 |     vocab=(struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
103 |   }
104 |   hash = GetWordHash(word);
105 |   while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
106 |   vocab_hash[hash]=vocab_size - 1;
107 |   return vocab_size - 1;
108 | }
109 | 
110 | // Used later for sorting by word counts
111 | int VocabCompare(const void *a, const void *b) {
112 |     return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
113 | }
114 | 
115 | // Sorts the vocabulary by frequency using word counts
116 | void SortVocab() {
117 |   int a;
118 |   unsigned int hash;
119 |   // Sort the vocabulary and keep </s> at the first position
120 |   qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
121 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
122 |   for (a = 0; a < vocab_size; a++) {
123 |     // Words occuring less than min_count times will be discarded from the vocab
124 |     if (vocab[a].cn < min_count) {
125 |       vocab_size--;
126 |       free(vocab[vocab_size].word);
127 |     } else {
128 |       // Hash will be re-computed, as after the sorting it is not actual
129 |       hash = GetWordHash(vocab[a].word);
130 |       while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
131 |       vocab_hash[hash] = a;
132 |     }
133 |   }
134 |   vocab = (struct vocab_word *)realloc(vocab, vocab_size * sizeof(struct vocab_word));
135 | }
136 | 
137 | // Reduces the vocabulary by removing infrequent tokens
138 | void ReduceVocab() {
139 |   int a, b = 0;
140 |   unsigned int hash;
141 |   for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
142 |     vocab[b].cn = vocab[a].cn;
143 |     vocab[b].word = vocab[a].word;
144 |     b++;
145 |   } else free(vocab[a].word);
146 |   vocab_size = b;
147 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
148 |   for (a = 0; a < vocab_size; a++) {
149 |     // Hash will be re-computed, as it is not actual
150 |     hash = GetWordHash(vocab[a].word);
151 |     while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
152 |     vocab_hash[hash] = a;
153 |   }
154 |   fflush(stdout);
155 |   min_reduce++;
156 | }
157 | 
158 | void LearnVocabFromTrainFile() {
159 |   char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2];
160 |   FILE *fin;
161 |   long long a, i, start = 1;
162 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
163 |   fin = fopen(train_file, "rb");
164 |   if (fin == NULL) {
165 |     printf("ERROR: training data file not found!\n");
166 |     exit(1);
167 |   }
168 |   vocab_size = 0;
169 |   AddWordToVocab((char *)"</s>");
170 |   while (1) {
171 |     ReadWord(word, fin);
172 |     if (feof(fin)) break;
173 |     if (!strcmp(word, "</s>")) {
174 |       start = 1;
175 |       continue;
176 |     } else start = 0;
177 |     train_words++;
178 |     if ((debug_mode > 1) && (train_words % 100000 == 0)) {
179 |       printf("Words processed: %lldK     Vocab size: %lldK  %c", train_words / 1000, vocab_size / 1000, 13);
180 |       fflush(stdout);
181 |     }
182 |     i = SearchVocab(word);
183 |     if (i == -1) {
184 |       a = AddWordToVocab(word);
185 |       vocab[a].cn = 1;
186 |     } else vocab[i].cn++;
187 |     if (start) continue;
188 |     sprintf(bigram_word, "%s_%s", last_word, word);
189 |     bigram_word[MAX_STRING - 1] = 0;
190 |     strcpy(last_word, word);
191 |     i = SearchVocab(bigram_word);
192 |     if (i == -1) {
193 |       a = AddWordToVocab(bigram_word);
194 |       vocab[a].cn = 1;
195 |     } else vocab[i].cn++;
196 |     if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
197 |   }
198 |   SortVocab();
199 |   if (debug_mode > 0) {
200 |     printf("\nVocab size (unigrams + bigrams): %lld\n", vocab_size);
201 |     printf("Words in train file: %lld\n", train_words);
202 |   }
203 |   fclose(fin);
204 | }
205 | 
206 | void TrainModel() {
207 |   long long pa = 0, pb = 0, pab = 0, oov, i, li = -1, cn = 0;
208 |   char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2];
209 |   real score;
210 |   FILE *fo, *fin;
211 |   printf("Starting training using file %s\n", train_file);
212 |   LearnVocabFromTrainFile();
213 |   fin = fopen(train_file, "rb");
214 |   fo = fopen(output_file, "wb");
215 |   word[0] = 0;
216 |   while (1) {
217 |     strcpy(last_word, word);
218 |     ReadWord(word, fin);
219 |     if (feof(fin)) break;
220 |     if (!strcmp(word, "</s>")) {
221 |       fprintf(fo, "\n");
222 |       continue;
223 |     }
224 |     cn++;
225 |     if ((debug_mode > 1) && (cn % 100000 == 0)) {
226 |       printf("Words written: %lldK%c", cn / 1000, 13);
227 |       fflush(stdout);
228 |     }
229 |     oov = 0;
230 |     i = SearchVocab(word);
231 |     if (i == -1) oov = 1; else pb = vocab[i].cn;
232 |     if (li == -1) oov = 1;
233 |     li = i;
234 |     sprintf(bigram_word, "%s_%s", last_word, word);
235 |     bigram_word[MAX_STRING - 1] = 0;
236 |     i = SearchVocab(bigram_word);
237 |     if (i == -1) oov = 1; else pab = vocab[i].cn;
238 |     if (pa < min_count) oov = 1;
239 |     if (pb < min_count) oov = 1;
240 |     if (oov) score = 0; else score = (pab - min_count) / (real)pa / (real)pb * (real)train_words;
241 |     if (score > threshold) {
242 |       fprintf(fo, "_%s", word);
243 |       pb = 0;
244 |     } else fprintf(fo, " %s", word);
245 |     pa = pb;
246 |   }
247 |   fclose(fo);
248 |   fclose(fin);
249 | }
250 | 
251 | int ArgPos(char *str, int argc, char **argv) {
252 |   int a;
253 |   for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
254 |     if (a == argc - 1) {
255 |       printf("Argument missing for %s\n", str);
256 |       exit(1);
257 |     }
258 |     return a;
259 |   }
260 |   return -1;
261 | }
262 | 
263 | int main(int argc, char **argv) {
264 |   int i;
265 |   if (argc == 1) {
266 |     printf("WORD2PHRASE tool v0.1a\n\n");
267 |     printf("Options:\n");
268 |     printf("Parameters for training:\n");
269 |     printf("\t-train <file>\n");
270 |     printf("\t\tUse text data from <file> to train the model\n");
271 |     printf("\t-output <file>\n");
272 |     printf("\t\tUse <file> to save the resulting word vectors / word clusters / phrases\n");
273 |     printf("\t-min-count <int>\n");
274 |     printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
275 |     printf("\t-threshold <float>\n");
276 |     printf("\t\t The <float> value represents threshold for forming the phrases (higher means less phrases); default 100\n");
277 |     printf("\t-debug <int>\n");
278 |     printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
279 |     printf("\nExamples:\n");
280 |     printf("./word2phrase -train text.txt -output phrases.txt -threshold 100 -debug 2\n\n");
281 |     return 0;
282 |   }
283 |   if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
284 |   if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
285 |   if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
286 |   if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
287 |   if ((i = ArgPos((char *)"-threshold", argc, argv)) > 0) threshold = atof(argv[i + 1]);
288 |   vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
289 |   vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
290 |   TrainModel();
291 |   return 0;
292 | }
293 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/word2vec-c/word2vec.c:
--------------------------------------------------------------------------------
  1 | //  Copyright 2013 Google Inc. All Rights Reserved.
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <stdlib.h>
 17 | #include <string.h>
 18 | #include <math.h>
 19 | #include <pthread.h>
 20 | 
 21 | #define MAX_STRING 100
 22 | #define EXP_TABLE_SIZE 1000
 23 | #define MAX_EXP 6
 24 | #define MAX_SENTENCE_LENGTH 1000
 25 | #define MAX_CODE_LENGTH 40
 26 | 
 27 | const int vocab_hash_size = 30000000;  // Maximum 30 * 0.7 = 21M words in the vocabulary
 28 | 
 29 | typedef float real;                    // Precision of float numbers
 30 | 
 31 | struct vocab_word {
 32 |   long long cn;
 33 |   int *point;
 34 |   char *word, *code, codelen;
 35 | };
 36 | 
 37 | FILE *fin;
 38 | 
 39 | char train_file[MAX_STRING], output_file[MAX_STRING];
 40 | char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
 41 | struct vocab_word *vocab;
 42 | int binary = 0, cbow = 0, debug_mode = 2, window = 5, min_count = 5, num_threads = 1, min_reduce = 1;
 43 | int *vocab_hash;
 44 | long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
 45 | long long train_words = 0, word_count_actual = 0, file_size = 0, classes = 0;
 46 | real alpha = 0.025, starting_alpha, sample = 0;
 47 | real *syn0, *syn1, *syn1neg, *expTable;
 48 | clock_t start;
 49 | 
 50 | int hs = 1, negative = 0;
 51 | const int table_size = 1e8;
 52 | int *table;
 53 | 
 54 | void InitUnigramTable() {
 55 |   int a, i;
 56 |   long long train_words_pow = 0;
 57 |   real d1, power = 0.75;
 58 |   table = (int *)malloc(table_size * sizeof(int));
 59 |   if (table == NULL) {
 60 |     fprintf(stderr, "cannot allocate memory for the table\n");
 61 |     exit(1);
 62 |   }
 63 |   for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power);
 64 |   i = 0;
 65 |   d1 = pow(vocab[i].cn, power) / (real)train_words_pow;
 66 |   for (a = 0; a < table_size; a++) {
 67 |     table[a] = i;
 68 |     if (a / (real)table_size > d1) {
 69 |       i++;
 70 |       d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
 71 |     }
 72 |     if (i >= vocab_size) i = vocab_size - 1;
 73 |   }
 74 | }
 75 | 
 76 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries
 77 | void ReadWord(char *word, FILE *fin) {
 78 |   int a = 0, ch;
 79 |   while (!feof(fin)) {
 80 |     ch = fgetc(fin);
 81 |     if (ch == 13) continue;
 82 |     if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
 83 |       if (a > 0) {
 84 |         if (ch == '\n') ungetc(ch, fin);
 85 |         break;
 86 |       }
 87 |       if (ch == '\n') {
 88 |         strcpy(word, (char *)"</s>");
 89 |         return;
 90 |       } else continue;
 91 |     }
 92 |     word[a] = ch;
 93 |     a++;
 94 |     if (a >= MAX_STRING - 1) a--;   // Truncate too long words
 95 |   }
 96 |   word[a] = 0;
 97 | }
 98 | 
 99 | // Returns hash value of a word
100 | int GetWordHash(char *word) {
101 |   unsigned long long a, hash = 0;
102 |   for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
103 |   hash = hash % vocab_hash_size;
104 |   return hash;
105 | }
106 | 
107 | // Returns position of a word in the vocabulary; if the word is not found, returns -1
108 | int SearchVocab(char *word) {
109 |   unsigned int hash = GetWordHash(word);
110 |   while (1) {
111 |     if (vocab_hash[hash] == -1) return -1;
112 |     if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
113 |     hash = (hash + 1) % vocab_hash_size;
114 |   }
115 |   return -1;
116 | }
117 | 
118 | // Reads a word and returns its index in the vocabulary
119 | int ReadWordIndex(FILE *fin) {
120 |   char word[MAX_STRING];
121 |   ReadWord(word, fin);
122 |   if (feof(fin)) return -1;
123 |   return SearchVocab(word);
124 | }
125 | 
126 | // Adds a word to the vocabulary
127 | int AddWordToVocab(char *word) {
128 |   unsigned int hash, length = strlen(word) + 1;
129 |   if (length > MAX_STRING) length = MAX_STRING;
130 |   vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
131 |   strcpy(vocab[vocab_size].word, word);
132 |   vocab[vocab_size].cn = 0;
133 |   vocab_size++;
134 |   // Reallocate memory if needed
135 |   if (vocab_size + 2 >= vocab_max_size) {
136 |     vocab_max_size += 1000;
137 |     vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
138 |   }
139 |   hash = GetWordHash(word);
140 |   while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
141 |   vocab_hash[hash] = vocab_size - 1;
142 |   return vocab_size - 1;
143 | }
144 | 
145 | // Used later for sorting by word counts
146 | int VocabCompare(const void *a, const void *b) {
147 |     return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
148 | }
149 | 
150 | void DestroyVocab() {
151 |   int a;
152 | 
153 |   for (a = 0; a < vocab_size; a++) {
154 |     if (vocab[a].word != NULL) {
155 |       free(vocab[a].word);
156 |     }
157 |     if (vocab[a].code != NULL) {
158 |       free(vocab[a].code);
159 |     }
160 |     if (vocab[a].point != NULL) {
161 |       free(vocab[a].point);
162 |     }
163 |   }
164 |   free(vocab[vocab_size].word);
165 |   free(vocab);
166 | }
167 | 
168 | // Sorts the vocabulary by frequency using word counts
169 | void SortVocab() {
170 |   int a, size;
171 |   unsigned int hash;
172 |   // Sort the vocabulary and keep </s> at the first position
173 |   qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
174 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
175 |   size = vocab_size;
176 |   train_words = 0;
177 |   for (a = 1; a < size; a++) { // Skip </s>
178 |     // Words occuring less than min_count times will be discarded from the vocab
179 |     if (vocab[a].cn < min_count) {
180 |       vocab_size--;
181 |       free(vocab[a].word);
182 |       vocab[a].word = NULL;
183 |     } else {
184 |       // Hash will be re-computed, as after the sorting it is not actual
185 |       hash=GetWordHash(vocab[a].word);
186 |       while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
187 |       vocab_hash[hash] = a;
188 |       train_words += vocab[a].cn;
189 |     }
190 |   }
191 |   vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
192 |   // Allocate memory for the binary tree construction
193 |   for (a = 0; a < vocab_size; a++) {
194 |     vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
195 |     vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
196 |   }
197 | }
198 | 
199 | // Reduces the vocabulary by removing infrequent tokens
200 | void ReduceVocab() {
201 |   int a, b = 0;
202 |   unsigned int hash;
203 |   for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
204 |     vocab[b].cn = vocab[a].cn;
205 |     vocab[b].word = vocab[a].word;
206 |     b++;
207 |   } else free(vocab[a].word);
208 |   vocab_size = b;
209 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
210 |   for (a = 0; a < vocab_size; a++) {
211 |     // Hash will be re-computed, as it is not actual
212 |     hash = GetWordHash(vocab[a].word);
213 |     while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
214 |     vocab_hash[hash] = a;
215 |   }
216 |   fflush(stdout);
217 |   min_reduce++;
218 | }
219 | 
220 | // Create binary Huffman tree using the word counts
221 | // Frequent words will have short uniqe binary codes
222 | void CreateBinaryTree() {
223 |   long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
224 |   char code[MAX_CODE_LENGTH];
225 |   long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
226 |   long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
227 |   long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
228 |   for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;
229 |   for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;
230 |   pos1 = vocab_size - 1;
231 |   pos2 = vocab_size;
232 |   // Following algorithm constructs the Huffman tree by adding one node at a time
233 |   for (a = 0; a < vocab_size - 1; a++) {
234 |     // First, find two smallest nodes 'min1, min2'
235 |     if (pos1 >= 0) {
236 |       if (count[pos1] < count[pos2]) {
237 |         min1i = pos1;
238 |         pos1--;
239 |       } else {
240 |         min1i = pos2;
241 |         pos2++;
242 |       }
243 |     } else {
244 |       min1i = pos2;
245 |       pos2++;
246 |     }
247 |     if (pos1 >= 0) {
248 |       if (count[pos1] < count[pos2]) {
249 |         min2i = pos1;
250 |         pos1--;
251 |       } else {
252 |         min2i = pos2;
253 |         pos2++;
254 |       }
255 |     } else {
256 |       min2i = pos2;
257 |       pos2++;
258 |     }
259 |     count[vocab_size + a] = count[min1i] + count[min2i];
260 |     parent_node[min1i] = vocab_size + a;
261 |     parent_node[min2i] = vocab_size + a;
262 |     binary[min2i] = 1;
263 |   }
264 |   // Now assign binary code to each vocabulary word
265 |   for (a = 0; a < vocab_size; a++) {
266 |     b = a;
267 |     i = 0;
268 |     while (1) {
269 |       code[i] = binary[b];
270 |       point[i] = b;
271 |       i++;
272 |       b = parent_node[b];
273 |       if (b == vocab_size * 2 - 2) break;
274 |     }
275 |     vocab[a].codelen = i;
276 |     vocab[a].point[0] = vocab_size - 2;
277 |     for (b = 0; b < i; b++) {
278 |       vocab[a].code[i - b - 1] = code[b];
279 |       vocab[a].point[i - b] = point[b] - vocab_size;
280 |     }
281 |   }
282 |   free(count);
283 |   free(binary);
284 |   free(parent_node);
285 | }
286 | 
287 | void LearnVocabFromTrainFile() {
288 |   char word[MAX_STRING];
289 |   //FILE *fin;
290 |   long long a, i;
291 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
292 |   //fin = fopen(train_file, "rb");
293 |   rewind(fin);
294 |   if (fin == NULL) {
295 |     printf("ERROR: training data file not found!\n");
296 |     exit(1);
297 |   }
298 |   vocab_size = 0;
299 |   AddWordToVocab((char *)"</s>");
300 |   while (1) {
301 |     ReadWord(word, fin);
302 |     if (feof(fin)) break;
303 |     train_words++;
304 |     if ((debug_mode > 1) && (train_words % 100000 == 0)) {
305 |       printf("%lldK%c", train_words / 1000, 13);
306 |       fflush(stdout);
307 |     }
308 |     i = SearchVocab(word);
309 |     if (i == -1) {
310 |       a = AddWordToVocab(word);
311 |       vocab[a].cn = 1;
312 |     } else vocab[i].cn++;
313 |     if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
314 |   }
315 |   SortVocab();
316 |   if (debug_mode > 0) {
317 |     printf("Vocab size: %lld\n", vocab_size);
318 |     printf("Words in train file: %lld\n", train_words);
319 |   }
320 |   file_size = ftell(fin);
321 |   //fclose(fin);
322 | }
323 | 
324 | void SaveVocab() {
325 |   long long i;
326 |   FILE *fo = fopen(save_vocab_file, "wb");
327 |   for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
328 |   fclose(fo);
329 | }
330 | 
331 | void ReadVocab() {
332 |   long long a, i = 0;
333 |   char c;
334 |   char word[MAX_STRING];
335 |   FILE *fvb = fopen(read_vocab_file, "rb");
336 |   if (fvb == NULL) {
337 |     printf("Vocabulary file not found\n");
338 |     exit(1);
339 |   }
340 |   for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
341 |   vocab_size = 0;
342 |   while (1) {
343 |     ReadWord(word, fvb);
344 |     if (feof(fvb)) break;
345 |     a = AddWordToVocab(word);
346 |     fscanf(fvb, "%lld%c", &vocab[a].cn, &c);
347 |     i++;
348 |   }
349 |   SortVocab();
350 |   if (debug_mode > 0) {
351 |     printf("Vocab size: %lld\n", vocab_size);
352 |     printf("Words in train file: %lld\n", train_words);
353 |   }
354 |   //fin = fopen(train_file, "rb");
355 |   rewind(fin);
356 |   if (fin == NULL) {
357 |     printf("ERROR: training data file not found!\n");
358 |     exit(1);
359 |   }
360 |   fseek(fin, 0, SEEK_END);
361 |   file_size = ftell(fin);
362 |   //fclose(fin);
363 | }
364 | 
365 | void InitNet() {
366 |   long long a, b;
367 |   a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real));
368 |   if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);}
369 |   if (hs) {
370 |     a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real));
371 |     if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);}
372 |     for (b = 0; b < layer1_size; b++) for (a = 0; a < vocab_size; a++)
373 |      syn1[a * layer1_size + b] = 0;
374 |   }
375 |   if (negative>0) {
376 |     a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real));
377 |     if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
378 |     for (b = 0; b < layer1_size; b++) for (a = 0; a < vocab_size; a++)
379 |      syn1neg[a * layer1_size + b] = 0;
380 |   }
381 |   for (b = 0; b < layer1_size; b++) for (a = 0; a < vocab_size; a++)
382 |    syn0[a * layer1_size + b] = (rand() / (real)RAND_MAX - 0.5) / layer1_size;
383 |   CreateBinaryTree();
384 | }
385 | 
386 | void DestroyNet() {
387 |   if (syn0 != NULL) {
388 |     free(syn0);
389 |   }
390 |   if (syn1 != NULL) {
391 |     free(syn1);
392 |   }
393 |   if (syn1neg != NULL) {
394 |     free(syn1neg);
395 |   }
396 | }
397 | 
398 | void *TrainModelThread(void *id) {
399 |   long long a, b, d, word, last_word, sentence_length = 0, sentence_position = 0;
400 |   long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
401 |   long long l1, l2, c, target, label;
402 |   unsigned long long next_random = (long long)id;
403 |   real f, g;
404 |   clock_t now;
405 |   real *neu1 = (real *)calloc(layer1_size, sizeof(real));
406 |   real *neu1e = (real *)calloc(layer1_size, sizeof(real));
407 |   rewind(fin);
408 |   FILE *fi = fin; //fopen(train_file, "rb");
409 |   if (fi == NULL) {
410 |     fprintf(stderr, "no such file or directory: %s", train_file);
411 |     exit(1);
412 |   }
413 |   fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
414 |   while (1) {
415 |     if (word_count - last_word_count > 10000) {
416 |       word_count_actual += word_count - last_word_count;
417 |       last_word_count = word_count;
418 |       if ((debug_mode > 1)) {
419 |         now=clock();
420 |         printf("%cAlpha: %f  Progress: %.2f%%  Words/thread/sec: %.2fk  ", 13, alpha,
421 |          word_count_actual / (real)(train_words + 1) * 100,
422 |          word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
423 |         fflush(stdout);
424 |       }
425 |       alpha = starting_alpha * (1 - word_count_actual / (real)(train_words + 1));
426 |       if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
427 |     }
428 |     if (sentence_length == 0) {
429 |       while (1) {
430 |         word = ReadWordIndex(fi);
431 |         if (feof(fi)) break;
432 |         if (word == -1) continue;
433 |         word_count++;
434 |         if (word == 0) break;
435 |         // The subsampling randomly discards frequent words while keeping the ranking same
436 |         if (sample > 0) {
437 |           real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;
438 |           next_random = next_random * (unsigned long long)25214903917 + 11;
439 |           if (ran < (next_random & 0xFFFF) / (real)65536) continue;
440 |         }
441 |         sen[sentence_length] = word;
442 |         sentence_length++;
443 |         if (sentence_length >= MAX_SENTENCE_LENGTH) break;
444 |       }
445 |       sentence_position = 0;
446 |     }
447 |     if (feof(fi)) break;
448 |     if (word_count > train_words / num_threads) break;
449 |     word = sen[sentence_position];
450 |     if (word == -1) continue;
451 |     for (c = 0; c < layer1_size; c++) neu1[c] = 0;
452 |     for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
453 |     next_random = next_random * (unsigned long long)25214903917 + 11;
454 |     b = next_random % window;
455 |     if (cbow) {  //train the cbow architecture
456 |       // in -> hidden
457 |       for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
458 |         c = sentence_position - window + a;
459 |         if (c < 0) continue;
460 |         if (c >= sentence_length) continue;
461 |         last_word = sen[c];
462 |         if (last_word == -1) continue;
463 |         for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size];
464 |       }
465 |       if (hs) for (d = 0; d < vocab[word].codelen; d++) {
466 |         f = 0;
467 |         l2 = vocab[word].point[d] * layer1_size;
468 |         // Propagate hidden -> output
469 |         for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
470 |         if (f <= -MAX_EXP) continue;
471 |         else if (f >= MAX_EXP) continue;
472 |         else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
473 |         // 'g' is the gradient multiplied by the learning rate
474 |         g = (1 - vocab[word].code[d] - f) * alpha;
475 |         // Propagate errors output -> hidden
476 |         for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
477 |         // Learn weights hidden -> output
478 |         for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
479 |       }
480 |       // NEGATIVE SAMPLING
481 |       if (negative > 0) for (d = 0; d < negative + 1; d++) {
482 |         if (d == 0) {
483 |           target = word;
484 |           label = 1;
485 |         } else {
486 |           next_random = next_random * (unsigned long long)25214903917 + 11;
487 |           target = table[(next_random >> 16) % table_size];
488 |           if (target == 0) target = next_random % (vocab_size - 1) + 1;
489 |           if (target == word) continue;
490 |           label = 0;
491 |         }
492 |         l2 = target * layer1_size;
493 |         f = 0;
494 |         for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];
495 |         if (f > MAX_EXP) g = (label - 1) * alpha;
496 |         else if (f < -MAX_EXP) g = (label - 0) * alpha;
497 |         else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
498 |         for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
499 |         for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
500 |       }
501 |       // hidden -> in
502 |       for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
503 |         c = sentence_position - window + a;
504 |         if (c < 0) continue;
505 |         if (c >= sentence_length) continue;
506 |         last_word = sen[c];
507 |         if (last_word == -1) continue;
508 |         for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];
509 |       }
510 |     } else {  //train skip-gram
511 |       for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
512 |         c = sentence_position - window + a;
513 |         if (c < 0) continue;
514 |         if (c >= sentence_length) continue;
515 |         last_word = sen[c];
516 |         if (last_word == -1) continue;
517 |         l1 = last_word * layer1_size;
518 |         for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
519 |         // HIERARCHICAL SOFTMAX
520 |         if (hs) for (d = 0; d < vocab[word].codelen; d++) {
521 |           f = 0;
522 |           l2 = vocab[word].point[d] * layer1_size;
523 |           // Propagate hidden -> output
524 |           for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2];
525 |           if (f <= -MAX_EXP) continue;
526 |           else if (f >= MAX_EXP) continue;
527 |           else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
528 |           // 'g' is the gradient multiplied by the learning rate
529 |           g = (1 - vocab[word].code[d] - f) * alpha;
530 |           // Propagate errors output -> hidden
531 |           for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
532 |           // Learn weights hidden -> output
533 |           for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1];
534 |         }
535 |         // NEGATIVE SAMPLING
536 |         if (negative > 0) for (d = 0; d < negative + 1; d++) {
537 |           if (d == 0) {
538 |             target = word;
539 |             label = 1;
540 |           } else {
541 |             next_random = next_random * (unsigned long long)25214903917 + 11;
542 |             target = table[(next_random >> 16) % table_size];
543 |             if (target == 0) target = next_random % (vocab_size - 1) + 1;
544 |             if (target == word) continue;
545 |             label = 0;
546 |           }
547 |           l2 = target * layer1_size;
548 |           f = 0;
549 |           for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2];
550 |           if (f > MAX_EXP) g = (label - 1) * alpha;
551 |           else if (f < -MAX_EXP) g = (label - 0) * alpha;
552 |           else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
553 |           for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
554 |           for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1];
555 |         }
556 |         // Learn weights input -> hidden
557 |         for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c];
558 |       }
559 |     }
560 |     sentence_position++;
561 |     if (sentence_position >= sentence_length) {
562 |       sentence_length = 0;
563 |       continue;
564 |     }
565 |   }
566 |   //fclose(fi);
567 |   free(neu1);
568 |   free(neu1e);
569 |   pthread_exit(NULL);
570 | }
571 | 
572 | void TrainModel() {
573 |   long a, b, c, d;
574 |   FILE *fo;
575 |   pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
576 |   if (pt == NULL) {
577 |     fprintf(stderr, "cannot allocate memory for threads\n");
578 |     exit(1);
579 |   }
580 |   printf("Starting training using file %s\n", train_file);
581 |   starting_alpha = alpha;
582 |   if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile();
583 |   if (save_vocab_file[0] != 0) SaveVocab();
584 |   if (output_file[0] == 0) return;
585 |   InitNet();
586 |   if (negative > 0) InitUnigramTable();
587 |   start = clock();
588 |   for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a);
589 |   for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
590 |   fo = fopen(output_file, "wb");
591 |   if (fo == NULL) {
592 |     fprintf(stderr, "Cannot open %s: permission denied\n", output_file);
593 |     exit(1);
594 |   }
595 |   if (classes == 0) {
596 |     // Save the word vectors
597 |     fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
598 |     for (a = 0; a < vocab_size; a++) {
599 |       if (vocab[a].word != NULL) {
600 |         fprintf(fo, "%s ", vocab[a].word);
601 |       }
602 |       if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
603 |       else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
604 |       fprintf(fo, "\n");
605 |     }
606 |   } else {
607 |     // Run K-means on the word vectors
608 |     int clcn = classes, iter = 10, closeid;
609 |     int *centcn = (int *)malloc(classes * sizeof(int));
610 |     if (centcn == NULL) {
611 |       fprintf(stderr, "cannot allocate memory for centcn\n");
612 |       exit(1);
613 |     }
614 |     int *cl = (int *)calloc(vocab_size, sizeof(int));
615 |     real closev, x;
616 |     real *cent = (real *)calloc(classes * layer1_size, sizeof(real));
617 |     for (a = 0; a < vocab_size; a++) cl[a] = a % clcn;
618 |     for (a = 0; a < iter; a++) {
619 |       for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0;
620 |       for (b = 0; b < clcn; b++) centcn[b] = 1;
621 |       for (c = 0; c < vocab_size; c++) {
622 |         for (d = 0; d < layer1_size; d++) {
623 |           cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
624 |           centcn[cl[c]]++;
625 |         }
626 |       }
627 |       for (b = 0; b < clcn; b++) {
628 |         closev = 0;
629 |         for (c = 0; c < layer1_size; c++) {
630 |           cent[layer1_size * b + c] /= centcn[b];
631 |           closev += cent[layer1_size * b + c] * cent[layer1_size * b + c];
632 |         }
633 |         closev = sqrt(closev);
634 |         for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev;
635 |       }
636 |       for (c = 0; c < vocab_size; c++) {
637 |         closev = -10;
638 |         closeid = 0;
639 |         for (d = 0; d < clcn; d++) {
640 |           x = 0;
641 |           for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b];
642 |           if (x > closev) {
643 |             closev = x;
644 |             closeid = d;
645 |           }
646 |         }
647 |         cl[c] = closeid;
648 |       }
649 |     }
650 |     // Save the K-means classes
651 |     for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
652 |     free(centcn);
653 |     free(cent);
654 |     free(cl);
655 |   }
656 |   fclose(fo);
657 |   free(table);
658 |   free(pt);
659 |   DestroyVocab();
660 | }
661 | 
662 | int ArgPos(char *str, int argc, char **argv) {
663 |   int a;
664 |   for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
665 |     if (a == argc - 1) {
666 |       printf("Argument missing for %s\n", str);
667 |       exit(1);
668 |     }
669 |     return a;
670 |   }
671 |   return -1;
672 | }
673 | 
674 | int main(int argc, char **argv) {
675 |   int i;
676 |   if (argc == 1) {
677 |     printf("WORD VECTOR estimation toolkit v 0.1b\n\n");
678 |     printf("Options:\n");
679 |     printf("Parameters for training:\n");
680 |     printf("\t-train <file>\n");
681 |     printf("\t\tUse text data from <file> to train the model\n");
682 |     printf("\t-output <file>\n");
683 |     printf("\t\tUse <file> to save the resulting word vectors / word clusters\n");
684 |     printf("\t-size <int>\n");
685 |     printf("\t\tSet size of word vectors; default is 100\n");
686 |     printf("\t-window <int>\n");
687 |     printf("\t\tSet max skip length between words; default is 5\n");
688 |     printf("\t-sample <float>\n");
689 |     printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency");
690 |     printf(" in the training data will be randomly down-sampled; default is 0 (off), useful value is 1e-5\n");
691 |     printf("\t-hs <int>\n");
692 |     printf("\t\tUse Hierarchical Softmax; default is 1 (0 = not used)\n");
693 |     printf("\t-negative <int>\n");
694 |     printf("\t\tNumber of negative examples; default is 0, common values are 5 - 10 (0 = not used)\n");
695 |     printf("\t-threads <int>\n");
696 |     printf("\t\tUse <int> threads (default 1)\n");
697 |     printf("\t-min-count <int>\n");
698 |     printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
699 |     printf("\t-alpha <float>\n");
700 |     printf("\t\tSet the starting learning rate; default is 0.025\n");
701 |     printf("\t-classes <int>\n");
702 |     printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
703 |     printf("\t-debug <int>\n");
704 |     printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
705 |     printf("\t-binary <int>\n");
706 |     printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
707 |     printf("\t-save-vocab <file>\n");
708 |     printf("\t\tThe vocabulary will be saved to <file>\n");
709 |     printf("\t-read-vocab <file>\n");
710 |     printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
711 |     printf("\t-cbow <int>\n");
712 |     printf("\t\tUse the continuous back of words model; default is 0 (skip-gram model)\n");
713 |     printf("\nExamples:\n");
714 |     printf("./word2vec -train data.txt -output vec.txt -debug 2 -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1\n\n");
715 |     return 0;
716 |   }
717 |   output_file[0] = 0;
718 |   save_vocab_file[0] = 0;
719 |   read_vocab_file[0] = 0;
720 |   if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);
721 |   if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
722 |   if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]);
723 |   if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]);
724 |   if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
725 |   if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);
726 |   if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]);
727 |   if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
728 |   if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
729 |   if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);
730 |   if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]);
731 |   if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]);
732 |   if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]);
733 |   if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
734 |   if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
735 |   if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]);
736 | 
737 |   fin = fopen(train_file, "rb");
738 |   vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
739 |   vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
740 |   expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
741 |   if (expTable == NULL) {
742 |     fprintf(stderr, "out of memory\n");
743 |     exit(1);
744 |   }
745 |   for (i = 0; i < EXP_TABLE_SIZE; i++) {
746 |     expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
747 |     expTable[i] = expTable[i] / (expTable[i] + 1);                   // Precompute f(x) = x / (x + 1)
748 |   }
749 |   TrainModel();
750 |   DestroyNet();
751 |   free(vocab_hash);
752 |   free(expTable);
753 |   return 0;
754 | }
755 | 


--------------------------------------------------------------------------------