├── word2vec ├── tests │ └── __init__.py ├── utils.py ├── __init__.py ├── wordclusters.py ├── io.py ├── scripts_interface.py └── wordvectors.py ├── MANIFEST.in ├── .gitignore ├── word2vec-c ├── demos │ ├── demo-classes.sh │ ├── demo-word.sh │ ├── demo-word-accuracy.sh │ ├── demo-analogy.sh │ ├── demo-phrases.sh │ └── demo-phrase-accuracy.sh ├── makefile ├── distance.c ├── word-analogy.c ├── compute-accuracy.c ├── word2phrase.c └── word2vec.c ├── setup.py ├── README.md ├── examples ├── demo-clusters.ipynb └── path.ipynb └── LICENSE.txt /word2vec/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE.txt 2 | include word2vec-c/makefile 3 | recursive-include word2vec-c *.c 4 | prune bin -------------------------------------------------------------------------------- /word2vec/utils.py: -------------------------------------------------------------------------------- 1 | from numpy import linalg as LA 2 | 3 | 4 | def unitvec(vec): 5 | return (1.0 / LA.norm(vec, ord=2)) * vec 6 | -------------------------------------------------------------------------------- /word2vec/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from io import * 3 | from wordvectors import * 4 | from wordclusters import * 5 | from scripts_interface import * 6 | -------------------------------------------------------------------------------- /word2vec/wordclusters.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class WordClusters(object): 5 | 6 | def __init__(self, vocab, clusters): 7 | self.vocab = vocab 8 | self.clusters = clusters 9 | 10 | def __getitem__(self, cluster): 11 | return self.vocab[self.clusters == cluster] 12 | 13 | @classmethod 14 | def from_text(cls, fname): 15 | vocab = np.genfromtxt(fname, dtype=np.object, delimiter=' ', usecols=0) 16 | clusters = np.genfromtxt(fname, dtype=int, delimiter=' ', usecols=1) 17 | return cls(vocab=vocab, clusters=clusters) 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | MANIFEST 21 | 22 | # Installer logs 23 | pip-log.txt 24 | 25 | # Unit test / coverage reports 26 | .coverage 27 | .tox 28 | nosetests.xml 29 | 30 | # Translations 31 | *.mo 32 | 33 | # Mr Developer 34 | .mr.developer.cfg 35 | .project 36 | .pydevproject 37 | 38 | 39 | # IPython Notebook 40 | .ipynb_checkpoints 41 | 42 | examples/text8 43 | examples/*.vec 44 | examples/*.clusters 45 | examples/*.gz 46 | examples/*.txt 47 | examples/*.bin 48 | examples/data/* 49 | -------------------------------------------------------------------------------- /word2vec/io.py: -------------------------------------------------------------------------------- 1 | import word2vec 2 | import numpy as np 3 | 4 | 5 | def load(fname, kind='bin', save_memory=True): 6 | ''' 7 | Loads a word vectors file 8 | ''' 9 | if kind == 'bin': 10 | return word2vec.WordVectors.from_binary(fname, save_memory=save_memory) 11 | elif kind == 'txt': 12 | return word2vec.WordVectors.from_text(fname, save_memory=save_memory) 13 | elif kind == 'mmap': 14 | return word2vec.WordVectors.from_mmap(fname) 15 | else: 16 | raise Exception('Unknown kind') 17 | 18 | 19 | def load_clusters(fname): 20 | ''' 21 | Loads a word cluster file 22 | ''' 23 | return word2vec.WordClusters.from_text(fname) 24 | -------------------------------------------------------------------------------- /word2vec-c/demos/demo-classes.sh: -------------------------------------------------------------------------------- 1 | DATA_DIR=../data 2 | SRC_DIR=../src 3 | BIN_DIR=../bin 4 | 5 | TEXT_DATA=$DATA_DIR/text8 6 | CLASSES_DATA=$DATA_DIR/classes.txt 7 | 8 | pushd ${SRC_DIR} && make; popd 9 | 10 | 11 | if [ ! -e $CLASSES_DATA ]; then 12 | 13 | if [ ! -e $TEXT_DATA ]; then 14 | wget http://mattmahoney.net/dc/text8.zip -O $DATA_DIR/text8.gz 15 | gzip -d $DATA_DIR/text8.gz -f 16 | fi 17 | echo ----------------------------------------------------------------------------------------------------- 18 | echo -- Training vectors... 19 | time $BIN_DIR/word2vec -train $TEXT_DATA -output $CLASSES_DATA -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -classes 500 20 | 21 | fi 22 | 23 | sort $CLASSES_DATA -k 2 -n > $DATA_DIR/classes.sorted.txt 24 | echo The word classes were saved to file $DATA_DIR/classes.sorted.txt 25 | -------------------------------------------------------------------------------- /word2vec-c/makefile: -------------------------------------------------------------------------------- 1 | SCRIPTS_DIR=../scripts 2 | BIN_DIR=../bin 3 | 4 | CC = gcc 5 | #The -Ofast might not work with older versions of gcc; in that case, use -O2 6 | CFLAGS = -lm -pthread -O2 -Wall -funroll-loops 7 | 8 | all: word2vec word2phrase w2v-distance w2v-word-analogy w2v-compute-accuracy 9 | 10 | word2vec : word2vec.c 11 | $(CC) word2vec.c -o ${BIN_DIR}/word2vec $(CFLAGS) 12 | word2phrase : word2phrase.c 13 | $(CC) word2phrase.c -o ${BIN_DIR}/word2phrase $(CFLAGS) 14 | w2v-distance : distance.c 15 | $(CC) distance.c -o ${BIN_DIR}/w2v-distance $(CFLAGS) 16 | w2v-word-analogy : word-analogy.c 17 | $(CC) word-analogy.c -o ${BIN_DIR}/w2v-word-analogy $(CFLAGS) 18 | w2v-compute-accuracy : compute-accuracy.c 19 | $(CC) compute-accuracy.c -o ${BIN_DIR}/w2v-compute-accuracy $(CFLAGS) 20 | 21 | clean: 22 | pushd ${BIN_DIR} && rm -rf word2vec word2phrase distance word-analogy compute-accuracy; popd 23 | -------------------------------------------------------------------------------- /word2vec-c/demos/demo-word.sh: -------------------------------------------------------------------------------- 1 | DATA_DIR=../data 2 | BIN_DIR=../bin 3 | 4 | TEXT_DATA=$DATA_DIR/text8 5 | VECTOR_DATA=$DATA_DIR/text8-vector.bin 6 | 7 | TEXT_DATA=$DATA_DIR/fashion_blogs.txt 8 | VECTOR_DATA=$DATA_DIR/fashion_blogs.bin 9 | 10 | pushd ${BIN_DIR} && make; popd 11 | 12 | if [ ! -e $VECTOR_DATA ]; then 13 | 14 | if [ ! -e $TEXT_DATA ]; then 15 | wget http://mattmahoney.net/dc/text8.zip -O $DATA_DIR/text8.gz 16 | gzip -d $DATA_DIR/text8.gz -f 17 | fi 18 | echo ----------------------------------------------------------------------------------------------------- 19 | echo -- Training vectors... 20 | time $BIN_DIR/word2vec -train $TEXT_DATA -output $VECTOR_DATA -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 21 | 22 | fi 23 | 24 | echo ----------------------------------------------------------------------------------------------------- 25 | echo -- distance... 26 | 27 | $BIN_DIR/distance $DATA_DIR/$VECTOR_DATA 28 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | from distutils.core import setup 4 | 5 | ''' 6 | To update to a new version: 7 | 1. change version 8 | 2. python setup.py sdist upload 9 | ''' 10 | 11 | DESCRIPTION = 'Google word2vec python wrapper' 12 | 13 | directory = 'bin' 14 | if not os.path.exists(directory): 15 | os.makedirs(directory) 16 | 17 | subprocess.call(['make', '-C', 'word2vec-c']) 18 | 19 | setup( 20 | name='word2vec', 21 | version='0.5.1', 22 | maintainer='Daniel Rodriguez', 23 | maintainer_email='df.rodriguez143@gmail.com', 24 | url='https://github.com/danielfrg/word2vec', 25 | packages=['word2vec'], 26 | description=DESCRIPTION, 27 | license='Apache License Version 2.0, January 2004', 28 | data_files=[('bin', ['bin/word2vec', 'bin/word2phrase', 'bin/w2v-distance', 29 | 'bin/w2v-word-analogy', 'bin/w2v-compute-accuracy'])], 30 | install_requires=[ 31 | 'numpy>=1.7.1' 32 | ], 33 | ) 34 | -------------------------------------------------------------------------------- /word2vec-c/demos/demo-word-accuracy.sh: -------------------------------------------------------------------------------- 1 | DATA_DIR=../data 2 | BIN_DIR=../bin 3 | SRC_DIR=../src 4 | 5 | TEXT_DATA=$DATA_DIR/text8 6 | VECTOR_DATA=$DATA_DIR/text8-vector.bin 7 | 8 | pushd ${SRC_DIR} && make; popd 9 | 10 | 11 | if [ ! -e $VECTOR_DATA ]; then 12 | 13 | if [ ! -e $TEXT_DATA ]; then 14 | wget http://mattmahoney.net/dc/text8.zip -O $DATA_DIR/text8.gz 15 | gzip -d $DATA_DIR/text8.gz -f 16 | fi 17 | echo ----------------------------------------------------------------------------------------------------- 18 | echo -- Training vectors... 19 | time $BIN_DIR/word2vec -train $TEXT_DATA -output $VECTOR_DATA -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 20 | 21 | fi 22 | 23 | echo ----------------------------------------------------------------------------------------------------- 24 | echo -- distance... 25 | 26 | # to compute accuracy with the full vocabulary, use: compute-accuracy $VECTOR_DATA < $DATA_DIR/questions-words.txt 27 | set -x 28 | $BIN_DIR/compute-accuracy $VECTOR_DATA 30000 < $DATA_DIR/questions-words.txt 29 | 30 | -------------------------------------------------------------------------------- /word2vec-c/demos/demo-analogy.sh: -------------------------------------------------------------------------------- 1 | DATA_DIR=../data 2 | BIN_DIR=../bin 3 | SRC_DIR=../src 4 | 5 | TEXT_DATA=$DATA_DIR/text8 6 | VECTOR_DATA=$DATA_DIR/text8-vector.bin 7 | 8 | pushd ${SRC_DIR} && make; popd 9 | 10 | echo ----------------------------------------------------------------------------------------------------- 11 | echo Note that for the word analogy to perform well, the models should be trained on much larger data sets 12 | echo Example input: paris france berlin 13 | echo ----------------------------------------------------------------------------------------------------- 14 | 15 | if [ ! -e $VECTOR_DATA ]; then 16 | 17 | if [ ! -e $TEXT_DATA ]; then 18 | wget http://mattmahoney.net/dc/text8.zip -O $DATA_DIR/text8.gz 19 | gzip -d $DATA_DIR/text8.gz -f 20 | fi 21 | echo ----------------------------------------------------------------------------------------------------- 22 | echo -- Training vectors... 23 | time $BIN_DIR/word2vec -train $TEXT_DATA -output $VECTOR_DATA -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 24 | 25 | fi 26 | 27 | echo ----------------------------------------------------------------------------------------------------- 28 | echo -- analogy... 29 | 30 | $BIN_DIR/word-analogy $VECTOR_DATA 31 | -------------------------------------------------------------------------------- /word2vec-c/demos/demo-phrases.sh: -------------------------------------------------------------------------------- 1 | DATA_DIR=../data 2 | BIN_DIR=../bin 3 | SRC_DIR=../src 4 | 5 | TEXT_DATA=$DATA_DIR/text8 6 | PHRASES_DATA=$DATA_DIR/text8-phrases 7 | PHRASES_VECTOR_DATA=$DATA_DIR/vectors-phrase.bin 8 | 9 | pushd ${SRC_DIR} && make; popd 10 | 11 | if [ ! -e $PHRASES_VECTOR_DATA ]; then 12 | 13 | if [ ! -e $PHRASES_DATA ]; then 14 | 15 | if [ ! -e $TEXT_DATA ]; then 16 | wget http://mattmahoney.net/dc/text8.zip -O $DATA_DIR/text8.gz 17 | gzip -d $DATA_DIR/text8.gz -f 18 | fi 19 | echo ----------------------------------------------------------------------------------------------------- 20 | echo -- Creating phrases... 21 | time $BIN_DIR/word2phrase -train $DATA_DIR/text8 -output $PHRASES_DATA -threshold 500 -debug 2 22 | 23 | fi 24 | 25 | echo ----------------------------------------------------------------------------------------------------- 26 | echo -- Training vectors from phrases... 27 | time $BIN_DIR/word2vec -train $PHRASES_DATA -output $PHRASES_VECTOR_DATA -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 0 28 | 29 | fi 30 | 31 | echo ----------------------------------------------------------------------------------------------------- 32 | echo -- distance... 33 | 34 | $BIN_DIR/distance $PHRASES_VECTOR_DATA 35 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | word2vec 2 | ======== 3 | 4 | Python interface to Google word2vec. 5 | 6 | Training is done using the original C code plus some patches, the other funcionality is pure python + numpy. 7 | 8 | ## Installation 9 | 10 | `pip install word2vec` 11 | 12 | I highly recommend the use the [Anaconda python distribution](http://continuum.io/downloads) 13 | 14 | ## Usage 15 | 16 | The easiest way is to look at this examples: 17 | [word2vec](http://nbviewer.ipython.org/urls/raw.github.com/danielfrg/word2vec/master/examples/demo-word.ipynb) 18 | and 19 | [word clusters](http://nbviewer.ipython.org/urls/raw.github.com/danielfrg/word2vec/master/examples/demo-clusters.ipynb) 20 | 21 | The default functionality from word2vec is also available from the command line: 22 | - word2vec 23 | - word2phrase 24 | - w2v-distance 25 | - w2v-word-analogy 26 | - w2v-compute-accuracy 27 | 28 | ## Issues 29 | 30 | Some people reported that they needed to do this if running OS X: 31 | [http://stackoverflow.com/questions/15590169/ld-library-not-found-for-lgfortran-mac-symlink-issue](http://stackoverflow.com/questions/15590169/ld-library-not-found-for-lgfortran-mac-symlink-issue) 32 | 33 | ## Development 34 | 35 | 1. create a `bin` directory 36 | 2. run the make file inside `word2vec-c` 37 | 3. export the new `bin` directory to PATH 38 | 39 | Tests require `py.test` 40 | -------------------------------------------------------------------------------- /word2vec-c/demos/demo-phrase-accuracy.sh: -------------------------------------------------------------------------------- 1 | DATA_DIR=../data 2 | BIN_DIR=../bin 3 | SRC_DIR=../src 4 | 5 | TEXT_DATA=$DATA_DIR/text8 6 | PHRASES_DATA=$DATA_DIR/text8-phrases 7 | PHRASES_VECTOR_DATA=$DATA_DIR/vectors-phrase.bin 8 | 9 | echo ---------------------------------------------------------------------------------------------------------------- 10 | echo Note that the accuracy and coverage of the test set questions is going to be low with this small training corpus 11 | echo To achieve better accuracy, larger training set is needed 12 | echo ---------------------------------------------------------------------------------------------------------------- 13 | 14 | pushd ${SRC_DIR} && make; popd 15 | 16 | if [ ! -e $PHRASES_VECTOR_DATA ]; then 17 | 18 | if [ ! -e $PHRASES_DATA ]; then 19 | 20 | if [ ! -e $TEXT_DATA ]; then 21 | wget http://mattmahoney.net/dc/text8.zip -O $DATA_DIR/text8.gz 22 | gzip -d $DATA_DIR/text8.gz -f 23 | fi 24 | echo ----------------------------------------------------------------------------------------------------- 25 | echo -- Creating phrases... 26 | time $BIN_DIR/word2phrase -train $DATA_DIR/text8 -output $PHRASES_DATA -threshold 500 -debug 2-min-count 3 27 | 28 | fi 29 | 30 | echo ----------------------------------------------------------------------------------------------------- 31 | echo -- Training vectors from phrases... 32 | time $BIN_DIR/word2vec -train $PHRASES_DATA -output $PHRASES_VECTOR_DATA -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 -min-count 3 33 | 34 | fi 35 | 36 | echo ----------------------------------------------------------------------------------------------------- 37 | echo -- distance... 38 | 39 | $BIN_DIR/compute-accuracy $PHRASES_VECTOR_DATA < $DATA_DIR/questions-phrases.txt 40 | -------------------------------------------------------------------------------- /word2vec/scripts_interface.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import subprocess 4 | 5 | realpath = os.path.dirname(os.path.realpath(__file__)) 6 | datadir = "../bin" 7 | 8 | def word2vec(train, output, size=100, window=5, sample=0, hs=1, negative=0, threads=4, 9 | min_count=5, alpha=0.025, debug=2, binary=1, cbow=0, 10 | save_vocab=None, read_vocab=None, verbose=False): 11 | process = [os.path.join(realpath, datadir, 'word2vec')] 12 | args = ['-train', '-output', '-size', '-window', '-sample', '-hs', '-negative', '-threads', 13 | '-min-count', '-alpha', '-debug', '-binary', '-cbow'] 14 | values = [train, output, size, window, sample, hs, negative, threads, 15 | min_count, alpha, debug, binary, cbow] 16 | for arg, value in zip(args, values): 17 | process.append(arg) 18 | process.append(str(value)) 19 | if save_vocab is not None: 20 | process.append('-save-vocab') 21 | process.append(str(save_vocab)) 22 | if read_vocab is not None: 23 | process.append('-read-vocab') 24 | process.append(str(read_vocab)) 25 | 26 | proc = subprocess.Popen(process, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 27 | 28 | if verbose: 29 | for line in proc.stdout: 30 | sys.stdout.write(line) 31 | sys.stdout.flush() 32 | 33 | out, err = proc.communicate() 34 | if 'ERROR:' in out: 35 | raise Exception(out) 36 | 37 | 38 | def word2clusters(train, output, classes, size=100, window=5, sample=0, hs=1, negative=0, threads=4, 39 | min_count=5, alpha=0.025, debug=2, binary=0, cbow=0, 40 | save_vocab=None, read_vocab=None, verbose=False): 41 | process = [os.path.join(realpath, datadir, 'word2vec')] 42 | args = ['-train', '-output', '-size', '-window', '-sample', '-hs', '-negative', '-threads', 43 | '-min-count', '-alpha', '-classes', '-debug', '-binary', '-cbow'] 44 | values = [train, output, size, window, sample, hs, negative, threads, 45 | min_count, alpha, classes, debug, binary, cbow] 46 | for arg, value in zip(args, values): 47 | process.append(arg) 48 | process.append(str(value)) 49 | if save_vocab is not None: 50 | process.append('-save-vocab') 51 | process.append(str(save_vocab)) 52 | if read_vocab is not None: 53 | process.append('-read-vocab') 54 | process.append(str(read_vocab)) 55 | 56 | proc = subprocess.Popen(process, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 57 | 58 | if verbose: 59 | for line in proc.stdout: 60 | sys.stdout.write(line) 61 | sys.stdout.flush() 62 | 63 | out, err = proc.communicate() 64 | if 'ERROR:' in out: 65 | raise Exception(out) 66 | 67 | 68 | def word2phrase(train, output, min_count=5, threshold=100, debug=2, verbose=False): 69 | process = [realpath + datadir + '/' +'word2phrase'] 70 | args = ['-train', '-output', '-min-count', '-threshold', '-debug'] 71 | values = [train, output, min_count, threshold, debug] 72 | for arg, value in zip(args, values): 73 | process.append(arg) 74 | process.append(str(value)) 75 | 76 | proc = subprocess.Popen(process, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 77 | 78 | if verbose: 79 | for line in proc.stdout: 80 | sys.stdout.write(line) 81 | sys.stdout.flush() 82 | 83 | out, err = proc.communicate() 84 | if 'ERROR:' in out: 85 | raise Exception(out) 86 | -------------------------------------------------------------------------------- /examples/demo-clusters.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "This is equivalent to `demo-classes.sh` from Google." 15 | ] 16 | }, 17 | { 18 | "cell_type": "heading", 19 | "level": 3, 20 | "metadata": {}, 21 | "source": [ 22 | "Grab some data" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "Download some data, for example: [http://mattmahoney.net/dc/text8.zip](http://mattmahoney.net/dc/text8.zip)" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "collapsed": false, 35 | "input": [ 36 | "!wget http://mattmahoney.net/dc/text8.zip -O text8.gz" 37 | ], 38 | "language": "python", 39 | "metadata": {}, 40 | "outputs": [] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "collapsed": false, 45 | "input": [ 46 | "!gzip -d text8.gz -f" 47 | ], 48 | "language": "python", 49 | "metadata": {}, 50 | "outputs": [] 51 | }, 52 | { 53 | "cell_type": "heading", 54 | "level": 3, 55 | "metadata": {}, 56 | "source": [ 57 | "Training" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "collapsed": false, 63 | "input": [ 64 | "import word2vec" 65 | ], 66 | "language": "python", 67 | "metadata": {}, 68 | "outputs": [], 69 | "prompt_number": 1 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "Note that this will take a long time depending on the parameters" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "collapsed": false, 81 | "input": [ 82 | "word2vec.word2clusters('data/text8', 'data/text8.clusters', 100)" 83 | ], 84 | "language": "python", 85 | "metadata": {}, 86 | "outputs": [], 87 | "prompt_number": 2 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "That generated a `text8.clusters` cluster containing the word clusters." 94 | ] 95 | }, 96 | { 97 | "cell_type": "heading", 98 | "level": 3, 99 | "metadata": {}, 100 | "source": [ 101 | "Predictions" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "collapsed": false, 107 | "input": [ 108 | "import word2vec" 109 | ], 110 | "language": "python", 111 | "metadata": {}, 112 | "outputs": [], 113 | "prompt_number": 1 114 | }, 115 | { 116 | "cell_type": "code", 117 | "collapsed": false, 118 | "input": [ 119 | "clusters = word2vec.load_clusters('data/text8.clusters')" 120 | ], 121 | "language": "python", 122 | "metadata": {}, 123 | "outputs": [], 124 | "prompt_number": 2 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "We can see all the words grouped on an specific cluster" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "collapsed": false, 136 | "input": [ 137 | "clusters[20]" 138 | ], 139 | "language": "python", 140 | "metadata": {}, 141 | "outputs": [ 142 | { 143 | "metadata": {}, 144 | "output_type": "pyout", 145 | "prompt_number": 3, 146 | "text": [ 147 | "array(['ways', 'manner', 'treated', ..., 'summarise', 'disproof',\n", 148 | " 'discourteous'], dtype=object)" 149 | ] 150 | } 151 | ], 152 | "prompt_number": 3 153 | } 154 | ], 155 | "metadata": {} 156 | } 157 | ] 158 | } -------------------------------------------------------------------------------- /examples/path.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:461b801e463dea39cdf372462e71ac5f6d27f436e2764a8bf7f339c09d059c40" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "code", 13 | "collapsed": false, 14 | "input": [ 15 | "import word2vec\n", 16 | "import numpy as np\n", 17 | "import networkx as nx" 18 | ], 19 | "language": "python", 20 | "metadata": {}, 21 | "outputs": [], 22 | "prompt_number": 1 23 | }, 24 | { 25 | "cell_type": "code", 26 | "collapsed": false, 27 | "input": [ 28 | "model = word2vec.load('data/text8.bin')" 29 | ], 30 | "language": "python", 31 | "metadata": {}, 32 | "outputs": [], 33 | "prompt_number": 2 34 | }, 35 | { 36 | "cell_type": "code", 37 | "collapsed": false, 38 | "input": [ 39 | "vocab_size = model.vocab.shape[0]" 40 | ], 41 | "language": "python", 42 | "metadata": {}, 43 | "outputs": [], 44 | "prompt_number": 3 45 | }, 46 | { 47 | "cell_type": "code", 48 | "collapsed": false, 49 | "input": [ 50 | "vocab_size" 51 | ], 52 | "language": "python", 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "metadata": {}, 57 | "output_type": "pyout", 58 | "prompt_number": 4, 59 | "text": [ 60 | "71291" 61 | ] 62 | } 63 | ], 64 | "prompt_number": 4 65 | }, 66 | { 67 | "cell_type": "code", 68 | "collapsed": false, 69 | "input": [ 70 | "dists = np.zeros((vocab_size, vocab_size))" 71 | ], 72 | "language": "python", 73 | "metadata": {}, 74 | "outputs": [], 75 | "prompt_number": 5 76 | }, 77 | { 78 | "cell_type": "code", 79 | "collapsed": false, 80 | "input": [ 81 | "n = 10" 82 | ], 83 | "language": "python", 84 | "metadata": {}, 85 | "outputs": [], 86 | "prompt_number": 6 87 | }, 88 | { 89 | "cell_type": "code", 90 | "collapsed": false, 91 | "input": [ 92 | "%%timeit -n1 -r1\n", 93 | "for ix, word in enumerate(model.vocab[:1000]):\n", 94 | " metrics = np.dot(model.l2norm, model.get_vector(word))\n", 95 | " best = np.argsort(metrics)[::-1][1:n + 1]\n", 96 | " dists[ix, best] = metrics[best]" 97 | ], 98 | "language": "python", 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "output_type": "stream", 103 | "stream": "stdout", 104 | "text": [ 105 | "1 loops, best of 1: 15.9 s per loop\n" 106 | ] 107 | } 108 | ], 109 | "prompt_number": 7 110 | }, 111 | { 112 | "cell_type": "code", 113 | "collapsed": false, 114 | "input": [ 115 | "model.cosine('dog')" 116 | ], 117 | "language": "python", 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "metadata": {}, 122 | "output_type": "pyout", 123 | "prompt_number": 52, 124 | "text": [ 125 | "{'dog': [('catahoula', 0.55066107867304437),\n", 126 | " ('shepherd', 0.54683936460837557),\n", 127 | " ('azawakh', 0.54571415535760059),\n", 128 | " ('haired', 0.53117246665323714),\n", 129 | " ('pyrenean', 0.52469939821586631),\n", 130 | " ('jindo', 0.51431665685657291),\n", 131 | " ('ass', 0.51208124629656004),\n", 132 | " ('mastiff', 0.51106035997299537),\n", 133 | " ('terrier', 0.50580184597076816),\n", 134 | " ('hairless', 0.50463618449995062)]}" 135 | ] 136 | } 137 | ], 138 | "prompt_number": 52 139 | }, 140 | { 141 | "cell_type": "code", 142 | "collapsed": false, 143 | "input": [ 144 | "dists[model.ix('dog'), model.ix('catahoula')]" 145 | ], 146 | "language": "python", 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "metadata": {}, 151 | "output_type": "pyout", 152 | "prompt_number": 53, 153 | "text": [ 154 | "0.55066107867304437" 155 | ] 156 | } 157 | ], 158 | "prompt_number": 53 159 | }, 160 | { 161 | "cell_type": "code", 162 | "collapsed": false, 163 | "input": [ 164 | "dists.size" 165 | ], 166 | "language": "python", 167 | "metadata": {}, 168 | "outputs": [ 169 | { 170 | "metadata": {}, 171 | "output_type": "pyout", 172 | "prompt_number": 54, 173 | "text": [ 174 | "5082406681" 175 | ] 176 | } 177 | ], 178 | "prompt_number": 54 179 | } 180 | ], 181 | "metadata": {} 182 | } 183 | ] 184 | } -------------------------------------------------------------------------------- /word2vec-c/distance.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include // mac os x 19 | 20 | 21 | const long long max_size = 2000; // max length of strings 22 | const long long N = 40; // number of closest words that will be shown 23 | const long long max_w = 50; // max length of vocabulary entries 24 | 25 | int main(int argc, char **argv) { 26 | FILE *f; 27 | char st1[max_size]; 28 | char bestw[N][max_size]; 29 | char file_name[max_size], st[100][max_size]; 30 | float dist, len, bestd[N], vec[max_size]; 31 | long long words, size, a, b, c, d, cn, bi[100]; 32 | char ch; 33 | float *M; 34 | char *vocab; 35 | if (argc < 2) { 36 | printf("Usage: ./distance \nwhere FILE contains word projections in the BINARY FORMAT\n"); 37 | return 0; 38 | } 39 | strcpy(file_name, argv[1]); 40 | f = fopen(file_name, "rb"); 41 | if (f == NULL) { 42 | printf("Input file not found\n"); 43 | return -1; 44 | } 45 | fscanf(f, "%lld", &words); 46 | fscanf(f, "%lld", &size); 47 | vocab = (char *)malloc((long long)words * max_w * sizeof(char)); 48 | M = (float *)malloc((long long)words * (long long)size * sizeof(float)); 49 | if (M == NULL) { 50 | printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size); 51 | return -1; 52 | } 53 | for (b = 0; b < words; b++) { 54 | fscanf(f, "%s%c", &vocab[b * max_w], &ch); 55 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); 56 | len = 0; 57 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 58 | len = sqrt(len); 59 | for (a = 0; a < size; a++) M[a + b * size] /= len; 60 | } 61 | fclose(f); 62 | while (1) { 63 | for (a = 0; a < N; a++) bestd[a] = 0; 64 | for (a = 0; a < N; a++) bestw[a][0] = 0; 65 | printf("Enter word or sentence (EXIT to break): "); 66 | a = 0; 67 | while (1) { 68 | st1[a] = fgetc(stdin); 69 | if ((st1[a] == '\n') || (a >= max_size - 1)) { 70 | st1[a] = 0; 71 | break; 72 | } 73 | a++; 74 | } 75 | if (!strcmp(st1, "EXIT")) break; 76 | cn = 0; 77 | b = 0; 78 | c = 0; 79 | while (1) { 80 | st[cn][b] = st1[c]; 81 | b++; 82 | c++; 83 | st[cn][b] = 0; 84 | if (st1[c] == 0) break; 85 | if (st1[c] == ' ') { 86 | cn++; 87 | b = 0; 88 | c++; 89 | } 90 | } 91 | cn++; 92 | for (a = 0; a < cn; a++) { 93 | for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break; 94 | if (b == words) b = -1; 95 | bi[a] = b; 96 | printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]); 97 | if (b == -1) { 98 | printf("Out of dictionary word!\n"); 99 | break; 100 | } 101 | } 102 | if (b == -1) continue; 103 | printf("\n Word Cosine distance\n------------------------------------------------------------------------\n"); 104 | for (a = 0; a < size; a++) vec[a] = 0; 105 | for (b = 0; b < cn; b++) { 106 | if (bi[b] == -1) continue; 107 | for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size]; 108 | } 109 | len = 0; 110 | for (a = 0; a < size; a++) len += vec[a] * vec[a]; 111 | len = sqrt(len); 112 | for (a = 0; a < size; a++) vec[a] /= len; 113 | for (a = 0; a < N; a++) bestd[a] = 0; 114 | for (a = 0; a < N; a++) bestw[a][0] = 0; 115 | for (c = 0; c < words; c++) { 116 | a = 0; 117 | for (b = 0; b < cn; b++) if (bi[b] == c) a = 1; 118 | if (a == 1) continue; 119 | dist = 0; 120 | for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size]; 121 | for (a = 0; a < N; a++) { 122 | if (dist > bestd[a]) { 123 | for (d = N - 1; d > a; d--) { 124 | bestd[d] = bestd[d - 1]; 125 | strcpy(bestw[d], bestw[d - 1]); 126 | } 127 | bestd[a] = dist; 128 | strcpy(bestw[a], &vocab[c * max_w]); 129 | break; 130 | } 131 | } 132 | } 133 | for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]); 134 | } 135 | return 0; 136 | } 137 | -------------------------------------------------------------------------------- /word2vec-c/word-analogy.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include // mac os x 19 | 20 | const long long max_size = 2000; // max length of strings 21 | const long long N = 40; // number of closest words that will be shown 22 | const long long max_w = 50; // max length of vocabulary entries 23 | 24 | int main(int argc, char **argv) { 25 | FILE *f; 26 | char st1[max_size]; 27 | char bestw[N][max_size]; 28 | char file_name[max_size], st[100][max_size]; 29 | float dist, len, bestd[N], vec[max_size]; 30 | long long words, size, a, b, c, d, cn, bi[100]; 31 | char ch; 32 | float *M; 33 | char *vocab; 34 | if (argc < 2) { 35 | printf("Usage: ./word-analogy \nwhere FILE contains word projections in the BINARY FORMAT\n"); 36 | return 0; 37 | } 38 | strcpy(file_name, argv[1]); 39 | f = fopen(file_name, "rb"); 40 | if (f == NULL) { 41 | printf("Input file not found\n"); 42 | return -1; 43 | } 44 | fscanf(f, "%lld", &words); 45 | fscanf(f, "%lld", &size); 46 | vocab = (char *)malloc((long long)words * max_w * sizeof(char)); 47 | M = (float *)malloc((long long)words * (long long)size * sizeof(float)); 48 | if (M == NULL) { 49 | printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size); 50 | return -1; 51 | } 52 | for (b = 0; b < words; b++) { 53 | fscanf(f, "%s%c", &vocab[b * max_w], &ch); 54 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); 55 | len = 0; 56 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 57 | len = sqrt(len); 58 | for (a = 0; a < size; a++) M[a + b * size] /= len; 59 | } 60 | fclose(f); 61 | while (1) { 62 | for (a = 0; a < N; a++) bestd[a] = 0; 63 | for (a = 0; a < N; a++) bestw[a][0] = 0; 64 | printf("Enter three words (EXIT to break): "); 65 | a = 0; 66 | while (1) { 67 | st1[a] = fgetc(stdin); 68 | if ((st1[a] == '\n') || (a >= max_size - 1)) { 69 | st1[a] = 0; 70 | break; 71 | } 72 | a++; 73 | } 74 | if (!strcmp(st1, "EXIT")) break; 75 | cn = 0; 76 | b = 0; 77 | c = 0; 78 | while (1) { 79 | st[cn][b] = st1[c]; 80 | b++; 81 | c++; 82 | st[cn][b] = 0; 83 | if (st1[c] == 0) break; 84 | if (st1[c] == ' ') { 85 | cn++; 86 | b = 0; 87 | c++; 88 | } 89 | } 90 | cn++; 91 | if (cn < 3) { 92 | printf("Only %lld words were entered.. three words are needed at the input to perform the calculation\n", cn); 93 | continue; 94 | } 95 | for (a = 0; a < cn; a++) { 96 | for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break; 97 | if (b == words) b = 0; 98 | bi[a] = b; 99 | printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]); 100 | if (b == 0) { 101 | printf("Out of dictionary word!\n"); 102 | break; 103 | } 104 | } 105 | if (b == 0) continue; 106 | printf("\n Word Distance\n------------------------------------------------------------------------\n"); 107 | for (a = 0; a < size; a++) vec[a] = M[a + bi[1] * size] - M[a + bi[0] * size] + M[a + bi[2] * size]; 108 | len = 0; 109 | for (a = 0; a < size; a++) len += vec[a] * vec[a]; 110 | len = sqrt(len); 111 | for (a = 0; a < size; a++) vec[a] /= len; 112 | for (a = 0; a < N; a++) bestd[a] = 0; 113 | for (a = 0; a < N; a++) bestw[a][0] = 0; 114 | for (c = 0; c < words; c++) { 115 | if (c == bi[0]) continue; 116 | if (c == bi[1]) continue; 117 | if (c == bi[2]) continue; 118 | a = 0; 119 | for (b = 0; b < cn; b++) if (bi[b] == c) a = 1; 120 | if (a == 1) continue; 121 | dist = 0; 122 | for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size]; 123 | for (a = 0; a < N; a++) { 124 | if (dist > bestd[a]) { 125 | for (d = N - 1; d > a; d--) { 126 | bestd[d] = bestd[d - 1]; 127 | strcpy(bestw[d], bestw[d - 1]); 128 | } 129 | bestd[a] = dist; 130 | strcpy(bestw[a], &vocab[c * max_w]); 131 | break; 132 | } 133 | } 134 | } 135 | for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]); 136 | } 137 | return 0; 138 | } 139 | -------------------------------------------------------------------------------- /word2vec-c/compute-accuracy.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include // mac os x 20 | #include 21 | 22 | const long long max_size = 2000; // max length of strings 23 | const long long N = 1; // number of closest words 24 | const long long max_w = 50; // max length of vocabulary entries 25 | 26 | int main(int argc, char **argv) 27 | { 28 | FILE *f; 29 | char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size], ch; 30 | float dist, len, bestd[N], vec[max_size]; 31 | long long words, size, a, b, c, d, b1, b2, b3, threshold = 0; 32 | float *M; 33 | char *vocab; 34 | int TCN, CCN = 0, TACN = 0, CACN = 0, SECN = 0, SYCN = 0, SEAC = 0, SYAC = 0, QID = 0, TQ = 0, TQS = 0; 35 | if (argc < 2) { 36 | printf("Usage: ./compute-accuracy \nwhere FILE contains word projections, and threshold is used to reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30000)\n"); 37 | return 0; 38 | } 39 | strcpy(file_name, argv[1]); 40 | if (argc > 2) threshold = atoi(argv[2]); 41 | f = fopen(file_name, "rb"); 42 | if (f == NULL) { 43 | printf("Input file not found\n"); 44 | return -1; 45 | } 46 | fscanf(f, "%lld", &words); 47 | if (threshold) if (words > threshold) words = threshold; 48 | fscanf(f, "%lld", &size); 49 | vocab = (char *)malloc(words * max_w * sizeof(char)); 50 | M = (float *)malloc(words * size * sizeof(float)); 51 | if (M == NULL) { 52 | printf("Cannot allocate memory: %lld MB\n", words * size * sizeof(float) / 1048576); 53 | return -1; 54 | } 55 | for (b = 0; b < words; b++) { 56 | fscanf(f, "%s%c", &vocab[b * max_w], &ch); 57 | for (a = 0; a < max_w; a++) vocab[b * max_w + a] = toupper(vocab[b * max_w + a]); 58 | for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); 59 | len = 0; 60 | for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; 61 | len = sqrt(len); 62 | for (a = 0; a < size; a++) M[a + b * size] /= len; 63 | } 64 | fclose(f); 65 | TCN = 0; 66 | while (1) { 67 | for (a = 0; a < N; a++) bestd[a] = 0; 68 | for (a = 0; a < N; a++) bestw[a][0] = 0; 69 | scanf("%s", st1); 70 | for (a = 0; a < strlen(st1); a++) st1[a] = toupper(st1[a]); 71 | if ((!strcmp(st1, ":")) || (!strcmp(st1, "EXIT")) || feof(stdin)) { 72 | if (TCN == 0) TCN = 1; 73 | if (QID != 0) { 74 | printf("ACCURACY TOP1: %.2f %% (%d / %d)\n", CCN / (float)TCN * 100, CCN, TCN); 75 | printf("Total accuracy: %.2f %% Semantic accuracy: %.2f %% Syntactic accuracy: %.2f %% \n", CACN / (float)TACN * 100, SEAC / (float)SECN * 100, SYAC / (float)SYCN * 100); 76 | } 77 | QID++; 78 | scanf("%s", st1); 79 | if (feof(stdin)) break; 80 | printf("%s:\n", st1); 81 | TCN = 0; 82 | CCN = 0; 83 | continue; 84 | } 85 | if (!strcmp(st1, "EXIT")) break; 86 | scanf("%s", st2); 87 | for (a = 0; a < strlen(st2); a++) st2[a] = toupper(st2[a]); 88 | scanf("%s", st3); 89 | for (a = 0; a bestd[a]) { 116 | for (d = N - 1; d > a; d--) { 117 | bestd[d] = bestd[d - 1]; 118 | strcpy(bestw[d], bestw[d - 1]); 119 | } 120 | bestd[a] = dist; 121 | strcpy(bestw[a], &vocab[c * max_w]); 122 | break; 123 | } 124 | } 125 | } 126 | if (!strcmp(st4, bestw[0])) { 127 | CCN++; 128 | CACN++; 129 | if (QID <= 5) SEAC++; else SYAC++; 130 | } 131 | if (QID <= 5) SECN++; else SYCN++; 132 | TCN++; 133 | TACN++; 134 | } 135 | printf("Questions seen / total: %d %d %.2f %% \n", TQS, TQ, TQS/(float)TQ*100); 136 | return 0; 137 | } 138 | -------------------------------------------------------------------------------- /word2vec/wordvectors.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | try: 3 | from sklearn.externals import joblib 4 | except: 5 | joblib = None 6 | 7 | from word2vec.utils import unitvec 8 | 9 | 10 | class WordVectors(object): 11 | 12 | def __init__(self, vocab, vectors=None, l2norm=None, save_memory=True): 13 | """ 14 | Initialize a WordVectors class based on vocabulary and vectors 15 | 16 | This initializer precomputes the l2norm of the vectors 17 | 18 | Parameters 19 | ---------- 20 | vocab : np.array 21 | 1d array with the vocabulary 22 | vectors : np.array 23 | 2d array with the vectors calculated by word2vec 24 | l2norm : np.array 25 | 2d array with the calulated l2norm of the vectors 26 | save_memory : boolean 27 | wheter or not save the original vectors in `self.vectors` 28 | """ 29 | if vectors is None and l2norm is None: 30 | raise Exception('Need vectors OR l2norm arguments') 31 | 32 | self.vocab = vocab 33 | 34 | if l2norm is None: 35 | if not save_memory: 36 | self.vectors = vectors 37 | self.l2norm = np.vstack(unitvec(vec) for vec in vectors) 38 | else: 39 | self.l2norm = l2norm 40 | 41 | def ix(self, word): 42 | """ 43 | Returns the index on self.vocab and self.l2norm for `word` 44 | """ 45 | temp = np.where(self.vocab == word)[0] 46 | if temp.size == 0: 47 | raise KeyError('Word not in vocabulary') 48 | else: 49 | return temp[0] 50 | 51 | def get_vector(self, word): 52 | """ 53 | Returns the (l2norm) vector for `word` in the vocabulary 54 | """ 55 | idx = self.ix(word) 56 | return self.l2norm[idx] 57 | 58 | def __getitem__(self, word): 59 | return self.get_vector(word) 60 | 61 | def generate_response(self, indexes, metric): 62 | """ 63 | Generates a response as a list of tuples based on the indexes 64 | Each tuple is: (vocab[i], metric[i]) 65 | """ 66 | return [(word, sim) for word, sim in zip(self.vocab[indexes], metric[indexes])] 67 | 68 | def cosine(self, words, n=10): 69 | """ 70 | Cosine similarity. 71 | 72 | metric = dot(l2norm_of_vectors, l2norm_of_target_vector) 73 | Uses a precomputed l2norm of the vectors 74 | 75 | Parameters 76 | ---------- 77 | words : string or list of string 78 | word(s) in the vocabulary to calculate the vectors 79 | n : int, optional (default 10) 80 | number of neighbors to return 81 | 82 | Returns 83 | ------- 84 | dict: of list of tuples 85 | 86 | Example 87 | ------- 88 | >>> model.cosine('black', n=2) 89 | ``` 90 | ``` 91 | {'black': [('white', 0.94757425919916516), 92 | ('yellow', 0.94640807944950878)] 93 | } 94 | """ 95 | if isinstance(words, basestring): 96 | words = [words] 97 | 98 | targets = np.vstack((self.get_vector(word) for word in words)) 99 | metrics = np.dot(self.l2norm, targets.T) 100 | 101 | ans = {} 102 | for col, word in enumerate(words): 103 | best = np.argsort(metrics[:, col])[::-1][1:n + 1] 104 | best = self.generate_response(best, metrics[:, col]) 105 | ans[word] = best 106 | 107 | return ans 108 | 109 | def _cosine(self, word, n=10): 110 | """ 111 | Test method for cosine distance using `scipy.distance.cosine` 112 | 113 | Note: This method is **a lot** slower than `self.cosine` 114 | and results are the almost the same, you should be using `self.cosine` 115 | 116 | Requires: `__init__(..., save_memory=False)` 117 | 118 | Parameters 119 | ---------- 120 | word : string 121 | word in the vocabulary to calculate the vectors 122 | n : int, optional (default 10) 123 | number of neighbors to return 124 | """ 125 | from scipy.spatial import distance 126 | 127 | target_vec = self[word] 128 | metric = np.empty(self.vocab.shape) 129 | for idx, vector in enumerate(self.vectors): 130 | metric[idx] = distance.cosine(target_vec, vector) 131 | best = metric.argsort()[1:n + 1] 132 | 133 | return self.generate_response(best, metric) 134 | 135 | def analogy(self, pos, neg, n=10): 136 | """ 137 | Analogy similarity. 138 | 139 | Parameters 140 | ---------- 141 | pos : list 142 | neg : list 143 | 144 | Returns 145 | ------- 146 | List of tuples, each tuple is (word, similarity) 147 | 148 | 149 | Example 150 | ------- 151 | `king - man + woman = queen` will be: 152 | `pos=['king', 'woman'], neg=['man']` 153 | """ 154 | words = pos + neg 155 | 156 | pos = [(word, 1.0) for word in pos] 157 | neg = [(word, -1.0) for word in neg] 158 | 159 | mean = [] 160 | for word, direction in pos + neg: 161 | mean.append(direction * unitvec(self.get_vector(word))) 162 | mean = np.array(mean).mean(axis=0) 163 | 164 | similarities = np.dot(self.l2norm, mean) 165 | best = similarities.argsort()[::-1][1:n + len(words) - 1] 166 | return self.generate_response(best, similarities) 167 | 168 | def to_mmap(self, fname): 169 | if not joblib: 170 | raise Exception("sklearn needed for save as mmap") 171 | 172 | joblib.dump(self, fname) 173 | 174 | @classmethod 175 | def from_binary(cls, fname, save_memory=True): 176 | """ 177 | Create a WordVectors class based on a word2vec binary file 178 | 179 | Parameters 180 | ---------- 181 | fname : path to file 182 | save_memory : boolean 183 | 184 | Returns 185 | ------- 186 | WordVectors class 187 | """ 188 | with open(fname) as fin: 189 | header = fin.readline() 190 | vocab_size, vector_size = map(int, header.split()) 191 | vocab = [] 192 | 193 | vectors = np.empty((vocab_size, vector_size), dtype=np.float) 194 | binary_len = np.dtype(np.float32).itemsize * vector_size 195 | for line_number in xrange(vocab_size): 196 | # mixed text and binary: read text first, then binary 197 | word = '' 198 | while True: 199 | ch = fin.read(1) 200 | if ch == ' ': 201 | break 202 | word += ch 203 | vocab.append(word) 204 | 205 | vector = np.fromstring(fin.read(binary_len), np.float32) 206 | vectors[line_number] = vector 207 | fin.read(1) # newline 208 | vocab = np.array(vocab) 209 | 210 | return cls(vocab=vocab, vectors=vectors, save_memory=save_memory) 211 | 212 | @classmethod 213 | def from_text(cls, fname, save_memory=True): 214 | """ 215 | Create a WordVectors class based on a word2vec text file 216 | 217 | Parameters 218 | ---------- 219 | fname : path to file 220 | save_memory : boolean 221 | 222 | Returns 223 | ------- 224 | WordVectors class 225 | """ 226 | with open(fname) as f: 227 | parts = f.readline().strip().split(' ') 228 | shape = int(parts[0]), int(parts[1]) 229 | 230 | vocab = np.genfromtxt(fname, dtype=object, delimiter=' ', usecols=0, skip_header=1) 231 | 232 | cols = np.arange(1, shape[1] + 1) 233 | vectors = np.genfromtxt(fname, dtype=float, delimiter=' ', usecols=cols, skip_header=1) 234 | 235 | return cls(vocab=vocab, vectors=vectors, save_memory=save_memory) 236 | 237 | @classmethod 238 | def from_mmap(cls, fname): 239 | """ 240 | Create a WordVectors class from a memory map 241 | 242 | Parameters 243 | ---------- 244 | fname : path to file 245 | save_memory : boolean 246 | 247 | Returns 248 | ------- 249 | WordVectors class 250 | """ 251 | memmaped = joblib.load(fname, mmap_mode='r+') 252 | return cls(vocab=memmaped.vocab, l2norm=memmaped.l2norm) 253 | -------------------------------------------------------------------------------- /word2vec-c/word2phrase.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #define MAX_STRING 60 22 | 23 | const int vocab_hash_size = 500000000; // Maximum 500M entries in the vocabulary 24 | 25 | typedef float real; // Precision of float numbers 26 | 27 | struct vocab_word { 28 | long long cn; 29 | char *word; 30 | }; 31 | 32 | char train_file[MAX_STRING], output_file[MAX_STRING]; 33 | struct vocab_word *vocab; 34 | int debug_mode = 2, min_count = 5, *vocab_hash, min_reduce = 1; 35 | long long vocab_max_size = 10000, vocab_size = 0; 36 | long long train_words = 0; 37 | real threshold = 100; 38 | 39 | unsigned long long next_random = 1; 40 | 41 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries 42 | void ReadWord(char *word, FILE *fin) { 43 | int a = 0, ch; 44 | while (!feof(fin)) { 45 | ch = fgetc(fin); 46 | if (ch == 13) continue; 47 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 48 | if (a > 0) { 49 | if (ch == '\n') ungetc(ch, fin); 50 | break; 51 | } 52 | if (ch == '\n') { 53 | strcpy(word, (char *)""); 54 | return; 55 | } else continue; 56 | } 57 | word[a] = ch; 58 | a++; 59 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 60 | } 61 | word[a] = 0; 62 | } 63 | 64 | // Returns hash value of a word 65 | int GetWordHash(char *word) { 66 | unsigned long long a, hash = 1; 67 | for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; 68 | hash = hash % vocab_hash_size; 69 | return hash; 70 | } 71 | 72 | // Returns position of a word in the vocabulary; if the word is not found, returns -1 73 | int SearchVocab(char *word) { 74 | unsigned int hash = GetWordHash(word); 75 | while (1) { 76 | if (vocab_hash[hash] == -1) return -1; 77 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; 78 | hash = (hash + 1) % vocab_hash_size; 79 | } 80 | return -1; 81 | } 82 | 83 | // Reads a word and returns its index in the vocabulary 84 | int ReadWordIndex(FILE *fin) { 85 | char word[MAX_STRING]; 86 | ReadWord(word, fin); 87 | if (feof(fin)) return -1; 88 | return SearchVocab(word); 89 | } 90 | 91 | // Adds a word to the vocabulary 92 | int AddWordToVocab(char *word) { 93 | unsigned int hash, length = strlen(word) + 1; 94 | if (length > MAX_STRING) length = MAX_STRING; 95 | vocab[vocab_size].word = (char *)calloc(length, sizeof(char)); 96 | strcpy(vocab[vocab_size].word, word); 97 | vocab[vocab_size].cn = 0; 98 | vocab_size++; 99 | // Reallocate memory if needed 100 | if (vocab_size + 2 >= vocab_max_size) { 101 | vocab_max_size += 10000; 102 | vocab=(struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word)); 103 | } 104 | hash = GetWordHash(word); 105 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 106 | vocab_hash[hash]=vocab_size - 1; 107 | return vocab_size - 1; 108 | } 109 | 110 | // Used later for sorting by word counts 111 | int VocabCompare(const void *a, const void *b) { 112 | return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn; 113 | } 114 | 115 | // Sorts the vocabulary by frequency using word counts 116 | void SortVocab() { 117 | int a; 118 | unsigned int hash; 119 | // Sort the vocabulary and keep at the first position 120 | qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare); 121 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 122 | for (a = 0; a < vocab_size; a++) { 123 | // Words occuring less than min_count times will be discarded from the vocab 124 | if (vocab[a].cn < min_count) { 125 | vocab_size--; 126 | free(vocab[vocab_size].word); 127 | } else { 128 | // Hash will be re-computed, as after the sorting it is not actual 129 | hash = GetWordHash(vocab[a].word); 130 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 131 | vocab_hash[hash] = a; 132 | } 133 | } 134 | vocab = (struct vocab_word *)realloc(vocab, vocab_size * sizeof(struct vocab_word)); 135 | } 136 | 137 | // Reduces the vocabulary by removing infrequent tokens 138 | void ReduceVocab() { 139 | int a, b = 0; 140 | unsigned int hash; 141 | for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) { 142 | vocab[b].cn = vocab[a].cn; 143 | vocab[b].word = vocab[a].word; 144 | b++; 145 | } else free(vocab[a].word); 146 | vocab_size = b; 147 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 148 | for (a = 0; a < vocab_size; a++) { 149 | // Hash will be re-computed, as it is not actual 150 | hash = GetWordHash(vocab[a].word); 151 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 152 | vocab_hash[hash] = a; 153 | } 154 | fflush(stdout); 155 | min_reduce++; 156 | } 157 | 158 | void LearnVocabFromTrainFile() { 159 | char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2]; 160 | FILE *fin; 161 | long long a, i, start = 1; 162 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 163 | fin = fopen(train_file, "rb"); 164 | if (fin == NULL) { 165 | printf("ERROR: training data file not found!\n"); 166 | exit(1); 167 | } 168 | vocab_size = 0; 169 | AddWordToVocab((char *)""); 170 | while (1) { 171 | ReadWord(word, fin); 172 | if (feof(fin)) break; 173 | if (!strcmp(word, "")) { 174 | start = 1; 175 | continue; 176 | } else start = 0; 177 | train_words++; 178 | if ((debug_mode > 1) && (train_words % 100000 == 0)) { 179 | printf("Words processed: %lldK Vocab size: %lldK %c", train_words / 1000, vocab_size / 1000, 13); 180 | fflush(stdout); 181 | } 182 | i = SearchVocab(word); 183 | if (i == -1) { 184 | a = AddWordToVocab(word); 185 | vocab[a].cn = 1; 186 | } else vocab[i].cn++; 187 | if (start) continue; 188 | sprintf(bigram_word, "%s_%s", last_word, word); 189 | bigram_word[MAX_STRING - 1] = 0; 190 | strcpy(last_word, word); 191 | i = SearchVocab(bigram_word); 192 | if (i == -1) { 193 | a = AddWordToVocab(bigram_word); 194 | vocab[a].cn = 1; 195 | } else vocab[i].cn++; 196 | if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); 197 | } 198 | SortVocab(); 199 | if (debug_mode > 0) { 200 | printf("\nVocab size (unigrams + bigrams): %lld\n", vocab_size); 201 | printf("Words in train file: %lld\n", train_words); 202 | } 203 | fclose(fin); 204 | } 205 | 206 | void TrainModel() { 207 | long long pa = 0, pb = 0, pab = 0, oov, i, li = -1, cn = 0; 208 | char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2]; 209 | real score; 210 | FILE *fo, *fin; 211 | printf("Starting training using file %s\n", train_file); 212 | LearnVocabFromTrainFile(); 213 | fin = fopen(train_file, "rb"); 214 | fo = fopen(output_file, "wb"); 215 | word[0] = 0; 216 | while (1) { 217 | strcpy(last_word, word); 218 | ReadWord(word, fin); 219 | if (feof(fin)) break; 220 | if (!strcmp(word, "")) { 221 | fprintf(fo, "\n"); 222 | continue; 223 | } 224 | cn++; 225 | if ((debug_mode > 1) && (cn % 100000 == 0)) { 226 | printf("Words written: %lldK%c", cn / 1000, 13); 227 | fflush(stdout); 228 | } 229 | oov = 0; 230 | i = SearchVocab(word); 231 | if (i == -1) oov = 1; else pb = vocab[i].cn; 232 | if (li == -1) oov = 1; 233 | li = i; 234 | sprintf(bigram_word, "%s_%s", last_word, word); 235 | bigram_word[MAX_STRING - 1] = 0; 236 | i = SearchVocab(bigram_word); 237 | if (i == -1) oov = 1; else pab = vocab[i].cn; 238 | if (pa < min_count) oov = 1; 239 | if (pb < min_count) oov = 1; 240 | if (oov) score = 0; else score = (pab - min_count) / (real)pa / (real)pb * (real)train_words; 241 | if (score > threshold) { 242 | fprintf(fo, "_%s", word); 243 | pb = 0; 244 | } else fprintf(fo, " %s", word); 245 | pa = pb; 246 | } 247 | fclose(fo); 248 | fclose(fin); 249 | } 250 | 251 | int ArgPos(char *str, int argc, char **argv) { 252 | int a; 253 | for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { 254 | if (a == argc - 1) { 255 | printf("Argument missing for %s\n", str); 256 | exit(1); 257 | } 258 | return a; 259 | } 260 | return -1; 261 | } 262 | 263 | int main(int argc, char **argv) { 264 | int i; 265 | if (argc == 1) { 266 | printf("WORD2PHRASE tool v0.1a\n\n"); 267 | printf("Options:\n"); 268 | printf("Parameters for training:\n"); 269 | printf("\t-train \n"); 270 | printf("\t\tUse text data from to train the model\n"); 271 | printf("\t-output \n"); 272 | printf("\t\tUse to save the resulting word vectors / word clusters / phrases\n"); 273 | printf("\t-min-count \n"); 274 | printf("\t\tThis will discard words that appear less than times; default is 5\n"); 275 | printf("\t-threshold \n"); 276 | printf("\t\t The value represents threshold for forming the phrases (higher means less phrases); default 100\n"); 277 | printf("\t-debug \n"); 278 | printf("\t\tSet the debug mode (default = 2 = more info during training)\n"); 279 | printf("\nExamples:\n"); 280 | printf("./word2phrase -train text.txt -output phrases.txt -threshold 100 -debug 2\n\n"); 281 | return 0; 282 | } 283 | if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]); 284 | if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]); 285 | if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]); 286 | if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]); 287 | if ((i = ArgPos((char *)"-threshold", argc, argv)) > 0) threshold = atof(argv[i + 1]); 288 | vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); 289 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); 290 | TrainModel(); 291 | return 0; 292 | } 293 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /word2vec-c/word2vec.c: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #define MAX_STRING 100 22 | #define EXP_TABLE_SIZE 1000 23 | #define MAX_EXP 6 24 | #define MAX_SENTENCE_LENGTH 1000 25 | #define MAX_CODE_LENGTH 40 26 | 27 | const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary 28 | 29 | typedef float real; // Precision of float numbers 30 | 31 | struct vocab_word { 32 | long long cn; 33 | int *point; 34 | char *word, *code, codelen; 35 | }; 36 | 37 | FILE *fin; 38 | 39 | char train_file[MAX_STRING], output_file[MAX_STRING]; 40 | char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING]; 41 | struct vocab_word *vocab; 42 | int binary = 0, cbow = 0, debug_mode = 2, window = 5, min_count = 5, num_threads = 1, min_reduce = 1; 43 | int *vocab_hash; 44 | long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100; 45 | long long train_words = 0, word_count_actual = 0, file_size = 0, classes = 0; 46 | real alpha = 0.025, starting_alpha, sample = 0; 47 | real *syn0, *syn1, *syn1neg, *expTable; 48 | clock_t start; 49 | 50 | int hs = 1, negative = 0; 51 | const int table_size = 1e8; 52 | int *table; 53 | 54 | void InitUnigramTable() { 55 | int a, i; 56 | long long train_words_pow = 0; 57 | real d1, power = 0.75; 58 | table = (int *)malloc(table_size * sizeof(int)); 59 | if (table == NULL) { 60 | fprintf(stderr, "cannot allocate memory for the table\n"); 61 | exit(1); 62 | } 63 | for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power); 64 | i = 0; 65 | d1 = pow(vocab[i].cn, power) / (real)train_words_pow; 66 | for (a = 0; a < table_size; a++) { 67 | table[a] = i; 68 | if (a / (real)table_size > d1) { 69 | i++; 70 | d1 += pow(vocab[i].cn, power) / (real)train_words_pow; 71 | } 72 | if (i >= vocab_size) i = vocab_size - 1; 73 | } 74 | } 75 | 76 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries 77 | void ReadWord(char *word, FILE *fin) { 78 | int a = 0, ch; 79 | while (!feof(fin)) { 80 | ch = fgetc(fin); 81 | if (ch == 13) continue; 82 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { 83 | if (a > 0) { 84 | if (ch == '\n') ungetc(ch, fin); 85 | break; 86 | } 87 | if (ch == '\n') { 88 | strcpy(word, (char *)""); 89 | return; 90 | } else continue; 91 | } 92 | word[a] = ch; 93 | a++; 94 | if (a >= MAX_STRING - 1) a--; // Truncate too long words 95 | } 96 | word[a] = 0; 97 | } 98 | 99 | // Returns hash value of a word 100 | int GetWordHash(char *word) { 101 | unsigned long long a, hash = 0; 102 | for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; 103 | hash = hash % vocab_hash_size; 104 | return hash; 105 | } 106 | 107 | // Returns position of a word in the vocabulary; if the word is not found, returns -1 108 | int SearchVocab(char *word) { 109 | unsigned int hash = GetWordHash(word); 110 | while (1) { 111 | if (vocab_hash[hash] == -1) return -1; 112 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; 113 | hash = (hash + 1) % vocab_hash_size; 114 | } 115 | return -1; 116 | } 117 | 118 | // Reads a word and returns its index in the vocabulary 119 | int ReadWordIndex(FILE *fin) { 120 | char word[MAX_STRING]; 121 | ReadWord(word, fin); 122 | if (feof(fin)) return -1; 123 | return SearchVocab(word); 124 | } 125 | 126 | // Adds a word to the vocabulary 127 | int AddWordToVocab(char *word) { 128 | unsigned int hash, length = strlen(word) + 1; 129 | if (length > MAX_STRING) length = MAX_STRING; 130 | vocab[vocab_size].word = (char *)calloc(length, sizeof(char)); 131 | strcpy(vocab[vocab_size].word, word); 132 | vocab[vocab_size].cn = 0; 133 | vocab_size++; 134 | // Reallocate memory if needed 135 | if (vocab_size + 2 >= vocab_max_size) { 136 | vocab_max_size += 1000; 137 | vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word)); 138 | } 139 | hash = GetWordHash(word); 140 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 141 | vocab_hash[hash] = vocab_size - 1; 142 | return vocab_size - 1; 143 | } 144 | 145 | // Used later for sorting by word counts 146 | int VocabCompare(const void *a, const void *b) { 147 | return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn; 148 | } 149 | 150 | void DestroyVocab() { 151 | int a; 152 | 153 | for (a = 0; a < vocab_size; a++) { 154 | if (vocab[a].word != NULL) { 155 | free(vocab[a].word); 156 | } 157 | if (vocab[a].code != NULL) { 158 | free(vocab[a].code); 159 | } 160 | if (vocab[a].point != NULL) { 161 | free(vocab[a].point); 162 | } 163 | } 164 | free(vocab[vocab_size].word); 165 | free(vocab); 166 | } 167 | 168 | // Sorts the vocabulary by frequency using word counts 169 | void SortVocab() { 170 | int a, size; 171 | unsigned int hash; 172 | // Sort the vocabulary and keep at the first position 173 | qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare); 174 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 175 | size = vocab_size; 176 | train_words = 0; 177 | for (a = 1; a < size; a++) { // Skip 178 | // Words occuring less than min_count times will be discarded from the vocab 179 | if (vocab[a].cn < min_count) { 180 | vocab_size--; 181 | free(vocab[a].word); 182 | vocab[a].word = NULL; 183 | } else { 184 | // Hash will be re-computed, as after the sorting it is not actual 185 | hash=GetWordHash(vocab[a].word); 186 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 187 | vocab_hash[hash] = a; 188 | train_words += vocab[a].cn; 189 | } 190 | } 191 | vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word)); 192 | // Allocate memory for the binary tree construction 193 | for (a = 0; a < vocab_size; a++) { 194 | vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char)); 195 | vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int)); 196 | } 197 | } 198 | 199 | // Reduces the vocabulary by removing infrequent tokens 200 | void ReduceVocab() { 201 | int a, b = 0; 202 | unsigned int hash; 203 | for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) { 204 | vocab[b].cn = vocab[a].cn; 205 | vocab[b].word = vocab[a].word; 206 | b++; 207 | } else free(vocab[a].word); 208 | vocab_size = b; 209 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 210 | for (a = 0; a < vocab_size; a++) { 211 | // Hash will be re-computed, as it is not actual 212 | hash = GetWordHash(vocab[a].word); 213 | while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; 214 | vocab_hash[hash] = a; 215 | } 216 | fflush(stdout); 217 | min_reduce++; 218 | } 219 | 220 | // Create binary Huffman tree using the word counts 221 | // Frequent words will have short uniqe binary codes 222 | void CreateBinaryTree() { 223 | long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH]; 224 | char code[MAX_CODE_LENGTH]; 225 | long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 226 | long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 227 | long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); 228 | for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn; 229 | for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15; 230 | pos1 = vocab_size - 1; 231 | pos2 = vocab_size; 232 | // Following algorithm constructs the Huffman tree by adding one node at a time 233 | for (a = 0; a < vocab_size - 1; a++) { 234 | // First, find two smallest nodes 'min1, min2' 235 | if (pos1 >= 0) { 236 | if (count[pos1] < count[pos2]) { 237 | min1i = pos1; 238 | pos1--; 239 | } else { 240 | min1i = pos2; 241 | pos2++; 242 | } 243 | } else { 244 | min1i = pos2; 245 | pos2++; 246 | } 247 | if (pos1 >= 0) { 248 | if (count[pos1] < count[pos2]) { 249 | min2i = pos1; 250 | pos1--; 251 | } else { 252 | min2i = pos2; 253 | pos2++; 254 | } 255 | } else { 256 | min2i = pos2; 257 | pos2++; 258 | } 259 | count[vocab_size + a] = count[min1i] + count[min2i]; 260 | parent_node[min1i] = vocab_size + a; 261 | parent_node[min2i] = vocab_size + a; 262 | binary[min2i] = 1; 263 | } 264 | // Now assign binary code to each vocabulary word 265 | for (a = 0; a < vocab_size; a++) { 266 | b = a; 267 | i = 0; 268 | while (1) { 269 | code[i] = binary[b]; 270 | point[i] = b; 271 | i++; 272 | b = parent_node[b]; 273 | if (b == vocab_size * 2 - 2) break; 274 | } 275 | vocab[a].codelen = i; 276 | vocab[a].point[0] = vocab_size - 2; 277 | for (b = 0; b < i; b++) { 278 | vocab[a].code[i - b - 1] = code[b]; 279 | vocab[a].point[i - b] = point[b] - vocab_size; 280 | } 281 | } 282 | free(count); 283 | free(binary); 284 | free(parent_node); 285 | } 286 | 287 | void LearnVocabFromTrainFile() { 288 | char word[MAX_STRING]; 289 | //FILE *fin; 290 | long long a, i; 291 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 292 | //fin = fopen(train_file, "rb"); 293 | rewind(fin); 294 | if (fin == NULL) { 295 | printf("ERROR: training data file not found!\n"); 296 | exit(1); 297 | } 298 | vocab_size = 0; 299 | AddWordToVocab((char *)""); 300 | while (1) { 301 | ReadWord(word, fin); 302 | if (feof(fin)) break; 303 | train_words++; 304 | if ((debug_mode > 1) && (train_words % 100000 == 0)) { 305 | printf("%lldK%c", train_words / 1000, 13); 306 | fflush(stdout); 307 | } 308 | i = SearchVocab(word); 309 | if (i == -1) { 310 | a = AddWordToVocab(word); 311 | vocab[a].cn = 1; 312 | } else vocab[i].cn++; 313 | if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); 314 | } 315 | SortVocab(); 316 | if (debug_mode > 0) { 317 | printf("Vocab size: %lld\n", vocab_size); 318 | printf("Words in train file: %lld\n", train_words); 319 | } 320 | file_size = ftell(fin); 321 | //fclose(fin); 322 | } 323 | 324 | void SaveVocab() { 325 | long long i; 326 | FILE *fo = fopen(save_vocab_file, "wb"); 327 | for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn); 328 | fclose(fo); 329 | } 330 | 331 | void ReadVocab() { 332 | long long a, i = 0; 333 | char c; 334 | char word[MAX_STRING]; 335 | FILE *fvb = fopen(read_vocab_file, "rb"); 336 | if (fvb == NULL) { 337 | printf("Vocabulary file not found\n"); 338 | exit(1); 339 | } 340 | for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; 341 | vocab_size = 0; 342 | while (1) { 343 | ReadWord(word, fvb); 344 | if (feof(fvb)) break; 345 | a = AddWordToVocab(word); 346 | fscanf(fvb, "%lld%c", &vocab[a].cn, &c); 347 | i++; 348 | } 349 | SortVocab(); 350 | if (debug_mode > 0) { 351 | printf("Vocab size: %lld\n", vocab_size); 352 | printf("Words in train file: %lld\n", train_words); 353 | } 354 | //fin = fopen(train_file, "rb"); 355 | rewind(fin); 356 | if (fin == NULL) { 357 | printf("ERROR: training data file not found!\n"); 358 | exit(1); 359 | } 360 | fseek(fin, 0, SEEK_END); 361 | file_size = ftell(fin); 362 | //fclose(fin); 363 | } 364 | 365 | void InitNet() { 366 | long long a, b; 367 | a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real)); 368 | if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);} 369 | if (hs) { 370 | a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real)); 371 | if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);} 372 | for (b = 0; b < layer1_size; b++) for (a = 0; a < vocab_size; a++) 373 | syn1[a * layer1_size + b] = 0; 374 | } 375 | if (negative>0) { 376 | a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real)); 377 | if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);} 378 | for (b = 0; b < layer1_size; b++) for (a = 0; a < vocab_size; a++) 379 | syn1neg[a * layer1_size + b] = 0; 380 | } 381 | for (b = 0; b < layer1_size; b++) for (a = 0; a < vocab_size; a++) 382 | syn0[a * layer1_size + b] = (rand() / (real)RAND_MAX - 0.5) / layer1_size; 383 | CreateBinaryTree(); 384 | } 385 | 386 | void DestroyNet() { 387 | if (syn0 != NULL) { 388 | free(syn0); 389 | } 390 | if (syn1 != NULL) { 391 | free(syn1); 392 | } 393 | if (syn1neg != NULL) { 394 | free(syn1neg); 395 | } 396 | } 397 | 398 | void *TrainModelThread(void *id) { 399 | long long a, b, d, word, last_word, sentence_length = 0, sentence_position = 0; 400 | long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1]; 401 | long long l1, l2, c, target, label; 402 | unsigned long long next_random = (long long)id; 403 | real f, g; 404 | clock_t now; 405 | real *neu1 = (real *)calloc(layer1_size, sizeof(real)); 406 | real *neu1e = (real *)calloc(layer1_size, sizeof(real)); 407 | rewind(fin); 408 | FILE *fi = fin; //fopen(train_file, "rb"); 409 | if (fi == NULL) { 410 | fprintf(stderr, "no such file or directory: %s", train_file); 411 | exit(1); 412 | } 413 | fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); 414 | while (1) { 415 | if (word_count - last_word_count > 10000) { 416 | word_count_actual += word_count - last_word_count; 417 | last_word_count = word_count; 418 | if ((debug_mode > 1)) { 419 | now=clock(); 420 | printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha, 421 | word_count_actual / (real)(train_words + 1) * 100, 422 | word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); 423 | fflush(stdout); 424 | } 425 | alpha = starting_alpha * (1 - word_count_actual / (real)(train_words + 1)); 426 | if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001; 427 | } 428 | if (sentence_length == 0) { 429 | while (1) { 430 | word = ReadWordIndex(fi); 431 | if (feof(fi)) break; 432 | if (word == -1) continue; 433 | word_count++; 434 | if (word == 0) break; 435 | // The subsampling randomly discards frequent words while keeping the ranking same 436 | if (sample > 0) { 437 | real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn; 438 | next_random = next_random * (unsigned long long)25214903917 + 11; 439 | if (ran < (next_random & 0xFFFF) / (real)65536) continue; 440 | } 441 | sen[sentence_length] = word; 442 | sentence_length++; 443 | if (sentence_length >= MAX_SENTENCE_LENGTH) break; 444 | } 445 | sentence_position = 0; 446 | } 447 | if (feof(fi)) break; 448 | if (word_count > train_words / num_threads) break; 449 | word = sen[sentence_position]; 450 | if (word == -1) continue; 451 | for (c = 0; c < layer1_size; c++) neu1[c] = 0; 452 | for (c = 0; c < layer1_size; c++) neu1e[c] = 0; 453 | next_random = next_random * (unsigned long long)25214903917 + 11; 454 | b = next_random % window; 455 | if (cbow) { //train the cbow architecture 456 | // in -> hidden 457 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 458 | c = sentence_position - window + a; 459 | if (c < 0) continue; 460 | if (c >= sentence_length) continue; 461 | last_word = sen[c]; 462 | if (last_word == -1) continue; 463 | for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size]; 464 | } 465 | if (hs) for (d = 0; d < vocab[word].codelen; d++) { 466 | f = 0; 467 | l2 = vocab[word].point[d] * layer1_size; 468 | // Propagate hidden -> output 469 | for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2]; 470 | if (f <= -MAX_EXP) continue; 471 | else if (f >= MAX_EXP) continue; 472 | else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; 473 | // 'g' is the gradient multiplied by the learning rate 474 | g = (1 - vocab[word].code[d] - f) * alpha; 475 | // Propagate errors output -> hidden 476 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; 477 | // Learn weights hidden -> output 478 | for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c]; 479 | } 480 | // NEGATIVE SAMPLING 481 | if (negative > 0) for (d = 0; d < negative + 1; d++) { 482 | if (d == 0) { 483 | target = word; 484 | label = 1; 485 | } else { 486 | next_random = next_random * (unsigned long long)25214903917 + 11; 487 | target = table[(next_random >> 16) % table_size]; 488 | if (target == 0) target = next_random % (vocab_size - 1) + 1; 489 | if (target == word) continue; 490 | label = 0; 491 | } 492 | l2 = target * layer1_size; 493 | f = 0; 494 | for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2]; 495 | if (f > MAX_EXP) g = (label - 1) * alpha; 496 | else if (f < -MAX_EXP) g = (label - 0) * alpha; 497 | else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; 498 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; 499 | for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c]; 500 | } 501 | // hidden -> in 502 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 503 | c = sentence_position - window + a; 504 | if (c < 0) continue; 505 | if (c >= sentence_length) continue; 506 | last_word = sen[c]; 507 | if (last_word == -1) continue; 508 | for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c]; 509 | } 510 | } else { //train skip-gram 511 | for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { 512 | c = sentence_position - window + a; 513 | if (c < 0) continue; 514 | if (c >= sentence_length) continue; 515 | last_word = sen[c]; 516 | if (last_word == -1) continue; 517 | l1 = last_word * layer1_size; 518 | for (c = 0; c < layer1_size; c++) neu1e[c] = 0; 519 | // HIERARCHICAL SOFTMAX 520 | if (hs) for (d = 0; d < vocab[word].codelen; d++) { 521 | f = 0; 522 | l2 = vocab[word].point[d] * layer1_size; 523 | // Propagate hidden -> output 524 | for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2]; 525 | if (f <= -MAX_EXP) continue; 526 | else if (f >= MAX_EXP) continue; 527 | else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; 528 | // 'g' is the gradient multiplied by the learning rate 529 | g = (1 - vocab[word].code[d] - f) * alpha; 530 | // Propagate errors output -> hidden 531 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; 532 | // Learn weights hidden -> output 533 | for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1]; 534 | } 535 | // NEGATIVE SAMPLING 536 | if (negative > 0) for (d = 0; d < negative + 1; d++) { 537 | if (d == 0) { 538 | target = word; 539 | label = 1; 540 | } else { 541 | next_random = next_random * (unsigned long long)25214903917 + 11; 542 | target = table[(next_random >> 16) % table_size]; 543 | if (target == 0) target = next_random % (vocab_size - 1) + 1; 544 | if (target == word) continue; 545 | label = 0; 546 | } 547 | l2 = target * layer1_size; 548 | f = 0; 549 | for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2]; 550 | if (f > MAX_EXP) g = (label - 1) * alpha; 551 | else if (f < -MAX_EXP) g = (label - 0) * alpha; 552 | else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; 553 | for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; 554 | for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1]; 555 | } 556 | // Learn weights input -> hidden 557 | for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c]; 558 | } 559 | } 560 | sentence_position++; 561 | if (sentence_position >= sentence_length) { 562 | sentence_length = 0; 563 | continue; 564 | } 565 | } 566 | //fclose(fi); 567 | free(neu1); 568 | free(neu1e); 569 | pthread_exit(NULL); 570 | } 571 | 572 | void TrainModel() { 573 | long a, b, c, d; 574 | FILE *fo; 575 | pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t)); 576 | if (pt == NULL) { 577 | fprintf(stderr, "cannot allocate memory for threads\n"); 578 | exit(1); 579 | } 580 | printf("Starting training using file %s\n", train_file); 581 | starting_alpha = alpha; 582 | if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile(); 583 | if (save_vocab_file[0] != 0) SaveVocab(); 584 | if (output_file[0] == 0) return; 585 | InitNet(); 586 | if (negative > 0) InitUnigramTable(); 587 | start = clock(); 588 | for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a); 589 | for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL); 590 | fo = fopen(output_file, "wb"); 591 | if (fo == NULL) { 592 | fprintf(stderr, "Cannot open %s: permission denied\n", output_file); 593 | exit(1); 594 | } 595 | if (classes == 0) { 596 | // Save the word vectors 597 | fprintf(fo, "%lld %lld\n", vocab_size, layer1_size); 598 | for (a = 0; a < vocab_size; a++) { 599 | if (vocab[a].word != NULL) { 600 | fprintf(fo, "%s ", vocab[a].word); 601 | } 602 | if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo); 603 | else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]); 604 | fprintf(fo, "\n"); 605 | } 606 | } else { 607 | // Run K-means on the word vectors 608 | int clcn = classes, iter = 10, closeid; 609 | int *centcn = (int *)malloc(classes * sizeof(int)); 610 | if (centcn == NULL) { 611 | fprintf(stderr, "cannot allocate memory for centcn\n"); 612 | exit(1); 613 | } 614 | int *cl = (int *)calloc(vocab_size, sizeof(int)); 615 | real closev, x; 616 | real *cent = (real *)calloc(classes * layer1_size, sizeof(real)); 617 | for (a = 0; a < vocab_size; a++) cl[a] = a % clcn; 618 | for (a = 0; a < iter; a++) { 619 | for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0; 620 | for (b = 0; b < clcn; b++) centcn[b] = 1; 621 | for (c = 0; c < vocab_size; c++) { 622 | for (d = 0; d < layer1_size; d++) { 623 | cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d]; 624 | centcn[cl[c]]++; 625 | } 626 | } 627 | for (b = 0; b < clcn; b++) { 628 | closev = 0; 629 | for (c = 0; c < layer1_size; c++) { 630 | cent[layer1_size * b + c] /= centcn[b]; 631 | closev += cent[layer1_size * b + c] * cent[layer1_size * b + c]; 632 | } 633 | closev = sqrt(closev); 634 | for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev; 635 | } 636 | for (c = 0; c < vocab_size; c++) { 637 | closev = -10; 638 | closeid = 0; 639 | for (d = 0; d < clcn; d++) { 640 | x = 0; 641 | for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b]; 642 | if (x > closev) { 643 | closev = x; 644 | closeid = d; 645 | } 646 | } 647 | cl[c] = closeid; 648 | } 649 | } 650 | // Save the K-means classes 651 | for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]); 652 | free(centcn); 653 | free(cent); 654 | free(cl); 655 | } 656 | fclose(fo); 657 | free(table); 658 | free(pt); 659 | DestroyVocab(); 660 | } 661 | 662 | int ArgPos(char *str, int argc, char **argv) { 663 | int a; 664 | for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { 665 | if (a == argc - 1) { 666 | printf("Argument missing for %s\n", str); 667 | exit(1); 668 | } 669 | return a; 670 | } 671 | return -1; 672 | } 673 | 674 | int main(int argc, char **argv) { 675 | int i; 676 | if (argc == 1) { 677 | printf("WORD VECTOR estimation toolkit v 0.1b\n\n"); 678 | printf("Options:\n"); 679 | printf("Parameters for training:\n"); 680 | printf("\t-train \n"); 681 | printf("\t\tUse text data from to train the model\n"); 682 | printf("\t-output \n"); 683 | printf("\t\tUse to save the resulting word vectors / word clusters\n"); 684 | printf("\t-size \n"); 685 | printf("\t\tSet size of word vectors; default is 100\n"); 686 | printf("\t-window \n"); 687 | printf("\t\tSet max skip length between words; default is 5\n"); 688 | printf("\t-sample \n"); 689 | printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency"); 690 | printf(" in the training data will be randomly down-sampled; default is 0 (off), useful value is 1e-5\n"); 691 | printf("\t-hs \n"); 692 | printf("\t\tUse Hierarchical Softmax; default is 1 (0 = not used)\n"); 693 | printf("\t-negative \n"); 694 | printf("\t\tNumber of negative examples; default is 0, common values are 5 - 10 (0 = not used)\n"); 695 | printf("\t-threads \n"); 696 | printf("\t\tUse threads (default 1)\n"); 697 | printf("\t-min-count \n"); 698 | printf("\t\tThis will discard words that appear less than times; default is 5\n"); 699 | printf("\t-alpha \n"); 700 | printf("\t\tSet the starting learning rate; default is 0.025\n"); 701 | printf("\t-classes \n"); 702 | printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n"); 703 | printf("\t-debug \n"); 704 | printf("\t\tSet the debug mode (default = 2 = more info during training)\n"); 705 | printf("\t-binary \n"); 706 | printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n"); 707 | printf("\t-save-vocab \n"); 708 | printf("\t\tThe vocabulary will be saved to \n"); 709 | printf("\t-read-vocab \n"); 710 | printf("\t\tThe vocabulary will be read from , not constructed from the training data\n"); 711 | printf("\t-cbow \n"); 712 | printf("\t\tUse the continuous back of words model; default is 0 (skip-gram model)\n"); 713 | printf("\nExamples:\n"); 714 | printf("./word2vec -train data.txt -output vec.txt -debug 2 -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1\n\n"); 715 | return 0; 716 | } 717 | output_file[0] = 0; 718 | save_vocab_file[0] = 0; 719 | read_vocab_file[0] = 0; 720 | if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]); 721 | if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]); 722 | if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]); 723 | if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]); 724 | if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]); 725 | if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]); 726 | if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]); 727 | if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]); 728 | if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]); 729 | if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]); 730 | if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]); 731 | if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]); 732 | if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]); 733 | if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]); 734 | if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]); 735 | if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]); 736 | 737 | fin = fopen(train_file, "rb"); 738 | vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); 739 | vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); 740 | expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real)); 741 | if (expTable == NULL) { 742 | fprintf(stderr, "out of memory\n"); 743 | exit(1); 744 | } 745 | for (i = 0; i < EXP_TABLE_SIZE; i++) { 746 | expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table 747 | expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1) 748 | } 749 | TrainModel(); 750 | DestroyNet(); 751 | free(vocab_hash); 752 | free(expTable); 753 | return 0; 754 | } 755 | --------------------------------------------------------------------------------