├── document2vec ├── __init__.py ├── corpora.py ├── tests │ └── test_document2vec.py └── document2vec.py ├── models ├── small.w2v └── bigmodel2small.py ├── setup.py ├── .travis.yml ├── .gitignore ├── LICENSE └── README.md /document2vec/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /models/small.w2v: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cemoody/Document2Vec/HEAD/models/small.w2v -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from distutils.core import setup 4 | 5 | setup(name='Document2Vec', 6 | version='0.1', 7 | description='Finding document vectors from pre-trained word vectors', 8 | author='Christopher Erick Moody', 9 | author_email='chrisemoody@gmail.com', 10 | install_requires=['pandas', 'numpy', 'gensim'], 11 | url='https://github.com/cemoody/Document2Vec') 12 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Much of this script was adapted from astropy/astropy. 2 | 3 | language: python 4 | 5 | env: 6 | global: 7 | - NUMPY_VERSION=1.8 8 | 9 | matrix: 10 | include: 11 | # All the recent versions of Python. 12 | - python: 2.7 13 | - python: 3.3 14 | - python: 3.4 15 | 16 | before_install: 17 | - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh 18 | - chmod +x miniconda.sh 19 | - ./miniconda.sh -b 20 | - export PATH=/home/travis/miniconda/bin:$PATH 21 | - conda update --yes conda 22 | 23 | install: 24 | - conda create --yes -n testing python=$TRAVIS_PYTHON_VERSION 25 | - source activate testing 26 | - conda install --yes numpy=$NUMPY_VERSION nose pip numba cython pandas 27 | - pip install gensim 28 | 29 | script: 30 | - ls 31 | - pwd 32 | - env | sort 33 | - nosetests -v 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | -------------------------------------------------------------------------------- /models/bigmodel2small.py: -------------------------------------------------------------------------------- 1 | import gensim 2 | import traceback 3 | import sys 4 | import math 5 | import numpy as np 6 | from gensim.models import Doc2Vec, Word2Vec 7 | from gensim.models.doc2vec import LabeledLineSentence, LabeledSentence 8 | 9 | 10 | model_w2v = Word2Vec.load("/home/moody/projects/Parachute/data/data-all-02.py2") 11 | words = set([l.strip() for l in open('../data/small.list').readlines()[1:]]) 12 | 13 | 14 | indices = [] 15 | for w in words: 16 | v = model_w2v.vocab.get(w, None) 17 | if v is None: continue 18 | indices.append(v.index) 19 | indices = np.array(indices, dtype=np.int) 20 | 21 | 22 | syn0 = model_w2v.syn0[indices] 23 | syn1 = model_w2v.syn1[indices] 24 | index2word = list(np.array(model_w2v.index2word)[indices]) 25 | vocab = {k:v for k, v in model_w2v.vocab.items() if k in words} 26 | 27 | for w, v in model_w2v.vocab.items(): 28 | v.index = model_w2v.index2word.find(w) 29 | 30 | model_w2v.syn0 = syn0 31 | model_w2v.syn1 = syn1 32 | model_w2v.vocab =vocab 33 | model_w2v.index2word = index2word 34 | 35 | model_w2v.save("../data/data-all-02.py2.small") 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Christopher Erick Moody 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /document2vec/corpora.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from gensim.corpora import TextCorpus, Dictionary 3 | from gensim.models.doc2vec import LabeledSentence 4 | 5 | 6 | class SeriesCorpus(TextCorpus): 7 | def __init__(self, series, vocab=None, stem=False, bigram=None, 8 | labels=True): 9 | """ Create a corpus that returns one row at a time out 10 | of a Pandas Series""" 11 | self.series = series 12 | self.metadata = False 13 | if vocab is not None: 14 | vocab = set(vocab) 15 | self.vocab = vocab 16 | self.labels = labels 17 | self.kwargs = dict(stem=stem, bigram=bigram) 18 | logging.info("Building SeriesCorpus") 19 | self.dictionary = Dictionary() 20 | self.dictionary.add_documents(self.get_texts()) 21 | 22 | def __iter__(self): 23 | if self.labels: 24 | for index, line in zip(self.series.index, self.series.values): 25 | label = ['SENT_%s' % str(index)] 26 | ls = LabeledSentence(line.split(' '), label) 27 | yield ls 28 | else: 29 | for index, line in self.series.index, self.series.values: 30 | yield line.split(' ') 31 | 32 | def line_iter(self, line): 33 | if self.vocab is not None: 34 | for word in line.split(' '): 35 | if word in self.vocab: 36 | yield word 37 | else: 38 | for word in line.split(' '): 39 | yield word 40 | 41 | def get_texts(self): 42 | logging.info("Iterating SeriesCorpus") 43 | for lineno, line in enumerate(self.series.values): 44 | if self.metadata: 45 | yield self.line_iter(line), (lineno,) 46 | else: 47 | yield self.line_iter(line) 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Document2Vec 2 | Finding document vectors from pre-trained word2vec word vectors 3 | 4 | ![Build Status](https://api.travis-ci.org/cemoody/Document2Vec.svg) 5 | ![MIT License](http://img.shields.io/badge/license-MIT-blue.svg?style=flat) 6 | 7 | # How to install 8 | Simply install from the git repo like so: 9 | 10 | ```bash 11 | pip install -e git+git://github.com/cemoody/Document2Vec.git#egg=Package 12 | # on a shared machine without system-python access add --user 13 | ``` 14 | 15 | # How to use 16 | The word2vec file must be a trained gensim Word2Vec file and cannot be Mikolov's 17 | pre-trained vectors. This is because training a new document vector requires 18 | the syn1 layer which the C version of word2vec throws away. 19 | 20 | Initialize Document2Vec with pre-trained word vectors from a pre-existing 21 | word2vec training run like so: 22 | 23 | ```python 24 | from document2vec.document2vec import Document2Vec 25 | from document2vec.corpora import SeriesCorpus 26 | import pandas as pd 27 | # This must be a gensim Word2Vec or Doc2Vec pickle 28 | d2v = Document2Vec("/home/moody/projects/Parachute/data/data-all-02.py2") 29 | sentences = pd.Series(['i love jackets', 'blue is my favorite color']) 30 | corpus = SeriesCorpus(sentences) 31 | doc_vectors = d2v.transform(corpus) 32 | ``` 33 | 34 | And then semantic similarities can be evaluated directly: 35 | 36 | ```python 37 | from scipy.spatial.distance import cosine 38 | # vector for 'i love jackets' 39 | v0 = doc_vectors[0, :] 40 | # vector the word 'jackets' 41 | v1 = d2v['jackets'] 42 | similarity = 1 - cosine(v0, v1) 43 | print(similarity) # 0.320 44 | # Of course, the similarity with a word that is literally 45 | # in the sentence is going to be quite high 46 | # What if we try something similar, like coats? 47 | v2 = d2v['coats'] 48 | similarity = 1 - cosine(v0, v2) 49 | print(similarity) # 0.265 50 | # And then if we try a very something very dissimilar from the sentece 51 | # like the city of New York we get low similarity: 52 | v3 = d2v['new_york'] 53 | similarity = 1 - cosine(v0, v3) 54 | print(similarity) # 0.02 55 | ``` 56 | 57 | # Monitoring training 58 | 59 | It can be useful to monitor the training over many iteration 60 | to make sure doc2vec is at (least locally) doing what it should be doing: 61 | 62 | ```python 63 | from scipy.spatial.distance import cosine 64 | import numpy as np 65 | def monitor(model): 66 | print model.alpha, 67 | for word in ['jackets', 'jacket', 'coats', 'dog']: 68 | print word,': ', 1.0 - cosine(model['SENT_0'], model[word]), 69 | print " " 70 | d2v.monitor = monitor 71 | doc_vectors = d2v.transform(corpus) 72 | ``` 73 | 74 | Will print something similar to the following: 75 | 76 | ``` 77 | 0.25000 jackets : 0.347975713494 jacket : 0.150385576332 coats : 0.305263268479 dog : 0.121432161320 78 | 0.20002 jackets : 0.301431248517 jacket : 0.113824911821 coats : 0.272647329817 dog : 0.125565730551 79 | 0.15004 jackets : 0.296385793196 jacket : 0.108801409463 coats : 0.267922727947 dog : 0.126922837909 80 | 0.10006 jackets : 0.293973052240 jacket : 0.106190931536 coats : 0.265730524733 dog : 0.126504370045 81 | 0.05008 jackets : 0.293425048701 jacket : 0.105495592420 coats : 0.264931351959 dog : 0.125495564005 82 | ``` 83 | 84 | -------------------------------------------------------------------------------- /document2vec/tests/test_document2vec.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import random 3 | import os.path 4 | import pandas as pd 5 | import numpy as np 6 | from document2vec.document2vec import Document2Vec 7 | from document2vec.corpora import SeriesCorpus 8 | from gensim.models.doc2vec import LabeledLineSentence 9 | from scipy.spatial.distance import cosine 10 | 11 | fn = "/home/moody/projects/Parachute/data/data-all-02.py2" 12 | w2v_file = os.path.realpath(fn) 13 | 14 | 15 | def _generate_corpus(model, n=10): 16 | docs = {} 17 | random.seed(0) 18 | max_nword = len(model.syn0) 19 | for j in range(n): 20 | idxs = [random.randint(0, max_nword) for _ in range(8)] 21 | words = (model.index2word[idx] for idx in idxs) 22 | sentence = ' '.join(words) 23 | docs[j] = sentence 24 | series = pd.Series(docs) 25 | corpus = SeriesCorpus(series) 26 | return corpus 27 | 28 | 29 | class TestDoc2Vec(unittest.TestCase): 30 | @unittest.skipIf(not os.path.exists(w2v_file), 31 | "Need the file %s to continue" % w2v_file) 32 | def test_init(self): 33 | m = Document2Vec() 34 | assert 'train_lbls' in dir(m) 35 | 36 | @unittest.skipIf(not os.path.exists(w2v_file), 37 | "Need the file %s to continue" % w2v_file) 38 | def test_load_from_w2v(self): 39 | model = Document2Vec(w2v_file) 40 | self.assertIsNot(type(model), None) 41 | self.assertIs(type(model), Document2Vec) 42 | self.assertIn('jacket', model.index2word) 43 | 44 | @unittest.skipIf(not os.path.exists(w2v_file), 45 | "Need the file %s to continue" % w2v_file) 46 | def test_get_vector(self): 47 | model = Document2Vec(w2v_file) 48 | v = model.get_vector('the') 49 | self.assertIs(type(v), np.ndarray) 50 | 51 | @unittest.skipIf(not os.path.exists(w2v_file), 52 | "Need the file %s to continue" % w2v_file) 53 | def test_word_similarity(self): 54 | model = Document2Vec(w2v_file) 55 | sim = model.similarity('blue', 'gold') 56 | self.assertGreater(sim, 0.3) 57 | 58 | @unittest.skipIf(not os.path.exists(w2v_file), 59 | "Need the file %s to continue" % w2v_file) 60 | def test_checkpoint(self): 61 | model = Document2Vec(w2v_file) 62 | checksum = model.syn0.sum() 63 | model._build_checkpoint() 64 | model.syn0 *= 2.0 65 | new_checksum = model.syn0.sum() 66 | self.assertNotEqual(new_checksum, checksum) 67 | model._reset_to_checkpoint() 68 | new_checksum = model.syn0.sum() 69 | self.assertEqual(new_checksum, checksum) 70 | 71 | @unittest.skipIf(not os.path.exists(w2v_file), 72 | "Need the file %s to continue" % w2v_file) 73 | def test_expand_model(self, n=10): 74 | model = Document2Vec(w2v_file) 75 | corpus = _generate_corpus(model, n=n) 76 | shape_before = model.syn0.shape 77 | model._expand_from(corpus) 78 | self.assertEqual(shape_before[0] + n, model.syn0.shape[0]) 79 | self.assertIn('SENT_0', model.index2word) 80 | 81 | @unittest.skipIf(not os.path.exists(w2v_file), 82 | "Need the file %s to continue" % w2v_file) 83 | def test_labeledlinesentence(self): 84 | model = Document2Vec(w2v_file) 85 | model.workers = 1 86 | corpus = _generate_corpus(model) 87 | fn = '/tmp/tmp_corpus' 88 | with open(fn, 'w') as fh: 89 | for line in corpus: 90 | text = ' '.join([w for w in line.words]) 91 | try: 92 | fh.write(text + '\n') 93 | except: 94 | continue 95 | corpus = LabeledLineSentence(fn) 96 | # vectors = model.fit_transform(corpus) 97 | # Get the first word in the corpus 98 | model.fit_transform(corpus) 99 | word = next(corpus.__iter__()).words[0] 100 | sim = model.similarity('SENT_0', word) 101 | self.assertGreater(sim, 0.15) 102 | 103 | @unittest.skipIf(not os.path.exists(w2v_file), 104 | "Need the file %s to continue" % w2v_file) 105 | def test_transform(self): 106 | """ Test that training the model brings the document vector 107 | closer to the vectors for words in the sentence""" 108 | model = Document2Vec(w2v_file) 109 | model.workers = 1 110 | corpus = _generate_corpus(model) 111 | # vectors = model.fit_transform(corpus) 112 | # Get the first word in the corpus 113 | vectors = model.transform(corpus) 114 | word = next(corpus.__iter__()).words[0] 115 | sent0_vector = vectors[0, :] 116 | sim = cosine(sent0_vector, model[word]) 117 | self.assertGreater(sim, 0.15) 118 | -------------------------------------------------------------------------------- /document2vec/document2vec.py: -------------------------------------------------------------------------------- 1 | import gensim 2 | import math 3 | import copy 4 | import numpy as np 5 | from gensim.models import Doc2Vec, Word2Vec 6 | 7 | 8 | class Document2Vec(Doc2Vec): 9 | def __init__(self, filename=None, min_count=1, alpha_initial=0.002, 10 | alpha_start=0.0005, alpha_end=0.0002, min_iters=10, 11 | monitor=None): 12 | Doc2Vec.__init__(self) 13 | if filename is not None: 14 | self.load_from_pickle(filename) 15 | self.checkpoint = {} 16 | self.filename = filename 17 | self.min_count = min_count 18 | self.alpha_initial = alpha_initial 19 | self.alpha_start = alpha_start 20 | self.alpha_end = alpha_end 21 | self.min_iters = min_iters 22 | if monitor is None: 23 | monitor = lambda *x: None 24 | self.monitor = monitor 25 | assert 'train_lbls' in dir(self) 26 | 27 | def load_from_pickle(self, filename): 28 | """ 29 | This loads a pretrained Word2Vec file into this Doc2Vec class. 30 | """ 31 | model_w2v = Doc2Vec.load(filename) 32 | for attr in dir(model_w2v): 33 | if attr == '__dict__': 34 | continue 35 | # Skip methods that we already have in this class 36 | if attr in dir(self) and callable(getattr(model_w2v, attr)): 37 | continue 38 | try: 39 | setattr(self, attr, getattr(model_w2v, attr)) 40 | except AttributeError: 41 | continue 42 | 43 | def load_from_w2v(self, filename): 44 | """ 45 | This loads a pretrained Word2Vec file into this Doc2Vec class. 46 | """ 47 | model_w2v = Doc2Vec.load_word2vec_format(filename, binary=False) 48 | self._vocab_from = Word2Vec._vocab_from 49 | self._prepare_sentences = model_w2v._prepare_sentences 50 | for attr in dir(model_w2v): 51 | if attr == '__dict__': 52 | continue 53 | if attr in dir(self) and callable(getattr(model_w2v, attr)): 54 | continue 55 | try: 56 | setattr(self, attr, getattr(model_w2v, attr)) 57 | except AttributeError: 58 | continue 59 | 60 | def get_vector(self, word): 61 | """Return the vector for a word""" 62 | return self.syn0[self.vocab[word].index] 63 | 64 | def _build_checkpoint(self): 65 | """Save the current state of the vectors such that 66 | we can revert training progress.""" 67 | vars = {} 68 | variables = ['syn0', 'index2word', 'vocab', 'syn1'] 69 | for name in variables: 70 | var = getattr(self, name, None) 71 | if var is not None: 72 | vars[name] = copy.deepcopy(var) 73 | self.checkpoint = vars 74 | 75 | def _reset_to_checkpoint(self): 76 | vars = self.checkpoint 77 | for name, var in vars.items(): 78 | setattr(self, name, var) 79 | 80 | @staticmethod 81 | def _make_label(prefix, suffix): 82 | label = '%s_%s' % (prefix, suffix) 83 | return label 84 | 85 | def _expand_from(self, corpus, prefix=None, labels=None): 86 | """ 87 | Pass through the dataset once to add the new labels to the model. 88 | These labels stand in one for each document/sentence and not 89 | for new vocabulary. 90 | """ 91 | if prefix is None: 92 | prefix = 'SENT' 93 | num_lines = sum(1 for _ in corpus) 94 | # Expand syn0 95 | shape = (self.syn0.shape[0] + num_lines, self.syn0.shape[1]) 96 | syn0 = (np.random.random(shape).astype(self.syn0.dtype) - 0.5) 97 | syn0 /= self.layer1_size 98 | syn0[:self.syn0.shape[0]] = self.syn0 99 | self.syn0 = syn0 100 | index2word_start = len(self.index2word) 101 | for j, line_no in enumerate(range(num_lines)): 102 | # Expand vocab 103 | newvocab = gensim.models.doc2vec.Vocab() 104 | newvocab.index = len(self.index2word) 105 | newvocab.sample_probability = 1.0 106 | # We insert each sentence at the root of the 107 | # Huffman tree. It's a hack. 108 | newvocab.code = [1, ] * int(math.log(line_no + 1, 2) + 1) 109 | label = Document2Vec._make_label(prefix, str(j)) 110 | self.vocab[label] = newvocab 111 | # Expand index2word 112 | self.index2word.append(label) 113 | assert len(self.index2word) == newvocab.index + 1 114 | return index2word_start 115 | 116 | def _calc_alpha(self, i, num_iters, initial): 117 | return initial * (num_iters - i) / num_iters + 1e-9 * i / num_iters 118 | 119 | def _fit(self, corpus): 120 | """ 121 | Given a gensim corpus, train the word2vec model on it. 122 | """ 123 | self.index2word_start = self._expand_from(corpus) 124 | self.train_word = False 125 | self.train_lbls = True 126 | start = self.index2word_start 127 | self.alpha = self.alpha_initial 128 | self.monitor(self) 129 | self.train(corpus) 130 | for i in range(0, self.min_iters): 131 | self.alpha = self._calc_alpha(i, self.min_iters, 132 | self.alpha_start) 133 | self.alpha = max(self.alpha, self.alpha_end) 134 | self.min_alpha = self.alpha 135 | self.monitor(self) 136 | self.train(corpus) 137 | self.monitor(self) 138 | return self.syn0[start:] 139 | 140 | def fit(self, *args, **kwargs): 141 | self._fit(*args, **kwargs) 142 | 143 | def fit_transform(self, *args, **kwargs): 144 | return self._fit(*args, **kwargs) 145 | 146 | def transform(self, corpus, **kwargs): 147 | self._build_checkpoint() 148 | vectors = self.fit_transform(corpus, **kwargs) 149 | self._reset_to_checkpoint() 150 | return vectors 151 | --------------------------------------------------------------------------------