├── document2vec
    ├── __init__.py
    ├── corpora.py
    ├── tests
    │   └── test_document2vec.py
    └── document2vec.py
├── models
    ├── small.w2v
    └── bigmodel2small.py
├── setup.py
├── .travis.yml
├── .gitignore
├── LICENSE
└── README.md


/document2vec/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/models/small.w2v:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cemoody/Document2Vec/HEAD/models/small.w2v


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from distutils.core import setup
 4 | 
 5 | setup(name='Document2Vec',
 6 |       version='0.1',
 7 |       description='Finding document vectors from pre-trained word vectors',
 8 |       author='Christopher Erick Moody',
 9 |       author_email='chrisemoody@gmail.com',
10 |       install_requires=['pandas', 'numpy', 'gensim'],
11 |       url='https://github.com/cemoody/Document2Vec')
12 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # Much of this script was adapted from astropy/astropy.
 2 | 
 3 | language: python
 4 | 
 5 | env:
 6 |     global:
 7 |         - NUMPY_VERSION=1.8
 8 | 
 9 | matrix:
10 |     include:
11 |         # All the recent versions of Python.
12 |         - python: 2.7
13 |         - python: 3.3
14 |         - python: 3.4
15 | 
16 | before_install:
17 |     - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
18 |     - chmod +x miniconda.sh
19 |     - ./miniconda.sh -b
20 |     - export PATH=/home/travis/miniconda/bin:$PATH
21 |     - conda update --yes conda
22 | 
23 | install:
24 |     - conda create --yes -n testing python=$TRAVIS_PYTHON_VERSION
25 |     - source activate testing
26 |     - conda install --yes numpy=$NUMPY_VERSION nose pip numba cython pandas
27 |     - pip install gensim
28 | 
29 | script:
30 |     - ls
31 |     - pwd
32 |     - env | sort
33 |     - nosetests -v
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 


--------------------------------------------------------------------------------
/models/bigmodel2small.py:
--------------------------------------------------------------------------------
 1 | import gensim
 2 | import traceback
 3 | import sys
 4 | import math
 5 | import numpy as np
 6 | from gensim.models import Doc2Vec, Word2Vec
 7 | from gensim.models.doc2vec import LabeledLineSentence, LabeledSentence
 8 | 
 9 | 
10 | model_w2v = Word2Vec.load("/home/moody/projects/Parachute/data/data-all-02.py2")
11 | words = set([l.strip() for l in open('../data/small.list').readlines()[1:]])
12 | 
13 | 
14 | indices = []
15 | for w in words:
16 |     v = model_w2v.vocab.get(w, None)
17 |     if v is None: continue
18 |     indices.append(v.index)
19 | indices = np.array(indices, dtype=np.int)
20 | 
21 | 
22 | syn0 = model_w2v.syn0[indices]
23 | syn1 = model_w2v.syn1[indices]
24 | index2word = list(np.array(model_w2v.index2word)[indices])
25 | vocab = {k:v for k, v in model_w2v.vocab.items() if k in words}
26 | 
27 | for w, v in model_w2v.vocab.items():
28 |     v.index = model_w2v.index2word.find(w)
29 | 
30 | model_w2v.syn0 = syn0
31 | model_w2v.syn1 = syn1
32 | model_w2v.vocab =vocab
33 | model_w2v.index2word = index2word
34 | 
35 | model_w2v.save("../data/data-all-02.py2.small")
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Christopher Erick Moody
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/document2vec/corpora.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from gensim.corpora import TextCorpus, Dictionary
 3 | from gensim.models.doc2vec import LabeledSentence
 4 | 
 5 | 
 6 | class SeriesCorpus(TextCorpus):
 7 |     def __init__(self, series, vocab=None, stem=False, bigram=None,
 8 |                  labels=True):
 9 |         """ Create a corpus that returns one row at a time out
10 |             of a Pandas Series"""
11 |         self.series = series
12 |         self.metadata = False
13 |         if vocab is not None:
14 |             vocab = set(vocab)
15 |         self.vocab = vocab
16 |         self.labels = labels
17 |         self.kwargs = dict(stem=stem, bigram=bigram)
18 |         logging.info("Building SeriesCorpus")
19 |         self.dictionary = Dictionary()
20 |         self.dictionary.add_documents(self.get_texts())
21 | 
22 |     def __iter__(self):
23 |         if self.labels:
24 |             for index, line in zip(self.series.index, self.series.values):
25 |                 label = ['SENT_%s' % str(index)]
26 |                 ls = LabeledSentence(line.split(' '), label)
27 |                 yield ls
28 |         else:
29 |             for index, line in self.series.index, self.series.values:
30 |                 yield line.split(' ')
31 | 
32 |     def line_iter(self, line):
33 |         if self.vocab is not None:
34 |             for word in line.split(' '):
35 |                 if word in self.vocab:
36 |                     yield word
37 |         else:
38 |             for word in line.split(' '):
39 |                 yield word
40 | 
41 |     def get_texts(self):
42 |         logging.info("Iterating SeriesCorpus")
43 |         for lineno, line in enumerate(self.series.values):
44 |             if self.metadata:
45 |                 yield self.line_iter(line), (lineno,)
46 |             else:
47 |                 yield self.line_iter(line)
48 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Document2Vec
 2 | Finding document vectors from pre-trained word2vec word vectors
 3 | 
 4 | ![Build Status](https://api.travis-ci.org/cemoody/Document2Vec.svg)
 5 | ![MIT License](http://img.shields.io/badge/license-MIT-blue.svg?style=flat)
 6 | 
 7 | # How to install
 8 | Simply install from the git repo like so:
 9 | 
10 | ```bash
11 | pip install -e git+git://github.com/cemoody/Document2Vec.git#egg=Package
12 | # on a shared machine without system-python access add --user
13 | ```
14 | 
15 | # How to use
16 | The word2vec file must be a trained gensim Word2Vec file and cannot be Mikolov's
17 | pre-trained vectors. This is because training a new document vector requires
18 | the syn1 layer which the C version of word2vec throws away.
19 | 
20 | Initialize Document2Vec with pre-trained word vectors from a pre-existing
21 | word2vec training run like so:
22 | 
23 | ```python    
24 | from document2vec.document2vec import Document2Vec
25 | from document2vec.corpora import SeriesCorpus
26 | import pandas as pd
27 | # This must be a gensim Word2Vec or Doc2Vec pickle
28 | d2v = Document2Vec("/home/moody/projects/Parachute/data/data-all-02.py2")
29 | sentences = pd.Series(['i love jackets', 'blue is my favorite color'])
30 | corpus = SeriesCorpus(sentences)
31 | doc_vectors = d2v.transform(corpus)
32 | ```
33 | 
34 | And then semantic similarities can be evaluated directly:
35 | 
36 | ```python
37 | from scipy.spatial.distance import cosine
38 | # vector for 'i love jackets'
39 | v0 = doc_vectors[0, :] 
40 | # vector the word 'jackets'
41 | v1 = d2v['jackets']
42 | similarity = 1 - cosine(v0, v1)
43 | print(similarity) # 0.320
44 | # Of course, the similarity with a word that is literally
45 | # in the sentence is going to be quite high
46 | # What if we try something similar, like coats?
47 | v2 = d2v['coats']
48 | similarity = 1 - cosine(v0, v2)
49 | print(similarity) # 0.265
50 | # And then if we try a very something very dissimilar from the sentece
51 | # like the city of New York we get low similarity:
52 | v3 = d2v['new_york']
53 | similarity = 1 - cosine(v0, v3)
54 | print(similarity) # 0.02
55 | ```
56 | 
57 | # Monitoring training
58 | 
59 | It can be useful to monitor the training over many iteration
60 | to make sure doc2vec is at (least locally) doing what it should be doing:
61 | 
62 | ```python
63 | from scipy.spatial.distance import cosine
64 | import numpy as np
65 | def monitor(model):
66 |     print model.alpha,
67 |     for word in ['jackets', 'jacket', 'coats', 'dog']:
68 |         print word,': ', 1.0 - cosine(model['SENT_0'], model[word]),
69 |     print " "
70 | d2v.monitor = monitor
71 | doc_vectors = d2v.transform(corpus)
72 | ```
73 | 
74 | Will print something similar to the following:
75 | 
76 | ```
77 | 0.25000 jackets :  0.347975713494 jacket :  0.150385576332 coats : 0.305263268479 dog :  0.121432161320
78 | 0.20002 jackets :  0.301431248517 jacket :  0.113824911821 coats : 0.272647329817 dog :  0.125565730551
79 | 0.15004 jackets :  0.296385793196 jacket :  0.108801409463 coats : 0.267922727947 dog :  0.126922837909
80 | 0.10006 jackets :  0.293973052240 jacket :  0.106190931536 coats : 0.265730524733 dog :  0.126504370045
81 | 0.05008 jackets :  0.293425048701 jacket :  0.105495592420 coats : 0.264931351959 dog :  0.125495564005
82 | ```
83 |  
84 | 


--------------------------------------------------------------------------------
/document2vec/tests/test_document2vec.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import random
  3 | import os.path
  4 | import pandas as pd
  5 | import numpy as np
  6 | from document2vec.document2vec import Document2Vec
  7 | from document2vec.corpora import SeriesCorpus
  8 | from gensim.models.doc2vec import LabeledLineSentence
  9 | from scipy.spatial.distance import cosine
 10 | 
 11 | fn = "/home/moody/projects/Parachute/data/data-all-02.py2"
 12 | w2v_file = os.path.realpath(fn)
 13 | 
 14 | 
 15 | def _generate_corpus(model, n=10):
 16 |     docs = {}
 17 |     random.seed(0)
 18 |     max_nword = len(model.syn0)
 19 |     for j in range(n):
 20 |         idxs = [random.randint(0, max_nword) for _ in range(8)]
 21 |         words = (model.index2word[idx] for idx in idxs)
 22 |         sentence = ' '.join(words)
 23 |         docs[j] = sentence
 24 |     series = pd.Series(docs)
 25 |     corpus = SeriesCorpus(series)
 26 |     return corpus
 27 | 
 28 | 
 29 | class TestDoc2Vec(unittest.TestCase):
 30 |     @unittest.skipIf(not os.path.exists(w2v_file),
 31 |                      "Need the file %s to continue" % w2v_file)
 32 |     def test_init(self):
 33 |         m = Document2Vec()
 34 |         assert 'train_lbls' in dir(m)
 35 | 
 36 |     @unittest.skipIf(not os.path.exists(w2v_file),
 37 |                      "Need the file %s to continue" % w2v_file)
 38 |     def test_load_from_w2v(self):
 39 |         model = Document2Vec(w2v_file)
 40 |         self.assertIsNot(type(model), None)
 41 |         self.assertIs(type(model), Document2Vec)
 42 |         self.assertIn('jacket', model.index2word)
 43 | 
 44 |     @unittest.skipIf(not os.path.exists(w2v_file),
 45 |                      "Need the file %s to continue" % w2v_file)
 46 |     def test_get_vector(self):
 47 |         model = Document2Vec(w2v_file)
 48 |         v = model.get_vector('the')
 49 |         self.assertIs(type(v), np.ndarray)
 50 | 
 51 |     @unittest.skipIf(not os.path.exists(w2v_file),
 52 |                      "Need the file %s to continue" % w2v_file)
 53 |     def test_word_similarity(self):
 54 |         model = Document2Vec(w2v_file)
 55 |         sim = model.similarity('blue', 'gold')
 56 |         self.assertGreater(sim, 0.3)
 57 | 
 58 |     @unittest.skipIf(not os.path.exists(w2v_file),
 59 |                      "Need the file %s to continue" % w2v_file)
 60 |     def test_checkpoint(self):
 61 |         model = Document2Vec(w2v_file)
 62 |         checksum = model.syn0.sum()
 63 |         model._build_checkpoint()
 64 |         model.syn0 *= 2.0
 65 |         new_checksum = model.syn0.sum()
 66 |         self.assertNotEqual(new_checksum, checksum)
 67 |         model._reset_to_checkpoint()
 68 |         new_checksum = model.syn0.sum()
 69 |         self.assertEqual(new_checksum, checksum)
 70 | 
 71 |     @unittest.skipIf(not os.path.exists(w2v_file),
 72 |                      "Need the file %s to continue" % w2v_file)
 73 |     def test_expand_model(self, n=10):
 74 |         model = Document2Vec(w2v_file)
 75 |         corpus = _generate_corpus(model, n=n)
 76 |         shape_before = model.syn0.shape
 77 |         model._expand_from(corpus)
 78 |         self.assertEqual(shape_before[0] + n, model.syn0.shape[0])
 79 |         self.assertIn('SENT_0', model.index2word)
 80 | 
 81 |     @unittest.skipIf(not os.path.exists(w2v_file),
 82 |                      "Need the file %s to continue" % w2v_file)
 83 |     def test_labeledlinesentence(self):
 84 |         model = Document2Vec(w2v_file)
 85 |         model.workers = 1
 86 |         corpus = _generate_corpus(model)
 87 |         fn = '/tmp/tmp_corpus'
 88 |         with open(fn, 'w') as fh:
 89 |             for line in corpus:
 90 |                 text = ' '.join([w for w in line.words])
 91 |                 try:
 92 |                     fh.write(text + '\n')
 93 |                 except:
 94 |                     continue
 95 |         corpus = LabeledLineSentence(fn)
 96 |         # vectors = model.fit_transform(corpus)
 97 |         # Get the first word in the corpus
 98 |         model.fit_transform(corpus)
 99 |         word = next(corpus.__iter__()).words[0]
100 |         sim = model.similarity('SENT_0', word)
101 |         self.assertGreater(sim, 0.15)
102 | 
103 |     @unittest.skipIf(not os.path.exists(w2v_file),
104 |                      "Need the file %s to continue" % w2v_file)
105 |     def test_transform(self):
106 |         """ Test that training the model brings the document vector
107 |             closer to the vectors for words in the sentence"""
108 |         model = Document2Vec(w2v_file)
109 |         model.workers = 1
110 |         corpus = _generate_corpus(model)
111 |         # vectors = model.fit_transform(corpus)
112 |         # Get the first word in the corpus
113 |         vectors = model.transform(corpus)
114 |         word = next(corpus.__iter__()).words[0]
115 |         sent0_vector = vectors[0, :]
116 |         sim = cosine(sent0_vector, model[word])
117 |         self.assertGreater(sim, 0.15)
118 | 


--------------------------------------------------------------------------------
/document2vec/document2vec.py:
--------------------------------------------------------------------------------
  1 | import gensim
  2 | import math
  3 | import copy
  4 | import numpy as np
  5 | from gensim.models import Doc2Vec, Word2Vec
  6 | 
  7 | 
  8 | class Document2Vec(Doc2Vec):
  9 |     def __init__(self, filename=None, min_count=1, alpha_initial=0.002,
 10 |                  alpha_start=0.0005, alpha_end=0.0002, min_iters=10,
 11 |                  monitor=None):
 12 |         Doc2Vec.__init__(self)
 13 |         if filename is not None:
 14 |             self.load_from_pickle(filename)
 15 |         self.checkpoint = {}
 16 |         self.filename = filename
 17 |         self.min_count = min_count
 18 |         self.alpha_initial = alpha_initial
 19 |         self.alpha_start = alpha_start
 20 |         self.alpha_end = alpha_end
 21 |         self.min_iters = min_iters
 22 |         if monitor is None:
 23 |             monitor = lambda *x: None
 24 |         self.monitor = monitor
 25 |         assert 'train_lbls' in dir(self)
 26 | 
 27 |     def load_from_pickle(self, filename):
 28 |         """
 29 |         This loads a pretrained Word2Vec file into this Doc2Vec class.
 30 |         """
 31 |         model_w2v = Doc2Vec.load(filename)
 32 |         for attr in dir(model_w2v):
 33 |             if attr == '__dict__':
 34 |                 continue
 35 |             # Skip methods that we already have in this class
 36 |             if attr in dir(self) and callable(getattr(model_w2v, attr)):
 37 |                 continue
 38 |             try:
 39 |                 setattr(self, attr, getattr(model_w2v, attr))
 40 |             except AttributeError:
 41 |                 continue
 42 | 
 43 |     def load_from_w2v(self, filename):
 44 |         """
 45 |         This loads a pretrained Word2Vec file into this Doc2Vec class.
 46 |         """
 47 |         model_w2v = Doc2Vec.load_word2vec_format(filename, binary=False)
 48 |         self._vocab_from = Word2Vec._vocab_from
 49 |         self._prepare_sentences = model_w2v._prepare_sentences
 50 |         for attr in dir(model_w2v):
 51 |             if attr == '__dict__':
 52 |                 continue
 53 |             if attr in dir(self) and callable(getattr(model_w2v, attr)):
 54 |                 continue
 55 |             try:
 56 |                 setattr(self, attr, getattr(model_w2v, attr))
 57 |             except AttributeError:
 58 |                 continue
 59 | 
 60 |     def get_vector(self, word):
 61 |         """Return the vector for a word"""
 62 |         return self.syn0[self.vocab[word].index]
 63 | 
 64 |     def _build_checkpoint(self):
 65 |         """Save the current state of the vectors such that
 66 |            we can revert training progress."""
 67 |         vars = {}
 68 |         variables = ['syn0', 'index2word', 'vocab', 'syn1']
 69 |         for name in variables:
 70 |             var = getattr(self, name, None)
 71 |             if var is not None:
 72 |                 vars[name] = copy.deepcopy(var)
 73 |         self.checkpoint = vars
 74 | 
 75 |     def _reset_to_checkpoint(self):
 76 |         vars = self.checkpoint
 77 |         for name, var in vars.items():
 78 |             setattr(self, name, var)
 79 | 
 80 |     @staticmethod
 81 |     def _make_label(prefix, suffix):
 82 |         label = '%s_%s' % (prefix, suffix)
 83 |         return label
 84 | 
 85 |     def _expand_from(self, corpus, prefix=None, labels=None):
 86 |         """
 87 |         Pass through the dataset once to add the new labels to the model.
 88 |         These labels stand in one for each document/sentence and not
 89 |         for new vocabulary.
 90 |         """
 91 |         if prefix is None:
 92 |             prefix = 'SENT'
 93 |         num_lines = sum(1 for _ in corpus)
 94 |         # Expand syn0
 95 |         shape = (self.syn0.shape[0] + num_lines, self.syn0.shape[1])
 96 |         syn0 = (np.random.random(shape).astype(self.syn0.dtype) - 0.5)
 97 |         syn0 /= self.layer1_size
 98 |         syn0[:self.syn0.shape[0]] = self.syn0
 99 |         self.syn0 = syn0
100 |         index2word_start = len(self.index2word)
101 |         for j, line_no in enumerate(range(num_lines)):
102 |             # Expand vocab
103 |             newvocab = gensim.models.doc2vec.Vocab()
104 |             newvocab.index = len(self.index2word)
105 |             newvocab.sample_probability = 1.0
106 |             # We insert each sentence at the root of the
107 |             # Huffman tree. It's a hack.
108 |             newvocab.code = [1, ] * int(math.log(line_no + 1, 2) + 1)
109 |             label = Document2Vec._make_label(prefix, str(j))
110 |             self.vocab[label] = newvocab
111 |             # Expand index2word
112 |             self.index2word.append(label)
113 |             assert len(self.index2word) == newvocab.index + 1
114 |         return index2word_start
115 | 
116 |     def _calc_alpha(self, i, num_iters, initial):
117 |         return initial * (num_iters - i) / num_iters + 1e-9 * i / num_iters
118 | 
119 |     def _fit(self, corpus):
120 |         """
121 |         Given a gensim corpus, train the word2vec model on it.
122 |         """
123 |         self.index2word_start = self._expand_from(corpus)
124 |         self.train_word = False
125 |         self.train_lbls = True
126 |         start = self.index2word_start
127 |         self.alpha = self.alpha_initial
128 |         self.monitor(self)
129 |         self.train(corpus)
130 |         for i in range(0, self.min_iters):
131 |             self.alpha = self._calc_alpha(i, self.min_iters,
132 |                                           self.alpha_start)
133 |             self.alpha = max(self.alpha, self.alpha_end)
134 |             self.min_alpha = self.alpha
135 |             self.monitor(self)
136 |             self.train(corpus)
137 |         self.monitor(self)
138 |         return self.syn0[start:]
139 | 
140 |     def fit(self, *args, **kwargs):
141 |         self._fit(*args, **kwargs)
142 | 
143 |     def fit_transform(self, *args, **kwargs):
144 |         return self._fit(*args, **kwargs)
145 | 
146 |     def transform(self, corpus, **kwargs):
147 |         self._build_checkpoint()
148 |         vectors = self.fit_transform(corpus, **kwargs)
149 |         self._reset_to_checkpoint()
150 |         return vectors
151 | 


--------------------------------------------------------------------------------