├── test.py
├── movie_review.py
├── README.md
├── Old-version.md
├── ddoc2vec.py
└── ddoc2vecf.py


/test.py:
--------------------------------------------------------------------------------
 1 | from gensim.models.word2vec import Word2Vec
 2 | from gensim.models.doc2vec import TaggedDocument
 3 | 
 4 | from pyspark import SparkContext, SparkConf
 5 | 
 6 | from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
 7 | from pyspark.mllib.regression import LabeledPoint
 8 | 
 9 | 
10 | conf = (SparkConf() \
11 |     .set("spark.driver.maxResultSize", "2g"))
12 | 
13 | sc = SparkContext(conf=conf)
14 | pos = sc.textFile("hdfs:///movie_review/positive").map(lambda s: (True, s.lower().split()))
15 | neg = sc.textFile("hdfs:///movie_review/negative").map(lambda s: (False, s.lower().split()))
16 | 
17 | if False:
18 |     docvecs = sc.pickleFile("hdfs://movie_review/doctags")
19 | else:
20 |     from ddoc2vec import DistDoc2Vec
21 | 
22 |     data = (neg + pos).zipWithIndex().map(lambda (v, i): (i, v[0], v[1]))
23 |     sents = data.map(lambda (a,b,c): c)
24 | 
25 |     model = Word2Vec(size=100, hs=0, negative=8)
26 |     dd2v = DistDoc2Vec(model, learn_hidden=False, num_partitions=5, num_iterations=10)
27 |     dd2v.build_vocab_from_rdd(sents, reset_hidden=False)
28 |     # train word2vec in driver
29 |     model.train(sents.collect())
30 |     model.save("/root/doc2vec/word2vec_model/review")
31 |     print "*** done training words ****"
32 |     print "*** len(model.vocab): %d ****" % len(model.vocab)
33 |     dd2v.train_sentences_cbow(data.map(lambda (i, l, v): TaggedDocument(words=v, tags=[i])))
34 |     # dd2v.saveAsPickleFile("hdfs:///movie_review/docvectors")
35 |     docvecs = dd2v.doctag_syn0
36 | 
37 | npos = pos.count()
38 | reg_data = docvecs.map(lambda (i, v): LabeledPoint(1.0 if i<npos else 0.0, v))
39 | (trainingData, testData) = reg_data.randomSplit([0.7, 0.3])
40 | 
41 | lrmodel = LogisticRegressionWithLBFGS.train(trainingData)
42 | labelsAndPreds = testData.map(lambda p: (p.label, lrmodel.predict(p.features)))
43 | 
44 | trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testData.count())
45 | falsePos = labelsAndPreds.filter(lambda (v, p): v != p and v == 0.0).count() / float(testData.filter(lambda lp: lp.label == 0.0).count())
46 | falseNeg = labelsAndPreds.filter(lambda (v, p): v != p and v == 1.0).count() / float(testData.filter(lambda lp: lp.label == 1.0).count())
47 | 
48 | print "*** Error Rate: %f ***" % trainErr
49 | print "*** False Positive Rate: %f ***" % falsePos
50 | print "*** False Negative Rate: %f ***" % falseNeg
51 | 
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/movie_review.py:
--------------------------------------------------------------------------------
  1 | from gensim.models.word2vec import Word2Vec
  2 | from gensim.models.doc2vec import TaggedDocument
  3 | 
  4 | from pyspark import SparkContext, SparkConf
  5 | from pyspark.sql import SQLContext
  6 | 
  7 | from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
  8 | from pyspark.mllib.regression import LabeledPoint
  9 |  
 10 | import numpy as np
 11 | import re
 12 | from ddoc2vecf import DistDoc2VecFast
 13 | 
 14 | 
 15 | def swap_kv(tp):
 16 |     return (tp[1], tp[0])
 17 | # parsing text
 18 | contractions = re.compile(r"'|-|\"")
 19 | # all non alphanumeric
 20 | symbols = re.compile(r'(\W+)', re.U)
 21 | # single character removal
 22 | singles = re.compile(r'(\s\S\s)', re.I|re.U)
 23 | # separators (any whitespace)
 24 | seps = re.compile(r'\s+')
 25 | 
 26 | def clean(text): 
 27 |     text = text.lower()
 28 |     text = contractions.sub('', text)
 29 |     text = symbols.sub(r' \1 ', text)
 30 |     text = singles.sub(' ', text)
 31 |     text = seps.sub(' ', text)
 32 |     return text
 33 | 
 34 | alteos = re.compile(r'([!\?])')
 35 | def sentences(l):
 36 |     l = alteos.sub(r' \1 .', l).rstrip("(\.)*\n")
 37 |     return l.split(".")
 38 | 
 39 | def parse_sentences(rdd):
 40 | 
 41 |     raw = rdd.zipWithIndex().map(swap_kv)
 42 | 
 43 |     data = raw.flatMap(lambda (id, text): [(id, clean(s).split()) for s in sentences(text)]) 
 44 |     return data
 45 | 
 46 | def parse_paragraphs(rdd):
 47 |     raw = rdd.zipWithIndex().map(swap_kv)
 48 | 
 49 |     def clean_paragraph(text):
 50 |         paragraph = []
 51 |         for s in sentences(text):
 52 |             paragraph = paragraph + clean(s).split()
 53 | 
 54 |         return paragraph
 55 | 
 56 |     data = raw.map(lambda (id, text): TaggedDocument(words=clean_paragraph(text), tags=[id])) 
 57 |     return data
 58 | 
 59 | def word2vec(rdd):
 60 |     sentences = parse_sentences(rdd)
 61 |     sentences_without_id = sentences.map(lambda (id, sent):sent)
 62 |     model = Word2Vec(size=100, hs=0, negative=8)
 63 |     dd2v = DistDoc2VecFast(model, learn_hidden=True, num_partitions=15, num_iterations=20)
 64 |     dd2v.build_vocab_from_rdd(sentences_without_id)
 65 |     print "*** done training words ****"
 66 |     print "*** len(model.vocab): %d ****" % len(model.vocab)
 67 |     return dd2v, sentences
 68 | 
 69 | def doc2vec(dd2v, rdd):
 70 |     paragraphs = parse_paragraphs(rdd)    
 71 |     dd2v.train_sentences_cbow(paragraphs)
 72 |     print "**** Done Training Doc2Vec ****"
 73 |     def split_vec(iterable):
 74 |         dvecs = iter(iterable).next()
 75 |         dvecs = dvecs['doctag_syn0']
 76 |         n = np.shape(dvecs)[0]
 77 |         return (dvecs[i] for i in xrange(n))
 78 |     return dd2v, dd2v.doctag_syn0.mapPartitions(split_vec) 
 79 | 
 80 | def regression(reg_data):
 81 |     (trainingData, testData) = reg_data.randomSplit([0.7, 0.3])
 82 |     lrmodel = LogisticRegressionWithLBFGS.train(trainingData)
 83 |     labelsAndPreds = testData.map(lambda p: (p.label, lrmodel.predict(p.features)))
 84 | 
 85 |     trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testData.count())
 86 |     falsePos = labelsAndPreds.filter(lambda (v, p): v != p and v == 0.0).count() / float(testData.filter(lambda lp: lp.label == 0.0).count())
 87 |     falseNeg = labelsAndPreds.filter(lambda (v, p): v != p and v == 1.0).count() / float(testData.filter(lambda lp: lp.label == 1.0).count())
 88 | 
 89 |     print "*** Error Rate: %f ***" % trainErr
 90 |     print "*** False Positive Rate: %f ***" % falsePos
 91 |     print "*** False Negative Rate: %f ***" % falseNeg
 92 | 
 93 | if __name__ == "__main__":
 94 |     conf = (SparkConf() \
 95 |         .set("spark.driver.maxResultSize", "4g"))
 96 | 
 97 |     sc = SparkContext(conf=conf)
 98 |     # sqlContext = SQLContext(sc)
 99 |     pos = sc.textFile("hdfs:///movie_review/positive")
100 |     neg = sc.textFile("hdfs:///movie_review/negative")
101 |     both = pos + neg
102 | 
103 |     dd2v, _ = word2vec(both)
104 |     dd2v, docvecs = doc2vec(dd2v, both)
105 | 
106 |     dd2v.model.save("/root/doc2vec/word2vec_model/review")
107 | 
108 |     npos = pos.count()
109 |     reg_data = docvecs.zipWithIndex().map(lambda (v, i): LabeledPoint(1.0 if i<npos else 0.0, v))
110 |     regression(reg_data)
111 | 
112 |     
113 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Gensim Doc2Vec on Spark
 2 | 
 3 | ## Overview
 4 | 
 5 | Implements a prototype for running `gensim` `Doc2Vec` on Spark. Only `PV-DBOW` with negative sampling is implemented.
 6 | This work is inspired by:
 7 | 
 8 | https://github.com/dirkneumann/deepdist
 9 | 
10 | https://github.com/klb3713/sentence2ve
11 | 
12 | ## General Idea
13 | 
14 | Most ML models in `Spark`'s `MLLib` only palatalizes training process, while keeping model parameters in driver program and broadcast to workers. This works for models which themselves are small enough to hold in memory on a single node, while training set can be large and has to be parallelized as `RDD`s. However, `Doc2Vec` models are not of this category - number of model parameters is linear to number of points in dataset. 
15 | 
16 | The goal of `Doc2Vec` is to learn vector representation of _each_ document in training set. For example, a dataset of 10 million documents and vector size 300, requires 300,000,000 floating number parameters (or a 300x1000,000 array). Fortunately, each data point (a.ka. sentence or document) only updates its corresponding row in the weights matrix during training process, therefore, it's possible to parallelize the model by zipping its parameters with training dataset: each partition only holds the parameters relevant to its own share of data. 
17 | 
18 | `gensim` is used as a basis for this setup, training for sentence vectors are adapted to work on `RDD`s. 
19 | 
20 | ## Details
21 | 
22 | When training `Doc2Vec` in `PV-DBOW` model and using negative sampling, three `numpy` arrays are of interests in `gensim`, the fully captures the model state:
23 | 
24 | 1. `model.syn0`
25 | 2. `model.syn1neg`
26 | 3. `model.docvecs.doctag_syn0`
27 | 
28 | In our implementation, we keep `syn0` and `syn1neg` centralized, as they are of limited size (size of total vocabulary). `doctag_syn0` is held as RDD (each partition holds a single `numpy` array for it). `Word2Vec` model is broadcasted to all partitions. 
29 | 
30 | In each training iteration, the following happens:
31 | 
32 | 1. On each partition, `Cython` and `BLAS` powered procedure `train_document_dbow` from `gensim.models.doc2vec_inner` is called, and trains word vectors, document vector and hidden layer weights jointly
33 | 2. We record triplet (`syn0` deltas, `syn1neg` deltas, `doctag_syn0`) and produce a new RDD with each partition holding a single triplet; and this new RDD is cached (as it will be used twice)
34 | 3. We aggregate all deltas through Spark's `RDD.aggregate` api, to sum all deltas, then apply deltas to `model` object in driver program
35 | 4. Previous generation of model broadcased is unpersisted, new model is broadcasted to all executors (runs actual training as `aggregate` is a Spark action)
36 | 5. Create new inputs from new RDD in step 4, training will not be re-run as we have cached results, then we invalid stale triplet RDD from previous iteration
37 | 
38 | By tweaking `num_partitions` and `num_iterations`, we can balance the trade-off between accuracy, speed and network overhead. 
39 | 
40 | ## Test Results
41 | 
42 | Cornell Movie Reviews [Dataset](http://www.cs.cornell.edu/people/pabo/movie-review-data/) is used to test the approach out. Model is trained on 5 partitions and 20 iterations, and we were able to classify movie reviews labels with about **11%** error rate only from docvectors, with balanced false negative and postie rate: 
43 | 
44 | ```
45 | *** Error Rate: 0.107995 ***
46 | *** False Positive Rate: 0.107799 ***
47 | *** False Negative Rate: 0.108191 ***
48 | ```
49 | 
50 | ## Running Test
51 | 
52 | The following shell script downloads movie review data and uploads it to HDFS
53 | 
54 | ```
55 | wget http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz
56 | tar xzvf review_polarity.tar.gz
57 | cd txt_sentoken/
58 | cat pos/*.txt > positive.txt
59 | cat neg/*.txt > negative.txt
60 | hadoop fs -mkdir -p /movie_review/positive
61 | hadoop fs -mkdir -p /movie_review/negative
62 | hadoop fs -put positive.txt /movie_review/positive/
63 | hadoop fs -put negative.txt /movie_review/negative/
64 | ```
65 | 
66 | The following command submits testing script `moview_review.py`
67 | 
68 | ```
69 | $SPARK_HOME/bin/spark-submit --verbose \
70 |   --master yarn    
71 |   --deploy-mode client 
72 |   --py-files ddoc2vecf.py
73 |   movie_review.py
74 | ```
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/Old-version.md:
--------------------------------------------------------------------------------
  1 | # Gensim Doc2Vec on Spark
  2 | 
  3 | ## Overview
  4 | 
  5 | Implements a prototype for running `gensim` `Doc2Vec` on Spark. Training for document vectors are pure python (with numpy) as of now, which is 20x-80x slower than `gensim`'s optimized C version. Therefore, running this is less likely to be faster than running `gensim` on a single node. Only `PV-DBOW` with negative sampling is implemented, and works best with a pre-trained `Word2Vec` model using `hs=0` and `negative>0` settings (skip-gram, negative sampling), so we don't need to learn the weights on hidden layer (`learn-hidden=False`).
  6 | 
  7 | This work is inspired by:
  8 | 
  9 | https://github.com/dirkneumann/deepdist
 10 | 
 11 | https://github.com/klb3713/sentence2ve
 12 | 
 13 | ## General Idea
 14 | 
 15 | Most ML models in `Spark`'s `MLLib` only palatalizes training process, while keeping model parameters in driver program and broadcast to workers. This works for models which themselves are small enough to hold in memory on a single node, while training set can be large and has to be parallelized as `RDD`s. However, `Doc2Vec` models are not of this category - number of model parameters is linear to number of points in dataset. 
 16 | 
 17 | The goal of `Doc2Vec` is to learn vector representation of _each_ document in training set. For example, a dataset of 10 million documents and vector size 300, requires 300,000,000 floating number parameters (or a 300x1000,000 array). Fortunately, each data point (a.ka. sentence or document) only updates its corresponding row in the weights matrix during training process, therefore, it's possible to parallelize the model by zipping its parameters with training dataset: each partition only holds the parameters relevant to its own share of data. 
 18 | 
 19 | `gensim` is used as a basis for this setup, training for sentence vectors are adapted to work on `RDD`s. 
 20 | 
 21 | ## TO-DOs
 22 | 
 23 | 1. using `gensim`'s internal procedures from `gensim.models.word2vec_inner` and `gensim.models.doc2vec_inner` to train document vectors 
 24 | 2. implement hierarchical sampling 
 25 | 
 26 | ## Algorithum
 27 | 
 28 | `PV-DBOW` [(paper)](https://cs.stanford.edu/~quocle/paragraph_vector.pdf) works very similarly to Skip Gram model in `word2vec` models. The model is forced to predict random word sampled from a sentence/document. 
 29 | 
 30 | > ... at each iteration of stochastic gradient descent, we sample a text window, then sample a random word from the text window and form a classifi-cation task given the Paragraph Vector.
 31 | 
 32 | Using negative sampling, instead of training softmax classifier, we train a simple logistic regression model. For example, giving a sentence with id `SENT_0` and context window size of 1:
 33 | 
 34 | ```
 35 | the quick brown fox jumped over the lazy dog
 36 | ```
 37 | 
 38 | We have context, target pairs of:
 39 | 
 40 | ```
 41 | ([the, brown], quick), ([quick, fox], brown), ([brown, jumped], fox), ...
 42 | ```
 43 | 
 44 | And training sampels of:
 45 | 
 46 | ```
 47 | (quick, the), (quick, brown), (brown, quick), (brown, fox), ...
 48 | ```
 49 | 
 50 | The goal of negative sampling PV-DBOW model is to maximize the objective function, where "quick" is the target word, and "sheep" is a randomly sampled noisy word:
 51 | 
 52 | J(θ) = log<sub>θ</sub>[D=1|SENT_0, the] + log<sub>θ</sub>[D=0|SENT_0, sheep]
 53 | 
 54 | (This is similar to word2vec model, which maximizes: J(θ) = log<sub>θ</sub>[D=1|quick,the] + log<sub>θ</sub>[D=0|quick,sheep], the target word is replaced with sentence id `SENT_0`, this makes it possible to reuse hidden->output weights obtained from word2vec model training).
 55 | 
 56 | The implementation in `gensim` for skip-gram, negative sampling can be found in function [`train_sg_pair`](https://github.com/piskvorky/gensim/blob/develop/gensim/models/word2vec.py#L223), which is reused in `Doc2Vec` python implementation as well. 
 57 | 
 58 | When training a single sentence, first, it's vector form is looked up (from `model.docvecs.doctag_syn0`) using id `SENT_0`, dot multiplied with vectors for word "the" and "sheep" looked up from the hidden->output layer weights (`model.syn1neg`), then fed into the logsistic function to produce an output between 0.0 and 1.0 (feed forward). The second step is to propagate error back to the input-> hidden layer (or just update the document vector with error gradient). 
 59 | 
 60 | Training on a sentence will _only_ update its own vector representation (the row corresponds to its id in input-> hidden weights matrix), if we freeze the learning of hidden->output layer (`numpy` array `model.syn1neg`). 
 61 | 
 62 | ## Moving on to a cluster
 63 | 
 64 | In `gensim`, document vectors are stored as `model.docvecs.doctag_syn0`, which is a `numpy` memmapped array (so that it does not blow up memory for large training dataset). This makes it impractical to broadcast the models to workers when running on Spark, as we'd have to copy the backing array (on disk) to multiple nodes. 
 65 | 
 66 | Assuming we have a already trained `gensim` `Word2Vec` model, we broadcast it to all workers, so that we can access helper functions on the model object anywhere. 
 67 | 
 68 | The first step is to parallelize this potentially very big numpy array. For a training set in RDD form of N documents and p partitions, we create a single `doctag_syn0` numpy array on each partition through `mapPartitions` operation. Subsequent training is performed on the RDD produced by zipping training set and this parameter RDD (RDD[numpy.array]). 
 69 | 
 70 | Training happens on each partition, for each iteration, the `numpy` array holder each partition's `doctags_syn0`is updated in place using a custom training function (a series of `numpy` array operations). Resulting parameters is collected as a new RDD through `mapPartitions`. 
 71 | 
 72 | ## Usage
 73 | 
 74 | Main class `DistDoc2Vec`. Constructed by:
 75 | 
 76 | ```
 77 | DistDoc2Vec(model=model, # gensim word2vec model
 78 |             alpha=0.025, # learning rate
 79 |             learn_hidden=False, # update syn1neg or not
 80 |             num_iterations=10,   # number of training iterations
 81 |             num_partitions=None  # number of partitions for RDD, if None will use input RDD's settings)
 82 | ```
 83 | 
 84 | Build vocab from RDD:
 85 | 
 86 | ```
 87 | build_vocab_from_rdd(rdd)
 88 | 
 89 | # rdd is a RDD of sentences, a sentence is a list of tokens/words
 90 | ```
 91 | 
 92 | Training on Spark:
 93 | 
 94 | ```
 95 | train_sentences_cbow(rdd)
 96 | 
 97 | # Rdd is a RDD of TaggedDocument (gensim) objects 
 98 | ```
 99 | 
100 | 
101 | ## Example
102 | 
103 | ```
104 | from gensim.models.word2vec import Word2Vec
105 | from gensim.models.doc2vec import TaggedDocument
106 | from ddoc2vec import DistDoc2Vec
107 | 
108 | # sents is a RDD of sentences (array of tokens)
109 | 
110 | model = Word2Vec(size=100, hs=0, negative=8)  
111 | dd2v = DistDoc2Vec(model, learn_hidden=False, num_partitions=5, num_iterations=10)
112 | dd2v.build_vocab_from_rdd(sents, reset_hidden=False)
113 | # train word2vec in driver
114 | model.train(sents.collect())
115 | dd2v.train_sentences_cbow(sents.zipWithIndex().map(lambda (s, i): TaggedDocument(words=s, tags=[i])))
116 | ```
117 | 
118 | Running `test.py` (on movie review data found [here](http://www.cs.cornell.edu/people/pabo/movie-review-data/)):
119 | 
120 | ```
121 | $SPARK_HOME/bin/spark-submit --verbose \
122 |    --master yarn \
123 |    --deploy-mode client \
124 |    --py-files ddoc2vec.py \
125 |    ./test.py
126 | ```
127 | 
128 | 


--------------------------------------------------------------------------------
/ddoc2vec.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from numpy import sqrt, exp, dot, zeros, outer, random, dtype, get_include, float32 as REAL,\
  3 |     uint32, seterr, array, uint8, vstack, argsort, fromstring, sqrt, newaxis, ndarray, empty, sum as np_sum
  4 | from gensim.models.word2vec import Vocab, Word2Vec
  5 | from gensim.models.doc2vec import TaggedDocument, Doc2Vec
  6 | 
  7 | from operator import add
  8 | from collections import defaultdict
  9 | 
 10 | class DistDoc2Vec:
 11 |     '''
 12 |     DBOW, Skip-gram doc2vec model on Spark
 13 |     '''
 14 |     def __init__(self, model, alpha=0.025,
 15 |                  num_iterations=100,
 16 |                  num_partitions=None,
 17 |                  learn_hidden=True, learn_words=False):
 18 |         self.model = model # gensim model
 19 |         self.alpha = alpha # learning rate
 20 |         self.learn_hidden = learn_hidden
 21 |         self.learn_words = learn_words
 22 |         self.num_iterations = num_iterations
 23 |         self.num_partitions = num_partitions
 24 | 
 25 |     def build_vocab_from_rdd(self, corpus, reset_hidden=True):
 26 |         '''
 27 |         Build model vocab from RDD, respect model's min_count, max_vocab_size
 28 |         if reset_hidden sets to True (default), reset syn1neg weights
 29 |         code borrowed from:
 30 |         https://github.com/dirkneumann/deepdist/blob/master/examples/word2vec_adagrad.py
 31 |         '''
 32 |         model = self.model
 33 |         model.corpus_count = corpus.count()
 34 |         s = corpus   \
 35 |             .flatMap(lambda s: [(w, 1) for w in s])   \
 36 |             .reduceByKey(add)            \
 37 |             .filter(lambda x: x[1] >= model.min_count)              \
 38 |             .collect()
 39 |             # .map(lambda x: (x[1], x[0]))              \
 40 |             # .sortByKey(False)                         \
 41 |             # .collect()
 42 | 
 43 |         model.raw_vocab = defaultdict(int, s)
 44 |         model.finalize_vocab()
 45 |         model.total_words = long(len(model.vocab))
 46 | 
 47 |     def saveAsPickleFile(self, path):
 48 |         syn0_path = "%s.syn0" % path 
 49 |         syn1_path = "%s.syn1" % path 
 50 |         doctagsyn0_path = "%s.doctag_syn0" % path 
 51 |         self.doctag_syn0.saveAsPickleFile(doctagsyn0_path)
 52 |         sc = self.doctag_syn0.context
 53 |         sc.parallelize(self.model.syn0, 1).saveAsPickleFile(syn0_path)
 54 |         sc.parallelize(self.model.syn1, 1).saveAsPickleFile(syn1_path)
 55 | 
 56 |     def train_sentences_cbow(self, corpus):
 57 |         '''
 58 |         Code adaped from: https://github.com/klb3713/sentence2vec           
 59 |         to be paralleizable on Spark
 60 |         '''
 61 |         model = self.model
 62 |         alpha = self.alpha
 63 |         vocab = model.vocab
 64 |  
 65 |         def make_sent_doctag(p):
 66 |             sent, i = p
 67 |             # for now support only single-tag docuemnt/sentence
 68 |             tag = iter(sent.tags).next()
 69 |             # filter out unknown words
 70 |             words_indices = [vocab[w].index for w in sent.words \
 71 |                              if w in vocab]
 72 |             seed = "%d %s" % (model.seed, tag)
 73 |             # a single row correspoinding to Doc2Vec's doctag_syn0
 74 |             # distributed as RDD  (1xvector_size)
 75 |             docvec = model.seeded_vector(seed)
 76 |             return (words_indices, docvec, tag, i, None) # last element placeholder of syn0neg
 77 | 
 78 |         dataset = corpus.zipWithIndex().map(make_sent_doctag)
 79 |         if self.num_partitions:
 80 |             dataset = dataset.repartition(self.num_partitions)
 81 | 
 82 |         dataset = dataset.cache()
 83 | 
 84 |         n_part = dataset.getNumPartitions()
 85 |         sc = dataset.context
 86 | 
 87 |         learn_hidden = self.learn_hidden
 88 | 
 89 |         bc_syn0 = sc.broadcast(model.syn0) 
 90 |         bc_syn1neg = sc.broadcast(model.syn1neg) 
 91 |         bc_table = sc.broadcast(model.cum_table)
 92 |         window = model.window
 93 |         negative = model.negative
 94 |         for k in xrange(self.num_iterations):
 95 |             print "**** Training iteration % d ****" % (k+1)
 96 |             def mapPartitions(iterable):
 97 |                 # sentence : [word index]
 98 |                 syn1neg = bc_syn1neg.value
 99 |                 syn0 = bc_syn0.value
100 |                 def result():
101 |                     for sentence, doctag_syn0, t, i, _ in iterable:
102 |                         for pos, w in enumerate(sentence):
103 |                             # `b` in the original word2vec code                  
104 |                             reduced_window = random.randint(window) 
105 |                             start = max(0, pos - window + reduced_window)
106 |                             window_pos = enumerate(sentence[start : pos+window+1-reduced_window],start)
107 |                             word2_indices = [wd for pos2, wd in window_pos if pos2 != pos]
108 |                             # layer 1
109 |                             # uncomment to take context words as input as well
110 |                             # l1 = np.sum(syn0[word2_indices], axis=0) / len(word2_indices)
111 |                             # l1 += doctag_syn0
112 |                             l1 = doctag_syn0
113 |                             neu1e = zeros(l1.shape, dtype=np.float32)
114 |                             word_indices = [w] 
115 |                             table = bc_table.value
116 |                             while len(word_indices) < negative + 1:
117 |                                 w2 = table.searchsorted(random.randint(table[-1]))
118 |                                 if w2 != w and w2 not in sentence:
119 |                                     word_indices.append(w2) 
120 | 
121 |                             l2b = syn1neg[word_indices] # 2d matrix, neg+1 x layer1_size
122 |                             labels = np.zeros(l2b.shape[0], dtype=REAL)
123 |                             labels[0] = 1.0 
124 |                             fb = 1.0 / (1.0 + exp(-dot(l1, l2b.T))) # feed forward
125 |                             gb = (labels - fb) * alpha / sqrt(k+1) # k'th iteration
126 |                             neu1e += dot(gb, l2b)
127 |                             if learn_hidden:
128 |                                 delta_syn1neg = outer(gb / n_part, l1)
129 |                                 syn1neg[word_indices] += delta_syn1neg
130 |                             doctag_syn0 += neu1e
131 | 
132 |                         yield (sentence, doctag_syn0, t, i, (word_indices, delta_syn1neg) if learn_hidden else None)
133 |                 return result()
134 | 
135 |             _dataset = dataset
136 |             dataset = dataset.mapPartitions(mapPartitions).cache() 
137 |             def seq_op(syn1neg, delta):
138 |                 if not delta:
139 |                     return syn1neg
140 |                 indices, update = delta
141 |                 syn1neg[indices] += delta
142 |                 return syn1neg
143 |             def comb_op(syn1neg_a, syn1neg_b):
144 |                 if learn_hidden:
145 |                    syn1neg_a += syn1neg_b
146 |                 return syn1neg_a
147 |             # 4: (indices, syn1neg deltas) or None
148 |             if learn_hidden:
149 |                 syn1neg = dataset.map(lambda tp: tp[4]) \
150 |                    .aggregate(np.zeros(bc_syn1neg.value.shape, dtype='float32'), 
151 |                               seq_op, comb_op) # this triggers an action 
152 |                 bc_syn1neg.unpersist()
153 |                 model.syn1neg += syn1neg / n_part
154 |                 bc_syn1neg = sc.broadcast(model.syn1neg)
155 |             else:
156 |                 dataset.count()
157 |             _dataset.unpersist()
158 | 
159 |         self.doctag_syn0 = dataset.map(lambda tp: (tp[2], tp[1])) # tag, docvec
160 |         
161 | 
162 | 
163 | 


--------------------------------------------------------------------------------
/ddoc2vecf.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from numpy import sqrt, exp, dot, zeros, outer, random, dtype, get_include, float32 as REAL,\
  3 |     uint32, seterr, array, uint8, vstack, argsort, fromstring, sqrt, newaxis, ndarray, empty, sum as np_sum
  4 | from gensim.models.word2vec import Vocab, Word2Vec
  5 | from gensim.models.doc2vec import TaggedDocument, Doc2Vec
  6 | 
  7 | from collections import defaultdict
  8 | from operator import add
  9 | try:
 10 |     from gensim.models.doc2vec_inner import train_document_dbow
 11 |     from gensim.models.word2vec_inner import FAST_VERSION  # blas-adaptation shared from word2vec
 12 | except:
 13 |     # failed... fall back to plain numpy (20-80x slower training than the above)
 14 |     FAST_VERSION = -1
 15 |     from gensim.models.doc2vec import train_document_dbow
 16 | 
 17 | class DistDoc2VecFast:
 18 |     '''
 19 |     DBOW, Skip-gram doc2vec model on Spark
 20 |     '''
 21 |     def __init__(self, model, alpha=0.025,
 22 |                  num_iterations=100,
 23 |                  num_partitions=None,
 24 |                  learn_hidden=True, learn_words=False):
 25 |         self.model = model # gensim model
 26 |         self.alpha = alpha # learning rate
 27 |         self.learn_hidden = learn_hidden
 28 |         self.learn_words = learn_words
 29 |         self.num_iterations = num_iterations
 30 |         self.num_partitions = num_partitions
 31 | 
 32 |     def build_vocab_from_rdd(self, corpus):
 33 |         '''
 34 |         Build model vocab from RDD, respect model's min_count, max_vocab_size
 35 |         if reset_hidden sets to True (default), reset syn1neg weights
 36 |         code borrowed from:
 37 |         https://github.com/dirkneumann/deepdist/blob/master/examples/word2vec_adagrad.py
 38 |         '''
 39 |         model = self.model
 40 |         model.corpus_count = corpus.count()
 41 |         s = corpus   \
 42 |             .flatMap(lambda s: [(w, 1) for w in s])   \
 43 |             .reduceByKey(add)            \
 44 |             .filter(lambda x: x[1] >= model.min_count)              \
 45 |             .collect()
 46 |             # .map(lambda x: (x[1], x[0]))              \
 47 |             # .sortByKey(False)                         \
 48 |             # .collect()
 49 | 
 50 |         model.raw_vocab = defaultdict(int, s)
 51 |         model.finalize_vocab()
 52 |         model.total_words = long(len(model.vocab))
 53 | 
 54 |     def saveAsPickleFile(self, path):
 55 |         syn0_path = "%s.syn0" % path 
 56 |         syn1neg_path = "%s.syn1neg" % path 
 57 |         doctagsyn0_path = "%s.doctag_syn0" % path 
 58 |         self.doctag_syn0.saveAsPickleFile(doctagsyn0_path)
 59 |         sc = self.doctag_syn0.context
 60 |         sc.parallelize(self.model.syn0, 1).saveAsPickleFile(syn0_path)
 61 |         sc.parallelize(self.model.syn1neg, 1).saveAsPickleFile(syn1neg_path)
 62 | 
 63 |     def init_doc_sims(self, corpus):
 64 |         model = self.model
 65 |         def make_sent_doctag(docs):
 66 |             tag2index, docvecs = {}, []
 67 |             for d in docs:
 68 |                 tag = d.tags[0]
 69 |                 sent = d.words
 70 |                 if tag in tag2index:
 71 |                     i = tag2index[tag]
 72 |                 else:
 73 |                     i = len(docvecs) 
 74 |                     seed = "%d %s" % (model.seed, tag)
 75 |                     docvec = model.seeded_vector(seed).astype(REAL)
 76 |                     tag2index[tag] = i
 77 |                     docvecs.append(docvec)
 78 |             return [{ 'lookup': tag2index, 'doctag_syn0': array(docvecs) }]
 79 |         return corpus.mapPartitions(make_sent_doctag)
 80 |       
 81 |     def train_sentences_cbow(self, corpus):
 82 |         '''
 83 |         Faster version, uses gensim's Cython training procedure
 84 |         (negative sampling, skip-gram settings)
 85 |         '''
 86 |         model = self.model
 87 |         alpha = self.alpha
 88 |         vector_size = model.vector_size
 89 |  
 90 |         if self.num_partitions:
 91 |             corpus = corpus.repartition(self.num_partitions)
 92 |         # RDD of init doc vectors
 93 |         doctag_syn0 = self.init_doc_sims(corpus) 
 94 | 
 95 |         n_part = corpus.getNumPartitions()
 96 |         sc = corpus.context
 97 | 
 98 |         corpus = corpus.glom().cache()
 99 |         doctag_locks = corpus.map(lambda x: np.ones(dtype=REAL, shape=(len(x), ))).cache()
100 | 
101 |         bc_model = sc.broadcast(model)
102 | 
103 |         syn0_zeros = np.zeros(np.shape(model.syn0), dtype=REAL)
104 |         syn1neg_zeros = np.zeros(np.shape(model.syn1neg), dtype=REAL)
105 | 
106 |         bc_syn0_0 = sc.broadcast(syn0_zeros)
107 |         bc_syn1neg_0 = sc.broadcast(syn1neg_zeros)
108 |         # params is a RDD of tripplelet (delta syn0, delta syn1neg, doctag_syn0 np array)
109 |         params = doctag_syn0.map(lambda d: (bc_syn0_0.value, bc_syn1neg_0.value, d)).cache()
110 | 
111 |         trained_count = sc.accumulator(0)
112 |         train_passes = sc.accumulator(0)
113 | 
114 |         def mapPartitions(iterable):
115 |             model = bc_model.value
116 |             syn0copy = model.syn0.copy()
117 |             syn1negcopy = model.syn1neg.copy()
118 |             params, sentences, lockf, k = iter(iterable).next()
119 |             _a, _b, docvecs = params
120 |             lookup = docvecs['lookup']
121 |             doctag_syn0_part = docvecs['doctag_syn0']
122 |             train_passes.add(1)
123 |             for sent in sentences:
124 |                 i = lookup[sent.tags[0]]
125 |                 # training document modify doctag_syn0_part in-place
126 |                 train_document_dbow(model, sent.words,
127 |                                     doctag_indexes=[i],
128 |                                     alpha=alpha * 1.0 / sqrt(k+1),
129 |                                     doctag_vectors=doctag_syn0_part,
130 |                                     doctag_locks=lockf,
131 |                                     learn_words=True,
132 |                                     train_words=True,
133 |                                     learn_hidden=True)
134 |             trained_count.add(i+1)
135 | 
136 |             dsyn0 = model.syn0 - syn0copy
137 |             dsyn1neg = model.syn1neg - syn1negcopy
138 | 
139 |             return [(dsyn0, dsyn1neg, docvecs)]
140 | 
141 |         def seq_op(a, b):
142 |             return (b[0], b[1])
143 | 
144 |         def comb_op(delta_pairs, next_deltas):
145 |             csyn0, csyn1neg = delta_pairs
146 |             if csyn0 is None:
147 |                 csyn0 = bc_syn0_0.value
148 |             if csyn1neg is None:
149 |                 csyn1neg = bc_syn1neg_0.value
150 |             dsyn0, dsyn1neg = next_deltas
151 |             csyn0 += dsyn0
152 |             csyn1neg += dsyn1neg
153 |             return csyn0, csyn1neg
154 | 
155 |         def simplify(k, params, corpus, locks):
156 |             dset = params.zip(corpus).zip(locks) \
157 |                 .map(lambda (pair, lockf): (pair[0], pair[1], lockf, k)) 
158 |             return dset
159 | 
160 |         for k in xrange(self.num_iterations):
161 |             dataset = simplify(k, params, corpus, doctag_locks) 
162 |             old_params = params
163 |             params = dataset.mapPartitions(mapPartitions).cache()
164 |             dsyn0, dsyn1neg = params.aggregate((None, None), seq_op, comb_op)
165 |             bc_model.unpersist()
166 |             model.syn0 += (dsyn0 / n_part)
167 |             model.syn1neg += (dsyn1neg / n_part)
168 |             bc_model = sc.broadcast(model)
169 |             old_params.unpersist()
170 | 
171 |         corpus.unpersist()
172 |         doctag_locks.unpersist()
173 |         bc_syn0_0.unpersist()
174 |         bc_syn1neg_0.unpersist()
175 | 
176 |         self.doctag_syn0 = params.map(lambda (_a, _b, dvecs): dvecs)
177 |             
178 |         # kick start training
179 |         self.doctag_syn0.count()
180 |         print "**** Train passes: %d ****" % train_passes.value
181 |         print "**** Train counts: %d ****" % trained_count.value
182 |         corpus.unpersist()
183 |         doctag_locks.unpersist()
184 |         bc_model.unpersist()
185 | 
186 |     def train_sentences_only_cbow(self, corpus):
187 |         '''
188 |         Faster version, uses gensim's Cython training procedure
189 |         But cannot learn weights for hidden layer (syn1neg)
190 |         Therefore, requires a already trained Word2Vec model 
191 |         (negative sampling, skip-gram settings)
192 |         '''
193 |         model = self.model
194 |         alpha = self.alpha
195 |         vector_size = model.vector_size
196 |  
197 |         if self.num_partitions:
198 |             corpus = corpus.repartition(self.num_partitions)
199 | 
200 |         doctag_syn0 = self.init_doc_sims(corpus) 
201 | 
202 |         n_part = corpus.getNumPartitions()
203 |         sc = corpus.context
204 | 
205 |         corpus = corpus.glom().cache()
206 |         doctag_locks = corpus.map(lambda x: np.ones(dtype=REAL, shape=(len(x), ))).cache()
207 | 
208 |         bc_model = sc.broadcast(model)
209 | 
210 |         trained_count = sc.accumulator(0)
211 |         train_passes = sc.accumulator(0)
212 | 
213 |         def mapPartitions(iterable):
214 |             model = bc_model.value
215 |             docvecs, sentences, lockf, k = iter(iterable).next()
216 |             lookup = docvecs['lookup']
217 |             doctag_syn0_part = docvecs['doctag_syn0']
218 |             train_passes.add(1)
219 |             for sent in sentences:
220 |                 i = lookup[sent.tags[0]]
221 |                 # training document modify doctag_syn0_part in-place
222 |                 train_document_dbow(model, sent.words,
223 |                                     doctag_indexes=[i],
224 |                                     alpha=alpha * 1.0 / sqrt(k+1),
225 |                                     doctag_vectors=doctag_syn0_part,
226 |                                     doctag_locks=lockf,
227 |                                     learn_words=False,
228 |                                     train_words=False,
229 |                                     learn_hidden=False)
230 |             trained_count.add(i+1)
231 | 
232 |             return [docvecs]
233 | 
234 |         def simplify(k, doctag_syn0, corpus, locks):
235 |             dset = doctag_syn0.zip(corpus).zip(locks) \
236 |                 .map(lambda (pair, lockf): (pair[0], pair[1], lockf, k)) 
237 |             return dset
238 | 
239 |         def reducer(dataset, k):
240 |             new_doctag = dataset.mapPartitions(mapPartitions)
241 |             return simplify(k, new_doctag, corpus, doctag_locks) 
242 | 
243 |         init_dataset = simplify(0, doctag_syn0, corpus, doctag_locks)
244 |         dataset = reduce(reducer, xrange(1, self.num_iterations), init_dataset)
245 | 
246 |         self.doctag_syn0 = dataset.map(lambda (docvecs, _1, _2, _3): docvecs) 
247 | 
248 |     def train(self, corpus):
249 |         if self.learn_words and self.learn_hidden:
250 |             return self.train_sentences_cbow(corpus)
251 |         else:
252 |             return self.train_sentences_only_cbow(corpus)
253 | 


--------------------------------------------------------------------------------