├── test.py ├── movie_review.py ├── README.md ├── Old-version.md ├── ddoc2vec.py └── ddoc2vecf.py /test.py: -------------------------------------------------------------------------------- 1 | from gensim.models.word2vec import Word2Vec 2 | from gensim.models.doc2vec import TaggedDocument 3 | 4 | from pyspark import SparkContext, SparkConf 5 | 6 | from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel 7 | from pyspark.mllib.regression import LabeledPoint 8 | 9 | 10 | conf = (SparkConf() \ 11 | .set("spark.driver.maxResultSize", "2g")) 12 | 13 | sc = SparkContext(conf=conf) 14 | pos = sc.textFile("hdfs:///movie_review/positive").map(lambda s: (True, s.lower().split())) 15 | neg = sc.textFile("hdfs:///movie_review/negative").map(lambda s: (False, s.lower().split())) 16 | 17 | if False: 18 | docvecs = sc.pickleFile("hdfs://movie_review/doctags") 19 | else: 20 | from ddoc2vec import DistDoc2Vec 21 | 22 | data = (neg + pos).zipWithIndex().map(lambda (v, i): (i, v[0], v[1])) 23 | sents = data.map(lambda (a,b,c): c) 24 | 25 | model = Word2Vec(size=100, hs=0, negative=8) 26 | dd2v = DistDoc2Vec(model, learn_hidden=False, num_partitions=5, num_iterations=10) 27 | dd2v.build_vocab_from_rdd(sents, reset_hidden=False) 28 | # train word2vec in driver 29 | model.train(sents.collect()) 30 | model.save("/root/doc2vec/word2vec_model/review") 31 | print "*** done training words ****" 32 | print "*** len(model.vocab): %d ****" % len(model.vocab) 33 | dd2v.train_sentences_cbow(data.map(lambda (i, l, v): TaggedDocument(words=v, tags=[i]))) 34 | # dd2v.saveAsPickleFile("hdfs:///movie_review/docvectors") 35 | docvecs = dd2v.doctag_syn0 36 | 37 | npos = pos.count() 38 | reg_data = docvecs.map(lambda (i, v): LabeledPoint(1.0 if i