├── word2veckeras ├── __init__.py ├── makefile ├── treebank.py ├── test.txt ├── word2veckeras.py ├── scoreword2veckeras.py └── doc2veckeras.py ├── example ├── test.txt ├── trees │ └── README ├── test-doc2veckeras.py ├── sentence-classify.py ├── treebank-classify.py ├── test-word2veckeras.py ├── test-scoreword2veckeras.py └── demo-scoreword2veckeras.py ├── setup.cfg ├── setup.py └── README.org /word2veckeras/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /example/test.txt: -------------------------------------------------------------------------------- 1 | ../word2veckeras/test.txt -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /example/trees/README: -------------------------------------------------------------------------------- 1 | unpack 2 | http://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip 3 | here -------------------------------------------------------------------------------- /word2veckeras/makefile: -------------------------------------------------------------------------------- 1 | 2 | start: test 3 | test: clean 4 | (cd .. ; python setup.py test) 5 | 6 | clean: 7 | -@rm -f *.pyc 8 | -------------------------------------------------------------------------------- /example/test-doc2veckeras.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | import gensim 4 | from word2veckeras.doc2veckeras import Doc2VecKeras 5 | 6 | def compare_d2v(d2v1,d2v2): 7 | return sum([np.linalg.norm(d2v1.docvecs[n]-d2v2.docvecs[n]) for n in range(len(d2v1.docvecs)) ])/len(d2v1.docvecs) 8 | 9 | input_file = 'test.txt' 10 | doc1=gensim.models.doc2vec.TaggedLineDocument(input_file) 11 | 12 | parameters = [{'size':[5],'dm':[0,1],'dm_concat':[0,1],'hs':[0,1],'negative':[0,5] }] 13 | from sklearn.grid_search import ParameterGrid 14 | for param in ParameterGrid(parameters): 15 | if (param['hs']==0 and param['negative']==0) or (param['dm']==0 and param['dm_concat']==0) : 16 | continue 17 | 18 | print param 19 | dvk=Doc2VecKeras(doc1,**param) 20 | dv =gensim.models.doc2vec.Doc2Vec(doc1,**param) 21 | print compare_d2v(dv,dvk) 22 | 23 | 24 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from setuptools import find_packages 3 | 4 | setup(name='word2veckeras', 5 | version='0.0.5.2', 6 | description='word2vec based on Kearas and gensim', 7 | author='Hirotaka Niitsuma', 8 | author_email='hirotaka.niitsuma@gmail.com', 9 | url='https://github.com/niitsuma/word2vec-keras-in-gensim', 10 | download_url='https://github.com/niitsuma/word2vec-keras-in-gensim/archive/master.zip', 11 | license='GNU Affero General Public License, version 3 - http://www.gnu.org/licenses/agpl-3.0.html', 12 | install_requires=['gensim', 'theano', 'pyyaml', 'six', 'keras<=0.3.1'], 13 | #install_requires=['gensim', 'theano', 'pyyaml', 'six', 'keras', 'sklearn'], 14 | # extras_require={ 15 | # 'h5py': ['h5py'], 16 | # }, 17 | packages=find_packages(), 18 | test_suite = 'test' 19 | ) 20 | -------------------------------------------------------------------------------- /README.org: -------------------------------------------------------------------------------- 1 | * word2vec-keras-in-gensim 2 | 3 | Just rewrite train function in gensim.models.word2vec.Word2Vec and gensim.models.doc2vec.Doc2Vec using Keras+Theano 4 | 5 | like 6 | 7 | #+BEGIN_SRC python 8 | class Word2VecKeras(gensim.models.word2vec.Word2Vec): 9 | def train(... 10 | #+END_SRC 11 | 12 | And can use GPU via Theano. 13 | 14 | * Install 15 | #+BEGIN_SRC bash 16 | pip install word2veckeras 17 | #+END_SRC 18 | 19 | * Usage 20 | 21 | same to gensim.models.word2vec.Word2Vec 22 | 23 | ** Example 24 | #+BEGIN_SRC python 25 | vsk = Word2VecKeras(gensim.models.word2vec.LineSentence('test.txt'),iter=100) 26 | print( vsk.most_similar('the', topn=5)) 27 | 28 | from nltk.corpus import brown 29 | brk = Word2VecKeras(brown.sents(),iter=10) 30 | print( brk.most_similar('the', topn=5)) 31 | #+END_SRC 32 | 33 | * Requirements 34 | 35 | #+BEGIN_SRC bash 36 | pip install -U keras 37 | #+END_SRC 38 | -------------------------------------------------------------------------------- /example/sentence-classify.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Licensed under the GNU Affero General Public License, version 3 - http://www.gnu.org/licenses/agpl-3.0.html 5 | 6 | import nltk 7 | import gensim 8 | from word2veckeras.doc2veckeras import SentenceClassifier,Doc2VecClassifier 9 | 10 | genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'] 11 | n_sample=40 12 | sents_labels= sum([list(zip(nltk.corpus.brown.sents(categories=[genres[i_g]])[:n_sample],[i_g]*n_sample)) for i_g in range(len(genres))],[]) 13 | X=[sl[0] for sl in sents_labels] 14 | Y=[sl[1] for sl in sents_labels] 15 | 16 | clf1=SentenceClassifier( doc2vec=gensim.models.doc2vec.Doc2Vec() ) 17 | clf1.fit(X,Y) 18 | print clf1.score(X,Y) 19 | 20 | from sklearn.grid_search import GridSearchCV,ParameterSampler, ParameterGrid 21 | 22 | clf2=Doc2VecClassifier() 23 | tuned_parameters = [{'dm':[1],'dm_concat':[0,1],'size': [200,300,400], 'window':[4,8],'min_count':[0,9],'sample':[0,1e-5],'iter':[1]}] 24 | #tuned_parameters = [{'dm':[1],'size': [200,300,400]}] 25 | clf2 = GridSearchCV(clf2, tuned_parameters,cv=3,n_jobs=-1,verbose=1) 26 | clf2.fit(X,Y) 27 | print clf2.best_estimator_ 28 | print clf2.best_params_ 29 | print clf2.best_score_ 30 | print clf2.score(X,Y) 31 | -------------------------------------------------------------------------------- /example/treebank-classify.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Licensed under the GNU Affero General Public License, version 3 - http://www.gnu.org/licenses/agpl-3.0.html 5 | 6 | import nltk 7 | import gensim 8 | from word2veckeras.doc2veckeras import SentenceClassifier,Doc2VecClassifier 9 | from word2veckeras.treebank import TreeBank 10 | 11 | treebank=TreeBank('./trees') 12 | Xtest,Ytest=treebank.sents_labels('test',only_root=False,pos_neg_label=False) 13 | 14 | X,Y =treebank.sents_labels('train',only_root=False,pos_neg_label=False) 15 | 16 | # X,Y =treebank.sents_labels('dev',only_root=False,pos_neg_label=False) 17 | # n_sample=300 18 | # X=X[:n_sample] 19 | # Y=Y[:n_sample] 20 | 21 | 22 | clf1=SentenceClassifier( doc2vec=gensim.models.doc2vec.Doc2Vec() ) 23 | clf1.fit(X,Y) 24 | print clf1.score(Xtest,Ytest) 25 | 26 | 27 | from sklearn.grid_search import GridSearchCV,ParameterSampler, ParameterGrid 28 | 29 | clf2=Doc2VecClassifier() 30 | tuned_parameters = [{'dm':[1],'dm_concat':[0,1],'size': [200,300,400], 'window':[4,8],'min_count':[0,9],'sample':[0,1e-5],'iter':[1]}] 31 | #tuned_parameters = [{'dm':[0,1],'size': [100,200]}] 32 | clf2 = GridSearchCV(clf2, tuned_parameters,cv=3,n_jobs=4,verbose=1) 33 | clf2.fit(X,Y) 34 | print clf2.best_estimator_ 35 | print clf2.best_params_ 36 | print clf2.best_score_ 37 | 38 | print clf2.best_estimator_.fit(X,Y) 39 | print clf2.best_estimator_.score(Xtest,Ytest) 40 | -------------------------------------------------------------------------------- /example/test-word2veckeras.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | import gensim 4 | from word2veckeras.word2veckeras import Word2VecKeras 5 | 6 | def compare_w2v(w2v1,w2v2): 7 | s=0.0 8 | count =0 9 | for w in w2v1.vocab: 10 | if w in w2v2.vocab: 11 | d=np.linalg.norm(w2v1[w]-w2v2[w]) 12 | count +=1 13 | s += d 14 | return s/count 15 | 16 | 17 | input_file = 'test.txt' 18 | sents=gensim.models.word2vec.LineSentence(input_file) 19 | 20 | v_iter=1 21 | v_size=5 22 | sg_v=1 23 | topn=4 24 | 25 | vs1 = gensim.models.word2vec.Word2Vec(sents,hs=1,negative=0,sg=sg_v,size=v_size,iter=1) 26 | 27 | print vs1['the'] 28 | vsk1 = Word2VecKeras(sents,hs=1,negative=0,sg=sg_v,size=v_size,iter=1) 29 | print( vsk1.most_similar('the', topn=topn)) 30 | print vsk1['the'] 31 | print np.linalg.norm(vs1.syn0-vsk1.syn0),compare_w2v(vs1,vsk1) 32 | vsk1 = Word2VecKeras(sents,hs=1,negative=0,sg=sg_v,size=v_size,iter=5) 33 | print vsk1['the'] 34 | print( vsk1.most_similar('the', topn=topn)) 35 | print( vs1.most_similar('the', topn=topn)) 36 | print np.linalg.norm(vs1.syn0-vsk1.syn0),compare_w2v(vs1,vsk1) 37 | 38 | 39 | from nltk.corpus import brown 40 | #brown_sents=list(brown.sents())[:2000] 41 | brown_sents=list(brown.sents()) 42 | 43 | #for sg_v in [1,0]: 44 | for sg_v in [0]: 45 | print sg_v 46 | brc = gensim.models.word2vec.Word2Vec(brown_sents,hs=1,negative=0,sg=sg_v,iter=1) 47 | print brc.most_similar_cosmul(positive=['she', 'him'], negative=['he'], topn=topn) 48 | #ns=[1,2,5,10,20] 49 | ns=[100] 50 | for n in ns : 51 | print n 52 | brck = Word2VecKeras(brown_sents,hs=1,negative=0,iter=n,sg=sg_v) 53 | print compare_w2v(brc,brck) 54 | print brck.most_similar_cosmul(positive=['she', 'him'], negative=['he'], topn=topn) 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /example/test-scoreword2veckeras.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | import gensim 4 | from word2veckeras.scoreword2veckeras import ScoreWord2VecKeras,LineScoredWordSentence,ScoredListSentence 5 | 6 | 7 | def compare_w2v(w2v1,w2v2): 8 | s=0.0 9 | count =0 10 | for w in w2v1.vocab: 11 | if w in w2v2.vocab: 12 | d=np.linalg.norm(w2v1[w]-w2v2[w]) 13 | count +=1 14 | s += d 15 | return s/count 16 | 17 | input_file = 'test.txt' 18 | 19 | scales=[1.0,1.0,1.0] 20 | def dummy_score_vec(word): 21 | return [len(word)*scales[0],ord(word[0])*scales[1],ord(word[-1])*scales[1]] 22 | #return [len(word)/0.2 ] 23 | 24 | v_iter=1 25 | v_size=5 26 | sg_v=1 27 | topn=4 28 | 29 | sws=list(LineScoredWordSentence(input_file,dummy_score_vec)) 30 | svk=ScoreWord2VecKeras(sws,hs=1,negative=0,sg=sg_v,size=v_size,iter=1) 31 | vs = gensim.models.word2vec.Word2Vec(gensim.models.word2vec.LineSentence(input_file),hs=1,negative=0,sg=sg_v,size=v_size,iter=1) 32 | 33 | print( svk.most_similar('the', topn=5)) 34 | print( vs.most_similar('the', topn=5)) 35 | print(svk['the']) 36 | print(vs['the']) 37 | 38 | #svk.save_word2vec_format('tmp.vec') 39 | #svk.save('tmp.model') 40 | 41 | #print svk.score_vector_size 42 | 43 | scored_word_list=[ 44 | ['This',[20*0.1,10*0.2]], 45 | ['is',[10*0.1,5*0.2]], 46 | ['a',[30*0.1,10*0.2]], 47 | ['pen',[10*0.1,5*0.2]], 48 | ['.',[3*0.1,5*0.2]], 49 | ] 50 | 51 | scored_word_list=[scored_word_list]*100 52 | #print scored_word_list 53 | svk2=ScoreWord2VecKeras(scored_word_list,iter=3) 54 | print(svk2.most_similar('a',topn=5)) 55 | #svk1.save('tmp.vec') 56 | #svk2.save_word2vec_format('tmp2.vec') 57 | 58 | 59 | 60 | from nltk.corpus import brown 61 | brown_sents=list(brown.sents())[:200] 62 | #brown_sents=list(brown.sents()) 63 | 64 | vs = gensim.models.word2vec.Word2Vec(brown_sents) 65 | svk2=ScoreWord2VecKeras(ScoredListSentence(brown_sents,dummy_score_vec)) 66 | 67 | 68 | print( vs.most_similar('the', topn=5)) 69 | print( svk.most_similar('the', topn=5)) 70 | -------------------------------------------------------------------------------- /example/demo-scoreword2veckeras.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Licensed under the GNU Affero General Public License, version 3 - http://www.gnu.org/licenses/agpl-3.0.html 5 | 6 | ### ScoreWord2Vec stil has bug 7 | 8 | import sys 9 | 10 | import gensim 11 | 12 | from word2veckeras.word2veckeras import Word2VecKeras 13 | from word2veckeras.doc2veckeras import Doc2VecKeras,LabeledListSentence 14 | from word2veckeras.scoreword2veckeras import ScoreWord2VecKeras,LineScoredWordSentence,ScoredListSentence 15 | 16 | def dummy_score_vec_fn(word): 17 | return [len(word)/10.0,ord(word[0])/100.0,ord(word[-1])/100.0] 18 | #return [len(word)/0.2 ] 19 | 20 | 21 | input_file = 'test.txt' 22 | test_docs =gensim.models.doc2vec.TaggedLineDocument(input_file) 23 | test_sents=gensim.models.word2vec.LineSentence(input_file) 24 | 25 | dk0=Doc2VecKeras(test_docs,size=10,iter=3) 26 | #sys.exit() 27 | 28 | test_scorewordsents=LineScoredWordSentence(input_file,dummy_score_vec_fn) 29 | 30 | ### null_word must need for Doc2VecKeras(dm_concat=1) 31 | vck = Word2VecKeras(test_sents,size=10,null_word=1,iter=3,sg=0) 32 | #vck = Word2VecKeras(test_sents,null_word=0,iter=3,sg=0) 33 | dklw=Doc2VecKeras(dm_concat=1) 34 | print vck.syn0[0] 35 | dklw.train_with_word2vec_instance(test_docs,vck,learn_words=True,iter=3) 36 | #dk.train_with_word2vec_instance(test_docs,vck) 37 | print dklw.syn0[0] 38 | dk=Doc2VecKeras(dm_concat=1) 39 | dk.train_with_word2vec_instance(test_docs,vck,learn_words=False,iter=3) 40 | print dk.syn0[0] 41 | 42 | #sys.exit() 43 | 44 | svk=ScoreWord2VecKeras(test_scorewordsents,size=10,null_word=1,iter=3,sg=0) 45 | print svk.syn0[0] 46 | dsk=Doc2VecKeras(dm_concat=1) 47 | dsk.train_with_word2vec_instance(test_docs,svk,learn_words=True,iter=3) 48 | print dsk.syn0[0] 49 | 50 | print(dk0.docvecs.most_similar(0)) 51 | print(dk.docvecs.most_similar(0)) 52 | print(dsk.docvecs.most_similar(0)) 53 | print(dklw.docvecs.most_similar(0)) 54 | 55 | #sys.exit() 56 | 57 | from nltk.corpus import brown 58 | 59 | brown_sents_sub=list(brown.sents()[:100]) 60 | brown_docs_sub=LabeledListSentence(brown_sents_sub) 61 | brown_scorewordsents=list(ScoredListSentence(brown_sents_sub,dummy_score_vec_fn)) 62 | 63 | 64 | vck_br = Word2VecKeras(brown_sents_sub,null_word=1,iter=3,sg=0) 65 | vkk_br = Word2VecKeras(brown_sents_sub,null_word=1,iter=3,sg=1) 66 | 67 | dg_br=gensim.models.doc2vec.Doc2Vec(brown_docs_sub) 68 | dk0_br=Doc2VecKeras(brown_docs_sub,iter=3) 69 | 70 | 71 | svk_br=ScoreWord2VecKeras(brown_scorewordsents,null_word=1,iter=3,sg=0) 72 | 73 | dk_br=Doc2VecKeras(dm_concat=1) 74 | dk_br.train_with_word2vec_instance(brown_docs_sub,vck_br,learn_words=False,iter=3) 75 | 76 | dkk_br=Doc2VecKeras(dm_concat=1) 77 | dkk_br.train_with_word2vec_instance(brown_docs_sub,vkk_br,learn_words=False,iter=3,dm=1) 78 | 79 | dsk_br=Doc2VecKeras(dm_concat=1) 80 | dsk_br.train_with_word2vec_instance(brown_docs_sub,svk_br,learn_words=False,iter=3) 81 | 82 | 83 | print(dg_br.docvecs.most_similar(0)) 84 | print(dk0_br.docvecs.most_similar(0)) 85 | print(dk_br.docvecs.most_similar(0)) 86 | print(dkk_br.docvecs.most_similar(0)) 87 | print(dsk_br.docvecs.most_similar(0)) 88 | -------------------------------------------------------------------------------- /word2veckeras/treebank.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Licensed under the GNU Affero General Public License, version 3 - http://www.gnu.org/licenses/agpl-3.0.html 5 | 6 | 7 | import sys 8 | import codecs 9 | 10 | from os import path 11 | import nltk 12 | from nltk.tree import * 13 | 14 | import numpy 15 | import numpy as np 16 | import copy 17 | import random 18 | 19 | import pickle 20 | 21 | import csv 22 | 23 | 24 | def tree2label_sent(tree): 25 | ret=[] 26 | #x=[int(tree.node) ,tree.leaves()] ##nltk ver2 27 | x=[int(tree.label()),tree.leaves()] ##nltk ver3 28 | ret.append(x) 29 | if len(tree)>1 : 30 | for t in tree: 31 | ret1=tree2label_sent(t) 32 | ret.extend(ret1) 33 | return ret 34 | 35 | def label_sents2uni_sent(lss): 36 | lsis_join=[[ls[0],' '.join(ls[1]),i] for (i,ls) in enumerate(lss)] 37 | uss={} 38 | for lsi in lsis_join: 39 | if lsi[1] in uss: 40 | uss[lsi[1]].append(lsi[2]) 41 | else: 42 | uss[lsi[1]]=[lsi[2]] 43 | return uss 44 | 45 | 46 | def trees2label_sents(trees,only_root=False,pos_neg_label=False,remove_double_count_sentence=False): 47 | #print 'trees2label_sents',flag_word_lower,flag_stemmer,flag_remove_double_count_sentence,only_root,pos_neg_label 48 | #sys.exit() 49 | lss=[] 50 | for tree in trees: 51 | lss_tmp=tree2label_sent(tree) 52 | if pos_neg_label and lss_tmp[0][0] == 2 : 53 | continue 54 | if pos_neg_label : 55 | lss_tmp2 = [ [1 if ls[0] > 2 else 0 ,ls[1]] for ls in lss_tmp] 56 | else: 57 | lss_tmp2 =lss_tmp 58 | if len(lss_tmp2) > 0 and only_root: 59 | lss.append(lss_tmp2[0]) 60 | elif len(lss_tmp2) > 0: 61 | lss.extend(lss_tmp2) 62 | if remove_double_count_sentence : 63 | uss=label_sents2uni_sent(lss) 64 | lss_new =[[np.mean([lss[id][0] for id in uss[s]]),lss[uss[s][0]][1] ] for s in uss ] 65 | return lss_new 66 | else: 67 | return lss 68 | 69 | 70 | class TreeBank(): 71 | def __init__(self, 72 | dirpath='trees', 73 | basenames=['train','test','dev'] 74 | ): 75 | argdict= locals() 76 | argdict.pop('argdict',None) 77 | argdict.pop('self',None) 78 | vars(self).update(argdict) 79 | for basename in basenames: 80 | treename='tree_' + basename 81 | vars(self)[treename]=self.load_tree_one(basename) 82 | #print vars(self) 83 | 84 | def load_tree_one(self,basename='dev'): 85 | trees=[] 86 | count=0 87 | infname = basename + '.txt' 88 | with codecs.open(path.join(self.dirpath, infname), 'r', 'utf-8') as f : 89 | for line in f.readlines(): 90 | count = count +1 91 | #tree=Tree.parse(line) ##nltk ver2 92 | tree =Tree.fromstring(line) ##nltk ver3 93 | trees.append(tree) 94 | return trees 95 | def labeled_sents(self,basename='dev',only_root=False,pos_neg_label=False,remove_double_count_sentence=False): 96 | #treename='tree_' + basename 97 | return trees2label_sents(vars(self)['tree_' + basename],only_root=only_root,pos_neg_label=pos_neg_label,remove_double_count_sentence=remove_double_count_sentence) 98 | 99 | def sents_labels(self,basename='dev',only_root=0,pos_neg_label=0,remove_double_count_sentence=False): 100 | labeled_sents=self.labeled_sents(basename,only_root=only_root,pos_neg_label=pos_neg_label,remove_double_count_sentence=remove_double_count_sentence) 101 | X=[ls[1] for ls in labeled_sents] 102 | Y=[ls[0] for ls in labeled_sents] 103 | return X,Y 104 | 105 | 106 | if __name__ == "__main__": 107 | treebank=TreeBank('./trees') 108 | print treebank.tree_dev[:3] 109 | # lss=trees2label_sents(treebank.tree_dev[:3]) 110 | # print lss[:3] 111 | lss=treebank.labeled_sents('dev')[:3] ##fine grade 112 | print lss[:3] 113 | # lss=trees2label_sents(treebank.tree_dev[:5],only_root=1,pos_neg_label=1) 114 | # print lss[:3] 115 | 116 | 117 | # lss=trees2label_sents(treebank.tree_dev[:5],only_root=0,pos_neg_label=0) 118 | # print lss[:3] 119 | lss=treebank.labeled_sents('dev',only_root=True,pos_neg_label=True)[:3] ##fine grade 120 | print lss[:3] 121 | 122 | 123 | -------------------------------------------------------------------------------- /word2veckeras/test.txt: -------------------------------------------------------------------------------- 1 | Harbin Institute of Technology (HIT) was founded in 1920. From its beginning, HIT has received preferential support from the central government. In 1954, the Ministry of Higher Education designated, for the first time, six national key universities. HIT was the only one of the six outside of Beijing. In 1984, HIT again found its way onto the list of 15 national key universities to receive special support. In 1996, HIT was among the first group of universities to be included in Project 211. This project targets 100 institutions of higher education in China to receive preferential support for development in order to become world-class universities in the 21st century. In 1999, HIT was listed as one of the top nine key universities in China. This distinction provided HIT with the opportunity to develop into a highly-competitive first-rate university with the assistance of the Ministry of Education and the Heilongjiang Provincial Government. 2 | After nearly 100 years, HIT has developed into a large nationally renowned multi-disciplinary university with science, engineering and research as its core. We have established our own unique programs related to the field of astronautics that are unparalled anywhere in China. We have broadened our established disciplinary programs by utilizing a cross- disciplinary curriculum and as such have formed a comparatively full disciplinary system that consists of key, emerging and supporting programs. HIT now has 21 schools/departments, including 73 undergraduate programs, 147 masters' programs, 81 doctoral programs, 18 post-doctoral research stations, 18 national key disciplines, and 32 national & provincial (ministerial) key labs. The university employs 2,944 full-time teachers, among which 884are professors, 1,102 are associate professors, including 22 academicians of the Chinese Academy of Sciences and the Chinese Academy of Engineering. At present, there are 42,695 full-time students including 25,035 undergraduates, 11,794 master degree candidates and 4,387 doctorial degree candidates. We also added the Shenzhen Graduate School and Weihai Campus to the main campus in Harbin (including the Research Academy of Science and Technology and Research Academy of Industrial Technology), forming a pattern of 'one university, three campuses'. 3 | HIT is consistently on the forefront in making innovations in research. For years, HIT has continued to undertake large-scale and highly sophisticated national projects. HIT's ability for scientific research has always been among the strongest in all universities in China. In 2007, HIT funds for scientific research reached 1.1billion RMB. In the comprehensive 10-year evaluation of the '863'project, HIT scientific research programs ranked second among all universities in China. HIT has been making great contributions to China's hi-tech research by creating many new inventions in scientific research fields such as China's first simulation computer, the first intelligent chess-playing computer, the first arc-welding robots, the first world advanced-level system radar, the first CMOS chip IC card with our own patent, the first giant computer-aided real-time 3-D image construction system, the first microcomputer-operated fiber twiner and the first large-scale tank-head forming machine. The famous 'Shenzhou Series Spaceship Project' received massive assistance from HIT in the field of large-scale land-based space simulation equipment, returning cabin deformation and orthopraxy welding technology, 3-axel simulation experimental platform and fault diagnosis. The micro-satellite 'Testing Satellite No.1', constructed mainly by HIT, was the first fully developed and launched satellite by a Chinese university. The technical advancements on the satellite meet international aerospace standards and mark a new chapter in the history of HIT and China's history of astronautics. 4 | HIT students study humanities and social sciences along with basic engineering and science courses for a strong comprehensive base. They go on to learn scientific research methods and laboratory skills which enhance their creativity and innovative abilities. When our students graduate from HIT, they are equipped with strong theoretical knowledge and the ability for practical application. 5 | HIT is famous for its original style of schooling: 'Being strict in qualifications for graduates; making every endeavor in educating students.' Our graduates have been warmly welcomed by employers throughout China; more than 100,000 graduates have stepped into society and many have moved up to high-ranking managerial positions and work as specialists in the fields of science and technology, education, and economics. A number of graduates have assumed leadership positions in the CPC and governments at different levels, or become generals of the PLA, academicians of the Chinese Academy of Sciences and the Chinese Academy of Engineering. 6 | HIT has remained an international university since its foundation. Courses at HIT used to be conducted exclusively in Russian and Japanese. After the reforming and opening to the outside world, HIT has gained greater weight in the world. So far, we have signed academic cooperation agreements with 126 institutions of higher education in 24 countries such as the United States, the United Kingdom, France, Germany, Japan and Russia. Cooperation and exchanges are carried out between HIT and these universities though exchanging students, faculty and research staff, holding academic conference and cooperating in scientific research. 7 | Today, all the faculty, students and staff of HIT, are dedicating, with full confidence, their concerted efforts to advance bravely towards the goal of building HIT into a well-known world-class university. -------------------------------------------------------------------------------- /word2veckeras/word2veckeras.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Licensed under the GNU Affero General Public License, version 3 - http://www.gnu.org/licenses/agpl-3.0.html 5 | 6 | import math 7 | from Queue import Queue 8 | 9 | from numpy import zeros, random, sum as np_sum, add as np_add, concatenate, \ 10 | repeat as np_repeat, array, float32 as REAL, empty, ones, memmap as np_memmap, \ 11 | sqrt, newaxis, ndarray, dot, vstack, dtype, divide as np_divide 12 | 13 | import gensim.models.word2vec 14 | 15 | from six.moves import xrange, zip 16 | from six import string_types, integer_types, itervalues 17 | 18 | import sys 19 | import random 20 | 21 | import numpy as np 22 | import copy 23 | 24 | import keras.constraints 25 | 26 | from keras.utils.np_utils import accuracy 27 | from keras.models import Graph,Sequential 28 | from keras.layers.core import Dense, Dropout, Activation, Merge, Flatten , Lambda 29 | from keras.layers.embeddings import Embedding 30 | from keras.optimizers import SGD 31 | from keras.objectives import mse 32 | 33 | 34 | def queue_to_list(q,extract_size): 35 | """ Dump a Queue to a list """ 36 | # A new list 37 | l = [] 38 | count=0 39 | while q.qsize() > 0: 40 | count +=1 41 | if count >extract_size: 42 | break 43 | l.append(q.get()) 44 | 45 | return l 46 | 47 | 48 | 49 | def train_sg_pair(model, word, context_index, alpha=None, learn_vectors=True, learn_hidden=True, 50 | context_vectors=None, context_locks=None, 51 | scale=1 52 | ): 53 | 54 | if word not in model.vocab: 55 | return 56 | predict_word = model.vocab[word] # target word (NN output) 57 | if model.hs: 58 | for i,p in enumerate(predict_word.point): 59 | yield context_index,p,predict_word.code[i] 60 | if model.negative: 61 | word_indices = [predict_word.index] 62 | while len(word_indices) < model.negative + 1: 63 | w = model.cum_table.searchsorted(model.random.randint(model.cum_table[-1])) 64 | if w != predict_word.index: 65 | word_indices.append(w) 66 | for i,p in enumerate(word_indices): 67 | yield context_index, p+model.keras_context_negative_base_index, model.neg_labels[i] 68 | 69 | 70 | def train_batch_sg(model, sentences, alpha=None, work=None,sub_batch_size=256,batch_size=256): 71 | 72 | batch_count=0 73 | sub_batch_count=0 74 | train_x0 =np.zeros((batch_size,sub_batch_size),dtype='int32') 75 | train_x1 =np.zeros((batch_size,sub_batch_size),dtype='int32') 76 | train_y =np.zeros((batch_size,sub_batch_size),dtype='int8') 77 | 78 | while 1: 79 | for sentence in sentences: 80 | word_vocabs = [model.vocab[w] for w in sentence if w in model.vocab and 81 | model.vocab[w].sample_int > model.random.rand() * 2**32] 82 | for pos, word in enumerate(word_vocabs): 83 | reduced_window = model.random.randint(model.window) # `b` in the original word2vec code 84 | 85 | # now go over all words from the (reduced) window, predicting each one in turn 86 | start = max(0, pos - model.window + reduced_window) 87 | #window_length=len(word_vocabs[start:(pos + model.window + 1 - reduced_window)]) 88 | #print window_length, 89 | for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): 90 | # don't train on the `word` itself 91 | if pos2 != pos: 92 | xy_gen=train_sg_pair(model, model.index2word[word.index], word2.index) 93 | for xy in xy_gen : 94 | if xy !=None: 95 | (x0,x1,y)=xy 96 | train_x0[batch_count][sub_batch_count]=x0 97 | train_x1[batch_count][sub_batch_count]=x1 98 | train_y[batch_count][sub_batch_count]=y 99 | sub_batch_count += 1 100 | if sub_batch_count >= sub_batch_size : 101 | batch_count += 1 102 | sub_batch_count=0 103 | if batch_count >= batch_size : 104 | yield { 'index':train_x0, 'point':train_x1, 'code':train_y} 105 | batch_count=0 106 | 107 | 108 | 109 | def build_keras_model_sg(index_size,vector_size, 110 | context_size, 111 | #code_dim, 112 | sub_batch_size=256, 113 | learn_vectors=True,learn_hidden=True, 114 | model=None): 115 | 116 | kerasmodel = Graph() 117 | kerasmodel.add_input(name='point' , input_shape=(1,), dtype=int) 118 | kerasmodel.add_input(name='index' , input_shape=(1,), dtype=int) 119 | kerasmodel.add_node(Embedding(index_size, vector_size, input_length=sub_batch_size,weights=[model.syn0]),name='embedding', input='index') 120 | kerasmodel.add_node(Embedding(context_size, vector_size, input_length=sub_batch_size,weights=[model.keras_syn1]),name='embedpoint', input='point') 121 | kerasmodel.add_node(Lambda(lambda x:x.sum(2)) , name='merge',inputs=['embedding','embedpoint'], merge_mode='mul') 122 | kerasmodel.add_node(Activation('sigmoid'), name='sigmoid', input='merge') 123 | kerasmodel.add_output(name='code',input='sigmoid') 124 | kerasmodel.compile('rmsprop', {'code':'mse'}) 125 | return kerasmodel 126 | 127 | 128 | 129 | def train_cbow_pair(model, word, input_word_indices, l=None, alpha=None, learn_vectors=True, learn_hidden=True): 130 | if model.hs: 131 | for i,p in enumerate(word.point): 132 | yield input_word_indices,[p],[word.code[i]] 133 | if model.negative: 134 | word_indices = [word.index] 135 | while len(word_indices) < model.negative + 1: 136 | w = model.cum_table.searchsorted(model.random.randint(model.cum_table[-1])) 137 | if w != word.index: 138 | word_indices.append(w) 139 | for i,p in enumerate(word_indices): 140 | yield input_word_indices, [p+model.keras_context_negative_base_index], [model.neg_labels[i]] 141 | 142 | 143 | def train_batch_cbow_xy_generator(model, sentences): 144 | for sentence in sentences: 145 | word_vocabs = [model.vocab[w] for w in sentence if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32] 146 | for pos, word in enumerate(word_vocabs): 147 | reduced_window = model.random.randint(model.window) # `b` in the original word2vec code 148 | start = max(0, pos - model.window + reduced_window) 149 | window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) 150 | word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] 151 | xy_gen=train_cbow_pair(model, word , word2_indices , None, None) 152 | for xy in xy_gen: 153 | if xy !=None: 154 | yield xy 155 | 156 | def train_batch_cbow(model, sentences, alpha=None, work=None, neu1=None,batch_size=256): 157 | w_len_queue_dict={} 158 | w_len_queue=[] 159 | 160 | while 1: 161 | for xy in train_batch_cbow_xy_generator(model, sentences): 162 | if xy != None: 163 | w_len=len(xy[0]) 164 | if w_len>0: 165 | if w_len not in w_len_queue_dict: 166 | w_len_queue_dict[w_len]=Queue() 167 | w_len_queue.append(w_len) 168 | w_len_queue_dict[w_len].put(xy) 169 | for w_len in w_len_queue: 170 | if w_len_queue_dict[w_len].qsize() >= batch_size : 171 | l=queue_to_list(w_len_queue_dict[w_len],batch_size) 172 | train=[[e[i] for e in l] for i in range(3)] 173 | yield { 'index':np.array(train[0]), 174 | 'point':np.array(train[1]), 175 | 'code':np.array(train[2])} 176 | 177 | 178 | def build_keras_model_cbow(index_size,vector_size, 179 | context_size, 180 | #code_dim, 181 | sub_batch_size=1, 182 | model=None,cbow_mean=False): 183 | 184 | kerasmodel = Graph() 185 | kerasmodel.add_input(name='point' , input_shape=(sub_batch_size,), dtype='int') 186 | kerasmodel.add_input(name='index' , input_shape=(1,), dtype='int') 187 | kerasmodel.add_node(Embedding(index_size, vector_size, weights=[model.syn0]),name='embedding', input='index') 188 | kerasmodel.add_node(Embedding(context_size, vector_size, input_length=sub_batch_size,weights=[model.keras_syn1]),name='embedpoint', input='point') 189 | if cbow_mean: 190 | kerasmodel.add_node(Lambda(lambda x:x.mean(1),output_shape=(vector_size,)),name='average',input='embedding') 191 | else: 192 | kerasmodel.add_node(Lambda(lambda x:x.sum(1),output_shape=(vector_size,)),name='average',input='embedding') 193 | 194 | kerasmodel.add_node(Activation('sigmoid'), name='sigmoid',inputs=['average','embedpoint'], merge_mode='dot',dot_axes=-1) 195 | kerasmodel.add_output(name='code',input='sigmoid') 196 | kerasmodel.compile('rmsprop', {'code':'mse'}) 197 | return kerasmodel 198 | 199 | def copy_word2vec_instance_from_to(w2v,w2v_to,sentences=None,documents=None):# ,dm=None, **kwargs): 200 | if hasattr(w2v,'dm'): 201 | if w2v.dm is None : 202 | #if not w2v_to.dm_concat: 203 | w2v_to.sg = w2v.sg 204 | else: 205 | w2v_to.sg=(1+w2v.dm) % 2 206 | else: 207 | w2v_to.sg = w2v.sg 208 | 209 | w2v_to.window = w2v.window 210 | w2v_to.min_count =w2v.min_count 211 | w2v_to.sample =w2v.sample 212 | w2v_to.cbow_mean=w2v.cbow_mean 213 | 214 | w2v_to.negative = w2v.negative 215 | w2v_to.hs=w2v.hs 216 | 217 | w2v_to.alpha = w2v.alpha 218 | 219 | w2v_to.vector_size=w2v.vector_size 220 | 221 | if hasattr(w2v,'dm_concat') and hasattr(w2v_to,'dm_concat'): 222 | if not w2v_to.dm_concat: 223 | w2v_to.layer1_size= w2v.layer1_size 224 | 225 | 226 | w2v_to.raw_vocab=w2v.raw_vocab 227 | w2v_to.index2word=w2v.index2word 228 | w2v_to.sorted_vocab = w2v.sorted_vocab 229 | 230 | w2v_to.vocab=w2v.vocab 231 | 232 | w2v_to.max_vocab_size = w2v.max_vocab_size 233 | 234 | if hasattr(w2v,'dm'): 235 | docs=documents 236 | #w2v_to.build_vocab(docs) 237 | for document_no, document in enumerate(docs): 238 | document_length = len(document.words) 239 | for tag in document.tags: 240 | w2v_to.docvecs.note_doctag(tag, document_no, document_length) 241 | w2v_to.reset_weights() 242 | 243 | w2v_to.syn0=w2v.syn0 244 | 245 | if w2v.hs: 246 | #if not w2v_to.dm_concat: 247 | w2v_to.syn1=w2v.syn1 248 | if w2v.negative: 249 | #if not w2v_to.dm_concat: 250 | w2v_to.syn1neg=w2v.syn1neg 251 | w2v_to.cum_table=w2v.cum_table 252 | 253 | return w2v_to 254 | #w2v_to.train(docs,**kwargs) 255 | #self.train(docs,learn_words=learn_words,**kwargs) 256 | 257 | 258 | def train_prepossess(model): 259 | 260 | vocab_size=len(model.vocab) 261 | 262 | if model.negative>0 and model.hs : 263 | model.keras_context_negative_base_index=len(model.vocab) 264 | model.keras_context_index_size=len(model.vocab)*2 265 | model.keras_syn1=np.vstack((model.syn1,model.syn1neg)) 266 | else: 267 | model.keras_context_negative_base_index=0 268 | model.keras_context_index_size=len(model.vocab) 269 | if model.hs : 270 | model.keras_syn1=model.syn1 271 | else: 272 | model.keras_syn1=model.syn1neg 273 | 274 | model.neg_labels = [] 275 | if model.negative > 0: 276 | # precompute negative labels optimization for pure-python training 277 | model.neg_labels = np.zeros(model.negative + 1,dtype='int8') 278 | model.neg_labels[0] = 1 279 | 280 | trim_rule=None 281 | if len(model.vocab) == 0 : #not hasattr(model, 'syn0'): 282 | print 'build_vocab' 283 | model.build_vocab(sentences, trim_rule=trim_rule) 284 | #print model.syn0 285 | 286 | 287 | model.word_context_size_max=0 288 | if model.hs : 289 | model.word_context_size_max += max(len(model.vocab[w].point) for w in model.vocab if hasattr(model.vocab[w],'point')) 290 | if model.negative > 0: 291 | model.word_context_size_max += model.negative + 1 292 | 293 | # sub_batch_size_update=False 294 | # if hasattr(model,'sub_batch_size'): 295 | # if model.sub_batch_size != sub_batch_size : 296 | # sub_batch_size_update=True 297 | # model.sub_batch_size=sub_batch_size 298 | 299 | 300 | class Word2VecKeras(gensim.models.word2vec.Word2Vec): 301 | 302 | def compare_w2v(self,w2v2): 303 | return np.mean([np.linalg.norm(self[w]-w2v2[w]) for w in self.vocab if w in w2v2.vocab]) 304 | 305 | def train(self, sentences, total_words=None, word_count=0, 306 | total_examples=None, queue_factor=2, report_delay=1, 307 | batch_size=128 #512 #256 308 | ,sub_batch_size=16 #32 #128 #128 #256 #128 #512 #256 #1 309 | ): 310 | train_prepossess(self) 311 | 312 | # if self.negative>0 and self.hs : 313 | # self.keras_context_negative_base_index=len(self.vocab) 314 | # self.keras_context_index_size=len(self.vocab)*2 315 | # self.keras_syn1=np.vstack((self.syn1,self.syn1neg)) 316 | # else: 317 | # self.keras_context_negative_base_index=0 318 | # self.keras_context_index_size=len(self.vocab) 319 | # if self.hs : 320 | # self.keras_syn1=self.syn1 321 | # else: 322 | # self.keras_syn1=self.syn1neg 323 | 324 | # self.neg_labels = [] 325 | # if self.negative > 0: 326 | # # precompute negative labels optimization for pure-python training 327 | # self.neg_labels = np.zeros(self.negative + 1,dtype='int8') 328 | # self.neg_labels[0] = 1 329 | 330 | 331 | # trim_rule=None 332 | # if len(self.vocab) == 0 : #not hasattr(self, 'syn0'): 333 | # print 'build_vocab' 334 | # self.build_vocab(sentences, trim_rule=trim_rule) 335 | # #print self.syn0 336 | # word_context_size_max=0 337 | # if self.hs : 338 | # word_context_size_max += max(len(self.vocab[w].point) for w in self.vocab if hasattr(self.vocab[w],'point')) 339 | # if self.negative > 0: 340 | # word_context_size_max += self.negative + 1 341 | 342 | 343 | vocab_size=len(self.vocab) 344 | 345 | sub_batch_size_update=False 346 | if hasattr(self,'sub_batch_size'): 347 | if self.sub_batch_size != sub_batch_size : 348 | sub_batch_size_update=True 349 | self.sub_batch_size=sub_batch_size 350 | 351 | if self.sg: 352 | samples_per_epoch=max(1,int((self.iter*self.window*2*sum(map(len,sentences)))/(sub_batch_size))) 353 | 354 | if not hasattr(self, 'kerasmodel') or sub_batch_size_update: 355 | self.kerasmodel=build_keras_model_sg(index_size=vocab_size,vector_size=self.vector_size, 356 | context_size=self.keras_context_index_size, 357 | #code_dim=vocab_size, 358 | sub_batch_size=sub_batch_size, 359 | model=self 360 | ) 361 | 362 | gen=train_batch_sg(self, sentences, sub_batch_size=sub_batch_size,batch_size=batch_size) 363 | self.kerasmodel.nodes['embedding'].set_weights([self.syn0]) 364 | self.kerasmodel.fit_generator(gen,samples_per_epoch=samples_per_epoch, nb_epoch=self.iter, verbose=0) 365 | else: 366 | samples_per_epoch=int(sum(map(len,sentences))) 367 | #samples_per_epoch=max(1,int(self.iter*self.window*2*sum(map(len,sentences))/sub_batch_size)) 368 | if not hasattr(self, 'kerasmodel'): 369 | self.kerasmodel=build_keras_model_cbow(index_size=vocab_size,vector_size=self.vector_size, 370 | context_size=self.keras_context_index_size, 371 | #code_dim=vocab_size, 372 | model=self,cbow_mean=self.cbow_mean 373 | ) 374 | gen=train_batch_cbow(self, sentences, self.alpha, work=None,batch_size=batch_size) 375 | self.kerasmodel.fit_generator(gen,samples_per_epoch=samples_per_epoch, nb_epoch=self.iter,verbose=0) 376 | self.syn0=self.kerasmodel.nodes['embedding'].get_weights()[0] 377 | if self.negative>0 and self.hs : 378 | syn1tmp=self.kerasmodel.nodes['embedpoint'].get_weights()[0] 379 | self.syn1=syn1tmp[0:len(self.vocab)] 380 | self.syn1neg=syn1tmp[len(self.vocab):2*len(self.vocab)] 381 | elif self.hs: 382 | self.syn1=self.kerasmodel.nodes['embedpoint'].get_weights()[0] 383 | else: 384 | self.syn1neg=self.kerasmodel.nodes['embedpoint'].get_weights()[0] 385 | 386 | 387 | if __name__ == "__main__": 388 | def compare_w2v(w2v1,w2v2): 389 | return np.mean([np.linalg.norm(w2v1[w]-w2v2[w]) for w in w2v1.vocab if w in w2v2.vocab]) 390 | 391 | input_file = 'test.txt' 392 | sents=gensim.models.word2vec.LineSentence(input_file) 393 | 394 | v_iter=1 395 | v_size=5 396 | sg_v=0 397 | topn=4 398 | # hs=1 399 | # negative=0 400 | hs=0 401 | negative=5 402 | 403 | 404 | 405 | vs1 = gensim.models.word2vec.Word2Vec(sents,hs=hs,negative=negative,sg=sg_v,size=v_size,iter=v_iter) 406 | vsk1 = Word2VecKeras(sents,hs=hs,negative=negative,sg=sg_v,size=v_size,iter=v_iter) 407 | print 'compare',vsk1.compare_w2v(vs1) 408 | vsk1.iter=20 409 | vsk1.train(sents,batch_size=100,sub_batch_size=64) 410 | print 'compare',vsk1.compare_w2v(vs1) 411 | print vs1['the'] 412 | print vsk1['the'] 413 | print( vs1.most_similar('the', topn=topn)) 414 | print( vsk1.most_similar('the', topn=topn)) 415 | 416 | sys.exit() 417 | 418 | from nltk.corpus import brown #, movie_reviews, treebank 419 | #print(brown.sents()[0]) 420 | #brown_sents=list(brown.sents()) 421 | #brown_sents=list(brown.sents()[:10000]) 422 | brown_sents=list(brown.sents())[:2000] 423 | 424 | br = gensim.models.word2vec.Word2Vec(brown_sents,hs=1,negative=0,sg=sg_v,iter=v_iter) 425 | brk =Word2VecKeras(brown_sents,hs=1,negative=0,sg=sg_v,iter=v_iter) 426 | 427 | print 'compare',brk.compare_w2v(br) 428 | brk.train(brown_sents) 429 | print 'compare',brk.compare_w2v(br) 430 | print brk.most_similar_cosmul(positive=['she', 'him'], negative=['he'], topn=topn) 431 | 432 | br_dummy = gensim.models.word2vec.Word2Vec(brown_sents,sg=sg_v,iter=1) 433 | copy_word2vec_instance_from_to(brk,br_dummy) 434 | print br_dummy.most_similar_cosmul(positive=['she', 'him'], negative=['he'], topn=topn) 435 | print(br_dummy.most_similar('the', topn=5)) 436 | 437 | print br.most_similar_cosmul(positive=['she', 'him'], negative=['he'], topn=topn) 438 | print brk.most_similar_cosmul(positive=['she', 'him'], negative=['he'], topn=topn) 439 | #print brk.most_similar('the', topn=5) 440 | #print(brk.most_similar('the', topn=5)) 441 | 442 | 443 | -------------------------------------------------------------------------------- /word2veckeras/scoreword2veckeras.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Licensed under the GNU Affero General Public License, version 3 - http://www.gnu.org/licenses/agpl-3.0.html 5 | 6 | import sys 7 | import itertools 8 | from Queue import Queue 9 | 10 | 11 | from numpy import zeros, random, sum as np_sum, add as np_add, concatenate, \ 12 | repeat as np_repeat, array, float32 as REAL, empty, ones, memmap as np_memmap, \ 13 | sqrt, newaxis, ndarray, dot, vstack, dtype, divide as np_divide 14 | 15 | import gensim.models.word2vec 16 | import gensim.utils 17 | 18 | from six.moves import xrange, zip 19 | from six import string_types, integer_types, itervalues 20 | 21 | 22 | import random 23 | 24 | import numpy as np 25 | import copy 26 | 27 | import keras.constraints 28 | 29 | from keras.utils.np_utils import accuracy 30 | from keras.models import Graph,Sequential 31 | from keras.layers.core import Dense, Dropout, Activation, Merge, Flatten , Lambda, Reshape,RepeatVector,Permute 32 | from keras.layers.embeddings import Embedding 33 | from keras.optimizers import SGD 34 | from keras.objectives import mse 35 | 36 | 37 | from word2veckeras import train_sg_pair, train_cbow_pair, queue_to_list , train_prepossess 38 | 39 | def word_score2scored_word(word,score): 40 | return [word,score] 41 | def scored_word2word(scored_word): 42 | return scored_word[0] 43 | def scored_word2score(scored_word): 44 | return scored_word[1] 45 | 46 | 47 | def train_batch_score_sg(model, scored_word_sentences, 48 | score_vector_size, 49 | alpha=None, work=None, 50 | sub_batch_size=256, 51 | batch_size=256): 52 | 53 | batch_count=0 54 | sub_batch_count=0 55 | train_x0 =np.zeros((batch_size,sub_batch_size),dtype='int32') 56 | train_x1 =np.zeros((batch_size,sub_batch_size),dtype='int32') 57 | train_y0 =np.zeros((batch_size,sub_batch_size),dtype='int8') 58 | train_y1 =np.zeros((batch_size,sub_batch_size,score_vector_size),dtype='float32') 59 | # train_x0=[[0]]*batch_size 60 | # train_x1=[[0]]*batch_size 61 | # train_y0=[[0]]*batch_size 62 | # train_y1=[[0]]*batch_size 63 | while 1: 64 | for scored_word_sentence in scored_word_sentences: 65 | #sentence=[scored_word2word(scored_word) for scored_word in scored_word_sentence] 66 | 67 | word_vocabs = [[model.vocab[w],s] for [w,s] in scored_word_sentence if w in model.vocab and 68 | model.vocab[w].sample_int > model.random.rand() * 2**32] 69 | for pos, scored_word in enumerate(word_vocabs): 70 | reduced_window = model.random.randint(model.window) # `b` in the original word2vec code 71 | word=scored_word2word(scored_word) 72 | # now go over all words from the (reduced) window, predicting each one in turn 73 | start = max(0, pos - model.window + reduced_window) 74 | for pos2, scored_word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): 75 | word2=scored_word2word(scored_word2) 76 | # don't train on the `word` itself 77 | if pos2 != pos: 78 | xy_gen=train_sg_pair(model, model.index2word[word.index], word2.index) #, alpha) 79 | for xy in xy_gen : 80 | if xy !=None: 81 | (x0,x1,y0)=xy 82 | y1=scored_word2score(scored_word) 83 | train_x0[batch_count][sub_batch_count]=x0 84 | train_x1[batch_count][sub_batch_count]=x1 85 | train_y0[batch_count][sub_batch_count]=y0 86 | train_y1[batch_count][sub_batch_count]=y1 87 | sub_batch_count += 1 88 | if sub_batch_count >= sub_batch_size : 89 | batch_count += 1 90 | sub_batch_count=0 91 | if batch_count >= batch_size : 92 | yield { 'index':train_x0, 'point':train_x1, 'code':train_y0,'score':train_y1} 93 | batch_count=0 94 | 95 | # train_x0[batch_count]=[x0] 96 | # train_x1[batch_count]=x1 97 | # train_y0[batch_count]=y0 98 | # train_y1[batch_count]=y1 99 | # #print train_x0,train_y1, 100 | # batch_count += 1 101 | # if batch_count >= batch_size : 102 | # #print { 'index':np.array(train_x0), 'point':np.array(train_x1), 'code':np.array(train_y0),'score':np.array(train_y1)} 103 | # #yield { 'index':np.array(train_x0), 'point':np.array(train_x1), 'code':np.array(train_y0),'score':np.array(train_y1,dtype=float32)} 104 | # yield { 'index':np.array(train_x0), 'point':np.array(train_x1), 'code':np.array(train_y0),'score':np.array(train_y1)} 105 | # batch_count=0 106 | 107 | 108 | def train_batch_score_cbow_xy_generator(model, scored_word_sentences): 109 | for scored_word_sentence in scored_word_sentences: 110 | #print scored_word_sentence 111 | scored_word_vocabs = [[model.vocab[w],s] for [w,s] in scored_word_sentence if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32] 112 | for pos, scored_word in enumerate(scored_word_vocabs): 113 | reduced_window = model.random.randint(model.window) # `b` in the original word2vec code 114 | start = max(0, pos - model.window + reduced_window) 115 | window_pos = enumerate(scored_word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) 116 | word2_indices = [scored_word2[0].index for pos2, scored_word2 in window_pos if (scored_word2 is not None and scored_word2[0] is not None and pos2 != pos)] 117 | xy_gen=train_cbow_pair(model, scored_word[0] , word2_indices , None, None) 118 | for xy in xy_gen: 119 | if xy !=None: 120 | xy1=[xy[0],xy[1],xy[2],[scored_word[1]]] 121 | yield xy1 122 | 123 | # if xy !=None: 124 | # xy1=[xy[0],xy[1],xy[2],scored_word[1]] 125 | # yield xy1 126 | 127 | def train_batch_score_cbow(model, scored_word_sentences, alpha=None, work=None, neu1=None,batch_size=100): 128 | w_len_queue_dict={} 129 | w_len_queue=[] 130 | 131 | while 1: 132 | for xy in train_batch_score_cbow_xy_generator(model, scored_word_sentences): 133 | if xy != None : 134 | w_len=len(xy[0]) 135 | if w_len>0: 136 | if w_len not in w_len_queue_dict: 137 | w_len_queue_dict[w_len]=Queue() 138 | w_len_queue.append(w_len) 139 | w_len_queue_dict[w_len].put(xy) 140 | for w_len in w_len_queue: 141 | if w_len_queue_dict[w_len].qsize() >= batch_size : 142 | l=queue_to_list(w_len_queue_dict[w_len],batch_size) 143 | train=[[e[i] for e in l] for i in range(4)] 144 | yield { 'index':np.array(train[0]), 145 | 'point':np.array(train[1]), 146 | 'code':np.array(train[2]), 147 | 'score':np.array(train[3]) 148 | } 149 | w_len_queue=w_len_queue[1:]+[w_len_queue[0]] 150 | 151 | 152 | 153 | def build_keras_model_score_word_sg(index_size,vector_size, 154 | #vocab_size, 155 | context_size, 156 | #code_dim, 157 | score_vector_size, 158 | sub_batch_size=256, 159 | word_vectors=None, 160 | score_vectors=None, 161 | hidden_vectors=None, 162 | model=None 163 | ): 164 | """ 165 | >>> word_vectors=np.array([[1,2,-1,1],[3,4,-1,-2],[5,6,-2,-2]]) 166 | >>> score_vectors=np.array([[10,20,11,21,5,6,7,8],[30,40,33,41,9,8,7,6]]) 167 | >>> hidden_vectors=np.array([[1,0,1,1],[0,1,1,1]]) 168 | >>> sub_batch_size=3 169 | >>> vector_size=4 170 | >>> score_vector_size=2 171 | >>> kerasmodel=build_keras_model_score_word_sg(index_size=3,vector_size=vector_size,context_size=2,score_vector_size=score_vector_size,sub_batch_size=sub_batch_size,word_vectors=word_vectors,score_vectors=score_vectors,hidden_vectors=hidden_vectors) 172 | >>> ind=[[0,1,2],[1,2,0]] 173 | >>> ipt=[[1,0,1],[0,1,0]] 174 | >>> tmp1=kerasmodel.predict({'index':np.array(ind),'point':np.array(ipt)}) 175 | >>> tmp3=np.array([[score_vectors[ipt[i][j]].reshape((score_vector_size,vector_size)).dot(word_vectors[ind[i][j]]) for j in range(sub_batch_size) ] for i in range(2)]) 176 | >>> tmp2=np.array([[word_vectors[ind[i][j]].dot(hidden_vectors[ipt[i][j]].T) for j in range(sub_batch_size) ] for i in range(2)]) 177 | >>> np.linalg.norm(1/(1+np.exp(-tmp2))-tmp1['code'])+np.linalg.norm(tmp1['score']-tmp3) < 0.0001 178 | True 179 | """ 180 | 181 | kerasmodel = Graph() 182 | 183 | kerasmodel.add_input(name='point' , input_shape=(sub_batch_size,), dtype=int) 184 | kerasmodel.add_input(name='index' , input_shape=(sub_batch_size,), dtype=int) 185 | if word_vectors is None: 186 | kerasmodel.add_node(Embedding(index_size, vector_size, input_length=sub_batch_size ),name='embedding', input='index') 187 | else: 188 | kerasmodel.add_node(Embedding(index_size, vector_size, input_length=sub_batch_size,weights=[word_vectors]),name='embedding', input='index') 189 | if hidden_vectors is None: 190 | kerasmodel.add_node(Embedding(context_size, vector_size, input_length=sub_batch_size ),name='embedpoint', input='point') 191 | else: 192 | kerasmodel.add_node(Embedding(context_size, vector_size, input_length=sub_batch_size,weights=[hidden_vectors]),name='embedpoint', input='point') 193 | kerasmodel.add_node(Lambda(lambda x:x.sum(2)) , name='merge',inputs=['embedding','embedpoint'], merge_mode='mul') 194 | kerasmodel.add_node(Activation('sigmoid'), name='sigmoid', input='merge') 195 | kerasmodel.add_output(name='code',input='sigmoid') 196 | 197 | if score_vectors is None: 198 | kerasmodel.add_node(Embedding(context_size, score_vector_size*vector_size, input_length=sub_batch_size, ),name='embedscore', input='point') 199 | else: 200 | kerasmodel.add_node(Embedding(context_size, score_vector_size*vector_size, input_length=sub_batch_size,weights=[score_vectors]),name='embedscore', input='point') 201 | kerasmodel.add_node(Reshape((sub_batch_size,score_vector_size,vector_size,)) , name='score1',input='embedscore') 202 | 203 | kerasmodel.add_node(Flatten(), name='index1',input='embedding') 204 | kerasmodel.add_node(RepeatVector(score_vector_size), name='index2',input='index1') 205 | kerasmodel.add_node(Reshape((score_vector_size,sub_batch_size,vector_size,)) , name='index3',input='index2') 206 | kerasmodel.add_node(Permute((2,1,3,)) , name='index4',input='index3') 207 | 208 | kerasmodel.add_node(Lambda(lambda x:x.sum(-1)) , name='scorenode',inputs=['score1','index4'], merge_mode='mul') 209 | 210 | kerasmodel.add_output(name='score',input='scorenode') 211 | 212 | kerasmodel.compile('rmsprop', {'code':'mse','score':'mse'}) 213 | return kerasmodel 214 | 215 | 216 | def build_keras_model_score_word_cbow(index_size,vector_size, 217 | #vocab_size, 218 | context_size, 219 | #code_dim, 220 | score_vector_size, 221 | sub_batch_size=256, 222 | word_vectors=None, 223 | score_vectors=None, 224 | hidden_vectors=None, 225 | model=None, 226 | cbow_mean=False): 227 | 228 | """ 229 | >>> word_vectors=np.array([[1,3,-1,2],[-2,4,-3,-1],[-3,4,2,-1]]) 230 | >>> score_vectors=np.array([[10,20,11,21,5,6,7,8],[30,40,33,41,9,8,7,6]]) 231 | >>> hidden_vectors=np.array([[-1,-1,1,-1],[1,-1,-1,1]]) 232 | >>> sub_batch_size=3 233 | >>> vector_size=4 234 | >>> score_vector_size=2 235 | >>> kerasmodel=build_keras_model_score_word_cbow(index_size=3,vector_size=vector_size,context_size=2,score_vector_size=score_vector_size,sub_batch_size=sub_batch_size,word_vectors=word_vectors,score_vectors=score_vectors,hidden_vectors=hidden_vectors) 236 | >>> ind=[[0,1,0,2,1],[1,2,2,0,0]] 237 | >>> ipt=[[1,0,1],[0,1,0]] 238 | >>> tmp1=kerasmodel.predict({'index':np.array(ind),'point':np.array(ipt)}) 239 | >>> tmp2=np.array([[word_vectors[ind[i]].sum(0).dot(hidden_vectors[ipt[i][j]].T) for j in range(sub_batch_size) ] for i in range(2)]) 240 | >>> tmp3=np.array([[score_vectors[ipt[i][j]].reshape((score_vector_size,vector_size)).dot(word_vectors[ind[i]].sum(0)) for j in range(sub_batch_size) ] for i in range(2)]) 241 | >>> np.linalg.norm(1/(1+np.exp(-tmp2))-tmp1['code'])+np.linalg.norm(tmp1['score']-tmp3) < 0.0001 242 | True 243 | """ 244 | 245 | kerasmodel = Graph() 246 | kerasmodel.add_input(name='point' , input_shape=(sub_batch_size,), dtype=int) 247 | kerasmodel.add_input(name='index' , input_shape=(1,), dtype=int) 248 | if word_vectors is None: 249 | kerasmodel.add_node(Embedding(index_size, vector_size, ),name='embedding', input='index') 250 | else: 251 | kerasmodel.add_node(Embedding(index_size, vector_size, weights=[word_vectors]),name='embedding', input='index') 252 | if hidden_vectors is None: 253 | kerasmodel.add_node(Embedding(context_size, vector_size, input_length=sub_batch_size ),name='embedpoint', input='point') 254 | else: 255 | kerasmodel.add_node(Embedding(context_size, vector_size, input_length=sub_batch_size,weights=[hidden_vectors]),name='embedpoint', input='point') 256 | 257 | if cbow_mean: 258 | kerasmodel.add_node(Lambda(lambda x:x.mean(1),output_shape=(vector_size,)),name='average',input='embedding') 259 | else: 260 | kerasmodel.add_node(Lambda(lambda x:x.sum(1) ,output_shape=(vector_size,)),name='average',input='embedding') 261 | 262 | kerasmodel.add_node(Activation('sigmoid'), name='sigmoid',inputs=['average','embedpoint'], merge_mode='dot',dot_axes=-1) 263 | kerasmodel.add_output(name='code',input='sigmoid') 264 | 265 | 266 | if score_vectors is None: 267 | kerasmodel.add_node(Embedding(context_size, score_vector_size*vector_size, input_length=sub_batch_size, ),name='embedscore', input='point') 268 | else: 269 | kerasmodel.add_node(Embedding(context_size, score_vector_size*vector_size, input_length=sub_batch_size,weights=[score_vectors]),name='embedscore', input='point') 270 | kerasmodel.add_node(Reshape((sub_batch_size,score_vector_size,vector_size,)) , name='score1',input='embedscore') 271 | #kerasmodel.add_node(Reshape((sub_batch_size,score_vector_size,vector_size,)) , name='scorenode',input='embedscore') 272 | 273 | 274 | ## kerasmodel.add_node(Flatten(), name='index1',input='average') 275 | kerasmodel.add_node(RepeatVector(score_vector_size*sub_batch_size), name='index2',input='average') 276 | kerasmodel.add_node(Reshape((score_vector_size,sub_batch_size,vector_size,)) , name='index3',input='index2') 277 | kerasmodel.add_node(Permute((2,1,3,)) , name='index4',input='index3') 278 | #kerasmodel.add_node(Permute((2,1,3,)) , name='scorenode',input='index3') 279 | 280 | kerasmodel.add_node(Lambda(lambda x:x.sum(-1)) , name='scorenode',inputs=['score1','index4'], merge_mode='mul') 281 | 282 | kerasmodel.add_output(name='score',input='scorenode') 283 | 284 | kerasmodel.compile('rmsprop', {'code':'mse','score':'mse'}) 285 | return kerasmodel 286 | 287 | 288 | 289 | class ScoreWord2VecKeras(gensim.models.word2vec.Word2Vec): 290 | 291 | def scan_vocab(self, scored_word_sentences, progress_per=10000, trim_rule=None): 292 | scored_word_sentences1, scored_word_sentences2 =itertools.tee(scored_word_sentences) 293 | 294 | sentences=( 295 | [ 296 | #[scored_word2word(scored_word),scored_word2score(scored_word)] 297 | scored_word2word(scored_word) 298 | for scored_word in scored_word_sentence ] 299 | for scored_word_sentence in scored_word_sentences1) 300 | super(ScoreWord2VecKeras, self).scan_vocab(sentences, progress_per, trim_rule) 301 | 302 | score_vec0=scored_word2score(scored_word_sentences2.next()) 303 | self.score_vector_size=len(score_vec0[1]) 304 | 305 | 306 | def train(self, scored_word_sentences, 307 | learn_doctags=True, learn_words=True, learn_hidden=True,iter=None, 308 | batch_size=128 #128, #512 #256 309 | ,sub_batch_size=128 #16 #32 #128 #128 #256 #128 #512 #256 #1 310 | #total_words=None, word_count=0, 311 | #chunksize=800, 312 | #total_examples=None, queue_factor=2, report_delay=1 313 | ): 314 | train_prepossess(self) 315 | vocab_size=len(self.vocab) 316 | #batch_size=800 ##optimized 1G mem video card 317 | #batch_size=chunksize 318 | samples_per_epoch=int(self.window*2*sum(map(len,scored_word_sentences))) 319 | #print 'samples_per_epoch',samples_per_epoch 320 | if self.sg: 321 | #print 'sg',self.keras_context_index_size,sub_batch_size 322 | self.kerasmodel =build_keras_model_score_word_sg(index_size=vocab_size, 323 | vector_size=self.vector_size, 324 | #vocab_size=vocab_size, 325 | #code_dim=vocab_size, 326 | context_size=self.keras_context_index_size, 327 | score_vector_size=self.score_vector_size, 328 | sub_batch_size=sub_batch_size, 329 | model=self, 330 | word_vectors=self.syn0, 331 | hidden_vectors=self.keras_syn1, 332 | ) 333 | 334 | gen=train_batch_score_sg(self, scored_word_sentences, #self.alpha, work=None, 335 | score_vector_size=self.score_vector_size, 336 | sub_batch_size=sub_batch_size, 337 | batch_size=batch_size) 338 | self.kerasmodel.fit_generator(gen,samples_per_epoch=samples_per_epoch, nb_epoch=self.iter,verbose=0) 339 | else: 340 | self.kerasmodel=build_keras_model_score_word_cbow(index_size=vocab_size,vector_size=self.vector_size, 341 | # vocab_size=vocab_size, 342 | # code_dim=vocab_size, 343 | context_size=self.keras_context_index_size, 344 | score_vector_size=self.score_vector_size, 345 | sub_batch_size=1,#sub_batch_size, 346 | model=self, 347 | cbow_mean=self.cbow_mean, 348 | word_vectors=self.syn0, 349 | hidden_vectors=self.keras_syn1, 350 | ) 351 | 352 | #wv0=copy.copy(self.kerasmodel.nodes['embedding'].get_weights()[0][0]) 353 | gen=train_batch_score_cbow(self, scored_word_sentences, self.alpha, work=None,batch_size=batch_size) 354 | self.kerasmodel.fit_generator(gen,samples_per_epoch=samples_per_epoch, nb_epoch=self.iter,verbose=0) 355 | 356 | self.syn0=self.kerasmodel.nodes['embedding'].get_weights()[0] 357 | if self.negative>0 and self.hs : 358 | syn1tmp=self.kerasmodel.nodes['embedpoint'].get_weights()[0] 359 | self.syn1=syn1tmp[0:len(self.vocab)] 360 | self.syn1neg=syn1tmp[len(self.vocab):2*len(self.vocab)] 361 | elif self.hs: 362 | self.syn1=self.kerasmodel.nodes['embedpoint'].get_weights()[0] 363 | else: 364 | self.syn1neg=self.kerasmodel.nodes['embedpoint'].get_weights()[0] 365 | 366 | 367 | 368 | 369 | 370 | 371 | class LineScoredWordSentence(object): 372 | def __init__(self, source,score_fn, max_sentence_length=10000, limit=None): 373 | self.source = source 374 | self.score_fn=score_fn 375 | self.max_sentence_length = max_sentence_length 376 | self.limit = limit 377 | 378 | def __iter__(self): 379 | try: 380 | self.source.seek(0) 381 | for line in itertools.islice(self.source, self.limit): 382 | line = gensim.utils.to_unicode(line).split() 383 | i = 0 384 | while i < len(line): 385 | yield [[w,self.score_fn(w)] for w in line[i : i + self.max_sentence_length]] 386 | i += self.max_sentence_length 387 | except AttributeError: 388 | with gensim.utils.smart_open(self.source) as fin: 389 | for line in itertools.islice(fin, self.limit): 390 | line = gensim.utils.to_unicode(line).split() 391 | i = 0 392 | while i < len(line): 393 | yield [[w,self.score_fn(w)] for w in line[i : i + self.max_sentence_length]] 394 | i += self.max_sentence_length 395 | 396 | class ScoredListSentence(object): 397 | def __init__(self, words_list,score_fn): 398 | """ 399 | words_list like: 400 | words_list = [ 401 | ['human', 'interface', 'computer'], 402 | ['survey', 'user', 'computer', 'system', 'response', 'time'], 403 | ['eps', 'user', 'interface', 'system'], 404 | ] 405 | sentence = LabeledListSentence(words_list) 406 | """ 407 | self.words_list = words_list 408 | self.score_fn=score_fn 409 | 410 | def __getitem__(self, index): 411 | t = [t for t in self] 412 | return t[index] 413 | def __iter__(self): 414 | for i, words in enumerate(self.words_list): 415 | #yield LabeledSentence(words, ['SENT_{0}'.format(i)]) 416 | #yield gensim.models.doc2vec.TaggedDocument(words, [i]) 417 | yield [[w,self.score_fn(w)] for w in words] 418 | 419 | 420 | 421 | 422 | if __name__ == "__main__": 423 | 424 | import doctest 425 | doctest.testmod() 426 | 427 | input_file = 'test.txt' 428 | 429 | scales=[1.0,1.0,1.0] 430 | 431 | def dummy_score_vec(word): 432 | return [len(word)*scales[0],ord(word[0])*scales[1],ord(word[-1])*scales[1]] 433 | #return [len(word)/0.2 ] 434 | 435 | sws=list(LineScoredWordSentence(input_file,dummy_score_vec)) 436 | #print sws[0] 437 | 438 | from word2veckeras import Word2VecKeras 439 | 440 | parameters = [{'size':[5],'hs':[0,1],'negative':[0,5],'sg':[0,1] }] 441 | from sklearn.grid_search import ParameterGrid 442 | for param in ParameterGrid(parameters): 443 | if (param['hs']==0 and param['negative']==0) : 444 | continue 445 | print param 446 | svk=ScoreWord2VecKeras(sws,**param) 447 | vsk = Word2VecKeras(gensim.models.word2vec.LineSentence(input_file),**param) 448 | vs = gensim.models.word2vec.Word2Vec(gensim.models.word2vec.LineSentence(input_file),**param) 449 | print( svk.most_similar('the', topn=5)) 450 | print( vsk.most_similar('the', topn=5)) 451 | print( vs.most_similar('the', topn=5)) 452 | print(svk['the']) 453 | print(vsk['the']) 454 | print(vs['the']) 455 | 456 | # #svk.save_word2vec_format('tmp.vec') 457 | # #svk.save('tmp.model') 458 | 459 | #print svk.score_vector_size 460 | 461 | scored_word_list=[ 462 | ['This',[20*0.1,10*0.2]], 463 | ['is',[10*0.1,5*0.2]], 464 | ['a',[30*0.1,10*0.2]], 465 | ['pen',[10*0.1,5*0.2]], 466 | ['.',[3*0.1,5*0.2]], 467 | ] 468 | 469 | scored_word_list=[scored_word_list]*100 470 | #print scored_word_list 471 | svk2=ScoreWord2VecKeras(scored_word_list,iter=3) 472 | print(svk2.most_similar('a',topn=5)) 473 | #svk1.save('tmp.vec') 474 | #svk2.save_word2vec_format('tmp2.vec') 475 | 476 | 477 | -------------------------------------------------------------------------------- /word2veckeras/doc2veckeras.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Licensed under the GNU Affero General Public License, version 3 - http://www.gnu.org/licenses/agpl-3.0.html 5 | 6 | import math 7 | import copy 8 | from Queue import Queue 9 | 10 | from numpy import zeros, random, sum as np_sum, add as np_add, concatenate, \ 11 | repeat as np_repeat, array, float32 as REAL, empty, ones, memmap as np_memmap, \ 12 | sqrt, newaxis, ndarray, dot, vstack, dtype, divide as np_divide 13 | 14 | import gensim.models.word2vec 15 | import gensim.models.doc2vec 16 | 17 | from six.moves import xrange, zip 18 | from six import string_types, integer_types, itervalues 19 | 20 | import sys 21 | import random 22 | 23 | import numpy as np 24 | import operator 25 | 26 | import keras.constraints 27 | 28 | from keras.utils.np_utils import accuracy 29 | from keras.models import Graph,Sequential 30 | from keras.layers.core import Dense, Dropout, Activation, Merge, Flatten, LambdaMerge,Lambda 31 | from keras.layers.embeddings import Embedding 32 | from keras.optimizers import SGD 33 | from keras.objectives import mse 34 | 35 | from sklearn.base import BaseEstimator,RegressorMixin, ClassifierMixin 36 | from sklearn.linear_model import LogisticRegression,LogisticRegressionCV 37 | 38 | from word2veckeras import train_sg_pair,train_cbow_pair,queue_to_list,train_prepossess 39 | 40 | 41 | def train_batch_dbow(model, 42 | docs, 43 | sub_batch_size=256,batch_size=256 44 | ): 45 | batch_count=0 46 | sub_batch_count=0 47 | train_x0 =np.zeros((batch_size,sub_batch_size),dtype='int32') 48 | train_x1 =np.zeros((batch_size,sub_batch_size),dtype='int32') 49 | train_y =np.zeros((batch_size,sub_batch_size),dtype='int8') 50 | while 1: 51 | for doc in docs: 52 | for doctag_index in doc.tags: 53 | for word in doc.words: 54 | xy_gen=train_sg_pair(model, word, doctag_index,) 55 | for xy in xy_gen : 56 | if xy !=None: 57 | (x0,x1,y)=xy 58 | train_x0[batch_count][sub_batch_count]=x0 59 | train_x1[batch_count][sub_batch_count]=x1 60 | train_y[batch_count][sub_batch_count]=y 61 | sub_batch_count += 1 62 | if sub_batch_count >= sub_batch_size : 63 | batch_count += 1 64 | sub_batch_count=0 65 | if batch_count >= batch_size : 66 | yield { 'index':train_x0, 'point':train_x1, 'code':train_y} 67 | batch_count=0 68 | 69 | def train_batch_dm_xy_generator(model, docs): 70 | for doc in docs: 71 | indexed_doctags = model.docvecs.indexed_doctags(doc.tags) 72 | doctag_indexes, doctag_vectors, doctag_locks, ignored = indexed_doctags 73 | 74 | word_vocabs = [model.vocab[w] for w in doc.words if w in model.vocab and 75 | model.vocab[w].sample_int > model.random.rand() * 2**32] 76 | for pos, word in enumerate(word_vocabs): 77 | reduced_window = model.random.randint(model.window) # `b` in the original doc2vec code 78 | start = max(0, pos - model.window + reduced_window) 79 | window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) 80 | word2_indexes = [word2.index for pos2, word2 in window_pos if pos2 != pos] 81 | 82 | xy_gen=train_cbow_pair(model, word, word2_indexes) 83 | x2=doctag_indexes 84 | for xy in xy_gen: 85 | if xy !=None: 86 | yield [xy[0],x2,xy[1],xy[2]] 87 | 88 | def train_batch_dm(model, docs,batch_size=100,sub_batch_size=1,): 89 | w_len_queue_dict={} 90 | w_len_queue=[] 91 | while 1: 92 | for xy in train_batch_dm_xy_generator(model,docs): 93 | if xy != None: 94 | w_len=len(xy[0]) 95 | if w_len>0: 96 | if w_len not in w_len_queue_dict: 97 | w_len_queue_dict[w_len]=Queue() 98 | w_len_queue.append(w_len) 99 | w_len_queue_dict[w_len].put(xy) 100 | for w_len in w_len_queue: 101 | if w_len_queue_dict[w_len].qsize() >= batch_size : 102 | l=queue_to_list(w_len_queue_dict[w_len],batch_size) 103 | train=[[e[i] for e in l] for i in range(4)] 104 | yield {'iword':np.array(train[0]), 105 | 'index':np.array(train[1]), 106 | 'point':np.array(train[2]), 107 | 'code':np.array(train[3])} 108 | 109 | def train_document_dm_concat_xy_generator(model, docs): 110 | for doc in docs: 111 | indexed_doctags = model.docvecs.indexed_doctags(doc.tags) 112 | doctag_indexes, doctag_vectors, doctag_locks, ignored = indexed_doctags 113 | 114 | word_vocabs = [model.vocab[w] for w in doc.words if w in model.vocab and 115 | model.vocab[w].sample_int > model.random.rand() * 2**32] 116 | null_word = model.vocab['\0'] 117 | pre_pad_count = model.window 118 | post_pad_count = model.window 119 | padded_document_indexes = ( 120 | (pre_pad_count * [null_word.index]) # pre-padding 121 | + [word.index for word in word_vocabs if word is not None] # elide out-of-Vocabulary words 122 | + (post_pad_count * [null_word.index]) # post-padding 123 | ) 124 | 125 | for pos in range(pre_pad_count, len(padded_document_indexes) - post_pad_count): 126 | word_context_indexes = ( 127 | padded_document_indexes[(pos - pre_pad_count): pos] # preceding words 128 | + padded_document_indexes[(pos + 1):(pos + 1 + post_pad_count)] # following words 129 | ) 130 | predict_word = model.vocab[model.index2word[padded_document_indexes[pos]]] 131 | xy_gen = train_cbow_pair(model, predict_word, word_context_indexes) 132 | for xy in xy_gen: 133 | if xy !=None: 134 | x2=doctag_indexes 135 | xy1=[xy[0],x2,xy[1],xy[2]] 136 | yield xy1 137 | 138 | def train_document_dm_concat(model, docs,batch_size=100): 139 | batch_count=0 140 | train_t=[0]*batch_size 141 | 142 | while 1: 143 | for xy in train_document_dm_concat_xy_generator(model,docs): 144 | train_t[batch_count]=xy 145 | batch_count += 1 146 | if batch_count >= batch_size : 147 | train=[[train_t[j][i] for j in range(batch_size) ] for i in range(4)] 148 | yield {'iword':np.array(train[0]), 149 | 'index':np.array(train[1]), 150 | 'point':np.array(train[2]), 151 | 'code':np.array(train[3])} 152 | batch_count=0 153 | 154 | 155 | def build_keras_model_dbow(index_size,vector_size, 156 | #vocab_size, 157 | context_size, 158 | sub_batch_size=1, 159 | doctag_vectors=None, 160 | hidden_vectors=None, 161 | learn_doctags=True, 162 | learn_hidden=True, 163 | model=None, 164 | ): 165 | 166 | """ 167 | >>> index_size=3 168 | >>> vector_size=2 169 | >>> context_siz=3 170 | >>> sub_batch_size=2 171 | >>> doctag_vectors=np.array([[-1.1,2.2],[-3.2,-4.3],[-1.1,-1.4]],'float32') 172 | >>> hidden_vectors=np.array([[-1,2],[3,4],[5,6]],'float32') 173 | >>> kerasmodel=build_keras_model_dbow(index_size=3,vector_size=2,context_size=3,sub_batch_size=2,doctag_vectors=doctag_vectors,hidden_vectors=hidden_vectors) 174 | >>> ind=[[0,1],[1,0]] 175 | >>> ipt=[[0,1],[1,2]] 176 | >>> tmp1=kerasmodel.predict({'index':np.array(ind),'point':np.array(ipt)})['code'] 177 | >>> tmp2=np.array([np.sum(doctag_vectors[ind[i]]*hidden_vectors[ipt[i]], axis=1) for i in range(2)]) 178 | >>> np.linalg.norm(1/(1+np.exp(-tmp2))-tmp1) < 0.001 179 | True 180 | """ 181 | 182 | kerasmodel = Graph() 183 | kerasmodel.add_input(name='point' , input_shape=(sub_batch_size,), dtype=int) 184 | kerasmodel.add_input(name='index' , input_shape=(sub_batch_size,), dtype=int) 185 | if hidden_vectors is None : 186 | kerasmodel.add_node(Embedding(context_size, vector_size, input_length=sub_batch_size, ),name='embedpoint', input='point') 187 | else: 188 | kerasmodel.add_node(Embedding(context_size, vector_size, input_length=sub_batch_size, weights=[hidden_vectors]),name='embedpoint', input='point') 189 | if doctag_vectors is None : 190 | kerasmodel.add_node(Embedding(index_size , vector_size, input_length=sub_batch_size, ),name='embedindex' , input='index') 191 | else: 192 | kerasmodel.add_node(Embedding(index_size , vector_size, input_length=sub_batch_size, weights=[doctag_vectors]),name='embedindex' , input='index') 193 | kerasmodel.add_node(Lambda(lambda x:x.sum(2)) , name='merge',inputs=['embedindex','embedpoint'], merge_mode='mul') 194 | kerasmodel.add_node(Activation('sigmoid'), name='sigmoid', input='merge') 195 | kerasmodel.add_output(name='code',input='sigmoid') 196 | kerasmodel.compile('rmsprop', {'code':'mse'}) 197 | return kerasmodel 198 | 199 | 200 | def build_keras_model_dm(index_size,vector_size,vocab_size, 201 | context_size, 202 | maxwords, 203 | cbow_mean=False, 204 | learn_doctags=True, learn_words=True, learn_hidden=True, 205 | model=None , 206 | word_vectors=None,doctag_vectors=None,hidden_vectors=None, 207 | sub_batch_size=1 208 | ): 209 | """ 210 | >>> word_vectors=np.array([[1,2],[3,4],[5,6]]) 211 | >>> doctag_vectors=np.array([[10,20],[30,40]]) 212 | >>> hidden_vectors=np.array([[1,0],[0,1]]) 213 | >>> sub_batch_size=2 214 | >>> kerasmodel=build_keras_model_dm(index_size=2,vector_size=2,vocab_size=3,context_size=2,maxwords=2,sub_batch_size=sub_batch_size,word_vectors=word_vectors,doctag_vectors=doctag_vectors,hidden_vectors=hidden_vectors, learn_words=True ) 215 | >>> ind=[[0],[1]] 216 | >>> iwd=[[1,0],[1,1]] 217 | >>> ipt=[[1,0],[0,1]] 218 | >>> tmp1=kerasmodel.predict({'index':np.array(ind),'iword':np.array(iwd),'point':np.array(ipt)})['code'] 219 | >>> tmp2=np.array([ [(word_vectors[iwd[i]].sum(0)+doctag_vectors[i]).dot(hidden_vectors[j]) for j in ipt[i] ] for i in range(2)]) 220 | >>> np.linalg.norm(1/(1+np.exp(-tmp2))-tmp1) < 0.001 221 | True 222 | """ 223 | kerasmodel = Graph() 224 | 225 | kerasmodel.add_input(name='index',input_shape=(1,) , dtype=int) 226 | if doctag_vectors is None : 227 | kerasmodel.add_node(Embedding(index_size, vector_size,trainable=learn_doctags,input_length=1 ),name='embedindex', input='index') 228 | else: 229 | kerasmodel.add_node(Embedding(index_size, vector_size,trainable=learn_doctags,input_length=1 ,weights=[doctag_vectors]),name='embedindex', input='index') 230 | kerasmodel.add_input(name='iword',input_shape=(maxwords,), dtype=int) 231 | 232 | if word_vectors is None : 233 | kerasmodel.add_node(Embedding(vocab_size, vector_size,trainable=learn_words ,input_length=maxwords ),name='embedword', input='iword') 234 | else: 235 | kerasmodel.add_node(Embedding(vocab_size, vector_size,trainable=learn_words ,input_length=maxwords,weights=[word_vectors ]),name='embedword', input='iword') 236 | 237 | kerasmodel.add_input(name='point',input_shape=(sub_batch_size,) , dtype=int) 238 | if hidden_vectors is None : 239 | kerasmodel.add_node(Embedding(context_size, vector_size,trainable=learn_hidden ,input_length=sub_batch_size ),name='embedpoint', input='point') 240 | else: 241 | kerasmodel.add_node(Embedding(context_size, vector_size,trainable=learn_hidden ,input_length=sub_batch_size ,weights=[hidden_vectors]),name='embedpoint', input='point') 242 | 243 | if cbow_mean: 244 | kerasmodel.add_node(Lambda(lambda x:x.mean(1),output_shape=(vector_size,)), name='merge',inputs=['embedindex','embedword'], merge_mode='concat', concat_axis=1) 245 | else: 246 | kerasmodel.add_node(Lambda(lambda x:x.sum(1),output_shape=(vector_size,)), name='merge',inputs=['embedindex','embedword'], merge_mode='concat', concat_axis=1) 247 | 248 | kerasmodel.add_node(Activation('sigmoid'), name='sigmoid',inputs=['merge','embedpoint'], merge_mode='dot',dot_axes=-1) 249 | kerasmodel.add_output(name='code',input='sigmoid') 250 | kerasmodel.compile('rmsprop', {'code':'mse'}) 251 | 252 | return kerasmodel 253 | 254 | 255 | def build_keras_model_dm_concat(index_size,vector_size,vocab_size, 256 | #code_dim, 257 | context_size, 258 | window_size, 259 | learn_doctags=True, learn_words=True, learn_hidden=True, 260 | model=None , 261 | word_vectors=None,doctag_vectors=None,hidden_vectors=None 262 | ): 263 | """ 264 | >>> syn0=np.array([[1,-2],[-1,2],[2,-2]],'float32') 265 | >>> word_vectors=syn0 266 | >>> syn1=np.array([[-1,2,1,-5,4,1,-2,3,-4,5],[3,4,-4,1,-2,6,-7,8,9,1],[5,-6,-8,7,6,-1,2,-3,4,5]],'float32') 267 | >>> hidden_vectors=syn1 268 | >>> doctag_vectors=np.array([[-1.1,2.2],[-3.2,-4.3],[-1.1,-1.4]],'float32') 269 | >>> kerasmodel=build_keras_model_dm_concat(index_size=3,vector_size=2,vocab_size=3,context_size=3,window_size=2,word_vectors=word_vectors,doctag_vectors=doctag_vectors,hidden_vectors=hidden_vectors) 270 | >>> ind=[[0],[1]] 271 | >>> iwd=[[0,0,1,2],[1,1,2,0]] 272 | >>> ipt=[[0],[1]] 273 | >>> tmp1=kerasmodel.predict({'index':np.array(ind),'iword':np.array(iwd),'point':np.array(ipt)})['code'] 274 | >>> tmp2=np.array([[np.vstack((doctag_vectors[ind[i]],word_vectors[iwd[i]])).flatten().dot(hidden_vectors[j]) for j in ipt[i] ] for i in range(2)]) 275 | >>> np.linalg.norm(1/(1+np.exp(-tmp2))-tmp1) < 0.001 276 | True 277 | """ 278 | 279 | kerasmodel = Graph() 280 | kerasmodel.add_input(name='iword' , input_shape=(1,), dtype=int) 281 | kerasmodel.add_input(name='index' , input_shape=(1,), dtype=int) 282 | if word_vectors is None: 283 | kerasmodel.add_node(Embedding(vocab_size, vector_size,input_length=2*window_size,trainable=learn_words,),name='embedword', input='iword') 284 | else: 285 | kerasmodel.add_node(Embedding(vocab_size, vector_size,input_length=2*window_size,trainable=learn_words,weights=[word_vectors]),name='embedword', input='iword') 286 | if doctag_vectors is None: 287 | kerasmodel.add_node(Embedding(index_size, vector_size,input_length=1,trainable=learn_doctags,), name='embedindex', input='index') 288 | else: 289 | kerasmodel.add_node(Embedding(index_size, vector_size,input_length=1,trainable=learn_doctags,weights=[doctag_vectors]), name='embedindex', input='index') 290 | 291 | kerasmodel.add_input(name='point',input_shape=(1,) , dtype=int) 292 | if hidden_vectors is None: 293 | kerasmodel.add_node(Embedding(context_size, (2*window_size+1)*vector_size,input_length=1, trainable=learn_hidden,),name='embedpoint', input='point') 294 | else: 295 | kerasmodel.add_node(Embedding(context_size, (2*window_size+1)*vector_size,input_length=1, trainable=learn_hidden,weights=[hidden_vectors]),name='embedpoint', input='point') 296 | 297 | kerasmodel.add_node(Flatten(),name='merge',inputs=['embedindex','embedword'],merge_mode='concat', concat_axis=1) 298 | kerasmodel.add_node(Activation('sigmoid'), name='sigmoid',inputs=['merge','embedpoint'], merge_mode='dot',dot_axes=-1) 299 | kerasmodel.add_output(name='code',input='sigmoid') 300 | kerasmodel.compile('rmsprop', {'code':'mse'}) 301 | return kerasmodel 302 | 303 | 304 | class Doc2VecKeras(gensim.models.doc2vec.Doc2Vec): 305 | def train(self, docs=None, 306 | #batch_size=800, 307 | learn_doctags=True, learn_words=True, learn_hidden=True,iter=None, 308 | batch_size=128 #128, #512 #256 309 | ,sub_batch_size=128 #16 #32 #128 #128 #256 #128 #512 #256 #1 310 | ): 311 | 312 | if iter!=None: 313 | self.iter=iter 314 | if docs==None: 315 | docs=self.docvecs 316 | 317 | train_prepossess(self) 318 | 319 | # if self.negative>0 and self.hs : 320 | # self.keras_context_negative_base_index=len(self.vocab) 321 | # self.keras_context_index_size=len(self.vocab)*2 322 | # self.keras_syn1=np.vstack((self.syn1,self.syn1neg)) 323 | # else: 324 | # self.keras_context_negative_base_index=0 325 | # self.keras_context_index_size=len(self.vocab) 326 | # if self.hs : 327 | # self.keras_syn1=self.syn1 328 | # else: 329 | # self.keras_syn1=self.syn1neg 330 | 331 | # self.neg_labels = [] 332 | # if self.negative > 0: 333 | # # precompute negative labels optimization for pure-python training 334 | # self.neg_labels = np.zeros(self.negative + 1,dtype='int8') 335 | # self.neg_labels[0] = 1 336 | 337 | 338 | # word_context_size_max=0 339 | # if self.hs : 340 | # word_context_size_max += max(len(self.vocab[w].point) for w in self.vocab if hasattr(self.vocab[w],'point')) 341 | # if self.negative > 0: 342 | # word_context_size_max += self.negative + 1 343 | 344 | vocab_size=len(self.vocab) 345 | index_size=len(self.docvecs) 346 | 347 | 348 | self.batch_size=batch_size 349 | batch_size=batch_size 350 | if self.sg: 351 | samples_per_epoch=max(1,int((self.word_context_size_max*self.window*2*sum(map(len,docs)))/(sub_batch_size))) 352 | self.kerasmodel=build_keras_model_dbow(index_size=index_size,vector_size=self.vector_size, 353 | context_size=self.keras_context_index_size, 354 | model=self, 355 | learn_doctags=learn_doctags, 356 | learn_hidden=learn_hidden, 357 | hidden_vectors=self.keras_syn1, 358 | doctag_vectors=self.docvecs.doctag_syn0, 359 | sub_batch_size=sub_batch_size 360 | ) 361 | gen=train_batch_dbow(self, docs, sub_batch_size=sub_batch_size,batch_size=batch_size) 362 | self.kerasmodel.fit_generator(gen,samples_per_epoch=samples_per_epoch, nb_epoch=self.iter,verbose=0) 363 | else: 364 | if self.dm_concat: 365 | samples_per_epoch=int(self.word_context_size_max*sum(map(len,docs))) 366 | self.kerasmodel=build_keras_model_dm_concat(index_size,self.vector_size,vocab_size, 367 | context_size=self.keras_context_index_size, 368 | window_size=self.window, 369 | model=self, 370 | learn_doctags=learn_doctags, learn_words=learn_words, learn_hidden=learn_hidden, 371 | word_vectors=self.syn0, 372 | hidden_vectors=self.keras_syn1, 373 | doctag_vectors=self.docvecs.doctag_syn0 374 | ) 375 | gen= train_document_dm_concat(self, docs, batch_size=batch_size) 376 | self.kerasmodel.fit_generator(gen,samples_per_epoch=samples_per_epoch, nb_epoch=self.iter, verbose=0) 377 | self.syn0=self.kerasmodel.nodes['embedword'].get_weights()[0] 378 | else: 379 | samples_per_epoch=int(self.word_context_size_max*sum(map(len,docs))) 380 | self.kerasmodel=build_keras_model_dm(index_size,self.vector_size,vocab_size, 381 | self.keras_context_index_size, 382 | maxwords=self.window*2+1, 383 | model=self, 384 | learn_doctags=learn_doctags, learn_words=learn_words, learn_hidden=learn_hidden, 385 | word_vectors=self.syn0, 386 | doctag_vectors=self.docvecs.doctag_syn0, 387 | hidden_vectors=self.keras_syn1, 388 | cbow_mean=self.cbow_mean 389 | ) 390 | 391 | gen=train_batch_dm(self, docs, batch_size=batch_size) 392 | self.kerasmodel.fit_generator(gen,samples_per_epoch=samples_per_epoch, nb_epoch=self.iter,verbose=0) 393 | self.syn0=self.kerasmodel.nodes['embedword'].get_weights()[0] 394 | self.docvecs.doctag_syn0=self.kerasmodel.nodes['embedindex'].get_weights()[0] 395 | if self.negative>0 and self.hs : 396 | syn1tmp=self.kerasmodel.nodes['embedpoint'].get_weights()[0] 397 | self.syn1=syn1tmp[0:len(self.vocab)] 398 | self.syn1neg=syn1tmp[len(self.vocab):2*len(self.vocab)] 399 | elif self.hs: 400 | self.syn1=self.kerasmodel.nodes['embedpoint'].get_weights()[0] 401 | else: 402 | self.syn1neg=self.kerasmodel.nodes['embedpoint'].get_weights()[0] 403 | 404 | # def infer_vector_keras(self, doc_words, steps=10): 405 | # vocab_size=len(self.vocab) 406 | # docs=LabeledListSentence([doc_words]) 407 | # batch_size=5 408 | # #batch_size=10 409 | # #batch_size=100 410 | # #batch_size=1000 411 | # samples_per_epoch=int(self.window*2*sum(map(len,docs))) 412 | 413 | # count_max= steps * samples_per_epoch/batch_size +steps 414 | # #print 'count_max',count_max 415 | # # print self.kerasmodel_infer.nodes['embedword'].get_weights() 416 | # # print self.kerasmodel_infer.nodes[ 'sigmoid'].get_weights() 417 | 418 | # doctag_vectors = empty((1, self.vector_size), dtype=REAL) 419 | # doctag_vectors[0] = self.seeded_vector(' '.join(doc_words)) 420 | # doctag_locks = ones(1, dtype=REAL) 421 | # doctag_indexes = [0] 422 | # #self.kerasmodel_infer.nodes.get_weights() 423 | # self.kerasmodel_infer.nodes['embedindex'].set_weights([ doctag_vectors]) 424 | # count =0 425 | # if self.sg: 426 | # for g in train_batch_dbow(self, docs, self.alpha, batch_size=batch_size): 427 | # self.kerasmodel_infer.fit(g, nb_epoch=1, verbose=0) 428 | # count +=1 429 | # if count > count_max: 430 | # break 431 | # elif self.dm_concat: 432 | # for g in train_document_dm_concat(self, docs, batch_size=batch_size): 433 | # self.kerasmodel_infer.fit(g, nb_epoch=1, verbose=0) 434 | # count +=1 435 | # if count > count_max: 436 | # break 437 | # else: 438 | # for g in train_batch_dm(self, docs, batch_size=batch_size): 439 | # self.kerasmodel_infer.fit(g, nb_epoch=1, verbose=0) 440 | # count +=1 441 | # if count > count_max: 442 | # break 443 | # vecs=self.kerasmodel_infer.nodes['embedindex'].get_weights()[0] 444 | # return vecs[0] 445 | 446 | 447 | def train_with_word2vec_instance(self,docs,w2v,dm=None, **kwargs): 448 | if self.dm_concat and not w2v.null_word : 449 | raise ValueError("self.dm_concat=1 need Word2Vec(null_word=1)") #KeyError ? 450 | 451 | if dm == None : 452 | #if not self.dm_concat: 453 | self.sg = w2v.sg 454 | else: 455 | self.sg=(1+dm) % 2 456 | 457 | self.window = w2v.window 458 | self.min_count =w2v.min_count 459 | self.sample =w2v.sample 460 | self.cbow_mean=w2v.cbow_mean 461 | 462 | self.negative = w2v.negative 463 | self.hs=w2v.hs 464 | 465 | #self.alpha = w2v.alpha 466 | 467 | self.vector_size=w2v.vector_size 468 | if not self.dm_concat: 469 | self.layer1_size= w2v.layer1_size 470 | 471 | 472 | self.raw_vocab=w2v.raw_vocab 473 | self.index2word=w2v.index2word 474 | self.sorted_vocab = w2v.sorted_vocab 475 | 476 | self.vocab=w2v.vocab 477 | 478 | self.max_vocab_size = w2v.max_vocab_size 479 | 480 | 481 | #self.build_vocab(docs) 482 | for document_no, document in enumerate(docs): 483 | document_length = len(document.words) 484 | for tag in document.tags: 485 | self.docvecs.note_doctag(tag, document_no, document_length) 486 | self.reset_weights() 487 | 488 | 489 | self.syn0=w2v.syn0 490 | 491 | if w2v.hs: 492 | if not self.dm_concat: 493 | self.syn1=w2v.syn1 494 | if w2v.negative: 495 | if not self.dm_concat: 496 | self.syn1neg=w2v.syn1neg 497 | self.cum_table=w2v.cum_table 498 | 499 | 500 | self.train(docs,**kwargs) 501 | #self.train(docs,learn_words=learn_words,**kwargs) 502 | 503 | 504 | class LabeledListSentence(object): 505 | def __init__(self, words_list): 506 | """ 507 | words_list like: 508 | words_list = [ 509 | ['human', 'interface', 'computer'], 510 | ['survey', 'user', 'computer', 'system', 'response', 'time'], 511 | ['eps', 'user', 'interface', 'system'], 512 | ] 513 | sentence = LabeledListSentence(words_list) 514 | """ 515 | self.words_list = words_list 516 | 517 | def __getitem__(self, index): 518 | t = [t for t in self] 519 | return t[index] 520 | def __iter__(self): 521 | for i, words in enumerate(self.words_list): 522 | #yield LabeledSentence(words, ['SENT_{0}'.format(i)]) 523 | yield gensim.models.doc2vec.TaggedDocument(words, [i]) 524 | 525 | class SentenceClassifier(BaseEstimator, ClassifierMixin): 526 | def __init__(self, 527 | sents_shuffle=False, 528 | doc2vec=gensim.models.doc2vec.Doc2Vec() 529 | ): 530 | argdict= locals() 531 | argdict.pop('argdict',None) 532 | argdict.pop('self',None) 533 | vars(self).update(argdict) 534 | #print argdict 535 | 536 | def fit(self, X, y): 537 | self.sents_train=X 538 | self.Y_train=y 539 | return self 540 | 541 | def doc2vec_set(self,all_docs): 542 | #print 'doc2vec_set,SentenceClassifier' 543 | if hasattr(self.doc2vec, 'syn0'): 544 | self.doc2vec.reset_weights() 545 | #del self.doc2vec.syn0 546 | delattr(self.doc2vec, 'syn0') 547 | self.doc2vec.build_vocab(all_docs) 548 | self.doc2vec.train(all_docs) 549 | 550 | def predict(self,X): 551 | self.sents_test=X 552 | self.sents_all=self.sents_train + self.sents_test 553 | 554 | if self.sents_shuffle : 555 | s_indexs=range(len(self.sents_all)) 556 | random.shuffle(s_indexs) 557 | s_invers_indexs=range(len(s_indexs)) 558 | for n in range(len(s_indexs)): 559 | s_invers_indexs[s_indexs[n]]=n 560 | sents_all=[self.sents_all[n] for n in s_indexs] 561 | else: 562 | sents_all=self.sents_all 563 | all_docs = list(LabeledListSentence(self.sents_all)) 564 | 565 | self.doc2vec_set(all_docs) 566 | #print 'size',self.doc2vec.vector_size 567 | 568 | self.X_train= [self.doc2vec.infer_vector(s) for s in self.sents_train] 569 | self.X_test= [self.doc2vec.infer_vector(s) for s in self.sents_test] 570 | self.logistic =LogisticRegressionCV(class_weight='balanced')#,n_jobs=-1) 571 | self.logistic.fit(self.X_train,self.Y_train) 572 | Y_test_predict=self.logistic.predict(self.X_test) 573 | return Y_test_predict 574 | 575 | doc2vec_init_param_dict={ 576 | #'sents_shuffle': False, 577 | 'comment': None, 578 | 'dm': 1, 'dm_mean': 0, 'hs': 1, 'sample': 0, 'seed': 1, 'dbow_words': 0, 'dm_concat': 0, 579 | 'min_count': 5, 'max_vocab_size': None, 'alpha': 0.025, 'dm_tag_count': 1, 580 | 'docvecs_mapfile': None, 581 | 'size': 300, 582 | 'documents': None, 'trim_rule': None, 'workers': 1, 'negative': 0, 'docvecs': None, 'window': 8, 583 | #'kwargs': {}, 584 | 'min_alpha': 0.0001, 585 | 'iter':1 586 | } 587 | 588 | class Doc2VecClassifier(SentenceClassifier): 589 | def __init__(self, 590 | sents_shuffle=False, 591 | documents=None, size=300, alpha=0.025, window=8, min_count=5, 592 | max_vocab_size=None, sample=0, seed=1, workers=1, min_alpha=0.0001, 593 | dm=1, hs=1, negative=0, dbow_words=0, dm_mean=0, dm_concat=0, dm_tag_count=1, 594 | docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, 595 | iter=1, 596 | #**kwargs 597 | ): 598 | argdict= locals() 599 | argdict.pop('argdict',None) 600 | argdict.pop('self',None) 601 | vars(self).update(doc2vec_init_param_dict) 602 | vars(self).update(argdict) 603 | 604 | def doc2vec_set(self,all_docs): 605 | #print 'doc2vec_set,Doc2VecClassifier' 606 | doc2vec_init_dict={k:vars(self)[k] for k in vars(self).keys() if k in doc2vec_init_param_dict} 607 | doc2vec_init_dict['documents']=all_docs 608 | #print doc2vec_init_dict 609 | self.doc2vec=gensim.models.doc2vec.Doc2Vec(**doc2vec_init_dict) 610 | 611 | 612 | if __name__ == "__main__": 613 | import doctest 614 | doctest.testmod() 615 | 616 | 617 | input_file = 'test.txt' 618 | doc1=gensim.models.doc2vec.TaggedLineDocument(input_file) 619 | for d in doc1: 620 | doc_words1=d.words 621 | break; 622 | d_size=5 623 | dv1=Doc2VecKeras( doc1,size=d_size,dm=0,dm_concat=1,hs=0,negative=5,iter=1) 624 | dvk1=gensim.models.doc2vec.Doc2Vec(doc1,size=d_size,dm=0,dm_concat=1,hs=0,negative=5,iter=1) 625 | 626 | print dv1.docvecs[0] 627 | print dvk1.docvecs[0] 628 | 629 | --------------------------------------------------------------------------------