├── README ├── utils.py ├── simpleexpe.py ├── evaluation.py ├── model.py ├── expeWAKA.py └── datatools.py /README: -------------------------------------------------------------------------------- 1 | ** Currently, you need the following version of theano to use the sparse implementation: 2 | https://bitbucket.org/ynd/theano/overview 3 | 4 | #----------------------------------------------------------------------------------------- 5 | SCRIPTS DESCRIPTIONS: 6 | #----------------------------------------------------------------------------------------- 7 | ** model.py contains all the tools to build, train and evaluate a model. 8 | 9 | ** datatools.py shows how to build the necessary data file (sparse matrices, dictionnaries), from 10 | the nlu/ data. 11 | 12 | ** simpleexpe.py is a simplified version of the training script (only using WordNet3.0 data), 13 | you can change directly the hard-coded parameters and run the code in the WakaBST folder using 14 | one of the two following commands: 15 | 16 | THEANO_FLAGS=floatX=float32,device=gpu python simpleexpe.py 17 | (for gpu use) 18 | or 19 | THEANO_FLAGS=floatX=float32,device=cpu python simpleexpe.py 20 | (for cpu use) 21 | 22 | 23 | ** utils.py is a simple script to load a model and create ranking lists. 24 | 25 | THEANO_FLAGS=floatX=float32,device=[cpu/gpu] python utils.py 26 | 27 | 28 | ** evaluation.py shows how to do the different evaluations procedure (mean rank, WSD, ...etc) 29 | 30 | THEANO_FLAGS=floatX=float32,device=[cpu/gpu] python evaluation.py resultfolder idjob evalnumber 31 | 32 | ** expeWAKA.py (sorry for the headache...) is the training script over all the different training sets 33 | using the jobman interface. 34 | 35 | #------------------------------------------------------------------------------------------ 36 | Contact: myusername ..at.. iro.umontreal.ca 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import scipy.sparse 2 | import cPickle 3 | import os 4 | import sys 5 | from model import * 6 | 7 | operator = 'quad' 8 | ndim = 50 9 | nbatches = 50 10 | lrparam = 0.1 11 | lremb = 0.001 12 | nbtest = 100 13 | testall = 5 14 | savepath = 'expe50' 15 | simfnstr = 'dot' 16 | listconcept = [['__u.s._NN_1','__army_NN_1'],['__brain_NN_1'], ['__auto_NN_1'],['__cat_NN_1']] 17 | listrel = [['__eat_VB_1'],['__drive_VB_1'],['__defend_VB_1']] 18 | nbrank = 10 19 | loadmodel = '/mnt/scratch/bengio/glorotxa/data/exp/glorotxa_db/wakabst/24/model.pkl' 20 | 21 | 22 | synset2lemme = cPickle.load(open('synset2lemme.pkl','r')) 23 | lemme2synset = cPickle.load(open('lemme2synset.pkl','r')) 24 | lemme2freq = cPickle.load(open('lemme2freq.pkl','r')) 25 | synset2idx = cPickle.load(open('synset2idx.pkl','r')) 26 | idx2synset = cPickle.load(open('idx2synset.pkl','r')) 27 | lemme2idx = cPickle.load(open('lemme2idx.pkl','r')) 28 | idx2lemme = cPickle.load(open('idx2lemme.pkl','r')) 29 | synset2neg = cPickle.load(open('synset2neg.pkl','r')) 30 | synset2def = cPickle.load(open('synset2def.pkl','r')) 31 | synset2concept = cPickle.load(open('synset2concept.pkl','r')) 32 | concept2synset = cPickle.load(open('concept2synset.pkl','r')) 33 | 34 | f = open(loadmodel) 35 | embeddings = cPickle.load(f) 36 | leftop = cPickle.load(f) 37 | rightop = cPickle.load(f) 38 | 39 | simfn = eval(simfnstr+'sim') 40 | 41 | # simi function 42 | sl = SimilarityFunctionleftl(simfn,embeddings,leftop,rightop) 43 | sr = SimilarityFunctionrightl(simfn,embeddings,leftop,rightop) 44 | so = SimilarityFunctionrell(simfn,embeddings,leftop,rightop) 45 | leftopid = Id() 46 | rightopid = Id() 47 | Esim = SimilarityFunctionrightl(L2sim,embeddings,leftopid,rightopid) 48 | 49 | txt = '' 50 | for cc in listconcept: 51 | txt+='\n' 52 | txt += getnclosest(nbrank, idx2lemme, lemme2idx, idx2synset, synset2idx, synset2def, synset2concept, concept2synset, Esim, cc, [], typ = 0, emb = True) 53 | for rr in listrel: 54 | txt+='\n' 55 | txt += getnclosest(nbrank,idx2lemme, lemme2idx, idx2synset, synset2idx, synset2def, synset2concept, concept2synset , sl, cc, rr, typ = 1, emb = False) 56 | txt+='\n' 57 | txt += getnclosest(nbrank,idx2lemme, lemme2idx, idx2synset, synset2idx, synset2def, synset2concept, concept2synset , sr, cc, rr, typ = 2, emb = False) 58 | for rr in listconcept: 59 | txt+='\n' 60 | txt += getnclosest(nbrank,idx2lemme, lemme2idx, idx2synset, synset2idx, synset2def, synset2concept, concept2synset , sr, cc, rr, typ = 3, emb = False) 61 | 62 | print txt 63 | -------------------------------------------------------------------------------- /simpleexpe.py: -------------------------------------------------------------------------------- 1 | import scipy.sparse 2 | import cPickle 3 | import os 4 | import sys 5 | from model import * 6 | 7 | train = 'train' 8 | 9 | operator = 'quad' # operator type ('quad' = quadratic) 10 | ndim = 50 # embeddings size 11 | nbatches = 10 # number of batches to create out of the whole training size 12 | lrparam = 1 # learning rate of the model parameters 13 | lremb = 0.01 # learning rate of the embeddings 14 | nbtest = 100 # perform ranking over this number of test samples 15 | testall = 10 # perform evaluation at each time we made this number of epochs. 16 | savepath = 'simpleexpesave' # name of the folder to create to save the model and the ranking lists 17 | simfnstr = 'dot' # the similarity scoring function 18 | listconcept = [['__brain_NN_1'], ['__france_NN_1'], ['__auto_NN_1']] # The rhs and lhs to use for the creation of the ranking list 19 | listrel = [['_has_part'],['_part_of']] # the relations to use to create the ranking lists 20 | nbrank = 30 # How many word we look when we create ranking lists 21 | warp = False # maximum number of resampling time for the warp update (if False no warp) 22 | loadmodel = False #path to model.pkl to load 23 | datpath = '/data/lisa/exp/glorotxa/WakaBST4/' #data path to load from 24 | 25 | print >> sys.stderr, 'train set : ', train 26 | print >> sys.stderr, 'operator : ', operator 27 | print >> sys.stderr, 'ndim : ', ndim 28 | print >> sys.stderr, 'nbbatches : ', nbatches 29 | print >> sys.stderr, 'lrparam : ', lrparam 30 | print >> sys.stderr, 'lremb : ', lremb 31 | print >> sys.stderr, 'nbtest : ', nbtest 32 | print >> sys.stderr, 'testall : ', testall 33 | print >> sys.stderr, 'savepath : ', savepath 34 | print >> sys.stderr, 'simfnstr : ', simfnstr 35 | print >> sys.stderr, 'listconcept : ', listconcept 36 | print >> sys.stderr, 'listrel : ', listrel 37 | print >> sys.stderr, 'nbrank : ', nbrank 38 | print >> sys.stderr, 'warp : ', warp 39 | print >> sys.stderr, 'loadmodel : ', loadmodel 40 | 41 | 42 | if savepath not in os.listdir('.'): 43 | os.mkdir(savepath) 44 | 45 | 46 | def warpsampling(fft,posl,posr,poso,posln,posrn,poson,N): 47 | # This simple function does the warp sampling in a unefficient manner. 48 | # (take the batch, do the forward, resample negative elements associated to cost = 0, re-do the forward... etc N times) 49 | # the resampling is done by shuffling the negative indexes matrices. 50 | # fft is the forward function (returning the cost>0 vector) 51 | count_sample = 0 52 | nbup = 0 53 | while count_sample> sys.stderr, '------ Epoch ', ct 221 | print >> sys.stderr, numpy.mean(left+right), numpy.std(left+right),numpy.mean(left),numpy.std(left),numpy.mean(right), numpy.std(right) 222 | print >> sys.stderr, numpy.mean(leftb+rightb), numpy.std(leftb+rightb),numpy.mean(leftb),numpy.std(leftb),numpy.mean(rightb), numpy.std(rightb) 223 | txt = '' 224 | txt += '%s %s %s %s %s %s\n'%(numpy.mean(left+right), numpy.std(left+right),numpy.mean(left),numpy.std(left),numpy.mean(right), numpy.std(right)) 225 | txt += '%s %s %s %s %s %s\n'%(numpy.mean(leftb+rightb), numpy.std(leftb+rightb),numpy.mean(leftb),numpy.std(leftb),numpy.mean(rightb), numpy.std(rightb)) 226 | left = [] 227 | right = [] 228 | leftb = [] 229 | rightb = [] 230 | rel = [] 231 | relb = [] 232 | result = calctestval(sl,sr,idxtl[:nbtest],idxtr[:nbtest],idxto[:nbtest]) 233 | txt += str(result)+'\n' 234 | for cc in listconcept: 235 | txt+='\n' 236 | txt += getnclosest(nbrank, idx2lemme, lemme2idx, idx2synset, synset2idx, synset2def, synset2concept, concept2synset, Esim, cc, [], typ = 0, emb = True) 237 | for rr in listrel: 238 | txt+='\n' 239 | txt += getnclosest(nbrank,idx2lemme, lemme2idx, idx2synset, synset2idx, synset2def, synset2concept, concept2synset , sll, cc, rr, typ = 1, emb = False) 240 | txt+='\n' 241 | txt += getnclosest(nbrank,idx2lemme, lemme2idx, idx2synset, synset2idx, synset2def, synset2concept, concept2synset , srl, cc, rr, typ = 2, emb = False) 242 | for rr in listconcept: 243 | txt +='\n' 244 | txt += getnclosest(nbrank,idx2lemme, lemme2idx, idx2synset, synset2idx, synset2def, synset2concept, concept2synset , srl, cc, rr, typ = 3, emb = False) 245 | f = open(savepath+'/model.pkl','w') 246 | cPickle.dump(embeddings,f,-1) 247 | cPickle.dump(leftop,f,-1) 248 | cPickle.dump(rightop,f,-1) 249 | if simfnstr == 'MLP': 250 | cPickle.dump(MLPout,f,-1) 251 | f.close() 252 | f = open(savepath+'/currentrel.txt','w') 253 | f.write(txt) 254 | f.close() 255 | print result 256 | 257 | -------------------------------------------------------------------------------- /evaluation.py: -------------------------------------------------------------------------------- 1 | import scipy.sparse 2 | import cPickle 3 | import os 4 | import sys 5 | from model import * 6 | 7 | 8 | # create or use a folder containing the results with the name: 9 | name = sys.argv[1] 10 | 11 | # take the job id of the experiment folder 12 | id = int(sys.argv[2]) 13 | 14 | # do the evaluation given by the number (0 for all) 15 | evaln = int(sys.argv[3]) 16 | 17 | 18 | def normafunc(x): 19 | return (x-numpy.min(x))/sum((x-numpy.min(x))) 20 | 21 | def softmaxfunc(x): 22 | return numpy.exp(x-numpy.max(x))/sum(numpy.exp(x-numpy.max(x))) 23 | 24 | print name,id,evaln 25 | 26 | try: 27 | os.mkdir(name) 28 | except: 29 | pass 30 | 31 | datpath ='/data/lisa/exp/glorotxa/WakaBST4/' 32 | 33 | synset2idx = cPickle.load(open(datpath+'synset2idx.pkl','r')) 34 | lemme2idx = cPickle.load(open(datpath+'lemme2idx.pkl','r')) 35 | loadmodel = '/data/lisa/exp/glorotxa/WakaBST4/evaluationsave/%s/model.pkl'%id 36 | 37 | 38 | f = open(loadmodel) 39 | embeddings = cPickle.load(f) 40 | leftop = cPickle.load(f) 41 | rightop = cPickle.load(f) 42 | simfn = eval('dotsim') 43 | try: 44 | MLPout = cPickle.load(f) 45 | simfn = MLPout 46 | except: 47 | simfn = eval('dotsim') 48 | 49 | #---------------------------------------------------------------------------------------------------- 50 | if evaln == 1 or evaln == 0: 51 | srl = SimilarityFunctionrightl(simfn,embeddings,leftop,rightop,numpy.max(synset2idx.values())+1,True) 52 | sll = SimilarityFunctionleftl(simfn,embeddings,leftop,rightop,numpy.max(synset2idx.values())+1,True) 53 | sol = SimilarityFunctionrell(simfn,embeddings,leftop,rightop,numpy.max(synset2idx.values())+1,True) 54 | 55 | posl = scipy.sparse.csr_matrix(cPickle.load(open(datpath+'XWN-lemme-lhs.pkl')),dtype='float32') 56 | posr = scipy.sparse.csr_matrix(cPickle.load(open(datpath+'XWN-lemme-rhs.pkl')),dtype='float32') 57 | poso = scipy.sparse.csr_matrix(cPickle.load(open(datpath+'XWN-lemme-rel.pkl')),dtype='float32') 58 | poslc = scipy.sparse.csr_matrix(cPickle.load(open(datpath+'XWN-corres-lhs.pkl')),dtype='float32') 59 | posrc = scipy.sparse.csr_matrix(cPickle.load(open(datpath+'XWN-corres-rhs.pkl')),dtype='float32') 60 | posoc = scipy.sparse.csr_matrix(cPickle.load(open(datpath+'XWN-corres-rel.pkl')),dtype='float32') 61 | 62 | nbtest = 5000 63 | 64 | llX , relX , rrX= calctestscore4(sll,srl,sol,posl[:,:nbtest],posr[:,:nbtest],poso[:,:nbtest],poslc[:,:nbtest],posrc[:,:nbtest],posoc[:,:nbtest]) 65 | f = open(name +'/' + name + '_XWNrank.pkl','w') 66 | cPickle.dump(llX,f,-1) 67 | cPickle.dump(relX,f,-1) 68 | cPickle.dump(rrX,f,-1) 69 | 70 | #---------------------------------------------------------------------------------------------------- 71 | #if evaln == 10 or evaln == 0: 72 | # srl = SimilarityFunctionrightl(simfn,embeddings,leftop,rightop,numpy.max(synset2idx.values())+1,True) 73 | # sll = SimilarityFunctionleftl(simfn,embeddings,leftop,rightop,numpy.max(synset2idx.values())+1,True) 74 | # sol = SimilarityFunctionrell(simfn,embeddings,leftop,rightop,numpy.max(synset2idx.values())+1,True) 75 | # 76 | # posl = scipy.sparse.csr_matrix(cPickle.load(open(datpath+'XWN-lemme-lhs.pkl')),dtype='float32') 77 | # posr = scipy.sparse.csr_matrix(cPickle.load(open(datpath+'XWN-lemme-rhs.pkl')),dtype='float32') 78 | # poso = scipy.sparse.csr_matrix(cPickle.load(open(datpath+'XWN-lemme-rel.pkl')),dtype='float32') 79 | # poslc = scipy.sparse.csr_matrix(cPickle.load(open(datpath+'XWN-mod-lhs.pkl')),dtype='float32') 80 | # posrc = scipy.sparse.csr_matrix(cPickle.load(open(datpath+'XWN-mod-rhs.pkl')),dtype='float32') 81 | # posoc = scipy.sparse.csr_matrix(cPickle.load(open(datpath+'XWN-mod-rel.pkl')),dtype='float32') 82 | # 83 | # nbtest = 5000 84 | # 85 | # llX , relX , rrX= calctestscore4(sll,srl,sol,posl[:,:nbtest],posr[:,:nbtest],poso[:,:nbtest],poslc[:,:nbtest],posrc[:,:nbtest],posoc[:,:nbtest]) 86 | # f = open(name +'/' + name + '_XWNmodrank.pkl','w') 87 | # cPickle.dump(llX,f,-1) 88 | # cPickle.dump(relX,f,-1) 89 | # cPickle.dump(rrX,f,-1) 90 | 91 | #---------------------------------------------------------------------------------------------------- 92 | #if evaln == 11 or evaln == 0: 93 | # srl = SimilarityFunctionrightl(simfn,embeddings,leftop,rightop,numpy.max(synset2idx.values())+1,True) 94 | # sll = SimilarityFunctionleftl(simfn,embeddings,leftop,rightop,numpy.max(synset2idx.values())+1,True) 95 | # sol = SimilarityFunctionrell(simfn,embeddings,leftop,rightop,numpy.max(synset2idx.values())+1,True) 96 | # 97 | # posl = scipy.sparse.csr_matrix(cPickle.load(open(datpath+'XWN-lemme-lhs.pkl')),dtype='float32') 98 | # posr = scipy.sparse.csr_matrix(cPickle.load(open(datpath+'XWN-lemme-rhs.pkl')),dtype='float32') 99 | # poso = scipy.sparse.csr_matrix(cPickle.load(open(datpath+'XWN-lemme-rel.pkl')),dtype='float32') 100 | # poslc = scipy.sparse.csr_matrix(cPickle.load(open(datpath+'XWN-nmod-lhs.pkl')),dtype='float32') 101 | # posrc = scipy.sparse.csr_matrix(cPickle.load(open(datpath+'XWN-nmod-rhs.pkl')),dtype='float32') 102 | # posoc = scipy.sparse.csr_matrix(cPickle.load(open(datpath+'XWN-nmod-rel.pkl')),dtype='float32') 103 | # 104 | # nbtest = 5000 105 | # 106 | # llX , relX , rrX= calctestscore4(sll,srl,sol,posl[:,:nbtest],posr[:,:nbtest],poso[:,:nbtest],poslc[:,:nbtest],posrc[:,:nbtest],posoc[:,:nbtest]) 107 | # f = open(name +'/' + name + '_XWNnmodrank.pkl','w') 108 | # cPickle.dump(llX,f,-1) 109 | # cPickle.dump(relX,f,-1) 110 | # cPickle.dump(rrX,f,-1) 111 | 112 | 113 | #---------------------------------------------------------------------------------------------------- 114 | if evaln == 2 or evaln == 0: 115 | modelpred = {} 116 | nmodelpred = {} 117 | posl = (cPickle.load(open(datpath+'XWN-WSD-lhs.pkl'))).tocsr() 118 | posr = (cPickle.load(open(datpath+'XWN-WSD-rhs.pkl'))).tocsr() 119 | poso = (cPickle.load(open(datpath+'XWN-WSD-rel.pkl'))).tocsr() 120 | dicto = cPickle.load(open(datpath+'XWN-WSD-dict.pkl')) 121 | lab = cPickle.load(open(datpath+'XWN-WSD-lab.pkl')) 122 | freq = cPickle.load(open(datpath+'XWN-WSD-freq.pkl')) 123 | simifunc = BatchSimilarityFunction(simfn,embeddings,leftop,rightop) 124 | listrank = (simifunc(posl,posr,poso)[0]).flatten() 125 | modelvX = [] 126 | linvX = [] 127 | softvX = [] 128 | for idx,i in enumerate(dicto.keys()): 129 | listtmp = listrank[dicto[i][0]:dicto[i][1]] 130 | labtmp = lab[dicto[i][0]:dicto[i][1]] 131 | assert sum(labtmp)==1 132 | freqtmp = freq[dicto[i][0]:dicto[i][1]] 133 | lin_p = normafunc(listtmp) 134 | soft_p = softmaxfunc(listtmp) 135 | lintmp = lin_p * numpy.asarray(freqtmp) 136 | softtmp = soft_p * numpy.asarray(freqtmp) 137 | if numpy.argsort(listtmp)[-1] != numpy.argsort(labtmp)[-1]: 138 | modelvX +=[1] 139 | else: 140 | modelvX +=[0] 141 | if numpy.argsort(lintmp)[-1] != numpy.argsort(labtmp)[-1]: 142 | linvX +=[1] 143 | else: 144 | linvX +=[0] 145 | if numpy.argsort(softtmp)[-1] != numpy.argsort(labtmp)[-1]: 146 | softvX +=[1] 147 | else: 148 | softvX += [0] 149 | modelpred.update({i:numpy.argsort(softtmp)[-1]}) 150 | bbtr = True 151 | for zz in numpy.argsort(softtmp): 152 | if zz != numpy.argsort(softtmp)[-1] and zz != numpy.argsort(labtmp)[-1]: 153 | bbtr = False 154 | gt = zz 155 | if bbtr: 156 | nmodelpred.update({i:numpy.argsort(labtmp)[-1]}) 157 | else: 158 | nmodelpred.update({i:gt}) 159 | f = open(name +'/' + name + '_XWN-WSD.pkl','w') 160 | g = open(name +'/'+'modelpred.pkl','w') 161 | h = open(name +'/'+'nmodelpred.pkl','w') 162 | cPickle.dump(modelvX,f,-1) 163 | cPickle.dump(linvX,f,-1) 164 | cPickle.dump(softvX,f,-1) 165 | cPickle.dump(modelpred,g,-1) 166 | cPickle.dump(nmodelpred,h,-1) 167 | h.close() 168 | f.close() 169 | g.close() 170 | 171 | #---------------------------------------------------------------------------------------------------- 172 | if evaln == 3 or evaln == 0: 173 | srl = SimilarityFunctionrightl(simfn,embeddings,leftop,rightop,numpy.max(synset2idx.values())+1,True) 174 | sll = SimilarityFunctionleftl(simfn,embeddings,leftop,rightop,numpy.max(synset2idx.values())+1,True) 175 | sol = SimilarityFunctionrell(simfn,embeddings,leftop,rightop,numpy.max(synset2idx.values())+1,True) 176 | posl = scipy.sparse.csr_matrix(cPickle.load(open(datpath+'Brown-lemme-lhs.pkl')),dtype='float32') 177 | posr = scipy.sparse.csr_matrix(cPickle.load(open(datpath+'Brown-lemme-rhs.pkl')),dtype='float32') 178 | poso = scipy.sparse.csr_matrix(cPickle.load(open(datpath+'Brown-lemme-rel.pkl')),dtype='float32') 179 | poslc = scipy.sparse.csr_matrix(cPickle.load(open(datpath+'Brown-corres-lhs.pkl')),dtype='float32') 180 | posrc = scipy.sparse.csr_matrix(cPickle.load(open(datpath+'Brown-corres-rhs.pkl')),dtype='float32') 181 | posoc = scipy.sparse.csr_matrix(cPickle.load(open(datpath+'Brown-corres-rel.pkl')),dtype='float32') 182 | 183 | nbtest = 5000 184 | 185 | llB , relB , rrB= calctestscore4(sll,srl,sol,posl[:,:nbtest],posr[:,:nbtest],poso[:,:nbtest],poslc[:,:nbtest],posrc[:,:nbtest],posoc[:,:nbtest]) 186 | f = open(name +'/' + name + '_Brownrank.pkl','w') 187 | cPickle.dump(llB,f,-1) 188 | cPickle.dump(relB,f,-1) 189 | cPickle.dump(rrB,f,-1) 190 | 191 | 192 | #---------------------------------------------------------------------------------------------------- 193 | if evaln == 4 or evaln == 0: 194 | posl = (cPickle.load(open(datpath+'Brown-WSD-lhs.pkl'))).tocsr() 195 | posr = (cPickle.load(open(datpath+'Brown-WSD-rhs.pkl'))).tocsr() 196 | poso = (cPickle.load(open(datpath+'Brown-WSD-rel.pkl'))).tocsr() 197 | dicto = cPickle.load(open(datpath+'Brown-WSD-dict.pkl')) 198 | lab = cPickle.load(open(datpath+'Brown-WSD-lab.pkl')) 199 | freq = cPickle.load(open(datpath+'Brown-WSD-freq.pkl')) 200 | simifunc = BatchSimilarityFunction(simfn,embeddings,leftop,rightop) 201 | listrank = (simifunc(posl,posr,poso)[0]).flatten() 202 | modelvB = [] 203 | linvB = [] 204 | softvB = [] 205 | for idx,i in enumerate(dicto.keys()): 206 | listtmp = listrank[dicto[i][0]:dicto[i][1]] 207 | labtmp = lab[dicto[i][0]:dicto[i][1]] 208 | assert sum(labtmp)==1 209 | freqtmp = freq[dicto[i][0]:dicto[i][1]] 210 | lin_p = normafunc(listtmp) 211 | soft_p = softmaxfunc(listtmp) 212 | lintmp = lin_p * numpy.asarray(freqtmp) 213 | softtmp = soft_p * numpy.asarray(freqtmp) 214 | if numpy.argsort(listtmp)[-1] != numpy.argsort(labtmp)[-1]: 215 | modelvB +=[1] 216 | else: 217 | modelvB +=[0] 218 | if numpy.argsort(lintmp)[-1] != numpy.argsort(labtmp)[-1]: 219 | linvB +=[1] 220 | else: 221 | linvB +=[0] 222 | if numpy.argsort(softtmp)[-1] != numpy.argsort(labtmp)[-1]: 223 | softvB +=[1] 224 | else: 225 | softvB += [0] 226 | f = open(name +'/' + name + '_Brown-WSD.pkl','w') 227 | cPickle.dump(modelvB,f,-1) 228 | cPickle.dump(linvB,f,-1) 229 | cPickle.dump(softvB,f,-1) 230 | 231 | #---------------------------------------------------------------------------------------------------- 232 | if evaln == 5 or evaln == 0: 233 | posl = (cPickle.load(open(datpath+'Senseval3-WSD-lhs.pkl'))).tocsr() 234 | posr = (cPickle.load(open(datpath+'Senseval3-WSD-rhs.pkl'))).tocsr() 235 | poso = (cPickle.load(open(datpath+'Senseval3-WSD-rel.pkl'))).tocsr() 236 | dicto = cPickle.load(open(datpath+'Senseval3-WSD-dict.pkl')) 237 | lab = cPickle.load(open(datpath+'Senseval3-WSD-lab.pkl')) 238 | freq = cPickle.load(open(datpath+'Senseval3-WSD-freq.pkl')) 239 | simifunc = BatchSimilarityFunction(simfn,embeddings,leftop,rightop) 240 | listrank = (simifunc(posl,posr,poso)[0]).flatten() 241 | modelvX = [] 242 | linvX = [] 243 | softvX = [] 244 | for idx,i in enumerate(dicto.keys()): 245 | listtmp = listrank[dicto[i][0]:dicto[i][1]] 246 | labtmp = lab[dicto[i][0]:dicto[i][1]] 247 | assert sum(labtmp)==1 248 | freqtmp = freq[dicto[i][0]:dicto[i][1]] 249 | lin_p = normafunc(listtmp) 250 | soft_p = softmaxfunc(listtmp) 251 | lintmp = lin_p * numpy.asarray(freqtmp) 252 | softtmp = soft_p * numpy.asarray(freqtmp) 253 | if numpy.argsort(listtmp)[-1] != numpy.argsort(labtmp)[-1]: 254 | modelvX +=[1] 255 | else: 256 | modelvX +=[0] 257 | if numpy.argsort(lintmp)[-1] != numpy.argsort(labtmp)[-1]: 258 | linvX +=[1] 259 | else: 260 | linvX +=[0] 261 | if numpy.argsort(softtmp)[-1] != numpy.argsort(labtmp)[-1]: 262 | softvX +=[1] 263 | else: 264 | softvX += [0] 265 | f = open(name +'/' + name + '_Senseval3-WSD.pkl','w') 266 | cPickle.dump(modelvX,f,-1) 267 | cPickle.dump(linvX,f,-1) 268 | cPickle.dump(softvX,f,-1) 269 | 270 | 271 | #---------------------------------------------------------------------------------------------------- 272 | if evaln == 6 or evaln == 0: 273 | datpath = '' 274 | 275 | # valid set 276 | WNvall = (cPickle.load(open(datpath+'WordNet3.0-val-lhs.pkl','r'))).tocsr() 277 | WNvalr = (cPickle.load(open(datpath+'WordNet3.0-val-rhs.pkl','r'))).tocsr() 278 | WNvalo = (cPickle.load(open(datpath+'WordNet3.0-val-rel.pkl','r'))).tocsr() 279 | 280 | # test set 281 | WNtestl = (cPickle.load(open(datpath+'WordNet3.0-test-lhs.pkl','r'))).tocsr() 282 | WNtestr = (cPickle.load(open(datpath+'WordNet3.0-test-rhs.pkl','r'))).tocsr() 283 | WNtesto = (cPickle.load(open(datpath+'WordNet3.0-test-rel.pkl','r'))).tocsr() 284 | 285 | rows,cols = WNtestl.nonzero() 286 | idxtl = rows[numpy.argsort(cols)] 287 | rows,cols = WNtestr.nonzero() 288 | idxtr = rows[numpy.argsort(cols)] 289 | rows,cols = WNtesto.nonzero() 290 | idxto = rows[numpy.argsort(cols)] 291 | 292 | rows,cols = WNvall.nonzero() 293 | idxvl = rows[numpy.argsort(cols)] 294 | rows,cols = WNvalr.nonzero() 295 | idxvr = rows[numpy.argsort(cols)] 296 | rows,cols = WNvalo.nonzero() 297 | idxvo = rows[numpy.argsort(cols)] 298 | 299 | sl = SimilarityFunctionleft(simfn,embeddings,leftop,rightop,subtensorspec = numpy.max(synset2idx.values())+1) 300 | sr = SimilarityFunctionright(simfn,embeddings,leftop,rightop,subtensorspec = numpy.max(synset2idx.values())+1) 301 | 302 | errlval,errrval = calctestval2(sl,sr,idxtl,idxtr,idxto) 303 | errltes,errrtes = calctestval2(sl,sr,idxvl,idxvr,idxvo) 304 | 305 | f = open(name +'/' + name + '_WN-rank.pkl','w') 306 | cPickle.dump(errlval,f,-1) 307 | cPickle.dump(errrval,f,-1) 308 | cPickle.dump(errltes,f,-1) 309 | cPickle.dump(errrtes,f,-1) 310 | 311 | 312 | 313 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import theano 2 | import theano.tensor as T 3 | import numpy 4 | import cPickle 5 | 6 | # Similarity functions ---------------------------- 7 | def L1sim(left,right): 8 | return -T.sum(T.sqrt(T.sqr(left-right)),axis=1) 9 | 10 | def L2sim(left,right): 11 | return -T.sqrt(T.sum(T.sqr(left-right),axis=1)) 12 | 13 | def dotsim(left,right): 14 | return T.sum(left*right,axis=1) 15 | 16 | # ------------------------------------------------- 17 | 18 | # Costs ------------------------------------------- 19 | def margincost(pos,neg,marge=1.0): 20 | out = neg - pos + marge 21 | return T.sum(out * (out>0)),out>0 22 | 23 | def validcost(pos,neg): 24 | # Corresponds to the error without marge. 25 | out = neg - pos 26 | return T.sum(out * (out>0)),out>0, T.sum(out * (out<0)) 27 | 28 | # ------------------------------------------------- 29 | 30 | # Activation functions ---------------------------- 31 | def htanh(x): 32 | return -1. * (x<-1.) + x * (x<1.) * (x>=-1.) + 1. * (x>=1) 33 | 34 | def hsigm(x): 35 | return x * (x<1) * (x>0) + 1. * (x>=1) 36 | 37 | def rect(x): 38 | return x*(x>0) 39 | 40 | def sigm(x): 41 | return T.nnet.sigmoid(x) 42 | 43 | def tanh(x): 44 | return T.tanh(x) 45 | 46 | def lin(x): 47 | return x 48 | 49 | # ------------------------------------------------- 50 | 51 | 52 | # Layers ------------------------------------------ 53 | class Layer(object): 54 | def __init__(self, rng, act, n_inp, n_out, Winit = None, tag=''): 55 | self.act = eval(act) 56 | self.actstr = act 57 | self.n_inp = n_inp 58 | self.n_out = n_out 59 | # init param 60 | if Winit == None: 61 | wbound = numpy.sqrt(6./(n_inp+n_out)) 62 | W_values = numpy.asarray( rng.uniform( low = -wbound, high = wbound, \ 63 | size = (n_inp, n_out)), dtype = theano.config.floatX) 64 | self.W = theano.shared(value = W_values, name = 'W'+tag) 65 | else: 66 | self.W = theano.shared(value = Winit, name = 'W'+tag) 67 | self.params = [self.W] 68 | def __call__(self,x): 69 | return self.act(T.dot(x, self.W)) 70 | def save(self,path): 71 | f = open(path,'w') 72 | cPickle.dump(self,f,-1) 73 | f.close() 74 | 75 | class Layercomb(object): 76 | def __init__(self, rng, act, n_inp1, n_inp2 , n_out, W1init = None, W2init = None, binit = None): 77 | self.act = eval(act) 78 | self.actstr = act 79 | self.n_inp1 = n_inp1 80 | self.n_inp2 = n_inp2 81 | self.n_out = n_out 82 | self.layer1 = Layer(rng, 'lin', n_inp1, n_out, Winit = W1init, tag = '1') 83 | self.layer2 = Layer(rng, 'lin', n_inp2, n_out, Winit = W2init, tag = '2') 84 | if binit == None: 85 | b_values = numpy.zeros((n_out,), dtype= theano.config.floatX) 86 | self.b = theano.shared(value= b_values, name = 'b') 87 | else: 88 | self.b = theano.shared(value = binit, name = 'b') 89 | self.params = self.layer1.params + self.layer2.params + [self.b] 90 | def __call__(self,x,y): 91 | return self.act(self.layer1(x) + self.layer2(y) + self.b) 92 | def save(self,path): 93 | f = open(path,'w') 94 | cPickle.dump(self,f,-1) 95 | f.close() 96 | 97 | 98 | class MLP(object): 99 | def __init__(self, rng, act, n_inp1, n_inp2, n_hid, n_out, W1init = None, W2init = None, b12init = None, W3init = None, b3init = None): 100 | self.act = eval(act) 101 | self.actstr = act 102 | self.n_inp1 = n_inp1 103 | self.n_inp2 = n_inp2 104 | self.n_hid = n_hid 105 | self.n_out = n_out 106 | self.layer12 = Layercomb(rng, act, n_inp1, n_inp2, n_hid, W1init = W1init, W2init = W2init, binit = b12init) 107 | self.layer3 = Layer(rng, 'lin', n_hid, n_out, Winit = W3init, tag = '3') 108 | if b3init == None: 109 | b_values = numpy.zeros((n_out,), dtype= theano.config.floatX) 110 | self.b = theano.shared(value= b_values, name = 'b') 111 | else: 112 | self.b = theano.shared(value = b3init, name = 'b') 113 | self.params = self.layer12.params + self.layer3.params + [self.b] 114 | def __call__(self,x,y): 115 | return self.layer3(self.layer12(x,y)) + self.b 116 | def save(self,path): 117 | f = open(path,'w') 118 | cPickle.dump(self,f,-1) 119 | f.close() 120 | 121 | class Quadlayer(object): 122 | def __init__(self, rng, n_inp1, n_inp2, n_hid, n_out, W1init = None, b1init = None, W2init = None, b2init = None, W3init = None, b3init = None): 123 | self.n_inp1 = n_inp1 124 | self.n_inp2 = n_inp2 125 | self.n_hid = n_hid 126 | self.n_out = n_out 127 | if W1init == None: 128 | wbound = numpy.sqrt(6./(n_inp1+n_hid)) 129 | W_values = numpy.asarray( rng.uniform( low = -wbound, high = wbound, \ 130 | size = (n_inp1, n_hid)), dtype = theano.config.floatX) 131 | self.W1 = theano.shared(value = W_values, name = 'W1') 132 | else: 133 | self.W1 = theano.shared(value = W1init, name = 'W1') 134 | if b1init == None: 135 | b_values = numpy.zeros((n_hid,), dtype= theano.config.floatX) 136 | self.b1 = theano.shared(value= b_values, name = 'b1') 137 | else: 138 | self.b1 = theano.shared(value = b1init, name = 'b1') 139 | if W2init == None: 140 | wbound = numpy.sqrt(6./(n_inp2+n_hid)) 141 | W_values = numpy.asarray( rng.uniform( low = -wbound, high = wbound, \ 142 | size = (n_inp2, n_hid)), dtype = theano.config.floatX) 143 | self.W2 = theano.shared(value = W_values, name = 'W2') 144 | else: 145 | self.W2 = theano.shared(value = W2init, name = 'W2') 146 | if b2init == None: 147 | b_values = numpy.zeros((n_hid,), dtype= theano.config.floatX) 148 | self.b2 = theano.shared(value= b_values, name = 'b2') 149 | else: 150 | self.b2 = theano.shared(value = b2init, name = 'b2') 151 | if W3init == None: 152 | wbound = numpy.sqrt(6./(n_hid+n_out)) 153 | W_values = numpy.asarray( rng.uniform( low = -wbound, high = wbound, \ 154 | size = (n_hid, n_out)), dtype = theano.config.floatX) 155 | self.W3 = theano.shared(value = W_values, name = 'W3') 156 | else: 157 | self.W3 = theano.shared(value = W3init, name = 'W3') 158 | if b3init == None: 159 | b_values = numpy.zeros((n_out,), dtype= theano.config.floatX) 160 | self.b3 = theano.shared(value= b_values, name = 'b3') 161 | else: 162 | self.b3 = theano.shared(value = b3init, name = 'b3') 163 | self.params = [self.W1,self.b1,self.W2,self.b2,self.W3,self.b3] 164 | def __call__(self,x,y): 165 | return T.dot((T.dot(x,self.W1) + self.b1) * (T.dot(y,self.W2) + self.b2), self.W3 ) + self.b3 166 | def save(self,path): 167 | f = open(path,'w') 168 | cPickle.dump(self,f,-1) 169 | f.close() 170 | 171 | class Id(object): 172 | def __init__(self): 173 | self.params = [] 174 | def __call__(self,x,y): 175 | return x 176 | def save(self,path): 177 | pass 178 | 179 | class Embedd(object): 180 | def __init__(self,rng,N,D,Einit = None): 181 | self.N = N 182 | self.D = D 183 | if Einit == None: 184 | wbound = numpy.sqrt(6) 185 | W_values = numpy.asarray( rng.uniform( low = -wbound, high = wbound, \ 186 | size = (D, N)), dtype = theano.config.floatX) 187 | self.E = theano.shared(value = W_values/numpy.sqrt(numpy.sum(W_values * W_values,axis=0)), name = 'E') 188 | self.updates = {self.E:self.E/T.sqrt(T.sum(self.E * self.E,axis=0))} 189 | self.norma = theano.function([],[],updates = self.updates) 190 | def normalize(self): 191 | self.norma() 192 | 193 | 194 | # --------------------------------------- 195 | 196 | 197 | def SimilarityFunctionl(fnsim,embeddings,leftop,rightop): 198 | # Creation of scoring function on sparse matrices lhs,rel,rhs. 199 | idxrel = theano.sparse.csr_matrix('idxrel') 200 | idxright = theano.sparse.csr_matrix('idxright') 201 | idxleft = theano.sparse.csr_matrix('idxleft') 202 | lhs = (theano.sparse.dot(embeddings.E,idxleft).T).reshape((1,embeddings.D)) 203 | rhs = (theano.sparse.dot(embeddings.E,idxright).T).reshape((1,embeddings.D)) 204 | rel = (theano.sparse.dot(embeddings.E,idxrel).T).reshape((1,embeddings.D)) 205 | simi = fnsim(leftop(lhs,rel),rightop(rhs,rel)) 206 | return theano.function([idxleft,idxright,idxrel],[simi]) 207 | 208 | 209 | # Creation of ccoring function with respect to the complete list of embeddings (or a subtensor of it defined by subtensorspec) 210 | # if adding = True the scoring function has 2 more arguments: idxadd which contains the indexes to add, and sc the scaling: 211 | # example: you want to ask ( [__us_NN_1,__army_NN_1] , [__attack_VB_1], [???,__village_NN_1] ) idxadd represents __village_NN_1 212 | # and sc represent the values of ??? (here 1/2) so that the sum of each member is 1 (to do a mean pool). 213 | 214 | # Ask for the right member 215 | def SimilarityFunctionrightl(fnsim,embeddings,leftop,rightop,subtensorspec = None, adding = False): 216 | # Scoring fuynction with respect to the complete list of embeddings (or a subtensor of it defined by subtensorspec) 217 | # if adding = True the scoring function has 2 more arguments 218 | idxrel = theano.sparse.csr_matrix('idxrel') 219 | idxleft = theano.sparse.csr_matrix('idxleft') 220 | lhs = (theano.sparse.dot(embeddings.E,idxleft).T).reshape((1,embeddings.D)) 221 | if not adding: 222 | if subtensorspec == None: 223 | rhs = embeddings.E.T 224 | else: 225 | rhs = embeddings.E[:,:subtensorspec].T 226 | else: 227 | idxadd = theano.sparse.csr_matrix('idxadd') 228 | sc = T.scalar('sc') 229 | if subtensorspec == None: 230 | rhs = embeddings.E.T * sc + (theano.sparse.dot(embeddings.E,idxadd).T).reshape((1,embeddings.D)) 231 | else: 232 | rhs = embeddings.E[:,:subtensorspec].T * sc + (theano.sparse.dot(embeddings.E,idxadd).T).reshape((1,embeddings.D)) 233 | rel = (theano.sparse.dot(embeddings.E,idxrel).T).reshape((1,embeddings.D)) 234 | simi = fnsim(leftop(lhs,rel),rightop(rhs,rel)) 235 | if not adding: 236 | return theano.function([idxleft,idxrel],[simi]) 237 | else: 238 | return theano.function([idxleft,idxrel,idxadd,sc],[simi]) 239 | 240 | # Ask for the left member 241 | def SimilarityFunctionleftl(fnsim,embeddings,leftop,rightop,subtensorspec = None, adding = False): 242 | idxrel = theano.sparse.csr_matrix('idxrel') 243 | idxright = theano.sparse.csr_matrix('idxright') 244 | rhs = (theano.sparse.dot(embeddings.E,idxright).T).reshape((1,embeddings.D)) 245 | if not adding: 246 | if subtensorspec == None: 247 | lhs = embeddings.E.T 248 | else: 249 | lhs = embeddings.E[:,:subtensorspec].T 250 | else: 251 | idxadd = theano.sparse.csr_matrix('idxadd') 252 | sc = T.scalar('sc') 253 | if subtensorspec == None: 254 | lhs = embeddings.E.T * sc + (theano.sparse.dot(embeddings.E,idxadd).T).reshape((1,embeddings.D)) 255 | else: 256 | lhs = embeddings.E[:,:subtensorspec].T * sc + (theano.sparse.dot(embeddings.E,idxadd).T).reshape((1,embeddings.D)) 257 | rel = (theano.sparse.dot(embeddings.E,idxrel).T).reshape((1,embeddings.D)) 258 | simi = fnsim(leftop(lhs,rel),rightop(rhs,rel)) 259 | if not adding: 260 | return theano.function([idxright,idxrel],[simi]) 261 | else: 262 | return theano.function([idxright,idxrel,idxadd,sc],[simi]) 263 | 264 | # Ask for the relation member 265 | def SimilarityFunctionrell(fnsim,embeddings,leftop,rightop,subtensorspec = None, adding = False): 266 | idxright = theano.sparse.csr_matrix('idxright') 267 | idxleft = theano.sparse.csr_matrix('idxleft') 268 | lhs = (theano.sparse.dot(embeddings.E,idxleft).T).reshape((1,embeddings.D)) 269 | if not adding: 270 | if subtensorspec == None: 271 | rel = embeddings.E.T 272 | else: 273 | rel = embeddings.E[:,:subtensorspec].T 274 | else: 275 | idxadd = theano.sparse.csr_matrix('idxadd') 276 | sc = T.scalar('sc') 277 | if subtensorspec == None: 278 | rel = embeddings.E.T * sc + (theano.sparse.dot(embeddings.E,idxadd).T).reshape((1,embeddings.D)) 279 | else: 280 | rel = embeddings.E[:,:subtensorspec].T * sc + (theano.sparse.dot(embeddings.E,idxadd).T).reshape((1,embeddings.D)) 281 | rhs = (theano.sparse.dot(embeddings.E,idxright).T).reshape((1,embeddings.D)) 282 | simi = fnsim(leftop(lhs,rel),rightop(rhs,rel)) 283 | if not adding: 284 | return theano.function([idxleft,idxright],[simi]) 285 | else: 286 | return theano.function([idxleft,idxright,idxadd,sc],[simi]) 287 | 288 | 289 | # Creation of scoring function on indexes (not on sparse matrices) 290 | def SimilarityFunction(fnsim,embeddings,leftop,rightop): 291 | idxrel = T.iscalar('idxrel') 292 | idxright = T.iscalar('idxright') 293 | idxleft = T.iscalar('idxleft') 294 | lhs = (embeddings.E[:,idxleft]).reshape((1,embeddings.D)) 295 | rhs = (embeddings.E[:,idxright]).reshape((1,embeddings.D)) 296 | rel = (embeddings.E[:,idxrel]).reshape((1,embeddings.D)) 297 | simi = fnsim(leftop(lhs,rel),rightop(rhs,rel)) 298 | return theano.function([idxleft,idxright,idxrel],[simi]) 299 | 300 | 301 | # Ask for the right member 302 | def SimilarityFunctionright(fnsim,embeddings,leftop,rightop,subtensorspec = None): 303 | idxrel = T.iscalar('idxrel') 304 | idxleft = T.iscalar('idxleft') 305 | lhs = (embeddings.E[:,idxleft]).reshape((1,embeddings.D)) 306 | if subtensorspec != None: 307 | rhs = (embeddings.E[:,:subtensorspec]).T 308 | else: 309 | rhs = embeddings.E.T 310 | rel = (embeddings.E[:,idxrel]).reshape((1,embeddings.D)) 311 | simi = fnsim(leftop(lhs,rel),rightop(rhs,rel)) 312 | return theano.function([idxleft,idxrel],[simi]) 313 | 314 | 315 | # Ask for the left member 316 | def SimilarityFunctionleft(fnsim,embeddings,leftop,rightop,subtensorspec = None): 317 | idxrel = T.iscalar('idxrel') 318 | idxright = T.iscalar('idxright') 319 | rhs = (embeddings.E[:,idxright]).reshape((1,embeddings.D)) 320 | if subtensorspec != None: 321 | lhs = (embeddings.E[:,:subtensorspec]).T 322 | else: 323 | lhs = embeddings.E.T 324 | rel = (embeddings.E[:,idxrel]).reshape((1,embeddings.D)) 325 | simi = fnsim(leftop(lhs,rel),rightop(rhs,rel)) 326 | return theano.function([idxright,idxrel],[simi]) 327 | 328 | # Ask for the relation member 329 | def SimilarityFunctionrel(fnsim,embeddings,leftop,rightop,subtensorspec = None): 330 | idxright = T.iscalar('idxrel') 331 | idxleft = T.iscalar('idxleft') 332 | lhs = (embeddings.E[:,idxleft]).reshape((1,embeddings.D)) 333 | rel = embeddings.E.T 334 | if subtensorspec != None: 335 | rel = (embeddings.E[:,:subtensorspec]).T 336 | else: 337 | rel = embeddings.E.T 338 | rhs = (embeddings.E[:,idxright]).reshape((1,embeddings.D)) 339 | simi = fnsim(leftop(lhs,rel),rightop(rhs,rel)) 340 | return theano.function([idxleft,idxright],[simi]) 341 | 342 | 343 | # get the N most probable words given by a scoring function simfn with part1 and part2 its 2 input members (in a list of synset or lemme or concept). 344 | # typ = 1 : ??? part2 part1 (simfn built with SimilarityFunctionleftl) 345 | # typ = 2 : part1 part2 ??? (simfn built with SimilarityFunctionrightl) 346 | # typ = 3 : part1 ??? part2 (simfn built with SimilarityFunctionrell) 347 | # emb = True : distance(part1,???) (you have to build a model with Id() layers with a L2 scoring function, then simfn built from SimilarityFunctionrightl) 348 | def getnclosest(N, idx2lemme, lemme2idx, idx2synset, synset2idx, synset2def, synset2concept, concept2synset, simfn, part1, part2, typ = 1, emb = False): 349 | idx1 = [] 350 | str1 = [] 351 | idx2 = [] 352 | str2 = [] 353 | vec1 = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,1),dtype=theano.config.floatX) 354 | for i in part1: 355 | if i in lemme2idx.keys(): 356 | idx1 += [lemme2idx[i]] 357 | vec1[idx1[-1],0] += 1/float(len(part1)) 358 | str1 += ['-'+i] 359 | elif i in synset2idx.keys(): 360 | idx1 += [synset2idx[i]] 361 | vec1[idx1[-1],0] += 1/float(len(part1)) 362 | str1 += ['-'+synset2concept[i]] 363 | else: 364 | idx1 += [synset2idx[concept2synset[i]]] 365 | vec1[idx1[-1],0] += 1/float(len(part1)) 366 | str1 += ['-'+i] 367 | vec2=scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,1),dtype=theano.config.floatX) 368 | for i in part2: 369 | if i in lemme2idx.keys(): 370 | idx2 += [lemme2idx[i]] 371 | vec2[idx2[-1],0] += 1/float(len(part2)) 372 | str2 += ['-'+i] 373 | elif i in synset2idx.keys(): 374 | idx2 += [synset2idx[i]] 375 | vec2[idx2[-1],0] += 1/float(len(part2)) 376 | str2 += ['-'+synset2concept[i]] 377 | else: 378 | idx2 += [synset2idx[concept2synset[i]]] 379 | vec2[idx2[-1],0] += 1/float(len(part2)) 380 | str2 += ['-'+i] 381 | ll = (simfn(vec1,vec2)[0]).flatten() 382 | llo = numpy.argsort(ll)[::-1] 383 | llt = ll[llo] 384 | tt = '' 385 | txt1 ='' 386 | for i in str1: 387 | txt1 += i 388 | txt2 = '' 389 | for i in str2: 390 | txt2 += i 391 | if emb: 392 | tt += 'Similar to: %s\n'%( txt1 ) 393 | else: 394 | if typ == 1: 395 | tt += '???? %s %s\n'%( txt2, txt1 ) 396 | elif typ == 2: 397 | tt += '%s %s ????\n'%( txt1, txt2 ) 398 | elif typ == 3: 399 | tt += '%s ???? %s\n'%( txt1, txt2 ) 400 | for i in range(N): 401 | if llo[i] in idx2lemme.keys(): 402 | stro = idx2lemme[llo[i]] 403 | elif idx2synset[llo[i]][0] == '_': 404 | stro = llo[i] 405 | else: 406 | stro = synset2concept[idx2synset[llo[i]]] + ' : ' + synset2def[idx2synset[llo[i]]] 407 | tt += 'Rank %s %s %s\n'%(i+1,llt[i],stro) 408 | return tt 409 | 410 | import theano.sparse 411 | import scipy.sparse 412 | 413 | # The training function creation: 414 | # relb = true, negative sample for the realtion member. 415 | 416 | # lrparams = learning rate for all the parameters of the model. 417 | # lrembeddings = learning rate for the embeddings. 418 | # inpposl = sparse matrix of the lhs. 419 | # inposr = sparse matrix of the rhs 420 | # inposo = sparse matrix of the relation 421 | # inpposln = sparse matrix of the negatif samples for the lhs 422 | # inpposrn = sparse matrix of the negatif samples for the rhs 423 | # inpposon = sparse matrix of the negatif samples for the relation 424 | def TrainFunction(fnsim,embeddings, leftop, rightop, marge = 1.0, relb = True): 425 | # inputs 426 | inpposr = theano.sparse.csr_matrix() 427 | inpposl = theano.sparse.csr_matrix() 428 | inpposo = theano.sparse.csr_matrix() 429 | inpposln = theano.sparse.csr_matrix() 430 | inpposrn = theano.sparse.csr_matrix() 431 | inpposon = theano.sparse.csr_matrix() 432 | lrparams = T.scalar('lrparams') 433 | lrembeddings = T.scalar('lrembeddings') 434 | # graph 435 | lhs = theano.sparse.dot(embeddings.E,inpposl).T 436 | rhs = theano.sparse.dot(embeddings.E,inpposr).T 437 | rel = theano.sparse.dot(embeddings.E,inpposo).T 438 | lhsn = theano.sparse.dot(embeddings.E,inpposln).T 439 | rhsn = theano.sparse.dot(embeddings.E,inpposrn).T 440 | reln = theano.sparse.dot(embeddings.E,inpposon).T 441 | simi = fnsim(leftop(lhs,rel),rightop(rhs,rel)) 442 | siminl = fnsim(leftop(lhsn,rel),rightop(rhs,rel)) 443 | siminr = fnsim(leftop(lhs,rel),rightop(rhsn,rel)) 444 | simino = fnsim(leftop(lhs,reln),rightop(rhs,reln)) 445 | costl,outl = margincost(simi,siminl,marge) 446 | costr,outr = margincost(simi,siminr,marge) 447 | costo,outo = margincost(simi,simino,marge) 448 | if relb: 449 | cost = costl + costr + costo 450 | else: 451 | cost = costl + costr 452 | out = T.concatenate([outl,outr,outo]) 453 | if hasattr(fnsim,'params'): 454 | gradientsparams = T.grad(cost, leftop.params + rightop.params + fnsim.params) 455 | updates = dict((i,i-lrparams*j) for i,j in zip(leftop.params + rightop.params + fnsim.params, gradientsparams)) 456 | else: 457 | gradientsparams = T.grad(cost, leftop.params + rightop.params) 458 | updates = dict((i,i-lrparams*j) for i,j in zip(leftop.params + rightop.params, gradientsparams)) 459 | gradientsembeddings = T.grad(cost, embeddings.E) 460 | newE = embeddings.E - lrembeddings * gradientsembeddings 461 | ############### scaling variants 462 | #updates = dict((i,i-lrparams/(1+T.cast(T.sum(out),dtype=theano.config.floatX))*j) for i,j in zip(leftop.params + rightop.params, gradientsparams)) 463 | #maskE = T.vector('maskE') 464 | #newE = (embeddings.E - lrembeddings/(1+maskE*T.cast(T.sum(out),dtype=theano.config.floatX)) * gradientsembeddings) 465 | ############### 466 | #newEnorm = newE / T.sqrt(T.sum(newE*newE,axis=0)) 467 | updates.update({embeddings.E:newE}) 468 | return theano.function([lrparams,lrembeddings,inpposl, inpposr, inpposo, inpposln, inpposrn,inpposon], [cost,costl,costr,costo,T.sum(out),T.sum(outl),T.sum(outr),T.sum(outo),lhs,rhs,rel,simi,siminl,siminr,simino],updates=updates) 469 | 470 | 471 | # Function returning the binary vector representing: cost>0 472 | def ForwardFunction(fnsim,embeddings, leftop, rightop, marge = 1.0): 473 | # inputs 474 | inpposr = theano.sparse.csr_matrix() 475 | inpposl = theano.sparse.csr_matrix() 476 | inpposo = theano.sparse.csr_matrix() 477 | inpposln = theano.sparse.csr_matrix() 478 | inpposrn = theano.sparse.csr_matrix() 479 | inpposon = theano.sparse.csr_matrix() 480 | # graph 481 | lhs = theano.sparse.dot(embeddings.E,inpposl).T 482 | rhs = theano.sparse.dot(embeddings.E,inpposr).T 483 | rel = theano.sparse.dot(embeddings.E,inpposo).T 484 | lhsn = theano.sparse.dot(embeddings.E,inpposln).T 485 | rhsn = theano.sparse.dot(embeddings.E,inpposrn).T 486 | reln = theano.sparse.dot(embeddings.E,inpposon).T 487 | simi = fnsim(leftop(lhs,rel),rightop(rhs,rel)) 488 | siminl = fnsim(leftop(lhsn,rel),rightop(rhs,rel)) 489 | siminr = fnsim(leftop(lhs,rel),rightop(rhsn,rel)) 490 | simino = fnsim(leftop(lhs,reln),rightop(rhs,reln)) 491 | costl,outl = margincost(simi,siminl,marge) 492 | costr,outr = margincost(simi,siminr,marge) 493 | costo,outo = margincost(simi,simino,marge) 494 | return theano.function([inpposl, inpposr, inpposo, inpposln, inpposrn,inpposon], [outl,outr,outo]) 495 | 496 | # Function returning the score over lhs,rhs and rel sparse matrices 497 | def BatchSimilarityFunction(fnsim,embeddings, leftop, rightop): 498 | # inputs 499 | inpposr = theano.sparse.csr_matrix() 500 | inpposl = theano.sparse.csr_matrix() 501 | inpposo = theano.sparse.csr_matrix() 502 | # graph 503 | lhs = theano.sparse.dot(embeddings.E,inpposl).T 504 | rhs = theano.sparse.dot(embeddings.E,inpposr).T 505 | rel = theano.sparse.dot(embeddings.E,inpposo).T 506 | simi = fnsim(leftop(lhs,rel),rightop(rhs,rel)) 507 | return theano.function([inpposl, inpposr, inpposo], [simi]) 508 | 509 | 510 | # Function doing the forward on a batch and returning all information (without updating): 511 | def BatchValidFunction(fnsim,embeddings, leftop, rightop): 512 | # inputs 513 | inpposr = theano.sparse.csr_matrix() 514 | inpposl = theano.sparse.csr_matrix() 515 | inpposo = theano.sparse.csr_matrix() 516 | inpposln = theano.sparse.csr_matrix() 517 | inpposrn = theano.sparse.csr_matrix() 518 | inpposon = theano.sparse.csr_matrix() 519 | # graph 520 | lhs = theano.sparse.dot(embeddings.E,inpposl).T 521 | rhs = theano.sparse.dot(embeddings.E,inpposr).T 522 | rel = theano.sparse.dot(embeddings.E,inpposo).T 523 | lhsn = theano.sparse.dot(embeddings.E,inpposln).T 524 | rhsn = theano.sparse.dot(embeddings.E,inpposrn).T 525 | reln = theano.sparse.dot(embeddings.E,inpposon).T 526 | simi = fnsim(leftop(lhs,rel),rightop(rhs,rel)) 527 | siminl = fnsim(leftop(lhsn,rel),rightop(rhs,rel)) 528 | siminr = fnsim(leftop(lhs,rel),rightop(rhsn,rel)) 529 | simino = fnsim(leftop(lhs,reln),rightop(rhs,reln)) 530 | costl,outl,margel = validcost(simi,siminl) 531 | costr,outr,marger = validcost(simi,siminr) 532 | costo,outo,margeo = validcost(simi,simino) 533 | cost = costl + costr + costo 534 | out = T.concatenate([outl,outr,outo]) 535 | return theano.function([inpposl, inpposr, inpposo, inpposln, inpposrn,inpposon], [cost,costl,costr,costo,T.sum(out),T.sum(outl),T.sum(outr),T.sum(outo),margel,marger,margeo,lhs,rhs,rel,simi,siminl,siminr,simino]) 536 | 537 | 538 | # Compute the mean rank of the lhs and rhs, over a list of lhs, rhs and rel indexes. 539 | # Only works when there is one word per member (WordNet) 540 | # sl build with SimilarityFunctionleft 541 | # sr build with SimilarityFunctionright 542 | def calctestval(sl,sr,idxtl,idxtr,idxto): 543 | errl = [] 544 | errr = [] 545 | for l,o,r in zip(idxtl,idxto,idxtr): 546 | errl += [numpy.argsort(numpy.argsort((sl(r,o)[0]).flatten())[::-1]).flatten()[l]] 547 | errr += [numpy.argsort(numpy.argsort((sr(l,o)[0]).flatten())[::-1]).flatten()[r]] 548 | return numpy.mean(errl+errr),numpy.std(errl+errr),numpy.mean(errl),numpy.std(errl),numpy.mean(errr),numpy.std(errr) 549 | 550 | # The same but returns the ranking lists instead of their mean and std. 551 | def calctestval2(sl,sr,idxtl,idxtr,idxto): 552 | errl = [] 553 | errr = [] 554 | for l,o,r in zip(idxtl,idxto,idxtr): 555 | errl += [numpy.argsort(numpy.argsort((sl(r,o)[0]).flatten())[::-1]).flatten()[l]] 556 | errr += [numpy.argsort(numpy.argsort((sr(l,o)[0]).flatten())[::-1]).flatten()[r]] 557 | return errl,errr 558 | 559 | # Similar but works with sparse index matrices (posl,posr,poso) = (lhs,rhs,rel) 560 | # replace the whole member by one word. 561 | # sl build with SimilarityFunctionleftl 562 | # sr build with SimilarityFunctionrightl 563 | # so build with SimilarityFunctionrell 564 | def calctestscore(sl,sr,so,posl,posr,poso): 565 | errl = [] 566 | errr = [] 567 | erro = [] 568 | for i in range(posl.shape[1]): 569 | rankl = numpy.argsort((sl(posr[:,i],poso[:,i])[0]).flatten()) 570 | for l in posl[:,i].nonzero()[0]: 571 | errl += [numpy.argsort(rankl[::-1]).flatten()[l]] 572 | rankr = numpy.argsort((sr(posl[:,i],poso[:,i])[0]).flatten()) 573 | for r in posr[:,i].nonzero()[0]: 574 | errr += [numpy.argsort(rankr[::-1]).flatten()[r]] 575 | ranko = numpy.argsort((so(posl[:,i],posr[:,i])[0]).flatten()) 576 | for o in poso[:,i].nonzero()[0]: 577 | erro += [numpy.argsort(ranko[::-1]).flatten()[0]] 578 | return numpy.mean(errl+errr+erro),numpy.std(errl+errr+erro),numpy.mean(errl),numpy.std(errl),numpy.mean(errr),numpy.std(errr),numpy.mean(erro),numpy.std(erro) 579 | 580 | import copy 581 | 582 | 583 | # Similar but works with sparse index matrices (posl,posr,poso) = (lhs,rhs,rel) 584 | # AND replace only ONE word per member (does ALL combinations) 585 | # sl build with SimilarityFunctionleftl (with the adding argument = True) 586 | # sr build with SimilarityFunctionrightl (with the adding argument = True) 587 | # so build with SimilarityFunctionrell (with the adding argument = True) 588 | def calctestscore2(sl,sr,so,posl,posr,poso): 589 | errl = [] 590 | errr = [] 591 | erro = [] 592 | for i in range(posl.shape[1]): 593 | lnz = posl[:,i].nonzero()[0] 594 | for j in lnz: 595 | val = posl[j,i] 596 | tmpadd = copy.deepcopy(posl[:,i]) 597 | tmpadd[j,0] = 0.0 598 | rankl = numpy.argsort((sl(posr[:,i],poso[:,i],tmpadd,val)[0]).flatten()) 599 | errl += [numpy.argsort(rankl[::-1]).flatten()[j]] 600 | rnz = posr[:,i].nonzero()[0] 601 | for j in rnz: 602 | val = posr[j,i] 603 | tmpadd = copy.deepcopy(posr[:,i]) 604 | tmpadd[j,0] = 0.0 605 | rankr = numpy.argsort((sr(posl[:,i],poso[:,i],tmpadd,val)[0]).flatten()) 606 | errr += [numpy.argsort(rankr[::-1]).flatten()[j]] 607 | onz = poso[:,i].nonzero()[0] 608 | for j in onz: 609 | val = poso[j,i] 610 | tmpadd = copy.deepcopy(poso[:,i]) 611 | tmpadd[j,0] = 0.0 612 | ranko = numpy.argsort((so(posl[:,i],posr[:,i],tmpadd,val)[0]).flatten()) 613 | erro += [numpy.argsort(ranko[::-1]).flatten()[j]] 614 | return numpy.mean(errl+errr+erro),numpy.std(errl+errr+erro),numpy.mean(errl),numpy.std(errl),numpy.mean(errr),numpy.std(errr),numpy.mean(erro),numpy.std(erro) 615 | 616 | # The same : 617 | # Similar but works with sparse index matrices (posl,posr,poso) = (lhs,rhs,rel) 618 | # AND replace only ONE word per member (does ALL combinations) 619 | # sl build with SimilarityFunctionleftl (with the adding argument = True) 620 | # sr build with SimilarityFunctionrightl (with the adding argument = True) 621 | # so build with SimilarityFunctionrell (with the adding argument = True) 622 | # But compares with the index correspondance sparse matrices: (poslc,posrc,posoc) 623 | # (you give lemmas in input and find the ranking of synsets). 624 | def calctestscore3(sl,sr,so,posl,posr,poso,poslc,posrc,posoc): 625 | errl = [] 626 | errr = [] 627 | erro = [] 628 | for i in range(posl.shape[1]): 629 | lnz = posl[:,i].nonzero()[0] 630 | for j in lnz: 631 | val = posl[j,i] 632 | tmpadd = copy.deepcopy(posl[:,i]) 633 | tmpadd[j,0] = 0.0 634 | rankl = numpy.argsort((sl(posr[:,i],poso[:,i],tmpadd,val)[0]).flatten()) 635 | errl += [numpy.argsort(rankl[::-1]).flatten()[poslc[j,i]]] 636 | rnz = posr[:,i].nonzero()[0] 637 | for j in rnz: 638 | val = posr[j,i] 639 | tmpadd = copy.deepcopy(posr[:,i]) 640 | tmpadd[j,0] = 0.0 641 | rankr = numpy.argsort((sr(posl[:,i],poso[:,i],tmpadd,val)[0]).flatten()) 642 | errr += [numpy.argsort(rankr[::-1]).flatten()[posrc[j,i]]] 643 | onz = poso[:,i].nonzero()[0] 644 | for j in onz: 645 | val = poso[j,i] 646 | tmpadd = copy.deepcopy(poso[:,i]) 647 | tmpadd[j,0] = 0.0 648 | ranko = numpy.argsort((so(posl[:,i],posr[:,i],tmpadd,val)[0]).flatten()) 649 | erro += [numpy.argsort(ranko[::-1]).flatten()[posoc[j,i]]] 650 | return numpy.mean(errl+errr+erro),numpy.std(errl+errr+erro),numpy.mean(errl),numpy.std(errl),numpy.mean(errr),numpy.std(errr),numpy.mean(erro),numpy.std(erro) 651 | 652 | 653 | # The same but return ranking lists instead of their mean and std. 654 | def calctestscore4(sl,sr,so,posl,posr,poso,poslc,posrc,posoc): 655 | errl = [] 656 | errr = [] 657 | erro = [] 658 | for i in range(posl.shape[1]): 659 | lnz = posl[:,i].nonzero()[0] 660 | for j in lnz: 661 | val = posl[j,i] 662 | tmpadd = copy.deepcopy(posl[:,i]) 663 | tmpadd[j,0] = 0.0 664 | rankl = numpy.argsort((sl(posr[:,i],poso[:,i],tmpadd,val)[0]).flatten()) 665 | errl += [numpy.argsort(rankl[::-1]).flatten()[poslc[j,i]]] 666 | rnz = posr[:,i].nonzero()[0] 667 | for j in rnz: 668 | val = posr[j,i] 669 | tmpadd = copy.deepcopy(posr[:,i]) 670 | tmpadd[j,0] = 0.0 671 | rankr = numpy.argsort((sr(posl[:,i],poso[:,i],tmpadd,val)[0]).flatten()) 672 | errr += [numpy.argsort(rankr[::-1]).flatten()[posrc[j,i]]] 673 | onz = poso[:,i].nonzero()[0] 674 | for j in onz: 675 | val = poso[j,i] 676 | tmpadd = copy.deepcopy(poso[:,i]) 677 | tmpadd[j,0] = 0.0 678 | ranko = numpy.argsort((so(posl[:,i],posr[:,i],tmpadd,val)[0]).flatten()) 679 | erro += [numpy.argsort(ranko[::-1]).flatten()[posoc[j,i]]] 680 | return errl,errr,erro 681 | -------------------------------------------------------------------------------- /expeWAKA.py: -------------------------------------------------------------------------------- 1 | import scipy.sparse 2 | import cPickle 3 | import os 4 | import sys 5 | from model import * 6 | import time 7 | 8 | def createrandommat(shape): 9 | randommat = scipy.sparse.lil_matrix((shape[0],shape[1]),dtype=theano.config.floatX) 10 | idxr = numpy.asarray(numpy.random.permutation(shape[1]),dtype='int32') 11 | idx = 0 12 | for i in idxr: 13 | if idx == shape[0]: 14 | idx=0 15 | randommat[idx,i]=1 16 | idx+=1 17 | return randommat.tocsr() 18 | 19 | def expeWAKA(state,channel): 20 | state.savepath = channel.remote_path+'/' if hasattr(channel,'remote_path') else channel.path+'/' 21 | datpath = '/mnt/scratch/bengio/glorotxa/data/exp/WakaBST4/' 22 | 23 | state.listconcept = [['__brain_NN_1'], ['__france_NN_1'], ['__auto_NN_1'],['__cat_NN_1'],['__monkey_NN_1'],['__u.s._NN_1','__army_NN_1']] 24 | state.listrel = [['_has_part'],['_part_of'],['__eat_VB_1'],['__drive_VB_1'],['__defend_VB_1'],['__attack_VB_1']] 25 | 26 | dictparam = {} 27 | dictparam.update({ 'operator':state.operator}) 28 | dictparam.update({ 'updateWN' : state.updateWN}) 29 | dictparam.update({ 'updateWNl' : state.updateWNl}) 30 | dictparam.update({ 'updateWNsl' : state.updateWNsl}) 31 | dictparam.update({ 'updateCN' : state.updateCN}) 32 | dictparam.update({ 'updateWK' : state.updateWK}) 33 | dictparam.update({ 'updateWKs' : state.updateWKs}) 34 | dictparam.update({ 'updateXWN' : state.updateXWN}) 35 | dictparam.update({ 'ndim' : state.ndim}) 36 | dictparam.update({ 'nbbatches' : state.nbatches}) 37 | dictparam.update({ 'lrparam' : state.lrparam}) 38 | dictparam.update({ 'lremb' : state.lremb}) 39 | dictparam.update({ 'nbtest' : state.nbtest}) 40 | dictparam.update({ 'testall' : state.testall}) 41 | dictparam.update({ 'savepath' : state.savepath}) 42 | dictparam.update({ 'simfnstr' : state.simfnstr}) 43 | dictparam.update({ 'listconcept' : state.listconcept}) 44 | dictparam.update({ 'listrel' : state.listrel}) 45 | dictparam.update({ 'nbrank' : state.nbrank}) 46 | dictparam.update({ 'loadmodel' : state.loadmodel}) 47 | dictparam.update({ 'begindeclr' : state.begindeclr}) 48 | dictparam.update({ 'ratdeclr' : state.ratdeclr}) 49 | dictparam.update({ 'totbatch' : state.totbatch}) 50 | dictparam.update({ 'margewn' : state.margewn}) 51 | dictparam.update({ 'margewnl' : state.margewnl}) 52 | dictparam.update({ 'margewnsl' : state.margewnsl}) 53 | dictparam.update({ 'margecn' : state.margecn}) 54 | dictparam.update({ 'margewk' : state.margewk}) 55 | dictparam.update({ 'margewks' : state.margewks}) 56 | dictparam.update({ 'margexwn' : state.margexwn}) 57 | dictparam.update({ 'relb' : state.relb}) 58 | dictparam.update({ 'random' : state.random}) 59 | 60 | 61 | print >> sys.stderr, 'operator : ', state.operator 62 | print >> sys.stderr, 'updateWN : ', state.updateWN 63 | print >> sys.stderr, 'updateWNl : ', state.updateWNl 64 | print >> sys.stderr, 'updateWNsl : ', state.updateWNsl 65 | print >> sys.stderr, 'updateCN : ', state.updateCN 66 | print >> sys.stderr, 'updateWK : ', state.updateWK 67 | print >> sys.stderr, 'updateWKs : ', state.updateWKs 68 | print >> sys.stderr, 'updateXWN : ', state.updateXWN 69 | print >> sys.stderr, 'ndim : ', state.ndim 70 | print >> sys.stderr, 'nbbatches : ', state.nbatches 71 | print >> sys.stderr, 'lrparam : ', state.lrparam 72 | print >> sys.stderr, 'lremb : ', state.lremb 73 | print >> sys.stderr, 'nbtest : ', state.nbtest 74 | print >> sys.stderr, 'testall : ', state.testall 75 | print >> sys.stderr, 'savepath : ', state.savepath 76 | print >> sys.stderr, 'simfnstr : ', state.simfnstr 77 | print >> sys.stderr, 'listconcept : ', state.listconcept 78 | print >> sys.stderr, 'listrel : ', state.listrel 79 | print >> sys.stderr, 'nbrank : ', state.nbrank 80 | print >> sys.stderr, 'loadmodel : ', state.loadmodel 81 | print >> sys.stderr, 'begindeclr: ', state.begindeclr 82 | print >> sys.stderr, 'ratdeclr: ', state.ratdeclr 83 | print >> sys.stderr, 'totbatch: ', state.totbatch 84 | print >> sys.stderr, 'margewn: ', state.margewn 85 | print >> sys.stderr, 'margewnl: ', state.margewnl 86 | print >> sys.stderr, 'margewnsl: ', state.margewnsl 87 | print >> sys.stderr, 'margecn: ', state.margecn 88 | print >> sys.stderr, 'margewk: ', state.margewk 89 | print >> sys.stderr, 'margewks: ', state.margewks 90 | print >> sys.stderr, 'margexwn: ', state.margexwn 91 | print >> sys.stderr, 'relb: ', state.relb 92 | print >> sys.stderr, 'random: ', state.random 93 | 94 | synset2lemme = cPickle.load(open(datpath+'synset2lemme.pkl','r')) 95 | lemme2synset = cPickle.load(open(datpath+'lemme2synset.pkl','r')) 96 | lemme2freq = cPickle.load(open(datpath+'lemme2freq.pkl','r')) 97 | synset2idx = cPickle.load(open(datpath+'synset2idx.pkl','r')) 98 | idx2synset = cPickle.load(open(datpath+'idx2synset.pkl','r')) 99 | lemme2idx = cPickle.load(open(datpath+'lemme2idx.pkl','r')) 100 | idx2lemme = cPickle.load(open(datpath+'idx2lemme.pkl','r')) 101 | synset2neg = cPickle.load(open(datpath+'synset2neg.pkl','r')) 102 | synset2def = cPickle.load(open(datpath+'synset2def.pkl','r')) 103 | synset2concept = cPickle.load(open(datpath+'synset2concept.pkl','r')) 104 | concept2synset = cPickle.load(open(datpath+'concept2synset.pkl','r')) 105 | 106 | print '####### WORDNET' 107 | # train set 108 | WNtrainl = (cPickle.load(open(datpath+'WordNet3.0-train-lhs.pkl','r'))).tocsr() 109 | WNtrainr = (cPickle.load(open(datpath+'WordNet3.0-train-rhs.pkl','r'))).tocsr() 110 | WNtraino = (cPickle.load(open(datpath+'WordNet3.0-train-rel.pkl','r'))).tocsr() 111 | 112 | if not state.random: 113 | WNtrainln = (cPickle.load(open(datpath+'WordNet3.0-train-lhs.pkl','r'))).tocsr() 114 | WNtrainrn = (cPickle.load(open(datpath+'WordNet3.0-train-rhs.pkl','r'))).tocsr() 115 | WNtrainon = (cPickle.load(open(datpath+'WordNet3.0-train-rel.pkl','r'))).tocsr() 116 | else: 117 | WNtrainln = createrandommat(WNtrainl.shape) 118 | WNtrainrn = createrandommat(WNtrainl.shape) 119 | WNtrainon = (cPickle.load(open(datpath+'WordNet3.0-train-rel.pkl','r'))).tocsr() 120 | #WNtrainon = createrandommat(WNtrainl.shape) 121 | 122 | numpy.random.seed(111) 123 | order = numpy.random.permutation(WNtrainl.shape[1]) 124 | WNtrainl = WNtrainl[:,order] 125 | WNtrainr = WNtrainr[:,order] 126 | WNtraino = WNtraino[:,order] 127 | WNtrainln = WNtrainln[:,numpy.random.permutation(WNtrainln.shape[1])] 128 | WNtrainrn = WNtrainrn[:,numpy.random.permutation(WNtrainln.shape[1])] 129 | WNtrainon = WNtrainon[:,numpy.random.permutation(WNtrainln.shape[1])] 130 | 131 | # valid set 132 | WNvall = (cPickle.load(open(datpath+'WordNet3.0-val-lhs.pkl','r'))).tocsr() 133 | WNvalr = (cPickle.load(open(datpath+'WordNet3.0-val-rhs.pkl','r'))).tocsr() 134 | WNvalo = (cPickle.load(open(datpath+'WordNet3.0-val-rel.pkl','r'))).tocsr() 135 | numpy.random.seed(222) 136 | order = numpy.random.permutation(WNvall.shape[1]) 137 | WNvall = WNvall[:,order] 138 | WNvalr = WNvalr[:,order] 139 | WNvalo = WNvalo[:,order] 140 | 141 | # test set 142 | WNtestl = (cPickle.load(open(datpath+'WordNet3.0-test-lhs.pkl','r'))).tocsr() 143 | WNtestr = (cPickle.load(open(datpath+'WordNet3.0-test-rhs.pkl','r'))).tocsr() 144 | WNtesto = (cPickle.load(open(datpath+'WordNet3.0-test-rel.pkl','r'))).tocsr() 145 | numpy.random.seed(333) 146 | order = numpy.random.permutation(WNtestl.shape[1]) 147 | WNtestl = WNtestl[:,order] 148 | WNtestr = WNtestr[:,order] 149 | WNtesto = WNtesto[:,order] 150 | 151 | 152 | print '####### WORDNET LEMME' 153 | # train set 154 | WNltrainl = (cPickle.load(open(datpath+'WordNet3.0-lemme-train-lhs.pkl','r'))).tocsr() 155 | WNltrainr = (cPickle.load(open(datpath+'WordNet3.0-lemme-train-rhs.pkl','r'))).tocsr() 156 | WNltraino = (cPickle.load(open(datpath+'WordNet3.0-lemme-train-rel.pkl','r'))).tocsr() 157 | 158 | if not state.random: 159 | WNltrainln = (cPickle.load(open(datpath+'WordNet3.0-lemme-train-lhs.pkl','r'))).tocsr() 160 | WNltrainrn = (cPickle.load(open(datpath+'WordNet3.0-lemme-train-rhs.pkl','r'))).tocsr() 161 | WNltrainon = (cPickle.load(open(datpath+'WordNet3.0-lemme-train-rel.pkl','r'))).tocsr() 162 | else: 163 | WNltrainln = createrandommat(WNltrainl.shape) 164 | WNltrainrn = createrandommat(WNltrainl.shape) 165 | WNltrainon = (cPickle.load(open(datpath+'WordNet3.0-lemme-train-rel.pkl','r'))).tocsr() 166 | #WNltrainon = createrandommat(WNltrainl.shape) 167 | 168 | numpy.random.seed(222) 169 | order = numpy.random.permutation(WNltrainl.shape[1]) 170 | WNltrainl = WNltrainl[:,order] 171 | WNltrainr = WNltrainr[:,order] 172 | WNltraino = WNltraino[:,order] 173 | WNltrainln = WNltrainln[:,numpy.random.permutation(WNltrainln.shape[1])] 174 | WNltrainrn = WNltrainrn[:,numpy.random.permutation(WNltrainln.shape[1])] 175 | WNltrainon = WNltrainon[:,numpy.random.permutation(WNltrainln.shape[1])] 176 | 177 | print '####### WORDNET syle' 178 | # train set 179 | WNsltrainl = (cPickle.load(open(datpath+'WordNet3.0-syle-train-lhs.pkl','r'))).tocsr() 180 | WNsltrainr = (cPickle.load(open(datpath+'WordNet3.0-syle-train-rhs.pkl','r'))).tocsr() 181 | WNsltraino = (cPickle.load(open(datpath+'WordNet3.0-syle-train-rel.pkl','r'))).tocsr() 182 | 183 | if not state.random: 184 | WNsltrainln = (cPickle.load(open(datpath+'WordNet3.0-syle-train-lhs.pkl','r'))).tocsr() 185 | WNsltrainrn = (cPickle.load(open(datpath+'WordNet3.0-syle-train-rhs.pkl','r'))).tocsr() 186 | WNsltrainon = (cPickle.load(open(datpath+'WordNet3.0-syle-train-rel.pkl','r'))).tocsr() 187 | else: 188 | WNsltrainln = createrandommat(WNsltrainl.shape) 189 | WNsltrainrn = createrandommat(WNsltrainl.shape) 190 | WNsltrainon = (cPickle.load(open(datpath+'WordNet3.0-syle-train-rel.pkl','r'))).tocsr() 191 | #WNsltrainon = createrandommat(WNsltrainl.shape) 192 | 193 | numpy.random.seed(333) 194 | order = numpy.random.permutation(WNsltrainl.shape[1]) 195 | WNsltrainl = WNsltrainl[:,order] 196 | WNsltrainr = WNsltrainr[:,order] 197 | WNsltraino = WNsltraino[:,order] 198 | WNsltrainln = WNsltrainln[:,numpy.random.permutation(WNsltrainln.shape[1])] 199 | WNsltrainrn = WNsltrainrn[:,numpy.random.permutation(WNsltrainln.shape[1])] 200 | WNsltrainon = WNsltrainon[:,numpy.random.permutation(WNsltrainln.shape[1])] 201 | 202 | print '####### ConceptNet' 203 | CNtrainl = (cPickle.load(open(datpath+'ConceptNet-lhs.pkl','r'))).tocsr() 204 | CNtrainr = (cPickle.load(open(datpath+'ConceptNet-rhs.pkl','r'))).tocsr() 205 | CNtraino = (cPickle.load(open(datpath+'ConceptNet-rel.pkl','r'))).tocsr() 206 | 207 | if not state.random: 208 | CNtrainln = (cPickle.load(open(datpath+'ConceptNet-lhs.pkl','r'))).tocsr() 209 | CNtrainrn = (cPickle.load(open(datpath+'ConceptNet-rhs.pkl','r'))).tocsr() 210 | CNtrainon = (cPickle.load(open(datpath+'ConceptNet-rel.pkl','r'))).tocsr() 211 | else: 212 | CNtrainln = createrandommat(CNtrainl.shape) 213 | CNtrainrn = createrandommat(CNtrainl.shape) 214 | CNtrainon = (cPickle.load(open(datpath+'ConceptNet-rel.pkl','r'))).tocsr() 215 | 216 | numpy.random.seed(444) 217 | order = numpy.random.permutation(CNtrainl.shape[1]) 218 | CNtrainl = CNtrainl[:,order] 219 | CNtrainr = CNtrainr[:,order] 220 | CNtraino = CNtraino[:,order] 221 | CNtrainln = CNtrainln[:,numpy.random.permutation(CNtrainln.shape[1])] 222 | CNtrainrn = CNtrainrn[:,numpy.random.permutation(CNtrainln.shape[1])] 223 | CNtrainon = CNtrainon[:,numpy.random.permutation(CNtrainln.shape[1])] 224 | 225 | print '####### Wikisample' 226 | WKtrainl = (cPickle.load(open(datpath+'Wikisample-lhs.pkl','r'))).tocsr() 227 | WKtrainr = (cPickle.load(open(datpath+'Wikisample-rhs.pkl','r'))).tocsr() 228 | WKtraino = (cPickle.load(open(datpath+'Wikisample-rel.pkl','r'))).tocsr() 229 | 230 | if not state.random: 231 | WKtrainln = (cPickle.load(open(datpath+'Wikisample-lhs.pkl','r'))).tocsr() 232 | WKtrainrn = (cPickle.load(open(datpath+'Wikisample-rhs.pkl','r'))).tocsr() 233 | WKtrainon = (cPickle.load(open(datpath+'Wikisample-rel.pkl','r'))).tocsr() 234 | else: 235 | WKtrainln = createrandommat((WNtrainl.shape[0],WNtrainl.shape[1]/state.nbatches*20+11000)) 236 | WKtrainrn = createrandommat((WNtrainl.shape[0],WNtrainl.shape[1]/state.nbatches*20+11000)) 237 | WKtrainon = createrandommat((WNtrainl.shape[0],WNtrainl.shape[1]/state.nbatches*20+11000)) 238 | 239 | numpy.random.seed(555) 240 | order = numpy.random.permutation(WKtrainl.shape[1]) 241 | WKtrainl = WKtrainl[:,order] 242 | WKtrainr = WKtrainr[:,order] 243 | WKtraino = WKtraino[:,order] 244 | WKtrainln = WKtrainln[:,numpy.random.permutation(WKtrainln.shape[1])] 245 | WKtrainrn = WKtrainrn[:,numpy.random.permutation(WKtrainln.shape[1])] 246 | WKtrainon = WKtrainon[:,numpy.random.permutation(WKtrainln.shape[1])] 247 | 248 | WKtrainvall = WKtrainl[:,-10000:] 249 | WKtrainvalr = WKtrainr[:,-10000:] 250 | WKtrainvalo = WKtraino[:,-10000:] 251 | WKtrainvalln = WKtrainln[:,-10000:] 252 | WKtrainvalrn = WKtrainrn[:,-10000:] 253 | WKtrainvalon = WKtrainon[:,-10000:] 254 | 255 | WKtrainl = WKtrainl[:,:-10000] 256 | WKtrainr = WKtrainr[:,:-10000] 257 | WKtraino = WKtraino[:,:-10000] 258 | WKtrainln = WKtrainln[:,:-10000] 259 | WKtrainrn = WKtrainrn[:,:-10000] 260 | WKtrainon = WKtrainon[:,:-10000] 261 | 262 | print '####### Wikisuper' 263 | WKstrainl = (cPickle.load(open(datpath+'Wikisuper-lhs.pkl','r'))).tocsr() 264 | WKstrainr = (cPickle.load(open(datpath+'Wikisuper-rhs.pkl','r'))).tocsr() 265 | WKstraino = (cPickle.load(open(datpath+'Wikisuper-rel.pkl','r'))).tocsr() 266 | 267 | WKstrainln = (cPickle.load(open(datpath+'Wikisuper-lhsn.pkl','r'))).tocsr() 268 | WKstrainrn = (cPickle.load(open(datpath+'Wikisuper-rhsn.pkl','r'))).tocsr() 269 | WKstrainon = (cPickle.load(open(datpath+'Wikisuper-reln.pkl','r'))).tocsr() 270 | 271 | WKstrainvall = WKstrainl[:,-10000:] 272 | WKstrainvalr = WKstrainr[:,-10000:] 273 | WKstrainvalo = WKstraino[:,-10000:] 274 | WKstrainvalln = WKstrainln[:,-10000:] 275 | WKstrainvalrn = WKstrainrn[:,-10000:] 276 | WKstrainvalon = WKstrainon[:,-10000:] 277 | 278 | WKstrainl = WKstrainl[:,:-10000] 279 | WKstrainr = WKstrainr[:,:-10000] 280 | WKstraino = WKstraino[:,:-10000] 281 | WKstrainln = WKstrainln[:,:-10000] 282 | WKstrainrn = WKstrainrn[:,:-10000] 283 | WKstrainon = WKstrainon[:,:-10000] 284 | 285 | numpy.random.seed(666) 286 | order = numpy.random.permutation(WKstrainl.shape[1]) 287 | WKstrainl = WKstrainl[:,order] 288 | WKstrainr = WKstrainr[:,order] 289 | WKstraino = WKstraino[:,order] 290 | WKstrainln = WKstrainln[:,order] 291 | WKstrainrn = WKstrainrn[:,order] 292 | WKstrainon = WKstrainon[:,order] 293 | 294 | print '####### XWN' 295 | XWNtrainl = (cPickle.load(open(datpath+'XWN-lhs.pkl','r'))).tocsr() 296 | XWNtrainr = (cPickle.load(open(datpath+'XWN-rhs.pkl','r'))).tocsr() 297 | XWNtraino = (cPickle.load(open(datpath+'XWN-rel.pkl','r'))).tocsr() 298 | 299 | XWNtrainln = (cPickle.load(open(datpath+'XWN-lhsn.pkl','r'))).tocsr() 300 | XWNtrainrn = (cPickle.load(open(datpath+'XWN-rhsn.pkl','r'))).tocsr() 301 | XWNtrainon = (cPickle.load(open(datpath+'XWN-reln.pkl','r'))).tocsr() 302 | 303 | XWNtrainvall = XWNtrainl[:,-10000:] 304 | XWNtrainvalr = XWNtrainr[:,-10000:] 305 | XWNtrainvalo = XWNtraino[:,-10000:] 306 | XWNtrainvalln = XWNtrainln[:,-10000:] 307 | XWNtrainvalrn = XWNtrainrn[:,-10000:] 308 | XWNtrainvalon = XWNtrainon[:,-10000:] 309 | 310 | XWNtrainl = XWNtrainl[:,:-10000] 311 | XWNtrainr = XWNtrainr[:,:-10000] 312 | XWNtraino = XWNtraino[:,:-10000] 313 | XWNtrainln = XWNtrainln[:,:-10000] 314 | XWNtrainrn = XWNtrainrn[:,:-10000] 315 | XWNtrainon = XWNtrainon[:,:-10000] 316 | 317 | numpy.random.seed(777) 318 | order = numpy.random.permutation(XWNtrainl.shape[1]) 319 | XWNtrainl = XWNtrainl[:,order] 320 | XWNtrainr = XWNtrainr[:,order] 321 | XWNtraino = XWNtraino[:,order] 322 | XWNtrainln = XWNtrainln[:,order] 323 | XWNtrainrn = XWNtrainrn[:,order] 324 | XWNtrainon = XWNtrainon[:,order] 325 | 326 | # ------------------ 327 | rows,cols = WNtestl.nonzero() 328 | idxtl = rows[numpy.argsort(cols)] 329 | rows,cols = WNtestr.nonzero() 330 | idxtr = rows[numpy.argsort(cols)] 331 | rows,cols = WNtesto.nonzero() 332 | idxto = rows[numpy.argsort(cols)] 333 | 334 | rows,cols = WNvall.nonzero() 335 | idxvl = rows[numpy.argsort(cols)] 336 | rows,cols = WNvalr.nonzero() 337 | idxvr = rows[numpy.argsort(cols)] 338 | rows,cols = WNvalo.nonzero() 339 | idxvo = rows[numpy.argsort(cols)] 340 | 341 | if not state.loadmodel: 342 | # operators 343 | if state.operator == 'Id': 344 | leftop = Id() 345 | rightop = Id() 346 | elif state.operator == 'linear': 347 | leftop = Layercomb(numpy.random, 'lin', state.ndim, state.ndim, state.ndim) 348 | rightop = Layercomb(numpy.random, 'lin', state.ndim, state.ndim, state.ndim) 349 | elif state.operator == 'mlp': 350 | leftop = MLP(numpy.random, 'sigm', state.ndim, state.ndim, (3*state.ndim)/2, state.ndim) 351 | rightop = MLP(numpy.random, 'sigm', state.ndim, state.ndim, (3*state.ndim)/2, state.ndim) 352 | elif state.operator == 'quad': 353 | leftop = Quadlayer(numpy.random, state.ndim, state.ndim, (3*state.ndim)/2, state.ndim) 354 | rightop = Quadlayer(numpy.random, state.ndim, state.ndim, (3*state.ndim)/2, state.ndim) 355 | if state.simfnstr == 'MLP': 356 | MLPout = MLP(numpy.random, 'sigm', state.ndim, state.ndim, state.ndim, 1) 357 | # embeddings 358 | embeddings = Embedd(numpy.random,numpy.max(lemme2idx.values())+1,state.ndim) 359 | else: 360 | f = open(state.loadmodel) 361 | embeddings = cPickle.load(f) 362 | leftop = cPickle.load(f) 363 | rightop = cPickle.load(f) 364 | if state.simfnstr == 'MLP': 365 | MLPout = cPickle.load(f) 366 | f.close() 367 | dictparam = cPickle.load(open(state.loadmodel[:-4]+'dict.pkl')) 368 | 369 | if state.simfnstr == 'MLP': 370 | simfn = MLPout 371 | else: 372 | simfn = eval(state.simfnstr+'sim') 373 | 374 | 375 | # train function 376 | ftwn = TrainFunction(simfn,embeddings,leftop,rightop, marge = state.margewn, relb = state.relb) 377 | ftwnl = TrainFunction(simfn,embeddings,leftop,rightop, marge = state.margewnl, relb = state.relb) 378 | ftwnsl = TrainFunction(simfn,embeddings,leftop,rightop, marge = state.margewnsl, relb = state.relb) 379 | ftcn = TrainFunction(simfn,embeddings,leftop,rightop, marge = state.margecn, relb = state.relb) 380 | ftwk = TrainFunction(simfn,embeddings,leftop,rightop, marge = state.margewk, relb = state.relb) 381 | ftwks = TrainFunction(simfn,embeddings,leftop,rightop, marge = state.margewks, relb = state.relb) 382 | ftxwn = TrainFunction(simfn,embeddings,leftop,rightop, marge = state.margexwn, relb = state.relb) 383 | vt = BatchValidFunction(simfn,embeddings,leftop,rightop) 384 | 385 | # simi function 386 | # for the right Word Net 387 | sl = SimilarityFunctionleft(simfn,embeddings,leftop,rightop,subtensorspec = numpy.max(synset2idx.values())+1) 388 | sr = SimilarityFunctionright(simfn,embeddings,leftop,rightop,subtensorspec = numpy.max(synset2idx.values())+1) 389 | srl = SimilarityFunctionrightl(simfn,embeddings,leftop,rightop) 390 | sll = SimilarityFunctionleftl(simfn,embeddings,leftop,rightop) 391 | sol = SimilarityFunctionrell(simfn,embeddings,leftop,rightop) 392 | leftopid = Id() 393 | rightopid = Id() 394 | Esim = SimilarityFunctionrightl(L2sim,embeddings,leftopid,rightopid) 395 | 396 | if 'epochl' in dictparam.keys(): 397 | ct = dictparam['epochl'][-1] 398 | else: 399 | ct = 0 400 | dictparam['epochl'] = [] 401 | dictparam['lrembl'] = [] 402 | dictparam['lrparaml'] = [] 403 | 404 | dictparam['WNallmean'] = [] 405 | dictparam['WNallstd'] = [] 406 | dictparam['WNleftmean'] = [] 407 | dictparam['WNleftstd'] = [] 408 | dictparam['WNrightmean'] = [] 409 | dictparam['WNrightstd'] = [] 410 | dictparam['WNrelamean'] = [] 411 | dictparam['WNrelastd'] = [] 412 | dictparam['WNallbmean'] = [] 413 | dictparam['WNallbstd'] = [] 414 | dictparam['WNleftbmean'] = [] 415 | dictparam['WNleftbstd'] = [] 416 | dictparam['WNrightbmean'] = [] 417 | dictparam['WNrightbstd'] = [] 418 | dictparam['WNrelabmean'] = [] 419 | dictparam['WNrelabstd'] = [] 420 | 421 | dictparam['WNlallmean'] = [] 422 | dictparam['WNlallstd'] = [] 423 | dictparam['WNlleftmean'] = [] 424 | dictparam['WNlleftstd'] = [] 425 | dictparam['WNlrightmean'] = [] 426 | dictparam['WNlrightstd'] = [] 427 | dictparam['WNlrelamean'] = [] 428 | dictparam['WNlrelastd'] = [] 429 | dictparam['WNlallbmean'] = [] 430 | dictparam['WNlallbstd'] = [] 431 | dictparam['WNlleftbmean'] = [] 432 | dictparam['WNlleftbstd'] = [] 433 | dictparam['WNlrightbmean'] = [] 434 | dictparam['WNlrightbstd'] = [] 435 | dictparam['WNlrelabmean'] = [] 436 | dictparam['WNlrelabstd'] = [] 437 | 438 | dictparam['WNslallmean'] = [] 439 | dictparam['WNslallstd'] = [] 440 | dictparam['WNslleftmean'] = [] 441 | dictparam['WNslleftstd'] = [] 442 | dictparam['WNslrightmean'] = [] 443 | dictparam['WNslrightstd'] = [] 444 | dictparam['WNslrelamean'] = [] 445 | dictparam['WNslrelastd'] = [] 446 | dictparam['WNslallbmean'] = [] 447 | dictparam['WNslallbstd'] = [] 448 | dictparam['WNslleftbmean'] = [] 449 | dictparam['WNslleftbstd'] = [] 450 | dictparam['WNslrightbmean'] = [] 451 | dictparam['WNslrightbstd'] = [] 452 | dictparam['WNslrelabmean'] = [] 453 | dictparam['WNslrelabstd'] = [] 454 | 455 | dictparam['CNallmean'] = [] 456 | dictparam['CNallstd'] = [] 457 | dictparam['CNleftmean'] = [] 458 | dictparam['CNleftstd'] = [] 459 | dictparam['CNrightmean'] = [] 460 | dictparam['CNrightstd'] = [] 461 | dictparam['CNrelamean'] = [] 462 | dictparam['CNrelastd'] = [] 463 | dictparam['CNallbmean'] = [] 464 | dictparam['CNallbstd'] = [] 465 | dictparam['CNleftbmean'] = [] 466 | dictparam['CNleftbstd'] = [] 467 | dictparam['CNrightbmean'] = [] 468 | dictparam['CNrightbstd'] = [] 469 | dictparam['CNrelabmean'] = [] 470 | dictparam['CNrelabstd'] = [] 471 | 472 | dictparam['WKallmean'] = [] 473 | dictparam['WKallstd'] = [] 474 | dictparam['WKleftmean'] = [] 475 | dictparam['WKleftstd'] = [] 476 | dictparam['WKrightmean'] = [] 477 | dictparam['WKrightstd'] = [] 478 | dictparam['WKrelamean'] = [] 479 | dictparam['WKrelastd'] = [] 480 | dictparam['WKallbmean'] = [] 481 | dictparam['WKallbstd'] = [] 482 | dictparam['WKleftbmean'] = [] 483 | dictparam['WKleftbstd'] = [] 484 | dictparam['WKrightbmean'] = [] 485 | dictparam['WKrightbstd'] = [] 486 | dictparam['WKrelabmean'] = [] 487 | dictparam['WKrelabstd'] = [] 488 | 489 | dictparam['WKsallmean'] = [] 490 | dictparam['WKsallstd'] = [] 491 | dictparam['WKsleftmean'] = [] 492 | dictparam['WKsleftstd'] = [] 493 | dictparam['WKsrightmean'] = [] 494 | dictparam['WKsrightstd'] = [] 495 | dictparam['WKsrelamean'] = [] 496 | dictparam['WKsrelastd'] = [] 497 | dictparam['WKsallbmean'] = [] 498 | dictparam['WKsallbstd'] = [] 499 | dictparam['WKsleftbmean'] = [] 500 | dictparam['WKsleftbstd'] = [] 501 | dictparam['WKsrightbmean'] = [] 502 | dictparam['WKsrightbstd'] = [] 503 | dictparam['WKsrelabmean'] = [] 504 | dictparam['WKsrelabstd'] = [] 505 | 506 | dictparam['XWNallmean'] = [] 507 | dictparam['XWNallstd'] = [] 508 | dictparam['XWNleftmean'] = [] 509 | dictparam['XWNleftstd'] = [] 510 | dictparam['XWNrightmean'] = [] 511 | dictparam['XWNrightstd'] = [] 512 | dictparam['XWNrelamean'] = [] 513 | dictparam['XWNrelastd'] = [] 514 | dictparam['XWNallbmean'] = [] 515 | dictparam['XWNallbstd'] = [] 516 | dictparam['XWNleftbmean'] = [] 517 | dictparam['XWNleftbstd'] = [] 518 | dictparam['XWNrightbmean'] = [] 519 | dictparam['XWNrightbstd'] = [] 520 | dictparam['XWNrelabmean'] = [] 521 | dictparam['XWNrelabstd'] = [] 522 | 523 | dictparam['WNval'] = [] 524 | dictparam['WNtes'] = [] 525 | dictparam['WKval'] = [] 526 | dictparam['WKvalb'] = [] 527 | dictparam['WKvalm'] = [] 528 | dictparam['WKsval'] = [] 529 | dictparam['WKsvalb'] = [] 530 | dictparam['WKsvalm'] = [] 531 | dictparam['XWNval'] = [] 532 | dictparam['XWNvalb'] = [] 533 | dictparam['XWNvalm'] = [] 534 | 535 | 536 | WNleft = [] 537 | WNright = [] 538 | WNrela = [] 539 | WNleftb = [] 540 | WNrightb = [] 541 | WNrelab = [] 542 | 543 | WNlleft = [] 544 | WNlright = [] 545 | WNlrela = [] 546 | WNlleftb = [] 547 | WNlrightb = [] 548 | WNlrelab = [] 549 | 550 | WNslleft = [] 551 | WNslright = [] 552 | WNslrela = [] 553 | WNslleftb = [] 554 | WNslrightb = [] 555 | WNslrelab = [] 556 | 557 | CNleft = [] 558 | CNright = [] 559 | CNrela = [] 560 | CNleftb = [] 561 | CNrightb = [] 562 | CNrelab = [] 563 | 564 | WKleft = [] 565 | WKright = [] 566 | WKrela = [] 567 | WKleftb = [] 568 | WKrightb = [] 569 | WKrelab = [] 570 | 571 | WKsleft = [] 572 | WKsright = [] 573 | WKsrela = [] 574 | WKsleftb = [] 575 | WKsrightb = [] 576 | WKsrelab = [] 577 | 578 | XWNleft = [] 579 | XWNright = [] 580 | XWNrela = [] 581 | XWNleftb = [] 582 | XWNrightb = [] 583 | XWNrelab = [] 584 | 585 | 586 | state.bestWNval = -1 587 | state.bestWNtes = -1 588 | state.bestWKval = -1 589 | state.bestWKvalm = -1 590 | state.bestWKvalb = -1 591 | state.bestWKsval = -1 592 | state.bestWKsvalm = -1 593 | state.bestWKsvalb = -1 594 | state.bestXWNval = -1 595 | state.bestXWNvalm = -1 596 | state.bestXWNvalb = -1 597 | 598 | M = WNtrainl.shape[1]/state.nbatches 599 | WNlbatch = WNltrainl.shape[1] / M 600 | WNlbatchct=0 601 | WNslbatch = WNsltrainl.shape[1] / M 602 | WNslbatchct=0 603 | CNbatch = CNtrainl.shape[1] / M 604 | CNbatchct=0 605 | WKbatch = WKtrainl.shape[1] / M 606 | WKbatchct=0 607 | WKnegbatch = 20 608 | WKnegbatchct = 0 609 | WKsbatch = WKstrainl.shape[1] / M 610 | WKsbatchct=0 611 | XWNbatch = XWNtrainl.shape[1] / M 612 | XWNbatchct=0 613 | 614 | ref = time.time() 615 | print >> sys.stderr, "BEGIN TRAINING" 616 | for ccc in range(state.totbatch): 617 | for i in range(state.nbatches): 618 | if state.updateWN: 619 | if ct > state.begindeclr: 620 | resl = ftwn(state.lrparam/float((1+state.ratdeclr * (ct-state.begindeclr))*float(M)),state.lremb/float(1+state.ratdeclr * (ct-state.begindeclr)),WNtrainl[:,i*M:(i+1)*M],WNtrainr[:,i*M:(i+1)*M],WNtraino[:,i*M:(i+1)*M],WNtrainln[:,i*M:(i+1)*M],WNtrainrn[:,i*M:(i+1)*M],WNtrainon[:,i*M:(i+1)*M]) 621 | else: 622 | resl = ftwn(state.lrparam/float(M),state.lremb,WNtrainl[:,i*M:(i+1)*M],WNtrainr[:,i*M:(i+1)*M],WNtraino[:,i*M:(i+1)*M],WNtrainln[:,i*M:(i+1)*M],WNtrainrn[:,i*M:(i+1)*M],WNtrainon[:,i*M:(i+1)*M]) 623 | WNleft += [resl[1]/float(M)] 624 | WNright += [resl[2]/float(M)] 625 | WNrela += [resl[3]/float(M)] 626 | WNleftb += [resl[5]/float(M)] 627 | WNrightb += [resl[6]/float(M)] 628 | WNrelab += [resl[7]/float(M)] 629 | embeddings.norma() 630 | 631 | if state.updateWNl: 632 | if WNlbatchct == WNlbatch: 633 | WNltrainln = WNltrainln[:,numpy.random.permutation(WNltrainln.shape[1])] 634 | WNltrainrn = WNltrainrn[:,numpy.random.permutation(WNltrainln.shape[1])] 635 | WNltrainon = WNltrainon[:,numpy.random.permutation(WNltrainln.shape[1])] 636 | neworder = numpy.random.permutation(WNltrainln.shape[1]) 637 | WNltrainl = WNltrainl[:,neworder] 638 | WNltrainr = WNltrainr[:,neworder] 639 | WNltraino = WNltraino[:,neworder] 640 | WNlbatchct = 0 641 | if ct > state.begindeclr: 642 | resl = ftwnl(state.lrparam/float((1+state.ratdeclr * (ct-state.begindeclr))*float(M)),state.lremb/float(1+state.ratdeclr * (ct-state.begindeclr)),WNltrainl[:,WNlbatchct*M:(WNlbatchct+1)*M],WNltrainr[:,WNlbatchct*M:(WNlbatchct+1)*M],WNltraino[:,WNlbatchct*M:(WNlbatchct+1)*M],WNltrainln[:,WNlbatchct*M:(WNlbatchct+1)*M],WNltrainrn[:,WNlbatchct*M:(WNlbatchct+1)*M],WNltrainon[:,WNlbatchct*M:(WNlbatchct+1)*M]) 643 | else: 644 | resl = ftwnl(state.lrparam/float(M),state.lremb,WNltrainl[:,WNlbatchct*M:(WNlbatchct+1)*M],WNltrainr[:,WNlbatchct*M:(WNlbatchct+1)*M],WNltraino[:,WNlbatchct*M:(WNlbatchct+1)*M],WNltrainln[:,WNlbatchct*M:(WNlbatchct+1)*M],WNltrainrn[:,WNlbatchct*M:(WNlbatchct+1)*M],WNltrainon[:,WNlbatchct*M:(WNlbatchct+1)*M]) 645 | WNlleft += [resl[1]/float(M)] 646 | WNlright += [resl[2]/float(M)] 647 | WNlrela += [resl[3]/float(M)] 648 | WNlleftb += [resl[5]/float(M)] 649 | WNlrightb += [resl[6]/float(M)] 650 | WNlrelab += [resl[7]/float(M)] 651 | embeddings.norma() 652 | WNlbatchct += 1 653 | 654 | if state.updateWNsl: 655 | if WNslbatchct == WNslbatch: 656 | WNsltrainln = WNsltrainln[:,numpy.random.permutation(WNsltrainln.shape[1])] 657 | WNsltrainrn = WNsltrainrn[:,numpy.random.permutation(WNsltrainln.shape[1])] 658 | WNsltrainon = WNsltrainon[:,numpy.random.permutation(WNsltrainln.shape[1])] 659 | neworder = numpy.random.permutation(WNsltrainln.shape[1]) 660 | WNsltrainl = WNsltrainl[:,neworder] 661 | WNsltrainr = WNsltrainr[:,neworder] 662 | WNsltraino = WNsltraino[:,neworder] 663 | WNslbatchct = 0 664 | if ct > state.begindeclr: 665 | resl = ftwnsl(state.lrparam/float((1+state.ratdeclr * (ct-state.begindeclr))*float(M)),state.lremb/float(1+state.ratdeclr * (ct-state.begindeclr)),WNsltrainl[:,WNslbatchct*M:(WNslbatchct+1)*M],WNsltrainr[:,WNslbatchct*M:(WNslbatchct+1)*M],WNsltraino[:,WNslbatchct*M:(WNslbatchct+1)*M],WNsltrainln[:,WNslbatchct*M:(WNslbatchct+1)*M],WNsltrainrn[:,WNslbatchct*M:(WNslbatchct+1)*M],WNsltrainon[:,WNslbatchct*M:(WNslbatchct+1)*M]) 666 | else: 667 | resl = ftwnsl(state.lrparam/float(M),state.lremb,WNsltrainl[:,WNslbatchct*M:(WNslbatchct+1)*M],WNsltrainr[:,WNslbatchct*M:(WNslbatchct+1)*M],WNsltraino[:,WNslbatchct*M:(WNslbatchct+1)*M],WNsltrainln[:,WNslbatchct*M:(WNslbatchct+1)*M],WNsltrainrn[:,WNslbatchct*M:(WNslbatchct+1)*M],WNsltrainon[:,WNslbatchct*M:(WNslbatchct+1)*M]) 668 | WNslleft += [resl[1]/float(M)] 669 | WNslright += [resl[2]/float(M)] 670 | WNslrela += [resl[3]/float(M)] 671 | WNslleftb += [resl[5]/float(M)] 672 | WNslrightb += [resl[6]/float(M)] 673 | WNslrelab += [resl[7]/float(M)] 674 | embeddings.norma() 675 | WNslbatchct += 1 676 | 677 | if state.updateCN: 678 | if CNbatchct == CNbatch: 679 | CNtrainln = CNtrainln[:,numpy.random.permutation(CNtrainln.shape[1])] 680 | CNtrainrn = CNtrainrn[:,numpy.random.permutation(CNtrainln.shape[1])] 681 | CNtrainon = CNtrainon[:,numpy.random.permutation(CNtrainln.shape[1])] 682 | neworder = numpy.random.permutation(CNtrainln.shape[1]) 683 | CNtrainl = CNtrainl[:,neworder] 684 | CNtrainr = CNtrainr[:,neworder] 685 | CNtraino = CNtraino[:,neworder] 686 | CNbatchct = 0 687 | if ct > state.begindeclr: 688 | resl = ftcn(state.lrparam/float((1+state.ratdeclr * (ct-state.begindeclr))*float(M)),state.lremb/float(1+state.ratdeclr * (ct-state.begindeclr)),CNtrainl[:,CNbatchct*M:(CNbatchct+1)*M],CNtrainr[:,CNbatchct*M:(CNbatchct+1)*M],CNtraino[:,CNbatchct*M:(CNbatchct+1)*M],CNtrainln[:,CNbatchct*M:(CNbatchct+1)*M],CNtrainrn[:,CNbatchct*M:(CNbatchct+1)*M],CNtrainon[:,CNbatchct*M:(CNbatchct+1)*M]) 689 | else: 690 | resl = ftcn(state.lrparam/float(M),state.lremb,CNtrainl[:,CNbatchct*M:(CNbatchct+1)*M],CNtrainr[:,CNbatchct*M:(CNbatchct+1)*M],CNtraino[:,CNbatchct*M:(CNbatchct+1)*M],CNtrainln[:,CNbatchct*M:(CNbatchct+1)*M],CNtrainrn[:,CNbatchct*M:(CNbatchct+1)*M],CNtrainon[:,CNbatchct*M:(CNbatchct+1)*M]) 691 | CNleft += [resl[1]/float(M)] 692 | CNright += [resl[2]/float(M)] 693 | CNrela += [resl[3]/float(M)] 694 | CNleftb += [resl[5]/float(M)] 695 | CNrightb += [resl[6]/float(M)] 696 | CNrelab += [resl[7]/float(M)] 697 | embeddings.norma() 698 | CNbatchct += 1 699 | 700 | if state.updateWK: 701 | if WKnegbatchct == WKnegbatch: 702 | WKtrainln = WKtrainln[:,numpy.random.permutation(WKtrainln.shape[1])] 703 | WKtrainrn = WKtrainrn[:,numpy.random.permutation(WKtrainln.shape[1])] 704 | WKtrainon = WKtrainon[:,numpy.random.permutation(WKtrainln.shape[1])] 705 | WKnegbatchct = 0 706 | if WKbatchct == WKbatch: 707 | neworder = numpy.random.permutation(WKtrainl.shape[1]) 708 | WKtrainl = WKtrainl[:,neworder] 709 | WKtrainr = WKtrainr[:,neworder] 710 | WKtraino = WKtraino[:,neworder] 711 | WKbatchct = 0 712 | if ct > state.begindeclr: 713 | resl = ftwk(state.lrparam/float((1+state.ratdeclr * (ct-state.begindeclr))*float(M)),state.lremb/float(1+state.ratdeclr * (ct-state.begindeclr)),WKtrainl[:,WKbatchct*M:(WKbatchct+1)*M],WKtrainr[:,WKbatchct*M:(WKbatchct+1)*M],WKtraino[:,WKbatchct*M:(WKbatchct+1)*M],WKtrainln[:,WKnegbatchct*M:(WKnegbatchct+1)*M],WKtrainrn[:,WKnegbatchct*M:(WKnegbatchct+1)*M],WKtrainon[:,WKnegbatchct*M:(WKnegbatchct+1)*M]) 714 | else: 715 | resl = ftwk(state.lrparam/float(M),state.lremb,WKtrainl[:,WKbatchct*M:(WKbatchct+1)*M],WKtrainr[:,WKbatchct*M:(WKbatchct+1)*M],WKtraino[:,WKbatchct*M:(WKbatchct+1)*M],WKtrainln[:,WKnegbatchct*M:(WKnegbatchct+1)*M],WKtrainrn[:,WKnegbatchct*M:(WKnegbatchct+1)*M],WKtrainon[:,WKnegbatchct*M:(WKnegbatchct+1)*M]) 716 | WKleft += [resl[1]/float(M)] 717 | WKright += [resl[2]/float(M)] 718 | WKrela += [resl[3]/float(M)] 719 | WKleftb += [resl[5]/float(M)] 720 | WKrightb += [resl[6]/float(M)] 721 | WKrelab += [resl[7]/float(M)] 722 | embeddings.norma() 723 | WKbatchct += 1 724 | WKnegbatch += 1 725 | 726 | if state.updateWKs: 727 | if WKsbatchct == WKsbatch: 728 | neworder = numpy.random.permutation(WKstrainln.shape[1]) 729 | WKstrainln = WKstrainln[:,neworder] 730 | WKstrainrn = WKstrainrn[:,neworder] 731 | WKstrainon = WKstrainon[:,neworder] 732 | WKstrainl = WKstrainl[:,neworder] 733 | WKstrainr = WKstrainr[:,neworder] 734 | WKstraino = WKstraino[:,neworder] 735 | WKsbatchct = 0 736 | if ct > state.begindeclr: 737 | resl = ftwks(state.lrparam/float((1+state.ratdeclr * (ct-state.begindeclr))*float(M)),state.lremb/float(1+state.ratdeclr * (ct-state.begindeclr)),WKstrainl[:,WKsbatchct*M:(WKsbatchct+1)*M],WKstrainr[:,WKsbatchct*M:(WKsbatchct+1)*M],WKstraino[:,WKsbatchct*M:(WKsbatchct+1)*M],WKstrainln[:,WKsbatchct*M:(WKsbatchct+1)*M],WKstrainrn[:,WKsbatchct*M:(WKsbatchct+1)*M],WKstrainon[:,WKsbatchct*M:(WKsbatchct+1)*M]) 738 | else: 739 | resl = ftwks(state.lrparam/float(M),state.lremb,WKstrainl[:,WKsbatchct*M:(WKsbatchct+1)*M],WKstrainr[:,WKsbatchct*M:(WKsbatchct+1)*M],WKstraino[:,WKsbatchct*M:(WKsbatchct+1)*M],WKstrainln[:,WKsbatchct*M:(WKsbatchct+1)*M],WKstrainrn[:,WKsbatchct*M:(WKsbatchct+1)*M],WKstrainon[:,WKsbatchct*M:(WKsbatchct+1)*M]) 740 | WKsleft += [resl[1]/float(M)] 741 | WKsright += [resl[2]/float(M)] 742 | WKsrela += [resl[3]/float(M)] 743 | WKsleftb += [resl[5]/float(M)] 744 | WKsrightb += [resl[6]/float(M)] 745 | WKsrelab += [resl[7]/float(M)] 746 | embeddings.norma() 747 | WKsbatchct += 1 748 | 749 | if state.updateXWN: 750 | if XWNbatchct == XWNbatch: 751 | neworder = numpy.random.permutation(XWNtrainln.shape[1]) 752 | XWNtrainln = XWNtrainln[:,neworder] 753 | XWNtrainrn = XWNtrainrn[:,neworder] 754 | XWNtrainon = XWNtrainon[:,neworder] 755 | XWNtrainl = XWNtrainl[:,neworder] 756 | XWNtrainr = XWNtrainr[:,neworder] 757 | XWNtraino = XWNtraino[:,neworder] 758 | XWNbatchct = 0 759 | if ct > state.begindeclr: 760 | resl = ftxwn(state.lrparam/float((1+state.ratdeclr * (ct-state.begindeclr))*float(M)),state.lremb/float(1+state.ratdeclr * (ct-state.begindeclr)),XWNtrainl[:,XWNbatchct*M:(XWNbatchct+1)*M],XWNtrainr[:,XWNbatchct*M:(XWNbatchct+1)*M],XWNtraino[:,XWNbatchct*M:(XWNbatchct+1)*M],XWNtrainln[:,XWNbatchct*M:(XWNbatchct+1)*M],XWNtrainrn[:,XWNbatchct*M:(XWNbatchct+1)*M],XWNtrainon[:,XWNbatchct*M:(XWNbatchct+1)*M]) 761 | else: 762 | resl = ftxwn(state.lrparam/float(M),state.lremb,XWNtrainl[:,XWNbatchct*M:(XWNbatchct+1)*M],XWNtrainr[:,XWNbatchct*M:(XWNbatchct+1)*M],XWNtraino[:,XWNbatchct*M:(XWNbatchct+1)*M],XWNtrainln[:,XWNbatchct*M:(XWNbatchct+1)*M],XWNtrainrn[:,XWNbatchct*M:(XWNbatchct+1)*M],XWNtrainon[:,XWNbatchct*M:(XWNbatchct+1)*M]) 763 | XWNleft += [resl[1]/float(M)] 764 | XWNright += [resl[2]/float(M)] 765 | XWNrela += [resl[3]/float(M)] 766 | XWNleftb += [resl[5]/float(M)] 767 | XWNrightb += [resl[6]/float(M)] 768 | XWNrelab += [resl[7]/float(M)] 769 | embeddings.norma() 770 | XWNbatchct += 1 771 | 772 | order = numpy.random.permutation(WNtrainl.shape[1]) 773 | WNtrainl = WNtrainl[:,order] 774 | WNtrainr = WNtrainr[:,order] 775 | WNtraino = WNtraino[:,order] 776 | WNtrainln = WNtrainln[:,numpy.random.permutation(WNtrainln.shape[1])] 777 | WNtrainrn = WNtrainrn[:,numpy.random.permutation(WNtrainln.shape[1])] 778 | WNtrainon = WNtrainon[:,numpy.random.permutation(WNtrainln.shape[1])] 779 | ct = ct + 1 780 | print >> sys.stderr, "FINISHED EPOCH %s --- current time: %s"%(ct,time.time()-ref) 781 | if ct/float(state.testall) == ct / state.testall: 782 | txt = '' 783 | txt += '------ Epoch %s ------ lr emb: %s ------ lr param: %s ------ time spent: %s\n'%(ct,state.lremb/float(1+state.ratdeclr*(ct-state.begindeclr)),state.lrparam/float(1+state.ratdeclr*(ct-state.begindeclr)),time.time()-ref) 784 | if state.updateWN: 785 | txt += 'WN\n' 786 | txt += '%s %s %s %s %s %s %s %s\n'%(numpy.mean(WNleft+WNright+WNrela), numpy.std(WNleft+WNright+WNrela),numpy.mean(WNleft),numpy.std(WNleft),numpy.mean(WNright), numpy.std(WNright),numpy.mean(WNrela), numpy.std(WNrela)) 787 | txt += '%s %s %s %s %s %s %s %s\n'%(numpy.mean(WNleftb+WNrightb+WNrelab), numpy.std(WNleftb+WNrightb+WNrelab),numpy.mean(WNleftb),numpy.std(WNleftb),numpy.mean(WNrightb), numpy.std(WNrightb),numpy.mean(WNrelab), numpy.std(WNrelab)) 788 | if state.updateWNl: 789 | txt += 'WNl\n' 790 | txt += '%s %s %s %s %s %s %s %s\n'%(numpy.mean(WNlleft+WNlright+WNlrela), numpy.std(WNlleft+WNlright+WNlrela),numpy.mean(WNlleft),numpy.std(WNlleft),numpy.mean(WNlright), numpy.std(WNlright),numpy.mean(WNlrela), numpy.std(WNlrela)) 791 | txt += '%s %s %s %s %s %s %s %s\n'%(numpy.mean(WNlleftb+WNlrightb+WNlrelab), numpy.std(WNlleftb+WNlrightb+WNlrelab),numpy.mean(WNlleftb),numpy.std(WNlleftb),numpy.mean(WNlrightb), numpy.std(WNlrightb),numpy.mean(WNlrelab), numpy.std(WNlrelab)) 792 | if state.updateWNsl: 793 | txt += 'WNsl\n' 794 | txt += '%s %s %s %s %s %s %s %s\n'%(numpy.mean(WNslleft+WNslright+WNslrela), numpy.std(WNslleft+WNslright+WNslrela),numpy.mean(WNslleft),numpy.std(WNslleft),numpy.mean(WNslright), numpy.std(WNslright),numpy.mean(WNslrela), numpy.std(WNslrela)) 795 | txt += '%s %s %s %s %s %s %s %s\n'%(numpy.mean(WNslleftb+WNslrightb+WNslrelab), numpy.std(WNslleftb+WNslrightb+WNslrelab),numpy.mean(WNslleftb),numpy.std(WNslleftb),numpy.mean(WNslrightb), numpy.std(WNslrightb),numpy.mean(WNslrelab), numpy.std(WNslrelab)) 796 | if state.updateCN: 797 | txt += 'CN\n' 798 | txt += '%s %s %s %s %s %s %s %s\n'%(numpy.mean(CNleft+CNright+CNrela), numpy.std(CNleft+CNright+CNrela),numpy.mean(CNleft),numpy.std(CNleft),numpy.mean(CNright), numpy.std(CNright),numpy.mean(CNrela), numpy.std(CNrela)) 799 | txt += '%s %s %s %s %s %s %s %s\n'%(numpy.mean(CNleftb+CNrightb+CNrelab), numpy.std(CNleftb+CNrightb+CNrelab),numpy.mean(CNleftb),numpy.std(CNleftb),numpy.mean(CNrightb), numpy.std(CNrightb),numpy.mean(CNrelab), numpy.std(CNrelab)) 800 | if state.updateWK: 801 | txt += 'WK\n' 802 | txt += '%s %s %s %s %s %s %s %s\n'%(numpy.mean(WKleft+WKright+WKrela), numpy.std(WKleft+WKright+WKrela),numpy.mean(WKleft),numpy.std(WKleft),numpy.mean(WKright), numpy.std(WKright),numpy.mean(WKrela), numpy.std(WKrela)) 803 | txt += '%s %s %s %s %s %s %s %s\n'%(numpy.mean(WKleftb+WKrightb+WKrelab), numpy.std(WKleftb+WKrightb+WKrelab),numpy.mean(WKleftb),numpy.std(WKleftb),numpy.mean(WKrightb), numpy.std(WKrightb),numpy.mean(WKrelab), numpy.std(WKrelab)) 804 | if state.updateWKs: 805 | txt += 'WKs\n' 806 | txt += '%s %s %s %s %s %s %s %s\n'%(numpy.mean(WKsleft+WKsright+WKsrela), numpy.std(WKsleft+WKsright+WKsrela),numpy.mean(WKsleft),numpy.std(WKsleft),numpy.mean(WKsright), numpy.std(WKsright),numpy.mean(WKsrela), numpy.std(WKsrela)) 807 | txt += '%s %s %s %s %s %s %s %s\n'%(numpy.mean(WKsleftb+WKsrightb+WKsrelab), numpy.std(WKsleftb+WKsrightb+WKsrelab),numpy.mean(WKsleftb),numpy.std(WKsleftb),numpy.mean(WKsrightb), numpy.std(WKsrightb),numpy.mean(WKsrelab), numpy.std(WKsrelab)) 808 | if state.updateXWN: 809 | txt += 'XWN\n' 810 | txt += '%s %s %s %s %s %s %s %s\n'%(numpy.mean(XWNleft+XWNright+XWNrela), numpy.std(XWNleft+XWNright+XWNrela),numpy.mean(XWNleft),numpy.std(XWNleft),numpy.mean(XWNright), numpy.std(XWNright),numpy.mean(XWNrela), numpy.std(XWNrela)) 811 | txt += '%s %s %s %s %s %s %s %s\n'%(numpy.mean(XWNleftb+XWNrightb+XWNrelab), numpy.std(XWNleftb+XWNrightb+XWNrelab),numpy.mean(XWNleftb),numpy.std(XWNleftb),numpy.mean(XWNrightb), numpy.std(XWNrightb),numpy.mean(XWNrelab), numpy.std(XWNrelab)) 812 | 813 | dictparam['epochl'] += [ct] 814 | if ct > state.begindeclr: 815 | dictparam['lrparaml'] += [state.lrparam/float(1+state.ratdeclr * (ct-state.begindeclr))] 816 | dictparam['lrembl'] += [state.lremb/float(1+state.ratdeclr * (ct-state.begindeclr))] 817 | else: 818 | dictparam['lrparaml'] += [state.lrparam] 819 | dictparam['lrembl'] += [state.lremb] 820 | if state.updateWN: 821 | dictparam['WNallmean'] += [numpy.mean(WNleft+WNright+WNrela)] 822 | dictparam['WNallstd'] += [numpy.std(WNleft+WNright+WNrela)] 823 | dictparam['WNleftmean'] += [numpy.mean(WNleft)] 824 | dictparam['WNleftstd'] += [numpy.std(WNleft)] 825 | dictparam['WNrightmean'] += [numpy.mean(WNright)] 826 | dictparam['WNrightstd'] += [numpy.std(WNright)] 827 | dictparam['WNrelamean'] += [numpy.mean(WNrela)] 828 | dictparam['WNrelastd'] += [numpy.std(WNrela)] 829 | dictparam['WNallbmean'] += [numpy.mean(WNleftb+WNrightb+WNrelab)] 830 | dictparam['WNallbstd'] += [numpy.std(WNleftb+WNrightb+WNrelab)] 831 | dictparam['WNleftbmean'] += [numpy.mean(WNleftb)] 832 | dictparam['WNleftbstd'] += [numpy.std(WNleftb)] 833 | dictparam['WNrightbmean'] += [numpy.mean(WNrightb)] 834 | dictparam['WNrightbstd'] += [numpy.std(WNrightb)] 835 | dictparam['WNrelabmean'] += [numpy.mean(WNrelab)] 836 | dictparam['WNrelabstd'] += [numpy.std(WNrelab)] 837 | state.WNallmean = numpy.mean(WNleft+WNright+WNrela) 838 | state.WNallmeanb = numpy.mean(WNleftb+WNrightb+WNrelab) 839 | if state.updateWNl: 840 | dictparam['WNlallmean'] += [numpy.mean(WNlleft+WNlright+WNlrela)] 841 | dictparam['WNlallstd'] += [numpy.std(WNlleft+WNlright+WNlrela)] 842 | dictparam['WNlleftmean'] += [numpy.mean(WNlleft)] 843 | dictparam['WNlleftstd'] += [numpy.std(WNlleft)] 844 | dictparam['WNlrightmean'] += [numpy.mean(WNlright)] 845 | dictparam['WNlrightstd'] += [numpy.std(WNlright)] 846 | dictparam['WNlrelamean'] += [numpy.mean(WNlrela)] 847 | dictparam['WNlrelastd'] += [numpy.std(WNlrela)] 848 | dictparam['WNlallbmean'] += [numpy.mean(WNlleftb+WNlrightb+WNlrelab)] 849 | dictparam['WNlallbstd'] += [numpy.std(WNlleftb+WNlrightb+WNlrelab)] 850 | dictparam['WNlleftbmean'] += [numpy.mean(WNlleftb)] 851 | dictparam['WNlleftbstd'] += [numpy.std(WNlleftb)] 852 | dictparam['WNlrightbmean'] += [numpy.mean(WNlrightb)] 853 | dictparam['WNlrightbstd'] += [numpy.std(WNlrightb)] 854 | dictparam['WNlrelabmean'] += [numpy.mean(WNlrelab)] 855 | dictparam['WNlrelabstd'] += [numpy.std(WNlrelab)] 856 | state.WNlallmean = numpy.mean(WNlleft+WNlright+WNlrela) 857 | state.WNlallmeanb = numpy.mean(WNlleftb+WNlrightb+WNlrelab) 858 | if state.updateWNsl: 859 | dictparam['WNslallmean'] += [numpy.mean(WNslleft+WNslright+WNslrela)] 860 | dictparam['WNslallstd'] += [numpy.std(WNslleft+WNslright+WNslrela)] 861 | dictparam['WNslleftmean'] += [numpy.mean(WNslleft)] 862 | dictparam['WNslleftstd'] += [numpy.std(WNslleft)] 863 | dictparam['WNslrightmean'] += [numpy.mean(WNslright)] 864 | dictparam['WNslrightstd'] += [numpy.std(WNslright)] 865 | dictparam['WNslrelamean'] += [numpy.mean(WNslrela)] 866 | dictparam['WNslrelastd'] += [numpy.std(WNslrela)] 867 | dictparam['WNslallbmean'] += [numpy.mean(WNslleftb+WNslrightb+WNslrelab)] 868 | dictparam['WNslallbstd'] += [numpy.std(WNslleftb+WNslrightb+WNslrelab)] 869 | dictparam['WNslleftbmean'] += [numpy.mean(WNslleftb)] 870 | dictparam['WNslleftbstd'] += [numpy.std(WNslleftb)] 871 | dictparam['WNslrightbmean'] += [numpy.mean(WNslrightb)] 872 | dictparam['WNslrightbstd'] += [numpy.std(WNslrightb)] 873 | dictparam['WNslrelabmean'] += [numpy.mean(WNslrelab)] 874 | dictparam['WNslrelabstd'] += [numpy.std(WNslrelab)] 875 | state.WNslallmean = numpy.mean(WNslleft+WNslright+WNslrela) 876 | state.WNslallmeanb = numpy.mean(WNslleftb+WNslrightb+WNslrelab) 877 | if state.updateCN: 878 | dictparam['CNallmean'] += [numpy.mean(CNleft+CNright+CNrela)] 879 | dictparam['CNallstd'] += [numpy.std(CNleft+CNright+CNrela)] 880 | dictparam['CNleftmean'] += [numpy.mean(CNleft)] 881 | dictparam['CNleftstd'] += [numpy.std(CNleft)] 882 | dictparam['CNrightmean'] += [numpy.mean(CNright)] 883 | dictparam['CNrightstd'] += [numpy.std(CNright)] 884 | dictparam['CNrelamean'] += [numpy.mean(CNrela)] 885 | dictparam['CNrelastd'] += [numpy.std(CNrela)] 886 | dictparam['CNallbmean'] += [numpy.mean(CNleftb+CNrightb+CNrelab)] 887 | dictparam['CNallbstd'] += [numpy.std(CNleftb+CNrightb+CNrelab)] 888 | dictparam['CNleftbmean'] += [numpy.mean(CNleftb)] 889 | dictparam['CNleftbstd'] += [numpy.std(CNleftb)] 890 | dictparam['CNrightbmean'] += [numpy.mean(CNrightb)] 891 | dictparam['CNrightbstd'] += [numpy.std(CNrightb)] 892 | dictparam['CNrelabmean'] += [numpy.mean(CNrelab)] 893 | dictparam['CNrelabstd'] += [numpy.std(CNrelab)] 894 | state.CNallmean = numpy.mean(CNleft+CNright+CNrela) 895 | state.CNallmeanb = numpy.mean(CNleftb+CNrightb+CNrelab) 896 | if state.updateWK: 897 | dictparam['WKallmean'] += [numpy.mean(WKleft+WKright+WKrela)] 898 | dictparam['WKallstd'] += [numpy.std(WKleft+WKright+WKrela)] 899 | dictparam['WKleftmean'] += [numpy.mean(WKleft)] 900 | dictparam['WKleftstd'] += [numpy.std(WKleft)] 901 | dictparam['WKrightmean'] += [numpy.mean(WKright)] 902 | dictparam['WKrightstd'] += [numpy.std(WKright)] 903 | dictparam['WKrelamean'] += [numpy.mean(WKrela)] 904 | dictparam['WKrelastd'] += [numpy.std(WKrela)] 905 | dictparam['WKallbmean'] += [numpy.mean(WKleftb+WKrightb+WKrelab)] 906 | dictparam['WKallbstd'] += [numpy.std(WKleftb+WKrightb+WKrelab)] 907 | dictparam['WKleftbmean'] += [numpy.mean(WKleftb)] 908 | dictparam['WKleftbstd'] += [numpy.std(WKleftb)] 909 | dictparam['WKrightbmean'] += [numpy.mean(WKrightb)] 910 | dictparam['WKrightbstd'] += [numpy.std(WKrightb)] 911 | dictparam['WKrelabmean'] += [numpy.mean(WKrelab)] 912 | dictparam['WKrelabstd'] += [numpy.std(WKrelab)] 913 | state.WKallmean = numpy.mean(WKleft+WKright+WKrela) 914 | state.WKallmeanb = numpy.mean(WKleftb+WKrightb+WKrelab) 915 | if state.updateWKs: 916 | dictparam['WKsallmean'] += [numpy.mean(WKsleft+WKsright+WKsrela)] 917 | dictparam['WKsallstd'] += [numpy.std(WKsleft+WKsright+WKsrela)] 918 | dictparam['WKsleftmean'] += [numpy.mean(WKsleft)] 919 | dictparam['WKsleftstd'] += [numpy.std(WKsleft)] 920 | dictparam['WKsrightmean'] += [numpy.mean(WKsright)] 921 | dictparam['WKsrightstd'] += [numpy.std(WKsright)] 922 | dictparam['WKsrelamean'] += [numpy.mean(WKsrela)] 923 | dictparam['WKsrelastd'] += [numpy.std(WKsrela)] 924 | dictparam['WKsallbmean'] += [numpy.mean(WKsleftb+WKsrightb+WKsrelab)] 925 | dictparam['WKsallbstd'] += [numpy.std(WKsleftb+WKsrightb+WKsrelab)] 926 | dictparam['WKsleftbmean'] += [numpy.mean(WKsleftb)] 927 | dictparam['WKsleftbstd'] += [numpy.std(WKsleftb)] 928 | dictparam['WKsrightbmean'] += [numpy.mean(WKsrightb)] 929 | dictparam['WKsrightbstd'] += [numpy.std(WKsrightb)] 930 | dictparam['WKsrelabmean'] += [numpy.mean(WKsrelab)] 931 | dictparam['WKsrelabstd'] += [numpy.std(WKsrelab)] 932 | state.WKsallmean = numpy.mean(WKsleft+WKsright+WKsrela) 933 | state.WKsallmeanb = numpy.mean(WKsleftb+WKsrightb+WKsrelab) 934 | if state.updateXWN: 935 | dictparam['XWNallmean'] += [numpy.mean(XWNleft+XWNright+XWNrela)] 936 | dictparam['XWNallstd'] += [numpy.std(XWNleft+XWNright+XWNrela)] 937 | dictparam['XWNleftmean'] += [numpy.mean(XWNleft)] 938 | dictparam['XWNleftstd'] += [numpy.std(XWNleft)] 939 | dictparam['XWNrightmean'] += [numpy.mean(XWNright)] 940 | dictparam['XWNrightstd'] += [numpy.std(XWNright)] 941 | dictparam['XWNrelamean'] += [numpy.mean(XWNrela)] 942 | dictparam['XWNrelastd'] += [numpy.std(XWNrela)] 943 | dictparam['XWNallbmean'] += [numpy.mean(XWNleftb+XWNrightb+XWNrelab)] 944 | dictparam['XWNallbstd'] += [numpy.std(XWNleftb+XWNrightb+XWNrelab)] 945 | dictparam['XWNleftbmean'] += [numpy.mean(XWNleftb)] 946 | dictparam['XWNleftbstd'] += [numpy.std(XWNleftb)] 947 | dictparam['XWNrightbmean'] += [numpy.mean(XWNrightb)] 948 | dictparam['XWNrightbstd'] += [numpy.std(XWNrightb)] 949 | dictparam['XWNrelabmean'] += [numpy.mean(XWNrelab)] 950 | dictparam['XWNrelabstd'] += [numpy.std(XWNrelab)] 951 | state.XWNallmean = numpy.mean(XWNleft+XWNright+XWNrela) 952 | state.XWNallmeanb = numpy.mean(XWNleftb+XWNrightb+XWNrelab) 953 | 954 | WNleft = [] 955 | WNright = [] 956 | WNrela = [] 957 | WNleftb = [] 958 | WNrightb = [] 959 | WNrelab = [] 960 | 961 | WNlleft = [] 962 | WNlright = [] 963 | WNlrela = [] 964 | WNlleftb = [] 965 | WNlrightb = [] 966 | WNlrelab = [] 967 | 968 | WNslleft = [] 969 | WNslright = [] 970 | WNslrela = [] 971 | WNslleftb = [] 972 | WNslrightb = [] 973 | WNslrelab = [] 974 | 975 | CNleft = [] 976 | CNright = [] 977 | CNrela = [] 978 | CNleftb = [] 979 | CNrightb = [] 980 | CNrelab = [] 981 | 982 | WKleft = [] 983 | WKright = [] 984 | WKrela = [] 985 | WKleftb = [] 986 | WKrightb = [] 987 | WKrelab = [] 988 | 989 | WKsleft = [] 990 | WKsright = [] 991 | WKsrela = [] 992 | WKsleftb = [] 993 | WKsrightb = [] 994 | WKsrelab = [] 995 | 996 | XWNleft = [] 997 | XWNright = [] 998 | XWNrela = [] 999 | XWNleftb = [] 1000 | XWNrightb = [] 1001 | XWNrelab = [] 1002 | 1003 | if state.updateWN: 1004 | resultt = calctestval(sl,sr,idxtl[:state.nbtest],idxtr[:state.nbtest],idxto[:state.nbtest]) 1005 | resultv = calctestval(sl,sr,idxvl[:state.nbtest],idxvr[:state.nbtest],idxvo[:state.nbtest]) 1006 | state.WNval = resultv[0] 1007 | if state.bestWNval == -1 or state.WNval> sys.stderr, txt 1125 | ref = time.time() 1126 | state.nbupdates = ct * state.nbatches 1127 | state.nbexamples = ct * state.nbatches * M 1128 | state.nbepochs = ct 1129 | channel.save() 1130 | 1131 | -------------------------------------------------------------------------------- /datatools.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import cPickle 3 | 4 | 5 | ### Create lemmes, synsets, indexes, definitions... dictionnaries ### 6 | ######################################################################################################### 7 | 8 | f = open('/data/lisa/data/NLU/wordnet3.0-synsets/filtered-data/WordNet3.0-filtered-lemma2synsets+cnts.txt','r') 9 | 10 | dat = f.readlines() 11 | f.close() 12 | dat = dat 13 | 14 | lemme2synset = {} 15 | lemme2freq = {} 16 | 17 | for idx,i in enumerate(dat): 18 | lemme,synsets,frequence = i[:-1].split('\t') 19 | if synsets[0] != '_': 20 | synsets = synsets[:-1] 21 | frequence = frequence[:-1] 22 | synlist = synsets.split(' ') 23 | freqlist = list(numpy.asarray(frequence.split(' '),dtype='float64')) 24 | lemme2synset.update({lemme:synlist}) 25 | lemme2freq.update({lemme:freqlist}) 26 | 27 | for i in ['_PropertyOf','_MadeOf','_DefinedAs','_PartOf','_IsA','_UsedFor','_CapableOfReceivingAction','_LocationOf','_SubeventOf','_LastSubeventOf','_PrerequisiteEventOf','_FirstSubeventOf','_EffectOf','_DesirousEffectOf','_DesireOf','_MotivationOf','_CapableOf']: 28 | lemme2synset.update({i:[i]}) 29 | lemme2freq.update({i:[1.0]}) 30 | 31 | f = open('lemme2synset.pkl','w') 32 | g = open('lemme2freq.pkl','w') 33 | 34 | cPickle.dump(lemme2synset,f,-1) 35 | cPickle.dump(lemme2freq,g,-1) 36 | 37 | f.close() 38 | g.close() 39 | 40 | f = open('/data/lisa/data/NLU/wordnet3.0-synsets/filtered-data/WordNet3.0-filtered-synset2lemmas.txt','r') 41 | dat = f.readlines() 42 | f.close() 43 | 44 | synset2lemme = {} 45 | synset2idx = {} 46 | idx2synset = {} 47 | 48 | for idx,i in enumerate(dat): 49 | synset,lemmes = i[:-1].split('\t') 50 | lemmes=lemmes[:-1] 51 | lemmelist = lemmes.split(' ') 52 | synset2lemme.update({synset:lemmelist}) 53 | synset2idx.update({synset:idx}) 54 | idx2synset.update({idx:synset}) 55 | 56 | synsetnb = idx+1 57 | 58 | for j in lemme2synset.keys(): 59 | if j[1]!='_': 60 | synset2lemme.update({j:[j]}) 61 | synset2idx.update({j:synsetnb}) 62 | idx2synset.update({synsetnb:j}) 63 | synsetnb+=1 64 | 65 | f = open('synset2lemme.pkl','w') 66 | g = open('synset2idx.pkl','w') 67 | h = open('idx2synset.pkl','w') 68 | 69 | 70 | cPickle.dump(synset2lemme,f,-1) 71 | cPickle.dump(synset2idx,g,-1) 72 | cPickle.dump(idx2synset,h,-1) 73 | 74 | f.close() 75 | g.close() 76 | h.close() 77 | 78 | f = open('/data/lisa/data/NLU/wordnet3.0-synsets/filtered-data/WordNet3.0-filtered-synset2definitions.txt','r') 79 | dat = f.readlines() 80 | f.close() 81 | 82 | synset2def = {} 83 | synset2concept = {} 84 | 85 | for idx,i in enumerate(dat): 86 | synset,concept,definition = i[:-1].split('\t') 87 | synset2def.update({synset:definition}) 88 | synset2concept.update({synset:concept}) 89 | 90 | 91 | f = open('synset2def.pkl','w') 92 | g = open('synset2concept.pkl','w') 93 | 94 | cPickle.dump(synset2def,f,-1) 95 | cPickle.dump(synset2concept,g,-1) 96 | 97 | f.close() 98 | g.close() 99 | 100 | f = open('/data/lisa/data/NLU/wordnet3.0-synsets/filtered-data/WordNet3.0-filtered-synset2negative_synsets.txt','r') 101 | dat = f.readlines() 102 | f.close() 103 | 104 | synset2neg = {} 105 | 106 | for idx,i in enumerate(dat): 107 | synset,neg = i[:-1].split('\t') 108 | neg = neg[:-1] 109 | synset2neg.update({synset:neg.split(' ')}) 110 | 111 | f = open('synset2neg.pkl','w') 112 | 113 | cPickle.dump(synset2neg,f,-1) 114 | 115 | f.close() 116 | 117 | 118 | f = open('/data/lisa/data/NLU/wordnet3.0-synsets/filtered-data/WordNet3.0-filtered-oldname2synset.txt','r') 119 | dat = f.readlines() 120 | f.close() 121 | 122 | concept2synset = {} 123 | 124 | for idx,i in enumerate(dat): 125 | concept,synset= i[:-1].split('\t') 126 | concept2synset.update({concept:synset}) 127 | 128 | f = open('concept2synset.pkl','w') 129 | 130 | cPickle.dump(concept2synset,f,-1) 131 | 132 | f.close() 133 | 134 | 135 | lemme2idx = {} 136 | idx2lemme = {} 137 | 138 | ct = synsetnb 139 | for i in lemme2synset.keys(): 140 | if len(lemme2synset[i])>1: 141 | lemme2idx.update({i:ct}) 142 | idx2lemme.update({ct:i}) 143 | ct+=1 144 | else: 145 | lemme2idx.update({i:synset2idx[lemme2synset[i][0]]}) 146 | 147 | f = open('lemme2idx.pkl','w') 148 | g = open('idx2lemme.pkl','w') 149 | 150 | cPickle.dump(lemme2idx,f,-1) 151 | cPickle.dump(idx2lemme,g,-1) 152 | 153 | f.close() 154 | g.close() 155 | 156 | ######################################################################################################### 157 | 158 | 159 | 160 | 161 | def parseline(line): 162 | lhs,rel,rhs = line.split('\t') 163 | lhs = lhs.split(' ') 164 | rhs = rhs.split(' ') 165 | rel = rel.split(' ') 166 | return lhs,rel,rhs 167 | 168 | 169 | 170 | 171 | 172 | 173 | ### Create WordNet3.0 sparse matrices of the lhs, rel and rhs ### 174 | # 3 cases -> 175 | # --synsets only (nolemme == 1), 176 | # --combinaisons consisting of 1 lemmes and others as synsets (nolemme == 2) 177 | # --lemmes only (nolemme == 3) 178 | ######################################################################################################### 179 | 180 | 181 | if True: 182 | numpy.random.seed(753) 183 | # ignore the followoing WordNet relations (too few of them) 184 | speciallist = ['_substance_holonym','_attribute','_substance_meronym','_entailment','_cause'] 185 | for nolemme in [1,2,3]: 186 | if nolemme==1: 187 | ll = ['train','val','test'] 188 | else: 189 | ll = ['train'] 190 | for datatyp in ll: 191 | f = open('/data/lisa/data/NLU/wordnet3.0-synsets/filtered-data/%s-WordNet3.0-filtered-synsets-relations-anto.txt'%datatyp,'r') 192 | 193 | dat = f.readlines() 194 | f.close() 195 | 196 | ct = 0 197 | for i in dat: 198 | lhs,rel,rhs = parseline(i[:-1]) 199 | if rel[0] not in speciallist: 200 | if nolemme==1 or nolemme==3: 201 | ct += 1 202 | if nolemme==2: 203 | for j in synset2lemme[lhs[0]]: 204 | if len(lemme2synset[j])!=1: 205 | ct += 1 206 | for j in synset2lemme[rel[0]]: 207 | if len(lemme2synset[j])!=1: 208 | assert False 209 | ct += 1 210 | for j in synset2lemme[rhs[0]]: 211 | if len(lemme2synset[j])!=1: 212 | ct += 1 213 | 214 | print len(dat),ct 215 | 216 | import scipy.sparse 217 | if datatyp == 'train': 218 | posl = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,ct),dtype='float32') 219 | posr = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,ct),dtype='float32') 220 | poso = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,ct),dtype='float32') 221 | else: 222 | posl = scipy.sparse.lil_matrix((numpy.max(synset2idx.values())+1,ct),dtype='float32') 223 | posr = scipy.sparse.lil_matrix((numpy.max(synset2idx.values())+1,ct),dtype='float32') 224 | poso = scipy.sparse.lil_matrix((numpy.max(synset2idx.values())+1,ct),dtype='float32') 225 | ct = 0 226 | for i in dat: 227 | lhs,rel,rhs = parseline(i[:-1]) 228 | if rel[0] not in speciallist: 229 | if nolemme==1: 230 | posl[synset2idx[lhs[0]],ct]=1 231 | posr[synset2idx[rhs[0]],ct]=1 232 | poso[synset2idx[rel[0]],ct]=1 233 | ct+=1 234 | if nolemme==3: 235 | posl[lemme2idx[synset2lemme[lhs[0]][numpy.random.permutation(len(synset2lemme[lhs[0]]))[0]]],ct]=1 236 | posr[lemme2idx[synset2lemme[rhs[0]][numpy.random.permutation(len(synset2lemme[rhs[0]]))[0]]],ct]=1 237 | poso[lemme2idx[synset2lemme[rel[0]][numpy.random.permutation(len(synset2lemme[rel[0]]))[0]]],ct]=1 238 | ct+=1 239 | if nolemme==2: 240 | for j in synset2lemme[lhs[0]]: 241 | if len(lemme2synset[j])!=1: 242 | posl[lemme2idx[j],ct]=1 243 | posr[synset2idx[rhs[0]],ct]=1 244 | poso[synset2idx[rel[0]],ct]=1 245 | ct += 1 246 | for j in synset2lemme[rel[0]]: 247 | if len(lemme2synset[j])!=1: 248 | assert False 249 | posl[synset2idx[lhs[0]],ct]=1 250 | posr[synset2idx[rhs[0]],ct]=1 251 | poso[lemme2idx[j],ct]=1 252 | ct += 1 253 | for j in synset2lemme[rhs[0]]: 254 | if len(lemme2synset[j])!=1: 255 | posr[lemme2idx[j],ct]=1 256 | posl[synset2idx[lhs[0]],ct]=1 257 | poso[synset2idx[rel[0]],ct]=1 258 | ct += 1 259 | 260 | if nolemme==1: 261 | f = open('WordNet3.0-%s-lhs.pkl'%datatyp,'w') 262 | g = open('WordNet3.0-%s-rhs.pkl'%datatyp,'w') 263 | h = open('WordNet3.0-%s-rel.pkl'%datatyp,'w') 264 | if nolemme==2: 265 | f = open('WordNet3.0-syle-%s-lhs.pkl'%datatyp,'w') 266 | g = open('WordNet3.0-syle-%s-rhs.pkl'%datatyp,'w') 267 | h = open('WordNet3.0-syle-%s-rel.pkl'%datatyp,'w') 268 | if nolemme==3: 269 | f = open('WordNet3.0-lemme-%s-lhs.pkl'%datatyp,'w') 270 | g = open('WordNet3.0-lemme-%s-rhs.pkl'%datatyp,'w') 271 | h = open('WordNet3.0-lemme-%s-rel.pkl'%datatyp,'w') 272 | 273 | cPickle.dump(posl.tocsr(),f,-1) 274 | cPickle.dump(posr.tocsr(),g,-1) 275 | cPickle.dump(poso.tocsr(),h,-1) 276 | 277 | f.close() 278 | g.close() 279 | h.close() 280 | 281 | ######################################################################################################### 282 | 283 | 284 | 285 | 286 | ### Create sampled Wikipedia sparse matrices of the lhs, rel and rhs ### 287 | # over the 130 files + the unambiguous wiki. 288 | # For each triplets of the 130 files: 289 | # - we create one instance with lemmas 290 | # - we create one instance for each ambiguous lemmas that we replace by a synset sampled according to the frequencies 291 | # For the unambiguous wiki: only synsets. 292 | ######################################################################################################### 293 | 294 | if True: 295 | totalsize = 0 296 | for nbf in range(131): 297 | f = open('/data/lisa/data/NLU/converted-wikipedia/nlu-data/triplets-file-%s.dat'%nbf,'r') 298 | dat = f.readlines() 299 | f.close() 300 | for i in dat: 301 | totalsize +=1 302 | bb = False 303 | lhs,rel,rhs = parseline(i[:-1]) 304 | if lhs[-1]=='': 305 | lhs = lhs[:-1] 306 | if rhs[-1]=='': 307 | rhs = rhs[:-1] 308 | if rel[-1]=='': 309 | rel = rel[:-1] 310 | for j in lhs: 311 | if len(lemme2synset[j])!=1: 312 | bb = True 313 | totalsize+=1 314 | for j in rhs: 315 | if len(lemme2synset[j])!=1: 316 | bb = True 317 | totalsize+=1 318 | for j in rel: 319 | if len(lemme2synset[j])!=1: 320 | bb = True 321 | totalsize+=1 322 | #assert bb 323 | f = open('/data/lisa/data/NLU/converted-wikipedia/nlu-data-synsets/unambiguous-triplets.dat','r') 324 | dat = f.readlines() 325 | totalsize+=len(dat) 326 | 327 | import scipy.sparse 328 | posl = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,totalsize),dtype='float32') 329 | posr = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,totalsize),dtype='float32') 330 | poso = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,totalsize),dtype='float32') 331 | 332 | numpy.random.seed(888) 333 | ct = 0 334 | for nbf in range(131): 335 | f = open('/data/lisa/data/NLU/converted-wikipedia/nlu-data/triplets-file-%s.dat'%nbf,'r') 336 | dat = f.readlines() 337 | print 'triplets-file-%s.dat'%nbf 338 | for i in dat: 339 | lhs,rel,rhs = parseline(i[:-1]) 340 | if lhs[-1]=='': 341 | lhs = lhs[:-1] 342 | if rhs[-1]=='': 343 | rhs = rhs[:-1] 344 | if rel[-1]=='': 345 | rel = rel[:-1] 346 | for i in lhs: 347 | posl[lemme2idx[i],ct]+=1/float(len(lhs)) 348 | for i in rel: 349 | poso[lemme2idx[i],ct]+=1/float(len(rel)) 350 | for i in rhs: 351 | posr[lemme2idx[i],ct]+=1/float(len(rhs)) 352 | ct+=1 353 | for idxtmp,k in enumerate(lhs): 354 | if len(lemme2synset[k])>1: 355 | listfreqtmp = numpy.cumsum(lemme2freq[k]) 356 | idxcc = (list(listfreqtmp >= numpy.random.uniform())).index(True) 357 | l = lemme2synset[k][idxcc] 358 | for j in list(lhs[:idxtmp])+list(lhs[(idxtmp+1):]): 359 | posl[lemme2idx[j],ct]+=1/float(len(lhs)) 360 | posl[synset2idx[l],ct]+=1/float(len(lhs)) 361 | for j in rel: 362 | poso[lemme2idx[j],ct]+=1/float(len(rel)) 363 | for j in rhs: 364 | posr[lemme2idx[j],ct]+=1/float(len(rhs)) 365 | ct+=1 366 | for idxtmp,k in enumerate(rel): 367 | if len(lemme2synset[k])>1: 368 | listfreqtmp = numpy.cumsum(lemme2freq[k]) 369 | idxcc = (list(listfreqtmp >= numpy.random.uniform())).index(True) 370 | l = lemme2synset[k][idxcc] 371 | for j in list(rel[:idxtmp])+list(rel[(idxtmp+1):]): 372 | poso[lemme2idx[j],ct]+=1/float(len(rel)) 373 | poso[synset2idx[l],ct]+=1/float(len(rel)) 374 | for j in lhs: 375 | posl[lemme2idx[j],ct]+=1/float(len(lhs)) 376 | for j in rhs: 377 | posr[lemme2idx[j],ct]+=1/float(len(rhs)) 378 | ct+=1 379 | for idxtmp,k in enumerate(rhs): 380 | if len(lemme2synset[k])>1: 381 | listfreqtmp = numpy.cumsum(lemme2freq[k]) 382 | idxcc = (list(listfreqtmp >= numpy.random.uniform())).index(True) 383 | l = lemme2synset[k][idxcc] 384 | for j in list(rhs[:idxtmp])+list(rhs[(idxtmp+1):]): 385 | posr[lemme2idx[j],ct]+=1/float(len(rhs)) 386 | posr[synset2idx[l],ct]+=1/float(len(rhs)) 387 | for j in rel: 388 | poso[lemme2idx[j],ct]+=1/float(len(rel)) 389 | for j in lhs: 390 | posl[lemme2idx[j],ct]+=1/float(len(lhs)) 391 | ct+=1 392 | 393 | f = open('/data/lisa/data/NLU/converted-wikipedia/nlu-data-synsets/unambiguous-triplets.dat','r') 394 | dat = f.readlines() 395 | for i in dat: 396 | lhs,rel,rhs = parseline(i[:-1]) 397 | if lhs[-1]=='': 398 | lhs = lhs[:-1] 399 | if rhs[-1]=='': 400 | rhs = rhs[:-1] 401 | if rel[-1]=='': 402 | rel = rel[:-1] 403 | for j in lhs: 404 | posl[synset2idx[j],ct]+=1/float(len(lhs)) 405 | for j in rhs: 406 | posr[synset2idx[j],ct]+=1/float(len(rhs)) 407 | for j in rel: 408 | poso[synset2idx[j],ct]+=1/float(len(rel)) 409 | ct += 1 410 | 411 | assert ct == totalsize 412 | print "finished" 413 | numpy.random.seed(999) 414 | neworder = numpy.random.permutation(totalsize) 415 | 416 | poso = (poso.tocsr())[:,neworder] 417 | posl = (posl.tocsr())[:,neworder] 418 | posr = (posr.tocsr())[:,neworder] 419 | 420 | f = open('Wikisample-lhs.pkl','w') 421 | g = open('Wikisample-rhs.pkl','w') 422 | h = open('Wikisample-rel.pkl','w') 423 | 424 | cPickle.dump(posl,f,-1) 425 | cPickle.dump(posr,g,-1) 426 | cPickle.dump(poso,h,-1) 427 | 428 | f.close() 429 | g.close() 430 | h.close() 431 | 432 | 433 | ######################################################################################################### 434 | 435 | 436 | ### Create Wikipedia sparse matrices of the lhs, rel and rhs ### 437 | # choosing one synset randomly (following the frequences) for all ambiguous lemmas 438 | # over the 130 files + the unambiguous wiki 439 | ######################################################################################################### 440 | 441 | 442 | if True: 443 | totalsize = 0 444 | for nbf in range(131): 445 | f = open('/data/lisa/data/NLU/converted-wikipedia/nlu-data/triplets-file-%s.dat'%nbf,'r') 446 | dat = f.readlines() 447 | f.close() 448 | totalsize += len(dat) 449 | f = open('/data/lisa/data/NLU/converted-wikipedia/nlu-data-synsets/unambiguous-triplets.dat','r') 450 | dat = f.readlines() 451 | totalsize+=len(dat) 452 | 453 | import scipy.sparse 454 | posl = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,totalsize),dtype='float32') 455 | posr = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,totalsize),dtype='float32') 456 | poso = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,totalsize),dtype='float32') 457 | 458 | numpy.random.seed(888) 459 | ct = 0 460 | for nbf in range(131): 461 | f = open('/data/lisa/data/NLU/converted-wikipedia/nlu-data/triplets-file-%s.dat'%nbf,'r') 462 | dat = f.readlines() 463 | print 'triplets-file-%s.dat'%nbf 464 | for i in dat: 465 | lhs,rel,rhs = parseline(i[:-1]) 466 | if lhs[-1]=='': 467 | lhs = lhs[:-1] 468 | if rhs[-1]=='': 469 | rhs = rhs[:-1] 470 | if rel[-1]=='': 471 | rel = rel[:-1] 472 | for i in lhs: 473 | if len(lemme2synset[i])>1: 474 | listfreqtmp = numpy.cumsum(lemme2freq[i]) 475 | idxcc = (list(listfreqtmp >= numpy.random.uniform())).index(True) 476 | posl[synset2idx[lemme2synset[i][idxcc]],ct]+=1/float(len(lhs)) 477 | else: 478 | posl[lemme2idx[i],ct]+=1/float(len(lhs)) 479 | for i in rel: 480 | if len(lemme2synset[i])>1: 481 | listfreqtmp = numpy.cumsum(lemme2freq[i]) 482 | idxcc = (list(listfreqtmp >= numpy.random.uniform())).index(True) 483 | poso[synset2idx[lemme2synset[i][idxcc]],ct]+=1/float(len(rel)) 484 | else: 485 | poso[lemme2idx[i],ct]+=1/float(len(rel)) 486 | for i in rhs: 487 | if len(lemme2synset[i])>1: 488 | listfreqtmp = numpy.cumsum(lemme2freq[i]) 489 | idxcc = (list(listfreqtmp >= numpy.random.uniform())).index(True) 490 | posr[synset2idx[lemme2synset[i][idxcc]],ct]+=1/float(len(rhs)) 491 | else: 492 | posr[lemme2idx[i],ct]+=1/float(len(rhs)) 493 | ct+=1 494 | 495 | f = open('/data/lisa/data/NLU/converted-wikipedia/nlu-data-synsets/unambiguous-triplets.dat','r') 496 | dat = f.readlines() 497 | for i in dat: 498 | lhs,rel,rhs = parseline(i[:-1]) 499 | if lhs[-1]=='': 500 | lhs = lhs[:-1] 501 | if rhs[-1]=='': 502 | rhs = rhs[:-1] 503 | if rel[-1]=='': 504 | rel = rel[:-1] 505 | for j in lhs: 506 | posl[synset2idx[j],ct]+=1/float(len(lhs)) 507 | for j in rhs: 508 | posr[synset2idx[j],ct]+=1/float(len(rhs)) 509 | for j in rel: 510 | poso[synset2idx[j],ct]+=1/float(len(rel)) 511 | ct += 1 512 | 513 | assert ct == totalsize 514 | print "finished" 515 | numpy.random.seed(999) 516 | neworder = numpy.random.permutation(totalsize) 517 | 518 | poso = (poso.tocsr())[:,neworder] 519 | posl = (posl.tocsr())[:,neworder] 520 | posr = (posr.tocsr())[:,neworder] 521 | 522 | f = open('Wikisamplesy-lhs.pkl','w') 523 | g = open('Wikisamplesy-rhs.pkl','w') 524 | h = open('Wikisamplesy-rel.pkl','w') 525 | 526 | cPickle.dump(posl,f,-1) 527 | cPickle.dump(posr,g,-1) 528 | cPickle.dump(poso,h,-1) 529 | 530 | f.close() 531 | g.close() 532 | h.close() 533 | 534 | 535 | ######################################################################################################### 536 | 537 | 538 | ### Create Supervised Wikipedia (using N. Usunier trick) sparse matrices of the lhs, rel and rhs ### 539 | ######################################################################################################### 540 | 541 | if True: 542 | f = open('/data/lisa/data/NLU/converted-wikipedia/nlu-data-synsets/lessambiguous-triplets.dat','r') 543 | dat = f.readlines() 544 | 545 | ct = 0 546 | 547 | for i in dat: 548 | onlyonesynset = True 549 | lhs,rel,rhs = parseline(i[:-1]) 550 | if lhs[-1]=='': 551 | lhs = lhs[:-1] 552 | if rhs[-1]=='': 553 | rhs = rhs[:-1] 554 | if rel[-1]=='': 555 | rel = rel[:-1] 556 | for j in lhs: 557 | if j[0]!='_': 558 | assert onlyonesynset 559 | onlyonesynset = False 560 | ct+=len(synset2neg[j]) 561 | for j in rhs: 562 | if j[0]!='_': 563 | assert onlyonesynset 564 | onlyonesynset = False 565 | ct+=len(synset2neg[j]) 566 | for j in rel: 567 | if j[0]!='_': 568 | assert onlyonesynset 569 | onlyonesynset = False 570 | ct+=len(synset2neg[j]) 571 | 572 | import scipy.sparse 573 | 574 | posln = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,ct),dtype='float32') 575 | posrn = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,ct),dtype='float32') 576 | poson = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,ct),dtype='float32') 577 | posl = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,ct),dtype='float32') 578 | posr = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,ct),dtype='float32') 579 | poso = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,ct),dtype='float32') 580 | print len(dat),ct 581 | 582 | 583 | f = open('/data/lisa/data/NLU/converted-wikipedia/nlu-data-synsets/lessambiguous-triplets.dat','r') 584 | dat = f.readlines() 585 | 586 | numpy.random.seed(222) 587 | neworder = numpy.random.permutation(len(dat)) 588 | currentidx = 0 589 | numpy.random.seed(666) 590 | for iii in neworder: 591 | i = dat[iii] 592 | onlyonesynset = True 593 | lhs,rel,rhs = parseline(i[:-1]) 594 | if lhs[-1]=='': 595 | lhs = lhs[:-1] 596 | if rhs[-1]=='': 597 | rhs = rhs[:-1] 598 | if rel[-1]=='': 599 | rel = rel[:-1] 600 | for j in lhs: 601 | if j[0]!='_': 602 | assert onlyonesynset 603 | onlyonesynset = False 604 | for ll in synset2neg[j]: 605 | for k in lhs: 606 | if k[0]=='_': 607 | posln[lemme2idx[k],currentidx]+=1/float(len(lhs)) 608 | posl[lemme2idx[k],currentidx]+=1/float(len(lhs)) 609 | else: 610 | posln[synset2idx[ll],currentidx]+=1/float(len(lhs)) 611 | posl[synset2idx[k],currentidx]+=1/float(len(lhs)) 612 | for k in rhs: 613 | posrn[lemme2idx[k],currentidx]+=1/float(len(rhs)) 614 | posr[lemme2idx[k],currentidx]+=1/float(len(rhs)) 615 | for k in rel: 616 | poson[lemme2idx[k],currentidx]+=1/float(len(rel)) 617 | poso[lemme2idx[k],currentidx]+=1/float(len(rel)) 618 | currentidx+=1 619 | for j in rhs: 620 | if j[0]!='_': 621 | assert onlyonesynset 622 | onlyonesynset = False 623 | for ll in synset2neg[j]: 624 | for k in rhs: 625 | if k[0]=='_': 626 | posrn[lemme2idx[k],currentidx]+=1/float(len(rhs)) 627 | posr[lemme2idx[k],currentidx]+=1/float(len(rhs)) 628 | else: 629 | posrn[synset2idx[ll],currentidx]+=1/float(len(rhs)) 630 | posr[synset2idx[k],currentidx]+=1/float(len(rhs)) 631 | for k in lhs: 632 | posln[lemme2idx[k],currentidx]+=1/float(len(lhs)) 633 | posl[lemme2idx[k],currentidx]+=1/float(len(lhs)) 634 | for k in rel: 635 | poson[lemme2idx[k],currentidx]+=1/float(len(rel)) 636 | poso[lemme2idx[k],currentidx]+=1/float(len(rel)) 637 | currentidx+=1 638 | for j in rel: 639 | if j[0]!='_': 640 | assert onlyonesynset 641 | onlyonesynset = False 642 | for ll in synset2neg[j]: 643 | for k in rel: 644 | if k[0]=='_': 645 | poson[lemme2idx[k],currentidx]+=1/float(len(rel)) 646 | poso[lemme2idx[k],currentidx]+=1/float(len(rel)) 647 | else: 648 | poson[synset2idx[ll],currentidx]+=1/float(len(rel)) 649 | poso[synset2idx[k],currentidx]+=1/float(len(rel)) 650 | for k in rhs: 651 | posrn[lemme2idx[k],currentidx]+=1/float(len(rhs)) 652 | posr[lemme2idx[k],currentidx]+=1/float(len(rhs)) 653 | for k in lhs: 654 | posln[lemme2idx[k],currentidx]+=1/float(len(lhs)) 655 | posl[lemme2idx[k],currentidx]+=1/float(len(lhs)) 656 | currentidx+=1 657 | 658 | assert currentidx == ct 659 | 660 | #neworder = numpy.random.permutation(ct) 661 | 662 | poso = (poso.tocsr())#[:,neworder] 663 | posl = (posl.tocsr())#[:,neworder] 664 | posr = (posr.tocsr())#[:,neworder] 665 | poson = (poson.tocsr())#[:,neworder] 666 | posln = (posln.tocsr())#[:,neworder] 667 | posrn = (posrn.tocsr())#[:,neworder] 668 | 669 | f = open('Wikisuper-lhs.pkl','w') 670 | g = open('Wikisuper-rhs.pkl','w') 671 | h = open('Wikisuper-rel.pkl','w') 672 | i = open('Wikisuper-lhsn.pkl','w') 673 | j = open('Wikisuper-rhsn.pkl','w') 674 | k = open('Wikisuper-reln.pkl','w') 675 | 676 | cPickle.dump(posl,f,-1) 677 | cPickle.dump(posr,g,-1) 678 | cPickle.dump(poso,h,-1) 679 | cPickle.dump(posln,i,-1) 680 | cPickle.dump(posrn,j,-1) 681 | cPickle.dump(poson,k,-1) 682 | 683 | f.close() 684 | g.close() 685 | h.close() 686 | i.close() 687 | j.close() 688 | k.close() 689 | 690 | ######################################################################################################### 691 | 692 | 693 | ### Create the WSD test set for the Brown corpus ### 694 | ### lhs,rel,rhs -> the data with all the possible synset configurations for unambiguous lemmas 695 | ### dict -> dictionnary of the form {index_beginning:index_end,...} (for each ambiguous lemma to disambiguate). 696 | ### lab -> vector of length = number of instances to score (1 correspond to the real synset, 0 elsewhere) 697 | ### freq -> vector of frequencies of the synsets in consideration (with respect to the given lemme) 698 | ######################################################################################################### 699 | 700 | if True: 701 | f = open('/data/lisa/data/NLU/semcor3.0/brown-synsets/Brown-filtered-triplets-unambiguous-lemmas.dat','r') 702 | g = open('/data/lisa/data/NLU/semcor3.0/brown-synsets/Brown-filtered-triplets-unambiguous-synsets.dat','r') 703 | 704 | dat1 = f.readlines() 705 | f.close() 706 | dat2 = g.readlines() 707 | g.close() 708 | 709 | missed = 0 710 | ct = 0 711 | for i,k in zip(dat1,dat2): 712 | lhs,rel,rhs = parseline(i[:-1]) 713 | lhsr,relr,rhsr = parseline(k[:-1]) 714 | for j in lhs: 715 | if len(lemme2synset[j])>1: 716 | ct += (len(lemme2synset[j])) 717 | else: 718 | missed += 1 719 | j = rel[0] 720 | if len(lemme2synset[j])>1: 721 | ct += len(lemme2synset[j]) 722 | else: 723 | missed += 1 724 | for j in rhs: 725 | if len(lemme2synset[j])>1: 726 | ct += len(lemme2synset[j]) 727 | else: 728 | missed += 1 729 | print ct,missed 730 | 731 | import scipy.sparse 732 | posl = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,ct),dtype='float32') 733 | posr = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,ct),dtype='float32') 734 | poso = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,ct),dtype='float32') 735 | 736 | idxcurrent = 0 737 | idxex = 0 738 | dictidx ={} 739 | freqlist = [] 740 | label = [] 741 | for i,k in zip(dat1,dat2): 742 | lhs,rel,rhs = parseline(i[:-1]) 743 | lhsr,relr,rhsr = parseline(k[:-1]) 744 | 745 | for idxtmp,k in enumerate(lhs): 746 | if len(lemme2synset[k])>1: 747 | dictidx.update({idxex:(idxcurrent,idxcurrent+len(lemme2synset[k]),lhsr[idxtmp] in synset2neg.keys())}) 748 | for l,ff in zip(lemme2synset[k],lemme2freq[k]): 749 | for j in list(lhs[:idxtmp])+list(lhs[(idxtmp+1):]): 750 | posl[lemme2idx[j],idxcurrent]+=1/float(len(lhs)) 751 | posl[synset2idx[l],idxcurrent]+=1/float(len(lhs)) 752 | freqlist+=[ff] 753 | if l == lhsr[idxtmp]: 754 | label += [1] 755 | else: 756 | label += [0] 757 | j = rel[0] 758 | poso[lemme2idx[j],idxcurrent]+=1 759 | for j in rhs: 760 | posr[lemme2idx[j],idxcurrent]+=1/float(len(rhs)) 761 | idxcurrent+=1 762 | idxex+=1 763 | k = rel[0] 764 | if len(lemme2synset[k])>1: 765 | dictidx.update({idxex:(idxcurrent,idxcurrent+len(lemme2synset[k]),relr[0] in synset2neg.keys())}) 766 | for l,ff in zip(lemme2synset[k],lemme2freq[k]): 767 | for j in lhs: 768 | posl[lemme2idx[j],idxcurrent]+=1/float(len(lhs)) 769 | poso[synset2idx[l],idxcurrent]+=1 770 | freqlist+=[ff] 771 | if l == relr[0]: 772 | label += [1] 773 | else: 774 | label += [0] 775 | for j in rhs: 776 | posr[lemme2idx[j],idxcurrent]+=1/float(len(rhs)) 777 | idxcurrent+=1 778 | idxex+=1 779 | 780 | for idxtmp,k in enumerate(rhs): 781 | if len(lemme2synset[k])>1: 782 | dictidx.update({idxex:(idxcurrent,idxcurrent+len(lemme2synset[k]),rhsr[idxtmp] in synset2neg.keys())}) 783 | for l,ff in zip(lemme2synset[k],lemme2freq[k]): 784 | for j in list(rhs[:idxtmp])+list(rhs[(idxtmp+1):]): 785 | posr[lemme2idx[j],idxcurrent]+=1/float(len(rhs)) 786 | posr[synset2idx[l],idxcurrent]+=1/float(len(rhs)) 787 | freqlist+=[ff] 788 | if l == rhsr[idxtmp]: 789 | label += [1] 790 | else: 791 | label += [0] 792 | j = rel[0] 793 | poso[lemme2idx[j],idxcurrent]+=1 794 | for j in lhs: 795 | posl[lemme2idx[j],idxcurrent]+=1/float(len(lhs)) 796 | idxcurrent+=1 797 | idxex+=1 798 | 799 | print idxcurrent,idxex,len(freqlist),len(dictidx),len(label),sum(label) 800 | f = open('Brown-WSD-lhs.pkl','w') 801 | g = open('Brown-WSD-rhs.pkl','w') 802 | h = open('Brown-WSD-rel.pkl','w') 803 | i = open('Brown-WSD-dict.pkl','w') 804 | j = open('Brown-WSD-lab.pkl','w') 805 | k = open('Brown-WSD-freq.pkl','w') 806 | 807 | cPickle.dump(posl,f,-1) 808 | f.close() 809 | cPickle.dump(posr,g,-1) 810 | g.close() 811 | cPickle.dump(poso,h,-1) 812 | h.close() 813 | cPickle.dump(dictidx,i,-1) 814 | i.close() 815 | cPickle.dump(label,j,-1) 816 | j.close() 817 | cPickle.dump(freqlist,k,-1) 818 | k.close() 819 | 820 | ### Also create the normal Brown corpus lhs,rel,rhs sparse matrices ### 821 | ### lemme: only lemmas 822 | ### synset: only synset 823 | ### corres: fill the sparse matrices in the following way -> mat[lemmeidx,instanceidx]=synsetidx 824 | ### to keep track of the correspondences 825 | ######################################################################################################### 826 | 827 | import scipy.sparse 828 | poslS = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,len(dat1)),dtype='float32') 829 | posrS = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,len(dat1)),dtype='float32') 830 | posoS = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,len(dat1)),dtype='float32') 831 | poslL = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,len(dat1)),dtype='float32') 832 | posrL = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,len(dat1)),dtype='float32') 833 | posoL = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,len(dat1)),dtype='float32') 834 | poslLS = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,len(dat1)),dtype='float32') 835 | posrLS = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,len(dat1)),dtype='float32') 836 | posoLS = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,len(dat1)),dtype='float32') 837 | 838 | idxcurrent = 0 839 | for i,k in zip(dat1,dat2): 840 | lhs,rel,rhs = parseline(i[:-1]) 841 | lhsr,relr,rhsr = parseline(k[:-1]) 842 | for j,k in zip(lhs,lhsr): 843 | poslL[lemme2idx[j],idxcurrent]+=1/float(len(lhs)) 844 | poslS[synset2idx[k],idxcurrent]+=1/float(len(lhs)) 845 | poslLS[lemme2idx[j],idxcurrent] = synset2idx[k] 846 | j = rel[0] 847 | k = relr[0] 848 | posoL[lemme2idx[j],idxcurrent]+=1 849 | posoS[synset2idx[k],idxcurrent]+=1 850 | posoLS[lemme2idx[j],idxcurrent] = synset2idx[k] 851 | for j,k in zip(rhs,rhsr): 852 | posrL[lemme2idx[j],idxcurrent]+=1/float(len(rhs)) 853 | posrS[synset2idx[k],idxcurrent]+=1/float(len(rhs)) 854 | posrLS[lemme2idx[j],idxcurrent] = synset2idx[k] 855 | idxcurrent+=1 856 | 857 | f = open('Brown-lemme-lhs.pkl','w') 858 | g = open('Brown-lemme-rhs.pkl','w') 859 | h = open('Brown-lemme-rel.pkl','w') 860 | i = open('Brown-synset-lhs.pkl','w') 861 | j = open('Brown-synset-rhs.pkl','w') 862 | k = open('Brown-synset-rel.pkl','w') 863 | l = open('Brown-corres-lhs.pkl','w') 864 | m = open('Brown-corres-rhs.pkl','w') 865 | n = open('Brown-corres-rel.pkl','w') 866 | 867 | cPickle.dump(poslL,f,-1) 868 | f.close() 869 | cPickle.dump(posrL,g,-1) 870 | g.close() 871 | cPickle.dump(posoL,h,-1) 872 | h.close() 873 | cPickle.dump(poslS,i,-1) 874 | i.close() 875 | cPickle.dump(posrS,j,-1) 876 | j.close() 877 | cPickle.dump(posoS,k,-1) 878 | k.close() 879 | cPickle.dump(poslLS,l,-1) 880 | l.close() 881 | cPickle.dump(posrLS,m,-1) 882 | m.close() 883 | cPickle.dump(posoLS,n,-1) 884 | n.close() 885 | 886 | ######################################################################################################### 887 | 888 | 889 | 890 | ### Create the XWN WSD test set with concept name (asked by antoine) ### 891 | ######################################################################################################### 892 | 893 | if False: 894 | g = open('/data/lisa/data/NLU/XWN/extended-wordnet-filtered-synsets.txt','r') 895 | dd = open('/data/lisa/data/NLU/XWN/extended-wordnet-test.txt','w') 896 | 897 | dat = g.readlines() 898 | g.close() 899 | numpy.random.seed(468) 900 | order = numpy.random.permutation(len(dat)) 901 | txt = '' 902 | missed = 0 903 | ct = 0 904 | for ii in range(5000): 905 | k = dat[order[ii]] 906 | lhsr,relr,rhsr = parseline(k[:-1]) 907 | bf = True 908 | for j in lhsr: 909 | if not bf: 910 | txt +=' ' 911 | else: 912 | bf = False 913 | txt+= str(synset2concept[j]) 914 | txt+='\t' 915 | bf = True 916 | for j in relr: 917 | if not bf: 918 | txt +=' ' 919 | else: 920 | bf = False 921 | txt+= str(synset2concept[j]) 922 | txt+='\t' 923 | bf = True 924 | for j in rhsr: 925 | if not bf: 926 | txt +=' ' 927 | else: 928 | bf = False 929 | txt+= str(synset2concept[j]) 930 | txt+='\n' 931 | dd.write(txt) 932 | dd.close() 933 | 934 | ######################################################################################################### 935 | 936 | 937 | ### Create the WSD test set for the XWN corpus ### 938 | ### lhs,rel,rhs -> the data with all the possible synset configurations for unambiguous lemmas 939 | ### dict -> dictionnary of the form {index_beginning:index_end,...} (for each ambiguous lemma to disambiguate). 940 | ### lab -> vector of length = number of instances to score (1 correspond to the real synset, 0 elsewhere) 941 | ### freq -> vector of frequencies of the synsets in consideration (with respect to the given lemme) 942 | ### uncomment to create data corresponding to the model choice (mod) and a different synset than the model choice and the true label (nmod) 943 | ### need to have done an evaluation of a trained model (see evaluation.py) 944 | ######################################################################################################### 945 | 946 | if True: 947 | f = open('/data/lisa/data/NLU/XWN/extended-wordnet-filtered-lemmas.txt','r') 948 | g = open('/data/lisa/data/NLU/XWN/extended-wordnet-filtered-synsets.txt','r') 949 | 950 | dat1 = f.readlines() 951 | f.close() 952 | dat2 = g.readlines() 953 | g.close() 954 | numpy.random.seed(468) 955 | order = numpy.random.permutation(len(dat1)) 956 | 957 | missed = 0 958 | ct = 0 959 | for ii in range(5000): 960 | i = dat1[order[ii]] 961 | k = dat2[order[ii]] 962 | lhs,rel,rhs = parseline(i[:-1]) 963 | lhsr,relr,rhsr = parseline(k[:-1]) 964 | for j in lhs: 965 | if len(lemme2synset[j])>1: 966 | ct += (len(lemme2synset[j])) 967 | else: 968 | missed += 1 969 | for j in rel: 970 | if len(lemme2synset[j])>1: 971 | ct += len(lemme2synset[j]) 972 | else: 973 | missed += 1 974 | for j in rhs: 975 | if len(lemme2synset[j])>1: 976 | ct += len(lemme2synset[j]) 977 | else: 978 | missed += 1 979 | print ct,missed 980 | 981 | import scipy.sparse 982 | posl = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,ct),dtype='float32') 983 | posr = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,ct),dtype='float32') 984 | poso = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,ct),dtype='float32') 985 | #poslm = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,5000),dtype='float32') 986 | #posrm = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,5000),dtype='float32') 987 | #posom = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,5000),dtype='float32') 988 | #poslnm = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,5000),dtype='float32') 989 | #posrnm = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,5000),dtype='float32') 990 | #posonm = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,5000),dtype='float32') 991 | 992 | #llmodel = cPickle.load(open('modelpred.pkl')) 993 | #nllmodel = cPickle.load(open('nmodelpred.pkl')) 994 | idxcurrent = 0 995 | idxex = 0 996 | dictidx ={} 997 | freqlist = [] 998 | label = [] 999 | for ii in range(5000): 1000 | i = dat1[order[ii]] 1001 | k = dat2[order[ii]] 1002 | lhs,rel,rhs = parseline(i[:-1]) 1003 | lhsr,relr,rhsr = parseline(k[:-1]) 1004 | 1005 | for idxtmp,k in enumerate(lhs): 1006 | if len(lemme2synset[k])>1: 1007 | #poslm[lemme2idx[k],ii]=synset2idx[lemme2synset[k][llmodel[idxex]]] 1008 | #poslnm[lemme2idx[k],ii]=synset2idx[lemme2synset[k][nllmodel[idxex]]] 1009 | dictidx.update({idxex:(idxcurrent,idxcurrent+len(lemme2synset[k]),lhsr[idxtmp] in synset2neg.keys())}) 1010 | for l,ff in zip(lemme2synset[k],lemme2freq[k]): 1011 | for j in list(lhs[:idxtmp])+list(lhs[(idxtmp+1):]): 1012 | posl[lemme2idx[j],idxcurrent]+=1/float(len(lhs)) 1013 | posl[synset2idx[l],idxcurrent]+=1/float(len(lhs)) 1014 | freqlist+=[ff] 1015 | if l == lhsr[idxtmp]: 1016 | label += [1] 1017 | else: 1018 | label += [0] 1019 | for j in rel: 1020 | poso[lemme2idx[j],idxcurrent]+=1/float(len(rel)) 1021 | for j in rhs: 1022 | posr[lemme2idx[j],idxcurrent]+=1/float(len(rhs)) 1023 | idxcurrent+=1 1024 | idxex+=1 1025 | else: 1026 | pass 1027 | #poslm[lemme2idx[k],ii]=lemme2idx[k] 1028 | #poslnm[lemme2idx[k],ii]=lemme2idx[k] 1029 | 1030 | for idxtmp,k in enumerate(rel): 1031 | if len(lemme2synset[k])>1: 1032 | #posom[lemme2idx[k],ii]=synset2idx[lemme2synset[k][llmodel[idxex]]] 1033 | #posonm[lemme2idx[k],ii]=synset2idx[lemme2synset[k][nllmodel[idxex]]] 1034 | dictidx.update({idxex:(idxcurrent,idxcurrent+len(lemme2synset[k]),relr[idxtmp] in synset2neg.keys())}) 1035 | for l,ff in zip(lemme2synset[k],lemme2freq[k]): 1036 | for j in lhs: 1037 | posl[lemme2idx[j],idxcurrent]+=1/float(len(lhs)) 1038 | for j in list(rel[:idxtmp])+list(rel[(idxtmp+1):]): 1039 | poso[lemme2idx[j],idxcurrent]+=1/float(len(rel)) 1040 | poso[synset2idx[l],idxcurrent]+=1/float(len(rel)) 1041 | freqlist+=[ff] 1042 | if l == relr[idxtmp]: 1043 | label += [1] 1044 | else: 1045 | label += [0] 1046 | for j in rhs: 1047 | posr[lemme2idx[j],idxcurrent]+=1/float(len(rhs)) 1048 | idxcurrent+=1 1049 | idxex+=1 1050 | else: 1051 | pass 1052 | #posom[lemme2idx[k],ii]=lemme2idx[k] 1053 | #posonm[lemme2idx[k],ii]=lemme2idx[k] 1054 | 1055 | for idxtmp,k in enumerate(rhs): 1056 | if len(lemme2synset[k])>1: 1057 | #posrm[lemme2idx[k],ii]=synset2idx[lemme2synset[k][llmodel[idxex]]] 1058 | #posrnm[lemme2idx[k],ii]=synset2idx[lemme2synset[k][nllmodel[idxex]]] 1059 | dictidx.update({idxex:(idxcurrent,idxcurrent+len(lemme2synset[k]),rhsr[idxtmp] in synset2neg.keys())}) 1060 | for l,ff in zip(lemme2synset[k],lemme2freq[k]): 1061 | for j in list(rhs[:idxtmp])+list(rhs[(idxtmp+1):]): 1062 | posr[lemme2idx[j],idxcurrent]+=1/float(len(rhs)) 1063 | posr[synset2idx[l],idxcurrent]+=1/float(len(rhs)) 1064 | freqlist+=[ff] 1065 | if l == rhsr[idxtmp]: 1066 | label += [1] 1067 | else: 1068 | label += [0] 1069 | for j in rel: 1070 | poso[lemme2idx[j],idxcurrent]+=1/float(len(rel)) 1071 | for j in lhs: 1072 | posl[lemme2idx[j],idxcurrent]+=1/float(len(lhs)) 1073 | idxcurrent+=1 1074 | idxex+=1 1075 | else: 1076 | pass 1077 | #posrm[lemme2idx[k],ii]=lemme2idx[k] 1078 | #posrnm[lemme2idx[k],ii]=lemme2idx[k] 1079 | 1080 | print idxcurrent,idxex,len(freqlist),len(dictidx),len(label),sum(label) 1081 | f = open('XWN-WSD-lhs.pkl','w') 1082 | g = open('XWN-WSD-rhs.pkl','w') 1083 | h = open('XWN-WSD-rel.pkl','w') 1084 | i = open('XWN-WSD-dict.pkl','w') 1085 | j = open('XWN-WSD-lab.pkl','w') 1086 | k = open('XWN-WSD-freq.pkl','w') 1087 | #l = open('XWN-mod-lhs.pkl','w') 1088 | #m = open('XWN-mod-rhs.pkl','w') 1089 | #n = open('XWN-mod-rel.pkl','w') 1090 | #o = open('XWN-nmod-lhs.pkl','w') 1091 | #p = open('XWN-nmod-rhs.pkl','w') 1092 | #q = open('XWN-nmod-rel.pkl','w') 1093 | cPickle.dump(posl,f,-1) 1094 | f.close() 1095 | cPickle.dump(posr,g,-1) 1096 | g.close() 1097 | cPickle.dump(poso,h,-1) 1098 | h.close() 1099 | cPickle.dump(dictidx,i,-1) 1100 | i.close() 1101 | cPickle.dump(label,j,-1) 1102 | j.close() 1103 | cPickle.dump(freqlist,k,-1) 1104 | k.close() 1105 | #cPickle.dump(poslm,l,-1) 1106 | #l.close() 1107 | #cPickle.dump(posrm,m,-1) 1108 | #m.close() 1109 | #cPickle.dump(posom,n,-1) 1110 | #n.close() 1111 | #cPickle.dump(poslnm,o,-1) 1112 | #o.close() 1113 | #cPickle.dump(posrnm,p,-1) 1114 | #p.close() 1115 | #cPickle.dump(posonm,q,-1) 1116 | #q.close() 1117 | 1118 | 1119 | ######################################################################################################### 1120 | 1121 | 1122 | ### Create the normal XWN test corpus lhs,rel,rhs sparse matrices ### 1123 | ### lemme: only lemmas 1124 | ### synset: only synset 1125 | ### corres: fill the sparse matrices in the following way -> mat[lemmeidx,instanceidx]=synsetidx 1126 | ### to keep track of the correspondences 1127 | ######################################################################################################### 1128 | 1129 | 1130 | if True: 1131 | import scipy.sparse 1132 | poslS = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,5000),dtype='float32') 1133 | posrS = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,5000),dtype='float32') 1134 | posoS = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,5000),dtype='float32') 1135 | poslL = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,5000),dtype='float32') 1136 | posrL = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,5000),dtype='float32') 1137 | posoL = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,5000),dtype='float32') 1138 | poslLS = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,5000),dtype='float32') 1139 | posrLS = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,5000),dtype='float32') 1140 | posoLS = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,5000),dtype='float32') 1141 | 1142 | idxcurrent = 0 1143 | for ii in range(5000): 1144 | i = dat1[order[ii]] 1145 | k = dat2[order[ii]] 1146 | lhs,rel,rhs = parseline(i[:-1]) 1147 | lhsr,relr,rhsr = parseline(k[:-1]) 1148 | for j,k in zip(lhs,lhsr): 1149 | poslL[lemme2idx[j],idxcurrent]+=1/float(len(lhs)) 1150 | poslS[synset2idx[k],idxcurrent]+=1/float(len(lhs)) 1151 | poslLS[lemme2idx[j],idxcurrent] = synset2idx[k] 1152 | for j,k in zip(rel,relr): 1153 | posoL[lemme2idx[j],idxcurrent]+=1/float(len(rel)) 1154 | posoS[synset2idx[k],idxcurrent]+=1/float(len(rel)) 1155 | posoLS[lemme2idx[j],idxcurrent] = synset2idx[k] 1156 | for j,k in zip(rhs,rhsr): 1157 | posrL[lemme2idx[j],idxcurrent]+=1/float(len(rhs)) 1158 | posrS[synset2idx[k],idxcurrent]+=1/float(len(rhs)) 1159 | posrLS[lemme2idx[j],idxcurrent] = synset2idx[k] 1160 | idxcurrent+=1 1161 | 1162 | f = open('XWN-lemme-lhs.pkl','w') 1163 | g = open('XWN-lemme-rhs.pkl','w') 1164 | h = open('XWN-lemme-rel.pkl','w') 1165 | i = open('XWN-synset-lhs.pkl','w') 1166 | j = open('XWN-synset-rhs.pkl','w') 1167 | k = open('XWN-synset-rel.pkl','w') 1168 | l = open('XWN-corres-lhs.pkl','w') 1169 | m = open('XWN-corres-rhs.pkl','w') 1170 | n = open('XWN-corres-rel.pkl','w') 1171 | 1172 | cPickle.dump(poslL,f,-1) 1173 | f.close() 1174 | cPickle.dump(posrL,g,-1) 1175 | g.close() 1176 | cPickle.dump(posoL,h,-1) 1177 | h.close() 1178 | cPickle.dump(poslS,i,-1) 1179 | i.close() 1180 | cPickle.dump(posrS,j,-1) 1181 | j.close() 1182 | cPickle.dump(posoS,k,-1) 1183 | k.close() 1184 | cPickle.dump(poslLS,l,-1) 1185 | l.close() 1186 | cPickle.dump(posrLS,m,-1) 1187 | m.close() 1188 | cPickle.dump(posoLS,n,-1) 1189 | n.close() 1190 | 1191 | ### Also create the training XWN corpus lhs,rel,rhs sparse matrices ### 1192 | # Supervised: instances of the type: 1 synset, others as lemmas 1193 | ######################################################################################################### 1194 | 1195 | ct = 0 1196 | for ii in xrange(5000,len(dat1)): 1197 | i = dat1[order[ii]] 1198 | lhs,rel,rhs = parseline(i[:-1]) 1199 | for j in lhs: 1200 | if len(lemme2synset[j])>1: 1201 | ct += len(lemme2synset[j]) - 1 1202 | for j in rhs: 1203 | if len(lemme2synset[j])>1: 1204 | ct += len(lemme2synset[j]) - 1 1205 | for j in rel: 1206 | if len(lemme2synset[j])>1: 1207 | ct += len(lemme2synset[j]) - 1 1208 | import scipy.sparse 1209 | 1210 | posln = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,ct),dtype='float32') 1211 | posrn = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,ct),dtype='float32') 1212 | poson = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,ct),dtype='float32') 1213 | posl = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,ct),dtype='float32') 1214 | posr = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,ct),dtype='float32') 1215 | poso = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,ct),dtype='float32') 1216 | print len(dat1),ct 1217 | 1218 | ct = 0 1219 | import copy 1220 | for ii in xrange(5000,len(dat1)): 1221 | i = dat1[order[ii]] 1222 | k = dat2[order[ii]] 1223 | lhs,rel,rhs = parseline(i[:-1]) 1224 | lhsr,relr,rhsr = parseline(k[:-1]) 1225 | for j,l in zip(lhs,lhsr): 1226 | if len(lemme2synset[j])>1: 1227 | lltmp1 = copy.deepcopy(lhs) 1228 | lltmp1.remove(j) 1229 | lltmp2 = copy.deepcopy(lemme2synset[j]) 1230 | lltmp2.remove(l) 1231 | for bb in lltmp2: 1232 | for vv in lltmp1: 1233 | posl[lemme2idx[vv],ct]+=1/float(len(lhs)) 1234 | posln[lemme2idx[vv],ct]+=1/float(len(lhs)) 1235 | posl[synset2idx[l],ct]+=1/float(len(lhs)) 1236 | posln[synset2idx[bb],ct]+=1/float(len(lhs)) 1237 | for vv in rel: 1238 | poso[lemme2idx[vv],ct]+=1/float(len(rel)) 1239 | poson[lemme2idx[vv],ct]+=1/float(len(rel)) 1240 | for vv in rhs: 1241 | posr[lemme2idx[vv],ct]+=1/float(len(rhs)) 1242 | posrn[lemme2idx[vv],ct]+=1/float(len(rhs)) 1243 | ct += 1 1244 | for j,l in zip(rhs,rhsr): 1245 | if len(lemme2synset[j])>1: 1246 | lltmp1 = copy.deepcopy(rhs) 1247 | lltmp1.remove(j) 1248 | lltmp2 = copy.deepcopy(lemme2synset[j]) 1249 | lltmp2.remove(l) 1250 | for bb in lltmp2: 1251 | for vv in lltmp1: 1252 | posr[lemme2idx[vv],ct]+=1/float(len(rhs)) 1253 | posrn[lemme2idx[vv],ct]+=1/float(len(rhs)) 1254 | posr[synset2idx[l],ct]+=1/float(len(rhs)) 1255 | posrn[synset2idx[bb],ct]+=1/float(len(rhs)) 1256 | for vv in rel: 1257 | poso[lemme2idx[vv],ct]+=1/float(len(rel)) 1258 | poson[lemme2idx[vv],ct]+=1/float(len(rel)) 1259 | for vv in lhs: 1260 | posl[lemme2idx[vv],ct]+=1/float(len(lhs)) 1261 | posln[lemme2idx[vv],ct]+=1/float(len(lhs)) 1262 | ct += 1 1263 | for j,l in zip(rel,relr): 1264 | if len(lemme2synset[j])>1: 1265 | lltmp1 = copy.deepcopy(rel) 1266 | lltmp1.remove(j) 1267 | lltmp2 = copy.deepcopy(lemme2synset[j]) 1268 | lltmp2.remove(l) 1269 | for bb in lltmp2: 1270 | for vv in lltmp1: 1271 | poso[lemme2idx[vv],ct]+=1/float(len(rel)) 1272 | poson[lemme2idx[vv],ct]+=1/float(len(rel)) 1273 | poso[synset2idx[l],ct]+=1/float(len(rel)) 1274 | poson[synset2idx[bb],ct]+=1/float(len(rel)) 1275 | for vv in lhs: 1276 | posl[lemme2idx[vv],ct]+=1/float(len(lhs)) 1277 | posln[lemme2idx[vv],ct]+=1/float(len(lhs)) 1278 | for vv in rhs: 1279 | posr[lemme2idx[vv],ct]+=1/float(len(rhs)) 1280 | posrn[lemme2idx[vv],ct]+=1/float(len(rhs)) 1281 | ct += 1 1282 | f = open('XWN-lhs.pkl','w') 1283 | g = open('XWN-rhs.pkl','w') 1284 | h = open('XWN-rel.pkl','w') 1285 | i = open('XWN-lhsn.pkl','w') 1286 | j = open('XWN-rhsn.pkl','w') 1287 | k = open('XWN-reln.pkl','w') 1288 | 1289 | cPickle.dump(posl,f,-1) 1290 | cPickle.dump(posr,g,-1) 1291 | cPickle.dump(poso,h,-1) 1292 | cPickle.dump(posln,i,-1) 1293 | cPickle.dump(posrn,j,-1) 1294 | cPickle.dump(poson,k,-1) 1295 | 1296 | f.close() 1297 | g.close() 1298 | h.close() 1299 | i.close() 1300 | j.close() 1301 | k.close() 1302 | 1303 | 1304 | ######################################################################################################### 1305 | 1306 | 1307 | ### Create the ConceptNet corpus ### 1308 | # Preprocessing with 3 filters: 1309 | # 1 -> all elements have an unambiguous POS. 1310 | # 2 -> at least one element with unambigous POS for each members (ignore ambiguous POS). 1311 | # 3 -> at least one element for each member and report ambiguous POS by taking the most frequent one. 1312 | ######################################################################################################### 1313 | 1314 | 1315 | if True: 1316 | from nltk.stem.wordnet import WordNetLemmatizer 1317 | lmtzr = WordNetLemmatizer() 1318 | f = open('/data/lisa/data/NLU/ConceptNet/predicates_concise_nonkline.txt','r') 1319 | dat = f.readlines() 1320 | g = open('/data/lisa/data/NLU/ConceptNet/predicates_concise_nonkline.txt','r') 1321 | dat += g.readlines() 1322 | f.close() 1323 | g.close() 1324 | ex = [] 1325 | for i in dat: 1326 | print len(ex) 1327 | rel,dum,couple = (i[1:-3]).partition(' ') 1328 | rel = '_'+rel 1329 | lcouple = couple[1:-1].split('" "')[:-1] 1330 | lcouple[0] = lcouple[0].split(' ') 1331 | left = [] 1332 | booltmp = True 1333 | for j in range(len(couple[0])): 1334 | lcouple[0][j] = lmtzr.lemmatize(lcouple[0][j]) 1335 | ctit = 0 1336 | name = '' 1337 | if '__' + lcouple[0][j] + '_NN' in lemme2idx.keys(): 1338 | ctit += 1 1339 | name ='__' + lcouple[0][j] + '_NN' 1340 | if '__' + lcouple[0][j] + '_VB' in lemme2idx.keys(): 1341 | ctit += 1 1342 | name = '__' + lcouple[0][j] + '_VB' 1343 | if '__' + lcouple[0][j] + '_JJ' in lemme2idx.keys(): 1344 | ctit += 1 1345 | name = '__' + lcouple[0][j] + '_JJ' 1346 | if '__' + lcouple[0][j] + '_RB' in lemme2idx.keys(): 1347 | ctit += 1 1348 | name = '__' + lcouple[0][j] + '_RB' 1349 | if ctit == 1: 1350 | left += [name] 1351 | else: 1352 | booltmp = False 1353 | #print 'left',lcouple[0],ctit,left 1354 | lcouple[1] = lcouple[1].split(' ') 1355 | right =[] 1356 | for j in range(len(couple[1])): 1357 | lcouple[1][j] = lmtzr.lemmatize(lcouple[1][j]) 1358 | ctit = 0 1359 | name = '' 1360 | if '__' + lcouple[1][j] + '_NN' in lemme2idx.keys(): 1361 | ctit += 1 1362 | name ='__' + lcouple[1][j] + '_NN' 1363 | if '__' + lcouple[1][j] + '_VB' in lemme2idx.keys(): 1364 | ctit += 1 1365 | name = '__' + lcouple[1][j] + '_VB' 1366 | if '__' + lcouple[1][j] + '_JJ' in lemme2idx.keys(): 1367 | ctit += 1 1368 | name = '__' + lcouple[1][j] + '_JJ' 1369 | if '__' + lcouple[1][j] + '_RB' in lemme2idx.keys(): 1370 | ctit += 1 1371 | name = '__' + lcouple[1][j] + '_RB' 1372 | if ctit == 1: 1373 | right += [name] 1374 | else: 1375 | booltmp = False 1376 | #print 'right',lcouple[1],ctit,right 1377 | if len(left)>=1 and len(right)>=1 and booltmp: 1378 | ex += [[left,[rel],right]] 1379 | print ex 1380 | print numpy.max(lemme2idx.values())+1,len(ex) 1381 | import scipy.sparse 1382 | posl = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,len(ex)),dtype='float32') 1383 | posr = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,len(ex)),dtype='float32') 1384 | poso = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,len(ex)),dtype='float32') 1385 | for ct,i in enumerate(ex): 1386 | for j in i[0]: 1387 | posl[lemme2idx[j],ct] += 1/float(len(i[0])) 1388 | for j in i[1]: 1389 | poso[lemme2idx[j],ct] += 1/float(len(i[1])) 1390 | for j in i[2]: 1391 | posr[lemme2idx[j],ct] += 1/float(len(i[2])) 1392 | 1393 | f = open('ConceptNet-lhs.pkl','w') 1394 | g = open('ConceptNet-rhs.pkl','w') 1395 | h = open('ConceptNet-rel.pkl','w') 1396 | 1397 | cPickle.dump(posl,f,-1) 1398 | f.close() 1399 | cPickle.dump(posr,g,-1) 1400 | g.close() 1401 | cPickle.dump(poso,h,-1) 1402 | h.close() 1403 | 1404 | 1405 | 1406 | 1407 | if True: 1408 | from nltk.stem.wordnet import WordNetLemmatizer 1409 | lmtzr = WordNetLemmatizer() 1410 | f = open('/data/lisa/data/NLU/ConceptNet/predicates_concise_nonkline.txt','r') 1411 | dat = f.readlines() 1412 | g = open('/data/lisa/data/NLU/ConceptNet/predicates_concise_nonkline.txt','r') 1413 | dat += g.readlines() 1414 | f.close() 1415 | g.close() 1416 | ex = [] 1417 | for i in dat: 1418 | print len(ex) 1419 | rel,dum,couple = (i[1:-3]).partition(' ') 1420 | rel = '_'+rel 1421 | lcouple = couple[1:-1].split('" "')[:-1] 1422 | lcouple[0] = lcouple[0].split(' ') 1423 | left = [] 1424 | booltmp = True 1425 | for j in range(len(couple[0])): 1426 | lcouple[0][j] = lmtzr.lemmatize(lcouple[0][j]) 1427 | ctit = 0 1428 | name = '' 1429 | if '__' + lcouple[0][j] + '_NN' in lemme2idx.keys(): 1430 | ctit += 1 1431 | name ='__' + lcouple[0][j] + '_NN' 1432 | if '__' + lcouple[0][j] + '_VB' in lemme2idx.keys(): 1433 | ctit += 1 1434 | name = '__' + lcouple[0][j] + '_VB' 1435 | if '__' + lcouple[0][j] + '_JJ' in lemme2idx.keys(): 1436 | ctit += 1 1437 | name = '__' + lcouple[0][j] + '_JJ' 1438 | if '__' + lcouple[0][j] + '_RB' in lemme2idx.keys(): 1439 | ctit += 1 1440 | name = '__' + lcouple[0][j] + '_RB' 1441 | if ctit == 1: 1442 | left += [name] 1443 | else: 1444 | booltmp = False 1445 | #print 'left',lcouple[0],ctit,left 1446 | lcouple[1] = lcouple[1].split(' ') 1447 | right =[] 1448 | for j in range(len(couple[1])): 1449 | lcouple[1][j] = lmtzr.lemmatize(lcouple[1][j]) 1450 | ctit = 0 1451 | name = '' 1452 | if '__' + lcouple[1][j] + '_NN' in lemme2idx.keys(): 1453 | ctit += 1 1454 | name ='__' + lcouple[1][j] + '_NN' 1455 | if '__' + lcouple[1][j] + '_VB' in lemme2idx.keys(): 1456 | ctit += 1 1457 | name = '__' + lcouple[1][j] + '_VB' 1458 | if '__' + lcouple[1][j] + '_JJ' in lemme2idx.keys(): 1459 | ctit += 1 1460 | name = '__' + lcouple[1][j] + '_JJ' 1461 | if '__' + lcouple[1][j] + '_RB' in lemme2idx.keys(): 1462 | ctit += 1 1463 | name = '__' + lcouple[1][j] + '_RB' 1464 | if ctit == 1: 1465 | right += [name] 1466 | else: 1467 | booltmp = False 1468 | #print 'right',lcouple[1],ctit,right 1469 | if len(left)>=1 and len(right)>=1: 1470 | ex += [[left,[rel],right]] 1471 | print ex 1472 | print numpy.max(lemme2idx.values())+1,len(ex) 1473 | import scipy.sparse 1474 | posl = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,len(ex)),dtype='float32') 1475 | posr = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,len(ex)),dtype='float32') 1476 | poso = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,len(ex)),dtype='float32') 1477 | for ct,i in enumerate(ex): 1478 | for j in i[0]: 1479 | posl[lemme2idx[j],ct] += 1/float(len(i[0])) 1480 | for j in i[1]: 1481 | poso[lemme2idx[j],ct] += 1/float(len(i[1])) 1482 | for j in i[2]: 1483 | posr[lemme2idx[j],ct] += 1/float(len(i[2])) 1484 | 1485 | f = open('ConceptNet2-lhs.pkl','w') 1486 | g = open('ConceptNet2-rhs.pkl','w') 1487 | h = open('ConceptNet2-rel.pkl','w') 1488 | 1489 | cPickle.dump(posl,f,-1) 1490 | f.close() 1491 | cPickle.dump(posr,g,-1) 1492 | g.close() 1493 | cPickle.dump(poso,h,-1) 1494 | h.close() 1495 | 1496 | 1497 | 1498 | 1499 | if True: 1500 | from nltk.stem.wordnet import WordNetLemmatizer 1501 | lmtzr = WordNetLemmatizer() 1502 | f = open('/data/lisa/data/NLU/ConceptNet/predicates_concise_nonkline.txt','r') 1503 | dat = f.readlines() 1504 | g = open('/data/lisa/data/NLU/ConceptNet/predicates_concise_nonkline.txt','r') 1505 | dat += g.readlines() 1506 | f.close() 1507 | g.close() 1508 | ex = [] 1509 | for i in dat: 1510 | print len(ex) 1511 | rel,dum,couple = (i[1:-3]).partition(' ') 1512 | rel = '_'+rel 1513 | lcouple = couple[1:-1].split('" "')[:-1] 1514 | lcouple[0] = lcouple[0].split(' ') 1515 | left = [] 1516 | booltmp = True 1517 | for j in range(len(couple[0])): 1518 | lcouple[0][j] = lmtzr.lemmatize(lcouple[0][j]) 1519 | ctit = 0 1520 | name = '' 1521 | if '__' + lcouple[0][j] + '_RB' in lemme2idx.keys(): 1522 | ctit += 1 1523 | name ='__' + lcouple[0][j] + '_RB' 1524 | if '__' + lcouple[0][j] + '_JJ' in lemme2idx.keys(): 1525 | ctit += 1 1526 | name = '__' + lcouple[0][j] + '_JJ' 1527 | if '__' + lcouple[0][j] + '_VB' in lemme2idx.keys(): 1528 | ctit += 1 1529 | name = '__' + lcouple[0][j] + '_VB' 1530 | if '__' + lcouple[0][j] + '_NN' in lemme2idx.keys(): 1531 | ctit += 1 1532 | name = '__' + lcouple[0][j] + '_NN' 1533 | if ctit > 0: 1534 | left += [name] 1535 | else: 1536 | booltmp = False 1537 | #print 'left',lcouple[0],ctit,left 1538 | lcouple[1] = lcouple[1].split(' ') 1539 | right =[] 1540 | for j in range(len(couple[1])): 1541 | lcouple[1][j] = lmtzr.lemmatize(lcouple[1][j]) 1542 | ctit = 0 1543 | name = '' 1544 | if '__' + lcouple[1][j] + '_RB' in lemme2idx.keys(): 1545 | ctit += 1 1546 | name ='__' + lcouple[1][j] + '_RB' 1547 | if '__' + lcouple[1][j] + '_JJ' in lemme2idx.keys(): 1548 | ctit += 1 1549 | name = '__' + lcouple[1][j] + '_JJ' 1550 | if '__' + lcouple[1][j] + '_VB' in lemme2idx.keys(): 1551 | ctit += 1 1552 | name = '__' + lcouple[1][j] + '_VB' 1553 | if '__' + lcouple[1][j] + '_NN' in lemme2idx.keys(): 1554 | ctit += 1 1555 | name = '__' + lcouple[1][j] + '_NN' 1556 | if ctit > 0: 1557 | right += [name] 1558 | else: 1559 | booltmp = False 1560 | #print 'right',lcouple[1],ctit,right 1561 | if len(left)>=1 and len(right)>=1: 1562 | ex += [[left,[rel],right]] 1563 | print ex 1564 | print numpy.max(lemme2idx.values())+1,len(ex) 1565 | import scipy.sparse 1566 | posl = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,len(ex)),dtype='float32') 1567 | posr = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,len(ex)),dtype='float32') 1568 | poso = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,len(ex)),dtype='float32') 1569 | for ct,i in enumerate(ex): 1570 | for j in i[0]: 1571 | posl[lemme2idx[j],ct] += 1/float(len(i[0])) 1572 | for j in i[1]: 1573 | poso[lemme2idx[j],ct] += 1/float(len(i[1])) 1574 | for j in i[2]: 1575 | posr[lemme2idx[j],ct] += 1/float(len(i[2])) 1576 | 1577 | f = open('ConceptNet3-lhs.pkl','w') 1578 | g = open('ConceptNet3-rhs.pkl','w') 1579 | h = open('ConceptNet3-rel.pkl','w') 1580 | 1581 | cPickle.dump(posl,f,-1) 1582 | f.close() 1583 | cPickle.dump(posr,g,-1) 1584 | g.close() 1585 | cPickle.dump(poso,h,-1) 1586 | h.close() 1587 | 1588 | 1589 | ######################################################################################################### 1590 | 1591 | 1592 | ### Create the Senseval3.0 test set ### 1593 | ### lhs,rel,rhs -> the data with all the possible synset configurations for unambiguous lemmas 1594 | ### dict -> dictionnary of the form {index_beginning:index_end,...} (for each ambiguous lemma to disambiguate). 1595 | ### lab -> vector of length = number of instances to score (1 correspond to the real synset, 0 elsewhere) 1596 | ### freq -> vector of frequencies of the synsets in consideration (with respect to the given lemme) 1597 | ######################################################################################################### 1598 | 1599 | 1600 | 1601 | 1602 | if True: 1603 | f = open('/data/lisa/data/NLU/senseval3/Senseval3-wn3.0-filtered-triplets-unambiguous-lemmas.dat','r') 1604 | g = open('/data/lisa/data/NLU/senseval3/Senseval3-wn3.0-filtered-triplets-unambiguous-synsets.dat','r') 1605 | 1606 | dat1 = f.readlines() 1607 | f.close() 1608 | dat2 = g.readlines() 1609 | g.close() 1610 | 1611 | missed = 0 1612 | ct = 0 1613 | for i,k in zip(dat1,dat2): 1614 | lhs,rel,rhs = parseline(i[:-1]) 1615 | lhsr,relr,rhsr = parseline(k[:-1]) 1616 | for j in lhs: 1617 | if len(lemme2synset[j])>1: 1618 | ct += (len(lemme2synset[j])) 1619 | else: 1620 | missed += 1 1621 | j = rel[0] 1622 | if len(lemme2synset[j])>1: 1623 | ct += len(lemme2synset[j]) 1624 | else: 1625 | missed += 1 1626 | for j in rhs: 1627 | if len(lemme2synset[j])>1: 1628 | ct += len(lemme2synset[j]) 1629 | else: 1630 | missed += 1 1631 | print ct,missed 1632 | 1633 | import scipy.sparse 1634 | posl = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,ct),dtype='float32') 1635 | posr = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,ct),dtype='float32') 1636 | poso = scipy.sparse.lil_matrix((numpy.max(lemme2idx.values())+1,ct),dtype='float32') 1637 | 1638 | idxcurrent = 0 1639 | idxex = 0 1640 | dictidx ={} 1641 | freqlist = [] 1642 | label = [] 1643 | for i,k in zip(dat1,dat2): 1644 | lhs,rel,rhs = parseline(i[:-1]) 1645 | lhsr,relr,rhsr = parseline(k[:-1]) 1646 | 1647 | for idxtmp,k in enumerate(lhs): 1648 | if len(lemme2synset[k])>1: 1649 | dictidx.update({idxex:(idxcurrent,idxcurrent+len(lemme2synset[k]),lhsr[idxtmp] in synset2neg.keys())}) 1650 | for l,ff in zip(lemme2synset[k],lemme2freq[k]): 1651 | for j in list(lhs[:idxtmp])+list(lhs[(idxtmp+1):]): 1652 | posl[lemme2idx[j],idxcurrent]+=1/float(len(lhs)) 1653 | posl[synset2idx[l],idxcurrent]+=1/float(len(lhs)) 1654 | freqlist+=[ff] 1655 | if l == lhsr[idxtmp]: 1656 | label += [1] 1657 | else: 1658 | label += [0] 1659 | j = rel[0] 1660 | poso[lemme2idx[j],idxcurrent]+=1 1661 | for j in rhs: 1662 | posr[lemme2idx[j],idxcurrent]+=1/float(len(rhs)) 1663 | idxcurrent+=1 1664 | idxex+=1 1665 | k = rel[0] 1666 | if len(lemme2synset[k])>1: 1667 | dictidx.update({idxex:(idxcurrent,idxcurrent+len(lemme2synset[k]),relr[0] in synset2neg.keys())}) 1668 | for l,ff in zip(lemme2synset[k],lemme2freq[k]): 1669 | for j in lhs: 1670 | posl[lemme2idx[j],idxcurrent]+=1/float(len(lhs)) 1671 | poso[synset2idx[l],idxcurrent]+=1 1672 | freqlist+=[ff] 1673 | if l == relr[0]: 1674 | label += [1] 1675 | else: 1676 | label += [0] 1677 | for j in rhs: 1678 | posr[lemme2idx[j],idxcurrent]+=1/float(len(rhs)) 1679 | idxcurrent+=1 1680 | idxex+=1 1681 | 1682 | for idxtmp,k in enumerate(rhs): 1683 | if len(lemme2synset[k])>1: 1684 | dictidx.update({idxex:(idxcurrent,idxcurrent+len(lemme2synset[k]),rhsr[idxtmp] in synset2neg.keys())}) 1685 | for l,ff in zip(lemme2synset[k],lemme2freq[k]): 1686 | for j in list(rhs[:idxtmp])+list(rhs[(idxtmp+1):]): 1687 | posr[lemme2idx[j],idxcurrent]+=1/float(len(rhs)) 1688 | posr[synset2idx[l],idxcurrent]+=1/float(len(rhs)) 1689 | freqlist+=[ff] 1690 | if l == rhsr[idxtmp]: 1691 | label += [1] 1692 | else: 1693 | label += [0] 1694 | j = rel[0] 1695 | poso[lemme2idx[j],idxcurrent]+=1 1696 | for j in lhs: 1697 | posl[lemme2idx[j],idxcurrent]+=1/float(len(lhs)) 1698 | idxcurrent+=1 1699 | idxex+=1 1700 | 1701 | print idxcurrent,idxex,len(freqlist),len(dictidx),len(label),sum(label) 1702 | f = open('Senseval3-WSD-lhs.pkl','w') 1703 | g = open('Senseval3-WSD-rhs.pkl','w') 1704 | h = open('Senseval3-WSD-rel.pkl','w') 1705 | i = open('Senseval3-WSD-dict.pkl','w') 1706 | j = open('Senseval3-WSD-lab.pkl','w') 1707 | k = open('Senseval3-WSD-freq.pkl','w') 1708 | 1709 | cPickle.dump(posl,f,-1) 1710 | f.close() 1711 | cPickle.dump(posr,g,-1) 1712 | g.close() 1713 | cPickle.dump(poso,h,-1) 1714 | h.close() 1715 | cPickle.dump(dictidx,i,-1) 1716 | i.close() 1717 | cPickle.dump(label,j,-1) 1718 | j.close() 1719 | cPickle.dump(freqlist,k,-1) 1720 | k.close() 1721 | --------------------------------------------------------------------------------