├── data └── .keep ├── .spyderproject ├── .gitattributes ├── .gitignore ├── bow.py ├── std_run.sh ├── const.py ├── save.py ├── reparse.py ├── reparseprops.py ├── train.py ├── README.md ├── vecfromtext.py └── basicgrad.py /data/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.spyderproject: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brmson/Sentence-selection/HEAD/.spyderproject -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear on external disk 35 | .Spotlight-V100 36 | .Trashes 37 | 38 | # Directories potentially created on remote AFP share 39 | .AppleDB 40 | .AppleDesktop 41 | Network Trash Folder 42 | Temporary Items 43 | .apdisk 44 | 45 | 46 | ############## our things: 47 | 48 | data/jacana/pokus.txt 49 | data/glovewiki.txt 50 | *.pyc 51 | untitled0.py 52 | /.project 53 | /io.py 54 | *.p 55 | *.POSInput 56 | tests.py 57 | data/w2v.txt 58 | /truth.txt 59 | /res.txt 60 | /unigram-Mb.pickle 61 | 62 | # generated intermediate data 63 | /data 64 | -------------------------------------------------------------------------------- /bow.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | # -*- coding: utf-8 -*- 3 | 4 | from vecfromtext import getGloveDict 5 | import numpy as np 6 | from const import * 7 | 8 | 9 | def prepForGrad(q,a1,a0,ans1,ans0,glovepath2): 10 | """ Bag-of-words based embedding vectors for each sentence. 11 | Returns a matrix with one sentence per row. """ 12 | gloveDict=getGloveDict(glovepath2) 13 | qa=np.zeros((len(q),GLOVELEN)) 14 | a1a=np.zeros((len(a1),GLOVELEN)) 15 | a0a=np.zeros((len(a0),GLOVELEN)) 16 | for i in range(0,len(q)): 17 | qa[i][:]=boxSentence(q[i],gloveDict) 18 | print 'questions embedded' 19 | for i in range(0,len(a1)): 20 | a1a[i][:]=boxSentence(a1[i],gloveDict) 21 | print 'true answers embedded' 22 | for i in range(0,len(a0)): 23 | a0a[i][:]=boxSentence(a0[i],gloveDict) 24 | print 'false answers embedded' 25 | return (qa,a1a,a0a) 26 | 27 | #Boxofwords for sentence 28 | def boxSentence(sentence,gloveDict): 29 | i=0 30 | v=np.zeros(GLOVELEN) 31 | for word in sentence: 32 | x=gloveDict.get(word) 33 | if x is not None: 34 | v+=x 35 | i+=1 36 | if i!=0: 37 | v=v/i 38 | return v 39 | -------------------------------------------------------------------------------- /std_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # A script to convert a set of (question, sentence, binLabel) tuples 4 | # (where sentence may be a passage or, with -p, property label) 5 | # to a classifier which attempts to predict binLabel from unseen 6 | # (question, sentences) pairs. 7 | # 8 | # Usage: std-run.sh [-p] TRAINDATAPATH 9 | # 10 | # Example: ./std-run.sh -p ../yodaqa/data/ml/embsel/propdata/ 11 | 12 | if [[ -f resources/glove.6B.50d.txt ]] 13 | then 14 | echo "Dictionary allready downloaded" 15 | else 16 | echo "Downloading dictionary" 17 | wget http://pasky.or.cz/dev/brmson/glove.6B.50d.txt.gz 18 | gunzip glove.6B.50d.txt.gz 19 | mkdir -p resources 20 | mv glove.6B.50d.txt resources 21 | fi 22 | 23 | 24 | 25 | props=false 26 | if [ "$1" = "-p" ]; then 27 | props=true 28 | shift 29 | fi 30 | path=$1 31 | 32 | # Convert YodaQA-generated data to Jacana-style data 33 | if [[ props ]] 34 | then 35 | echo 'Running property-reparse' 36 | python reparseprops.py "$path" 37 | else 38 | echo 'Running sentence-reparse' 39 | python reparse.py "$path" 40 | fi 41 | 42 | # Convert Jacana-style data to pickled Python data structures 43 | echo 'Running save.py' 44 | python save.py 45 | 46 | # Train and save a classifier on top of the pickled data 47 | echo 'Running train.py' 48 | python train.py 49 | -------------------------------------------------------------------------------- /const.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | GLOVELEN=50 4 | GLOVEPATH='resources/glove.6B.50d.txt' 5 | #GLOVEPATH='data/glove.6B.100d.txt' 6 | ## 7 | #QPATH='data/jacana/Train1-100.Question.POSInput' 8 | #APATH1='data/jacana/Train1-100.Positive-J.POSInput' 9 | #APATH0='data/jacana/Train1-100.Negative-T.POSInput' 10 | QPATH='data/Qtrain.txt' 11 | APATH1='data/Ptrain.txt' 12 | APATH0='data/Ntrain.txt' 13 | CPATH1="data/Clues1train.txt" 14 | CPATH0="data/Clues0train.txt" 15 | #QPATH='data/jacana/Train1-2393.Question.POSInput' 16 | #APATH1='data/jacana/Train1-2393.Positive-M.POSInput' 17 | #APATH0='data/jacana/Train1-2393.Negative-M.POSInput' 18 | #TQPATH='data/jacana/Test.Question.POSInput' 19 | #TAPATH1='data/jacana/Test.Positive-J.POSInput' 20 | #TAPATH0='data/jacana/Test.Negative-T.POSInput' 21 | TQPATH='data/Qtest.txt' 22 | TAPATH1='data/Ptest.txt' 23 | TAPATH0='data/Ntest.txt' 24 | APATH0='data/Ntrain.txt' 25 | TCPATH1="data/Clues1test.txt" 26 | TCPATH0="data/Clues0test.txt" 27 | 28 | #GLOVEPATH='data/w2v.txt' 29 | GLOVEPATH2='data/usedembed.txt' 30 | TGLOVEPATH2='data/tusedembed.txt' 31 | PTQA='data/tqarray.txt' 32 | PTA1A='data/ta1rray.txt' 33 | PTA0A='data/ta0rray.txt' 34 | PTANS1='data/tans1.txt' 35 | PTANS0='data/tans0.txt' 36 | PQA='data/qarray.txt' 37 | PA1A='data/a1rray.txt' 38 | PA0A='data/a0rray.txt' 39 | PANS1='data/ans1.txt' 40 | PANS0='data/ans0.txt' 41 | LISTPATH="data/trainlist.p" 42 | TLISTPATH="data/testlist.p" 43 | -------------------------------------------------------------------------------- /save.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | input=jacana formated text files 5 | output=pickled q objects 6 | """ 7 | from basicgrad import ttlist 8 | from vecfromtext import textArrays,shortGlove 9 | from bow import prepForGrad 10 | import pickle 11 | import numpy as np 12 | from const import * 13 | 14 | def trecEval(li,count=True): 15 | truth=open('truth.txt','w') 16 | res=open('res.txt','w') 17 | for i in range(0,len(li)): 18 | for j in range(0,len(li[i].y)): 19 | truth.write(' '.join(map(str,(i,0,j,int(li[i].y[j]),'\n')))) 20 | if (count): 21 | res.write(' '.join(map(str,(i,0,j,1,li[i].tcount[j],'glove','\n')))) 22 | else: 23 | res.write(' '.join(map(str,(i,0,j,1,li[i].t[j],'glove','\n')))) 24 | truth.close() 25 | res.close() 26 | print 'trec_eval created' 27 | return 28 | 29 | def saveQlist(QPATH,APATH1,APATH0,GLOVEPATH,GLOVEPATH2,PLIST,PANS1,PANS0,new_dict=False,c1=False,c0=False): 30 | """From jacana formated documents of questions, true answers, false answers 31 | saves list of Qs to PLIST path""" 32 | (q,a1,a0,ans1,ans0)=textArrays(QPATH,APATH1,APATH0) 33 | if new_dict==True: 34 | shortGlove(q,a1,a0,GLOVEPATH,GLOVEPATH2) 35 | (qa,a1a,a0a)=prepForGrad(q,a1,a0,ans1,ans0,GLOVEPATH2) 36 | sentences=(q,a1,a0) 37 | li=ttlist(qa,a1a,a0a,ans1,ans0,sentences,c1,c0) 38 | pickle.dump( li, open( PLIST , "wb" ) ) 39 | np.savetxt(PANS1,ans1) 40 | np.savetxt(PANS0,ans0) 41 | return 42 | 43 | saveQlist(QPATH,APATH1,APATH0,GLOVEPATH,GLOVEPATH2,LISTPATH,PANS1,PANS0,new_dict=True,c1=CPATH1,c0=CPATH0) 44 | print 'training data saved' 45 | #saveQlist(TQPATH,TAPATH1,TAPATH0,GLOVEPATH,TGLOVEPATH2,TLISTPATH,PTANS1,PTANS0,new_dict=True,c1=TCPATH1,c0=TCPATH0) 46 | #print 'testing data saved' 47 | -------------------------------------------------------------------------------- /reparse.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Usage: reparse.py DATAPATH 4 | 5 | input=yodaqa csv outputs (sentences) 6 | output=jacana formated files for use in save.py 7 | """ 8 | 9 | import os 10 | import sys 11 | import glob 12 | 13 | QPATH="data/Qtrain.txt" 14 | PPATH="data/Ptrain.txt" 15 | NPATH="data/Ntrain.txt" 16 | CPATH1="data/Clues1train.txt" 17 | CPATH0="data/Clues0train.txt" 18 | #TPATH="data/curated-test" 19 | #TQPATH="data/Qtest.txt" 20 | #TPPATH="data/Ptest.txt" 21 | #TNPATH="data/Ntest.txt" 22 | #TCPATH1="data/Clues1test.txt" 23 | #TCPATH0="data/Clues0test.txt" 24 | 25 | 26 | def reparse(PATH,QPATH,PPATH,NPATH,CPATH1,CPATH0): 27 | q=open(QPATH,'w') 28 | p=open(PPATH,'w') 29 | n=open(NPATH,'w') 30 | cp=open(CPATH1,'w') 31 | cn=open(CPATH0,'w') 32 | 33 | qnum=0 34 | path=PATH+"/"+file 35 | i=0 36 | p.write("\n") 37 | n.write("\n") 38 | with open(path,'r') as f: 39 | for line in f: 40 | s=line.split(" ") 41 | if(s[0]=="" and i==0): 42 | q.write("\n") 43 | q.write(" ".join(s[1:])) 44 | q.write("\n") 45 | i+=1 46 | qnum+=1 47 | continue 48 | elif(s[0]=="" and i!=0): 49 | continue 50 | if(s[0]=='1'): 51 | p.write(" ".join(s[3:])) 52 | cp.write(" ".join(s[1:3])+"\n") 53 | else: 54 | n.write(" ".join(s[3:])) 55 | cn.write(" ".join(s[1:3])+"\n") 56 | p.write("\n") 57 | n.write("\n") 58 | print ".", 59 | q.close() 60 | p.close() 61 | n.close() 62 | cp.close() 63 | cn.close() 64 | 65 | 66 | PATH = sys.argv[1] 67 | reparse(PATH,QPATH,PPATH,NPATH,CPATH1,CPATH0) 68 | #reparse(TPATH,TQPATH,TPPATH,TNPATH,TCPATH1,TCPATH0) 69 | -------------------------------------------------------------------------------- /reparseprops.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Usage: reparseprops.py DATAPATH 4 | 5 | input=yodaqa csv outputs (properties) 6 | output=jacana formated files for use in save.py 7 | """ 8 | 9 | import os 10 | import sys 11 | import glob 12 | 13 | QPATH="data/Qtrain.txt" 14 | PPATH="data/Ptrain.txt" 15 | NPATH="data/Ntrain.txt" 16 | CPATH1="data/Clues1train.txt" 17 | CPATH0="data/Clues0train.txt" 18 | #TPATH="data/curated-test" 19 | #TQPATH="data/Qtest.txt" 20 | #TPPATH="data/Ptest.txt" 21 | #TNPATH="data/Ntest.txt" 22 | #TCPATH1="data/Clues1test.txt" 23 | #TCPATH0="data/Clues0test.txt" 24 | 25 | def notNumber(s): 26 | try: 27 | float(s) 28 | return False 29 | except ValueError: 30 | return True 31 | 32 | 33 | def reparseProps(PATH,QPATH,PPATH,NPATH,CPATH1,CPATH0): 34 | q=open(QPATH,'w') 35 | p=open(PPATH,'w') 36 | n=open(NPATH,'w') 37 | cp=open(CPATH1,'w') 38 | cn=open(CPATH0,'w') 39 | 40 | qnum=0 41 | for path in glob.glob(PATH + '/*'): 42 | i=0 43 | p.write("\n") 44 | n.write("\n") 45 | propdict=dict() 46 | propset=set() 47 | with open(path,'r') as f: 48 | for line in f: 49 | s=line.split(" ") 50 | if(s[0]!=""): 51 | s=line.split(" ") 52 | text=" ".join(s[2:]).lower() 53 | if text in propdict: 54 | if(s[0]=='1'): 55 | propdict[text]='1' 56 | continue 57 | propdict[text]=s[0] 58 | with open(path,'r') as f: 59 | for line in f: 60 | s=line.split(" ") 61 | if(s[0]=="" and i==0): 62 | q.write("\n") 63 | q.write(" ".join(s[1:])) 64 | q.write("\n") 65 | i+=1 66 | qnum+=1 67 | continue 68 | elif(s[0]=="" and i!=0): 69 | continue 70 | if notNumber(s[0]) or notNumber(s[1]): 71 | continue 72 | # print s 73 | text=" ".join(s[2:]).lower() 74 | if text not in propset: 75 | # print text 76 | if(propdict[text]=='1'): 77 | p.write(text) 78 | cp.write(" ".join(s[1:2])+"\n") 79 | else: 80 | n.write(text) 81 | cn.write(" ".join(s[1:2])+"\n") 82 | propset.add(text) 83 | p.write("\n") 84 | n.write("\n") 85 | print ".", 86 | q.close() 87 | p.close() 88 | n.close() 89 | cp.close() 90 | cn.close() 91 | 92 | 93 | PATH = sys.argv[1] 94 | reparseProps(PATH,QPATH,PPATH,NPATH,CPATH1,CPATH0) 95 | #reparseProps(TPATH,TQPATH,TPPATH,TNPATH,TCPATH1,TCPATH0) 96 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | input=q objects 5 | output=trained weights 6 | """ 7 | 8 | import time 9 | import random 10 | from basicgrad import mrrcount,mrr,setRes,getInputsClues,testGrad,trainConsts 11 | from const import * 12 | import numpy as np 13 | from sklearn import linear_model 14 | from vecfromtext import loadList,saveMb 15 | from multiprocessing import Pool 16 | 17 | def cross_validate_one(idx): 18 | global gdata 19 | (M,b,trainlist,threads)=gdata 20 | if idx==0: 21 | (M2,b2)=testGrad(M,b,trainlist,idx) 22 | res=0 23 | else: 24 | random.shuffle(trainlist) 25 | trainvalborder=len(trainlist)*(threads-2)/(threads-1) 26 | (M2,b2)=testGrad(M,b,trainlist[:trainvalborder],idx) 27 | print 'MMR after unigram learning train(idx=',idx,'):',mrr(M2,b2,trainlist) 28 | res=mrr(M2,b2,trainlist[trainvalborder:]) 29 | print 'MMR after unigram learning val(idx=',idx,'):',res 30 | return (res,M2,b2) 31 | 32 | def cross_validate_all(M,b,trainlist): 33 | global gdata 34 | threads=5 35 | gdata=(M,b,trainlist,threads+1) 36 | i=0 37 | pool = Pool() 38 | mrrs=[] 39 | for res in pool.imap(cross_validate_one,range(threads+1)): 40 | mrr,M,b=res 41 | if i==0: 42 | retM=M 43 | retb=b 44 | i+=1 45 | else: 46 | mrrs.append(mrr) 47 | pool.close() 48 | return (mrrs,sum(mrrs)/threads,retM,retb) 49 | 50 | def trainMb(trainlist,ans1,ans0): 51 | """Unigram training from saved Qlist files, returns Mb weights. 52 | You can play with the learning constants in trainConsts() of basicgrad.py""" 53 | t0=time.time() 54 | M=np.random.normal(0,0.01,(GLOVELEN,GLOVELEN)) 55 | b=-0.0001 56 | # M=np.loadtxt('data/M58prop') 57 | # b=np.loadtxt('data/b58prop') 58 | mrrs,crossmrr,M,b=cross_validate_all(M,b,trainlist) 59 | t1=time.time() 60 | print "time spent training =",t1-t0 61 | print "MRR after crossvalidation=",crossmrr 62 | 63 | # XXX: This has a sideeffect, setting resolutions in trainlist 64 | trainmrr=mrr(M,b,trainlist) 65 | print 'Mb MRR on train:', trainmrr 66 | l,alpha=trainConsts() 67 | results=[crossmrr,mrrs,l,alpha,trainmrr] 68 | return (M,b,results) 69 | 70 | 71 | def trainClues(trainlist,ans1,ans0): 72 | """Logistic regression using Mb probability and clues as input. 73 | requires mrr(M,b,trainlist) called beforehand to work properly""" 74 | (x,y)=getInputsClues(trainlist,ans1,ans0) 75 | clf = linear_model.LogisticRegression(C=1, penalty='l2', tol=1e-5) 76 | clf.fit(x, y) 77 | counttest=clf.predict_proba(x) 78 | setRes(trainlist,ans1,ans0,counttest[:,1]) 79 | mrrt=mrrcount(trainlist,ans1,ans0) 80 | print 'MRR unigram+clues train',mrrt 81 | w=clf.coef_ 82 | w=np.append(w,clf.intercept_); 83 | return w 84 | 85 | def train(LISTPATH,PANS1,PANS0): 86 | (trainlist,ans1,ans0)=loadList(LISTPATH,PANS1,PANS0) 87 | print 'data loaded' 88 | (M,b,results)=trainMb(trainlist,ans1,ans0) 89 | w=trainClues(trainlist,ans1,ans0) 90 | 91 | prop_num=0 92 | for q in trainlist: 93 | prop_num+=len(q.y) 94 | q_num=len(trainlist) 95 | print "trained on",q_num,"questions" 96 | print "trained on",prop_num,"properties" 97 | crossmrr,mrrs,l,alpha,trainmrr=results 98 | results=(q_num,prop_num,crossmrr,mrrs,l,alpha,trainmrr) 99 | return (M,b,w,results) 100 | 101 | 102 | if __name__ == "__main__": 103 | # Seed always to the same number to get reproducible models 104 | np.random.seed(17151713) 105 | 106 | (M, b, w, results) = train(LISTPATH, PANS1, PANS0) 107 | 108 | saveMb(M,b,"data/Mbtemp.txt",results) 109 | np.savetxt('data/weights.txt',w) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Deep Learning for Answer Sentence Selection Reconstruction 2 | ========================================================== 3 | 4 | This work started as an attempt to reproduce Yu et al.'s http://arxiv.org/abs/1412.1632 5 | 6 | Used word embeddings: pre-trained GloVe vectors from http://nlp.stanford.edu/projects/glove/ 7 | 8 | So far implemented: 9 | * Bag of words + basic gradient descent learning classification 10 | * Bag of words + basic gradient descent learning classification + word counts logistic regression 11 | 12 | Development Instructions 13 | ------------------------ 14 | 15 | For sentence selection development, 16 | used dataset: TREC-based originally by Wang et al., 2007; in the form 17 | by Yao et al., 2013 as downloaded from https://code.google.com/p/jacana/ 18 | 19 | Preprocessing (not required): 20 | * Run save.py first with updated filepath constants (const.py) if you have different dataset (requires jacana formating) 21 | 22 | Train and test: 23 | * Run train.py for training from TREC TRAIN dataset and testing from TREC TEST dataset 24 | * train.py generates truth.txt and res.txt, to evaluate using the official trec_eval tool, run 25 | 26 | trec_eval -a truth.txt res.txt 27 | 28 | TODO: 29 | * CNN instead of bag of words unigram averaging for aggregate embeddings. 30 | 31 | Results (evaluated using stock TREC scripts): 32 | 33 | | | MRR | MAP | 34 | |-----------------|--------|--------| 35 | | TRAIN | 0.7312 | 0.6551 | 36 | | TRAIN-ALL | 0.7308 | 0.6566 | 37 | | TRAIN+count | 0.7763 | 0.7165 | 38 | | TRAIN-ALL+count | 0.8128 | 0.7258 | 39 | 40 | 41 | Property selection in yodaqa/moviesC: 42 | ------------------------------------- 43 | 44 | Folow these steps if you want to retrain currently used weights: 45 | 46 | * Gather input data (labelled tuples) according to the instructions 47 | in YodaQA data/ml/embsel/README.md. 48 | 49 | * Run './std_run.sh -p PATH' (PATH is the directory of dumped yodaqa files). 50 | You can alter the training constants in basicgrad.py and train.py. 51 | 52 | * If you are happy with the results, you copy the generated file data/Mbtemp.txt 53 | to yodaqa src/main/resources/cz/brmlab/yodaqa/analysis/rdf/Mbprop.txt 54 | 55 | In summary, use this: 56 | 57 | ./std_run.sh -p ../yodaqa/data/ml/embsel/propdata 58 | cp data/Mbtemp.txt ../yodaqa/src/main/resources/cz/brmlab/yodaqa/analysis/rdf/Mbprop.txt 59 | 60 | ### Snapshot of results based on curated: 61 | 62 | (With a random 1:1 train:test split of the original curated-train.) 63 | 64 | **Used dataset:** 65 | 66 | train questions: 270 train sentences: 19624 (generated with curated-measure.sh train) 67 | test questions: 222 test sentences: 17561 (generated with curated-measure.sh train) 68 | 2.7902739024% of the properties contains correct answers 69 | random test mrr = 0.0475542678953 70 | 71 | **Current results:** 72 | 73 | MMR after unigram learning train: 0.600856454434 74 | MMR after unigram learning test: 0.582881935037 75 | 76 | 77 | Sentence selection on yodaqa/curated: 78 | ------------------------------------- 79 | 80 | Folow these steps if you want to retrain currently used weights: 81 | 82 | * Gather input data (labelled tuples) according to the instructions 83 | in YodaQA data/ml/embsel/README.md. 84 | 85 | * Run './std_run.sh -p PATH' (PATH is the directory of dumped yodaqa files). 86 | You can alter the training constants in basicgrad.py and train.py. 87 | 88 | * If you are happy with the results, you copy the generated file data/Mbtemp.txt 89 | to yodaqa src/main/resources/cz/brmlab/yodaqa/analysis/passextract/Mb.txt 90 | 91 | In summary, use this (with YodaQA's f/sentence-selection branch): 92 | 93 | ./std_run.sh ../yodaqa/data/ml/embsel/sentdata 94 | cp data/Mbtemp.txt ../yodaqa/src/main/resources/cz/brmlab/yodaqa/analysis/passextract/Mb.txt 95 | 96 | ### Snapshot of results based on curated: 97 | 98 | (With a random 1:1 train:test split of the original curated-train.) 99 | 100 | **Used dataset:** 101 | 102 | train questions: 186 train sentences: 43843 (generated with curated-measure.sh train) 103 | test questions: 429 test sentences: 88779 (generated with curated-measure.sh test) 104 | 5.21294450264% of the properties contains correct answers 105 | random test mrr = 0.0760195275186 106 | 107 | **Current results:** 108 | 109 | baseline (clue1+0.25*clue2): 110 | 111 | MRR unigram+clues train 0.249327071552 112 | MRR unigram+clues test 0.29659580682 113 | 114 | glove only: 115 | 116 | MMR after unigram learning train: 0.224787152966 117 | MMR after unigram learning test: 0.222749753007 118 | 119 | glove+clue1: 120 | 121 | MRR unigram+clues train 0.358206351223 122 | MRR unigram+clues test 0.388948882077 123 | 124 | -------------------------------------------------------------------------------- /vecfromtext.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import pickle 5 | import re 6 | 7 | def getGloveDict(glovepath2): 8 | """Returns discionary of used words""" 9 | gloveDict = dict() 10 | with open(glovepath2,'r') as f: 11 | for line in f: 12 | word=line.split(' ',1)[0] 13 | gloveDict[word] = np.array(line.split(' ')[1:]).astype(float) 14 | return gloveDict 15 | 16 | 17 | def textArrays(qpath,apath1,apath0): 18 | """ Returns qa text vectors from files with jacana formating. 19 | Text == array of tokens. 20 | It is a tuple of: 21 | * a list of question texts 22 | * a list of texts of all correct answers (across all questions) 23 | * a list of texts of all incorrect answers 24 | * for each question, #of correct answers (used for computing the index in list of all correct answers) 25 | * for each question, #of incorrect answers 26 | """ 27 | questions=[] 28 | with open(qpath,'r') as f: 29 | for line in f: 30 | line=line.lower() 31 | if line[0]!='<': 32 | line=re.sub('[^0-9a-zA-Z]+',' ', line) 33 | x=np.array(line.split(' ')[:-1]) 34 | questions.append(x) 35 | 36 | answers1=[] 37 | i=0 38 | ans1=[] 39 | with open(apath1,'r') as f: 40 | for line in f: 41 | line=line.lower() 42 | if line[0]!='<': 43 | i+=1 44 | line=re.sub('[^0-9a-zA-Z]+',' ', line) 45 | x=np.array(line.split(' ')[:-1]) 46 | answers1.append(x) 47 | elif line[0:2]=='0: 57 | d=0 58 | for sentence in self.atext: 59 | if word in sentence: 60 | d+=1 61 | continue 62 | self.idf[i]+=wc*np.log(N/d) 63 | 64 | 65 | def ttlist(qa,a1a,a0a,ans1,ans0,sentences,c1=False,c0=False): 66 | """Returns list of qs""" 67 | clues1=np.zeros((2,sum(ans1))) 68 | clues0=np.zeros((2,sum(ans0))) 69 | if(c1): 70 | i=0 71 | with open(c1,'r') as f: 72 | for line in f: 73 | s=line.split(" ") 74 | clues1[0,i]=float(s[0]) 75 | # clues1[1,i]=float(s[1]) 76 | i+=1 77 | i=0 78 | with open(c0,'r') as f: 79 | for line in f: 80 | s=line.split(" ") 81 | clues0[0,i]=float(s[0]) 82 | # clues0[1,i]=float(s[1]) 83 | i+=1 84 | 85 | (questions,answers1,answers0)=sentences 86 | li=[] 87 | ones=0 88 | zeros=0 89 | for i in range(0,len(ans1)): 90 | li.append(q(qa[i],a1a[ones:ones+ans1[i]],a0a[zeros:zeros+ans0[i]],questions[i], 91 | answers1[ones:ones+ans1[i]],answers0[zeros:zeros+ans0[i]],clues1[:,ones:ones+ans1[i]],clues0[:,zeros:zeros+ans0[i]])) 92 | ones+=ans1[i] 93 | zeros+=ans0[i] 94 | return li 95 | 96 | def testGrad(M,b,li,idx): 97 | """Updates weights using basic gradient descent""" 98 | l,alpha=trainConsts() 99 | bestmrr=0.0 100 | n_iter = 200 101 | plot = np.zeros(int(n_iter / 5)) 102 | for i in range(0, n_iter): 103 | ggM=0.0 104 | ggb=0.0 105 | if i%5==0: 106 | plot[int(i/5)]=lossAll(li,M,b) 107 | print '[%d/%d] loss function: %.1f (bestMRR %.3f) Thread number %d' % (i, n_iter, plot[int(i/5)], bestmrr, idx) 108 | for q in li: 109 | labels=q.y 110 | # np.transpose(np.array(q.a[:,j],ndmin=2)) 111 | (gM,gb)=grad(labels,q.q,M,q.a,b) 112 | ggM+=gM 113 | ggb+=gb 114 | M=M-alpha*ggM 115 | b=b-alpha*ggb 116 | curmrr=mrr(M,b,li) 117 | if bestmrr