├── data
    └── .keep
├── .spyderproject
├── .gitattributes
├── .gitignore
├── bow.py
├── std_run.sh
├── const.py
├── save.py
├── reparse.py
├── reparseprops.py
├── train.py
├── README.md
├── vecfromtext.py
└── basicgrad.py


/data/.keep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.spyderproject:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brmson/Sentence-selection/HEAD/.spyderproject


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Windows image file caches
 2 | Thumbs.db
 3 | ehthumbs.db
 4 | 
 5 | # Folder config file
 6 | Desktop.ini
 7 | 
 8 | # Recycle Bin used on file shares
 9 | $RECYCLE.BIN/
10 | 
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 | 
17 | # Windows shortcuts
18 | *.lnk
19 | 
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 | 
24 | # OSX
25 | # =========================
26 | 
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 | 
31 | # Thumbnails
32 | ._*
33 | 
34 | # Files that might appear on external disk
35 | .Spotlight-V100
36 | .Trashes
37 | 
38 | # Directories potentially created on remote AFP share
39 | .AppleDB
40 | .AppleDesktop
41 | Network Trash Folder
42 | Temporary Items
43 | .apdisk
44 | 
45 | 
46 | ############## our things:
47 | 
48 | data/jacana/pokus.txt
49 | data/glovewiki.txt
50 | *.pyc
51 | untitled0.py
52 | /.project
53 | /io.py
54 | *.p
55 | *.POSInput
56 | tests.py
57 | data/w2v.txt
58 | /truth.txt
59 | /res.txt
60 | /unigram-Mb.pickle
61 | 
62 | # generated intermediate data
63 | /data
64 | 


--------------------------------------------------------------------------------
/bow.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from vecfromtext import getGloveDict
 5 | import numpy as np
 6 | from const import *
 7 | 
 8 | 
 9 | def prepForGrad(q,a1,a0,ans1,ans0,glovepath2):    
10 |     """ Bag-of-words based embedding vectors for each sentence.
11 |     Returns a matrix with one sentence per row. """
12 |     gloveDict=getGloveDict(glovepath2)
13 |     qa=np.zeros((len(q),GLOVELEN))
14 |     a1a=np.zeros((len(a1),GLOVELEN))
15 |     a0a=np.zeros((len(a0),GLOVELEN))
16 |     for i in range(0,len(q)):
17 |         qa[i][:]=boxSentence(q[i],gloveDict)
18 |     print 'questions embedded'
19 |     for i in range(0,len(a1)):
20 |         a1a[i][:]=boxSentence(a1[i],gloveDict)
21 |     print 'true answers embedded'
22 |     for i in range(0,len(a0)):
23 |         a0a[i][:]=boxSentence(a0[i],gloveDict)
24 |     print 'false answers embedded'
25 |     return (qa,a1a,a0a)
26 | 
27 | #Boxofwords for sentence
28 | def boxSentence(sentence,gloveDict):
29 |     i=0
30 |     v=np.zeros(GLOVELEN)
31 |     for word in sentence:
32 |         x=gloveDict.get(word)
33 |         if x is not None:
34 |             v+=x
35 |             i+=1
36 |     if i!=0:
37 |         v=v/i
38 |     return v
39 | 


--------------------------------------------------------------------------------
/std_run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # A script to convert a set of (question, sentence, binLabel) tuples
 4 | # (where sentence may be a passage or, with -p, property label)
 5 | # to a classifier which attempts to predict binLabel from unseen
 6 | # (question, sentences) pairs.
 7 | #
 8 | # Usage: std-run.sh [-p] TRAINDATAPATH
 9 | #
10 | # Example: ./std-run.sh -p ../yodaqa/data/ml/embsel/propdata/
11 | 
12 | if [[ -f resources/glove.6B.50d.txt ]]
13 | then
14 | echo "Dictionary allready downloaded"
15 | else
16 | echo "Downloading dictionary"
17 | wget http://pasky.or.cz/dev/brmson/glove.6B.50d.txt.gz
18 | gunzip glove.6B.50d.txt.gz
19 | mkdir -p resources
20 | mv glove.6B.50d.txt resources
21 | fi
22 | 
23 | 
24 | 
25 | props=false
26 | if [ "$1" = "-p" ]; then
27 | 	props=true
28 | 	shift
29 | fi
30 | path=$1
31 | 
32 | # Convert YodaQA-generated data to Jacana-style data
33 | if [[ props ]]
34 | then
35 | 	echo 'Running property-reparse'
36 | 	python reparseprops.py "$path"
37 | else
38 | 	echo 'Running sentence-reparse'
39 | 	python reparse.py "$path"
40 | fi
41 | 
42 | # Convert Jacana-style data to pickled Python data structures
43 | echo 'Running save.py'
44 | python save.py
45 | 
46 | # Train and save a classifier on top of the pickled data
47 | echo 'Running train.py'
48 | python train.py
49 | 


--------------------------------------------------------------------------------
/const.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | GLOVELEN=50
 4 | GLOVEPATH='resources/glove.6B.50d.txt'
 5 | #GLOVEPATH='data/glove.6B.100d.txt'
 6 | ##
 7 | #QPATH='data/jacana/Train1-100.Question.POSInput'
 8 | #APATH1='data/jacana/Train1-100.Positive-J.POSInput'
 9 | #APATH0='data/jacana/Train1-100.Negative-T.POSInput'
10 | QPATH='data/Qtrain.txt'
11 | APATH1='data/Ptrain.txt'
12 | APATH0='data/Ntrain.txt'
13 | CPATH1="data/Clues1train.txt"
14 | CPATH0="data/Clues0train.txt"
15 | #QPATH='data/jacana/Train1-2393.Question.POSInput'
16 | #APATH1='data/jacana/Train1-2393.Positive-M.POSInput'
17 | #APATH0='data/jacana/Train1-2393.Negative-M.POSInput'
18 | #TQPATH='data/jacana/Test.Question.POSInput'
19 | #TAPATH1='data/jacana/Test.Positive-J.POSInput'
20 | #TAPATH0='data/jacana/Test.Negative-T.POSInput'
21 | TQPATH='data/Qtest.txt'
22 | TAPATH1='data/Ptest.txt'
23 | TAPATH0='data/Ntest.txt'
24 | APATH0='data/Ntrain.txt'
25 | TCPATH1="data/Clues1test.txt"
26 | TCPATH0="data/Clues0test.txt"
27 | 
28 | #GLOVEPATH='data/w2v.txt'
29 | GLOVEPATH2='data/usedembed.txt'
30 | TGLOVEPATH2='data/tusedembed.txt'
31 | PTQA='data/tqarray.txt'
32 | PTA1A='data/ta1rray.txt'
33 | PTA0A='data/ta0rray.txt'
34 | PTANS1='data/tans1.txt'
35 | PTANS0='data/tans0.txt'
36 | PQA='data/qarray.txt'
37 | PA1A='data/a1rray.txt'
38 | PA0A='data/a0rray.txt'
39 | PANS1='data/ans1.txt'
40 | PANS0='data/ans0.txt'
41 | LISTPATH="data/trainlist.p"
42 | TLISTPATH="data/testlist.p"
43 | 


--------------------------------------------------------------------------------
/save.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | input=jacana formated text files
 5 | output=pickled q objects
 6 | """
 7 | from basicgrad import ttlist
 8 | from vecfromtext import textArrays,shortGlove
 9 | from bow import prepForGrad
10 | import pickle
11 | import numpy as np
12 | from const import *
13 | 
14 | def trecEval(li,count=True):
15 |     truth=open('truth.txt','w')
16 |     res=open('res.txt','w')
17 |     for i in range(0,len(li)):
18 |         for j in range(0,len(li[i].y)):
19 |             truth.write(' '.join(map(str,(i,0,j,int(li[i].y[j]),'\n'))))
20 |             if (count):
21 |                 res.write(' '.join(map(str,(i,0,j,1,li[i].tcount[j],'glove','\n'))))
22 |             else:
23 |                 res.write(' '.join(map(str,(i,0,j,1,li[i].t[j],'glove','\n'))))
24 |     truth.close()
25 |     res.close()
26 |     print 'trec_eval created'
27 |     return
28 | 
29 | def saveQlist(QPATH,APATH1,APATH0,GLOVEPATH,GLOVEPATH2,PLIST,PANS1,PANS0,new_dict=False,c1=False,c0=False):
30 |     """From jacana formated documents of questions, true answers, false answers
31 |     saves list of Qs to PLIST path"""   
32 |     (q,a1,a0,ans1,ans0)=textArrays(QPATH,APATH1,APATH0)
33 |     if new_dict==True:
34 |         shortGlove(q,a1,a0,GLOVEPATH,GLOVEPATH2)
35 |     (qa,a1a,a0a)=prepForGrad(q,a1,a0,ans1,ans0,GLOVEPATH2)
36 |     sentences=(q,a1,a0)
37 |     li=ttlist(qa,a1a,a0a,ans1,ans0,sentences,c1,c0)
38 |     pickle.dump( li, open( PLIST , "wb" ) )
39 |     np.savetxt(PANS1,ans1)
40 |     np.savetxt(PANS0,ans0)
41 |     return
42 |     
43 | saveQlist(QPATH,APATH1,APATH0,GLOVEPATH,GLOVEPATH2,LISTPATH,PANS1,PANS0,new_dict=True,c1=CPATH1,c0=CPATH0)
44 | print 'training data saved'
45 | #saveQlist(TQPATH,TAPATH1,TAPATH0,GLOVEPATH,TGLOVEPATH2,TLISTPATH,PTANS1,PTANS0,new_dict=True,c1=TCPATH1,c0=TCPATH0)
46 | #print 'testing data saved'
47 | 


--------------------------------------------------------------------------------
/reparse.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Usage: reparse.py DATAPATH
 4 | 
 5 | input=yodaqa csv outputs (sentences)
 6 | output=jacana formated files for use in save.py
 7 | """
 8 | 
 9 | import os
10 | import sys
11 | import glob
12 | 
13 | QPATH="data/Qtrain.txt"
14 | PPATH="data/Ptrain.txt"
15 | NPATH="data/Ntrain.txt"
16 | CPATH1="data/Clues1train.txt"
17 | CPATH0="data/Clues0train.txt"
18 | #TPATH="data/curated-test"
19 | #TQPATH="data/Qtest.txt"
20 | #TPPATH="data/Ptest.txt"
21 | #TNPATH="data/Ntest.txt"
22 | #TCPATH1="data/Clues1test.txt"
23 | #TCPATH0="data/Clues0test.txt"
24 | 
25 | 
26 | def reparse(PATH,QPATH,PPATH,NPATH,CPATH1,CPATH0):
27 |     q=open(QPATH,'w')
28 |     p=open(PPATH,'w')
29 |     n=open(NPATH,'w')
30 |     cp=open(CPATH1,'w')
31 |     cn=open(CPATH0,'w')
32 |     
33 |     qnum=0
34 |         path=PATH+"/"+file
35 |         i=0
36 |         p.write("<A "+str(qnum)+">\n")
37 |         n.write("<A "+str(qnum)+">\n")
38 |         with open(path,'r') as f:
39 |             for line in f:
40 |                 s=line.split(" ")
41 |                 if(s[0]=="<Q>" and i==0):
42 |                     q.write("<Q "+str(qnum)+">\n")
43 |                     q.write(" ".join(s[1:]))
44 |                     q.write("</Q>\n")
45 |                     i+=1
46 |                     qnum+=1
47 |                     continue
48 |                 elif(s[0]=="<Q>" and i!=0):
49 |                     continue
50 |                 if(s[0]=='1'):
51 |                     p.write(" ".join(s[3:]))
52 |                     cp.write(" ".join(s[1:3])+"\n")
53 |                 else:
54 |                     n.write(" ".join(s[3:]))
55 |                     cn.write(" ".join(s[1:3])+"\n")
56 |         p.write("</A>\n")
57 |         n.write("</A>\n")
58 |         print ".",
59 |     q.close()
60 |     p.close()
61 |     n.close()
62 |     cp.close()
63 |     cn.close()
64 | 
65 | 
66 | PATH = sys.argv[1]
67 | reparse(PATH,QPATH,PPATH,NPATH,CPATH1,CPATH0)
68 | #reparse(TPATH,TQPATH,TPPATH,TNPATH,TCPATH1,TCPATH0)
69 | 


--------------------------------------------------------------------------------
/reparseprops.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Usage: reparseprops.py DATAPATH
 4 | 
 5 | input=yodaqa csv outputs (properties)
 6 | output=jacana formated files for use in save.py
 7 | """
 8 | 
 9 | import os
10 | import sys
11 | import glob
12 | 
13 | QPATH="data/Qtrain.txt"
14 | PPATH="data/Ptrain.txt"
15 | NPATH="data/Ntrain.txt"
16 | CPATH1="data/Clues1train.txt"
17 | CPATH0="data/Clues0train.txt"
18 | #TPATH="data/curated-test"
19 | #TQPATH="data/Qtest.txt"
20 | #TPPATH="data/Ptest.txt"
21 | #TNPATH="data/Ntest.txt"
22 | #TCPATH1="data/Clues1test.txt"
23 | #TCPATH0="data/Clues0test.txt"
24 | 
25 | def notNumber(s):
26 |     try:
27 |         float(s)
28 |         return False
29 |     except ValueError:
30 |         return True
31 | 
32 | 
33 | def reparseProps(PATH,QPATH,PPATH,NPATH,CPATH1,CPATH0):
34 |     q=open(QPATH,'w')
35 |     p=open(PPATH,'w')
36 |     n=open(NPATH,'w')
37 |     cp=open(CPATH1,'w')
38 |     cn=open(CPATH0,'w')
39 |     
40 |     qnum=0
41 |     for path in glob.glob(PATH + '/*'):
42 |         i=0
43 |         p.write("<A "+str(qnum)+">\n")
44 |         n.write("<A "+str(qnum)+">\n")
45 |         propdict=dict()
46 |         propset=set()
47 |         with open(path,'r') as f:
48 |             for line in f:
49 |                 s=line.split(" ")
50 |                 if(s[0]!="<Q>"):
51 |                     s=line.split(" ")
52 |                     text=" ".join(s[2:]).lower()
53 |                     if text in propdict:
54 |                         if(s[0]=='1'):
55 |                             propdict[text]='1'
56 |                         continue
57 |                     propdict[text]=s[0]
58 |         with open(path,'r') as f:
59 |             for line in f:
60 |                 s=line.split(" ")
61 |                 if(s[0]=="<Q>" and i==0):
62 |                     q.write("<Q "+str(qnum)+">\n")
63 |                     q.write(" ".join(s[1:]))
64 |                     q.write("</Q>\n")
65 |                     i+=1
66 |                     qnum+=1
67 |                     continue
68 |                 elif(s[0]=="<Q>" and i!=0):
69 |                     continue
70 |                 if notNumber(s[0]) or notNumber(s[1]):
71 |                     continue
72 | #                print s
73 |                 text=" ".join(s[2:]).lower()
74 |                 if text not in propset:
75 | #                    print text
76 |                     if(propdict[text]=='1'):
77 |                         p.write(text)
78 |                         cp.write(" ".join(s[1:2])+"\n")
79 |                     else:
80 |                         n.write(text)
81 |                         cn.write(" ".join(s[1:2])+"\n")
82 |                     propset.add(text)
83 |         p.write("</A>\n")
84 |         n.write("</A>\n")
85 |         print ".",
86 |     q.close()
87 |     p.close()
88 |     n.close()
89 |     cp.close()
90 |     cn.close()
91 | 
92 | 
93 | PATH = sys.argv[1]
94 | reparseProps(PATH,QPATH,PPATH,NPATH,CPATH1,CPATH0)
95 | #reparseProps(TPATH,TQPATH,TPPATH,TNPATH,TCPATH1,TCPATH0)
96 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | input=q objects
  5 | output=trained weights
  6 | """
  7 | 
  8 | import time
  9 | import random
 10 | from basicgrad import mrrcount,mrr,setRes,getInputsClues,testGrad,trainConsts
 11 | from const import *
 12 | import numpy as np
 13 | from sklearn import linear_model
 14 | from vecfromtext import loadList,saveMb
 15 | from multiprocessing import Pool
 16 |   
 17 | def cross_validate_one(idx):
 18 |     global gdata
 19 |     (M,b,trainlist,threads)=gdata
 20 |     if idx==0:
 21 |         (M2,b2)=testGrad(M,b,trainlist,idx)
 22 |         res=0
 23 |     else:
 24 |         random.shuffle(trainlist)
 25 |         trainvalborder=len(trainlist)*(threads-2)/(threads-1)
 26 |         (M2,b2)=testGrad(M,b,trainlist[:trainvalborder],idx)
 27 |         print 'MMR after unigram learning train(idx=',idx,'):',mrr(M2,b2,trainlist)
 28 |         res=mrr(M2,b2,trainlist[trainvalborder:])
 29 |         print 'MMR after unigram learning val(idx=',idx,'):',res
 30 |     return (res,M2,b2)
 31 |     
 32 | def cross_validate_all(M,b,trainlist):
 33 |     global gdata
 34 |     threads=5
 35 |     gdata=(M,b,trainlist,threads+1)
 36 |     i=0
 37 |     pool = Pool()
 38 |     mrrs=[]
 39 |     for res in pool.imap(cross_validate_one,range(threads+1)):
 40 |         mrr,M,b=res
 41 |         if i==0:
 42 |             retM=M
 43 |             retb=b
 44 |             i+=1
 45 |         else:
 46 |             mrrs.append(mrr)
 47 |     pool.close()
 48 |     return (mrrs,sum(mrrs)/threads,retM,retb)
 49 | 
 50 | def trainMb(trainlist,ans1,ans0):
 51 |     """Unigram training from saved Qlist files, returns Mb weights.
 52 |     You can play with the learning constants in trainConsts() of basicgrad.py"""
 53 |     t0=time.time()
 54 |     M=np.random.normal(0,0.01,(GLOVELEN,GLOVELEN))
 55 |     b=-0.0001
 56 | #    M=np.loadtxt('data/M58prop')
 57 | #    b=np.loadtxt('data/b58prop')
 58 |     mrrs,crossmrr,M,b=cross_validate_all(M,b,trainlist)
 59 |     t1=time.time()
 60 |     print "time spent training =",t1-t0
 61 |     print "MRR after crossvalidation=",crossmrr
 62 | 
 63 |     # XXX: This has a sideeffect, setting resolutions in trainlist
 64 |     trainmrr=mrr(M,b,trainlist)
 65 |     print 'Mb MRR on train:', trainmrr
 66 |     l,alpha=trainConsts()
 67 |     results=[crossmrr,mrrs,l,alpha,trainmrr]
 68 |     return (M,b,results)
 69 | 
 70 | 
 71 | def trainClues(trainlist,ans1,ans0):
 72 |     """Logistic regression using Mb probability and clues as input.
 73 |     requires mrr(M,b,trainlist) called beforehand to work properly"""
 74 |     (x,y)=getInputsClues(trainlist,ans1,ans0)
 75 |     clf = linear_model.LogisticRegression(C=1, penalty='l2', tol=1e-5)
 76 |     clf.fit(x, y)
 77 |     counttest=clf.predict_proba(x)
 78 |     setRes(trainlist,ans1,ans0,counttest[:,1])
 79 |     mrrt=mrrcount(trainlist,ans1,ans0)
 80 |     print 'MRR unigram+clues train',mrrt
 81 |     w=clf.coef_
 82 |     w=np.append(w,clf.intercept_);
 83 |     return w
 84 | 
 85 | def train(LISTPATH,PANS1,PANS0):
 86 |     (trainlist,ans1,ans0)=loadList(LISTPATH,PANS1,PANS0)
 87 |     print 'data loaded'
 88 |     (M,b,results)=trainMb(trainlist,ans1,ans0)
 89 |     w=trainClues(trainlist,ans1,ans0)
 90 |     
 91 |     prop_num=0
 92 |     for q in trainlist:
 93 |         prop_num+=len(q.y)
 94 |     q_num=len(trainlist)
 95 |     print "trained on",q_num,"questions"
 96 |     print "trained on",prop_num,"properties"
 97 |     crossmrr,mrrs,l,alpha,trainmrr=results
 98 |     results=(q_num,prop_num,crossmrr,mrrs,l,alpha,trainmrr)
 99 |     return (M,b,w,results)
100 |     
101 | 
102 | if __name__ == "__main__":
103 |     # Seed always to the same number to get reproducible models
104 |     np.random.seed(17151713)
105 | 
106 |     (M, b, w, results) = train(LISTPATH, PANS1, PANS0)
107 | 
108 |     saveMb(M,b,"data/Mbtemp.txt",results)
109 |     np.savetxt('data/weights.txt',w)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Deep Learning for Answer Sentence Selection Reconstruction
  2 | ==========================================================
  3 | 
  4 | This work started as an attempt to reproduce Yu et al.'s http://arxiv.org/abs/1412.1632
  5 | 
  6 | Used word embeddings: pre-trained GloVe vectors from http://nlp.stanford.edu/projects/glove/
  7 | 
  8 | So far implemented:
  9 |   * Bag of words + basic gradient descent learning classification
 10 |   * Bag of words + basic gradient descent learning classification + word counts logistic regression
 11 | 
 12 | Development Instructions
 13 | ------------------------
 14 | 
 15 | For sentence selection development,
 16 | used dataset: TREC-based originally by Wang et al., 2007; in the form
 17 | by Yao et al., 2013 as downloaded from https://code.google.com/p/jacana/
 18 | 
 19 | Preprocessing (not required):
 20 |   * Run save.py first with updated filepath constants (const.py) if you have different dataset (requires jacana formating)
 21 | 
 22 | Train and test:
 23 |   * Run train.py for training from TREC TRAIN dataset and testing from TREC TEST dataset
 24 |   * train.py generates truth.txt and res.txt, to evaluate using the official trec_eval tool, run
 25 | 
 26 | 	trec_eval -a truth.txt res.txt
 27 | 
 28 | TODO:
 29 |   * CNN instead of bag of words unigram averaging for aggregate embeddings. 
 30 | 
 31 | Results (evaluated using stock TREC scripts):
 32 | 
 33 | |                 | MRR    | MAP    |
 34 | |-----------------|--------|--------|
 35 | | TRAIN           | 0.7312 | 0.6551 |
 36 | | TRAIN-ALL       | 0.7308 | 0.6566 |
 37 | | TRAIN+count     | 0.7763 | 0.7165 |
 38 | | TRAIN-ALL+count | 0.8128 | 0.7258 |
 39 | 
 40 | 
 41 | Property selection in yodaqa/moviesC:
 42 | -------------------------------------
 43 | 
 44 | Folow these steps if you want to retrain currently used weights:
 45 | 
 46 |   * Gather input data (labelled tuples) according to the instructions
 47 |     in YodaQA data/ml/embsel/README.md.
 48 | 
 49 |   * Run './std_run.sh -p PATH' (PATH is the directory of dumped yodaqa files).
 50 |     You can alter the training constants in basicgrad.py and train.py.
 51 | 
 52 |   * If you are happy with the results, you copy the generated file data/Mbtemp.txt
 53 |     to yodaqa src/main/resources/cz/brmlab/yodaqa/analysis/rdf/Mbprop.txt
 54 | 
 55 | In summary, use this:
 56 | 
 57 | 	./std_run.sh -p ../yodaqa/data/ml/embsel/propdata
 58 | 	cp data/Mbtemp.txt ../yodaqa/src/main/resources/cz/brmlab/yodaqa/analysis/rdf/Mbprop.txt
 59 | 
 60 | ### Snapshot of results based on curated:
 61 | 
 62 | (With a random 1:1 train:test split of the original curated-train.)
 63 | 
 64 | **Used dataset:**  
 65 | 
 66 | 	train questions: 270 train sentences: 19624	(generated with curated-measure.sh train)
 67 | 	test questions: 222 test sentences: 17561	(generated with curated-measure.sh train)
 68 | 	2.7902739024% of the properties contains correct answers
 69 | 	random test mrr = 0.0475542678953
 70 | 
 71 | **Current results:**  
 72 | 
 73 | 	MMR after unigram learning train: 0.600856454434
 74 | 	MMR after unigram learning test: 0.582881935037
 75 | 
 76 | 
 77 | Sentence selection on yodaqa/curated:
 78 | -------------------------------------
 79 | 
 80 | Folow these steps if you want to retrain currently used weights:
 81 | 
 82 |   * Gather input data (labelled tuples) according to the instructions
 83 |     in YodaQA data/ml/embsel/README.md.
 84 | 
 85 |   * Run './std_run.sh -p PATH' (PATH is the directory of dumped yodaqa files).
 86 |     You can alter the training constants in basicgrad.py and train.py.
 87 | 
 88 |   * If you are happy with the results, you copy the generated file data/Mbtemp.txt
 89 |     to yodaqa src/main/resources/cz/brmlab/yodaqa/analysis/passextract/Mb.txt
 90 | 
 91 | In summary, use this (with YodaQA's f/sentence-selection branch):
 92 | 
 93 | 	./std_run.sh ../yodaqa/data/ml/embsel/sentdata
 94 | 	cp data/Mbtemp.txt ../yodaqa/src/main/resources/cz/brmlab/yodaqa/analysis/passextract/Mb.txt
 95 | 
 96 | ### Snapshot of results based on curated:
 97 | 
 98 | (With a random 1:1 train:test split of the original curated-train.)
 99 | 
100 | **Used dataset:**  
101 | 
102 | 	train questions: 186 train sentences: 43843	(generated with curated-measure.sh train)
103 | 	test questions: 429 test sentences: 88779	(generated with curated-measure.sh test)
104 | 	5.21294450264% of the properties contains correct answers
105 | 	random test mrr = 0.0760195275186
106 | 
107 | **Current results:**  
108 | 
109 | baseline (clue1+0.25*clue2):
110 | 
111 | 	MRR unigram+clues train 0.249327071552
112 | 	MRR unigram+clues test 0.29659580682
113 | 
114 | glove only:  
115 | 
116 | 	MMR after unigram learning train: 0.224787152966
117 | 	MMR after unigram learning test: 0.222749753007
118 | 
119 | glove+clue1:  
120 | 
121 | 	MRR unigram+clues train 0.358206351223
122 | 	MRR unigram+clues test 0.388948882077
123 | 
124 | 


--------------------------------------------------------------------------------
/vecfromtext.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import numpy as np
  4 | import pickle
  5 | import re
  6 | 
  7 | def getGloveDict(glovepath2):
  8 |     """Returns discionary of used words"""
  9 |     gloveDict = dict()
 10 |     with open(glovepath2,'r') as f:
 11 |         for line in f:
 12 |             word=line.split(' ',1)[0]
 13 |             gloveDict[word] = np.array(line.split(' ')[1:]).astype(float)
 14 |     return gloveDict
 15 | 
 16 | 
 17 | def textArrays(qpath,apath1,apath0):
 18 |     """ Returns qa text vectors from files with jacana formating.
 19 |     Text == array of tokens.
 20 |     It is a tuple of:
 21 |       * a list of question texts
 22 |       * a list of texts of all correct answers (across all questions)
 23 |       * a list of texts of all incorrect answers
 24 |       * for each question, #of correct answers (used for computing the index in list of all correct answers)
 25 |       * for each question, #of incorrect answers
 26 |     """
 27 |     questions=[]
 28 |     with open(qpath,'r') as f:
 29 |         for line in f:
 30 |             line=line.lower()
 31 |             if line[0]!='<':
 32 |                 line=re.sub('[^0-9a-zA-Z]+',' ', line)
 33 |                 x=np.array(line.split(' ')[:-1])
 34 |                 questions.append(x)
 35 |                 
 36 |     answers1=[]
 37 |     i=0
 38 |     ans1=[]
 39 |     with open(apath1,'r') as f:
 40 |         for line in f:
 41 |             line=line.lower()
 42 |             if line[0]!='<':
 43 |                 i+=1
 44 |                 line=re.sub('[^0-9a-zA-Z]+',' ', line)
 45 |                 x=np.array(line.split(' ')[:-1])
 46 |                 answers1.append(x)
 47 |             elif line[0:2]=='</':
 48 |                 ans1.append(i)
 49 |                 i=0
 50 |     answers0=[]
 51 |     i=0
 52 |     ans0=[]
 53 |     with open(apath0,'r') as f:
 54 |         for line in f:
 55 |             line=line.lower()
 56 |             if line[0]!='<':
 57 |                 i+=1
 58 |                 line=re.sub('[^0-9a-zA-Z]+',' ', line)
 59 |                 x=np.array(line.split(' ')[:-1])
 60 |                 if len(x)<1:
 61 |                     i-=1
 62 |                 else:
 63 |                     answers0.append(x)
 64 |             elif line[0:2]=='</':
 65 |                 ans0.append(i)
 66 |                 i=0
 67 |     return (questions,answers1,answers0,ans1,ans0)
 68 | 
 69 | def shortGlove(questions,answers1,answers0,glovepath_in,glovepath_out):                
 70 |     """ From a full Glove dictionary (glovepath2),
 71 |     creates smaller Glove-vector file with used words only """
 72 |     i=0
 73 |     words=set()
 74 |     for sentence in questions:
 75 |         for word in questions[i]:
 76 |             if word not in words:
 77 |                 words.add(word)
 78 |         i+=1
 79 |     i=0       
 80 |     for sentence in answers1:
 81 |         for word in answers1[i]:
 82 |             if word not in words:
 83 |                 words.add(word)
 84 |         i+=1
 85 |     i=0       
 86 |     for sentence in answers0:
 87 |         for word in answers0[i]:
 88 |             if word not in words:
 89 |                 words.add(word)
 90 |         i+=1
 91 |     used=open(glovepath_out,'w')
 92 |     with open(glovepath_in,'r') as f:
 93 |         for line in f:
 94 |             word=line.split(' ',1)[0]
 95 |             if word in words:
 96 | #                print 'found',word
 97 |                 used.write(line)
 98 |                 words.remove(word)
 99 |     used.close()
100 |     return
101 | 
102 | def saveArrays(qa,a1a,a0a,ans1,ans0,pqa,pa1a,pa0a,pans1,pans0):
103 |     np.savetxt(pqa,qa)
104 |     np.savetxt(pa1a,a1a)
105 |     np.savetxt(pa0a,a0a)
106 |     np.savetxt(pans1,ans1)
107 |     np.savetxt(pans0,ans0)
108 |     return
109 |     
110 | #results=[q_num,prop_num,crossmrr,mrrs,l,alpha,trainmrr]
111 | def saveMb(M,b,path,results):
112 |     np.savetxt(path,M)
113 |     m=open(path,'a')
114 |     m.write("%f"%b)
115 |     m.close()
116 |     with open(path, "r+") as f:
117 |          old = f.read()
118 |          f.seek(0)
119 |          f.write("\\\\Weights for property-selection feature, generated with https://github.com/brmson/Sentence-selection\n" +
120 |          "\\\\Trained on "+str(results[0])+" questions, "+str(results[1])+" properties\n"+
121 |          "\\\\Cross-validation MRRs: "+str(results[3])+"\n"+
122 |          "\\\\Mean cross-validation MRR "+str(results[2])+"\n"+  
123 |          "\\\\MRR on the whole training set: "+str(results[6])+"\n"+
124 |          "\\\\Learning constant alpha = "+str(results[5])+"\n"+
125 |          "\\\\Regularisation constant l = "+str(results[4])+"\n"+
126 |          old)
127 |     
128 | def loadArrays(qa,a1a,a0a):
129 |     qa=np.loadtxt(qa)
130 |     a1a=np.loadtxt(a1a)
131 |     a0a=np.loadtxt(a0a)
132 |     return (qa,a1a,a0a) 
133 |     
134 | def loadList(LISTPATH,PANS1,PANS0):
135 |     ans1=np.loadtxt(PANS1).astype(int)
136 |     ans0=np.loadtxt(PANS0).astype(int)
137 |     li = pickle.load( open( LISTPATH, "rb" ) )
138 |     return (li,ans1,ans0)


--------------------------------------------------------------------------------
/basicgrad.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | contains most of the important learning and evaluating functions
  5 | """
  6 | import numpy as np
  7 | #import matplotlib.pyplot as mpl
  8 | import scipy.special as s
  9 | from const import *
 10 | 
 11 | def trainConsts():
 12 |     """If you want to experiment with training constants, change them here"""
 13 |     l=5e-3    #regularisation constant
 14 |     alpha=1e-7 #learning constant
 15 |     return(l,alpha)
 16 | 
 17 | class q(object):
 18 |     """Holds question with all its answers and T/F values as well as counted probabilities"""
 19 |     q=[]
 20 |     a=[]
 21 |     y=[]
 22 |     t=[]
 23 |     tcount=[]
 24 |     clues=[]
 25 |     qtext=[]
 26 |     atext=[]
 27 |     counts=[]
 28 |     idf=[]
 29 |     def __init__(self,q,a1,a0,qtext,atext1,atext0,clues1=0,clues0=0):
 30 |         self.q=np.transpose(np.array(q,ndmin=2))  # question emb. (column)
 31 |         a1=np.array(a1,ndmin=2)  # correct ans. emb. (answers in rows)
 32 |         a0=np.array(a0,ndmin=2)  # incorrect
 33 |         self.a=np.hstack((np.transpose(a1),np.transpose(a0)))  # answer matrix (answer per column, correct come first)
 34 |         self.y=np.hstack((np.ones(len(a1)),np.zeros(len(a0))))  # answer labels
 35 |         self.qtext=qtext
 36 |         self.atext=atext1
 37 |         self.atext.extend((atext0))
 38 |         self.setCounts()
 39 |         self.setClues(clues1,clues0)
 40 |     def sett(self,M,b):
 41 |         """ compute answer labels based on model M,b """
 42 |         self.t=s.expit(z(self.q,M,self.a,b)[0])  # answer labels as estimated by the model
 43 |     def settcount(self,results):
 44 |         self.tcount=results
 45 |     def setClues(self,clues1,clues0):
 46 |         self.clues=np.hstack((clues1,clues0))
 47 |     def setCounts(self):
 48 |         """ compute counts of common words in question and each answer """
 49 |         N=len(self.y)
 50 |         self.counts=np.zeros(len(self.y))
 51 |         self.idf=np.zeros(len(self.y))
 52 |         for i in range(0,len(self.counts)):
 53 |             for word in self.qtext:
 54 |                 wc=self.atext[i].tolist().count(word)
 55 |                 self.counts[i]+=wc/len(self.atext[i])
 56 |                 if wc>0:
 57 |                     d=0
 58 |                     for sentence in self.atext:
 59 |                         if word in sentence:
 60 |                             d+=1
 61 |                             continue
 62 |                     self.idf[i]+=wc*np.log(N/d)
 63 | 
 64 | 
 65 | def ttlist(qa,a1a,a0a,ans1,ans0,sentences,c1=False,c0=False):
 66 |     """Returns list of qs"""
 67 |     clues1=np.zeros((2,sum(ans1)))
 68 |     clues0=np.zeros((2,sum(ans0)))
 69 |     if(c1):
 70 |         i=0
 71 |         with open(c1,'r') as f:
 72 |             for line in f:
 73 |                 s=line.split(" ")
 74 |                 clues1[0,i]=float(s[0])
 75 | #                clues1[1,i]=float(s[1])
 76 |                 i+=1
 77 |         i=0
 78 |         with open(c0,'r') as f:
 79 |             for line in f:
 80 |                 s=line.split(" ")
 81 |                 clues0[0,i]=float(s[0])
 82 | #                clues0[1,i]=float(s[1])
 83 |                 i+=1
 84 | 
 85 |     (questions,answers1,answers0)=sentences
 86 |     li=[]
 87 |     ones=0
 88 |     zeros=0
 89 |     for i in range(0,len(ans1)):
 90 |         li.append(q(qa[i],a1a[ones:ones+ans1[i]],a0a[zeros:zeros+ans0[i]],questions[i],
 91 |                     answers1[ones:ones+ans1[i]],answers0[zeros:zeros+ans0[i]],clues1[:,ones:ones+ans1[i]],clues0[:,zeros:zeros+ans0[i]]))
 92 |         ones+=ans1[i]
 93 |         zeros+=ans0[i]
 94 |     return li
 95 |     
 96 | def testGrad(M,b,li,idx):
 97 |     """Updates weights using basic gradient descent"""
 98 |     l,alpha=trainConsts()
 99 |     bestmrr=0.0
100 |     n_iter = 200
101 |     plot = np.zeros(int(n_iter / 5))
102 |     for i in range(0, n_iter):
103 |         ggM=0.0
104 |         ggb=0.0
105 |         if i%5==0:
106 |             plot[int(i/5)]=lossAll(li,M,b)
107 |             print '[%d/%d] loss function: %.1f (bestMRR %.3f) Thread number %d' % (i, n_iter, plot[int(i/5)], bestmrr, idx)
108 |         for q in li:
109 |             labels=q.y
110 | #                np.transpose(np.array(q.a[:,j],ndmin=2))
111 |             (gM,gb)=grad(labels,q.q,M,q.a,b)
112 |             ggM+=gM
113 |             ggb+=gb
114 |             M=M-alpha*ggM
115 |             b=b-alpha*ggb
116 |         curmrr=mrr(M,b,li)
117 |         if bestmrr<curmrr:
118 |             bestmrr=curmrr
119 |             bestM=M
120 |             bestb=b
121 | #    mpl.plot(plot)
122 |     return(bestM,bestb)
123 | 
124 | def loss(labels,q,M,a,b):
125 |     """#Loss cross-entropy function with regularization
126 |     inputs: labels-row array of {0.1};q-column vector;M-matrix;a-row of columns;b-scalar
127 |     """
128 |     l,alpha=trainConsts()
129 |     x=-(labels*np.log(s.expit(z(q,M,a,b)))+(1-labels)*np.log(1-s.expit(z(q,M,a,b))))
130 |     return np.sum(x)+l/2*(np.sum(M**2)+b**2)
131 | 
132 | 
133 | #qTMa+b
134 | def z(q,M,a,b):
135 |     return np.dot(np.dot(np.transpose(q),M),a)+b
136 | 
137 | #Grad of loss over weights, 1 question 1 answer input
138 | def grad(labels,q,M,anss,b):
139 |     l,alpha=trainConsts()
140 |     d=np.reshape(s.expit(z(q,M,anss,b)),(len(labels),))-labels
141 |     gM=0
142 | #    gb=0
143 |     for i in range(0,len(d)):
144 |         gM+=np.transpose(np.dot(np.reshape(anss[:,i],(GLOVELEN,1)),q.reshape((1,GLOVELEN))))*d[i]+l*M
145 |     return (gM,sum(d))
146 | 
147 | class yt(object):
148 |     y=0
149 |     t=0
150 |     def __init__(self,y,t):
151 |         self.y=y
152 |         self.t=t
153 | 
154 | #Sorts probabilities and returns first True
155 | def firstTrue(y,t):
156 |     li=[]
157 |     for i in range(0,len(y)):
158 |         li.append(yt(y[i],t[i]))
159 |     li.reverse()
160 |     li.sort(key=lambda x: x.t,reverse=True)
161 |     i=0
162 |     for item in li:
163 |         i+=1
164 |         if item.y==1:
165 |             return i
166 |     return i+1
167 | 
168 | #Sum of losses for multiple qs
169 | def lossAll(li,M,b):
170 |     los=0
171 |     for q in li:
172 |         los+=loss(q.y,q.q,M,q.a,b)
173 |     return los
174 |     
175 | #Returns MRR (used in uni)
176 | def mrr(M,b,li):
177 |     mrr=0.0
178 |     for q in li:
179 |         q.sett(M,b)
180 |         mrr+=1/firstTrue(q.y,q.t)
181 |     return mrr/len(li)
182 | 
183 | def setRes(li,ans1,ans0,res):
184 |     p=0
185 |     for i in range(0,len(li)):
186 |         li[i].settcount(res[p:p+ans1[i]+ans0[i]])
187 |         p+=ans1[i]+ans0[i]
188 |     return    
189 | 
190 | #Returns MRR (used in uni+count)
191 | def mrrcount(li,ans1,ans0):
192 |     mrr=0.0
193 |     for q in li:
194 |         mrr+=1/firstTrue(q.y,q.tcount)
195 |     return mrr/len(ans1)
196 | 
197 | 
198 | #Returns number of questions with correct answers in the first 3 sentences
199 | def strictPercentage(li,ans1,ans0):
200 |     p=0.0
201 |     for q in li:
202 |         x=firstTrue(q.y,q.tcount)
203 |         if x<=3:
204 |             p+=1        
205 |     return p/len(ans1)
206 |  
207 | 
208 | def getInputs(li,ans1,ans0):
209 |     y=np.zeros(sum(ans1)+sum(ans0))
210 |     x=np.zeros((len(y),3))
211 |     poz=0
212 |     for q in li:
213 |         for i in range(0,len(q.y)):
214 |             x[poz+i]=[q.t[i],q.counts[i],q.idf[i]]
215 |             y[poz+i]=q.y[i]
216 |         poz+=len(q.y)
217 |     return (x,y)
218 | 
219 | def getInputsClues(li,ans1,ans0):
220 |     y=np.zeros(sum(ans1)+sum(ans0))
221 |     x=np.zeros((len(y),2))
222 |     poz=0
223 |     for q in li:
224 |         for i in range(0,len(q.y)):
225 |             x[poz+i]=[q.t[i],q.clues[0,i]]
226 | #            x[poz+i]=[q.clues[0,i],q.clues[1,i]]
227 | #            x[poz+i]=[q.t[i]]
228 |             y[poz+i]=q.y[i]
229 |         poz+=len(q.y)
230 |     return (x,y)


--------------------------------------------------------------------------------