├── data
└── .keep
├── .spyderproject
├── .gitattributes
├── .gitignore
├── bow.py
├── std_run.sh
├── const.py
├── save.py
├── reparse.py
├── reparseprops.py
├── train.py
├── README.md
├── vecfromtext.py
└── basicgrad.py
/data/.keep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.spyderproject:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brmson/Sentence-selection/HEAD/.spyderproject
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
4 | # Custom for Visual Studio
5 | *.cs diff=csharp
6 |
7 | # Standard to msysgit
8 | *.doc diff=astextplain
9 | *.DOC diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot diff=astextplain
13 | *.DOT diff=astextplain
14 | *.pdf diff=astextplain
15 | *.PDF diff=astextplain
16 | *.rtf diff=astextplain
17 | *.RTF diff=astextplain
18 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Windows image file caches
2 | Thumbs.db
3 | ehthumbs.db
4 |
5 | # Folder config file
6 | Desktop.ini
7 |
8 | # Recycle Bin used on file shares
9 | $RECYCLE.BIN/
10 |
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 |
17 | # Windows shortcuts
18 | *.lnk
19 |
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 |
24 | # OSX
25 | # =========================
26 |
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 |
31 | # Thumbnails
32 | ._*
33 |
34 | # Files that might appear on external disk
35 | .Spotlight-V100
36 | .Trashes
37 |
38 | # Directories potentially created on remote AFP share
39 | .AppleDB
40 | .AppleDesktop
41 | Network Trash Folder
42 | Temporary Items
43 | .apdisk
44 |
45 |
46 | ############## our things:
47 |
48 | data/jacana/pokus.txt
49 | data/glovewiki.txt
50 | *.pyc
51 | untitled0.py
52 | /.project
53 | /io.py
54 | *.p
55 | *.POSInput
56 | tests.py
57 | data/w2v.txt
58 | /truth.txt
59 | /res.txt
60 | /unigram-Mb.pickle
61 |
62 | # generated intermediate data
63 | /data
64 |
--------------------------------------------------------------------------------
/bow.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | # -*- coding: utf-8 -*-
3 |
4 | from vecfromtext import getGloveDict
5 | import numpy as np
6 | from const import *
7 |
8 |
9 | def prepForGrad(q,a1,a0,ans1,ans0,glovepath2):
10 | """ Bag-of-words based embedding vectors for each sentence.
11 | Returns a matrix with one sentence per row. """
12 | gloveDict=getGloveDict(glovepath2)
13 | qa=np.zeros((len(q),GLOVELEN))
14 | a1a=np.zeros((len(a1),GLOVELEN))
15 | a0a=np.zeros((len(a0),GLOVELEN))
16 | for i in range(0,len(q)):
17 | qa[i][:]=boxSentence(q[i],gloveDict)
18 | print 'questions embedded'
19 | for i in range(0,len(a1)):
20 | a1a[i][:]=boxSentence(a1[i],gloveDict)
21 | print 'true answers embedded'
22 | for i in range(0,len(a0)):
23 | a0a[i][:]=boxSentence(a0[i],gloveDict)
24 | print 'false answers embedded'
25 | return (qa,a1a,a0a)
26 |
27 | #Boxofwords for sentence
28 | def boxSentence(sentence,gloveDict):
29 | i=0
30 | v=np.zeros(GLOVELEN)
31 | for word in sentence:
32 | x=gloveDict.get(word)
33 | if x is not None:
34 | v+=x
35 | i+=1
36 | if i!=0:
37 | v=v/i
38 | return v
39 |
--------------------------------------------------------------------------------
/std_run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # A script to convert a set of (question, sentence, binLabel) tuples
4 | # (where sentence may be a passage or, with -p, property label)
5 | # to a classifier which attempts to predict binLabel from unseen
6 | # (question, sentences) pairs.
7 | #
8 | # Usage: std-run.sh [-p] TRAINDATAPATH
9 | #
10 | # Example: ./std-run.sh -p ../yodaqa/data/ml/embsel/propdata/
11 |
12 | if [[ -f resources/glove.6B.50d.txt ]]
13 | then
14 | echo "Dictionary allready downloaded"
15 | else
16 | echo "Downloading dictionary"
17 | wget http://pasky.or.cz/dev/brmson/glove.6B.50d.txt.gz
18 | gunzip glove.6B.50d.txt.gz
19 | mkdir -p resources
20 | mv glove.6B.50d.txt resources
21 | fi
22 |
23 |
24 |
25 | props=false
26 | if [ "$1" = "-p" ]; then
27 | props=true
28 | shift
29 | fi
30 | path=$1
31 |
32 | # Convert YodaQA-generated data to Jacana-style data
33 | if [[ props ]]
34 | then
35 | echo 'Running property-reparse'
36 | python reparseprops.py "$path"
37 | else
38 | echo 'Running sentence-reparse'
39 | python reparse.py "$path"
40 | fi
41 |
42 | # Convert Jacana-style data to pickled Python data structures
43 | echo 'Running save.py'
44 | python save.py
45 |
46 | # Train and save a classifier on top of the pickled data
47 | echo 'Running train.py'
48 | python train.py
49 |
--------------------------------------------------------------------------------
/const.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | GLOVELEN=50
4 | GLOVEPATH='resources/glove.6B.50d.txt'
5 | #GLOVEPATH='data/glove.6B.100d.txt'
6 | ##
7 | #QPATH='data/jacana/Train1-100.Question.POSInput'
8 | #APATH1='data/jacana/Train1-100.Positive-J.POSInput'
9 | #APATH0='data/jacana/Train1-100.Negative-T.POSInput'
10 | QPATH='data/Qtrain.txt'
11 | APATH1='data/Ptrain.txt'
12 | APATH0='data/Ntrain.txt'
13 | CPATH1="data/Clues1train.txt"
14 | CPATH0="data/Clues0train.txt"
15 | #QPATH='data/jacana/Train1-2393.Question.POSInput'
16 | #APATH1='data/jacana/Train1-2393.Positive-M.POSInput'
17 | #APATH0='data/jacana/Train1-2393.Negative-M.POSInput'
18 | #TQPATH='data/jacana/Test.Question.POSInput'
19 | #TAPATH1='data/jacana/Test.Positive-J.POSInput'
20 | #TAPATH0='data/jacana/Test.Negative-T.POSInput'
21 | TQPATH='data/Qtest.txt'
22 | TAPATH1='data/Ptest.txt'
23 | TAPATH0='data/Ntest.txt'
24 | APATH0='data/Ntrain.txt'
25 | TCPATH1="data/Clues1test.txt"
26 | TCPATH0="data/Clues0test.txt"
27 |
28 | #GLOVEPATH='data/w2v.txt'
29 | GLOVEPATH2='data/usedembed.txt'
30 | TGLOVEPATH2='data/tusedembed.txt'
31 | PTQA='data/tqarray.txt'
32 | PTA1A='data/ta1rray.txt'
33 | PTA0A='data/ta0rray.txt'
34 | PTANS1='data/tans1.txt'
35 | PTANS0='data/tans0.txt'
36 | PQA='data/qarray.txt'
37 | PA1A='data/a1rray.txt'
38 | PA0A='data/a0rray.txt'
39 | PANS1='data/ans1.txt'
40 | PANS0='data/ans0.txt'
41 | LISTPATH="data/trainlist.p"
42 | TLISTPATH="data/testlist.p"
43 |
--------------------------------------------------------------------------------
/save.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | """
4 | input=jacana formated text files
5 | output=pickled q objects
6 | """
7 | from basicgrad import ttlist
8 | from vecfromtext import textArrays,shortGlove
9 | from bow import prepForGrad
10 | import pickle
11 | import numpy as np
12 | from const import *
13 |
14 | def trecEval(li,count=True):
15 | truth=open('truth.txt','w')
16 | res=open('res.txt','w')
17 | for i in range(0,len(li)):
18 | for j in range(0,len(li[i].y)):
19 | truth.write(' '.join(map(str,(i,0,j,int(li[i].y[j]),'\n'))))
20 | if (count):
21 | res.write(' '.join(map(str,(i,0,j,1,li[i].tcount[j],'glove','\n'))))
22 | else:
23 | res.write(' '.join(map(str,(i,0,j,1,li[i].t[j],'glove','\n'))))
24 | truth.close()
25 | res.close()
26 | print 'trec_eval created'
27 | return
28 |
29 | def saveQlist(QPATH,APATH1,APATH0,GLOVEPATH,GLOVEPATH2,PLIST,PANS1,PANS0,new_dict=False,c1=False,c0=False):
30 | """From jacana formated documents of questions, true answers, false answers
31 | saves list of Qs to PLIST path"""
32 | (q,a1,a0,ans1,ans0)=textArrays(QPATH,APATH1,APATH0)
33 | if new_dict==True:
34 | shortGlove(q,a1,a0,GLOVEPATH,GLOVEPATH2)
35 | (qa,a1a,a0a)=prepForGrad(q,a1,a0,ans1,ans0,GLOVEPATH2)
36 | sentences=(q,a1,a0)
37 | li=ttlist(qa,a1a,a0a,ans1,ans0,sentences,c1,c0)
38 | pickle.dump( li, open( PLIST , "wb" ) )
39 | np.savetxt(PANS1,ans1)
40 | np.savetxt(PANS0,ans0)
41 | return
42 |
43 | saveQlist(QPATH,APATH1,APATH0,GLOVEPATH,GLOVEPATH2,LISTPATH,PANS1,PANS0,new_dict=True,c1=CPATH1,c0=CPATH0)
44 | print 'training data saved'
45 | #saveQlist(TQPATH,TAPATH1,TAPATH0,GLOVEPATH,TGLOVEPATH2,TLISTPATH,PTANS1,PTANS0,new_dict=True,c1=TCPATH1,c0=TCPATH0)
46 | #print 'testing data saved'
47 |
--------------------------------------------------------------------------------
/reparse.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Usage: reparse.py DATAPATH
4 |
5 | input=yodaqa csv outputs (sentences)
6 | output=jacana formated files for use in save.py
7 | """
8 |
9 | import os
10 | import sys
11 | import glob
12 |
13 | QPATH="data/Qtrain.txt"
14 | PPATH="data/Ptrain.txt"
15 | NPATH="data/Ntrain.txt"
16 | CPATH1="data/Clues1train.txt"
17 | CPATH0="data/Clues0train.txt"
18 | #TPATH="data/curated-test"
19 | #TQPATH="data/Qtest.txt"
20 | #TPPATH="data/Ptest.txt"
21 | #TNPATH="data/Ntest.txt"
22 | #TCPATH1="data/Clues1test.txt"
23 | #TCPATH0="data/Clues0test.txt"
24 |
25 |
26 | def reparse(PATH,QPATH,PPATH,NPATH,CPATH1,CPATH0):
27 | q=open(QPATH,'w')
28 | p=open(PPATH,'w')
29 | n=open(NPATH,'w')
30 | cp=open(CPATH1,'w')
31 | cn=open(CPATH0,'w')
32 |
33 | qnum=0
34 | path=PATH+"/"+file
35 | i=0
36 | p.write("\n")
37 | n.write("\n")
38 | with open(path,'r') as f:
39 | for line in f:
40 | s=line.split(" ")
41 | if(s[0]=="" and i==0):
42 | q.write("\n")
43 | q.write(" ".join(s[1:]))
44 | q.write("
\n")
45 | i+=1
46 | qnum+=1
47 | continue
48 | elif(s[0]=="" and i!=0):
49 | continue
50 | if(s[0]=='1'):
51 | p.write(" ".join(s[3:]))
52 | cp.write(" ".join(s[1:3])+"\n")
53 | else:
54 | n.write(" ".join(s[3:]))
55 | cn.write(" ".join(s[1:3])+"\n")
56 | p.write("
\n")
57 | n.write("\n")
58 | print ".",
59 | q.close()
60 | p.close()
61 | n.close()
62 | cp.close()
63 | cn.close()
64 |
65 |
66 | PATH = sys.argv[1]
67 | reparse(PATH,QPATH,PPATH,NPATH,CPATH1,CPATH0)
68 | #reparse(TPATH,TQPATH,TPPATH,TNPATH,TCPATH1,TCPATH0)
69 |
--------------------------------------------------------------------------------
/reparseprops.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Usage: reparseprops.py DATAPATH
4 |
5 | input=yodaqa csv outputs (properties)
6 | output=jacana formated files for use in save.py
7 | """
8 |
9 | import os
10 | import sys
11 | import glob
12 |
13 | QPATH="data/Qtrain.txt"
14 | PPATH="data/Ptrain.txt"
15 | NPATH="data/Ntrain.txt"
16 | CPATH1="data/Clues1train.txt"
17 | CPATH0="data/Clues0train.txt"
18 | #TPATH="data/curated-test"
19 | #TQPATH="data/Qtest.txt"
20 | #TPPATH="data/Ptest.txt"
21 | #TNPATH="data/Ntest.txt"
22 | #TCPATH1="data/Clues1test.txt"
23 | #TCPATH0="data/Clues0test.txt"
24 |
25 | def notNumber(s):
26 | try:
27 | float(s)
28 | return False
29 | except ValueError:
30 | return True
31 |
32 |
33 | def reparseProps(PATH,QPATH,PPATH,NPATH,CPATH1,CPATH0):
34 | q=open(QPATH,'w')
35 | p=open(PPATH,'w')
36 | n=open(NPATH,'w')
37 | cp=open(CPATH1,'w')
38 | cn=open(CPATH0,'w')
39 |
40 | qnum=0
41 | for path in glob.glob(PATH + '/*'):
42 | i=0
43 | p.write("\n")
44 | n.write("\n")
45 | propdict=dict()
46 | propset=set()
47 | with open(path,'r') as f:
48 | for line in f:
49 | s=line.split(" ")
50 | if(s[0]!=""):
51 | s=line.split(" ")
52 | text=" ".join(s[2:]).lower()
53 | if text in propdict:
54 | if(s[0]=='1'):
55 | propdict[text]='1'
56 | continue
57 | propdict[text]=s[0]
58 | with open(path,'r') as f:
59 | for line in f:
60 | s=line.split(" ")
61 | if(s[0]=="" and i==0):
62 | q.write("\n")
63 | q.write(" ".join(s[1:]))
64 | q.write("
\n")
65 | i+=1
66 | qnum+=1
67 | continue
68 | elif(s[0]=="" and i!=0):
69 | continue
70 | if notNumber(s[0]) or notNumber(s[1]):
71 | continue
72 | # print s
73 | text=" ".join(s[2:]).lower()
74 | if text not in propset:
75 | # print text
76 | if(propdict[text]=='1'):
77 | p.write(text)
78 | cp.write(" ".join(s[1:2])+"\n")
79 | else:
80 | n.write(text)
81 | cn.write(" ".join(s[1:2])+"\n")
82 | propset.add(text)
83 | p.write("
\n")
84 | n.write("\n")
85 | print ".",
86 | q.close()
87 | p.close()
88 | n.close()
89 | cp.close()
90 | cn.close()
91 |
92 |
93 | PATH = sys.argv[1]
94 | reparseProps(PATH,QPATH,PPATH,NPATH,CPATH1,CPATH0)
95 | #reparseProps(TPATH,TQPATH,TPPATH,TNPATH,TCPATH1,TCPATH0)
96 |
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | """
4 | input=q objects
5 | output=trained weights
6 | """
7 |
8 | import time
9 | import random
10 | from basicgrad import mrrcount,mrr,setRes,getInputsClues,testGrad,trainConsts
11 | from const import *
12 | import numpy as np
13 | from sklearn import linear_model
14 | from vecfromtext import loadList,saveMb
15 | from multiprocessing import Pool
16 |
17 | def cross_validate_one(idx):
18 | global gdata
19 | (M,b,trainlist,threads)=gdata
20 | if idx==0:
21 | (M2,b2)=testGrad(M,b,trainlist,idx)
22 | res=0
23 | else:
24 | random.shuffle(trainlist)
25 | trainvalborder=len(trainlist)*(threads-2)/(threads-1)
26 | (M2,b2)=testGrad(M,b,trainlist[:trainvalborder],idx)
27 | print 'MMR after unigram learning train(idx=',idx,'):',mrr(M2,b2,trainlist)
28 | res=mrr(M2,b2,trainlist[trainvalborder:])
29 | print 'MMR after unigram learning val(idx=',idx,'):',res
30 | return (res,M2,b2)
31 |
32 | def cross_validate_all(M,b,trainlist):
33 | global gdata
34 | threads=5
35 | gdata=(M,b,trainlist,threads+1)
36 | i=0
37 | pool = Pool()
38 | mrrs=[]
39 | for res in pool.imap(cross_validate_one,range(threads+1)):
40 | mrr,M,b=res
41 | if i==0:
42 | retM=M
43 | retb=b
44 | i+=1
45 | else:
46 | mrrs.append(mrr)
47 | pool.close()
48 | return (mrrs,sum(mrrs)/threads,retM,retb)
49 |
50 | def trainMb(trainlist,ans1,ans0):
51 | """Unigram training from saved Qlist files, returns Mb weights.
52 | You can play with the learning constants in trainConsts() of basicgrad.py"""
53 | t0=time.time()
54 | M=np.random.normal(0,0.01,(GLOVELEN,GLOVELEN))
55 | b=-0.0001
56 | # M=np.loadtxt('data/M58prop')
57 | # b=np.loadtxt('data/b58prop')
58 | mrrs,crossmrr,M,b=cross_validate_all(M,b,trainlist)
59 | t1=time.time()
60 | print "time spent training =",t1-t0
61 | print "MRR after crossvalidation=",crossmrr
62 |
63 | # XXX: This has a sideeffect, setting resolutions in trainlist
64 | trainmrr=mrr(M,b,trainlist)
65 | print 'Mb MRR on train:', trainmrr
66 | l,alpha=trainConsts()
67 | results=[crossmrr,mrrs,l,alpha,trainmrr]
68 | return (M,b,results)
69 |
70 |
71 | def trainClues(trainlist,ans1,ans0):
72 | """Logistic regression using Mb probability and clues as input.
73 | requires mrr(M,b,trainlist) called beforehand to work properly"""
74 | (x,y)=getInputsClues(trainlist,ans1,ans0)
75 | clf = linear_model.LogisticRegression(C=1, penalty='l2', tol=1e-5)
76 | clf.fit(x, y)
77 | counttest=clf.predict_proba(x)
78 | setRes(trainlist,ans1,ans0,counttest[:,1])
79 | mrrt=mrrcount(trainlist,ans1,ans0)
80 | print 'MRR unigram+clues train',mrrt
81 | w=clf.coef_
82 | w=np.append(w,clf.intercept_);
83 | return w
84 |
85 | def train(LISTPATH,PANS1,PANS0):
86 | (trainlist,ans1,ans0)=loadList(LISTPATH,PANS1,PANS0)
87 | print 'data loaded'
88 | (M,b,results)=trainMb(trainlist,ans1,ans0)
89 | w=trainClues(trainlist,ans1,ans0)
90 |
91 | prop_num=0
92 | for q in trainlist:
93 | prop_num+=len(q.y)
94 | q_num=len(trainlist)
95 | print "trained on",q_num,"questions"
96 | print "trained on",prop_num,"properties"
97 | crossmrr,mrrs,l,alpha,trainmrr=results
98 | results=(q_num,prop_num,crossmrr,mrrs,l,alpha,trainmrr)
99 | return (M,b,w,results)
100 |
101 |
102 | if __name__ == "__main__":
103 | # Seed always to the same number to get reproducible models
104 | np.random.seed(17151713)
105 |
106 | (M, b, w, results) = train(LISTPATH, PANS1, PANS0)
107 |
108 | saveMb(M,b,"data/Mbtemp.txt",results)
109 | np.savetxt('data/weights.txt',w)
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Deep Learning for Answer Sentence Selection Reconstruction
2 | ==========================================================
3 |
4 | This work started as an attempt to reproduce Yu et al.'s http://arxiv.org/abs/1412.1632
5 |
6 | Used word embeddings: pre-trained GloVe vectors from http://nlp.stanford.edu/projects/glove/
7 |
8 | So far implemented:
9 | * Bag of words + basic gradient descent learning classification
10 | * Bag of words + basic gradient descent learning classification + word counts logistic regression
11 |
12 | Development Instructions
13 | ------------------------
14 |
15 | For sentence selection development,
16 | used dataset: TREC-based originally by Wang et al., 2007; in the form
17 | by Yao et al., 2013 as downloaded from https://code.google.com/p/jacana/
18 |
19 | Preprocessing (not required):
20 | * Run save.py first with updated filepath constants (const.py) if you have different dataset (requires jacana formating)
21 |
22 | Train and test:
23 | * Run train.py for training from TREC TRAIN dataset and testing from TREC TEST dataset
24 | * train.py generates truth.txt and res.txt, to evaluate using the official trec_eval tool, run
25 |
26 | trec_eval -a truth.txt res.txt
27 |
28 | TODO:
29 | * CNN instead of bag of words unigram averaging for aggregate embeddings.
30 |
31 | Results (evaluated using stock TREC scripts):
32 |
33 | | | MRR | MAP |
34 | |-----------------|--------|--------|
35 | | TRAIN | 0.7312 | 0.6551 |
36 | | TRAIN-ALL | 0.7308 | 0.6566 |
37 | | TRAIN+count | 0.7763 | 0.7165 |
38 | | TRAIN-ALL+count | 0.8128 | 0.7258 |
39 |
40 |
41 | Property selection in yodaqa/moviesC:
42 | -------------------------------------
43 |
44 | Folow these steps if you want to retrain currently used weights:
45 |
46 | * Gather input data (labelled tuples) according to the instructions
47 | in YodaQA data/ml/embsel/README.md.
48 |
49 | * Run './std_run.sh -p PATH' (PATH is the directory of dumped yodaqa files).
50 | You can alter the training constants in basicgrad.py and train.py.
51 |
52 | * If you are happy with the results, you copy the generated file data/Mbtemp.txt
53 | to yodaqa src/main/resources/cz/brmlab/yodaqa/analysis/rdf/Mbprop.txt
54 |
55 | In summary, use this:
56 |
57 | ./std_run.sh -p ../yodaqa/data/ml/embsel/propdata
58 | cp data/Mbtemp.txt ../yodaqa/src/main/resources/cz/brmlab/yodaqa/analysis/rdf/Mbprop.txt
59 |
60 | ### Snapshot of results based on curated:
61 |
62 | (With a random 1:1 train:test split of the original curated-train.)
63 |
64 | **Used dataset:**
65 |
66 | train questions: 270 train sentences: 19624 (generated with curated-measure.sh train)
67 | test questions: 222 test sentences: 17561 (generated with curated-measure.sh train)
68 | 2.7902739024% of the properties contains correct answers
69 | random test mrr = 0.0475542678953
70 |
71 | **Current results:**
72 |
73 | MMR after unigram learning train: 0.600856454434
74 | MMR after unigram learning test: 0.582881935037
75 |
76 |
77 | Sentence selection on yodaqa/curated:
78 | -------------------------------------
79 |
80 | Folow these steps if you want to retrain currently used weights:
81 |
82 | * Gather input data (labelled tuples) according to the instructions
83 | in YodaQA data/ml/embsel/README.md.
84 |
85 | * Run './std_run.sh -p PATH' (PATH is the directory of dumped yodaqa files).
86 | You can alter the training constants in basicgrad.py and train.py.
87 |
88 | * If you are happy with the results, you copy the generated file data/Mbtemp.txt
89 | to yodaqa src/main/resources/cz/brmlab/yodaqa/analysis/passextract/Mb.txt
90 |
91 | In summary, use this (with YodaQA's f/sentence-selection branch):
92 |
93 | ./std_run.sh ../yodaqa/data/ml/embsel/sentdata
94 | cp data/Mbtemp.txt ../yodaqa/src/main/resources/cz/brmlab/yodaqa/analysis/passextract/Mb.txt
95 |
96 | ### Snapshot of results based on curated:
97 |
98 | (With a random 1:1 train:test split of the original curated-train.)
99 |
100 | **Used dataset:**
101 |
102 | train questions: 186 train sentences: 43843 (generated with curated-measure.sh train)
103 | test questions: 429 test sentences: 88779 (generated with curated-measure.sh test)
104 | 5.21294450264% of the properties contains correct answers
105 | random test mrr = 0.0760195275186
106 |
107 | **Current results:**
108 |
109 | baseline (clue1+0.25*clue2):
110 |
111 | MRR unigram+clues train 0.249327071552
112 | MRR unigram+clues test 0.29659580682
113 |
114 | glove only:
115 |
116 | MMR after unigram learning train: 0.224787152966
117 | MMR after unigram learning test: 0.222749753007
118 |
119 | glove+clue1:
120 |
121 | MRR unigram+clues train 0.358206351223
122 | MRR unigram+clues test 0.388948882077
123 |
124 |
--------------------------------------------------------------------------------
/vecfromtext.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import numpy as np
4 | import pickle
5 | import re
6 |
7 | def getGloveDict(glovepath2):
8 | """Returns discionary of used words"""
9 | gloveDict = dict()
10 | with open(glovepath2,'r') as f:
11 | for line in f:
12 | word=line.split(' ',1)[0]
13 | gloveDict[word] = np.array(line.split(' ')[1:]).astype(float)
14 | return gloveDict
15 |
16 |
17 | def textArrays(qpath,apath1,apath0):
18 | """ Returns qa text vectors from files with jacana formating.
19 | Text == array of tokens.
20 | It is a tuple of:
21 | * a list of question texts
22 | * a list of texts of all correct answers (across all questions)
23 | * a list of texts of all incorrect answers
24 | * for each question, #of correct answers (used for computing the index in list of all correct answers)
25 | * for each question, #of incorrect answers
26 | """
27 | questions=[]
28 | with open(qpath,'r') as f:
29 | for line in f:
30 | line=line.lower()
31 | if line[0]!='<':
32 | line=re.sub('[^0-9a-zA-Z]+',' ', line)
33 | x=np.array(line.split(' ')[:-1])
34 | questions.append(x)
35 |
36 | answers1=[]
37 | i=0
38 | ans1=[]
39 | with open(apath1,'r') as f:
40 | for line in f:
41 | line=line.lower()
42 | if line[0]!='<':
43 | i+=1
44 | line=re.sub('[^0-9a-zA-Z]+',' ', line)
45 | x=np.array(line.split(' ')[:-1])
46 | answers1.append(x)
47 | elif line[0:2]=='':
48 | ans1.append(i)
49 | i=0
50 | answers0=[]
51 | i=0
52 | ans0=[]
53 | with open(apath0,'r') as f:
54 | for line in f:
55 | line=line.lower()
56 | if line[0]!='<':
57 | i+=1
58 | line=re.sub('[^0-9a-zA-Z]+',' ', line)
59 | x=np.array(line.split(' ')[:-1])
60 | if len(x)<1:
61 | i-=1
62 | else:
63 | answers0.append(x)
64 | elif line[0:2]=='':
65 | ans0.append(i)
66 | i=0
67 | return (questions,answers1,answers0,ans1,ans0)
68 |
69 | def shortGlove(questions,answers1,answers0,glovepath_in,glovepath_out):
70 | """ From a full Glove dictionary (glovepath2),
71 | creates smaller Glove-vector file with used words only """
72 | i=0
73 | words=set()
74 | for sentence in questions:
75 | for word in questions[i]:
76 | if word not in words:
77 | words.add(word)
78 | i+=1
79 | i=0
80 | for sentence in answers1:
81 | for word in answers1[i]:
82 | if word not in words:
83 | words.add(word)
84 | i+=1
85 | i=0
86 | for sentence in answers0:
87 | for word in answers0[i]:
88 | if word not in words:
89 | words.add(word)
90 | i+=1
91 | used=open(glovepath_out,'w')
92 | with open(glovepath_in,'r') as f:
93 | for line in f:
94 | word=line.split(' ',1)[0]
95 | if word in words:
96 | # print 'found',word
97 | used.write(line)
98 | words.remove(word)
99 | used.close()
100 | return
101 |
102 | def saveArrays(qa,a1a,a0a,ans1,ans0,pqa,pa1a,pa0a,pans1,pans0):
103 | np.savetxt(pqa,qa)
104 | np.savetxt(pa1a,a1a)
105 | np.savetxt(pa0a,a0a)
106 | np.savetxt(pans1,ans1)
107 | np.savetxt(pans0,ans0)
108 | return
109 |
110 | #results=[q_num,prop_num,crossmrr,mrrs,l,alpha,trainmrr]
111 | def saveMb(M,b,path,results):
112 | np.savetxt(path,M)
113 | m=open(path,'a')
114 | m.write("%f"%b)
115 | m.close()
116 | with open(path, "r+") as f:
117 | old = f.read()
118 | f.seek(0)
119 | f.write("\\\\Weights for property-selection feature, generated with https://github.com/brmson/Sentence-selection\n" +
120 | "\\\\Trained on "+str(results[0])+" questions, "+str(results[1])+" properties\n"+
121 | "\\\\Cross-validation MRRs: "+str(results[3])+"\n"+
122 | "\\\\Mean cross-validation MRR "+str(results[2])+"\n"+
123 | "\\\\MRR on the whole training set: "+str(results[6])+"\n"+
124 | "\\\\Learning constant alpha = "+str(results[5])+"\n"+
125 | "\\\\Regularisation constant l = "+str(results[4])+"\n"+
126 | old)
127 |
128 | def loadArrays(qa,a1a,a0a):
129 | qa=np.loadtxt(qa)
130 | a1a=np.loadtxt(a1a)
131 | a0a=np.loadtxt(a0a)
132 | return (qa,a1a,a0a)
133 |
134 | def loadList(LISTPATH,PANS1,PANS0):
135 | ans1=np.loadtxt(PANS1).astype(int)
136 | ans0=np.loadtxt(PANS0).astype(int)
137 | li = pickle.load( open( LISTPATH, "rb" ) )
138 | return (li,ans1,ans0)
--------------------------------------------------------------------------------
/basicgrad.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | # -*- coding: utf-8 -*-
3 | """
4 | contains most of the important learning and evaluating functions
5 | """
6 | import numpy as np
7 | #import matplotlib.pyplot as mpl
8 | import scipy.special as s
9 | from const import *
10 |
11 | def trainConsts():
12 | """If you want to experiment with training constants, change them here"""
13 | l=5e-3 #regularisation constant
14 | alpha=1e-7 #learning constant
15 | return(l,alpha)
16 |
17 | class q(object):
18 | """Holds question with all its answers and T/F values as well as counted probabilities"""
19 | q=[]
20 | a=[]
21 | y=[]
22 | t=[]
23 | tcount=[]
24 | clues=[]
25 | qtext=[]
26 | atext=[]
27 | counts=[]
28 | idf=[]
29 | def __init__(self,q,a1,a0,qtext,atext1,atext0,clues1=0,clues0=0):
30 | self.q=np.transpose(np.array(q,ndmin=2)) # question emb. (column)
31 | a1=np.array(a1,ndmin=2) # correct ans. emb. (answers in rows)
32 | a0=np.array(a0,ndmin=2) # incorrect
33 | self.a=np.hstack((np.transpose(a1),np.transpose(a0))) # answer matrix (answer per column, correct come first)
34 | self.y=np.hstack((np.ones(len(a1)),np.zeros(len(a0)))) # answer labels
35 | self.qtext=qtext
36 | self.atext=atext1
37 | self.atext.extend((atext0))
38 | self.setCounts()
39 | self.setClues(clues1,clues0)
40 | def sett(self,M,b):
41 | """ compute answer labels based on model M,b """
42 | self.t=s.expit(z(self.q,M,self.a,b)[0]) # answer labels as estimated by the model
43 | def settcount(self,results):
44 | self.tcount=results
45 | def setClues(self,clues1,clues0):
46 | self.clues=np.hstack((clues1,clues0))
47 | def setCounts(self):
48 | """ compute counts of common words in question and each answer """
49 | N=len(self.y)
50 | self.counts=np.zeros(len(self.y))
51 | self.idf=np.zeros(len(self.y))
52 | for i in range(0,len(self.counts)):
53 | for word in self.qtext:
54 | wc=self.atext[i].tolist().count(word)
55 | self.counts[i]+=wc/len(self.atext[i])
56 | if wc>0:
57 | d=0
58 | for sentence in self.atext:
59 | if word in sentence:
60 | d+=1
61 | continue
62 | self.idf[i]+=wc*np.log(N/d)
63 |
64 |
65 | def ttlist(qa,a1a,a0a,ans1,ans0,sentences,c1=False,c0=False):
66 | """Returns list of qs"""
67 | clues1=np.zeros((2,sum(ans1)))
68 | clues0=np.zeros((2,sum(ans0)))
69 | if(c1):
70 | i=0
71 | with open(c1,'r') as f:
72 | for line in f:
73 | s=line.split(" ")
74 | clues1[0,i]=float(s[0])
75 | # clues1[1,i]=float(s[1])
76 | i+=1
77 | i=0
78 | with open(c0,'r') as f:
79 | for line in f:
80 | s=line.split(" ")
81 | clues0[0,i]=float(s[0])
82 | # clues0[1,i]=float(s[1])
83 | i+=1
84 |
85 | (questions,answers1,answers0)=sentences
86 | li=[]
87 | ones=0
88 | zeros=0
89 | for i in range(0,len(ans1)):
90 | li.append(q(qa[i],a1a[ones:ones+ans1[i]],a0a[zeros:zeros+ans0[i]],questions[i],
91 | answers1[ones:ones+ans1[i]],answers0[zeros:zeros+ans0[i]],clues1[:,ones:ones+ans1[i]],clues0[:,zeros:zeros+ans0[i]]))
92 | ones+=ans1[i]
93 | zeros+=ans0[i]
94 | return li
95 |
96 | def testGrad(M,b,li,idx):
97 | """Updates weights using basic gradient descent"""
98 | l,alpha=trainConsts()
99 | bestmrr=0.0
100 | n_iter = 200
101 | plot = np.zeros(int(n_iter / 5))
102 | for i in range(0, n_iter):
103 | ggM=0.0
104 | ggb=0.0
105 | if i%5==0:
106 | plot[int(i/5)]=lossAll(li,M,b)
107 | print '[%d/%d] loss function: %.1f (bestMRR %.3f) Thread number %d' % (i, n_iter, plot[int(i/5)], bestmrr, idx)
108 | for q in li:
109 | labels=q.y
110 | # np.transpose(np.array(q.a[:,j],ndmin=2))
111 | (gM,gb)=grad(labels,q.q,M,q.a,b)
112 | ggM+=gM
113 | ggb+=gb
114 | M=M-alpha*ggM
115 | b=b-alpha*ggb
116 | curmrr=mrr(M,b,li)
117 | if bestmrr