├── FocusedLabeling ├── process_inference.lua ├── generate_inference_data.py ├── infer_crf.lua ├── test_crf.lua └── train_crf.lua ├── src ├── model │ ├── TripleScore.lua │ ├── BatchDot.lua │ ├── BiRNN.lua │ ├── BiRNNSelect.lua │ ├── Linear.lua │ ├── model_utils.lua │ ├── CRF.lua │ └── BiGRU.lua ├── data │ ├── Vocab.lua │ ├── SeqLabelRankLoader.lua │ ├── SeqLabelingLoader.lua │ ├── SeqMultiLabelLoader.lua │ ├── RankingDataLoader.lua │ └── SeqRankingLoader.lua ├── optim │ ├── AdaGrad.lua │ └── SGD.lua └── py_module │ ├── QAData.py │ ├── freebase.py │ └── virtuoso.py ├── EntityTypeVec ├── process_inference.lua ├── test_ent_typevec.lua ├── infer_ent_typevec.lua └── train_ent_typevec.lua ├── vocab └── create_vocab.lua ├── process.lua ├── RelationRNN ├── process_inference.lua ├── infer_rel_rnn.lua └── train_rel_rnn.lua ├── init.lua ├── .gitignore ├── Inference ├── valid │ └── run.sh ├── generate_score_data.py ├── test │ └── run.sh ├── joint_predict.py ├── query_candidates.py └── joint_disambiguation.py ├── KnowledgeBase ├── convert.py └── type.top-500.pkl ├── data_preprocess.sh ├── Virtuoso.md ├── README.md └── SimpleQuestions ├── generate_training_data.py └── PreprocessData └── process_rawdata.py /FocusedLabeling/process_inference.lua: -------------------------------------------------------------------------------- 1 | require '..' 2 | 3 | local cmd = torch.CmdLine() 4 | cmd:text('Options') 5 | cmd:option('-testSplit','valid','use which data split set') 6 | cmd:text() 7 | 8 | local opt = cmd:parse(arg) 9 | 10 | local wordVocab = torch.load('../vocab/vocab.word.t7') 11 | 12 | local txtPath = string.format('inference-data/label.%s.txt', opt.testSplit) 13 | local thPath = string.format('inference-data/label.%s.t7', opt.testSplit) 14 | 15 | createSeqLabelingData(txtPath, thPath, wordVocab, 1) -------------------------------------------------------------------------------- /src/model/TripleScore.lua: -------------------------------------------------------------------------------- 1 | function TripleScore(negBatchSize) 2 | local tarVec = nn.Identity()() 3 | local posVec = nn.Identity()() 4 | local negMat = nn.Identity()() 5 | 6 | local scoreVecPos = BatchDot() ({tarVec, posVec}) 7 | local scoreMatPos = nn.Replicate(negBatchSize) (scoreVecPos) 8 | 9 | local tarMat = nn.Replicate(negBatchSize) (tarVec) 10 | local scoreMatNeg = BatchDot() ({tarMat, negMat}) 11 | 12 | return nn.gModule({tarVec, posVec, negMat}, {scoreMatPos, scoreMatNeg}) 13 | end 14 | -------------------------------------------------------------------------------- /EntityTypeVec/process_inference.lua: -------------------------------------------------------------------------------- 1 | require '..' 2 | 3 | local cmd = torch.CmdLine() 4 | cmd:text('Options') 5 | cmd:option('-testSplit','valid','use which data split set') 6 | cmd:text() 7 | 8 | local opt = cmd:parse(arg) 9 | 10 | local wordVocab = torch.load('../vocab/vocab.word.t7') 11 | local relationVocab = torch.load('../vocab/vocab.rel.t7') 12 | 13 | local txtPath = string.format('../Inference/FB5M-ngram/type.multi.%s.txt', opt.testSplit) 14 | local thPath = string.format('inference-data/ent.%s.t7', opt.testSplit) 15 | 16 | createSeqLabelRankData(txtPath, thPath, wordVocab, 501) -------------------------------------------------------------------------------- /vocab/create_vocab.lua: -------------------------------------------------------------------------------- 1 | require '..' 2 | 3 | function createWordVocab() 4 | local wordVocab = Vocab('word.glove100k.txt') 5 | wordVocab:add_unk_token() 6 | wordVocab:add_pad_token() 7 | 8 | torch.save('vocab.word.t7', wordVocab) 9 | end 10 | 11 | function createFBVocab() 12 | local vocabPath = '../KnowledgeBase' 13 | 14 | local relVocab = Vocab(vocabPath..'/FB5M.rel.txt') 15 | relVocab:add_unk_token() 16 | 17 | local entVocab = Vocab(vocabPath..'/FB5M.ent.txt') 18 | entVocab:add_unk_token() 19 | 20 | torch.save('vocab.rel.t7', relVocab) 21 | torch.save('vocab.ent.t7', entVocab) 22 | end 23 | 24 | createWordVocab() 25 | createFBVocab() 26 | -------------------------------------------------------------------------------- /src/model/BatchDot.lua: -------------------------------------------------------------------------------- 1 | local BatchDot, parent = torch.class('BatchDot', 'nn.Module') 2 | 3 | function BatchDot:__init() 4 | parent.__init(self) 5 | self.gradInput = {torch.Tensor(), torch.Tensor()} 6 | self._viewSize = torch.LongStorage() 7 | end 8 | 9 | function BatchDot:updateOutput(input) 10 | self.output = torch.cmul(input[1], input[2]):sum(input[1]:dim()) 11 | return self.output 12 | end 13 | 14 | function BatchDot:updateGradInput(input, gradOutput) 15 | expandGradOutput = torch.expand(gradOutput, input[1]:size()) 16 | self.gradInput[1] = torch.cmul(expandGradOutput, input[2]) 17 | self.gradInput[2] = torch.cmul(expandGradOutput, input[1]) 18 | return self.gradInput 19 | end 20 | -------------------------------------------------------------------------------- /process.lua: -------------------------------------------------------------------------------- 1 | require '.' 2 | 3 | function trainData() 4 | local wordVocab = torch.load('vocab/vocab.word.t7') 5 | local entVocab = torch.load('vocab/vocab.ent.t7') 6 | local relVocab = torch.load('vocab/vocab.rel.t7') 7 | 8 | trainDir = 'SimpleQuestions/trainingData' 9 | 10 | -- focused labeling 11 | createSeqLabelingData(trainDir..'/data.train.focused_labeling', 'data/train.focused_labeling.t7', wordVocab, 128) 12 | 13 | -- entity network 14 | createSeqMultiLabelData(trainDir..'/data.train.entity_typevec', 'data/train.entity_typevec.t7', wordVocab, 501, 256) 15 | 16 | -- relation network 17 | createSeqRankingData(trainDir..'/data.train.relation_ranking', 'data/train.relation_ranking.t7', wordVocab, relVocab, 256) 18 | end 19 | 20 | trainData() 21 | -------------------------------------------------------------------------------- /src/model/BiRNN.lua: -------------------------------------------------------------------------------- 1 | local BiRNN, parent = torch.class('BiRNN', 'nn.Module') 2 | 3 | -- initialize the module 4 | function BiRNN:__init(config) 5 | parent.__init(self) 6 | 7 | -- set cuda streams 8 | self.nStream = 2 9 | if cutorch then 10 | self.streamList = {1, 2} 11 | if cutorch.getNumStreams() < self.nStream then cutorch.reserveStreams(self.nStream) end 12 | end 13 | end 14 | 15 | function BiRNN:traverseOrder(seqLen, streamIdx) 16 | if streamIdx == 1 then 17 | return 1, seqLen, 1 18 | else 19 | return seqLen, 1, -1 20 | end 21 | end 22 | 23 | function BiRNN:setAttr(attr, val) 24 | 25 | end 26 | 27 | function BiRNN:evaluate() 28 | self.train = false 29 | if cutorch.getNumStreams() < self.nStream then cutorch.reserveStreams(self.nStream) end 30 | end -------------------------------------------------------------------------------- /RelationRNN/process_inference.lua: -------------------------------------------------------------------------------- 1 | require '..' 2 | 3 | local cmd = torch.CmdLine() 4 | cmd:text('Options') 5 | cmd:option('-testSplit','valid','use which data split set') 6 | cmd:text() 7 | 8 | local opt = cmd:parse(arg) 9 | 10 | local wordVocab = torch.load('../vocab/vocab.word.t7') 11 | local relationVocab = torch.load('../vocab/vocab.rel.t7') 12 | 13 | local txtSPath = string.format('../Inference/valid/rel.single.%s.txt', opt.testSplit) 14 | local txtMPath = string.format('../Inference/valid/rel.multi.%s.txt', opt.testSplit) 15 | 16 | local thSPath = string.format('inference-data/rel.single.%s.t7', opt.testSplit) 17 | local thMPath = string.format('inference-data/rel.multi.%s.t7', opt.testSplit) 18 | 19 | createRankingData(txtSPath, thSPath, wordVocab, relationVocab, 1) 20 | createRankingData(txtMPath, thMPath, wordVocab, relationVocab, 1) 21 | -------------------------------------------------------------------------------- /init.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | require 'nn' 3 | require 'nngraph' 4 | require 'logroll' 5 | 6 | local ok, err = pcall( function () require 'cutorch' end ) 7 | if ok then 8 | require 'cunn' 9 | require 'cudnn' 10 | end 11 | 12 | include('src/model/CRF.lua') 13 | include('src/model/BiRNN.lua') 14 | include('src/model/BiGRU.lua') 15 | include('src/model/BiRNNSelect.lua') 16 | include('src/model/Linear.lua') 17 | include('src/model/BatchDot.lua') 18 | include('src/model/TripleScore.lua') 19 | include('src/model/model_utils.lua') 20 | 21 | include('src/optim/AdaGrad.lua') 22 | include('src/optim/SGD.lua') 23 | 24 | include('src/data/RankingDataLoader.lua') 25 | include('src/data/SeqMultiLabelLoader.lua') 26 | include('src/data/SeqLabelingLoader.lua') 27 | include('src/data/SeqRankingLoader.lua') 28 | include('src/data/SeqLabelRankLoader.lua') 29 | include('src/data/Vocab.lua') 30 | -------------------------------------------------------------------------------- /FocusedLabeling/generate_inference_data.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | import io 3 | import cPickle as pickle 4 | import argparse 5 | 6 | sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), 'src/py_module' )) 7 | from QAData import * 8 | import virtuoso 9 | 10 | if __name__ == '__main__': 11 | parser = argparse.ArgumentParser(description='generate_inference_data.py') 12 | parser.add_argument('--split', default='valid', type=str, help="which data split to consider") 13 | args = parser.parse_args() 14 | 15 | data_list = pickle.load(file('../SimpleQuestions/PreprocessData/QAData.{}.pkl'.format(args.split), 'rb')) 16 | if not os.path.exists('inference-data'): 17 | os.mkdir('inference-data') 18 | 19 | with io.open('inference-data/label.{}.txt'.format(args.split), 'w', encoding='utf8') as fo: 20 | for data in data_list: 21 | if data.text_attention_indices: 22 | fo.write(u'%s\t%s\n' % (data.question, 23 | ' '.join([str(index) for index in data.text_attention_indices]))) 24 | else: 25 | fo.write(u'%s\t%s\n' % (data.question, 26 | ' '.join(['0' for _ in data.question.strip().split()]))) -------------------------------------------------------------------------------- /src/model/BiRNNSelect.lua: -------------------------------------------------------------------------------- 1 | local BiRNNSelect, parent = torch.class('BiRNNSelect', 'nn.Module') 2 | 3 | function BiRNNSelect:__init() 4 | parent.__init(self) 5 | self.output = torch.Tensor() 6 | self.gradInput = torch.Tensor() 7 | end 8 | 9 | function BiRNNSelect:updateOutput(input) 10 | local seqLen = input:size(1) 11 | local batchSize = input:size(2) 12 | local doubleSize = input:size(3) 13 | local hiddenSize = doubleSize / 2 14 | 15 | self.output:resize(batchSize, hiddenSize * 2) 16 | 17 | local fLeft, fRight = 1, hiddenSize 18 | local bLeft, bRight = hiddenSize+1, doubleSize 19 | 20 | self.output[{{},{fLeft, fRight}}]:copy(input[{{seqLen}, {},{fLeft, fRight}}]) 21 | self.output[{{},{bLeft, bRight}}]:copy(input[{{ 1}, {},{bLeft, bRight}}]) 22 | 23 | return self.output 24 | end 25 | 26 | function BiRNNSelect:updateGradInput(input, gradOutput) 27 | local seqLen = input:size(1) 28 | local doubleSize = input:size(3) 29 | local hiddenSize = doubleSize / 2 30 | 31 | self.gradInput:resizeAs(input) 32 | self.gradInput:zero() 33 | 34 | local fLeft, fRight = 1, hiddenSize 35 | local bLeft, bRight = hiddenSize+1, doubleSize 36 | 37 | self.gradInput[{{seqLen}, {},{fLeft, fRight}}]:copy(gradOutput[{{},{fLeft, fRight}}]) 38 | self.gradInput[{{ 1}, {},{bLeft, bRight}}]:copy(gradOutput[{{},{bLeft, bRight}}]) 39 | 40 | return self.gradInput 41 | end -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Project specific 2 | *.pkl 3 | *.t7 4 | KnowledgeBase 5 | SimpleQuestions 6 | RawData 7 | tmp 8 | 9 | # Byte-compiled / optimized / DLL files 10 | __pycache__/ 11 | *.py[cod] 12 | *$py.class 13 | 14 | # C extensions 15 | *.so 16 | 17 | # Distribution / packaging 18 | .Python 19 | env/ 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *,cover 54 | .hypothesis/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # IPython Notebook 78 | .ipynb_checkpoints 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # dotenv 87 | .env 88 | 89 | # virtualenv 90 | venv/ 91 | ENV/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | 96 | # Rope project settings 97 | .ropeproject 98 | -------------------------------------------------------------------------------- /Inference/valid/run.sh: -------------------------------------------------------------------------------- 1 | 2 | predict () { 3 | echo "$1 $2" 4 | cp $1/score.valid.multi.label.FB5M score.multi.valid.FB5M 5 | cp $2/score.valid.label.FB5M score.ent.valid.FB5M 6 | python ../joint_disambiguation.py multi.valid.cpickle score.multi.valid.FB5M score.ent.valid.FB5M 7 | } 8 | 9 | predict_symbol () { 10 | echo "symbol $1 $2" 11 | cp $1/score.valid.multi.label.anonymous.FB5M score.multi.valid.FB5M 12 | cp $2/score.valid.label.FB5M score.ent.valid.FB5M 13 | python ../joint_disambiguation.py multi.valid.cpickle score.multi.valid.FB5M score.ent.valid.FB5M 14 | } 15 | 16 | predict "../../RelationRNN" "../../EntityTypeVec" 17 | predict "../../RelationLTGCNN" "../../EntityTypeVec" 18 | predict "../../RelationAverage" "../../EntityTypeVec" 19 | predict_symbol "../../RelationLTGCNN" "../../EntityTypeVec" 20 | 21 | predict "../../RelationRNN" "../../EntityRNN/TransE" 22 | predict "../../RelationLTGCNN" "../../EntityRNN/TransE" 23 | predict "../../RelationAverage" "../../EntityRNN/TransE" 24 | predict_symbol "../../RelationLTGCNN" "../../EntityRNN/TransE" 25 | 26 | predict "../../RelationRNN" "../../EntityRNN/Random" 27 | predict "../../RelationLTGCNN" "../../EntityRNN/Random" 28 | predict "../../RelationAverage" "../../EntityRNN/Random" 29 | predict_symbol "../../RelationLTGCNN" "../../EntityRNN/Random" 30 | 31 | predict "../../RelationRNN" "../../EntityAverage" 32 | predict "../../RelationLTGCNN" "../../EntityAverage" 33 | predict "../../RelationAverage" "../../EntityAverage" 34 | predict_symbol "../../RelationLTGCNN" "../../EntityAverage" 35 | -------------------------------------------------------------------------------- /KnowledgeBase/convert.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | import cPickle as pickle 3 | 4 | def www2fb(in_str): 5 | out_str = 'fb:%s' % (in_str.split('www.freebase.com/')[-1].replace('/', '.')) 6 | return out_str 7 | 8 | def main(): 9 | in_fn = sys.argv[1] 10 | db = in_fn.split('-')[-1].split('.')[0] 11 | 12 | out_fn = '%s.core.txt' % (db) 13 | ent_fn = '%s.ent.pkl' % (db) 14 | rel_fn = '%s.rel.pkl' % (db) 15 | 16 | ent_dict = {} 17 | rel_dict = {} 18 | triple_dict = {} 19 | 20 | with file(in_fn, 'rb') as fi: 21 | for line in fi: 22 | fields = line.strip().split('\t') 23 | sub = www2fb(fields[0]) 24 | rel = www2fb(fields[1]) 25 | objs = fields[2].split() 26 | if ent_dict.has_key(sub): 27 | ent_dict[sub] += 1 28 | else: 29 | ent_dict[sub] = 1 30 | if rel_dict.has_key(rel): 31 | rel_dict[rel] += 1 32 | else: 33 | rel_dict[rel] = 1 34 | for obj in objs: 35 | obj = www2fb(obj) 36 | triple_dict[(sub, rel, obj)] = 1 37 | if ent_dict.has_key(obj): 38 | ent_dict[obj] += 1 39 | else: 40 | ent_dict[obj] = 1 41 | 42 | pickle.dump(ent_dict, file(ent_fn, 'wb')) 43 | with file('%s.ent.txt' % (db), 'wb') as fo: 44 | for k, v in sorted(ent_dict.items(), key = lambda kv: kv[1], reverse = True): 45 | print >> fo, k 46 | 47 | pickle.dump(rel_dict, file(rel_fn, 'wb')) 48 | with file('%s.rel.txt' % (db), 'wb') as fo: 49 | for k, v in sorted(rel_dict.items(), key = lambda kv: kv[1], reverse = True): 50 | print >> fo, k 51 | 52 | with file(out_fn, 'wb') as fo: 53 | for (sub, rel, obj) in triple_dict.keys(): 54 | print >> fo, '<%s>\t<%s>\t<%s>\t.' % (sub, rel, obj) 55 | print len(triple_dict) 56 | 57 | if __name__ == '__main__': 58 | main() 59 | -------------------------------------------------------------------------------- /FocusedLabeling/infer_crf.lua: -------------------------------------------------------------------------------- 1 | require '..' 2 | 3 | local cmd = torch.CmdLine() 4 | cmd:text() 5 | cmd:text('Training a Recurrent Neural Network to classify a sequence of words') 6 | cmd:text() 7 | cmd:text('Comandline Options') 8 | 9 | cmd:option('-wordVocab','../vocab/vocab.word.t7','training data file') 10 | cmd:option('-testData','inference-data/label.valid.t7','data file to predict') 11 | cmd:option('-modelFile','model/model.BiGRU','path to the trained model') 12 | 13 | cmd:option('-useGPU',1,'which GPU is used for computation') 14 | 15 | cmd:text() 16 | 17 | ----------------------------- Basic Options ----------------------------- 18 | 19 | local opt = cmd:parse(arg) 20 | 21 | local wordVocab = torch.load(opt.wordVocab) 22 | 23 | if opt.useGPU > 0 then 24 | require 'cutorch' 25 | require 'cunn' 26 | cutorch.setDevice(opt.useGPU) 27 | torch.setdefaulttensortype('torch.CudaTensor') 28 | end 29 | 30 | ----------------------------- Data Loader ----------------------------- 31 | local loader = SeqLabelingLoader(opt.testData, flog) 32 | 33 | -------------------------- Load & Init Models ------------------------- 34 | local model = torch.load(opt.modelFile) 35 | local seqModel = model.seqModel 36 | local linearCRF = model.linearCRF 37 | seqModel:evaluate() 38 | linearCRF:evaluate() 39 | 40 | ----------------------------- Prediction ----------------------------- 41 | local maxIters = loader.numBatch 42 | 43 | local fields = stringx.split(opt.testData, '.') 44 | local split = fields[#fields-1] 45 | local file = io.open(string.format("label.result.%s", split), 'w') 46 | 47 | for i = 1, maxIters do 48 | xlua.progress(i, maxIters) 49 | 50 | ----------------------- load minibatch ------------------------ 51 | local seq, _ = loader:nextBatch(1) 52 | local currSeqLen = seq:size(1) 53 | local seqVec = seqModel:forward(seq) 54 | local predict = linearCRF:forward(seqVec) 55 | 56 | for i = 1, currSeqLen do 57 | file:write(predict[{i,1}]-0.999, ' ') 58 | end 59 | file:write('\n') 60 | end 61 | file:close() -------------------------------------------------------------------------------- /data_preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ROOTDIR=`pwd` 3 | KBPATH=${ROOTDIR}/KnowledgeBase/VirtuosoKB/ 4 | 5 | # 1. download SimpleQuestionv2 6 | echo "====> Step 1: download raw data" 7 | mkdir -p ${ROOTDIR}/RawData 8 | cd ${ROOTDIR}/RawData 9 | 10 | wget https://www.dropbox.com/s/tohrsllcfy7rch4/SimpleQuestions_v2.tgz 11 | tar -xzf SimpleQuestions_v2.tgz 12 | 13 | wget https://www.dropbox.com/s/dt4i1a1wayks43n/FB5M-extra.tar.gz 14 | tar -xzf FB5M-extra.tar.gz 15 | 16 | # 2. create KB data 17 | echo "====> Step 2: create KB data" 18 | cd ${ROOTDIR}/KnowledgeBase 19 | python convert.py ${ROOTDIR}/RawData/SimpleQuestions_v2/freebase-subsets/freebase-FB5M.txt 20 | 21 | mv FB5M.core.txt ${KBPATH}/data/ 22 | mv ${ROOTDIR}/RawData/FB5M.*.txt ${KBPATH}/data/ 23 | 24 | # 3. load data into knowledge base 25 | echo "====> Step 3: load data into knowledge base" 26 | cd ${KBPATH} 27 | ./bin/virtuoso-t +foreground +configfile var/lib/virtuoso/db/virtuoso.ini & # start the server 28 | serverPID=$! 29 | sleep 10 30 | 31 | ./bin/isql 1111 dba dba exec="ld_dir_all('./data', '*', 'fb:');" 32 | 33 | pids=() 34 | for i in `seq 1 4`; do 35 | ./bin/isql 1111 dba dba exec="rdf_loader_run();" & 36 | pids+=($!) 37 | done 38 | for pid in ${pids[@]}; do 39 | wait $pid 40 | done 41 | 42 | # 4. create Vocabs 43 | echo "====> Step 4: create Vocabs" 44 | cd ${ROOTDIR}/vocab 45 | th create_vocab.lua 46 | 47 | 5. create training data 48 | ho "====> Step 5: create training data (this will take some time)" 49 | 50 | # 5.1. QAData.pkl 51 | cd ${ROOTDIR}/SimpleQuestions/PreprocessData 52 | python process_rawdata.py ${ROOTDIR}/RawData/SimpleQuestions_v2/annotated_fb_data_train.txt 6 53 | python process_rawdata.py ${ROOTDIR}/RawData/SimpleQuestions_v2/annotated_fb_data_valid.txt 6 54 | python process_rawdata.py ${ROOTDIR}/RawData/SimpleQuestions_v2/annotated_fb_data_test.txt 6 55 | 56 | # 5.2. create train data in .txt format 57 | cd ${ROOTDIR}/SimpleQuestions 58 | python generate_training_data.py 59 | 60 | # 5.3. convert .txt data to .t7 format 61 | cd ${ROOTDIR} 62 | mkdir ${ROOTDIR}/data 63 | th process.lua 64 | -------------------------------------------------------------------------------- /EntityTypeVec/test_ent_typevec.lua: -------------------------------------------------------------------------------- 1 | require '..' 2 | 3 | local cmd = torch.CmdLine() 4 | cmd:text() 5 | cmd:text('Training a Recurrent Neural Network to classify a sequence of words') 6 | cmd:text() 7 | cmd:text('Comandline Options') 8 | 9 | cmd:option('-testData','data/valid.torch','training data file') 10 | cmd:option('-modelFile','model.BiGRU','filename for loading trained model') 11 | 12 | cmd:option('-useGPU',1,'which GPU is used for computation') 13 | 14 | cmd:text() 15 | 16 | ----------------------------- Basic Options ----------------------------- 17 | 18 | local opt = cmd:parse(arg) 19 | local flog = logroll.print_logger() 20 | 21 | if opt.useGPU > 0 then 22 | require 'cutorch' 23 | require 'cunn' 24 | cutorch.setDevice(opt.useGPU) 25 | torch.setdefaulttensortype('torch.CudaTensor') 26 | flog.info(string.rep('-', 50)) 27 | flog.info('Set default tensor type to CudaTensor') 28 | end 29 | 30 | ----------------------------- Data Loader ----------------------------- 31 | local loader = SeqMultiLabelLoader(opt.testData, flog) 32 | 33 | -------------------------- Load & Init Models ------------------------- 34 | cutorch.reserveStreams(2) 35 | local model = torch.load(opt.modelFile) 36 | model:evaluate() 37 | 38 | ----------------------------- Prediction ----------------------------- 39 | local maxIters = loader.numBatch 40 | flog.info(string.rep('-', 40)) 41 | flog.info('Begin Prediction') 42 | 43 | local sumPred, sumCorr, sumTrue = 0, 0, 0 44 | 45 | for i = 1, maxIters do 46 | xlua.progress(i, maxIters) 47 | 48 | ----------------------- load minibatch ------------------------ 49 | local seq, labels = loader:nextBatch() 50 | local currSeqLen = seq:size(1) 51 | 52 | local predict = model:forward(seq) 53 | local hardPred = torch.ge(predict, 0.5) 54 | sumCorr = sumCorr + torch.cmul(hardPred:type(torch.type(labels)), labels):sum() 55 | sumTrue = sumTrue + labels:sum() 56 | sumPred = sumPred + hardPred:sum() 57 | 58 | end 59 | 60 | local p, r = sumCorr / sumPred, sumCorr / sumTrue 61 | print(p, r, 2 * p * r / (p + r)) 62 | -------------------------------------------------------------------------------- /Inference/generate_score_data.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import cPickle as pickle 3 | 4 | sys.path.append(os.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), 'src/py_module' )) 5 | import QAData 6 | 7 | if __name__ == '__main__': 8 | if len(sys.argv) < 2: 9 | print 'usage: python generate_score_data.py cpickle_data' 10 | sys.exit(-1) 11 | 12 | suffix = sys.argv[1].split('.')[-2] 13 | single_rel_file = file('rel.single.%s.txt'%(suffix), 'wb') 14 | multi_rel_file = file('rel.multi.%s.txt'%(suffix), 'wb') 15 | multi_ent_file = file('ent.multi.%s.txt'%(suffix), 'wb') 16 | multi_type_file = file('type.multi.%s.txt'%(suffix), 'wb') 17 | 18 | data_list = pickle.load(file(sys.argv[1], 'rb')) 19 | single_rel_data = [] 20 | multi_rel_data = [] 21 | print >> sys.stderr, 'Finish loading QAData' 22 | 23 | count = 0 24 | for data in data_list: 25 | if hasattr(data, 'cand_sub') and hasattr(data, 'cand_rel') and len(data.cand_rel) > 0 and data.relation in data.cand_rel and data.subject in data.cand_sub: 26 | # if data.subject in data.cand_sub: 27 | question = data.question 28 | # Case 1: single candidate subject 29 | if len(data.cand_sub) == 1: 30 | print >> single_rel_file, '%s\t%s\t%s' % (question, data.relation, '\t'.join(data.cand_rel)) 31 | single_rel_data.append(data) 32 | # Case 2: multiple candidate subjects 33 | elif len(data.cand_sub) > 1: 34 | print >> multi_rel_file, '%s\t%s\t%s' % (question, data.relation, '\t'.join(data.cand_rel)) 35 | print >> multi_ent_file, '%s\t%s\t%s' % (question, data.subject, '\t'.join(data.cand_sub)) 36 | print >> multi_type_file, '%s\t%d\t%s' % (question, data.cand_sub.index(data.subject), '\t'.join([' '.join([str(t) for t in st]) for st in data.sub_types])) 37 | multi_rel_data.append(data) 38 | else: 39 | count += 1 40 | 41 | single_rel_file.close() 42 | multi_rel_file.close() 43 | multi_ent_file.close() 44 | multi_type_file.close() 45 | 46 | pickle.dump(single_rel_data, file('single.%s.cpickle'%(suffix), 'wb')) 47 | pickle.dump(multi_rel_data, file('multi.%s.cpickle'%(suffix), 'wb')) 48 | print >> sys.stderr, count 49 | -------------------------------------------------------------------------------- /EntityTypeVec/infer_ent_typevec.lua: -------------------------------------------------------------------------------- 1 | require '..' 2 | require 'SeqLabelRankLoader' 3 | 4 | local cmd = torch.CmdLine() 5 | cmd:text('Comandline Options') 6 | cmd:option('-testData','inference-data/ent.valid.t7','training data file') 7 | cmd:option('-modelFile','model.BiGRU','filename for loading trained model') 8 | cmd:option('-useGPU',0,'which GPU is used for computation') 9 | 10 | cmd:text() 11 | 12 | ----------------------------- Basic Options ----------------------------- 13 | 14 | local opt = cmd:parse(arg) 15 | local flog = logroll.print_logger() 16 | 17 | if opt.useGPU > 0 then 18 | cutorch.setDevice(opt.useGPU) 19 | torch.setdefaulttensortype('torch.CudaTensor') 20 | flog.info(string.rep('-', 50)) 21 | flog.info('Set default tensor type to CudaTensor') 22 | end 23 | 24 | ----------------------------- Data Loader ----------------------------- 25 | local fields = stringx.split(opt.testData, '.') 26 | local split = fields[#fields-1] 27 | local loader = SeqLabelRankLoader(opt.testData, flog) 28 | local score_file = io.open(string.format('score.ent.multi.%s', split), 'w') 29 | local rank_file = io.open(string.format('rank.ent.multi.%s', split), 'w') 30 | 31 | -------------------------- Load & Init Models ------------------------- 32 | cutorch.reserveStreams(2) 33 | local model = torch.load(opt.modelFile) 34 | model:evaluate() 35 | 36 | ----------------------------- Prediction ----------------------------- 37 | local maxIters = loader.numBatch 38 | flog.info(string.rep('-', 40)) 39 | flog.info('Begin Prediction') 40 | 41 | for i = 1, maxIters do 42 | xlua.progress(i, maxIters) 43 | 44 | ----------------------- load minibatch ------------------------ 45 | local seq, posIdx, candi = loader:nextBatch(1) 46 | local currSeqLen = seq:size(1) 47 | local numCandi = candi:size(1) 48 | 49 | local predict = model:forward(seq) 50 | predict:maskedSelect(torch.lt(predict, 0.5)):zero() 51 | local repPred = predict:expandAs(candi) 52 | 53 | candi = candi:cuda() 54 | local scores = torch.cmul(repPred, candi):sum(2):view(numCandi) 55 | 56 | local _, argSort = torch.sort(scores, 1, true) 57 | rank_file:write(posIdx, '\t') 58 | for i = 1, numCandi do 59 | rank_file:write(argSort[i], ' ') 60 | end 61 | rank_file:write('\n') 62 | 63 | for i = 1, numCandi do 64 | score_file:write(scores[i], ' ') 65 | end 66 | score_file:write('\n') 67 | end 68 | rank_file:close() 69 | score_file:close() 70 | -------------------------------------------------------------------------------- /Virtuoso.md: -------------------------------------------------------------------------------- 1 | This File provides instruction on how to build and config **Virtuoso**, a triple-storage software the package relies on. 2 | 3 | 4 | 5 | ##### 1. Download source code from github 6 | 7 | ```shell 8 | cd tmp 9 | git clone https://github.com/openlink/virtuoso-opensource.git 10 | ``` 11 | 12 | 13 | 14 | ##### 2. Configure and compile the source code to specific path 15 | 16 | To build Virtuoso on systems other than `Linux 64-bit`, please refer to the [virtuoso building doc](https://github.com/openlink/virtuoso-opensource) 17 | 18 | ```shell 19 | cd virtuoso-opensource 20 | 21 | # generate makefile 22 | sh autogen.sh 23 | 24 | # PKGPATH is the root directory you put this package in 25 | PKGPATH="put your path here" 26 | 27 | # ultimate install path 28 | INSTALLPATH=${PKGPATH}/KnowledgeBase/VirtuosoKG 29 | mkdir -p ${INSTALLPATH} 30 | 31 | # flags for Linux 64-bit 32 | CFLAGS="-O2 -m64" 33 | export CFLAGS 34 | 35 | # configurate 36 | ./configure --prefix=${INSTALLPATH} 37 | 38 | # compile (compiling will take quite a while) 39 | make 40 | 41 | # install 42 | make install 43 | ``` 44 | 45 | 46 | 47 | ##### 3. Edit the .ini config file of Virtuoso KB 48 | 49 | Here, we config virtuoso in the following way so that a proper performance can be achieved. 50 | 51 | ```shell 52 | # create a folder to store data to be loaded 53 | cd ${INSTALLPATH} 54 | mkdir data 55 | 56 | # edit the .ini config 57 | vi var/lib/virtuoso/db/virtuoso.ini 58 | 59 | # all changes necessary to make are under the [Parameters] section 60 | 61 | # 1. DirsAllowed : directory from which data is allowed to be loaded. 62 | # So we need to append our created data directory after the default value. 63 | 64 | # default 65 | DirsAllowed = ., ${INSTALLPATH}/share/virtuoso/vad 66 | # modified 67 | DirsAllowed = ., ${INSTALLPATH}/share/virtuoso/vad, ${INSTALLPATH}/data 68 | 69 | # 2. MaxQueryMem : maximum memory virtuoso can use to handle queries. 70 | # Intuitively, the larger the MaxQueryMem, the potentially faster the query. 71 | # The recommemded value is 1/2 to 2/3 of the whole memory on the machine. 72 | 73 | # default 74 | MaxQueryMem = 2G 75 | # modified : for our experiment, on a 6-core machine with 32G memory. 76 | MaxQueryMem = 16G 77 | 78 | # 3. VectorSize : initial parallel query operations size. 79 | # Intuitively, the larger the VectorSize, the potentially faster the query. 80 | 81 | # default 82 | VectorSize = 1000 83 | # modified : for our experiment, on a 6-core machine with 32G memory. 84 | VectorSize = 10000 85 | ``` 86 | 87 | -------------------------------------------------------------------------------- /FocusedLabeling/test_crf.lua: -------------------------------------------------------------------------------- 1 | require '..' 2 | 3 | local cmd = torch.CmdLine() 4 | cmd:text() 5 | cmd:text('Training a Recurrent Neural Network to classify a sequence of words') 6 | cmd:text() 7 | cmd:text('Comandline Options') 8 | 9 | cmd:option('-wordVocab','../vocab/vocab.word.t7','training data file') 10 | cmd:option('-testData','../data/valid.t7','training data file') 11 | cmd:option('-modelFile','model.BiGRU','filename for loading trained model') 12 | 13 | cmd:option('-useGPU',1,'which GPU is used for computation') 14 | 15 | cmd:text() 16 | 17 | ----------------------------- Basic Options ----------------------------- 18 | 19 | local opt = cmd:parse(arg) 20 | local flog = logroll.print_logger() 21 | 22 | local wordVocab = torch.load(opt.wordVocab) 23 | 24 | if opt.useGPU > 0 then 25 | require 'cutorch' 26 | require 'cunn' 27 | cutorch.setDevice(opt.useGPU) 28 | torch.setdefaulttensortype('torch.CudaTensor') 29 | flog.info(string.rep('-', 50)) 30 | flog.info('Set default tensor type to CudaTensor') 31 | end 32 | 33 | ----------------------------- Data Loader ----------------------------- 34 | local loader = SeqClassLoader(opt.testData, flog) 35 | 36 | -------------------------- Load & Init Models ------------------------- 37 | local model = torch.load(opt.modelFile) 38 | local seqModel = model.seqModel 39 | local linearCRF = model.linearCRF 40 | seqModel:evaluate() 41 | linearCRF:evaluate() 42 | 43 | ----------------------------- Prediction ----------------------------- 44 | local maxIters = loader.numBatch 45 | flog.info(string.rep('-', 40)) 46 | flog.info('Begin Prediction') 47 | 48 | local sumPred, sumCorr, sumTrue = 0, 0, 0 49 | local count = 0 50 | 51 | for i = 1, maxIters do 52 | xlua.progress(i, maxIters) 53 | 54 | ----------------------- load minibatch ------------------------ 55 | local seq, labels = loader:nextBatch() 56 | local currSeqLen = seq:size(1) 57 | local seqVec = seqModel:forward(seq) 58 | local predict = linearCRF:forward(seqVec) 59 | 60 | if torch.sum(torch.ne(predict, labels)) == 0 then 61 | count = count + 1 62 | end 63 | local maskPred = torch.eq(predict, 2) 64 | local maskTrue = torch.eq(labels, 2) 65 | sumCorr = sumCorr + torch.eq(predict:type(torch.type(labels)), labels):cmul(maskTrue):sum() 66 | sumTrue = sumTrue + maskTrue:sum() 67 | sumPred = sumPred + maskPred:sum() 68 | -- for i = 1, currSeqLen do 69 | -- print(string.format("%15s\t%1d\t%1d", wordVocab:token(seq[{i,1}]), predict[{i,1}], labels[{i,1}])) 70 | -- end 71 | end 72 | 73 | local p, r = sumCorr / sumPred, sumCorr / sumTrue 74 | print(p, r, 2 * p * r / (p + r)) 75 | print(count / loader.numBatch) 76 | -------------------------------------------------------------------------------- /src/data/Vocab.lua: -------------------------------------------------------------------------------- 1 | local Vocab = torch.class('Vocab') 2 | 3 | function Vocab:__init(path) 4 | self.size = 0 5 | self._index = {} 6 | self._tokens = {} 7 | 8 | local file = io.open(path) 9 | while true do 10 | local line = file:read() 11 | if line == nil then break end 12 | self.size = self.size + 1 13 | self._tokens[self.size] = line 14 | self._index[line] = self.size 15 | end 16 | file:close() 17 | 18 | print('vocab size: '..self.size) 19 | end 20 | 21 | function Vocab:contains(w) 22 | if not self._index[w] then return false end 23 | return true 24 | end 25 | 26 | function Vocab:add(w) 27 | if self._index[w] ~= nil then 28 | return self._index[w] 29 | end 30 | self.size = self.size + 1 31 | self._tokens[self.size] = w 32 | self._index[w] = self.size 33 | return self.size 34 | end 35 | 36 | function Vocab:index(w) 37 | local index = self._index[w] 38 | if index == nil then 39 | if self.unk_index == nil then 40 | error('Token not in vocabulary and no UNK token defined: ' .. w) 41 | end 42 | return self.unk_index 43 | end 44 | return index 45 | end 46 | 47 | function Vocab:token(i) 48 | if i < 1 or i > self.size then 49 | error('Index ' .. i .. ' out of bounds') 50 | end 51 | return self._tokens[i] 52 | end 53 | 54 | function Vocab:map(tokens) 55 | local len = #tokens 56 | local output = torch.IntTensor(len) 57 | for i = 1, len do 58 | output[i] = self:index(tokens[i]) 59 | end 60 | return output 61 | end 62 | 63 | function Vocab:add_unk_token() 64 | if self.unk_token ~= nil then return end 65 | self.unk_index = self:add('') 66 | print('vocab size: '..self.size) 67 | end 68 | 69 | function Vocab:add_pad_token() 70 | if self.pad_token ~= nil then return end 71 | self.pad_index = self:add('') 72 | print('vocab size: '..self.size) 73 | end 74 | 75 | function Vocab:add_ent_token() 76 | if self.ent_token ~= nil then return end 77 | self.ent_index = self:add('') 78 | print('vocab size: '..self.size) 79 | end 80 | 81 | function Vocab:add_start_token() 82 | if self.start_token ~= nil then return end 83 | self.start_index = self:add('') 84 | print('vocab size: '..self.size) 85 | end 86 | 87 | function Vocab:add_end_token() 88 | if self.end_token ~= nil then return end 89 | self.end_index = self:add('') 90 | print('vocab size: '..self.size) 91 | end 92 | 93 | function Vocab:add_space_token() 94 | if self.space_token ~= nil then return end 95 | self.space_index = self:add('<_>') 96 | print('vocab size: '..self.size) 97 | end 98 | -------------------------------------------------------------------------------- /Inference/test/run.sh: -------------------------------------------------------------------------------- 1 | 2 | predict () { 3 | echo "predict $1 $2 $3 $4" 4 | cp $1/score.test.multi.label.FB5M score.multi.valid.FB5M 5 | cp $2/score.test.label.FB5M score.ent.valid.FB5M 6 | #python ../joint_predict.py multi.test.cpickle score.multi.valid.FB5M score.ent.valid.FB5M $3 $4 7 | python ../joint_disambiguation.py multi.test.cpickle score.multi.valid.FB5M score.ent.valid.FB5M 8 | } 9 | 10 | predict_symbol () { 11 | echo "predict symbol $1 $2 $3 $4" 12 | cp $1/score.test.multi.label.anonymous.FB5M score.multi.valid.FB5M 13 | cp $2/score.test.label.FB5M score.ent.valid.FB5M 14 | #python ../joint_predict.py multi.test.cpickle score.multi.valid.FB5M score.ent.valid.FB5M $3 $4 15 | python ../joint_disambiguation.py multi.test.cpickle score.multi.valid.FB5M score.ent.valid.FB5M 16 | } 17 | 18 | predict "../../RelationRNN" "../../EntityTypeVec" 0.85 0.0 19 | predict "../../RelationLTGCNN" "../../EntityTypeVec" 0.85 0.0 20 | predict "../../RelationAverage" "../../EntityTypeVec" 0.85 0.0 21 | predict_symbol "../../RelationLTGCNN" "../../EntityTypeVec" 0.85 0.0 22 | 23 | predict "../../RelationRNN" "../../EntityTypeVec" 0.90 0.95 24 | predict "../../RelationLTGCNN" "../../EntityTypeVec" 0.85 0.85 25 | predict "../../RelationAverage" "../../EntityTypeVec" 0.90 0.85 26 | predict_symbol "../../RelationLTGCNN" "../../EntityTypeVec" 0.85 0.85 27 | 28 | predict "../../RelationRNN" "../../EntityRNN/TransE" 0.60 0.0 29 | predict "../../RelationLTGCNN" "../../EntityRNN/TransE" 0.55 0.0 30 | predict "../../RelationAverage" "../../EntityRNN/TransE" 0.60 0.0 31 | predict_symbol "../../RelationLTGCNN" "../../EntityRNN/TransE" 0.60 0.0 32 | 33 | predict "../../RelationRNN" "../../EntityRNN/TransE" 0.90 0.95 34 | predict "../../RelationLTGCNN" "../../EntityRNN/TransE" 0.50 0.85 35 | predict "../../RelationAverage" "../../EntityRNN/TransE" 0.95 0.95 36 | predict_symbol "../../RelationLTGCNN" "../../EntityRNN/TransE" 0.65 0.95 37 | 38 | predict "../../RelationRNN" "../../EntityRNN/Random" 0.75 0.0 39 | predict "../../RelationLTGCNN" "../../EntityRNN/Random" 0.70 0.0 40 | predict "../../RelationAverage" "../../EntityRNN/Random" 0.70 0.0 41 | predict_symbol "../../RelationLTGCNN" "../../EntityRNN/Random" 0.65 0.0 42 | 43 | predict "../../RelationRNN" "../../EntityRNN/Random" 0.60 0.95 44 | predict "../../RelationLTGCNN" "../../EntityRNN/Random" 0.70 0.85 45 | predict "../../RelationAverage" "../../EntityRNN/Random" 0.95 0.95 46 | predict_symbol "../../RelationLTGCNN" "../../EntityRNN/Random" 0.95 0.95 47 | 48 | predict "../../RelationRNN" "../../EntityAverage" 0.60 0.0 49 | predict "../../RelationLTGCNN" "../../EntityAverage" 0.65 0.0 50 | predict "../../RelationAverage" "../../EntityAverage" 0.55 0.0 51 | predict_symbol "../../RelationLTGCNN" "../../EntityAverage" 0.65 0.0 52 | 53 | predict "../../RelationRNN" "../../EntityAverage" 0.65 0.95 54 | predict "../../RelationLTGCNN" "../../EntityAverage" 0.65 0.85 55 | predict "../../RelationAverage" "../../EntityAverage" 0.95 0.95 56 | predict_symbol "../../RelationLTGCNN" "../../EntityAverage" 0.65 0.85 57 | -------------------------------------------------------------------------------- /src/optim/AdaGrad.lua: -------------------------------------------------------------------------------- 1 | -- For this AdaGrad implementation, it supports both (optional) traditional momentum 2 | -- and Nesterov Accelerated Gradient (nag). However, both styles of momentum apply the 3 | -- same value to all parameters. 4 | 5 | local AdaGrad = torch.class('AdaGrad') 6 | 7 | function AdaGrad:__init(gradTab, config) 8 | self.lr = config.lr 9 | self.histGradSquare = {} 10 | for i, grad in pairs(gradTab) do 11 | self.histGradSquare[i] = grad:clone():fill(1e-4) 12 | end 13 | if config.momentum then 14 | self.momentum = config.momentum 15 | self.velocity = {} 16 | for i, grad in pairs(gradTab) do 17 | self.velocity[i] = grad:clone():fill(0) 18 | end 19 | elseif config.nag then 20 | self.nag = config.nag 21 | self.const_1 = self.nag * self.nag -- NAG from "advances in optimizing recurrent networks" 22 | self.const_2 = self.nag + 1 -- NAG from "advances in optimizing recurrent networks" 23 | self.velocity = {} 24 | for i, grad in pairs(gradTab) do 25 | self.velocity[i] = grad:clone():fill(0) 26 | end 27 | end 28 | if config.logger then 29 | config.logger.info(string.rep('-', 50)) 30 | config.logger.info(string.format('AdaGrad Configurations:')) 31 | for i = 1, #self.lr do 32 | config.logger.info(string.format(' learning rate [%1d] : %f', i , self.lr[i])) 33 | end 34 | if self.momentum then 35 | config.logger.info(string.format(' classic momentum : %f', self.momentum)) 36 | elseif self.nag then 37 | config.logger.info(string.format(' Nesterov momentum : %f', self.nag)) 38 | end 39 | end 40 | end 41 | 42 | function AdaGrad:updateParams(paramsTab, gradTab) 43 | if self.momentum then 44 | for i = 1, #paramsTab do 45 | self.histGradSquare[i]:addcmul(1, gradTab[i], gradTab[i]) 46 | self.velocity[i]:mul(self.momentum):addcdiv(-self.lr[i], gradTab[i], torch.sqrt(self.histGradSquare[i])) 47 | paramsTab[i]:add(self.velocity[i]) 48 | end 49 | elseif self.nag then 50 | for i = 1, #paramsTab do 51 | self.histGradSquare[i]:addcmul(1, gradTab[i], gradTab[i]) 52 | self.velocity[i]:mul(self.const_1):addcdiv(-self.lr[i]*self.const_2, gradTab[i], torch.sqrt(self.histGradSquare[i])) 53 | paramsTab[i]:add(self.velocity[i]) 54 | end 55 | else 56 | for i = 1, #paramsTab do 57 | self.histGradSquare[i]:addcmul(1, gradTab[i], gradTab[i]) 58 | paramsTab[i]:addcdiv(-self.lr[i], gradTab[i], torch.sqrt(self.histGradSquare[i])) 59 | end 60 | end 61 | end 62 | 63 | function AdaGrad:updateMomentum(rate) 64 | if self.momentum then 65 | self.momentum = rate 66 | elseif self.nag then 67 | self.nag = rate 68 | end 69 | end 70 | 71 | function AdaGrad:effectiveGradNorm(gradTab) 72 | for i = 1, #gradTab do 73 | print(string.format('effective norm %d: %f', i, torch.cdiv(gradTab[i], torch.sqrt(self.histGradSquare[i])):norm())) 74 | end 75 | end 76 | -------------------------------------------------------------------------------- /src/py_module/QAData.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn import preprocessing 3 | 4 | def fb2www(in_data): 5 | if type(in_data) == type(' '): 6 | out_data = in_data.replace('.', '/').replace('fb:', 'www.freebase.com/') 7 | elif type(in_data) == type([]): 8 | out_data = [data.replace('.', '/').replace('fb:', 'www.freebase.com/') for data in in_data] 9 | return out_data 10 | 11 | class QAData(object): 12 | """docstring for QAData""" 13 | def __init__(self, data_tuple): 14 | super(QAData, self).__init__() 15 | self.question = data_tuple[0] 16 | self.subject = data_tuple[1] 17 | self.relation = data_tuple[2] 18 | self.object = data_tuple[3] 19 | self.num_text_token = int(data_tuple[4]) 20 | 21 | def add_candidate(self, sub, rels, types = None): 22 | if not hasattr(self, 'cand_sub'): 23 | self.cand_sub = [] 24 | if not hasattr(self, 'cand_rel'): 25 | self.cand_rel = [] 26 | if not hasattr(self, 'sub_rels'): 27 | self.sub_rels = [] 28 | self.cand_sub.append(sub) 29 | self.sub_rels.append(rels) 30 | self.cand_rel.extend(rels) 31 | if types: 32 | if not hasattr(self, 'sub_types'): 33 | self.sub_types = [] 34 | self.sub_types.append(types) 35 | 36 | def remove_duplicate(self): 37 | self.cand_rel = list(set(self.cand_rel)) 38 | 39 | def make_score_mat(self): 40 | # make candidate unique rels 41 | self.num_sub = len(self.cand_sub) 42 | self.num_rel = len(self.cand_rel) 43 | self.rel_dict = {self.cand_rel[i]:i for i in range(self.num_rel)} 44 | 45 | # establish score matrix 46 | self.score_mat = np.zeros((self.num_sub, self.num_rel)) 47 | for i in range(self.num_sub): 48 | for rel in self.sub_rels[i]: 49 | self.score_mat[i, self.rel_dict[rel]] = 1 50 | 51 | def fill_rel_score(self, scores): 52 | self.score_mat = self.score_mat * scores 53 | 54 | def fill_ent_score(self, scores): 55 | self.ent_score = preprocessing.scale(scores) 56 | 57 | # def top_sub_rel(self): 58 | # # sub_score = preprocessing.scale(np.sum(self.score_mat, 1)) 59 | # # sub_score += self.ent_score 60 | # sub_score = np.sum(self.score_mat, 1) 61 | 62 | # top_subid = np.argmax(sub_score) 63 | # top_relid = np.argmax(self.score_mat[top_subid]) 64 | # self.pred_sub = self.cand_sub[top_subid] 65 | # self.pred_rel = self.cand_rel[top_relid] 66 | # return self.cand_sub[top_subid], self.cand_rel[top_relid] 67 | 68 | def top_sub_rel(self): 69 | sub_score = np.sum(self.score_mat, 1) 70 | top_subscore = np.max(sub_score) 71 | top_subids = [] 72 | for subid in np.argsort(sub_score)[::-1]: 73 | if sub_score[subid] < top_subscore: 74 | break 75 | top_subids.append(subid) 76 | 77 | top_relid = np.argmax(self.score_mat[top_subids[0]]) 78 | 79 | return [self.cand_sub[subid] for subid in top_subids], self.cand_rel[top_relid] 80 | -------------------------------------------------------------------------------- /src/optim/SGD.lua: -------------------------------------------------------------------------------- 1 | -- For this SGD implementation, it supports both (optional) traditional momentum 2 | -- and Nesterov Accelerated Gradient (nag). However, both styles of momentum apply the 3 | -- same value to all parameters. 4 | 5 | local SGD = torch.class('SGD') 6 | 7 | function SGD:__init(gradTab, config) 8 | self.lr = config.lr 9 | if config.momentum then 10 | self.momentum = config.momentum 11 | self.velocity = {} 12 | for i, grad in pairs(gradTab) do 13 | self.velocity[i] = grad:clone():fill(0) 14 | end 15 | elseif config.nag then 16 | self.nag = config.nag 17 | self.const_1 = self.nag * self.nag -- NAG from "advances in optimizing recurrent networks" 18 | self.const_2 = self.nag + 1 -- NAG from "advances in optimizing recurrent networks" 19 | self.velocity = {} 20 | for i, grad in pairs(gradTab) do 21 | self.velocity[i] = grad:clone():fill(0) 22 | end 23 | end 24 | if config.annealing then 25 | self.annealing = config.annealing 26 | self.masterLr = {} 27 | for i = 1, #self.lr do 28 | self.masterLr[i] = self.lr[i] 29 | end 30 | end 31 | self.count = 0 32 | if config.logger then 33 | config.logger.info(string.rep('-', 50)) 34 | config.logger.info(string.format('SGD Configurations:')) 35 | for i = 1, #self.lr do 36 | config.logger.info(string.format(' learning rate [%1d] : %f', i , self.lr[i])) 37 | end 38 | if self.momentum then 39 | config.logger.info(string.format(' classic momentum : %f', self.momentum)) 40 | elseif self.nag then 41 | config.logger.info(string.format(' Nesterov momentum : %f', self.nag)) 42 | end 43 | if self.annealing then 44 | config.logger.info(string.format(' Annearling rate : %f', self.annealing)) 45 | end 46 | end 47 | end 48 | 49 | function SGD:updateParams(paramsTab, gradTab) 50 | self.count = self.count + 1 51 | if self.annealing then 52 | for i = 1, #self.masterLr do 53 | self.lr[i] = self.masterLr[i] / (1 + self.annealing * math.sqrt(self.count)) 54 | end 55 | end 56 | -- print (self.lr) 57 | if self.momentum and self.momentum > 0 then 58 | for i = 1, #paramsTab do 59 | self.velocity[i]:mul(self.momentum):add(-self.lr[i], gradTab[i]) 60 | paramsTab[i]:add(self.velocity[i]) 61 | end 62 | elseif self.nag and self.nag > 0 then 63 | for i = 1, #paramsTab do 64 | self.velocity[i]:mul(self.const_1):add(-self.lr[i]*self.const_2, gradTab[i]) 65 | paramsTab[i]:add(self.velocity[i]) 66 | end 67 | else 68 | for i = 1, #paramsTab do 69 | paramsTab[i]:add(-self.lr[i], gradTab[i]) 70 | end 71 | end 72 | end 73 | 74 | function SGD:updateMomentum(rate) 75 | if self.momentum then 76 | self.momentum = rate 77 | elseif self.nag then 78 | self.nag = rate 79 | end 80 | end 81 | 82 | function SGD:effectiveGradNorm(gradTab) 83 | for i = 1, #gradTab do 84 | print(string.format('effective norm %d: %f', i, gradTab[i]:norm())) 85 | end 86 | end 87 | -------------------------------------------------------------------------------- /RelationRNN/infer_rel_rnn.lua: -------------------------------------------------------------------------------- 1 | require '..' 2 | 3 | local cmd = torch.CmdLine() 4 | cmd:text() 5 | cmd:text('Testing a Recurrent Neural Network to embed a sentence') 6 | cmd:text() 7 | cmd:text('Options') 8 | cmd:option('-useGPU',1,'whether to use gpu for computation') 9 | cmd:option('-modelFile','model.rel.stackBiRNN','file path for saved model') 10 | cmd:option('-testData','inference-data/rel.single.valid.t7','run test on which data set') 11 | cmd:text() 12 | 13 | -- parse input params 14 | local opt = cmd:parse(arg) 15 | local flog = logroll.print_logger() 16 | 17 | if opt.useGPU > 0 then 18 | require 'cutorch' 19 | require 'cunn' 20 | cutorch.setDevice(opt.useGPU) 21 | torch.setdefaulttensortype('torch.CudaTensor') 22 | end 23 | 24 | -- load all models 25 | local fields = stringx.split(opt.testData, '.') 26 | local ncand = fields[#fields-2] 27 | local split = fields[#fields-1] 28 | 29 | local model = torch.load(opt.modelFile) 30 | 31 | -- init data loader and output files 32 | local loader = RankingDataLoader(opt.testData, flog) 33 | local score_file = io.open(string.format('score.rel.%s.%s', ncand, split), 'w') 34 | local rank_file = io.open(string.format('rank.rel.%s.%s', ncand, split), 'w') 35 | 36 | -- extract sub models 37 | local relEmbed = model.relEmbed 38 | local seqModel = model.seqModel 39 | local scoreModel = model.scoreModel 40 | local negRelDrop = model.negRelDrop 41 | 42 | seqModel:evaluate() 43 | negRelDrop:evaluate() 44 | 45 | -- core testing loop 46 | for i = 1, loader.numBatch do 47 | xlua.progress(i, loader.numBatch) 48 | ----------------------- load minibatch ------------------------ 49 | local seq, pos, neg = loader:nextBatch(1) 50 | neg = neg:view(-1) 51 | local currSeqLen = seq:size(1) 52 | local loss = 0 53 | 54 | ------------------------ forward pass ------------------------- 55 | -- sequence vectors [n_batch x n_dim] 56 | local seqVec = seqModel:forward(seq) 57 | 58 | -- negative matrix [n_neg x n_batch x n_dim] 59 | -- local negMat = relEmbed:forward(neg) 60 | 61 | local tmp = relEmbed:forward(neg) 62 | local negMat = negRelDrop:forward(tmp) 63 | 64 | -- sequence matrix [n_neg x n_batch x n_dim] 65 | local seqMat = torch.repeatTensor(seqVec, negMat:size(1), 1) 66 | 67 | if opt.useGPU > 0 then 68 | scores = torch.cmul(seqMat, negMat):sum(2):view(-1) 69 | else 70 | scores = torch.mm(seqMat, negMat:t()):diag() 71 | end 72 | 73 | -- write to rank file 74 | if scores:size(1) > 1 then 75 | local _, argSort = scores:sort(1, true) 76 | 77 | rank_file:write(pos[1], '\t') 78 | for i = 1, argSort:size(1) do 79 | rank_file:write(neg[argSort[i]], ' ') 80 | end 81 | rank_file:write('\n') 82 | 83 | -- write to score file 84 | local topIndices = {} 85 | for i = 1, argSort:size(1) do 86 | topIndices[argSort[i]] = 1 87 | end 88 | for i = 1, scores:size(1) do 89 | if topIndices[i] then 90 | score_file:write(scores[i], ' ') 91 | else 92 | score_file:write(0, ' ') 93 | end 94 | end 95 | score_file:write('\n') 96 | else 97 | rank_file:write(pos[1], '\t') 98 | rank_file:write(neg[1]) 99 | rank_file:write('\n') 100 | score_file:write(scores[1]) 101 | score_file:write('\n') 102 | end 103 | 104 | collectgarbage() 105 | end 106 | score_file:close() 107 | rank_file:close() 108 | -------------------------------------------------------------------------------- /src/data/SeqLabelRankLoader.lua: -------------------------------------------------------------------------------- 1 | -- file to define the class SeqLabelRankLoader 2 | -- SeqLabelRankLoader:nextBatch() return a batch of 3 | 4 | local SeqLabelRankLoader = torch.class('SeqLabelRankLoader') 5 | 6 | function SeqLabelRankLoader:__init(datafile, logger) 7 | -- sequence & pos match 8 | local data = torch.load(datafile) 9 | self.candidates = data.candidates 10 | self.sequences = data.sequences 11 | self.posIndex = data.posIndex 12 | 13 | -- additional variables 14 | self.batchSize = self.sequences[1]:size(2) 15 | self.numBatch = #self.sequences 16 | self.currIdx = 1 17 | self.indices = randperm(self.numBatch) 18 | 19 | if torch.Tensor():type() == 'torch.CudaTensor' then 20 | for i = 1, self.numBatch do 21 | self.candidates[i] = self.candidates[i]:cuda() 22 | self.sequences[i] = self.sequences[i]:cuda() 23 | end 24 | end 25 | 26 | if logger then 27 | self.logger = logger 28 | self.logger.info(string.rep('-', 50)) 29 | self.logger.info(string.format('SeqLabelRankLoader Configurations:')) 30 | self.logger.info(string.format(' number of batch : %d', self.numBatch)) 31 | self.logger.info(string.format(' data batch size : %d', self.batchSize)) 32 | end 33 | end 34 | 35 | -- sequences[dataIdx]: 2-D LongTensor, [seqLen x batchSize] 36 | -- posIndex[dataIdx]: 2-D LongTensor, [batchSize x numLabel] 37 | function SeqLabelRankLoader:nextBatch(circular) 38 | if self.currIdx > self.numBatch then 39 | self.currIdx = 1 40 | self.indices = randperm(self.numBatch) 41 | end 42 | local dataIdx 43 | if circular then 44 | dataIdx = self.currIdx 45 | else 46 | dataIdx = self.indices[self.currIdx] 47 | end 48 | self.currIdx = self.currIdx + 1 49 | 50 | return self.sequences[dataIdx], self.posIndex[dataIdx], self.candidates[dataIdx] 51 | end 52 | 53 | -- create torch-format data for SeqLabelRankLoader 54 | function createSeqLabelRankData(dataPath, savePath, wordVocab, numLabel) 55 | -- class variables 56 | local candidates = {} 57 | local sequences = {} 58 | local posIndex = {} 59 | 60 | -- read data fileh 61 | local file = io.open(dataPath, 'r') 62 | local batchIdx = 0 63 | local line 64 | 65 | while true do 66 | line = file:read() 67 | if line == nil then break end 68 | batchIdx = batchIdx + 1 69 | print ('batch '..batchIdx) 70 | local fields = stringx.split(line, '\t') 71 | 72 | -- fields[1]: language sequence 73 | local tokens = stringx.split(fields[1]) 74 | sequences[batchIdx] = torch.LongTensor(#tokens, 1) 75 | 76 | for i = 1, #tokens do 77 | local token = tokens[i] 78 | sequences[batchIdx][{i, 1}] = wordVocab:index(token) 79 | end 80 | 81 | -- fields[2]: correct label 82 | posIndex[batchIdx] = tonumber(fields[2]) + 1 83 | 84 | -- fields[3:] 85 | local numCandi = #fields - 2 86 | candidates[batchIdx] = torch.zeros(numCandi, numLabel) 87 | 88 | for candiIdx = 1, numCandi do 89 | local labels = stringx.split(fields[candiIdx+2]) 90 | for i = 1, #labels do 91 | index = tonumber(labels[i]) + 1 92 | candidates[batchIdx][{candiIdx, index}] = 1 93 | end 94 | end 95 | 96 | end 97 | file:close() 98 | 99 | local data = {} 100 | data.candidates = candidates 101 | data.sequences = sequences 102 | data.posIndex = posIndex 103 | 104 | torch.save(savePath, data) 105 | end 106 | -------------------------------------------------------------------------------- /src/data/SeqLabelingLoader.lua: -------------------------------------------------------------------------------- 1 | local SeqLabelingLoader = torch.class('SeqLabelingLoader') 2 | 3 | function SeqLabelingLoader:__init(datafile, logger) 4 | -- class variables 5 | local data = torch.load(datafile) 6 | self.sequences = data.seq 7 | self.seqLabels = data.label 8 | 9 | -- additional variables 10 | self.batchSize = self.sequences[1]:size(2) 11 | self.numBatch = #self.sequences 12 | self.currIdx = 1 13 | self.indices = randperm(self.numBatch) 14 | 15 | if torch.Tensor():type() == 'torch.CudaTensor' then 16 | for i = 1, self.numBatch do 17 | self.seqLabels[i] = self.seqLabels[i]:cuda() 18 | self.sequences[i] = self.sequences[i]:cuda() 19 | end 20 | end 21 | 22 | if logger then 23 | self.logger = logger 24 | self.logger.info(string.rep('-', 50)) 25 | self.logger.info(string.format('SeqLabelingLoader Configurations:')) 26 | self.logger.info(string.format(' number of batch: %d', self.numBatch)) 27 | self.logger.info(string.format(' data batch size: %d', self.batchSize)) 28 | end 29 | end 30 | 31 | function SeqLabelingLoader:nextBatch(circular) 32 | if self.currIdx > self.numBatch then 33 | self.currIdx = 1 34 | self.indices = randperm(self.numBatch) 35 | end 36 | local dataIdx 37 | if circular then 38 | dataIdx = self.currIdx 39 | else 40 | dataIdx = self.indices[self.currIdx] 41 | end 42 | self.currIdx = self.currIdx + 1 43 | return self.sequences[dataIdx], self.seqLabels[dataIdx] 44 | end 45 | 46 | -- create torch-format data for SeqLabelingLoader 47 | function createSeqLabelingData(dataPath, savePath, wordVocab, batchSize, noneLabel, trueLabel) 48 | -- class variable 49 | local sequences = {} 50 | local seqLabels = {} 51 | 52 | local noneLabel = noneLabel or 1 53 | local trueLabel = trueLabel or 2 54 | 55 | -- read data fileh 56 | local file = io.open(dataPath, 'r') 57 | local batchIdx = 0 -- the index of sequence batches 58 | local seqIdx = 0 -- sequence index within each batch 59 | local line 60 | 61 | while true do 62 | line = file:read() 63 | if line == nil then break end 64 | local fields = stringx.split(line, '\t') 65 | 66 | -- fields[1]: language sequence 67 | local tokens = stringx.split(fields[1]) 68 | 69 | -- fields[2]: label labels 70 | local labels = stringx.split(fields[2]) 71 | 72 | -- allocate tensor memory 73 | if seqIdx % batchSize == 0 then 74 | print('batch: '..batchIdx) 75 | seqIdx = 1 76 | batchIdx = batchIdx + 1 77 | sequences[batchIdx] = torch.LongTensor(#tokens, batchSize):fill(wordVocab.pad_index) 78 | seqLabels[batchIdx] = torch.DoubleTensor(#tokens, batchSize):fill(noneLabel) 79 | else 80 | seqIdx = seqIdx + 1 81 | end 82 | 83 | -- parse tokens into table 84 | for i = 1, #tokens do 85 | sequences[batchIdx][{i, seqIdx}] = wordVocab:index(tokens[i]) 86 | end 87 | 88 | -- parse labels into table 89 | if #labels == #tokens then 90 | for i = 1, #labels do 91 | seqLabels[batchIdx][{i, seqIdx}] = tonumber(labels[i]) 92 | end 93 | else 94 | for i = 1, #labels do 95 | seqLabels[batchIdx][{tonumber(labels[i]) + 1, seqIdx}] = trueLabel 96 | end 97 | end 98 | end 99 | file:close() 100 | 101 | local data = {} 102 | data.seq = sequences 103 | data.label = seqLabels 104 | 105 | torch.save(savePath, data) 106 | end -------------------------------------------------------------------------------- /src/data/SeqMultiLabelLoader.lua: -------------------------------------------------------------------------------- 1 | local SeqMultiLabelLoader = torch.class('SeqMultiLabelLoader') 2 | 3 | function SeqMultiLabelLoader:__init(datafile, logger) 4 | -- sequence & pos match 5 | local data = torch.load(datafile) 6 | self.sequences = data.sequences 7 | self.seqLabels = data.seqLabels 8 | if data.seqLength ~= nil then 9 | self.seqLength = data.seqLength 10 | end 11 | 12 | -- additional variables 13 | self.batchSize = self.sequences[1]:size(2) 14 | self.numBatch = #self.sequences 15 | self.currIdx = 1 16 | self.indices = randperm(self.numBatch) 17 | 18 | if torch.Tensor():type() == 'torch.CudaTensor' then 19 | for i = 1, self.numBatch do 20 | self.sequences[i] = self.sequences[i]:cuda() 21 | self.seqLabels[i] = self.seqLabels[i]:cuda() 22 | if self.seqLength ~= nil then 23 | self.seqLength[i] = self.seqLength[i]:cuda() 24 | end 25 | end 26 | end 27 | 28 | if logger then 29 | self.logger = logger 30 | self.logger.info(string.rep('-', 50)) 31 | self.logger.info(string.format('SeqMultiLabelLoader Configurations:')) 32 | self.logger.info(string.format(' number of batch : %d', self.numBatch)) 33 | self.logger.info(string.format(' data batch size : %d', self.batchSize)) 34 | end 35 | end 36 | 37 | function SeqMultiLabelLoader:nextBatch(circular) 38 | if self.currIdx > self.numBatch then 39 | self.currIdx = 1 40 | self.indices = randperm(self.numBatch) 41 | end 42 | local dataIdx 43 | if circular then 44 | dataIdx = self.currIdx 45 | else 46 | dataIdx = self.indices[self.currIdx] 47 | end 48 | self.currIdx = self.currIdx + 1 49 | 50 | if self.seqLength ~= nil then 51 | return self.sequences[dataIdx], self.seqLabels[dataIdx], self.seqLength[dataIdx] 52 | else 53 | return self.sequences[dataIdx], self.seqLabels[dataIdx] 54 | end 55 | end 56 | 57 | function createSeqMultiLabelData(dataPath, savePath, wordVocab, numLabel, batchSize) 58 | -- class variables 59 | local seqLabels = {} 60 | local sequences = {} 61 | local seqLength = {} 62 | 63 | -- read data fileh 64 | local file = io.open(dataPath, 'r') 65 | local batchIdx = 0 -- the index of sequence batches 66 | local seqIdx = 0 -- sequence index within each batch 67 | local line 68 | 69 | while true do 70 | line = file:read() 71 | if line == nil then break end 72 | local fields = stringx.split(line, '\t') 73 | 74 | -- fields[1]: language sequence 75 | local tokens = stringx.split(fields[1]) 76 | -- allocate tensor memory 77 | if seqIdx % batchSize == 0 then 78 | print('batch: '..batchIdx) 79 | seqIdx = 1 80 | batchIdx = batchIdx + 1 81 | sequences[batchIdx] = torch.LongTensor(#tokens, batchSize):fill(wordVocab.pad_index) 82 | seqLength[batchIdx] = torch.LongTensor(batchSize):fill(0) 83 | seqLabels[batchIdx] = torch.zeros(batchSize, numLabel) 84 | else 85 | seqIdx = seqIdx + 1 86 | end 87 | 88 | -- parse each token in sequence 89 | for i = 1, #tokens do 90 | local token = tokens[i] 91 | sequences[batchIdx][{i, seqIdx}] = wordVocab:index(token) 92 | end 93 | seqLength[batchIdx][seqIdx] = #tokens 94 | 95 | -- fields[2]: labels 96 | local labels = stringx.split(fields[2]) 97 | for i = 1, #labels do 98 | index = tonumber(labels[i]) + 1 99 | seqLabels[batchIdx][{seqIdx, index}] = 1 100 | end 101 | 102 | end 103 | file:close() 104 | 105 | local data = {} 106 | data.seqLabels = seqLabels 107 | data.sequences = sequences 108 | data.seqLength = seqLength 109 | 110 | torch.save(savePath, data) 111 | end 112 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CFO 2 | Code repo for [Conditional Focused Neural Question Answering with Large-scale Knowledge Bases](https://www.aclweb.org/anthology/P/P16/P16-1076.pdf) 3 | 4 | # Installation and Preprocessing 5 | 1. Refer to Virtuoso.md to install and confiture the software 6 | 2. Make sure [torch7](http://torch.ch/) is installed together with the following dependencies 7 | - logroll: `luarocks install logroll` 8 | - nngraph: `luarocks install nngraph` 9 | 3. After the installation and configuration of **Virtuoso**, run `bash data_preprocess.sh` to finish preprocessing 10 | 11 | # Training 12 | 13 | 1. Focused Lableing 14 | 15 | ``` 16 | cd FocusedLabeling 17 | th train_crf.lua 18 | ``` 19 | 20 | 2. Entity Type Vector 21 | 22 | ``` 23 | cd EntityTypeVec 24 | th train_ent_typevec.lua 25 | ``` 26 | 27 | 3. RNN based Relation Network 28 | 29 | ``` 30 | cd RelationRNN 31 | th train_rel_rnn.lua 32 | ``` 33 | 34 | # Inference 35 | In the following, define `SPLIT='valid' or 'test'`. 36 | 37 | 1. Run focused labeling on validation/test data 38 | ``` 39 | cd FocusedLabeling 40 | 41 | python generate_inference_data.py --split ${SPLIT} 42 | 43 | th process_inference.lua -testSplit ${SPLIT} 44 | th infer_crf.lua \ 45 | -testData inference-data/label.${SPLIT}.t7 \ 46 | -modelFile "path-to-pretrained-model" 47 | ``` 48 | - `python generate_inference_data.py --split ${SPLIT}` will create the file `label.${SPLIT}.txt` in the folder `FocusedLabeling/inference-data`; 49 | - `th process_inference.lua` will turn the text file `label.${SPLIT}.txt` into `label.${SPLIT}.t7` in torch format (both in the folder `FocusedLabeling/inference-data`); 50 | - `th infer_crf.lua ...` will generate the file `label.result.${SPLIT}` in the folder `FocusedLabeling`. 51 | 52 | 2. Query candidates based on focused labeling 53 | 54 | ``` 55 | cd Inference 56 | mkdir ${SPLIT} && cd ${SPLIT} 57 | python ../query_candidates.py 6 \ 58 | ../../PreprocessData/QAData.${SPLIT}.pkl \ 59 | ../../FocusedLabeling/label.result.${SPLIT} \ 60 | ../../KnowledgeBase/type.top-500.pkl 61 | ``` 62 | This step will generate the file `QAData.label.${SPLIT}.cpickle` in the folder `Inference/${SPLIT}`. 63 | 64 | 3. Generate score data based on the query results 65 | 66 | ``` 67 | cd Inference/${SPLIT} 68 | python ../generate_score_data.py QAData.label.${SPLIT}.cpickle 69 | ``` 70 | 71 | This step will generate the following files in the same folder `Inference/${SPLIT}`: 72 | 73 | - `rel.single.${SPLIT}.txt` (candidate relations for those with only a single candidate subject) 74 | - `rel.multi.${SPLIT}.txt` (candidate relations for those with only multiple candidate subject) 75 | - `type.multi.${SPLIT}.txt` (candidate entities for those with multiple candidate subjects) 76 | - `single.${SPLIT}.cpickle` 77 | - `multi.${SPLIT}.cpickle` 78 | 79 | 4. Run relation inference 80 | 81 | ``` 82 | cd RelationRNN 83 | mkdir inference-data 84 | th process_inference.lua -testSplit ${SPLIT} 85 | th infer_rel_rnn.lua -testData inference-data/rel.single.${SPLIT}.t7 86 | th infer_rel_rnn.lua -testData inference-data/rel.multi.${SPLIT}.t7 87 | ``` 88 | 89 | This step will generate the files `score.rel.single.${SPLIT}` and `score.rel.multi.${SPLIT}` in the folder `RelationRNN`. 90 | 91 | 5. Run entity inference 92 | 93 | ``` 94 | cd EntityTypeVec 95 | mkdir inference-data 96 | th process_inference.lua -testSplit ${SPLIT} 97 | th infer_ent_typevec.lua -testData inference-data/ent.${SPLIT}.t7 98 | ``` 99 | 100 | This step will generate the file `score.ent.multi.multi.${SPLIT}` in the folder `EntityTypeVec`. 101 | 102 | 6. Run joint disambiguation 103 | 104 | ``` 105 | cd Inference/${SPLIT} 106 | python ../joint_disambiguation.py multi.${SPLIT}.cpickle \ 107 | ../../RelationRNN/score.rel.multi.${SPLIT} \ 108 | ../../EntityTypeVec/score.ent.multi.multi.${SPLIT} 109 | ``` 110 | 111 | -------------------------------------------------------------------------------- /src/data/RankingDataLoader.lua: -------------------------------------------------------------------------------- 1 | local RankingDataLoader = torch.class('RankingDataLoader') 2 | 3 | function RankingDataLoader:__init(datafile, logger) 4 | -- class variables 5 | local data = torch.load(datafile) 6 | self.sequences = data.seq 7 | self.seqLengths = data.len 8 | self.posMatches = data.pos 9 | self.negMatches = data.neg 10 | 11 | -- additional variables 12 | self.batchSize = self.sequences[1]:size(2) 13 | self.numBatch = #self.sequences 14 | self.negSize = self.negMatches[1]:size(1) 15 | self.currIdx = 1 16 | self.indices = randperm(self.numBatch) 17 | 18 | if torch.Tensor():type() == 'torch.CudaTensor' then 19 | for i = 1, self.numBatch do 20 | self.posMatches[i] = self.posMatches[i]:cuda() 21 | self.negMatches[i] = self.negMatches[i]:cuda() 22 | self.seqLengths[i] = self.seqLengths[i]:cuda() 23 | self.sequences[i] = self.sequences[i]:cuda() 24 | end 25 | end 26 | 27 | if logger then 28 | self.logger = logger 29 | self.logger.info(string.rep('-', 50)) 30 | self.logger.info(string.format('RankingDataLoader Configurations:')) 31 | self.logger.info(string.format(' number of batch: %d', self.numBatch)) 32 | self.logger.info(string.format(' data batch size: %d', self.batchSize)) 33 | self.logger.info(string.format(' neg sample size: %d', self.negSize)) 34 | end 35 | end 36 | 37 | function RankingDataLoader:nextBatch(circular) 38 | if self.currIdx > self.numBatch then 39 | self.currIdx = 1 40 | self.indices = randperm(self.numBatch) 41 | end 42 | local dataIdx 43 | if circular then 44 | dataIdx = self.currIdx 45 | else 46 | dataIdx = self.indices[self.currIdx] 47 | end 48 | self.currIdx = self.currIdx + 1 49 | return self.sequences[dataIdx], self.posMatches[dataIdx], self.negMatches[dataIdx], self.seqLengths[dataIdx] 50 | end 51 | 52 | function createRankingData(dataPath, savePath, wordVocab, fbVocab, batchSize) 53 | -- class variables 54 | local posMatches = {} 55 | local negMatches = {} 56 | local seqLengths = {} 57 | local sequences = {} 58 | 59 | -- read data fileh 60 | local file = io.open(dataPath, 'r') 61 | local batchIdx = 0 -- the index of sequence batches 62 | local seqIdx = 0 -- sequence index within each batch 63 | local line 64 | 65 | while true do 66 | line = file:read() 67 | if line == nil then break end 68 | local fields = stringx.split(line, '\t') 69 | 70 | -- fields[1]: language sequence 71 | local tokens = stringx.split(fields[1]) 72 | -- allocate tensor memory 73 | if seqIdx % batchSize == 0 then 74 | seqIdx = 1 75 | batchIdx = batchIdx + 1 76 | sequences[batchIdx] = torch.LongTensor(#tokens, batchSize):fill(wordVocab.pad_index) 77 | seqLengths[batchIdx] = torch.LongTensor(batchSize):fill(0) 78 | posMatches[batchIdx] = torch.LongTensor(batchSize):fill(0) 79 | negMatches[batchIdx] = torch.LongTensor(#fields-2, batchSize):fill(0) 80 | else 81 | seqIdx = seqIdx + 1 82 | end 83 | 84 | -- parse each token in sequence 85 | for i = 1, #tokens do 86 | local token = tokens[i] 87 | sequences[batchIdx][{i, seqIdx}] = wordVocab:index(token) 88 | end 89 | seqLengths[batchIdx][seqIdx] = #tokens 90 | 91 | -- fields[2]: positive match 92 | posMatches[batchIdx][seqIdx] = fbVocab:index(fields[2]) 93 | 94 | -- fields[3-#fields]: negative match 95 | for i = 3, #fields do 96 | negMatches[batchIdx][{i-2, seqIdx}] = fbVocab:index(fields[i]) 97 | end 98 | end 99 | file:close() 100 | 101 | local data = {} 102 | data.pos = posMatches 103 | data.neg = negMatches 104 | data.len = seqLengths 105 | data.seq = sequences 106 | 107 | torch.save(savePath, data) 108 | end 109 | -------------------------------------------------------------------------------- /SimpleQuestions/generate_training_data.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | import io 3 | import cPickle as pickle 4 | 5 | sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), 'src/py_module' )) 6 | from QAData import * 7 | import virtuoso 8 | 9 | def focused_labeling_data(data_list): 10 | with io.open('trainingData/data.train.focused_labeling', 'w', encoding='utf8') as fo: 11 | for data in data_list: 12 | if data.text_attention_indices: 13 | fo.write(u'%s\t%s\n' % (data.question, ' '.join([str(index) for index in data.text_attention_indices]))) 14 | 15 | def relation_ranking_data(data_list): 16 | fo = io.open('trainingData/data.train.relation_ranking', 'w', encoding='utf8') 17 | 18 | # Main Loop 19 | data_turple = [] 20 | data_num = 0 21 | for data in data_list: 22 | question = data.question 23 | pos_rel = data.relation 24 | 25 | # this condition will filter out any question that has only one word 26 | if len(question.split()) > 1: 27 | data_turple.append((question, pos_rel)) 28 | data_num += 1 29 | 30 | # will choose to output data according to indices 31 | chosen_num = data_num - (data_num % 256) 32 | chosen_indices = np.sort(np.random.permutation(data_num)[:chosen_num]) 33 | 34 | chosen_indices_idx = 0 35 | # for each data triple in data_turple list 36 | for idx in range(len(data_turple)): 37 | question = data_turple[idx][0] 38 | pos_rel = data_turple[idx][1] 39 | if idx == chosen_indices[chosen_indices_idx]: 40 | fo.write(u'%s\t%s\n' % (question, pos_rel)) 41 | chosen_indices_idx += 1 42 | 43 | fo.close() 44 | 45 | def entity_ranking_data(data_list): 46 | fo = io.open('trainingData/data.train.entity_ranking', 'w', encoding='utf8') 47 | 48 | # Main Loop 49 | data_turple = [] 50 | data_num = 0 51 | for data in data_list: 52 | pos_sub = data.subject 53 | pos_rel = data.relation 54 | question = data.question 55 | 56 | # this condition will filter out any question that has only one word 57 | if len(question.split()) > 1: 58 | data_turple.append((question, pos_sub, pos_rel)) 59 | data_num += 1 60 | 61 | # will choose to output data according to indices 62 | chosen_num = data_num - (data_num % 256) 63 | chosen_indices = np.sort(np.random.permutation(data_num)[:chosen_num]) 64 | 65 | chosen_indices_idx = 0 66 | # for each data triple in data_turple list 67 | for idx in range(len(data_turple)): 68 | question = data_turple[idx][0] 69 | pos_sub = data_turple[idx][1] 70 | pos_rel = data_turple[idx][2] 71 | if idx == chosen_indices[chosen_indices_idx]: 72 | fo.write(u'%s\t%s\t%s\n' % (question, pos_sub, pos_rel)) 73 | chosen_indices_idx += 1 74 | 75 | fo.close() 76 | 77 | def entity_typevec_data(data_list): 78 | type_dict = pickle.load(file('../KnowledgeBase/type.top-500.pkl', 'rb')) 79 | with io.open('trainingData/data.train.entity_typevec', 'w', encoding='utf8') as fo: 80 | for data in data_list: 81 | sub = data.subject 82 | question = data.question 83 | types = virtuoso.id_query_type(sub) 84 | types = [t for t in types if type_dict.has_key(t)] 85 | if len(types) > 0: 86 | fo.write(u'%s\t%s\n' % (question, ' '.join([str(type_dict[t]) for t in types]))) 87 | else: 88 | fo.write(u'%s\t%d\n' % (question, len(type_dict))) 89 | 90 | 91 | if __name__ == '__main__': 92 | data_list = pickle.load(file('PreprocessData/QAData.train.pkl', 'rb')) 93 | if not os.path.exists('trainingData'): 94 | os.mkdir('trainingData') 95 | print >> sys.stderr, 'focused_labeling_data' 96 | focused_labeling_data(data_list) 97 | print >> sys.stderr, 'relation_ranking_data' 98 | relation_ranking_data(data_list) 99 | # print >> sys.stderr, 'entity_ranking_data' 100 | # entity_ranking_data(data_list) 101 | print >> sys.stderr, 'entity_typevec_data' 102 | entity_typevec_data(data_list) 103 | -------------------------------------------------------------------------------- /src/py_module/freebase.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import urllib 4 | import re 5 | 6 | api_key = 'AIzaSyAsmMIiVDkF2Vfjt3cDwSHCmHF7QTS0_kY' 7 | 8 | def suggest_id(query_string): 9 | service_url = 'https://www.googleapis.com/freebase/v1/search' 10 | params = { 11 | 'query': query_string, 12 | 'key': api_key 13 | } 14 | url = service_url + '?' + urllib.urlencode(params) 15 | response = json.loads(urllib.urlopen(url).read()) 16 | 17 | suggested_entity = [] 18 | for result in response['result']: 19 | if result['mid'].startswith('/m/'): 20 | suggested_entity.append('fb:m.' + str(result['mid'].split('/m/')[-1])) 21 | 22 | return suggested_entity 23 | 24 | def mid2name(entity_mid): 25 | service_url = 'https://www.googleapis.com/freebase/v1/mqlread' 26 | query = [{'id': None, 'mid': entity_mid, 'name': None}] 27 | 28 | params = { 29 | 'query': json.dumps(query), 30 | 'key': api_key 31 | } 32 | 33 | url = service_url + '?' + urllib.urlencode(params) 34 | response = json.loads(urllib.urlopen(url).read()) 35 | 36 | if response['result'][0].has_key('name') and response['result'][0]['name']: 37 | return response['result'][0]['name'].encode('utf-8') 38 | else: 39 | return None 40 | 41 | def mid2id(entity_mid): 42 | service_url = 'https://www.googleapis.com/freebase/v1/mqlread' 43 | query = [{'mid': entity_mid, 'id': None}] 44 | 45 | params = { 46 | 'query': json.dumps(query), 47 | 'key': api_key 48 | } 49 | 50 | url = service_url + '?' + urllib.urlencode(params) 51 | response = json.loads(urllib.urlopen(url).read()) 52 | 53 | if response['result'][0].has_key('id'): 54 | return response['result'][0]['id'] 55 | else: 56 | return None 57 | 58 | 59 | def id2mid(entity_id): 60 | service_url = 'https://www.googleapis.com/freebase/v1/mqlread' 61 | query = [{'id': entity_id, 'mid': None}] 62 | 63 | params = { 64 | 'query': json.dumps(query), 65 | 'key': api_key 66 | } 67 | 68 | url = service_url + '?' + urllib.urlencode(params) 69 | response = json.loads(urllib.urlopen(url).read()) 70 | 71 | if response['result'][0].has_key('mid'): 72 | return response['result'][0]['mid'] 73 | else: 74 | return None 75 | 76 | def name2mids(entity_name): 77 | service_url = 'https://www.googleapis.com/freebase/v1/mqlread' 78 | query = [{'name': entity_name, 'mid': None, '/common/topic/alias': []}] 79 | 80 | params = { 81 | 'query': json.dumps(query), 82 | 'key': api_key 83 | } 84 | 85 | url = service_url + '?' + urllib.urlencode(params) 86 | response = json.loads(urllib.urlopen(url).read()) 87 | 88 | mid_list = [] 89 | for res in response['result']: 90 | if res.has_key('mid'): 91 | mid_list.append(str(res['mid'])) 92 | return mid_list 93 | 94 | def unquotekey(key, encoding=None): 95 | """ 96 | unquote a namespace key and turn it into a unicode string 97 | """ 98 | 99 | valid_always = string.ascii_letters + string.digits 100 | 101 | output = [] 102 | i = 0 103 | while i < len(key): 104 | if key[i] in valid_always: 105 | output.append(key[i]) 106 | i += 1 107 | elif key[i] in '_-' and i != 0 and i != len(key): 108 | output.append(key[i]) 109 | i += 1 110 | elif key[i] == '$' and i+4 < len(key): 111 | # may raise ValueError if there are invalid characters 112 | output.append(unichr(int(key[i+1:i+5],16))) 113 | i += 5 114 | else: 115 | raise ValueError, "unquote key saw invalid character '%s' at position %d" % (key[i], i) 116 | 117 | ustr = u''.join(output) 118 | 119 | if encoding is None: 120 | return ustr 121 | 122 | return ustr.encode(encoding) 123 | 124 | # used to escape strings for sparql query 125 | def escape_string(s): 126 | escape_map = { 127 | '"' : '\\"', 128 | '\r': '\\r', 129 | '\n': '\\n', 130 | '\t': '\\t', 131 | '\b': '\\b', 132 | '\f': '\\f' 133 | } 134 | s = s.replace('\\','\\u005c\\u005c') 135 | for key, value in escape_map.items(): 136 | s = s.replace(key,value) 137 | return '"' + s + '"' 138 | 139 | # used to escape strings for sparql query 140 | def unescape_string(s): 141 | unescape_map = { 142 | '\\"': '"' , 143 | '\\r': '\r', 144 | '\\n': '\n', 145 | '\\t': '\t', 146 | '\\b': '\b', 147 | '\\f': '\f' 148 | } 149 | # strip the quote " on both sides 150 | s = s[1:-1] 151 | for key, value in unescape_map.items(): 152 | s = s.replace(key,value) 153 | s = s.replace('\\u005c\\u005c', '\\') 154 | return s 155 | 156 | 157 | -------------------------------------------------------------------------------- /src/model/Linear.lua: -------------------------------------------------------------------------------- 1 | local Linear, parent = torch.class('Linear', 'nn.Module') 2 | 3 | function Linear:__init(inputSize, outputSize, bias) 4 | parent.__init(self) 5 | local bias = ((bias == nil) and true) or bias 6 | self.weight = torch.Tensor(outputSize, inputSize) 7 | self.gradWeight = torch.Tensor(outputSize, inputSize) 8 | if bias then 9 | self.bias = torch.Tensor(outputSize) 10 | self.gradBias = torch.Tensor(outputSize) 11 | end 12 | self:reset() 13 | end 14 | 15 | function Linear:reset(stdv) 16 | if stdv then 17 | stdv = stdv * math.sqrt(3) 18 | else 19 | stdv = 1./math.sqrt(self.weight:size(2)) 20 | end 21 | if nn.oldSeed then 22 | for i=1,self.weight:size(1) do 23 | self.weight:select(1, i):apply(function() 24 | return torch.uniform(-stdv, stdv) 25 | end) 26 | end 27 | if self.bias then 28 | for i=1,self.bias:nElement() do 29 | self.bias[i] = torch.uniform(-stdv, stdv) 30 | end 31 | end 32 | else 33 | self.weight:uniform(-stdv, stdv) 34 | if self.bias then self.bias:uniform(-stdv, stdv) end 35 | end 36 | return self 37 | end 38 | 39 | function Linear:updateOutput(input) 40 | if input:dim() == 1 then 41 | self.output:resize(self.weight:size(1)) 42 | if self.bias then self.output:copy(self.bias) else self.output:zero() end 43 | self.output:addmv(1, self.weight, input) 44 | elseif input:dim() == 2 then 45 | local nframe = input:size(1) 46 | local nElement = self.output:nElement() 47 | self.output:resize(nframe, self.weight:size(1)) 48 | if self.output:nElement() ~= nElement then 49 | self.output:zero() 50 | end 51 | self.addBuffer = self.addBuffer or input.new() 52 | if self.addBuffer:nElement() ~= nframe then 53 | self.addBuffer:resize(nframe):fill(1) 54 | end 55 | self.output:addmm(0, self.output, 1, input, self.weight:t()) 56 | if self.bias then self.output:addr(1, self.addBuffer, self.bias) end 57 | elseif input:dim() >= 3 then 58 | -- computation happens in 2D views 59 | local dInput = input:view(-1, self.weight:size(2)) 60 | local nframe = dInput:size(1) 61 | self.output:resize(nframe, self.weight:size(1)) 62 | if self.output:nElement() ~= nElement then 63 | self.output:zero() 64 | end 65 | self.addBuffer = self.addBuffer or input.new() 66 | if self.addBuffer:nElement() ~= nframe then 67 | self.addBuffer:resize(nframe):fill(1) 68 | end 69 | self.output:addmm(0, self.output, 1, dInput, self.weight:t()) 70 | if self.bias then self.output:addr(1, self.addBuffer, self.bias) end 71 | 72 | -- re-view output according to the input size 73 | local sizes = input:size() 74 | sizes[input:dim()] = self.weight:size(1) 75 | self.output = self.output:view(sizes) 76 | else 77 | error('input must be 1D, 2D or 3D Tensor') 78 | end 79 | 80 | return self.output 81 | end 82 | 83 | function Linear:updateGradInput(input, gradOutput) 84 | if self.gradInput then 85 | 86 | local nElement = self.gradInput:nElement() 87 | self.gradInput:resizeAs(input) 88 | if self.gradInput:nElement() ~= nElement then 89 | self.gradInput:zero() 90 | end 91 | if input:dim() == 1 then 92 | self.gradInput:addmv(0, 1, self.weight:t(), gradOutput) 93 | elseif input:dim() == 2 then 94 | self.gradInput:addmm(0, 1, gradOutput, self.weight) 95 | elseif input:dim() >= 3 then 96 | local dGradInput = self.gradInput:view(-1, self.weight:size(2)) 97 | local dGradOutput = gradOutput:view(-1, self.weight:size(1)) 98 | dGradInput:addmm(0, 1, dGradOutput, self.weight) 99 | end 100 | 101 | return self.gradInput 102 | end 103 | end 104 | 105 | function Linear:accGradParameters(input, gradOutput, scale) 106 | scale = scale or 1 107 | if input:dim() == 1 then 108 | self.gradWeight:addr(scale, gradOutput, input) 109 | if self.bias then self.gradBias:add(scale, gradOutput) end 110 | elseif input:dim() == 2 then 111 | self.gradWeight:addmm(scale, gradOutput:t(), input) 112 | if self.bias then 113 | self.gradBias:addmv(scale, gradOutput:t(), self.addBuffer) 114 | end 115 | elseif input:dim() == 3 then 116 | local dGradOutput = gradOutput:view(-1, self.weight:size(1)) 117 | local dInput = input:view(-1, self.weight:size(2)) 118 | self.gradWeight:addmm(scale, dGradOutput:t(), dInput) 119 | if self.bias then 120 | self.gradBias:addmv(scale, dGradOutput:t(), self.addBuffer) 121 | end 122 | end 123 | end 124 | 125 | -- we do not need to accumulate parameters when sharing 126 | Linear.sharedAccUpdateGradParameters = Linear.accUpdateGradParameters 127 | 128 | 129 | function Linear:__tostring__() 130 | return torch.type(self) .. 131 | string.format('(%d -> %d)', self.weight:size(2), self.weight:size(1)) .. 132 | (self.bias == nil and ' without bias' or '') 133 | end 134 | -------------------------------------------------------------------------------- /Inference/joint_predict.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import glob 3 | import cPickle as pickle 4 | import numpy as np 5 | from sklearn import preprocessing 6 | 7 | sys.path.append(os.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), 'src/py_module' )) 8 | from QAData import * 9 | 10 | def predict_func(data, rel_scores, ent_scores, alpha, top_rel_ratio): 11 | rel_scores = np.array(rel_scores) 12 | ent_scores = np.array(ent_scores) 13 | 14 | ent_threshold = np.min(ent_scores) 15 | top_sub_ids = np.where(ent_scores >= ent_threshold)[0] 16 | 17 | rel_threshold = top_rel_ratio * (np.max(rel_scores) - np.min(rel_scores)) + np.min(rel_scores) 18 | top_rel_ids = np.where(rel_scores >= rel_threshold)[0] 19 | 20 | rel_id_dict = {data.cand_rel[rel_id]:i for i, rel_id in enumerate(top_rel_ids)} 21 | 22 | score_mat = np.zeros((top_sub_ids.shape[0], top_rel_ids.shape[0])) 23 | 24 | for row_idx, sub_id in enumerate(top_sub_ids): 25 | for rel in data.sub_rels[sub_id]: 26 | if rel_id_dict.has_key(rel): 27 | col_idx = rel_id_dict[rel] 28 | #score_mat[row_idx, col_idx] = rel_scores[top_rel_ids[col_idx]] 29 | score_mat[row_idx, col_idx] = 1 30 | 31 | # compute all the terms 32 | ent_scores = ent_scores[top_sub_ids] 33 | rel_scores = rel_scores[top_rel_ids] 34 | score_mat = np.exp(score_mat * alpha + ent_scores.reshape(score_mat.shape[0], 1) * (1 - alpha)) 35 | 36 | # normalization 37 | score_mat /= np.sum(score_mat, 0) 38 | 39 | score_mat *= np.exp(rel_scores) 40 | 41 | top_sub_id, top_rel_id = np.unravel_index(np.argmax(score_mat), score_mat.shape) 42 | 43 | return [data.cand_sub[top_sub_ids[top_sub_id]]], data.cand_rel[top_rel_ids[top_rel_id]] 44 | 45 | if __name__ == '__main__': 46 | # Parse input argument 47 | if len(sys.argv) == 5: 48 | data_fn = sys.argv[1] 49 | rel_score_fn = sys.argv[2] 50 | ent_score_fn = sys.argv[3] 51 | alpha = float(sys.argv[4]) 52 | top_rel_ratio = 0.0 53 | elif len(sys.argv) == 6: 54 | data_fn = sys.argv[1] 55 | rel_score_fn = sys.argv[2] 56 | ent_score_fn = sys.argv[3] 57 | alpha = float(sys.argv[4]) 58 | top_rel_ratio = float(sys.argv[5]) 59 | else: 60 | print 'Wrong arguments. Usage: ' 61 | print ' python joint_disambiguation.py cpickle_file rel_score_file ent_score_file alpha [[rel_ratio]]' 62 | sys.exit(1) 63 | 64 | chosen_subs = 0 65 | total_subs = 0 66 | 67 | # Error information 68 | error_dir = './error_analysis' 69 | if not os.path.exists(error_dir): 70 | os.makedirs(error_dir) 71 | category = data_fn.split('.')[0] 72 | f_0_0 = file(os.path.join(error_dir, 'sub_cor_rel_cor.%s.txt'%(category)), 'wb') 73 | f_0_1 = file(os.path.join(error_dir, 'sub_cor_rel_err.%s.txt'%(category)), 'wb') 74 | f_1_0 = file(os.path.join(error_dir, 'sub_err_rel_cor.%s.txt'%(category)), 'wb') 75 | f_1_1 = file(os.path.join(error_dir, 'sub_err_rel_err.%s.txt'%(category)), 'wb') 76 | 77 | # Further disambiguation 78 | suffix = sys.argv[1].split('.')[-2] 79 | 80 | # Load cPickle file into data 81 | data_list = pickle.load(file(data_fn, 'rb')) 82 | print >> sys.stderr, 'finish loading cpickle file %d' % (len(data_list)) 83 | 84 | rel_score_list = file(rel_score_fn, 'rb').readlines() 85 | if ent_score_fn: 86 | ent_score_list = file(ent_score_fn, 'rb').readlines() 87 | 88 | # Count the totol number of data 89 | corr_mat = np.zeros((2,2)) 90 | 91 | for idx, data in enumerate(data_list): 92 | rel_scores = [float(score) for score in rel_score_list[idx].strip().split(' ')] 93 | if ent_score_fn: 94 | ent_scores = [float(score) for score in ent_score_list[idx].strip().split(' ')] 95 | top_sub, top_rel = predict_func(data, rel_scores, ent_scores, alpha, top_rel_ratio) 96 | else: 97 | top_sub, top_rel = rel_based(data, rel_scores) 98 | 99 | if len(top_sub) == 1 and top_sub[0] == data.subject: 100 | if top_rel == data.relation: 101 | corr_mat[0,0] += 1 102 | print >> f_0_0, '%s\t%s\t%s\t%s\t%s' % (data.question, fb2www(data.subject), fb2www(top_sub), data.relation, top_rel) 103 | else: 104 | corr_mat[0,1] += 1 105 | print >> f_0_1, '%s\t%s\t%s\t%s\t%s' % (data.question, fb2www(data.subject), fb2www(top_sub), data.relation, top_rel) 106 | else: 107 | if top_rel == data.relation: 108 | corr_mat[1,0] += 1 109 | print >> f_1_0, '%s\t%s\t%s\t%s\t%s' % (data.question, fb2www(data.subject), fb2www(top_sub), data.relation, top_rel) 110 | else: 111 | corr_mat[1,1] += 1 112 | print >> f_1_1, '%s\t%s\t%s\t%s\t%s' % (data.question, fb2www(data.subject), fb2www(top_sub), data.relation, top_rel) 113 | 114 | 115 | print alpha 116 | print corr_mat / len(data_list) 117 | print corr_mat 118 | 119 | f_0_0.close() 120 | f_0_1.close() 121 | f_1_0.close() 122 | f_1_1.close() 123 | -------------------------------------------------------------------------------- /src/data/SeqRankingLoader.lua: -------------------------------------------------------------------------------- 1 | local SeqRankingLoader = torch.class('SeqRankingLoader') 2 | 3 | function SeqRankingLoader:__init(datafile, negSize, negRange, logger) 4 | -- sequence & pos match 5 | local data = torch.load(datafile) 6 | self.sequences = data.seq 7 | self.posMatches = data.pos 8 | if data.len ~= nil then 9 | self.seqLengths = data.len 10 | end 11 | 12 | -- for negative sampling 13 | self.negSize = negSize 14 | self.negRange = negRange 15 | 16 | -- additional variables 17 | self.batchSize = self.sequences[1]:size(2) 18 | self.numBatch = #self.sequences 19 | self.currIdx = 1 20 | self.indices = randperm(self.numBatch) 21 | 22 | -- allocate memory 23 | self._negMatch = torch.LongTensor(self.negSize, self.batchSize) 24 | self._posMatch = torch.LongTensor(1, self.batchSize):expand(self.negSize, self.batchSize) 25 | 26 | if torch.Tensor():type() == 'torch.CudaTensor' then 27 | for i = 1, self.numBatch do 28 | self.sequences[i] = self.sequences[i]:cuda() 29 | self.posMatches[i] = self.posMatches[i]:cuda() 30 | if self.seqLengths ~= nil then 31 | self.seqLengths[i] = self.seqLengths[i]:cuda() 32 | end 33 | end 34 | self.negMatch = torch.CudaTensor(self.negSize, self.batchSize) 35 | else 36 | self.negMatch = self._negMatch 37 | end 38 | 39 | if logger then 40 | self.logger = logger 41 | self.logger.info(string.rep('-', 50)) 42 | self.logger.info(string.format('SeqRankingLoader Configurations:')) 43 | self.logger.info(string.format(' number of batch : %d', self.numBatch)) 44 | self.logger.info(string.format(' data batch size : %d', self.batchSize)) 45 | self.logger.info(string.format(' neg sample size : %d', self.negSize)) 46 | self.logger.info(string.format(' neg sample range: %d', self.negRange)) 47 | end 48 | end 49 | 50 | function SeqRankingLoader:setNegSize(negSize) 51 | self.negSize = negSize 52 | 53 | -- allocate memory 54 | self._negMatch = torch.LongTensor(self.negSize, self.batchSize) 55 | self._posMatch = torch.LongTensor(1, self.batchSize):expand(self.negSize, self.batchSize) 56 | 57 | if torch.Tensor():type() == 'torch.CudaTensor' then 58 | self.negMatch = torch.CudaTensor(self.negSize, self.batchSize) 59 | end 60 | end 61 | 62 | function SeqRankingLoader:nextBatch(circular) 63 | if self.currIdx > self.numBatch then 64 | self.currIdx = 1 65 | self.indices = randperm(self.numBatch) 66 | end 67 | local dataIdx 68 | if circular then 69 | dataIdx = self.currIdx 70 | else 71 | dataIdx = self.indices[self.currIdx] 72 | end 73 | self.currIdx = self.currIdx + 1 74 | 75 | self._posMatch:storage():copy(self.posMatches[dataIdx]:storage()) 76 | self._negMatch:random(1, self.negRange) 77 | 78 | while torch.sum(torch.eq(self._negMatch, self._posMatch)) > 0 do 79 | self._negMatch:maskedFill(torch.eq(self._negMatch, self._posMatch), math.random(1, self.negRange)) 80 | end 81 | 82 | if torch.Tensor():type() == 'torch.CudaTensor' then 83 | self.negMatch:copy(self._negMatch) 84 | end 85 | if self.seqLengths ~= nil then 86 | return self.sequences[dataIdx], self.posMatches[dataIdx], self.negMatch, self.seqLengths[dataIdx] 87 | else 88 | return self.sequences[dataIdx], self.posMatches[dataIdx], self.negMatch 89 | end 90 | 91 | end 92 | 93 | function createSeqRankingData(dataPath, savePath, wordVocab, fbVocab, batchSize) 94 | -- class variables 95 | local posMatches = {} 96 | local seqLengths = {} 97 | local sequences = {} 98 | 99 | -- read data fileh 100 | local file = io.open(dataPath, 'r') 101 | local batchIdx = 0 -- the index of sequence batches 102 | local seqIdx = 0 -- sequence index within each batch 103 | local line 104 | 105 | while true do 106 | line = file:read() 107 | if line == nil then break end 108 | local fields = stringx.split(line, '\t') 109 | 110 | -- fields[1]: language sequence 111 | local tokens = stringx.split(fields[1]) 112 | 113 | -- allocate tensor memory 114 | if seqIdx % batchSize == 0 then 115 | print('batch: '..batchIdx) 116 | seqIdx = 1 117 | batchIdx = batchIdx + 1 118 | posMatches[batchIdx] = torch.LongTensor(batchSize):fill(0) 119 | seqLengths[batchIdx] = torch.LongTensor(batchSize):fill(0) 120 | sequences [batchIdx] = torch.LongTensor(#tokens, batchSize):fill(wordVocab.pad_index) 121 | else 122 | seqIdx = seqIdx + 1 123 | end 124 | 125 | -- parse each token in sequence 126 | for i = 1, #tokens do 127 | local token = tokens[i] 128 | sequences[batchIdx][{i, seqIdx}] = wordVocab:index(token) 129 | end 130 | seqLengths[batchIdx][seqIdx] = #tokens 131 | 132 | -- fields[2]: positive match 133 | posMatches[batchIdx][seqIdx] = fbVocab:index(fields[2]) 134 | 135 | end 136 | file:close() 137 | 138 | local data = {} 139 | data.pos = posMatches 140 | data.len = seqLengths 141 | data.seq = sequences 142 | 143 | torch.save(savePath, data) 144 | end 145 | -------------------------------------------------------------------------------- /FocusedLabeling/train_crf.lua: -------------------------------------------------------------------------------- 1 | require '..' 2 | 3 | local cmd = torch.CmdLine() 4 | cmd:text() 5 | cmd:text('Training a Recurrent Neural Network to classify a sequence of words') 6 | cmd:text() 7 | cmd:text('Comandline Options') 8 | 9 | cmd:option('-wordVocabSize',100002,'number of words in dictionary') 10 | cmd:option('-wordEmbedDim',300,'size of word embedding') 11 | cmd:option('-wordEmbedPath','../embedding/word.100k.glove.t7','pretained word embedding path') 12 | 13 | cmd:option('-hiddenSize',256,'size of BiGRU unit') 14 | cmd:option('-outputType',1,'output type of each rnn layer') 15 | cmd:option('-numLayer',2,'number of BiGRU layers') 16 | cmd:option('-maxSeqLen',40,'number of steps the BiGRU needs to unroll') 17 | 18 | cmd:option('-numClass',2,'number of classes in classification') 19 | 20 | cmd:option('-trainData','../data/train.focused_labeling.t7','training data file') 21 | 22 | cmd:option('-initRange',0.08,'the range of uniformly initialize parameters') 23 | cmd:option('-momentumEpoch',1,'after which epoch, the model starts to increase momentum') 24 | cmd:option('-maxEpochs',100,'number of full passes through the training data') 25 | 26 | cmd:option('-printEvery',50,'the frequency (# minibatches) of logging loss information') 27 | cmd:option('-logFile','logs/log.BiGRU','log file to record training information') 28 | cmd:option('-saveEvery',10,'the frequency (# epochs) of automatic saving trained models') 29 | cmd:option('-saveFile','model.BiGRU','filename for saving trained model') 30 | 31 | cmd:option('-useGPU',1,'which GPU is used for computation') 32 | 33 | cmd:text() 34 | 35 | ----------------------------- Basic Options ----------------------------- 36 | 37 | local opt = cmd:parse(arg) 38 | local flog = logroll.file_logger(opt.logFile) 39 | -- local flog = logroll.print_logger() 40 | 41 | if opt.useGPU > 0 then 42 | require 'cutorch' 43 | require 'cunn' 44 | cutorch.setDevice(opt.useGPU) 45 | torch.setdefaulttensortype('torch.CudaTensor') 46 | flog.info(string.rep('-', 50)) 47 | flog.info('Set default tensor type to CudaTensor') 48 | torch.manualSeed(1) 49 | cutorch.manualSeed(1) 50 | end 51 | 52 | ----------------------------- Data Loader ----------------------------- 53 | local loader = SeqLabelingLoader(opt.trainData, flog) 54 | 55 | ----------------------------- Init Models ----------------------------- 56 | -- Init word embedding model 57 | local wordEmbed = cudacheck(nn.LookupTable(opt.wordVocabSize, opt.wordEmbedDim)) 58 | -- loadPretrainedEmbed(wordEmbed, opt.wordEmbedPath) 59 | 60 | -- Init Stacked BiGRU 61 | local rnnconfig = { 62 | hiddenSize = opt.hiddenSize, 63 | maxSeqLen = opt.maxSeqLen, 64 | maxBatch = loader.batchSize, 65 | logger = flog 66 | } 67 | local RNN = {} 68 | for l = 1, opt.numLayer do 69 | rnnconfig.inputSize = l == 1 and opt.wordEmbedDim or opt.hiddenSize * 2 70 | RNN[l] = BiGRU(rnnconfig) 71 | end 72 | 73 | -- Init linear project model 74 | local linear = Linear(opt.hiddenSize*2, opt.numClass) 75 | 76 | -- Init the linear CRF 77 | local linearCRF = CRF(opt.numClass, opt.maxSeqLen, loader.batchSize) 78 | 79 | local seqModel = nn.Sequential() 80 | seqModel:add(wordEmbed) 81 | for l = 1, opt.numLayer do 82 | seqModel:add(nn.Dropout(0.7)) 83 | seqModel:add(RNN[l]) 84 | end 85 | seqModel:add(linear) 86 | 87 | local model = {} 88 | model.seqModel = seqModel 89 | model.linearCRF = linearCRF 90 | 91 | ----------------------------- Optimization ----------------------------- 92 | -- Create tables to hold params and grads 93 | local optimParams, optimGrads = {}, {} 94 | for l = 1, opt.numLayer do 95 | optimParams[l], optimGrads[l] = RNN[l]:getParameters() 96 | end 97 | optimParams[#optimParams+1], optimGrads[#optimGrads+1] = linear:getParameters() 98 | optimParams[#optimParams+1], optimGrads[#optimGrads+1] = linearCRF:getParameters() 99 | for i = 1, #optimParams do 100 | optimParams[i]:uniform(-opt.initRange, opt.initRange) 101 | end 102 | optimParams[#optimParams+1], optimGrads[#optimGrads+1] = wordEmbed:getParameters() 103 | print(optimParams, optimGrads) 104 | 105 | -- Configurations for Optimizer 106 | local optimConf = {lr = {}, logger = flog} 107 | for l = 1, #optimParams do optimConf['lr'][l] = 2e-2 end 108 | local optimizer = AdaGrad(optimGrads, optimConf) 109 | 110 | ----------------------------- Training ----------------------------- 111 | 112 | local avgProb = 0 113 | 114 | local maxIters = opt.maxEpochs * loader.numBatch 115 | flog.info(string.rep('-', 40)) 116 | flog.info('Begin Training') 117 | 118 | for i = 1, maxIters do 119 | xlua.progress(i, maxIters) 120 | 121 | ----------------------- clean gradients ----------------------- 122 | for i = 1, #optimGrads do optimGrads[i]:zero() end 123 | 124 | ----------------------- load minibatch ------------------------ 125 | local seq, labels = loader:nextBatch() 126 | local currSeqLen = seq:size(1) 127 | 128 | ------------------------ forward pass ------------------------- 129 | local seqVec = seqModel:forward(seq) 130 | local prob = linearCRF:forward({seqVec, labels}) 131 | avgProb = avgProb + torch.mean(prob) 132 | 133 | ------------------------ backward pass ------------------------ 134 | local d_seqVec = linearCRF:backward({seqVec, labels}) 135 | seqModel:backward(seq, d_seqVec) 136 | 137 | ----------------------- parameter update ---------------------- 138 | -- optim for rnn, projection 139 | for l = 1, opt.numLayer do optimGrads[l]:clamp(-10, 10) end 140 | optimizer:updateParams(optimParams, optimGrads) 141 | 142 | -- Logging 143 | if i % loader.numBatch == 0 then 144 | flog.info(string.format("finish epoch %d", i / loader.numBatch)) 145 | end 146 | 147 | ------------------------ training info ------------------------ 148 | if i % opt.printEvery == 0 then 149 | linearCRF:evaluate() 150 | local pred = linearCRF:forward(seqVec) 151 | local maskPred = torch.eq(pred, 2) 152 | local maskTrue = torch.eq(labels, 2) 153 | local corr = torch.eq(pred:type(torch.type(labels)), labels):cmul(maskTrue):sum() 154 | 155 | local p, r = corr / maskPred:sum(), corr / maskTrue:sum() 156 | flog.info(string.format("iter %4d, avg prob = %5f, p = %3f, r = %3f, F1 = %3f", i, avgProb / opt.printEvery, p, r, 2 * p * r / (p + r))) 157 | linearCRF:training() 158 | avgProb = 0 159 | end 160 | 161 | 162 | if i % (loader.numBatch * opt.saveEvery) == 0 then 163 | local epoch = i / loader.numBatch 164 | print('Saving model after epoch ' .. epoch) 165 | torch.save(opt.saveFile..'.'..opt.useGPU..'.'..epoch, model) 166 | end 167 | end 168 | -------------------------------------------------------------------------------- /EntityTypeVec/train_ent_typevec.lua: -------------------------------------------------------------------------------- 1 | require '..' 2 | 3 | local cmd = torch.CmdLine() 4 | cmd:text() 5 | cmd:text('Training a Recurrent Neural Network to classify a sequence of words') 6 | cmd:text() 7 | cmd:text('Comandline Options') 8 | 9 | cmd:option('-wordVocabSize',100003,'number of words in dictionary') 10 | cmd:option('-wordEmbedDim',300,'size of word embedding') 11 | cmd:option('-wordEmbedPath','../embedding/word.100k.glove.t7','pretained word embedding path') 12 | 13 | cmd:option('-hiddenSize',256,'size of BiGRU unit') 14 | cmd:option('-outputType',1,'output type of each rnn layer') 15 | cmd:option('-numLayer',2,'number of BiGRU layers') 16 | cmd:option('-maxSeqLen',200,'number of steps the BiGRU needs to unroll') 17 | 18 | cmd:option('-numClass',1,'number of classes in classification') 19 | 20 | cmd:option('-trainData','../data/train.entity_typevec.t7','training data file') 21 | 22 | cmd:option('-optMethod','adamomentum','the optimization method used') 23 | cmd:option('-initRange',0.08,'the range of uniformly initialize parameters') 24 | cmd:option('-momentumEpoch',1,'after which epoch, the model starts to increase momentum') 25 | cmd:option('-maxEpochs',500,'number of full passes through the training data') 26 | 27 | cmd:option('-printEvery',100,'the frequency (# minibatches) of logging loss information') 28 | cmd:option('-logFile','logs/log.BiGRU','log file to record training information') 29 | cmd:option('-saveEvery',100,'the frequency (# epochs) of automatic saving trained models') 30 | cmd:option('-saveFile','model.BiGRU','filename for saving trained model') 31 | 32 | cmd:option('-useGPU',1,'which GPU is used for computation') 33 | 34 | cmd:text() 35 | 36 | ----------------------------- Basic Options ----------------------------- 37 | 38 | local opt = cmd:parse(arg) 39 | -- local flog = logroll.file_logger(opt.logFile) 40 | local flog = logroll.print_logger() 41 | 42 | if opt.useGPU > 0 then 43 | cutorch.setDevice(opt.useGPU) 44 | torch.setdefaulttensortype('torch.CudaTensor') 45 | end 46 | 47 | ----------------------------- Data Loader ----------------------------- 48 | local loader = SeqMultiLabelLoader(opt.trainData, flog) 49 | 50 | ----------------------------- Init Models ----------------------------- 51 | -- Init word embedding model 52 | local wordEmbed = cudacheck(nn.LookupTable(opt.wordVocabSize, opt.wordEmbedDim)) 53 | -- loadPretrainedEmbed(wordEmbed, opt.wordEmbedPath) 54 | 55 | -- Init Stacked BiGRU 56 | local rnnconfig = { 57 | hiddenSize = opt.hiddenSize, 58 | maxSeqLen = opt.maxSeqLen, 59 | maxBatch = loader.batchSize, 60 | logger = flog 61 | } 62 | 63 | local RNN = {} 64 | for l = 1, opt.numLayer do 65 | rnnconfig.inputSize = l == 1 and opt.wordEmbedDim or opt.hiddenSize * 2 66 | RNN[l] = BiGRU(rnnconfig) 67 | end 68 | 69 | -- Init the Classification Criterion 70 | local criterion = nn.BCECriterion() 71 | 72 | local selectLayer = BiRNNSelect() 73 | local linearLayer = nn.Linear(2 * opt.hiddenSize, 501) 74 | 75 | local model = nn.Sequential() 76 | model:add(wordEmbed) 77 | for l = 1, opt.numLayer do 78 | model:add(nn.Dropout(0.3)) 79 | model:add(RNN[l]) 80 | end 81 | model:add(selectLayer) 82 | model:add(linearLayer) 83 | model:add(nn.Sigmoid()) 84 | 85 | ----------------------------- Optimization ----------------------------- 86 | -- Create tables to hold params and grads 87 | local optimParams, optimGrads = {}, {} 88 | for l = 1, opt.numLayer do 89 | optimParams[l], optimGrads[l] = RNN[l]:getParameters() 90 | optimParams[l]:uniform(-opt.initRange, opt.initRange) 91 | end 92 | optimParams[#optimParams+1], optimGrads[#optimGrads+1] = wordEmbed:getParameters() 93 | optimParams[#optimParams+1], optimGrads[#optimGrads+1] = linearLayer:getParameters() 94 | 95 | 96 | -- Configurations for Optimizer 97 | local optimizer 98 | if opt.optMethod == 'adamomentum' then 99 | local optimConf = {lr = {}, momentum = 0.9, logger = flog} 100 | for l = 1, #optimParams do optimConf['lr'][l] = 1e-2 end 101 | optimizer = AdaGrad(optimGrads, optimConf) 102 | elseif opt.optMethod == 'adagrad' then 103 | local optimConf = {lr = {}, logger = flog} 104 | for l = 1, #optimParams do optimConf['lr'][l] = 2e-2 end 105 | optimizer = AdaGrad(optimGrads, optimConf) 106 | elseif opt.optMethod == 'momentum' then 107 | local optimConf = {lr = {}, momentum = 0.9, annealing = 0.01, logger = flog} 108 | for l = 1, #optimParams do optimConf['lr'][l] = 3e-1 end 109 | optimizer = SGD(optimGrads, optimConf) 110 | elseif opt.optMethod == 'SGD' then 111 | local optimConf = {lr = {}, annealing = 0.01, logger = flog} 112 | for l = 1, #optimParams do optimConf['lr'][l] = 5e-3 end 113 | optimizer = SGD(optimGrads, optimConf) 114 | else 115 | print ('Error: optMethod not match') 116 | os.exit(-1) 117 | end 118 | 119 | local lrWrd = 1e-4 120 | 121 | ----------------------------- Training ----------------------------- 122 | local sumLoss = 0 123 | local sumCorr = 0 124 | local sumTrue = 0 125 | local maxIters = opt.maxEpochs * loader.numBatch 126 | flog.info(string.rep('-', 40)) 127 | flog.info('Begin Training') 128 | 129 | for i = 1, maxIters do 130 | xlua.progress(i, maxIters) 131 | 132 | ----------------------- clean gradients ----------------------- 133 | for i = 1, #optimGrads do optimGrads[i]:zero() end 134 | 135 | ----------------------- load minibatch ------------------------ 136 | local seq, labels = loader:nextBatch() 137 | local currSeqLen = seq:size(1) 138 | 139 | ------------------------ forward pass ------------------------- 140 | local predict = model:forward(seq) 141 | 142 | -------------------------- criterion -------------------------- 143 | local loss = criterion:forward(predict, labels) 144 | sumLoss = sumLoss + loss 145 | 146 | local hardPred = torch.ge(predict, 0.5) 147 | sumCorr = sumCorr + torch.cmul(hardPred:type(torch.type(labels)), labels):sum() 148 | sumTrue = sumTrue + labels:sum() 149 | 150 | ------------------------ backward pass ------------------------ 151 | local d_predict = criterion:backward(predict, labels) 152 | model:backward(seq, d_predict) 153 | 154 | ----------------------- parameter update ---------------------- 155 | -- optim for rnn, projection 156 | for l = 1, opt.numLayer do optimGrads[l]:clamp(-10, 10) end 157 | optimizer:updateParams(optimParams, optimGrads) 158 | 159 | -- Logging 160 | if i % loader.numBatch == 0 then 161 | flog.info(string.format("finish epoch %d", i / loader.numBatch)) 162 | end 163 | 164 | ------------------------ training info ------------------------ 165 | if i % opt.printEvery == 0 then 166 | flog.info(string.format("iter %4d, loss = %5f, corr = %5f", 167 | i, sumLoss / opt.printEvery, sumCorr / sumTrue)) 168 | sumLoss, sumCorr, sumTrue = 0, 0, 0 169 | end 170 | if i % (loader.numBatch * opt.saveEvery) == 0 then 171 | local epoch = i / loader.numBatch 172 | print('Saving model after epoch ' .. epoch) 173 | torch.save(opt.saveFile..'.'..opt.useGPU..'.'..epoch, model) 174 | end 175 | end 176 | -------------------------------------------------------------------------------- /SimpleQuestions/PreprocessData/process_rawdata.py: -------------------------------------------------------------------------------- 1 | # This tool preprocess the original simple question dataset in 5 aspects: 2 | # 1. change triple information in to fb:... format 3 | # 2. replace the escape ('//') simbol in original question 4 | # 3. tokenize the question 5 | # 4. change the tokenized question into lower cases 6 | # 5. add another fields which indicates the token number of the question 7 | 8 | import multiprocessing as mp 9 | import sys, os, io, re 10 | import cPickle as pickle 11 | from nltk import word_tokenize 12 | sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), 'src/py_module' )) 13 | import QAData 14 | import virtuoso 15 | 16 | split = None 17 | 18 | def extract(line): 19 | fields = line.strip().split('\t') 20 | sub = 'fb:' + fields[0].split('www.freebase.com/')[-1].replace('/','.') 21 | rel = 'fb:' + fields[1].split('www.freebase.com/')[-1].replace('/','.') 22 | obj = 'fb:' + fields[2].split('www.freebase.com/')[-1].replace('/','.') 23 | if sub == 'fb:m.07s9rl0': 24 | sub = 'fb:m.02822' 25 | if obj == 'fb:m.07s9rl0': 26 | obj = 'fb:m.02822' 27 | question = fields[-1].replace('\\\\','') 28 | tokens = word_tokenize(question) 29 | return ' '.join(tokens).lower(), sub, rel, obj, len(tokens) 30 | 31 | def get_indices(src_list, pattern_list): 32 | indices = None 33 | for i in range(len(src_list)): 34 | match = 1 35 | for j in range(len(pattern_list)): 36 | if src_list[i+j] != pattern_list[j]: 37 | match = 0 38 | break 39 | if match: 40 | indices = range(i, i + len(pattern_list)) 41 | break 42 | return indices 43 | 44 | def query_golden_subs(data): 45 | golden_subs = [] 46 | if data.text_subject: 47 | # extract fields needed 48 | relation = data.relation 49 | subject = data.subject 50 | text_subject = data.text_subject 51 | 52 | # query name / alias by subject (id) 53 | candi_sub_list = virtuoso.str_query_id(text_subject) 54 | 55 | # add candidates to data 56 | for candi_sub in candi_sub_list: 57 | candi_rel_list = virtuoso.id_query_out_rel(candi_sub) 58 | if relation in candi_rel_list: 59 | golden_subs.append(candi_sub) 60 | 61 | if len(golden_subs) == 0: 62 | golden_subs = [data.subject] 63 | 64 | return golden_subs 65 | 66 | def reverse_link(question, subject): 67 | # get question tokens 68 | tokens = question.split() 69 | 70 | # init default value of returned variables 71 | text_subject = None 72 | text_attention_indices = None 73 | 74 | # query name / alias by node_id (subject) 75 | res_list = virtuoso.id_query_str(subject) 76 | 77 | # sorted by length 78 | for res in sorted(res_list, key = lambda res: len(res), reverse = True): 79 | pattern = r'(^|\s)(%s)($|\s)' % (re.escape(res)) 80 | if re.search(pattern, question): 81 | text_subject = res 82 | text_attention_indices = get_indices(tokens, res.split()) 83 | break 84 | 85 | return text_subject, text_attention_indices 86 | 87 | def form_anonymous_quesion(data): 88 | anonymous_question = None 89 | if data.text_attention_indices: 90 | anonymous_tokens = [] 91 | tokens = data.question.split() 92 | anonymous_tokens.extend(tokens[:data.text_attention_indices[0]]) 93 | anonymous_tokens.append('X') 94 | anonymous_tokens.extend(tokens[data.text_attention_indices[-1]+1:]) 95 | anonymous_question = ' '.join(anonymous_tokens) 96 | 97 | return anonymous_question 98 | 99 | def form_type_based_question(data): 100 | typed_question = None 101 | num_type_token = -1 102 | if data.text_attention_indices and data.sub_ntp: 103 | tokens = data.question.split() 104 | new_tokens = [] 105 | new_tokens.extend(tokens[:data.text_attention_indices[0]]) 106 | new_tokens.append(data.sub_ntp) 107 | new_tokens.extend(tokens[data.text_attention_indices[-1]+1:]) 108 | typed_question = ' '.join(new_tokens) 109 | num_type_token = len(new_tokens) 110 | 111 | return typed_question, num_type_token 112 | 113 | def knowledge_graph_attributes(data_list, pid = 0): 114 | # Open log file 115 | log_file = file('logs/log.%s.%d.txt'%(split, pid), 'wb') 116 | 117 | succ_att_link = 0 118 | qadata_list = [] 119 | for data_index, data_tuple in enumerate(data_list): 120 | # Step-1: create QAData instance 121 | data = QAData.QAData(data_tuple) 122 | 123 | # Step-2: reverse linking 124 | data.text_subject, data.text_attention_indices = reverse_link(data.question, data.subject) 125 | 126 | # Step-3: create anonymous question for LTG-CNN+ 127 | if split == 'train': 128 | data.anonymous_question = form_anonymous_quesion(data) 129 | 130 | qadata_list.append(data) 131 | 132 | # logging 133 | if data.text_subject: 134 | succ_att_link += 1 135 | print >> log_file, '[%d] attention: %f' % (data_index, succ_att_link / float(data_index+1)) 136 | 137 | pickle.dump(qadata_list, file('temp.%s.pkl'%(pid), 'wb')) 138 | log_file.close() 139 | 140 | def process(num_process, data_list): 141 | # Make dir 142 | if not os.path.exists('logs'): 143 | os.mkdir('logs') 144 | 145 | # Split workload 146 | length = len(data_list) 147 | data_per_p = (length + num_process - 1) / num_process 148 | 149 | # Spawn processes 150 | processes = [ 151 | mp.Process( 152 | target = knowledge_graph_attributes, 153 | args = ( 154 | data_list[pid*data_per_p:(pid+1)*data_per_p], 155 | pid 156 | ) 157 | ) 158 | for pid in range(num_process) 159 | ] 160 | 161 | # Run processes 162 | for p in processes: 163 | p.start() 164 | 165 | # Exit the completed processes 166 | for p in processes: 167 | p.join() 168 | 169 | if __name__ == '__main__': 170 | 171 | if len(sys.argv) != 3: 172 | print 'python preprocess.py input_file num_process' 173 | sys.exit(-1) 174 | 175 | in_file_path = sys.argv[1] 176 | num_process = int(sys.argv[2]) 177 | 178 | split = in_file_path.split('_')[-1].split('.')[0] 179 | 180 | in_file = io.open(in_file_path, 'r', encoding='utf8') 181 | 182 | data_list = [] 183 | for line in in_file: 184 | question, sub, rel, obj, length = extract(line) 185 | data_list.append((question, sub, rel, obj, length)) 186 | 187 | process(num_process, sorted(data_list, key = lambda data: data[-1], reverse = True)) 188 | 189 | # Merge all data [this will preserve the order] 190 | new_data_list = [] 191 | for p in range(num_process): 192 | temp_fn = 'temp.%d.pkl'%(p) 193 | new_data_list.extend(pickle.load(file(temp_fn, 'rb'))) 194 | os.remove(temp_fn) 195 | 196 | pickle.dump(new_data_list, file('QAData.%s.pkl'%(split), 'wb')) 197 | 198 | in_file.close() 199 | -------------------------------------------------------------------------------- /Inference/query_candidates.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os, sys, re 3 | import multiprocessing as mp 4 | import cPickle as pickle 5 | import numpy as np 6 | 7 | sys.path.append(os.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), 'src/py_module' )) 8 | from QAData import * 9 | import virtuoso 10 | import freebase 11 | 12 | type_dict = None 13 | stop_words = ['of', 'on', 'the', 'off', 'in', 'for', 'with', 'a', 'an', 'did', 'does', 'good', 'or', 'not', \ 14 | "'", '?', '!', ':', ','] 15 | 16 | def generate_ngrams(tokens, min_len, max_len): 17 | ngrams = [] 18 | num_token = len(tokens) 19 | assert(num_token >= max_len) 20 | for num in range(min_len, max_len+1): 21 | for i in range(num_token-num+1): 22 | ngram = ' '.join(tokens[i:i+num]) 23 | if not ngram in stop_words: 24 | ngrams.append(ngram) 25 | return list(set(ngrams)) 26 | 27 | def beg_end_indices(scores, threshold): 28 | seq_len = len(scores) 29 | max_idx = np.argmax(scores) 30 | beg_idx = max_idx 31 | end_idx = max_idx 32 | for i in range(max_idx-1,-1,-1): 33 | if np.abs(scores[i+1] - scores[i]) / scores[i+1] > threshold: 34 | break 35 | beg_idx = i 36 | for i in range(max_idx+1,seq_len,1): 37 | if np.abs(scores[i-1] - scores[i]) / scores[i-1] > threshold: 38 | break 39 | end_idx = i 40 | return beg_idx, end_idx 41 | 42 | def form_anonymous_quesion(question, beg_idx, end_idx): 43 | anonymous_tokens = [] 44 | tokens = question.split() 45 | anonymous_tokens.extend(tokens[:beg_idx]) 46 | anonymous_tokens.append('X') 47 | anonymous_tokens.extend(tokens[end_idx+1:]) 48 | anonymous_question = ' '.join(anonymous_tokens) 49 | 50 | return anonymous_question 51 | 52 | def query_candidate(data_list, pred_list, pid = 0): 53 | log_file = open('logs/log.%d.txt'%(pid), 'wb') 54 | new_data_list = [] 55 | 56 | succ_match = 0 57 | data_index = 0 58 | for pred, data in zip(pred_list, data_list): 59 | # incremnt data_index 60 | data_index += 1 61 | 62 | # extract scores 63 | scores = [float(score) for score in pred.strip().split()] 64 | 65 | # extract fields needed 66 | relation = data.relation 67 | subject = data.subject 68 | question = data.question 69 | tokens = question.split() 70 | 71 | # query name / alias by subject (id) 72 | candi_sub_list = [] 73 | for threshold in np.arange(0.5, 0.0, -0.095): 74 | beg_idx, end_idx = beg_end_indices(scores, threshold) 75 | sub_text = ' '.join(tokens[beg_idx:end_idx+1]) 76 | candi_sub_list.extend(virtuoso.str_query_id(sub_text)) 77 | if len(candi_sub_list) > 0: 78 | break 79 | 80 | # # using freebase suggest 81 | # if len(candi_sub_list) == 0: 82 | # beg_idx, end_idx = beg_end_indices(scores, 0.2) 83 | # sub_text = ' '.join(tokens[beg_idx:end_idx+1]) 84 | # sub_text = re.sub(r'\s(\w+)\s(n?\'[tsd])\s', r' \1\2 ', sub_text) 85 | # suggest_subs = [] 86 | # for trial in range(3): 87 | # try: 88 | # suggest_subs = freebase.suggest_id(sub_text) 89 | # break 90 | # except: 91 | # print >> sys.stderr, 'freebase suggest_id error: trial = %d, sub_text = %s' % (trial, sub_text) 92 | # candi_sub_list.extend(suggest_subs) 93 | # if data.subject not in candi_sub_list: 94 | # print >> log_file, '%s\t\t%s\t\t%s\t\t%d' % (sub_text, data.text_subject, fb2www(data.subject), len(candi_sub_list)) 95 | 96 | # if potential subject founded 97 | if len(candi_sub_list) > 0: 98 | # add candidates to data 99 | for candi_sub in candi_sub_list: 100 | candi_rel_list = virtuoso.id_query_out_rel(candi_sub) 101 | if len(candi_rel_list) > 0: 102 | if type_dict: 103 | candi_type_list = [type_dict[t] for t in virtuoso.id_query_type(candi_sub) if type_dict.has_key(t)] 104 | if len(candi_type_list) == 0: 105 | candi_type_list.append(len(type_dict)) 106 | data.add_candidate(candi_sub, candi_rel_list, candi_type_list) 107 | else: 108 | data.add_candidate(candi_sub, candi_rel_list) 109 | data.anonymous_question = form_anonymous_quesion(question, beg_idx, end_idx) 110 | 111 | # make score mat 112 | if hasattr(data, 'cand_sub') and hasattr(data, 'cand_rel'): 113 | # remove duplicate relations 114 | data.remove_duplicate() 115 | 116 | # append to new_data_list 117 | new_data_list.append(data) 118 | 119 | # loging information 120 | if subject in candi_sub_list: 121 | succ_match += 1 122 | 123 | if data_index % 100 == 0: 124 | print >> sys.stderr, '[%d] %d / %d' % (pid, data_index, len(data_list)) 125 | 126 | print >> log_file, '%d / %d = %f ' % (succ_match, data_index+1, succ_match / float(data_index+1)) 127 | 128 | log_file.close() 129 | pickle.dump(new_data_list, file('temp.%d.cpickle'%(pid),'wb')) 130 | 131 | if __name__ == '__main__': 132 | # Check number of argv 133 | if len(sys.argv) == 4: 134 | # Parse input argument 135 | num_process = int(sys.argv[1]) 136 | data_list = pickle.load(file(sys.argv[2], 'rb')) 137 | pred_list = file(sys.argv[3], 'rb').readlines() 138 | elif len(sys.argv) == 5: 139 | # Parse input argument 140 | num_process = int(sys.argv[1]) 141 | data_list = pickle.load(file(sys.argv[2], 'rb')) 142 | pred_list = file(sys.argv[3], 'rb').readlines() 143 | type_dict = pickle.load(file(sys.argv[4], 'rb')) 144 | else: 145 | print 'usage: python query_candidate_relation.py num_processes QAData_cpickle_file attention_score_file [[type_dict]]' 146 | sys.exit(-1) 147 | 148 | suffix = sys.argv[2].split('.')[-2] 149 | 150 | assert(len(data_list) == len(pred_list)) 151 | 152 | # Create log directory 153 | log_dir = './logs' 154 | if not os.path.exists(log_dir): 155 | os.makedirs(log_dir) 156 | 157 | # Allocate dataload 158 | length = len(data_list) 159 | data_per_p = (length + num_process - 1) / num_process 160 | 161 | # Spawn processes 162 | processes = [ 163 | mp.Process( 164 | target = query_candidate, 165 | args = (data_list[pid*data_per_p:(pid+1)*data_per_p], 166 | pred_list[pid*data_per_p:(pid+1)*data_per_p], 167 | pid) 168 | ) 169 | for pid in range(num_process) 170 | ] 171 | 172 | # Run processes 173 | for p in processes: 174 | p.start() 175 | 176 | # Exit the completed processes 177 | for p in processes: 178 | p.join() 179 | 180 | # Merge all data [this will preserve the order] 181 | new_data_list = [] 182 | for p in range(num_process): 183 | temp_fn = 'temp.%d.cpickle'%(p) 184 | new_data_list.extend(pickle.load(file(temp_fn, 'rb'))) 185 | 186 | pickle.dump(new_data_list, file('QAData.label.%s.cpickle'%(suffix), 'wb')) 187 | 188 | # Remove temp data 189 | for p in range(num_process): 190 | temp_fn = 'temp.%d.cpickle'%(p) 191 | os.remove(temp_fn) 192 | -------------------------------------------------------------------------------- /Inference/joint_disambiguation.py: -------------------------------------------------------------------------------- 1 | import os, sys, re 2 | import glob 3 | import cPickle as pickle 4 | import numpy as np 5 | from sklearn import preprocessing 6 | 7 | sys.path.append(os.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), 'src/py_module' )) 8 | from QAData import * 9 | 10 | def top_sub_rel(data, rel_scores, ent_scores, alpha, rel_ratio): 11 | rel_scores = np.array(rel_scores) 12 | ent_scores = np.array(ent_scores) 13 | 14 | #ent_threshold = np.min(ent_scores) 15 | #top_sub_ids = np.where(ent_scores >= ent_threshold)[0] 16 | top_sub_ids = np.arange(ent_scores.shape[0]) 17 | 18 | rel_threshold = rel_ratio * (np.max(rel_scores) - np.min(rel_scores)) + np.min(rel_scores) 19 | top_rel_ids = np.where(rel_scores >= rel_threshold)[0] 20 | #top_rel_ids = np.arange(rel_scores.shape[0]) 21 | 22 | # dict for top relation column idx 23 | rel_id_dict = {data.cand_rel[rel_id]:i for i, rel_id in enumerate(top_rel_ids)} 24 | 25 | score_mat = np.zeros((top_sub_ids.shape[0], top_rel_ids.shape[0])) 26 | 27 | # fill the score matrix 28 | for row_idx, sub_id in enumerate(top_sub_ids): 29 | for rel in data.sub_rels[sub_id]: 30 | if rel_id_dict.has_key(rel): 31 | col_idx = rel_id_dict[rel] 32 | #score_mat[row_idx, col_idx] = rel_scores[top_rel_ids[col_idx]] 33 | score_mat[row_idx, col_idx] = 1 34 | 35 | # compute all the terms 36 | ent_scores = ent_scores[top_sub_ids] 37 | rel_scores = rel_scores[top_rel_ids] 38 | 39 | # u(s,r,q) = alpha * I(s->r) + (1 - alpha) * g(q)^T E(s) 40 | score_mat = np.exp(score_mat * alpha + ent_scores.reshape(score_mat.shape[0], 1) * (1 - alpha)) 41 | 42 | # p(s|q,r) propto u(s,r,q) 43 | score_mat /= np.sum(score_mat, 0) 44 | 45 | # p(s|q,r) * p(r|q) 46 | score_mat *= np.exp(rel_scores) 47 | 48 | #max_score = np.max(score_mat) 49 | #if np.where(score_mat == max_score)[0].shape[0] > 1: 50 | # print np.where(score_mat == max_score)[0].shape[0] 51 | 52 | top_sub_id, top_rel_id = np.unravel_index(np.argmax(score_mat), score_mat.shape) 53 | 54 | return [data.cand_sub[top_sub_ids[top_sub_id]]], data.cand_rel[top_rel_ids[top_rel_id]] 55 | 56 | def math(data, rel_scores, ent_scores, alpha = 0.5): 57 | rel_id_dict = {data.cand_rel[i]:i for i in range(len(data.cand_rel))} 58 | #rel_scores = preprocessing.scale(rel_scores) 59 | #ent_scores = preprocessing.scale(ent_scores) 60 | 61 | score_mat = np.zeros((len(data.cand_sub), len(data.cand_rel))) 62 | for i in range(len(data.cand_sub)): 63 | for rel in data.sub_rels[i]: 64 | j = rel_id_dict[rel] 65 | score_mat[i, j] = rel_scores[j] 66 | 67 | # compute all the terms 68 | score_mat = np.exp(score_mat * alpha + np.array(ent_scores).reshape(score_mat.shape[0], 1) * (1 - alpha)) 69 | 70 | # normalization 71 | score_mat /= np.sum(score_mat, 0) 72 | 73 | score_mat *= np.exp(rel_scores) 74 | 75 | top_sub_id, top_rel_id = np.unravel_index(np.argmax(score_mat), score_mat.shape) 76 | 77 | return [data.cand_sub[top_sub_id]], data.cand_rel[top_rel_id] 78 | 79 | def weighted_avg(data, rel_scores, ent_scores, alpha = 0.2): 80 | rel_id_dict = {data.cand_rel[i]:i for i in range(len(data.cand_rel))} 81 | # rel_scores = preprocessing.scale(rel_scores) 82 | # ent_scores = preprocessing.scale(ent_scores) 83 | 84 | score_mat = np.zeros((len(data.cand_sub), len(data.cand_rel))) 85 | for i in range(len(data.cand_sub)): 86 | for rel in data.sub_rels[i]: 87 | j = rel_id_dict[rel] 88 | score_mat[i, j] = rel_scores[j] 89 | 90 | sub_scores = alpha * np.array(ent_scores) + (1 - alpha) * np.sum(score_mat, 1) 91 | top_sub_score = np.max(sub_scores) 92 | top_sub_ids = [] 93 | for sub_id in np.argsort(sub_scores)[::-1]: 94 | if sub_scores[sub_id] < top_sub_score: 95 | break 96 | top_sub_ids.append(sub_id) 97 | 98 | top_rel = data.cand_rel[np.argmax(score_mat[top_sub_ids[0]])] 99 | top_subs = [data.cand_sub[sub_id] for sub_id in top_sub_ids] 100 | return top_subs, top_rel 101 | 102 | def rel_based(data, rel_scores): 103 | rel_scores = np.array(rel_scores) 104 | top_rel_ids = np.argsort(rel_scores) 105 | # rel_scores[top_rel_ids[:-2]] = 0 106 | # reverse rel->id dict 107 | rel_id_dict = {data.cand_rel[i]:i for i in range(len(data.cand_rel))} 108 | 109 | score_mat = np.zeros((len(data.cand_sub), len(data.cand_rel))) 110 | for i in range(len(data.cand_sub)): 111 | for rel in data.sub_rels[i]: 112 | j = rel_id_dict[rel] 113 | score_mat[i, j] = rel_scores[j] 114 | 115 | sub_score = np.sum(score_mat, 1) 116 | top_subscore = np.max(sub_score) 117 | top_subid = np.argmax(sub_score) 118 | top_relid = np.argmax(score_mat[top_subid]) 119 | 120 | return [data.cand_sub[top_subid]], data.cand_rel[top_relid] 121 | 122 | if __name__ == '__main__': 123 | # Parse input argument 124 | if len(sys.argv) == 3: 125 | data_fn = sys.argv[1] 126 | rel_score_fn = sys.argv[2] 127 | ent_score_fn = None 128 | elif len(sys.argv) == 4: 129 | data_fn = sys.argv[1] 130 | rel_score_fn = sys.argv[2] 131 | ent_score_fn = sys.argv[3] 132 | else: 133 | print 'Wrong arguments. Usage: ' 134 | print ' python joint_disambiguation.py cpickle_file rel_score_file ent_score_file' 135 | sys.exit(1) 136 | 137 | chosen_subs = 0 138 | total_subs = 0 139 | 140 | count_multi = 0 141 | 142 | # Error information 143 | error_dir = './error_analysis' 144 | if not os.path.exists(error_dir): 145 | os.makedirs(error_dir) 146 | category = data_fn.split('.')[0] 147 | 148 | # Load cPickle file into data 149 | data_list = pickle.load(file(data_fn, 'rb')) 150 | print >> sys.stderr, 'finish loading cpickle file %d' % (len(data_list)) 151 | 152 | rel_score_list = file(rel_score_fn, 'rb').readlines() 153 | if ent_score_fn: 154 | ent_score_list = file(ent_score_fn, 'rb').readlines() 155 | 156 | # Count the totol number of data 157 | for rel_ratio in [0, 0.75, 0.85, 0.95]: 158 | #for rel_ratio in [0]: 159 | print '=' * 120 160 | #for alpha in np.arange(0.05,1.00,0.05): 161 | for alpha in np.arange(0.05,1.01,0.05): 162 | # Rescore for each data in data_list 163 | corr_mat = np.zeros((2,2)) 164 | 165 | count = 0 166 | for idx, data in enumerate(data_list): 167 | rel_scores = [float(score) for score in rel_score_list[idx].strip().split(' ')] 168 | ent_scores = [float(score) for score in ent_score_list[idx].strip().split(' ')] 169 | # top_sub, top_rel = rel_based(data, rel_scores) 170 | # top_sub, top_rel = weighted_avg(data, rel_scores, ent_scores) 171 | # top_sub, top_rel = math(data, rel_scores, ent_scores, alpha) 172 | top_sub, top_rel = top_sub_rel(data, rel_scores, ent_scores, alpha, rel_ratio) 173 | 174 | if len(top_sub) == 1 and top_sub[0] == data.subject: 175 | if top_rel == data.relation: 176 | corr_mat[0,0] += 1 177 | else: 178 | corr_mat[0,1] += 1 179 | else: 180 | if top_rel == data.relation: 181 | corr_mat[1,0] += 1 182 | else: 183 | corr_mat[1,1] += 1 184 | 185 | print '%4.3f, %4.3f, %d' % (alpha, rel_ratio, corr_mat[0,0]) 186 | -------------------------------------------------------------------------------- /src/model/model_utils.lua: -------------------------------------------------------------------------------- 1 | -- cuda utils 2 | function cudacheck(input) 3 | if torch.Tensor():type() == 'torch.CudaTensor' then 4 | input = input:cuda() 5 | end 6 | return input 7 | end 8 | 9 | function range(b, e) 10 | local result = cudacheck(torch.LongTensor.range(torch.LongTensor(e-b+1),b,e)) 11 | return result 12 | end 13 | 14 | function randperm(up) 15 | local result = cudacheck(torch.LongTensor.randperm(torch.LongTensor(up),up)) 16 | return result 17 | end 18 | 19 | -- loading embedding 20 | function loadPretrainedEmbed (model, embedPath, renorm) 21 | local pretrainedEmbed = torch.load(embedPath) 22 | assert(model.weight:size(2) == pretrainedEmbed:size(2), 'Embedding size does not match') 23 | model.weight:narrow(1, 1, pretrainedEmbed:size(1)):copy(pretrainedEmbed) 24 | if renorm then 25 | model.weight:renorm(2, 2, 1) 26 | end 27 | end 28 | 29 | -- flatten parameters 30 | function flatten(parameters) 31 | 32 | -- returns true if tensor occupies a contiguous region of memory (no holes) 33 | local function isCompact(tensor) 34 | local sortedStride, perm = torch.sort( 35 | torch.LongTensor(tensor:nDimension()):set(tensor:stride()), 1, true) 36 | local sortedSize = torch.LongTensor(tensor:nDimension()):set( 37 | tensor:size()):index(1, perm) 38 | local nRealDim = torch.clamp(sortedStride, 0, 1):sum() 39 | sortedStride = sortedStride:narrow(1, 1, nRealDim):clone() 40 | sortedSize = sortedSize:narrow(1, 1, nRealDim):clone() 41 | local t = tensor.new():set(tensor:storage(), 1, 42 | sortedSize:storage(), 43 | sortedStride:storage()) 44 | return t:isContiguous() 45 | end 46 | 47 | if not parameters or #parameters == 0 then 48 | return torch.Tensor() 49 | end 50 | local Tensor = parameters[1].new 51 | 52 | -- 1. construct the set of all unique storages referenced by parameter tensors 53 | local storages = {} 54 | local nParameters = 0 55 | local parameterMeta = {} 56 | for k = 1,#parameters do 57 | local param = parameters[k] 58 | local storage = parameters[k]:storage() 59 | local storageKey = torch.pointer(storage) 60 | 61 | if not storages[storageKey] then 62 | storages[storageKey] = {storage, nParameters} 63 | nParameters = nParameters + storage:size() 64 | end 65 | 66 | parameterMeta[k] = {storageOffset = param:storageOffset() + 67 | storages[storageKey][2], 68 | size = param:size(), 69 | stride = param:stride()} 70 | end 71 | 72 | -- 2. construct a single tensor that will hold all the parameters 73 | local flatParameters = Tensor(nParameters):zero() 74 | 75 | -- 3. determine if there are elements in the storage that none of the 76 | -- parameter tensors reference ('holes') 77 | local tensorsCompact = true 78 | for k = 1,#parameters do 79 | local meta = parameterMeta[k] 80 | local tmp = Tensor():set( 81 | flatParameters:storage(), meta.storageOffset, meta.size, meta.stride 82 | ) 83 | tmp:fill(1) 84 | tensorsCompact = tensorsCompact and isCompact(tmp) 85 | end 86 | 87 | local maskParameters = flatParameters:byte():clone() 88 | local compactOffsets = flatParameters:long():cumsum(1) 89 | local nUsedParameters = compactOffsets[-1] 90 | 91 | -- 4. copy storages into the flattened parameter tensor 92 | for _, storageAndOffset in pairs(storages) do 93 | local storage, offset = table.unpack(storageAndOffset) 94 | flatParameters[{{offset+1,offset+storage:size()}}]:copy(Tensor():set(storage)) 95 | end 96 | 97 | -- 5. allow garbage collection 98 | storages = nil 99 | for k = 1,#parameters do 100 | parameters[k]:set(Tensor()) 101 | end 102 | 103 | -- 6. compact the flattened parameters if there were holes 104 | if nUsedParameters ~= nParameters then 105 | assert(tensorsCompact, "Cannot gather tensors that are not compact") 106 | 107 | flatParameters = Tensor(nUsedParameters):copy(flatParameters:maskedSelect(maskParameters)) 108 | for k = 1,#parameters do 109 | parameterMeta[k].storageOffset = compactOffsets[parameterMeta[k].storageOffset] 110 | end 111 | end 112 | 113 | -- 7. fix up the parameter tensors to point at the flattened parameters 114 | for k = 1,#parameters do 115 | parameters[k]:set(flatParameters:storage(), 116 | parameterMeta[k].storageOffset, 117 | parameterMeta[k].size, 118 | parameterMeta[k].stride) 119 | end 120 | 121 | return flatParameters 122 | end 123 | 124 | -- clone utils 125 | function combineParameters(...) 126 | --[[ like module:getParameters, but operates on many modules ]]-- 127 | 128 | -- get parameters 129 | local networks = {...} 130 | local parameters = {} 131 | local gradParameters = {} 132 | for i = 1, #networks do 133 | local net_params, net_grads = networks[i]:parameters() 134 | 135 | if net_params then 136 | for _, p in pairs(net_params) do 137 | parameters[#parameters + 1] = p 138 | end 139 | for _, g in pairs(net_grads) do 140 | gradParameters[#gradParameters + 1] = g 141 | end 142 | end 143 | end 144 | 145 | -- flatten parameters and gradients 146 | local flatParameters = flatten(parameters) 147 | local flatGradParameters = flatten(gradParameters) 148 | 149 | assert(flatParameters:nElement() == flatGradParameters:nElement(), 150 | 'check that you are sharing parameters and gradParameters') 151 | if parameters then 152 | for i = 1, #parameters do 153 | assert(parameters[i]:storageOffset() == gradParameters[i]:storageOffset(), 154 | 'misaligned parameter at ' .. tostring(i)) 155 | end 156 | end 157 | 158 | -- return new flat vector that contains all discrete parameters 159 | return flatParameters, flatGradParameters 160 | end 161 | 162 | function cloneManyTimes(net, T) 163 | local clones = {} 164 | local params, gradParams 165 | if net.parameters then 166 | params, gradParams = net:parameters() 167 | if params == nil then 168 | params = {} 169 | end 170 | end 171 | local paramsNoGrad 172 | if net.parametersNoGrad then 173 | paramsNoGrad = net:parametersNoGrad() 174 | end 175 | local mem = torch.MemoryFile("w"):binary() 176 | mem:writeObject(net) 177 | for t = 1, T do 178 | -- We need to use a new reader for each clone. 179 | -- We don't want to use the pointers to already read objects. 180 | local reader = torch.MemoryFile(mem:storage(), "r"):binary() 181 | local clone = reader:readObject() 182 | reader:close() 183 | if net.parameters then 184 | local cloneParams, cloneGradParams = clone:parameters() 185 | local cloneParamsNoGrad 186 | for i = 1, #params do 187 | cloneParams[i]:set(params[i]) 188 | cloneGradParams[i]:set(gradParams[i]) 189 | end 190 | if paramsNoGrad then 191 | cloneParamsNoGrad = clone:parametersNoGrad() 192 | for i =1,#paramsNoGrad do 193 | cloneParamsNoGrad[i]:set(paramsNoGrad[i]) 194 | end 195 | end 196 | end 197 | clones[t] = clone 198 | collectgarbage() 199 | end 200 | mem:close() 201 | return clones 202 | end 203 | -------------------------------------------------------------------------------- /RelationRNN/train_rel_rnn.lua: -------------------------------------------------------------------------------- 1 | require '..' 2 | 3 | local cmd = torch.CmdLine() 4 | cmd:text() 5 | cmd:text('Training a Recurrent Neural Network to embed a sentence') 6 | cmd:text() 7 | cmd:text('Options') 8 | 9 | cmd:option('-vocabSize',100002,'number of words in dictionary') 10 | 11 | cmd:option('-relSize',7524,'number of relations in dictionary') 12 | cmd:option('-relEmbedSize',256,'size of rel embedding') 13 | 14 | cmd:option('-wrdEmbedSize',300,'size of word embedding') 15 | cmd:option('-wrdEmbedPath','../embedding/word.100k.glove.t7','pretained word embedding path') 16 | 17 | cmd:option('-numLayer',2,'number of RNN layers') 18 | cmd:option('-maxSeqLen',40,'number of timesteps to unroll to') 19 | cmd:option('-hiddenSize',256,'size of RNN internal state') 20 | cmd:option('-dropoutRate',0.5,'dropout rate') 21 | 22 | cmd:option('-negSize',1024,'number of negtive samples for each iteration') 23 | cmd:option('-maxEpochs',1000,'number of full passes through the training data') 24 | cmd:option('-initRange',0.08,'the range of uniformly initialize parameters') 25 | cmd:option('-costMargin',0.1,'the margin used in the ranking cost') 26 | cmd:option('-useGPU',1,'whether to use gpu for computation') 27 | 28 | cmd:option('-printEvery',100,'how many steps/minibatches between printing out the loss') 29 | cmd:option('-saveEvery',100,'how many epochs between auto save trained models') 30 | cmd:option('-saveFile','model.rel.stackBiRNN','filename to autosave the model (protos) to') 31 | cmd:option('-logFile','logs/rel.stackBiRNN.log','log file to record training information') 32 | cmd:option('-dataFile', '../data/train.relation_ranking.t7','training data file') 33 | 34 | cmd:option('-seed',123,'torch manual random number generator seed') 35 | cmd:text() 36 | 37 | ----------------------------- parse params ----------------------------- 38 | 39 | local opt = cmd:parse(arg) 40 | -- local flog = logroll.file_logger(opt.logFile) 41 | local flog = logroll.print_logger() 42 | if opt.useGPU > 0 then 43 | cutorch.setDevice(opt.useGPU) 44 | torch.setdefaulttensortype('torch.CudaTensor') 45 | end 46 | 47 | ----------------------------- define loader ----------------------------- 48 | local loader = SeqRankingLoader(opt.dataFile, opt.negSize, opt.relSize, flog) 49 | 50 | ----------------------------- define models ----------------------------- 51 | -- word embedding model 52 | local wordEmbed = cudacheck(nn.LookupTable(opt.vocabSize, opt.wrdEmbedSize)) 53 | -- loadPretrainedEmbed(wordEmbed, opt.wrdEmbedPath) 54 | 55 | -- rel embedding model 56 | -- local relEmbed = torch.load('../TransE/model.60').RelEmbed 57 | local relEmbed = cudacheck(nn.LookupTable(opt.relSize, opt.relEmbedSize)) 58 | relEmbed.weight:uniform(-opt.initRange, opt.initRange) 59 | relEmbed.weight:renorm(2, 2, 1) 60 | 61 | local posRelDrop = nn.Dropout(0.3) 62 | local negRelDrop = nn.Dropout(0.3) 63 | 64 | -- multi-layer (stacked) Bi-RNN 65 | local config = {} 66 | config.hiddenSize = opt.hiddenSize 67 | config.maxSeqLen = opt.maxSeqLen 68 | config.maxBatch = 256 69 | config.logger = flog 70 | 71 | local RNN = {} 72 | for l = 1, opt.numLayer do 73 | config.inputSize = l == 1 and opt.wrdEmbedSize or opt.hiddenSize * 2 74 | RNN[l] = BiGRU(config) 75 | end 76 | 77 | local selectLayer = BiRNNSelect() 78 | local linearLayer = nn.Linear(2 * opt.hiddenSize, opt.relEmbedSize) 79 | 80 | local seqModel = nn.Sequential() 81 | seqModel:add(wordEmbed) 82 | for l = 1, opt.numLayer do 83 | seqModel:add(nn.Dropout(opt.dropoutRate)) 84 | seqModel:add(RNN[l]) 85 | end 86 | seqModel:add(selectLayer) 87 | seqModel:add(linearLayer) 88 | 89 | -- ranking score model 90 | local scoreModel = TripleScore(opt.negSize) 91 | 92 | -- put all models together 93 | local model = {} 94 | model.seqModel = seqModel 95 | model.relEmbed = relEmbed 96 | model.posRelDrop = posRelDrop 97 | model.negRelDrop = negRelDrop 98 | model.scoreModel = scoreModel 99 | 100 | -- margin ranking criterion 101 | local criterion = nn.MarginRankingCriterion(opt.costMargin) 102 | 103 | -- put together parms and grad pointers in optimParams and optimGrad tables 104 | local optimParams, optimGrad = {}, {} 105 | for l = 1, opt.numLayer do 106 | local rnnParams, rnnGrad = RNN[l]:getParameters() 107 | rnnParams:uniform(-opt.initRange, opt.initRange) 108 | optimParams[l], optimGrad[l] = rnnParams, rnnGrad 109 | end 110 | optimParams[#optimParams+1], optimGrad[#optimGrad+1] = linearLayer:getParameters() 111 | 112 | -- optimization configurations [subject to change] 113 | local lrWrd, lrRel = 1e-3, 3e-4 114 | 115 | local optimConf = {['lr'] = {}, ['momentum'] = 0.3} 116 | -- local optimConf = {['lr'] = {}} 117 | for l = 1, #optimParams do optimConf['lr'][l] = 1e-3 end 118 | local optimizer = AdaGrad(optimGrad, optimConf) 119 | 120 | -- prepare for training 121 | local sumLoss, epochLoss = 0, 0 122 | local maxIters = opt.maxEpochs * loader.numBatch 123 | local ones = torch.ones(loader.batchSize, loader.negSize) 124 | 125 | -- core training loop 126 | for i = 1, maxIters do 127 | xlua.progress(i, maxIters) 128 | -- in the beginning of each loop, clean the grad_params 129 | relEmbed:zeroGradParameters() 130 | wordEmbed:zeroGradParameters() 131 | for i = 1, #optimGrad do optimGrad[i]:zero() end 132 | 133 | ----------------------- load minibatch ------------------------ 134 | local seq, pos, negs = loader:nextBatch() 135 | local currSeqLen = seq:size(1) 136 | local loss = 0 137 | 138 | ------------------------ forward pass ------------------------- 139 | -- sequence vectors [n_batch x n_dim] 140 | local seqVec = seqModel:forward(seq) 141 | 142 | -- positive vectors [n_batch x n_dim] 143 | local posVec = relEmbed:forward(pos):clone() 144 | local posDropVec = posRelDrop:forward(posVec) 145 | 146 | -- negative matrix [n_neg x n_batch x n_dim] 147 | local negMat = relEmbed:forward(negs) 148 | local negDropMat = negRelDrop:forward(negMat) 149 | 150 | -- scores table {[1] = postive_scores, [2] = negative_scores} 151 | -- local scores = scoreModel:forward({seqVec, posVec, negMat}) 152 | local scores = scoreModel:forward({seqVec, posDropVec, negDropMat}) 153 | local loss = criterion:forward(scores, ones) 154 | 155 | -- d_scores table {[1] = d_postive_scores, [2] = d_negative_scores} 156 | local d_scores = criterion:backward(scores, ones) 157 | 158 | -- d_seqVec [n_batch x n_dim], d_posVec [n_batch x n_dim], d_negMat [n_neg x n_batch x n_dim] 159 | -- local d_seqVec, d_posVec, d_negMat = unpack(scoreModel:backward({seqVec, posVec, negMat}, d_scores)) 160 | local d_seqVec, d_posDropVec, d_negDropMat = unpack(scoreModel:backward({seqVec, posDropVec, negDropMat}, d_scores)) 161 | 162 | local d_negMat = negRelDrop:backward(negMat, d_negDropMat) 163 | 164 | local d_posVec = posRelDrop:backward(posVec, d_posDropVec) 165 | 166 | -- grad due to negative matrix 167 | relEmbed:backward(negs, d_negMat) 168 | 169 | -- grad due to positive vectors 170 | relEmbed:backward(pos, d_posVec) 171 | 172 | -- grad to the sequence model 173 | -- seqModel:backward(dropedSeq, d_seqVec) 174 | seqModel:backward(seq, d_seqVec) 175 | 176 | ----------------------- parameter update ---------------------- 177 | -- sgd with scheduled anealing 178 | relEmbed:updateParameters(lrRel / (1 + 0.0001 * i)) 179 | 180 | -- renorm rel embeding into normal ball 181 | relEmbed.weight:renorm(2, 2, 1) 182 | 183 | -- sgd with scheduled anealing (override with sparse update) 184 | wordEmbed:updateParameters(lrWrd / (1 + 0.0001 * i)) 185 | 186 | -- adagrad for rnn, projection 187 | for l = 1, opt.numLayer do optimGrad[l]:clamp(-10, 10) end 188 | optimizer:updateParams(optimParams, optimGrad) 189 | 190 | -- accumulate loss 191 | sumLoss = sumLoss + loss 192 | epochLoss = epochLoss + loss 193 | 194 | -- scheduled anealing the momentum rate after each epoch 195 | if i % loader.numBatch == 0 then 196 | flog.info(string.format('epoch %3d, loss %6.8f', i / loader.numBatch, epochLoss / loader.numBatch / loader.negSize)) 197 | epochLoss = 0 198 | if i / loader.numBatch >= 10 then 199 | optimizer:updateMomentum(math.min(optimizer.momentum + 0.3, 0.99)) 200 | end 201 | end 202 | 203 | ------------------------ training info ------------------------ 204 | if i % opt.printEvery == 0 then 205 | flog.info(string.format("iter %4d, loss = %6.8f", i, sumLoss / opt.printEvery / opt.negSize)) 206 | sumLoss = 0 207 | end 208 | if i % (loader.numBatch * opt.saveEvery) == 0 then 209 | -- save model after each epoch 210 | local epoch = i / loader.numBatch 211 | print('saving model after epoch', epoch) 212 | torch.save(opt.saveFile..'.'..opt.useGPU..'.'..epoch, model) 213 | end 214 | end 215 | -------------------------------------------------------------------------------- /src/py_module/virtuoso.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import sys 3 | import urllib, json 4 | import freebase 5 | 6 | # Setting global variables 7 | data_source = 'fb:' 8 | query_url = 'http://localhost:8890/sparql/' 9 | 10 | # HTTP URL is constructed accordingly with JSON query results format in mind. 11 | def sparql_query(query, URL, format='application/json'): 12 | 13 | params={ 14 | 'default-graph': '', 15 | 'should-sponge': 'soft', 16 | 'query': query.encode('utf8'), 17 | 'debug': 'on', 18 | 'timeout': '', 19 | 'format': format, 20 | 'save': 'display', 21 | 'fname': '' 22 | } 23 | 24 | encoded_query = urllib.urlencode(params) 25 | http_response = urllib.urlopen(URL, encoded_query).read() 26 | 27 | try: 28 | json_response = json.loads(http_response) 29 | return json_response 30 | except: 31 | print >> sys.stderr, 'json load error' 32 | print >> sys.stderr, http_response 33 | return None 34 | 35 | # Using freebase mid to query its types 36 | def id_query_type(node_id): 37 | query = ''' 38 | SELECT ?type WHERE {<%s> ?type} 39 | ''' % (node_id) 40 | json_response = sparql_query(query, query_url) 41 | 42 | try: 43 | type_list = [item['type']['value'] for item in json_response['results']['bindings']] 44 | return list(set(type_list)) 45 | except: 46 | return [] 47 | 48 | # Using freebase mid to query its original cased name 49 | def id_query_en_name(node_id): 50 | query = ''' 51 | SELECT ?name WHERE {<%s> ?name} 52 | ''' % (node_id) 53 | json_response = sparql_query(query, query_url) 54 | 55 | try: 56 | name_list = [item['name']['value'] for item in json_response['results']['bindings']] 57 | return list(set(name_list)) 58 | except: 59 | return [] 60 | 61 | # Using freebase mid to query its original cased alias 62 | def id_query_en_alias(node_id): 63 | query = ''' 64 | SELECT ?alias WHERE {<%s> ?alias} 65 | ''' % (node_id) 66 | json_response = sparql_query(query, query_url) 67 | 68 | try: 69 | alias_list = [item['alias']['value'] for item in json_response['results']['bindings']] 70 | return list(set(alias_list)) 71 | except: 72 | return [] 73 | 74 | # Using freebase mid to query its processed & tokenized name 75 | def id_query_name(node_id): 76 | query = ''' 77 | SELECT ?name WHERE {<%s> ?name} 78 | ''' % (node_id) 79 | json_response = sparql_query(query, query_url) 80 | 81 | try: 82 | name_list = [item['name']['value'] for item in json_response['results']['bindings']] 83 | return list(set(name_list)) 84 | except: 85 | return [] 86 | 87 | # Using freebase mid to query its processed & tokenized alias 88 | def id_query_alias(node_id): 89 | query = ''' 90 | SELECT ?alias WHERE {<%s> ?alias} 91 | ''' % (node_id) 92 | json_response = sparql_query(query, query_url) 93 | 94 | try: 95 | alias_list = [item['alias']['value'] for item in json_response['results']['bindings']] 96 | return list(set(alias_list)) 97 | except: 98 | return [] 99 | 100 | # Using freebase mid to query its processed & tokenized name & alias 101 | def id_query_str(node_id): 102 | query = ''' 103 | SELECT ?str WHERE { {<%s> ?str} UNION {<%s> ?str} } 104 | ''' % (node_id, node_id) 105 | json_response = sparql_query(query, query_url) 106 | 107 | try: 108 | name_list = [item['str']['value'] for item in json_response['results']['bindings']] 109 | return list(set(name_list)) 110 | except: 111 | return [] 112 | # Using freebase mid to query all relations coming out of the entity 113 | def id_query_out_rel(node_id, unique = True): 114 | query = ''' 115 | SELECT ?relation WHERE {<%s> ?relation ?object} 116 | ''' % (node_id) 117 | json_response = sparql_query(query, query_url) 118 | 119 | try: 120 | relations = [str(item['relation']['value']) for item in json_response['results']['bindings']] 121 | return list(set(relations)) 122 | except: 123 | return [] 124 | 125 | # Using freebase mid to query all relations coming into the entity 126 | def id_query_in_rel(node_id, unique = True): 127 | query = ''' 128 | SELECT ?relation WHERE {?subject ?relation <%s>} 129 | ''' % (node_id) 130 | json_response = sparql_query(query, query_url) 131 | 132 | try: 133 | relations = [str(item['relation']['value']) for item in json_response['results']['bindings']] 134 | return list(set(relations)) 135 | except: 136 | return [] 137 | 138 | 139 | # Using the name of an entity to query its freebase mid 140 | def name_query_id(name): 141 | query = ''' 142 | SELECT ?node_id WHERE {?node_id "%s"} 143 | ''' % (name) 144 | json_response = sparql_query(query, query_url) 145 | 146 | try: 147 | node_id_list = [str(item['node_id']['value']) for item in json_response['results']['bindings']] 148 | return list(set(node_id_list)) 149 | except: 150 | return [] 151 | 152 | # Using the alias of an entity to query its freebase mid 153 | def alias_query_id(alias): 154 | query = ''' 155 | SELECT ?node_id WHERE {?node_id "%s"} 156 | ''' % (alias) 157 | json_response = sparql_query(query, query_url) 158 | 159 | try: 160 | node_id_list = [str(item['node_id']['value']) for item in json_response['results']['bindings']] 161 | return list(set(node_id_list)) 162 | except: 163 | return [] 164 | 165 | # Using the alias/name of an entity to query its freebase mid 166 | def str_query_id(string): 167 | query = ''' 168 | SELECT ?node_id WHERE { {?node_id "%s"} UNION {?node_id "%s"} } 169 | ''' % (string, string) 170 | json_response = sparql_query(query, query_url) 171 | 172 | try: 173 | node_id_list = [str(item['node_id']['value']) for item in json_response['results']['bindings']] 174 | return list(set(node_id_list)) 175 | except: 176 | return [] 177 | 178 | # Using freebase mid to query all object coming out of the entity 179 | def id_query_in_entity(node_id, unique = True): 180 | query = ''' 181 | SELECT ?subject WHERE {?subject ?relation <%s>} 182 | ''' % (node_id) 183 | json_response = sparql_query(query, query_url) 184 | 185 | try: 186 | subjects = [str(item['subject']['value']) for item in json_response['results']['bindings']] 187 | return list(set(subjects)) 188 | except: 189 | return [] 190 | 191 | # Using freebase mid to query all relation coming into the entity 192 | def id_query_out_entity(node_id, unique = True): 193 | query = ''' 194 | SELECT ?object WHERE {<%s> ?relation ?object} 195 | ''' % (node_id) 196 | json_response = sparql_query(query, query_url) 197 | 198 | try: 199 | objects = [str(item['object']['value']) for item in json_response['results']['bindings']] 200 | return list(set(objects)) 201 | except: 202 | return [] 203 | 204 | # Using the subject and relation to query the corresponding object 205 | def query_object(subject, relation): 206 | query = ''' 207 | SELECT ?object WHERE {<%s> <%s> ?object} 208 | ''' % (subject, relation) 209 | json_response = sparql_query(query, query_url) 210 | 211 | try: 212 | return [str(item['object']['value']) for item in json_response['results']['bindings']] 213 | except: 214 | return [] 215 | 216 | # Using the object and relation to query the corresponding subject 217 | def query_subject(obj, relation): 218 | query = ''' 219 | SELECT ?subject WHERE {?subject <%s> <%s>} 220 | ''' % (relation, obj) 221 | json_response = sparql_query(query, query_url) 222 | 223 | try: 224 | return [str(item['subject']['value']) for item in json_response['results']['bindings']] 225 | except: 226 | return [] 227 | 228 | # Using the subject and object to query the corresponding relation 229 | def query_relation(sub, obj): 230 | query = ''' 231 | SELECT ?relation WHERE {<%s> ?relation <%s>} 232 | ''' % (sub, obj) 233 | json_response = sparql_query(query, query_url) 234 | 235 | try: 236 | objects = [str(item['relation']['value']) for item in json_response['results']['bindings']] 237 | return list(set(objects)) 238 | except: 239 | return [] 240 | 241 | # Check whether a node is a CVT node 242 | def check_cvt(node_id): 243 | query = ''' 244 | SELECT ?tag WHERE {<%s> ?tag} 245 | ''' % (node_id) 246 | json_response = sparql_query(query, query_url) 247 | ret = [str(item['tag']['value']) for item in json_response['results']['bindings']] 248 | 249 | if len(ret) == 1 and ret[0] == 'true': 250 | return True 251 | else: 252 | return False 253 | -------------------------------------------------------------------------------- /src/model/CRF.lua: -------------------------------------------------------------------------------- 1 | local CRF, parent = torch.class('CRF', 'nn.Module') 2 | 3 | -- initialize the module 4 | function CRF:__init(numClass, maxSeqLen, maxBatch) 5 | self.numClass = numClass 6 | self.maxSeqLen = maxSeqLen 7 | self.maxBatch = maxBatch 8 | 9 | -- pairwire parameter 10 | self.weight = torch.rand(self.numClass, self.numClass) 11 | self.gradWeight = torch.zeros(self.numClass, self.numClass) 12 | 13 | -- state memory 14 | self.alpha = torch.zeros(self.maxSeqLen, self.maxBatch, self.numClass) 15 | self.beta = torch.zeros(self.maxSeqLen, self.maxBatch, self.numClass) 16 | 17 | self.partition = torch.zeros(self.maxBatch) 18 | 19 | self.marginalU = torch.zeros(self.maxSeqLen, self.maxBatch, self.numClass) 20 | self.marginalP = torch.zeros(self.maxSeqLen - 1, self.maxBatch, self.numClass, self.numClass) 21 | 22 | self.output = torch.Tensor(maxSeqLen, self.maxBatch) 23 | self.gradInput = torch.Tensor(maxSeqLen, self.maxBatch, self.numClass) 24 | 25 | -- working memory 26 | self.tempMat = torch.zeros(self.maxBatch, self.numClass, self.numClass) 27 | self.maxVec = torch.zeros(self.maxBatch, self.numClass) 28 | 29 | self.uFactor = torch.zeros(self.maxBatch) 30 | self.pFactor = torch.zeros(self.maxBatch) 31 | self.flatLabelPair = torch.zeros(self.maxSeqLen, self.maxBatch) 32 | 33 | self.tempGradWeight = torch.zeros(self.maxSeqLen * self.maxBatch, self.numClass * self.numClass) 34 | 35 | -- helper structures 36 | self.stridePartitionVec = torch.LongStorage({0, 1, 0}) 37 | self.stridePartitionMat = torch.LongStorage({0, 1, 0, 0}) 38 | self.strideWeight = torch.LongStorage({0, 0, self.numClass, 1}) 39 | 40 | self.fullVecSize = torch.LongStorage({self.maxSeqLen, self.maxBatch, self.numClass}) 41 | self.fullMatSize = torch.LongStorage({self.maxSeqLen, self.maxBatch, self.numClass, self.numClass}) 42 | self.pairMatSize = torch.LongStorage({self.maxSeqLen - 1, self.maxBatch, self.numClass, self.numClass}) 43 | self.stepMatSize = torch.LongStorage({self.maxBatch, self.numClass, self.numClass}) 44 | 45 | -- set training flag 46 | self.train = true 47 | end 48 | 49 | function CRF:viterbi(input) 50 | local unary = input 51 | local seqLen, batchSize = unary:size(1), unary:size(2) 52 | 53 | self.fullMatSize[1], self.fullMatSize[2] = seqLen, batchSize 54 | self.fullVecSize[1], self.fullVecSize[2] = seqLen, batchSize 55 | self.stepMatSize[1] = batchSize 56 | 57 | -- resize tensor 58 | self.alpha:resize(self.fullVecSize):zero() 59 | self.beta:resize (self.fullVecSize):zero() 60 | 61 | self.tempMat:resize(self.stepMatSize) 62 | 63 | -- replicates 64 | local batchWeight = self.weight:view(1, self.numClass, self.numClass):expand(self.stepMatSize) 65 | 66 | local repUnary = unary:view(seqLen, batchSize, self.numClass, 1):expand(self.fullMatSize) 67 | local repAlpha = self.alpha:view(seqLen, batchSize, self.numClass, 1):expand(self.fullMatSize) 68 | local repBeta = self.beta:view (seqLen, batchSize, self.numClass, 1):expand(self.fullMatSize) 69 | 70 | for i = 1, seqLen do 71 | self.tempMat:copy(repUnary[i]) 72 | if i ~= seqLen then 73 | self.tempMat:add(batchWeight) 74 | end 75 | if i ~= 1 then 76 | self.tempMat:add(repAlpha[i-1]) 77 | end 78 | 79 | val, idx = torch.max(self.tempMat, 2) 80 | self.alpha[i], self.beta[i] = val, idx:typeAs(self.beta[i]) 81 | end 82 | 83 | self.output:resize(seqLen, batchSize, 1):zero() 84 | 85 | self.output[seqLen] = self.beta[{seqLen, {}, 1}] 86 | for i = seqLen - 1, 1, -1 do 87 | self.output[i] = self.beta[i]:gather(2, self.output[i+1]) 88 | end 89 | 90 | self.output = self.output:view(seqLen, batchSize) 91 | 92 | return self.output 93 | end 94 | 95 | function CRF:forwardbackward(input) 96 | local unary, label = unpack(input) 97 | local seqLen, batchSize = unary:size(1), unary:size(2) 98 | 99 | self.pairMatSize[1], self.pairMatSize[2] = seqLen - 1, batchSize 100 | self.fullMatSize[1], self.fullMatSize[2] = seqLen, batchSize 101 | self.fullVecSize[1], self.fullVecSize[2] = seqLen, batchSize 102 | self.stepMatSize[1] = batchSize 103 | 104 | -- resize tensor 105 | self.alpha:resize(self.fullVecSize):zero() 106 | self.beta:resize(self.fullVecSize):zero() 107 | 108 | self.marginalU:resize(self.fullVecSize) 109 | self.marginalP:resize(self.pairMatSize) 110 | 111 | self.partition:resize(batchSize) 112 | 113 | self.tempMat:resize(self.stepMatSize) 114 | self.maxVec:resize (batchSize, self.numClass) 115 | 116 | -- replicates 117 | local fullPartitionVec = self.partition.new(self.partition:storage(), self.partition:storageOffset(), self.fullVecSize, self.stridePartitionVec) 118 | local pairPartitionMat = self.partition.new(self.partition:storage(), self.partition:storageOffset(), self.pairMatSize, self.stridePartitionMat) 119 | 120 | local pairWeight = self.weight.new(self.weight:storage(), self.weight:storageOffset(), self.pairMatSize, self.strideWeight) 121 | 122 | local batchWeight = self.weight:view(1, self.numClass, self.numClass):expand(self.stepMatSize) 123 | local transWeight = batchWeight:transpose(2,3) 124 | 125 | local repUnary = unary:view(seqLen, batchSize, self.numClass, 1):expand(self.fullMatSize) 126 | local repAlpha = self.alpha:view(seqLen, batchSize, self.numClass, 1):expand(self.fullMatSize) 127 | local repBeta = self.beta:view (seqLen, batchSize, self.numClass, 1):expand(self.fullMatSize) 128 | 129 | local repMaxVec = self.maxVec:view(batchSize, 1, self.numClass):expand(self.stepMatSize) 130 | 131 | -- forward recursion [alpha] 132 | for i = 1, seqLen do 133 | self.tempMat:copy(repUnary[i]) 134 | if i ~= seqLen then 135 | self.tempMat:add(batchWeight) 136 | end 137 | if i ~= 1 then 138 | self.tempMat:add(repAlpha[i-1]) 139 | end 140 | 141 | -- log sum exp 142 | self.maxVec:max(self.tempMat, 2) 143 | self.tempMat:add(-1, repMaxVec):exp() 144 | self.alpha[i]:sum(self.tempMat, 2):log() 145 | self.alpha[i]:add(self.maxVec) 146 | end 147 | 148 | -- backward recursion [beta] 149 | for i = seqLen, 1, -1 do 150 | self.tempMat:copy(repUnary[i]) 151 | if i ~= 1 then 152 | self.tempMat:add(transWeight) 153 | end 154 | if i ~= seqLen then 155 | self.tempMat:add(repBeta[i+1]) 156 | end 157 | 158 | -- log sum exp 159 | self.maxVec:max(self.tempMat, 2) 160 | self.tempMat:add(-1, repMaxVec):exp() 161 | self.beta[i]:sum(self.tempMat, 2):log() 162 | self.beta[i]:add(self.maxVec) 163 | end 164 | 165 | self.partition:copy(self.alpha[{seqLen, {}, 1}]) 166 | 167 | -- marginals 168 | self.marginalU:copy(unary) 169 | if seqLen >= 2 then 170 | self.marginalU[{{2, seqLen}}]:add(self.alpha[{{1, seqLen - 1}}]) 171 | self.marginalU[{{1, seqLen - 1}}]:add(self.beta [{{2, seqLen}}]) 172 | end 173 | self.marginalU:add(-1, fullPartitionVec) 174 | self.marginalU:exp() 175 | 176 | if seqLen >= 2 then 177 | self.marginalP:add(repUnary[{{1, seqLen - 1}}], repUnary[{{2, seqLen}}]:transpose(3,4)) 178 | self.marginalP:add(pairWeight) 179 | if seqLen > 2 then 180 | self.marginalP[{{2, seqLen - 1}}]:add(repAlpha[{{1, seqLen - 2}}]) 181 | self.marginalP[{{1, seqLen - 2}}]:add(repBeta [{{3, seqLen}}]:transpose(3,4)) 182 | end 183 | self.marginalP:add(-1, pairPartitionMat) 184 | self.marginalP:exp() 185 | end 186 | 187 | -- empirical probability 188 | self.output:resize(batchSize):zero() 189 | self.uFactor:resize(batchSize):zero() 190 | self.pFactor:resize(batchSize):zero() 191 | self.flatLabelPair:resize(seqLen - 1, batchSize):zero() 192 | 193 | self.uFactor:view(batchSize, 1):sum(unary:view(-1, self.numClass):gather(2, label:view(-1, 1)):view(seqLen, batchSize), 1) 194 | 195 | if seqLen >= 2 then 196 | self.flatLabelPair = (label[{{1, seqLen - 1}}] - 1) * self.numClass + label[{{2, seqLen}}] 197 | self.pFactor:view(batchSize, 1):sum(self.weight:view(1, -1):gather(2, self.flatLabelPair:view(1, -1)):view(seqLen-1, batchSize), 1) 198 | end 199 | 200 | self.output:add(self.uFactor, self.pFactor) 201 | self.output:add(-1, self.partition) 202 | self.output:exp() 203 | 204 | return self.output 205 | end 206 | 207 | function CRF:updateOutput(input) 208 | if self.train then 209 | return self:forwardbackward(input) 210 | else 211 | return self:viterbi(input) 212 | end 213 | end 214 | 215 | function CRF:backward(input) 216 | local unary, label = unpack(input) 217 | local seqLen, batchSize = unary:size(1), unary:size(2) 218 | 219 | self.gradInput:resizeAs(unary):zero() 220 | 221 | self.gradInput:view(-1, self.numClass):scatter(2, label:view(-1, 1), -1) 222 | self.gradInput:add(self.marginalU) 223 | 224 | if seqLen >= 2 then 225 | self.tempGradWeight:resize((seqLen - 1) * batchSize, self.numClass * self.numClass) 226 | 227 | self.tempGradWeight:scatter(2, self.flatLabelPair:view(-1, 1), -1) 228 | self.tempGradWeight:add(self.marginalP) 229 | 230 | self.gradWeight:view(1, self.numClass * self.numClass):sum(self.tempGradWeight, 1) 231 | end 232 | 233 | return self.gradInput 234 | end 235 | 236 | function CRF:parameters() 237 | return {self.weight}, {self.gradWeight} 238 | end 239 | -------------------------------------------------------------------------------- /src/model/BiGRU.lua: -------------------------------------------------------------------------------- 1 | local BiGRU, parent = torch.class('BiGRU', 'BiRNN') 2 | 3 | -- initialize the module 4 | function BiGRU:__init(config) 5 | parent.__init(self) 6 | 7 | -- config the model 8 | self.inputSize = config.inputSize 9 | self.hiddenSize = config.hiddenSize 10 | self.maxSeqLen = config.maxSeqLen or 200 11 | self.maxBatch = config.maxBatch or 128 12 | 13 | -- allocate weights memory 14 | self.weight = torch.Tensor(self.inputSize, self.hiddenSize*6):uniform(-1.0, 1.0) 15 | self.gradWeight = torch.Tensor(self.inputSize, self.hiddenSize*6):zero() 16 | 17 | self.bias = torch.Tensor(self.hiddenSize*6):uniform(-1.0, 1.0) 18 | self.gradBias = torch.Tensor(self.hiddenSize*6):zero() 19 | 20 | self.recWeight_G = torch.Tensor(2, self.hiddenSize, self.hiddenSize*2):uniform(-1.0, 1.0) 21 | self.gradRecWeight_G = torch.Tensor(2, self.hiddenSize, self.hiddenSize*2):zero() 22 | 23 | self.recWeight_H = torch.Tensor(2, self.hiddenSize, self.hiddenSize):uniform(-1.0, 1.0) 24 | self.gradRecWeight_H = torch.Tensor(2, self.hiddenSize, self.hiddenSize):zero() 25 | 26 | -- allocate working memory 27 | self.gates = torch.Tensor(self.maxSeqLen, self.maxBatch, self.hiddenSize*6):zero() 28 | self.resetH = torch.Tensor(self.maxSeqLen, self.maxBatch, self.hiddenSize*2):zero() 29 | self.comple = torch.Tensor(self.maxSeqLen, self.maxBatch, self.hiddenSize*2):zero() 30 | self.hidden = torch.Tensor(self.maxSeqLen, self.maxBatch, self.hiddenSize*2):zero() 31 | 32 | self.gradGates = torch.Tensor(self.maxSeqLen, self.maxBatch, self.hiddenSize*6):zero() 33 | self.gradInput = torch.Tensor(self.maxSeqLen, self.maxBatch, self.inputSize *2):zero() 34 | self.gradResetH = torch.Tensor(self.maxSeqLen, self.maxBatch, self.hiddenSize*2):zero() 35 | 36 | self.buffer = torch.ones(self.maxSeqLen*self.maxBatch) 37 | 38 | -- logging information 39 | if config.logger then 40 | config.logger.info(string.rep('-', 50)) 41 | config.logger.info('BiGRU Configuration:') 42 | config.logger.info(string.format(' inputSize : %5d', self.inputSize)) 43 | config.logger.info(string.format(' hiddenSize : %5d', self.hiddenSize)) 44 | config.logger.info(string.format(' maxSeqLen : %5d', self.maxSeqLen)) 45 | config.logger.info(string.format(' maxBatch : %5d', self.maxBatch)) 46 | end 47 | 48 | end 49 | 50 | function BiGRU:updateOutput(input) 51 | assert(self.inputSize==input:size(3), 'Input size not match') 52 | local seqLen, batchSize = input:size(1), input:size(2) 53 | 54 | self.gates:resize (seqLen, batchSize, self.hiddenSize*6) 55 | self.resetH:resize(seqLen, batchSize, self.hiddenSize*2) 56 | self.comple:resize(seqLen, batchSize, self.hiddenSize*2) 57 | self.hidden:resize(seqLen, batchSize, self.hiddenSize*2) 58 | 59 | self.buffer:resize(seqLen*batchSize) 60 | 61 | self.comple:fill(1) 62 | 63 | local denseInput = input:view(seqLen*batchSize, self.inputSize) 64 | local denseGates = self.gates:view(seqLen*batchSize, self.hiddenSize*6) 65 | 66 | denseGates:addr(0, 1, self.buffer, self.bias) 67 | denseGates:addmm(1, denseInput, self.weight) 68 | 69 | for i = 1, self.nStream do 70 | -- set stream: stream 1 deals with forward-GRU & stream 2 deals with backward-GRU 71 | if cutorch then cutorch.setStream(i) end 72 | 73 | -- get traverse order (depends on the stream) 74 | local begIdx, endIdx, stride = self:traverseOrder(seqLen, i) 75 | 76 | -- compute stream memory offset 77 | local left, right = (i-1)*self.hiddenSize, i*self.hiddenSize 78 | 79 | local prevHidden 80 | 81 | for seqIdx = begIdx, endIdx, stride do 82 | -- get current memory 83 | local currGates = self.gates [{seqIdx, {}, {3*left+1, 3*right}}] 84 | local currResetH = self.resetH[{seqIdx, {}, { left+1, right}}] 85 | local currComple = self.comple[{seqIdx, {}, { left+1, right}}] 86 | local currHidden = self.hidden[{seqIdx, {}, { left+1, right}}] 87 | 88 | -- decompose currGates 89 | local preGateAct = currGates[{{}, { 1, self.hiddenSize}}] 90 | local resetGate = currGates[{{}, { self.hiddenSize+1, 2*self.hiddenSize}}] 91 | local updateGate = currGates[{{}, {2*self.hiddenSize+1, 3*self.hiddenSize}}] 92 | local bothGates = currGates[{{}, { self.hiddenSize+1, 3*self.hiddenSize}}] 93 | 94 | -- recurrent connection 95 | if seqIdx ~= begIdx then 96 | bothGates:addmm(1, prevHidden, self.recWeight_G[i]) 97 | end 98 | 99 | -- inplace non-linearity for reset & update (both) gates 100 | -- bothGates.nn.Sigmoid_forward(bothGates, bothGates) 101 | bothGates.THNN.Sigmoid_updateOutput(bothGates:cdata(), bothGates:cdata()) 102 | 103 | -- reset prev hidden 104 | if seqIdx ~= begIdx then 105 | currResetH:cmul(resetGate, prevHidden) 106 | preGateAct:addmm(1, currResetH, self.recWeight_H[i]) 107 | end 108 | -- preGateAct.nn.Tanh_forward(preGateAct, preGateAct) 109 | preGateAct.THNN.Tanh_updateOutput(preGateAct:cdata(), preGateAct:cdata()) 110 | 111 | -- complementary gate 112 | currComple:add(-1, updateGate) 113 | 114 | -- currect hidden 115 | currHidden:cmul(preGateAct, currComple) 116 | if seqIdx ~= begIdx then 117 | currHidden:addcmul(1, prevHidden, updateGate) 118 | end 119 | 120 | -- set prev hidden 121 | prevHidden = currHidden 122 | end 123 | end 124 | 125 | if cutorch then 126 | -- set back the stream to default stream (0): 127 | cutorch.setStream(0) 128 | 129 | -- 0 is default stream, let 0 wait for the 2 streams to complete before doing anything further 130 | cutorch.streamWaitFor(0, self.streamList) 131 | end 132 | 133 | self.output = self.hidden 134 | return self.output 135 | end 136 | 137 | function BiGRU:updateGradInput(input, gradOutput) 138 | assert(self.hiddenSize*2==gradOutput:size(gradOutput:nDimension()), 'gradOutput size not match') 139 | assert(input:size(1)==gradOutput:size(1) and input:size(2)==gradOutput:size(2), 'gradOutput and input size not match') 140 | 141 | local seqLen, batchSize = input:size(1), input:size(2) 142 | 143 | self.gradInput:resize (seqLen, batchSize, self.inputSize) 144 | self.gradGates:resize (seqLen, batchSize, self.hiddenSize*6) 145 | self.gradResetH:resize(seqLen, batchSize, self.hiddenSize*2) 146 | 147 | self.gradGates[1]:fill(0) 148 | self.gradGates[seqLen]:fill(0) 149 | 150 | for i = 1, self.nStream do 151 | -- set stream: stream 1 deals with forward-GRU & stream 2 deals with backward-GRU 152 | if cutorch then cutorch.setStream(i) end 153 | 154 | -- get traverse order (depends on the stream) 155 | local begIdx, endIdx, stride = self:traverseOrder(seqLen, i) 156 | 157 | -- compute stream memory offset 158 | local left, right = (i-1)*self.hiddenSize, i*self.hiddenSize 159 | 160 | local prevHidden, prevGradOutput 161 | 162 | for seqIdx = endIdx, begIdx, -stride do 163 | -- get current memory 164 | local currGates = self.gates [{seqIdx, {}, {3*left+1, 3*right}}] 165 | local currResetH = self.resetH[{seqIdx, {}, { left+1, right}}] 166 | local currComple = self.comple[{seqIdx, {}, { left+1, right}}] 167 | local currHidden = self.hidden[{seqIdx, {}, { left+1, right}}] 168 | 169 | local currGradGates = self.gradGates [{seqIdx, {}, {3*left+1, 3*right}}] 170 | local currGradResetH = self.gradResetH[{seqIdx, {}, { left+1, right}}] 171 | local currGradOutput = gradOutput [{seqIdx, {}, { left+1, right}}] 172 | 173 | -- decompose currGates 174 | local preGateAct = currGates[{{}, { 1, self.hiddenSize}}] 175 | local resetGate = currGates[{{}, { self.hiddenSize+1, 2*self.hiddenSize}}] 176 | local updateGate = currGates[{{}, {2*self.hiddenSize+1, 3*self.hiddenSize}}] 177 | 178 | local gradPreGateAct = currGradGates[{{}, { 1, self.hiddenSize}}] 179 | local gradResetGate = currGradGates[{{}, { self.hiddenSize+1, 2*self.hiddenSize}}] 180 | local gradUpdateGate = currGradGates[{{}, {2*self.hiddenSize+1, 3*self.hiddenSize}}] 181 | local gradBothGates = currGradGates[{{}, { self.hiddenSize+1, 3*self.hiddenSize}}] 182 | 183 | -- pre-gate input: d_h[t] / d_title{h}[t] 184 | gradPreGateAct:cmul(currGradOutput, currComple) 185 | -- gradPreGateAct.nn.Tanh_backward(gradPreGateAct, preGateAct, gradPreGateAct) -- inplace 186 | gradPreGateAct.THNN.Tanh_updateGradInput(preGateAct:cdata(), gradPreGateAct:cdata(), gradPreGateAct:cdata(), preGateAct:cdata()) -- inplace 187 | 188 | -- related to prev hidden 189 | if seqIdx ~= begIdx then 190 | -- set prev hidden 191 | prevHidden = self.hidden[{seqIdx-stride, {}, {left+1, right}}] 192 | 193 | -- reset prev hidden: d_h[t] / d_hat{h}[t] 194 | currGradResetH:mm(gradPreGateAct, self.recWeight_H[i]:t()) 195 | 196 | -- reset gate: d_h[t] / d_r[t] 197 | gradResetGate:cmul(currGradResetH, prevHidden) 198 | -- gradResetGate.nn.Sigmoid_backward(gradResetGate, resetGate, gradResetGate) -- inplace 199 | gradResetGate.THNN.Sigmoid_updateGradInput(resetGate:cdata(), gradResetGate:cdata(), gradResetGate:cdata(), resetGate:cdata()) -- inplace 200 | 201 | -- update gate: d_h[t] / d_z[t] 202 | gradUpdateGate:cmul(currGradOutput, prevHidden) 203 | end 204 | 205 | -- update gate: d_h[t] / d_z[t] 206 | gradUpdateGate:addcmul(-1, currGradOutput, preGateAct) 207 | -- gradUpdateGate.nn.Sigmoid_backward(gradUpdateGate, updateGate, gradUpdateGate) -- inplace 208 | gradUpdateGate.THNN.Sigmoid_updateGradInput(updateGate:cdata(), gradUpdateGate:cdata(), gradUpdateGate:cdata(), updateGate:cdata()) -- inplace 209 | 210 | -- d_h[t] / d_recWeight_H 211 | self.gradRecWeight_H[i]:addmm(1, currResetH:t(), gradPreGateAct) 212 | 213 | if seqIdx ~= begIdx then 214 | -- set prev grad hidden/output 215 | prevGradOutput = gradOutput[{seqIdx-stride, {}, {left+1, right}}] 216 | 217 | -- prev hidden: d_h[t] / d_h[t-1] 218 | prevGradOutput:addmm(1, gradBothGates, self.recWeight_G[i]:t()) 219 | prevGradOutput:addcmul(1, currGradOutput, updateGate) 220 | prevGradOutput:addcmul(1, currGradResetH, resetGate) 221 | 222 | -- d_h[t] / d_recWeight_G 223 | self.gradRecWeight_G[i]:addmm(1, prevHidden:t(), gradBothGates) 224 | end 225 | end 226 | end 227 | 228 | if cutorch then 229 | -- set back the stream to default stream (0): 230 | cutorch.setStream(0) 231 | 232 | -- 0 is default stream, let 0 wait for the 2 streams to complete before doing anything further 233 | cutorch.streamWaitFor(0, self.streamList) 234 | end 235 | 236 | local denseInput = input:view(seqLen*batchSize, self.inputSize) 237 | local denseGradInput = self.gradInput:view(seqLen*batchSize, self.inputSize) 238 | local denseGradGates = self.gradGates:view(seqLen*batchSize, self.hiddenSize*6) 239 | 240 | -- d_E / d_input 241 | denseGradInput:mm(denseGradGates, self.weight:t()) 242 | 243 | -- d_E / d_W 244 | self.gradWeight:addmm(1, denseInput:t(), denseGradGates) 245 | 246 | -- d_E / d_b 247 | self.gradBias:addmv(1, denseGradGates:t(), self.buffer) 248 | 249 | return self.gradInput 250 | end 251 | 252 | function BiGRU:parameters() 253 | return {self.weight, self.recWeight_G, self.recWeight_H, self.bias}, {self.gradWeight, self.gradRecWeight_G, self.gradRecWeight_H, self.gradBias} 254 | end 255 | -------------------------------------------------------------------------------- /KnowledgeBase/type.top-500.pkl: -------------------------------------------------------------------------------- 1 | (dp1 2 | S'fb:film.film' 3 | p2 4 | I16 5 | sS'fb:music.genre' 6 | p3 7 | I395 8 | sS'fb:base.tagit.man_made_thing' 9 | p4 10 | I225 11 | sS'fb:user.doconnor.pets.topic' 12 | p5 13 | I319 14 | sS'fb:award.hall_of_fame_inductee' 15 | p6 16 | I216 17 | sS'fb:user.narphorium.people.nndb_person' 18 | p7 19 | I75 20 | sS'fb:music.multipart_release' 21 | p8 22 | I211 23 | sS'fb:base.academia.topic' 24 | p9 25 | I380 26 | sS'fb:base.wfilmbase.topic' 27 | p10 28 | I262 29 | sS'fb:base.cars_refactor.model' 30 | p11 31 | I236 32 | sS'fb:book.magazine' 33 | p12 34 | I315 35 | sS'fb:book.book_character' 36 | p13 37 | I190 38 | sS'fb:freebase.type_profile' 39 | p14 40 | I204 41 | sS'fb:base.webvideo.topic' 42 | p15 43 | I391 44 | sS'fb:base.prison.topic' 45 | p16 46 | I415 47 | sS'fb:base.aareas.schema.earth.citytown' 48 | p17 49 | I193 50 | sS'fb:internet.website_owner' 51 | p18 52 | I306 53 | sS'fb:base.ontologies.ontology_instance' 54 | p19 55 | I117 56 | sS'fb:travel.accommodation' 57 | p20 58 | I384 59 | sS'fb:influence.influence_node' 60 | p21 61 | I44 62 | sS'fb:base.type_ontology.abstract' 63 | p22 64 | I4 65 | sS'fb:base.americancomedy.topic' 66 | p23 67 | I256 68 | sS'fb:location.hud_foreclosure_area' 69 | p24 70 | I57 71 | sS'fb:user.alexander.philosophy.philosopher' 72 | p25 73 | I357 74 | sS'fb:base.consumermedical.medical_term' 75 | p26 76 | I218 77 | sS'fb:food.food' 78 | p27 79 | I197 80 | sS'fb:fictional_universe.person_in_fiction' 81 | p28 82 | I165 83 | sS'fb:education.field_of_study' 84 | p29 85 | I271 86 | sS'fb:sports.boxer' 87 | p30 88 | I189 89 | sS'fb:base.newyorkcity.topic' 90 | p31 91 | I367 92 | sS'fb:boats.ship' 93 | p32 94 | I259 95 | sS'fb:base.adultentertainment.adult_entertainer' 96 | p33 97 | I314 98 | sS'fb:user.robert.us_congress.topic' 99 | p34 100 | I389 101 | sS'fb:base.horseracing.topic' 102 | p35 103 | I212 104 | sS'fb:base.worldwartwo.topic' 105 | p36 106 | I268 107 | sS'fb:base.blackhistorymonth.topic' 108 | p37 109 | I180 110 | sS'fb:music.release_track' 111 | p38 112 | I15 113 | sS'fb:user.alexander.misc.murdered_person' 114 | p39 115 | I255 116 | sS'fb:base.hindisoundtracks.topic' 117 | p40 118 | I455 119 | sS'fb:business.business_location' 120 | p41 121 | I270 122 | sS'fb:medicine.risk_factor' 123 | p42 124 | I477 125 | sS'fb:royalty.monarch' 126 | p43 127 | I253 128 | sS'fb:base.type_ontology.animate' 129 | p44 130 | I6 131 | sS'fb:book.periodical' 132 | p45 133 | I133 134 | sS'fb:music.producer' 135 | p46 136 | I104 137 | sS'fb:aviation.aircraft_owner' 138 | p47 139 | I361 140 | sS'fb:dining.restaurant' 141 | p48 142 | I407 143 | sS'fb:periodicals.newspaper_circulation_area' 144 | p49 145 | I140 146 | sS'fb:medicine.condition_prevention_factors' 147 | p50 148 | I456 149 | sS'fb:base.allthingsnewyork.topic' 150 | p51 151 | I115 152 | sS'fb:sports.drafted_athlete' 153 | p52 154 | I290 155 | sS'fb:base.fictionaluniverse.topic' 156 | p53 157 | I497 158 | sS'fb:food.dish' 159 | p54 160 | I355 161 | sS'fb:base.cannes.topic' 162 | p55 163 | I202 164 | sS'fb:book.literary_series' 165 | p56 166 | I481 167 | sS'fb:biology.animal_breed' 168 | p57 169 | I459 170 | sS'fb:base.zxspectrum.zx_spectrum_program' 171 | p58 172 | I448 173 | sS'fb:military.battle' 174 | p59 175 | I171 176 | sS'fb:sports.sports_official' 177 | p60 178 | I353 179 | sS'fb:film.cinematographer' 180 | p61 181 | I93 182 | sS'fb:law.inventor' 183 | p62 184 | I235 185 | sS'fb:media_common.quotation' 186 | p63 187 | I272 188 | sS'fb:music.release' 189 | p64 190 | I20 191 | sS'fb:astronomy.star' 192 | p65 193 | I258 194 | sS'fb:medicine.disease' 195 | p66 196 | I185 197 | sS'fb:film.film_screening_venue' 198 | p67 199 | I431 200 | sS'fb:broadcast.tv_station' 201 | p68 202 | I474 203 | sS'fb:media_common.adapted_work' 204 | p69 205 | I148 206 | sS'fb:music.soundtrack' 207 | p70 208 | I98 209 | sS'fb:award.award_nominated_work' 210 | p71 211 | I45 212 | sS'fb:theater.theater' 213 | p72 214 | I486 215 | sS'fb:theater.theater_actor' 216 | p73 217 | I126 218 | sS'fb:book.written_work' 219 | p74 220 | I28 221 | sS'fb:astronomy.celestial_object' 222 | p75 223 | I58 224 | sS'fb:base.scotland.topic' 225 | p76 226 | I487 227 | sS'fb:type.content' 228 | p77 229 | I17 230 | sS'fb:base.aareas.schema.au.local_government_area' 231 | p78 232 | I493 233 | sS'fb:people.deceased_person' 234 | p79 235 | I8 236 | sS'fb:broadcast.broadcast' 237 | p80 238 | I146 239 | sS'fb:sports.sports_team' 240 | p81 241 | I167 242 | sS'fb:fictional_universe.fictional_character_creator' 243 | p82 244 | I254 245 | sS'fb:olympics.olympic_athlete' 246 | p83 247 | I96 248 | sS'fb:dining.chef' 249 | p84 250 | I452 251 | sS'fb:biology.deceased_organism' 252 | p85 253 | I398 254 | sS'fb:architecture.museum' 255 | p86 256 | I241 257 | sS'fb:base.argumentmaps.idea' 258 | p87 259 | I441 260 | sS'fb:government.government_agency' 261 | p88 262 | I308 263 | sS'fb:base.litcentral.topic' 264 | p89 265 | I416 266 | sS'fb:base.schemastaging.context_name' 267 | p90 268 | I137 269 | sS'fb:media_common.creative_work' 270 | p91 271 | I22 272 | sS'fb:visual_art.visual_artist' 273 | p92 274 | I82 275 | sS'fb:business.industry' 276 | p93 277 | I293 278 | sS'fb:base.lgbtfilms.topic' 279 | p94 280 | I265 281 | sS'fb:biology.animal' 282 | p95 283 | I374 284 | sS'fb:broadcast.artist' 285 | p96 286 | I108 287 | sS'fb:base.crime.lawyer' 288 | p97 289 | I168 290 | sS'fb:base.popstra.celebrity' 291 | p98 292 | I150 293 | sS'fb:user.maxim75.default_domain.dbpedia_import' 294 | p99 295 | I103 296 | sS'fb:base.foodrecipes.topic' 297 | p100 298 | I339 299 | sS'fb:sports.sports_award_winner' 300 | p101 301 | I401 302 | sS'fb:film.editor' 303 | p102 304 | I92 305 | sS'fb:film.film_location' 306 | p103 307 | I260 308 | sS'fb:music.lyricist' 309 | p104 310 | I87 311 | sS'fb:base.argumentmaps.innovator' 312 | p105 313 | I417 314 | sS'fb:military.military_commander' 315 | p106 316 | I440 317 | sS'fb:tv.tv_personality' 318 | p107 319 | I97 320 | sS'fb:organization.organization' 321 | p108 322 | I32 323 | sS'fb:music.songwriter' 324 | p109 325 | I242 326 | sS'fb:base.skosbase.topic' 327 | p110 328 | I162 329 | sS'fb:film.film_set_designer' 330 | p111 331 | I228 332 | sS'fb:cvg.cvg_designer' 333 | p112 334 | I465 335 | sS'fb:visual_art.art_subject' 336 | p113 337 | I307 338 | sS'fb:people.family_member' 339 | p114 340 | I145 341 | sS'fb:projects.project_participant' 342 | p115 343 | I282 344 | sS'fb:music.artist' 345 | p116 346 | I23 347 | sS'fb:medicine.symptom' 348 | p117 349 | I403 350 | sS'fb:cricket.cricket_bowler' 351 | p118 352 | I333 353 | sS'fb:music.concert_film' 354 | p119 355 | I352 356 | sS'fb:user.alexbl.honorary_title.titled_person' 357 | p120 358 | I301 359 | sS'fb:american_football.football_coach' 360 | p121 361 | I317 362 | sS'fb:geography.body_of_water' 363 | p122 364 | I107 365 | sS'fb:cvg.cvg_publisher' 366 | p123 367 | I388 368 | sS'fb:american_football.football_player' 369 | p124 370 | I76 371 | sS'fb:user.micahsaul.advertising.advertiser' 372 | p125 373 | I489 374 | sS'fb:base.aareas.schema.administrative_area' 375 | p126 376 | I78 377 | sS'fb:education.educational_institution_campus' 378 | p127 379 | I77 380 | sS'fb:base.moscratch.topic' 381 | p128 382 | I488 383 | sS'fb:base.gayporn.topic' 384 | p129 385 | I377 386 | sS'fb:base.movies1001.topic' 387 | p130 388 | I385 389 | sS'fb:tennis.tennis_tournament_champion' 390 | p131 391 | I408 392 | sS'fb:location.australian_local_government_area' 393 | p132 394 | I490 395 | sS'fb:cricket.cricket_player' 396 | p133 397 | I179 398 | sS'fb:base.uncommon.topic' 399 | p134 400 | I291 401 | sS'fb:music.composer' 402 | p135 403 | I49 404 | sS'fb:business.consumer_product' 405 | p136 406 | I31 407 | sS'fb:religion.deity' 408 | p137 409 | I485 410 | sS'fb:music.conductor' 411 | p138 412 | I288 413 | sS'fb:tv.tv_soundtrack' 414 | p139 415 | I330 416 | sS'fb:user.alust.default_domain.processed_with_review_queue' 417 | p140 418 | I42 419 | sS'fb:base.computerscience.topic' 420 | p141 421 | I413 422 | sS'fb:base.toronto.topic' 423 | p142 424 | I313 425 | sS'fb:theater.musical_soundtrack' 426 | p143 427 | I454 428 | sS'fb:sports.pro_athlete' 429 | p144 430 | I14 431 | sS'fb:base.todolists.topic' 432 | p145 433 | I161 434 | sS'fb:geography.lake' 435 | p146 436 | I178 437 | sS'fb:cvg.cvg_developer' 438 | p147 439 | I305 440 | sS'fb:location.us_county' 441 | p148 442 | I187 443 | sS'fb:base.vermont.topic' 444 | p149 445 | I360 446 | sS'fb:base.tagit.concept' 447 | p150 448 | I109 449 | sS'fb:base.animal_synopses.animal_synopsis' 450 | p151 451 | I445 452 | sS'fb:book.journal' 453 | p152 454 | I400 455 | sS'fb:music.record_label' 456 | p153 457 | I239 458 | sS'fb:royalty.chivalric_order_member' 459 | p154 460 | I136 461 | sS'fb:cvg.musical_game_song' 462 | p155 463 | I494 464 | sS'fb:comic_books.comic_book_series' 465 | p156 466 | I473 467 | sS'fb:architecture.venue' 468 | p157 469 | I163 470 | sS'fb:base.tagasauris.organic_object' 471 | p158 472 | I213 473 | sS'fb:base.aareas.schema.us.county' 474 | p159 475 | I188 476 | sS'fb:biology.organism_classification' 477 | p160 478 | I118 479 | sS'fb:base.sundance.topic' 480 | p161 481 | I419 482 | sS'fb:cvg.video_game_soundtrack' 483 | p162 484 | I378 485 | sS'fb:event.disaster' 486 | p163 487 | I427 488 | sS'fb:book.book' 489 | p164 490 | I29 491 | sS'fb:book.publishing_company' 492 | p165 493 | I390 494 | sS'fb:theater.theatrical_lyricist' 495 | p166 496 | I480 497 | sS'fb:base.foodrecipes.recipe_ingredient' 498 | p167 499 | I372 500 | sS'fb:education.educational_institution' 501 | p168 502 | I54 503 | sS'fb:people.measured_person' 504 | p169 505 | I12 506 | sS'fb:biology.pedigreed_animal' 507 | p170 508 | I220 509 | sS'fb:military.military_unit' 510 | p171 511 | I244 512 | sS'fb:base.animemanga.topic' 513 | p172 514 | I402 515 | sS'fb:base.rosenbaum.topic' 516 | p173 517 | I396 518 | sS'fb:user.micahsaul.advertising.advertised_thing' 519 | p174 520 | I351 521 | sS'fb:base.biblioness.bibs_topic' 522 | p175 523 | I350 524 | sS'fb:time.recurring_event' 525 | p176 526 | I286 527 | sS'fb:base.reviews.review' 528 | p177 529 | I457 530 | sS'fb:tv.tv_character' 531 | p178 532 | I125 533 | sS'fb:computer.software_developer' 534 | p179 535 | I492 536 | sS'fb:organization.membership_organization' 537 | p180 538 | I491 539 | sS'fb:base.duiattorneys.topic' 540 | p181 541 | I484 542 | sS'fb:base.holocaust.topic' 543 | p182 544 | I469 545 | sS'fb:user.tsegaran.random.taxonomy_subject' 546 | p183 547 | I302 548 | sS'fb:film.film_production_designer' 549 | p184 550 | I130 551 | sS'fb:base.biblioness.bibs_location' 552 | p185 553 | I261 554 | sS'fb:base.tagit.place' 555 | p186 556 | I329 557 | sS'fb:base.activism.topic' 558 | p187 559 | I237 560 | sS'fb:award.award_winning_work' 561 | p188 562 | I55 563 | sS'fb:law.judge' 564 | p189 565 | I334 566 | sS'fb:religion.place_of_worship' 567 | p190 568 | I217 569 | sS'fb:business.defunct_company' 570 | p191 571 | I251 572 | sS'fb:base.petbreeds.topic' 573 | p192 574 | I471 575 | sS'fb:book.book_subject' 576 | p193 577 | I80 578 | sS'fb:theater.theater_director' 579 | p194 580 | I311 581 | sS'fb:media_common.media_genre' 582 | p195 583 | I327 584 | sS'fb:basketball.basketball_player' 585 | p196 586 | I110 587 | sS'fb:medicine.drug' 588 | p197 589 | I156 590 | sS'fb:medicine.icd_9_cm_classification' 591 | p198 592 | I336 593 | sS'fb:base.ottawa.topic' 594 | p199 595 | I392 596 | sS'fb:book.translated_work' 597 | p200 598 | I304 599 | sS'fb:base.engineering.engineering_person' 600 | p201 601 | I434 602 | sS'fb:tv.tv_writer' 603 | p202 604 | I124 605 | sS'fb:music.engineer' 606 | p203 607 | I273 608 | sS'fb:geography.island' 609 | p204 610 | I246 611 | sS'fb:architecture.house' 612 | p205 613 | I447 614 | sS'fb:basketball.basketball_coach' 615 | p206 616 | I346 617 | sS'fb:medicine.manufactured_drug_form' 618 | p207 619 | I113 620 | sS'fb:people.person' 621 | p208 622 | I7 623 | sS'fb:protected_sites.listed_site' 624 | p209 625 | I90 626 | sS'fb:base.americancivilwar.military_unit' 627 | p210 628 | I383 629 | sS'fb:music.composition' 630 | p211 631 | I48 632 | sS'fb:award.award_nominee' 633 | p212 634 | I27 635 | sS'fb:base.morelaw.canadian_lawyer' 636 | p213 637 | I463 638 | sS'fb:base.schemastaging.theater_production_extra' 639 | p214 640 | I425 641 | sS'fb:business.brand' 642 | p215 643 | I269 644 | sS'fb:base.ttiff.topic' 645 | p216 646 | I364 647 | sS'fb:base.prison.prisoner' 648 | p217 649 | I472 650 | sS'fb:base.karlovyvaryinternationalfilmfestival.topic' 651 | p218 652 | I194 653 | sS'fb:base.ukparliament.topic' 654 | p219 655 | I453 656 | sS'fb:base.argumentmaps.topic' 657 | p220 658 | I300 659 | sS'fb:award.competitor' 660 | p221 661 | I279 662 | sS'fb:education.university' 663 | p222 664 | I86 665 | sS'fb:visual_art.artwork' 666 | p223 667 | I119 668 | sS'fb:base.yalebase.person' 669 | p224 670 | I284 671 | sS'fb:base.usnris.nris_listing' 672 | p225 673 | I101 674 | sS'fb:base.marchmadness.topic' 675 | p226 676 | I451 677 | sS'fb:aviation.airport' 678 | p227 679 | I160 680 | sS'fb:architecture.architectural_structure_owner' 681 | p228 682 | I442 683 | sS'fb:base.academyawards.topic' 684 | p229 685 | I338 686 | sS'fb:biology.organism' 687 | p230 688 | I195 689 | sS'fb:symbols.name_source' 690 | p231 691 | I157 692 | sS'fb:sports.sports_facility' 693 | p232 694 | I277 695 | sS'fb:transportation.bridge' 696 | p233 697 | I257 698 | sS'fb:base.rosetta.languoid' 699 | p234 700 | I149 701 | sS'fb:base.moscratch.shce021709' 702 | p235 703 | I495 704 | sS'fb:base.x2010fifaworldcupsouthafrica.topic' 705 | p236 706 | I414 707 | sS'fb:sports.sports_team_location' 708 | p237 709 | I295 710 | sS'fb:film.film_costumer_designer' 711 | p238 712 | I177 713 | sS'fb:tv.tv_program_guest' 714 | p239 715 | I198 716 | sS'fb:organization.organization_sector' 717 | p240 718 | I341 719 | sS'fb:film.film_casting_director' 720 | p241 721 | I275 722 | sS'fb:sports.golfer' 723 | p242 724 | I233 725 | sS'fb:base.skosbase.skos_concept' 726 | p243 727 | I276 728 | sS'fb:aviation.aircraft_model' 729 | p244 730 | I373 731 | sS'fb:base.setrakian.topic' 732 | p245 733 | I412 734 | sS'fb:geography.geographical_feature' 735 | p246 736 | I70 737 | sS'fb:law.invention' 738 | p247 739 | I393 740 | sS'fb:user.benvvalk.default_domain.moby_output_descriptor' 741 | p248 742 | I418 743 | sS'fb:tv.tv_actor' 744 | p249 745 | I38 746 | sS'fb:medicine.notable_person_with_medical_condition' 747 | p250 748 | I287 749 | sS'fb:freebase.user_profile' 750 | p251 751 | I370 752 | sS'fb:music.musical_group' 753 | p252 754 | I52 755 | sS'fb:film.person_or_entity_appearing_in_film' 756 | p253 757 | I50 758 | sS'fb:comic_books.comic_book_character' 759 | p254 760 | I181 761 | sS'fb:medicine.hospital' 762 | p255 763 | I475 764 | sS'fb:base.nobelprizes.nobel_prize_winner' 765 | p256 766 | I432 767 | sS'fb:computer.software' 768 | p257 769 | I186 770 | sS'fb:base.popstra.sww_base' 771 | p258 772 | I142 773 | sS'fb:fictional_universe.fictional_setting' 774 | p259 775 | I387 776 | sS'fb:base.kwebbase.kwtopic' 777 | p260 778 | I285 779 | sS'fb:base.culturalevent.event' 780 | p261 781 | I116 782 | sS'fb:base.type_ontology.agent' 783 | p262 784 | I2 785 | sS'fb:fictional_universe.work_of_fiction' 786 | p263 787 | I88 788 | sS'fb:book.published_work' 789 | p264 790 | I192 791 | sS'fb:military.military_person' 792 | p265 793 | I67 794 | sS'fb:freebase.equivalent_topic' 795 | p266 796 | I422 797 | sS'fb:media_common.quotation_subject' 798 | p267 799 | I375 800 | sS'fb:book.newspaper' 801 | p268 802 | I176 803 | sS'fb:base.wikipedia_infobox.video_game' 804 | p269 805 | I95 806 | sS'fb:soccer.football_team_manager' 807 | p270 808 | I274 809 | sS'fb:interests.collectable_item' 810 | p271 811 | I482 812 | sS'fb:base.thoroughbredracing.thoroughbred_racehorse' 813 | p272 814 | I232 815 | sS'fb:book.short_story' 816 | p273 817 | I312 818 | sS'fb:tv.tv_series_episode' 819 | p274 820 | I59 821 | sS'fb:geography.river' 822 | p275 823 | I129 824 | sS'fb:education.academic' 825 | p276 826 | I105 827 | sS'fb:tv.tv_program_creator' 828 | p277 829 | I144 830 | sS'fb:base.jewlib.topic' 831 | p278 832 | I323 833 | sS'fb:music.single' 834 | p279 835 | I26 836 | sS'fb:film.writer' 837 | p280 838 | I41 839 | sS'fb:user.sandos.common_sense.pet' 840 | p281 841 | I439 842 | sS'fb:education.school' 843 | p282 844 | I111 845 | sS'fb:base.crime.convicted_criminal' 846 | p283 847 | I294 848 | sS'fb:location.capital_of_administrative_division' 849 | p284 850 | I151 851 | sS'fb:music.featured_artist' 852 | p285 853 | I182 854 | sS'fb:location.statistical_region' 855 | p286 856 | I21 857 | sS'fb:media_common.cataloged_instance' 858 | p287 859 | I39 860 | sS'fb:exhibitions.exhibition_subject' 861 | p288 862 | I424 863 | sS'fb:broadcast.content' 864 | p289 865 | I234 866 | sS'fb:base.summermovies2009.topic' 867 | p290 868 | I411 869 | sS'fb:theater.theater_character' 870 | p291 871 | I359 872 | sS'fb:base.fight.sports_official' 873 | p292 874 | I344 875 | sS'fb:music.album' 876 | p293 877 | I10 878 | sS'fb:base.wfilmbase.film' 879 | p294 880 | I263 881 | sS'fb:government.governmental_jurisdiction' 882 | p295 883 | I219 884 | sS'fb:base.zxspectrum.topic' 885 | p296 886 | I325 887 | sS'fb:military.military_conflict' 888 | p297 889 | I120 890 | sS'fb:internet.website' 891 | p298 892 | I322 893 | sS'fb:film.production_company' 894 | p299 895 | I498 896 | sS'fb:award.competition' 897 | p300 898 | I326 899 | sS'fb:sports.school_sports_team' 900 | p301 901 | I460 902 | sS'fb:base.schemastaging.person_extra' 903 | p302 904 | I206 905 | sS'fb:base.sfiff.topic' 906 | p303 907 | I164 908 | sS'fb:base.schemastaging.drug_extra' 909 | p304 910 | I354 911 | sS'fb:location.hud_county_place' 912 | p305 913 | I66 914 | sS'fb:business.business_operation' 915 | p306 916 | I62 917 | sS'fb:language.human_language' 918 | p307 919 | I143 920 | sS'fb:business.issuer' 921 | p308 922 | I174 923 | sS'fb:government.political_party' 924 | p309 925 | I247 926 | sS'fb:architecture.structure' 927 | p310 928 | I53 929 | sS'fb:organization.organization_member' 930 | p311 931 | I196 932 | sS'fb:type.user' 933 | p312 934 | I371 935 | sS'fb:business.issue' 936 | p313 937 | I208 938 | sS'fb:base.kwebbase.kwconnection' 939 | p314 940 | I281 941 | sS'fb:film.film_character' 942 | p315 943 | I63 944 | sS'fb:type.type' 945 | p316 946 | I205 947 | sS'fb:soccer.football_player' 948 | p317 949 | I35 950 | sS'fb:architecture.architect' 951 | p318 952 | I154 953 | sS'fb:transportation.road' 954 | p319 955 | I222 956 | sS'fb:medicine.physician' 957 | p320 958 | I243 959 | sS'fb:user.sandos.common_sense.common_sense_organism' 960 | p321 961 | I438 962 | sS'fb:film.film_subject' 963 | p322 964 | I191 965 | sS'fb:sports.sports_championship_event' 966 | p323 967 | I406 968 | sS'fb:type.namespace' 969 | p324 970 | I297 971 | sS'fb:base.nobelprizes.topic' 972 | p325 973 | I342 974 | sS'fb:music.release_component' 975 | p326 976 | I127 977 | sS'fb:projects.project_focus' 978 | p327 979 | I61 980 | sS'fb:base.crime.topic' 981 | p328 982 | I139 983 | sS'fb:film.music_contributor' 984 | p329 985 | I99 986 | sS'fb:business.employer' 987 | p330 988 | I36 989 | sS'fb:music.guitarist' 990 | p331 991 | I173 992 | sS'fb:base.fblinux.topic' 993 | p332 994 | I496 995 | sS'fb:base.schemastaging.tv_actor_extra' 996 | p333 997 | I343 998 | sS'fb:tennis.tennis_player' 999 | p334 1000 | I199 1001 | sS'fb:business.product_category' 1002 | p335 1003 | I345 1004 | sS'fb:base.skosbase.vocabulary_equivalent_topic' 1005 | p336 1006 | I152 1007 | sS'fb:theater.theater_producer' 1008 | p337 1009 | I362 1010 | sS'fb:base.pornactresses.topic' 1011 | p338 1012 | I468 1013 | sS'fb:government.u_s_congressperson' 1014 | p339 1015 | I122 1016 | sS'fb:cvg.game_version' 1017 | p340 1018 | I89 1019 | sS'fb:location.dated_location' 1020 | p341 1021 | I19 1022 | sS'fb:film.actor' 1023 | p342 1024 | I11 1025 | sS'fb:fictional_universe.fictional_character' 1026 | p343 1027 | I46 1028 | sS'fb:base.americancivilwar.topic' 1029 | p344 1030 | I223 1031 | sS'fb:martial_arts.martial_artist' 1032 | p345 1033 | I159 1034 | sS'fb:wine.wine' 1035 | p346 1036 | I252 1037 | sS'fb:user.zsi_editorial.editorial.base_topic' 1038 | p347 1039 | I278 1040 | sS'fb:base.popstra.topic' 1041 | p348 1042 | I141 1043 | sS'fb:base.schemastaging.non_profit_extra' 1044 | p349 1045 | I250 1046 | sS'fb:book.book_edition' 1047 | p350 1048 | I51 1049 | sS'fb:religion.religious_leader' 1050 | p351 1051 | I356 1052 | sS'fb:base.consumermedical.disease' 1053 | p352 1054 | I221 1055 | sS'fb:symbols.namesake' 1056 | p353 1057 | I132 1058 | sS'fb:user.skud.names.topic' 1059 | p354 1060 | I324 1061 | sS'fb:user.micahsaul.advertising.ad_campaign' 1062 | p355 1063 | I470 1064 | sS'fb:location.australian_suburb' 1065 | p356 1066 | I464 1067 | sS'fb:people.ethnicity' 1068 | p357 1069 | I292 1070 | sS'fb:base.filmnoir.topic' 1071 | p358 1072 | I349 1073 | sS'fb:tv.tv_producer' 1074 | p359 1075 | I112 1076 | sS'fb:base.schemastaging.aircraft_model_extra' 1077 | p360 1078 | I436 1079 | sS'fb:base.fight.topic' 1080 | p361 1081 | I310 1082 | sS'fb:base.washingtondc.topic' 1083 | p362 1084 | I368 1085 | sS'fb:base.x2010fifaworldcupsouthafrica.world_cup_participant' 1086 | p363 1087 | I426 1088 | sS'fb:award.award_category' 1089 | p364 1090 | I331 1091 | sS'fb:base.vancouver.topic' 1092 | p365 1093 | I320 1094 | sS'fb:base.thoroughbredracing.topic' 1095 | p366 1096 | I203 1097 | sS'fb:biology.gene' 1098 | p367 1099 | I369 1100 | sS'fb:base.aareas.schema.england.civil_parish' 1101 | p368 1102 | I267 1103 | sS'fb:base.schemastaging.government_position_held_extra' 1104 | p369 1105 | I435 1106 | sS'fb:award.ranked_item' 1107 | p370 1108 | I85 1109 | sS'fb:base.type_ontology.physically_instantiable' 1110 | p371 1111 | I3 1112 | sS'fb:film.producer' 1113 | p372 1114 | I43 1115 | sS'fb:common.topic' 1116 | p373 1117 | I0 1118 | sS'fb:geography.mountain' 1119 | p374 1120 | I135 1121 | sS'fb:theater.theater_production' 1122 | p375 1123 | I332 1124 | sS'fb:people.cause_of_death' 1125 | p376 1126 | I443 1127 | sS'fb:base.ireland.topic' 1128 | p377 1129 | I478 1130 | sS'fb:base.horseracing.racehorse' 1131 | p378 1132 | I238 1133 | sS'fb:protected_sites.protected_site' 1134 | p379 1135 | I231 1136 | sS'fb:automotive.model' 1137 | p380 1138 | I227 1139 | sS'fb:music.recording' 1140 | p381 1141 | I13 1142 | sS'fb:film.director' 1143 | p382 1144 | I40 1145 | sS'fb:organization.organization_founder' 1146 | p383 1147 | I91 1148 | sS'fb:soccer.football_team' 1149 | p384 1150 | I335 1151 | sS'fb:base.australianpolitics.topic' 1152 | p385 1153 | I467 1154 | sS'fb:astronomy.star_system_body' 1155 | p386 1156 | I69 1157 | sS'fb:architecture.building' 1158 | p387 1159 | I73 1160 | sS'fb:astronomy.astronomical_discovery' 1161 | p388 1162 | I64 1163 | sS'fb:base.saints.topic' 1164 | p389 1165 | I429 1166 | sS'fb:opera.opera' 1167 | p390 1168 | I420 1169 | sS'fb:medicine.drug_formulation' 1170 | p391 1171 | I81 1172 | sS'fb:base.fashionmodels.fashion_model' 1173 | p392 1174 | I423 1175 | sS'fb:base.adultentertainment.topic' 1176 | p393 1177 | I266 1178 | sS'fb:travel.travel_destination' 1179 | p394 1180 | I172 1181 | sS'fb:architecture.skyscraper' 1182 | p395 1183 | I309 1184 | sS'fb:base.yalebase.topic' 1185 | p396 1186 | I264 1187 | sS'fb:book.author' 1188 | p397 1189 | I25 1190 | sS'fb:computer.computer_scientist' 1191 | p398 1192 | I499 1193 | sS'fb:biology.owned_animal' 1194 | p399 1195 | I224 1196 | sS'fb:base.americancomedy.comedian' 1197 | p400 1198 | I316 1199 | sS'fb:base.myspace.myspace_user' 1200 | p401 1201 | I106 1202 | sS'fb:film.film_story_contributor' 1203 | p402 1204 | I79 1205 | sS'fb:sports.sports_team_coach' 1206 | p403 1207 | I404 1208 | sS'fb:astronomy.asteroid' 1209 | p404 1210 | I65 1211 | sS'fb:base.argumentmaps.original_idea' 1212 | p405 1213 | I405 1214 | sS'fb:music.writer' 1215 | p406 1216 | I47 1217 | sS'fb:base.plopquiz.topic' 1218 | p407 1219 | I379 1220 | sS'fb:medicine.drug_ingredient' 1221 | p408 1222 | I229 1223 | sS'fb:media_common.adaptation' 1224 | p409 1225 | I102 1226 | sS'fb:astronomy.orbital_relationship' 1227 | p410 1228 | I60 1229 | sS'fb:user.doconnor.pets.pet' 1230 | p411 1231 | I446 1232 | sS'fb:baseball.baseball_player' 1233 | p412 1234 | I71 1235 | sS'fb:base.atheism.atheist' 1236 | p413 1237 | I394 1238 | sS'fb:book.periodical_publisher' 1239 | p414 1240 | I366 1241 | sS'fb:base.berlininternationalfilmfestival.topic' 1242 | p415 1243 | I134 1244 | sS'fb:base.ovguide.topic' 1245 | p416 1246 | I184 1247 | sS'fb:government.government_office_or_title' 1248 | p417 1249 | I483 1250 | sS'fb:base.schemastaging.athlete_extra' 1251 | p418 1252 | I94 1253 | sS'fb:medicine.medical_treatment' 1254 | p419 1255 | I215 1256 | sS'fb:base.objectionablecontent.flagged_content' 1257 | p420 1258 | I348 1259 | sS'fb:tv.tv_director' 1260 | p421 1261 | I170 1262 | sS'fb:base.schemastaging.food_extra' 1263 | p422 1264 | I458 1265 | sS'fb:base.schemastaging.topic' 1266 | p423 1267 | I131 1268 | sS'fb:tv.tv_network' 1269 | p424 1270 | I462 1271 | sS'fb:business.consumer_company' 1272 | p425 1273 | I397 1274 | sS'fb:base.argentina.topic' 1275 | p426 1276 | I430 1277 | sS'fb:base.gayporn.gay_porn' 1278 | p427 1279 | I381 1280 | sS'fb:people.place_of_interment' 1281 | p428 1282 | I382 1283 | sS'fb:metropolitan_transit.transit_line' 1284 | p429 1285 | I358 1286 | sS'fb:travel.tourist_attraction' 1287 | p430 1288 | I114 1289 | sS'fb:base.schemastaging.contact_product' 1290 | p431 1291 | I461 1292 | sS'fb:base.performer.topic' 1293 | p432 1294 | I433 1295 | sS'fb:user.zsi_editorial.editorial.topic' 1296 | p433 1297 | I245 1298 | sS'fb:theater.play' 1299 | p434 1300 | I248 1301 | sS'fb:organization.non_profit_organization' 1302 | p435 1303 | I175 1304 | sS'fb:award.award_winner' 1305 | p436 1306 | I30 1307 | sS'fb:internet.social_network_user' 1308 | p437 1309 | I56 1310 | sS'fb:celebrities.celebrity' 1311 | p438 1312 | I226 1313 | sS'fb:user.tsegaran.random.topic' 1314 | p439 1315 | I476 1316 | sS'fb:time.event' 1317 | p440 1318 | I83 1319 | sS'fb:base.saints.saint' 1320 | p441 1321 | I328 1322 | sS'fb:astronomy.astronomer' 1323 | p442 1324 | I410 1325 | sS'fb:location.census_designated_place' 1326 | p443 1327 | I158 1328 | sS'fb:location.citytown' 1329 | p444 1330 | I24 1331 | sS'fb:base.schemastaging.organization_extra' 1332 | p445 1333 | I155 1334 | sS'fb:base.americancivilwar.regiment' 1335 | p446 1336 | I363 1337 | sS'fb:base.engineering.topic' 1338 | p447 1339 | I296 1340 | sS'fb:base.activism.activist' 1341 | p448 1342 | I210 1343 | sS'fb:music.group_member' 1344 | p449 1345 | I33 1346 | sS'fb:food.ingredient' 1347 | p450 1348 | I289 1349 | sS'fb:location.location' 1350 | p451 1351 | I9 1352 | sS'fb:government.political_district' 1353 | p452 1354 | I337 1355 | sS'fb:location.neighborhood' 1356 | p453 1357 | I166 1358 | sS'fb:base.rosetta.local_name' 1359 | p454 1360 | I450 1361 | sS'fb:base.frameline.topic' 1362 | p455 1363 | I321 1364 | sS'fb:film.film_crewmember' 1365 | p456 1366 | I121 1367 | sS'fb:location.administrative_division' 1368 | p457 1369 | I74 1370 | sS'fb:user.jg.default_domain.racehorse' 1371 | p458 1372 | I299 1373 | sS'fb:metropolitan_transit.transit_stop' 1374 | p459 1375 | I318 1376 | sS'fb:location.postal_code' 1377 | p460 1378 | I138 1379 | sS'fb:base.folklore.topic' 1380 | p461 1381 | I399 1382 | sS'fb:user.doconnor.pets.horse' 1383 | p462 1384 | I444 1385 | sS'fb:base.cinemainspector.person_sign' 1386 | p463 1387 | I428 1388 | sS'fb:common.resource' 1389 | p464 1390 | I214 1391 | sS'fb:base.type_ontology.inanimate' 1392 | p465 1393 | I5 1394 | sS'fb:theater.theatrical_composer' 1395 | p466 1396 | I466 1397 | sS'fb:base.rugby.rugby_player' 1398 | p467 1399 | I200 1400 | sS'fb:royalty.noble_person' 1401 | p468 1402 | I123 1403 | sS'fb:government.politician' 1404 | p469 1405 | I37 1406 | sS'fb:common.image' 1407 | p470 1408 | I18 1409 | sS'fb:broadcast.radio_station' 1410 | p471 1411 | I169 1412 | sS'fb:base.services.topic' 1413 | p472 1414 | I365 1415 | sS'fb:business.board_member' 1416 | p473 1417 | I128 1418 | sS'fb:base.tagit.organic_thing' 1419 | p474 1420 | I207 1421 | sS'fb:ice_hockey.hockey_player' 1422 | p475 1423 | I153 1424 | sS'fb:base.tagit.topic' 1425 | p476 1426 | I386 1427 | sS'fb:aviation.airline' 1428 | p477 1429 | I376 1430 | sS'fb:base.type_ontology.non_agent' 1431 | p478 1432 | I1 1433 | sS'fb:organization.endowed_organization' 1434 | p479 1435 | I249 1436 | sS'fb:media_common.netflix_title' 1437 | p480 1438 | I34 1439 | sS'fb:type.property' 1440 | p481 1441 | I298 1442 | sS'fb:user.sandos.common_sense.topic' 1443 | p482 1444 | I437 1445 | sS'fb:base.disneyana.topic' 1446 | p483 1447 | I479 1448 | sS'fb:education.school_district' 1449 | p484 1450 | I449 1451 | sS'fb:base.schemastaging.sports_team_extra' 1452 | p485 1453 | I240 1454 | sS'fb:location.uk_civil_parish' 1455 | p486 1456 | I209 1457 | sS'fb:base.yemebase.topic' 1458 | p487 1459 | I409 1460 | sS'fb:user.narphorium.people.topic' 1461 | p488 1462 | I72 1463 | sS'fb:organization.organization_scope' 1464 | p489 1465 | I280 1466 | sS'fb:cvg.computer_videogame' 1467 | p490 1468 | I84 1469 | sS'fb:travel.hotel' 1470 | p491 1471 | I421 1472 | sS'fb:base.catalog.cataloged_composition' 1473 | p492 1474 | I347 1475 | sS'fb:base.usnris.topic' 1476 | p493 1477 | I100 1478 | sS'fb:common.webpage' 1479 | p494 1480 | I201 1481 | sS'fb:tv.tv_series_season' 1482 | p495 1483 | I183 1484 | sS'fb:film.film_art_director' 1485 | p496 1486 | I147 1487 | sS'fb:tv.tv_program' 1488 | p497 1489 | I68 1490 | sS'fb:chemistry.chemical_compound' 1491 | p498 1492 | I230 1493 | sS'fb:people.profession' 1494 | p499 1495 | I283 1496 | sS'fb:base.disaster2.topic' 1497 | p500 1498 | I340 1499 | sS'fb:base.fashionmodels.topic' 1500 | p501 1501 | I303 1502 | s. --------------------------------------------------------------------------------