├── FocusedLabeling
    ├── process_inference.lua
    ├── generate_inference_data.py
    ├── infer_crf.lua
    ├── test_crf.lua
    └── train_crf.lua
├── src
    ├── model
    │   ├── TripleScore.lua
    │   ├── BatchDot.lua
    │   ├── BiRNN.lua
    │   ├── BiRNNSelect.lua
    │   ├── Linear.lua
    │   ├── model_utils.lua
    │   ├── CRF.lua
    │   └── BiGRU.lua
    ├── data
    │   ├── Vocab.lua
    │   ├── SeqLabelRankLoader.lua
    │   ├── SeqLabelingLoader.lua
    │   ├── SeqMultiLabelLoader.lua
    │   ├── RankingDataLoader.lua
    │   └── SeqRankingLoader.lua
    ├── optim
    │   ├── AdaGrad.lua
    │   └── SGD.lua
    └── py_module
    │   ├── QAData.py
    │   ├── freebase.py
    │   └── virtuoso.py
├── EntityTypeVec
    ├── process_inference.lua
    ├── test_ent_typevec.lua
    ├── infer_ent_typevec.lua
    └── train_ent_typevec.lua
├── vocab
    └── create_vocab.lua
├── process.lua
├── RelationRNN
    ├── process_inference.lua
    ├── infer_rel_rnn.lua
    └── train_rel_rnn.lua
├── init.lua
├── .gitignore
├── Inference
    ├── valid
    │   └── run.sh
    ├── generate_score_data.py
    ├── test
    │   └── run.sh
    ├── joint_predict.py
    ├── query_candidates.py
    └── joint_disambiguation.py
├── KnowledgeBase
    ├── convert.py
    └── type.top-500.pkl
├── data_preprocess.sh
├── Virtuoso.md
├── README.md
└── SimpleQuestions
    ├── generate_training_data.py
    └── PreprocessData
        └── process_rawdata.py


/FocusedLabeling/process_inference.lua:
--------------------------------------------------------------------------------
 1 | require '..'
 2 | 
 3 | local cmd = torch.CmdLine()
 4 | cmd:text('Options')
 5 | cmd:option('-testSplit','valid','use which data split set')
 6 | cmd:text()
 7 | 
 8 | local opt = cmd:parse(arg)
 9 | 
10 | local wordVocab = torch.load('../vocab/vocab.word.t7')
11 | 
12 | local txtPath = string.format('inference-data/label.%s.txt', opt.testSplit)
13 | local thPath = string.format('inference-data/label.%s.t7', opt.testSplit)
14 | 
15 | createSeqLabelingData(txtPath, thPath, wordVocab, 1)


--------------------------------------------------------------------------------
/src/model/TripleScore.lua:
--------------------------------------------------------------------------------
 1 | function TripleScore(negBatchSize)
 2 |     local tarVec = nn.Identity()()
 3 |     local posVec = nn.Identity()()
 4 |     local negMat = nn.Identity()()
 5 |     
 6 |     local scoreVecPos = BatchDot() ({tarVec, posVec})
 7 |     local scoreMatPos = nn.Replicate(negBatchSize) (scoreVecPos)
 8 | 
 9 |     local tarMat      = nn.Replicate(negBatchSize) (tarVec)
10 |     local scoreMatNeg = BatchDot() ({tarMat, negMat})
11 | 
12 |     return nn.gModule({tarVec, posVec, negMat}, {scoreMatPos, scoreMatNeg})
13 | end
14 | 


--------------------------------------------------------------------------------
/EntityTypeVec/process_inference.lua:
--------------------------------------------------------------------------------
 1 | require '..'
 2 | 
 3 | local cmd = torch.CmdLine()
 4 | cmd:text('Options')
 5 | cmd:option('-testSplit','valid','use which data split set')
 6 | cmd:text()
 7 | 
 8 | local opt = cmd:parse(arg)
 9 | 
10 | local wordVocab = torch.load('../vocab/vocab.word.t7')
11 | local relationVocab = torch.load('../vocab/vocab.rel.t7')
12 | 
13 | local txtPath = string.format('../Inference/FB5M-ngram/type.multi.%s.txt', opt.testSplit)
14 | local thPath = string.format('inference-data/ent.%s.t7', opt.testSplit)
15 | 
16 | createSeqLabelRankData(txtPath, thPath, wordVocab, 501)


--------------------------------------------------------------------------------
/vocab/create_vocab.lua:
--------------------------------------------------------------------------------
 1 | require '..'
 2 | 
 3 | function createWordVocab()
 4 |     local wordVocab = Vocab('word.glove100k.txt')
 5 |     wordVocab:add_unk_token()
 6 |     wordVocab:add_pad_token()
 7 | 
 8 |     torch.save('vocab.word.t7', wordVocab)
 9 | end
10 | 
11 | function createFBVocab()
12 |     local vocabPath = '../KnowledgeBase'
13 | 
14 |     local relVocab = Vocab(vocabPath..'/FB5M.rel.txt')
15 |     relVocab:add_unk_token()
16 | 
17 |     local entVocab = Vocab(vocabPath..'/FB5M.ent.txt')
18 |     entVocab:add_unk_token()
19 | 
20 |     torch.save('vocab.rel.t7', relVocab)
21 |     torch.save('vocab.ent.t7', entVocab)
22 | end
23 | 
24 | createWordVocab()
25 | createFBVocab()
26 | 


--------------------------------------------------------------------------------
/src/model/BatchDot.lua:
--------------------------------------------------------------------------------
 1 | local BatchDot, parent = torch.class('BatchDot', 'nn.Module')
 2 | 
 3 | function BatchDot:__init()
 4 |     parent.__init(self)
 5 |     self.gradInput = {torch.Tensor(), torch.Tensor()}
 6 |     self._viewSize = torch.LongStorage()
 7 | end 
 8 | 
 9 | function BatchDot:updateOutput(input)
10 |     self.output = torch.cmul(input[1], input[2]):sum(input[1]:dim())
11 |     return self.output
12 | end
13 | 
14 | function BatchDot:updateGradInput(input, gradOutput)
15 |     expandGradOutput  = torch.expand(gradOutput, input[1]:size())
16 |     self.gradInput[1] = torch.cmul(expandGradOutput, input[2])
17 |     self.gradInput[2] = torch.cmul(expandGradOutput, input[1])
18 |     return self.gradInput
19 | end
20 | 


--------------------------------------------------------------------------------
/process.lua:
--------------------------------------------------------------------------------
 1 | require '.'
 2 | 
 3 | function trainData()
 4 |     local wordVocab = torch.load('vocab/vocab.word.t7')
 5 |     local entVocab = torch.load('vocab/vocab.ent.t7')
 6 |     local relVocab = torch.load('vocab/vocab.rel.t7')
 7 | 
 8 |     trainDir = 'SimpleQuestions/trainingData'
 9 | 
10 |     -- focused labeling
11 |     createSeqLabelingData(trainDir..'/data.train.focused_labeling', 'data/train.focused_labeling.t7', wordVocab, 128)
12 | 
13 |     -- entity network
14 |     createSeqMultiLabelData(trainDir..'/data.train.entity_typevec', 'data/train.entity_typevec.t7', wordVocab, 501, 256)
15 |     
16 |     -- relation network
17 |     createSeqRankingData(trainDir..'/data.train.relation_ranking', 'data/train.relation_ranking.t7', wordVocab, relVocab, 256)
18 | end
19 | 
20 | trainData()
21 | 


--------------------------------------------------------------------------------
/src/model/BiRNN.lua:
--------------------------------------------------------------------------------
 1 | local BiRNN, parent = torch.class('BiRNN', 'nn.Module')
 2 | 
 3 | -- initialize the module
 4 | function BiRNN:__init(config)
 5 |     parent.__init(self)
 6 | 
 7 |     -- set cuda streams
 8 |     self.nStream = 2
 9 |     if cutorch then
10 |         self.streamList = {1, 2}
11 |         if cutorch.getNumStreams() < self.nStream then cutorch.reserveStreams(self.nStream) end
12 |     end
13 | end
14 | 
15 | function BiRNN:traverseOrder(seqLen, streamIdx)
16 |     if streamIdx == 1 then
17 |         return 1, seqLen, 1
18 |     else
19 |         return seqLen, 1, -1
20 |     end
21 | end
22 | 
23 | function BiRNN:setAttr(attr, val)
24 |     
25 | end
26 | 
27 | function BiRNN:evaluate()
28 |     self.train = false
29 |     if cutorch.getNumStreams() < self.nStream then cutorch.reserveStreams(self.nStream) end
30 | end


--------------------------------------------------------------------------------
/RelationRNN/process_inference.lua:
--------------------------------------------------------------------------------
 1 | require '..'
 2 | 
 3 | local cmd = torch.CmdLine()
 4 | cmd:text('Options')
 5 | cmd:option('-testSplit','valid','use which data split set')
 6 | cmd:text()
 7 | 
 8 | local opt = cmd:parse(arg)
 9 | 
10 | local wordVocab = torch.load('../vocab/vocab.word.t7')
11 | local relationVocab = torch.load('../vocab/vocab.rel.t7')
12 | 
13 | local txtSPath = string.format('../Inference/valid/rel.single.%s.txt', opt.testSplit)
14 | local txtMPath = string.format('../Inference/valid/rel.multi.%s.txt', opt.testSplit)
15 | 
16 | local thSPath = string.format('inference-data/rel.single.%s.t7', opt.testSplit)
17 | local thMPath = string.format('inference-data/rel.multi.%s.t7', opt.testSplit)
18 | 
19 | createRankingData(txtSPath, thSPath, wordVocab, relationVocab, 1)
20 | createRankingData(txtMPath, thMPath, wordVocab, relationVocab, 1)
21 | 


--------------------------------------------------------------------------------
/init.lua:
--------------------------------------------------------------------------------
 1 | require 'torch'
 2 | require 'nn'
 3 | require 'nngraph'
 4 | require 'logroll'
 5 | 
 6 | local ok, err = pcall( function () require 'cutorch' end )
 7 | if ok then
 8 | 	require 'cunn'
 9 | 	require 'cudnn'
10 | end
11 | 
12 | include('src/model/CRF.lua')
13 | include('src/model/BiRNN.lua')
14 | include('src/model/BiGRU.lua')
15 | include('src/model/BiRNNSelect.lua')
16 | include('src/model/Linear.lua')
17 | include('src/model/BatchDot.lua')
18 | include('src/model/TripleScore.lua')
19 | include('src/model/model_utils.lua')
20 | 
21 | include('src/optim/AdaGrad.lua')
22 | include('src/optim/SGD.lua')
23 | 
24 | include('src/data/RankingDataLoader.lua')
25 | include('src/data/SeqMultiLabelLoader.lua')
26 | include('src/data/SeqLabelingLoader.lua')
27 | include('src/data/SeqRankingLoader.lua')
28 | include('src/data/SeqLabelRankLoader.lua')
29 | include('src/data/Vocab.lua')
30 | 


--------------------------------------------------------------------------------
/FocusedLabeling/generate_inference_data.py:
--------------------------------------------------------------------------------
 1 | import sys, os
 2 | import io
 3 | import cPickle as pickle
 4 | import argparse
 5 | 
 6 | sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), 'src/py_module' ))
 7 | from QAData import *
 8 | import virtuoso
 9 | 
10 | if __name__ == '__main__':
11 |     parser = argparse.ArgumentParser(description='generate_inference_data.py')
12 |     parser.add_argument('--split', default='valid', type=str, help="which data split to consider")
13 |     args = parser.parse_args()
14 |     
15 |     data_list = pickle.load(file('../SimpleQuestions/PreprocessData/QAData.{}.pkl'.format(args.split), 'rb'))
16 |     if not os.path.exists('inference-data'):
17 |         os.mkdir('inference-data')
18 |     
19 |     with io.open('inference-data/label.{}.txt'.format(args.split), 'w', encoding='utf8') as fo:
20 |         for data in data_list:
21 |             if data.text_attention_indices:
22 |                 fo.write(u'%s\t%s\n' % (data.question, 
23 |                     ' '.join([str(index) for index in data.text_attention_indices])))
24 |             else:
25 |                 fo.write(u'%s\t%s\n' % (data.question, 
26 |                     ' '.join(['0' for _ in data.question.strip().split()])))


--------------------------------------------------------------------------------
/src/model/BiRNNSelect.lua:
--------------------------------------------------------------------------------
 1 | local BiRNNSelect, parent = torch.class('BiRNNSelect', 'nn.Module')
 2 | 
 3 | function BiRNNSelect:__init()
 4 |    parent.__init(self)
 5 |    self.output    = torch.Tensor()
 6 |    self.gradInput = torch.Tensor()
 7 | end
 8 | 
 9 | function BiRNNSelect:updateOutput(input)
10 |    local seqLen     = input:size(1)
11 |    local batchSize  = input:size(2)
12 |    local doubleSize = input:size(3)
13 |    local hiddenSize = doubleSize / 2
14 | 
15 |    self.output:resize(batchSize, hiddenSize * 2)
16 | 
17 |    local fLeft, fRight =            1, hiddenSize
18 |    local bLeft, bRight = hiddenSize+1, doubleSize
19 | 
20 |    self.output[{{},{fLeft, fRight}}]:copy(input[{{seqLen}, {},{fLeft, fRight}}])
21 |    self.output[{{},{bLeft, bRight}}]:copy(input[{{     1}, {},{bLeft, bRight}}])
22 | 
23 |    return self.output
24 | end
25 | 
26 | function BiRNNSelect:updateGradInput(input, gradOutput)
27 |    local seqLen     = input:size(1)
28 |    local doubleSize = input:size(3)
29 |    local hiddenSize = doubleSize / 2
30 | 
31 |    self.gradInput:resizeAs(input)
32 |    self.gradInput:zero()
33 | 
34 |    local fLeft, fRight =            1, hiddenSize
35 |    local bLeft, bRight = hiddenSize+1, doubleSize
36 |    
37 |    self.gradInput[{{seqLen}, {},{fLeft, fRight}}]:copy(gradOutput[{{},{fLeft, fRight}}])
38 |    self.gradInput[{{     1}, {},{bLeft, bRight}}]:copy(gradOutput[{{},{bLeft, bRight}}])
39 |    
40 |    return self.gradInput
41 | end 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Project specific
 2 | *.pkl
 3 | *.t7
 4 | KnowledgeBase
 5 | SimpleQuestions
 6 | RawData
 7 | tmp
 8 | 
 9 | # Byte-compiled / optimized / DLL files
10 | __pycache__/
11 | *.py[cod]
12 | *$py.class
13 | 
14 | # C extensions
15 | *.so
16 | 
17 | # Distribution / packaging
18 | .Python
19 | env/
20 | build/
21 | develop-eggs/
22 | dist/
23 | downloads/
24 | eggs/
25 | .eggs/
26 | lib/
27 | lib64/
28 | parts/
29 | sdist/
30 | var/
31 | *.egg-info/
32 | .installed.cfg
33 | *.egg
34 | 
35 | # PyInstaller
36 | #  Usually these files are written by a python script from a template
37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
38 | *.manifest
39 | *.spec
40 | 
41 | # Installer logs
42 | pip-log.txt
43 | pip-delete-this-directory.txt
44 | 
45 | # Unit test / coverage reports
46 | htmlcov/
47 | .tox/
48 | .coverage
49 | .coverage.*
50 | .cache
51 | nosetests.xml
52 | coverage.xml
53 | *,cover
54 | .hypothesis/
55 | 
56 | # Translations
57 | *.mo
58 | *.pot
59 | 
60 | # Django stuff:
61 | *.log
62 | local_settings.py
63 | 
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 | 
68 | # Scrapy stuff:
69 | .scrapy
70 | 
71 | # Sphinx documentation
72 | docs/_build/
73 | 
74 | # PyBuilder
75 | target/
76 | 
77 | # IPython Notebook
78 | .ipynb_checkpoints
79 | 
80 | # pyenv
81 | .python-version
82 | 
83 | # celery beat schedule file
84 | celerybeat-schedule
85 | 
86 | # dotenv
87 | .env
88 | 
89 | # virtualenv
90 | venv/
91 | ENV/
92 | 
93 | # Spyder project settings
94 | .spyderproject
95 | 
96 | # Rope project settings
97 | .ropeproject
98 | 


--------------------------------------------------------------------------------
/Inference/valid/run.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | predict () {
 3 |     echo "$1 $2"
 4 |     cp $1/score.valid.multi.label.FB5M score.multi.valid.FB5M
 5 |     cp $2/score.valid.label.FB5M score.ent.valid.FB5M
 6 |     python ../joint_disambiguation.py multi.valid.cpickle score.multi.valid.FB5M score.ent.valid.FB5M
 7 | }
 8 | 
 9 | predict_symbol () {
10 |     echo "symbol $1 $2"
11 |     cp $1/score.valid.multi.label.anonymous.FB5M score.multi.valid.FB5M
12 |     cp $2/score.valid.label.FB5M score.ent.valid.FB5M
13 |     python ../joint_disambiguation.py multi.valid.cpickle score.multi.valid.FB5M score.ent.valid.FB5M
14 | }
15 | 
16 | predict "../../RelationRNN" "../../EntityTypeVec"
17 | predict "../../RelationLTGCNN" "../../EntityTypeVec" 
18 | predict "../../RelationAverage" "../../EntityTypeVec"
19 | predict_symbol "../../RelationLTGCNN" "../../EntityTypeVec" 
20 | 
21 | predict "../../RelationRNN" "../../EntityRNN/TransE"
22 | predict "../../RelationLTGCNN" "../../EntityRNN/TransE" 
23 | predict "../../RelationAverage" "../../EntityRNN/TransE"
24 | predict_symbol "../../RelationLTGCNN" "../../EntityRNN/TransE" 
25 | 
26 | predict "../../RelationRNN" "../../EntityRNN/Random"
27 | predict "../../RelationLTGCNN" "../../EntityRNN/Random" 
28 | predict "../../RelationAverage" "../../EntityRNN/Random"
29 | predict_symbol "../../RelationLTGCNN" "../../EntityRNN/Random" 
30 | 
31 | predict "../../RelationRNN" "../../EntityAverage"
32 | predict "../../RelationLTGCNN" "../../EntityAverage"
33 | predict "../../RelationAverage" "../../EntityAverage"
34 | predict_symbol "../../RelationLTGCNN" "../../EntityAverage"
35 | 


--------------------------------------------------------------------------------
/KnowledgeBase/convert.py:
--------------------------------------------------------------------------------
 1 | import sys, os
 2 | import cPickle as pickle
 3 | 
 4 | def www2fb(in_str):
 5 |     out_str = 'fb:%s' % (in_str.split('www.freebase.com/')[-1].replace('/', '.'))
 6 |     return out_str
 7 | 
 8 | def main():
 9 |     in_fn = sys.argv[1]
10 |     db = in_fn.split('-')[-1].split('.')[0]
11 | 
12 |     out_fn = '%s.core.txt' % (db)
13 |     ent_fn = '%s.ent.pkl' % (db)
14 |     rel_fn = '%s.rel.pkl' % (db)
15 | 
16 |     ent_dict = {}
17 |     rel_dict = {}
18 |     triple_dict = {}
19 | 
20 |     with file(in_fn, 'rb') as fi:
21 |         for line in fi:
22 |             fields = line.strip().split('\t')
23 |             sub = www2fb(fields[0])
24 |             rel = www2fb(fields[1])
25 |             objs = fields[2].split()
26 |             if ent_dict.has_key(sub):
27 |                 ent_dict[sub] += 1
28 |             else:
29 |                 ent_dict[sub] = 1
30 |             if rel_dict.has_key(rel):
31 |                 rel_dict[rel] += 1
32 |             else:
33 |                 rel_dict[rel] = 1
34 |             for obj in objs:
35 |                 obj = www2fb(obj)
36 |                 triple_dict[(sub, rel, obj)] = 1
37 |                 if ent_dict.has_key(obj):
38 |                     ent_dict[obj] += 1
39 |                 else:
40 |                     ent_dict[obj] = 1
41 | 
42 |     pickle.dump(ent_dict, file(ent_fn, 'wb'))
43 |     with file('%s.ent.txt' % (db), 'wb') as fo:
44 |         for k, v in sorted(ent_dict.items(), key = lambda kv: kv[1], reverse = True):
45 |             print >> fo, k
46 | 
47 |     pickle.dump(rel_dict, file(rel_fn, 'wb'))
48 |     with file('%s.rel.txt' % (db), 'wb') as fo:
49 |         for k, v in sorted(rel_dict.items(), key = lambda kv: kv[1], reverse = True):
50 |             print >> fo, k
51 | 
52 |     with file(out_fn, 'wb') as fo:
53 |         for (sub, rel, obj) in triple_dict.keys():
54 |             print >> fo, '<%s>\t<%s>\t<%s>\t.' % (sub, rel, obj)
55 |     print len(triple_dict)
56 | 
57 | if __name__ == '__main__':
58 |     main()
59 | 


--------------------------------------------------------------------------------
/FocusedLabeling/infer_crf.lua:
--------------------------------------------------------------------------------
 1 | require '..'
 2 | 
 3 | local cmd = torch.CmdLine()
 4 | cmd:text()
 5 | cmd:text('Training a Recurrent Neural Network to classify a sequence of words')
 6 | cmd:text()
 7 | cmd:text('Comandline Options')
 8 | 
 9 | cmd:option('-wordVocab','../vocab/vocab.word.t7','training data file')
10 | cmd:option('-testData','inference-data/label.valid.t7','data file to predict')
11 | cmd:option('-modelFile','model/model.BiGRU','path to the trained model')
12 | 
13 | cmd:option('-useGPU',1,'which GPU is used for computation')
14 | 
15 | cmd:text()
16 | 
17 | ----------------------------- Basic Options -----------------------------
18 | 
19 | local opt = cmd:parse(arg)
20 | 
21 | local wordVocab = torch.load(opt.wordVocab)
22 | 
23 | if opt.useGPU > 0 then
24 |     require 'cutorch'
25 |     require 'cunn'
26 |     cutorch.setDevice(opt.useGPU)
27 |     torch.setdefaulttensortype('torch.CudaTensor')
28 | end
29 | 
30 | ----------------------------- Data Loader -----------------------------
31 | local loader = SeqLabelingLoader(opt.testData, flog)
32 | 
33 | -------------------------- Load & Init Models -------------------------
34 | local model = torch.load(opt.modelFile)
35 | local seqModel = model.seqModel
36 | local linearCRF = model.linearCRF
37 | seqModel:evaluate()
38 | linearCRF:evaluate()
39 | 
40 | ----------------------------- Prediction -----------------------------
41 | local maxIters = loader.numBatch
42 | 
43 | local fields = stringx.split(opt.testData, '.')
44 | local split = fields[#fields-1]
45 | local file = io.open(string.format("label.result.%s", split), 'w')
46 | 
47 | for i = 1, maxIters do
48 |     xlua.progress(i, maxIters)
49 | 
50 |     ----------------------- load minibatch ------------------------
51 |     local seq, _ = loader:nextBatch(1)
52 |     local currSeqLen = seq:size(1)
53 |     local seqVec = seqModel:forward(seq)
54 |     local predict = linearCRF:forward(seqVec)
55 | 
56 |     for i = 1, currSeqLen do
57 |         file:write(predict[{i,1}]-0.999, ' ')
58 |     end
59 |     file:write('\n')
60 | end
61 | file:close()


--------------------------------------------------------------------------------
/data_preprocess.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ROOTDIR=`pwd`
 3 | KBPATH=${ROOTDIR}/KnowledgeBase/VirtuosoKB/
 4 | 
 5 | # 1. download SimpleQuestionv2
 6 | echo "====> Step 1: download raw data"
 7 | mkdir -p ${ROOTDIR}/RawData
 8 | cd ${ROOTDIR}/RawData
 9 | 
10 | wget https://www.dropbox.com/s/tohrsllcfy7rch4/SimpleQuestions_v2.tgz
11 | tar -xzf SimpleQuestions_v2.tgz
12 | 
13 | wget https://www.dropbox.com/s/dt4i1a1wayks43n/FB5M-extra.tar.gz
14 | tar -xzf FB5M-extra.tar.gz
15 | 
16 | # 2. create KB data
17 | echo "====> Step 2: create KB data"
18 | cd ${ROOTDIR}/KnowledgeBase
19 | python convert.py ${ROOTDIR}/RawData/SimpleQuestions_v2/freebase-subsets/freebase-FB5M.txt
20 | 
21 | mv FB5M.core.txt ${KBPATH}/data/
22 | mv ${ROOTDIR}/RawData/FB5M.*.txt ${KBPATH}/data/
23 | 
24 | # 3. load data into knowledge base
25 | echo "====> Step 3: load data into knowledge base"
26 | cd ${KBPATH}
27 | ./bin/virtuoso-t +foreground +configfile var/lib/virtuoso/db/virtuoso.ini & # start the server
28 | serverPID=$!
29 | sleep 10
30 | 
31 | ./bin/isql 1111 dba dba exec="ld_dir_all('./data', '*', 'fb:');"
32 | 
33 | pids=()
34 | for i in `seq 1 4`; do
35 | 	./bin/isql 1111 dba dba exec="rdf_loader_run();" &
36 |    pids+=($!)
37 | done
38 | for pid in ${pids[@]}; do
39 |      wait $pid
40 | done
41 | 
42 | # 4. create Vocabs
43 | echo "====> Step 4: create Vocabs"
44 | cd ${ROOTDIR}/vocab
45 | th create_vocab.lua 
46 | 
47 | 5. create training data
48 | ho "====> Step 5: create training data (this will take some time)"
49 | 
50 | # 5.1. QAData.pkl
51 | cd ${ROOTDIR}/SimpleQuestions/PreprocessData
52 | python process_rawdata.py ${ROOTDIR}/RawData/SimpleQuestions_v2/annotated_fb_data_train.txt 6
53 | python process_rawdata.py ${ROOTDIR}/RawData/SimpleQuestions_v2/annotated_fb_data_valid.txt 6
54 | python process_rawdata.py ${ROOTDIR}/RawData/SimpleQuestions_v2/annotated_fb_data_test.txt 6
55 | 
56 | # 5.2. create train data in .txt format
57 | cd ${ROOTDIR}/SimpleQuestions
58 | python generate_training_data.py
59 | 
60 | # 5.3. convert .txt data to .t7 format
61 | cd ${ROOTDIR}
62 | mkdir ${ROOTDIR}/data
63 | th process.lua
64 | 


--------------------------------------------------------------------------------
/EntityTypeVec/test_ent_typevec.lua:
--------------------------------------------------------------------------------
 1 | require '..'
 2 | 
 3 | local cmd = torch.CmdLine()
 4 | cmd:text()
 5 | cmd:text('Training a Recurrent Neural Network to classify a sequence of words')
 6 | cmd:text()
 7 | cmd:text('Comandline Options')
 8 | 
 9 | cmd:option('-testData','data/valid.torch','training data file')
10 | cmd:option('-modelFile','model.BiGRU','filename for loading trained model')
11 | 
12 | cmd:option('-useGPU',1,'which GPU is used for computation')
13 | 
14 | cmd:text()
15 | 
16 | ----------------------------- Basic Options -----------------------------
17 | 
18 | local opt = cmd:parse(arg)
19 | local flog = logroll.print_logger()
20 | 
21 | if opt.useGPU > 0 then
22 |     require 'cutorch'
23 |     require 'cunn'
24 |     cutorch.setDevice(opt.useGPU)
25 |     torch.setdefaulttensortype('torch.CudaTensor')
26 |     flog.info(string.rep('-', 50))
27 |     flog.info('Set default tensor type to CudaTensor')
28 | end
29 | 
30 | ----------------------------- Data Loader -----------------------------
31 | local loader = SeqMultiLabelLoader(opt.testData, flog)
32 | 
33 | -------------------------- Load & Init Models -------------------------
34 | cutorch.reserveStreams(2)
35 | local model = torch.load(opt.modelFile)
36 | model:evaluate()
37 | 
38 | ----------------------------- Prediction -----------------------------
39 | local maxIters = loader.numBatch
40 | flog.info(string.rep('-', 40))
41 | flog.info('Begin Prediction')
42 | 
43 | local sumPred, sumCorr, sumTrue = 0, 0, 0
44 | 
45 | for i = 1, maxIters do
46 |     xlua.progress(i, maxIters)
47 | 
48 |     ----------------------- load minibatch ------------------------
49 |     local seq, labels = loader:nextBatch()
50 |     local currSeqLen = seq:size(1)    
51 | 
52 |     local predict = model:forward(seq)
53 |     local hardPred = torch.ge(predict, 0.5)
54 |     sumCorr = sumCorr + torch.cmul(hardPred:type(torch.type(labels)), labels):sum()
55 |     sumTrue = sumTrue + labels:sum()
56 |     sumPred = sumPred + hardPred:sum()
57 |     
58 | end
59 | 
60 | local p, r = sumCorr / sumPred, sumCorr / sumTrue
61 | print(p, r, 2 * p * r / (p + r))
62 | 


--------------------------------------------------------------------------------
/Inference/generate_score_data.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import cPickle as pickle
 3 | 
 4 | sys.path.append(os.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), 'src/py_module' ))
 5 | import QAData
 6 | 
 7 | if __name__ == '__main__':
 8 |     if len(sys.argv) < 2:
 9 |         print 'usage: python generate_score_data.py cpickle_data'
10 |         sys.exit(-1)
11 |     
12 |     suffix = sys.argv[1].split('.')[-2]
13 |     single_rel_file = file('rel.single.%s.txt'%(suffix), 'wb')
14 |     multi_rel_file  = file('rel.multi.%s.txt'%(suffix), 'wb')
15 |     multi_ent_file  = file('ent.multi.%s.txt'%(suffix), 'wb')
16 |     multi_type_file  = file('type.multi.%s.txt'%(suffix), 'wb')
17 | 
18 |     data_list = pickle.load(file(sys.argv[1], 'rb'))
19 |     single_rel_data = []
20 |     multi_rel_data  = []
21 |     print >> sys.stderr, 'Finish loading QAData'
22 | 
23 |     count = 0
24 |     for data in data_list:
25 |         if hasattr(data, 'cand_sub') and hasattr(data, 'cand_rel') and len(data.cand_rel) > 0 and data.relation in data.cand_rel and data.subject in data.cand_sub:
26 |         # if data.subject in data.cand_sub:
27 |             question = data.question
28 |             # Case 1: single candidate subject
29 |             if len(data.cand_sub) == 1:
30 |                 print >> single_rel_file, '%s\t%s\t%s' % (question, data.relation, '\t'.join(data.cand_rel))
31 |                 single_rel_data.append(data)
32 |             # Case 2: multiple candidate subjects
33 |             elif len(data.cand_sub) > 1:
34 |                 print >> multi_rel_file,  '%s\t%s\t%s' % (question, data.relation, '\t'.join(data.cand_rel))
35 |                 print >> multi_ent_file,  '%s\t%s\t%s' % (question, data.subject, '\t'.join(data.cand_sub))
36 |                 print >> multi_type_file, '%s\t%d\t%s' % (question, data.cand_sub.index(data.subject), '\t'.join([' '.join([str(t) for t in st]) for st in data.sub_types]))
37 |                 multi_rel_data.append(data)
38 |         else:
39 |             count += 1
40 | 
41 |     single_rel_file.close()
42 |     multi_rel_file.close()
43 |     multi_ent_file.close()
44 |     multi_type_file.close()
45 | 
46 |     pickle.dump(single_rel_data, file('single.%s.cpickle'%(suffix), 'wb'))
47 |     pickle.dump(multi_rel_data, file('multi.%s.cpickle'%(suffix), 'wb'))
48 |     print >> sys.stderr, count
49 | 


--------------------------------------------------------------------------------
/EntityTypeVec/infer_ent_typevec.lua:
--------------------------------------------------------------------------------
 1 | require '..'
 2 | require 'SeqLabelRankLoader'
 3 | 
 4 | local cmd = torch.CmdLine()
 5 | cmd:text('Comandline Options')
 6 | cmd:option('-testData','inference-data/ent.valid.t7','training data file')
 7 | cmd:option('-modelFile','model.BiGRU','filename for loading trained model')
 8 | cmd:option('-useGPU',0,'which GPU is used for computation')
 9 | 
10 | cmd:text()
11 | 
12 | ----------------------------- Basic Options -----------------------------
13 | 
14 | local opt = cmd:parse(arg)
15 | local flog = logroll.print_logger()
16 | 
17 | if opt.useGPU > 0 then
18 |     cutorch.setDevice(opt.useGPU)
19 |     torch.setdefaulttensortype('torch.CudaTensor')
20 |     flog.info(string.rep('-', 50))
21 |     flog.info('Set default tensor type to CudaTensor')
22 | end
23 | 
24 | ----------------------------- Data Loader -----------------------------
25 | local fields = stringx.split(opt.testData, '.')
26 | local split = fields[#fields-1]
27 | local loader = SeqLabelRankLoader(opt.testData, flog)
28 | local score_file = io.open(string.format('score.ent.multi.%s', split), 'w')
29 | local rank_file  = io.open(string.format('rank.ent.multi.%s', split), 'w')
30 | 
31 | -------------------------- Load & Init Models -------------------------
32 | cutorch.reserveStreams(2)
33 | local model = torch.load(opt.modelFile)
34 | model:evaluate()
35 | 
36 | ----------------------------- Prediction -----------------------------
37 | local maxIters = loader.numBatch
38 | flog.info(string.rep('-', 40))
39 | flog.info('Begin Prediction')
40 | 
41 | for i = 1, maxIters do
42 |     xlua.progress(i, maxIters)
43 | 
44 |     ----------------------- load minibatch ------------------------
45 |     local seq, posIdx, candi = loader:nextBatch(1)
46 |     local currSeqLen = seq:size(1)    
47 |     local numCandi = candi:size(1)
48 | 
49 |     local predict = model:forward(seq)
50 |     predict:maskedSelect(torch.lt(predict, 0.5)):zero()
51 |     local repPred = predict:expandAs(candi)
52 | 
53 |     candi = candi:cuda()
54 |     local scores = torch.cmul(repPred, candi):sum(2):view(numCandi)
55 | 
56 |     local _, argSort = torch.sort(scores, 1, true)
57 |     rank_file:write(posIdx, '\t')
58 |     for i = 1, numCandi do
59 |         rank_file:write(argSort[i], ' ')
60 |     end
61 |     rank_file:write('\n')
62 | 
63 |     for i = 1, numCandi do
64 |         score_file:write(scores[i], ' ')
65 |     end
66 |     score_file:write('\n')
67 | end
68 | rank_file:close()
69 | score_file:close()
70 | 


--------------------------------------------------------------------------------
/Virtuoso.md:
--------------------------------------------------------------------------------
 1 | This File provides instruction on how to build and config **Virtuoso**, a triple-storage software the package relies on.
 2 | 
 3 | 
 4 | 
 5 | ##### 1. Download source code from github
 6 | 
 7 | ```shell
 8 | cd tmp
 9 | git clone https://github.com/openlink/virtuoso-opensource.git
10 | ```
11 | 
12 | 
13 | 
14 | ##### 2. Configure and compile the source code to specific path
15 | 
16 | To build Virtuoso on systems other than `Linux 64-bit`, please refer to the [virtuoso building doc](https://github.com/openlink/virtuoso-opensource)
17 | 
18 | ```shell
19 | cd virtuoso-opensource
20 | 
21 | # generate makefile
22 | sh autogen.sh 
23 | 
24 | # PKGPATH is the root directory you put this package in
25 | PKGPATH="put your path here"
26 | 
27 | # ultimate install path
28 | INSTALLPATH=${PKGPATH}/KnowledgeBase/VirtuosoKG
29 | mkdir -p ${INSTALLPATH}
30 | 
31 | # flags for Linux 64-bit 
32 | CFLAGS="-O2 -m64"
33 | export CFLAGS
34 | 
35 | # configurate
36 | ./configure --prefix=${INSTALLPATH}
37 | 
38 | # compile (compiling will take quite a while)
39 | make
40 | 
41 | # install
42 | make install
43 | ```
44 | 
45 | 
46 | 
47 | ##### 3. Edit the .ini config file of Virtuoso KB
48 | 
49 | Here, we config virtuoso in the following way so that a proper performance can be achieved.
50 | 
51 | ```shell
52 | # create a folder to store data to be loaded
53 | cd ${INSTALLPATH}
54 | mkdir data
55 | 
56 | # edit the .ini config
57 | vi var/lib/virtuoso/db/virtuoso.ini
58 | 
59 | # all changes necessary to make are under the [Parameters] section
60 | 
61 |   # 1. DirsAllowed : directory from which data is allowed to be loaded. 
62 |   # So we need to append our created data directory after the default value. 
63 | 
64 |     # default 
65 |     DirsAllowed = ., ${INSTALLPATH}/share/virtuoso/vad
66 |     # modified
67 |     DirsAllowed = ., ${INSTALLPATH}/share/virtuoso/vad, ${INSTALLPATH}/data
68 | 
69 |   # 2. MaxQueryMem : maximum memory virtuoso can use to handle queries. 
70 |   # Intuitively, the larger the MaxQueryMem, the potentially faster the query. 
71 |   # The recommemded value is 1/2 to 2/3 of the whole memory on the machine.
72 | 
73 |     # default
74 |     MaxQueryMem = 2G 
75 |     # modified : for our experiment, on a 6-core machine with 32G memory.
76 |     MaxQueryMem = 16G
77 | 
78 |   # 3. VectorSize : initial parallel query operations size. 
79 |   # Intuitively, the larger the VectorSize, the potentially faster the query. 
80 |   	
81 |   	# default
82 |     VectorSize = 1000
83 |     # modified : for our experiment, on a 6-core machine with 32G memory.
84 |     VectorSize = 10000
85 | ```
86 | 
87 | 


--------------------------------------------------------------------------------
/FocusedLabeling/test_crf.lua:
--------------------------------------------------------------------------------
 1 | require '..'
 2 | 
 3 | local cmd = torch.CmdLine()
 4 | cmd:text()
 5 | cmd:text('Training a Recurrent Neural Network to classify a sequence of words')
 6 | cmd:text()
 7 | cmd:text('Comandline Options')
 8 | 
 9 | cmd:option('-wordVocab','../vocab/vocab.word.t7','training data file')
10 | cmd:option('-testData','../data/valid.t7','training data file')
11 | cmd:option('-modelFile','model.BiGRU','filename for loading trained model')
12 | 
13 | cmd:option('-useGPU',1,'which GPU is used for computation')
14 | 
15 | cmd:text()
16 | 
17 | ----------------------------- Basic Options -----------------------------
18 | 
19 | local opt = cmd:parse(arg)
20 | local flog = logroll.print_logger()
21 | 
22 | local wordVocab = torch.load(opt.wordVocab)
23 | 
24 | if opt.useGPU > 0 then
25 |     require 'cutorch'
26 |     require 'cunn'
27 |     cutorch.setDevice(opt.useGPU)
28 |     torch.setdefaulttensortype('torch.CudaTensor')
29 |     flog.info(string.rep('-', 50))
30 |     flog.info('Set default tensor type to CudaTensor')
31 | end
32 | 
33 | ----------------------------- Data Loader -----------------------------
34 | local loader = SeqClassLoader(opt.testData, flog)
35 | 
36 | -------------------------- Load & Init Models -------------------------
37 | local model = torch.load(opt.modelFile)
38 | local seqModel = model.seqModel
39 | local linearCRF = model.linearCRF
40 | seqModel:evaluate()
41 | linearCRF:evaluate()
42 | 
43 | ----------------------------- Prediction -----------------------------
44 | local maxIters = loader.numBatch
45 | flog.info(string.rep('-', 40))
46 | flog.info('Begin Prediction')
47 | 
48 | local sumPred, sumCorr, sumTrue = 0, 0, 0
49 | local count = 0
50 | 
51 | for i = 1, maxIters do
52 |     xlua.progress(i, maxIters)
53 | 
54 |     ----------------------- load minibatch ------------------------
55 |     local seq, labels = loader:nextBatch()
56 |     local currSeqLen = seq:size(1)
57 |     local seqVec = seqModel:forward(seq)
58 |     local predict = linearCRF:forward(seqVec)
59 | 
60 |     if torch.sum(torch.ne(predict, labels)) == 0 then
61 |         count = count + 1
62 |     end
63 |     local maskPred = torch.eq(predict, 2)
64 |     local maskTrue = torch.eq(labels, 2)
65 |     sumCorr = sumCorr + torch.eq(predict:type(torch.type(labels)), labels):cmul(maskTrue):sum()
66 |     sumTrue = sumTrue + maskTrue:sum()
67 |     sumPred = sumPred + maskPred:sum()
68 |     -- for i = 1, currSeqLen do
69 |     --     print(string.format("%15s\t%1d\t%1d", wordVocab:token(seq[{i,1}]), predict[{i,1}], labels[{i,1}]))
70 |     -- end
71 | end
72 | 
73 | local p, r = sumCorr / sumPred, sumCorr / sumTrue
74 | print(p, r, 2 * p * r / (p + r))
75 | print(count / loader.numBatch)
76 | 


--------------------------------------------------------------------------------
/src/data/Vocab.lua:
--------------------------------------------------------------------------------
 1 | local Vocab = torch.class('Vocab')
 2 | 
 3 | function Vocab:__init(path)
 4 |     self.size = 0
 5 |     self._index = {}
 6 |     self._tokens = {}
 7 | 
 8 |     local file = io.open(path)
 9 |     while true do
10 |         local line = file:read()
11 |         if line == nil then break end
12 |         self.size = self.size + 1
13 |         self._tokens[self.size] = line
14 |         self._index[line] = self.size
15 |     end
16 |     file:close()
17 | 
18 |     print('vocab size: '..self.size)
19 | end
20 | 
21 | function Vocab:contains(w)
22 |     if not self._index[w] then return false end
23 |     return true
24 | end
25 | 
26 | function Vocab:add(w)
27 |     if self._index[w] ~= nil then
28 |         return self._index[w]
29 |     end
30 |     self.size = self.size + 1
31 |     self._tokens[self.size] = w
32 |     self._index[w] = self.size
33 |     return self.size
34 | end
35 | 
36 | function Vocab:index(w)
37 |     local index = self._index[w]
38 |     if index == nil then
39 |         if self.unk_index == nil then
40 |             error('Token not in vocabulary and no UNK token defined: ' .. w)
41 |         end
42 |         return self.unk_index
43 |     end
44 |     return index
45 | end
46 | 
47 | function Vocab:token(i)
48 |     if i < 1 or i > self.size then
49 |         error('Index ' .. i .. ' out of bounds')
50 |     end
51 |     return self._tokens[i]
52 | end
53 | 
54 | function Vocab:map(tokens)
55 |     local len = #tokens
56 |     local output = torch.IntTensor(len)
57 |     for i = 1, len do
58 |         output[i] = self:index(tokens[i])
59 |     end
60 |     return output
61 | end
62 | 
63 | function Vocab:add_unk_token()
64 |     if self.unk_token ~= nil then return end
65 |     self.unk_index = self:add('<unk>')
66 |     print('vocab size: '..self.size)
67 | end
68 | 
69 | function Vocab:add_pad_token()
70 |     if self.pad_token ~= nil then return end
71 |     self.pad_index = self:add('<pad>')
72 |     print('vocab size: '..self.size)
73 | end
74 | 
75 | function Vocab:add_ent_token()
76 |     if self.ent_token ~= nil then return end
77 |     self.ent_index = self:add('<entity>')
78 |     print('vocab size: '..self.size)
79 | end
80 | 
81 | function Vocab:add_start_token()
82 |     if self.start_token ~= nil then return end
83 |     self.start_index = self:add('<s>')
84 |     print('vocab size: '..self.size)
85 | end
86 | 
87 | function Vocab:add_end_token()
88 |     if self.end_token ~= nil then return end
89 |     self.end_index = self:add('</s>')
90 |     print('vocab size: '..self.size)
91 | end
92 | 
93 | function Vocab:add_space_token()
94 |     if self.space_token ~= nil then return end
95 |     self.space_index = self:add('<_>')
96 |     print('vocab size: '..self.size)
97 | end
98 | 


--------------------------------------------------------------------------------
/Inference/test/run.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | predict () {
 3 |     echo "predict $1 $2 $3 $4"
 4 |     cp $1/score.test.multi.label.FB5M score.multi.valid.FB5M
 5 |     cp $2/score.test.label.FB5M score.ent.valid.FB5M
 6 |     #python ../joint_predict.py multi.test.cpickle score.multi.valid.FB5M score.ent.valid.FB5M $3 $4
 7 |     python ../joint_disambiguation.py multi.test.cpickle score.multi.valid.FB5M score.ent.valid.FB5M
 8 | }
 9 | 
10 | predict_symbol () {
11 |     echo "predict symbol $1 $2 $3 $4"
12 |     cp $1/score.test.multi.label.anonymous.FB5M score.multi.valid.FB5M
13 |     cp $2/score.test.label.FB5M score.ent.valid.FB5M
14 |     #python ../joint_predict.py multi.test.cpickle score.multi.valid.FB5M score.ent.valid.FB5M $3 $4
15 |     python ../joint_disambiguation.py multi.test.cpickle score.multi.valid.FB5M score.ent.valid.FB5M
16 | }
17 | 
18 | predict "../../RelationRNN" "../../EntityTypeVec" 0.85 0.0
19 | predict "../../RelationLTGCNN" "../../EntityTypeVec" 0.85 0.0
20 | predict "../../RelationAverage" "../../EntityTypeVec" 0.85 0.0
21 | predict_symbol "../../RelationLTGCNN" "../../EntityTypeVec" 0.85 0.0
22 | 
23 | predict "../../RelationRNN" "../../EntityTypeVec" 0.90 0.95
24 | predict "../../RelationLTGCNN" "../../EntityTypeVec" 0.85 0.85
25 | predict "../../RelationAverage" "../../EntityTypeVec" 0.90 0.85
26 | predict_symbol "../../RelationLTGCNN" "../../EntityTypeVec" 0.85 0.85
27 | 
28 | predict "../../RelationRNN" "../../EntityRNN/TransE" 0.60 0.0
29 | predict "../../RelationLTGCNN" "../../EntityRNN/TransE" 0.55 0.0
30 | predict "../../RelationAverage" "../../EntityRNN/TransE" 0.60 0.0
31 | predict_symbol "../../RelationLTGCNN" "../../EntityRNN/TransE" 0.60 0.0
32 | 
33 | predict "../../RelationRNN" "../../EntityRNN/TransE" 0.90 0.95
34 | predict "../../RelationLTGCNN" "../../EntityRNN/TransE" 0.50 0.85
35 | predict "../../RelationAverage" "../../EntityRNN/TransE" 0.95 0.95
36 | predict_symbol "../../RelationLTGCNN" "../../EntityRNN/TransE" 0.65 0.95
37 | 
38 | predict "../../RelationRNN" "../../EntityRNN/Random" 0.75 0.0
39 | predict "../../RelationLTGCNN" "../../EntityRNN/Random" 0.70 0.0
40 | predict "../../RelationAverage" "../../EntityRNN/Random" 0.70 0.0
41 | predict_symbol "../../RelationLTGCNN" "../../EntityRNN/Random" 0.65 0.0
42 | 
43 | predict "../../RelationRNN" "../../EntityRNN/Random" 0.60 0.95
44 | predict "../../RelationLTGCNN" "../../EntityRNN/Random" 0.70 0.85
45 | predict "../../RelationAverage" "../../EntityRNN/Random" 0.95 0.95
46 | predict_symbol "../../RelationLTGCNN" "../../EntityRNN/Random" 0.95 0.95
47 | 
48 | predict "../../RelationRNN" "../../EntityAverage"         0.60 0.0
49 | predict "../../RelationLTGCNN" "../../EntityAverage"        0.65 0.0
50 | predict "../../RelationAverage" "../../EntityAverage"                 0.55 0.0
51 | predict_symbol "../../RelationLTGCNN" "../../EntityAverage" 0.65 0.0
52 | 
53 | predict "../../RelationRNN" "../../EntityAverage"         0.65 0.95
54 | predict "../../RelationLTGCNN" "../../EntityAverage"        0.65 0.85
55 | predict "../../RelationAverage" "../../EntityAverage"                 0.95 0.95
56 | predict_symbol "../../RelationLTGCNN" "../../EntityAverage" 0.65 0.85
57 | 


--------------------------------------------------------------------------------
/src/optim/AdaGrad.lua:
--------------------------------------------------------------------------------
 1 | -- For this AdaGrad implementation, it supports both (optional) traditional momentum 
 2 | -- and Nesterov Accelerated Gradient (nag). However, both styles of momentum apply the 
 3 | -- same value to all parameters. 
 4 | 
 5 | local AdaGrad = torch.class('AdaGrad')
 6 | 
 7 | function AdaGrad:__init(gradTab, config)
 8 |     self.lr = config.lr
 9 |     self.histGradSquare = {}
10 |     for i, grad in pairs(gradTab) do
11 |         self.histGradSquare[i] = grad:clone():fill(1e-4)
12 |     end
13 |     if config.momentum then
14 |         self.momentum = config.momentum        
15 |         self.velocity = {}
16 |         for i, grad in pairs(gradTab) do
17 |             self.velocity[i] = grad:clone():fill(0)
18 |         end
19 |     elseif config.nag then
20 |         self.nag      = config.nag
21 |         self.const_1  = self.nag * self.nag -- NAG from "advances in optimizing recurrent networks"
22 |         self.const_2  = self.nag + 1        -- NAG from "advances in optimizing recurrent networks"
23 |         self.velocity = {}
24 |         for i, grad in pairs(gradTab) do
25 |             self.velocity[i] = grad:clone():fill(0)
26 |         end
27 |     end
28 |     if config.logger then
29 |         config.logger.info(string.rep('-', 50))
30 |         config.logger.info(string.format('AdaGrad Configurations:'))
31 |         for i = 1, #self.lr do
32 |             config.logger.info(string.format('    learning rate [%1d] : %f', i , self.lr[i]))
33 |         end
34 |         if self.momentum then
35 |             config.logger.info(string.format('    classic momentum  : %f', self.momentum))
36 |         elseif self.nag then
37 |             config.logger.info(string.format('    Nesterov momentum : %f', self.nag))
38 |         end
39 |     end
40 | end
41 | 
42 | function AdaGrad:updateParams(paramsTab, gradTab)
43 |     if self.momentum then
44 |         for i = 1, #paramsTab do
45 |             self.histGradSquare[i]:addcmul(1, gradTab[i], gradTab[i])
46 |             self.velocity[i]:mul(self.momentum):addcdiv(-self.lr[i], gradTab[i], torch.sqrt(self.histGradSquare[i]))
47 |             paramsTab[i]:add(self.velocity[i])
48 |         end
49 |     elseif self.nag then
50 |         for i = 1, #paramsTab do
51 |             self.histGradSquare[i]:addcmul(1, gradTab[i], gradTab[i])
52 |             self.velocity[i]:mul(self.const_1):addcdiv(-self.lr[i]*self.const_2, gradTab[i], torch.sqrt(self.histGradSquare[i]))
53 |             paramsTab[i]:add(self.velocity[i])
54 |         end
55 |     else
56 |         for i = 1, #paramsTab do
57 |             self.histGradSquare[i]:addcmul(1, gradTab[i], gradTab[i])
58 |             paramsTab[i]:addcdiv(-self.lr[i], gradTab[i], torch.sqrt(self.histGradSquare[i]))
59 |         end
60 |     end
61 | end
62 | 
63 | function AdaGrad:updateMomentum(rate)
64 |     if self.momentum then
65 |         self.momentum = rate
66 |     elseif self.nag then
67 |         self.nag = rate
68 |     end
69 | end
70 | 
71 | function AdaGrad:effectiveGradNorm(gradTab)
72 |     for i = 1, #gradTab do
73 |         print(string.format('effective norm %d: %f', i, torch.cdiv(gradTab[i], torch.sqrt(self.histGradSquare[i])):norm()))
74 |     end
75 | end
76 | 


--------------------------------------------------------------------------------
/src/py_module/QAData.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn import preprocessing
 3 | 
 4 | def fb2www(in_data):
 5 |     if type(in_data) == type(' '):
 6 |         out_data = in_data.replace('.', '/').replace('fb:', 'www.freebase.com/')
 7 |     elif type(in_data) == type([]):
 8 |         out_data = [data.replace('.', '/').replace('fb:', 'www.freebase.com/') for data in in_data]
 9 |     return out_data
10 | 
11 | class QAData(object):
12 |     """docstring for QAData"""
13 |     def __init__(self, data_tuple):
14 |         super(QAData, self).__init__()
15 |         self.question = data_tuple[0]
16 |         self.subject  = data_tuple[1]
17 |         self.relation = data_tuple[2]
18 |         self.object   = data_tuple[3]
19 |         self.num_text_token = int(data_tuple[4])
20 | 
21 |     def add_candidate(self, sub, rels, types = None):
22 |         if not hasattr(self, 'cand_sub'):
23 |             self.cand_sub = []
24 |         if not hasattr(self, 'cand_rel'):
25 |             self.cand_rel = []
26 |         if not hasattr(self, 'sub_rels'):
27 |             self.sub_rels = []
28 |         self.cand_sub.append(sub)
29 |         self.sub_rels.append(rels)
30 |         self.cand_rel.extend(rels)
31 |         if types:
32 |             if not hasattr(self, 'sub_types'):
33 |                 self.sub_types = []
34 |             self.sub_types.append(types)
35 | 
36 |     def remove_duplicate(self):
37 |         self.cand_rel = list(set(self.cand_rel))
38 | 
39 |     def make_score_mat(self):
40 |         # make candidate unique rels
41 |         self.num_sub  = len(self.cand_sub)
42 |         self.num_rel  = len(self.cand_rel)
43 |         self.rel_dict = {self.cand_rel[i]:i for i in range(self.num_rel)}
44 |         
45 |         # establish score matrix
46 |         self.score_mat = np.zeros((self.num_sub, self.num_rel))
47 |         for i in range(self.num_sub):
48 |             for rel in self.sub_rels[i]:
49 |                 self.score_mat[i, self.rel_dict[rel]] = 1
50 | 
51 |     def fill_rel_score(self, scores):
52 |         self.score_mat = self.score_mat * scores
53 | 
54 |     def fill_ent_score(self, scores):
55 |         self.ent_score = preprocessing.scale(scores)
56 | 
57 |     # def top_sub_rel(self):
58 |     #     # sub_score = preprocessing.scale(np.sum(self.score_mat, 1))
59 |     #     # sub_score += self.ent_score
60 |     #     sub_score = np.sum(self.score_mat, 1)
61 | 
62 |     #     top_subid = np.argmax(sub_score)
63 |     #     top_relid = np.argmax(self.score_mat[top_subid])
64 |     #     self.pred_sub = self.cand_sub[top_subid]
65 |     #     self.pred_rel = self.cand_rel[top_relid]
66 |     #     return self.cand_sub[top_subid], self.cand_rel[top_relid]
67 | 
68 |     def top_sub_rel(self):
69 |         sub_score = np.sum(self.score_mat, 1)
70 |         top_subscore = np.max(sub_score)
71 |         top_subids = []
72 |         for subid in np.argsort(sub_score)[::-1]:
73 |             if sub_score[subid] < top_subscore:
74 |                 break
75 |             top_subids.append(subid)
76 | 
77 |         top_relid = np.argmax(self.score_mat[top_subids[0]])
78 | 
79 |         return [self.cand_sub[subid] for subid in top_subids], self.cand_rel[top_relid]
80 | 


--------------------------------------------------------------------------------
/src/optim/SGD.lua:
--------------------------------------------------------------------------------
 1 | -- For this SGD implementation, it supports both (optional) traditional momentum 
 2 | -- and Nesterov Accelerated Gradient (nag). However, both styles of momentum apply the 
 3 | -- same value to all parameters. 
 4 | 
 5 | local SGD = torch.class('SGD')
 6 | 
 7 | function SGD:__init(gradTab, config)
 8 |     self.lr = config.lr
 9 |     if config.momentum then
10 |         self.momentum = config.momentum        
11 |         self.velocity = {}
12 |         for i, grad in pairs(gradTab) do
13 |             self.velocity[i] = grad:clone():fill(0)
14 |         end
15 |     elseif config.nag then
16 |         self.nag      = config.nag
17 |         self.const_1  = self.nag * self.nag -- NAG from "advances in optimizing recurrent networks"
18 |         self.const_2  = self.nag + 1        -- NAG from "advances in optimizing recurrent networks"
19 |         self.velocity = {}
20 |         for i, grad in pairs(gradTab) do
21 |             self.velocity[i] = grad:clone():fill(0)
22 |         end
23 |     end
24 |     if config.annealing then
25 |         self.annealing = config.annealing
26 |         self.masterLr = {}
27 |         for i = 1, #self.lr do
28 |             self.masterLr[i]  = self.lr[i]
29 |         end
30 |     end
31 |     self.count = 0
32 |     if config.logger then
33 |         config.logger.info(string.rep('-', 50))
34 |         config.logger.info(string.format('SGD Configurations:'))
35 |         for i = 1, #self.lr do
36 |             config.logger.info(string.format('    learning rate [%1d] : %f', i , self.lr[i]))
37 |         end
38 |         if self.momentum then
39 |             config.logger.info(string.format('    classic momentum  : %f', self.momentum))
40 |         elseif self.nag then
41 |             config.logger.info(string.format('    Nesterov momentum : %f', self.nag))
42 |         end
43 |         if self.annealing then
44 |             config.logger.info(string.format('    Annearling rate   : %f', self.annealing))
45 |         end
46 |     end
47 | end
48 | 
49 | function SGD:updateParams(paramsTab, gradTab)
50 |     self.count = self.count + 1
51 |     if self.annealing then
52 |         for i = 1, #self.masterLr do
53 |             self.lr[i] = self.masterLr[i] / (1 + self.annealing * math.sqrt(self.count))
54 |         end
55 |     end
56 |     -- print (self.lr)
57 |     if self.momentum and self.momentum > 0 then
58 |         for i = 1, #paramsTab do            
59 |             self.velocity[i]:mul(self.momentum):add(-self.lr[i], gradTab[i])
60 |             paramsTab[i]:add(self.velocity[i])
61 |         end
62 |     elseif self.nag and self.nag > 0 then
63 |         for i = 1, #paramsTab do
64 |             self.velocity[i]:mul(self.const_1):add(-self.lr[i]*self.const_2, gradTab[i])
65 |             paramsTab[i]:add(self.velocity[i])
66 |         end
67 |     else
68 |         for i = 1, #paramsTab do
69 |             paramsTab[i]:add(-self.lr[i], gradTab[i])
70 |         end
71 |     end
72 | end
73 | 
74 | function SGD:updateMomentum(rate)
75 |     if self.momentum then
76 |         self.momentum = rate
77 |     elseif self.nag then
78 |         self.nag = rate
79 |     end
80 | end
81 | 
82 | function SGD:effectiveGradNorm(gradTab)
83 |     for i = 1, #gradTab do
84 |         print(string.format('effective norm %d: %f', i, gradTab[i]:norm()))
85 |     end
86 | end
87 | 


--------------------------------------------------------------------------------
/RelationRNN/infer_rel_rnn.lua:
--------------------------------------------------------------------------------
  1 | require '..'
  2 | 
  3 | local cmd = torch.CmdLine()
  4 | cmd:text()
  5 | cmd:text('Testing a Recurrent Neural Network to embed a sentence')
  6 | cmd:text()
  7 | cmd:text('Options')
  8 | cmd:option('-useGPU',1,'whether to use gpu for computation')
  9 | cmd:option('-modelFile','model.rel.stackBiRNN','file path for saved model')
 10 | cmd:option('-testData','inference-data/rel.single.valid.t7','run test on which data set')
 11 | cmd:text()
 12 | 
 13 | -- parse input params
 14 | local opt = cmd:parse(arg)
 15 | local flog = logroll.print_logger()
 16 | 
 17 | if opt.useGPU > 0 then
 18 |     require 'cutorch'
 19 |     require 'cunn'
 20 |     cutorch.setDevice(opt.useGPU)
 21 |     torch.setdefaulttensortype('torch.CudaTensor')    
 22 | end
 23 | 
 24 | -- load all models
 25 | local fields = stringx.split(opt.testData, '.')
 26 | local ncand = fields[#fields-2]
 27 | local split = fields[#fields-1]
 28 | 
 29 | local model = torch.load(opt.modelFile)
 30 | 
 31 | -- init data loader and output files
 32 | local loader = RankingDataLoader(opt.testData, flog)
 33 | local score_file = io.open(string.format('score.rel.%s.%s', ncand, split), 'w')
 34 | local rank_file  = io.open(string.format('rank.rel.%s.%s', ncand, split), 'w')
 35 | 
 36 | -- extract sub models
 37 | local relEmbed   = model.relEmbed
 38 | local seqModel   = model.seqModel
 39 | local scoreModel = model.scoreModel
 40 | local negRelDrop = model.negRelDrop
 41 | 
 42 | seqModel:evaluate()
 43 | negRelDrop:evaluate()
 44 | 
 45 | -- core testing loop
 46 | for i = 1, loader.numBatch do
 47 |     xlua.progress(i, loader.numBatch)
 48 |     ----------------------- load minibatch ------------------------
 49 |     local seq, pos, neg = loader:nextBatch(1)
 50 |     neg = neg:view(-1)
 51 |     local currSeqLen = seq:size(1)
 52 |     local loss = 0
 53 | 
 54 |     ------------------------ forward pass -------------------------    
 55 |     -- sequence vectors [n_batch x n_dim]
 56 |     local seqVec = seqModel:forward(seq)
 57 | 
 58 |     -- negative matrix  [n_neg x n_batch x n_dim]
 59 |     -- local negMat = relEmbed:forward(neg)
 60 |     
 61 |     local tmp = relEmbed:forward(neg)
 62 |     local negMat = negRelDrop:forward(tmp)
 63 |     
 64 |     -- sequence matrix  [n_neg x n_batch x n_dim]
 65 |     local seqMat = torch.repeatTensor(seqVec, negMat:size(1), 1)
 66 | 
 67 |     if opt.useGPU > 0 then
 68 |         scores = torch.cmul(seqMat, negMat):sum(2):view(-1)
 69 |     else
 70 |         scores = torch.mm(seqMat, negMat:t()):diag()
 71 |     end
 72 |     
 73 |     -- write to rank file
 74 |     if scores:size(1) > 1 then
 75 |         local _, argSort = scores:sort(1, true)
 76 | 
 77 |         rank_file:write(pos[1], '\t')
 78 |         for i = 1, argSort:size(1) do
 79 |             rank_file:write(neg[argSort[i]], ' ')
 80 |         end
 81 |         rank_file:write('\n')
 82 | 
 83 |         -- write to score file
 84 |         local topIndices = {}
 85 |         for i = 1, argSort:size(1) do
 86 |             topIndices[argSort[i]] = 1
 87 |         end
 88 |         for i = 1, scores:size(1) do
 89 |             if topIndices[i] then
 90 |                 score_file:write(scores[i], ' ')
 91 |             else
 92 |                 score_file:write(0, ' ')
 93 |             end
 94 |         end
 95 |         score_file:write('\n')
 96 |     else
 97 |         rank_file:write(pos[1], '\t')
 98 |         rank_file:write(neg[1])
 99 |         rank_file:write('\n')
100 |         score_file:write(scores[1])
101 |         score_file:write('\n')
102 |     end
103 | 
104 |     collectgarbage()
105 | end
106 | score_file:close()
107 | rank_file:close()
108 | 


--------------------------------------------------------------------------------
/src/data/SeqLabelRankLoader.lua:
--------------------------------------------------------------------------------
  1 | -- file to define the class SeqLabelRankLoader
  2 | -- SeqLabelRankLoader:nextBatch() return a batch of 
  3 | 
  4 | local SeqLabelRankLoader = torch.class('SeqLabelRankLoader')
  5 | 
  6 | function SeqLabelRankLoader:__init(datafile, logger)
  7 |     -- sequence & pos match
  8 |     local data = torch.load(datafile)
  9 |     self.candidates = data.candidates
 10 |     self.sequences = data.sequences
 11 |     self.posIndex = data.posIndex
 12 | 
 13 |     -- additional variables
 14 |     self.batchSize = self.sequences[1]:size(2)
 15 |     self.numBatch  = #self.sequences
 16 |     self.currIdx   = 1
 17 |     self.indices   = randperm(self.numBatch)
 18 | 
 19 |     if torch.Tensor():type() == 'torch.CudaTensor' then
 20 |         for i = 1, self.numBatch do
 21 |             self.candidates[i] = self.candidates[i]:cuda()
 22 |             self.sequences[i] = self.sequences[i]:cuda()
 23 |         end
 24 |     end
 25 | 
 26 |     if logger then
 27 |         self.logger = logger
 28 |         self.logger.info(string.rep('-', 50))
 29 |         self.logger.info(string.format('SeqLabelRankLoader Configurations:'))
 30 |         self.logger.info(string.format('    number of batch : %d', self.numBatch))
 31 |         self.logger.info(string.format('    data batch size : %d', self.batchSize))
 32 |     end
 33 | end
 34 | 
 35 | -- sequences[dataIdx]: 2-D LongTensor, [seqLen x batchSize]
 36 | -- posIndex[dataIdx]: 2-D LongTensor, [batchSize x numLabel]
 37 | function SeqLabelRankLoader:nextBatch(circular)
 38 |     if self.currIdx > self.numBatch then
 39 |         self.currIdx = 1
 40 |         self.indices = randperm(self.numBatch)
 41 |     end
 42 |     local dataIdx
 43 |     if circular then
 44 |         dataIdx = self.currIdx
 45 |     else
 46 |         dataIdx = self.indices[self.currIdx]
 47 |     end 
 48 |     self.currIdx = self.currIdx + 1
 49 | 
 50 |     return self.sequences[dataIdx], self.posIndex[dataIdx], self.candidates[dataIdx]
 51 | end
 52 | 
 53 | -- create torch-format data for SeqLabelRankLoader
 54 | function createSeqLabelRankData(dataPath, savePath, wordVocab, numLabel)
 55 |     -- class variables
 56 |     local candidates = {}
 57 |     local sequences = {}
 58 |     local posIndex = {}
 59 | 
 60 |     -- read data fileh
 61 |     local file = io.open(dataPath, 'r')
 62 |     local batchIdx  = 0
 63 |     local line
 64 |     
 65 |     while true do
 66 |         line = file:read()
 67 |         if line == nil then break end
 68 |         batchIdx = batchIdx + 1
 69 |         print ('batch '..batchIdx)
 70 |         local fields = stringx.split(line, '\t')
 71 |         
 72 |         -- fields[1]: language sequence
 73 |         local tokens = stringx.split(fields[1])
 74 |         sequences[batchIdx] = torch.LongTensor(#tokens, 1)
 75 | 
 76 |         for i = 1, #tokens do
 77 |             local token = tokens[i]
 78 |             sequences[batchIdx][{i, 1}] = wordVocab:index(token)
 79 |         end
 80 |         
 81 |         -- fields[2]: correct label
 82 |         posIndex[batchIdx] = tonumber(fields[2]) + 1
 83 | 
 84 |         -- fields[3:]
 85 |         local numCandi = #fields - 2
 86 |         candidates[batchIdx] = torch.zeros(numCandi, numLabel)
 87 | 
 88 |         for candiIdx = 1, numCandi do
 89 |             local labels = stringx.split(fields[candiIdx+2])
 90 |             for i = 1, #labels do
 91 |                 index = tonumber(labels[i]) + 1
 92 |                 candidates[batchIdx][{candiIdx, index}] = 1
 93 |             end
 94 |         end
 95 | 
 96 |     end
 97 |     file:close()
 98 | 
 99 |     local data = {}
100 |     data.candidates = candidates
101 |     data.sequences = sequences
102 |     data.posIndex = posIndex
103 | 
104 |     torch.save(savePath, data)
105 | end
106 | 


--------------------------------------------------------------------------------
/src/data/SeqLabelingLoader.lua:
--------------------------------------------------------------------------------
  1 | local SeqLabelingLoader = torch.class('SeqLabelingLoader')
  2 | 
  3 | function SeqLabelingLoader:__init(datafile, logger)
  4 |     -- class variables
  5 |     local data = torch.load(datafile)
  6 |     self.sequences = data.seq
  7 |     self.seqLabels = data.label
  8 | 
  9 |     -- additional variables
 10 |     self.batchSize = self.sequences[1]:size(2)
 11 |     self.numBatch = #self.sequences    
 12 |     self.currIdx = 1
 13 |     self.indices = randperm(self.numBatch)
 14 | 
 15 |     if torch.Tensor():type() == 'torch.CudaTensor' then
 16 |         for i = 1, self.numBatch do
 17 |             self.seqLabels[i]    = self.seqLabels[i]:cuda()
 18 |             self.sequences[i] = self.sequences[i]:cuda()
 19 |         end
 20 |     end
 21 | 
 22 |     if logger then
 23 |         self.logger = logger
 24 |         self.logger.info(string.rep('-', 50))
 25 |         self.logger.info(string.format('SeqLabelingLoader Configurations:'))
 26 |         self.logger.info(string.format('    number of batch: %d', self.numBatch))
 27 |         self.logger.info(string.format('    data batch size: %d', self.batchSize))
 28 |     end
 29 | end
 30 | 
 31 | function SeqLabelingLoader:nextBatch(circular)
 32 |     if self.currIdx > self.numBatch then
 33 |         self.currIdx = 1
 34 |         self.indices = randperm(self.numBatch)
 35 |     end
 36 |     local dataIdx
 37 |     if circular then
 38 |         dataIdx = self.currIdx
 39 |     else
 40 |         dataIdx = self.indices[self.currIdx]
 41 |     end 
 42 |     self.currIdx = self.currIdx + 1
 43 |     return self.sequences[dataIdx], self.seqLabels[dataIdx]
 44 | end
 45 | 
 46 | -- create torch-format data for SeqLabelingLoader
 47 | function createSeqLabelingData(dataPath, savePath, wordVocab, batchSize, noneLabel, trueLabel)
 48 |     -- class variable
 49 |     local sequences = {}
 50 |     local seqLabels = {}
 51 | 
 52 |     local noneLabel = noneLabel or 1
 53 |     local trueLabel = trueLabel or 2
 54 | 
 55 |     -- read data fileh
 56 |     local file = io.open(dataPath, 'r')
 57 |     local batchIdx = 0    -- the index of sequence batches
 58 |     local seqIdx   = 0    -- sequence index within each batch
 59 |     local line
 60 |     
 61 |     while true do
 62 |         line = file:read()
 63 |         if line == nil then break end
 64 |         local fields = stringx.split(line, '\t')
 65 |         
 66 |         -- fields[1]: language sequence
 67 |         local tokens = stringx.split(fields[1])
 68 | 
 69 |         -- fields[2]: label labels
 70 |         local labels = stringx.split(fields[2])
 71 | 
 72 |         -- allocate tensor memory
 73 |         if seqIdx % batchSize == 0 then
 74 |             print('batch: '..batchIdx)
 75 |             seqIdx = 1
 76 |             batchIdx = batchIdx + 1
 77 |             sequences[batchIdx] = torch.LongTensor(#tokens, batchSize):fill(wordVocab.pad_index)
 78 |             seqLabels[batchIdx] = torch.DoubleTensor(#tokens, batchSize):fill(noneLabel)
 79 |         else
 80 |             seqIdx = seqIdx + 1
 81 |         end
 82 | 
 83 |         -- parse tokens into table
 84 |         for i = 1, #tokens do
 85 |             sequences[batchIdx][{i, seqIdx}] = wordVocab:index(tokens[i])
 86 |         end
 87 | 
 88 |         -- parse labels into table
 89 |         if #labels == #tokens then
 90 |             for i = 1, #labels do
 91 |                 seqLabels[batchIdx][{i, seqIdx}] = tonumber(labels[i])
 92 |             end
 93 |         else
 94 |             for i = 1, #labels do
 95 |                 seqLabels[batchIdx][{tonumber(labels[i]) + 1, seqIdx}] = trueLabel
 96 |             end
 97 |         end
 98 |     end
 99 |     file:close()
100 | 
101 |     local data = {}
102 |     data.seq   = sequences
103 |     data.label = seqLabels
104 | 
105 |     torch.save(savePath, data)
106 | end


--------------------------------------------------------------------------------
/src/data/SeqMultiLabelLoader.lua:
--------------------------------------------------------------------------------
  1 | local SeqMultiLabelLoader = torch.class('SeqMultiLabelLoader')
  2 | 
  3 | function SeqMultiLabelLoader:__init(datafile, logger)
  4 |     -- sequence & pos match
  5 |     local data = torch.load(datafile)
  6 |     self.sequences = data.sequences
  7 |     self.seqLabels = data.seqLabels
  8 |     if data.seqLength ~= nil then
  9 |         self.seqLength = data.seqLength
 10 |     end
 11 | 
 12 |     -- additional variables
 13 |     self.batchSize = self.sequences[1]:size(2)
 14 |     self.numBatch  = #self.sequences
 15 |     self.currIdx   = 1
 16 |     self.indices   = randperm(self.numBatch)
 17 | 
 18 |     if torch.Tensor():type() == 'torch.CudaTensor' then
 19 |         for i = 1, self.numBatch do
 20 |             self.sequences[i] = self.sequences[i]:cuda()
 21 |             self.seqLabels[i] = self.seqLabels[i]:cuda()
 22 |             if self.seqLength ~= nil then
 23 |                 self.seqLength[i] = self.seqLength[i]:cuda()
 24 |             end
 25 |         end
 26 |     end
 27 | 
 28 |     if logger then
 29 |         self.logger = logger
 30 |         self.logger.info(string.rep('-', 50))
 31 |         self.logger.info(string.format('SeqMultiLabelLoader Configurations:'))
 32 |         self.logger.info(string.format('    number of batch : %d', self.numBatch))
 33 |         self.logger.info(string.format('    data batch size : %d', self.batchSize))
 34 |     end
 35 | end
 36 | 
 37 | function SeqMultiLabelLoader:nextBatch(circular)
 38 |     if self.currIdx > self.numBatch then
 39 |         self.currIdx = 1
 40 |         self.indices = randperm(self.numBatch)
 41 |     end
 42 |     local dataIdx
 43 |     if circular then
 44 |         dataIdx = self.currIdx
 45 |     else
 46 |         dataIdx = self.indices[self.currIdx]
 47 |     end 
 48 |     self.currIdx = self.currIdx + 1
 49 | 
 50 |     if self.seqLength ~= nil then
 51 |         return self.sequences[dataIdx], self.seqLabels[dataIdx], self.seqLength[dataIdx]
 52 |     else
 53 |         return self.sequences[dataIdx], self.seqLabels[dataIdx]
 54 |     end
 55 | end
 56 | 
 57 | function createSeqMultiLabelData(dataPath, savePath, wordVocab, numLabel, batchSize)
 58 |     -- class variables
 59 |     local seqLabels = {}
 60 |     local sequences = {}
 61 |     local seqLength = {}
 62 | 
 63 |     -- read data fileh
 64 |     local file = io.open(dataPath, 'r')
 65 |     local batchIdx = 0    -- the index of sequence batches
 66 |     local seqIdx   = 0    -- sequence index within each batch
 67 |     local line
 68 |     
 69 |     while true do
 70 |         line = file:read()
 71 |         if line == nil then break end
 72 |         local fields = stringx.split(line, '\t')
 73 |         
 74 |         -- fields[1]: language sequence
 75 |         local tokens = stringx.split(fields[1])
 76 |         -- allocate tensor memory
 77 |         if seqIdx % batchSize == 0 then
 78 |             print('batch: '..batchIdx)
 79 |             seqIdx = 1
 80 |             batchIdx = batchIdx + 1
 81 |             sequences[batchIdx] = torch.LongTensor(#tokens, batchSize):fill(wordVocab.pad_index)
 82 |             seqLength[batchIdx] = torch.LongTensor(batchSize):fill(0)
 83 |             seqLabels[batchIdx] = torch.zeros(batchSize, numLabel)
 84 |         else
 85 |             seqIdx = seqIdx + 1
 86 |         end
 87 | 
 88 |         -- parse each token in sequence
 89 |         for i = 1, #tokens do
 90 |             local token = tokens[i]
 91 |             sequences[batchIdx][{i, seqIdx}] = wordVocab:index(token)
 92 |         end
 93 |         seqLength[batchIdx][seqIdx] = #tokens
 94 |         
 95 |         -- fields[2]: labels
 96 |         local labels = stringx.split(fields[2])
 97 |         for i = 1, #labels do
 98 |             index = tonumber(labels[i]) + 1
 99 |             seqLabels[batchIdx][{seqIdx, index}] = 1
100 |         end
101 | 
102 |     end
103 |     file:close()
104 | 
105 |     local data = {}
106 |     data.seqLabels = seqLabels
107 |     data.sequences = sequences
108 |     data.seqLength = seqLength
109 | 
110 |     torch.save(savePath, data)
111 | end
112 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # CFO
  2 | Code repo for [Conditional Focused Neural Question Answering with Large-scale Knowledge Bases](https://www.aclweb.org/anthology/P/P16/P16-1076.pdf)
  3 | 
  4 | # Installation and Preprocessing
  5 | 1. Refer to Virtuoso.md to install and confiture the software
  6 | 2. Make sure [torch7](http://torch.ch/) is installed together with the following dependencies
  7 |    - logroll: `luarocks install logroll`
  8 |    - nngraph: `luarocks install nngraph`
  9 | 3. After the installation and configuration of **Virtuoso**, run `bash data_preprocess.sh` to finish preprocessing
 10 | 
 11 | # Training
 12 | 
 13 | 1. Focused Lableing
 14 | 
 15 |    ```
 16 |    cd FocusedLabeling
 17 |    th train_crf.lua
 18 |    ```
 19 | 
 20 | 2. Entity Type Vector
 21 | 
 22 |    ```
 23 |    cd EntityTypeVec
 24 |    th train_ent_typevec.lua
 25 |    ```
 26 | 
 27 | 3. RNN based Relation Network
 28 | 
 29 |    ```
 30 |    cd RelationRNN
 31 |    th train_rel_rnn.lua
 32 |    ```
 33 | 
 34 | # Inference
 35 | In the following, define `SPLIT='valid' or 'test'`.
 36 | 
 37 | 1. Run focused labeling on validation/test data
 38 |    ```
 39 |    cd FocusedLabeling
 40 |    
 41 |    python generate_inference_data.py --split ${SPLIT}
 42 |    
 43 |    th process_inference.lua -testSplit ${SPLIT}
 44 |    th infer_crf.lua \
 45 |        -testData inference-data/label.${SPLIT}.t7 \
 46 |        -modelFile "path-to-pretrained-model"
 47 |    ```
 48 |    - `python generate_inference_data.py --split ${SPLIT}` will create the file `label.${SPLIT}.txt` in the folder `FocusedLabeling/inference-data`;
 49 |    - `th process_inference.lua` will turn the text file `label.${SPLIT}.txt` into `label.${SPLIT}.t7` in torch format (both in the folder `FocusedLabeling/inference-data`);
 50 |    - `th infer_crf.lua ...`  will generate the file `label.result.${SPLIT}` in the folder `FocusedLabeling`.
 51 | 
 52 | 2. Query candidates based on focused labeling
 53 | 
 54 |    ```
 55 |    cd Inference
 56 |    mkdir ${SPLIT} && cd ${SPLIT}
 57 |    python ../query_candidates.py 6 \
 58 |           ../../PreprocessData/QAData.${SPLIT}.pkl \
 59 |           ../../FocusedLabeling/label.result.${SPLIT} \
 60 |           ../../KnowledgeBase/type.top-500.pkl
 61 |    ```
 62 |    This step will generate the file `QAData.label.${SPLIT}.cpickle` in the folder `Inference/${SPLIT}`.
 63 | 
 64 | 3. Generate score data based on the query results
 65 | 
 66 |    ```
 67 |    cd Inference/${SPLIT}
 68 |    python ../generate_score_data.py QAData.label.${SPLIT}.cpickle
 69 |    ```
 70 | 
 71 |    This step will generate the following files in the same folder `Inference/${SPLIT}`:
 72 | 
 73 |    - `rel.single.${SPLIT}.txt` (candidate relations for those with only a single candidate subject)
 74 |    - `rel.multi.${SPLIT}.txt`   (candidate relations for those with only multiple candidate subject)
 75 |    - `type.multi.${SPLIT}.txt` (candidate entities for those with multiple candidate subjects)
 76 |    - `single.${SPLIT}.cpickle`
 77 |    - `multi.${SPLIT}.cpickle`
 78 | 
 79 | 4. Run relation inference
 80 | 
 81 |    ```
 82 |    cd RelationRNN
 83 |    mkdir inference-data
 84 |    th process_inference.lua -testSplit ${SPLIT}
 85 |    th infer_rel_rnn.lua -testData inference-data/rel.single.${SPLIT}.t7
 86 |    th infer_rel_rnn.lua -testData inference-data/rel.multi.${SPLIT}.t7
 87 |    ```
 88 | 
 89 |    This step will generate the files `score.rel.single.${SPLIT}` and `score.rel.multi.${SPLIT}` in the folder `RelationRNN`.
 90 | 
 91 | 5. Run entity inference
 92 | 
 93 |    ```
 94 |    cd EntityTypeVec
 95 |    mkdir inference-data
 96 |    th process_inference.lua -testSplit ${SPLIT}
 97 |    th infer_ent_typevec.lua -testData inference-data/ent.${SPLIT}.t7
 98 |    ```
 99 | 
100 |    This step will generate the file `score.ent.multi.multi.${SPLIT}` in the folder `EntityTypeVec`.
101 | 
102 | 6. Run joint disambiguation
103 | 
104 |    ```
105 |    cd Inference/${SPLIT}
106 |    python ../joint_disambiguation.py multi.${SPLIT}.cpickle \
107 |           ../../RelationRNN/score.rel.multi.${SPLIT} \
108 |           ../../EntityTypeVec/score.ent.multi.multi.${SPLIT}
109 |    ```
110 | 
111 |    


--------------------------------------------------------------------------------
/src/data/RankingDataLoader.lua:
--------------------------------------------------------------------------------
  1 | local RankingDataLoader = torch.class('RankingDataLoader')
  2 | 
  3 | function RankingDataLoader:__init(datafile, logger)
  4 |     -- class variables
  5 |     local data = torch.load(datafile)
  6 |     self.sequences  = data.seq
  7 |     self.seqLengths = data.len
  8 |     self.posMatches = data.pos
  9 |     self.negMatches = data.neg
 10 | 
 11 |     -- additional variables
 12 |     self.batchSize = self.sequences[1]:size(2)
 13 |     self.numBatch = #self.sequences
 14 |     self.negSize = self.negMatches[1]:size(1)
 15 |     self.currIdx = 1
 16 |     self.indices = randperm(self.numBatch)
 17 | 
 18 |     if torch.Tensor():type() == 'torch.CudaTensor' then
 19 |         for i = 1, self.numBatch do
 20 |             self.posMatches[i] = self.posMatches[i]:cuda()
 21 |             self.negMatches[i] = self.negMatches[i]:cuda()
 22 |             self.seqLengths[i] = self.seqLengths[i]:cuda()
 23 |             self.sequences[i]  = self.sequences[i]:cuda()
 24 |         end
 25 |     end
 26 | 
 27 |     if logger then
 28 |         self.logger = logger
 29 |         self.logger.info(string.rep('-', 50))
 30 |         self.logger.info(string.format('RankingDataLoader Configurations:'))
 31 |         self.logger.info(string.format('    number of batch: %d', self.numBatch))
 32 |         self.logger.info(string.format('    data batch size: %d', self.batchSize))
 33 |         self.logger.info(string.format('    neg sample size: %d', self.negSize))
 34 |     end
 35 | end
 36 | 
 37 | function RankingDataLoader:nextBatch(circular)
 38 |     if self.currIdx > self.numBatch then
 39 |         self.currIdx = 1
 40 |         self.indices = randperm(self.numBatch)
 41 |     end
 42 |     local dataIdx
 43 |     if circular then
 44 |         dataIdx = self.currIdx
 45 |     else
 46 |         dataIdx = self.indices[self.currIdx]
 47 |     end 
 48 |     self.currIdx = self.currIdx + 1
 49 |     return self.sequences[dataIdx], self.posMatches[dataIdx], self.negMatches[dataIdx], self.seqLengths[dataIdx]
 50 | end
 51 | 
 52 | function createRankingData(dataPath, savePath, wordVocab, fbVocab, batchSize)
 53 |     -- class variables
 54 |     local posMatches = {}
 55 |     local negMatches = {}
 56 |     local seqLengths = {}
 57 |     local sequences  = {}
 58 | 
 59 |     -- read data fileh
 60 |     local file = io.open(dataPath, 'r')
 61 |     local batchIdx = 0    -- the index of sequence batches
 62 |     local seqIdx   = 0    -- sequence index within each batch
 63 |     local line
 64 |     
 65 |     while true do
 66 |         line = file:read()
 67 |         if line == nil then break end
 68 |         local fields = stringx.split(line, '\t')
 69 |         
 70 |         -- fields[1]: language sequence
 71 |         local tokens = stringx.split(fields[1])
 72 |         -- allocate tensor memory
 73 |         if seqIdx % batchSize == 0 then
 74 |             seqIdx = 1
 75 |             batchIdx = batchIdx + 1
 76 |             sequences[batchIdx]  = torch.LongTensor(#tokens, batchSize):fill(wordVocab.pad_index)
 77 |             seqLengths[batchIdx] = torch.LongTensor(batchSize):fill(0)
 78 |             posMatches[batchIdx] = torch.LongTensor(batchSize):fill(0)
 79 |             negMatches[batchIdx] = torch.LongTensor(#fields-2, batchSize):fill(0)
 80 |         else
 81 |             seqIdx = seqIdx + 1
 82 |         end
 83 | 
 84 |         -- parse each token in sequence
 85 |         for i = 1, #tokens do
 86 |             local token = tokens[i]
 87 |             sequences[batchIdx][{i, seqIdx}] = wordVocab:index(token)
 88 |         end
 89 |         seqLengths[batchIdx][seqIdx] = #tokens
 90 |         
 91 |         -- fields[2]: positive match
 92 |         posMatches[batchIdx][seqIdx] = fbVocab:index(fields[2])
 93 | 
 94 |         -- fields[3-#fields]: negative match
 95 |         for i = 3, #fields do
 96 |             negMatches[batchIdx][{i-2, seqIdx}] = fbVocab:index(fields[i])
 97 |         end
 98 |     end
 99 |     file:close()
100 | 
101 |     local data = {}
102 |     data.pos = posMatches
103 |     data.neg = negMatches
104 |     data.len = seqLengths
105 |     data.seq = sequences
106 | 
107 |     torch.save(savePath, data)
108 | end
109 | 


--------------------------------------------------------------------------------
/SimpleQuestions/generate_training_data.py:
--------------------------------------------------------------------------------
  1 | import sys, os
  2 | import io
  3 | import cPickle as pickle
  4 | 
  5 | sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), 'src/py_module' ))
  6 | from QAData import *
  7 | import virtuoso
  8 | 
  9 | def focused_labeling_data(data_list):
 10 |     with io.open('trainingData/data.train.focused_labeling', 'w', encoding='utf8') as fo:
 11 |         for data in data_list:
 12 |             if data.text_attention_indices:
 13 |                 fo.write(u'%s\t%s\n' % (data.question, ' '.join([str(index) for index in data.text_attention_indices])))
 14 | 
 15 | def relation_ranking_data(data_list):
 16 |     fo = io.open('trainingData/data.train.relation_ranking', 'w', encoding='utf8')
 17 | 
 18 |     # Main Loop
 19 |     data_turple = []
 20 |     data_num = 0
 21 |     for data in data_list:
 22 |         question = data.question
 23 |         pos_rel  = data.relation
 24 |         
 25 |         # this condition will filter out any question that has only one word
 26 |         if len(question.split()) > 1:
 27 |             data_turple.append((question, pos_rel))
 28 |             data_num += 1
 29 | 
 30 |     # will choose to output data according to indices
 31 |     chosen_num = data_num - (data_num % 256)
 32 |     chosen_indices = np.sort(np.random.permutation(data_num)[:chosen_num])
 33 | 
 34 |     chosen_indices_idx = 0
 35 |     # for each data triple in data_turple list
 36 |     for idx in range(len(data_turple)):
 37 |         question = data_turple[idx][0]
 38 |         pos_rel  = data_turple[idx][1]
 39 |         if idx == chosen_indices[chosen_indices_idx]:
 40 |             fo.write(u'%s\t%s\n' % (question, pos_rel))
 41 |             chosen_indices_idx += 1
 42 | 
 43 |     fo.close()
 44 | 
 45 | def entity_ranking_data(data_list):
 46 |     fo = io.open('trainingData/data.train.entity_ranking', 'w', encoding='utf8')
 47 | 
 48 |     # Main Loop
 49 |     data_turple = []
 50 |     data_num = 0
 51 |     for data in data_list:
 52 |         pos_sub  = data.subject
 53 |         pos_rel  = data.relation
 54 |         question = data.question
 55 |         
 56 |         # this condition will filter out any question that has only one word
 57 |         if len(question.split()) > 1:
 58 |             data_turple.append((question, pos_sub, pos_rel))
 59 |             data_num += 1
 60 | 
 61 |     # will choose to output data according to indices
 62 |     chosen_num = data_num - (data_num % 256)
 63 |     chosen_indices = np.sort(np.random.permutation(data_num)[:chosen_num])
 64 | 
 65 |     chosen_indices_idx = 0
 66 |     # for each data triple in data_turple list
 67 |     for idx in range(len(data_turple)):
 68 |         question = data_turple[idx][0]
 69 |         pos_sub  = data_turple[idx][1]
 70 |         pos_rel  = data_turple[idx][2]
 71 |         if idx == chosen_indices[chosen_indices_idx]:
 72 |             fo.write(u'%s\t%s\t%s\n' % (question, pos_sub, pos_rel))
 73 |             chosen_indices_idx += 1
 74 | 
 75 |     fo.close()
 76 | 
 77 | def entity_typevec_data(data_list):
 78 |     type_dict = pickle.load(file('../KnowledgeBase/type.top-500.pkl', 'rb'))
 79 |     with io.open('trainingData/data.train.entity_typevec', 'w', encoding='utf8') as fo:
 80 |         for data in data_list:
 81 |             sub = data.subject
 82 |             question = data.question
 83 |             types = virtuoso.id_query_type(sub)
 84 |             types = [t for t in types if type_dict.has_key(t)]
 85 |             if len(types) > 0:
 86 |                 fo.write(u'%s\t%s\n' % (question, ' '.join([str(type_dict[t]) for t in types])))
 87 |             else:
 88 |                 fo.write(u'%s\t%d\n' % (question, len(type_dict)))
 89 | 
 90 | 
 91 | if __name__ == '__main__':
 92 |     data_list = pickle.load(file('PreprocessData/QAData.train.pkl', 'rb'))
 93 |     if not os.path.exists('trainingData'):
 94 |         os.mkdir('trainingData')
 95 |     print >> sys.stderr, 'focused_labeling_data'
 96 |     focused_labeling_data(data_list)
 97 |     print >> sys.stderr, 'relation_ranking_data'
 98 |     relation_ranking_data(data_list)
 99 |     # print >> sys.stderr, 'entity_ranking_data'
100 |     # entity_ranking_data(data_list)
101 |     print >> sys.stderr, 'entity_typevec_data'
102 |     entity_typevec_data(data_list)
103 | 


--------------------------------------------------------------------------------
/src/py_module/freebase.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import json
  3 | import urllib
  4 | import re
  5 | 
  6 | api_key = 'AIzaSyAsmMIiVDkF2Vfjt3cDwSHCmHF7QTS0_kY'
  7 | 
  8 | def suggest_id(query_string):
  9 |     service_url = 'https://www.googleapis.com/freebase/v1/search'
 10 |     params = {
 11 |         'query': query_string,
 12 |         'key': api_key
 13 |     }
 14 |     url = service_url + '?' + urllib.urlencode(params)
 15 |     response = json.loads(urllib.urlopen(url).read())
 16 | 
 17 |     suggested_entity = []
 18 |     for result in response['result']:
 19 |         if result['mid'].startswith('/m/'):
 20 |             suggested_entity.append('fb:m.' + str(result['mid'].split('/m/')[-1]))
 21 | 
 22 |     return suggested_entity
 23 | 
 24 | def mid2name(entity_mid):
 25 |     service_url = 'https://www.googleapis.com/freebase/v1/mqlread'
 26 |     query = [{'id': None, 'mid': entity_mid, 'name': None}]
 27 | 
 28 |     params = {
 29 |             'query': json.dumps(query),
 30 |             'key': api_key
 31 |     }
 32 | 
 33 |     url = service_url + '?' + urllib.urlencode(params)
 34 |     response = json.loads(urllib.urlopen(url).read())
 35 | 
 36 |     if response['result'][0].has_key('name') and response['result'][0]['name']:
 37 |         return response['result'][0]['name'].encode('utf-8')
 38 |     else:
 39 |         return None
 40 | 
 41 | def mid2id(entity_mid):
 42 |     service_url = 'https://www.googleapis.com/freebase/v1/mqlread'
 43 |     query = [{'mid': entity_mid, 'id': None}]
 44 | 
 45 |     params = {
 46 |             'query': json.dumps(query),
 47 |             'key': api_key
 48 |     }
 49 | 
 50 |     url = service_url + '?' + urllib.urlencode(params)
 51 |     response = json.loads(urllib.urlopen(url).read())
 52 | 
 53 |     if response['result'][0].has_key('id'):
 54 |         return response['result'][0]['id']
 55 |     else:
 56 |         return None
 57 | 
 58 | 
 59 | def id2mid(entity_id):
 60 |     service_url = 'https://www.googleapis.com/freebase/v1/mqlread'
 61 |     query = [{'id': entity_id, 'mid': None}]
 62 | 
 63 |     params = {
 64 |             'query': json.dumps(query),
 65 |             'key': api_key
 66 |     }
 67 | 
 68 |     url = service_url + '?' + urllib.urlencode(params)
 69 |     response = json.loads(urllib.urlopen(url).read())
 70 | 
 71 |     if response['result'][0].has_key('mid'):
 72 |         return response['result'][0]['mid']
 73 |     else:
 74 |         return None
 75 | 
 76 | def name2mids(entity_name):
 77 |     service_url = 'https://www.googleapis.com/freebase/v1/mqlread'
 78 |     query = [{'name': entity_name, 'mid': None, '/common/topic/alias': []}]
 79 | 
 80 |     params = {
 81 |             'query': json.dumps(query),
 82 |             'key': api_key
 83 |     }
 84 | 
 85 |     url = service_url + '?' + urllib.urlencode(params)
 86 |     response = json.loads(urllib.urlopen(url).read())
 87 | 
 88 |     mid_list = []
 89 |     for res in response['result']:
 90 |         if res.has_key('mid'):
 91 |             mid_list.append(str(res['mid']))
 92 |     return mid_list
 93 | 
 94 | def unquotekey(key, encoding=None):
 95 |     """
 96 |     unquote a namespace key and turn it into a unicode string
 97 |     """
 98 | 
 99 |     valid_always = string.ascii_letters + string.digits
100 | 
101 |     output = []
102 |     i = 0
103 |     while i < len(key):
104 |         if key[i] in valid_always:
105 |             output.append(key[i])
106 |             i += 1
107 |         elif key[i] in '_-' and i != 0 and i != len(key):
108 |             output.append(key[i])
109 |             i += 1
110 |         elif key[i] == '$' and i+4 < len(key):
111 |             # may raise ValueError if there are invalid characters
112 |             output.append(unichr(int(key[i+1:i+5],16)))
113 |             i += 5
114 |         else:
115 |             raise ValueError, "unquote key saw invalid character '%s' at position %d" % (key[i], i)
116 | 
117 |     ustr = u''.join(output)
118 |     
119 |     if encoding is None:
120 |         return ustr
121 | 
122 |     return ustr.encode(encoding)
123 | 
124 | # used to escape strings for sparql query
125 | def escape_string(s):
126 |     escape_map = {
127 |         '"' : '\\"',
128 |         '\r': '\\r',
129 |         '\n': '\\n',
130 |         '\t': '\\t',
131 |         '\b': '\\b',
132 |         '\f': '\\f'
133 |     }
134 |     s = s.replace('\\','\\u005c\\u005c')
135 |     for key, value in escape_map.items():
136 |         s = s.replace(key,value)
137 |     return '"' + s + '"'
138 | 
139 | # used to escape strings for sparql query
140 | def unescape_string(s):
141 |     unescape_map = {
142 |         '\\"': '"' ,
143 |         '\\r': '\r',
144 |         '\\n': '\n',
145 |         '\\t': '\t',
146 |         '\\b': '\b',
147 |         '\\f': '\f'
148 |     }
149 |     # strip the quote " on both sides
150 |     s = s[1:-1]
151 |     for key, value in unescape_map.items():
152 |         s = s.replace(key,value)
153 |     s = s.replace('\\u005c\\u005c', '\\')
154 |     return s
155 | 
156 | 
157 | 


--------------------------------------------------------------------------------
/src/model/Linear.lua:
--------------------------------------------------------------------------------
  1 | local Linear, parent = torch.class('Linear', 'nn.Module')
  2 | 
  3 | function Linear:__init(inputSize, outputSize, bias)
  4 |    parent.__init(self)
  5 |    local bias = ((bias == nil) and true) or bias
  6 |    self.weight = torch.Tensor(outputSize, inputSize)
  7 |    self.gradWeight = torch.Tensor(outputSize, inputSize)
  8 |    if bias then
  9 |       self.bias = torch.Tensor(outputSize)
 10 |       self.gradBias = torch.Tensor(outputSize)
 11 |    end
 12 |    self:reset()
 13 | end
 14 | 
 15 | function Linear:reset(stdv)
 16 |    if stdv then
 17 |       stdv = stdv * math.sqrt(3)
 18 |    else
 19 |       stdv = 1./math.sqrt(self.weight:size(2))
 20 |    end
 21 |    if nn.oldSeed then
 22 |       for i=1,self.weight:size(1) do
 23 |          self.weight:select(1, i):apply(function()
 24 |             return torch.uniform(-stdv, stdv)
 25 |          end)
 26 |       end
 27 |       if self.bias then
 28 |          for i=1,self.bias:nElement() do
 29 |             self.bias[i] = torch.uniform(-stdv, stdv)
 30 |          end
 31 |       end
 32 |    else
 33 |       self.weight:uniform(-stdv, stdv)
 34 |       if self.bias then self.bias:uniform(-stdv, stdv) end
 35 |    end
 36 |    return self
 37 | end
 38 | 
 39 | function Linear:updateOutput(input)
 40 |    if input:dim() == 1 then
 41 |       self.output:resize(self.weight:size(1))
 42 |       if self.bias then self.output:copy(self.bias) else self.output:zero() end
 43 |       self.output:addmv(1, self.weight, input)
 44 |    elseif input:dim() == 2 then
 45 |       local nframe = input:size(1)
 46 |       local nElement = self.output:nElement()
 47 |       self.output:resize(nframe, self.weight:size(1))
 48 |       if self.output:nElement() ~= nElement then
 49 |          self.output:zero()
 50 |       end
 51 |       self.addBuffer = self.addBuffer or input.new()
 52 |       if self.addBuffer:nElement() ~= nframe then
 53 |          self.addBuffer:resize(nframe):fill(1)
 54 |       end
 55 |       self.output:addmm(0, self.output, 1, input, self.weight:t())
 56 |       if self.bias then self.output:addr(1, self.addBuffer, self.bias) end
 57 |    elseif input:dim() >= 3 then
 58 |       -- computation happens in 2D views
 59 |       local dInput = input:view(-1, self.weight:size(2))
 60 |       local nframe = dInput:size(1)
 61 |       self.output:resize(nframe, self.weight:size(1))
 62 |       if self.output:nElement() ~= nElement then
 63 |          self.output:zero()
 64 |       end
 65 |       self.addBuffer = self.addBuffer or input.new()
 66 |       if self.addBuffer:nElement() ~= nframe then
 67 |          self.addBuffer:resize(nframe):fill(1)
 68 |       end
 69 |       self.output:addmm(0, self.output, 1, dInput, self.weight:t())
 70 |       if self.bias then self.output:addr(1, self.addBuffer, self.bias) end
 71 | 
 72 |       -- re-view output according to the input size
 73 |       local sizes = input:size()
 74 |       sizes[input:dim()] = self.weight:size(1)
 75 |       self.output = self.output:view(sizes)
 76 |    else
 77 |       error('input must be 1D, 2D or 3D Tensor')
 78 |    end
 79 | 
 80 |    return self.output
 81 | end
 82 | 
 83 | function Linear:updateGradInput(input, gradOutput)
 84 |    if self.gradInput then
 85 | 
 86 |       local nElement = self.gradInput:nElement()
 87 |       self.gradInput:resizeAs(input)
 88 |       if self.gradInput:nElement() ~= nElement then
 89 |          self.gradInput:zero()
 90 |       end
 91 |       if input:dim() == 1 then
 92 |          self.gradInput:addmv(0, 1, self.weight:t(), gradOutput)
 93 |       elseif input:dim() == 2 then
 94 |          self.gradInput:addmm(0, 1, gradOutput, self.weight)
 95 |       elseif input:dim() >= 3 then
 96 |          local dGradInput = self.gradInput:view(-1, self.weight:size(2))
 97 |          local dGradOutput = gradOutput:view(-1, self.weight:size(1))
 98 |          dGradInput:addmm(0, 1, dGradOutput, self.weight)
 99 |       end
100 | 
101 |       return self.gradInput
102 |    end
103 | end
104 | 
105 | function Linear:accGradParameters(input, gradOutput, scale)
106 |    scale = scale or 1
107 |    if input:dim() == 1 then
108 |       self.gradWeight:addr(scale, gradOutput, input)
109 |       if self.bias then self.gradBias:add(scale, gradOutput) end
110 |    elseif input:dim() == 2 then
111 |       self.gradWeight:addmm(scale, gradOutput:t(), input)
112 |       if self.bias then
113 |          self.gradBias:addmv(scale, gradOutput:t(), self.addBuffer)
114 |       end
115 |    elseif input:dim() == 3 then      
116 |       local dGradOutput = gradOutput:view(-1, self.weight:size(1))
117 |       local dInput      = input:view(-1, self.weight:size(2))
118 |       self.gradWeight:addmm(scale, dGradOutput:t(), dInput)
119 |       if self.bias then
120 |          self.gradBias:addmv(scale, dGradOutput:t(), self.addBuffer)
121 |       end
122 |    end
123 | end
124 | 
125 | -- we do not need to accumulate parameters when sharing
126 | Linear.sharedAccUpdateGradParameters = Linear.accUpdateGradParameters
127 | 
128 | 
129 | function Linear:__tostring__()
130 |   return torch.type(self) ..
131 |       string.format('(%d -> %d)', self.weight:size(2), self.weight:size(1)) ..
132 |       (self.bias == nil and ' without bias' or '')
133 | end
134 | 


--------------------------------------------------------------------------------
/Inference/joint_predict.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | import glob
  3 | import cPickle as pickle
  4 | import numpy as np
  5 | from sklearn import preprocessing
  6 | 
  7 | sys.path.append(os.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), 'src/py_module' ))
  8 | from QAData import *
  9 | 
 10 | def predict_func(data, rel_scores, ent_scores, alpha, top_rel_ratio):
 11 |     rel_scores = np.array(rel_scores)
 12 |     ent_scores = np.array(ent_scores)
 13 | 
 14 |     ent_threshold = np.min(ent_scores)
 15 |     top_sub_ids = np.where(ent_scores >= ent_threshold)[0]
 16 | 
 17 |     rel_threshold = top_rel_ratio * (np.max(rel_scores) - np.min(rel_scores)) + np.min(rel_scores)
 18 |     top_rel_ids = np.where(rel_scores >= rel_threshold)[0]
 19 | 
 20 |     rel_id_dict = {data.cand_rel[rel_id]:i for i, rel_id in enumerate(top_rel_ids)}
 21 | 
 22 |     score_mat = np.zeros((top_sub_ids.shape[0], top_rel_ids.shape[0]))
 23 | 
 24 |     for row_idx, sub_id in enumerate(top_sub_ids):
 25 |         for rel in data.sub_rels[sub_id]:
 26 |             if rel_id_dict.has_key(rel):
 27 |                 col_idx = rel_id_dict[rel]
 28 |                 #score_mat[row_idx, col_idx] = rel_scores[top_rel_ids[col_idx]]
 29 |                 score_mat[row_idx, col_idx] = 1
 30 | 
 31 |     # compute all the terms
 32 |     ent_scores = ent_scores[top_sub_ids]
 33 |     rel_scores = rel_scores[top_rel_ids]
 34 |     score_mat = np.exp(score_mat * alpha + ent_scores.reshape(score_mat.shape[0], 1) * (1 - alpha))
 35 | 
 36 |     # normalization
 37 |     score_mat /= np.sum(score_mat, 0)
 38 | 
 39 |     score_mat *= np.exp(rel_scores)
 40 | 
 41 |     top_sub_id, top_rel_id = np.unravel_index(np.argmax(score_mat), score_mat.shape)
 42 | 
 43 |     return [data.cand_sub[top_sub_ids[top_sub_id]]], data.cand_rel[top_rel_ids[top_rel_id]]
 44 | 
 45 | if __name__ == '__main__':
 46 |     # Parse input argument
 47 |     if len(sys.argv) == 5:
 48 |         data_fn = sys.argv[1]
 49 |         rel_score_fn = sys.argv[2]  
 50 |         ent_score_fn = sys.argv[3]
 51 |         alpha = float(sys.argv[4])
 52 |         top_rel_ratio = 0.0
 53 |     elif len(sys.argv) == 6:
 54 |         data_fn = sys.argv[1]
 55 |         rel_score_fn = sys.argv[2]
 56 |         ent_score_fn = sys.argv[3]
 57 |         alpha = float(sys.argv[4])
 58 |         top_rel_ratio = float(sys.argv[5])
 59 |     else:
 60 |         print 'Wrong arguments. Usage: '
 61 |         print '  python joint_disambiguation.py cpickle_file rel_score_file ent_score_file alpha [[rel_ratio]]'
 62 |         sys.exit(1)
 63 | 
 64 |     chosen_subs = 0
 65 |     total_subs = 0
 66 | 
 67 |     # Error information
 68 |     error_dir = './error_analysis'
 69 |     if not os.path.exists(error_dir):
 70 |         os.makedirs(error_dir)
 71 |     category = data_fn.split('.')[0]
 72 |     f_0_0 = file(os.path.join(error_dir, 'sub_cor_rel_cor.%s.txt'%(category)), 'wb')
 73 |     f_0_1 = file(os.path.join(error_dir, 'sub_cor_rel_err.%s.txt'%(category)), 'wb')
 74 |     f_1_0 = file(os.path.join(error_dir, 'sub_err_rel_cor.%s.txt'%(category)), 'wb')
 75 |     f_1_1 = file(os.path.join(error_dir, 'sub_err_rel_err.%s.txt'%(category)), 'wb')
 76 |     
 77 |     # Further disambiguation
 78 |     suffix = sys.argv[1].split('.')[-2]
 79 | 
 80 |     # Load cPickle file into data
 81 |     data_list = pickle.load(file(data_fn, 'rb'))
 82 |     print >> sys.stderr, 'finish loading cpickle file %d' % (len(data_list))
 83 | 
 84 |     rel_score_list = file(rel_score_fn, 'rb').readlines()
 85 |     if ent_score_fn:
 86 |         ent_score_list = file(ent_score_fn, 'rb').readlines()
 87 |     
 88 |     # Count the totol number of data
 89 |     corr_mat = np.zeros((2,2))
 90 | 
 91 |     for idx, data in enumerate(data_list):
 92 |         rel_scores = [float(score) for score in rel_score_list[idx].strip().split(' ')]
 93 |         if ent_score_fn:
 94 |             ent_scores = [float(score) for score in ent_score_list[idx].strip().split(' ')]
 95 |             top_sub, top_rel = predict_func(data, rel_scores, ent_scores, alpha, top_rel_ratio)
 96 |         else:
 97 |             top_sub, top_rel = rel_based(data, rel_scores)
 98 |         
 99 |         if len(top_sub) == 1 and top_sub[0] == data.subject:
100 |             if top_rel == data.relation:
101 |                 corr_mat[0,0] += 1
102 |                 print >> f_0_0, '%s\t%s\t%s\t%s\t%s' % (data.question, fb2www(data.subject), fb2www(top_sub), data.relation, top_rel)
103 |             else:
104 |                 corr_mat[0,1] += 1
105 |                 print >> f_0_1, '%s\t%s\t%s\t%s\t%s' % (data.question, fb2www(data.subject), fb2www(top_sub), data.relation, top_rel)
106 |         else:
107 |             if top_rel == data.relation:
108 |                 corr_mat[1,0] += 1
109 |                 print >> f_1_0, '%s\t%s\t%s\t%s\t%s' % (data.question, fb2www(data.subject), fb2www(top_sub), data.relation, top_rel)
110 |             else:
111 |                 corr_mat[1,1] += 1
112 |                 print >> f_1_1, '%s\t%s\t%s\t%s\t%s' % (data.question, fb2www(data.subject), fb2www(top_sub), data.relation, top_rel)
113 |         
114 | 
115 |     print alpha
116 |     print corr_mat / len(data_list)
117 |     print corr_mat
118 | 
119 |     f_0_0.close()
120 |     f_0_1.close()
121 |     f_1_0.close()
122 |     f_1_1.close()
123 | 


--------------------------------------------------------------------------------
/src/data/SeqRankingLoader.lua:
--------------------------------------------------------------------------------
  1 | local SeqRankingLoader = torch.class('SeqRankingLoader')
  2 | 
  3 | function SeqRankingLoader:__init(datafile, negSize, negRange, logger)
  4 |     -- sequence & pos match
  5 |     local data = torch.load(datafile)
  6 |     self.sequences  = data.seq
  7 |     self.posMatches = data.pos
  8 |     if data.len ~= nil then
  9 |         self.seqLengths = data.len
 10 |     end
 11 | 
 12 |     -- for negative sampling
 13 |     self.negSize  = negSize
 14 |     self.negRange = negRange
 15 | 
 16 |     -- additional variables
 17 |     self.batchSize = self.sequences[1]:size(2)
 18 |     self.numBatch  = #self.sequences
 19 |     self.currIdx   = 1
 20 |     self.indices   = randperm(self.numBatch)
 21 | 
 22 |     -- allocate memory
 23 |     self._negMatch = torch.LongTensor(self.negSize, self.batchSize)
 24 |     self._posMatch = torch.LongTensor(1, self.batchSize):expand(self.negSize, self.batchSize)
 25 | 
 26 |     if torch.Tensor():type() == 'torch.CudaTensor' then
 27 |         for i = 1, self.numBatch do
 28 |             self.sequences[i]  = self.sequences[i]:cuda()            
 29 |             self.posMatches[i] = self.posMatches[i]:cuda()
 30 |             if self.seqLengths ~= nil then
 31 |                 self.seqLengths[i] = self.seqLengths[i]:cuda()
 32 |             end
 33 |         end
 34 |         self.negMatch = torch.CudaTensor(self.negSize, self.batchSize)
 35 |     else
 36 |         self.negMatch = self._negMatch
 37 |     end
 38 | 
 39 |     if logger then
 40 |         self.logger = logger
 41 |         self.logger.info(string.rep('-', 50))
 42 |         self.logger.info(string.format('SeqRankingLoader Configurations:'))
 43 |         self.logger.info(string.format('    number of batch : %d', self.numBatch))
 44 |         self.logger.info(string.format('    data batch size : %d', self.batchSize))
 45 |         self.logger.info(string.format('    neg sample size : %d', self.negSize))
 46 |         self.logger.info(string.format('    neg sample range: %d', self.negRange))
 47 |     end
 48 | end
 49 | 
 50 | function SeqRankingLoader:setNegSize(negSize)
 51 |     self.negSize  = negSize
 52 |     
 53 |     -- allocate memory
 54 |     self._negMatch = torch.LongTensor(self.negSize, self.batchSize)
 55 |     self._posMatch = torch.LongTensor(1, self.batchSize):expand(self.negSize, self.batchSize)
 56 | 
 57 |     if torch.Tensor():type() == 'torch.CudaTensor' then
 58 |         self.negMatch = torch.CudaTensor(self.negSize, self.batchSize)
 59 |     end
 60 | end
 61 | 
 62 | function SeqRankingLoader:nextBatch(circular)
 63 |     if self.currIdx > self.numBatch then
 64 |         self.currIdx = 1
 65 |         self.indices = randperm(self.numBatch)
 66 |     end
 67 |     local dataIdx
 68 |     if circular then
 69 |         dataIdx = self.currIdx
 70 |     else
 71 |         dataIdx = self.indices[self.currIdx]
 72 |     end 
 73 |     self.currIdx = self.currIdx + 1
 74 | 
 75 |     self._posMatch:storage():copy(self.posMatches[dataIdx]:storage())
 76 |     self._negMatch:random(1, self.negRange)
 77 | 
 78 |     while torch.sum(torch.eq(self._negMatch, self._posMatch)) > 0 do
 79 |         self._negMatch:maskedFill(torch.eq(self._negMatch, self._posMatch), math.random(1, self.negRange))
 80 |     end
 81 | 
 82 |     if torch.Tensor():type() == 'torch.CudaTensor' then
 83 |         self.negMatch:copy(self._negMatch)
 84 |     end
 85 |     if self.seqLengths ~= nil then
 86 |         return self.sequences[dataIdx], self.posMatches[dataIdx], self.negMatch, self.seqLengths[dataIdx]
 87 |     else
 88 |         return self.sequences[dataIdx], self.posMatches[dataIdx], self.negMatch
 89 |     end
 90 | 
 91 | end
 92 | 
 93 | function createSeqRankingData(dataPath, savePath, wordVocab, fbVocab, batchSize)
 94 |     -- class variables
 95 |     local posMatches = {}
 96 |     local seqLengths = {}
 97 |     local sequences  = {}
 98 | 
 99 |     -- read data fileh
100 |     local file = io.open(dataPath, 'r')
101 |     local batchIdx = 0    -- the index of sequence batches
102 |     local seqIdx   = 0    -- sequence index within each batch
103 |     local line
104 |     
105 |     while true do
106 |         line = file:read()
107 |         if line == nil then break end
108 |         local fields = stringx.split(line, '\t')
109 |         
110 |         -- fields[1]: language sequence
111 |         local tokens = stringx.split(fields[1])
112 | 
113 |         -- allocate tensor memory
114 |         if seqIdx % batchSize == 0 then
115 |             print('batch: '..batchIdx)
116 |             seqIdx = 1
117 |             batchIdx = batchIdx + 1            
118 |             posMatches[batchIdx] = torch.LongTensor(batchSize):fill(0)
119 |             seqLengths[batchIdx] = torch.LongTensor(batchSize):fill(0)
120 |             sequences [batchIdx] = torch.LongTensor(#tokens, batchSize):fill(wordVocab.pad_index)
121 |         else
122 |             seqIdx = seqIdx + 1
123 |         end
124 | 
125 |         -- parse each token in sequence
126 |         for i = 1, #tokens do
127 |             local token = tokens[i]            
128 |             sequences[batchIdx][{i, seqIdx}] = wordVocab:index(token)
129 |         end
130 |         seqLengths[batchIdx][seqIdx] = #tokens
131 |         
132 |         -- fields[2]: positive match
133 |         posMatches[batchIdx][seqIdx] = fbVocab:index(fields[2])
134 | 
135 |     end
136 |     file:close()
137 | 
138 |     local data = {}
139 |     data.pos = posMatches
140 |     data.len = seqLengths
141 |     data.seq = sequences
142 | 
143 |     torch.save(savePath, data)
144 | end
145 | 


--------------------------------------------------------------------------------
/FocusedLabeling/train_crf.lua:
--------------------------------------------------------------------------------
  1 | require '..'
  2 | 
  3 | local cmd = torch.CmdLine()
  4 | cmd:text()
  5 | cmd:text('Training a Recurrent Neural Network to classify a sequence of words')
  6 | cmd:text()
  7 | cmd:text('Comandline Options')
  8 | 
  9 | cmd:option('-wordVocabSize',100002,'number of words in dictionary')
 10 | cmd:option('-wordEmbedDim',300,'size of word embedding')
 11 | cmd:option('-wordEmbedPath','../embedding/word.100k.glove.t7','pretained word embedding path')
 12 | 
 13 | cmd:option('-hiddenSize',256,'size of BiGRU unit')
 14 | cmd:option('-outputType',1,'output type of each rnn layer')
 15 | cmd:option('-numLayer',2,'number of BiGRU layers')
 16 | cmd:option('-maxSeqLen',40,'number of steps the BiGRU needs to unroll')
 17 | 
 18 | cmd:option('-numClass',2,'number of classes in classification')
 19 | 
 20 | cmd:option('-trainData','../data/train.focused_labeling.t7','training data file')
 21 | 
 22 | cmd:option('-initRange',0.08,'the range of uniformly initialize parameters')
 23 | cmd:option('-momentumEpoch',1,'after which epoch, the model starts to increase momentum')
 24 | cmd:option('-maxEpochs',100,'number of full passes through the training data')
 25 | 
 26 | cmd:option('-printEvery',50,'the frequency (# minibatches) of logging loss information')
 27 | cmd:option('-logFile','logs/log.BiGRU','log file to record training information')
 28 | cmd:option('-saveEvery',10,'the frequency (# epochs) of automatic saving trained models')
 29 | cmd:option('-saveFile','model.BiGRU','filename for saving trained model')
 30 | 
 31 | cmd:option('-useGPU',1,'which GPU is used for computation')
 32 | 
 33 | cmd:text()
 34 | 
 35 | ----------------------------- Basic Options -----------------------------
 36 | 
 37 | local opt = cmd:parse(arg)
 38 | local flog = logroll.file_logger(opt.logFile)
 39 | -- local flog = logroll.print_logger()
 40 | 
 41 | if opt.useGPU > 0 then
 42 |     require 'cutorch'
 43 |     require 'cunn'
 44 |     cutorch.setDevice(opt.useGPU)
 45 |     torch.setdefaulttensortype('torch.CudaTensor')
 46 |     flog.info(string.rep('-', 50))
 47 |     flog.info('Set default tensor type to CudaTensor')
 48 |     torch.manualSeed(1)
 49 |     cutorch.manualSeed(1)
 50 | end
 51 | 
 52 | ----------------------------- Data Loader -----------------------------
 53 | local loader = SeqLabelingLoader(opt.trainData, flog)
 54 | 
 55 | ----------------------------- Init Models -----------------------------
 56 | -- Init word embedding model
 57 | local wordEmbed = cudacheck(nn.LookupTable(opt.wordVocabSize, opt.wordEmbedDim))
 58 | -- loadPretrainedEmbed(wordEmbed, opt.wordEmbedPath)
 59 | 
 60 | -- Init Stacked BiGRU
 61 | local rnnconfig = {
 62 | 	hiddenSize = opt.hiddenSize,
 63 | 	maxSeqLen = opt.maxSeqLen, 
 64 | 	maxBatch = loader.batchSize, 
 65 | 	logger = flog
 66 | }
 67 | local RNN = {}
 68 | for l = 1, opt.numLayer do
 69 |     rnnconfig.inputSize = l == 1 and opt.wordEmbedDim or opt.hiddenSize * 2
 70 |     RNN[l] = BiGRU(rnnconfig)
 71 | end
 72 | 
 73 | -- Init linear project model
 74 | local linear = Linear(opt.hiddenSize*2, opt.numClass)
 75 | 
 76 | -- Init the linear CRF
 77 | local linearCRF = CRF(opt.numClass, opt.maxSeqLen, loader.batchSize)
 78 | 
 79 | local seqModel = nn.Sequential()
 80 | seqModel:add(wordEmbed)
 81 | for l = 1, opt.numLayer do
 82 | 	seqModel:add(nn.Dropout(0.7))
 83 | 	seqModel:add(RNN[l])
 84 | end
 85 | seqModel:add(linear)
 86 | 
 87 | local model = {}
 88 | model.seqModel = seqModel
 89 | model.linearCRF = linearCRF
 90 | 
 91 | ----------------------------- Optimization -----------------------------
 92 | -- Create tables to hold params and grads
 93 | local optimParams, optimGrads = {}, {}
 94 | for l = 1, opt.numLayer do
 95 |     optimParams[l], optimGrads[l] = RNN[l]:getParameters()    
 96 | end
 97 | optimParams[#optimParams+1], optimGrads[#optimGrads+1] = linear:getParameters()
 98 | optimParams[#optimParams+1], optimGrads[#optimGrads+1] = linearCRF:getParameters()
 99 | for i = 1, #optimParams do
100 |     optimParams[i]:uniform(-opt.initRange, opt.initRange)
101 | end
102 | optimParams[#optimParams+1], optimGrads[#optimGrads+1] = wordEmbed:getParameters()
103 | print(optimParams, optimGrads)
104 | 
105 | -- Configurations for Optimizer
106 | local optimConf = {lr = {}, logger = flog}
107 | for l = 1, #optimParams do optimConf['lr'][l] = 2e-2 end    
108 | local optimizer = AdaGrad(optimGrads, optimConf)
109 | 
110 | ----------------------------- Training -----------------------------
111 | 
112 | local avgProb = 0
113 | 
114 | local maxIters = opt.maxEpochs * loader.numBatch
115 | flog.info(string.rep('-', 40))
116 | flog.info('Begin Training')
117 | 
118 | for i = 1, maxIters do
119 |     xlua.progress(i, maxIters)
120 | 
121 |     ----------------------- clean gradients -----------------------
122 |     for i = 1, #optimGrads do optimGrads[i]:zero() end
123 | 
124 |     ----------------------- load minibatch ------------------------
125 |     local seq, labels = loader:nextBatch()    
126 |     local currSeqLen = seq:size(1)
127 | 
128 |     ------------------------ forward pass -------------------------
129 |     local seqVec = seqModel:forward(seq)
130 |     local prob = linearCRF:forward({seqVec, labels})
131 |     avgProb = avgProb + torch.mean(prob)
132 | 	
133 |     ------------------------ backward pass ------------------------
134 |     local d_seqVec = linearCRF:backward({seqVec, labels})    
135 |     seqModel:backward(seq, d_seqVec)
136 |     
137 |     ----------------------- parameter update ----------------------
138 |     -- optim for rnn, projection    
139 |     for l = 1, opt.numLayer do optimGrads[l]:clamp(-10, 10) end
140 |     optimizer:updateParams(optimParams, optimGrads)
141 | 
142 |     -- Logging 
143 |     if i % loader.numBatch == 0 then
144 |         flog.info(string.format("finish epoch %d", i / loader.numBatch))
145 |     end
146 | 
147 |     ------------------------ training info ------------------------
148 |     if i % opt.printEvery == 0 then
149 |         linearCRF:evaluate()
150 |         local pred = linearCRF:forward(seqVec)
151 |         local maskPred = torch.eq(pred, 2)
152 |         local maskTrue = torch.eq(labels, 2)
153 |         local corr = torch.eq(pred:type(torch.type(labels)), labels):cmul(maskTrue):sum()
154 | 
155 |         local p, r = corr / maskPred:sum(), corr / maskTrue:sum()
156 |         flog.info(string.format("iter %4d, avg prob = %5f, p = %3f, r = %3f, F1 = %3f", i, avgProb / opt.printEvery, p, r, 2 * p * r / (p + r)))
157 |         linearCRF:training()
158 |         avgProb = 0
159 |     end
160 | 
161 | 
162 |     if i % (loader.numBatch * opt.saveEvery) == 0 then
163 |         local epoch = i / loader.numBatch
164 |         print('Saving model after epoch ' .. epoch)
165 |         torch.save(opt.saveFile..'.'..opt.useGPU..'.'..epoch, model)
166 |     end
167 | end
168 | 


--------------------------------------------------------------------------------
/EntityTypeVec/train_ent_typevec.lua:
--------------------------------------------------------------------------------
  1 | require '..'
  2 | 
  3 | local cmd = torch.CmdLine()
  4 | cmd:text()
  5 | cmd:text('Training a Recurrent Neural Network to classify a sequence of words')
  6 | cmd:text()
  7 | cmd:text('Comandline Options')
  8 | 
  9 | cmd:option('-wordVocabSize',100003,'number of words in dictionary')
 10 | cmd:option('-wordEmbedDim',300,'size of word embedding')
 11 | cmd:option('-wordEmbedPath','../embedding/word.100k.glove.t7','pretained word embedding path')
 12 | 
 13 | cmd:option('-hiddenSize',256,'size of BiGRU unit')
 14 | cmd:option('-outputType',1,'output type of each rnn layer')
 15 | cmd:option('-numLayer',2,'number of BiGRU layers')
 16 | cmd:option('-maxSeqLen',200,'number of steps the BiGRU needs to unroll')
 17 | 
 18 | cmd:option('-numClass',1,'number of classes in classification')
 19 | 
 20 | cmd:option('-trainData','../data/train.entity_typevec.t7','training data file')
 21 | 
 22 | cmd:option('-optMethod','adamomentum','the optimization method used')
 23 | cmd:option('-initRange',0.08,'the range of uniformly initialize parameters')
 24 | cmd:option('-momentumEpoch',1,'after which epoch, the model starts to increase momentum')
 25 | cmd:option('-maxEpochs',500,'number of full passes through the training data')
 26 | 
 27 | cmd:option('-printEvery',100,'the frequency (# minibatches) of logging loss information')
 28 | cmd:option('-logFile','logs/log.BiGRU','log file to record training information')
 29 | cmd:option('-saveEvery',100,'the frequency (# epochs) of automatic saving trained models')
 30 | cmd:option('-saveFile','model.BiGRU','filename for saving trained model')
 31 | 
 32 | cmd:option('-useGPU',1,'which GPU is used for computation')
 33 | 
 34 | cmd:text()
 35 | 
 36 | ----------------------------- Basic Options -----------------------------
 37 | 
 38 | local opt = cmd:parse(arg)
 39 | -- local flog = logroll.file_logger(opt.logFile)
 40 | local flog = logroll.print_logger()
 41 | 
 42 | if opt.useGPU > 0 then
 43 |     cutorch.setDevice(opt.useGPU)
 44 |     torch.setdefaulttensortype('torch.CudaTensor')
 45 | end
 46 | 
 47 | ----------------------------- Data Loader -----------------------------
 48 | local loader  = SeqMultiLabelLoader(opt.trainData, flog)
 49 | 
 50 | ----------------------------- Init Models -----------------------------
 51 | -- Init word embedding model
 52 | local wordEmbed = cudacheck(nn.LookupTable(opt.wordVocabSize, opt.wordEmbedDim))
 53 | -- loadPretrainedEmbed(wordEmbed, opt.wordEmbedPath)
 54 | 
 55 | -- Init Stacked BiGRU
 56 | local rnnconfig = {
 57 | 	hiddenSize = opt.hiddenSize,
 58 | 	maxSeqLen = opt.maxSeqLen, 
 59 | 	maxBatch = loader.batchSize, 
 60 | 	logger = flog
 61 | }
 62 | 
 63 | local RNN = {}
 64 | for l = 1, opt.numLayer do
 65 |     rnnconfig.inputSize = l == 1 and opt.wordEmbedDim or opt.hiddenSize * 2
 66 |     RNN[l] = BiGRU(rnnconfig)
 67 | end
 68 | 
 69 | -- Init the Classification Criterion
 70 | local criterion  = nn.BCECriterion()
 71 | 
 72 | local selectLayer = BiRNNSelect()
 73 | local linearLayer = nn.Linear(2 * opt.hiddenSize, 501)
 74 | 
 75 | local model = nn.Sequential()
 76 | model:add(wordEmbed)
 77 | for l = 1, opt.numLayer do
 78 |     model:add(nn.Dropout(0.3))
 79 | 	model:add(RNN[l])
 80 | end
 81 | model:add(selectLayer)
 82 | model:add(linearLayer)
 83 | model:add(nn.Sigmoid())
 84 | 
 85 | ----------------------------- Optimization -----------------------------
 86 | -- Create tables to hold params and grads
 87 | local optimParams, optimGrads = {}, {}
 88 | for l = 1, opt.numLayer do
 89 |     optimParams[l], optimGrads[l] = RNN[l]:getParameters()
 90 |     optimParams[l]:uniform(-opt.initRange, opt.initRange)
 91 | end
 92 | optimParams[#optimParams+1], optimGrads[#optimGrads+1] = wordEmbed:getParameters()
 93 | optimParams[#optimParams+1], optimGrads[#optimGrads+1] = linearLayer:getParameters()
 94 | 
 95 | 
 96 | -- Configurations for Optimizer
 97 | local optimizer
 98 | if opt.optMethod == 'adamomentum' then
 99 |     local optimConf = {lr = {}, momentum = 0.9, logger = flog}
100 |     for l = 1, #optimParams do optimConf['lr'][l] = 1e-2 end
101 |     optimizer = AdaGrad(optimGrads, optimConf)
102 | elseif opt.optMethod == 'adagrad' then
103 |     local optimConf = {lr = {}, logger = flog}
104 |     for l = 1, #optimParams do optimConf['lr'][l] = 2e-2 end
105 |     optimizer = AdaGrad(optimGrads, optimConf)
106 | elseif opt.optMethod == 'momentum' then
107 |     local optimConf = {lr = {}, momentum = 0.9, annealing = 0.01, logger = flog}
108 |     for l = 1, #optimParams do optimConf['lr'][l] = 3e-1 end
109 |     optimizer = SGD(optimGrads, optimConf)
110 | elseif opt.optMethod == 'SGD' then
111 |     local optimConf = {lr = {}, annealing = 0.01, logger = flog}
112 |     for l = 1, #optimParams do optimConf['lr'][l] = 5e-3 end
113 |     optimizer = SGD(optimGrads, optimConf)
114 | else 
115 |     print ('Error: optMethod not match')
116 |     os.exit(-1)
117 | end
118 | 
119 | local lrWrd = 1e-4
120 | 
121 | ----------------------------- Training -----------------------------
122 | local sumLoss  = 0
123 | local sumCorr  = 0
124 | local sumTrue  = 0
125 | local maxIters = opt.maxEpochs * loader.numBatch
126 | flog.info(string.rep('-', 40))
127 | flog.info('Begin Training')
128 | 
129 | for i = 1, maxIters do
130 |     xlua.progress(i, maxIters)
131 | 
132 |     ----------------------- clean gradients -----------------------
133 |     for i = 1, #optimGrads do optimGrads[i]:zero() end
134 | 
135 |     ----------------------- load minibatch ------------------------
136 |     local seq, labels = loader:nextBatch()
137 |     local currSeqLen = seq:size(1)
138 | 
139 |     ------------------------ forward pass -------------------------
140 |     local predict = model:forward(seq)
141 | 
142 |     -------------------------- criterion --------------------------
143 |     local loss = criterion:forward(predict, labels)
144 |     sumLoss = sumLoss + loss
145 | 
146 |     local hardPred = torch.ge(predict, 0.5)
147 |     sumCorr = sumCorr + torch.cmul(hardPred:type(torch.type(labels)), labels):sum()
148 |     sumTrue = sumTrue + labels:sum()
149 | 
150 |     ------------------------ backward pass ------------------------
151 |     local d_predict = criterion:backward(predict, labels)
152 |     model:backward(seq, d_predict)    
153 |     
154 |     ----------------------- parameter update ----------------------
155 |     -- optim for rnn, projection
156 |     for l = 1, opt.numLayer do optimGrads[l]:clamp(-10, 10) end
157 |     optimizer:updateParams(optimParams, optimGrads)
158 | 
159 |     -- Logging 
160 |     if i % loader.numBatch == 0 then
161 |         flog.info(string.format("finish epoch %d", i / loader.numBatch))
162 |     end
163 | 
164 |     ------------------------ training info ------------------------
165 |     if i % opt.printEvery == 0 then
166 |         flog.info(string.format("iter %4d, loss = %5f, corr = %5f", 
167 |             i, sumLoss / opt.printEvery, sumCorr / sumTrue))
168 |         sumLoss, sumCorr, sumTrue = 0, 0, 0
169 |     end
170 |     if i % (loader.numBatch * opt.saveEvery) == 0 then
171 |         local epoch = i / loader.numBatch
172 |         print('Saving model after epoch ' .. epoch)
173 |         torch.save(opt.saveFile..'.'..opt.useGPU..'.'..epoch, model)
174 |     end
175 | end
176 | 


--------------------------------------------------------------------------------
/SimpleQuestions/PreprocessData/process_rawdata.py:
--------------------------------------------------------------------------------
  1 | # This tool preprocess the original simple question dataset in 5 aspects:
  2 | #   1. change triple information in to fb:... format 
  3 | #   2. replace the escape ('//') simbol in original question
  4 | #   3. tokenize the question
  5 | #   4. change the tokenized question into lower cases
  6 | #   5. add another fields which indicates the token number of the question
  7 | 
  8 | import multiprocessing as mp
  9 | import sys, os, io, re
 10 | import cPickle as pickle
 11 | from nltk import word_tokenize
 12 | sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), 'src/py_module' ))
 13 | import QAData
 14 | import virtuoso
 15 | 
 16 | split = None
 17 | 
 18 | def extract(line):
 19 |     fields   = line.strip().split('\t')
 20 |     sub = 'fb:' + fields[0].split('www.freebase.com/')[-1].replace('/','.')
 21 |     rel = 'fb:' + fields[1].split('www.freebase.com/')[-1].replace('/','.')
 22 |     obj = 'fb:' + fields[2].split('www.freebase.com/')[-1].replace('/','.')
 23 |     if sub == 'fb:m.07s9rl0':
 24 |         sub = 'fb:m.02822'
 25 |     if obj == 'fb:m.07s9rl0':
 26 |         obj = 'fb:m.02822'
 27 |     question = fields[-1].replace('\\\\','')
 28 |     tokens   = word_tokenize(question)
 29 |     return ' '.join(tokens).lower(), sub, rel, obj, len(tokens)
 30 | 
 31 | def get_indices(src_list, pattern_list):
 32 |     indices = None
 33 |     for i in range(len(src_list)):
 34 |         match = 1
 35 |         for j in range(len(pattern_list)):
 36 |             if src_list[i+j] != pattern_list[j]:
 37 |                 match = 0
 38 |                 break
 39 |         if match:
 40 |             indices = range(i, i + len(pattern_list))
 41 |             break
 42 |     return indices
 43 | 
 44 | def query_golden_subs(data):
 45 |     golden_subs = []
 46 |     if data.text_subject:
 47 |         # extract fields needed
 48 |         relation     = data.relation
 49 |         subject      = data.subject
 50 |         text_subject = data.text_subject
 51 |         
 52 |         # query name / alias by subject (id)
 53 |         candi_sub_list = virtuoso.str_query_id(text_subject)
 54 | 
 55 |         # add candidates to data
 56 |         for candi_sub in candi_sub_list:
 57 |             candi_rel_list = virtuoso.id_query_out_rel(candi_sub)
 58 |             if relation in candi_rel_list:
 59 |                 golden_subs.append(candi_sub)
 60 | 
 61 |     if len(golden_subs) == 0:
 62 |         golden_subs = [data.subject]
 63 | 
 64 |     return golden_subs
 65 | 
 66 | def reverse_link(question, subject):
 67 |     # get question tokens
 68 |     tokens = question.split()
 69 | 
 70 |     # init default value of returned variables
 71 |     text_subject = None
 72 |     text_attention_indices = None
 73 | 
 74 |     # query name / alias by node_id (subject)
 75 |     res_list = virtuoso.id_query_str(subject)
 76 | 
 77 |     # sorted by length
 78 |     for res in sorted(res_list, key = lambda res: len(res), reverse = True):
 79 |         pattern = r'(^|\s)(%s)($|\s)' % (re.escape(res))
 80 |         if re.search(pattern, question):
 81 |             text_subject = res
 82 |             text_attention_indices = get_indices(tokens, res.split())
 83 |             break
 84 | 
 85 |     return text_subject, text_attention_indices
 86 | 
 87 | def form_anonymous_quesion(data):
 88 |     anonymous_question = None
 89 |     if data.text_attention_indices:
 90 |         anonymous_tokens = []
 91 |         tokens = data.question.split()
 92 |         anonymous_tokens.extend(tokens[:data.text_attention_indices[0]])
 93 |         anonymous_tokens.append('X')
 94 |         anonymous_tokens.extend(tokens[data.text_attention_indices[-1]+1:])
 95 |         anonymous_question = ' '.join(anonymous_tokens)
 96 | 
 97 |     return anonymous_question
 98 | 
 99 | def form_type_based_question(data):
100 |     typed_question = None
101 |     num_type_token = -1
102 |     if data.text_attention_indices and data.sub_ntp:
103 |         tokens = data.question.split()
104 |         new_tokens = []
105 |         new_tokens.extend(tokens[:data.text_attention_indices[0]])
106 |         new_tokens.append(data.sub_ntp)
107 |         new_tokens.extend(tokens[data.text_attention_indices[-1]+1:])
108 |         typed_question = ' '.join(new_tokens)
109 |         num_type_token = len(new_tokens)
110 | 
111 |     return typed_question, num_type_token
112 | 
113 | def knowledge_graph_attributes(data_list, pid = 0):
114 |     # Open log file
115 |     log_file = file('logs/log.%s.%d.txt'%(split, pid), 'wb')
116 | 
117 |     succ_att_link = 0
118 |     qadata_list = []
119 |     for data_index, data_tuple in enumerate(data_list):
120 |         # Step-1: create QAData instance
121 |         data = QAData.QAData(data_tuple)
122 | 
123 |         # Step-2: reverse linking
124 |         data.text_subject, data.text_attention_indices = reverse_link(data.question, data.subject)
125 | 
126 |         # Step-3: create anonymous question for LTG-CNN+
127 |         if split == 'train':
128 |             data.anonymous_question = form_anonymous_quesion(data)
129 | 
130 |         qadata_list.append(data)
131 |         
132 |         # logging
133 |         if data.text_subject:
134 |             succ_att_link += 1
135 |         print >> log_file, '[%d] attention: %f' % (data_index, succ_att_link / float(data_index+1))
136 | 
137 |     pickle.dump(qadata_list, file('temp.%s.pkl'%(pid), 'wb'))
138 |     log_file.close()
139 | 
140 | def process(num_process, data_list):
141 |     # Make dir
142 |     if not os.path.exists('logs'):
143 |         os.mkdir('logs')
144 | 
145 |     # Split workload
146 |     length = len(data_list)
147 |     data_per_p = (length + num_process - 1) / num_process
148 | 
149 |     # Spawn processes
150 |     processes = [
151 |         mp.Process(
152 |             target = knowledge_graph_attributes,
153 |             args = ( 
154 |                 data_list[pid*data_per_p:(pid+1)*data_per_p],
155 |                 pid
156 |                 )
157 |             )
158 |         for pid in range(num_process)
159 |     ]
160 | 
161 |     # Run processes
162 |     for p in processes:
163 |         p.start()
164 | 
165 |     # Exit the completed processes
166 |     for p in processes:
167 |         p.join()
168 | 
169 | if __name__ == '__main__':
170 |     
171 |     if len(sys.argv) != 3:
172 |         print 'python preprocess.py input_file num_process'
173 |         sys.exit(-1)
174 | 
175 |     in_file_path = sys.argv[1]
176 |     num_process = int(sys.argv[2])
177 | 
178 |     split = in_file_path.split('_')[-1].split('.')[0]
179 | 
180 |     in_file = io.open(in_file_path, 'r', encoding='utf8')
181 |     
182 |     data_list = []
183 |     for line in in_file:
184 |         question, sub, rel, obj, length = extract(line)
185 |         data_list.append((question, sub, rel, obj, length))
186 | 
187 |     process(num_process, sorted(data_list, key = lambda data: data[-1], reverse = True))
188 | 
189 |     # Merge all data [this will preserve the order]
190 |     new_data_list = []
191 |     for p in range(num_process):
192 |         temp_fn = 'temp.%d.pkl'%(p)
193 |         new_data_list.extend(pickle.load(file(temp_fn, 'rb')))
194 |         os.remove(temp_fn)
195 | 
196 |     pickle.dump(new_data_list, file('QAData.%s.pkl'%(split), 'wb'))
197 | 
198 |     in_file.close()
199 | 


--------------------------------------------------------------------------------
/Inference/query_candidates.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os, sys, re
  3 | import multiprocessing as mp
  4 | import cPickle as pickle
  5 | import numpy as np
  6 | 
  7 | sys.path.append(os.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), 'src/py_module' ))
  8 | from QAData import *
  9 | import virtuoso
 10 | import freebase
 11 | 
 12 | type_dict = None
 13 | stop_words = ['of', 'on', 'the', 'off', 'in', 'for', 'with', 'a', 'an', 'did', 'does', 'good', 'or', 'not', \
 14 |               "'", '?', '!', ':', ',']
 15 | 
 16 | def generate_ngrams(tokens, min_len, max_len):
 17 |     ngrams = []
 18 |     num_token = len(tokens)
 19 |     assert(num_token >= max_len)
 20 |     for num in range(min_len, max_len+1):
 21 |         for i in range(num_token-num+1):
 22 |             ngram = ' '.join(tokens[i:i+num])
 23 |             if not ngram in stop_words:
 24 |                 ngrams.append(ngram)
 25 |     return list(set(ngrams))
 26 | 
 27 | def beg_end_indices(scores, threshold):
 28 |     seq_len = len(scores)
 29 |     max_idx = np.argmax(scores)
 30 |     beg_idx = max_idx
 31 |     end_idx = max_idx
 32 |     for i in range(max_idx-1,-1,-1):
 33 |         if np.abs(scores[i+1] - scores[i]) / scores[i+1] > threshold:
 34 |             break
 35 |         beg_idx = i 
 36 |     for i in range(max_idx+1,seq_len,1):
 37 |         if np.abs(scores[i-1] - scores[i]) / scores[i-1] > threshold:
 38 |             break
 39 |         end_idx = i 
 40 |     return beg_idx, end_idx
 41 | 
 42 | def form_anonymous_quesion(question, beg_idx, end_idx):
 43 |     anonymous_tokens = []
 44 |     tokens = question.split()
 45 |     anonymous_tokens.extend(tokens[:beg_idx])
 46 |     anonymous_tokens.append('X')
 47 |     anonymous_tokens.extend(tokens[end_idx+1:])
 48 |     anonymous_question = ' '.join(anonymous_tokens)
 49 | 
 50 |     return anonymous_question
 51 | 
 52 | def query_candidate(data_list, pred_list, pid = 0):
 53 |     log_file = open('logs/log.%d.txt'%(pid), 'wb')
 54 |     new_data_list = []
 55 | 
 56 |     succ_match = 0
 57 |     data_index = 0
 58 |     for pred, data in zip(pred_list, data_list):
 59 |         # incremnt data_index
 60 |         data_index += 1
 61 | 
 62 |         # extract scores
 63 |         scores = [float(score) for score in pred.strip().split()]
 64 | 
 65 |         # extract fields needed
 66 |         relation = data.relation
 67 |         subject  = data.subject
 68 |         question = data.question
 69 |         tokens   = question.split()
 70 |         
 71 |         # query name / alias by subject (id)
 72 |         candi_sub_list = []
 73 |         for threshold in np.arange(0.5, 0.0, -0.095):
 74 |             beg_idx, end_idx = beg_end_indices(scores, threshold)
 75 |             sub_text = ' '.join(tokens[beg_idx:end_idx+1])
 76 |             candi_sub_list.extend(virtuoso.str_query_id(sub_text))
 77 |             if len(candi_sub_list) > 0:
 78 |                 break
 79 | 
 80 |         # # using freebase suggest
 81 |         # if len(candi_sub_list) == 0:
 82 |         #     beg_idx, end_idx = beg_end_indices(scores, 0.2)
 83 |         #     sub_text = ' '.join(tokens[beg_idx:end_idx+1])
 84 |         #     sub_text = re.sub(r'\s(\w+)\s(n?\'[tsd])\s', r' \1\2 ', sub_text)
 85 |         #     suggest_subs = []
 86 |         #     for trial in range(3):
 87 |         #         try:
 88 |         #             suggest_subs = freebase.suggest_id(sub_text)
 89 |         #             break
 90 |         #         except:
 91 |         #             print >> sys.stderr, 'freebase suggest_id error: trial = %d, sub_text = %s' % (trial, sub_text)
 92 |         #     candi_sub_list.extend(suggest_subs)
 93 |         #     if data.subject not in candi_sub_list:
 94 |         #         print >> log_file, '%s\t\t%s\t\t%s\t\t%d' % (sub_text, data.text_subject, fb2www(data.subject), len(candi_sub_list))
 95 | 
 96 |         # if potential subject founded
 97 |         if len(candi_sub_list) > 0:
 98 |             # add candidates to data
 99 |             for candi_sub in candi_sub_list:
100 |                 candi_rel_list = virtuoso.id_query_out_rel(candi_sub)
101 |                 if len(candi_rel_list) > 0:
102 |                     if type_dict:
103 |                         candi_type_list = [type_dict[t] for t in virtuoso.id_query_type(candi_sub) if type_dict.has_key(t)]
104 |                         if len(candi_type_list) == 0:
105 |                             candi_type_list.append(len(type_dict))
106 |                         data.add_candidate(candi_sub, candi_rel_list, candi_type_list)
107 |                     else:
108 |                         data.add_candidate(candi_sub, candi_rel_list)
109 |             data.anonymous_question = form_anonymous_quesion(question, beg_idx, end_idx)
110 |             
111 |             # make score mat
112 |         if hasattr(data, 'cand_sub') and hasattr(data, 'cand_rel'):
113 |             # remove duplicate relations
114 |             data.remove_duplicate()
115 | 
116 |             # append to new_data_list
117 |             new_data_list.append(data)
118 |                 
119 |         # loging information
120 |         if subject in candi_sub_list:
121 |             succ_match += 1
122 | 
123 |         if data_index % 100 == 0:
124 |             print >> sys.stderr, '[%d] %d / %d' % (pid, data_index, len(data_list))
125 | 
126 |     print >> log_file, '%d / %d = %f ' % (succ_match, data_index+1, succ_match / float(data_index+1))
127 | 
128 |     log_file.close()
129 |     pickle.dump(new_data_list, file('temp.%d.cpickle'%(pid),'wb'))
130 | 
131 | if __name__ == '__main__':
132 |     # Check number of argv
133 |     if len(sys.argv) == 4:
134 |         # Parse input argument
135 |         num_process = int(sys.argv[1])
136 |         data_list   = pickle.load(file(sys.argv[2], 'rb'))
137 |         pred_list   = file(sys.argv[3], 'rb').readlines()
138 |     elif len(sys.argv) == 5:
139 |         # Parse input argument
140 |         num_process = int(sys.argv[1])
141 |         data_list   = pickle.load(file(sys.argv[2], 'rb'))
142 |         pred_list   = file(sys.argv[3], 'rb').readlines()
143 |         type_dict   = pickle.load(file(sys.argv[4], 'rb'))
144 |     else:
145 |         print 'usage: python query_candidate_relation.py num_processes QAData_cpickle_file attention_score_file [[type_dict]]'
146 |         sys.exit(-1)
147 | 
148 |     suffix = sys.argv[2].split('.')[-2]
149 | 
150 |     assert(len(data_list) == len(pred_list))
151 | 
152 |     # Create log directory
153 |     log_dir = './logs'
154 |     if not os.path.exists(log_dir):
155 |         os.makedirs(log_dir)
156 | 
157 |     # Allocate dataload
158 |     length = len(data_list)
159 |     data_per_p = (length + num_process - 1) / num_process
160 | 
161 |     # Spawn processes
162 |     processes = [ 
163 |         mp.Process(
164 |             target = query_candidate,
165 |             args = (data_list[pid*data_per_p:(pid+1)*data_per_p], 
166 |                     pred_list[pid*data_per_p:(pid+1)*data_per_p], 
167 |                     pid)   
168 |         )   
169 |         for pid in range(num_process)
170 |     ]  
171 | 
172 |     # Run processes
173 |     for p in processes:
174 |         p.start()
175 | 
176 |     # Exit the completed processes
177 |     for p in processes:
178 |         p.join()
179 | 
180 |     # Merge all data [this will preserve the order]
181 |     new_data_list = []
182 |     for p in range(num_process):
183 |         temp_fn = 'temp.%d.cpickle'%(p)
184 |         new_data_list.extend(pickle.load(file(temp_fn, 'rb')))
185 | 
186 |     pickle.dump(new_data_list, file('QAData.label.%s.cpickle'%(suffix), 'wb'))
187 |     
188 |     # Remove temp data
189 |     for p in range(num_process):
190 |         temp_fn = 'temp.%d.cpickle'%(p)
191 |         os.remove(temp_fn)
192 | 


--------------------------------------------------------------------------------
/Inference/joint_disambiguation.py:
--------------------------------------------------------------------------------
  1 | import os, sys, re
  2 | import glob
  3 | import cPickle as pickle
  4 | import numpy as np
  5 | from sklearn import preprocessing
  6 | 
  7 | sys.path.append(os.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), 'src/py_module' ))
  8 | from QAData import *
  9 | 
 10 | def top_sub_rel(data, rel_scores, ent_scores, alpha, rel_ratio):
 11 |     rel_scores = np.array(rel_scores)
 12 |     ent_scores = np.array(ent_scores)
 13 | 
 14 |     #ent_threshold = np.min(ent_scores)
 15 |     #top_sub_ids = np.where(ent_scores >= ent_threshold)[0]
 16 |     top_sub_ids = np.arange(ent_scores.shape[0])
 17 | 
 18 |     rel_threshold = rel_ratio * (np.max(rel_scores) - np.min(rel_scores)) + np.min(rel_scores)
 19 |     top_rel_ids = np.where(rel_scores >= rel_threshold)[0]
 20 |     #top_rel_ids = np.arange(rel_scores.shape[0])
 21 | 
 22 |     # dict for top relation column idx
 23 |     rel_id_dict = {data.cand_rel[rel_id]:i for i, rel_id in enumerate(top_rel_ids)}
 24 | 
 25 |     score_mat = np.zeros((top_sub_ids.shape[0], top_rel_ids.shape[0]))
 26 |     
 27 |     # fill the score matrix
 28 |     for row_idx, sub_id in enumerate(top_sub_ids):
 29 |         for rel in data.sub_rels[sub_id]:
 30 |             if rel_id_dict.has_key(rel):
 31 |                 col_idx = rel_id_dict[rel]
 32 |                 #score_mat[row_idx, col_idx] = rel_scores[top_rel_ids[col_idx]]
 33 |                 score_mat[row_idx, col_idx] = 1
 34 | 
 35 |     # compute all the terms
 36 |     ent_scores = ent_scores[top_sub_ids]
 37 |     rel_scores = rel_scores[top_rel_ids]
 38 | 
 39 |     # u(s,r,q) = alpha * I(s->r) + (1 - alpha) * g(q)^T E(s)
 40 |     score_mat = np.exp(score_mat * alpha + ent_scores.reshape(score_mat.shape[0], 1) * (1 - alpha))
 41 | 
 42 |     # p(s|q,r) propto u(s,r,q)
 43 |     score_mat /= np.sum(score_mat, 0)
 44 |     
 45 |     # p(s|q,r) * p(r|q)
 46 |     score_mat *= np.exp(rel_scores)
 47 | 
 48 |     #max_score = np.max(score_mat)
 49 |     #if np.where(score_mat == max_score)[0].shape[0] > 1:
 50 |     #    print np.where(score_mat == max_score)[0].shape[0]
 51 | 
 52 |     top_sub_id, top_rel_id = np.unravel_index(np.argmax(score_mat), score_mat.shape)
 53 | 
 54 |     return [data.cand_sub[top_sub_ids[top_sub_id]]], data.cand_rel[top_rel_ids[top_rel_id]]
 55 | 
 56 | def math(data, rel_scores, ent_scores, alpha = 0.5):
 57 |     rel_id_dict = {data.cand_rel[i]:i for i in range(len(data.cand_rel))}
 58 |     #rel_scores = preprocessing.scale(rel_scores)
 59 |     #ent_scores = preprocessing.scale(ent_scores)
 60 |  
 61 |     score_mat = np.zeros((len(data.cand_sub), len(data.cand_rel)))
 62 |     for i in range(len(data.cand_sub)):
 63 |         for rel in data.sub_rels[i]:
 64 |             j = rel_id_dict[rel]
 65 |             score_mat[i, j] = rel_scores[j]
 66 |     
 67 |     # compute all the terms
 68 |     score_mat = np.exp(score_mat * alpha + np.array(ent_scores).reshape(score_mat.shape[0], 1) * (1 - alpha))
 69 | 
 70 |     # normalization
 71 |     score_mat /= np.sum(score_mat, 0)
 72 | 
 73 |     score_mat *= np.exp(rel_scores)
 74 | 
 75 |     top_sub_id, top_rel_id = np.unravel_index(np.argmax(score_mat), score_mat.shape)
 76 | 
 77 |     return [data.cand_sub[top_sub_id]], data.cand_rel[top_rel_id]
 78 | 
 79 | def weighted_avg(data, rel_scores, ent_scores, alpha = 0.2):
 80 |     rel_id_dict = {data.cand_rel[i]:i for i in range(len(data.cand_rel))}
 81 |     # rel_scores = preprocessing.scale(rel_scores)
 82 |     # ent_scores = preprocessing.scale(ent_scores)
 83 | 
 84 |     score_mat = np.zeros((len(data.cand_sub), len(data.cand_rel)))
 85 |     for i in range(len(data.cand_sub)):
 86 |         for rel in data.sub_rels[i]:
 87 |             j = rel_id_dict[rel]
 88 |             score_mat[i, j] = rel_scores[j]
 89 | 
 90 |     sub_scores = alpha * np.array(ent_scores) + (1 - alpha) * np.sum(score_mat, 1)
 91 |     top_sub_score = np.max(sub_scores)
 92 |     top_sub_ids = []
 93 |     for sub_id in np.argsort(sub_scores)[::-1]:
 94 |         if sub_scores[sub_id] < top_sub_score:
 95 |             break
 96 |         top_sub_ids.append(sub_id)
 97 | 
 98 |     top_rel = data.cand_rel[np.argmax(score_mat[top_sub_ids[0]])]
 99 |     top_subs = [data.cand_sub[sub_id] for sub_id in top_sub_ids]
100 |     return top_subs, top_rel
101 | 
102 | def rel_based(data, rel_scores):
103 |     rel_scores = np.array(rel_scores)
104 |     top_rel_ids = np.argsort(rel_scores)
105 |     # rel_scores[top_rel_ids[:-2]] = 0
106 |     # reverse rel->id dict
107 |     rel_id_dict = {data.cand_rel[i]:i for i in range(len(data.cand_rel))}
108 | 
109 |     score_mat = np.zeros((len(data.cand_sub), len(data.cand_rel)))
110 |     for i in range(len(data.cand_sub)):
111 |         for rel in data.sub_rels[i]:
112 |             j = rel_id_dict[rel]
113 |             score_mat[i, j] = rel_scores[j]
114 | 
115 |     sub_score = np.sum(score_mat, 1)
116 |     top_subscore = np.max(sub_score)
117 |     top_subid = np.argmax(sub_score)
118 |     top_relid = np.argmax(score_mat[top_subid])
119 | 
120 |     return [data.cand_sub[top_subid]], data.cand_rel[top_relid]
121 | 
122 | if __name__ == '__main__':
123 |     # Parse input argument
124 |     if len(sys.argv) == 3:
125 |         data_fn  = sys.argv[1]
126 |         rel_score_fn = sys.argv[2]  
127 |         ent_score_fn = None
128 |     elif len(sys.argv) == 4:
129 |         data_fn  = sys.argv[1]
130 |         rel_score_fn = sys.argv[2]  
131 |         ent_score_fn = sys.argv[3]
132 |     else:
133 |         print 'Wrong arguments. Usage: '
134 |         print '  python joint_disambiguation.py cpickle_file rel_score_file ent_score_file'
135 |         sys.exit(1)
136 | 
137 |     chosen_subs = 0
138 |     total_subs = 0
139 | 
140 |     count_multi = 0
141 | 
142 |     # Error information
143 |     error_dir = './error_analysis'
144 |     if not os.path.exists(error_dir):
145 |         os.makedirs(error_dir)
146 |     category = data_fn.split('.')[0]
147 |     
148 |     # Load cPickle file into data
149 |     data_list = pickle.load(file(data_fn, 'rb'))
150 |     print >> sys.stderr, 'finish loading cpickle file %d' % (len(data_list))
151 | 
152 |     rel_score_list = file(rel_score_fn, 'rb').readlines()
153 |     if ent_score_fn:
154 |         ent_score_list = file(ent_score_fn, 'rb').readlines()
155 |     
156 |     # Count the totol number of data
157 |     for rel_ratio in [0, 0.75, 0.85, 0.95]:
158 |     #for rel_ratio in [0]:
159 |         print '=' * 120
160 |         #for alpha in np.arange(0.05,1.00,0.05):
161 |         for alpha in np.arange(0.05,1.01,0.05):
162 |             # Rescore for each data in data_list
163 |             corr_mat = np.zeros((2,2))
164 | 
165 |             count = 0
166 |             for idx, data in enumerate(data_list):
167 |                 rel_scores = [float(score) for score in rel_score_list[idx].strip().split(' ')]
168 |                 ent_scores = [float(score) for score in ent_score_list[idx].strip().split(' ')]
169 |                 # top_sub, top_rel = rel_based(data, rel_scores)
170 |                 # top_sub, top_rel = weighted_avg(data, rel_scores, ent_scores)
171 |                 # top_sub, top_rel = math(data, rel_scores, ent_scores, alpha)
172 |                 top_sub, top_rel = top_sub_rel(data, rel_scores, ent_scores, alpha, rel_ratio)
173 |                 
174 |                 if len(top_sub) == 1 and top_sub[0] == data.subject:
175 |                     if top_rel == data.relation:
176 |                         corr_mat[0,0] += 1
177 |                     else:
178 |                         corr_mat[0,1] += 1
179 |                 else:
180 |                     if top_rel == data.relation:
181 |                         corr_mat[1,0] += 1
182 |                     else:
183 |                         corr_mat[1,1] += 1
184 |                 
185 |             print '%4.3f, %4.3f, %d' % (alpha, rel_ratio, corr_mat[0,0])
186 | 


--------------------------------------------------------------------------------
/src/model/model_utils.lua:
--------------------------------------------------------------------------------
  1 | -- cuda utils
  2 | function cudacheck(input)
  3 |     if torch.Tensor():type() == 'torch.CudaTensor' then
  4 |         input = input:cuda()
  5 |     end
  6 |     return input
  7 | end
  8 | 
  9 | function range(b, e)
 10 |     local result = cudacheck(torch.LongTensor.range(torch.LongTensor(e-b+1),b,e))
 11 |     return result
 12 | end
 13 | 
 14 | function randperm(up)
 15 |     local result = cudacheck(torch.LongTensor.randperm(torch.LongTensor(up),up))
 16 |     return result
 17 | end
 18 | 
 19 | -- loading embedding
 20 | function loadPretrainedEmbed (model, embedPath, renorm) 
 21 |     local pretrainedEmbed = torch.load(embedPath)
 22 |     assert(model.weight:size(2) == pretrainedEmbed:size(2), 'Embedding size does not match')
 23 |     model.weight:narrow(1, 1, pretrainedEmbed:size(1)):copy(pretrainedEmbed)
 24 |     if renorm then
 25 |         model.weight:renorm(2, 2, 1)
 26 |     end
 27 | end
 28 | 
 29 | -- flatten parameters
 30 | function flatten(parameters)
 31 | 
 32 |     -- returns true if tensor occupies a contiguous region of memory (no holes)
 33 |     local function isCompact(tensor)
 34 |         local sortedStride, perm = torch.sort(
 35 |                 torch.LongTensor(tensor:nDimension()):set(tensor:stride()), 1, true)
 36 |         local sortedSize = torch.LongTensor(tensor:nDimension()):set(
 37 |                 tensor:size()):index(1, perm)
 38 |         local nRealDim = torch.clamp(sortedStride, 0, 1):sum()
 39 |         sortedStride = sortedStride:narrow(1, 1, nRealDim):clone()
 40 |         sortedSize    = sortedSize:narrow(1, 1, nRealDim):clone()
 41 |         local t = tensor.new():set(tensor:storage(), 1,
 42 |                                    sortedSize:storage(),
 43 |                                    sortedStride:storage())
 44 |         return t:isContiguous()
 45 |     end
 46 | 
 47 |     if not parameters or #parameters == 0 then
 48 |         return torch.Tensor()
 49 |     end
 50 |     local Tensor = parameters[1].new    
 51 | 
 52 |     -- 1. construct the set of all unique storages referenced by parameter tensors
 53 |     local storages = {}
 54 |     local nParameters = 0
 55 |     local parameterMeta = {}
 56 |     for k = 1,#parameters do
 57 |         local param = parameters[k]
 58 |         local storage = parameters[k]:storage()
 59 |         local storageKey = torch.pointer(storage)
 60 | 
 61 |         if not storages[storageKey] then
 62 |             storages[storageKey] = {storage, nParameters}
 63 |             nParameters = nParameters + storage:size()
 64 |         end
 65 | 
 66 |         parameterMeta[k] = {storageOffset = param:storageOffset() +
 67 |                                             storages[storageKey][2],
 68 |                             size          = param:size(),
 69 |                             stride        = param:stride()}
 70 |     end
 71 | 
 72 |     -- 2. construct a single tensor that will hold all the parameters
 73 |     local flatParameters = Tensor(nParameters):zero()
 74 | 
 75 |     -- 3. determine if there are elements in the storage that none of the
 76 |     --     parameter tensors reference ('holes')
 77 |     local tensorsCompact = true
 78 |     for k = 1,#parameters do
 79 |         local meta = parameterMeta[k]
 80 |         local tmp = Tensor():set(
 81 |             flatParameters:storage(), meta.storageOffset, meta.size, meta.stride
 82 |         )
 83 |         tmp:fill(1)
 84 |         tensorsCompact = tensorsCompact and isCompact(tmp)
 85 |     end
 86 | 
 87 |     local maskParameters  = flatParameters:byte():clone()
 88 |     local compactOffsets  = flatParameters:long():cumsum(1)
 89 |     local nUsedParameters = compactOffsets[-1]
 90 | 
 91 |     -- 4. copy storages into the flattened parameter tensor
 92 |     for _, storageAndOffset in pairs(storages) do
 93 |         local storage, offset = table.unpack(storageAndOffset)
 94 |         flatParameters[{{offset+1,offset+storage:size()}}]:copy(Tensor():set(storage))
 95 |     end
 96 | 
 97 |     -- 5. allow garbage collection
 98 |     storages = nil
 99 |     for k = 1,#parameters do
100 |         parameters[k]:set(Tensor())
101 |     end
102 | 
103 |     -- 6. compact the flattened parameters if there were holes
104 |     if nUsedParameters ~= nParameters then
105 |         assert(tensorsCompact, "Cannot gather tensors that are not compact")
106 | 
107 |         flatParameters = Tensor(nUsedParameters):copy(flatParameters:maskedSelect(maskParameters))
108 |         for k = 1,#parameters do
109 |             parameterMeta[k].storageOffset = compactOffsets[parameterMeta[k].storageOffset]
110 |         end
111 |     end
112 | 
113 |     -- 7. fix up the parameter tensors to point at the flattened parameters
114 |     for k = 1,#parameters do
115 |         parameters[k]:set(flatParameters:storage(),
116 |             parameterMeta[k].storageOffset,
117 |             parameterMeta[k].size,
118 |             parameterMeta[k].stride)
119 |     end
120 | 
121 |     return flatParameters
122 | end
123 | 
124 | -- clone utils
125 | function combineParameters(...)
126 |     --[[ like module:getParameters, but operates on many modules ]]--
127 | 
128 |     -- get parameters
129 |     local networks = {...}
130 |     local parameters = {}
131 |     local gradParameters = {}
132 |     for i = 1, #networks do
133 |         local net_params, net_grads = networks[i]:parameters()
134 | 
135 |         if net_params then
136 |             for _, p in pairs(net_params) do
137 |                 parameters[#parameters + 1] = p
138 |             end
139 |             for _, g in pairs(net_grads) do
140 |                 gradParameters[#gradParameters + 1] = g
141 |             end
142 |         end
143 |     end
144 | 
145 |     -- flatten parameters and gradients
146 |     local flatParameters = flatten(parameters)
147 |     local flatGradParameters = flatten(gradParameters)
148 | 
149 |     assert(flatParameters:nElement() == flatGradParameters:nElement(),
150 |         'check that you are sharing parameters and gradParameters')
151 |     if parameters then
152 |         for i = 1, #parameters do
153 |             assert(parameters[i]:storageOffset() == gradParameters[i]:storageOffset(),
154 |                'misaligned parameter at ' .. tostring(i))
155 |         end
156 |     end
157 | 
158 |     -- return new flat vector that contains all discrete parameters
159 |     return flatParameters, flatGradParameters
160 | end
161 | 
162 | function cloneManyTimes(net, T)
163 |     local clones = {}
164 |     local params, gradParams
165 |     if net.parameters then
166 |         params, gradParams = net:parameters()
167 |         if params == nil then
168 |             params = {}
169 |         end
170 |     end
171 |     local paramsNoGrad
172 |     if net.parametersNoGrad then
173 |         paramsNoGrad = net:parametersNoGrad()
174 |     end
175 |     local mem = torch.MemoryFile("w"):binary()
176 |     mem:writeObject(net)
177 |     for t = 1, T do
178 |         -- We need to use a new reader for each clone.
179 |         -- We don't want to use the pointers to already read objects.
180 |         local reader = torch.MemoryFile(mem:storage(), "r"):binary()
181 |         local clone  = reader:readObject()
182 |         reader:close()
183 |         if net.parameters then
184 |             local cloneParams, cloneGradParams = clone:parameters()
185 |             local cloneParamsNoGrad
186 |             for i = 1, #params do
187 |                 cloneParams[i]:set(params[i])
188 |                 cloneGradParams[i]:set(gradParams[i])
189 |             end
190 |             if paramsNoGrad then
191 |                 cloneParamsNoGrad = clone:parametersNoGrad()
192 |                 for i =1,#paramsNoGrad do
193 |                     cloneParamsNoGrad[i]:set(paramsNoGrad[i])
194 |                 end
195 |             end
196 |         end
197 |         clones[t] = clone
198 |         collectgarbage()
199 |     end
200 |     mem:close()
201 |     return clones
202 | end
203 | 


--------------------------------------------------------------------------------
/RelationRNN/train_rel_rnn.lua:
--------------------------------------------------------------------------------
  1 | require '..'
  2 | 
  3 | local cmd = torch.CmdLine()
  4 | cmd:text()
  5 | cmd:text('Training a Recurrent Neural Network to embed a sentence')
  6 | cmd:text()
  7 | cmd:text('Options')
  8 | 
  9 | cmd:option('-vocabSize',100002,'number of words in dictionary')
 10 | 
 11 | cmd:option('-relSize',7524,'number of relations in dictionary')
 12 | cmd:option('-relEmbedSize',256,'size of rel embedding')
 13 | 
 14 | cmd:option('-wrdEmbedSize',300,'size of word embedding')
 15 | cmd:option('-wrdEmbedPath','../embedding/word.100k.glove.t7','pretained word embedding path')
 16 | 
 17 | cmd:option('-numLayer',2,'number of RNN layers')
 18 | cmd:option('-maxSeqLen',40,'number of timesteps to unroll to')
 19 | cmd:option('-hiddenSize',256,'size of RNN internal state')
 20 | cmd:option('-dropoutRate',0.5,'dropout rate')
 21 | 
 22 | cmd:option('-negSize',1024,'number of negtive samples for each iteration')
 23 | cmd:option('-maxEpochs',1000,'number of full passes through the training data')
 24 | cmd:option('-initRange',0.08,'the range of uniformly initialize parameters')
 25 | cmd:option('-costMargin',0.1,'the margin used in the ranking cost')
 26 | cmd:option('-useGPU',1,'whether to use gpu for computation')
 27 | 
 28 | cmd:option('-printEvery',100,'how many steps/minibatches between printing out the loss')
 29 | cmd:option('-saveEvery',100,'how many epochs between auto save trained models')
 30 | cmd:option('-saveFile','model.rel.stackBiRNN','filename to autosave the model (protos) to')
 31 | cmd:option('-logFile','logs/rel.stackBiRNN.log','log file to record training information')
 32 | cmd:option('-dataFile', '../data/train.relation_ranking.t7','training data file')
 33 | 
 34 | cmd:option('-seed',123,'torch manual random number generator seed')
 35 | cmd:text()
 36 | 
 37 | ----------------------------- parse params -----------------------------
 38 | 
 39 | local opt = cmd:parse(arg)
 40 | -- local flog = logroll.file_logger(opt.logFile)
 41 | local flog = logroll.print_logger()
 42 | if opt.useGPU > 0 then
 43 |     cutorch.setDevice(opt.useGPU)
 44 |     torch.setdefaulttensortype('torch.CudaTensor')
 45 | end
 46 | 
 47 | ----------------------------- define loader -----------------------------
 48 | local loader = SeqRankingLoader(opt.dataFile, opt.negSize, opt.relSize, flog)
 49 | 
 50 | ----------------------------- define models -----------------------------
 51 | -- word embedding model
 52 | local wordEmbed = cudacheck(nn.LookupTable(opt.vocabSize, opt.wrdEmbedSize))
 53 | -- loadPretrainedEmbed(wordEmbed, opt.wrdEmbedPath)
 54 | 
 55 | -- rel embedding model
 56 | -- local relEmbed = torch.load('../TransE/model.60').RelEmbed
 57 | local relEmbed = cudacheck(nn.LookupTable(opt.relSize, opt.relEmbedSize))
 58 | relEmbed.weight:uniform(-opt.initRange, opt.initRange)
 59 | relEmbed.weight:renorm(2, 2, 1)
 60 | 
 61 | local posRelDrop = nn.Dropout(0.3)
 62 | local negRelDrop = nn.Dropout(0.3)
 63 | 
 64 | -- multi-layer (stacked) Bi-RNN
 65 | local config = {}
 66 | config.hiddenSize = opt.hiddenSize
 67 | config.maxSeqLen  = opt.maxSeqLen
 68 | config.maxBatch   = 256
 69 | config.logger     = flog
 70 | 
 71 | local RNN = {}
 72 | for l = 1, opt.numLayer do
 73 |     config.inputSize = l == 1 and opt.wrdEmbedSize or opt.hiddenSize * 2
 74 |     RNN[l] = BiGRU(config)
 75 | end
 76 | 
 77 | local selectLayer = BiRNNSelect()
 78 | local linearLayer = nn.Linear(2 * opt.hiddenSize, opt.relEmbedSize)
 79 | 
 80 | local seqModel = nn.Sequential()
 81 | seqModel:add(wordEmbed)
 82 | for l = 1, opt.numLayer do
 83 |     seqModel:add(nn.Dropout(opt.dropoutRate))
 84 |     seqModel:add(RNN[l])
 85 | end
 86 | seqModel:add(selectLayer)
 87 | seqModel:add(linearLayer)
 88 | 
 89 | -- ranking score model
 90 | local scoreModel = TripleScore(opt.negSize)
 91 | 
 92 | -- put all models together
 93 | local model = {}
 94 | model.seqModel   = seqModel
 95 | model.relEmbed   = relEmbed
 96 | model.posRelDrop = posRelDrop
 97 | model.negRelDrop = negRelDrop
 98 | model.scoreModel = scoreModel
 99 | 
100 | -- margin ranking criterion
101 | local criterion  = nn.MarginRankingCriterion(opt.costMargin)
102 | 
103 | -- put together parms and grad pointers in optimParams and optimGrad tables
104 | local optimParams, optimGrad = {}, {}
105 | for l = 1, opt.numLayer do
106 |     local rnnParams, rnnGrad = RNN[l]:getParameters()
107 |     rnnParams:uniform(-opt.initRange, opt.initRange)
108 |     optimParams[l], optimGrad[l] = rnnParams, rnnGrad
109 | end
110 | optimParams[#optimParams+1], optimGrad[#optimGrad+1] = linearLayer:getParameters()
111 | 
112 | -- optimization configurations [subject to change]
113 | local lrWrd, lrRel = 1e-3, 3e-4
114 | 
115 | local optimConf = {['lr'] = {}, ['momentum'] = 0.3}
116 | -- local optimConf = {['lr'] = {}}
117 | for l = 1, #optimParams do optimConf['lr'][l] = 1e-3 end
118 | local optimizer = AdaGrad(optimGrad, optimConf)
119 | 
120 | -- prepare for training
121 | local sumLoss, epochLoss  = 0, 0
122 | local maxIters = opt.maxEpochs * loader.numBatch
123 | local ones = torch.ones(loader.batchSize, loader.negSize)
124 | 
125 | -- core training loop
126 | for i = 1, maxIters do
127 |     xlua.progress(i, maxIters)
128 |     -- in the beginning of each loop, clean the grad_params
129 |     relEmbed:zeroGradParameters()
130 |     wordEmbed:zeroGradParameters()
131 |     for i = 1, #optimGrad do optimGrad[i]:zero() end
132 | 
133 |     ----------------------- load minibatch ------------------------
134 |     local seq, pos, negs = loader:nextBatch()
135 |     local currSeqLen = seq:size(1)
136 |     local loss = 0
137 | 
138 |     ------------------------ forward pass -------------------------
139 |     -- sequence vectors [n_batch x n_dim]
140 |     local seqVec = seqModel:forward(seq)
141 | 
142 |     -- positive vectors [n_batch x n_dim]
143 |     local posVec = relEmbed:forward(pos):clone()
144 |     local posDropVec = posRelDrop:forward(posVec)
145 | 
146 |     -- negative matrix  [n_neg x n_batch x n_dim]
147 |     local negMat = relEmbed:forward(negs)
148 |     local negDropMat = negRelDrop:forward(negMat)
149 | 
150 |     -- scores table {[1] = postive_scores, [2] = negative_scores}
151 |     -- local scores = scoreModel:forward({seqVec, posVec, negMat})
152 |     local scores = scoreModel:forward({seqVec, posDropVec, negDropMat})
153 |     local loss = criterion:forward(scores, ones)
154 |     
155 |     -- d_scores table {[1] = d_postive_scores, [2] = d_negative_scores}
156 |     local d_scores = criterion:backward(scores, ones)
157 | 
158 |     -- d_seqVec [n_batch x n_dim], d_posVec [n_batch x n_dim], d_negMat [n_neg x n_batch x n_dim]
159 |     -- local d_seqVec, d_posVec, d_negMat = unpack(scoreModel:backward({seqVec, posVec, negMat}, d_scores))
160 |     local d_seqVec, d_posDropVec, d_negDropMat = unpack(scoreModel:backward({seqVec, posDropVec, negDropMat}, d_scores))
161 | 
162 |     local d_negMat = negRelDrop:backward(negMat, d_negDropMat)
163 | 
164 |     local d_posVec = posRelDrop:backward(posVec, d_posDropVec)
165 | 
166 |     -- grad due to negative matrix
167 |     relEmbed:backward(negs, d_negMat)
168 | 
169 |     -- grad due to positive vectors
170 |     relEmbed:backward(pos, d_posVec)
171 | 
172 |     -- grad to the sequence model
173 |     -- seqModel:backward(dropedSeq, d_seqVec)
174 |     seqModel:backward(seq, d_seqVec)
175 | 
176 |     ----------------------- parameter update ----------------------
177 |     -- sgd with scheduled anealing
178 |     relEmbed:updateParameters(lrRel / (1 + 0.0001 * i))
179 | 
180 |     -- renorm rel embeding into normal ball
181 |     relEmbed.weight:renorm(2, 2, 1)
182 | 
183 |     -- sgd with scheduled anealing (override with sparse update)
184 |     wordEmbed:updateParameters(lrWrd / (1 + 0.0001 * i))
185 | 
186 |     -- adagrad for rnn, projection    
187 |     for l = 1, opt.numLayer do optimGrad[l]:clamp(-10, 10) end
188 |     optimizer:updateParams(optimParams, optimGrad)
189 |     
190 |     -- accumulate loss
191 |     sumLoss   = sumLoss + loss
192 |     epochLoss = epochLoss + loss
193 | 
194 |     -- scheduled anealing the momentum rate after each epoch
195 |     if i % loader.numBatch == 0 then
196 |         flog.info(string.format('epoch %3d, loss %6.8f', i / loader.numBatch, epochLoss / loader.numBatch / loader.negSize))
197 |         epochLoss = 0
198 |         if i / loader.numBatch >= 10 then
199 |             optimizer:updateMomentum(math.min(optimizer.momentum + 0.3, 0.99))
200 |         end        
201 |     end
202 | 
203 |     ------------------------ training info ------------------------
204 |     if i % opt.printEvery == 0 then
205 |         flog.info(string.format("iter %4d, loss = %6.8f", i, sumLoss / opt.printEvery / opt.negSize))
206 |         sumLoss = 0
207 |     end
208 |     if i % (loader.numBatch * opt.saveEvery) == 0 then
209 |         -- save model after each epoch
210 |         local epoch = i / loader.numBatch
211 |         print('saving model after epoch', epoch)
212 |         torch.save(opt.saveFile..'.'..opt.useGPU..'.'..epoch, model)
213 |     end
214 | end
215 | 


--------------------------------------------------------------------------------
/src/py_module/virtuoso.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import sys
  3 | import urllib, json
  4 | import freebase
  5 | 
  6 | # Setting global variables
  7 | data_source = 'fb:'
  8 | query_url   = 'http://localhost:8890/sparql/'
  9 | 
 10 | # HTTP URL is constructed accordingly with JSON query results format in mind.
 11 | def sparql_query(query, URL, format='application/json'):
 12 | 
 13 |     params={
 14 |              'default-graph': '',
 15 |              'should-sponge': 'soft',
 16 |              'query': query.encode('utf8'),
 17 |              'debug': 'on',
 18 |              'timeout': '',
 19 |              'format': format,
 20 |              'save': 'display',
 21 |         'fname': ''
 22 |     }
 23 | 
 24 |     encoded_query = urllib.urlencode(params)
 25 |     http_response = urllib.urlopen(URL, encoded_query).read()
 26 | 
 27 |     try:
 28 |         json_response = json.loads(http_response)
 29 |         return json_response
 30 |     except:
 31 |         print >> sys.stderr, 'json load error'
 32 |         print >> sys.stderr, http_response
 33 |         return None 
 34 | 
 35 | # Using freebase mid to query its types
 36 | def id_query_type(node_id):
 37 |     query = '''
 38 |         SELECT ?type WHERE {<%s> <fb:type.object.type> ?type}
 39 |     ''' % (node_id)
 40 |     json_response = sparql_query(query, query_url)
 41 | 
 42 |     try:
 43 |         type_list = [item['type']['value'] for item in json_response['results']['bindings']]
 44 |         return list(set(type_list))
 45 |     except:
 46 |         return []
 47 | 
 48 | # Using freebase mid to query its original cased name
 49 | def id_query_en_name(node_id):
 50 |     query = '''
 51 |         SELECT ?name WHERE {<%s> <fb:type.object.en_name> ?name}
 52 |     ''' % (node_id)
 53 |     json_response = sparql_query(query, query_url)
 54 | 
 55 |     try:
 56 |         name_list = [item['name']['value'] for item in json_response['results']['bindings']]
 57 |         return list(set(name_list))
 58 |     except:
 59 |         return []
 60 | 
 61 | # Using freebase mid to query its original cased alias
 62 | def id_query_en_alias(node_id):
 63 |     query = '''
 64 |         SELECT ?alias WHERE {<%s> <fb:common.topic.en_alias> ?alias}
 65 |     ''' % (node_id)
 66 |     json_response = sparql_query(query, query_url)
 67 | 
 68 |     try:
 69 |         alias_list = [item['alias']['value'] for item in json_response['results']['bindings']]
 70 |         return list(set(alias_list))
 71 |     except:
 72 |         return []
 73 | 
 74 | # Using freebase mid to query its processed & tokenized name
 75 | def id_query_name(node_id):
 76 |     query = '''
 77 |         SELECT ?name WHERE {<%s> <fb:type.object.name> ?name}
 78 |     ''' % (node_id)
 79 |     json_response = sparql_query(query, query_url)
 80 | 
 81 |     try:
 82 |         name_list = [item['name']['value'] for item in json_response['results']['bindings']]
 83 |         return list(set(name_list))
 84 |     except:
 85 |         return []
 86 | 
 87 | # Using freebase mid to query its processed & tokenized alias
 88 | def id_query_alias(node_id):
 89 |     query = '''
 90 |         SELECT ?alias WHERE {<%s> <fb:common.topic.alias> ?alias}
 91 |     ''' % (node_id)
 92 |     json_response = sparql_query(query, query_url)
 93 | 
 94 |     try:
 95 |         alias_list = [item['alias']['value'] for item in json_response['results']['bindings']]
 96 |         return list(set(alias_list))
 97 |     except:
 98 |         return []
 99 | 
100 | # Using freebase mid to query its processed & tokenized name & alias
101 | def id_query_str(node_id):
102 |     query = '''
103 |         SELECT ?str WHERE { {<%s> <fb:type.object.name> ?str} UNION {<%s> <fb:common.topic.alias> ?str} }
104 |     ''' % (node_id, node_id)
105 |     json_response = sparql_query(query, query_url)
106 | 
107 |     try:
108 |         name_list = [item['str']['value'] for item in json_response['results']['bindings']]
109 |         return list(set(name_list))
110 |     except:
111 |         return []
112 | # Using freebase mid to query all relations coming out of the entity
113 | def id_query_out_rel(node_id, unique = True):
114 |     query = '''
115 |         SELECT ?relation WHERE {<%s> ?relation ?object}
116 |     ''' % (node_id)
117 |     json_response = sparql_query(query, query_url)
118 | 
119 |     try:
120 |         relations = [str(item['relation']['value']) for item in json_response['results']['bindings']] 
121 |         return list(set(relations))
122 |     except:
123 |         return []
124 | 
125 | # Using freebase mid to query all relations coming into the entity
126 | def id_query_in_rel(node_id, unique = True):
127 |     query = '''
128 |         SELECT ?relation WHERE {?subject ?relation <%s>}
129 |     ''' % (node_id)
130 |     json_response = sparql_query(query, query_url)
131 | 
132 |     try:
133 |         relations = [str(item['relation']['value']) for item in json_response['results']['bindings']] 
134 |         return list(set(relations))
135 |     except:
136 |         return []
137 | 
138 | 
139 | # Using the name of an entity to query its freebase mid
140 | def name_query_id(name):
141 |     query = '''
142 |         SELECT ?node_id WHERE {?node_id <fb:type.object.name> "%s"}
143 |     ''' % (name)
144 |     json_response = sparql_query(query, query_url)
145 | 
146 |     try:
147 |         node_id_list = [str(item['node_id']['value']) for item in json_response['results']['bindings']]
148 |         return list(set(node_id_list))
149 |     except:
150 |         return []
151 | 
152 | # Using the alias of an entity to query its freebase mid
153 | def alias_query_id(alias):
154 |     query = '''
155 |         SELECT ?node_id WHERE {?node_id <fb:common.topic.alias> "%s"}
156 |     ''' % (alias)
157 |     json_response = sparql_query(query, query_url)
158 | 
159 |     try:
160 |         node_id_list = [str(item['node_id']['value']) for item in json_response['results']['bindings']]
161 |         return list(set(node_id_list))
162 |     except:
163 |         return []
164 | 
165 | # Using the alias/name of an entity to query its freebase mid
166 | def str_query_id(string):
167 |     query = '''
168 |         SELECT ?node_id WHERE  { {?node_id <fb:common.topic.alias> "%s"} UNION {?node_id <fb:type.object.name> "%s"} }
169 |     ''' % (string, string)
170 |     json_response = sparql_query(query, query_url)
171 | 
172 |     try:
173 |         node_id_list = [str(item['node_id']['value']) for item in json_response['results']['bindings']]
174 |         return list(set(node_id_list))
175 |     except:
176 |         return []
177 | 
178 | # Using freebase mid to query all object coming out of the entity
179 | def id_query_in_entity(node_id, unique = True):
180 |     query = '''
181 |         SELECT ?subject WHERE {?subject ?relation <%s>}
182 |     ''' % (node_id)
183 |     json_response = sparql_query(query, query_url)
184 | 
185 |     try:
186 |         subjects = [str(item['subject']['value']) for item in json_response['results']['bindings']] 
187 |         return list(set(subjects))
188 |     except:
189 |         return []
190 | 
191 | # Using freebase mid to query all relation coming into the entity
192 | def id_query_out_entity(node_id, unique = True):
193 |     query = '''
194 |         SELECT ?object WHERE {<%s> ?relation ?object}
195 |     ''' % (node_id)
196 |     json_response = sparql_query(query, query_url)
197 | 
198 |     try:
199 |         objects = [str(item['object']['value']) for item in json_response['results']['bindings']] 
200 |         return list(set(objects))
201 |     except:
202 |         return []
203 | 
204 | # Using the subject and relation to query the corresponding object
205 | def query_object(subject, relation):
206 |     query = '''
207 |         SELECT ?object WHERE {<%s> <%s> ?object}
208 |     ''' % (subject, relation)
209 |     json_response = sparql_query(query, query_url)
210 | 
211 |     try:
212 |         return [str(item['object']['value']) for item in json_response['results']['bindings']]
213 |     except:
214 |         return []
215 | 
216 | # Using the object and relation to query the corresponding subject 
217 | def query_subject(obj, relation):
218 |     query = '''
219 |         SELECT ?subject WHERE {?subject <%s> <%s>}
220 |     ''' % (relation, obj)
221 |     json_response = sparql_query(query, query_url)
222 | 
223 |     try:
224 |         return [str(item['subject']['value']) for item in json_response['results']['bindings']]
225 |     except:
226 |         return []
227 | 
228 | # Using the subject and object to query the corresponding relation
229 | def query_relation(sub, obj):
230 |     query = '''
231 |         SELECT ?relation WHERE {<%s> ?relation <%s>}
232 |     ''' % (sub, obj)
233 |     json_response = sparql_query(query, query_url)
234 | 
235 |     try:
236 |         objects = [str(item['relation']['value']) for item in json_response['results']['bindings']] 
237 |         return list(set(objects))
238 |     except:
239 |         return []
240 | 
241 | # Check whether a node is a CVT node
242 | def check_cvt(node_id):
243 |     query = '''
244 |         SELECT ?tag WHERE {<%s> <fb:cvt_node_identifier> ?tag}
245 |     ''' % (node_id)
246 |     json_response = sparql_query(query, query_url)
247 |     ret = [str(item['tag']['value']) for item in json_response['results']['bindings']]
248 | 
249 |     if len(ret) == 1 and ret[0] == 'true':
250 |         return True
251 |     else:
252 |         return False
253 | 


--------------------------------------------------------------------------------
/src/model/CRF.lua:
--------------------------------------------------------------------------------
  1 | local CRF, parent = torch.class('CRF', 'nn.Module')
  2 | 
  3 | -- initialize the module
  4 | function CRF:__init(numClass, maxSeqLen, maxBatch)
  5 |     self.numClass  = numClass
  6 |     self.maxSeqLen = maxSeqLen
  7 |     self.maxBatch  = maxBatch
  8 | 
  9 |     -- pairwire parameter
 10 |     self.weight     = torch.rand(self.numClass, self.numClass)
 11 |     self.gradWeight = torch.zeros(self.numClass, self.numClass)
 12 | 
 13 |     -- state memory
 14 |     self.alpha = torch.zeros(self.maxSeqLen, self.maxBatch, self.numClass)
 15 |     self.beta  = torch.zeros(self.maxSeqLen, self.maxBatch, self.numClass)
 16 | 
 17 |     self.partition = torch.zeros(self.maxBatch)
 18 | 
 19 |     self.marginalU = torch.zeros(self.maxSeqLen, self.maxBatch, self.numClass)
 20 |     self.marginalP = torch.zeros(self.maxSeqLen - 1, self.maxBatch, self.numClass, self.numClass)
 21 | 
 22 |     self.output    = torch.Tensor(maxSeqLen, self.maxBatch)
 23 |     self.gradInput = torch.Tensor(maxSeqLen, self.maxBatch, self.numClass)
 24 | 
 25 |     -- working memory
 26 |     self.tempMat = torch.zeros(self.maxBatch, self.numClass, self.numClass)
 27 |     self.maxVec  = torch.zeros(self.maxBatch, self.numClass)
 28 | 
 29 |     self.uFactor = torch.zeros(self.maxBatch)
 30 |     self.pFactor = torch.zeros(self.maxBatch)
 31 |     self.flatLabelPair = torch.zeros(self.maxSeqLen, self.maxBatch)
 32 | 
 33 |     self.tempGradWeight = torch.zeros(self.maxSeqLen * self.maxBatch, self.numClass * self.numClass)
 34 | 
 35 |     -- helper structures
 36 |     self.stridePartitionVec = torch.LongStorage({0, 1, 0})
 37 |     self.stridePartitionMat = torch.LongStorage({0, 1, 0, 0})
 38 |     self.strideWeight = torch.LongStorage({0, 0, self.numClass, 1})
 39 | 
 40 |     self.fullVecSize = torch.LongStorage({self.maxSeqLen, self.maxBatch, self.numClass})
 41 |     self.fullMatSize = torch.LongStorage({self.maxSeqLen, self.maxBatch, self.numClass, self.numClass})
 42 |     self.pairMatSize = torch.LongStorage({self.maxSeqLen - 1, self.maxBatch, self.numClass, self.numClass})
 43 |     self.stepMatSize = torch.LongStorage({self.maxBatch, self.numClass, self.numClass})
 44 | 
 45 |     -- set training flag 
 46 |     self.train = true
 47 | end
 48 | 
 49 | function CRF:viterbi(input)
 50 |     local unary = input
 51 |     local seqLen, batchSize = unary:size(1), unary:size(2)
 52 | 
 53 |     self.fullMatSize[1], self.fullMatSize[2] = seqLen, batchSize
 54 |     self.fullVecSize[1], self.fullVecSize[2] = seqLen, batchSize
 55 |     self.stepMatSize[1] = batchSize
 56 | 
 57 |     -- resize tensor
 58 |     self.alpha:resize(self.fullVecSize):zero()
 59 |     self.beta:resize (self.fullVecSize):zero()
 60 |     
 61 |     self.tempMat:resize(self.stepMatSize)
 62 | 
 63 |     -- replicates
 64 |     local batchWeight = self.weight:view(1, self.numClass, self.numClass):expand(self.stepMatSize)
 65 | 
 66 |     local repUnary = unary:view(seqLen, batchSize, self.numClass, 1):expand(self.fullMatSize)
 67 |     local repAlpha = self.alpha:view(seqLen, batchSize, self.numClass, 1):expand(self.fullMatSize)
 68 |     local repBeta  = self.beta:view (seqLen, batchSize, self.numClass, 1):expand(self.fullMatSize)
 69 | 
 70 |     for i = 1, seqLen do
 71 |         self.tempMat:copy(repUnary[i])
 72 |         if i ~= seqLen then
 73 |             self.tempMat:add(batchWeight)
 74 |         end
 75 |         if i ~= 1 then
 76 |             self.tempMat:add(repAlpha[i-1])
 77 |         end
 78 |         
 79 |         val, idx = torch.max(self.tempMat, 2)
 80 |         self.alpha[i], self.beta[i] = val, idx:typeAs(self.beta[i])
 81 |     end        
 82 | 
 83 |     self.output:resize(seqLen, batchSize, 1):zero()
 84 | 
 85 |     self.output[seqLen] = self.beta[{seqLen, {}, 1}]
 86 |     for i = seqLen - 1, 1, -1 do
 87 |         self.output[i] = self.beta[i]:gather(2, self.output[i+1])
 88 |     end
 89 | 
 90 |     self.output = self.output:view(seqLen, batchSize)
 91 | 
 92 |     return self.output
 93 | end
 94 | 
 95 | function CRF:forwardbackward(input)
 96 |     local unary, label = unpack(input)
 97 |     local seqLen, batchSize = unary:size(1), unary:size(2)
 98 |     
 99 |     self.pairMatSize[1], self.pairMatSize[2] = seqLen - 1, batchSize
100 |     self.fullMatSize[1], self.fullMatSize[2] = seqLen, batchSize
101 |     self.fullVecSize[1], self.fullVecSize[2] = seqLen, batchSize
102 |     self.stepMatSize[1] = batchSize
103 | 
104 |     -- resize tensor
105 |     self.alpha:resize(self.fullVecSize):zero()
106 |     self.beta:resize(self.fullVecSize):zero()
107 | 
108 |     self.marginalU:resize(self.fullVecSize)
109 |     self.marginalP:resize(self.pairMatSize)
110 | 
111 |     self.partition:resize(batchSize)
112 |     
113 |     self.tempMat:resize(self.stepMatSize)
114 |     self.maxVec:resize (batchSize, self.numClass)
115 | 
116 |     -- replicates
117 |     local fullPartitionVec = self.partition.new(self.partition:storage(), self.partition:storageOffset(), self.fullVecSize, self.stridePartitionVec)
118 |     local pairPartitionMat = self.partition.new(self.partition:storage(), self.partition:storageOffset(), self.pairMatSize, self.stridePartitionMat)
119 | 
120 |     local pairWeight  = self.weight.new(self.weight:storage(), self.weight:storageOffset(), self.pairMatSize, self.strideWeight)
121 | 
122 |     local batchWeight = self.weight:view(1, self.numClass, self.numClass):expand(self.stepMatSize)
123 |     local transWeight = batchWeight:transpose(2,3)
124 | 
125 |     local repUnary  = unary:view(seqLen, batchSize, self.numClass, 1):expand(self.fullMatSize)
126 |     local repAlpha  = self.alpha:view(seqLen, batchSize, self.numClass, 1):expand(self.fullMatSize)
127 |     local repBeta   = self.beta:view (seqLen, batchSize, self.numClass, 1):expand(self.fullMatSize)
128 |     
129 |     local repMaxVec = self.maxVec:view(batchSize, 1, self.numClass):expand(self.stepMatSize)
130 | 
131 |     -- forward recursion [alpha]
132 |     for i = 1, seqLen do
133 |         self.tempMat:copy(repUnary[i])
134 |         if i ~= seqLen then
135 |             self.tempMat:add(batchWeight)
136 |         end
137 |         if i ~= 1 then
138 |             self.tempMat:add(repAlpha[i-1])
139 |         end
140 | 
141 |         -- log sum exp
142 |         self.maxVec:max(self.tempMat, 2)
143 |         self.tempMat:add(-1, repMaxVec):exp()
144 |         self.alpha[i]:sum(self.tempMat, 2):log()
145 |         self.alpha[i]:add(self.maxVec)
146 |     end
147 |     
148 |     -- backward recursion [beta]
149 |     for i = seqLen, 1, -1 do
150 |         self.tempMat:copy(repUnary[i])
151 |         if i ~= 1 then
152 |             self.tempMat:add(transWeight)
153 |         end
154 |         if i ~= seqLen then
155 |             self.tempMat:add(repBeta[i+1])
156 |         end
157 | 
158 |         -- log sum exp
159 |         self.maxVec:max(self.tempMat, 2)
160 |         self.tempMat:add(-1, repMaxVec):exp()
161 |         self.beta[i]:sum(self.tempMat, 2):log()
162 |         self.beta[i]:add(self.maxVec)
163 |     end
164 | 
165 |     self.partition:copy(self.alpha[{seqLen, {}, 1}])
166 | 
167 |     -- marginals
168 |     self.marginalU:copy(unary)
169 |     if seqLen >= 2 then
170 |         self.marginalU[{{2, seqLen}}]:add(self.alpha[{{1, seqLen - 1}}])
171 |         self.marginalU[{{1, seqLen - 1}}]:add(self.beta [{{2, seqLen}}])
172 |     end
173 |     self.marginalU:add(-1, fullPartitionVec)
174 |     self.marginalU:exp()
175 |     
176 |     if seqLen >= 2 then
177 |         self.marginalP:add(repUnary[{{1, seqLen - 1}}], repUnary[{{2, seqLen}}]:transpose(3,4))
178 |         self.marginalP:add(pairWeight)
179 |         if seqLen > 2 then
180 |             self.marginalP[{{2, seqLen - 1}}]:add(repAlpha[{{1, seqLen - 2}}])
181 |             self.marginalP[{{1, seqLen - 2}}]:add(repBeta [{{3, seqLen}}]:transpose(3,4))
182 |         end
183 |         self.marginalP:add(-1, pairPartitionMat)
184 |         self.marginalP:exp()
185 |     end    
186 | 
187 |     -- empirical probability
188 |     self.output:resize(batchSize):zero()
189 |     self.uFactor:resize(batchSize):zero()
190 |     self.pFactor:resize(batchSize):zero()
191 |     self.flatLabelPair:resize(seqLen - 1, batchSize):zero()
192 | 
193 |     self.uFactor:view(batchSize, 1):sum(unary:view(-1, self.numClass):gather(2, label:view(-1, 1)):view(seqLen, batchSize), 1)
194 | 
195 |     if seqLen >= 2 then
196 | 	    self.flatLabelPair = (label[{{1, seqLen - 1}}] - 1) * self.numClass + label[{{2, seqLen}}]
197 | 	    self.pFactor:view(batchSize, 1):sum(self.weight:view(1, -1):gather(2, self.flatLabelPair:view(1, -1)):view(seqLen-1, batchSize), 1)
198 | 	end
199 | 
200 |     self.output:add(self.uFactor, self.pFactor)
201 |     self.output:add(-1, self.partition)
202 |     self.output:exp()
203 |     
204 |     return self.output
205 | end
206 | 
207 | function CRF:updateOutput(input)
208 |     if self.train then
209 |         return self:forwardbackward(input)
210 |     else
211 |         return self:viterbi(input)
212 |     end
213 | end
214 | 
215 | function CRF:backward(input)
216 |     local unary, label = unpack(input)
217 |     local seqLen, batchSize = unary:size(1), unary:size(2)
218 | 
219 |     self.gradInput:resizeAs(unary):zero()
220 |     
221 |     self.gradInput:view(-1, self.numClass):scatter(2, label:view(-1, 1), -1)
222 |     self.gradInput:add(self.marginalU)
223 | 
224 |     if seqLen >= 2 then
225 | 	    self.tempGradWeight:resize((seqLen - 1) * batchSize, self.numClass * self.numClass)
226 | 	    
227 | 	    self.tempGradWeight:scatter(2, self.flatLabelPair:view(-1, 1), -1)
228 | 	    self.tempGradWeight:add(self.marginalP)
229 | 	    
230 | 	    self.gradWeight:view(1, self.numClass * self.numClass):sum(self.tempGradWeight, 1)
231 | 	end
232 | 
233 |     return self.gradInput
234 | end
235 | 
236 | function CRF:parameters()
237 |     return {self.weight}, {self.gradWeight}
238 | end
239 | 


--------------------------------------------------------------------------------
/src/model/BiGRU.lua:
--------------------------------------------------------------------------------
  1 | local BiGRU, parent = torch.class('BiGRU', 'BiRNN')
  2 | 
  3 | -- initialize the module
  4 | function BiGRU:__init(config)
  5 |     parent.__init(self)
  6 | 
  7 |     -- config the model
  8 |     self.inputSize   = config.inputSize
  9 |     self.hiddenSize  = config.hiddenSize
 10 |     self.maxSeqLen   = config.maxSeqLen or 200
 11 |     self.maxBatch    = config.maxBatch  or 128
 12 | 
 13 |     -- allocate weights memory
 14 |     self.weight     = torch.Tensor(self.inputSize, self.hiddenSize*6):uniform(-1.0, 1.0)
 15 |     self.gradWeight = torch.Tensor(self.inputSize, self.hiddenSize*6):zero()
 16 |     
 17 |     self.bias     = torch.Tensor(self.hiddenSize*6):uniform(-1.0, 1.0)
 18 |     self.gradBias = torch.Tensor(self.hiddenSize*6):zero()
 19 | 
 20 |     self.recWeight_G     = torch.Tensor(2, self.hiddenSize, self.hiddenSize*2):uniform(-1.0, 1.0)
 21 |     self.gradRecWeight_G = torch.Tensor(2, self.hiddenSize, self.hiddenSize*2):zero()
 22 | 
 23 |     self.recWeight_H     = torch.Tensor(2, self.hiddenSize, self.hiddenSize):uniform(-1.0, 1.0)
 24 |     self.gradRecWeight_H = torch.Tensor(2, self.hiddenSize, self.hiddenSize):zero()
 25 |     
 26 |     -- allocate working memory
 27 |     self.gates  = torch.Tensor(self.maxSeqLen, self.maxBatch, self.hiddenSize*6):zero()
 28 |     self.resetH = torch.Tensor(self.maxSeqLen, self.maxBatch, self.hiddenSize*2):zero()
 29 |     self.comple = torch.Tensor(self.maxSeqLen, self.maxBatch, self.hiddenSize*2):zero()
 30 |     self.hidden = torch.Tensor(self.maxSeqLen, self.maxBatch, self.hiddenSize*2):zero()
 31 | 
 32 |     self.gradGates  = torch.Tensor(self.maxSeqLen, self.maxBatch, self.hiddenSize*6):zero()
 33 |     self.gradInput  = torch.Tensor(self.maxSeqLen, self.maxBatch, self.inputSize *2):zero()
 34 |     self.gradResetH = torch.Tensor(self.maxSeqLen, self.maxBatch, self.hiddenSize*2):zero()
 35 | 
 36 |     self.buffer = torch.ones(self.maxSeqLen*self.maxBatch)
 37 | 
 38 |     -- logging information
 39 |     if config.logger then
 40 |         config.logger.info(string.rep('-', 50))
 41 |         config.logger.info('BiGRU Configuration:')
 42 |         config.logger.info(string.format('    inputSize   : %5d', self.inputSize))
 43 |         config.logger.info(string.format('    hiddenSize  : %5d', self.hiddenSize))
 44 |         config.logger.info(string.format('    maxSeqLen   : %5d', self.maxSeqLen))
 45 |         config.logger.info(string.format('    maxBatch    : %5d', self.maxBatch))
 46 |     end
 47 | 
 48 | end
 49 | 
 50 | function BiGRU:updateOutput(input)
 51 |     assert(self.inputSize==input:size(3), 'Input size not match')
 52 |     local seqLen, batchSize = input:size(1), input:size(2)
 53 | 
 54 |     self.gates:resize (seqLen, batchSize, self.hiddenSize*6)
 55 |     self.resetH:resize(seqLen, batchSize, self.hiddenSize*2)
 56 |     self.comple:resize(seqLen, batchSize, self.hiddenSize*2)
 57 |     self.hidden:resize(seqLen, batchSize, self.hiddenSize*2)
 58 | 
 59 |     self.buffer:resize(seqLen*batchSize)
 60 | 
 61 |     self.comple:fill(1)
 62 |     
 63 |     local denseInput = input:view(seqLen*batchSize, self.inputSize)
 64 |     local denseGates = self.gates:view(seqLen*batchSize, self.hiddenSize*6)
 65 | 
 66 |     denseGates:addr(0, 1, self.buffer, self.bias)
 67 |     denseGates:addmm(1, denseInput, self.weight)
 68 | 
 69 |     for i = 1, self.nStream do
 70 |         -- set stream: stream 1 deals with forward-GRU & stream 2 deals with backward-GRU
 71 |         if cutorch then cutorch.setStream(i) end
 72 | 
 73 |         -- get traverse order (depends on the stream)
 74 |         local begIdx, endIdx, stride = self:traverseOrder(seqLen, i)
 75 | 
 76 |         -- compute stream memory offset
 77 |         local left, right = (i-1)*self.hiddenSize, i*self.hiddenSize
 78 | 
 79 |         local prevHidden
 80 | 
 81 |         for seqIdx = begIdx, endIdx, stride do
 82 |             -- get current memory
 83 |             local currGates  = self.gates [{seqIdx, {}, {3*left+1, 3*right}}]
 84 |             local currResetH = self.resetH[{seqIdx, {}, {  left+1,   right}}]
 85 |             local currComple = self.comple[{seqIdx, {}, {  left+1,   right}}]
 86 |             local currHidden = self.hidden[{seqIdx, {}, {  left+1,   right}}]
 87 | 
 88 |             -- decompose currGates
 89 |             local preGateAct = currGates[{{}, {                  1,   self.hiddenSize}}]
 90 |             local resetGate  = currGates[{{}, {  self.hiddenSize+1, 2*self.hiddenSize}}]
 91 |             local updateGate = currGates[{{}, {2*self.hiddenSize+1, 3*self.hiddenSize}}]
 92 |             local bothGates  = currGates[{{}, {  self.hiddenSize+1, 3*self.hiddenSize}}]
 93 | 
 94 |             -- recurrent connection
 95 |             if seqIdx ~= begIdx then
 96 |                 bothGates:addmm(1, prevHidden, self.recWeight_G[i])
 97 |             end        
 98 |             
 99 |             -- inplace non-linearity for reset & update (both) gates
100 |             -- bothGates.nn.Sigmoid_forward(bothGates, bothGates)
101 |             bothGates.THNN.Sigmoid_updateOutput(bothGates:cdata(), bothGates:cdata())
102 | 
103 |             -- reset prev hidden
104 |             if seqIdx ~= begIdx then
105 |                 currResetH:cmul(resetGate, prevHidden)
106 |                 preGateAct:addmm(1, currResetH, self.recWeight_H[i])
107 |             end
108 |             -- preGateAct.nn.Tanh_forward(preGateAct, preGateAct)
109 |             preGateAct.THNN.Tanh_updateOutput(preGateAct:cdata(), preGateAct:cdata())
110 | 
111 |             -- complementary gate
112 |             currComple:add(-1, updateGate)
113 | 
114 |             -- currect hidden
115 |             currHidden:cmul(preGateAct, currComple)
116 |             if seqIdx ~= begIdx then
117 |                 currHidden:addcmul(1, prevHidden, updateGate)
118 |             end
119 | 
120 |             -- set prev hidden
121 |             prevHidden = currHidden
122 |         end
123 |     end
124 | 
125 |     if cutorch then
126 |         -- set back the stream to default stream (0):
127 |         cutorch.setStream(0)
128 | 
129 |         -- 0 is default stream, let 0 wait for the 2 streams to complete before doing anything further
130 |         cutorch.streamWaitFor(0, self.streamList)
131 |     end
132 | 
133 |     self.output = self.hidden
134 |     return self.output
135 | end
136 | 
137 | function BiGRU:updateGradInput(input, gradOutput)
138 |     assert(self.hiddenSize*2==gradOutput:size(gradOutput:nDimension()), 'gradOutput size not match')
139 |     assert(input:size(1)==gradOutput:size(1) and input:size(2)==gradOutput:size(2), 'gradOutput and input size not match')
140 |     
141 |     local seqLen, batchSize = input:size(1), input:size(2)
142 | 
143 |     self.gradInput:resize (seqLen, batchSize, self.inputSize)
144 |     self.gradGates:resize (seqLen, batchSize, self.hiddenSize*6)
145 |     self.gradResetH:resize(seqLen, batchSize, self.hiddenSize*2)
146 |         
147 |     self.gradGates[1]:fill(0)
148 |     self.gradGates[seqLen]:fill(0)
149 | 
150 |     for i = 1, self.nStream do
151 |         -- set stream: stream 1 deals with forward-GRU & stream 2 deals with backward-GRU
152 |         if cutorch then cutorch.setStream(i) end
153 | 
154 |         -- get traverse order (depends on the stream)
155 |         local begIdx, endIdx, stride = self:traverseOrder(seqLen, i)
156 | 
157 |         -- compute stream memory offset
158 |         local left, right = (i-1)*self.hiddenSize, i*self.hiddenSize
159 | 
160 |         local prevHidden, prevGradOutput
161 |         
162 |         for seqIdx = endIdx, begIdx, -stride do
163 |             -- get current memory
164 |             local currGates  = self.gates [{seqIdx, {}, {3*left+1, 3*right}}]
165 |             local currResetH = self.resetH[{seqIdx, {}, {  left+1,   right}}]
166 |             local currComple = self.comple[{seqIdx, {}, {  left+1,   right}}]
167 |             local currHidden = self.hidden[{seqIdx, {}, {  left+1,   right}}]
168 | 
169 |             local currGradGates  = self.gradGates [{seqIdx, {}, {3*left+1, 3*right}}]
170 |             local currGradResetH = self.gradResetH[{seqIdx, {}, {  left+1,   right}}]
171 |             local currGradOutput = gradOutput     [{seqIdx, {}, {  left+1,   right}}]
172 | 
173 |             -- decompose currGates
174 |             local preGateAct = currGates[{{}, {                  1,   self.hiddenSize}}]
175 |             local resetGate  = currGates[{{}, {  self.hiddenSize+1, 2*self.hiddenSize}}]
176 |             local updateGate = currGates[{{}, {2*self.hiddenSize+1, 3*self.hiddenSize}}]
177 | 
178 |             local gradPreGateAct = currGradGates[{{}, {                  1,   self.hiddenSize}}]
179 |             local gradResetGate  = currGradGates[{{}, {  self.hiddenSize+1, 2*self.hiddenSize}}]
180 |             local gradUpdateGate = currGradGates[{{}, {2*self.hiddenSize+1, 3*self.hiddenSize}}]
181 |             local gradBothGates  = currGradGates[{{}, {  self.hiddenSize+1, 3*self.hiddenSize}}]
182 | 
183 |             -- pre-gate input: d_h[t] / d_title{h}[t]
184 |             gradPreGateAct:cmul(currGradOutput, currComple)
185 |             -- gradPreGateAct.nn.Tanh_backward(gradPreGateAct, preGateAct, gradPreGateAct)    -- inplace
186 |             gradPreGateAct.THNN.Tanh_updateGradInput(preGateAct:cdata(), gradPreGateAct:cdata(), gradPreGateAct:cdata(), preGateAct:cdata()) -- inplace
187 | 
188 |             -- related to prev hidden
189 |             if seqIdx ~= begIdx then
190 |                 -- set prev hidden
191 |                 prevHidden = self.hidden[{seqIdx-stride, {}, {left+1, right}}]
192 | 
193 |                 -- reset prev hidden: d_h[t] / d_hat{h}[t]
194 |                 currGradResetH:mm(gradPreGateAct, self.recWeight_H[i]:t())
195 |                 
196 |                 -- reset gate: d_h[t] / d_r[t]
197 |                 gradResetGate:cmul(currGradResetH, prevHidden)
198 |                 -- gradResetGate.nn.Sigmoid_backward(gradResetGate, resetGate, gradResetGate) -- inplace
199 |                 gradResetGate.THNN.Sigmoid_updateGradInput(resetGate:cdata(), gradResetGate:cdata(), gradResetGate:cdata(), resetGate:cdata()) -- inplace
200 | 
201 |                 -- update gate: d_h[t] / d_z[t]
202 |                 gradUpdateGate:cmul(currGradOutput, prevHidden)
203 |             end
204 | 
205 |             -- update gate: d_h[t] / d_z[t]
206 |             gradUpdateGate:addcmul(-1, currGradOutput, preGateAct)
207 |             -- gradUpdateGate.nn.Sigmoid_backward(gradUpdateGate, updateGate, gradUpdateGate) -- inplace
208 |             gradUpdateGate.THNN.Sigmoid_updateGradInput(updateGate:cdata(), gradUpdateGate:cdata(), gradUpdateGate:cdata(), updateGate:cdata()) -- inplace
209 | 
210 |             -- d_h[t] / d_recWeight_H
211 |             self.gradRecWeight_H[i]:addmm(1, currResetH:t(), gradPreGateAct)
212 | 
213 |             if seqIdx ~= begIdx then
214 |                 -- set prev grad hidden/output
215 |                 prevGradOutput = gradOutput[{seqIdx-stride, {}, {left+1, right}}]
216 |                 
217 |                 -- prev hidden: d_h[t] / d_h[t-1]
218 |                 prevGradOutput:addmm(1, gradBothGates, self.recWeight_G[i]:t())
219 |                 prevGradOutput:addcmul(1, currGradOutput, updateGate)
220 |                 prevGradOutput:addcmul(1, currGradResetH, resetGate)
221 | 
222 |                 -- d_h[t] / d_recWeight_G
223 |                 self.gradRecWeight_G[i]:addmm(1, prevHidden:t(), gradBothGates)
224 |             end
225 |         end
226 |     end
227 | 
228 |     if cutorch then
229 |         -- set back the stream to default stream (0):
230 |         cutorch.setStream(0)
231 | 
232 |         -- 0 is default stream, let 0 wait for the 2 streams to complete before doing anything further
233 |         cutorch.streamWaitFor(0, self.streamList)
234 |     end
235 | 
236 |     local denseInput     = input:view(seqLen*batchSize, self.inputSize)
237 |     local denseGradInput = self.gradInput:view(seqLen*batchSize, self.inputSize)
238 |     local denseGradGates = self.gradGates:view(seqLen*batchSize, self.hiddenSize*6)
239 | 
240 |     -- d_E / d_input
241 |     denseGradInput:mm(denseGradGates, self.weight:t())
242 | 
243 |     -- d_E / d_W
244 |     self.gradWeight:addmm(1, denseInput:t(), denseGradGates)
245 | 
246 |     -- d_E / d_b
247 |     self.gradBias:addmv(1, denseGradGates:t(), self.buffer)
248 | 
249 |     return self.gradInput
250 | end
251 | 
252 | function BiGRU:parameters()
253 |     return {self.weight, self.recWeight_G, self.recWeight_H, self.bias}, {self.gradWeight, self.gradRecWeight_G, self.gradRecWeight_H, self.gradBias}
254 | end
255 | 


--------------------------------------------------------------------------------
/KnowledgeBase/type.top-500.pkl:
--------------------------------------------------------------------------------
   1 | (dp1
   2 | S'fb:film.film'
   3 | p2
   4 | I16
   5 | sS'fb:music.genre'
   6 | p3
   7 | I395
   8 | sS'fb:base.tagit.man_made_thing'
   9 | p4
  10 | I225
  11 | sS'fb:user.doconnor.pets.topic'
  12 | p5
  13 | I319
  14 | sS'fb:award.hall_of_fame_inductee'
  15 | p6
  16 | I216
  17 | sS'fb:user.narphorium.people.nndb_person'
  18 | p7
  19 | I75
  20 | sS'fb:music.multipart_release'
  21 | p8
  22 | I211
  23 | sS'fb:base.academia.topic'
  24 | p9
  25 | I380
  26 | sS'fb:base.wfilmbase.topic'
  27 | p10
  28 | I262
  29 | sS'fb:base.cars_refactor.model'
  30 | p11
  31 | I236
  32 | sS'fb:book.magazine'
  33 | p12
  34 | I315
  35 | sS'fb:book.book_character'
  36 | p13
  37 | I190
  38 | sS'fb:freebase.type_profile'
  39 | p14
  40 | I204
  41 | sS'fb:base.webvideo.topic'
  42 | p15
  43 | I391
  44 | sS'fb:base.prison.topic'
  45 | p16
  46 | I415
  47 | sS'fb:base.aareas.schema.earth.citytown'
  48 | p17
  49 | I193
  50 | sS'fb:internet.website_owner'
  51 | p18
  52 | I306
  53 | sS'fb:base.ontologies.ontology_instance'
  54 | p19
  55 | I117
  56 | sS'fb:travel.accommodation'
  57 | p20
  58 | I384
  59 | sS'fb:influence.influence_node'
  60 | p21
  61 | I44
  62 | sS'fb:base.type_ontology.abstract'
  63 | p22
  64 | I4
  65 | sS'fb:base.americancomedy.topic'
  66 | p23
  67 | I256
  68 | sS'fb:location.hud_foreclosure_area'
  69 | p24
  70 | I57
  71 | sS'fb:user.alexander.philosophy.philosopher'
  72 | p25
  73 | I357
  74 | sS'fb:base.consumermedical.medical_term'
  75 | p26
  76 | I218
  77 | sS'fb:food.food'
  78 | p27
  79 | I197
  80 | sS'fb:fictional_universe.person_in_fiction'
  81 | p28
  82 | I165
  83 | sS'fb:education.field_of_study'
  84 | p29
  85 | I271
  86 | sS'fb:sports.boxer'
  87 | p30
  88 | I189
  89 | sS'fb:base.newyorkcity.topic'
  90 | p31
  91 | I367
  92 | sS'fb:boats.ship'
  93 | p32
  94 | I259
  95 | sS'fb:base.adultentertainment.adult_entertainer'
  96 | p33
  97 | I314
  98 | sS'fb:user.robert.us_congress.topic'
  99 | p34
 100 | I389
 101 | sS'fb:base.horseracing.topic'
 102 | p35
 103 | I212
 104 | sS'fb:base.worldwartwo.topic'
 105 | p36
 106 | I268
 107 | sS'fb:base.blackhistorymonth.topic'
 108 | p37
 109 | I180
 110 | sS'fb:music.release_track'
 111 | p38
 112 | I15
 113 | sS'fb:user.alexander.misc.murdered_person'
 114 | p39
 115 | I255
 116 | sS'fb:base.hindisoundtracks.topic'
 117 | p40
 118 | I455
 119 | sS'fb:business.business_location'
 120 | p41
 121 | I270
 122 | sS'fb:medicine.risk_factor'
 123 | p42
 124 | I477
 125 | sS'fb:royalty.monarch'
 126 | p43
 127 | I253
 128 | sS'fb:base.type_ontology.animate'
 129 | p44
 130 | I6
 131 | sS'fb:book.periodical'
 132 | p45
 133 | I133
 134 | sS'fb:music.producer'
 135 | p46
 136 | I104
 137 | sS'fb:aviation.aircraft_owner'
 138 | p47
 139 | I361
 140 | sS'fb:dining.restaurant'
 141 | p48
 142 | I407
 143 | sS'fb:periodicals.newspaper_circulation_area'
 144 | p49
 145 | I140
 146 | sS'fb:medicine.condition_prevention_factors'
 147 | p50
 148 | I456
 149 | sS'fb:base.allthingsnewyork.topic'
 150 | p51
 151 | I115
 152 | sS'fb:sports.drafted_athlete'
 153 | p52
 154 | I290
 155 | sS'fb:base.fictionaluniverse.topic'
 156 | p53
 157 | I497
 158 | sS'fb:food.dish'
 159 | p54
 160 | I355
 161 | sS'fb:base.cannes.topic'
 162 | p55
 163 | I202
 164 | sS'fb:book.literary_series'
 165 | p56
 166 | I481
 167 | sS'fb:biology.animal_breed'
 168 | p57
 169 | I459
 170 | sS'fb:base.zxspectrum.zx_spectrum_program'
 171 | p58
 172 | I448
 173 | sS'fb:military.battle'
 174 | p59
 175 | I171
 176 | sS'fb:sports.sports_official'
 177 | p60
 178 | I353
 179 | sS'fb:film.cinematographer'
 180 | p61
 181 | I93
 182 | sS'fb:law.inventor'
 183 | p62
 184 | I235
 185 | sS'fb:media_common.quotation'
 186 | p63
 187 | I272
 188 | sS'fb:music.release'
 189 | p64
 190 | I20
 191 | sS'fb:astronomy.star'
 192 | p65
 193 | I258
 194 | sS'fb:medicine.disease'
 195 | p66
 196 | I185
 197 | sS'fb:film.film_screening_venue'
 198 | p67
 199 | I431
 200 | sS'fb:broadcast.tv_station'
 201 | p68
 202 | I474
 203 | sS'fb:media_common.adapted_work'
 204 | p69
 205 | I148
 206 | sS'fb:music.soundtrack'
 207 | p70
 208 | I98
 209 | sS'fb:award.award_nominated_work'
 210 | p71
 211 | I45
 212 | sS'fb:theater.theater'
 213 | p72
 214 | I486
 215 | sS'fb:theater.theater_actor'
 216 | p73
 217 | I126
 218 | sS'fb:book.written_work'
 219 | p74
 220 | I28
 221 | sS'fb:astronomy.celestial_object'
 222 | p75
 223 | I58
 224 | sS'fb:base.scotland.topic'
 225 | p76
 226 | I487
 227 | sS'fb:type.content'
 228 | p77
 229 | I17
 230 | sS'fb:base.aareas.schema.au.local_government_area'
 231 | p78
 232 | I493
 233 | sS'fb:people.deceased_person'
 234 | p79
 235 | I8
 236 | sS'fb:broadcast.broadcast'
 237 | p80
 238 | I146
 239 | sS'fb:sports.sports_team'
 240 | p81
 241 | I167
 242 | sS'fb:fictional_universe.fictional_character_creator'
 243 | p82
 244 | I254
 245 | sS'fb:olympics.olympic_athlete'
 246 | p83
 247 | I96
 248 | sS'fb:dining.chef'
 249 | p84
 250 | I452
 251 | sS'fb:biology.deceased_organism'
 252 | p85
 253 | I398
 254 | sS'fb:architecture.museum'
 255 | p86
 256 | I241
 257 | sS'fb:base.argumentmaps.idea'
 258 | p87
 259 | I441
 260 | sS'fb:government.government_agency'
 261 | p88
 262 | I308
 263 | sS'fb:base.litcentral.topic'
 264 | p89
 265 | I416
 266 | sS'fb:base.schemastaging.context_name'
 267 | p90
 268 | I137
 269 | sS'fb:media_common.creative_work'
 270 | p91
 271 | I22
 272 | sS'fb:visual_art.visual_artist'
 273 | p92
 274 | I82
 275 | sS'fb:business.industry'
 276 | p93
 277 | I293
 278 | sS'fb:base.lgbtfilms.topic'
 279 | p94
 280 | I265
 281 | sS'fb:biology.animal'
 282 | p95
 283 | I374
 284 | sS'fb:broadcast.artist'
 285 | p96
 286 | I108
 287 | sS'fb:base.crime.lawyer'
 288 | p97
 289 | I168
 290 | sS'fb:base.popstra.celebrity'
 291 | p98
 292 | I150
 293 | sS'fb:user.maxim75.default_domain.dbpedia_import'
 294 | p99
 295 | I103
 296 | sS'fb:base.foodrecipes.topic'
 297 | p100
 298 | I339
 299 | sS'fb:sports.sports_award_winner'
 300 | p101
 301 | I401
 302 | sS'fb:film.editor'
 303 | p102
 304 | I92
 305 | sS'fb:film.film_location'
 306 | p103
 307 | I260
 308 | sS'fb:music.lyricist'
 309 | p104
 310 | I87
 311 | sS'fb:base.argumentmaps.innovator'
 312 | p105
 313 | I417
 314 | sS'fb:military.military_commander'
 315 | p106
 316 | I440
 317 | sS'fb:tv.tv_personality'
 318 | p107
 319 | I97
 320 | sS'fb:organization.organization'
 321 | p108
 322 | I32
 323 | sS'fb:music.songwriter'
 324 | p109
 325 | I242
 326 | sS'fb:base.skosbase.topic'
 327 | p110
 328 | I162
 329 | sS'fb:film.film_set_designer'
 330 | p111
 331 | I228
 332 | sS'fb:cvg.cvg_designer'
 333 | p112
 334 | I465
 335 | sS'fb:visual_art.art_subject'
 336 | p113
 337 | I307
 338 | sS'fb:people.family_member'
 339 | p114
 340 | I145
 341 | sS'fb:projects.project_participant'
 342 | p115
 343 | I282
 344 | sS'fb:music.artist'
 345 | p116
 346 | I23
 347 | sS'fb:medicine.symptom'
 348 | p117
 349 | I403
 350 | sS'fb:cricket.cricket_bowler'
 351 | p118
 352 | I333
 353 | sS'fb:music.concert_film'
 354 | p119
 355 | I352
 356 | sS'fb:user.alexbl.honorary_title.titled_person'
 357 | p120
 358 | I301
 359 | sS'fb:american_football.football_coach'
 360 | p121
 361 | I317
 362 | sS'fb:geography.body_of_water'
 363 | p122
 364 | I107
 365 | sS'fb:cvg.cvg_publisher'
 366 | p123
 367 | I388
 368 | sS'fb:american_football.football_player'
 369 | p124
 370 | I76
 371 | sS'fb:user.micahsaul.advertising.advertiser'
 372 | p125
 373 | I489
 374 | sS'fb:base.aareas.schema.administrative_area'
 375 | p126
 376 | I78
 377 | sS'fb:education.educational_institution_campus'
 378 | p127
 379 | I77
 380 | sS'fb:base.moscratch.topic'
 381 | p128
 382 | I488
 383 | sS'fb:base.gayporn.topic'
 384 | p129
 385 | I377
 386 | sS'fb:base.movies1001.topic'
 387 | p130
 388 | I385
 389 | sS'fb:tennis.tennis_tournament_champion'
 390 | p131
 391 | I408
 392 | sS'fb:location.australian_local_government_area'
 393 | p132
 394 | I490
 395 | sS'fb:cricket.cricket_player'
 396 | p133
 397 | I179
 398 | sS'fb:base.uncommon.topic'
 399 | p134
 400 | I291
 401 | sS'fb:music.composer'
 402 | p135
 403 | I49
 404 | sS'fb:business.consumer_product'
 405 | p136
 406 | I31
 407 | sS'fb:religion.deity'
 408 | p137
 409 | I485
 410 | sS'fb:music.conductor'
 411 | p138
 412 | I288
 413 | sS'fb:tv.tv_soundtrack'
 414 | p139
 415 | I330
 416 | sS'fb:user.alust.default_domain.processed_with_review_queue'
 417 | p140
 418 | I42
 419 | sS'fb:base.computerscience.topic'
 420 | p141
 421 | I413
 422 | sS'fb:base.toronto.topic'
 423 | p142
 424 | I313
 425 | sS'fb:theater.musical_soundtrack'
 426 | p143
 427 | I454
 428 | sS'fb:sports.pro_athlete'
 429 | p144
 430 | I14
 431 | sS'fb:base.todolists.topic'
 432 | p145
 433 | I161
 434 | sS'fb:geography.lake'
 435 | p146
 436 | I178
 437 | sS'fb:cvg.cvg_developer'
 438 | p147
 439 | I305
 440 | sS'fb:location.us_county'
 441 | p148
 442 | I187
 443 | sS'fb:base.vermont.topic'
 444 | p149
 445 | I360
 446 | sS'fb:base.tagit.concept'
 447 | p150
 448 | I109
 449 | sS'fb:base.animal_synopses.animal_synopsis'
 450 | p151
 451 | I445
 452 | sS'fb:book.journal'
 453 | p152
 454 | I400
 455 | sS'fb:music.record_label'
 456 | p153
 457 | I239
 458 | sS'fb:royalty.chivalric_order_member'
 459 | p154
 460 | I136
 461 | sS'fb:cvg.musical_game_song'
 462 | p155
 463 | I494
 464 | sS'fb:comic_books.comic_book_series'
 465 | p156
 466 | I473
 467 | sS'fb:architecture.venue'
 468 | p157
 469 | I163
 470 | sS'fb:base.tagasauris.organic_object'
 471 | p158
 472 | I213
 473 | sS'fb:base.aareas.schema.us.county'
 474 | p159
 475 | I188
 476 | sS'fb:biology.organism_classification'
 477 | p160
 478 | I118
 479 | sS'fb:base.sundance.topic'
 480 | p161
 481 | I419
 482 | sS'fb:cvg.video_game_soundtrack'
 483 | p162
 484 | I378
 485 | sS'fb:event.disaster'
 486 | p163
 487 | I427
 488 | sS'fb:book.book'
 489 | p164
 490 | I29
 491 | sS'fb:book.publishing_company'
 492 | p165
 493 | I390
 494 | sS'fb:theater.theatrical_lyricist'
 495 | p166
 496 | I480
 497 | sS'fb:base.foodrecipes.recipe_ingredient'
 498 | p167
 499 | I372
 500 | sS'fb:education.educational_institution'
 501 | p168
 502 | I54
 503 | sS'fb:people.measured_person'
 504 | p169
 505 | I12
 506 | sS'fb:biology.pedigreed_animal'
 507 | p170
 508 | I220
 509 | sS'fb:military.military_unit'
 510 | p171
 511 | I244
 512 | sS'fb:base.animemanga.topic'
 513 | p172
 514 | I402
 515 | sS'fb:base.rosenbaum.topic'
 516 | p173
 517 | I396
 518 | sS'fb:user.micahsaul.advertising.advertised_thing'
 519 | p174
 520 | I351
 521 | sS'fb:base.biblioness.bibs_topic'
 522 | p175
 523 | I350
 524 | sS'fb:time.recurring_event'
 525 | p176
 526 | I286
 527 | sS'fb:base.reviews.review'
 528 | p177
 529 | I457
 530 | sS'fb:tv.tv_character'
 531 | p178
 532 | I125
 533 | sS'fb:computer.software_developer'
 534 | p179
 535 | I492
 536 | sS'fb:organization.membership_organization'
 537 | p180
 538 | I491
 539 | sS'fb:base.duiattorneys.topic'
 540 | p181
 541 | I484
 542 | sS'fb:base.holocaust.topic'
 543 | p182
 544 | I469
 545 | sS'fb:user.tsegaran.random.taxonomy_subject'
 546 | p183
 547 | I302
 548 | sS'fb:film.film_production_designer'
 549 | p184
 550 | I130
 551 | sS'fb:base.biblioness.bibs_location'
 552 | p185
 553 | I261
 554 | sS'fb:base.tagit.place'
 555 | p186
 556 | I329
 557 | sS'fb:base.activism.topic'
 558 | p187
 559 | I237
 560 | sS'fb:award.award_winning_work'
 561 | p188
 562 | I55
 563 | sS'fb:law.judge'
 564 | p189
 565 | I334
 566 | sS'fb:religion.place_of_worship'
 567 | p190
 568 | I217
 569 | sS'fb:business.defunct_company'
 570 | p191
 571 | I251
 572 | sS'fb:base.petbreeds.topic'
 573 | p192
 574 | I471
 575 | sS'fb:book.book_subject'
 576 | p193
 577 | I80
 578 | sS'fb:theater.theater_director'
 579 | p194
 580 | I311
 581 | sS'fb:media_common.media_genre'
 582 | p195
 583 | I327
 584 | sS'fb:basketball.basketball_player'
 585 | p196
 586 | I110
 587 | sS'fb:medicine.drug'
 588 | p197
 589 | I156
 590 | sS'fb:medicine.icd_9_cm_classification'
 591 | p198
 592 | I336
 593 | sS'fb:base.ottawa.topic'
 594 | p199
 595 | I392
 596 | sS'fb:book.translated_work'
 597 | p200
 598 | I304
 599 | sS'fb:base.engineering.engineering_person'
 600 | p201
 601 | I434
 602 | sS'fb:tv.tv_writer'
 603 | p202
 604 | I124
 605 | sS'fb:music.engineer'
 606 | p203
 607 | I273
 608 | sS'fb:geography.island'
 609 | p204
 610 | I246
 611 | sS'fb:architecture.house'
 612 | p205
 613 | I447
 614 | sS'fb:basketball.basketball_coach'
 615 | p206
 616 | I346
 617 | sS'fb:medicine.manufactured_drug_form'
 618 | p207
 619 | I113
 620 | sS'fb:people.person'
 621 | p208
 622 | I7
 623 | sS'fb:protected_sites.listed_site'
 624 | p209
 625 | I90
 626 | sS'fb:base.americancivilwar.military_unit'
 627 | p210
 628 | I383
 629 | sS'fb:music.composition'
 630 | p211
 631 | I48
 632 | sS'fb:award.award_nominee'
 633 | p212
 634 | I27
 635 | sS'fb:base.morelaw.canadian_lawyer'
 636 | p213
 637 | I463
 638 | sS'fb:base.schemastaging.theater_production_extra'
 639 | p214
 640 | I425
 641 | sS'fb:business.brand'
 642 | p215
 643 | I269
 644 | sS'fb:base.ttiff.topic'
 645 | p216
 646 | I364
 647 | sS'fb:base.prison.prisoner'
 648 | p217
 649 | I472
 650 | sS'fb:base.karlovyvaryinternationalfilmfestival.topic'
 651 | p218
 652 | I194
 653 | sS'fb:base.ukparliament.topic'
 654 | p219
 655 | I453
 656 | sS'fb:base.argumentmaps.topic'
 657 | p220
 658 | I300
 659 | sS'fb:award.competitor'
 660 | p221
 661 | I279
 662 | sS'fb:education.university'
 663 | p222
 664 | I86
 665 | sS'fb:visual_art.artwork'
 666 | p223
 667 | I119
 668 | sS'fb:base.yalebase.person'
 669 | p224
 670 | I284
 671 | sS'fb:base.usnris.nris_listing'
 672 | p225
 673 | I101
 674 | sS'fb:base.marchmadness.topic'
 675 | p226
 676 | I451
 677 | sS'fb:aviation.airport'
 678 | p227
 679 | I160
 680 | sS'fb:architecture.architectural_structure_owner'
 681 | p228
 682 | I442
 683 | sS'fb:base.academyawards.topic'
 684 | p229
 685 | I338
 686 | sS'fb:biology.organism'
 687 | p230
 688 | I195
 689 | sS'fb:symbols.name_source'
 690 | p231
 691 | I157
 692 | sS'fb:sports.sports_facility'
 693 | p232
 694 | I277
 695 | sS'fb:transportation.bridge'
 696 | p233
 697 | I257
 698 | sS'fb:base.rosetta.languoid'
 699 | p234
 700 | I149
 701 | sS'fb:base.moscratch.shce021709'
 702 | p235
 703 | I495
 704 | sS'fb:base.x2010fifaworldcupsouthafrica.topic'
 705 | p236
 706 | I414
 707 | sS'fb:sports.sports_team_location'
 708 | p237
 709 | I295
 710 | sS'fb:film.film_costumer_designer'
 711 | p238
 712 | I177
 713 | sS'fb:tv.tv_program_guest'
 714 | p239
 715 | I198
 716 | sS'fb:organization.organization_sector'
 717 | p240
 718 | I341
 719 | sS'fb:film.film_casting_director'
 720 | p241
 721 | I275
 722 | sS'fb:sports.golfer'
 723 | p242
 724 | I233
 725 | sS'fb:base.skosbase.skos_concept'
 726 | p243
 727 | I276
 728 | sS'fb:aviation.aircraft_model'
 729 | p244
 730 | I373
 731 | sS'fb:base.setrakian.topic'
 732 | p245
 733 | I412
 734 | sS'fb:geography.geographical_feature'
 735 | p246
 736 | I70
 737 | sS'fb:law.invention'
 738 | p247
 739 | I393
 740 | sS'fb:user.benvvalk.default_domain.moby_output_descriptor'
 741 | p248
 742 | I418
 743 | sS'fb:tv.tv_actor'
 744 | p249
 745 | I38
 746 | sS'fb:medicine.notable_person_with_medical_condition'
 747 | p250
 748 | I287
 749 | sS'fb:freebase.user_profile'
 750 | p251
 751 | I370
 752 | sS'fb:music.musical_group'
 753 | p252
 754 | I52
 755 | sS'fb:film.person_or_entity_appearing_in_film'
 756 | p253
 757 | I50
 758 | sS'fb:comic_books.comic_book_character'
 759 | p254
 760 | I181
 761 | sS'fb:medicine.hospital'
 762 | p255
 763 | I475
 764 | sS'fb:base.nobelprizes.nobel_prize_winner'
 765 | p256
 766 | I432
 767 | sS'fb:computer.software'
 768 | p257
 769 | I186
 770 | sS'fb:base.popstra.sww_base'
 771 | p258
 772 | I142
 773 | sS'fb:fictional_universe.fictional_setting'
 774 | p259
 775 | I387
 776 | sS'fb:base.kwebbase.kwtopic'
 777 | p260
 778 | I285
 779 | sS'fb:base.culturalevent.event'
 780 | p261
 781 | I116
 782 | sS'fb:base.type_ontology.agent'
 783 | p262
 784 | I2
 785 | sS'fb:fictional_universe.work_of_fiction'
 786 | p263
 787 | I88
 788 | sS'fb:book.published_work'
 789 | p264
 790 | I192
 791 | sS'fb:military.military_person'
 792 | p265
 793 | I67
 794 | sS'fb:freebase.equivalent_topic'
 795 | p266
 796 | I422
 797 | sS'fb:media_common.quotation_subject'
 798 | p267
 799 | I375
 800 | sS'fb:book.newspaper'
 801 | p268
 802 | I176
 803 | sS'fb:base.wikipedia_infobox.video_game'
 804 | p269
 805 | I95
 806 | sS'fb:soccer.football_team_manager'
 807 | p270
 808 | I274
 809 | sS'fb:interests.collectable_item'
 810 | p271
 811 | I482
 812 | sS'fb:base.thoroughbredracing.thoroughbred_racehorse'
 813 | p272
 814 | I232
 815 | sS'fb:book.short_story'
 816 | p273
 817 | I312
 818 | sS'fb:tv.tv_series_episode'
 819 | p274
 820 | I59
 821 | sS'fb:geography.river'
 822 | p275
 823 | I129
 824 | sS'fb:education.academic'
 825 | p276
 826 | I105
 827 | sS'fb:tv.tv_program_creator'
 828 | p277
 829 | I144
 830 | sS'fb:base.jewlib.topic'
 831 | p278
 832 | I323
 833 | sS'fb:music.single'
 834 | p279
 835 | I26
 836 | sS'fb:film.writer'
 837 | p280
 838 | I41
 839 | sS'fb:user.sandos.common_sense.pet'
 840 | p281
 841 | I439
 842 | sS'fb:education.school'
 843 | p282
 844 | I111
 845 | sS'fb:base.crime.convicted_criminal'
 846 | p283
 847 | I294
 848 | sS'fb:location.capital_of_administrative_division'
 849 | p284
 850 | I151
 851 | sS'fb:music.featured_artist'
 852 | p285
 853 | I182
 854 | sS'fb:location.statistical_region'
 855 | p286
 856 | I21
 857 | sS'fb:media_common.cataloged_instance'
 858 | p287
 859 | I39
 860 | sS'fb:exhibitions.exhibition_subject'
 861 | p288
 862 | I424
 863 | sS'fb:broadcast.content'
 864 | p289
 865 | I234
 866 | sS'fb:base.summermovies2009.topic'
 867 | p290
 868 | I411
 869 | sS'fb:theater.theater_character'
 870 | p291
 871 | I359
 872 | sS'fb:base.fight.sports_official'
 873 | p292
 874 | I344
 875 | sS'fb:music.album'
 876 | p293
 877 | I10
 878 | sS'fb:base.wfilmbase.film'
 879 | p294
 880 | I263
 881 | sS'fb:government.governmental_jurisdiction'
 882 | p295
 883 | I219
 884 | sS'fb:base.zxspectrum.topic'
 885 | p296
 886 | I325
 887 | sS'fb:military.military_conflict'
 888 | p297
 889 | I120
 890 | sS'fb:internet.website'
 891 | p298
 892 | I322
 893 | sS'fb:film.production_company'
 894 | p299
 895 | I498
 896 | sS'fb:award.competition'
 897 | p300
 898 | I326
 899 | sS'fb:sports.school_sports_team'
 900 | p301
 901 | I460
 902 | sS'fb:base.schemastaging.person_extra'
 903 | p302
 904 | I206
 905 | sS'fb:base.sfiff.topic'
 906 | p303
 907 | I164
 908 | sS'fb:base.schemastaging.drug_extra'
 909 | p304
 910 | I354
 911 | sS'fb:location.hud_county_place'
 912 | p305
 913 | I66
 914 | sS'fb:business.business_operation'
 915 | p306
 916 | I62
 917 | sS'fb:language.human_language'
 918 | p307
 919 | I143
 920 | sS'fb:business.issuer'
 921 | p308
 922 | I174
 923 | sS'fb:government.political_party'
 924 | p309
 925 | I247
 926 | sS'fb:architecture.structure'
 927 | p310
 928 | I53
 929 | sS'fb:organization.organization_member'
 930 | p311
 931 | I196
 932 | sS'fb:type.user'
 933 | p312
 934 | I371
 935 | sS'fb:business.issue'
 936 | p313
 937 | I208
 938 | sS'fb:base.kwebbase.kwconnection'
 939 | p314
 940 | I281
 941 | sS'fb:film.film_character'
 942 | p315
 943 | I63
 944 | sS'fb:type.type'
 945 | p316
 946 | I205
 947 | sS'fb:soccer.football_player'
 948 | p317
 949 | I35
 950 | sS'fb:architecture.architect'
 951 | p318
 952 | I154
 953 | sS'fb:transportation.road'
 954 | p319
 955 | I222
 956 | sS'fb:medicine.physician'
 957 | p320
 958 | I243
 959 | sS'fb:user.sandos.common_sense.common_sense_organism'
 960 | p321
 961 | I438
 962 | sS'fb:film.film_subject'
 963 | p322
 964 | I191
 965 | sS'fb:sports.sports_championship_event'
 966 | p323
 967 | I406
 968 | sS'fb:type.namespace'
 969 | p324
 970 | I297
 971 | sS'fb:base.nobelprizes.topic'
 972 | p325
 973 | I342
 974 | sS'fb:music.release_component'
 975 | p326
 976 | I127
 977 | sS'fb:projects.project_focus'
 978 | p327
 979 | I61
 980 | sS'fb:base.crime.topic'
 981 | p328
 982 | I139
 983 | sS'fb:film.music_contributor'
 984 | p329
 985 | I99
 986 | sS'fb:business.employer'
 987 | p330
 988 | I36
 989 | sS'fb:music.guitarist'
 990 | p331
 991 | I173
 992 | sS'fb:base.fblinux.topic'
 993 | p332
 994 | I496
 995 | sS'fb:base.schemastaging.tv_actor_extra'
 996 | p333
 997 | I343
 998 | sS'fb:tennis.tennis_player'
 999 | p334
1000 | I199
1001 | sS'fb:business.product_category'
1002 | p335
1003 | I345
1004 | sS'fb:base.skosbase.vocabulary_equivalent_topic'
1005 | p336
1006 | I152
1007 | sS'fb:theater.theater_producer'
1008 | p337
1009 | I362
1010 | sS'fb:base.pornactresses.topic'
1011 | p338
1012 | I468
1013 | sS'fb:government.u_s_congressperson'
1014 | p339
1015 | I122
1016 | sS'fb:cvg.game_version'
1017 | p340
1018 | I89
1019 | sS'fb:location.dated_location'
1020 | p341
1021 | I19
1022 | sS'fb:film.actor'
1023 | p342
1024 | I11
1025 | sS'fb:fictional_universe.fictional_character'
1026 | p343
1027 | I46
1028 | sS'fb:base.americancivilwar.topic'
1029 | p344
1030 | I223
1031 | sS'fb:martial_arts.martial_artist'
1032 | p345
1033 | I159
1034 | sS'fb:wine.wine'
1035 | p346
1036 | I252
1037 | sS'fb:user.zsi_editorial.editorial.base_topic'
1038 | p347
1039 | I278
1040 | sS'fb:base.popstra.topic'
1041 | p348
1042 | I141
1043 | sS'fb:base.schemastaging.non_profit_extra'
1044 | p349
1045 | I250
1046 | sS'fb:book.book_edition'
1047 | p350
1048 | I51
1049 | sS'fb:religion.religious_leader'
1050 | p351
1051 | I356
1052 | sS'fb:base.consumermedical.disease'
1053 | p352
1054 | I221
1055 | sS'fb:symbols.namesake'
1056 | p353
1057 | I132
1058 | sS'fb:user.skud.names.topic'
1059 | p354
1060 | I324
1061 | sS'fb:user.micahsaul.advertising.ad_campaign'
1062 | p355
1063 | I470
1064 | sS'fb:location.australian_suburb'
1065 | p356
1066 | I464
1067 | sS'fb:people.ethnicity'
1068 | p357
1069 | I292
1070 | sS'fb:base.filmnoir.topic'
1071 | p358
1072 | I349
1073 | sS'fb:tv.tv_producer'
1074 | p359
1075 | I112
1076 | sS'fb:base.schemastaging.aircraft_model_extra'
1077 | p360
1078 | I436
1079 | sS'fb:base.fight.topic'
1080 | p361
1081 | I310
1082 | sS'fb:base.washingtondc.topic'
1083 | p362
1084 | I368
1085 | sS'fb:base.x2010fifaworldcupsouthafrica.world_cup_participant'
1086 | p363
1087 | I426
1088 | sS'fb:award.award_category'
1089 | p364
1090 | I331
1091 | sS'fb:base.vancouver.topic'
1092 | p365
1093 | I320
1094 | sS'fb:base.thoroughbredracing.topic'
1095 | p366
1096 | I203
1097 | sS'fb:biology.gene'
1098 | p367
1099 | I369
1100 | sS'fb:base.aareas.schema.england.civil_parish'
1101 | p368
1102 | I267
1103 | sS'fb:base.schemastaging.government_position_held_extra'
1104 | p369
1105 | I435
1106 | sS'fb:award.ranked_item'
1107 | p370
1108 | I85
1109 | sS'fb:base.type_ontology.physically_instantiable'
1110 | p371
1111 | I3
1112 | sS'fb:film.producer'
1113 | p372
1114 | I43
1115 | sS'fb:common.topic'
1116 | p373
1117 | I0
1118 | sS'fb:geography.mountain'
1119 | p374
1120 | I135
1121 | sS'fb:theater.theater_production'
1122 | p375
1123 | I332
1124 | sS'fb:people.cause_of_death'
1125 | p376
1126 | I443
1127 | sS'fb:base.ireland.topic'
1128 | p377
1129 | I478
1130 | sS'fb:base.horseracing.racehorse'
1131 | p378
1132 | I238
1133 | sS'fb:protected_sites.protected_site'
1134 | p379
1135 | I231
1136 | sS'fb:automotive.model'
1137 | p380
1138 | I227
1139 | sS'fb:music.recording'
1140 | p381
1141 | I13
1142 | sS'fb:film.director'
1143 | p382
1144 | I40
1145 | sS'fb:organization.organization_founder'
1146 | p383
1147 | I91
1148 | sS'fb:soccer.football_team'
1149 | p384
1150 | I335
1151 | sS'fb:base.australianpolitics.topic'
1152 | p385
1153 | I467
1154 | sS'fb:astronomy.star_system_body'
1155 | p386
1156 | I69
1157 | sS'fb:architecture.building'
1158 | p387
1159 | I73
1160 | sS'fb:astronomy.astronomical_discovery'
1161 | p388
1162 | I64
1163 | sS'fb:base.saints.topic'
1164 | p389
1165 | I429
1166 | sS'fb:opera.opera'
1167 | p390
1168 | I420
1169 | sS'fb:medicine.drug_formulation'
1170 | p391
1171 | I81
1172 | sS'fb:base.fashionmodels.fashion_model'
1173 | p392
1174 | I423
1175 | sS'fb:base.adultentertainment.topic'
1176 | p393
1177 | I266
1178 | sS'fb:travel.travel_destination'
1179 | p394
1180 | I172
1181 | sS'fb:architecture.skyscraper'
1182 | p395
1183 | I309
1184 | sS'fb:base.yalebase.topic'
1185 | p396
1186 | I264
1187 | sS'fb:book.author'
1188 | p397
1189 | I25
1190 | sS'fb:computer.computer_scientist'
1191 | p398
1192 | I499
1193 | sS'fb:biology.owned_animal'
1194 | p399
1195 | I224
1196 | sS'fb:base.americancomedy.comedian'
1197 | p400
1198 | I316
1199 | sS'fb:base.myspace.myspace_user'
1200 | p401
1201 | I106
1202 | sS'fb:film.film_story_contributor'
1203 | p402
1204 | I79
1205 | sS'fb:sports.sports_team_coach'
1206 | p403
1207 | I404
1208 | sS'fb:astronomy.asteroid'
1209 | p404
1210 | I65
1211 | sS'fb:base.argumentmaps.original_idea'
1212 | p405
1213 | I405
1214 | sS'fb:music.writer'
1215 | p406
1216 | I47
1217 | sS'fb:base.plopquiz.topic'
1218 | p407
1219 | I379
1220 | sS'fb:medicine.drug_ingredient'
1221 | p408
1222 | I229
1223 | sS'fb:media_common.adaptation'
1224 | p409
1225 | I102
1226 | sS'fb:astronomy.orbital_relationship'
1227 | p410
1228 | I60
1229 | sS'fb:user.doconnor.pets.pet'
1230 | p411
1231 | I446
1232 | sS'fb:baseball.baseball_player'
1233 | p412
1234 | I71
1235 | sS'fb:base.atheism.atheist'
1236 | p413
1237 | I394
1238 | sS'fb:book.periodical_publisher'
1239 | p414
1240 | I366
1241 | sS'fb:base.berlininternationalfilmfestival.topic'
1242 | p415
1243 | I134
1244 | sS'fb:base.ovguide.topic'
1245 | p416
1246 | I184
1247 | sS'fb:government.government_office_or_title'
1248 | p417
1249 | I483
1250 | sS'fb:base.schemastaging.athlete_extra'
1251 | p418
1252 | I94
1253 | sS'fb:medicine.medical_treatment'
1254 | p419
1255 | I215
1256 | sS'fb:base.objectionablecontent.flagged_content'
1257 | p420
1258 | I348
1259 | sS'fb:tv.tv_director'
1260 | p421
1261 | I170
1262 | sS'fb:base.schemastaging.food_extra'
1263 | p422
1264 | I458
1265 | sS'fb:base.schemastaging.topic'
1266 | p423
1267 | I131
1268 | sS'fb:tv.tv_network'
1269 | p424
1270 | I462
1271 | sS'fb:business.consumer_company'
1272 | p425
1273 | I397
1274 | sS'fb:base.argentina.topic'
1275 | p426
1276 | I430
1277 | sS'fb:base.gayporn.gay_porn'
1278 | p427
1279 | I381
1280 | sS'fb:people.place_of_interment'
1281 | p428
1282 | I382
1283 | sS'fb:metropolitan_transit.transit_line'
1284 | p429
1285 | I358
1286 | sS'fb:travel.tourist_attraction'
1287 | p430
1288 | I114
1289 | sS'fb:base.schemastaging.contact_product'
1290 | p431
1291 | I461
1292 | sS'fb:base.performer.topic'
1293 | p432
1294 | I433
1295 | sS'fb:user.zsi_editorial.editorial.topic'
1296 | p433
1297 | I245
1298 | sS'fb:theater.play'
1299 | p434
1300 | I248
1301 | sS'fb:organization.non_profit_organization'
1302 | p435
1303 | I175
1304 | sS'fb:award.award_winner'
1305 | p436
1306 | I30
1307 | sS'fb:internet.social_network_user'
1308 | p437
1309 | I56
1310 | sS'fb:celebrities.celebrity'
1311 | p438
1312 | I226
1313 | sS'fb:user.tsegaran.random.topic'
1314 | p439
1315 | I476
1316 | sS'fb:time.event'
1317 | p440
1318 | I83
1319 | sS'fb:base.saints.saint'
1320 | p441
1321 | I328
1322 | sS'fb:astronomy.astronomer'
1323 | p442
1324 | I410
1325 | sS'fb:location.census_designated_place'
1326 | p443
1327 | I158
1328 | sS'fb:location.citytown'
1329 | p444
1330 | I24
1331 | sS'fb:base.schemastaging.organization_extra'
1332 | p445
1333 | I155
1334 | sS'fb:base.americancivilwar.regiment'
1335 | p446
1336 | I363
1337 | sS'fb:base.engineering.topic'
1338 | p447
1339 | I296
1340 | sS'fb:base.activism.activist'
1341 | p448
1342 | I210
1343 | sS'fb:music.group_member'
1344 | p449
1345 | I33
1346 | sS'fb:food.ingredient'
1347 | p450
1348 | I289
1349 | sS'fb:location.location'
1350 | p451
1351 | I9
1352 | sS'fb:government.political_district'
1353 | p452
1354 | I337
1355 | sS'fb:location.neighborhood'
1356 | p453
1357 | I166
1358 | sS'fb:base.rosetta.local_name'
1359 | p454
1360 | I450
1361 | sS'fb:base.frameline.topic'
1362 | p455
1363 | I321
1364 | sS'fb:film.film_crewmember'
1365 | p456
1366 | I121
1367 | sS'fb:location.administrative_division'
1368 | p457
1369 | I74
1370 | sS'fb:user.jg.default_domain.racehorse'
1371 | p458
1372 | I299
1373 | sS'fb:metropolitan_transit.transit_stop'
1374 | p459
1375 | I318
1376 | sS'fb:location.postal_code'
1377 | p460
1378 | I138
1379 | sS'fb:base.folklore.topic'
1380 | p461
1381 | I399
1382 | sS'fb:user.doconnor.pets.horse'
1383 | p462
1384 | I444
1385 | sS'fb:base.cinemainspector.person_sign'
1386 | p463
1387 | I428
1388 | sS'fb:common.resource'
1389 | p464
1390 | I214
1391 | sS'fb:base.type_ontology.inanimate'
1392 | p465
1393 | I5
1394 | sS'fb:theater.theatrical_composer'
1395 | p466
1396 | I466
1397 | sS'fb:base.rugby.rugby_player'
1398 | p467
1399 | I200
1400 | sS'fb:royalty.noble_person'
1401 | p468
1402 | I123
1403 | sS'fb:government.politician'
1404 | p469
1405 | I37
1406 | sS'fb:common.image'
1407 | p470
1408 | I18
1409 | sS'fb:broadcast.radio_station'
1410 | p471
1411 | I169
1412 | sS'fb:base.services.topic'
1413 | p472
1414 | I365
1415 | sS'fb:business.board_member'
1416 | p473
1417 | I128
1418 | sS'fb:base.tagit.organic_thing'
1419 | p474
1420 | I207
1421 | sS'fb:ice_hockey.hockey_player'
1422 | p475
1423 | I153
1424 | sS'fb:base.tagit.topic'
1425 | p476
1426 | I386
1427 | sS'fb:aviation.airline'
1428 | p477
1429 | I376
1430 | sS'fb:base.type_ontology.non_agent'
1431 | p478
1432 | I1
1433 | sS'fb:organization.endowed_organization'
1434 | p479
1435 | I249
1436 | sS'fb:media_common.netflix_title'
1437 | p480
1438 | I34
1439 | sS'fb:type.property'
1440 | p481
1441 | I298
1442 | sS'fb:user.sandos.common_sense.topic'
1443 | p482
1444 | I437
1445 | sS'fb:base.disneyana.topic'
1446 | p483
1447 | I479
1448 | sS'fb:education.school_district'
1449 | p484
1450 | I449
1451 | sS'fb:base.schemastaging.sports_team_extra'
1452 | p485
1453 | I240
1454 | sS'fb:location.uk_civil_parish'
1455 | p486
1456 | I209
1457 | sS'fb:base.yemebase.topic'
1458 | p487
1459 | I409
1460 | sS'fb:user.narphorium.people.topic'
1461 | p488
1462 | I72
1463 | sS'fb:organization.organization_scope'
1464 | p489
1465 | I280
1466 | sS'fb:cvg.computer_videogame'
1467 | p490
1468 | I84
1469 | sS'fb:travel.hotel'
1470 | p491
1471 | I421
1472 | sS'fb:base.catalog.cataloged_composition'
1473 | p492
1474 | I347
1475 | sS'fb:base.usnris.topic'
1476 | p493
1477 | I100
1478 | sS'fb:common.webpage'
1479 | p494
1480 | I201
1481 | sS'fb:tv.tv_series_season'
1482 | p495
1483 | I183
1484 | sS'fb:film.film_art_director'
1485 | p496
1486 | I147
1487 | sS'fb:tv.tv_program'
1488 | p497
1489 | I68
1490 | sS'fb:chemistry.chemical_compound'
1491 | p498
1492 | I230
1493 | sS'fb:people.profession'
1494 | p499
1495 | I283
1496 | sS'fb:base.disaster2.topic'
1497 | p500
1498 | I340
1499 | sS'fb:base.fashionmodels.topic'
1500 | p501
1501 | I303
1502 | s.


--------------------------------------------------------------------------------