├── src ├── nn │ ├── repeat.py │ ├── __init__.py │ ├── alltests.py │ ├── sum2.py │ ├── lstm_profile.py │ ├── const_value.py │ ├── lstm_test_target.csv │ ├── normalize.py │ ├── elem_prod.py │ ├── active.py │ ├── sum.py │ ├── model.py │ ├── valid_tool.py │ ├── loader.py │ ├── meanpool1d.py │ ├── selector.py │ ├── dropout.py │ ├── tester.py │ ├── const_weights.py │ ├── maxpool1d.py │ ├── inner_prod.py │ ├── sequential.py │ ├── active_func.py │ ├── cos_sim.py │ ├── ordinal.py │ ├── lut.py │ ├── sum_prod.py │ ├── sequential_tests.py │ ├── conv1d.py │ ├── map.py │ ├── lstm_test.py │ ├── func.py │ ├── container.py │ ├── reshape.py │ ├── lstm.py │ ├── stage.py │ ├── recurrent_tests.py │ ├── lstm_old.py │ └── trainer.py ├── imageqa_crosstest.py ├── imageqa_layout.py ├── calculate_wups.py ├── imageqa_modelavg.py ├── imageqa_adhoc.py ├── imageqa_compare.py ├── imageqa_test.py ├── imageqa_ensemble.py └── train.py ├── results └── README.md ├── data └── README.md ├── config └── train.yml ├── .gitignore ├── LICENSE ├── models ├── img_bow.model.yml ├── vis_lstm.model.yml └── 2_vis_blstm.model.yml └── README.md /src/nn/repeat.py: -------------------------------------------------------------------------------- 1 | class Repeat(Stage): 2 | pass -------------------------------------------------------------------------------- /results/README.md: -------------------------------------------------------------------------------- 1 | # Image QA results folder 2 | 3 | This folder is for storing training results. 4 | 5 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | # Image QA data folder 2 | 3 | Please download the data of this directory 4 | * http://www.cs.toronto.edu/~mren/imageqa/data/hidden_oxford_mscoco.h5 5 | * http://www.cs.toronto.edu/~mren/imageqa/data/cocoqa.zip 6 | 7 | -------------------------------------------------------------------------------- /src/nn/__init__.py: -------------------------------------------------------------------------------- 1 | # Neural network package 2 | # Package level functions 3 | from loader import * 4 | from func import * 5 | from active_func import * 6 | from tester import * 7 | 8 | # Classes 9 | from trainer import * 10 | 11 | -------------------------------------------------------------------------------- /config/train.yml: -------------------------------------------------------------------------------- 1 | numEpoch: 2000 2 | batchSize: 100 3 | shuffle: true 4 | writeRecord: true 5 | saveModel: true 6 | plotFigs: true 7 | calcError: true 8 | stopCost: 0.01 9 | progress: true 10 | patience: 10 11 | criterion: loss 12 | needValid: true 13 | sendEmail: false 14 | -------------------------------------------------------------------------------- /src/nn/alltests.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import unittest 3 | 4 | test_file_strings = glob.glob('*_test*.py') 5 | module_strings = [str[0:len(str)-3] for str in test_file_strings] 6 | suites = [unittest.defaultTestLoader.loadTestsFromName(str) for str 7 | in module_strings] 8 | testSuite = unittest.TestSuite(suites) 9 | text_runner = unittest.TextTestRunner().run(testSuite) -------------------------------------------------------------------------------- /src/nn/sum2.py: -------------------------------------------------------------------------------- 1 | from stage import * 2 | 3 | class Sum2(Stage): 4 | """Stage summing first half of the input with second half.""" 5 | def __init__(self, name, inputNames, outputDim, 6 | defaultValue=0.0): 7 | Stage.__init__( 8 | self, 9 | name=name, 10 | inputNames=inputNames, 11 | outputDim=outputDim, 12 | defaultValue=defaultValue) 13 | def forward(self, X): 14 | self.numComponents = X.shape[1] 15 | return np.sum(X, axis=1) 16 | def backward(self, dEdY): 17 | self.dEdW = 0.0 18 | return np.tile(dEdY.reshape(dEdY.shape[0], 1, dEdY.shape[1]), (1, self.numComponents, 1)) 19 | -------------------------------------------------------------------------------- /src/nn/lstm_profile.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import numpy as np 4 | import lstm_old as l 5 | 6 | start = time.time() 7 | timespan = 100 8 | multiErr = len(sys.argv) > 1 and sys.argv[1] == 'm' 9 | for i in range(0, 10): 10 | lstm = l.LSTM_Old( 11 | inputDim=100, 12 | outputDim=100, 13 | initRange=.1, 14 | initSeed=3, 15 | cutOffZeroEnd=True, 16 | multiErr=multiErr, 17 | outputdEdX=True) 18 | X = np.random.rand(10, timespan, 100) 19 | Y = lstm.forward(X) 20 | dEdY = np.random.rand(10, timespan, 100) if multiErr else np.random.rand(10, 100) 21 | dEdY = lstm.backward(dEdY) 22 | print '%.4f s' % (time.time() - start) -------------------------------------------------------------------------------- /src/nn/const_value.py: -------------------------------------------------------------------------------- 1 | from stage import * 2 | 3 | class ConstValue(Stage): 4 | def __init__(self, 5 | name, 6 | inputNames, 7 | outputDim, 8 | value): 9 | Stage.__init__(self, 10 | name=name, 11 | outputDim=outputDim, 12 | inputNames=inputNames, 13 | outputdEdX=False) 14 | self.dEdW = 0 15 | self.value = value 16 | 17 | def graphBackward(self): 18 | self.backward(self.dEdY) 19 | 20 | def forward(self, X): 21 | return np.zeros((X.shape[0], self.outputDim)) + self.value 22 | 23 | def backward(self, dEdY): 24 | return None -------------------------------------------------------------------------------- /src/nn/lstm_test_target.csv: -------------------------------------------------------------------------------- 1 | 1.000000000000000000e+00 2 | 1.000000000000000000e+00 3 | 1.000000000000000000e+00 4 | 1.000000000000000000e+00 5 | 1.000000000000000000e+00 6 | 1.000000000000000000e+00 7 | 1.000000000000000000e+00 8 | 1.000000000000000000e+00 9 | 1.000000000000000000e+00 10 | 1.000000000000000000e+00 11 | 1.000000000000000000e+00 12 | 1.000000000000000000e+00 13 | 1.000000000000000000e+00 14 | 1.000000000000000000e+00 15 | 1.000000000000000000e+00 16 | 1.000000000000000000e+00 17 | 1.000000000000000000e+00 18 | 1.000000000000000000e+00 19 | 1.000000000000000000e+00 20 | 1.000000000000000000e+00 21 | 1.000000000000000000e+00 22 | 1.000000000000000000e+00 23 | 1.000000000000000000e+00 24 | 1.000000000000000000e+00 25 | 1.000000000000000000e+00 26 | -------------------------------------------------------------------------------- /src/nn/normalize.py: -------------------------------------------------------------------------------- 1 | from stage import * 2 | 3 | class Normalize(Stage): 4 | def __init__(self, 5 | outputDim, 6 | mean, 7 | std, 8 | name=None, 9 | inputNames=None, 10 | outputdEdX=True): 11 | Stage.__init__(self, 12 | name=name, 13 | inputNames=inputNames, 14 | outputDim=outputDim, 15 | outputdEdX=outputdEdX) 16 | self.mean = mean 17 | self.std = std 18 | self.X = 0 19 | self.Y = 0 20 | pass 21 | 22 | def forward(self, X): 23 | return (X - self.mean) / self.std 24 | 25 | def backward(self, dEdY): 26 | return dEdY / self.std -------------------------------------------------------------------------------- /src/nn/elem_prod.py: -------------------------------------------------------------------------------- 1 | from stage import * 2 | 3 | class ElementProduct(Stage): 4 | """Stage multiplying first half of the input with second half""" 5 | def __init__(self, name, inputNames, outputDim, 6 | defaultValue=0.0): 7 | Stage.__init__( 8 | self, 9 | name=name, 10 | inputNames=inputNames, 11 | outputDim=outputDim, 12 | defaultValue=defaultValue) 13 | def forward(self, X): 14 | self.X = X 15 | return X[:,:X.shape[1]/2] * X[:,X.shape[1]/2:] 16 | def backward(self, dEdY): 17 | self.dEdW = 0.0 18 | return np.concatenate( 19 | (self.X[:,self.X.shape[1]/2:] * dEdY, 20 | self.X[:,:self.X.shape[1]/2] * dEdY), 21 | axis=-1) -------------------------------------------------------------------------------- /src/nn/active.py: -------------------------------------------------------------------------------- 1 | from stage import * 2 | 3 | class Active(Stage): 4 | def __init__(self, 5 | activeFn, 6 | inputNames, 7 | outputDim, 8 | defaultValue=0.0, 9 | outputdEdX=True, 10 | name=None): 11 | Stage.__init__(self, 12 | name=name, 13 | inputNames=inputNames, 14 | outputDim=outputDim, 15 | defaultValue=defaultValue, 16 | outputdEdX=outputdEdX) 17 | self.activeFn = activeFn 18 | def forward(self, X): 19 | self.Y = self.activeFn.forward(X) 20 | return self.Y 21 | def backward(self, dEdY): 22 | self.dEdW = 0 23 | return self.activeFn.backward(dEdY, self.Y, 0) -------------------------------------------------------------------------------- /src/nn/sum.py: -------------------------------------------------------------------------------- 1 | from stage import * 2 | 3 | class Sum(Stage): 4 | """Stage summing first half of the input with second half.""" 5 | def __init__(self, name, inputNames, numComponents, outputDim, 6 | defaultValue=0.0): 7 | Stage.__init__( 8 | self, 9 | name=name, 10 | inputNames=inputNames, 11 | outputDim=outputDim, 12 | defaultValue=defaultValue) 13 | self.numComponents = numComponents 14 | def forward(self, X): 15 | return np.sum( 16 | X.reshape(X.shape[0], 17 | self.numComponents, 18 | X.shape[1] / self.numComponents), 19 | axis=1) 20 | def backward(self, dEdY): 21 | self.dEdW = 0.0 22 | return np.tile(dEdY, self.numComponents) 23 | -------------------------------------------------------------------------------- /src/nn/model.py: -------------------------------------------------------------------------------- 1 | from container import * 2 | 3 | class GraphModel(Container): 4 | def __init__(self, 5 | stages, 6 | outputStageNames, 7 | costFn, 8 | inputDim=0, 9 | outputDim=0, 10 | name=None, 11 | decisionFn=None, 12 | specFilename=None): 13 | Container.__init__(self, 14 | name=name, 15 | stages=stages, 16 | inputNames=['input'], 17 | outputStageNames=outputStageNames, 18 | inputDim=inputDim, 19 | outputDim=outputDim, 20 | outputdEdX=True) 21 | self.getCost = costFn 22 | self.predict = decisionFn 23 | self.specFilename = specFilename -------------------------------------------------------------------------------- /src/nn/valid_tool.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def splitData(trainInput, trainTarget, heldOutRatio, validNumber): 4 | s = np.round(trainInput.shape[0] * heldOutRatio) 5 | start = s * validNumber 6 | validInput = trainInput[start : start + s] 7 | validTarget = trainTarget[start : start + s] 8 | if validNumber == 0: 9 | trainInput = trainInput[s:] 10 | trainTarget = trainTarget[s:] 11 | else: 12 | trainInput = np.concatenate((trainInput[0:start], trainInput[start + s:])) 13 | trainTarget = np.concatenate((trainTarget[0:start], trainTarget[start + s:])) 14 | return trainInput, trainTarget, validInput, validTarget 15 | 16 | def shuffleData(X, T, random=None): 17 | if random is None: 18 | random = np.random.RandomState() 19 | shuffle = np.arange(0, X.shape[0]) 20 | shuffle = random.permutation(shuffle) 21 | X = X[shuffle] 22 | T = T[shuffle] 23 | return X, T -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /src/nn/loader.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import router 3 | from model import * 4 | 5 | def load(modelSpecFilename): 6 | """ 7 | Need the following items in the model spec file: 8 | costFn 9 | decisionFn 10 | stages 11 | specs 12 | :param modelSpecFilename: 13 | :return: 14 | """ 15 | with open(modelSpecFilename) as f: 16 | modelDict = yaml.load(f) 17 | 18 | for stageDict in modelDict['specs']: 19 | router.addStage(stageDict) 20 | 21 | modelStages = [] 22 | for s in modelDict['stages']: 23 | modelStages.append(router.routeStage(s)) 24 | costFn=router.routeFn(modelDict['costFn']) 25 | 26 | if modelDict.has_key('decisionFn'): 27 | decisionFn = router.routeFn(modelDict['decisionFn']) 28 | else: 29 | decisionFn = None 30 | outputList = modelDict['outputs'].split(',') 31 | for i in range(len(outputList)): 32 | outputList[i] = outputList[i].strip() 33 | model = GraphModel( 34 | name=modelDict['name'] if modelDict.has_key('name') else None, 35 | stages=modelStages, 36 | outputStageNames=outputList, 37 | costFn=costFn, 38 | decisionFn=decisionFn, 39 | specFilename=modelSpecFilename) 40 | 41 | return model -------------------------------------------------------------------------------- /src/nn/meanpool1d.py: -------------------------------------------------------------------------------- 1 | from stage import * 2 | 3 | class MeanPool1D(Stage): 4 | """ 5 | 1D mean pooling. 6 | Padding no longer make sense now. 7 | Make sure you have the right size. 8 | """ 9 | def __init__(self, 10 | outputDim, 11 | windowSize, 12 | inputNames=None, 13 | defaultValue=0.0, 14 | outputdEdX=True, 15 | name=None): 16 | Stage.__init__(self, 17 | name=name, 18 | inputNames=inputNames, 19 | outputDim=outputDim, 20 | defaultValue=defaultValue, 21 | outputdEdX=outputdEdX) 22 | self.windowSize = windowSize 23 | self.X = 0 24 | self.Y = 0 25 | 26 | def forward(self, X): 27 | X = X.reshape(X.shape[0], self.windowSize, X.shape[1] / self.windowSize, X.shape[2]) 28 | Y = np.mean(X, axis=1) 29 | self.X = X 30 | return Y 31 | 32 | def backward(self, dEdY): 33 | dEdX = np.tile( 34 | dEdY.reshape(dEdY.shape[0], 1, dEdY.shape[1], dEdY.shape[2]), 35 | (1, self.windowSize, 1, 1)) 36 | dEdX /= float(self.windowSize) 37 | dEdX = dEdX.reshape(dEdX.shape[0], dEdX.shape[1] * dEdX.shape[2], dEdX.shape[3]) 38 | return dEdX -------------------------------------------------------------------------------- /src/nn/selector.py: -------------------------------------------------------------------------------- 1 | from stage import * 2 | 3 | class Selector(Stage): 4 | def __init__(self, 5 | name, 6 | inputNames, 7 | start, 8 | end, 9 | axis=-1): 10 | Stage.__init__( 11 | self, 12 | name=name, 13 | inputNames=inputNames, 14 | outputDim=end-start) 15 | self.start = start 16 | self.end = end 17 | self.axis = axis 18 | if axis < -2 or axis > 2: 19 | raise Exception('Selector axis=%d not supported' % axis) 20 | 21 | def forward(self, X): 22 | self.X = X 23 | if self.axis == -1: 24 | self.axis = len(X.shape) - 1 25 | if self.axis == 0: 26 | return X[self.start:self.end] 27 | elif self.axis == 1: 28 | return X[:, self.start:self.end] 29 | elif self.axis == 2: 30 | return X[:, :, self.start:self.end] 31 | 32 | def backward(self, dEdY): 33 | dEdX = np.zeros(self.X.shape) 34 | if self.axis == 0: 35 | dEdX[self.start:self.end] = dEdY 36 | elif self.axis == 1: 37 | dEdX[:, self.start:self.end] = dEdY 38 | elif self.axis == 2: 39 | dEdX[:, :, self.start:self.end] = dEdY 40 | return dEdX -------------------------------------------------------------------------------- /src/nn/dropout.py: -------------------------------------------------------------------------------- 1 | from stage import * 2 | 3 | class Dropout(Stage): 4 | def __init__(self, 5 | name, 6 | inputNames, 7 | outputDim, 8 | dropoutRate, 9 | initSeed, 10 | debug=False): 11 | Stage.__init__(self, 12 | name=name, 13 | inputNames=inputNames, 14 | outputDim=outputDim) 15 | self.dropout = True 16 | self.dropoutVec = 0 17 | self.dropoutRate = dropoutRate 18 | self.debug = debug 19 | self.random = np.random.RandomState(initSeed) 20 | self.seed = initSeed 21 | 22 | def forward(self, X): 23 | if self.dropoutRate > 0.0 and self.dropout: 24 | if self.debug: 25 | self.random = np.random.RandomState(self.seed) 26 | self.dropoutVec = (self.random.uniform(0, 1, (X.shape[-1])) > 27 | self.dropoutRate) 28 | Y = X * self.dropoutVec 29 | else: 30 | Y = X * (1 - self.dropoutRate) 31 | self.X = X 32 | return Y 33 | 34 | def backward(self, dEdY): 35 | dEdX = None 36 | if self.outputdEdX: 37 | if self.dropout: 38 | dEdX = dEdY * self.dropoutVec 39 | else: 40 | dEdX = dEdY / (1 - self.dropoutRate) 41 | return dEdX -------------------------------------------------------------------------------- /src/nn/tester.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def test(model, X, numExPerBat=100, layerNames=None): 4 | N = X.shape[0] 5 | batchStart = 0 6 | Y = None 7 | layers = {} 8 | if layerNames is not None: 9 | for layerName in layerNames: 10 | layers[layerName] = [] 11 | while batchStart < N: 12 | # Batch info 13 | batchEnd = min(N, batchStart + numExPerBat) 14 | Ytmp = model.forward(X[batchStart:batchEnd], dropout=False) 15 | if Y is None: 16 | Yshape = np.copy(Ytmp.shape) 17 | Yshape[0] = N 18 | Y = np.zeros(Yshape) 19 | if layerNames is not None: 20 | for layerName in layerNames: 21 | stage = model 22 | for stageName in layerName.split(':'): 23 | stage = stage.stageDict[stageName] 24 | layers[layerName].append(stage.getValue()) 25 | Y[batchStart:batchEnd] = Ytmp 26 | batchStart += numExPerBat 27 | if layerNames is not None: 28 | for layerName in layerNames: 29 | layers[layerName] = np.concatenate(layers[layerName], axis=0) 30 | return Y, layers 31 | else: 32 | return Y 33 | 34 | def calcRate(model, Y, T): 35 | Yfinal = model.predict(Y) 36 | correct = np.sum(Yfinal.reshape(Yfinal.size) == T.reshape(T.size)) 37 | total = Yfinal.size 38 | rate = correct / float(total) 39 | return rate, correct, total -------------------------------------------------------------------------------- /src/nn/const_weights.py: -------------------------------------------------------------------------------- 1 | from stage import * 2 | 3 | class ConstWeights(Stage): 4 | def __init__(self, 5 | name, 6 | outputDim=0, 7 | inputDim=0, 8 | initRange=1.0, 9 | initSeed=2, 10 | needInit=True, 11 | initWeights=0, 12 | learningRate=0.0, 13 | learningRateAnnealConst=0.0, 14 | momentum=0.0, 15 | deltaMomentum=0.0, 16 | weightClip=0.0, 17 | gradientClip=0.0, 18 | weightRegConst=0.0): 19 | Stage.__init__(self, 20 | name=name, 21 | outputDim=outputDim, 22 | inputNames=['input'], 23 | learningRate=learningRate, 24 | learningRateAnnealConst=learningRateAnnealConst, 25 | momentum=momentum, 26 | deltaMomentum=deltaMomentum, 27 | weightClip=weightClip, 28 | gradientClip=gradientClip, 29 | weightRegConst=weightRegConst, 30 | outputdEdX=False) 31 | if needInit: 32 | self.random = np.random.RandomState(initSeed) 33 | self.W = self.random.uniform( 34 | -initRange/2.0, initRange/2.0, (outputDim, inputDim)) 35 | else: 36 | self.W = initWeights 37 | self.dEdW = 0 38 | 39 | def graphBackward(self): 40 | self.backward(self.dEdY) 41 | 42 | def forward(self, X): 43 | return self.W 44 | 45 | def backward(self, dEdY): 46 | self.dEdW = dEdY 47 | return None -------------------------------------------------------------------------------- /src/nn/maxpool1d.py: -------------------------------------------------------------------------------- 1 | from stage import * 2 | 3 | class MaxPool1D(Stage): 4 | """ 5 | 1D max pooling. 6 | """ 7 | def __init__(self, 8 | outputDim, 9 | windowSize, 10 | inputNames=None, 11 | defaultValue=0.0, 12 | outputdEdX=True, 13 | name=None): 14 | Stage.__init__(self, 15 | name=name, 16 | inputNames=inputNames, 17 | outputDim=outputDim, 18 | defaultValue=defaultValue, 19 | outputdEdX=outputdEdX) 20 | self.windowSize = windowSize 21 | self.X = 0 22 | self.Y = 0 23 | 24 | def forward(self, X): 25 | mod = np.mod(X.shape[1], self.windowSize) 26 | if mod > 0: 27 | X = np.concatenate((X, np.zeros((X.shape[0], self.windowSize - mod, X.shape[2]))), axis=1) 28 | X = X.reshape(X.shape[0], self.windowSize, X.shape[1] / self.windowSize, X.shape[2]) 29 | self.argX = np.argmax(X, axis=1) 30 | Y = np.max(X, axis=1) 31 | self.X = X 32 | self.mod = mod 33 | return Y 34 | 35 | def backward(self, dEdY): 36 | """ 37 | Assuming the last dimension is the largest. 38 | """ 39 | self.dEdW = 0 40 | dEdX = np.zeros(self.X.shape) 41 | for i in range(self.X.shape[0]): 42 | for j in range(self.X.shape[2]): 43 | dEdX[i, self.argX[i, j, :], j, range(0, self.X.shape[3])] = dEdY[i, j, :] 44 | dEdX = dEdX.reshape(dEdX.shape[0], dEdX.shape[1] * dEdX.shape[2], dEdX.shape[3]) 45 | if self.mod > 0: 46 | dEdX = dEdX[:, :-(self.windowSize - self.mod), :] 47 | return dEdX -------------------------------------------------------------------------------- /src/nn/inner_prod.py: -------------------------------------------------------------------------------- 1 | from stage import * 2 | 3 | class InnerProduct(Stage): 4 | """ 5 | Inner product calculates the inner product of two input vectors. 6 | Two vectors aligns on the second axis (time-axis). 7 | """ 8 | def __init__(self, 9 | name, 10 | inputNames, 11 | outputDim, 12 | learningRate=0.0, 13 | learningRateAnnealConst=0.0, 14 | momentum=0.0, 15 | deltaMomentum=0.0, 16 | weightClip=0.0, 17 | gradientClip=0.0, 18 | weightRegConst=0.0, 19 | outputdEdX=True): 20 | Stage.__init__(self, 21 | name=name, 22 | outputDim=outputDim, 23 | inputNames=inputNames, 24 | learningRate=learningRate, 25 | learningRateAnnealConst=learningRateAnnealConst, 26 | momentum=momentum, 27 | deltaMomentum=deltaMomentum, 28 | weightClip=weightClip, 29 | gradientClip=gradientClip, 30 | weightRegConst=weightRegConst, 31 | outputdEdX=outputdEdX) 32 | self.W = 1 33 | def forward(self, X): 34 | Y = np.sum(X[:, 0, :] * X[:, 1, :], axis=-1) + self.W 35 | self.Y = Y 36 | self.X = X 37 | return Y 38 | 39 | def backward(self, dEdY): 40 | self.dEdW = np.sum(dEdY,axis=0) 41 | #print dEdY 42 | dEdX = np.zeros(self.X.shape) 43 | dEdX[:, 1, :] = dEdY.reshape(dEdY.size, 1) * self.X[:, 0, :] 44 | dEdX[:, 0, :] = dEdY.reshape(dEdY.size, 1) * self.X[:, 1, :] 45 | return dEdX -------------------------------------------------------------------------------- /src/nn/sequential.py: -------------------------------------------------------------------------------- 1 | from container import * 2 | 3 | class Sequential(Stage): 4 | def __init__(self, stages, inputNames=None, name=None, outputDim=0, outputdEdX=True): 5 | Stage.__init__(self, 6 | name=name, 7 | outputDim=outputDim, 8 | inputNames=inputNames, 9 | outputdEdX=outputdEdX) 10 | self.stages = stages 11 | 12 | def forward(self, X, dropout=True): 13 | X1 = X 14 | for stage in self.stages: 15 | if isinstance(stage, Container) or isinstance(stage, Sequential): 16 | X1 = stage.forward(X1, dropout) 17 | elif hasattr(stage, 'dropout'): 18 | stage.dropout = dropout 19 | X1 = stage.forward(X1) 20 | else: 21 | X1 = stage.forward(X1) 22 | return X1 23 | 24 | def backward(self, dEdY): 25 | for stage in reversed(self.stages): 26 | dEdY = stage.backward(dEdY) 27 | if dEdY is None: break 28 | return dEdY if self.outputdEdX else None 29 | 30 | def updateWeights(self): 31 | for stage in self.stages: 32 | stage.updateWeights() 33 | return 34 | 35 | def updateLearningParams(self, numEpoch): 36 | for stage in self.stages: 37 | stage.updateLearningParams(numEpoch) 38 | return 39 | 40 | def getWeights(self): 41 | weights = [] 42 | for stage in self.stages: 43 | weights.append(stage.getWeights()) 44 | return np.array(weights, dtype=object) 45 | 46 | def loadWeights(self, W): 47 | for i in range(W.shape[0]): 48 | self.stages[i].loadWeights(W[i]) -------------------------------------------------------------------------------- /src/nn/active_func.py: -------------------------------------------------------------------------------- 1 | from func import * 2 | 3 | class SoftmaxActiveFn(): 4 | def __init__(self): 5 | pass 6 | 7 | @staticmethod 8 | def forward(Z): 9 | expY = np.exp(Z) 10 | expYshape = np.copy(Z.shape) 11 | expYshape[-1] = 1 12 | Y = expY / np.sum(expY, axis=-1).reshape(expYshape).repeat(Z.shape[-1], axis=-1) 13 | return Y 14 | 15 | @staticmethod 16 | def backward(dEdY, Y, Z): 17 | timespan = Y.shape[0] 18 | U = dEdY * Y 19 | dEdZ = U - np.sum(U, axis=-1).reshape(timespan, 1) * Y 20 | return dEdZ 21 | 22 | class SigmoidActiveFn(): 23 | def __init__(self): 24 | pass 25 | 26 | @staticmethod 27 | def forward(Z): 28 | Y = sigmoidFn(Z) 29 | return Y 30 | 31 | @staticmethod 32 | def backward(dEdY, Y, Z): 33 | dEdZ = dEdY * Y * (1 - Y) 34 | return dEdZ 35 | 36 | class TanhActiveFn(): 37 | def __init__(self): 38 | pass 39 | 40 | @staticmethod 41 | def forward(Z): 42 | Y = np.tanh(Z) 43 | return Y 44 | 45 | @staticmethod 46 | def backward(dEdY, Y, Z): 47 | dEdZ = dEdY * (1 - Y * Y) 48 | return dEdZ 49 | 50 | class IdentityActiveFn(): 51 | def __init__(self): 52 | pass 53 | 54 | @staticmethod 55 | def forward(Z): 56 | return Z 57 | 58 | @staticmethod 59 | def backward(dEdY, Y, Z): 60 | return dEdY 61 | 62 | class ReluActiveFn(): 63 | def __init__(self): 64 | pass 65 | 66 | @staticmethod 67 | def forward(Z): 68 | return np.maximum(0, Z) 69 | 70 | @staticmethod 71 | def backward(dEdY, Y, Z): 72 | return (Y > 0).astype(int) * dEdY 73 | -------------------------------------------------------------------------------- /models/img_bow.model.yml: -------------------------------------------------------------------------------- 1 | name: 'img_bow' 2 | costFn: 'crossEntIdx' 3 | decisionFn: 'argmax' 4 | stages: 5 | - 'imgSel' 6 | - 'imgFeat' 7 | - 'txtSel' 8 | - 'txtUnfold' 9 | - 'txtDict' 10 | - 'txtFold' 11 | - 'bow' 12 | - 'softmax' 13 | outputs: 'softmax' 14 | specs: 15 | - name: 'imgSel' 16 | type: 'selector' 17 | inputs: 'input' 18 | start: 0 19 | end: 1 20 | axis: 1 21 | - name: 'txtSel' 22 | type: 'selector' 23 | inputs: 'input' 24 | start: 1 25 | end: 56 26 | axis: 1 27 | - name: 'imgFeat' 28 | type: 'lut' 29 | inputs: 'imgSel' 30 | inputDim: 123288 31 | outputDim: 4096 32 | initWeights: '../data/hidden_oxford_mscoco.h5' 33 | sparse: true 34 | format: 'h5' 35 | h5key: 'hidden7' 36 | learningRate: 0.0 37 | outputdEdX: false 38 | - name: 'txtUnfold' 39 | type: 'timeUnfold' 40 | inputs: 'txtSel' 41 | outputdEdX: false 42 | - name: 'txtDict' 43 | type: 'lut' 44 | inputs: 'txtUnfold' 45 | intConversion: true 46 | inputDim: 9738 47 | outputDim: 500 48 | initRange: 1.0 49 | initSeed: 2 50 | learningRate: 0.8 51 | momentum: 0.9 52 | gradientClip: 0.1 53 | weightClip: 1000.0 54 | - name: 'txtFold' 55 | type: 'timeFold' 56 | inputs: 'txtDict' 57 | timespan: 55 58 | - name: 'bow' 59 | type: 'sum2' 60 | inputs: 'txtFold' 61 | outputDim: 500 62 | - name: 'softmax' 63 | type: 'map' 64 | inputs: 'bow, imgFeat' 65 | activeFn: 'softmax' 66 | outputDim: 431 67 | initRange: 0.01 68 | initSeed: 7 69 | learningRate: 0.01 70 | learningRateAnnealConst: 0.0 71 | momentum: 0.9 72 | gradientClip: 0.1 73 | weightClip: 15.0 74 | weightRegConst: 0.00005 75 | -------------------------------------------------------------------------------- /src/nn/cos_sim.py: -------------------------------------------------------------------------------- 1 | from stage import * 2 | 3 | class CosSimilarity(Stage): 4 | """ 5 | Compute the cosine similartiy of vectors with a bank of vectors 6 | """ 7 | def __init__(self, bankDim, inputNames, outputDim, name=None): 8 | Stage.__init__(self, name=name, inputNames=inputNames, outputDim=outputDim) 9 | self.bankDim = bankDim 10 | self.A = 0 11 | self.Z = 0 12 | self.Anorm = 0 13 | self.Znorm = 0 14 | self.Auni = 0 15 | self.Zuni = 0 16 | 17 | def forward(self, X): 18 | bankDim = self.bankDim 19 | A = X[:bankDim] 20 | Z = X[bankDim:] 21 | Xnorm2 = np.sum(np.power(X, 2), axis=-1) 22 | Xnorm = np.sqrt(Xnorm2) 23 | Anorm = Xnorm[:bankDim] 24 | Znorm = Xnorm[bankDim:] 25 | Zuni = Z / Znorm.reshape(Z.shape[0], 1) 26 | Auni = A / Anorm.reshape(bankDim, 1) 27 | self.Y = np.inner(Zuni, Auni) 28 | self.A = A 29 | self.Z = Z 30 | self.Anorm = Anorm 31 | self.Znorm = Znorm 32 | self.Auni = Auni 33 | self.Zuni = Zuni 34 | return self.Y 35 | 36 | def backward(self, dEdY): 37 | # For now, output gradient towards the vector bank. 38 | self.dEdW = 0 39 | Z = self.Z 40 | A = self.A 41 | Anorm = self.Anorm 42 | Znorm = self.Znorm 43 | Auni = self.Auni 44 | Zuni = self.Zuni 45 | 46 | V = np.dot(dEdY, Auni) 47 | dEdZ = np.sum(V * Z, axis=-1).reshape(Z.shape[0], 1) * \ 48 | (-Z / (Znorm ** 3).reshape(Z.shape[0], 1)) + \ 49 | V / Znorm.reshape(Z.shape[0], 1) 50 | 51 | U = np.dot(dEdY.transpose(), Zuni) 52 | dEdA = np.sum(U * A, axis=-1).reshape(A.shape[0], 1) * \ 53 | (-A / (Anorm ** 3).reshape(A.shape[0], 1)) + \ 54 | U / Anorm.reshape(A.shape[0], 1) 55 | 56 | dEdX = np.concatenate((dEdA, dEdZ), axis=0) 57 | return dEdX -------------------------------------------------------------------------------- /src/imageqa_crosstest.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import numpy as np 4 | import imageqa_test as it 5 | import prep 6 | import nn 7 | 8 | def reindexDataset( 9 | srcQuestions, 10 | srcAnswers, 11 | srcQuestionIdict, 12 | dstQuestionDict, 13 | srcAnsIdict, 14 | dstAnsDict): 15 | dstQuestions = np.zeros(srcQuestions.shape, dtype='int') 16 | dstAnswers = np.zeros(srcAnswers.shape, dtype='int') 17 | for n in range(srcQuestions.shape[0]): 18 | dstQuestions[n, 0, 0] = srcQuestions[n, 0, 0] 19 | for t in range(1, srcQuestions.shape[1]): 20 | word = srcQuestionIdict[srcQuestions[n, t, 0] - 1] 21 | if dstQuestionDict.has_key(word): 22 | dstQuestions[n, t, 0] = dstQuestionDict[word] 23 | else: 24 | dstQuestions[n, t, 0] = dstQuestionDict['UNK'] 25 | word = srcAnsIdict[srcAnswers[n, 0]] 26 | if dstAnsDict.has_key(word): 27 | dstAnswers[n, 0] = dstAnsDict[word] 28 | else: 29 | dstAnswers[n, 0] = dstAnsDict['UNK'] 30 | return dstQuestions, dstAnswers 31 | 32 | if __name__ == '__main__': 33 | """ 34 | Usage: python imageqa_crosstest.py 35 | -m[odel] {model id} 36 | -d[ata] {model data folder} 37 | -td[ata] {test data folder} 38 | [-reindex {whether to reindex the test data, default false}] 39 | [-r[esults] {results folder}] 40 | [-dataset {cocoqa/daquar, default cocoqa}] 41 | """ 42 | resultsFolder = '../results' 43 | needReindex = False 44 | dataset = 'cocoqa' 45 | for i, flag in enumerate(sys.argv): 46 | if flag == '-m' or flag == '-model': 47 | modelId = sys.argv[i + 1] 48 | elif flag == '-d' or flag == '-data': 49 | dataFolder = sys.argv[i + 1] 50 | elif flag == '-td' or flag == '-tdata': 51 | testDataFolder = sys.argv[i + 1] 52 | elif flag == '-reindex': 53 | needReindex = True 54 | elif flag == '-r' or flag == '-results': 55 | resultsFolder = sys.argv[i + 1] 56 | elif flag == '-dataset': 57 | dataset = sys.argv[i + 1] 58 | 59 | model = it.loadModel(modelId, resultsFolder) 60 | data = it.loadDataset(dataFolder) 61 | testdata = it.loadDataset(testDataFolder) 62 | if needReindex: 63 | testQuestions, testAnswers = reindexDataset( 64 | testdata['testData'][0], 65 | testdata['testData'][1], 66 | testdata['questionIdict'], 67 | data['questionDict'], 68 | testdata['ansIdict'], 69 | data['ansDict']) 70 | else: 71 | testQuestions = testdata['testData'][0] 72 | testAnswers = testdata['testData'][1] 73 | outputTest = nn.test(model, testQuestions) 74 | rate, correct, total = nn.calcRate(model, outputTest, testAnswers) 75 | print 'rate: %.4f' % rate 76 | 77 | -------------------------------------------------------------------------------- /src/nn/ordinal.py: -------------------------------------------------------------------------------- 1 | from stage import * 2 | import numpy as np 3 | 4 | 5 | class OrdinalRegression(Stage): 6 | def __init__( 7 | self, 8 | outputDim, 9 | fixExtreme=True, 10 | inputNames=None, 11 | name=None, 12 | outputdEdX=True, 13 | learningRate=0.0, 14 | learningRateAnnealConst=0.0, 15 | momentum=0.0, 16 | deltaMomentum=0.0, 17 | weightClip=0.0, 18 | gradientClip=0.0, 19 | weightRegConst=0.0): 20 | Stage.__init__( 21 | self, 22 | name=name, 23 | inputNames=inputNames, 24 | outputDim=outputDim, 25 | outputdEdX=outputdEdX, 26 | learningRate=learningRate, 27 | learningRateAnnealConst=learningRateAnnealConst, 28 | momentum=momentum, 29 | deltaMomentum=deltaMomentum, 30 | weightClip=weightClip, 31 | gradientClip=gradientClip, 32 | weightRegConst=weightRegConst) 33 | # Uniform initialization 34 | # mu_0 = -1 35 | # mu_(n-1) = 1 36 | # mu_i = -1 + 2 * (i / n) 37 | self.fixExtreme = fixExtreme 38 | mu = np.linspace(-1, 1, self.outputDim) 39 | # pi_i = 1/n 40 | pi = np.zeros(self.outputDim) + np.log(1 / float(self.outputDim)) 41 | self.W = np.zeros((2, self.outputDim)) 42 | self.W[0] = mu 43 | self.W[1] = pi 44 | 45 | def forward(self, X): 46 | mu = self.W[0].reshape(1, self.W.shape[1]) 47 | pi = self.W[1].reshape(1, self.W.shape[1]) 48 | self.Xshape = X.shape 49 | X = X.reshape(X.shape[0], 1) 50 | self.X = X 51 | Z = np.exp(mu * X + (pi - np.power(mu, 2) / 2)) 52 | Y = Z / np.sum(Z, axis=-1).reshape(X.shape[0], 1) 53 | self.Z = Z 54 | self.Y = Y 55 | return Y 56 | 57 | def backward(self, dEdY): 58 | # Here we ignore the dEdY because this is always the last layer... 59 | target = dEdY != 0.0 60 | targetInt = target.astype('int') 61 | targetIdx = np.nonzero(target)[1] 62 | mu = self.W[0] 63 | pi = self.W[1] 64 | dEdX = (-mu[targetIdx] + np.dot(self.Y, mu)) / float(self.X.shape[0]) 65 | dEdX = dEdX.reshape(self.Xshape) 66 | dEdMu = np.mean( 67 | (self.X - mu) * 68 | (self.Y - targetInt), axis=0) 69 | # Fix extreme mu's 70 | if self.fixExtreme: 71 | dEdMu[0] = 0 72 | dEdMu[-1] = 0 73 | dEdPi = np.mean(self.Y - targetInt, axis=0) 74 | self.dEdW = np.zeros(self.W.shape) 75 | self.dEdW[0] = dEdMu 76 | self.dEdW[1] = dEdPi 77 | return dEdX 78 | 79 | def updateLearningParams(self, numEpoch): 80 | Stage.updateLearningParams(self, numEpoch) 81 | print 'mu:', 82 | for i in range(self.W.shape[-1]): 83 | print '%.3f' % self.W[0, i], 84 | print 'pi:', 85 | for i in range(self.W.shape[-1]): 86 | print '%.3f' % self.W[1, i], 87 | print 88 | 89 | -------------------------------------------------------------------------------- /src/imageqa_layout.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import nn 4 | import numpy as np 5 | import imageqa_test as it 6 | import imageqa_visprior as ip 7 | import imageqa_ensemble as ie 8 | import imageqa_render as ir 9 | 10 | def parseInputFile(filename): 11 | caption = '' 12 | selIds = [] 13 | selComments = [] 14 | with open(filename) as f: 15 | i = 0 16 | for line in f: 17 | if i == 0 and line.startswith('caption:'): 18 | caption = line[8:-1] 19 | else: 20 | parts = line.split(',') 21 | selIds.append(int(parts[0])) 22 | if len(parts) > 1: 23 | selComments.append(parts[1][:-1]) 24 | else: 25 | selComments.append('') 26 | i += 1 27 | return caption, selIds, selComments 28 | 29 | if __name__ == '__main__': 30 | """ 31 | Render a selection of examples into LaTeX. 32 | Usage: python imageqa_layout.py 33 | -m[odel] {name1:modelId1} 34 | -m[odel] {name2:modelId2} 35 | -em[odel] {name3:ensembleModelId3,ensembleModelId4,...} 36 | -pem[odel] {name4:ensembleModelId5,ensembleModelId6,...} 37 | ... 38 | -d[ata] {dataFolder} 39 | -i[nput] {listFile} 40 | -o[utput] {outputFolder} 41 | [-k {top K answers}] 42 | [-p[icture] {pictureFolder}] 43 | [-f[ile] {outputFilename}] 44 | [-daquar/-cocoqa] 45 | Input file format: 46 | QID1[,Comment1] 47 | QID2[,Comment2] 48 | ... 49 | """ 50 | params = ir.parseComparativeParams(sys.argv) 51 | 52 | urlDict = ir.loadImgUrl(params['dataset'], params['dataFolder']) 53 | data = it.loadDataset(params['dataFolder']) 54 | 55 | print('Parsing input file...') 56 | caption, selIds, selComments = parseInputFile(params['inputFile']) 57 | 58 | print('Running models...') 59 | idx = np.array(selIds, dtype='int') 60 | inputTestSel = inputTest[idx] 61 | targetTestSel = targetTest[idx] 62 | inputTest = data['testData'][0] 63 | questionTypeArray = data['questionTypeArray'] 64 | modelOutputs = ie.runAllModels( 65 | inputTestSel, 66 | questionTypeArray[idx], 67 | params['models'], 68 | params['resultsFolder'], 69 | params['dataset'], 70 | params['dataFolder']): 71 | 72 | # Render 73 | print('Rendering LaTeX...') 74 | 75 | # Replace escape char 76 | data['questionIdict'] = ir.escapeLatexIdict(data['questionIdict']) 77 | data['ansIdict'] = ir.escapeLatexIdict(data['ansIdict']) 78 | 79 | if not os.path.exists(outputFolder): 80 | os.makedirs(outputFolder) 81 | ir.renderLatex( 82 | inputTestSel, 83 | targetTestSel, 84 | data['questionIdict'], 85 | data['ansIdict'], 86 | urlDict, 87 | topK=params['topK'], 88 | outputFolder=params['outputFolder'], 89 | pictureFolder=params['pictureFolder'], 90 | comments=selComments, 91 | caption=caption, 92 | modelOutputs=modelOutputs, 93 | modelNames=ir.getModelNames(params['models']), 94 | questionIds=idx, 95 | filename=params['outputFilename'] + '.tex') -------------------------------------------------------------------------------- /src/calculate_wups.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Mateusz Malinowski 4 | # mmalinow@mpi-inf.mpg.de 5 | # 6 | # Modified by Mengye Ren 7 | # mren@cs.toronto.edu 8 | # 9 | # This evaluation code will only work for single word answers. 10 | 11 | # it assumes there are two files 12 | # - first file with ground truth answers 13 | # - second file with predicted answers 14 | # both answers are line-aligned 15 | 16 | import sys 17 | import re 18 | 19 | from numpy import prod 20 | from nltk.corpus import wordnet as wn 21 | 22 | word_pair_dict = {} 23 | 24 | def file2list(filepath): 25 | with open(filepath,'r') as f: 26 | lines = [k for k in 27 | [k.strip() for k in f.readlines()] 28 | if len(k) > 0] 29 | return lines 30 | 31 | def dirac_measure(a, b): 32 | """ 33 | Returns 1 iff a = b and 0 otherwise. 34 | """ 35 | return float(a == b) 36 | 37 | 38 | def wup_measure(a, b, similarity_threshold = 0.925, debug = False): 39 | """ 40 | Returns Wu-Palmer similarity score. 41 | More specifically, it computes: 42 | max_{x \in interp(a)} max_{y \in interp(b)} wup(x,y) 43 | where interp is a 'interpretation field' 44 | """ 45 | if debug: print 'Original', a, b 46 | if word_pair_dict.has_key(a+','+b): 47 | return word_pair_dict[a+','+b] 48 | 49 | def get_semantic_field(a): 50 | return wn.synsets(a, pos=wn.NOUN) 51 | 52 | if a == b: return 1.0 53 | 54 | interp_a = get_semantic_field(a) 55 | interp_b = get_semantic_field(b) 56 | if debug: print(interp_a) 57 | 58 | if interp_a == [] or interp_b == []: 59 | return 0.0 60 | 61 | if debug: print 'Stem', a, b 62 | global_max=0.0 63 | for x in interp_a: 64 | for y in interp_b: 65 | local_score=x.wup_similarity(y) 66 | if debug: print 'Local', local_score 67 | if local_score > global_max: 68 | global_max=local_score 69 | if debug: print 'Global', global_max 70 | 71 | # we need to use the semantic fields and therefore we downweight 72 | # unless the score is high which indicates both are synonyms 73 | if global_max < similarity_threshold: 74 | interp_weight = 0.1 75 | else: 76 | interp_weight = 1.0 77 | 78 | final_score = global_max * interp_weight 79 | word_pair_dict[a+','+b] = final_score 80 | return final_score 81 | 82 | def runAll(gt_filepath, pred_filepath, thresh): 83 | global word_pair_dict 84 | word_pair_dict = {} 85 | input_gt=file2list(gt_filepath) 86 | input_pred=file2list(pred_filepath) 87 | 88 | if thresh == -1: 89 | measure = dirac_measure 90 | else: 91 | measure = lambda x, y: wup_measure(x, y, thresh) 92 | 93 | if thresh == -1: 94 | print 'standard Accuracy is used' 95 | else: 96 | print 'soft WUPS is used' 97 | score_list = [measure(ta, pa) for (ta, pa) in zip(input_gt, input_pred)] 98 | final_score = sum(map( 99 | lambda x: float(x) / float(len(score_list)), score_list)) 100 | 101 | print 'final score:', final_score 102 | return final_score 103 | 104 | 105 | if __name__ == '__main__': 106 | if len(sys.argv) < 4: 107 | print 'Usage: true answers file, predicted answers file, threshold' 108 | print 'If threshold is -1, then the standard Accuracy is used' 109 | sys.exit("3 arguments must be given") 110 | gt_filepath=sys.argv[1] 111 | pred_filepath=sys.argv[2] 112 | thresh=float(sys.argv[3]) 113 | runAll(gt_filepath, pred_filepath, thresh) 114 | -------------------------------------------------------------------------------- /models/vis_lstm.model.yml: -------------------------------------------------------------------------------- 1 | name: 'vis_lstm' 2 | costFn: 'crossEntIdx' 3 | decisionFn: 'argmax' 4 | stages: 5 | - 'imgSel' 6 | - 'imgUnfold' 7 | - 'imgFeat' 8 | - 'imgFeatNorm' 9 | - 'imgMap' 10 | - 'imgFold' 11 | - 'txtSel' 12 | - 'txtUnfold' 13 | - 'txtDict' 14 | - 'txtFold' 15 | - 'concat' 16 | - 'dropout' 17 | - 'lstm' 18 | - 'softmax' 19 | outputs: 'softmax' 20 | specs: 21 | - name: 'imgSel' 22 | type: 'selector' 23 | inputs: 'input' 24 | start: 0 25 | end: 1 26 | axis: 1 27 | - name: 'txtSel' 28 | type: 'selector' 29 | inputs: 'input' 30 | start: 1 31 | end: 56 32 | axis: 1 33 | - name: 'imgUnfold' 34 | type: 'timeUnfold' 35 | inputs: 'imgSel' 36 | - name: 'imgFeat' 37 | type: 'lut' 38 | inputs: 'imgUnfold' 39 | inputDim: 123288 40 | outputDim: 4096 41 | initWeights: '../data/hidden_oxford_mscoco.h5' 42 | sparse: true 43 | format: 'h5' 44 | h5key: 'hidden7' 45 | learningRate: 0.0 46 | outputdEdX: false 47 | - name: 'imgFeatNorm' 48 | type: 'normalize' 49 | inputs: 'imgFeat' 50 | mean: '../data/hidden_oxford_mscoco.h5' 51 | meanKey: 'hidden7_mean' 52 | std: '../data/hidden_oxford_mscoco.h5' 53 | stdKey: 'hidden7_std' 54 | format: 'h5' 55 | outputDim: 4096 56 | - name: 'imgMap' 57 | type: 'map' 58 | inputs: 'imgFeatNorm' 59 | activeFn: 'identity' 60 | bias: false 61 | outputDim: 500 62 | initRange: 0.05 63 | initSeed: 1 64 | learningRate: 0.8 65 | momentum: 0.9 66 | gradientClip: 0.1 67 | weightClip: 100.0 68 | outputdEdX: false 69 | - name: 'imgFold' 70 | type: 'timeFold' 71 | inputs: 'imgMap' 72 | timespan: 1 73 | - name: 'txtUnfold' 74 | type: 'timeUnfold' 75 | inputs: 'txtSel' 76 | - name: 'txtDict' 77 | type: 'lut' 78 | intConversion: true 79 | inputs: 'txtUnfold' 80 | inputDim: 9738 81 | outputDim: 500 82 | initRange: 1.0 83 | initSeed: 2 84 | learningRate: 0.8 85 | momentum: 0.9 86 | gradientClip: 0.1 87 | weightClip: 1000.0 88 | outputdEdX: false 89 | - name: 'txtFold' 90 | type: 'timeFold' 91 | inputs: 'txtDict' 92 | timespan: 55 93 | - name: 'concat' 94 | type: 'concat' 95 | inputs: 'imgFold, txtFold' 96 | axis: 1 97 | - name: 'dropout' 98 | type: 'dropout' 99 | inputs: 'concat' 100 | dropoutRate: 0.2 101 | initSeed: 3 102 | outputDim: 500 103 | - name: 'lstm' 104 | type: 'lstm' 105 | inputs: 'dropout' 106 | inputDim: 500 107 | outputDim: 300 108 | timespan: 56 109 | initRange: 0.1 110 | initSeed: 4 111 | cutOffZeroEnd: true 112 | multiErr: false 113 | learningRate: 0.8 114 | learningRateAnnealConst: 0.0 115 | momentum: 0.9 116 | gradientClip: 0.1 117 | weightClip: 100.0 118 | weightRegConst: 0.00005 119 | - name: 'softmax' 120 | type: 'map' 121 | inputs: 'lstm' 122 | activeFn: 'softmax' 123 | outputDim: 431 124 | initRange: 0.1 125 | initSeed: 7 126 | learningRate: 0.01 127 | learningRateAnnealConst: 0.0 128 | momentum: 0.9 129 | gradientClip: 0.1 130 | weightClip: 15.0 131 | weightRegConst: 0.00005 132 | -------------------------------------------------------------------------------- /src/nn/lut.py: -------------------------------------------------------------------------------- 1 | from stage import * 2 | import os 3 | use_gpu = os.environ.get('GNUMPY_USE_GPU', 'yes') == 'yes' 4 | 5 | class LUT(Stage): 6 | """ 7 | Look-up table. 8 | WARNING: this implementation of LUT is index 1-based. 9 | 0 will mean an all-zero entry. 10 | The first row of the weight matrix is one. 11 | """ 12 | def __init__(self, 13 | inputNames, 14 | inputDim, 15 | outputDim, 16 | lazyInit=True, 17 | initRange=1.0, 18 | initSeed=2, 19 | intConversion=False, 20 | needInit=True, 21 | initWeights=0, 22 | sparse=False, 23 | learningRate=0.0, 24 | learningRateAnnealConst=0.0, 25 | momentum=0.0, 26 | deltaMomentum=0.0, 27 | weightClip=0.0, 28 | gradientClip=0.0, 29 | weightRegConst=0.0, 30 | outputdEdX=False, 31 | name=None): 32 | Stage.__init__(self, 33 | name=name, 34 | inputNames=inputNames, 35 | learningRate=learningRate, 36 | outputDim=outputDim, 37 | learningRateAnnealConst=learningRateAnnealConst, 38 | momentum=momentum, 39 | deltaMomentum=deltaMomentum, 40 | weightClip=weightClip, 41 | gradientClip=gradientClip, 42 | weightRegConst=weightRegConst, 43 | gpu=False, 44 | outputdEdX=outputdEdX) 45 | self.outputDim = outputDim 46 | self.inputDim = inputDim 47 | self.initRange = initRange 48 | self.random = np.random.RandomState(initSeed) 49 | self.needInit = needInit 50 | self.intConversion = intConversion 51 | 52 | # Zeroth rows of the weight matrix is reserved 53 | # for empty word at the end of a sentence. 54 | if needInit: 55 | if lazyInit: 56 | self.W = None 57 | else: 58 | self.initWeights() 59 | else: 60 | self.W = initWeights 61 | if use_gpu and self.W.dtype != np.float32: 62 | self.W = self.W.astype('float32') 63 | self.X = 0 64 | self.Y = 0 65 | self.sparse = sparse 66 | self.dEdW = 0.0 67 | 68 | def initWeights(self): 69 | # print self.name 70 | self.W = self.random.uniform( 71 | -self.initRange/2.0, self.initRange/2.0, 72 | (self.inputDim, self.outputDim)) 73 | if use_gpu and self.W.dtype != np.float32: 74 | self.W = self.W.astype('float32') 75 | 76 | def forward(self, X): 77 | if self.W is None: self.initWeights() 78 | if self.intConversion: X = X.astype(int) 79 | X = X.reshape(X.size) 80 | self.X = X 81 | Y = np.zeros((X.shape[0], self.outputDim), self.W.dtype) 82 | for n in range(0, X.shape[0]): 83 | if self.sparse: 84 | if X[n] != 0: 85 | Y[n] = self.W[X[n] - 1].todense() 86 | else: 87 | if X[n] != 0: 88 | Y[n] = self.W[X[n] - 1] 89 | return Y 90 | 91 | def backward(self, dEdY): 92 | X = self.X 93 | if self.learningRate > 0.0: 94 | self.dEdW = np.zeros(self.W.shape, self.W.dtype) 95 | for n in range(0, X.shape[0]): 96 | if X[n] != 0: 97 | self.dEdW[X[n] - 1] += dEdY[n] 98 | if self.outputdEdX: 99 | return np.zeros(X.shape) 100 | else: 101 | return None 102 | 103 | def loadWeights(self, W): 104 | if self.learningRate == 0.0: 105 | return 106 | else: 107 | Stage.loadWeights(self, W) 108 | 109 | def getWeights(self): 110 | if self.learningRate == 0.0: 111 | return 0 112 | else: 113 | return self.W 114 | -------------------------------------------------------------------------------- /src/nn/sum_prod.py: -------------------------------------------------------------------------------- 1 | from stage import * 2 | #use_gpu = os.environ.get('GNUMPY_USE_GPU', 'yes') == 'yes' 3 | use_gpu = False 4 | if use_gpu: 5 | import gnumpy as gpu 6 | import gnumpy as gnp 7 | 8 | class SumProduct(Stage): 9 | def __init__(self, 10 | name, 11 | inputNames, 12 | sumAxis, 13 | outputDim, 14 | gpu=use_gpu, 15 | beta=1.0): 16 | Stage.__init__(self, 17 | name=name, 18 | inputNames=inputNames, 19 | gpu=gpu, 20 | outputDim=outputDim) 21 | self.sumAxis = sumAxis 22 | self.beta = beta 23 | 24 | def getInput(self): 25 | # Assume that the input size is always 2 26 | # Rewrite get input logic into two separate arrays 27 | if len(self.inputs) == 2: 28 | return [self.inputs[0].Y, self.inputs[1].Y] 29 | elif len(self.inputs) == 3: 30 | return [self.inputs[0].Y, self.inputs[1].Y, self.inputs[2].Y] 31 | 32 | def sendError(self, dEdX): 33 | self.inputs[0].dEdY += dEdX[0] 34 | self.inputs[0].receivedError = True 35 | self.inputs[1].dEdY += dEdX[1] 36 | self.inputs[1].receivedError = True 37 | if len(self.inputs) == 3: 38 | self.inputs[2].dEdY += dEdX[2] 39 | self.inputs[2].receivedError = True 40 | 41 | def forward(self, X): 42 | if self.gpu: 43 | self.X = [] 44 | self.X.append(gpu.as_garray(X[0].astype('float32'))) 45 | self.X.append(gpu.as_garray(X[1].astype('float32'))) 46 | if len(X) == 2: 47 | Y = self.beta * gpu.sum(self.X[0] * self.X[1], axis=self.sumAxis) 48 | elif len(X) == 3: 49 | self.X.append(gpu.as_garray(X[2].astype('float32'))) 50 | self.Z = gpu.sum(self.X[0] * self.X[1], axis=self.sumAxis) 51 | Y = self.X[2] * self.Z 52 | Y = Y.as_numpy_array(dtype='float32') 53 | else: 54 | self.X = X 55 | if len(self.X) == 2: 56 | Y = self.beta * np.sum(self.X[0] * self.X[1], axis=self.sumAxis) 57 | elif len(self.X) == 3: 58 | self.Z = np.sum(self.X[0] * self.X[1], axis=self.sumAxis) 59 | Y = self.X[2] * self.Z 60 | return Y 61 | 62 | def backward(self, dEdY): 63 | # Need to generalize, but now, let's assume it's the attention model. 64 | dEdX = [] 65 | if self.gpu: 66 | if len(self.X) == 2: 67 | dEdY = dEdY.reshape(dEdY.shape[0], 1, dEdY.shape[1]) 68 | dEdY = gpu.as_garray(dEdY) 69 | dEdX1 = self.beta * gpu.sum(dEdY * self.X[1], axis=2) 70 | dEdX2 = self.beta * dEdY * self.X[0] 71 | dEdX.append(dEdX1.as_numpy_array(dtype='float32')) 72 | dEdX.append(dEdX2.as_numpy_array(dtype='float32')) 73 | elif len(self.X) == 3: 74 | dEdY = gpu.as_garray(dEdY) 75 | dEdY2 = dEdY.reshape(dEdY.shape[0], 1, dEdY.shape[1]) 76 | dEdY2 = gpu.as_garray(dEdY2) 77 | dEdX1 = self.X[2] * gpu.sum(dEdY2 * self.X[1], axis=2) 78 | dEdX2 = self.X[2].reshape(self.X[2].shape[0], 1, 1) * dEdY2 * self.X[0] 79 | dEdX3 = gpu.sum(dEdY * self.Z, axis=-1).reshape(self.X[2].shape[0], 1) 80 | dEdX.append(dEdX1.as_numpy_array(dtype='float32')) 81 | dEdX.append(dEdX2.as_numpy_array(dtype='float32')) 82 | dEdX.append(dEdX3.as_numpy_array(dtype='float32')) 83 | else: 84 | if len(self.X) == 2: 85 | dEdY = dEdY.reshape(dEdY.shape[0], 1, dEdY.shape[1]) 86 | dEdX.append(self.beta * np.sum(dEdY * self.X[1], axis=2)) 87 | dEdX.append(self.beta * dEdY * self.X[0]) 88 | elif len(self.X) == 3: 89 | dEdY2 = dEdY.reshape(dEdY.shape[0], 1, dEdY.shape[1]) 90 | dEdX.append(self.X[2] * np.sum(dEdY2 * self.X[1], axis=2)) 91 | dEdX.append(self.X[2].reshape(self.X[2].shape[0], 1, 1) * dEdY2 * self.X[0]) 92 | dEdX.append(np.sum(dEdY * self.Z, axis=-1).reshape(self.X[2].shape[0], 1)) 93 | return dEdX 94 | -------------------------------------------------------------------------------- /src/nn/sequential_tests.py: -------------------------------------------------------------------------------- 1 | from sequential import * 2 | from lstm_old import * 3 | from map import * 4 | from dropout import * 5 | from reshape import * 6 | from lut import * 7 | from active_func import * 8 | import unittest 9 | 10 | class Sequential_Tests(unittest.TestCase): 11 | """Sequential stacks of stages tests""" 12 | def setUp(self): 13 | random = np.random.RandomState(2) 14 | self.trainInput = random.uniform(0, 10, (5, 5, 1)).astype(int) 15 | self.trainTarget = random.uniform(0, 1, (5, 1)).astype(int) 16 | 17 | def test_grad(self): 18 | wordEmbed = np.random.rand(np.max(self.trainInput), 5) 19 | timespan = self.trainInput.shape[1] 20 | time_unfold = TimeUnfold() 21 | 22 | lut = LUT( 23 | inputDim=np.max(self.trainInput)+1, 24 | outputDim=5, 25 | inputNames=None, 26 | needInit=False, 27 | initWeights=wordEmbed 28 | ) 29 | 30 | m = Map( 31 | outputDim=5, 32 | activeFn=IdentityActiveFn(), 33 | inputNames=None, 34 | initRange=0.1, 35 | initSeed=1, 36 | ) 37 | 38 | time_fold = TimeFold( 39 | timespan=timespan 40 | ) 41 | 42 | lstm = LSTM_Old( 43 | inputDim=5, 44 | outputDim=5, 45 | initRange=.1, 46 | initSeed=3, 47 | cutOffZeroEnd=True, 48 | multiErr=True 49 | ) 50 | 51 | dropout = Dropout( 52 | name='d1', 53 | dropoutRate=0.5, 54 | inputNames=None, 55 | outputDim=5, 56 | initSeed=2, 57 | debug=True 58 | ) 59 | 60 | lstm_second = LSTM_Old( 61 | inputDim=5, 62 | outputDim=5, 63 | initRange=.1, 64 | initSeed=3, 65 | cutOffZeroEnd=True, 66 | multiErr=False 67 | ) 68 | 69 | soft = Map( 70 | outputDim=2, 71 | activeFn=SoftmaxActiveFn, 72 | initRange=0.1, 73 | initSeed=5 74 | ) 75 | 76 | self.model = Sequential( 77 | stages=[ 78 | time_unfold, 79 | lut, 80 | m, 81 | time_fold, 82 | lstm, 83 | dropout, 84 | lstm_second, 85 | soft 86 | ]) 87 | self.hasDropout = True 88 | costFn = crossEntIdx 89 | output = self.model.forward(self.trainInput, dropout=self.hasDropout) 90 | E, dEdY = costFn(output, self.trainTarget) 91 | dEdX = self.model.backward(dEdY) 92 | self.chkgrd(soft.dEdW, self.evaluateGrad(soft.getWeights(), costFn)) 93 | #self.chkgrd(lstm_second.dEdW, self.evaluateGrad(lstm_second.getWeights(), costFn)) 94 | #self.chkgrd(lstm.dEdW, self.evaluateGrad(lstm.getWeights(), costFn)) 95 | self.chkgrd(m.dEdW, self.evaluateGrad(m.getWeights(), costFn)) 96 | 97 | def chkgrd(self, dE, dETmp): 98 | #print dE/dETmp 99 | dE = dE.reshape(dE.size) 100 | dETmp = dETmp.reshape(dE.size) 101 | tolerance = 5e-1 102 | for i in range(dE.size): 103 | self.assertTrue( 104 | (dE[i] == 0 and dETmp[i] == 0) or 105 | (np.abs(dE[i] / dETmp[i] - 1) < tolerance)) 106 | 107 | def evaluateGrad(self, W, costFn): 108 | eps = 1 109 | dEdW = np.zeros(W.shape) 110 | for i in range(W.shape[0]): 111 | for j in range(W.shape[1]): 112 | W[i,j] += eps 113 | output = self.model.forward(self.trainInput, dropout=self.hasDropout) 114 | Etmp1, d1 = costFn(output, self.trainTarget) 115 | 116 | W[i,j] -= 2 * eps 117 | output = self.model.forward(self.trainInput, dropout=self.hasDropout) 118 | Etmp2, d2 = costFn(output, self.trainTarget) 119 | 120 | dEdW[i,j] = (Etmp1 - Etmp2) / 2.0 / eps 121 | W[i,j] += eps 122 | return dEdW 123 | 124 | if __name__ == '__main__': 125 | suite = unittest.TestSuite() 126 | suite.addTests( 127 | unittest.TestLoader().loadTestsFromTestCase(Sequential_Tests)) 128 | unittest.TextTestRunner(verbosity=2).run(suite) 129 | -------------------------------------------------------------------------------- /src/nn/conv1d.py: -------------------------------------------------------------------------------- 1 | import os 2 | use_gpu = os.environ.get('GNUMPY_USE_GPU', 'yes') == 'yes' 3 | if use_gpu: 4 | import gnumpy as gpu 5 | import gnumpy as gnp 6 | from stage import * 7 | 8 | class Conv1D(Stage): 9 | """ 10 | 1D temporal convolution. 11 | No padding, stride=1. 12 | """ 13 | def __init__(self, 14 | numChannels, 15 | windowSize, 16 | numFilters, 17 | inputNames=None, 18 | initRange=1.0, 19 | initSeed=2, 20 | needInit=True, 21 | initWeights=None, 22 | learningRate=0.0, 23 | learningRateAnnealConst=0.0, 24 | momentum=0.0, 25 | deltaMomentum=0.0, 26 | weightClip=0.0, 27 | gradientClip=0.0, 28 | weightRegConst=0.0, 29 | defaultValue=0.0, 30 | outputdEdX=True, 31 | gpu=use_gpu, 32 | name=None): 33 | Stage.__init__(self, 34 | name=name, 35 | inputNames=inputNames, 36 | outputDim=numFilters, 37 | defaultValue=defaultValue, 38 | learningRate=learningRate, 39 | learningRateAnnealConst=learningRateAnnealConst, 40 | momentum=momentum, 41 | deltaMomentum=deltaMomentum, 42 | weightClip=weightClip, 43 | gradientClip=gradientClip, 44 | weightRegConst=weightRegConst, 45 | gpu=gpu, 46 | outputdEdX=outputdEdX) 47 | self.numFilters = numFilters 48 | self.numChannels = numChannels 49 | self.windowSize = windowSize 50 | self.random = np.random.RandomState(initSeed) 51 | if needInit: 52 | self.W = self.random.uniform(-initRange/2.0, initRange/2.0, 53 | (self.windowSize * self.numChannels, self.numFilters)) 54 | else: 55 | self.W = initWeights 56 | if self.gpu: 57 | self.W = gnp.as_garray(self.W.astype('float32')) 58 | self.X = 0 59 | self.Y = 0 60 | 61 | def forward(self, X): 62 | self.X = X 63 | # Num of examples 64 | N = X.shape[0] 65 | # Timespan 66 | T = X.shape[1] 67 | # Windows size 68 | S = self.windowSize 69 | # Channels 70 | D = self.numChannels 71 | # Num filters 72 | F = self.numFilters 73 | Z = np.zeros((N, T - S + 1, S, D), X.dtype) 74 | for i in range(T - S + 1): 75 | Z[:, i, :, :] = X[:, i : i + S, :] 76 | Z = Z.reshape(N * (T - S + 1), S * D) 77 | if self.gpu: 78 | Z = gpu.as_garray(Z.astype('float32')) 79 | Y = gpu.dot(Z, self.W) 80 | Y = gpu.as_numpy_array(Y) 81 | else: 82 | Y = np.dot(Z, self.W) 83 | 84 | Y = Y.reshape(N, T - S + 1, F) 85 | self.Z = Z 86 | return Y 87 | 88 | def backward(self, dEdY): 89 | N = dEdY.shape[0] 90 | S = self.windowSize 91 | T = dEdY.shape[1] + S - 1 92 | F = dEdY.shape[2] 93 | D = self.X.shape[2] 94 | dEdY = dEdY.reshape(N * (T - S + 1), F) 95 | dEdX = np.zeros(self.X.shape, self.X.dtype) 96 | 97 | if self.gpu: 98 | gdEdY = gpu.as_garray(dEdY.astype('float32')) 99 | self.dEdW = gpu.dot(self.Z.transpose(), gdEdY) 100 | else: 101 | self.dEdW = np.dot(self.Z.transpose(), dEdY) 102 | 103 | if self.outputdEdX: 104 | if self.gpu: 105 | gdEdZ = gpu.dot(gdEdY, self.W.transpose()) 106 | dEdZ = gpu.as_numpy_array(gdEdZ) 107 | else: 108 | dEdZ = np.dot(dEdY, self.W.transpose()) 109 | 110 | dEdZ = dEdZ.reshape(N, T - S + 1, S, D) 111 | for t in range(0, T): 112 | if t <= S - 1: 113 | dEdX[:, t, :] = np.sum(dEdZ[:, range(0, t + 1), range(t, -1, -1), :], axis=1) 114 | elif t >= T - S + 1: 115 | dEdX[:, t, :] = np.sum(dEdZ[:, range(t - S + 1, T - S + 1), range(S - 1, S - (T - t) - 1, -1), :], axis=1) 116 | else: 117 | dEdX[:, t, :] = np.sum(dEdZ[:, range(t - S + 1, t + 1), range(S - 1, -1, -1), :], axis=1) 118 | return dEdX 119 | -------------------------------------------------------------------------------- /src/imageqa_modelavg.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | import numpy as np 5 | 6 | import imageqa_test as it 7 | import nn 8 | 9 | def runAvgAll(models, data): 10 | print 'Running model %s' % modelId 11 | modelOutput = nn.test(model, data['testData'][0]) 12 | modelOutputs.append(modelOutput) 13 | finalOutput = np.zeros(modelOutputs[0].shape) 14 | for output in modelOutputs: 15 | shape0 = min(finalOutput.shape[0], output.shape[0]) 16 | shape1 = min(finalOutput.shape[1], output.shape[1]) 17 | finalOutput[:shape0, :shape1] += output[:shape0, :shape1] / float(len(modelOutputs)) 18 | return finalOutput 19 | 20 | def testAvgAll(modelOutputs, mixRatio, data, outputFolder): 21 | # finalOutput = mixRatio * modelOutputs[0] + \ 22 | # (1 - mixRatio) * modelOutputs[1] 23 | finalOutput = np.zeros(modelOutputs[0].shape) 24 | for output in modelOutputs: 25 | shape0 = min(finalOutput.shape[0], output.shape[0]) 26 | shape1 = min(finalOutput.shape[1], output.shape[1]) 27 | finalOutput[:shape0, :shape1] += output[:shape0, :shape1] / float(len(modelOutputs)) 28 | testAnswerFile = it.getAnswerFilename(outputFolder, resultsFolder) 29 | testTruthFile = it.getTruthFilename(outputFolder, resultsFolder) 30 | resultsRank, \ 31 | resultsCategory, \ 32 | resultsWups = it.runAllMetrics( 33 | data['testData'][0], 34 | finalOutput, 35 | data['testData'][1], 36 | data['ansIdict'], 37 | data['questionTypeArray'], 38 | testAnswerFile, 39 | testTruthFile) 40 | it.writeMetricsToFile( 41 | outputFolder, 42 | resultsRank, 43 | resultsCategory, 44 | resultsWups, 45 | resultsFolder) 46 | 47 | def testAvg(modelOutputs, mixRatio, target): 48 | finalOutput = mixRatio * modelOutputs[0] + \ 49 | (1 - mixRatio) * modelOutputs[1] 50 | rate, _, __ = it.calcPrecision(finalOutput, target) 51 | return rate 52 | 53 | def validAvg(modelOutputs, mixRatios, target): 54 | bestRate = 0.0 55 | bestMixRatio = 0.0 56 | for mixRatio in mixRatios: 57 | rate = testAvg(modelOutputs, mixRatio, target) 58 | print 'Mix ratio %.4f Rate %.4f' % (mixRatio, rate) 59 | if rate > bestRate: 60 | bestMixRatio = mixRatio 61 | bestRate = rate 62 | return bestMixRatio 63 | 64 | if __name__ == '__main__': 65 | """ 66 | Usage: python imageqa_modelavg.py 67 | -m[odel] {modelId1} 68 | -m[odel] {modelId2} 69 | -vm[odel] {validModelId1} 70 | -vm[odel] {validModelId2} 71 | -d[ata] {dataFolder} 72 | -o[utput] {outputFolder} 73 | [-r[esults] {resultsFolder}] 74 | """ 75 | resultsFolder = '../results' 76 | modelIds = [] 77 | validModelIds = [] 78 | for i, flag in enumerate(sys.argv): 79 | if flag == '-m' or flag == '-model': 80 | modelIds.append(sys.argv[i + 1]) 81 | elif flag == '-vm' or flag == '-vmodel': 82 | validModelIds.append(sys.argv[i + 1]) 83 | elif flag == '-r' or flag == '-results': 84 | resultsFolder = sys.argv[i + 1] 85 | elif flag == '-d' or flag == '-data': 86 | dataFolder = sys.argv[i + 1] 87 | elif flag == '-o' or flag == '-output': 88 | outputFolder = sys.argv[i + 1] 89 | data = it.loadDataset(dataFolder) 90 | 91 | models = [] 92 | validModels = [] 93 | for modelId in modelIds: 94 | print 'Loading model %s' % modelId 95 | models.append(it.loadModel(modelId, resultsFolder)) 96 | for modelId in validModelIds: 97 | print 'Loading model %s' % modelId 98 | validModels.append(it.loadModel(modelId, resultsFolder)) 99 | 100 | modelOutputs = [] 101 | validModelOutputs = [] 102 | # for modelId, model in zip(validModelIds, validModels): 103 | # print 'Running model %s' % modelId 104 | # modelOutput = nn.test(model, data['validData'][0]) 105 | # validModelOutputs.append(modelOutput) 106 | # 107 | # mixRatios = np.arange(0, 11) * 0.1 108 | # bestMixRatio = validAvg(validModelOutputs, mixRatios, data['validData'][1]) 109 | # print 'Best ratio found: %.4f' % bestMixRatio 110 | bestMixRatio = 0.5 111 | shape = None 112 | for modelId, model in zip(modelIds, models): 113 | print 'Running model %s' % modelId 114 | modelOutput = nn.test(model, data['testData'][0]) 115 | if shape is None: 116 | shape = modelOutput.shape 117 | else: 118 | modelOutput = modelOutput[:shape[0],:shape[1]] 119 | modelOutputs.append(modelOutput) 120 | 121 | testAvgAll(modelOutputs, bestMixRatio, data, outputFolder) 122 | -------------------------------------------------------------------------------- /models/2_vis_blstm.model.yml: -------------------------------------------------------------------------------- 1 | name: '2_vis_blstm' 2 | costFn: 'crossEntIdx' 3 | decisionFn: 'argmax' 4 | stages: 5 | - 'imgSel' 6 | - 'txtSel' 7 | - 'txtDict' 8 | - 'txtFold' 9 | - 'imgFeat' 10 | - 'imgFeatNorm' 11 | - 'imgMapFirst' 12 | - 'imgMapLast' 13 | - 'imgFoldFirst' 14 | - 'imgFoldLast' 15 | - 'concat' 16 | - 'concatRev' 17 | - 'concatLast' 18 | - 'concatLastRev' 19 | - 'dropoutForward' 20 | - 'dropoutBackward' 21 | - 'lstmF' 22 | - 'lstmB' 23 | - 'answer' 24 | outputs: 'answer' 25 | specs: 26 | - name: 'imgSel' 27 | type: 'selector' 28 | inputs: 'input' 29 | start: 0 30 | end: 1 31 | axis: 1 32 | - name: 'txtSel' 33 | type: 'selector' 34 | inputs: 'input' 35 | start: 1 36 | end: 56 37 | axis: 1 38 | - name: 'txtDict' 39 | type: 'lut' 40 | intConversion: true 41 | inputs: 'txtSel' 42 | inputDim: 9738 43 | outputDim: 500 44 | initRange: 1.0 45 | initSeed: 2 46 | learningRate: 0.8 47 | momentum: 0.9 48 | gradientClip: 0.1 49 | weightClip: 2000.0 50 | outputdEdX: false 51 | - name: 'txtFold' 52 | type: 'timeFold' 53 | inputs: 'txtDict' 54 | timespan: 55 55 | - name: 'imgFeat' 56 | type: 'lut' 57 | inputs: 'imgSel' 58 | inputDim: 123288 59 | outputDim: 4096 60 | initWeights: '../data/hidden_oxford_mscoco.h5' 61 | sparse: true 62 | format: 'h5' 63 | h5key: 'hidden7' 64 | learningRate: 0.0 65 | outputdEdX: false 66 | - name: 'imgFeatNorm' 67 | type: 'normalize' 68 | inputs: 'imgFeat' 69 | mean: '../data/hidden_oxford_mscoco.h5' 70 | meanKey: 'hidden7_mean' 71 | std: '../data/hidden_oxford_mscoco.h5' 72 | stdKey: 'hidden7_std' 73 | format: 'h5' 74 | outputDim: 4096 75 | - name: 'imgMapFirst' 76 | type: 'map' 77 | inputs: 'imgFeatNorm' 78 | activeFn: 'identity' 79 | outputDim: 500 80 | bias: false 81 | initRange: 0.05 82 | initSeed: 1 83 | learningRate: 0.8 84 | momentum: 0.9 85 | gradientClip: 0.1 86 | weightClip: 100.0 87 | outputdEdX: false 88 | - name: 'imgMapLast' 89 | type: 'map' 90 | inputs: 'imgFeatNorm' 91 | activeFn: 'identity' 92 | outputDim: 500 93 | bias: false 94 | initRange: 0.05 95 | initSeed: 15 96 | learningRate: 0.8 97 | momentum: 0.9 98 | gradientClip: 0.1 99 | weightClip: 100.0 100 | outputdEdX: false 101 | - name: 'imgFoldFirst' 102 | type: 'timeFold' 103 | inputs: 'imgMapFirst' 104 | timespan: 1 105 | - name: 'imgFoldLast' 106 | type: 'timeFold' 107 | inputs: 'imgMapLast' 108 | timespan: 1 109 | - name: 'concat' 110 | type: 'concat' 111 | inputs: 'imgFoldFirst, txtFold' 112 | axis: 1 113 | - name: 'concatRev' 114 | type: 'timeReverse' 115 | inputs: 'concat' 116 | - name: 'concatLast' 117 | type: 'concat' 118 | inputs: 'imgFoldLast, concatRev' 119 | axis: 1 120 | - name: 'concatLastRev' 121 | type: 'timeReverse' 122 | inputs: 'concatLast' 123 | - name: 'dropoutForward' 124 | type: 'dropout' 125 | inputs: 'concatLastRev' 126 | dropoutRate: 0.4 127 | initSeed: 3 128 | outputDim: 500 129 | - name: 'dropoutBackward' 130 | type: 'dropout' 131 | inputs: 'concatLast' 132 | dropoutRate: 0.4 133 | initSeed: 4 134 | outputDim: 500 135 | - name: 'lstmF' 136 | type: 'lstm' 137 | inputs: 'dropoutForward' 138 | inputDim: 500 139 | outputDim: 300 140 | timespan: 57 141 | initRange: 0.1 142 | initSeed: 5 143 | multiOutput: false 144 | learningRate: 0.8 145 | learningRateAnnealConst: 0.0 146 | momentum: 0.9 147 | gradientClip: 0.1 148 | weightClip: 100.0 149 | weightRegConst: 0.00005 150 | outputdEdX: true 151 | - name: 'lstmB' 152 | type: 'lstm' 153 | inputs: 'dropoutBackward' 154 | inputDim: 500 155 | outputDim: 300 156 | timespan: 57 157 | initRange: 0.1 158 | initSeed: 6 159 | multiOutput: false 160 | learningRate: 0.8 161 | learningRateAnnealConst: 0.0 162 | momentum: 0.9 163 | gradientClip: 0.1 164 | weightClip: 100.0 165 | weightRegConst: 0.00005 166 | outputdEdX: true 167 | - name: 'answer' 168 | type: 'map' 169 | inputs: 'lstmF, lstmB' 170 | outputDim: 431 171 | activeFn: 'softmax' 172 | initRange: 0.01 173 | initSeed: 6 174 | learningRate: 0.01 175 | momentum: 0.9 176 | gradientClip: 0.1 177 | weightClip: 15.0 178 | weightRegConst: 0.00005 179 | -------------------------------------------------------------------------------- /src/nn/map.py: -------------------------------------------------------------------------------- 1 | from stage import * 2 | import os 3 | use_gpu = os.environ.get('GNUMPY_USE_GPU', 'yes') == 'yes' 4 | if use_gpu: 5 | import gnumpy as gpu 6 | import gnumpy as gnp 7 | 8 | class Map(Stage): 9 | def __init__(self, 10 | outputDim, 11 | activeFn, 12 | inputNames=None, 13 | initRange=1.0, 14 | bias=True, 15 | biasInitConst=-1.0, 16 | initSeed=2, 17 | needInit=True, 18 | initWeights=0, 19 | initType='zeroMean', 20 | learningRate=0.0, 21 | learningRateAnnealConst=0.0, 22 | momentum=0.0, 23 | deltaMomentum=0.0, 24 | weightClip=0.0, 25 | gradientClip=0.0, 26 | weightRegConst=0.0, 27 | outputdEdX=True, 28 | defaultValue=0.0, 29 | gpu=use_gpu, 30 | name=None): 31 | Stage.__init__(self, 32 | name=name, 33 | inputNames=inputNames, 34 | outputDim=outputDim, 35 | defaultValue=defaultValue, 36 | learningRate=learningRate, 37 | learningRateAnnealConst=learningRateAnnealConst, 38 | momentum=momentum, 39 | deltaMomentum=deltaMomentum, 40 | weightClip=weightClip, 41 | gradientClip=gradientClip, 42 | weightRegConst=weightRegConst, 43 | gpu=gpu, 44 | outputdEdX=outputdEdX) 45 | self.bias = bias 46 | self.activeFn = activeFn 47 | self.inputDim = None 48 | self.random = np.random.RandomState(initSeed) 49 | if not needInit: 50 | if self.gpu: 51 | self.W = gnp.as_garray(initWeights) 52 | else: 53 | self.W = initWeights 54 | else: 55 | # Lazy initialize the weights until the first data arrives 56 | self.W = None 57 | self.initRange = initRange 58 | self.biasInitConst = biasInitConst 59 | self.initType = initType 60 | self.X = 0 61 | self.Y = 0 62 | pass 63 | 64 | def initWeights(self): 65 | if self.initType == 'zeroMean': 66 | r0 = -self.initRange/2.0 67 | r1 = self.initRange/2.0 68 | elif self.initType == 'positive': 69 | r0 = 0.0 70 | r1 = self.initRange 71 | else: 72 | raise Exception('Unknown initialization type: ' + self.initType) 73 | if self.bias: 74 | if self.biasInitConst >= 0.0: 75 | self.W = np.concatenate((self.random.uniform( 76 | r0, r1, (self.inputDim, self.outputDim)), 77 | np.ones((1, self.outputDim)) * self.biasInitConst), axis=0) 78 | else: 79 | self.W = self.random.uniform( 80 | r0, r1, (self.inputDim + 1, self.outputDim)) 81 | else: 82 | self.W = self.random.uniform( 83 | -self.initRange/2.0, self.initRange/2.0, (self.inputDim, self.outputDim)) 84 | if self.gpu: 85 | self.W = gpu.as_garray(self.W.astype('float32')) 86 | 87 | def forward(self, X): 88 | if self.inputDim is None: self.inputDim = X.shape[-1] 89 | if self.W is None: self.initWeights() 90 | if self.bias: 91 | self.X = np.concatenate((X, np.ones((X.shape[0], 1), dtype=X.dtype)), axis=-1) 92 | else: 93 | self.X = X 94 | if self.gpu: 95 | self.X = gpu.as_garray(self.X.astype('float32')) 96 | Z = gpu.dot(self.X, self.W) 97 | Z = Z.as_numpy_array(dtype='float32') 98 | self.Y = self.activeFn.forward(Z) 99 | else: 100 | Z = np.dot(self.X, self.W) 101 | self.Y = self.activeFn.forward(Z) 102 | return self.Y 103 | 104 | def backward(self, dEdY): 105 | dEdZ = self.activeFn.backward(dEdY, self.Y, 0) 106 | if self.gpu: 107 | gdEdZ = gpu.as_garray(dEdZ.astype('float32')) 108 | self.dEdW = gpu.dot(self.X.transpose(), gdEdZ) 109 | if self.bias: 110 | dEdX = gpu.dot(gdEdZ, self.W[:-1, :].transpose()) 111 | else: 112 | dEdX = gpu.dot(gdEdZ, self.W.transpose()) 113 | dEdX = gpu.as_numpy_array(dEdX) 114 | else: 115 | self.dEdW = np.dot(self.X.transpose(), dEdZ) 116 | if self.bias: 117 | dEdX = np.dot(dEdZ, self.W[:-1, :].transpose()) 118 | else: 119 | dEdX = np.dot(dEdZ, self.W.transpose()) 120 | return dEdX if self.outputdEdX else None 121 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Image QA 2 | This repository contains code to reproduce results in paper *Exploring Models 3 | and Data for Image Question Answering*. Mengye Ren, Ryan Kiros, Richard Zemel. 4 | NIPS 2015 (to appear). 5 | 6 | ## Rendered results 7 | Results for each model can be viewed directly at 8 | http://www.cs.toronto.edu/~mren/imageqa/results 9 | 10 | ## Dataset 11 | COCO-QA dataset is released at 12 | http://www.cs.toronto.edu/~mren/imageqa/data/cocoqa 13 | 14 | ## Prerequisites 15 | ### Dependencies 16 | Please install the following dependencies: 17 | * python 2.7 18 | * numpy 19 | * scipy 20 | * hdf5 21 | * h5py (python package for read/write h5 files) 22 | * pyyaml (python pakcage for parse yaml format) 23 | * cuda (optional, if you want to run on GPU) 24 | * cudamat (optional, python wrapper for cuda) 25 | 26 | ### Repository structure 27 | The repository contains the following folders: 28 | * *src*: Source code folder 29 | * *data*: Empty folder, to store dataset 30 | * *results*: Empty folder, to store results 31 | * *models*: Model architecture description files 32 | * *config*: Training loop hyperparameters (batch size, etc.) 33 | 34 | ### Data files 35 | Please download the following files from my server: 36 | * Image features from VGG-19 37 | * http://www.cs.toronto.edu/~mren/imageqa/data/hidden_oxford_mscoco.h5 38 | * about 1.1G 39 | * Encoded COCO-QA dataset 40 | * http://www.cs.toronto.edu/~mren/imageqa/data/cocoqa.zip 41 | * about 5.4M 42 | 43 | After downloading the files, please place *hidden_oxford_mscoco.h5* inside 44 | *data* folder, extract *cocoqa* folder inside *data*. 45 | 46 | Now your data folder should contain the following files: 47 | * *hidden_oxford_mscoco.h5* - the last hidden layer activation from the VGG-19 48 | conv net on the entire MS-COCO dataset. It is stored as a scipy sparse row 49 | matrix format. Each row represents an image. 50 | * *cocoqa/imgid_dict.pkl* - a list telling you which row 51 | corresponding to which original MS-COCO image ID. 52 | * *cocoqa/train.npy* - training set (not including hold-out set) 53 | * *cocoqa/valid.npy* - validation set to determine early stop. 54 | * *cocoqa/test.npy* - test set 55 | * *cocoqa/qdict.pkl* - question word dictionary 56 | * *cocoqa/ansdict.pkl* - answer class definition 57 | 58 | All numpy files above (train, valid, test) stores two objects, the input data 59 | and the target value. The input data is 3-d matrix, with first dimension to be 60 | number of example, second dimension to be time, third dimension to be feature. 61 | The first time step is the image ID, and later the word ID. The target value is 62 | the answer class ID. The IDs dictionary can be found in qdict.pkl and 63 | ansdict.pkl, which are python pickle files storing the dictionary object. All 64 | unseen words in the test set are encoded as 'UNK' and has its own ID. Note that 65 | the word ID is 1-based, 0 is reserved for empty word, which has a zero word 66 | embedding vector. 67 | 68 | ## Training 69 | 70 | After setting up the dataset, call the following command to train a model. For 71 | IMG+BOW, {model file} is *models/img_bow.model.yml*. VIS+LSTM and 2-VIS+BLSTM 72 | can also be found in the *models* folder. 73 | 74 | ``` 75 | cd src 76 | 77 | GNUMPY_USE_GPU={yes|no} python train.py \ 78 | -model ../models/{model file} \ 79 | -output ../results \ 80 | -data ../data/cocoqa \ 81 | -config ../config/train.yml \ 82 | [-board {gpu board id} (optional)] 83 | ``` 84 | 85 | While training, it will print some statuses, and here is how to decode them: 86 | * N: number of epochs 87 | * T: number of seconds elapsed 88 | * TE: training loss 89 | * TR: accuracy on training set 90 | * VE: validation loss 91 | * VR: accuracy on validation set 92 | * ST: layer name 93 | * GN: euclidean norm of the gradient of the layer 94 | * GC: gradient clip 95 | * WN: euclidean norm of the weights of the layer 96 | * WC: weight clip 97 | 98 | First round it will train using only the training set and validate on the 99 | hold-out set, to determine the number of epoch to train. Then it will start 100 | another job to train the training set plus the hold out set together. It will 101 | not print test set performance until everything has been finished. 102 | 103 | ## Reading trained weight matrices 104 | 105 | The weights are stored in results folder named 106 | {model}-{timestamp}/{model}-{timestamp}.w.npy 107 | 108 | If you load the weights in python, it will be a list of arrays. 109 | Non-parameterized layers have a single 0 value in the list. For IMG+BOW model, 110 | there are only 2 non-zero entries, one is the word embedding matrix, and the 111 | other is the softmax weights. The softmax weights have the last row as the 112 | bias. 113 | 114 | For LSTM weights, the weight for the entire LSTM unit is reshaped into one 115 | matrix, 116 | 117 | * W = [W_I, W_F, W_Z, W_O]^T. 118 | 119 | W_I is for the input gate, W_F is for the 120 | forget gate, W_Z is for the input transformation, and W_O is for the output 121 | gate. The weights for each W has the last row as the bias, 122 | i.e. (InDim + 1) x OutDim. 123 | 124 | * W_I = [W_XI, W_HI, W_CI, b_I]^T 125 | * W_F = [W_XF, W_HF, W_CF, b_F]^T 126 | * W_Z = [W_XZ, W_HZ, b_Z]^T 127 | * W_O = [W_XO, W_HO, W_CO, b_O]^T 128 | -------------------------------------------------------------------------------- /src/nn/lstm_test.py: -------------------------------------------------------------------------------- 1 | from sequential import * 2 | from lstm_old import * 3 | from dropout import * 4 | from reshape import * 5 | from lut import * 6 | from lstm import * 7 | import unittest 8 | 9 | class LSTM_Recurrent_Real_Tests(unittest.TestCase): 10 | def test_all(self): 11 | trainInput = np.loadtxt('lstm_test_input.csv', delimiter=',') 12 | trainInput = trainInput.reshape(trainInput.shape[0], trainInput.shape[1], 1) 13 | trainTarget = np.loadtxt('lstm_test_target.csv', delimiter=',') 14 | trainTarget = trainTarget.reshape(trainTarget.shape[0], 1) 15 | wordEmbed = np.loadtxt('lstm_test_word.csv', delimiter=',') 16 | D = 300 17 | D2 = 10 18 | N = trainInput.shape[0] 19 | Time = trainInput.shape[1] 20 | multiOutput = False 21 | time_unfold = TimeUnfold() 22 | lut = LUT( 23 | inputDim=np.max(trainInput)+1, 24 | outputDim=D, 25 | inputNames=None, 26 | needInit=False, 27 | initWeights=wordEmbed 28 | ) 29 | 30 | time_fold = TimeFold( 31 | timespan=Time, 32 | inputNames=None 33 | ) 34 | 35 | dropout = Dropout( 36 | name='d1', 37 | dropoutRate=0.2, 38 | initSeed=2, 39 | inputNames=None, 40 | outputDim=D2 41 | ) 42 | dropout2 = Dropout( 43 | name='d2', 44 | dropoutRate=0.2, 45 | initSeed=2, 46 | inputNames=None, 47 | outputDim=D2 48 | ) 49 | lstm = LSTM( 50 | name='lstm', 51 | timespan=Time, 52 | inputDim=D, 53 | outputDim=D2, 54 | inputNames=None, 55 | multiOutput=multiOutput, 56 | cutOffZeroEnd=True, 57 | learningRate=0.8, 58 | momentum=0.9, 59 | outputdEdX=True) 60 | 61 | lstm2 = LSTM_Old( 62 | name='lstm', 63 | inputDim=D, 64 | outputDim=D2, 65 | needInit=False, 66 | initRange=0.1, 67 | initSeed=0, 68 | cutOffZeroEnd=True, 69 | multiErr=multiOutput, 70 | learningRate=0.8, 71 | momentum=0.9, 72 | outputdEdX=True) 73 | 74 | sig = Map( 75 | name='sig', 76 | outputDim=1, 77 | activeFn=SigmoidActiveFn(), 78 | initRange=0.1, 79 | initSeed=5, 80 | learningRate=0.01, 81 | momentum=0.9, 82 | weightClip=10.0, 83 | gradientClip=0.1, 84 | weightRegConst=0.00005 85 | ) 86 | sig2 = Map( 87 | name='sig', 88 | outputDim=1, 89 | activeFn=SigmoidActiveFn(), 90 | initRange=0.1, 91 | initSeed=5, 92 | learningRate=0.01, 93 | momentum=0.9, 94 | weightClip=10.0, 95 | gradientClip=0.1, 96 | weightRegConst=0.00005 97 | ) 98 | 99 | costFn = crossEntOne 100 | model1 = Sequential( 101 | stages=[ 102 | time_unfold, 103 | lut, 104 | time_fold, 105 | dropout, 106 | lstm, 107 | sig 108 | ] 109 | ) 110 | 111 | model2 = Sequential( 112 | stages=[ 113 | time_unfold, 114 | lut, 115 | time_fold, 116 | dropout2, 117 | lstm2, 118 | sig2 119 | ] 120 | ) 121 | 122 | input_ = trainInput[0:N, 0:Time] 123 | target_ = trainTarget[0:N] 124 | Y1 = model1.forward(input_) 125 | 126 | W = lstm.getWeights() 127 | lstm2.W = W.transpose() 128 | Y2 = model2.forward(input_) 129 | self.chkEqual(Y1, Y2) 130 | 131 | E, dEdY1 = costFn(Y1, target_) 132 | E, dEdY2 = costFn(Y2, target_) 133 | model1.backward(dEdY1) 134 | model2.backward(dEdY2) 135 | 136 | dEdW = lstm.getGradient() 137 | self.chkEqual(dEdW.transpose(), lstm2.dEdW) 138 | lstm.updateWeights() 139 | lstm2.updateWeights() 140 | W = lstm.getWeights() 141 | self.chkEqual(W.transpose(), lstm2.W) 142 | 143 | def chkEqual(self, a, b): 144 | tolerance = 1e-1 145 | a = a.reshape(a.size) 146 | b = b.reshape(b.size) 147 | for i in range(a.size): 148 | if not ((a[i] == 0 and b[i] == 0) or 149 | (np.abs(a[i]) < 1e-7 and np.abs(b[i]) < 1e-7) or 150 | (np.abs(a[i] / b[i] - 1) < tolerance)): 151 | print a[i], b[i], a[i]/b[i] 152 | self.assertTrue( 153 | (a[i] == 0 and b[i] == 0) or 154 | (np.abs(a[i]) < 1e-7 and np.abs(b[i]) < 1e-7) or 155 | (np.abs(a[i] / b[i] - 1) < tolerance)) 156 | 157 | if __name__ == '__main__': 158 | unittest.main() -------------------------------------------------------------------------------- /src/nn/func.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def meanSqErr(Y, T, weights=None): 4 | diff = Y - T.reshape(Y.shape) 5 | diff2 = np.sum(np.power(diff, 2), axis=-1) 6 | if weights is not None: 7 | diff2 *= weights 8 | weights = weights.reshape(weights.shape[0], 1) 9 | diff *= weights 10 | E = 0.5 * np.sum(diff2) / float(Y.shape[0]) 11 | dEdY = diff / float(Y.shape[0]) 12 | return E, dEdY 13 | 14 | def hardLimit(Y): 15 | return (Y > 0.5).astype(int) 16 | 17 | def sigmoidFn(X): 18 | return 1 / (1 + np.exp(-X)) 19 | 20 | def crossEntIdx(Y, T, weights=None): 21 | eps = 1e-8 22 | Y2 = Y.reshape(Y.size / Y.shape[-1], Y.shape[-1]) 23 | T2 = T.reshape(T.size) 24 | E = 0.0 25 | dEdY = np.zeros(Y2.shape, float) 26 | if weights is None: 27 | for n in range(0, Y2.shape[0]): 28 | E += -np.log(Y2[n, T2[n]] + eps) 29 | dEdY[n, T2[n]] = -1 / (Y2[n, T2[n]] + eps) 30 | else: 31 | for n in range(0, Y2.shape[0]): 32 | E += -np.log(Y2[n, T2[n]] + eps) * weights[n] 33 | dEdY[n, T2[n]] = -1 / (Y2[n, T2[n]] + eps) * weights[n] 34 | E /= Y2.shape[0] 35 | dEdY /= Y2.shape[0] 36 | dEdY = dEdY.reshape(Y.shape) 37 | return E, dEdY 38 | 39 | def crossEntOneIdx(Y, T, weights=None): 40 | eps = 1e-8 41 | Y2 = Y.reshape(Y.size / Y.shape[-1], Y.shape[-1]) 42 | T2 = T.reshape(T.size) 43 | E = 0.0 44 | dEdY = np.zeros(Y2.shape, float) 45 | if weights is None: 46 | for n in range(0, Y.shape[0]): 47 | E += -np.log(Y2[n, T2[n]] + eps) + np.log(1 - Y2[n, T2[n] + eps]) 48 | E += -np.sum(np.log(1 - Y2[n, :] + eps)) 49 | dEdY[n, :] = 1 / (1 - Y2[n] + eps) 50 | dEdY[n, T2[n]] = -1 / (Y2[n, T2[n]] + eps) 51 | else: 52 | for n in range(0, Y.shape[0]): 53 | E += (-np.log(Y2[n, T2[n]] + eps) + \ 54 | np.log(1 - Y2[n, T2[n] + eps])) * weights[n] 55 | E += (-np.sum(np.log(1 - Y2[n, :] + eps))) * weights[n] 56 | dEdY[n, :] = (1 / (1 - Y2[n] + eps)) * weights[n] 57 | dEdY[n, T2[n]] = (-1 / (Y2[n, T2[n]] + eps)) * weights[n] 58 | E /= Y2.shape[0] 59 | dEdY /= Y2.shape[0] 60 | dEdY = dEdY.reshape(Y.shape) 61 | return E, dEdY 62 | 63 | def crossEntOneAccIdx(Y, T, weights=None): 64 | eps = 1e-8 65 | Y2 = Y.reshape(Y.size / Y.shape[-1], Y.shape[-1]) 66 | T2 = T.reshape(T.size) 67 | E = 0.0 68 | dEdY = np.zeros(Y2.shape, float) 69 | if weights is None: 70 | for n in range(0, Y.shape[0]): 71 | t = T2[n] 72 | E += -np.sum(np.log(Y2[n, t + 1:] + eps)) 73 | E += -np.sum(np.log(1 - Y2[n, :t + 1] + eps)) 74 | dEdY[n, t + 1:] = -1 / (Y2[n, t + 1:] + eps) 75 | dEdY[n, :t + 1] = 1/ (1 - Y2[n, :t + 1] + eps) 76 | else: 77 | for n in range(0, Y.shape[0]): 78 | t = T2[n] 79 | E += -np.sum(np.log(Y2[n, t + 1:] + eps)) * weights[n] 80 | E += -np.sum(np.log(1 - Y2[n, :t + 1] + eps)) * weights[n] 81 | dEdY[n, t + 1:] = -1 / (Y2[n, t + 1:] + eps) * weights[n] 82 | dEdY[n, :t + 1] = 1/ (1 - Y2[n, :t + 1] + eps) * weights[n] 83 | E /= Y2.shape[0] 84 | dEdY /= Y2.shape[0] 85 | dEdY = dEdY.reshape(Y.shape) 86 | return E, dEdY 87 | 88 | def crossEntOne(Y, T, weights=None): 89 | eps = 1e-8 90 | T = T.reshape(Y.shape) 91 | cost = -T * np.log(Y + eps) - (1 - T) * np.log(1 - Y + eps) 92 | dcost = -T / (Y + eps) + (1 - T) / (1 - Y + eps) 93 | if weights is not None: 94 | cost *= weights 95 | dcost *= weights.reshape(weights.shape[0], 1) 96 | if len(Y.shape) == 0: 97 | E = cost 98 | dEdY = dcost 99 | else: 100 | E = np.sum(cost) / float(Y.size) 101 | dEdY = dcost / float(Y.size) 102 | return E, dEdY 103 | 104 | def argmax(Y): 105 | return np.argmax(Y, axis=-1) 106 | 107 | def argmaxDiff(Y): 108 | Y2 = Y.reshape(Y.size / Y.shape[-1], Y.shape[-1]) 109 | Ydiff = np.zeros(Y2.shape) 110 | for i in range(Y2.shape[1] - 1): 111 | Ydiff[:, i] = Y2[:, i + 1] - Y2[:, i] 112 | Ydiff2 = np.reshape(Ydiff, Y.shape) 113 | return np.argmax(Ydiff2, axis=-1) 114 | 115 | def meanSqErrEye(Y, T, weights=None): 116 | eye = np.eye(Y.shape[-1]) 117 | T2 = T.reshape(T.size) 118 | T3 = eye[T2] 119 | return meanSqErr(Y, T3, weights=weights) 120 | 121 | def roundInt(Y): 122 | return np.round(Y).astype('int') 123 | 124 | def rankingLoss(Y, T, weights=None): 125 | alpha = 0.1 126 | dEdY = np.zeros(Y.shape) 127 | E = 0.0 128 | for n in range(T.size): 129 | cost = Y[n] - Y[n, T[n]] + alpha 130 | valid = (cost > 0).astype(int) 131 | nvalid = np.sum(valid) - 1 132 | cost = cost * valid 133 | dEdY[n] = valid 134 | dEdY[n, T[n]] = -nvalid 135 | if weights is not None: 136 | cost *= weights[n] 137 | dEdY[n] *= weights[n] 138 | E += np.sum(cost) - alpha 139 | E /= float(T.size) 140 | dEdY /= float(T.size) 141 | return E, dEdY 142 | -------------------------------------------------------------------------------- /src/imageqa_adhoc.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import nn 4 | import numpy as np 5 | import imageqa_test as it 6 | import imageqa_ensemble as ie 7 | import imageqa_render as ir 8 | import prep 9 | 10 | def parseInputFile(filename): 11 | qids = [] 12 | questions = [] 13 | answers = [] 14 | caption = '' 15 | i = 0 16 | with open(filename) as f: 17 | for line in f: 18 | if i == 0 and line.startswith('caption:'): 19 | caption = line[8:-1] 20 | else: 21 | parts = line.split(',') 22 | qids.append(int(parts[0])) 23 | questions.append(parts[1]) 24 | answers.append(parts[2].strip('\n')) 25 | i += 1 26 | return caption, qids, questions, answers 27 | 28 | if __name__ == '__main__': 29 | """ 30 | Ask adhoc questions with trained models. 31 | 32 | Usage: python imageqa_adhoc.py 33 | -m[odel] {name1:modelId1} 34 | -m[odel] {name2:modelId2} 35 | -em[odel] {name3:ensembleModelId3,ensembleModelId4,...} 36 | -pem[odel] {name3:ensembleModelId5,ensembleModelId6,...} 37 | -aem[odel] {name3:ensembleModelId7,ensembleModelId8,...} 38 | ... 39 | -d[ata] {dataFolder} 40 | -i[nput] {listFile} 41 | -o[utput] {outputFolder} 42 | [-k {top K answers}] 43 | [-p[icture] {pictureFolder}] 44 | [-r[esults] {resultsFolder}] 45 | [-f[ile] {outputTexFilename}] 46 | [-dataset {daquar/cocoqa}] 47 | [-format {html/latex}] 48 | Parameters: 49 | -m[odel]: Model name and model ID 50 | -d[ata]: Dataset dataFolder 51 | -i[nput]: Adhoc question list filename 52 | -o[utput]: Output folder of the rendered results 53 | -k: Render top-K answers (default 1) 54 | -p[icture]: Picture folder, only required in LaTeX mode (default "img") 55 | -r[esults]: Results folder where trained models are stored (default "../results") 56 | -f[ile]: Output filename, only required in LaTex mode 57 | -dataset: Use DAQUAR/COCO-QA dataset (default "cocoqa") 58 | -format: Set output format to HTML/LaTeX (default "html") 59 | 60 | Input question list format: 61 | QID1,Question1,GroundTruthAnswer1 62 | QID2,Question2,GroundTruthAnswer2 63 | ... 64 | """ 65 | params = ir.parseComparativeParams(sys.argv) 66 | 67 | urlDict = ir.loadImgUrl(params['dataset'], params['dataFolder']) 68 | data = it.loadDataset(params['dataFolder']) 69 | maxlen = data['testData'][0].shape[1] 70 | 71 | print('Parsing input file...') 72 | caption, qids, questions, answers = parseInputFile(params['inputFile']) 73 | idx = np.array(qids, dtype='int') 74 | #inputTestSel = data['testData'][0][idx] 75 | #targetTestSel = data['testData'][1][idx] 76 | imgids = qids 77 | #imgids = inputTestSel[:, 0, 0] 78 | inputTest = prep.combine(\ 79 | prep.lookupQID(questions, data['questionDict'], maxlen), imgids) 80 | targetTest = prep.lookupAnsID(answers, data['ansDict']) 81 | questionTypeArray = data['questionTypeArray'][idx] 82 | 83 | print('Running models...') 84 | modelOutputs = ie.runAllModels( 85 | inputTest, 86 | questionTypeArray, 87 | params['models'], 88 | params['resultsFolder'], 89 | params['dataset'], 90 | params['dataFolder']) 91 | 92 | # Render 93 | if not os.path.exists(params['outputFolder']): 94 | os.makedirs(params['outputFolder']) 95 | if params['format'] == 'html': 96 | print('Rendering HTML...') 97 | pages = ir.renderHtml( 98 | inputTest, 99 | targetTest, 100 | data['questionIdict'], 101 | data['ansIdict'], 102 | urlDict, 103 | topK=params['topK'], 104 | modelOutputs=modelOutputs, 105 | modelNames=ir.getModelNames(params['models']), 106 | questionIds=idx) 107 | for i, page in enumerate(pages): 108 | with open(os.path.join(params['outputFolder'], 109 | '%s-%d.html' % (params['outputFilename'], i)), 'w') as f: 110 | f.write(page) 111 | elif params['format'] == 'latex': 112 | # For LaTeX only, replace underscore in vocabulary. 113 | data['questionIdict'] = ir.escapeLatexIdict(data['questionIdict']) 114 | data['ansIdict'] = ir.escapeLatexIdict(data['ansIdict']) 115 | ir.renderLatex( 116 | inputTest, 117 | targetTest, 118 | data['questionIdict'], 119 | data['ansIdict'], 120 | urlDict, 121 | topK=params['topK'], 122 | outputFolder=params['outputFolder'], 123 | pictureFolder=params['pictureFolder'], 124 | comments=None, 125 | caption=caption, 126 | modelOutputs=modelOutputs, 127 | modelNames=ir.getModelNames(params['models']), 128 | questionIds=idx, 129 | filename=params['outputFilename']+'.tex') 130 | -------------------------------------------------------------------------------- /src/nn/container.py: -------------------------------------------------------------------------------- 1 | from active_func import * 2 | from map import * 3 | 4 | class Input(Stage): 5 | def __init__(self, name, outputDim): 6 | Stage.__init__(self, 7 | name=name, 8 | inputNames=[], 9 | outputDim=outputDim) 10 | def setValue(self, value): 11 | self.Y = value 12 | def forward(self, X): 13 | return X 14 | def backward(self, dEdY): 15 | return dEdY 16 | 17 | class Output(Stage): 18 | def __init__(self, name, inputNames, outputDim=0, defaultValue=0): 19 | Stage.__init__(self, 20 | name=name, 21 | inputNames=inputNames, 22 | defaultValue=defaultValue, 23 | outputDim=outputDim) 24 | def graphForward(self): 25 | self.Y = self.getInput() 26 | def graphBackward(self): 27 | self.sendError(self.dEdY) 28 | 29 | class Container(Stage): 30 | def __init__(self, 31 | stages, 32 | outputStageNames, 33 | inputDim, 34 | outputDim, 35 | inputNames, 36 | name=None, 37 | outputdEdX=True): 38 | Stage.__init__(self, 39 | name=name, 40 | inputNames=inputNames, 41 | outputDim=outputDim, 42 | outputdEdX=outputdEdX) 43 | self.stages = [] 44 | self.stageDict = {} 45 | self.inputDim = inputDim 46 | self.outputStageNames = outputStageNames 47 | 48 | inputStage = self.createInputStage() 49 | self.stages.append(inputStage) 50 | self.stageDict['input'] = inputStage 51 | 52 | for stage in stages: 53 | self.register(stage) 54 | 55 | outputStage = self.createOutputStage() 56 | self.stages.append(outputStage) 57 | self.stageDict['output'] = outputStage 58 | 59 | self.link() 60 | self.dEdW = [] 61 | for stage in self.stages: 62 | self.dEdW.append(0.0) 63 | 64 | def createInputStage(self): 65 | return Input(name='input', outputDim=self.inputDim) 66 | 67 | def createOutputStage(self): 68 | return Output(name='output', inputNames=self.outputStageNames) 69 | 70 | def register(self, stage): 71 | """ 72 | Register a substage 73 | :param stage: new recurrent substage 74 | :return: 75 | """ 76 | #print stage 77 | if not hasattr(stage, 'used'): 78 | stage.used = False 79 | self.stages.append(stage) 80 | self.stageDict[stage.name] = stage 81 | 82 | def link(self): 83 | """ 84 | Link substages with their input strings 85 | :return: 86 | """ 87 | for stage in self.stages: 88 | for stageName in stage.inputNames: 89 | stageInput = self.stageDict[stageName] 90 | stageInput.used = True 91 | stage.addInput(stageInput) 92 | 93 | def clearError(self): 94 | for stage in self.stages: 95 | stage.clearError() 96 | self.dEdY = 0.0 97 | self.receivedError = False 98 | 99 | def graphForward(self, dropout=True): 100 | self.X = self.getInput() 101 | self.Y = self.forward(self.X, dropout=dropout) 102 | 103 | #@profile 104 | def forward(self, X, dropout=True): 105 | self.stages[0].Y = X 106 | for s in range(1, len(self.stages) - 1): 107 | if self.stages[s].used: 108 | if hasattr(self.stages[s], 'dropout'): 109 | self.stages[s].dropout = dropout 110 | self.stages[s].graphForward() 111 | elif isinstance(self.stages[s], Container): 112 | self.stages[s].graphForward(dropout=dropout) 113 | else: 114 | self.stages[s].graphForward() 115 | self.stages[-1].graphForward() 116 | Y = self.stages[-1].Y 117 | 118 | # Clear error and ready for next batch 119 | self.clearError() 120 | 121 | self.X = X 122 | return Y 123 | 124 | #@profile 125 | def backward(self, dEdY): 126 | self.stages[-1].sendError(dEdY) 127 | for s in reversed(range(1, len(self.stages) - 1)): 128 | #print 'container backward', self.stages[s].name, self.stages[s].used, self.stages[s].receivedError 129 | if self.stages[s].used and self.stages[s].receivedError: 130 | self.stages[s].graphBackward() 131 | 132 | # Collect input error 133 | if self.outputdEdX: 134 | dEdX = self.stages[0].dEdY 135 | 136 | return dEdX if self.outputdEdX else None 137 | 138 | def updateWeights(self): 139 | for s in range(1, len(self.stages)-1): 140 | # Because all stages are "shallow copied", the weights are shared. 141 | self.stages[s].updateWeights() 142 | 143 | def updateLearningParams(self, numEpoch): 144 | for s in range(1, len(self.stages)-1): 145 | # Since only the first stage updates the weights, 146 | # learning params just need to update in the first stage. 147 | self.stages[s].updateLearningParams(numEpoch) 148 | 149 | def setGradient(self, value): 150 | if type(value) is float: 151 | for s in range(1, len(self.stages) - 1): 152 | self.stages[s].setGradient(value) 153 | elif type(value) is np.ndarray: 154 | for s in range(1, len(self.stages) - 1): 155 | self.stages[s].setGradient(value[s - 1]) 156 | else: 157 | raise Exception('Unknown type %s for setGradient' % type(value)) 158 | 159 | def getWeights(self): 160 | weights = [] 161 | for s in range(1, len(self.stages)-1): 162 | if self.stages[s].gpu: 163 | weights.append(gpu.as_numpy_array(self.stages[s].getWeights())) 164 | else: 165 | weights.append(self.stages[s].getWeights()) 166 | return np.array(weights, dtype=object) 167 | 168 | def loadWeights(self, W): 169 | for s in range(1, len(self.stages) - 1): 170 | self.stages[s].loadWeights(W[s - 1]) 171 | -------------------------------------------------------------------------------- /src/nn/reshape.py: -------------------------------------------------------------------------------- 1 | from stage import * 2 | 3 | class Reshape(Stage): 4 | def __init__(self, reshapeFn, inputNames=None, outputDim=0, name=None, outputdEdX=True): 5 | Stage.__init__(self, name=name, inputNames=inputNames, outputDim=outputDim, outputdEdX=outputdEdX) 6 | self.reshapeFn = eval('lambda x: ' + reshapeFn) 7 | self.Xshape = 0 8 | 9 | def forward(self, X): 10 | self.Xshape = X.shape 11 | return np.reshape(X, self.reshapeFn(X.shape)) 12 | 13 | def backward(self, dEdY): 14 | if self.outputdEdX: 15 | return np.reshape(dEdY, self.Xshape) 16 | 17 | class TimeUnfold(Reshape): 18 | def __init__(self, inputNames=None, name=None, outputdEdX=True): 19 | Reshape.__init__(self, 20 | name=name, 21 | inputNames=inputNames, 22 | reshapeFn='(x[0] * x[1], x[2])', 23 | outputdEdX=outputdEdX) 24 | 25 | class TimeFold(Reshape): 26 | def __init__(self, timespan, inputNames=None, name=None, outputdEdX=True): 27 | self.timespan = timespan 28 | t = str(self.timespan) 29 | Reshape.__init__(self, 30 | name=name, 31 | inputNames=inputNames, 32 | reshapeFn='(x[0] / '+t+','+t+', x[1])', 33 | outputdEdX=outputdEdX) 34 | 35 | class TimeReverse(Stage): 36 | def __init__(self, inputNames, outputDim=0, name=None, outputdEdX=True): 37 | Stage.__init__(self, 38 | name=name, 39 | inputNames=inputNames, 40 | outputDim=outputDim, 41 | outputdEdX=outputdEdX) 42 | 43 | def forward(self, X): 44 | #print self.name, X.shape 45 | N = X.shape[0] 46 | self.Xend = np.zeros(N, dtype=int) + X.shape[1] 47 | reachedEnd = np.sum(X, axis=-1) == 0.0 48 | Y = np.zeros(X.shape) 49 | # Scan for the end of the sequence. 50 | for n in range(N): 51 | found = False 52 | for t in range(X.shape[1]): 53 | if reachedEnd[n, t]: 54 | self.Xend[n] = t 55 | if t > 0: 56 | found = True 57 | Y[n, 0:t, :] = X[n, t-1::-1, :] 58 | break 59 | if found == False: 60 | self.Xend[n] = X.shape[1] 61 | Y[n, :, :] = X[n, ::-1, :] 62 | return Y 63 | 64 | def backward(self, dEdY): 65 | if self.outputdEdX: 66 | dEdX = np.zeros(dEdY.shape) 67 | for n in range(dEdY.shape[0]): 68 | t = self.Xend[n] 69 | if t > 0: 70 | dEdX[n, 0:t, :] = dEdY[n, t-1::-1, :] 71 | return dEdX 72 | else: 73 | return None 74 | 75 | class TimeRepeat(Stage): 76 | def __init__(self, numRepeats, inputNames=None, outputDim=0, name=None, outputdEdX=True): 77 | Stage.__init__(self, name=name, inputNames=inputNames, outputDim=outputDim, outputdEdX=outputdEdX) 78 | self.numRepeats = numRepeats 79 | 80 | def forward(self, X): 81 | self.Xshape = X.shape 82 | if len(X.shape) == 2: 83 | X = X.reshape(X.shape[0], 1, X.shape[1]) 84 | return np.tile(X, (1, self.numRepeats, 1)) 85 | 86 | def backward(self, dEdY): 87 | if self.outputdEdX: 88 | dEdY = dEdY.reshape( 89 | dEdY.shape[0], self.numRepeats, dEdY.shape[1] / self.numRepeats, dEdY.shape[2]) 90 | dEdX = np.sum(dEdY, axis=1) 91 | if len(self.Xshape) == 2: 92 | dEdX = dEdX.reshape(dEdX.shape[0], dEdX.shape[-1]) 93 | return dEdX 94 | 95 | class TimeFinal(Stage): 96 | """ 97 | Scans and selects the last timestep. 98 | """ 99 | def __init__(self, inputNames, outputDim=0, name=None, outputdEdX=True): 100 | Stage.__init__(self, 101 | name=name, 102 | inputNames=inputNames, 103 | outputDim=outputDim, 104 | outputdEdX=outputdEdX) 105 | self.Xend = 0.0 106 | 107 | def forward(self, X): 108 | N = X.shape[0] 109 | self.X = X 110 | self.Xend = np.zeros(N, dtype=int) + X.shape[1] 111 | reachedEnd = np.sum(X, axis=-1) == 0.0 112 | Y = np.zeros((N, X.shape[-1])) 113 | # Scan for the end of the sequence. 114 | for n in range(N): 115 | for t in range(X.shape[1]): 116 | if reachedEnd[n, t]: 117 | self.Xend[n] = t 118 | break 119 | for n in range(N): 120 | if self.Xend[n] > 0: 121 | Y[n] = X[n, self.Xend[n] - 1] 122 | return Y 123 | 124 | def backward(self, dEdY): 125 | if self.outputdEdX: 126 | dEdX = np.zeros(self.X.shape) 127 | for n in range(dEdY.shape[0]): 128 | if self.Xend[n] > 0: 129 | dEdX[n, self.Xend[n] - 1, :] = dEdY[n] 130 | return dEdX 131 | else: 132 | return None 133 | 134 | class Concat(Stage): 135 | def __init__(self, inputNames, axis, name=None): 136 | Stage.__init__(self, name=name, inputNames=inputNames, outputDim=0) 137 | self.axis = axis 138 | def getInput(self): 139 | if len(self.inputs) > 1: 140 | self.splX = [] 141 | for stage in self.inputs: 142 | X = stage.Y 143 | self.splX.append(X) 144 | return np.concatenate(self.splX, axis=self.axis) 145 | else: 146 | return self.inputs[0].Y 147 | def sendError(self, dEdX): 148 | """ 149 | Iterates over input list and sends dEdX. 150 | """ 151 | if len(self.inputs) > 1: 152 | s = 0 153 | for stage in self.inputs: 154 | s2 = s + stage.Y.shape[self.axis] 155 | if self.axis == 0: 156 | stage.dEdY += dEdX[s : s2] 157 | elif self.axis == 1: 158 | stage.dEdY += dEdX[:, s : s2] 159 | elif self.axis == 2: 160 | stage.dEdY += dEdX[:, :, s : s2] 161 | s = s2 162 | stage.receivedError = True 163 | else: 164 | self.inputs[0].dEdY += dEdX 165 | self.inputs[0].receivedError = True 166 | 167 | def forward(self, X): 168 | return X 169 | def backward(self, dEdY): 170 | return dEdY 171 | -------------------------------------------------------------------------------- /src/imageqa_compare.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import nn 4 | import numpy as np 5 | import imageqa_test as it 6 | import imageqa_render as ir 7 | import imageqa_ensemble as ie 8 | 9 | nameList = ['object', 'number', 'color', 'location'] 10 | 11 | def getCatName(i): 12 | return nameList[i] 13 | 14 | def getBinName(n): 15 | bin = [] 16 | for k in range(numModels): 17 | bin.append(str(n >> (numModels - k - 1))) 18 | n = n & (~(1 << (numModels - k - 1))) 19 | return ''.join(bin) 20 | 21 | def getName(catName, binName): 22 | return catName + '-' + binName 23 | 24 | def renderIndex(modelNames, numCategories, bins): 25 | htmlList = [] 26 | htmlList.append('
' % \ 27 | 'span.good {color:green;} span.bad {color:red;} \ 28 | table{border-spacing:10px;}') 29 | numModels = len(modelNames) 30 | numCorrect = 1 << numModels 31 | htmlList.append('| ') 44 | if c == '1': 45 | htmlList.append( 46 | '%s' % modelNames[k]) 47 | elif c == '0': 48 | htmlList.append( 49 | '%s' % modelNames[k]) 50 | htmlList.append(' | ') 51 | htmlList.append('%d items | ' % len(bins[binId])) 52 | htmlList.append('link | ' % \ 53 | getName(getCatName(i), getBinName(j))) 54 | htmlList.append('