├── LICENSE ├── .gitignore ├── fm.py ├── run_movielens.py ├── auto_vfm.py ├── vfm.py └── README.md /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Christopher Erick Moody 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /fm.py: -------------------------------------------------------------------------------- 1 | from chainer import Chain 2 | from chainer import links as L 3 | from chainer import functions as F 4 | from chainer import reporter 5 | import numpy as np 6 | 7 | 8 | class FM(Chain): 9 | _mask = None 10 | 11 | def __init__(self, n_features=None, n_dim=8, lossfun=F.mean_squared_error, 12 | lambda0=5e-3, lambda1=5e-3, lambda2=5e-3, init_bias=0.0, 13 | intx_term=True, total_nobs=1): 14 | self.n_dim = n_dim 15 | self.n_features = n_features 16 | self.lossfun = lossfun 17 | self.lambda0 = lambda0 18 | self.lambda1 = lambda1 19 | self.lambda2 = lambda2 20 | self.intx_term = intx_term 21 | self.total_nobs = total_nobs 22 | 23 | # These are all the learned weights corresponding 24 | # to the overall bias, slope per feature, and latent 25 | # interaction vector per feature 26 | super(FM, self).__init__(bias=L.Bias(shape=(1,)), 27 | slope=L.EmbedID(n_features, 1), 28 | latent=L.EmbedID(n_features, n_dim)) 29 | 30 | # Xavier initialize weights 31 | c = np.sqrt(n_features * n_dim) 32 | self.latent.W.data[...] = np.random.randn(n_features, n_dim) / c 33 | d = np.sqrt(n_features) 34 | self.slope.W.data[...] = np.random.randn(n_features, 1) / d 35 | self.bias.b.data[...] *= 0.0 36 | self.bias.b.data[...] += init_bias 37 | 38 | def mask(self, bs, nf): 39 | if self._mask is None or self._mask.shape[0] != bs: 40 | mask = self.xp.ones((nf, nf), dtype='float32') 41 | mask -= self.xp.eye(nf, dtype='float32') 42 | masks = self.xp.tile(mask, (bs, 1, 1)) 43 | self._mask = masks 44 | return self._mask 45 | 46 | def forward(self, val, loc, y): 47 | """ Given the sparse feature vector defined by location 48 | integers for the column index and the value at that index. 49 | y ~ c + sum(w_i x_i) + sum_ij( * x_i * x_j) 50 | 51 | Parameters 52 | ---------- 53 | val : array of float 54 | Values in the feature array. Should of shape (batchsize, n_feat_max) 55 | 56 | loc : array of int 57 | Location of the non-zero columns in the sparse vector. Should be of 58 | shape (batchsize, n_feat_max) 59 | """ 60 | 61 | bs = val.data.shape[0] 62 | nf = val.data.shape[1] 63 | mask = self.mask(bs, nf) 64 | # Input shape is (batchsize, n_feat_max) and 65 | # v is (batchsize, n_feat_max, n_dim) 66 | vi = self.latent(loc) 67 | # Form square latent interaction matrix of shape 68 | # (batchsize, n_feat_max, n_feat_max) 69 | vij = F.batch_matmul(vi, vi, transb=True) 70 | # Form square observed feature matrix of shape 71 | # (batchsize, n_feat_max, n_feat_max) 72 | xij = F.batch_matmul(val, val, transb=True) 73 | # Slope coupled to each active feature 74 | # loc & self.slope(loc) are shape (batchsize, n_feat_max) 75 | # val is also (batchsize, n_feat_max) 76 | coef = F.reshape(self.slope(loc), val.data.shape) 77 | slop = F.sum(coef * val, axis=1) 78 | # This double sums all of the interaction terms aside 79 | # from the computational burden this shouldn't be a problem. 80 | # TODO: implement the trick in Rendle's paper 81 | # that makes this O(kN) instead of O(kN^2) 82 | intx = F.sum(vij * xij * mask, axis=(1, 2)) * 0.5 83 | # Broadcast to shape of batchsize 84 | bias = F.broadcast_to(self.bias.b, slop.data.shape) 85 | # Optionally choose to include the interaction term 86 | # without this is linear regression 87 | if self.intx_term: 88 | pred = bias + slop + intx 89 | else: 90 | pred = bias + slop 91 | # Compute MSE loss 92 | mse = F.mean_squared_error(pred, y) 93 | rmse = F.sqrt(mse) 94 | # Calculate regularization losses 95 | frac = loc.data.shape[0] * 1.0 / self.total_nobs 96 | reg0 = F.sum(self.bias.b) 97 | reg1 = F.sum(self.slope.W * self.slope.W) 98 | reg2 = F.sum(self.latent.W * self.latent.W) 99 | # Total loss is MSE plus regularization losses 100 | regt = reg0 * self.lambda0 + reg1 * self.lambda1 + reg2 * self.lambda2 101 | loss = mse + regt * frac 102 | # Log the errors 103 | logs = {'loss': loss, 'mse': mse, 'rmse': rmse, 'reg0': reg0, 104 | 'regt': regt, 'reg1': reg1, 'reg2': reg2, 105 | 'bias': F.sum(self.bias.b)} 106 | reporter.report(logs, self) 107 | return loss 108 | 109 | def __call__(self, val, loc, y, dummy=None): 110 | return self.forward(val, loc, y) 111 | -------------------------------------------------------------------------------- /run_movielens.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import argparse 3 | import numpy as np 4 | import requests 5 | from zipfile import ZipFile 6 | from sklearn.model_selection import train_test_split 7 | 8 | import chainer 9 | from chainer import training 10 | from chainer.training import extensions 11 | from chainer.datasets import TupleDataset 12 | 13 | from fm import FM 14 | from vfm import VFM 15 | from auto_vfm import AutoVFM 16 | 17 | # Hyperparameters set through CLI 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('--n_dim', dest='n_dim', default=8, type=int) 20 | parser.add_argument('--batchsize', dest='batchsize', default=4096, type=int) 21 | parser.add_argument('--model_type', dest='model_type', default='FM', type=str) 22 | parser.add_argument('--device', dest='device', default=-1, type=int) 23 | parser.add_argument('--lambda0', dest='lambda0', default=1, type=float) 24 | parser.add_argument('--lambda1', dest='lambda1', default=1, type=float) 25 | parser.add_argument('--lambda2', dest='lambda2', default=1, type=float) 26 | parser.add_argument('--intx_term', dest='intx_term', default=1, type=int) 27 | parser.add_argument('--alpha', dest='alpha', default=1e-3, type=float) 28 | parser.add_argument('--resume', dest='resume', default=None, type=str) 29 | 30 | # Expand arguments into local variables 31 | args = vars(parser.parse_args()) 32 | print args 33 | n_dim = args.pop('n_dim') 34 | batchsize = args.pop('batchsize') 35 | model_type = args.pop('model_type') 36 | device = args.pop('device') 37 | lambda0 = args.pop('lambda0') 38 | lambda1 = args.pop('lambda1') 39 | lambda2 = args.pop('lambda2') 40 | intx_term = args.pop('intx_term') 41 | alpha = args.pop('alpha') 42 | resume = args.pop('resume') 43 | 44 | # Download, unzip and read in the dataset 45 | name = 'ml-1m.zip' 46 | base = 'ml-1m' 47 | if not os.path.exists(name): 48 | url = 'http://files.grouplens.org/datasets/movielens/' + name 49 | r = requests.get(url) 50 | with open(name, 'wb') as fh: 51 | fh.write(r.content) 52 | zip = ZipFile(name) 53 | zip.extractall() 54 | 55 | # First col is user, 2nd is movie id, 3rd is rating 56 | data = np.genfromtxt(base + '/ratings.dat', delimiter='::') 57 | # print("WARNING: Subsetting data") 58 | # data = data[::100, :] 59 | user = data[:, 0].astype('int32') 60 | movie = data[:, 1].astype('int32') 61 | rating = data[:, 2].astype('float32') 62 | n_features = user.max() + 1 + movie.max() + 1 63 | 64 | # Formatting dataset 65 | loc = np.zeros((len(data), 2), dtype='int32') 66 | loc[:, 0] = user 67 | loc[:, 1] = movie + user.max() 68 | val = np.ones((len(data), 2), dtype='float32') 69 | 70 | # Train test split 71 | tloc, vloc, tval, vval, ty, vy = train_test_split(loc, val, rating, 72 | random_state=42) 73 | total_nobs = len(tloc) 74 | train = TupleDataset(tloc, tval, ty) 75 | valid = TupleDataset(vloc, vval, vy) 76 | 77 | # Setup model 78 | print "Running model:" + model_type 79 | if model_type == 'FM': 80 | model = FM(n_features, n_dim, lambda0=lambda0, lambda1=lambda1, 81 | lambda2=lambda2, init_bias=ty.mean(), intx_term=intx_term, 82 | total_nobs=total_nobs) 83 | elif model_type == 'VFM': 84 | mu = ty.mean() 85 | lv = 0.5 * np.log(ty.std()) 86 | model = VFM(n_features, n_dim, init_bias_mu=mu, init_bias_lv=lv, 87 | total_nobs=total_nobs, lambda1=lambda1, lambda2=lambda2, 88 | lambda0=lambda0) 89 | elif model_type == 'AutoVFM': 90 | mu = ty.mean() 91 | lv = 0.5 * np.log(ty.std()) 92 | model = AutoVFM(n_features, n_dim, init_bias_mu=mu, init_bias_lv=lv, 93 | total_nobs=total_nobs, lambda1=lambda1, lambda2=lambda2, 94 | lambda0=lambda0) 95 | if device >= 0: 96 | chainer.cuda.get_device(device).use() 97 | model.to_gpu(device) 98 | optimizer = chainer.optimizers.Adam(alpha) 99 | optimizer.setup(model) 100 | 101 | 102 | class TestModeEvaluator(extensions.Evaluator): 103 | def evaluate(self): 104 | model = self.get_target('main') 105 | model.train = False 106 | ret = super(TestModeEvaluator, self).evaluate() 107 | model.train = True 108 | return ret 109 | 110 | 111 | # Setup iterators 112 | train_iter = chainer.iterators.SerialIterator(train, batchsize) 113 | valid_iter = chainer.iterators.SerialIterator(valid, batchsize, 114 | repeat=False, shuffle=False) 115 | updater = training.StandardUpdater(train_iter, optimizer, device=device) 116 | trainer = training.Trainer(updater, (500, 'epoch'), out='out_' + str(device)) 117 | 118 | # Setup logging, printing & saving 119 | keys = ['loss', 'rmse', 'bias', 'kld0', 'kld1'] 120 | keys += ['kldg', 'kldi', 'hypg', 'hypi'] 121 | keys += ['hypglv', 'hypilv'] 122 | reports = ['epoch'] 123 | reports += ['main/' + key for key in keys] 124 | reports += ['validation/main/rmse'] 125 | trainer.extend(TestModeEvaluator(valid_iter, model, device=device)) 126 | trainer.extend(extensions.Evaluator(valid_iter, model, device=device)) 127 | trainer.extend(extensions.dump_graph('main/loss')) 128 | trainer.extend(extensions.snapshot(), trigger=(10, 'epoch')) 129 | trainer.extend(extensions.LogReport(trigger=(1, 'epoch'))) 130 | trainer.extend(extensions.PrintReport(reports)) 131 | trainer.extend(extensions.ProgressBar(update_interval=10)) 132 | 133 | # If previous model detected, resume 134 | if resume: 135 | print("Loading from {}".format(resume)) 136 | chainer.serializers.load_npz(resume, trainer) 137 | 138 | # Run the model 139 | trainer.run() 140 | -------------------------------------------------------------------------------- /auto_vfm.py: -------------------------------------------------------------------------------- 1 | from chainer import Chain 2 | from chainer import links as L 3 | from chainer import functions as F 4 | from chainer import reporter 5 | from chainer import cuda 6 | import numpy as np 7 | 8 | 9 | def dot(a, b): 10 | """ Simple dot product""" 11 | return F.sum(a * b, axis=-1) 12 | 13 | 14 | def batch_interactions(x): 15 | xp = cuda.get_array_module(x.data) 16 | batchsize = x.shape[0] 17 | shape = (batchsize, x.shape[1] ** 2) 18 | left = xp.tile(x.data, (1, x.shape[1])) 19 | right = xp.repeat(x.data, x.shape[1]).reshape(shape) 20 | return left, right 21 | 22 | 23 | def kl_div(mu1, lv1, lv2): 24 | # KL Divergence between given normal and prior at N(0, sigma_2) 25 | # Prior assumes mean at zero 26 | # lns2 - lns1 + (s2^2 + (u1 - u2)**2)/ 2s2**2 - 0.5 27 | if len(lv1.shape) == 2: 28 | lv1 = F.expand_dims(lv1, 0) 29 | mu1 = F.expand_dims(mu1, 0) 30 | lv2 = F.broadcast_to(lv2, lv1.shape) 31 | v12 = F.exp(lv1)**2.0 32 | v22 = F.exp(lv2)**2.0 33 | return lv2 - lv1 + .5 * v12 / v22 + .5 * mu1**2. / v22 - .5 34 | 35 | 36 | class AutoVFM(Chain): 37 | lv_floor = -100.0 38 | 39 | def __init__(self, n_features=None, n_dim=8, lossfun=F.mean_squared_error, 40 | lambda0=1, lambda1=1, lambda2=1, init_bias_mu=0.0, 41 | init_bias_lv=0.0, intx_term=True, total_nobs=1): 42 | self.n_dim = n_dim 43 | self.n_features = n_features 44 | self.lossfun = lossfun 45 | self.lambda0 = lambda0 46 | self.lambda1 = lambda1 47 | self.lambda2 = lambda2 48 | self.intx_term = intx_term 49 | self.total_nobs = total_nobs 50 | 51 | # In contrast to the FM model, the slopes and latent vectors 52 | # will have means (mu) and log variances (lv) for each component. 53 | ones_3d = (1, 1, 1) 54 | super(AutoVFM, self).__init__(bias_mu=L.Bias(shape=(1,)), 55 | bias_lv=L.Bias(shape=(1,)), 56 | slop_mu=L.Bias(shape=(1, 1)), 57 | slop_lv=L.Bias(shape=(1, 1)), 58 | slop_delta_mu=L.EmbedID(n_features, 1, 59 | ignore_label=-1), 60 | slop_delta_lv=L.EmbedID(n_features, 1, 61 | ignore_label=-1), 62 | feat_mu_vec=L.Bias(shape=(1, 1, n_dim)), 63 | feat_lv_vec=L.Bias(shape=(1, 1, n_dim)), 64 | hyper_feat_lv_vec=L.Bias(shape=ones_3d), 65 | feat_delta_mu=L.EmbedID(n_features, n_dim, 66 | ignore_label=-1), 67 | feat_delta_lv=L.EmbedID(n_features, n_dim, 68 | ignore_label=-1), 69 | hyper_feat_delta_lv=L.Bias(shape=ones_3d)) 70 | 71 | # Xavier initialize weights 72 | c = np.sqrt(n_features * n_dim) * 1e3 73 | d = np.sqrt(n_features) * 1e3 74 | self.feat_delta_mu.W.data[...] = np.random.randn(n_features, n_dim) / c 75 | self.feat_delta_lv.W.data[...] = np.random.randn(n_features, n_dim) / c 76 | self.slop_delta_mu.W.data[...] = np.random.randn(n_features, 1) / d 77 | self.slop_delta_lv.W.data[...] = np.random.randn(n_features, 1) / d 78 | self.bias_mu.b.data[...] *= 0.0 79 | self.bias_mu.b.data[...] += init_bias_mu 80 | self.bias_lv.b.data[...] *= 0.0 81 | self.bias_lv.b.data[...] += init_bias_lv 82 | 83 | def term_bias(self, bs, train=True): 84 | """ Compute overall bias and broadcast to shape of batchsize 85 | """ 86 | 87 | shape = (bs, 1,) 88 | # Bias is drawn from a Gaussian with given mu and log variance 89 | bs_mu = F.broadcast_to(self.bias_mu.b, shape) 90 | bs_lv = F.broadcast_to(self.bias_lv.b, shape) 91 | bias = F.flatten(F.gaussian(bs_mu, bs_lv)) 92 | 93 | # Add a very negative log variance so we're sampling 94 | # from a very narrow distribution about the mean. 95 | # Useful for validation dataset when we want to only guess 96 | # the mean. 97 | if not train: 98 | bs_lv += self.lv_floor 99 | 100 | # Compute prior on the bias, so compute the KL div 101 | # from the KL(N(mu_bias, var_bias) | N(0, 1)) 102 | kld = F.gaussian_kl_divergence(self.bias_mu.b, self.bias_lv.b) 103 | return bias, kld 104 | 105 | def term_slop(self, loc, val, bs, nf, train=True): 106 | """ Compute the slope for each active feature. 107 | """ 108 | shape = (bs, nf) 109 | 110 | # Reshape all of our constants 111 | pr_mu = F.broadcast_to(self.slop_mu.b, shape) 112 | pr_lv = F.broadcast_to(self.slop_lv.b, shape) 113 | # This is either zero or a very negative number 114 | # indicating to sample N(mean, logvar) or just draw 115 | # the mean preicsely 116 | if not train: 117 | pr_lv += self.lv_floor 118 | 119 | # The feature slopes are grouped together so that they 120 | # all share a common mean. Then individual features slop_delta_lv 121 | # are shrunk towards zero, which effectively sets features to fall 122 | # back on the group mean. 123 | sl_mu = F.reshape(self.slop_delta_mu(loc), shape) + pr_mu 124 | sl_lv = F.reshape(self.slop_delta_lv(loc), shape) + pr_lv 125 | coef = F.gaussian(sl_mu, sl_lv) 126 | slop = F.sum(coef * val, axis=1) 127 | 128 | # Calculate divergence between group mean and N(0, 1) 129 | kld1 = F.gaussian_kl_divergence(self.slop_mu.b, self.slop_lv.b) 130 | # Calculate divergence of individual delta means and delta vars 131 | args = (self.slop_delta_mu.W, self.slop_delta_lv.W) 132 | kld2 = F.gaussian_kl_divergence(*args) 133 | 134 | return slop, kld1 + kld2 135 | 136 | def term_feat(self, iloc, jloc, ival, jval, bs, nf, train=True): 137 | # Change all of the shapes to form interaction vectors 138 | shape = (bs, nf * 2, self.n_dim) 139 | feat_mu_vec = F.broadcast_to(self.feat_mu_vec.b, shape) 140 | feat_lv_vec = F.broadcast_to(self.feat_lv_vec.b, shape) 141 | if not train: 142 | feat_lv_vec += self.lv_floor 143 | 144 | # Construct the interaction mean and variance 145 | # iloc is (bs, nf), feat(iloc) is (bs, nf, ndim) and 146 | # dot(feat, feat) is (bs, nf) 147 | ivec = F.gaussian(feat_mu_vec + self.feat_delta_mu(iloc), 148 | feat_lv_vec + self.feat_delta_lv(iloc)) 149 | jvec = F.gaussian(feat_mu_vec + self.feat_delta_mu(jloc), 150 | feat_lv_vec + self.feat_delta_lv(jloc)) 151 | # feat is (bs, ) 152 | feat = dot(F.sum(ivec * jvec, axis=2), ival * jval) 153 | 154 | # Compute the KLD for the group mean vector and variance vector 155 | # KL(N(group mu, group lv) || N(0, hyper_lv)) 156 | # hyper_lv ~ gamma(1, 1) 157 | kldg = F.sum(kl_div(self.feat_mu_vec.b, self.feat_lv_vec.b, 158 | self.hyper_feat_lv_vec.b)) 159 | # Compute deviations from hyperprior 160 | # KL(N(delta_i, delta_i lv) || N(0, hyper_delta_lv)) 161 | # hyper_delta_lv ~ gamma(1, 1) 162 | kldi = F.sum(kl_div(self.feat_delta_mu.W, self.feat_delta_lv.W, 163 | self.hyper_feat_delta_lv.b)) 164 | # Hyperprior penalty for log(var) ~ Gamma(alpha=1, beta=1) 165 | # Gamma(log(var) | alpha=1, beta=1) = -log(var) 166 | # The loss function will attempt to make log(var) as negative as 167 | # possible which will in turn make the variance as small as possible 168 | # The sum just casts a 1D vector to a scalar 169 | hyperg = -F.sum(self.hyper_feat_lv_vec.b) 170 | hyperi = -F.sum(self.hyper_feat_delta_lv.b) 171 | return feat, kldg, kldi, hyperg, hyperi 172 | 173 | def forward(self, loc, val, y, train=True): 174 | """ Given the sparse feature vector defined by location 175 | integers for the column index and the value at that index. 176 | y ~ c + sum(w_i x_i) + sum_ij( * x_i * x_j) 177 | 178 | Parameters 179 | ---------- 180 | val : array of float 181 | Values in the feature array. Should of shape (batchsize, n_feat_max) 182 | 183 | loc : array of int 184 | Location of the non-zero columns in the sparse vector. Should be of 185 | shape (batchsize, n_feat_max) 186 | 187 | y : array of float 188 | Array of expected outcome. 189 | 190 | train: bool 191 | If True uses the reparameterization trick to estimate variables. 192 | If False, this sets the variance to nearly zero such that 193 | parameters are always set to the mean with no noise, which is useful 194 | at test time. 195 | 196 | """ 197 | bs = val.data.shape[0] 198 | nf = val.data.shape[1] 199 | 200 | iloc, jloc = batch_interactions(loc) 201 | ival, jval = batch_interactions(val) 202 | 203 | # Compute scalar bias term 204 | bias, kld0 = self.term_bias(bs, train=train) 205 | # Compute the feature weights 206 | slop, kld1 = self.term_slop(loc, val, bs, nf, train=train) 207 | # Compute factorized weights on interaction features 208 | feat, kldg, kldi, hypg, hypi = self.term_feat(iloc, jloc, ival, jval, 209 | bs, nf, train=train) 210 | 211 | # Optionally choose to include the interaction term 212 | # without this is linear regression 213 | pred = bias + slop 214 | if self.intx_term: 215 | pred += feat 216 | 217 | return pred, kld0, kld1, kldg, kldi, hypg, hypi 218 | 219 | def __call__(self, loc, val, y, train=True): 220 | bs = val.data.shape[0] 221 | ret = self.forward(loc, val, y, train=train) 222 | pred, kld0, kld1, kldg, kldi, hypg, hypi = ret 223 | 224 | # Compute MSE loss 225 | mse = F.mean_squared_error(pred, y) 226 | rmse = F.sqrt(mse) # Only used for reporting 227 | 228 | # Now compute the total KLD loss 229 | kldt = kld0 * self.lambda0 + kld1 * self.lambda1 230 | kldt += kldg + kldi + hypg + hypi 231 | 232 | # Total loss is MSE plus regularization losses 233 | loss = mse + kldt * (1.0 / self.total_nobs) 234 | 235 | # Log the errors 236 | logs = {'loss': loss, 'rmse': rmse, 'kld0': kld0, 'kld1': kld1, 237 | 'kldg': kldg, 'kldi': kldi, 'hypg': hypg, 'hypi': hypi, 238 | 'hypglv': F.sum(self.hyper_feat_lv_vec.b), 239 | 'hypilv': F.sum(self.hyper_feat_delta_lv.b), 240 | 'kldt': kldt, 'bias': F.sum(self.bias_mu.b)} 241 | reporter.report(logs, self) 242 | return loss 243 | -------------------------------------------------------------------------------- /vfm.py: -------------------------------------------------------------------------------- 1 | import chainer 2 | from chainer import training 3 | from chainer.training import extensions 4 | from chainer.datasets import TupleDataset 5 | 6 | from chainer import Chain 7 | from chainer import links as L 8 | from chainer import functions as F 9 | from chainer import reporter 10 | from chainer import cuda 11 | import numpy as np 12 | 13 | 14 | def dot(a, b): 15 | """ Simple dot product""" 16 | return F.sum(a * b, axis=-1) 17 | 18 | 19 | def batch_interactions(x): 20 | xp = cuda.get_array_module(x.data) 21 | batchsize = x.shape[0] 22 | shape = (batchsize, x.shape[1] ** 2) 23 | left = xp.tile(x.data, (1, x.shape[1])) 24 | right = xp.repeat(x.data, x.shape[1]).reshape(shape) 25 | return left, right 26 | 27 | 28 | class VFM(Chain): 29 | lv_floor = -100.0 30 | 31 | def __init__(self, n_features=None, n_dim=8, lossfun=F.mean_squared_error, 32 | lambda0=1, lambda1=1, lambda2=1, init_bias_mu=0.0, 33 | init_bias_lv=0.0, intx_term=True, total_nobs=1): 34 | self.n_dim = n_dim 35 | self.n_features = n_features 36 | self.lossfun = lossfun 37 | self.lambda0 = lambda0 38 | self.lambda1 = lambda1 39 | self.lambda2 = lambda2 40 | self.intx_term = intx_term 41 | self.total_nobs = total_nobs 42 | 43 | # In contrast to the FM model, the slopes and latent vectors 44 | # will have means (mu) and log variances (lv) for each component. 45 | super(VFM, self).__init__(bias_mu=L.Bias(shape=(1,)), 46 | bias_lv=L.Bias(shape=(1,)), 47 | slop_mu=L.Bias(shape=(1, 1)), 48 | slop_lv=L.Bias(shape=(1, 1)), 49 | slop_delta_mu=L.EmbedID(n_features, 1, 50 | ignore_label=-1), 51 | slop_delta_lv=L.EmbedID(n_features, 1, 52 | ignore_label=-1), 53 | feat_mu_vec=L.Bias(shape=(1, 1, n_dim)), 54 | feat_lv_vec=L.Bias(shape=(1, 1, n_dim)), 55 | feat_delta_mu=L.EmbedID(n_features, n_dim, 56 | ignore_label=-1), 57 | feat_delta_lv=L.EmbedID(n_features, n_dim, 58 | ignore_label=-1)) 59 | 60 | # Xavier initialize weights 61 | c = np.sqrt(n_features * n_dim) * 1e3 62 | d = np.sqrt(n_features) * 1e3 63 | self.feat_delta_mu.W.data[...] = np.random.randn(n_features, n_dim) / c 64 | self.feat_delta_lv.W.data[...] = np.random.randn(n_features, n_dim) / c 65 | self.slop_delta_mu.W.data[...] = np.random.randn(n_features, 1) / d 66 | self.slop_delta_lv.W.data[...] = np.random.randn(n_features, 1) / d 67 | self.bias_mu.b.data[...] *= 0.0 68 | self.bias_mu.b.data[...] += init_bias_mu 69 | self.bias_lv.b.data[...] *= 0.0 70 | self.bias_lv.b.data[...] += init_bias_lv 71 | 72 | def term_bias(self, bs, train=True): 73 | """ Compute overall bias and broadcast to shape of batchsize 74 | """ 75 | 76 | shape = (bs, 1,) 77 | # Bias is drawn from a Gaussian with given mu and log variance 78 | bs_mu = F.broadcast_to(self.bias_mu.b, shape) 79 | bs_lv = F.broadcast_to(self.bias_lv.b, shape) 80 | bias = F.flatten(F.gaussian(bs_mu, bs_lv)) 81 | 82 | # Add a very negative log variance so we're sampling 83 | # from a very narrow distribution about the mean. 84 | # Useful for validation dataset when we want to only guess 85 | # the mean. 86 | if not train: 87 | bs_lv += self.lv_floor 88 | 89 | # Compute prior on the bias, so compute the KL div 90 | # from the KL(N(mu_bias, var_bias) | N(0, 1)) 91 | kld = F.gaussian_kl_divergence(self.bias_mu.b, self.bias_lv.b) 92 | return bias, kld 93 | 94 | def term_slop(self, loc, val, bs, nf, train=True): 95 | """ Compute the slope for each active feature. 96 | """ 97 | shape = (bs, nf) 98 | 99 | # Reshape all of our constants 100 | pr_mu = F.broadcast_to(self.slop_mu.b, shape) 101 | pr_lv = F.broadcast_to(self.slop_lv.b, shape) 102 | # This is either zero or a very negative number 103 | # indicating to sample N(mean, logvar) or just draw 104 | # the mean preicsely 105 | if not train: 106 | pr_lv += self.lv_floor 107 | 108 | # The feature slopes are grouped together so that they 109 | # all share a common mean. Then individual features slop_delta_lv 110 | # are shrunk towards zero, which effectively sets features to fall 111 | # back on the group mean. 112 | sl_mu = F.reshape(self.slop_delta_mu(loc), shape) + pr_mu 113 | sl_lv = F.reshape(self.slop_delta_lv(loc), shape) + pr_lv 114 | coef = F.gaussian(sl_mu, sl_lv) 115 | slop = F.sum(coef * val, axis=1) 116 | 117 | # Calculate divergence between group mean and N(0, 1) 118 | kld1 = F.gaussian_kl_divergence(self.slop_mu.b, self.slop_lv.b) 119 | # Calculate divergence of individual delta means and delta vars 120 | args = (self.slop_delta_mu.W, self.slop_delta_lv.W) 121 | kld2 = F.gaussian_kl_divergence(*args) 122 | 123 | return slop, kld1 + kld2 124 | 125 | def term_feat(self, iloc, jloc, ival, jval, bs, nf, train=True): 126 | # Change all of the shapes to form interaction vectors 127 | shape = (bs, nf * 2, self.n_dim) 128 | feat_mu_vec = F.broadcast_to(self.feat_mu_vec.b, shape) 129 | feat_lv_vec = F.broadcast_to(self.feat_lv_vec.b, shape) 130 | if not train: 131 | feat_lv_vec += self.lv_floor 132 | 133 | # Construct the interaction mean and variance 134 | # iloc is (bs, nf), feat(iloc) is (bs, nf, ndim) and 135 | # dot(feat, feat) is (bs, nf) 136 | ivec = F.gaussian(feat_mu_vec + self.feat_delta_mu(iloc), 137 | feat_lv_vec + self.feat_delta_lv(iloc)) 138 | jvec = F.gaussian(feat_mu_vec + self.feat_delta_mu(jloc), 139 | feat_lv_vec + self.feat_delta_lv(jloc)) 140 | # feat is (bs, ) 141 | feat = dot(F.sum(ivec * jvec, axis=2), ival * jval) 142 | 143 | # Compute the KLD for the group mean vector and variance vector 144 | kld1 = F.gaussian_kl_divergence(self.feat_mu_vec.b, self.feat_lv_vec.b) 145 | # Compute the KLD for vector deviations from the group mean and var 146 | kld2 = F.gaussian_kl_divergence(self.feat_delta_mu.W, 147 | self.feat_delta_lv.W) 148 | return feat, kld1 + kld2 149 | 150 | def forward(self, loc, val, y, train=True): 151 | """ Given the sparse feature vector defined by location 152 | integers for the column index and the value at that index. 153 | y ~ c + sum(w_i x_i) + sum_ij( * x_i * x_j) 154 | 155 | Parameters 156 | ---------- 157 | val : array of float 158 | Values in the feature array. Should of shape (batchsize, n_feat_max) 159 | 160 | loc : array of int 161 | Location of the non-zero columns in the sparse vector. Should be of 162 | shape (batchsize, n_feat_max) 163 | 164 | y : array of float 165 | Array of expected outcome. 166 | 167 | train: bool 168 | If True uses the reparameterization trick to estimate variables. 169 | If False, this sets the variance to nearly zero such that 170 | parameters are always set to the mean with no noise, which is useful 171 | at test time. 172 | 173 | """ 174 | bs = val.data.shape[0] 175 | nf = val.data.shape[1] 176 | 177 | iloc, jloc = batch_interactions(loc) 178 | ival, jval = batch_interactions(val) 179 | 180 | # Compute scalar bias term 181 | bias, kld0 = self.term_bias(bs, train=train) 182 | # Compute the feature weights 183 | slop, kld1 = self.term_slop(loc, val, bs, nf, train=train) 184 | # Compute factorized weights on interaction features 185 | feat, kld2 = self.term_feat(iloc, jloc, ival, jval, 186 | bs, nf, train=train) 187 | 188 | # Optionally choose to include the interaction term 189 | # without this is linear regression 190 | pred = bias + slop 191 | if self.intx_term: 192 | pred += feat 193 | 194 | return pred, kld0, kld1, kld2 195 | 196 | def __call__(self, loc, val, y, train=True): 197 | bs = val.data.shape[0] 198 | pred, kld0, kld1, kld2 = self.forward(loc, val, y, train=train) 199 | 200 | # Compute MSE loss 201 | mse = F.mean_squared_error(pred, y) 202 | rmse = F.sqrt(mse) # Only used for reporting 203 | 204 | # Now compute the total KLD loss 205 | kldt = kld0 * self.lambda0 + kld1 * self.lambda1 + kld2 * self.lambda2 206 | 207 | # Total loss is MSE plus regularization losses 208 | loss = mse + kldt * (1.0 / self.total_nobs) 209 | 210 | # Log the errors 211 | logs = {'loss': loss, 'rmse': rmse, 'kld0': kld0, 'kld1': kld1, 212 | 'kld2': kld2, 'kldt': kldt, 'bias': F.sum(self.bias_mu.b)} 213 | reporter.report(logs, self) 214 | return loss 215 | 216 | 217 | class TestModeEvaluator(extensions.Evaluator): 218 | def evaluate(self): 219 | model = self.get_target('main') 220 | model.train = False 221 | ret = super(TestModeEvaluator, self).evaluate() 222 | model.train = True 223 | return ret 224 | 225 | 226 | def fit(model, train, valid, device=-1, batchsize=4096, n_epoch=500, 227 | resume=None, alpha=1e-3): 228 | if device >= 0: 229 | chainer.cuda.get_device(device).use() 230 | model.to_gpu(device) 231 | optimizer = chainer.optimizers.Adam(alpha) 232 | optimizer.setup(model) 233 | 234 | # Setup iterators 235 | train_iter = chainer.iterators.SerialIterator(train, batchsize) 236 | valid_iter = chainer.iterators.SerialIterator(valid, batchsize, 237 | repeat=False, shuffle=False) 238 | updater = training.StandardUpdater(train_iter, optimizer, device=device) 239 | trainer = training.Trainer(updater, (n_epoch, 'epoch'), 240 | out='out_' + str(device)) 241 | 242 | # Setup logging, printing & saving 243 | keys = ['loss', 'rmse', 'bias', 'kld0', 'kld1'] 244 | keys += ['kldg', 'kldi', 'hypg', 'hypi'] 245 | keys += ['hypglv', 'hypilv'] 246 | reports = ['epoch'] 247 | reports += ['main/' + key for key in keys] 248 | reports += ['validation/main/rmse'] 249 | trainer.extend(TestModeEvaluator(valid_iter, model, device=device)) 250 | trainer.extend(extensions.Evaluator(valid_iter, model, device=device)) 251 | trainer.extend(extensions.dump_graph('main/loss')) 252 | trainer.extend(extensions.snapshot(), trigger=(10, 'epoch')) 253 | trainer.extend(extensions.LogReport(trigger=(1, 'epoch'))) 254 | trainer.extend(extensions.PrintReport(reports)) 255 | trainer.extend(extensions.ProgressBar(update_interval=10)) 256 | 257 | # If previous model detected, resume 258 | if resume: 259 | print("Loading from {}".format(resume)) 260 | chainer.serializers.load_npz(resume, trainer) 261 | 262 | # Run the model 263 | trainer.run() 264 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Chainer Variational Factorization Machine implementation. 2 | 3 | Run `python run_movielens.py` to download, extract, and run the FM model 4 | on MovieLens 1M data. 5 | 6 | 7 | | model | batchsize | rank |intx term | lambda0 | lambda1 | lambda2 | RMSE | Notes | 8 | |-----------|-----------|------|----------|---------|---------|---------| -------| ----- | 9 | | FM | 8192 | 0 | N |0 | 1e-2 | 0 | 0.9305 | Regression with regularization | 10 | | FM | 8192 | 0 | N |0 | 0 | 0 | 0.9115 | Regression with no regularization | 11 | | FM | 8192 | 0 | N |0 | 1e-3 | 0 | 0.9112 | Regression with less regularization | 12 | | FM | 8192 | 20 | Y |0 | 0 | 1e-3 | 0.8633 | FM model w/ 20D latent vector | 13 | | FM | 8192 | 20 | Y |0 | 1e-3 | 1e-3 | 0.8618 | FM model w/ 20D latent vector and regularization | 14 | |VFM | 8192 | 20 | Y |0 | 1e-3 | 1e-3 | 0.8625 | Variational FM model with arbitrary reularization| 15 | |VFM | 8192 | 20 | Y |1 | 1 | 1 | 0.8620 | Variational FM model with default priors| 16 | |VFM | 8192 | 20 | Y |1 | 1 | 1 | 0.8585 | Variational FM model with grouping| 17 | |VFM | 8192 | 64 | Y |1 | 1 | 1 | 0.8800 | Higher rank model does worse| 18 | 19 | After Dec 19: 20 | 21 | | model | batchsize | rank |intx term | lambda0 | lambda1 | lambda2 | RMSE | Notes | 22 | |-----------|-----------|------|----------|---------|---------|---------| -------| ----- | 23 | |VFM | 4096 | 8 | Y | 0 | 0 | 0 | 0.8782 | no regularization | 24 | |VFM | 4096 | 8 | Y | 0 | 1 | 1 | 0.8775 | | 25 | |VFM | 4096 | 8 | Y | 1 | 1 | 1 | 0.8870 | with alpha=1e-2, fast but inaccurate | 26 | |VFM | 4096 | 8 | Y | 10 | 10 | 10 | 0.8628 | more regularization than default | 27 | |VFM | 4096 | 8 | Y | 1 | 1 | 1 | 0.8805 | default, initialized from 10-10-10 run for 200->500 epochs | 28 | |VFM | 4096 | 8 | Y | 10 | 1 | 1 | 0.8805 | default, initialized from 10-10-10 run for 200->500 epochs | 29 | |VFM | 4096 | 8 | Y | 1 | 10 | 1 | 0.8793 | default, initialized from 10-10-10 run for 200->500 epochs | 30 | |VFM | 4096 | 8 | Y | 1 | 1 | 10 | 0.8623 | default, initialized from 10-10-10 run for 200->500 epochs | 31 | |VFM | 4096 | 8 | Y | 10 | 10 | 10 | 0.8619 | added 300 epochs to 10-10-10 run | 32 | |VFM | 4096 | 8 | Y | 0 | 1 | 10 | 0.8629 | default, initialized from 10-10-10 run for 200->500 epochs | 33 | |VFM | 4096 | 8 | Y | 0 | 0 | 10 | 0.8793 | default, initialized from 10-10-10 run for 200->500 epochs | 34 | |VFM | 4096 | 8 | Y | 0 | 0 | 1 | 0.8815 | default, initialized from 10-10-10 run for 200->500 epochs | 35 | |VFM | 4096 | 8 | Y | 1 | 1 | 50 | 0.8561 | default, initialized from 10-10-10 run for 200->500 epochs | 36 | |VFM | 4096 | 8 | Y | 0 | 1 | 50 | 0.8561 | default, initialized from 10-10-10 run for 200->500 epochs | 37 | |VFM | 4096 | 8 | Y | 0 | 1 | 100 | 0.8672 | default, initialized from 10-10-10 run for 200->500 epochs | 38 | |VFM | 4096 | 8 | Y | 1 | 1 | 100 | 0.8673 | default, initialized from 10-10-10 run for 200->500 epochs | 39 | |VFM | 4096 | 8 | Y | 100 | 100 | 100 | 0.8708 | initialized from 10-10-10 model | 40 | 41 | Yamada [1] reports the following errors on a 25% test set of the same 42 | ML-1M dataset root mean squared errors (RMSE): 43 | 44 | | Model | RMSE | 45 | |-------------------| ------| 46 | | libFM ALS | 0.981 | 47 | | libFM SGD | 0.943 | 48 | | libFM MCMC 0.05 | 0.877 | 49 | | CFM | 0.866 | 50 | | CFM (BCD) | 0.850 | 51 | | libFM MCMC 0.10 | 0.846 | 52 | 53 | [1] https://arxiv.org/pdf/1507.01073.pdf 54 | 55 | # Dicussion 56 | 57 | Within the Variational FM framework we can get more than a good point estimate, 58 | we can can get an estimate of the mean and variance of a single feature. This 59 | means we can estimate the variance conditioned on a few active features (e.g. 60 | conditioned on a single user) and retrieve the most uncertain item for that 61 | user. The idea here is to switch inference from a gradient descent model (which makes 62 | point estimates) to variational stochastic gradient descent (which estimates approximate 63 | posteriors) to build an active learning model. 64 | 65 | ## Regression 66 | 67 | For typical linear regression with interactions we have: 68 | 69 | ![eq1](https://latex.codecogs.com/gif.latex?%5Cdpi%7B300%7D%20y%20%5Csim%20c%20+%20%5CSigma_i%20%5Cbeta_i%20x_i%20+%20%5CSigma_%7Bij%7D%20w_%7Bij%7D%20x_i%20x_j) 70 | 71 | [//]: # ( y \sim c + \Sigma_i \beta_i x_i + \Sigma_{ij} w_{ij} x_i x_j) 72 | 73 | 74 | Note that `x_i` is usually a sparse feature vector (but doesn't have to be). In the land of recommenders, we're usually interested in the coefficient `w_ij` in front of an interaction such as `x_i x_j` where `x_i` might be a dummy-encoded user id and `x_j` is an item_id. The big problem here is that `w_ij` is quadratic in the number of features (e.g. # of users + # of items), so there are lots of parameters to estimate with sparse observations. (Note: we've also left off any regularization, but might choose to L2 penalize `w_ij` or `beta_ij`.) 75 | 76 | ## FMs 77 | 78 | FMs fix this by doing a low-rank approximation to `w_ij` by saying that `w_ij= v_i0 * v_j0 + ... + v_ik * v_jk` where each feature `i` has a latent rank-k vector `v_i`. Instead of computing an N x N `w_ij` matrix, we compute N x k parameters in the form of N `v_i` vectors, yielding a new objective function: 79 | 80 | ![eq2](https://latex.codecogs.com/gif.latex?%5Cdpi%7B300%7D%20y%20%5Csim%20c%20+%20%5CSigma_i%20%5Cbeta_i%20x_i%20+%20%5CSigma_%7Bij%7D%20%5Cvec%7Bv_i%7D%20%5Ccdot%20%5Cvec%7Bv_j%7D%20x_i%20x_j) 81 | 82 | [//]: # ( y \sim c + \Sigma_i \beta_i x_i + \Sigma_{ij} \vec{v_i} \cdot \vec{v_j} x_i x_j) 83 | 84 | ## VFMs 85 | 86 | In variational FMs we impose a bit more hierarchy by grouping feature vectors and swap out L2 regularization for Gauassian priors: 87 | 88 | ![eq3](https://latex.codecogs.com/gif.latex?%5Cinline%20%5Cdpi%7B300%7D%20%5Cbeta_i%20%5Csim%20%5Cmathcal%7BN%7D%28%20%5Cmu_%5Cbeta%2C%20%5Csigma_%5Cbeta%29) 89 | 90 | 91 | ![eq3b](https://latex.codecogs.com/gif.latex?%5Cinline%20%5Cdpi%7B300%7D%20%5Cvec%7Bv_i%7D%20%5Csim%20%5Cmathcal%7BN%7D%28%20%5Cvec%7B%5Cmu_v%7D%2C%20%5Cvec%7B%5Csigma%7D_v%29%29) 92 | 93 | [//]: # (\beta_i \sim \mathcal{N}( \mu_\beta, \sigma_\beta)) 94 | [//]: # (\vec{v_i} \sim \mathcal{N}( \vec{\mu_v}, \vec{\sigma}_v))) 95 | 96 | And then group these (hyper)priors together assuming a normal prior with unity variance. The vectors `v_i` are drawn from a multivariate prior with a diagonal covariance matrix. The assumption is that there's a group feature-vector, and individual feature vectors need evidence to deviate from that group vector. The log-normal prior on the variance isn't the disciplined choice (inverse Wishart I think?) but it is convenient and amenable to Stochastic Variational Bayes inference. 97 | 98 | ![eq3](https://latex.codecogs.com/gif.latex?%5Cinline%20%5Cdpi%7B300%7D%20%5Cmu_%5Cbeta%20%5Csim%20%5Cmathcal%7BN%7D%280%2C%201%29%5C%5C%20log%5Csigma_%5Cbeta%20%5Csim%20%5Cmathcal%7BN%7D%280%2C%201%29) 99 | 100 | [//]: # (\mu_\beta \sim \mathcal{N}(0, 1)) 101 | [//]: # (log\sigma_\beta \sim \mathcal{N}(0, 1)) 102 | 103 | As you can see in the results table, shrinking to the groups improves test set validation scores (.8620 --> .8580). 104 | 105 | This forms a deep model: the hyperpriors `mu_b` and `sigma_b` pick the group mean and group variance from which individual `beta_i` and `v_i` are drawn. In variational inference, those `beta_i` and `v_i` in turn have their own means and variances, so that we're not just point estimating `beta_i` but in fact estimate `mu_beta_i` and `sigma_beta_i`. If you're curious how this mode of inference works, read [this](http://blog.shakirm.com/2015/10/machine-learning-trick-of-the-day-4-reparameterisation-tricks/) or [this for the trick in 140 characters](https://twitter.com/ryan_p_adams/status/663049108689715200) -- it's at the heart of Bayesian deep learning techniques. 106 | 107 | ## Model uncertainty 108 | 109 | With estimates of `mu_v_i = E[v_i]` and `sigma_v_i = Var[v_i]` we finally get the critical ingredient to do active learning on FMs -- an uncertainty estimate around the feature vector `v_i`. But we need the uncertainty for the whole model, which is composed of interactions on `v_i`: 110 | 111 | ![eq4](https://latex.codecogs.com/gif.latex?%5Cdpi%7B300%7D%20var%5Bv_i%20v_j%5D%20%3D%20%5Csigma_%7Bv_i%7D%20%5Csigma_%7Bv_j%7D%20+%20%5Csigma_%7Bv_i%7D%20%5Cmu_%7Bv_j%7D%20+%20%5Csigma_%7Bv_j%7D%20%5Cmu_%7Bv_i%7D) 112 | 113 | [//]: # (var[v_i v_j] = \sigma_{v_i} \sigma_{v_j} + \sigma_{v_i} \mu_{v_j} + \sigma_{v_j} \mu_{v_i}) 114 | 115 | Note that the above is just the identity for the product of two independent random variables. Technically `v_i` is a vector, but the components are independent so replace that above `v_i` with an arbitrary component of that vector: 116 | 117 | ![eq4b](https://latex.codecogs.com/gif.latex?%5Cdpi%7B300%7D%20var%5Bv_i%20v_j%5D%20%3D%20%5Cvec%7B%5Csigma_%7Bv_i%7D%7D%20%5Ccdot%20%5Cvec%7B%5Csigma_%7Bv_j%7D%7D%20+%20%5Cvec%7B%5Csigma_%7Bv_i%7D%7D%20%5Ccdot%20%5Cvec%7B%5Cmu_%7Bv_j%7D%7D%20+%20%5Cvec%7B%5Csigma_%7Bv_j%7D%7D%20%5Ccdot%20%5Cvec%7B%20%5Cmu_%7Bv_i%7D%7D) 118 | 119 | [//]: # (var[v_i v_j] = \vec{\sigma_{v_i}} \cdot \vec{\sigma_{v_j}} + \vec{\sigma_{v_i}} \cdot \vec{\mu_{v_j}} + \vec{\sigma_{v_j}} \cdot \vec{ \mu_{v_i}}) 120 | 121 | The variances of the `beta` components do not covary with the `v_i` components, so the full model variance is decomposes into the sum of the individual variances: 122 | 123 | ![eq5](https://latex.codecogs.com/gif.latex?%5Cdpi%7B300%7D%20var%5Bc%20+%20%5CSigma_i%5Cbeta_i%20x_i%20+%20%5CSigma_%7Bij%7D%20v_i%20v_j%20x_i%20x_j%5D%20%3D%5C%5C%20%5Csigma_c%20+%20%5CSigma_i%20var%5B%5Cbeta_i%5D%20x_i%20+%20%5CSigma_%7Bij%7D%20var%5Bv_i%20v_j%5D%20x_i%20x_j%20%3D%20%5C%5C%20%5Csigma_c%20+%5CSigma_i%20%5Csigma_%7B%5Cbeta_i%7D%20x_i%20+%20%5CSigma_%7Bij%7D%20%5B%5Cvec%7B%5Csigma_%7Bv_i%7D%7D%20%5Ccdot%20%5Cvec%7B%5Csigma_%7Bv_j%7D%7D%20+%20%5Cvec%7B%5Csigma_%7Bv_i%7D%7D%20%5Ccdot%20%5Cvec%7B%5Cmu_%7Bv_j%7D%7D%20+%20%5Cvec%7B%5Csigma_%7Bv_j%7D%7D%20%5Ccdot%20%5Cvec%7B%20%5Cmu_%7Bv_i%7D%7D%5D%20x_i%20x_j) 124 | 125 | [//]: # ( var[c + \Sigma_i\beta_i x_i + \Sigma_{ij} v_i v_j x_i x_j] =\\ \sigma_c + \Sigma_i var[\beta_i] x_i + \Sigma_{ij} var[v_i v_j] x_i x_j = \\ \sigma_c +\Sigma_i \sigma_{\beta_i} x_i + \Sigma_{ij} [\vec{\sigma_{v_i}} \cdot \vec{\sigma_{v_j}} + \vec{\sigma_{v_i}} \cdot \vec{\mu_{v_j}} + \vec{\sigma_{v_j}} \cdot \vec{ \mu_{v_i}}] x_i x_j ) 126 | 127 | We've used the fact that `beta` and `v_i` are independent to sum the variances independently. 128 | 129 | So in picking the next question we can rank by the above measure to get the highest variance question. The observation features `x_i x_j` are known for each trial (they're just usually the user ID and item ID) and the means `mu` and variances `sigma` are easily accessible model parameters. 130 | 131 | ## Example 132 | 133 | For a concrete example, we may be interested in user 19 (e.g. `x_19=1`) and want to know how uncertain we are on what item 45 might be rated (if there are 1000 users, then item 45 is feature 1000 + 45, then `x_1045=1`): 134 | 135 | 136 | ![eq6](https://latex.codecogs.com/gif.latex?%5Cdpi%7B300%7D%20%5Csigma_c%20+%20%5Csigma_%7B%5Cbeta_%7B19%7D%7D%20+%20%5Csigma_%7B%5Cbeta_%7B1045%7D%7D%20+%20%5Cvec%7B%5Csigma_%7Bv_%7B19%7D%7D%7D%20%5Ccdot%20%5Cvec%7B%5Csigma_%7Bv_%7B1045%7D%7D%7D%20+%20%5Cvec%7B%5Csigma_%7Bv_%7B19%7D%7D%7D%20%5Ccdot%20%5Cvec%7B%5Cmu_%7Bv_%7B19%7D%7D%7D%20+%20%5Cvec%7B%5Csigma_%7Bv_%7B1045%7D%7D%7D%20%5Ccdot%20%5Cvec%7B%20%5Cmu_%7Bv_%7B19%7D%7D%7D) 137 | 138 | [//]: # ( \sigma_c + \sigma_{\beta_{19}} + \sigma_{\beta_{1045}} +\vec{\sigma_{v_{19}}} \cdot \vec{\sigma_{v_{1045}}} + \vec{\sigma_{v_{19}}} \cdot \vec{\mu_{v_{19}}} + \vec{\sigma_{v_{1045}}} \cdot \vec{ \mu_{v_{19}}}) 139 | 140 | So the interpretation is that the variance is driven by an overall constant factor, the variances of the user and item summed, and then interactions terms combine pairs of feature variances, and feature weights with variances. 141 | 142 | ## Next steps 143 | 144 | We should be able to emulate active learning within this dataset. At training time instead of drawing the next example randomly from your dataset, use this model to rank the available training data and re-train only using most informative datapoint at every timestemp. 145 | --------------------------------------------------------------------------------