├── python ├── recommendation_systems │ ├── cdl │ │ ├── tryinv.py │ │ ├── mult.py │ │ ├── test_BCD.py │ │ ├── mnist_data.py │ │ ├── data.py │ │ ├── cal_rec.py │ │ ├── show_recommendation.py │ │ ├── cal_precision.py │ │ ├── mf.py │ │ ├── mnist_sae.py │ │ ├── BCD_one.py │ │ ├── model.py │ │ ├── cdl.py │ │ ├── solver.py │ │ ├── autoencoder.py │ │ └── collaborative-dl.ipynb │ └── matrix_factorization.ipynb ├── basic │ ├── data_iter.py │ ├── optimizer.ipynb │ ├── data.ipynb │ ├── module.ipynb │ └── ndarray.ipynb └── outline.ipynb ├── .gitignore ├── README.md └── LICENSE /python/recommendation_systems/cdl/tryinv.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | a = np.mat(np.ones((3,3))) 3 | print a*a 4 | print np.dot(a,a) 5 | print np.multiply(a,a) 6 | print np.sum(a) 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *checkpoints* 2 | *.dat 3 | *.demo 4 | *.pyc 5 | *.numpy 6 | *.symbol 7 | *.json 8 | *.dat 9 | *.log 10 | *checkpoint* 11 | *.demo 12 | *.csv 13 | ml-100k/ 14 | *.zip 15 | *.params 16 | -------------------------------------------------------------------------------- /python/recommendation_systems/cdl/mult.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def read_mult(f_in='mult.dat',D=8000): 4 | fp = open(f_in) 5 | lines = fp.readlines() 6 | X = np.zeros((len(lines),D)) 7 | for i,line in enumerate(lines): 8 | strs = line.strip().split(' ')[1:] 9 | for strr in strs: 10 | segs = strr.split(':') 11 | X[i,int(segs[0])] = float(segs[1]) 12 | arr_max = np.amax(X,axis=1) 13 | X = (X.T/arr_max).T 14 | return X 15 | -------------------------------------------------------------------------------- /python/recommendation_systems/cdl/test_BCD.py: -------------------------------------------------------------------------------- 1 | # pylint: skip-file 2 | import numpy as np 3 | from BCD_one import BCD_one 4 | num_v = 100 5 | num_u = 100 6 | num_iter = 10 7 | K = 4 8 | lambda_u = 100 9 | lambda_v = 0.1 10 | a = 1 11 | b = 0.01 12 | a_m_b = a-b 13 | theta = np.mat(np.random.rand(K,num_v)).T 14 | V = np.mat(np.random.rand(K,num_v)).T 15 | U = np.mat(np.random.rand(K,num_u)).T 16 | R = np.mat(np.random.rand(num_u,num_v)) 17 | R[R<0.9] = 0 18 | for i in range(num_iter): 19 | U, V = BCD_one(R, U, V, theta, lambda_u, lambda_v) 20 | -------------------------------------------------------------------------------- /python/recommendation_systems/cdl/mnist_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from sklearn.datasets import fetch_mldata 4 | 5 | def get_mnist(): 6 | np.random.seed(1234) # set seed for deterministic ordering 7 | data_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) 8 | data_path = os.path.join(data_path, '../../data') 9 | mnist = fetch_mldata('MNIST original', data_home=data_path) 10 | p = np.random.permutation(mnist.data.shape[0]) 11 | X = mnist.data[p].astype(np.float32)*0.02 12 | Y = mnist.target[p] 13 | return X, Y 14 | -------------------------------------------------------------------------------- /python/recommendation_systems/cdl/data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from mult import read_mult 3 | 4 | def get_mult(): 5 | X = read_mult('mult.dat',8000).astype(np.float32) 6 | return X 7 | 8 | def get_dummy_mult(): 9 | X = np.random.rand(100,100) 10 | X[X<0.9] = 0 11 | return X 12 | 13 | def read_user(f_in='cf-train-1-users.dat',num_u=5551,num_v=16980): 14 | fp = open(f_in) 15 | R = np.mat(np.zeros((num_u,num_v))) 16 | for i,line in enumerate(fp): 17 | segs = line.strip().split(' ')[1:] 18 | for seg in segs: 19 | R[i,int(seg)] = 1 20 | return R 21 | 22 | def read_dummy_user(): 23 | R = np.mat(np.random.rand(100,100)) 24 | R[R<0.9] = 0 25 | R[R>0.8] = 1 26 | return R 27 | 28 | -------------------------------------------------------------------------------- /python/recommendation_systems/cdl/cal_rec.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from data import read_user 3 | def cal_rec(p,cut): 4 | R_true = read_user('cf-test-1-users.dat') 5 | dir_save = 'cdl'+str(p) 6 | U = np.mat(np.loadtxt(dir_save+'/final-U.dat')) 7 | V = np.mat(np.loadtxt(dir_save+'/final-V.dat')) 8 | R = U*V.T 9 | num_u = R.shape[0] 10 | num_hit = 0 11 | fp = open(dir_save+'/rec-list.dat','w') 12 | for i in range(num_u): 13 | if i!=0 and i%100==0: 14 | print 'Iter '+str(i) 15 | l_score = R[i,:].A1.tolist() 16 | pl = sorted(enumerate(l_score),key=lambda d:d[1],reverse=True) 17 | l_rec = list(zip(*pl)[0])[:cut] 18 | s_rec = set(l_rec) 19 | s_true = set(np.where(R_true[i,:]>0)[1].A1) 20 | cnt_hit = len(s_rec.intersection(s_true)) 21 | fp.write('%d:' % cnt_hit) 22 | fp.write(' '.join(map(str,l_rec))) 23 | fp.write('\n') 24 | fp.close() 25 | 26 | cal_rec(2,8) 27 | 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MXNet Notebooks 2 | 3 | This repo contains various notebooks ranging from basic usages of MXNet to 4 | state-of-the-art deep learning applications. 5 | 6 | ## How to use 7 | 8 | ### Python 9 | 10 | The python notebooks are written in [Jupyter](http://jupyter.org/). 11 | 12 | - **View** we sugguest to use http://nbviewer.jupyter.org/ for online viewing. Here is the link to open the 13 | [outline on nbviewer](http://nbviewer.jupyter.org/github/dmlc/mxnet-notebooks/blob/master/python/outline.ipynb). 14 | 15 | - **Edit** You should be able to view and edit these notebooks if jupyter is installed. 16 | 17 | ## How to develope 18 | 19 | Some general guidelines 20 | 21 | - A notebook covers a single concept or application 22 | - Try to be as basic as possible. Put advanced usages at the end, and allow reader to skip it. 23 | - Keep the cell outputs on the notebooks so that readers can see the results without running 24 | - Organize frequenlty asked questions on the [mxnet's issue](https://github.com/dmlc/mxnet/issues) into notebooks. 25 | -------------------------------------------------------------------------------- /python/recommendation_systems/cdl/show_recommendation.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from data import read_user 3 | import numpy as np 4 | p = 2 5 | user_id = 1 6 | # read predicted results 7 | dir_save = 'cdl%d' % p 8 | csvReader = csv.reader(open('raw-data.csv','rb')) 9 | d_id_title = dict() 10 | for i,row in enumerate(csvReader): 11 | if i==0: 12 | continue 13 | d_id_title[i-1] = row[3] 14 | R_test = read_user('cf-test-1-users.dat') 15 | R_train = read_user('cf-train-1-users.dat') 16 | fp = open(dir_save+'/rec-list.dat') 17 | lines = fp.readlines() 18 | 19 | s_test = set(np.where(R_test[user_id,:]>0)[1].A1) 20 | l_train = np.where(R_train[user_id,:]>0)[1].A1.tolist() 21 | l_pred = map(int,lines[user_id].strip().split(':')[1].split(' ')) 22 | print '##### Articles in the Training Sets #####' 23 | for i in l_train: 24 | print d_id_title[i] 25 | print '\n##### Articles Recommended (Correct Ones Marked by Stars) #####' 26 | for i in l_pred: 27 | if i in s_test: 28 | print '* '+d_id_title[i] 29 | else: 30 | print d_id_title[i] 31 | fp.close() 32 | -------------------------------------------------------------------------------- /python/recommendation_systems/cdl/cal_precision.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from data import read_user 3 | def cal_precision(p,cut): 4 | R_true = read_user('cf-test-1-users.dat') 5 | dir_save = 'cdl'+str(p) 6 | U = np.mat(np.loadtxt(dir_save+'/final-U.dat')) 7 | V = np.mat(np.loadtxt(dir_save+'/final-V.dat')) 8 | R = U*V.T 9 | num_u = R.shape[0] 10 | num_hit = 0 11 | fp = open(dir_save+'/rec-list.dat','w') 12 | for i in range(num_u): 13 | if i!=0 and i%100==0: 14 | print 'Iter '+str(i)+':'+str(float(num_hit)/i/cut) 15 | l_score = R[i,:].A1.tolist() 16 | pl = sorted(enumerate(l_score),key=lambda d:d[1],reverse=True) 17 | l_rec = list(zip(*pl)[0])[:cut] 18 | s_rec = set(l_rec) 19 | s_true = set(np.where(R_true[i,:]>0)[1].A1) 20 | cnt_hit = len(s_rec.intersection(s_true)) 21 | num_hit += cnt_hit 22 | fp.write('%d:' % cnt_hit) 23 | fp.write(' '.join(map(str,l_rec))) 24 | fp.write('\n') 25 | fp.close() 26 | print 'Precision: %.3f' % (float(num_hit)/num_u/cut) 27 | 28 | cal_precision(2,8) 29 | 30 | -------------------------------------------------------------------------------- /python/recommendation_systems/cdl/mf.py: -------------------------------------------------------------------------------- 1 | # pylint: skip-file 2 | import numpy as np 3 | num_v = 16980 4 | num_u = 5551 5 | num_iter = 10 6 | K = 10 7 | lambda_u = 100 8 | lambda_v = 0.1 9 | a = 1 10 | b = 0.01 11 | a_m_b = a-b 12 | theta = np.mat(np.random.rand(K,num_v)) 13 | V = np.mat(np.random.rand(K,num_v)) 14 | U = np.mat(np.random.rand(K,num_u)) 15 | R = np.mat(np.random.rand(num_u,num_v)) 16 | R[R<0.9992] = 0 17 | I_u = np.mat(np.eye(K)*lambda_u) 18 | I_v = np.mat(np.eye(K)*lambda_v) 19 | C = np.mat(np.ones(R.shape))*b 20 | C[np.where(R>0)] = a 21 | print 'I: %d, J: %d, K: %d' % (num_u,num_v,K) 22 | for it in range(num_iter): 23 | print 'iter %d' % it 24 | V_sq = V*V.T*b 25 | for i in range(num_u): 26 | idx_a = np.where(R[i,:]>0)[1].A1 27 | V_cut = V[:,idx_a] 28 | U[:,i] = np.linalg.pinv(V_sq+V_cut*V_cut.T*a_m_b+I_u)*(V_cut*R[i,idx_a].T) 29 | U_sq = U*U.T*b 30 | for j in range(num_v): 31 | idx_a = np.where(R[:,j]>0)[0].A1 32 | U_cut = U[:,idx_a] 33 | V[:,j] = np.linalg.pinv(U_sq+U_cut*U_cut.T*a_m_b+I_v)*(U_cut*R[idx_a,j]+lambda_v*theta[:,j]) 34 | if it%1==0: 35 | E = U.T*V-R 36 | E = np.sum(np.multiply(C,np.multiply(E,E))) 37 | print 'E: %.3f' % E 38 | 39 | -------------------------------------------------------------------------------- /python/recommendation_systems/cdl/mnist_sae.py: -------------------------------------------------------------------------------- 1 | # pylint: skip-file 2 | import mxnet as mx 3 | import numpy as np 4 | import logging 5 | import mnist_data as data 6 | from math import sqrt 7 | from autoencoder import AutoEncoderModel 8 | 9 | if __name__ == '__main__': 10 | lv = 1e-2# lv/ln in CDL 11 | # set to INFO to see less information during training 12 | logging.basicConfig(level=logging.DEBUG) 13 | #ae_model = AutoEncoderModel(mx.gpu(0), [784,500,500,2000,10], pt_dropout=0.2, 14 | # internal_act='relu', output_act='relu') 15 | ae_model = AutoEncoderModel(mx.cpu(2), [784,500,500,2000,10], pt_dropout=0.2, 16 | internal_act='relu', output_act='relu') 17 | 18 | X, _ = data.get_mnist() 19 | train_X = X[:60000] 20 | val_X = X[60000:] 21 | 22 | #ae_model.layerwise_pretrain(train_X, 256, 50000, 'sgd', l_rate=0.1, decay=0.0, 23 | # lr_scheduler=mx.misc.FactorScheduler(20000,0.1)) 24 | #V = np.zeros((train_X.shape[0],10)) 25 | V = np.random.rand(train_X.shape[0],10)/10 26 | lambda_v_rt = np.ones((train_X.shape[0],10))*sqrt(lv) 27 | ae_model.finetune(train_X, V, lambda_v_rt, 256, 28 | 20, 'sgd', l_rate=0.1, decay=0.0, 29 | lr_scheduler=mx.misc.FactorScheduler(20000,0.1)) 30 | ae_model.save('mnist_pt.arg') 31 | ae_model.load('mnist_pt.arg') 32 | print "Training error:", ae_model.eval(train_X,V,lambda_v_rt) 33 | #print "Validation error:", ae_model.eval(val_X) 34 | -------------------------------------------------------------------------------- /python/recommendation_systems/cdl/BCD_one.py: -------------------------------------------------------------------------------- 1 | # pylint: skip-file 2 | import numpy as np 3 | def BCD_one(R, U, V, theta, lambda_u, lambda_v, dir_save='.', 4 | get_loss=False, num_iter=1): 5 | U = U.T 6 | V = np.mat(V.T) 7 | theta = np.mat(theta.T) 8 | num_v = R.shape[1] 9 | num_u = R.shape[0] 10 | K = U.shape[0] 11 | a = 1 12 | b = 0.01 13 | a_m_b = a-b 14 | I_u = np.mat(np.eye(K)*lambda_u) 15 | I_v = np.mat(np.eye(K)*lambda_v) 16 | C = np.mat(np.ones(R.shape))*b 17 | C[np.where(R>0)] = a 18 | #print 'I: %d, J: %d, K: %d' % (num_u,num_v,K) 19 | for it in range(num_iter): 20 | U_sq = U*U.T*b 21 | for j in range(num_v): 22 | idx_a = np.where(R[:,j]>0)[0].A1 23 | U_cut = U[:,idx_a] 24 | V[:,j] = np.linalg.pinv(U_sq+U_cut*U_cut.T*a_m_b+I_v)*(U_cut*R[idx_a,j]+lambda_v*theta[:,j]) 25 | V_sq = V*V.T*b 26 | for i in range(num_u): 27 | idx_a = np.where(R[i,:]>0)[1].A1 28 | V_cut = V[:,idx_a] 29 | U[:,i] = np.linalg.pinv(V_sq+V_cut*V_cut.T*a_m_b+I_u)*(V_cut*R[i,idx_a].T) 30 | if it%10==9: 31 | E = U.T*V-R 32 | E = np.sum(np.multiply(C,np.square(E)))/2.0 33 | reg_loss_v = np.sum(np.square(theta-V))/2.0 34 | reg_loss_u = np.sum(np.square(U))/2.0 35 | E = E+lambda_v*reg_loss_v+lambda_u*reg_loss_u 36 | print 'Iter %d - E: %.3f' % (it,E) 37 | fp = open(dir_save+'/cdl.log','a') 38 | fp.write('Iter %d - E: %.3f\n' % (it,E)) 39 | fp.close() 40 | 41 | if get_loss: 42 | E = U.T*V-R 43 | E = np.sum(np.multiply(C,np.square(E)))/2.0 44 | reg_loss_v = np.sum(np.square(theta-V))/2.0 45 | reg_loss_u = np.sum(np.square(U))/2.0 46 | E = E+lambda_v*reg_loss_v+lambda_u*reg_loss_u 47 | #print 'E: %.3f' % E 48 | else: 49 | E = 0 50 | U = U.T 51 | V = np.asarray(V.T) 52 | return U, V, E 53 | 54 | -------------------------------------------------------------------------------- /python/basic/data_iter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import mxnet as mx 3 | 4 | class SimpleBatch(object): 5 | def __init__(self, data, label, pad=None): 6 | self.data = data 7 | self.label = label 8 | self.pad = pad 9 | 10 | class SimpleIter: 11 | def __init__(self, mu, sigma, batch_size, num_batches): 12 | self.mu = mu 13 | self.sigma = sigma 14 | self.batch_size = batch_size 15 | self.num_batches = num_batches 16 | self.data_shape = (batch_size, mu.shape[1]) 17 | self.label_shape = (batch_size, ) 18 | self.cur_batch = 0 19 | 20 | def __iter__(self): 21 | return self 22 | 23 | def reset(self): 24 | self.cur_batch = 0 25 | 26 | def __next__(self): 27 | return self.next() 28 | 29 | @property 30 | def provide_data(self): 31 | return [('data', self.data_shape)] 32 | 33 | @property 34 | def provide_label(self): 35 | return [('softmax_label', self.label_shape)] 36 | 37 | def next(self): 38 | if self.cur_batch < self.num_batches: 39 | self.cur_batch += 1 40 | num_classes = self.mu.shape[0] 41 | label = np.random.randint(0, num_classes, self.label_shape) 42 | data = np.zeros(self.data_shape) 43 | for i in range(num_classes): 44 | data[label==i,:] = np.random.normal( 45 | self.mu[i,:], self.sigma[i,:], (sum(label==i), self.data_shape[1])) 46 | return SimpleBatch(data=[mx.nd.array(data)], label=[mx.nd.array(label)], pad=0) 47 | else: 48 | raise StopIteration 49 | 50 | class SyntheticData: 51 | """Genrate synthetic data 52 | """ 53 | def __init__(self, num_classes, num_features): 54 | self.num_classes = num_classes 55 | self.num_features = num_features 56 | self.mu = np.random.rand(num_classes, num_features) 57 | self.sigma = np.ones((num_classes, num_features)) * 0.1 58 | 59 | def get_iter(self, batch_size, num_batches=10): 60 | return SimpleIter(self.mu, self.sigma, batch_size, num_batches) -------------------------------------------------------------------------------- /python/recommendation_systems/cdl/model.py: -------------------------------------------------------------------------------- 1 | # pylint: skip-file 2 | import mxnet as mx 3 | import numpy as np 4 | import logging 5 | from solver import Solver, Monitor 6 | try: 7 | import cPickle as pickle 8 | except: 9 | import pickle 10 | 11 | 12 | def extract_feature(sym, args, auxs, data_iter, N, xpu=mx.cpu()): 13 | input_buffs = [mx.nd.empty(shape, ctx=xpu) for k, shape in data_iter.provide_data] 14 | input_names = [k for k, shape in data_iter.provide_data] 15 | args = dict(args, **dict(zip(input_names, input_buffs))) 16 | exe = sym.bind(xpu, args=args, aux_states=auxs) 17 | outputs = [[] for i in exe.outputs] 18 | output_buffs = None 19 | 20 | data_iter.hard_reset() 21 | for batch in data_iter: 22 | for data, buff in zip(batch.data, input_buffs): 23 | data.copyto(buff) 24 | exe.forward(is_train=False) 25 | if output_buffs is None: 26 | output_buffs = [mx.nd.empty(i.shape, ctx=mx.cpu()) for i in exe.outputs] 27 | else: 28 | for out, buff in zip(outputs, output_buffs): 29 | out.append(buff.asnumpy()) 30 | for out, buff in zip(exe.outputs, output_buffs): 31 | out.copyto(buff) 32 | for out, buff in zip(outputs, output_buffs): 33 | out.append(buff.asnumpy()) 34 | outputs = [np.concatenate(i, axis=0)[:N] for i in outputs] 35 | return dict(zip(sym.list_outputs(), outputs)) 36 | 37 | class MXModel(object): 38 | def __init__(self, xpu=mx.cpu(), *args, **kwargs): 39 | self.xpu = xpu 40 | self.loss = None 41 | self.args = {} 42 | self.args_grad = {} 43 | self.args_mult = {} 44 | self.auxs = {} 45 | self.setup(*args, **kwargs) 46 | 47 | def save(self, fname): 48 | args_save = {key: v.asnumpy() for key, v in self.args.items()} 49 | with open(fname, 'w') as fout: 50 | pickle.dump(args_save, fout) 51 | 52 | def load(self, fname): 53 | with open(fname) as fin: 54 | args_save = pickle.load(fin) 55 | for key, v in args_save.items(): 56 | if key in self.args: 57 | self.args[key][:] = v 58 | 59 | def setup(self, *args, **kwargs): 60 | raise NotImplementedError("must override this") 61 | -------------------------------------------------------------------------------- /python/recommendation_systems/cdl/cdl.py: -------------------------------------------------------------------------------- 1 | # pylint: skip-file 2 | import mxnet as mx 3 | import numpy as np 4 | import logging 5 | import data 6 | from math import sqrt 7 | from autoencoder import AutoEncoderModel 8 | import os 9 | 10 | if __name__ == '__main__': 11 | lambda_u = 1 # lambda_u in CDL 12 | lambda_v = 10 # lambda_v in CDL 13 | K = 50 14 | p = 4 15 | is_dummy = False 16 | num_iter = 34000 17 | batch_size = 256 18 | 19 | np.random.seed(1234) # set seed 20 | lv = 1e-2 # lambda_v/lambda_n in CDL 21 | dir_save = 'cdl%d' % p 22 | if not os.path.isdir(dir_save): 23 | os.system('mkdir %s' % dir_save) 24 | fp = open(dir_save+'/cdl.log','w') 25 | print 'p%d: lambda_v/lambda_u/ratio/K: %f/%f/%f/%d' % (p,lambda_v,lambda_u,lv,K) 26 | fp.write('p%d: lambda_v/lambda_u/ratio/K: %f/%f/%f/%d\n' % \ 27 | (p,lambda_v,lambda_u,lv,K)) 28 | fp.close() 29 | if is_dummy: 30 | X = data.get_dummy_mult() 31 | R = data.read_dummy_user() 32 | else: 33 | X = data.get_mult() 34 | R = data.read_user() 35 | # set to INFO to see less information during training 36 | logging.basicConfig(level=logging.DEBUG) 37 | #ae_model = AutoEncoderModel(mx.gpu(0), [784,500,500,2000,10], pt_dropout=0.2, 38 | # internal_act='relu', output_act='relu') 39 | ae_model = AutoEncoderModel(mx.cpu(2), [X.shape[1],100,K], 40 | pt_dropout=0.2, internal_act='relu', output_act='relu') 41 | 42 | train_X = X 43 | 44 | #ae_model.layerwise_pretrain(train_X, 256, 50000, 'sgd', l_rate=0.1, decay=0.0, 45 | # lr_scheduler=mx.misc.FactorScheduler(20000,0.1)) 46 | #V = np.zeros((train_X.shape[0],10)) 47 | V = np.random.rand(train_X.shape[0],K)/10 48 | lambda_v_rt = np.ones((train_X.shape[0],K))*sqrt(lv) 49 | U, V, theta, BCD_loss = ae_model.finetune(train_X, R, V, lambda_v_rt, lambda_u, 50 | lambda_v, dir_save, batch_size, 51 | num_iter, 'sgd', l_rate=0.1, decay=0.0, 52 | lr_scheduler=mx.misc.FactorScheduler(20000,0.1)) 53 | #ae_model.save('cdl_pt.arg') 54 | np.savetxt(dir_save+'/final-U.dat',U,fmt='%.5f',comments='') 55 | np.savetxt(dir_save+'/final-V.dat',V,fmt='%.5f',comments='') 56 | np.savetxt(dir_save+'/final-theta.dat',theta,fmt='%.5f',comments='') 57 | 58 | #ae_model.load('cdl_pt.arg') 59 | Recon_loss = lambda_v/lv*ae_model.eval(train_X,V,lambda_v_rt) 60 | print "Training error: %.3f" % (BCD_loss+Recon_loss) 61 | fp = open(dir_save+'/cdl.log','a') 62 | fp.write("Training error: %.3f\n" % (BCD_loss+Recon_loss)) 63 | fp.close() 64 | #print "Validation error:", ae_model.eval(val_X) 65 | -------------------------------------------------------------------------------- /python/outline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Python Notebooks for MXNet\n", 8 | "\n", 9 | "The outline of python notebooks.\n", 10 | "\n", 11 | "## Basic Concepts\n", 12 | "\n", 13 | "These tutorials walk through how to use the basic components of MXNet.\n", 14 | "\n", 15 | "- [NDArray](./basic/ndarray.ipynb): multi-dimensional array \n", 16 | "- [Symbol](./basic/symbol.ipynb): symbolic expression\n", 17 | "- [mxied programming](./basic/mixed.ipynb): developing training algorithms by using NDArray and Symbol together.\n", 18 | "- [Module](./basic/module.ipynb) : intermediate-level and high-level interface for neural network training and inference. \n", 19 | "- DOING. [Data Input]() : data iterators\n", 20 | "- DOING. [Key-value Store]() : data communication for multi-device and multi-machines training \n", 21 | "- TODO. Initializer : various ways to intialize the parameters\n", 22 | "- TODO. Optimizer : parameters updaters such as `sgd` updater\n", 23 | "- TODO. Metric : various metric to evaluate the progress\n", 24 | " \n", 25 | "\n", 26 | "## Neural Networks\n", 27 | "\n", 28 | "How to implement various neural networks. \n", 29 | "\n", 30 | "TODO. Convolution Neural Networks\n", 31 | "\n", 32 | "Recurrent Neural networks\n", 33 | "- [LSTM](./rnn/lstm.ipynb) build LSTM from scratch\n", 34 | "\n", 35 | "## How To\n", 36 | "\n", 37 | "- TODO Use pretrained models for prediction and feature extraction. \n", 38 | "\n", 39 | "## Applications\n", 40 | "\n", 41 | "Complete examples for various applications. \n", 42 | "\n", 43 | "### Image Classification\n", 44 | "\n", 45 | "- [Convolutional neural network for Written Digit Recognition](./cnn/mnist.ipynb) train and predict on the mnist datasets\n", 46 | "\n", 47 | "### Recommendation Systems\n", 48 | "\n", 49 | "- [Matrix Factorization](./recommendation_systems/matrix_factorization.ipynb) writing a basic matrix factorization algorithm\n", 50 | "- [Collaborative Deep Learning](./recommendation_systems/cdl/collaborative-dl.ipynb) state-of-the-art algorithm in KDD 15\n" 51 | ] 52 | } 53 | ], 54 | "metadata": { 55 | "anaconda-cloud": {}, 56 | "kernelspec": { 57 | "display_name": "Python 2", 58 | "language": "python", 59 | "name": "python2" 60 | }, 61 | "language_info": { 62 | "codemirror_mode": { 63 | "name": "ipython", 64 | "version": 2 65 | }, 66 | "file_extension": ".py", 67 | "mimetype": "text/x-python", 68 | "name": "python", 69 | "nbconvert_exporter": "python", 70 | "pygments_lexer": "ipython2", 71 | "version": "2.7.6" 72 | } 73 | }, 74 | "nbformat": 4, 75 | "nbformat_minor": 0 76 | } 77 | -------------------------------------------------------------------------------- /python/basic/optimizer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# Optimizer\n", 10 | "\n", 11 | "In gradient-base optimization algorithms, we update the parameters (or weights) using the gradients in each iteration. We call this updating function as `Optimizer`. \n", 12 | "\n", 13 | "The main method of an optimizer is `update(weight, grad)`, which updates a NDArray weight using a NDArray gradient. But given that a multi-layer neural network often has more than one weights, we assign each weight a unique integer index. Furthermore, an optimizer may need space to store auxiliary state, such as momentum, we also allow a user-defined state for updating. In summary, an optimizer has two major methods\n", 14 | "\n", 15 | "- `create_state(index, weight)`: create auxiliary state for the `index`-th weight. \n", 16 | "- `update(index, weight, grad, state)`: update the `index`-th weight given the gradient and auxiliary state. The state can be also updated.\n", 17 | "\n", 18 | "\n", 19 | "## Basic Usage\n", 20 | "\n", 21 | "### Create and Update\n", 22 | "MXNet has already implemented several popular optimizers in [python/mxnet/optimizer.py](https://github.com/dmlc/mxnet/blob/master/python/mxnet/optimizer.py). An convenient way to create one is by using `optimizer.create(name, args...)`. The following codes create a standard SGD updater which does\n", 23 | "\n", 24 | "```\n", 25 | "weight = weight - learning_rate * grad\n", 26 | "```" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 6, 32 | "metadata": { 33 | "collapsed": false 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "import mxnet as mx\n", 38 | "opt = mx.optimizer.create('sgd', learning_rate=.1)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "Then we can use the `update` function." 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 14, 51 | "metadata": { 52 | "collapsed": false, 53 | "scrolled": true 54 | }, 55 | "outputs": [ 56 | { 57 | "name": "stdout", 58 | "output_type": "stream", 59 | "text": [ 60 | "[[ 0.89999998 0.89999998 0.89999998]\n", 61 | " [ 0.89999998 0.89999998 0.89999998]]\n" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "grad = mx.nd.ones((2,3))\n", 67 | "weight = mx.nd.ones((2,3))\n", 68 | "index = 0\n", 69 | "\n", 70 | "opt.update(index, weight, grad, state=None)\n", 71 | "print(weight.asnumpy())" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "When momentum is non-zero, the sgd optimizer needs extra state. " 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 15, 84 | "metadata": { 85 | "collapsed": false 86 | }, 87 | "outputs": [ 88 | { 89 | "name": "stdout", 90 | "output_type": "stream", 91 | "text": [ 92 | "[[-0.1 -0.1 -0.1]\n", 93 | " [-0.1 -0.1 -0.1]]\n" 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "mom_opt = mx.optimizer.create('sgd', learning_rate=.1, momentum=.01)\n", 99 | "state = mom_opt.create_state(index, weight)\n", 100 | "opt.update(index, weight, grad, state)\n", 101 | "print(state.asnumpy())" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "### Flexible Learning Rate\n", 109 | "\n", 110 | "- [lr scheduler](https://github.com/dmlc/mxnet/blob/master/python/mxnet/lr_scheduler.py)\n", 111 | "- layer-wise lr: set_lr_mult, set_wd_mult\n", 112 | "\n", 113 | "\n", 114 | "### More optimizers\n" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "## Customized Optimizer" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": { 128 | "collapsed": true 129 | }, 130 | "outputs": [], 131 | "source": [] 132 | } 133 | ], 134 | "metadata": { 135 | "kernelspec": { 136 | "display_name": "Python 2", 137 | "language": "python", 138 | "name": "python2" 139 | }, 140 | "language_info": { 141 | "codemirror_mode": { 142 | "name": "ipython", 143 | "version": 2 144 | }, 145 | "file_extension": ".py", 146 | "mimetype": "text/x-python", 147 | "name": "python", 148 | "nbconvert_exporter": "python", 149 | "pygments_lexer": "ipython2", 150 | "version": "2.7.6" 151 | } 152 | }, 153 | "nbformat": 4, 154 | "nbformat_minor": 1 155 | } 156 | -------------------------------------------------------------------------------- /python/recommendation_systems/cdl/solver.py: -------------------------------------------------------------------------------- 1 | # pylint: skip-file 2 | import mxnet as mx 3 | import numpy as np 4 | import logging 5 | import model 6 | from BCD_one import BCD_one 7 | 8 | class Monitor(object): 9 | def __init__(self, interval, level=logging.DEBUG, stat=None): 10 | self.interval = interval 11 | self.level = level 12 | if stat is None: 13 | def mean_abs(x): 14 | return np.fabs(x).mean() 15 | self.stat = mean_abs 16 | else: 17 | self.stat = stat 18 | 19 | def forward_end(self, i, internals): 20 | if i%self.interval == 0 and logging.getLogger().isEnabledFor(self.level): 21 | for key in sorted(internals.keys()): 22 | arr = internals[key] 23 | logging.log(self.level, 'Iter:%d param:%s\t\tstat(%s):%s'%(i, key, self.stat.__name__, str(self.stat(arr.asnumpy())))) 24 | 25 | def backward_end(self, i, weights, grads, metric=None): 26 | if i%self.interval == 0 and logging.getLogger().isEnabledFor(self.level): 27 | for key in sorted(grads.keys()): 28 | arr = grads[key] 29 | logging.log(self.level, 'Iter:%d param:%s\t\tstat(%s):%s\t\tgrad_stat:%s'%(i, key, self.stat.__name__, str(self.stat(weights[key].asnumpy())), str(self.stat(arr.asnumpy())))) 30 | if i%self.interval == 0 and metric is not None: 31 | logging.log(logging.INFO, 'Iter:%d metric:%f'%(i, metric.get()[1])) 32 | metric.reset() 33 | 34 | class Solver(object): 35 | def __init__(self, optimizer, **kwargs): 36 | if isinstance(optimizer, str): 37 | self.optimizer = mx.optimizer.create(optimizer, **kwargs) 38 | else: 39 | self.optimizer = optimizer 40 | self.updater = mx.optimizer.get_updater(self.optimizer) 41 | self.monitor = None 42 | self.metric = None 43 | self.iter_end_callback = None 44 | self.iter_start_callback = None 45 | 46 | def set_metric(self, metric): 47 | self.metric = metric 48 | 49 | def set_monitor(self, monitor): 50 | self.monitor = monitor 51 | 52 | def set_iter_end_callback(self, callback): 53 | self.iter_end_callback = callback 54 | 55 | def set_iter_start_callback(self, callback): 56 | self.iter_start_callback = callback 57 | 58 | def solve(self, X, R, V, lambda_v_rt, lambda_u, lambda_v, dir_save, batch_size, xpu, sym, args, args_grad, auxs, 59 | data_iter, begin_iter, end_iter, args_lrmult={}, debug = False): 60 | # names and shapes 61 | input_desc = data_iter.provide_data + data_iter.provide_label 62 | input_names = [k for k, shape in input_desc] 63 | # plances to store them 64 | input_buffs = [mx.nd.empty(shape, ctx=xpu) for k, shape in input_desc] 65 | args = dict(args, **dict(zip(input_names, input_buffs))) 66 | 67 | # list all outputs (strings) 68 | output_names = sym.list_outputs() 69 | if debug: 70 | sym = sym.get_internals() 71 | blob_names = sym.list_outputs() 72 | sym_group = [] 73 | for i in range(len(blob_names)): 74 | if blob_names[i] not in args: 75 | x = sym[i] 76 | if blob_names[i] not in output_names: 77 | x = mx.symbol.BlockGrad(x, name=blob_names[i]) 78 | sym_group.append(x) 79 | sym = mx.symbol.Group(sym_group) 80 | # bind the network params to the network (symbol) 81 | exe = sym.bind(xpu, args=args, args_grad=args_grad, aux_states=auxs) 82 | 83 | assert len(sym.list_arguments()) == len(exe.grad_arrays) 84 | update_dict = {name: nd for name, nd in zip(sym.list_arguments(), exe.grad_arrays) if nd} 85 | batch_size = input_buffs[0].shape[0] 86 | self.optimizer.rescale_grad = 1.0/batch_size 87 | self.optimizer.set_lr_mult(args_lrmult) 88 | 89 | output_dict = {} 90 | output_buff = {} 91 | internal_dict = dict(zip(input_names, input_buffs)) 92 | # exe.outputs is a list of all output ndarrays 93 | for key, arr in zip(sym.list_outputs(), exe.outputs): 94 | if key in output_names: 95 | output_dict[key] = arr 96 | output_buff[key] = mx.nd.empty(arr.shape, ctx=mx.cpu()) 97 | else: 98 | internal_dict[key] = arr 99 | 100 | # init' U 101 | U = np.mat(np.zeros((R.shape[0],V.shape[1]))) 102 | # set lambda_v_rt to 0 in the first epoch 103 | lambda_v_rt_old = np.zeros(lambda_v_rt.shape) 104 | lambda_v_rt_old[:] = lambda_v_rt[:] 105 | lambda_v_rt[:,:] = 0 106 | epoch = 0 # index epochs 107 | data_iter = mx.io.NDArrayIter({'data': X, 'V': V, 'lambda_v_rt': 108 | lambda_v_rt}, 109 | batch_size=batch_size, shuffle=False, 110 | last_batch_handle='pad') 111 | data_iter.reset() 112 | for i in range(begin_iter, end_iter): 113 | if self.iter_start_callback is not None: 114 | if self.iter_start_callback(i): 115 | return 116 | #if i==100: 117 | # V = np.zeros(V.shape) 118 | # data_iter = mx.io.NDArrayIter({'data': X, 'V': V, 'lambda_v_rt': 119 | # lambda_v_rt}, 120 | # batch_size=batch_size, shuffle=False, 121 | # last_batch_handle='pad') 122 | # data_iter.reset() 123 | # for j in range(10): 124 | # batch = data_iter.next() 125 | try: 126 | batch = data_iter.next() 127 | except: 128 | # means the end of an epoch 129 | epoch += 1 130 | theta = model.extract_feature(sym[0], args, auxs, 131 | data_iter, X.shape[0], xpu).values()[0] 132 | # update U, V and get BCD loss 133 | U, V, BCD_loss = BCD_one(R, U, V, theta, 134 | lambda_u, lambda_v, dir_save, True) 135 | # get recon' loss 136 | Y = model.extract_feature(sym[1], args, auxs, 137 | data_iter, X.shape[0], xpu).values()[0] 138 | Recon_loss = lambda_v/np.square(lambda_v_rt_old[0,0])*np.sum(np.square(Y-X))/2.0 139 | print "Epoch %d - tr_err/bcd_err/rec_err: %.1f/%.1f/%.1f" % (epoch, 140 | BCD_loss+Recon_loss, BCD_loss, Recon_loss) 141 | fp = open(dir_save+'/cdl.log','a') 142 | fp.write("Epoch %d - tr_err/bcd_err/rec_err: %.1f/%.1f/%.1f\n" % (epoch, 143 | BCD_loss+Recon_loss, BCD_loss, Recon_loss)) 144 | fp.close() 145 | lambda_v_rt[:] = lambda_v_rt_old[:] # back to normal lambda_v_rt 146 | data_iter = mx.io.NDArrayIter({'data': X, 'V': V, 'lambda_v_rt': 147 | lambda_v_rt}, 148 | batch_size=batch_size, shuffle=False, 149 | last_batch_handle='pad') 150 | data_iter.reset() 151 | batch = data_iter.next() 152 | 153 | for data, buff in zip(batch.data+batch.label, input_buffs): 154 | # copy data from batch to input_buffs 155 | # input_buffs is used during ff and bp 156 | # buffs->args->exe 157 | data.copyto(buff) 158 | exe.forward(is_train=True) 159 | if self.monitor is not None: 160 | self.monitor.forward_end(i, internal_dict) 161 | for key in output_dict: 162 | # output_buff is used for computing metrics 163 | output_dict[key].copyto(output_buff[key]) 164 | 165 | exe.backward() 166 | for key, arr in update_dict.items(): 167 | self.updater(key, arr, args[key]) 168 | 169 | if self.metric is not None: 170 | self.metric.update([input_buffs[-1]], 171 | [output_buff[output_names[0]]]) 172 | 173 | if self.monitor is not None: 174 | self.monitor.backward_end(i, args, update_dict, self.metric) 175 | 176 | if self.iter_end_callback is not None: 177 | if self.iter_end_callback(i): 178 | return 179 | exe.outputs[0].wait_to_read() 180 | #Y = model.extract_feature(sym[0], args, auxs, 181 | # data_iter, X.shape[0], xpu).values()[0] 182 | #print Y 183 | #print Y.shape 184 | theta = model.extract_feature(sym[0], args, auxs, 185 | data_iter, X.shape[0], xpu).values()[0] 186 | U, V, BCD_loss = BCD_one(R, U, V, theta, lambda_u, lambda_v, 187 | dir_save, True, 1) 188 | fp.close() 189 | return U, V, theta, BCD_loss 190 | -------------------------------------------------------------------------------- /python/recommendation_systems/cdl/autoencoder.py: -------------------------------------------------------------------------------- 1 | # pylint: skip-file 2 | import mxnet as mx 3 | from mxnet import misc 4 | import numpy as np 5 | import model 6 | import logging 7 | from solver import Solver, Monitor 8 | try: 9 | import cPickle as pickle 10 | except: 11 | import pickle 12 | 13 | class AutoEncoderModel(model.MXModel): 14 | def setup(self, dims, sparseness_penalty=None, pt_dropout=None, ft_dropout=None, input_act=None, internal_act='relu', output_act=None): 15 | self.N = len(dims) - 1 16 | self.dims = dims 17 | self.stacks = [] 18 | self.pt_dropout = pt_dropout 19 | self.ft_dropout = ft_dropout 20 | self.input_act = input_act 21 | self.internal_act = internal_act 22 | self.output_act = output_act 23 | 24 | self.data = mx.symbol.Variable('data') 25 | self.V = mx.symbol.Variable('V') 26 | self.lambda_v_rt = mx.symbol.Variable('lambda_v_rt') 27 | for i in range(self.N): 28 | if i == 0: 29 | decoder_act = input_act 30 | idropout = None 31 | else: 32 | decoder_act = internal_act 33 | idropout = pt_dropout 34 | if i == self.N-1: 35 | encoder_act = output_act 36 | odropout = None 37 | else: 38 | encoder_act = internal_act 39 | odropout = pt_dropout 40 | istack, iargs, iargs_grad, iargs_mult, iauxs = self.make_stack(i, self.data, dims[i], dims[i+1], 41 | sparseness_penalty, idropout, odropout, encoder_act, decoder_act) 42 | self.stacks.append(istack) 43 | self.args.update(iargs) 44 | self.args_grad.update(iargs_grad) 45 | self.args_mult.update(iargs_mult) 46 | self.auxs.update(iauxs) 47 | self.encoder, self.internals = self.make_encoder(self.data, dims, sparseness_penalty, ft_dropout, internal_act, output_act) 48 | self.decoder = self.make_decoder(self.encoder, dims, sparseness_penalty, ft_dropout, internal_act, input_act) 49 | if input_act == 'softmax': 50 | self.loss = self.decoder 51 | else: 52 | #fe_loss = mx.symbol.LinearRegressionOutput(data=1*self.encoder, 53 | # label=1*self.V) 54 | fe_loss = mx.symbol.LinearRegressionOutput(data=self.lambda_v_rt*self.encoder, 55 | label=self.lambda_v_rt*self.V) 56 | fr_loss = mx.symbol.LinearRegressionOutput(data=self.decoder, label=self.data) 57 | self.loss = mx.symbol.Group([fe_loss, fr_loss]) 58 | 59 | def make_stack(self, istack, data, num_input, num_hidden, sparseness_penalty=None, idropout=None, 60 | odropout=None, encoder_act='relu', decoder_act='relu'): 61 | x = data 62 | if idropout: 63 | x = mx.symbol.Dropout(data=x, p=idropout) 64 | x = mx.symbol.FullyConnected(name='encoder_%d'%istack, data=x, num_hidden=num_hidden) 65 | if encoder_act: 66 | x = mx.symbol.Activation(data=x, act_type=encoder_act) 67 | if encoder_act == 'sigmoid' and sparseness_penalty: 68 | x = mx.symbol.IdentityAttachKLSparseReg(data=x, name='sparse_encoder_%d' % istack, penalty=sparseness_penalty) 69 | if odropout: 70 | x = mx.symbol.Dropout(data=x, p=odropout) 71 | x = mx.symbol.FullyConnected(name='decoder_%d'%istack, data=x, num_hidden=num_input) 72 | if decoder_act == 'softmax': 73 | x = mx.symbol.Softmax(data=x, label=data, prob_label=True, act_type=decoder_act) 74 | elif decoder_act: 75 | x = mx.symbol.Activation(data=x, act_type=decoder_act) 76 | if decoder_act == 'sigmoid' and sparseness_penalty: 77 | x = mx.symbol.IdentityAttachKLSparseReg(data=x, name='sparse_decoder_%d' % istack, penalty=sparseness_penalty) 78 | x = mx.symbol.LinearRegressionOutput(data=x, label=data) 79 | else: 80 | x = mx.symbol.LinearRegressionOutput(data=x, label=data) 81 | 82 | args = {'encoder_%d_weight'%istack: mx.nd.empty((num_hidden, num_input), self.xpu), 83 | 'encoder_%d_bias'%istack: mx.nd.empty((num_hidden,), self.xpu), 84 | 'decoder_%d_weight'%istack: mx.nd.empty((num_input, num_hidden), self.xpu), 85 | 'decoder_%d_bias'%istack: mx.nd.empty((num_input,), self.xpu),} 86 | args_grad = {'encoder_%d_weight'%istack: mx.nd.empty((num_hidden, num_input), self.xpu), 87 | 'encoder_%d_bias'%istack: mx.nd.empty((num_hidden,), self.xpu), 88 | 'decoder_%d_weight'%istack: mx.nd.empty((num_input, num_hidden), self.xpu), 89 | 'decoder_%d_bias'%istack: mx.nd.empty((num_input,), self.xpu),} 90 | args_mult = {'encoder_%d_weight'%istack: 1.0, 91 | 'encoder_%d_bias'%istack: 2.0, 92 | 'decoder_%d_weight'%istack: 1.0, 93 | 'decoder_%d_bias'%istack: 2.0,} 94 | auxs = {} 95 | if encoder_act == 'sigmoid' and sparseness_penalty: 96 | auxs['sparse_encoder_%d_moving_avg' % istack] = mx.nd.ones((num_hidden), self.xpu) * 0.5 97 | if decoder_act == 'sigmoid' and sparseness_penalty: 98 | auxs['sparse_decoder_%d_moving_avg' % istack] = mx.nd.ones((num_input), self.xpu) * 0.5 99 | init = mx.initializer.Uniform(0.07) 100 | for k,v in args.items(): 101 | init(k,v) 102 | 103 | return x, args, args_grad, args_mult, auxs 104 | 105 | def make_encoder(self, data, dims, sparseness_penalty=None, dropout=None, internal_act='relu', output_act=None): 106 | x = data 107 | internals = [] 108 | N = len(dims) - 1 109 | for i in range(N): 110 | x = mx.symbol.FullyConnected(name='encoder_%d'%i, data=x, num_hidden=dims[i+1]) 111 | if internal_act and i < N-1: 112 | x = mx.symbol.Activation(data=x, act_type=internal_act) 113 | if internal_act=='sigmoid' and sparseness_penalty: 114 | x = mx.symbol.IdentityAttachKLSparseReg(data=x, name='sparse_encoder_%d' % i, penalty=sparseness_penalty) 115 | elif output_act and i == N-1: 116 | x = mx.symbol.Activation(data=x, act_type=output_act) 117 | if output_act=='sigmoid' and sparseness_penalty: 118 | x = mx.symbol.IdentityAttachKLSparseReg(data=x, name='sparse_encoder_%d' % i, penalty=sparseness_penalty) 119 | if dropout: 120 | x = mx.symbol.Dropout(data=x, p=dropout) 121 | internals.append(x) 122 | return x, internals 123 | 124 | def make_decoder(self, feature, dims, sparseness_penalty=None, dropout=None, internal_act='relu', input_act=None): 125 | x = feature 126 | N = len(dims) - 1 127 | for i in reversed(range(N)): 128 | x = mx.symbol.FullyConnected(name='decoder_%d'%i, data=x, num_hidden=dims[i]) 129 | if internal_act and i > 0: 130 | x = mx.symbol.Activation(data=x, act_type=internal_act) 131 | if internal_act=='sigmoid' and sparseness_penalty: 132 | x = mx.symbol.IdentityAttachKLSparseReg(data=x, name='sparse_decoder_%d' % i, penalty=sparseness_penalty) 133 | elif input_act and i == 0: 134 | x = mx.symbol.Activation(data=x, act_type=input_act) 135 | if input_act=='sigmoid' and sparseness_penalty: 136 | x = mx.symbol.IdentityAttachKLSparseReg(data=x, name='sparse_decoder_%d' % i, penalty=sparseness_penalty) 137 | if dropout and i > 0: 138 | x = mx.symbol.Dropout(data=x, p=dropout) 139 | return x 140 | 141 | def layerwise_pretrain(self, X, batch_size, n_iter, optimizer, l_rate, decay, lr_scheduler=None): 142 | def l2_norm(label, pred): 143 | return np.mean(np.square(label-pred))/2.0 144 | solver = Solver(optimizer, momentum=0.9, wd=decay, learning_rate=l_rate, lr_scheduler=lr_scheduler) 145 | solver.set_metric(mx.metric.CustomMetric(l2_norm)) 146 | solver.set_monitor(Monitor(1000)) 147 | data_iter = mx.io.NDArrayIter({'data': X}, batch_size=batch_size, shuffle=True, 148 | last_batch_handle='roll_over') 149 | for i in range(self.N): 150 | if i == 0: 151 | data_iter_i = data_iter 152 | else: 153 | X_i = model.extract_feature(self.internals[i-1], self.args, self.auxs, 154 | data_iter, X.shape[0], self.xpu).values()[0] 155 | data_iter_i = mx.io.NDArrayIter({'data': X_i}, batch_size=batch_size, 156 | last_batch_handle='roll_over') 157 | logging.info('Pre-training layer %d...'%i) 158 | solver.solve(self.xpu, self.stacks[i], self.args, self.args_grad, self.auxs, data_iter_i, 159 | 0, n_iter, {}, False) 160 | 161 | def finetune(self, X, R, V, lambda_v_rt, lambda_u, lambda_v, dir_save, batch_size, n_iter, optimizer, l_rate, decay, lr_scheduler=None): 162 | def l2_norm(label, pred): 163 | return np.mean(np.square(label-pred))/2.0 164 | solver = Solver(optimizer, momentum=0.9, wd=decay, learning_rate=l_rate, lr_scheduler=lr_scheduler) 165 | solver.set_metric(mx.metric.CustomMetric(l2_norm)) 166 | solver.set_monitor(Monitor(1000)) 167 | data_iter = mx.io.NDArrayIter({'data': X, 'V': V, 'lambda_v_rt': 168 | lambda_v_rt}, 169 | batch_size=batch_size, shuffle=False, 170 | last_batch_handle='pad') 171 | logging.info('Fine tuning...') 172 | # self.loss is the net 173 | U, V, theta, BCD_loss = solver.solve(X, R, V, lambda_v_rt, lambda_u, 174 | lambda_v, dir_save, batch_size, self.xpu, self.loss, self.args, self.args_grad, self.auxs, data_iter, 175 | 0, n_iter, {}, False) 176 | return U, V, theta, BCD_loss 177 | 178 | # modified by hog 179 | def eval(self, X, V, lambda_v_rt): 180 | batch_size = 100 181 | data_iter = mx.io.NDArrayIter({'data': X, 'V': V, 'lambda_v_rt': 182 | lambda_v_rt}, 183 | batch_size=batch_size, shuffle=False, 184 | last_batch_handle='pad') 185 | # modified by hog 186 | Y = model.extract_feature(self.loss[1], self.args, self.auxs, data_iter, 187 | X.shape[0], self.xpu).values()[0] 188 | return np.sum(np.square(Y-X))/2.0 189 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /python/basic/data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Loading Data\n", 8 | "\n", 9 | "This tutorial we focus on how to feeding data into a training and inference program. We can manually copy data into a binded symbol as shown in the [mixed programming](./mixed.ipynb). Most training and inference modules in MXNet accepts data iterators, which simplifies this procedure, especially when reading large datasets from filesystems. Here we discuss the API conventions and several provided iterators. \n", 10 | "\n", 11 | "## Basic Data Iterator\n", 12 | "\n", 13 | "Data iterators in MXNet is similar to the iterator in Python. In Python, we can use the built-in function `iter` with an iterable object (such as list) to return an iterator. For example, in `x = iter([1, 2, 3])` we obtain an iterator on the list `[1,2,3]`. If we repeatedly call `x.next()` (`__next__()` for Python 3), then we will get elements from the list one by one, and end with a `StopIteration` exception. \n", 14 | "\n", 15 | "MXNet's data iterator returns a batch of data in each `next` call. We first introduce what a data batch looks like and then how to write a basic data iterator.\n", 16 | "\n", 17 | "### Data Batch\n", 18 | "\n", 19 | "A data batch often contains *n* examples and the according labels. Here *n* is often called as the batch size. \n", 20 | "\n", 21 | "The following codes defines a valid data batch is able to be read by most training/inference modules. " 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "metadata": { 28 | "collapsed": true 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "class SimpleBatch(object):\n", 33 | " def __init__(self, data, label, pad=0):\n", 34 | " self.data = data\n", 35 | " self.label = label\n", 36 | " self.pad = pad" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "We explain what each attribute means:\n", 44 | "\n", 45 | "- **`data`** is a list of NDArray, each of them has $n$ length first dimension. For example, if an example is an image with size $224 \\times 224$ and RGB channels, then the array shape should be `(n, 3, 224, 244)`. Note that the image batch format used by MXNet is \n", 46 | "\n", 47 | " $$\\textrm{batch_size} \\times \\textrm{num_channel} \\times \\textrm{height} \\times \\textrm{width}$$\n", 48 | " The channels are often in RGB order.\n", 49 | "\n", 50 | " Each array will be copied into a free variable of the Symbol later. The mapping from arrays to free variables should be given by the `provide_data` attribute of the iterator, which will be discussed shortly. \n", 51 | " \n", 52 | "- **`label`** is also a list of NDArray. Often each NDArray is a 1-dimensional array with shape `(n,)`. For classification, each class is represented by an integer starting from 0.\n", 53 | "\n", 54 | "- **`pad`** is an integer shows how many examples are for merely used for padding, which should be ignored in the results. A nonzero padding is often used when we reach the end of the data and the total number of examples cannot be divided by the batch size. \n", 55 | " " 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "### Symbol and Data Variables \n", 63 | "\n", 64 | "Before moving the iterator, we first look at how to find which variables in a Symbol are for input data. In MXNet, an operator (`mx.sym.*`) has one or more input variables and output variables; some operators may have additional auxiliary variables for internal states. For an input variable of an operator, if do not assign it with an output of another operator during creating this operator, then this input variable is free. We need to assign it with external data before running. \n", 65 | "\n", 66 | "The following codes define a simple multilayer perceptron (MLP) and then print all free variables." 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 2, 72 | "metadata": { 73 | "collapsed": false 74 | }, 75 | "outputs": [ 76 | { 77 | "name": "stdout", 78 | "output_type": "stream", 79 | "text": [ 80 | "['data', 'fc1_weight', 'fc1_bias', 'fc2_weight', 'fc2_bias', 'softmax_label']\n", 81 | "['softmax_output']\n" 82 | ] 83 | } 84 | ], 85 | "source": [ 86 | "import mxnet as mx\n", 87 | "num_classes = 10\n", 88 | "net = mx.sym.Variable('data')\n", 89 | "net = mx.sym.FullyConnected(data=net, name='fc1', num_hidden=64)\n", 90 | "net = mx.sym.Activation(data=net, name='relu1', act_type=\"relu\")\n", 91 | "net = mx.sym.FullyConnected(data=net, name='fc2', num_hidden=num_classes)\n", 92 | "net = mx.sym.SoftmaxOutput(data=net, name='softmax')\n", 93 | "print(net.list_arguments())\n", 94 | "print(net.list_outputs())" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "As can be seen, we name a variable either by its operator's name if it is atomic (e.g. `sym.Variable`) or by the `opname_varname` convention. The `varname` often means what this variable is for:\n", 102 | "- `weight` : the weight parameters\n", 103 | "- `bias` : the bias parameters\n", 104 | "- `output` : the output\n", 105 | "- `label` : input label\n", 106 | "\n", 107 | "On the above example, now we know that there are 4 variables for parameters, and two for input data: `data` for examples and `softmax_label` for the according labels. \n", 108 | "\n", 109 | "The following example define a matrix factorization object function with rank 10 for recommendation systems. It has three input variables, `user` for user IDs, `item` for item IDs, and `score` is the rating `user` gives to `item`. " 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 3, 115 | "metadata": { 116 | "collapsed": false 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "num_users = 1000\n", 121 | "num_items = 1000\n", 122 | "k = 10 \n", 123 | "user = mx.symbol.Variable('user')\n", 124 | "item = mx.symbol.Variable('item')\n", 125 | "score = mx.symbol.Variable('score')\n", 126 | "# user feature lookup\n", 127 | "user = mx.symbol.Embedding(data = user, input_dim = num_users, output_dim = k) \n", 128 | "# item feature lookup\n", 129 | "item = mx.symbol.Embedding(data = item, input_dim = num_items, output_dim = k)\n", 130 | "# predict by the inner product, which is elementwise product and then sum\n", 131 | "pred = user * item\n", 132 | "pred = mx.symbol.sum_axis(data = pred, axis = 1)\n", 133 | "pred = mx.symbol.Flatten(data = pred)\n", 134 | "# loss layer\n", 135 | "pred = mx.symbol.LinearRegressionOutput(data = pred, label = score)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "### Data Iterators\n", 143 | "\n", 144 | "Now we are ready to show how to create a valid MXNet data iterator. An iterator should \n", 145 | "1. return a data batch or raise a `StopIteration` exception if reaching the end when call `next()` in python 2 or `__next()__` in python 3\n", 146 | "2. has `reset()` method to restart reading from the beginning\n", 147 | "3. has `provide_data` and `provide_label` attributes, the former returns a list of `(str, tuple)` pairs, each pair stores an input data variable name and its shape. It is similar for `provide_label`, which provides information about input labels.\n", 148 | "\n", 149 | "The following codes define a simple iterator that return some random data each time. " 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 4, 155 | "metadata": { 156 | "collapsed": true 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "import numpy as np\n", 161 | "class SimpleIter:\n", 162 | " def __init__(self, data_names, data_shapes, data_gen,\n", 163 | " label_names, label_shapes, label_gen, num_batches=10):\n", 164 | " self._provide_data = zip(data_names, data_shapes)\n", 165 | " self._provide_label = zip(label_names, label_shapes)\n", 166 | " self.num_batches = num_batches\n", 167 | " self.data_gen = data_gen\n", 168 | " self.label_gen = label_gen\n", 169 | " self.cur_batch = 0\n", 170 | "\n", 171 | " def __iter__(self):\n", 172 | " return self\n", 173 | "\n", 174 | " def reset(self):\n", 175 | " self.cur_batch = 0 \n", 176 | "\n", 177 | " def __next__(self):\n", 178 | " return self.next()\n", 179 | "\n", 180 | " @property\n", 181 | " def provide_data(self):\n", 182 | " return self._provide_data\n", 183 | "\n", 184 | " @property\n", 185 | " def provide_label(self):\n", 186 | " return self._provide_label\n", 187 | "\n", 188 | " def next(self):\n", 189 | " if self.cur_batch < self.num_batches:\n", 190 | " self.cur_batch += 1\n", 191 | " data = [mx.nd.array(g(d[1])) for d,g in zip(self._provide_data, self.data_gen)]\n", 192 | " label = [mx.nd.array(g(d[1])) for d,g in zip(self._provide_label, self.label_gen)]\n", 193 | " return SimpleBatch(data, label)\n", 194 | " else:\n", 195 | " raise StopIteration" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": { 201 | "collapsed": true 202 | }, 203 | "source": [ 204 | "Now we can feed the data iterator into a training problem. Here we used the `Module` class, more details about this class is discussed in [module.ipynb](./module.ipynb)." 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 5, 210 | "metadata": { 211 | "collapsed": false, 212 | "scrolled": true 213 | }, 214 | "outputs": [ 215 | { 216 | "name": "stderr", 217 | "output_type": "stream", 218 | "text": [ 219 | "INFO:root:Epoch[0] Train-accuracy=0.078125\n", 220 | "INFO:root:Epoch[0] Time cost=0.307\n", 221 | "INFO:root:Epoch[1] Train-accuracy=0.103125\n", 222 | "INFO:root:Epoch[1] Time cost=0.291\n", 223 | "INFO:root:Epoch[2] Train-accuracy=0.118750\n", 224 | "INFO:root:Epoch[2] Time cost=0.419\n", 225 | "INFO:root:Epoch[3] Train-accuracy=0.096875\n", 226 | "INFO:root:Epoch[3] Time cost=0.419\n", 227 | "INFO:root:Epoch[4] Train-accuracy=0.109375\n", 228 | "INFO:root:Epoch[4] Time cost=0.415\n" 229 | ] 230 | } 231 | ], 232 | "source": [ 233 | "import logging\n", 234 | "logging.basicConfig(level=logging.INFO)\n", 235 | "\n", 236 | "n = 32\n", 237 | "data = SimpleIter(['data'], [(n, 100)], \n", 238 | " [lambda s: np.random.uniform(-1, 1, s)],\n", 239 | " ['softmax_label'], [(n,)], \n", 240 | " [lambda s: np.random.randint(0, num_classes, s)])\n", 241 | "\n", 242 | "mod = mx.mod.Module(symbol=net)\n", 243 | "mod.fit(data, num_epoch=5)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "While for Symbol `pred`, we need to provide three inputs, two for examples and one for label." 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 6, 256 | "metadata": { 257 | "collapsed": false, 258 | "scrolled": true 259 | }, 260 | "outputs": [ 261 | { 262 | "name": "stderr", 263 | "output_type": "stream", 264 | "text": [ 265 | "INFO:root:Epoch[0] Train-accuracy=0.190625\n", 266 | "INFO:root:Epoch[0] Time cost=0.060\n", 267 | "INFO:root:Epoch[1] Train-accuracy=0.196875\n", 268 | "INFO:root:Epoch[1] Time cost=0.039\n", 269 | "INFO:root:Epoch[2] Train-accuracy=0.203125\n", 270 | "INFO:root:Epoch[2] Time cost=0.036\n", 271 | "INFO:root:Epoch[3] Train-accuracy=0.218750\n", 272 | "INFO:root:Epoch[3] Time cost=0.029\n", 273 | "INFO:root:Epoch[4] Train-accuracy=0.175000\n", 274 | "INFO:root:Epoch[4] Time cost=0.020\n" 275 | ] 276 | } 277 | ], 278 | "source": [ 279 | "data = SimpleIter(['user', 'item'],\n", 280 | " [(n,), (n,)],\n", 281 | " [lambda s: np.random.randint(0, num_users, s),\n", 282 | " lambda s: np.random.randint(0, num_items, s)],\n", 283 | " ['score'], [(n,)],\n", 284 | " [lambda s: np.random.randint(0, 5, s)])\n", 285 | "\n", 286 | "mod = mx.mod.Module(symbol=pred, data_names=['user', 'item'], label_names=['score'])\n", 287 | "mod.fit(data, num_epoch=5)" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": {}, 293 | "source": [ 294 | "## More Iterators \n", 295 | "\n", 296 | "MXNet provides multiple efficient data iterators.\n", 297 | "\n", 298 | "TODO. Explain more here.\n", 299 | "\n", 300 | "## Implementation\n", 301 | "\n", 302 | "Iterators can be implemented in either C++ or front-end languages such as Python. The C++ definition is at [include/mxnet/io.h](https://github.com/dmlc/mxnet/blob/master/include/mxnet/io.h), all C++ implementations are located in [src/io](https://github.com/dmlc/mxnet/tree/master/src/io). These implementations heavily rely on [dmlc-core](https://github.com/dmlc/dmlc-core), which supports reading data from various data format and filesystems. \n", 303 | "\n", 304 | "\n", 305 | "## Further Readings\n", 306 | "\n", 307 | "- [Data loading API](http://mxnet.io/packages/python/io.html)\n", 308 | "- [Design of efficient data format](http://mxnet.io/system/note_data_loading.html)" 309 | ] 310 | } 311 | ], 312 | "metadata": { 313 | "kernelspec": { 314 | "display_name": "Python 2", 315 | "language": "python", 316 | "name": "python2" 317 | }, 318 | "language_info": { 319 | "codemirror_mode": { 320 | "name": "ipython", 321 | "version": 2 322 | }, 323 | "file_extension": ".py", 324 | "mimetype": "text/x-python", 325 | "name": "python", 326 | "nbconvert_exporter": "python", 327 | "pygments_lexer": "ipython2", 328 | "version": "2.7.6" 329 | } 330 | }, 331 | "nbformat": 4, 332 | "nbformat_minor": 1 333 | } 334 | -------------------------------------------------------------------------------- /python/basic/module.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# Training and Inference Module\n", 10 | "\n", 11 | "In [mixed.ipynb](./mixed.ipynb) we shown how to develop a training program by using `NDArray` and `Symbol` together. To avoid writing such programs again and again, we modularized commonly used codes in the `module` (or `mod` for short) package. This package provides intermediate-level and high-level interface for executing predefined networks. \n", 12 | "\n", 13 | "\n", 14 | "## Basic Usage\n", 15 | "\n", 16 | "### Preliminary\n", 17 | "\n", 18 | "In this tutorial, we will use a simple multilayer perception for 10 classes and a synthetic dataset. " 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "metadata": { 25 | "collapsed": false 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "import mxnet as mx\n", 30 | "from data_iter import SyntheticData\n", 31 | "\n", 32 | "# mlp\n", 33 | "net = mx.sym.Variable('data')\n", 34 | "net = mx.sym.FullyConnected(net, name='fc1', num_hidden=64)\n", 35 | "net = mx.sym.Activation(net, name='relu1', act_type=\"relu\")\n", 36 | "net = mx.sym.FullyConnected(net, name='fc2', num_hidden=10)\n", 37 | "net = mx.sym.SoftmaxOutput(net, name='softmax')\n", 38 | "\n", 39 | "# synthetic 10 classes dataset with 128 dimension \n", 40 | "data = SyntheticData(10, 128)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "### Create Module\n", 48 | "\n", 49 | "The most widely used module class is `Module`, which wraps a `Symbol` and one or more `Executor`s.\n", 50 | "\n", 51 | "We construct a module by specify\n", 52 | "\n", 53 | "- symbol : the network Symbol\n", 54 | "- context : the device (or a list of devices) for execution\n", 55 | "- data_names : the list of data variable names \n", 56 | "- label_names : the list of label variable names\n", 57 | "\n", 58 | "One can refer to [data.ipynb](./data.ipynb) for more explanations about the last two arguments. Here we have only one data named `data`, and one label, with the name `softmax_label`, which is automatically named for us following the name `softmax` we specified for the `SoftmaxOutput` operator." 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 2, 64 | "metadata": { 65 | "collapsed": true 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "mod = mx.mod.Module(symbol=net, \n", 70 | " context=mx.cpu(),\n", 71 | " data_names=['data'], \n", 72 | " label_names=['softmax_label'])" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "### Train, Predict, and Evaluate\n", 80 | "\n", 81 | "Modules provide high-level APIs for training, predicting and evaluating. To fit a module, simply call the `fit` function with some `DataIters`. " 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 3, 87 | "metadata": { 88 | "collapsed": false 89 | }, 90 | "outputs": [ 91 | { 92 | "name": "stderr", 93 | "output_type": "stream", 94 | "text": [ 95 | "INFO:root:Epoch[0] Train-accuracy=0.131250\n", 96 | "INFO:root:Epoch[0] Time cost=0.028\n", 97 | "INFO:root:Epoch[0] Validation-accuracy=0.209375\n", 98 | "INFO:root:Epoch[1] Train-accuracy=0.153125\n", 99 | "INFO:root:Epoch[1] Time cost=0.039\n", 100 | "INFO:root:Epoch[1] Validation-accuracy=0.278125\n", 101 | "INFO:root:Epoch[2] Train-accuracy=0.203125\n", 102 | "INFO:root:Epoch[2] Time cost=0.029\n", 103 | "INFO:root:Epoch[2] Validation-accuracy=0.093750\n", 104 | "INFO:root:Epoch[3] Train-accuracy=0.137500\n", 105 | "INFO:root:Epoch[3] Time cost=0.029\n", 106 | "INFO:root:Epoch[3] Validation-accuracy=0.465625\n", 107 | "INFO:root:Epoch[4] Train-accuracy=0.400000\n", 108 | "INFO:root:Epoch[4] Time cost=0.029\n", 109 | "INFO:root:Epoch[4] Validation-accuracy=0.462500\n" 110 | ] 111 | }, 112 | { 113 | "name": "stdout", 114 | "output_type": "stream", 115 | "text": [ 116 | "Finished training\n" 117 | ] 118 | } 119 | ], 120 | "source": [ 121 | "import logging\n", 122 | "logging.basicConfig(level=logging.INFO)\n", 123 | "\n", 124 | "batch_size=32\n", 125 | "mod.fit(data.get_iter(batch_size), \n", 126 | " eval_data=data.get_iter(batch_size),\n", 127 | " optimizer='sgd',\n", 128 | " optimizer_params={'learning_rate':0.1},\n", 129 | " eval_metric='acc',\n", 130 | " num_epoch=5)\n", 131 | "print('Finished training')" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "To predict with a module, simply call `predict()` with a `DataIter`. It will collect and return all the prediction results." 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 4, 144 | "metadata": { 145 | "collapsed": false 146 | }, 147 | "outputs": [ 148 | { 149 | "name": "stdout", 150 | "output_type": "stream", 151 | "text": [ 152 | "shape of predict: (320L, 10L)\n" 153 | ] 154 | } 155 | ], 156 | "source": [ 157 | "y = mod.predict(data.get_iter(batch_size))\n", 158 | "print('shape of predict: %s' % (y.shape,))" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "Another convenient API for prediction in the case where the prediction results might be too large to fit in the memory is `iter_predict`:" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 5, 171 | "metadata": { 172 | "collapsed": false 173 | }, 174 | "outputs": [ 175 | { 176 | "name": "stdout", 177 | "output_type": "stream", 178 | "text": [ 179 | "batch 0, accuracy 0.437500\n", 180 | "batch 1, accuracy 0.531250\n", 181 | "batch 2, accuracy 0.250000\n", 182 | "batch 3, accuracy 0.500000\n", 183 | "batch 4, accuracy 0.406250\n", 184 | "batch 5, accuracy 0.531250\n", 185 | "batch 6, accuracy 0.593750\n", 186 | "batch 7, accuracy 0.468750\n", 187 | "batch 8, accuracy 0.531250\n", 188 | "batch 9, accuracy 0.312500\n" 189 | ] 190 | } 191 | ], 192 | "source": [ 193 | "for preds, i_batch, batch in mod.iter_predict(data.get_iter(batch_size)):\n", 194 | " pred_label = preds[0].asnumpy().argmax(axis=1)\n", 195 | " label = batch.label[0].asnumpy().astype('int32')\n", 196 | " print('batch %d, accuracy %f' % (i_batch, float(sum(pred_label==label))/len(label)))" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "If we do not need the prediction outputs, but just need to evaluate on a test set, we can call the `score()` function with a `DataIter` and a `EvalMetric`:" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 6, 209 | "metadata": { 210 | "collapsed": false 211 | }, 212 | "outputs": [ 213 | { 214 | "data": { 215 | "text/plain": [ 216 | "[('mse', 28.235007095336915), ('accuracy', 0.4625)]" 217 | ] 218 | }, 219 | "execution_count": 6, 220 | "metadata": {}, 221 | "output_type": "execute_result" 222 | } 223 | ], 224 | "source": [ 225 | "mod.score(data.get_iter(batch_size), ['mse', 'acc'])" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "### Save and Load\n", 233 | "\n", 234 | "We can save the module parameters in each training epoch by using a checkpoint callback." 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 7, 240 | "metadata": { 241 | "collapsed": false 242 | }, 243 | "outputs": [ 244 | { 245 | "name": "stderr", 246 | "output_type": "stream", 247 | "text": [ 248 | "INFO:root:Epoch[0] Train-accuracy=0.081250\n", 249 | "INFO:root:Epoch[0] Time cost=0.031\n", 250 | "INFO:root:Saved checkpoint to \"mx_mlp-0001.params\"\n", 251 | "INFO:root:Epoch[1] Train-accuracy=0.084375\n", 252 | "INFO:root:Epoch[1] Time cost=0.036\n", 253 | "INFO:root:Saved checkpoint to \"mx_mlp-0002.params\"\n", 254 | "INFO:root:Epoch[2] Train-accuracy=0.075000\n", 255 | "INFO:root:Epoch[2] Time cost=0.029\n", 256 | "INFO:root:Saved checkpoint to \"mx_mlp-0003.params\"\n", 257 | "INFO:root:Epoch[3] Train-accuracy=0.106250\n", 258 | "INFO:root:Epoch[3] Time cost=0.030\n", 259 | "INFO:root:Saved checkpoint to \"mx_mlp-0004.params\"\n", 260 | "INFO:root:Epoch[4] Train-accuracy=0.090625\n", 261 | "INFO:root:Epoch[4] Time cost=0.031\n", 262 | "INFO:root:Saved checkpoint to \"mx_mlp-0005.params\"\n" 263 | ] 264 | } 265 | ], 266 | "source": [ 267 | "# construct a callback function to save checkpoints\n", 268 | "model_prefix = 'mx_mlp'\n", 269 | "checkpoint = mx.callback.do_checkpoint(model_prefix)\n", 270 | "\n", 271 | "mod = mx.mod.Module(symbol=net)\n", 272 | "mod.fit(data.get_iter(batch_size), num_epoch=5, epoch_end_callback=checkpoint)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": {}, 278 | "source": [ 279 | "To load the saved module parameters, call the `load_checkpoint` function. It load the Symbol and the associated parameters. We can then set the loaded parameters into the module. " 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 8, 285 | "metadata": { 286 | "collapsed": false 287 | }, 288 | "outputs": [ 289 | { 290 | "name": "stdout", 291 | "output_type": "stream", 292 | "text": [ 293 | "True\n" 294 | ] 295 | } 296 | ], 297 | "source": [ 298 | "sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, 3)\n", 299 | "print(sym.tojson() == net.tojson())\n", 300 | "\n", 301 | "# assign the loaded parameters to the module\n", 302 | "mod.set_params(arg_params, aux_params)" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "Or if we just want to resume training from a saved checkpoint, instead of calling `set_params()`, we can directly call `fit()`, passing the loaded parameters, so that `fit()` knows to start from those parameters instead of initializing from random. We also set the `begin_epoch` so that so that `fit()` knows we are resuming from a previous saved epoch.\n", 310 | " " 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 9, 316 | "metadata": { 317 | "collapsed": false 318 | }, 319 | "outputs": [ 320 | { 321 | "name": "stderr", 322 | "output_type": "stream", 323 | "text": [ 324 | "INFO:root:Epoch[3] Train-accuracy=0.078125\n", 325 | "INFO:root:Epoch[3] Time cost=0.029\n", 326 | "INFO:root:Epoch[4] Train-accuracy=0.125000\n", 327 | "INFO:root:Epoch[4] Time cost=0.030\n" 328 | ] 329 | } 330 | ], 331 | "source": [ 332 | "mod = mx.mod.Module(symbol=sym)\n", 333 | "mod.fit(data.get_iter(batch_size),\n", 334 | " num_epoch=5,\n", 335 | " arg_params=arg_params, \n", 336 | " aux_params=aux_params,\n", 337 | " begin_epoch=3)" 338 | ] 339 | }, 340 | { 341 | "cell_type": "markdown", 342 | "metadata": {}, 343 | "source": [ 344 | "## Module as a computation \"machine\"\n", 345 | "\n", 346 | "We already seen how to module for basic training and inference. Now we are going to show a more flexiable usage of module.\n", 347 | "\n", 348 | "A module represents a computation component. The design purpose of a module is that it abstract a computation “machine”, that accpets `Symbol` programs and data, and then we can run forward, backward, update parameters, etc. \n", 349 | "\n", 350 | "We aim to make the APIs easy and flexible to use, especially in the case when we need to use imperative API to work with multiple modules (e.g. stochastic depth network).\n", 351 | "\n", 352 | "A module has several states:\n", 353 | "- **Initial state**. Memory is not allocated yet, not ready for computation yet.\n", 354 | "- **Binded**. Shapes for inputs, outputs, and parameters are all known, memory allocated, ready for computation.\n", 355 | "- **Parameter initialized**. For modules with parameters, doing computation before initializing the parameters might result in undefined outputs.\n", 356 | "- **Optimizer installed**. An optimizer can be installed to a module. After this, the parameters of the module can be updated according to the optimizer after gradients are computed (forward-backward).\n", 357 | "\n", 358 | "The following codes implement a simplified `fit()`. Here we used other components including initializer, optimizer, and metric, which are explained in other notebooks." 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 10, 364 | "metadata": { 365 | "collapsed": false 366 | }, 367 | "outputs": [ 368 | { 369 | "name": "stdout", 370 | "output_type": "stream", 371 | "text": [ 372 | "('accuracy', 0.3875)\n" 373 | ] 374 | } 375 | ], 376 | "source": [ 377 | "# initial state\n", 378 | "mod = mx.mod.Module(symbol=net)\n", 379 | "\n", 380 | "# bind, tell the module the data and label shapes, so\n", 381 | "# that memory could be allocated on the devices for computation\n", 382 | "train_iter = data.get_iter(batch_size)\n", 383 | "mod.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label)\n", 384 | "\n", 385 | "# init parameters\n", 386 | "mod.init_params(initializer=mx.init.Xavier(magnitude=2.))\n", 387 | "\n", 388 | "# init optimizer\n", 389 | "mod.init_optimizer(optimizer='sgd', optimizer_params=(('learning_rate', 0.1), ))\n", 390 | "\n", 391 | "# use accuracy as the metric\n", 392 | "metric = mx.metric.create('acc')\n", 393 | "\n", 394 | "# train one epoch, i.e. going over the data iter one pass\n", 395 | "for batch in train_iter:\n", 396 | " mod.forward(batch, is_train=True) # compute predictions\n", 397 | " mod.update_metric(metric, batch.label) # accumulate prediction accuracy\n", 398 | " mod.backward() # compute gradients\n", 399 | " mod.update() # update parameters using SGD\n", 400 | " \n", 401 | "# training accuracy\n", 402 | "print(metric.get())" 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": {}, 408 | "source": [ 409 | "Beside the operations, a module provides a lot of useful information.\n", 410 | "\n", 411 | "basic names:\n", 412 | "- **data_names**: list of string indicating the names of the required data.\n", 413 | "- **output_names**: list of string indicating the names of the outputs.\n", 414 | "\n", 415 | "state information\n", 416 | "- **binded**: bool, indicating whether the memory buffers needed for computation has been allocated.\n", 417 | "- **for_training**: whether the module is binded for training (if binded).\n", 418 | "- **params_initialized**: bool, indicating whether the parameters of this modules has been initialized.\n", 419 | "- **optimizer_initialized**: bool, indicating whether an optimizer is defined and initialized.\n", 420 | "- **inputs_need_grad**: bool, indicating whether gradients with respect to the input data is needed. Might be useful when implementing composition of modules.\n", 421 | "\n", 422 | "input/output information\n", 423 | "- **data_shapes**: a list of (name, shape). In theory, since the memory is allocated, we could directly provide the data arrays. But in the case of data parallelization, the data arrays might not be of the same shape as viewed from the external world.\n", 424 | "- **label_shapes**: a list of (name, shape). This might be [] if the module does not need labels (e.g. it does not contains a loss function at the top), or a module is not binded for training.\n", 425 | "- **output_shapes**: a list of (name, shape) for outputs of the module.\n", 426 | "\n", 427 | "parameters (for modules with parameters)\n", 428 | "- **get_params()**: return a tuple (arg_params, aux_params). Each of those is a dictionary of name to NDArray mapping. Those NDArray always lives on CPU. The actual parameters used for computing might live on other devices (GPUs), this function will retrieve (a copy of) the latest parameters.\n", 429 | "- **get_outputs()**: get outputs of the previous forward operation.\n", 430 | "- **get_input_grads()**: get the gradients with respect to the inputs computed in the previous backward operation.\n" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": 11, 436 | "metadata": { 437 | "collapsed": false 438 | }, 439 | "outputs": [ 440 | { 441 | "name": "stdout", 442 | "output_type": "stream", 443 | "text": [ 444 | "([('data', (32, 128))], [('softmax_label', (32,))], [('softmax_output', (32, 10L))])\n", 445 | "({'fc2_bias': , 'fc2_weight': , 'fc1_bias': , 'fc1_weight': }, {})\n" 446 | ] 447 | } 448 | ], 449 | "source": [ 450 | "print((mod.data_shapes, mod.label_shapes, mod.output_shapes))\n", 451 | "print(mod.get_params())" 452 | ] 453 | }, 454 | { 455 | "cell_type": "markdown", 456 | "metadata": {}, 457 | "source": [ 458 | "## More on Modules\n", 459 | "\n", 460 | "Module simplifies the implementation of new modules. For example\n", 461 | "\n", 462 | "- [`SequentialModule`](https://github.com/dmlc/mxnet/blob/master/python/mxnet/module/sequential_module.py) can chain multiple modules together\n", 463 | "- [`BucketingModule`](https://github.com/dmlc/mxnet/blob/master/python/mxnet/module/bucketing_module.py) is able to handle bucketing, which is useful for various length inputs and outputs\n", 464 | "- [`PythonModule`](https://github.com/dmlc/mxnet/blob/master/python/mxnet/module/python_module.py) implements many APIs as empty function to ease users to implement customized modules. \n", 465 | "\n", 466 | "See also [example/module](https://github.com/dmlc/mxnet/tree/master/example/module) for a list of code examples using the module API.\n", 467 | "\n", 468 | "## Implementation\n", 469 | "\n", 470 | "The `module` is implemented in python, located at [python/mxnet/module](https://github.com/dmlc/mxnet/tree/master/python/mxnet/module) \n", 471 | "\n", 472 | "## Futher Readings\n", 473 | "\n", 474 | "- [module API doc](http://mxnet.io/packages/python/module.html#module-interface-api)" 475 | ] 476 | } 477 | ], 478 | "metadata": { 479 | "anaconda-cloud": {}, 480 | "kernelspec": { 481 | "display_name": "Python [default]", 482 | "language": "python", 483 | "name": "python2" 484 | }, 485 | "language_info": { 486 | "codemirror_mode": { 487 | "name": "ipython", 488 | "version": 2 489 | }, 490 | "file_extension": ".py", 491 | "mimetype": "text/x-python", 492 | "name": "python", 493 | "nbconvert_exporter": "python", 494 | "pygments_lexer": "ipython2", 495 | "version": "2.7.12" 496 | } 497 | }, 498 | "nbformat": 4, 499 | "nbformat_minor": 1 500 | } 501 | -------------------------------------------------------------------------------- /python/recommendation_systems/cdl/collaborative-dl.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# MXNet for Collaborative Deep Learning in Recommender Systems\n", 8 | "In this tutorial, we build on MXNet to implement the Collaborative Deep Learning (CDL) [1] model for recommender systems.\n", 9 | "\n", 10 | "## Brief Introduction of CDL\n", 11 | "\n", 12 | "In CDL, a probabilistic stacked denoising autoencoder (pSDAE) is connected to a matrix factorization (MF) component. Model training will alternate between pSDAE and MF. In each epoch, a pSDAE with a reconstruction target at the end and a regression target in the bottleneck will be udpated before updating the latent factors U and V in the regularized MF.\n", 13 | "\n", 14 | "Below is the graphical model for CDL. The part in the red rectangle is pSDAE and the rest is the MF component regularized by pSDAE. Essentially, the updating will alternate between pSDAE (updating $W^+$) and the MF component (updating $u$ and $v$).\n", 15 | "### Some Notation:\n", 16 | "- $x_0$: the input vectors to pSDAE (corrupted data, e.g., randomly deleting some entries of the input)\n", 17 | "- $x_c$: the reconstruction target vectors (the uncorrupted data)\n", 18 | "- $x_{L/2}$: the output vectors of pSDAE's middle layer (bottleneck layer)\n", 19 | "- $X_0$: the matrix consists of vectors $x_0$\n", 20 | "- $X_c$: the matrix consists of vectors $x_c$\n", 21 | "- $W^+$: weights and biases of pSDAE\n", 22 | "- $v$: latent item vectors\n", 23 | "- $u$: latent user vectors\n", 24 | "- $R$: rating matrix ('1' if the article is in the user's library and '0' otherwise)\n", 25 | "- $I$: number of users\n", 26 | "- $J$: number of items\n", 27 | "- $\\lambda_u$, $\\lambda_v$, $\\lambda_w$, $\\lambda_n$: hyperparameters\n", 28 | "\n", 29 | "![](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/cdl/PGM-CDL.png)\n", 30 | "\n", 31 | "Below we show a special case of CDL (from a neural network perspective), where it degenerates to simultaneously training two neural networks overlaid together with a common input layer (the corrupted input) but different output layers. This might be a lot easier to understand for people not familiar with graphical models. \n", 32 | "\n", 33 | "![](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/cdl/NN-CDL.png)\n", 34 | "\n", 35 | "The objective function (which we use in this implementation) for this special case is:\n", 36 | "\\begin{align}\n", 37 | "\\mathscr{L}=&-\\frac{\\lambda_u}{2}\\sum\\limits_i \\|u_i\\|_2^2\n", 38 | "-\\frac{\\lambda_w}{2}\\sum\\limits_l(\\|W_l\\|_F^2+\\|b_l\\|_2^2)\\nonumber \\\\\n", 39 | "&-\\frac{\\lambda_v}{2}\\sum\\limits_j\\|v_j-f_e(X_{0,j*},W^+)^T\\|_2^2 \\nonumber \\\\\n", 40 | "&-\\frac{\\lambda_n}{2}\\sum\\limits_j\\|f_r(X_{0,j*},W^+)-X_{c,j*}\\|_2^2 \\nonumber \\\\\n", 41 | "&-\\sum\\limits_{i,j}\\frac{C_{ij}}{2}(R_{ij}-u_i^Tv_j)^2,\n", 42 | "\\end{align}\n", 43 | "where the encoder function $f_e(\\cdot,W^+)$ takes the corrupted content vector $X_{0,j*}$ of item $j$ as input and computes the encoding of the item, and the function $f_r(\\cdot,W^+)$ also takes $X_{0,j*}$ as input, computes the encoding and then the reconstructed content vector of item $j$. Here $\\lambda_w$, $\\lambda_n$, $\\lambda_u$, and $\\lambda_v$ are hyperparameters and $C_{ij}$ is a confidence parameter ($C_{ij} = a$ if $R_{ij}=1$ and $C_{ij}=b$ otherwise). For example, if the number of layers $L=6$, $f_e(X_{0,j*},W^+)$ is the output of the third layer while $f_r(X_{0,j*},W^+)$ is the output of the sixth layer.\n", 44 | "\n", 45 | "To learn CDL, we have to implement the block coordinate descent (BCD) update using numpy/mshadow and call this BCD procedure after each epoch of pSDAE. Besides the MF part, another difference between CDL and conventional deep learning models is that pSDAE has a fixed target at the end and a dynamic target (the latent item factors V) in the bottleneck layer. It might need some hacking to make this work.\n", 46 | "\n", 47 | "[1] H. Wang, N. Wang, and D. Yeung. Collaborative deep learning for recommender systems. In KDD, 2015." 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "## Implementing CDL in MXNet for Recommender Systerms" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 1, 60 | "metadata": { 61 | "collapsed": true 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "import mxnet as mx\n", 66 | "import numpy as np\n", 67 | "import logging\n", 68 | "import data\n", 69 | "from math import sqrt\n", 70 | "from autoencoder import AutoEncoderModel\n", 71 | "import os" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "### Setting Hyperparameters\n", 79 | "- lambda_u: regularization coefficent for user latent matrix U\n", 80 | "- lambda_v: regularization coefficent for item latent matrix V\n", 81 | "- K: number of latent factors\n", 82 | "- is_dummy: whether to use a dummy dataset for demo\n", 83 | "- num_iter: number of iterations (minibatches) to train (a epoch in the used dataset takes about 68 iterations)\n", 84 | "- batch_size: minibatch size\n", 85 | "- dir_save: directory to save training results\n", 86 | "- lv: lambda_v/lambda_n in CDL; this controls the trade-off between reconstruction error in pSDAE and recommendation accuracy during training" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 2, 92 | "metadata": { 93 | "collapsed": true 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "lambda_u = 1 # lambda_u in CDL\n", 98 | "lambda_v = 10 # lambda_v in CDL\n", 99 | "K = 50\n", 100 | "p = 1\n", 101 | "is_dummy = False\n", 102 | "num_iter = 100 # about 68 iterations/epoch, the recommendation results at the end need 100 epochs\n", 103 | "batch_size = 256\n", 104 | "\n", 105 | "np.random.seed(1234) # set seed\n", 106 | "lv = 1e-2 # lambda_v/lambda_n in CDL\n", 107 | "dir_save = 'cdl%d' % p" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "Create the directory and the log file." 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 3, 120 | "metadata": { 121 | "collapsed": false 122 | }, 123 | "outputs": [ 124 | { 125 | "name": "stdout", 126 | "output_type": "stream", 127 | "text": [ 128 | "p1: lambda_v/lambda_u/ratio/K: 10.000000/1.000000/0.010000/50\n" 129 | ] 130 | } 131 | ], 132 | "source": [ 133 | "if not os.path.isdir(dir_save):\n", 134 | " os.system('mkdir %s' % dir_save)\n", 135 | "fp = open(dir_save+'/cdl.log','w')\n", 136 | "print 'p%d: lambda_v/lambda_u/ratio/K: %f/%f/%f/%d' % (p,lambda_v,lambda_u,lv,K)\n", 137 | "fp.write('p%d: lambda_v/lambda_u/ratio/K: %f/%f/%f/%d\\n' % \\\n", 138 | " (p,lambda_v,lambda_u,lv,K))\n", 139 | "fp.close()" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "### Loading Data\n", 147 | "Here we load the text information (as input to pSDAE) in the file mult.dat and the rating matrix (as input for the MF part) in the file cf-train-1-users.dat. Code for loading the data are packed in data.py.\n", 148 | "\n", 149 | "We use the CiteULike dataset here. The input text is bag-of-words vectors normalized to [0,1]. Some details:\n", 150 | "- task: recommend articles to users\n", 151 | "- number of users: 5551\n", 152 | "- number of items: 16980\n", 153 | "- number of ratings for training: ~169800\n", 154 | "- number of terms: 8000" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 4, 160 | "metadata": { 161 | "collapsed": false 162 | }, 163 | "outputs": [], 164 | "source": [ 165 | "# download data\n", 166 | "import os\n", 167 | "data_url='https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/cdl'\n", 168 | "for filename in ('mult.dat', 'cf-train-1-users.dat', 'cf-test-1-users.dat', 'raw-data.csv'):\n", 169 | " if not os.path.exists(filename):\n", 170 | " os.system(\"wget %s/%s\" % (data_url, filename))\n", 171 | "# read data\n", 172 | "X = data.get_mult()\n", 173 | "R = data.read_user()" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "### Network Definition\n", 181 | "Here we deine the logging level and construct the network. As mentioned before, pSDAE has multiple targets, we used `mx.symbol.Group` to group both loss. The codes snippets are shown as following, refer to [autoencode.py](autoencoder.py) for more details. \n", 182 | "\n", 183 | "```python\n", 184 | "fe_loss = mx.symbol.LinearRegressionOutput(\n", 185 | " data=self.lambda_v_rt*self.encoder,\n", 186 | " label=self.lambda_v_rt*self.V)\n", 187 | "fr_loss = mx.symbol.LinearRegressionOutput(\n", 188 | " data=self.decoder, label=self.data)\n", 189 | "loss = mx.symbol.Group([fe_loss, fr_loss]) \n", 190 | "```" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 5, 196 | "metadata": { 197 | "collapsed": false 198 | }, 199 | "outputs": [], 200 | "source": [ 201 | "logging.basicConfig(level=logging.INFO)\n", 202 | "cdl_model = AutoEncoderModel(mx.cpu(2), [X.shape[1],100,K],\n", 203 | " pt_dropout=0.2, internal_act='relu', output_act='relu')" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "### Initializing Variables\n", 211 | "Here we initialize several variables. V is the latent item matrix and lambda_v_rt is an ndarray with entries equal to sqrt(lv). We need this lambda_v_rt to hack the trade-off between two targets in pSDAE." 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 6, 217 | "metadata": { 218 | "collapsed": true 219 | }, 220 | "outputs": [], 221 | "source": [ 222 | "train_X = X\n", 223 | "V = np.random.rand(train_X.shape[0],K)/10\n", 224 | "lambda_v_rt = np.ones((train_X.shape[0],K))*sqrt(lv)" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "### Training the CDL\n", 232 | "Train the whole CDL (joint training of pSDAE and the connected MF). We use SGD for pSDAE and BCD for the MF part. U is the user latent matrix, V is the item latent matrix, theta is the output of pSDAE's middle layer, and BCD_loss equals to rating_loss+reg_loss_for_U+reg_loss_for_V. For demostration we train for only 100 iterations (about 1.5 epochs) here. The shown recommendations in later parts are results after 100 epochs.\n", 233 | "\n", 234 | "The function finetune below will call the function 'solve' in the solver.py, where the customized training loop resides. In the training loop, we call the following code after each epoch of pSDAE to update U and V using BCD. The BCD updating procedure is wrapped up in the function BCD_one. Note that after each epoch, we upate U and V for only one iteration.\n", 235 | "\n", 236 | "```python\n", 237 | "theta = model.extract_feature(sym[0], args, auxs,\n", 238 | " data_iter, X.shape[0], xpu).values()[0]\n", 239 | "# update U, V and get BCD loss\n", 240 | "U, V, BCD_loss = BCD_one(R, U, V, theta,\n", 241 | " lambda_u, lambda_v, dir_save, True)\n", 242 | "# get recon' loss\n", 243 | "Y = model.extract_feature(sym[1], args, auxs,\n", 244 | " data_iter, X.shape[0], xpu).values()[0]\n", 245 | "Recon_loss = lambda_v/np.square(lambda_v_rt_old[0,0])*np.sum(np.square(Y-X))/2.0\n", 246 | "lambda_v_rt[:] = lambda_v_rt_old[:] # back to normal lambda_v_rt\n", 247 | "data_iter = mx.io.NDArrayIter({'data': X, 'V': V, 'lambda_v_rt':\n", 248 | " lambda_v_rt},\n", 249 | " batch_size=batch_size, shuffle=False,\n", 250 | " last_batch_handle='pad')\n", 251 | "data_iter.reset()\n", 252 | "batch = data_iter.next()\n", 253 | "```" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 7, 259 | "metadata": { 260 | "collapsed": false 261 | }, 262 | "outputs": [ 263 | { 264 | "name": "stderr", 265 | "output_type": "stream", 266 | "text": [ 267 | "INFO:root:Fine tuning...\n", 268 | "INFO:root:Iter:0 metric:0.001668\n" 269 | ] 270 | }, 271 | { 272 | "name": "stdout", 273 | "output_type": "stream", 274 | "text": [ 275 | "Epoch 1 - tr_err/bcd_err/rec_err: 53641376.1/27755.0/53613621.1\n" 276 | ] 277 | } 278 | ], 279 | "source": [ 280 | "U, V, theta, BCD_loss = cdl_model.finetune(train_X, R, V, lambda_v_rt, lambda_u,\n", 281 | " lambda_v, dir_save, batch_size,\n", 282 | " num_iter, 'sgd', l_rate=0.1, decay=0.0,\n", 283 | " lr_scheduler=mx.misc.FactorScheduler(20000,0.1))" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": {}, 289 | "source": [ 290 | "### Saving Models and Parameters\n", 291 | "Save the network (pSDAE) parameters, latent matrices, and middle-layer output." 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 8, 297 | "metadata": { 298 | "collapsed": true 299 | }, 300 | "outputs": [], 301 | "source": [ 302 | "cdl_model.save(dir_save+'/cdl_pt.arg')\n", 303 | "np.savetxt(dir_save+'/final-U.dat.demo',U,fmt='%.5f',comments='')\n", 304 | "np.savetxt(dir_save+'/final-V.dat.demo',V,fmt='%.5f',comments='')\n", 305 | "np.savetxt(dir_save+'/final-theta.dat.demo',theta,fmt='%.5f',comments='')" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "### Computing Training Error\n", 313 | "The training loss consists of the loss in pSDAE and that in MF." 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 9, 319 | "metadata": { 320 | "collapsed": false 321 | }, 322 | "outputs": [ 323 | { 324 | "name": "stdout", 325 | "output_type": "stream", 326 | "text": [ 327 | "Training error: 53629559.864\n" 328 | ] 329 | } 330 | ], 331 | "source": [ 332 | "Recon_loss = lambda_v/lv*cdl_model.eval(train_X,V,lambda_v_rt)\n", 333 | "print \"Training error: %.3f\" % (BCD_loss+Recon_loss)\n", 334 | "fp = open(dir_save+'/cdl.log','a')\n", 335 | "fp.write(\"Training error: %.3f\\n\" % (BCD_loss+Recon_loss))\n", 336 | "fp.close()" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "metadata": {}, 342 | "source": [ 343 | "### Generating Recommendations\n", 344 | "Load the latent matrices (U and V), compute the predicted ratings R=UV^T, and generate recommendation lists for each user. There 5551 users in the dataset." 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 10, 350 | "metadata": { 351 | "collapsed": false, 352 | "scrolled": true 353 | }, 354 | "outputs": [ 355 | { 356 | "name": "stdout", 357 | "output_type": "stream", 358 | "text": [ 359 | "User 100\n", 360 | "User 200\n", 361 | "User 300\n", 362 | "User 400\n", 363 | "User 500\n", 364 | "User 600\n", 365 | "User 700\n", 366 | "User 800\n", 367 | "User 900\n", 368 | "User 1000\n", 369 | "User 1100\n", 370 | "User 1200\n", 371 | "User 1300\n", 372 | "User 1400\n", 373 | "User 1500\n", 374 | "User 1600\n", 375 | "User 1700\n", 376 | "User 1800\n", 377 | "User 1900\n", 378 | "User 2000\n", 379 | "User 2100\n", 380 | "User 2200\n", 381 | "User 2300\n", 382 | "User 2400\n", 383 | "User 2500\n", 384 | "User 2600\n", 385 | "User 2700\n", 386 | "User 2800\n", 387 | "User 2900\n", 388 | "User 3000\n", 389 | "User 3100\n", 390 | "User 3200\n", 391 | "User 3300\n", 392 | "User 3400\n", 393 | "User 3500\n", 394 | "User 3600\n", 395 | "User 3700\n", 396 | "User 3800\n", 397 | "User 3900\n", 398 | "User 4000\n", 399 | "User 4100\n", 400 | "User 4200\n", 401 | "User 4300\n", 402 | "User 4400\n", 403 | "User 4500\n", 404 | "User 4600\n", 405 | "User 4700\n", 406 | "User 4800\n", 407 | "User 4900\n", 408 | "User 5000\n", 409 | "User 5100\n", 410 | "User 5200\n", 411 | "User 5300\n", 412 | "User 5400\n", 413 | "User 5500\n" 414 | ] 415 | } 416 | ], 417 | "source": [ 418 | "import numpy as np\n", 419 | "from data import read_user\n", 420 | "def cal_rec(p,cut):\n", 421 | " R_true = read_user('cf-test-1-users.dat')\n", 422 | " dir_save = 'cdl'+str(p)\n", 423 | " #U = np.mat(np.loadtxt(dir_save+'/final-U.dat'))\n", 424 | " #V = np.mat(np.loadtxt(dir_save+'/final-V.dat'))\n", 425 | " R = U*V.T\n", 426 | " num_u = R.shape[0]\n", 427 | " num_hit = 0\n", 428 | " fp = open(dir_save+'/rec-list.dat','w')\n", 429 | " for i in range(num_u):\n", 430 | " if i!=0 and i%100==0:\n", 431 | " print 'User '+str(i)\n", 432 | " l_score = R[i,:].A1.tolist()\n", 433 | " pl = sorted(enumerate(l_score),key=lambda d:d[1],reverse=True)\n", 434 | " l_rec = list(zip(*pl)[0])[:cut]\n", 435 | " s_rec = set(l_rec)\n", 436 | " s_true = set(np.where(R_true[i,:]>0)[1].A1)\n", 437 | " cnt_hit = len(s_rec.intersection(s_true))\n", 438 | " fp.write('%d:' % cnt_hit)\n", 439 | " fp.write(' '.join(map(str,l_rec)))\n", 440 | " fp.write('\\n')\n", 441 | " fp.close()\n", 442 | "\n", 443 | "cal_rec(1,8)" 444 | ] 445 | }, 446 | { 447 | "cell_type": "markdown", 448 | "metadata": {}, 449 | "source": [ 450 | "### Show Recommendations\n", 451 | "Load the article titles (raw-data.csv), ratings (cf-train-1-users.dat and cf-test-1-users.dat), and recommendation lists (rec-list.dat)." 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": 11, 457 | "metadata": { 458 | "collapsed": false 459 | }, 460 | "outputs": [], 461 | "source": [ 462 | "import csv\n", 463 | "from data import read_user\n", 464 | "import numpy as np\n", 465 | "p = 1\n", 466 | "# read predicted results\n", 467 | "dir_save = 'cdl%d' % p\n", 468 | "csvReader = csv.reader(open('raw-data.csv','rb'))\n", 469 | "d_id_title = dict()\n", 470 | "for i,row in enumerate(csvReader):\n", 471 | " if i==0:\n", 472 | " continue\n", 473 | " d_id_title[i-1] = row[3]\n", 474 | "R_test = read_user('cf-test-1-users.dat')\n", 475 | "R_train = read_user('cf-train-1-users.dat')\n", 476 | "fp = open(dir_save+'/rec-list.dat')\n", 477 | "lines = fp.readlines()" 478 | ] 479 | }, 480 | { 481 | "cell_type": "markdown", 482 | "metadata": {}, 483 | "source": [ 484 | "Show the titles of articles in the training set and titles of recommended articles. Correctly recommended articles are marked by asterisks." 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": 12, 490 | "metadata": { 491 | "collapsed": false, 492 | "scrolled": true 493 | }, 494 | "outputs": [ 495 | { 496 | "name": "stdout", 497 | "output_type": "stream", 498 | "text": [ 499 | "########## User 3 ##########\n", 500 | "\n", 501 | "##### Articles in the Training Sets #####\n", 502 | "Formal Ontology and Information Systems\n", 503 | "Ontologies: a silver bullet for knowledge management and electronic commerce\n", 504 | "Business Process Execution Language for Web Services version 1.1\n", 505 | "Unraveling the Web services web: an introduction to SOAP, WSDL, and UDDI\n", 506 | "A cookbook for using the model-view controller user interface paradigm in Smalltalk-80\n", 507 | "Object-oriented application frameworks\n", 508 | "Data integration: a theoretical perspective\n", 509 | "Web services: been there, done that?\n", 510 | "Sweetening Ontologies with DOLCE\n", 511 | "Naive Geography\n", 512 | "\n", 513 | "##### Articles Recommended (Correct Ones Marked by Asterisks) #####\n", 514 | "The Wisdom of Crowds\n", 515 | "Nexus: Small Worlds and the Groundbreaking Theory of Networks\n", 516 | "VisANT: an integrative framework for networks in systems biology\n", 517 | "The Essays of Warren Buffett : Lessons for Corporate America\n", 518 | "What's new in pediatric orthopaedics.\n", 519 | "Doing with Understanding: Lessons from Research on Problem- and Project-Based Learning\n", 520 | "Predictably Irrational: The Hidden Forces That Shape Our Decisions\n", 521 | "The Predictably Irrational CD: The Hidden Forces That Shape Our Decisions\n" 522 | ] 523 | } 524 | ], 525 | "source": [ 526 | "user_id = 3\n", 527 | "s_test = set(np.where(R_test[user_id,:]>0)[1].A1)\n", 528 | "l_train = np.where(R_train[user_id,:]>0)[1].A1.tolist()\n", 529 | "l_pred = map(int,lines[user_id].strip().split(':')[1].split(' '))\n", 530 | "print '########## User '+str(user_id)+' ##########\\n'\n", 531 | "print '##### Articles in the Training Sets #####'\n", 532 | "for i in l_train:\n", 533 | " print d_id_title[i]\n", 534 | "print '\\n##### Articles Recommended (Correct Ones Marked by Asterisks) #####'\n", 535 | "for i in l_pred:\n", 536 | " if i in s_test:\n", 537 | " print '* '+d_id_title[i]\n", 538 | " else:\n", 539 | " print d_id_title[i]\n", 540 | "fp.close()" 541 | ] 542 | } 543 | ], 544 | "metadata": { 545 | "kernelspec": { 546 | "display_name": "Python 2", 547 | "language": "python", 548 | "name": "python2" 549 | }, 550 | "language_info": { 551 | "codemirror_mode": { 552 | "name": "ipython", 553 | "version": 2 554 | }, 555 | "file_extension": ".py", 556 | "mimetype": "text/x-python", 557 | "name": "python", 558 | "nbconvert_exporter": "python", 559 | "pygments_lexer": "ipython2", 560 | "version": "2.7.6" 561 | } 562 | }, 563 | "nbformat": 4, 564 | "nbformat_minor": 1 565 | } 566 | -------------------------------------------------------------------------------- /python/basic/ndarray.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# NDArray Tutorial\n", 8 | "\n", 9 | "\n", 10 | "One of the main object in MXNet is the multidimensional array provided by the package `mxnet.ndarray`, or `mxnet.nd` for short. If you familiar with the scientific computing python package [NumPy](http://www.numpy.org/), `mxnet.ndarray` is similar to `numpy.ndarray` in many aspects. \n", 11 | "\n", 12 | "## The basic\n", 13 | "\n", 14 | "A multidimensional array is a table of numbers with the same type. For example, the coordinates of a point in 3D space `[1, 2, 3]` is a 1-dimensional array with that dimension has a length of 3. The following picture shows a 2-dimensional array. The length of the first dimension is 2, and the second dimension has a length of 3\n", 15 | "```\n", 16 | "[[0, 1, 2]\n", 17 | " [3, 4, 5]]\n", 18 | "```\n", 19 | "The array class is called `NDArray`. Some important attributes of a `NDArray` object are:\n", 20 | "\n", 21 | "- **ndarray.shape** the dimensions of the array. It is a tuple of integers indicating the length of the array in each dimension. For a matrix with `n` rows and `m` columns, the `shape` will be `(n, m)`. \n", 22 | "- **ndarray.dtype** an `numpy` object describing the type of the elements.\n", 23 | "- **ndarray.size** the total number of numbers in the array, which equals to the product of the elements of `shape`\n", 24 | "- **ndarray.context** the device this array is stored. A device can be the CPU or the i-th GPU.\n", 25 | "- **ndarray.handle** the pointer to the according C++ object. Normally we won't need to use this attribute. \n", 26 | "\n", 27 | "### An example" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 1, 33 | "metadata": { 34 | "collapsed": false, 35 | "scrolled": false 36 | }, 37 | "outputs": [ 38 | { 39 | "data": { 40 | "text/plain": [ 41 | "{'context': cpu(0),\n", 42 | " 'data type': numpy.float32,\n", 43 | " 'shape': (1L, 2L),\n", 44 | " 'size': 2,\n", 45 | " 'type': mxnet.ndarray.NDArray}" 46 | ] 47 | }, 48 | "execution_count": 1, 49 | "metadata": {}, 50 | "output_type": "execute_result" 51 | } 52 | ], 53 | "source": [ 54 | "import mxnet as mx\n", 55 | "a = mx.nd.array([[2,3]])\n", 56 | "{'shape': a.shape, 'size':a.size, 'data type':a.dtype, 'context':a.context, 'type':type(a)}" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "### Array Creation \n", 64 | "An array can be created in multiple ways. For example, we can create an array from a regular Python list or tuple by using the `array` function" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 2, 70 | "metadata": { 71 | "collapsed": false 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "a = mx.nd.array([1,2,3]) # create a 1-dimensional array with a python list\n", 76 | "b = mx.nd.array([[1,2,3], [2,3,4]]) # create a 2-dimensional array with a nested python list " 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "or even from an `numpy.ndarray` object" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 3, 89 | "metadata": { 90 | "collapsed": false 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "import numpy as np\n", 95 | "c = np.arange(15).reshape(3,5)\n", 96 | "a = mx.nd.array(c) # create a 2-dimensional array from a numpy.ndarray object" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "We can specify the element type with the option `dtype`, which accepts a numpy type. In default, `float32` is used. " 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 4, 109 | "metadata": { 110 | "collapsed": false 111 | }, 112 | "outputs": [ 113 | { 114 | "data": { 115 | "text/plain": [ 116 | "(numpy.float32, numpy.int32, numpy.float16)" 117 | ] 118 | }, 119 | "execution_count": 4, 120 | "metadata": {}, 121 | "output_type": "execute_result" 122 | } 123 | ], 124 | "source": [ 125 | "a = mx.nd.array([1,2,3]) # float32 is used in deafult\n", 126 | "b = mx.nd.array([1,2,3], dtype=np.int32) # create an int32 array\n", 127 | "c = mx.nd.array([1.2, 2.3], dtype=np.float16) # create a 16-bit float array\n", 128 | "(a.dtype, b.dtype, c.dtype)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "If we only know the size but not the element values, there are several functions to create arrays with initial placeholder content. " 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 5, 141 | "metadata": { 142 | "collapsed": false 143 | }, 144 | "outputs": [], 145 | "source": [ 146 | "a = mx.nd.zeros((2,3)) # create a 2-dimensional array full of zeros with shape (2,3) \n", 147 | "b = mx.nd.ones((2,3)) # create a same shape array full of ones\n", 148 | "c = mx.nd.full((2,3), 7) # create a same shape array with all elements set to 7\n", 149 | "d = mx.nd.empty((2,3)) # create a same shape whose initial content is random and depends on the state of the memory" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "### Printing Arrays\n", 157 | "We often first convert `NDArray` to `numpy.ndarray` by the function `asnumpy` for printing. Numpy uses the following layout:\n", 158 | "- the last axis is printed from left to right,\n", 159 | "- the second-to-last is printed from top to bottom,\n", 160 | "- the rest are also printed from top to bottom, with each slice separated from the next by an empty line." 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 6, 166 | "metadata": { 167 | "collapsed": false 168 | }, 169 | "outputs": [ 170 | { 171 | "name": "stdout", 172 | "output_type": "stream", 173 | "text": [ 174 | "[[ 1. 1. 1.]\n", 175 | " [ 1. 1. 1.]]\n", 176 | "[[ 0. 0. 0. ..., 0. 0. 0.]\n", 177 | " [ 0. 0. 0. ..., 0. 0. 0.]\n", 178 | " [ 0. 0. 0. ..., 0. 0. 0.]\n", 179 | " ..., \n", 180 | " [ 0. 0. 0. ..., 0. 0. 0.]\n", 181 | " [ 0. 0. 0. ..., 0. 0. 0.]\n", 182 | " [ 0. 0. 0. ..., 0. 0. 0.]]\n" 183 | ] 184 | } 185 | ], 186 | "source": [ 187 | "b = mx.nd.ones((2,3))\n", 188 | "print(b.asnumpy())\n", 189 | "c = mx.nd.zeros((1000,1000))\n", 190 | "print(c.asnumpy())" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "### Copies\n", 198 | "Data is *NOT* copied in normal assignment and function arguments passing." 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 7, 204 | "metadata": { 205 | "collapsed": false 206 | }, 207 | "outputs": [ 208 | { 209 | "name": "stdout", 210 | "output_type": "stream", 211 | "text": [ 212 | "True\n", 213 | "140484456318800\n", 214 | "140484456318800\n" 215 | ] 216 | } 217 | ], 218 | "source": [ 219 | "a = mx.nd.ones((2,2))\n", 220 | "b = a # copy by reference\n", 221 | "print(b is a)\n", 222 | "def f(x): # also copy by reference\n", 223 | " print(id(x))\n", 224 | "f(a)\n", 225 | "print(id(a))" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "The `copy` method makes a deep copy of the array and its data" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 8, 238 | "metadata": { 239 | "collapsed": false 240 | }, 241 | "outputs": [ 242 | { 243 | "name": "stdout", 244 | "output_type": "stream", 245 | "text": [ 246 | "False\n" 247 | ] 248 | } 249 | ], 250 | "source": [ 251 | "b = a.copy()\n", 252 | "print (b is a)" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "We can also use the `copyto` method or the slice operator `[]` to avoid additional memory allocation" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 9, 265 | "metadata": { 266 | "collapsed": false 267 | }, 268 | "outputs": [ 269 | { 270 | "name": "stdout", 271 | "output_type": "stream", 272 | "text": [ 273 | "140484456317456\n", 274 | "140484456317456\n", 275 | "140484456317456\n" 276 | ] 277 | } 278 | ], 279 | "source": [ 280 | "b = mx.nd.ones(a.shape)\n", 281 | "print(id(b))\n", 282 | "b[:] = a\n", 283 | "print(id(b))\n", 284 | "a.copyto(b)\n", 285 | "print(id(b))" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": {}, 291 | "source": [ 292 | "### Basic Operations\n", 293 | "Arithmetic operators on arrays apply *elementwise*. A new array is created and filled with the result." 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 10, 299 | "metadata": { 300 | "collapsed": false 301 | }, 302 | "outputs": [ 303 | { 304 | "name": "stdout", 305 | "output_type": "stream", 306 | "text": [ 307 | "[[-2. -2. -2.]\n", 308 | " [-2. -2. -2.]]\n", 309 | "[[-0.7568025 -0.7568025]\n", 310 | " [-0.7568025 -0.7568025]\n", 311 | " [-0.7568025 -0.7568025]]\n", 312 | "[[ 2. 2. 2.]\n", 313 | " [ 2. 2. 2.]]\n" 314 | ] 315 | } 316 | ], 317 | "source": [ 318 | "a = mx.nd.ones((2,3))\n", 319 | "b = mx.nd.ones((2,3))\n", 320 | "c = a + b # elementwise plus\n", 321 | "d = - c # elementwise minus\n", 322 | "print(d.asnumpy())\n", 323 | "e = mx.nd.sin(c**2).T # elementwise pow and sin, and then transpose\n", 324 | "print(e.asnumpy())\n", 325 | "f = mx.nd.maximum(a, c) # elementwise max\n", 326 | "print(f.asnumpy())" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": { 332 | "collapsed": false 333 | }, 334 | "source": [ 335 | "Simiar to `NumPy`, `*` is used for elementwise multiply, while matrix-matrix multiplication is left for `dot`" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 11, 341 | "metadata": { 342 | "collapsed": false 343 | }, 344 | "outputs": [ 345 | { 346 | "name": "stdout", 347 | "output_type": "stream", 348 | "text": [ 349 | "[[ 1. 1.]\n", 350 | " [ 1. 1.]]\n", 351 | "[[ 2. 2.]\n", 352 | " [ 2. 2.]]\n" 353 | ] 354 | } 355 | ], 356 | "source": [ 357 | "a = mx.nd.ones((2,2))\n", 358 | "b = a * a\n", 359 | "c = mx.nd.dot(a,a)\n", 360 | "print(b.asnumpy())\n", 361 | "print(c.asnumpy())" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": {}, 367 | "source": [ 368 | "The assignment operators such as `+=` and `*=` act in place to modify an existing array rather than create a new one." 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": 12, 374 | "metadata": { 375 | "collapsed": false 376 | }, 377 | "outputs": [ 378 | { 379 | "name": "stdout", 380 | "output_type": "stream", 381 | "text": [ 382 | "140484456318736\n", 383 | "140484456318736\n", 384 | "[[ 2. 2.]\n", 385 | " [ 2. 2.]]\n" 386 | ] 387 | } 388 | ], 389 | "source": [ 390 | "a = mx.nd.ones((2,2))\n", 391 | "b = mx.nd.ones(a.shape)\n", 392 | "print(id(b))\n", 393 | "b += a\n", 394 | "print(id(b))\n", 395 | "print(b.asnumpy())" 396 | ] 397 | }, 398 | { 399 | "cell_type": "markdown", 400 | "metadata": {}, 401 | "source": [ 402 | "### Indexing and Slicing\n", 403 | "The slice operator `[]` applies on axis 0. " 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 13, 409 | "metadata": { 410 | "collapsed": false 411 | }, 412 | "outputs": [ 413 | { 414 | "name": "stdout", 415 | "output_type": "stream", 416 | "text": [ 417 | "[[ 0. 1.]\n", 418 | " [ 2. 3.]\n", 419 | " [ 4. 5.]]\n", 420 | "[[ 0. 1.]\n", 421 | " [ 1. 1.]\n", 422 | " [ 4. 5.]]\n" 423 | ] 424 | } 425 | ], 426 | "source": [ 427 | "a = mx.nd.array(np.arange(6).reshape(3,2))\n", 428 | "print(a[:].asnumpy())\n", 429 | "a[1:2] = 1\n", 430 | "print(a.asnumpy())" 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "metadata": {}, 436 | "source": [ 437 | "We can also slice a particular axis with the method `slice_axis`" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 14, 443 | "metadata": { 444 | "collapsed": false 445 | }, 446 | "outputs": [ 447 | { 448 | "name": "stdout", 449 | "output_type": "stream", 450 | "text": [ 451 | "[[ 1.]\n", 452 | " [ 1.]\n", 453 | " [ 5.]]\n" 454 | ] 455 | } 456 | ], 457 | "source": [ 458 | "d = mx.nd.slice_axis(a, axis=1, begin=1, end=2)\n", 459 | "print d.asnumpy()" 460 | ] 461 | }, 462 | { 463 | "cell_type": "markdown", 464 | "metadata": {}, 465 | "source": [ 466 | "### Shape Manipulation \n", 467 | "The shape of the array can be changed as long as the size remaining the same " 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": 15, 473 | "metadata": { 474 | "collapsed": false 475 | }, 476 | "outputs": [ 477 | { 478 | "name": "stdout", 479 | "output_type": "stream", 480 | "text": [ 481 | "[[ 0. 1. 2. 3. 4. 5.]\n", 482 | " [ 6. 7. 8. 9. 10. 11.]\n", 483 | " [ 12. 13. 14. 15. 16. 17.]\n", 484 | " [ 18. 19. 20. 21. 22. 23.]]\n", 485 | "[[[ 0. 1. 2. 3.]\n", 486 | " [ 4. 5. 6. 7.]\n", 487 | " [ 8. 9. 10. 11.]]\n", 488 | "\n", 489 | " [[ 12. 13. 14. 15.]\n", 490 | " [ 16. 17. 18. 19.]\n", 491 | " [ 20. 21. 22. 23.]]]\n" 492 | ] 493 | } 494 | ], 495 | "source": [ 496 | "a = mx.nd.array(np.arange(24).reshape(4,6))\n", 497 | "print(a.asnumpy())\n", 498 | "b = a.reshape((2,3,4))\n", 499 | "print(b.asnumpy())" 500 | ] 501 | }, 502 | { 503 | "cell_type": "markdown", 504 | "metadata": {}, 505 | "source": [ 506 | "Method `concatenate` stacks multiple arrays along the first dimension. (Their shapes must be the same)." 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": 16, 512 | "metadata": { 513 | "collapsed": false 514 | }, 515 | "outputs": [ 516 | { 517 | "name": "stdout", 518 | "output_type": "stream", 519 | "text": [ 520 | "[[ 1. 1. 1.]\n", 521 | " [ 1. 1. 1.]\n", 522 | " [ 2. 2. 2.]\n", 523 | " [ 2. 2. 2.]]\n" 524 | ] 525 | } 526 | ], 527 | "source": [ 528 | "a = mx.nd.ones((2,3))\n", 529 | "b = mx.nd.ones((2,3))*2\n", 530 | "c = mx.nd.concatenate([a,b])\n", 531 | "print(c.asnumpy())" 532 | ] 533 | }, 534 | { 535 | "cell_type": "markdown", 536 | "metadata": {}, 537 | "source": [ 538 | "### Reduce\n", 539 | "\n", 540 | "We can reduce the array to a scalar, or along a particular axis." 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": 17, 546 | "metadata": { 547 | "collapsed": false 548 | }, 549 | "outputs": [ 550 | { 551 | "name": "stdout", 552 | "output_type": "stream", 553 | "text": [ 554 | "[ 6.]\n", 555 | "[ 3. 3.]\n" 556 | ] 557 | } 558 | ], 559 | "source": [ 560 | "a = mx.nd.ones((2,3))\n", 561 | "b = mx.nd.sum(a) # sum over all elements\n", 562 | "print(b.asnumpy())\n", 563 | "c = mx.nd.sum_axis(a, axis=1) # sum over axis 1\n", 564 | "print(c.asnumpy())" 565 | ] 566 | }, 567 | { 568 | "cell_type": "markdown", 569 | "metadata": {}, 570 | "source": [ 571 | "### Broadcast\n", 572 | "We can also broadcast an array by duplicating." 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "execution_count": 18, 578 | "metadata": { 579 | "collapsed": false 580 | }, 581 | "outputs": [ 582 | { 583 | "name": "stdout", 584 | "output_type": "stream", 585 | "text": [ 586 | "[[ 0. 0.]\n", 587 | " [ 1. 1.]\n", 588 | " [ 2. 2.]\n", 589 | " [ 3. 3.]\n", 590 | " [ 4. 4.]\n", 591 | " [ 5. 5.]]\n", 592 | "[[[[ 0. 1. 2.]\n", 593 | " [ 0. 1. 2.]]\n", 594 | "\n", 595 | " [[ 0. 1. 2.]\n", 596 | " [ 0. 1. 2.]]]\n", 597 | "\n", 598 | "\n", 599 | " [[[ 3. 4. 5.]\n", 600 | " [ 3. 4. 5.]]\n", 601 | "\n", 602 | " [[ 3. 4. 5.]\n", 603 | " [ 3. 4. 5.]]]]\n" 604 | ] 605 | } 606 | ], 607 | "source": [ 608 | "a = mx.nd.array(np.arange(6).reshape(6,1))\n", 609 | "b = a.broadcast_to((6,2)) # broadcast along axis 1\n", 610 | "print(b.asnumpy())\n", 611 | "c = a.reshape((2,1,1,3))\n", 612 | "d = c.broadcast_to((2,2,2,3)) # broadcast along axes 1 and 2.\n", 613 | "print(d.asnumpy())" 614 | ] 615 | }, 616 | { 617 | "cell_type": "markdown", 618 | "metadata": {}, 619 | "source": [ 620 | "Broadcast can be applied to operations such as `*` and `+`. " 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": 19, 626 | "metadata": { 627 | "collapsed": false 628 | }, 629 | "outputs": [ 630 | { 631 | "name": "stdout", 632 | "output_type": "stream", 633 | "text": [ 634 | "[[ 2. 2.]\n", 635 | " [ 2. 2.]\n", 636 | " [ 2. 2.]]\n" 637 | ] 638 | } 639 | ], 640 | "source": [ 641 | "a = mx.nd.ones((3,1))\n", 642 | "b = mx.nd.ones((1,2))\n", 643 | "c = a + b\n", 644 | "print(c.asnumpy())" 645 | ] 646 | }, 647 | { 648 | "cell_type": "markdown", 649 | "metadata": {}, 650 | "source": [ 651 | "## The Advanced \n", 652 | "There are some advanced features in `mxnet.ndarray` which make mxnet different from other libraries. " 653 | ] 654 | }, 655 | { 656 | "cell_type": "markdown", 657 | "metadata": {}, 658 | "source": [ 659 | "### GPU Support\n", 660 | "\n", 661 | "In default operators are executed on CPU. It is easy to switch to another computation resource, such as GPU, if available. The device information is stored in `ndarray.context`. When MXNet is compiled with flag `USE_CUDA=1` and there is at least one Nvidia GPU card, we can make all computations run on GPU 0 by using context `mx.gpu(0)`, or simply `mx.gpu()`. If there are more than two GPUs, the 2nd GPU is represented by `mx.gpu(1)`." 662 | ] 663 | }, 664 | { 665 | "cell_type": "code", 666 | "execution_count": 20, 667 | "metadata": { 668 | "collapsed": false 669 | }, 670 | "outputs": [ 671 | { 672 | "name": "stdout", 673 | "output_type": "stream", 674 | "text": [ 675 | "('running on ', cpu(0))\n", 676 | "[[ 2. 2. 2. ..., 2. 2. 2.]\n", 677 | " [ 2. 2. 2. ..., 2. 2. 2.]\n", 678 | " [ 2. 2. 2. ..., 2. 2. 2.]\n", 679 | " ..., \n", 680 | " [ 2. 2. 2. ..., 2. 2. 2.]\n", 681 | " [ 2. 2. 2. ..., 2. 2. 2.]\n", 682 | " [ 2. 2. 2. ..., 2. 2. 2.]]\n", 683 | "('running on ', gpu(0))\n", 684 | "[[ 2. 2. 2. ..., 2. 2. 2.]\n", 685 | " [ 2. 2. 2. ..., 2. 2. 2.]\n", 686 | " [ 2. 2. 2. ..., 2. 2. 2.]\n", 687 | " ..., \n", 688 | " [ 2. 2. 2. ..., 2. 2. 2.]\n", 689 | " [ 2. 2. 2. ..., 2. 2. 2.]\n", 690 | " [ 2. 2. 2. ..., 2. 2. 2.]]\n" 691 | ] 692 | } 693 | ], 694 | "source": [ 695 | "def f():\n", 696 | " a = mx.nd.ones((100,100))\n", 697 | " b = mx.nd.ones((100,100))\n", 698 | " c = a + b\n", 699 | " print('running on ', c.context)\n", 700 | " print(c.asnumpy())\n", 701 | "f() # in default mx.cpu() is used\n", 702 | "with mx.Context(mx.gpu()): # change the default context to the first GPU\n", 703 | " f()\n" 704 | ] 705 | }, 706 | { 707 | "cell_type": "markdown", 708 | "metadata": {}, 709 | "source": [ 710 | "We can also explicitly specify the context when creating an array" 711 | ] 712 | }, 713 | { 714 | "cell_type": "code", 715 | "execution_count": 21, 716 | "metadata": { 717 | "collapsed": false 718 | }, 719 | "outputs": [ 720 | { 721 | "name": "stdout", 722 | "output_type": "stream", 723 | "text": [ 724 | "gpu(0)\n" 725 | ] 726 | } 727 | ], 728 | "source": [ 729 | "a = mx.nd.ones((100, 100), mx.gpu(0))\n", 730 | "b = mx.nd.ones((100, 100), mx.gpu(0))\n", 731 | "c = a + b\n", 732 | "print(c.context)" 733 | ] 734 | }, 735 | { 736 | "cell_type": "markdown", 737 | "metadata": {}, 738 | "source": [ 739 | "Currently MXNet requires two arrays to sit on the same device for computation. There are several methods for copying data between devices." 740 | ] 741 | }, 742 | { 743 | "cell_type": "code", 744 | "execution_count": 22, 745 | "metadata": { 746 | "collapsed": false 747 | }, 748 | "outputs": [ 749 | { 750 | "name": "stdout", 751 | "output_type": "stream", 752 | "text": [ 753 | "gpu(0)\n", 754 | "gpu(0)\n" 755 | ] 756 | } 757 | ], 758 | "source": [ 759 | "a = mx.nd.ones((100,100), mx.cpu())\n", 760 | "b = mx.nd.ones((100,100), mx.gpu())\n", 761 | "c = mx.nd.ones((100,100), mx.gpu())\n", 762 | "a.copyto(c) # copy from CPU to GPU\n", 763 | "d = b + c\n", 764 | "print(d.context)\n", 765 | "e = b.as_in_context(c.context) + c # same to above\n", 766 | "print(e.context)" 767 | ] 768 | }, 769 | { 770 | "cell_type": "markdown", 771 | "metadata": {}, 772 | "source": [ 773 | "### Serialize From/To (Distributed) Filesystems \n", 774 | "There are two ways to save data to (load from) disks easily. The first way uses `pickle`. `NDArray` is pickle compatible." 775 | ] 776 | }, 777 | { 778 | "cell_type": "code", 779 | "execution_count": 23, 780 | "metadata": { 781 | "collapsed": false 782 | }, 783 | "outputs": [ 784 | { 785 | "name": "stdout", 786 | "output_type": "stream", 787 | "text": [ 788 | "[[ 1. 1. 1.]\n", 789 | " [ 1. 1. 1.]]\n" 790 | ] 791 | } 792 | ], 793 | "source": [ 794 | "import pickle as pkl\n", 795 | "a = mx.nd.ones((2, 3))\n", 796 | "# pack and then dump into disk\n", 797 | "data = pkl.dumps(a)\n", 798 | "pkl.dump(data, open('tmp.pickle', 'wb'))\n", 799 | "# load from disk and then unpack \n", 800 | "data = pkl.load(open('tmp.pickle', 'rb'))\n", 801 | "b = pkl.loads(data)\n", 802 | "print(b.asnumpy())" 803 | ] 804 | }, 805 | { 806 | "cell_type": "markdown", 807 | "metadata": {}, 808 | "source": [ 809 | "The second way is to directly dump into disk in binary format by method `save` and `load`. " 810 | ] 811 | }, 812 | { 813 | "cell_type": "code", 814 | "execution_count": 24, 815 | "metadata": { 816 | "collapsed": false 817 | }, 818 | "outputs": [ 819 | { 820 | "name": "stdout", 821 | "output_type": "stream", 822 | "text": [ 823 | "[[ 1. 1. 1.]\n", 824 | " [ 1. 1. 1.]]\n", 825 | "[[ 2. 2. 2.]\n", 826 | " [ 2. 2. 2.]]\n" 827 | ] 828 | } 829 | ], 830 | "source": [ 831 | "# load and save a list\n", 832 | "a = mx.nd.ones((2,3))\n", 833 | "b = mx.nd.ones((2,3))*2 \n", 834 | "mx.nd.save(\"temp.ndarray\", [a,b])\n", 835 | "c = mx.nd.load(\"temp.ndarray\")\n", 836 | "print(c[0].asnumpy())\n", 837 | "print(c[1].asnumpy())" 838 | ] 839 | }, 840 | { 841 | "cell_type": "code", 842 | "execution_count": 25, 843 | "metadata": { 844 | "collapsed": false 845 | }, 846 | "outputs": [ 847 | { 848 | "name": "stdout", 849 | "output_type": "stream", 850 | "text": [ 851 | "[[ 1. 1. 1.]\n", 852 | " [ 1. 1. 1.]]\n", 853 | "[[ 2. 2. 2.]\n", 854 | " [ 2. 2. 2.]]\n" 855 | ] 856 | } 857 | ], 858 | "source": [ 859 | "# load and save a dict\n", 860 | "mx.nd.save(\"temp.ndarray\", {'a':a, 'b':b})\n", 861 | "c = mx.nd.load(\"temp.ndarray\")\n", 862 | "print(c['a'].asnumpy())\n", 863 | "print(c['b'].asnumpy())" 864 | ] 865 | }, 866 | { 867 | "cell_type": "markdown", 868 | "metadata": {}, 869 | "source": [ 870 | "The load/save is better than pickle in two aspects\n", 871 | "1. The data saved with the Python interface can be used by another lanuage binding. For example, if we save the data in python:\n", 872 | "```python\n", 873 | "a = mx.nd.ones((2, 3))\n", 874 | "mx.save(\"temp.ndarray\", [a,])\n", 875 | "```\n", 876 | "then we can load it into R:\n", 877 | "```R\n", 878 | "a <- mx.nd.load(\"temp.ndarray\")\n", 879 | "as.array(a[[1]])\n", 880 | "## [,1] [,2] [,3]\n", 881 | "## [1,] 1 1 1\n", 882 | "## [2,] 1 1 1\n", 883 | "```\n", 884 | "2. If a distributed filesystem such as Amazon S3 or Hadoop HDFS is set up, we can directly save to and load from it. \n", 885 | "```python\n", 886 | "mx.nd.save('s3://mybucket/mydata.ndarray', [a,]) # if compiled with USE_S3=1\n", 887 | "mx.nd.save('hdfs///users/myname/mydata.bin', [a,]) # if compiled with USE_HDFS=1\n", 888 | "```\n" 889 | ] 890 | }, 891 | { 892 | "cell_type": "markdown", 893 | "metadata": {}, 894 | "source": [ 895 | "### Lazy Evaluation and Auto Parallelization\n", 896 | "\n", 897 | "MXNet uses lazy evaluation for better performance. When we run `a=b+1` in python, the python thread just pushs the operation into the backend engine and then returns. There are two benefits for such optimization:\n", 898 | "1. The main python thread can continue to execute other computations once the previous one is pushed. It is useful for frontend languages with heavy overheads. \n", 899 | "2. It is easier for the backend engine to explore further optimization, such as auto parallelization that will be discussed shortly. \n", 900 | "\n", 901 | "The backend engine is able to resolve the data dependencies and schedule the computations correctly. It is transparent to frontend users. We can explicitly call the method `wait_to_read` on the result array to wait the computation finished. Operations that copy data from an array to other packages, such as `asnumpy`, will implicitly call `wait_to_read`. " 902 | ] 903 | }, 904 | { 905 | "cell_type": "code", 906 | "execution_count": 26, 907 | "metadata": { 908 | "collapsed": false, 909 | "scrolled": true 910 | }, 911 | "outputs": [ 912 | { 913 | "name": "stdout", 914 | "output_type": "stream", 915 | "text": [ 916 | "time for all computations are pushed into the backend engine: 0.002128 sec\n", 917 | "time for all computations are finished: 0.820602 sec\n" 918 | ] 919 | } 920 | ], 921 | "source": [ 922 | "import time\n", 923 | "\n", 924 | "def do(x, n):\n", 925 | " \"\"\"push computation into the backend engine\"\"\"\n", 926 | " return [mx.nd.dot(x,x) for i in range(n)]\n", 927 | "def wait(x):\n", 928 | " \"\"\"wait until all results are available\"\"\"\n", 929 | " for y in x:\n", 930 | " y.wait_to_read()\n", 931 | " \n", 932 | "tic = time.time()\n", 933 | "a = mx.nd.ones((1000,1000))\n", 934 | "b = do(a, 50)\n", 935 | "toc = time.time() - tic\n", 936 | "print('time for all computations are pushed into the backend engine: %f sec' % (time.time() - tic))\n", 937 | "wait(b)\n", 938 | "print('time for all computations are finished: %f sec' % (time.time() - tic))" 939 | ] 940 | }, 941 | { 942 | "cell_type": "markdown", 943 | "metadata": {}, 944 | "source": [ 945 | "Besides analyzing data read and write dependencies, the backend engine is able to schedule computations with no dependency in parallel. For example, in the following codes\n", 946 | "```python\n", 947 | "a = mx.nd.ones((2,3))\n", 948 | "b = a + 1\n", 949 | "c = a + 2\n", 950 | "d = b * c\n", 951 | "```\n", 952 | "the second and third sentences can be executed in parallel. " 953 | ] 954 | }, 955 | { 956 | "cell_type": "code", 957 | "execution_count": 27, 958 | "metadata": { 959 | "collapsed": false, 960 | "scrolled": true 961 | }, 962 | "outputs": [ 963 | { 964 | "name": "stdout", 965 | "output_type": "stream", 966 | "text": [ 967 | "time for all computations are finished: 1.437338 sec\n" 968 | ] 969 | } 970 | ], 971 | "source": [ 972 | "# run computation on CPU first, and then on GPU\n", 973 | "n = 50\n", 974 | "a = mx.nd.ones((1000,1000))\n", 975 | "b = mx.nd.ones((2000,2000), mx.gpu())\n", 976 | "tic = time.time()\n", 977 | "c = do(a, n)\n", 978 | "wait(c)\n", 979 | "d = do(b, n)\n", 980 | "wait(d)\n", 981 | "print('time for all computations are finished: %f sec' % (time.time() - tic))\n" 982 | ] 983 | }, 984 | { 985 | "cell_type": "code", 986 | "execution_count": 28, 987 | "metadata": { 988 | "collapsed": false 989 | }, 990 | "outputs": [ 991 | { 992 | "name": "stdout", 993 | "output_type": "stream", 994 | "text": [ 995 | "improved parallelization: 1.104402 sec\n" 996 | ] 997 | } 998 | ], 999 | "source": [ 1000 | "# the backend engine will try to parallel the CPU and GPU computation.\n", 1001 | "tic = time.time()\n", 1002 | "c = do(a, n)\n", 1003 | "d = do(b, n)\n", 1004 | "wait(c)\n", 1005 | "wait(d)\n", 1006 | "print('improved parallelization: %f sec' % (time.time() - tic))\n" 1007 | ] 1008 | }, 1009 | { 1010 | "cell_type": "markdown", 1011 | "metadata": {}, 1012 | "source": [ 1013 | "## Current Status\n", 1014 | "\n", 1015 | "We try our best to keep the NDArray API as the same numpy's. But it is not fully numpy compatible yet. Here we summary some major difference, which we hope to be fixed in a short time. We are also welcome to any contribution.\n", 1016 | "\n", 1017 | "- Slice and Index. \n", 1018 | " - NDArray can only slice one dimension at each time, namely we cannot use `x[:, 1]` to slice both dimensions.\n", 1019 | " - Only continues indexes are supported, we cannot do `x[1:2:3]`\n", 1020 | " - boolean indices are not supported, such as `x[y==1]`.\n", 1021 | "- Lack of reduce functions such as `max`, `min`...\n", 1022 | "\n", 1023 | "## Futher Readings\n", 1024 | "- [NDArray API](http://mxnet.dmlc.ml/en/latest/packages/python/ndarray.html) Documents for all NDArray methods.\n", 1025 | "- [MinPy](https://github.com/dmlc/minpy) on-going project, fully numpy compatible with GPU and auto differentiation supports " 1026 | ] 1027 | } 1028 | ], 1029 | "metadata": { 1030 | "anaconda-cloud": {}, 1031 | "kernelspec": { 1032 | "display_name": "Python [Root]", 1033 | "language": "python", 1034 | "name": "Python [Root]" 1035 | }, 1036 | "language_info": { 1037 | "codemirror_mode": { 1038 | "name": "ipython", 1039 | "version": 3 1040 | }, 1041 | "file_extension": ".py", 1042 | "mimetype": "text/x-python", 1043 | "name": "python", 1044 | "nbconvert_exporter": "python", 1045 | "pygments_lexer": "ipython3", 1046 | "version": "3.5.2" 1047 | } 1048 | }, 1049 | "nbformat": 4, 1050 | "nbformat_minor": 1 1051 | } 1052 | -------------------------------------------------------------------------------- /python/recommendation_systems/matrix_factorization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Matrix Factorization\n", 8 | "\n", 9 | "In a recommendation system, there is a group of users and a set of items. Given that each users have rated some items in the system, we would like to predict how the users would rate the items that they have not yet rated, such that we can make recommendations to the users.\n", 10 | "\n", 11 | "Matrix factorization is one of the mainly used algorithm in recommendation systems. It can be used to discover latent features underlying the interactions between two different kinds of entities. Assume we assign a $k$ dimensional vector $u_i$ to user $i$ and $k$ dimensional vector $v_j$ to item $j$, then user $i$ rates movie $j$ by $\\langle u_i, v_j\\rangle$.\n", 12 | "\n", 13 | "We can learn all $u_i$ and $v_j$ directly, which is essentially performing SVD on the user-item matrix. We can also try to learn the latent features using multi-layer neural networks. \n", 14 | "\n", 15 | "In this tutorial, we will work though the steps to implement these ideas in MXNet." 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "## Prepare Data\n", 23 | "\n", 24 | "We use the [MovieLens](http://grouplens.org/datasets/movielens/) data here, but it can apply to other datasets as well. Each row of this dataset contains a tuple of user id, movie id, rating, and time stamp, we will only use the first three items. We first define the a batch which contains n tuples. It also provides name and shape information to MXNet about the data and label. " 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 9, 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "class Batch(object):\n", 36 | " def __init__(self, data_names, data, label_names, label):\n", 37 | " self.data = data\n", 38 | " self.label = label\n", 39 | " self.data_names = data_names\n", 40 | " self.label_names = label_names\n", 41 | " \n", 42 | " @property\n", 43 | " def provide_data(self):\n", 44 | " return [(n, x.shape) for n, x in zip(self.data_names, self.data)]\n", 45 | " \n", 46 | " @property\n", 47 | " def provide_label(self):\n", 48 | " return [(n, x.shape) for n, x in zip(self.label_names, self.label)]\n" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "Then we define a data iterator, which returns a batch of tuples each time. " 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 10, 61 | "metadata": { 62 | "collapsed": true 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "import mxnet as mx\n", 67 | "import random\n", 68 | "\n", 69 | "class Batch(object):\n", 70 | " def __init__(self, data_names, data, label_names, label):\n", 71 | " self.data = data\n", 72 | " self.label = label\n", 73 | " self.data_names = data_names\n", 74 | " self.label_names = label_names\n", 75 | "\n", 76 | " @property\n", 77 | " def provide_data(self):\n", 78 | " return [(n, x.shape) for n, x in zip(self.data_names, self.data)]\n", 79 | "\n", 80 | " @property\n", 81 | " def provide_label(self):\n", 82 | " return [(n, x.shape) for n, x in zip(self.label_names, self.label)]\n", 83 | "\n", 84 | "class DataIter(mx.io.DataIter):\n", 85 | " def __init__(self, fname, batch_size):\n", 86 | " super(DataIter, self).__init__()\n", 87 | " self.batch_size = batch_size\n", 88 | " self.data = []\n", 89 | " for line in file(fname):\n", 90 | " tks = line.strip().split('\\t')\n", 91 | " if len(tks) != 4:\n", 92 | " continue\n", 93 | " self.data.append((int(tks[0]), int(tks[1]), float(tks[2])))\n", 94 | " self.provide_data = [('user', (batch_size, )), ('item', (batch_size, ))]\n", 95 | " self.provide_label = [('score', (self.batch_size, ))]\n", 96 | "\n", 97 | " def __iter__(self):\n", 98 | " for k in range(len(self.data) / self.batch_size):\n", 99 | " users = []\n", 100 | " items = []\n", 101 | " scores = []\n", 102 | " for i in range(self.batch_size):\n", 103 | " j = k * self.batch_size + i\n", 104 | " user, item, score = self.data[j]\n", 105 | " users.append(user)\n", 106 | " items.append(item)\n", 107 | " scores.append(score)\n", 108 | "\n", 109 | " data_all = [mx.nd.array(users), mx.nd.array(items)]\n", 110 | " label_all = [mx.nd.array(scores)]\n", 111 | " data_names = ['user', 'item']\n", 112 | " label_names = ['score']\n", 113 | "\n", 114 | " data_batch = Batch(data_names, data_all, label_names, label_all)\n", 115 | " yield data_batch\n", 116 | "\n", 117 | " def reset(self):\n", 118 | " random.shuffle(self.data)" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "Now we download the data and provide a function to obtain the data iterator:" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 11, 131 | "metadata": { 132 | "collapsed": true 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "import os\n", 137 | "if not os.path.exists(\"ml-100k.zip\"):\n", 138 | " os.system(\"wget http://files.grouplens.org/datasets/movielens/ml-100k.zip\") \n", 139 | " os.system(\"unzip ml-100k.zip\")\n", 140 | "def get_data(batch_size):\n", 141 | " return (DataIter('./ml-100k/u1.base', batch_size), DataIter('./ml-100k/u1.test', batch_size))" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "Finally we calculate the numbers of users and items for later use." 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 12, 154 | "metadata": { 155 | "collapsed": false 156 | }, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/plain": [ 161 | "(944, 1683)" 162 | ] 163 | }, 164 | "execution_count": 12, 165 | "metadata": {}, 166 | "output_type": "execute_result" 167 | } 168 | ], 169 | "source": [ 170 | "def max_id(fname):\n", 171 | " mu = 0\n", 172 | " mi = 0\n", 173 | " for line in file(fname):\n", 174 | " tks = line.strip().split('\\t')\n", 175 | " if len(tks) != 4:\n", 176 | " continue\n", 177 | " mu = max(mu, int(tks[0]))\n", 178 | " mi = max(mi, int(tks[1]))\n", 179 | " return mu + 1, mi + 1\n", 180 | "max_user, max_item = max_id('./ml-100k/u.data')\n", 181 | "(max_user, max_item)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "## Optimization\n", 189 | "\n", 190 | "We first implement the RMSE (root-mean-square error) measurement, which is commonly used by matrix factorization. " 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 13, 196 | "metadata": { 197 | "collapsed": true 198 | }, 199 | "outputs": [], 200 | "source": [ 201 | "import math\n", 202 | "def RMSE(label, pred):\n", 203 | " ret = 0.0\n", 204 | " n = 0.0\n", 205 | " pred = pred.flatten()\n", 206 | " for i in range(len(label)):\n", 207 | " ret += (label[i] - pred[i]) * (label[i] - pred[i])\n", 208 | " n += 1.0\n", 209 | " return math.sqrt(ret / n)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "Then we define a general training module, which is borrowed from the image classification application. " 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 14, 222 | "metadata": { 223 | "collapsed": true 224 | }, 225 | "outputs": [], 226 | "source": [ 227 | "def train(network, batch_size, num_epoch, learning_rate):\n", 228 | " model = mx.model.FeedForward(\n", 229 | " ctx = [mx.gpu(0)], # can be change to [mx.gpu(0), mx.gpu(1)] if there are 2 gpus\n", 230 | " symbol = network,\n", 231 | " num_epoch = num_epoch,\n", 232 | " learning_rate = learning_rate,\n", 233 | " #lr_scheduler = mx.lr_scheduler.FactorScheduler(\n", 234 | " #step = max(int(80000/batch_size*2), 1),\n", 235 | " # factor = 0.95),\n", 236 | " wd = 0.0001,\n", 237 | " #initializer = mx.init.Xavier(factor_type=\"in\", magnitude=2.34),\n", 238 | " momentum = 0.9)\n", 239 | "\n", 240 | " batch_size = 64\n", 241 | " train, test = get_data(batch_size)\n", 242 | "\n", 243 | " import logging\n", 244 | " head = '%(asctime)-15s %(message)s'\n", 245 | " logging.basicConfig(level=logging.DEBUG)\n", 246 | "\n", 247 | " model.fit(X = train, \n", 248 | " eval_data = test,\n", 249 | " eval_metric = RMSE,\n", 250 | " batch_end_callback=mx.callback.Speedometer(batch_size, 10000/batch_size),)" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "## Networks\n", 258 | "\n", 259 | "Now we try various networks. We first learn the latent vectors directly." 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 15, 265 | "metadata": { 266 | "collapsed": false 267 | }, 268 | "outputs": [ 269 | { 270 | "name": "stderr", 271 | "output_type": "stream", 272 | "text": [ 273 | "INFO:root:Start training with [gpu(0)]\n", 274 | "INFO:root:Epoch[0] Batch [156]\tSpeed: 56152.02 samples/sec\tTrain-RMSE=3.706615\n", 275 | "INFO:root:Epoch[0] Batch [312]\tSpeed: 56093.35 samples/sec\tTrain-RMSE=3.697545\n", 276 | "INFO:root:Epoch[0] Batch [468]\tSpeed: 55999.96 samples/sec\tTrain-RMSE=3.708319\n", 277 | "INFO:root:Epoch[0] Batch [624]\tSpeed: 56241.24 samples/sec\tTrain-RMSE=3.688322\n", 278 | "INFO:root:Epoch[0] Batch [780]\tSpeed: 55798.34 samples/sec\tTrain-RMSE=3.704836\n", 279 | "INFO:root:Epoch[0] Batch [936]\tSpeed: 55368.81 samples/sec\tTrain-RMSE=3.683681\n", 280 | "INFO:root:Epoch[0] Batch [1092]\tSpeed: 55267.68 samples/sec\tTrain-RMSE=3.705261\n", 281 | "INFO:root:Epoch[0] Batch [1248]\tSpeed: 54383.48 samples/sec\tTrain-RMSE=3.694059\n", 282 | "INFO:root:Epoch[0] Resetting Data Iterator\n", 283 | "INFO:root:Epoch[0] Time cost=1.477\n", 284 | "INFO:root:Epoch[0] Validation-RMSE=3.713718\n", 285 | "INFO:root:Epoch[1] Batch [156]\tSpeed: 55056.23 samples/sec\tTrain-RMSE=3.697979\n", 286 | "INFO:root:Epoch[1] Batch [312]\tSpeed: 54194.29 samples/sec\tTrain-RMSE=3.668891\n", 287 | "INFO:root:Epoch[1] Batch [468]\tSpeed: 54458.23 samples/sec\tTrain-RMSE=3.642110\n", 288 | "INFO:root:Epoch[1] Batch [624]\tSpeed: 55428.18 samples/sec\tTrain-RMSE=3.577095\n", 289 | "INFO:root:Epoch[1] Batch [780]\tSpeed: 55699.93 samples/sec\tTrain-RMSE=3.426324\n", 290 | "INFO:root:Epoch[1] Batch [936]\tSpeed: 55505.17 samples/sec\tTrain-RMSE=3.084756\n", 291 | "INFO:root:Epoch[1] Batch [1092]\tSpeed: 55576.92 samples/sec\tTrain-RMSE=2.704116\n", 292 | "INFO:root:Epoch[1] Batch [1248]\tSpeed: 54706.81 samples/sec\tTrain-RMSE=2.379844\n", 293 | "INFO:root:Epoch[1] Resetting Data Iterator\n", 294 | "INFO:root:Epoch[1] Time cost=1.490\n", 295 | "INFO:root:Epoch[1] Validation-RMSE=2.419295\n", 296 | "INFO:root:Epoch[2] Batch [156]\tSpeed: 49208.89 samples/sec\tTrain-RMSE=2.110523\n", 297 | "INFO:root:Epoch[2] Batch [312]\tSpeed: 48535.99 samples/sec\tTrain-RMSE=1.889097\n", 298 | "INFO:root:Epoch[2] Batch [468]\tSpeed: 48694.81 samples/sec\tTrain-RMSE=1.759300\n", 299 | "INFO:root:Epoch[2] Batch [624]\tSpeed: 48668.72 samples/sec\tTrain-RMSE=1.647674\n", 300 | "INFO:root:Epoch[2] Batch [780]\tSpeed: 52903.51 samples/sec\tTrain-RMSE=1.517060\n", 301 | "INFO:root:Epoch[2] Batch [936]\tSpeed: 55862.36 samples/sec\tTrain-RMSE=1.484016\n", 302 | "INFO:root:Epoch[2] Batch [1092]\tSpeed: 56004.98 samples/sec\tTrain-RMSE=1.400864\n", 303 | "INFO:root:Epoch[2] Batch [1248]\tSpeed: 54597.33 samples/sec\tTrain-RMSE=1.342697\n", 304 | "INFO:root:Epoch[2] Resetting Data Iterator\n", 305 | "INFO:root:Epoch[2] Time cost=1.588\n", 306 | "INFO:root:Epoch[2] Validation-RMSE=1.431981\n", 307 | "INFO:root:Epoch[3] Batch [156]\tSpeed: 54941.66 samples/sec\tTrain-RMSE=1.265084\n", 308 | "INFO:root:Epoch[3] Batch [312]\tSpeed: 54567.87 samples/sec\tTrain-RMSE=1.264938\n", 309 | "INFO:root:Epoch[3] Batch [468]\tSpeed: 54376.70 samples/sec\tTrain-RMSE=1.229487\n", 310 | "INFO:root:Epoch[3] Batch [624]\tSpeed: 54321.68 samples/sec\tTrain-RMSE=1.212403\n", 311 | "INFO:root:Epoch[3] Batch [780]\tSpeed: 53476.68 samples/sec\tTrain-RMSE=1.166233\n", 312 | "INFO:root:Epoch[3] Batch [936]\tSpeed: 53950.34 samples/sec\tTrain-RMSE=1.150932\n", 313 | "INFO:root:Epoch[3] Batch [1092]\tSpeed: 54338.24 samples/sec\tTrain-RMSE=1.146948\n", 314 | "INFO:root:Epoch[3] Batch [1248]\tSpeed: 54033.32 samples/sec\tTrain-RMSE=1.140070\n", 315 | "INFO:root:Epoch[3] Resetting Data Iterator\n", 316 | "INFO:root:Epoch[3] Time cost=1.513\n", 317 | "INFO:root:Epoch[3] Validation-RMSE=1.202079\n", 318 | "INFO:root:Epoch[4] Batch [156]\tSpeed: 54793.07 samples/sec\tTrain-RMSE=1.096710\n", 319 | "INFO:root:Epoch[4] Batch [312]\tSpeed: 54616.48 samples/sec\tTrain-RMSE=1.097131\n", 320 | "INFO:root:Epoch[4] Batch [468]\tSpeed: 54474.31 samples/sec\tTrain-RMSE=1.098475\n", 321 | "INFO:root:Epoch[4] Batch [624]\tSpeed: 54543.99 samples/sec\tTrain-RMSE=1.067425\n", 322 | "INFO:root:Epoch[4] Batch [780]\tSpeed: 54422.99 samples/sec\tTrain-RMSE=1.077556\n", 323 | "INFO:root:Epoch[4] Batch [936]\tSpeed: 54681.38 samples/sec\tTrain-RMSE=1.063173\n", 324 | "INFO:root:Epoch[4] Batch [1092]\tSpeed: 55050.15 samples/sec\tTrain-RMSE=1.058728\n", 325 | "INFO:root:Epoch[4] Batch [1248]\tSpeed: 54792.14 samples/sec\tTrain-RMSE=1.052552\n", 326 | "INFO:root:Epoch[4] Resetting Data Iterator\n", 327 | "INFO:root:Epoch[4] Time cost=1.501\n", 328 | "INFO:root:Epoch[4] Validation-RMSE=1.115826\n", 329 | "INFO:root:Epoch[5] Batch [156]\tSpeed: 49110.90 samples/sec\tTrain-RMSE=1.022660\n", 330 | "INFO:root:Epoch[5] Batch [312]\tSpeed: 54949.52 samples/sec\tTrain-RMSE=1.034606\n", 331 | "INFO:root:Epoch[5] Batch [468]\tSpeed: 54974.05 samples/sec\tTrain-RMSE=1.034887\n", 332 | "INFO:root:Epoch[5] Batch [624]\tSpeed: 54504.02 samples/sec\tTrain-RMSE=1.028711\n", 333 | "INFO:root:Epoch[5] Batch [780]\tSpeed: 54669.96 samples/sec\tTrain-RMSE=1.025565\n", 334 | "INFO:root:Epoch[5] Batch [936]\tSpeed: 54931.43 samples/sec\tTrain-RMSE=1.025683\n", 335 | "INFO:root:Epoch[5] Batch [1092]\tSpeed: 54472.54 samples/sec\tTrain-RMSE=1.037122\n", 336 | "INFO:root:Epoch[5] Batch [1248]\tSpeed: 54714.96 samples/sec\tTrain-RMSE=1.018717\n", 337 | "INFO:root:Epoch[5] Resetting Data Iterator\n", 338 | "INFO:root:Epoch[5] Time cost=1.522\n", 339 | "INFO:root:Epoch[5] Validation-RMSE=1.071941\n", 340 | "INFO:root:Epoch[6] Batch [156]\tSpeed: 56005.20 samples/sec\tTrain-RMSE=1.003523\n", 341 | "INFO:root:Epoch[6] Batch [312]\tSpeed: 55581.20 samples/sec\tTrain-RMSE=1.000883\n", 342 | "INFO:root:Epoch[6] Batch [468]\tSpeed: 54543.64 samples/sec\tTrain-RMSE=0.991006\n", 343 | "INFO:root:Epoch[6] Batch [624]\tSpeed: 55623.35 samples/sec\tTrain-RMSE=1.006816\n", 344 | "INFO:root:Epoch[6] Batch [780]\tSpeed: 55959.70 samples/sec\tTrain-RMSE=1.010641\n", 345 | "INFO:root:Epoch[6] Batch [936]\tSpeed: 56006.85 samples/sec\tTrain-RMSE=1.012411\n", 346 | "INFO:root:Epoch[6] Batch [1092]\tSpeed: 56219.04 samples/sec\tTrain-RMSE=1.007607\n", 347 | "INFO:root:Epoch[6] Batch [1248]\tSpeed: 55977.35 samples/sec\tTrain-RMSE=1.001625\n", 348 | "INFO:root:Epoch[6] Resetting Data Iterator\n", 349 | "INFO:root:Epoch[6] Time cost=1.475\n", 350 | "INFO:root:Epoch[6] Validation-RMSE=1.049505\n", 351 | "INFO:root:Epoch[7] Batch [156]\tSpeed: 56863.96 samples/sec\tTrain-RMSE=0.987829\n", 352 | "INFO:root:Epoch[7] Batch [312]\tSpeed: 55854.91 samples/sec\tTrain-RMSE=0.984407\n", 353 | "INFO:root:Epoch[7] Batch [468]\tSpeed: 56121.09 samples/sec\tTrain-RMSE=0.993369\n", 354 | "INFO:root:Epoch[7] Batch [624]\tSpeed: 55779.69 samples/sec\tTrain-RMSE=0.999314\n", 355 | "INFO:root:Epoch[7] Batch [780]\tSpeed: 55902.70 samples/sec\tTrain-RMSE=0.982091\n", 356 | "INFO:root:Epoch[7] Batch [936]\tSpeed: 55519.08 samples/sec\tTrain-RMSE=0.988640\n", 357 | "INFO:root:Epoch[7] Batch [1092]\tSpeed: 55751.99 samples/sec\tTrain-RMSE=0.997003\n", 358 | "INFO:root:Epoch[7] Batch [1248]\tSpeed: 55966.88 samples/sec\tTrain-RMSE=0.986120\n", 359 | "INFO:root:Epoch[7] Resetting Data Iterator\n", 360 | "INFO:root:Epoch[7] Time cost=1.469\n", 361 | "INFO:root:Epoch[7] Validation-RMSE=1.036123\n", 362 | "INFO:root:Epoch[8] Batch [156]\tSpeed: 56123.57 samples/sec\tTrain-RMSE=0.968606\n", 363 | "INFO:root:Epoch[8] Batch [312]\tSpeed: 55761.27 samples/sec\tTrain-RMSE=0.981268\n", 364 | "INFO:root:Epoch[8] Batch [468]\tSpeed: 55953.72 samples/sec\tTrain-RMSE=0.984670\n", 365 | "INFO:root:Epoch[8] Batch [624]\tSpeed: 43851.58 samples/sec\tTrain-RMSE=0.973858\n", 366 | "INFO:root:Epoch[8] Batch [780]\tSpeed: 37403.78 samples/sec\tTrain-RMSE=0.984366\n", 367 | "INFO:root:Epoch[8] Batch [936]\tSpeed: 37331.73 samples/sec\tTrain-RMSE=0.981248\n", 368 | "INFO:root:Epoch[8] Batch [1092]\tSpeed: 37453.16 samples/sec\tTrain-RMSE=0.989204\n", 369 | "INFO:root:Epoch[8] Batch [1248]\tSpeed: 37427.62 samples/sec\tTrain-RMSE=0.983527\n", 370 | "INFO:root:Epoch[8] Resetting Data Iterator\n", 371 | "INFO:root:Epoch[8] Time cost=1.875\n", 372 | "INFO:root:Epoch[8] Validation-RMSE=1.024367\n", 373 | "INFO:root:Epoch[9] Batch [156]\tSpeed: 37729.29 samples/sec\tTrain-RMSE=0.975141\n", 374 | "INFO:root:Epoch[9] Batch [312]\tSpeed: 37596.04 samples/sec\tTrain-RMSE=0.967309\n", 375 | "INFO:root:Epoch[9] Batch [468]\tSpeed: 37445.83 samples/sec\tTrain-RMSE=0.969378\n", 376 | "INFO:root:Epoch[9] Batch [624]\tSpeed: 37450.08 samples/sec\tTrain-RMSE=0.973308\n", 377 | "INFO:root:Epoch[9] Batch [780]\tSpeed: 37638.41 samples/sec\tTrain-RMSE=0.987016\n", 378 | "INFO:root:Epoch[9] Batch [936]\tSpeed: 37573.54 samples/sec\tTrain-RMSE=0.977451\n", 379 | "INFO:root:Epoch[9] Batch [1092]\tSpeed: 37399.98 samples/sec\tTrain-RMSE=0.971797\n", 380 | "INFO:root:Epoch[9] Batch [1248]\tSpeed: 37465.93 samples/sec\tTrain-RMSE=0.970934\n", 381 | "INFO:root:Epoch[9] Resetting Data Iterator\n", 382 | "INFO:root:Epoch[9] Time cost=2.174\n", 383 | "INFO:root:Epoch[9] Validation-RMSE=1.017791\n" 384 | ] 385 | } 386 | ], 387 | "source": [ 388 | "def plain_net(k):\n", 389 | " # input\n", 390 | " user = mx.symbol.Variable('user')\n", 391 | " item = mx.symbol.Variable('item')\n", 392 | " score = mx.symbol.Variable('score')\n", 393 | " # user feature lookup\n", 394 | " user = mx.symbol.Embedding(data = user, input_dim = max_user, output_dim = k) \n", 395 | " # item feature lookup\n", 396 | " item = mx.symbol.Embedding(data = item, input_dim = max_item, output_dim = k)\n", 397 | " # predict by the inner product, which is elementwise product and then sum\n", 398 | " pred = user * item\n", 399 | " pred = mx.symbol.sum_axis(data = pred, axis = 1)\n", 400 | " pred = mx.symbol.Flatten(data = pred)\n", 401 | " # loss layer\n", 402 | " pred = mx.symbol.LinearRegressionOutput(data = pred, label = score)\n", 403 | " return pred\n", 404 | "\n", 405 | "train(plain_net(64), batch_size=64, num_epoch=10, learning_rate=.05)" 406 | ] 407 | }, 408 | { 409 | "cell_type": "markdown", 410 | "metadata": {}, 411 | "source": [ 412 | "Next we try to use 2 layers neural network to learn the latent variables, which stack a fully connected layer above the embedding layers: " 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": 16, 418 | "metadata": { 419 | "collapsed": false, 420 | "scrolled": true 421 | }, 422 | "outputs": [ 423 | { 424 | "name": "stderr", 425 | "output_type": "stream", 426 | "text": [ 427 | "INFO:root:Start training with [gpu(0)]\n", 428 | "INFO:root:Epoch[0] Batch [156]\tSpeed: 34221.56 samples/sec\tTrain-RMSE=1.942634\n", 429 | "INFO:root:Epoch[0] Batch [312]\tSpeed: 30912.80 samples/sec\tTrain-RMSE=1.119005\n", 430 | "INFO:root:Epoch[0] Batch [468]\tSpeed: 35044.73 samples/sec\tTrain-RMSE=1.098399\n", 431 | "INFO:root:Epoch[0] Batch [624]\tSpeed: 35098.19 samples/sec\tTrain-RMSE=1.054476\n", 432 | "INFO:root:Epoch[0] Batch [780]\tSpeed: 35128.96 samples/sec\tTrain-RMSE=1.011925\n", 433 | "INFO:root:Epoch[0] Batch [936]\tSpeed: 35080.08 samples/sec\tTrain-RMSE=0.983108\n", 434 | "INFO:root:Epoch[0] Batch [1092]\tSpeed: 34945.15 samples/sec\tTrain-RMSE=0.977220\n", 435 | "INFO:root:Epoch[0] Batch [1248]\tSpeed: 33966.69 samples/sec\tTrain-RMSE=0.965951\n", 436 | "INFO:root:Epoch[0] Resetting Data Iterator\n", 437 | "INFO:root:Epoch[0] Time cost=2.376\n", 438 | "INFO:root:Epoch[0] Validation-RMSE=0.978114\n", 439 | "INFO:root:Epoch[1] Batch [156]\tSpeed: 30683.17 samples/sec\tTrain-RMSE=0.949212\n", 440 | "INFO:root:Epoch[1] Batch [312]\tSpeed: 30868.07 samples/sec\tTrain-RMSE=0.951608\n", 441 | "INFO:root:Epoch[1] Batch [468]\tSpeed: 31483.75 samples/sec\tTrain-RMSE=0.955808\n", 442 | "INFO:root:Epoch[1] Batch [624]\tSpeed: 34920.70 samples/sec\tTrain-RMSE=0.959489\n", 443 | "INFO:root:Epoch[1] Batch [780]\tSpeed: 32155.51 samples/sec\tTrain-RMSE=0.945665\n", 444 | "INFO:root:Epoch[1] Batch [936]\tSpeed: 31340.62 samples/sec\tTrain-RMSE=0.944534\n", 445 | "INFO:root:Epoch[1] Batch [1092]\tSpeed: 31162.70 samples/sec\tTrain-RMSE=0.945014\n", 446 | "INFO:root:Epoch[1] Batch [1248]\tSpeed: 31244.24 samples/sec\tTrain-RMSE=0.951440\n", 447 | "INFO:root:Epoch[1] Resetting Data Iterator\n", 448 | "INFO:root:Epoch[1] Time cost=2.563\n", 449 | "INFO:root:Epoch[1] Validation-RMSE=0.975165\n", 450 | "INFO:root:Epoch[2] Batch [156]\tSpeed: 35262.69 samples/sec\tTrain-RMSE=0.942642\n", 451 | "INFO:root:Epoch[2] Batch [312]\tSpeed: 34887.02 samples/sec\tTrain-RMSE=0.933122\n", 452 | "INFO:root:Epoch[2] Batch [468]\tSpeed: 35088.57 samples/sec\tTrain-RMSE=0.944248\n", 453 | "INFO:root:Epoch[2] Batch [624]\tSpeed: 34988.95 samples/sec\tTrain-RMSE=0.943167\n", 454 | "INFO:root:Epoch[2] Batch [780]\tSpeed: 34955.04 samples/sec\tTrain-RMSE=0.943710\n", 455 | "INFO:root:Epoch[2] Batch [936]\tSpeed: 30766.38 samples/sec\tTrain-RMSE=0.928660\n", 456 | "INFO:root:Epoch[2] Batch [1092]\tSpeed: 30604.26 samples/sec\tTrain-RMSE=0.948441\n", 457 | "INFO:root:Epoch[2] Batch [1248]\tSpeed: 30682.32 samples/sec\tTrain-RMSE=0.943952\n", 458 | "INFO:root:Epoch[2] Resetting Data Iterator\n", 459 | "INFO:root:Epoch[2] Time cost=2.442\n", 460 | "INFO:root:Epoch[2] Validation-RMSE=0.962911\n", 461 | "INFO:root:Epoch[3] Batch [156]\tSpeed: 30835.95 samples/sec\tTrain-RMSE=0.920814\n", 462 | "INFO:root:Epoch[3] Batch [312]\tSpeed: 31063.53 samples/sec\tTrain-RMSE=0.926121\n", 463 | "INFO:root:Epoch[3] Batch [468]\tSpeed: 30694.21 samples/sec\tTrain-RMSE=0.936737\n", 464 | "INFO:root:Epoch[3] Batch [624]\tSpeed: 30565.19 samples/sec\tTrain-RMSE=0.945118\n", 465 | "INFO:root:Epoch[3] Batch [780]\tSpeed: 30898.93 samples/sec\tTrain-RMSE=0.933481\n", 466 | "INFO:root:Epoch[3] Batch [936]\tSpeed: 30642.78 samples/sec\tTrain-RMSE=0.939749\n", 467 | "INFO:root:Epoch[3] Batch [1092]\tSpeed: 31263.29 samples/sec\tTrain-RMSE=0.947351\n", 468 | "INFO:root:Epoch[3] Batch [1248]\tSpeed: 31130.36 samples/sec\tTrain-RMSE=0.945224\n", 469 | "INFO:root:Epoch[3] Resetting Data Iterator\n", 470 | "INFO:root:Epoch[3] Time cost=2.630\n", 471 | "INFO:root:Epoch[3] Validation-RMSE=0.957273\n", 472 | "INFO:root:Epoch[4] Batch [156]\tSpeed: 31526.08 samples/sec\tTrain-RMSE=0.918031\n", 473 | "INFO:root:Epoch[4] Batch [312]\tSpeed: 30568.47 samples/sec\tTrain-RMSE=0.930736\n", 474 | "INFO:root:Epoch[4] Batch [468]\tSpeed: 30525.11 samples/sec\tTrain-RMSE=0.942186\n", 475 | "INFO:root:Epoch[4] Batch [624]\tSpeed: 31053.46 samples/sec\tTrain-RMSE=0.932028\n", 476 | "INFO:root:Epoch[4] Batch [780]\tSpeed: 31089.75 samples/sec\tTrain-RMSE=0.939669\n", 477 | "INFO:root:Epoch[4] Batch [936]\tSpeed: 30973.30 samples/sec\tTrain-RMSE=0.938523\n", 478 | "INFO:root:Epoch[4] Batch [1092]\tSpeed: 30370.52 samples/sec\tTrain-RMSE=0.933891\n", 479 | "INFO:root:Epoch[4] Batch [1248]\tSpeed: 30878.11 samples/sec\tTrain-RMSE=0.929265\n", 480 | "INFO:root:Epoch[4] Resetting Data Iterator\n", 481 | "INFO:root:Epoch[4] Time cost=2.630\n", 482 | "INFO:root:Epoch[4] Validation-RMSE=0.955076\n", 483 | "INFO:root:Epoch[5] Batch [156]\tSpeed: 30856.24 samples/sec\tTrain-RMSE=0.924776\n", 484 | "INFO:root:Epoch[5] Batch [312]\tSpeed: 31165.80 samples/sec\tTrain-RMSE=0.919315\n", 485 | "INFO:root:Epoch[5] Batch [468]\tSpeed: 31213.56 samples/sec\tTrain-RMSE=0.948400\n", 486 | "INFO:root:Epoch[5] Batch [624]\tSpeed: 31150.55 samples/sec\tTrain-RMSE=0.930602\n", 487 | "INFO:root:Epoch[5] Batch [780]\tSpeed: 31274.08 samples/sec\tTrain-RMSE=0.935643\n", 488 | "INFO:root:Epoch[5] Batch [936]\tSpeed: 31413.41 samples/sec\tTrain-RMSE=0.932529\n", 489 | "INFO:root:Epoch[5] Batch [1092]\tSpeed: 31318.90 samples/sec\tTrain-RMSE=0.938194\n", 490 | "INFO:root:Epoch[5] Batch [1248]\tSpeed: 31095.54 samples/sec\tTrain-RMSE=0.926393\n", 491 | "INFO:root:Epoch[5] Resetting Data Iterator\n", 492 | "INFO:root:Epoch[5] Time cost=2.603\n", 493 | "INFO:root:Epoch[5] Validation-RMSE=0.957072\n", 494 | "INFO:root:Epoch[6] Batch [156]\tSpeed: 30959.40 samples/sec\tTrain-RMSE=0.939036\n", 495 | "INFO:root:Epoch[6] Batch [312]\tSpeed: 30305.79 samples/sec\tTrain-RMSE=0.917075\n", 496 | "INFO:root:Epoch[6] Batch [468]\tSpeed: 31005.73 samples/sec\tTrain-RMSE=0.938251\n", 497 | "INFO:root:Epoch[6] Batch [624]\tSpeed: 30332.31 samples/sec\tTrain-RMSE=0.939276\n", 498 | "INFO:root:Epoch[6] Batch [780]\tSpeed: 31259.40 samples/sec\tTrain-RMSE=0.924594\n", 499 | "INFO:root:Epoch[6] Batch [936]\tSpeed: 31219.99 samples/sec\tTrain-RMSE=0.926362\n", 500 | "INFO:root:Epoch[6] Batch [1092]\tSpeed: 32942.02 samples/sec\tTrain-RMSE=0.924488\n", 501 | "INFO:root:Epoch[6] Batch [1248]\tSpeed: 34970.60 samples/sec\tTrain-RMSE=0.926823\n", 502 | "INFO:root:Epoch[6] Resetting Data Iterator\n", 503 | "INFO:root:Epoch[6] Time cost=2.573\n", 504 | "INFO:root:Epoch[6] Validation-RMSE=0.953832\n", 505 | "INFO:root:Epoch[7] Batch [156]\tSpeed: 34790.73 samples/sec\tTrain-RMSE=0.914066\n", 506 | "INFO:root:Epoch[7] Batch [312]\tSpeed: 34857.54 samples/sec\tTrain-RMSE=0.938872\n", 507 | "INFO:root:Epoch[7] Batch [468]\tSpeed: 34974.02 samples/sec\tTrain-RMSE=0.930738\n", 508 | "INFO:root:Epoch[7] Batch [624]\tSpeed: 34889.08 samples/sec\tTrain-RMSE=0.918107\n", 509 | "INFO:root:Epoch[7] Batch [780]\tSpeed: 34933.17 samples/sec\tTrain-RMSE=0.916927\n", 510 | "INFO:root:Epoch[7] Batch [936]\tSpeed: 34738.06 samples/sec\tTrain-RMSE=0.929646\n", 511 | "INFO:root:Epoch[7] Batch [1092]\tSpeed: 34856.32 samples/sec\tTrain-RMSE=0.929019\n", 512 | "INFO:root:Epoch[7] Batch [1248]\tSpeed: 34863.02 samples/sec\tTrain-RMSE=0.938901\n", 513 | "INFO:root:Epoch[7] Resetting Data Iterator\n", 514 | "INFO:root:Epoch[7] Time cost=2.333\n", 515 | "INFO:root:Epoch[7] Validation-RMSE=0.963170\n", 516 | "INFO:root:Epoch[8] Batch [156]\tSpeed: 35032.45 samples/sec\tTrain-RMSE=0.925915\n", 517 | "INFO:root:Epoch[8] Batch [312]\tSpeed: 34781.54 samples/sec\tTrain-RMSE=0.923126\n", 518 | "INFO:root:Epoch[8] Batch [468]\tSpeed: 34741.81 samples/sec\tTrain-RMSE=0.932726\n", 519 | "INFO:root:Epoch[8] Batch [624]\tSpeed: 34974.63 samples/sec\tTrain-RMSE=0.929655\n", 520 | "INFO:root:Epoch[8] Batch [780]\tSpeed: 34901.76 samples/sec\tTrain-RMSE=0.921560\n", 521 | "INFO:root:Epoch[8] Batch [936]\tSpeed: 34887.63 samples/sec\tTrain-RMSE=0.929801\n", 522 | "INFO:root:Epoch[8] Batch [1092]\tSpeed: 34707.05 samples/sec\tTrain-RMSE=0.929785\n", 523 | "INFO:root:Epoch[8] Batch [1248]\tSpeed: 34832.37 samples/sec\tTrain-RMSE=0.929096\n", 524 | "INFO:root:Epoch[8] Resetting Data Iterator\n", 525 | "INFO:root:Epoch[8] Time cost=2.336\n", 526 | "INFO:root:Epoch[8] Validation-RMSE=0.965062\n", 527 | "INFO:root:Epoch[9] Batch [156]\tSpeed: 34933.38 samples/sec\tTrain-RMSE=0.909513\n", 528 | "INFO:root:Epoch[9] Batch [312]\tSpeed: 34673.05 samples/sec\tTrain-RMSE=0.933655\n", 529 | "INFO:root:Epoch[9] Batch [468]\tSpeed: 34769.87 samples/sec\tTrain-RMSE=0.929988\n", 530 | "INFO:root:Epoch[9] Batch [624]\tSpeed: 34876.79 samples/sec\tTrain-RMSE=0.916171\n", 531 | "INFO:root:Epoch[9] Batch [780]\tSpeed: 35084.13 samples/sec\tTrain-RMSE=0.918759\n", 532 | "INFO:root:Epoch[9] Batch [936]\tSpeed: 35038.72 samples/sec\tTrain-RMSE=0.918020\n", 533 | "INFO:root:Epoch[9] Batch [1092]\tSpeed: 34762.51 samples/sec\tTrain-RMSE=0.927458\n", 534 | "INFO:root:Epoch[9] Batch [1248]\tSpeed: 34990.71 samples/sec\tTrain-RMSE=0.941439\n", 535 | "INFO:root:Epoch[9] Resetting Data Iterator\n", 536 | "INFO:root:Epoch[9] Time cost=2.333\n", 537 | "INFO:root:Epoch[9] Validation-RMSE=0.951576\n" 538 | ] 539 | } 540 | ], 541 | "source": [ 542 | "def get_one_layer_mlp(hidden, k):\n", 543 | " # input\n", 544 | " user = mx.symbol.Variable('user')\n", 545 | " item = mx.symbol.Variable('item')\n", 546 | " score = mx.symbol.Variable('score')\n", 547 | " # user latent features\n", 548 | " user = mx.symbol.Embedding(data = user, input_dim = max_user, output_dim = k)\n", 549 | " user = mx.symbol.FullyConnected(data = user, num_hidden = hidden)\n", 550 | " # item latent features\n", 551 | " item = mx.symbol.Embedding(data = item, input_dim = max_item, output_dim = k)\n", 552 | " item = mx.symbol.FullyConnected(data = item, num_hidden = hidden)\n", 553 | " # predict by the inner product\n", 554 | " pred = user * item\n", 555 | " pred = mx.symbol.sum_axis(data = pred, axis = 1)\n", 556 | " pred = mx.symbol.Flatten(data = pred)\n", 557 | " # loss layer\n", 558 | " pred = mx.symbol.LinearRegressionOutput(data = pred, label = score)\n", 559 | " return pred\n", 560 | "\n", 561 | "train(get_one_layer_mlp(64, 64), batch_size=64, num_epoch=10, learning_rate=.02)" 562 | ] 563 | }, 564 | { 565 | "cell_type": "markdown", 566 | "metadata": {}, 567 | "source": [ 568 | "Adding dropout layers to relief the over-fitting. " 569 | ] 570 | }, 571 | { 572 | "cell_type": "code", 573 | "execution_count": 17, 574 | "metadata": { 575 | "collapsed": false 576 | }, 577 | "outputs": [ 578 | { 579 | "name": "stderr", 580 | "output_type": "stream", 581 | "text": [ 582 | "INFO:root:Start training with [gpu(0)]\n", 583 | "INFO:root:Epoch[0] Batch [156]\tSpeed: 29301.10 samples/sec\tTrain-RMSE=1.790682\n", 584 | "INFO:root:Epoch[0] Batch [312]\tSpeed: 29281.08 samples/sec\tTrain-RMSE=1.113848\n", 585 | "INFO:root:Epoch[0] Batch [468]\tSpeed: 29358.50 samples/sec\tTrain-RMSE=1.060094\n", 586 | "INFO:root:Epoch[0] Batch [624]\tSpeed: 29291.55 samples/sec\tTrain-RMSE=1.005032\n", 587 | "INFO:root:Epoch[0] Batch [780]\tSpeed: 29233.50 samples/sec\tTrain-RMSE=0.975966\n", 588 | "INFO:root:Epoch[0] Batch [936]\tSpeed: 29310.04 samples/sec\tTrain-RMSE=0.982252\n", 589 | "INFO:root:Epoch[0] Batch [1092]\tSpeed: 29440.38 samples/sec\tTrain-RMSE=0.972627\n", 590 | "INFO:root:Epoch[0] Batch [1248]\tSpeed: 29424.34 samples/sec\tTrain-RMSE=0.972099\n", 591 | "INFO:root:Epoch[0] Resetting Data Iterator\n", 592 | "INFO:root:Epoch[0] Time cost=2.768\n", 593 | "INFO:root:Epoch[0] Validation-RMSE=0.983640\n", 594 | "INFO:root:Epoch[1] Batch [156]\tSpeed: 22813.32 samples/sec\tTrain-RMSE=0.944569\n", 595 | "INFO:root:Epoch[1] Batch [312]\tSpeed: 22577.30 samples/sec\tTrain-RMSE=0.950992\n", 596 | "INFO:root:Epoch[1] Batch [468]\tSpeed: 22614.69 samples/sec\tTrain-RMSE=0.957585\n", 597 | "INFO:root:Epoch[1] Batch [624]\tSpeed: 22576.14 samples/sec\tTrain-RMSE=0.933718\n", 598 | "INFO:root:Epoch[1] Batch [780]\tSpeed: 22581.64 samples/sec\tTrain-RMSE=0.953794\n", 599 | "INFO:root:Epoch[1] Batch [936]\tSpeed: 22460.23 samples/sec\tTrain-RMSE=0.958251\n", 600 | "INFO:root:Epoch[1] Batch [1092]\tSpeed: 22577.51 samples/sec\tTrain-RMSE=0.947528\n", 601 | "INFO:root:Epoch[1] Batch [1248]\tSpeed: 22556.34 samples/sec\tTrain-RMSE=0.941649\n", 602 | "INFO:root:Epoch[1] Resetting Data Iterator\n", 603 | "INFO:root:Epoch[1] Time cost=3.582\n", 604 | "INFO:root:Epoch[1] Validation-RMSE=0.968206\n", 605 | "INFO:root:Epoch[2] Batch [156]\tSpeed: 22787.39 samples/sec\tTrain-RMSE=0.930318\n", 606 | "INFO:root:Epoch[2] Batch [312]\tSpeed: 22674.01 samples/sec\tTrain-RMSE=0.941576\n", 607 | "INFO:root:Epoch[2] Batch [468]\tSpeed: 22767.96 samples/sec\tTrain-RMSE=0.944394\n", 608 | "INFO:root:Epoch[2] Batch [624]\tSpeed: 22708.45 samples/sec\tTrain-RMSE=0.933607\n", 609 | "INFO:root:Epoch[2] Batch [780]\tSpeed: 22713.47 samples/sec\tTrain-RMSE=0.937826\n", 610 | "INFO:root:Epoch[2] Batch [936]\tSpeed: 22606.24 samples/sec\tTrain-RMSE=0.950298\n", 611 | "INFO:root:Epoch[2] Batch [1092]\tSpeed: 22610.39 samples/sec\tTrain-RMSE=0.937505\n", 612 | "INFO:root:Epoch[2] Batch [1248]\tSpeed: 22522.81 samples/sec\tTrain-RMSE=0.942826\n", 613 | "INFO:root:Epoch[2] Resetting Data Iterator\n", 614 | "INFO:root:Epoch[2] Time cost=3.570\n", 615 | "INFO:root:Epoch[2] Validation-RMSE=0.961135\n", 616 | "INFO:root:Epoch[3] Batch [156]\tSpeed: 22609.20 samples/sec\tTrain-RMSE=0.934108\n", 617 | "INFO:root:Epoch[3] Batch [312]\tSpeed: 22430.77 samples/sec\tTrain-RMSE=0.930039\n", 618 | "INFO:root:Epoch[3] Batch [468]\tSpeed: 22459.93 samples/sec\tTrain-RMSE=0.921241\n", 619 | "INFO:root:Epoch[3] Batch [624]\tSpeed: 22508.39 samples/sec\tTrain-RMSE=0.935312\n", 620 | "INFO:root:Epoch[3] Batch [780]\tSpeed: 29143.73 samples/sec\tTrain-RMSE=0.945226\n", 621 | "INFO:root:Epoch[3] Batch [936]\tSpeed: 29230.33 samples/sec\tTrain-RMSE=0.941457\n", 622 | "INFO:root:Epoch[3] Batch [1092]\tSpeed: 29360.99 samples/sec\tTrain-RMSE=0.946398\n", 623 | "INFO:root:Epoch[3] Batch [1248]\tSpeed: 29240.09 samples/sec\tTrain-RMSE=0.937095\n", 624 | "INFO:root:Epoch[3] Resetting Data Iterator\n", 625 | "INFO:root:Epoch[3] Time cost=3.188\n", 626 | "INFO:root:Epoch[3] Validation-RMSE=0.961336\n", 627 | "INFO:root:Epoch[4] Batch [156]\tSpeed: 29343.64 samples/sec\tTrain-RMSE=0.933257\n", 628 | "INFO:root:Epoch[4] Batch [312]\tSpeed: 29160.58 samples/sec\tTrain-RMSE=0.917006\n", 629 | "INFO:root:Epoch[4] Batch [468]\tSpeed: 29216.30 samples/sec\tTrain-RMSE=0.942208\n", 630 | "INFO:root:Epoch[4] Batch [624]\tSpeed: 29259.11 samples/sec\tTrain-RMSE=0.937070\n", 631 | "INFO:root:Epoch[4] Batch [780]\tSpeed: 29183.52 samples/sec\tTrain-RMSE=0.929886\n", 632 | "INFO:root:Epoch[4] Batch [936]\tSpeed: 29283.23 samples/sec\tTrain-RMSE=0.942090\n", 633 | "INFO:root:Epoch[4] Batch [1092]\tSpeed: 29315.05 samples/sec\tTrain-RMSE=0.925818\n", 634 | "INFO:root:Epoch[4] Batch [1248]\tSpeed: 29301.72 samples/sec\tTrain-RMSE=0.946179\n", 635 | "INFO:root:Epoch[4] Resetting Data Iterator\n", 636 | "INFO:root:Epoch[4] Time cost=2.772\n", 637 | "INFO:root:Epoch[4] Validation-RMSE=0.956032\n", 638 | "INFO:root:Epoch[5] Batch [156]\tSpeed: 29654.36 samples/sec\tTrain-RMSE=0.921894\n", 639 | "INFO:root:Epoch[5] Batch [312]\tSpeed: 29407.17 samples/sec\tTrain-RMSE=0.928768\n", 640 | "INFO:root:Epoch[5] Batch [468]\tSpeed: 29363.23 samples/sec\tTrain-RMSE=0.923658\n", 641 | "INFO:root:Epoch[5] Batch [624]\tSpeed: 29401.80 samples/sec\tTrain-RMSE=0.920641\n", 642 | "INFO:root:Epoch[5] Batch [780]\tSpeed: 29421.04 samples/sec\tTrain-RMSE=0.929944\n", 643 | "INFO:root:Epoch[5] Batch [936]\tSpeed: 29321.84 samples/sec\tTrain-RMSE=0.939287\n", 644 | "INFO:root:Epoch[5] Batch [1092]\tSpeed: 29435.78 samples/sec\tTrain-RMSE=0.947566\n", 645 | "INFO:root:Epoch[5] Batch [1248]\tSpeed: 29440.75 samples/sec\tTrain-RMSE=0.935860\n", 646 | "INFO:root:Epoch[5] Resetting Data Iterator\n", 647 | "INFO:root:Epoch[5] Time cost=2.758\n", 648 | "INFO:root:Epoch[5] Validation-RMSE=0.963435\n", 649 | "INFO:root:Epoch[6] Batch [156]\tSpeed: 29453.84 samples/sec\tTrain-RMSE=0.924084\n", 650 | "INFO:root:Epoch[6] Batch [312]\tSpeed: 29369.97 samples/sec\tTrain-RMSE=0.932678\n", 651 | "INFO:root:Epoch[6] Batch [468]\tSpeed: 29396.87 samples/sec\tTrain-RMSE=0.940401\n", 652 | "INFO:root:Epoch[6] Batch [624]\tSpeed: 29553.95 samples/sec\tTrain-RMSE=0.922988\n", 653 | "INFO:root:Epoch[6] Batch [780]\tSpeed: 29427.96 samples/sec\tTrain-RMSE=0.927942\n", 654 | "INFO:root:Epoch[6] Batch [936]\tSpeed: 28665.02 samples/sec\tTrain-RMSE=0.930397\n", 655 | "INFO:root:Epoch[6] Batch [1092]\tSpeed: 28085.11 samples/sec\tTrain-RMSE=0.928915\n", 656 | "INFO:root:Epoch[6] Batch [1248]\tSpeed: 29463.06 samples/sec\tTrain-RMSE=0.928845\n", 657 | "INFO:root:Epoch[6] Resetting Data Iterator\n", 658 | "INFO:root:Epoch[6] Time cost=2.783\n", 659 | "INFO:root:Epoch[6] Validation-RMSE=0.953191\n", 660 | "INFO:root:Epoch[7] Batch [156]\tSpeed: 29812.06 samples/sec\tTrain-RMSE=0.923179\n", 661 | "INFO:root:Epoch[7] Batch [312]\tSpeed: 29008.01 samples/sec\tTrain-RMSE=0.932230\n", 662 | "INFO:root:Epoch[7] Batch [468]\tSpeed: 29442.55 samples/sec\tTrain-RMSE=0.915791\n", 663 | "INFO:root:Epoch[7] Batch [624]\tSpeed: 29462.81 samples/sec\tTrain-RMSE=0.930312\n", 664 | "INFO:root:Epoch[7] Batch [780]\tSpeed: 29415.23 samples/sec\tTrain-RMSE=0.922857\n", 665 | "INFO:root:Epoch[7] Batch [936]\tSpeed: 29436.57 samples/sec\tTrain-RMSE=0.926345\n", 666 | "INFO:root:Epoch[7] Batch [1092]\tSpeed: 29437.35 samples/sec\tTrain-RMSE=0.922830\n", 667 | "INFO:root:Epoch[7] Batch [1248]\tSpeed: 29366.44 samples/sec\tTrain-RMSE=0.932184\n", 668 | "INFO:root:Epoch[7] Resetting Data Iterator\n", 669 | "INFO:root:Epoch[7] Time cost=2.761\n", 670 | "INFO:root:Epoch[7] Validation-RMSE=0.955406\n", 671 | "INFO:root:Epoch[8] Batch [156]\tSpeed: 29679.83 samples/sec\tTrain-RMSE=0.915235\n", 672 | "INFO:root:Epoch[8] Batch [312]\tSpeed: 29480.73 samples/sec\tTrain-RMSE=0.926871\n", 673 | "INFO:root:Epoch[8] Batch [468]\tSpeed: 29419.22 samples/sec\tTrain-RMSE=0.918528\n", 674 | "INFO:root:Epoch[8] Batch [624]\tSpeed: 29449.15 samples/sec\tTrain-RMSE=0.926639\n", 675 | "INFO:root:Epoch[8] Batch [780]\tSpeed: 29444.12 samples/sec\tTrain-RMSE=0.914554\n", 676 | "INFO:root:Epoch[8] Batch [936]\tSpeed: 29494.75 samples/sec\tTrain-RMSE=0.928927\n", 677 | "INFO:root:Epoch[8] Batch [1092]\tSpeed: 29466.19 samples/sec\tTrain-RMSE=0.929132\n", 678 | "INFO:root:Epoch[8] Batch [1248]\tSpeed: 29537.06 samples/sec\tTrain-RMSE=0.939788\n", 679 | "INFO:root:Epoch[8] Resetting Data Iterator\n", 680 | "INFO:root:Epoch[8] Time cost=2.752\n", 681 | "INFO:root:Epoch[8] Validation-RMSE=0.959778\n", 682 | "INFO:root:Epoch[9] Batch [156]\tSpeed: 29699.26 samples/sec\tTrain-RMSE=0.916807\n", 683 | "INFO:root:Epoch[9] Batch [312]\tSpeed: 29539.41 samples/sec\tTrain-RMSE=0.910764\n", 684 | "INFO:root:Epoch[9] Batch [468]\tSpeed: 29567.49 samples/sec\tTrain-RMSE=0.927473\n", 685 | "INFO:root:Epoch[9] Batch [624]\tSpeed: 29540.48 samples/sec\tTrain-RMSE=0.917609\n", 686 | "INFO:root:Epoch[9] Batch [780]\tSpeed: 29887.83 samples/sec\tTrain-RMSE=0.927466\n", 687 | "INFO:root:Epoch[9] Batch [936]\tSpeed: 29868.52 samples/sec\tTrain-RMSE=0.925121\n", 688 | "INFO:root:Epoch[9] Batch [1092]\tSpeed: 29969.65 samples/sec\tTrain-RMSE=0.929185\n", 689 | "INFO:root:Epoch[9] Batch [1248]\tSpeed: 29789.65 samples/sec\tTrain-RMSE=0.929312\n", 690 | "INFO:root:Epoch[9] Resetting Data Iterator\n", 691 | "INFO:root:Epoch[9] Time cost=2.731\n", 692 | "INFO:root:Epoch[9] Validation-RMSE=0.947868\n" 693 | ] 694 | } 695 | ], 696 | "source": [ 697 | "def get_one_layer_dropout_mlp(hidden, k):\n", 698 | " # input\n", 699 | " user = mx.symbol.Variable('user')\n", 700 | " item = mx.symbol.Variable('item')\n", 701 | " score = mx.symbol.Variable('score')\n", 702 | " # user latent features\n", 703 | " user = mx.symbol.Embedding(data = user, input_dim = max_user, output_dim = k)\n", 704 | " user = mx.symbol.FullyConnected(data = user, num_hidden = hidden)\n", 705 | " user = mx.symbol.Dropout(data=user, p=0.5)\n", 706 | " # item latent features\n", 707 | " item = mx.symbol.Embedding(data = item, input_dim = max_item, output_dim = k)\n", 708 | " item = mx.symbol.FullyConnected(data = item, num_hidden = hidden)\n", 709 | " item = mx.symbol.Dropout(data=item, p=0.5) \n", 710 | " # predict by the inner product\n", 711 | " pred = user * item\n", 712 | " pred = mx.symbol.sum_axis(data = pred, axis = 1)\n", 713 | " pred = mx.symbol.Flatten(data = pred)\n", 714 | " # loss layer\n", 715 | " pred = mx.symbol.LinearRegressionOutput(data = pred, label = score)\n", 716 | " return pred\n", 717 | "train(get_one_layer_mlp(256, 512), batch_size=64, num_epoch=10, learning_rate=.02)" 718 | ] 719 | }, 720 | { 721 | "cell_type": "markdown", 722 | "metadata": { 723 | "collapsed": true 724 | }, 725 | "source": [ 726 | "## Acknowledgement\n", 727 | "\n", 728 | "This tutorial is based on examples from [xlvector/github](https://github.com/xlvector/)." 729 | ] 730 | } 731 | ], 732 | "metadata": { 733 | "kernelspec": { 734 | "display_name": "Python 2", 735 | "language": "python", 736 | "name": "python2" 737 | }, 738 | "language_info": { 739 | "codemirror_mode": { 740 | "name": "ipython", 741 | "version": 2 742 | }, 743 | "file_extension": ".py", 744 | "mimetype": "text/x-python", 745 | "name": "python", 746 | "nbconvert_exporter": "python", 747 | "pygments_lexer": "ipython2", 748 | "version": "2.7.6" 749 | } 750 | }, 751 | "nbformat": 4, 752 | "nbformat_minor": 1 753 | } 754 | --------------------------------------------------------------------------------