├── 2LayerLSTM ├── lstmClassOld.py └── stockTT.bin ├── DavesReservoirComputingRefs ├── GRU ├── GRU.py └── stockTT.bin ├── ICLR Top paper thumbnail descriptions. ├── LICENSE ├── PapersSuggested_not_read ├── README.md ├── ZeeshanZiaSlides-DeepSupervision3DObjectParsing └── DLStudyGroup.pdf ├── awspot ├── README.md └── user_data.sh ├── gatedRNN.ipynb └── tensorflow └── Gated Feedback Recurrent Neural Network - GF LSTM.ipynb /2LayerLSTM/lstmClassOld.py: -------------------------------------------------------------------------------- 1 | import theano 2 | import theano.tensor as T 3 | import numpy as np 4 | import random 5 | import matplotlib.pyplot as plt 6 | import cPickle as pickle 7 | from math import sqrt 8 | #from lstmClass import LstmLayer, recurrent_fn 9 | 10 | '''Define lstm class for single layer lstm 11 | The objective is to define in such a way as to facilitate construction of multi-layer 12 | lstm. The questions are: 13 | 1. Cost function can't go inside the class because it may only be associated with the last 14 | layer in the stack. 15 | 2. Should the scan function be inside or outside of the class? 16 | 3. How about the gradient calculations? It seems like those need to be outside the class 17 | does that mean that the gradient calculations have to be outside the class statement. 18 | 4. Perhaps the scan function and the single layer recurrance function need to be inside 19 | the class statement, but the cost function goes outside. 20 | 5. Then the gradient calculation might only need to have a list of the parameters for which 21 | the cost needs to be diff'd. That would just be the list of lstm-layer objects dotted with 22 | the parameter list for each one. 23 | 6. Not clear how gradient of scan function may interact with python oop. Not sure if scan 24 | output includes enough for gradient calc. Perhaps scan should be external to class structure 25 | Plan A. 26 | class RNN 27 | 28 | 29 | ''' 30 | 31 | class LstmLayer(object): 32 | 33 | def __init__(self, n_in, n_hidden, n_out, name): 34 | self.name = name 35 | rng = np.random.RandomState(1234) 36 | #cell input 37 | self.W_ug = np.asarray(rng.normal(size=(n_in, n_hidden), scale= .01, loc = 0.0), dtype = theano.config.floatX) 38 | self.W_hg = np.asarray(rng.normal(size=(n_hidden, n_hidden), scale=.01, loc = 0.0), dtype = theano.config.floatX) 39 | self.b_g = np.zeros((n_hidden,), dtype=theano.config.floatX) 40 | #input gate equation 41 | self.W_ui = np.asarray(rng.normal(size=(n_in, n_hidden), scale =.01, loc=0.0), dtype = theano.config.floatX) 42 | self.W_hi = np.asarray(rng.normal(size=(n_hidden, n_hidden), scale =.01, loc=0.0), dtype = theano.config.floatX) 43 | self.b_i = np.zeros((n_hidden,), dtype=theano.config.floatX) 44 | #forget gate equations 45 | self.W_uf = np.asarray(rng.normal(size=(n_in, n_hidden), scale =.01, loc=0.0), dtype = theano.config.floatX) 46 | self.W_hf = np.asarray(rng.normal(size=(n_hidden, n_hidden), scale =.01, loc=0.0), dtype = theano.config.floatX) 47 | self.b_f = np.zeros((n_hidden,), dtype=theano.config.floatX) 48 | #cell output gate equations 49 | self.W_uo = np.asarray(rng.normal(size=(n_in, n_hidden), scale =.01, loc=0.0), dtype = theano.config.floatX) 50 | self.W_ho = np.asarray(rng.normal(size=(n_hidden, n_hidden), scale =.01, loc=0.0), dtype = theano.config.floatX) 51 | self.b_o = np.zeros((n_hidden,), dtype=theano.config.floatX) 52 | #output layer 53 | self.W_hy = np.asarray(rng.normal(size=(n_hidden, n_out), scale =.01, loc=0.0), dtype = theano.config.floatX) 54 | self.b_hy = np.zeros((n_out,), dtype=theano.config.floatX) 55 | 56 | #cell input 57 | self.W_ug = theano.shared(self.W_ug, 'W_ug' + self.name) 58 | self.W_hg = theano.shared(self.W_hg, 'W_hg' + self.name) 59 | self.b_g = theano.shared(self.b_g, 'b_g' + self.name) 60 | #input gate equation 61 | self.W_ui = theano.shared(self.W_ui, 'W_ui' + self.name) 62 | self.W_hi = theano.shared(self.W_hi, 'W_hi' + self.name) 63 | self.b_i = theano.shared(self.b_i, 'b_i' + self.name) 64 | #forget gate equations 65 | self.W_uf = theano.shared(self.W_uf, 'W_uf' + self.name) 66 | self.W_hf = theano.shared(self.W_hf, 'W_hf' + self.name) 67 | self.b_f = theano.shared(self.b_f, 'b_f' + self.name) 68 | #cell output gate equations 69 | self.W_uo = theano.shared(self.W_uo, 'W_uo' + self.name) 70 | self.W_ho = theano.shared(self.W_ho, 'W_ho' + self.name) 71 | self.b_o = theano.shared(self.b_o, 'b_o' + self.name) 72 | #output layer 73 | self.W_hy = theano.shared(self.W_hy, 'W_hy' + self.name) 74 | self.b_hy = theano.shared(self.b_hy, 'b_hy' + self.name) 75 | 76 | self.h0_tm1 = theano.shared(np.zeros(n_hidden, dtype=theano.config.floatX)) 77 | self.s0_tm1 = theano.shared(np.zeros(n_hidden, dtype=theano.config.floatX)) 78 | self.argList = [self.W_ug, self.W_hg, self.b_g, self.W_ui, self.W_hi, 79 | self.b_i, self.W_uf, self.W_hf, self.b_f, self.W_uo, self.W_ho, self.b_o, self.W_hy, self.b_hy] 80 | 81 | def recurrent_fn(u_t, h_tm1, s_tm1, W_ug, W_hg, b_g, W_ui, W_hi, b_i, W_uf, W_hf, b_f, 82 | W_uo, W_ho, b_o, W_hy, b_hy): 83 | g_t = T.tanh(T.dot(u_t, W_ug) + T.dot(h_tm1, W_hg) + b_g) 84 | i_t = T.nnet.sigmoid(T.dot(u_t, W_ui) + T.dot(h_tm1, W_hi) + b_i) 85 | f_t = T.nnet.sigmoid(T.dot(u_t, W_uf) + T.dot(h_tm1, W_hf) + b_f) 86 | o_t = T.nnet.sigmoid(T.dot(u_t, W_uo) + T.dot(h_tm1, W_ho) + b_o) 87 | s_t = g_t * i_t + s_tm1*f_t 88 | h_t = T.tanh(s_t)*o_t 89 | #h_t = self.activ(T.dot(h_tm1, W_hh) + T.dot(u_t, W_uh) + b_hh) 90 | return [h_t, s_t] 91 | 92 | def fcn2(u_t, h_tm1, s_tm1,h_tm12, s_tm12, W_ug, W_hg, b_g, W_ui, W_hi, b_i, W_uf, W_hf, b_f, 93 | W_uo, W_ho, b_o, W_hy, b_hy, W_ug2, W_hg2, b_g2, W_ui2, W_hi2, b_i2, W_uf2, W_hf2, b_f2, 94 | W_uo2, W_ho2, b_o2, W_hy2, b_hy2): 95 | [h_t, s_t] = recurrent_fn(u_t, h_tm1, s_tm1, W_ug, W_hg, b_g, W_ui, W_hi, b_i, W_uf, W_hf, b_f, 96 | W_uo, W_ho, b_o, W_hy, b_hy) 97 | o1 = T.dot(h_tm1, W_hy) + b_hy 98 | [h_t2, s_t2] = recurrent_fn(o1, h_tm12, s_tm12, W_ug2, W_hg2, b_g2, W_ui2, W_hi2, b_i2, W_uf2, W_hf2, b_f2, 99 | W_uo2, W_ho2, b_o2, W_hy2, b_hy2) 100 | return [h_t, s_t, h_t2, s_t2] 101 | 102 | 103 | #use lstmLayer class to define algebra of lstm and build stack and gradient calculation 104 | 105 | #one layer lstm stack for stock price prediction 106 | # u = T.matrix() 107 | # t = T.scalar() 108 | # l1 = LstmLayer(n_in=5, n_hidden=10, n_out=1, name='l1') 109 | 110 | #theano.printing.debugprint([h0_tm1, u, W_hh, W_uh, W_hy, b_hh, b_hy], print_type=True) 111 | #define 112 | # [l1.h, l1.s], _ = theano.scan(recurrent_fn, sequences = u, 113 | # outputs_info = [l1.h0_tm1, l1.s0_tm1], 114 | # non_sequences = l1.argList) 115 | # y = T.dot(l1.h[-1], l1.W_hy) + l1.b_hy 116 | # cost = ((t - y)**2).mean(axis=0).sum() 117 | # grad = T.grad(cost, l1.argList) 118 | # lr = T.scalar() 119 | # update = [(a, a-lr*b) for (a,b) in zip(l1.argList, grad)] 120 | # 121 | # train_step = theano.function([u, t, lr], cost, 122 | # on_unused_input='warn', 123 | # updates=update, 124 | # allow_input_downcast=True) 125 | 126 | #two layer lstm stack for stock price prediction 127 | u = T.matrix() 128 | t = T.scalar() 129 | o1 = T.matrix() 130 | l1 = LstmLayer(n_in=5, n_hidden=10, n_out=10, name='l1') 131 | l2 = LstmLayer(n_in=10, n_hidden=10, n_out=1, name='l2') 132 | #theano.printing.debugprint([h0_tm1, u, W_hh, W_uh, W_hy, b_hh, b_hy], print_type=True) 133 | #define 134 | [l1.h, l1.s, l2.h, l2.s], _ = theano.scan(fcn2, sequences = u, 135 | outputs_info = [l1.h0_tm1, l1.s0_tm1, l2.h0_tm1, l2.s0_tm1], 136 | non_sequences = l1.argList + l2.argList) 137 | # non_sequences = l1.argList + l2.argList, mode='DebugMode') 138 | 139 | 140 | 141 | y = T.dot(l2.h[-1], l2.W_hy) + l2.b_hy 142 | cost = ((t - y)**2).mean(axis=0).sum() 143 | grad = T.grad(cost, l1.argList + l2.argList) 144 | lr = T.scalar() 145 | update = [(a, a-lr*b) for (a,b) in zip(l1.argList + l2.argList, grad)] 146 | 147 | train_step = theano.function([u, t, lr], cost, 148 | on_unused_input='warn', 149 | updates=update, 150 | allow_input_downcast=True) 151 | # allow_input_downcast=True, mode='DebugMode') 152 | 153 | 154 | if __name__ == '__main__': 155 | 156 | (xlist, ylist) = pickle.load(open('stockTT.bin', 'rb')) 157 | nInputs = len(xlist[0]) 158 | x = np.array(xlist, dtype = theano.config.floatX) 159 | y = np.array(ylist, dtype = theano.config.floatX) 160 | print "Std Dev of Price Change", np.std(y) 161 | nHidden = 20 162 | nOutputs = 1 163 | lr = 0.01 164 | eSmooth = 1.0 165 | nPasses = 1 166 | vals = [] 167 | errSq = [] 168 | for i in range(nPasses): 169 | for j in range(len(x)): 170 | u = np.asarray(xlist[j], dtype = theano.config.floatX).reshape((1,nInputs)) 171 | t = y[j] 172 | 173 | c = train_step(u, t, lr) 174 | if j%10==0: print "iteration {0}: {1}".format(j, np.sqrt(c)) 175 | eSmooth = 0.1*np.sqrt(c) + 0.9*eSmooth 176 | vals.append(eSmooth) 177 | errSq.append(c) 178 | print 'RMS Pred Error', sqrt(np.average(errSq[500:])) 179 | plt.plot(vals) 180 | plt.show() 181 | 182 | 183 | 184 | -------------------------------------------------------------------------------- /DavesReservoirComputingRefs: -------------------------------------------------------------------------------- 1 | Here is the material for Reservoir Computing for next week - one paper 2 | (with accompanying slides), plus a second paper if we have time. 3 | 4 | === 5 | Overview: 6 | 7 | Next week's paper is Reservoir Computing by Felix Grezes. But if you are 8 | new to randomized projections, you may want to start with the blog post 9 | from by Terence Tao and/or the Nuite Blanche blog post to learn about 10 | the underlying concepts - links to both blogs are below, after Grezes' 11 | paper and slides. 12 | 13 | If we have time we may also discuss "Information Processing Using a 14 | Single Dynamical Node as Complex System." 15 | 16 | -- 17 | Links: 18 | 19 | Next weeks' paper (and related slides): 20 | Reservoir Computing by Felix Grezes. 21 | http://www.gc.cuny.edu/CUNY_GC/media/Computer-Science/Student%20Presentations/Felix%20Grezes/Second_Exam_Survey_Felix_Grezes_9_04_2014.pdf 22 | 23 | Slides by Felix Grezes: Reservoir Computing for Neural Networks 24 | http://www.gc.cuny.edu/CUNY_GC/media/Computer-Science/Student%20Presentations/Felix%20Grezes/Second_Exam_Slides_Felix_Grezes_9-14-2014.pdf 25 | (more at: http://speech.cs.qc.cuny.edu/~felix/ ) 26 | 27 | -- 28 | 29 | This is a short, very useful backgrounder on randomized projections, 30 | here used for compressed sensing, in a blog post by Terence Tao 31 | https://terrytao.wordpress.com/2007/04/13/compressed-sensing-and-single-pixel-cameras/ 32 | 33 | and the same story told with illustrations on the Nuit Blanche blog: 34 | http://nuit-blanche.blogspot.com/2007/07/how-does-rice-one-pixel-camera-work.html 35 | 36 | (BTW http://nuit-blanche.blogspot.com is a tremendous website.) 37 | 38 | --- 39 | 40 | If we have time, we may discuss this paper: 41 | 42 | Information Processing Using a Single Dynamical Node as Complex System. 43 | https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3195233/pdf/ncomms1476.pdf 44 | 45 | Supplementary Information to the paper: 46 | http://www.nature.com/article-assets/npg/ncomms/journal/v2/n9/extref/ncomms1476-s1.pdf 47 | 48 | === 49 | 50 | Additional background info on, and software for, randomized projections 51 | and Reservoir Computing: 52 | 53 | Web Site on Reservoir Computing 54 | http://organic.elis.ugent.be/flavors 55 | -- 56 | Reservoir Computing Python Toolkit (OGERr) (see above Reservoir 57 | Computing web site for list of other software packages) 58 | http://organic.elis.ugent.be/software/organic-environment-reservoir-computing-oger-toolbox 59 | -- 60 | A good video by Ted Dunning on randomized projections: 61 | https://vimeo.com/33417977 62 | 63 | A key paper on randomized projections, which is discussed in the above 64 | video: 65 | Finding structure with randomness: Probabilistic algorithms for 66 | constructing approximate matrix decompositions - 67 | https://arxiv.org/abs/0909.4061 68 | -- 69 | Some papers on Reservoir Computing that go into more detail: 70 | Constructing optimized binary masks for reservoir computing with delay 71 | systems 72 | https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3887384/pdf/srep03629.pdf 73 | -- 74 | A Comparative Study of Reservoir Computing for Temporal Signal 75 | Processing https://arxiv.org/abs/1401.2224 76 | -- 77 | Product Reservoir Computing: Time-Series Computation with Multiplicative 78 | Neurons 79 | https://arxiv.org/pdf/1502.00718.pdf 80 | -- 81 | Reservoir computing approaches to recurrent neural network training 82 | http://minds.jacobs-university.de/sites/default/files/uploads/papers/2261_LukoseviciusJaeger09.pdf 83 | -- 84 | Reservoir Computing and Self-Organized Neural Hierarchies 85 | http://minds.jacobs-university.de/sites/default/files/uploads/papers/Mantas_Lukosevicius_PhD_thesis.pdf 86 | -------------------------------------------------------------------------------- /GRU/GRU.py: -------------------------------------------------------------------------------- 1 | import theano 2 | import theano.tensor as T 3 | import numpy as np 4 | import random 5 | import matplotlib.pyplot as plt 6 | import cPickle as pickle 7 | from math import sqrt 8 | #from lstmClass import LstmLayer, recurrent_fn 9 | 10 | '''Define GRU class for single layer GRU 11 | The objective is to define in such a way as to facilitate construction of multi-layer 12 | GRU. The questions are: 13 | 1. Cost function can't go inside the class because it may only be associated with the last 14 | layer in the stack. 15 | 2. Should the scan function be inside or outside of the class? 16 | 3. How about the gradient calculations? It seems like those need to be outside the class 17 | does that mean that the gradient calculations have to be outside the class statement. 18 | 4. Perhaps the scan function and the single layer recurrance function need to be inside 19 | the class statement, but the cost function goes outside. 20 | 5. Then the gradient calculation might only need to have a list of the parameters for which 21 | the cost needs to be diff'd. That would just be the list of lstm-layer objects dotted with 22 | the parameter list for each one. 23 | 6. Not clear how gradient of scan function may interact with python oop. Not sure if scan 24 | output includes enough for gradient calc. Perhaps scan should be external to class structure 25 | 26 | 27 | ''' 28 | 29 | class GRULayer(object): 30 | 31 | def __init__(self, n_in, n_hidden, n_out, name): 32 | self.name = name 33 | rng = np.random.RandomState(1234) 34 | #candidate activation - h-twiddle equation 35 | self.W_uht = np.asarray(rng.normal(size=(n_in, n_hidden), scale= .01, loc = 0.0), dtype = theano.config.floatX) 36 | self.W_hht = np.asarray(rng.normal(size=(n_hidden, n_hidden), scale=.01, loc = 0.0), dtype = theano.config.floatX) 37 | self.b_ht = np.zeros((n_hidden,), dtype=theano.config.floatX) 38 | #update gate - z eqn 39 | self.W_uz = np.asarray(rng.normal(size=(n_in, n_hidden), scale =.01, loc=0.0), dtype = theano.config.floatX) 40 | self.W_hz = np.asarray(rng.normal(size=(n_hidden, n_hidden), scale =.01, loc=0.0), dtype = theano.config.floatX) 41 | self.b_z = np.zeros((n_hidden,), dtype=theano.config.floatX) 42 | #reset gate equations - r 43 | self.W_ur = np.asarray(rng.normal(size=(n_in, n_hidden), scale =.01, loc=0.0), dtype = theano.config.floatX) 44 | self.W_hr = np.asarray(rng.normal(size=(n_hidden, n_hidden), scale =.01, loc=0.0), dtype = theano.config.floatX) 45 | self.b_r = np.zeros((n_hidden,), dtype=theano.config.floatX) 46 | 47 | #output layer 48 | self.W_hy = np.asarray(rng.normal(size=(n_hidden, n_out), scale =.01, loc=0.0), dtype = theano.config.floatX) 49 | self.b_y = np.zeros((n_out,), dtype=theano.config.floatX) 50 | 51 | #candidate activation - h-twiddle equation 52 | self.W_uht = theano.shared(self.W_uht, 'W_uht' + self.name) 53 | self.W_hht = theano.shared(self.W_hht, 'W_hht' + self.name) 54 | self.b_ht = theano.shared(self.b_ht, 'b_ht' + self.name) 55 | #update gate - z eqn 56 | self.W_uz = theano.shared(self.W_uz, 'W_uz' + self.name) 57 | self.W_hz = theano.shared(self.W_hz, 'W_hz' + self.name) 58 | self.b_z = theano.shared(self.b_z, 'b_z' + self.name) 59 | #reset gate equations 60 | self.W_ur = theano.shared(self.W_ur, 'W_ur' + self.name) 61 | self.W_hr = theano.shared(self.W_hr, 'W_hr' + self.name) 62 | self.b_r = theano.shared(self.b_r, 'b_r' + self.name) 63 | 64 | #output layer 65 | self.W_hy = theano.shared(self.W_hy, 'W_hy' + self.name) 66 | self.b_y = theano.shared(self.b_y, 'b_y' + self.name) 67 | 68 | self.h0_tm1 = theano.shared(np.zeros(n_hidden, dtype=theano.config.floatX)) 69 | self.argList = [self.W_uht, self.W_hht, self.b_ht, self.W_uz, self.W_hz, 70 | self.b_z, self.W_ur, self.W_hr, self.b_r, self.W_hy, self.b_y] 71 | 72 | def recurrent_fn(u_t, h_tm1, W_uht, W_hht, b_ht, W_uz, W_hz, b_z, W_ur, W_hr, b_r, W_hy, b_y): 73 | z_t = T.nnet.sigmoid(T.dot(u_t, W_uz) + T.dot(h_tm1, W_hz) + b_z) 74 | r_t = T.nnet.sigmoid(T.dot(u_t, W_ur) + T.dot(h_tm1, W_hr) + b_r) 75 | ht_t = T.tanh(T.dot(u_t, W_uht) + T.dot(r_t*h_tm1, W_hht) + b_ht) 76 | 77 | h_t = (1 - z_t)*h_tm1 + z_t*ht_t 78 | return h_t 79 | 80 | # def fcn2(u_t, h_tm1, s_tm1,h_tm12, s_tm12, W_ug, W_hg, b_g, W_ui, W_hi, b_i, W_uf, W_hf, b_f, 81 | # W_uo, W_ho, b_o, W_hy, b_hy, W_ug2, W_hg2, b_g2, W_ui2, W_hi2, b_i2, W_uf2, W_hf2, b_f2, 82 | # W_uo2, W_ho2, b_o2, W_hy2, b_hy2): 83 | # [h_t, s_t] = recurrent_fn(u_t, h_tm1, s_tm1, W_ug, W_hg, b_g, W_ui, W_hi, b_i, W_uf, W_hf, b_f, 84 | # W_uo, W_ho, b_o, W_hy, b_hy) 85 | # o1 = T.dot(h_tm1, W_hy) + b_hy 86 | # [h_t2, s_t2] = recurrent_fn(o1, h_tm12, s_tm12, W_ug2, W_hg2, b_g2, W_ui2, W_hi2, b_i2, W_uf2, W_hf2, b_f2, 87 | # W_uo2, W_ho2, b_o2, W_hy2, b_hy2) 88 | # return [h_t, s_t, h_t2, s_t2] 89 | 90 | 91 | #use GRULayer class to define algebra of GRU and build stack and gradient calculation 92 | 93 | #one layer gru stack for stock price prediction 94 | u = T.matrix() 95 | t = T.scalar() 96 | l1 = GRULayer(n_in=5, n_hidden=10, n_out=1, name='l1') 97 | 98 | #theano.printing.debugprint([h0_tm1, u, W_hh, W_uh, W_hy, b_hh, b_hy], print_type=True) 99 | #define 100 | l1.h, _ = theano.scan(recurrent_fn, sequences = u, 101 | outputs_info = [l1.h0_tm1], 102 | non_sequences = l1.argList) 103 | y = T.dot(l1.h[-1], l1.W_hy) + l1.b_y 104 | cost = ((t - y)**2).mean(axis=0).sum() 105 | grad = T.grad(cost, l1.argList) 106 | lr = T.scalar() 107 | update = [(a, a-lr*b) for (a,b) in zip(l1.argList, grad)] 108 | 109 | train_step = theano.function([u, t, lr], cost, 110 | on_unused_input='warn', 111 | updates=update, 112 | allow_input_downcast=True) 113 | 114 | # #two layer lstm stack for stock price prediction 115 | # u = T.matrix() 116 | # t = T.scalar() 117 | # o1 = T.matrix() 118 | # l1 = LstmLayer(n_in=5, n_hidden=10, n_out=10, name='l1') 119 | # l2 = LstmLayer(n_in=10, n_hidden=10, n_out=1, name='l2') 120 | # #theano.printing.debugprint([h0_tm1, u, W_hh, W_uh, W_hy, b_hh, b_hy], print_type=True) 121 | # #define 122 | # [l1.h, l1.s, l2.h, l2.s], _ = theano.scan(fcn2, sequences = u, 123 | # outputs_info = [l1.h0_tm1, l1.s0_tm1, l2.h0_tm1, l2.s0_tm1], 124 | # non_sequences = l1.argList + l2.argList) 125 | # # non_sequences = l1.argList + l2.argList, mode='DebugMode') 126 | 127 | 128 | 129 | # y = T.dot(l2.h[-1], l2.W_hy) + l2.b_hy 130 | # cost = ((t - y)**2).mean(axis=0).sum() 131 | # grad = T.grad(cost, l1.argList + l2.argList) 132 | # lr = T.scalar() 133 | # update = [(a, a-lr*b) for (a,b) in zip(l1.argList + l2.argList, grad)] 134 | # 135 | # train_step = theano.function([u, t, lr], cost, 136 | # on_unused_input='warn', 137 | # updates=update, 138 | # allow_input_downcast=True) 139 | # allow_input_downcast=True, mode='DebugMode') 140 | 141 | 142 | if __name__ == '__main__': 143 | 144 | (xlist, ylist) = pickle.load(open('stockTT.bin', 'rb')) 145 | nInputs = len(xlist[0]) 146 | x = np.array(xlist, dtype = theano.config.floatX) 147 | y = np.array(ylist, dtype = theano.config.floatX) 148 | print "Std Dev of Price Change", np.std(y) 149 | nHidden = 50 150 | nOutputs = 1 151 | lr = 0.01 152 | eSmooth = 1.0 153 | nPasses = 1 154 | vals = [] 155 | errSq = [] 156 | for i in range(nPasses): 157 | for j in range(len(x)): 158 | u = np.asarray(xlist[j], dtype = theano.config.floatX).reshape((1,nInputs)) 159 | t = y[j] 160 | 161 | c = train_step(u, t, lr) 162 | if j%10==0: print "iteration {0}: {1}".format(j, np.sqrt(c)) 163 | eSmooth = 0.1*np.sqrt(c) + 0.9*eSmooth 164 | vals.append(eSmooth) 165 | errSq.append(c) 166 | print 'RMS Pred Error', sqrt(np.average(errSq[500:])) 167 | plt.plot(vals) 168 | plt.show() 169 | 170 | #with nhidden = 20 (error number is the same with nhidden = 50) 171 | #std dev of price = 3.55612 172 | #std dev of error = 1.36565147217 173 | # 1 - ratio = 0.8525 -------------------------------------------------------------------------------- /ICLR Top paper thumbnail descriptions.: -------------------------------------------------------------------------------- 1 | https://iclr.cc/Conferences/2018/Schedule?type=Oral 2 | 3 | Zero shot visual imitation - Two step robot learing. First cause robot to explore without goals and then frame objective as sequence of views of intermediate steps, while re-labeling exploration sequence so objectives reached during exploration are treated as targets. Leads to one-shot learning. 4 | 5 | Boosting dilated convolutional networks w mixed tensor decompositions - theoretical demonstration that layer inter connections improve expressive efficiency in dialation convolution networks. Interesting and useful result of rare theoretical type. going to take some work to get through. 6 | 7 | Principled Adversarial Training - Use wasserstein measure to define distributional neighborhood for generating adversarial training examples. 8 | 9 | Breaking the softmax Bottleneck - Demonstrate the softmax is too restrictive of a model and propose to overcome restriction by using mixture of softmaxes MoS. Achieve consistently better performance in a variety of benchmarks. 10 | 11 | Characterizing adversarial subspaces using local intrinsic dimensionality - Characterize adversarial subspaces as space filling in neighborhoods of legitimate examples and charactize by local dimensionality. This characterization yields test for adversarial example and they show vastly improved detection rates that other methods. 12 | 13 | Neural Sketch Learning - System for code generation that operates by breaking the problem into two parts 1. generating sketches that describe core operation and 2. filling in the sketches with code that satisfies the details of typing etc. 14 | 15 | Learning to represent programs as graphs - Builds on Gated Graph Neural Networks (GGNN) and uses it to attack sub problems of programming, naming variables and using them correctly. 16 | 17 | Insufficiency of Existing momentum schemes for stochastic optimization - Demonstrates cases where Nesterov etc don't perform well when using SGD versus GD. Develop an alternative based on Nesterov. 18 | 19 | Convergence of ADAM and Beyond - Analyze convergence issue of Adam on large parameter spaces. Determine that moving average is not well suited and develop alternative. Show performance improvements in synthetic cases constructed on the basis of authors' analysis of weaknesses and on benchmarks known to cause Adam problems. 20 | 21 | Wasserstein Auto Encoders - Authors use Wasserstein distance as comparator function for AE and adversarial nets. Demonstrate better convergence properties than GAN while matching GAN's better quality. 22 | https://wolfweb.unr.edu/homepage/jabuka/Classes/2006_spring/topology/Notes/04%20-%20Congergent%20sequences.pdf 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright [2016] [Mike Bowles] 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /PapersSuggested_not_read: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # hdDeepLearningStudy 2 | Papers,code etc for deep learning study group 3 | See group discord - https://discord.gg/HuWVmMgmqS 4 | zoom link - On the meetup page 5 | meeting time - 6:30 pm California time 6 | 7 | ## Tuesday, November 21, 2023 8 | paper: MemGPT -Towards LLMs as an Operating System https://arxiv.org/pdf/2310.08560.pdf 9 | Blog w MemBPT - https://memgpt.ai/ 10 | youtube: https://www.youtube.com/watch?v=nQmZmFERmrg 11 | 12 | ## Tuesday, November 14, 2023 13 | paper: https://openreview.net/pdf?id=S1KGaTSOTS - CLUSTERFORMER: Clustering As A Universal Visual Learner. 14 | 15 | ## Tuesday, November 7, 2023 16 | paper: https://arxiv.org/pdf/2310.12962.pdf - An Emulator for Fine-Tuning Large Language Models using Small Language Models 17 | 18 | ## Tuesday, October 31, 2023 19 | paper: https://www.nature.com/articles/s42256-023-00711-8 - From attribution maps to human-understandable explanations through Concept Relevance Propagation 20 | 21 | ## Tuesday, October 24, 2023 22 | paper: https://arxiv.org/pdf/2209.12951.pdf - Liquid Structural State-Space Models 23 | 24 | ## Tuesday, October 17, 2023 25 | paper: Liquid Time-Constant Networks https://arxiv.org/abs/2006.04439 26 | youtube: https://www.youtube.com/watch?v=IlliqYiRhMU 27 | shorter video: https://www.youtube.com/watch?v=RI35E5ewBuI 28 | 29 | ## Tuesday, October 10, 2023 30 | paper - 3D Gaussian Splatting for Real-Time Radiance Field Rendering https://arxiv.org/abs/2308.04079 31 | youtube: Superb 2 minute video on paper https://www.youtube.com/watch?v=HVv_IQKlafQ 32 | youtube: Siggraph 2023 talk on paper - this is 5 minutes https://www.youtube.com/watch?v=T_kXY43VZnk&t=3s 33 | Author's blog: including links to code: https://repo-sam.inria.fr/fungraph/3d-gaussian-splatting/ 34 | 35 | ## Tuesday, October 3 , 2023 36 | paper: https://arxiv.org/abs/2112.04035 - Relating transformers to models and neural representations of the hippocampal formation 37 | another paper: https://amygdala.psychdept.arizona.edu/labspace/JclubLabMeetings/JeanMarc-Build-cognitive-maps.pdf - How to build a cognitive map 38 | youtube: https://www.youtube.com/watch?v=9qOaII_PzGY&t=413s - How Your Brain Organizes Information 39 | youtube: https://www.youtube.com/watch?v=cufOEzoVMVA - Can We Build an Artificial Hippocampus? 40 | youtube: https://www.cell.com/cell/fulltext/S0092-8674(20)31388-X - The Tolman-Eichenbaum Machine: Unifying Space and Relational Memory through Generalization in the Hippocampal Formation 41 | 42 | ## Tuesday, September 26, 2023 43 | paper: https://research.nvidia.com/labs/par/Perfusion/ - 3D Gaussian Splatting for Real-Time Radiance Field Rendering 44 | 45 | ## Tuesday, September 19, 2023 46 | paper: https://arxiv.org/pdf/2210.09276.pdf - Imagic: Text-Based Real Image Editing with Diffusion Models 47 | youtube: https://www.youtube.com/watch?v=PzHMjCtuPuo 48 | blog: https://imagic-editing.github.io/ 49 | 50 | ## Tuesday, Sept 12, 2023 51 | paper: https://arxiv.org/abs/2307.02486 - LongNet: Scaling Transformers to 1,000,000,000 Tokens 52 | Blog: https://syncedreview.com/2023/07/10/microsofts-longnet-scales-transformer-to-one-billion-tokens 53 | 54 | ## Tuesday, Sept 5, 2023 55 | https://arxiv.org/pdf/2308.08708.pdf - Consciousness in Artificial Intelligence: Insights from the Science of Consciousness 56 | 57 | ## Tuesday, August 29, 2023 58 | paper: https://arxiv.org/pdf/2307.15936.pdf - A Theory for Emergence of Complex Skills in Language Models and video 59 | youtube: https://www.youtube.com/watch?v=0D23NeBjCeQ 60 | 61 | ## Tuesday, August 22, 2023 62 | Paper: https://arxiv.org/pdf/2206.04843.pdf -- Neural Laplace: Learning diverse classes of differential equations in the Laplace domain 63 | Slides and video from ICML 2022: https://icml.cc/virtual/2022/oral/16728 64 | 65 | ## Wednesday, August 16, 2023 66 | paper: https://arxiv.org/abs/2308.03296 - Studying Large Language Model Generalization with Influence Functions 67 | blog: https://www.anthropic.com/index/influence-functions 68 | 69 | ## Wednesday, August 9, 2023 70 | paper: Music Generations https://arxiv.org/pdf/2306.05284.pdf 71 | blog: https://about.fb.com/news/2023/08/audiocraft-generative-ai-for-music-and-audio/ 72 | blog: https://ai.meta.com/blog/audiocraft-musicgen-audiogen-encodec-generative-ai-audio/ 73 | 74 | ## Wednesday, August 2, 2023 75 | paper: https://arxiv.org/abs/2205.10343 Towards Understanding Grokking: An Effective Theory of Representation Learning 76 | blog: https://ericjmichaud.com/grokking-squared/ 77 | blog: https://www.beren.io/2022-01-11-Grokking-Grokking/ 78 | blog: https://www.beren.io/2022-04-17-Understanding_Overparametrized_Generalization/ 79 | 80 | ## Wednesday, July 26, 2023 81 | paper: Mixture of experts (similar to chatGPT4): https://arxiv.org/abs/2305.14705 82 | 83 | blog: Mixture-of-Experts with Expert Choice Routing - 84 | https://ai.googleblog.com/2022/11/mixture-of-experts-with-expert-choice.html 85 | 86 | blot: Introducing Pathways: A next-generation AI architecture 87 | https://blog.google/technology/ai/introducing-pathways-next-generation-ai-architecture/ 88 | 89 | ## Wednesday, July 19, 2023 90 | We're going to cover Chapter 16 Deep Networks for Classification from the following book: 91 | https://book-wright-ma.github.io/Book-WM-20210422.pdf - High dimensional Data Analysis with Low Dimensional Models 92 | blog: https://terrytao.wordpress.com/2007/04/13/compressed-sensing-and-single-pixel-cameras/#more-25 93 | 94 | ## Wednesday, July 12, 2023 95 | We're going to cover the 4th chapter of this book. 96 | https://book-wright-ma.github.io/Book-WM-20210422.pdf - High dimensional Data Analysis with Low Dimensional Models 97 | 98 | ## Wednesday, July 5, 2023 99 | We're going to cover the 1st chapter of this book. 100 | https://book-wright-ma.github.io/Book-WM-20210422.pdf - High dimensional Data Analysis with Low Dimensional Models 101 | Blog: https://terrytao.wordpress.com/2007/04/13/compressed-sensing-and-single-pixel-cameras/#more-25 102 | 103 | ## Wednesday, June 28, 2023 104 | paper: https://arxiv.org/pdf/2305.17126.pdf - Large Language Models as Tool Makers 105 | youtube: https://www.youtube.com/watch?v=qWI1AJ2nSDY 106 | youtube: https://www.youtube.com/watch?v=KXlPzMRTfMk 107 | youtube: https://www.youtube.com/watch?v=srDVNbxPgZI 108 | 109 | ## Wednesday, June 21, 2023 110 | Consciousness as a Memory System https://pubmed.ncbi.nlm.nih.gov/36178498/ 111 | 112 | ## Wednesday, June 14, 2023 113 | https://arxiv.org/abs/1804.08838 114 | Blog: https://www.uber.com/blog/intrinsic-dimension/ 115 | more good stuff on intrinsic dimension: 116 | Nature paper: https://www.nature.com/articles/s41598-017-11873-y 117 | Wikipedia: https://en.wikipedia.org/wiki/Intrinsic_dimension 118 | Application - Yann LeCun at 57:15 on does text fully represent world model? 119 | https://www.youtube.com/watch?v=SGzMElJ11Cc 120 | vs. differing view from Ilya Sutskever at 15:30 121 | https://www.youtube.com/watch?v=SjhIlw3Iffs 122 | Applying intrinsic dimension to scaling laws in training / loss: 123 | https://jmlr.csail.mit.edu/papers/volume23/20-1111/20-1111.pdf 124 | https://arxiv.org/abs/2102.06701 125 | 126 | ## Wednesday, June 7, 2023 127 | Paper: https://arxiv.org/pdf/2305.16291.pdf 128 | Twit: Tweet with nice overview by author https://twitter.com/DrJimFan/status/1662117784023883777 129 | Code: https://github.com/MineDojo/Voyager 130 | website: https://voyager.minedojo.org/ 131 | 132 | ## Wednesday, May 31, 2023 133 | paper: https://arxiv.org/pdf/2203.15556.pdf - Training Compute-Optimal Large Language Models 134 | blog: https://www.lesswrong.com/posts/6Fpvch8RR29qLEWNH/chinchilla-s-wild-implications 135 | blog: https://www.harmdevries.com/post/model-size-vs-compute-overhead/ 136 | google blog: https://www.cnbc.com/2023/05/16/googles-palm-2-uses-nearly-five-times-more-text-data-than-predecessor.html 137 | 138 | ## Wednesday, May 24, 2023 139 | paper: https://arxiv.org/abs/2212.09720 - The case for 4-bit precision: k-bit Inference Scaling Laws 140 | paper: https://arxiv.org/pdf/2210.17323.pdf - GPTQ: ACCURATE POST-TRAINING QUANTIZATION FOR GENERATIVE PRE-TRAINED TRANSFORMERS 141 | 142 | ## Wednesday, May 17, 2023 143 | paper: https://arxiv.org/pdf/2106.09685.pdf - LORA: LOW-RANK ADAPTATION OF LARGE LANGUAGE MODELS 144 | 145 | ## Wednesday, May 10, 2023 146 | paper: https://arxiv.org/pdf/2210.03629.pdf - REACT: SYNERGIZING REASONING AND ACTING IN LANGUAGE MODELS 147 | paper: https://www.pinecone.io/learn/locality-sensitive-hashing/ 148 | 149 | ## Wednesday, May 3, 2023 150 | paper: https://arxiv.org/pdf/2201.11903.pdf - Chain of thought prompting elicits reasoning in large language models. 151 | paper: https://arxiv.org/pdf/2210.03629.pdf - REACT: SYNERGIZING REASONING AND ACTING IN LANGUAGE MODELS 152 | paper: https://www.pinecone.io/learn/locality-sensitive-hashing/ 153 | 154 | ## Wednesday, Apr 26, 2023 155 | https://python.langchain.com/en/latest/modules/agents.html 156 | https://arxiv.org/pdf/2210.03629.pdf - REACT: SYNERGIZING REASONING AND ACTING IN LANGUAGE MODELS 157 | https://www.pinecone.io/learn/locality-sensitive-hashing/ 158 | 159 | ## Wednesday, Apr 19, 2023 160 | Blog: https://yoheinakajima.com/task-driven-autonomous-agent-utilizing-gpt-4-pinecone-and-langchain-for-diverse-applications/ 161 | Code: https://github.com/hwchase17/langchain 162 | 163 | ## Wednesday, Apr 12, 2023 164 | Paper: Eliciting Latent Predictions from Transformers with the Tuned Lens https://arxiv.org/abs/2303.08112 165 | 166 | ## Wednesday, Apr 5, 2023 167 | Paper: https://openreview.net/pdf?id=lMMaNf6oxKM - Recipe for a General, Powerful, Scalable Graph Transformer 168 | youtube: https://www.youtube.com/watch?v=DiLSCReBaTg 169 | 170 | ## Wednesday, Mar 29, 2023 171 | Paper: https://proceedings.neurips.cc/paper/2021/hash/f1c1592588411002af340cbaedd6fc33-Abstract.html - Do Transformers Really Perform Badly for Graph Representation? 172 | video: https://www.youtube.com/watch?v=FKuQpPIRjLk - review by authors 173 | video: https://www.youtube.com/watch?v=xQ5ltOOxoFg 174 | 175 | ## Wednesday, Mar 22, 2023 176 | Paper: https://arxiv.org/abs/2212.07359 - Post-hoc Uncertainty Learning using a Dirichlet Meta-Model 177 | youtube: https://www.youtube.com/watch?v=nE8XJ1f0zO0 178 | 179 | ## Wednesday, Mar 15, 2023 180 | Paper: https://arxiv.org/abs/2202.05262 - Locating and Editing Factual Associations in GPT 181 | blog: https://rome.baulab.info/ 182 | Yannic video: https://www.youtube.com/watch?v=_NMQyOu2HTo 183 | 184 | ## Wednesday, Mar 8, 2023 185 | Paper: Human-Timescale Adaptation in an Open-Ended Task Space: https://arxiv.org/pdf/2301.07608.pdf 186 | https://www.youtube.com/watch?v=A2hOWShiYoM 187 | https://sites.google.com/view/adaptive-agent/ 188 | 189 | ## Wednesday, Mar 1, 2023 190 | Paper: Toolformer: Language Models Can Teach Themselves to Use Tools: https://arxiv.org/abs/2302.04761 191 | 192 | ## Wednesday, Feb 22, 2023 193 | Paper: https://arxiv.org/pdf/2203.02155.pdf - Training language models to follow instructions with human feedback 194 | 195 | ## Wednesday, Feb 15, 2023 196 | Paper: https://arxiv.org/pdf/2111.15664.pdf - OCR-free Document Understanding Transformer 197 | 198 | ## Wednesday, Feb 8, 2023 199 | Paper: https://arxiv.org/abs/2205.06175 - A generalist agent - Gato 200 | YouTube: Eden Mayer https://www.youtube.com/watch?v=wSQJZHfAg18 201 | YouTube - Jay Alamar https://www.youtube.com/watch?v=kT6DYKgWNHg 202 | YouTube - Lex Fridman and Oriol Vinyals on How Gato Works https://www.youtube.com/watch?v=vwB9zO2h9j0 203 | Overview - main site on Gato at Deepmind https://www.deepmind.com/publications/a-generalist-agent 204 | blog review - https://arshren.medium.com/deep-minds-generalist-agent-gato-209969e12782 205 | 206 | ## Wednesday, Feb 1, 2023 207 | Paper: https://openreview.net/pdf?id=M95oDwJXayG - ADDRESSING PARAMETER CHOICE ISSUES IN UNSUPERVISED DOMAIN ADAPTATION BY AGGREGATION 208 | 209 | ## Wednesday, Jan 25, 2023 210 | Paper: https://arxiv.org/pdf/2301.04104v1.pdf - Mastering Diverse Domains through World Models 211 | Blog: https://danijar.com/project/dreamerv3/ 212 | YouTube: https://www.youtube.com/watch?v=vfpZu0R1s1Y 213 | 214 | ## Wednesday, Jan 18, 2023 215 | Paper: https://arxiv.org/abs/2212.04089 - Composable NN: Editing Models With Task Arithmetic 216 | 217 | ## Wednesday, Jan 11, 2023 218 | Paper: https://arxiv.org/pdf/1707.06690.pdf - DeepPath: A Reinforcement Learning Method for Knowledge Graph Reasoning 219 | 220 | ## Wednesday, Jan 4, 2023 221 | Paper: https://arxiv.org/abs/2212.04458 - GENERAL-PURPOSE IN-CONTEXT LEARNING BY META-LEARNING TRANSFORMERS 222 | 223 | ## Wednesday, Dec 21, 2022 224 | paper: https://arxiv.org/pdf/2209.04836.pdf - GIT RE-BASIN: MERGING MODELS MODULO PERMUTATION SYMMETRIES 225 | 226 | ## Wednesday, Dec 14, 2022 227 | paper: https://arxiv.org/abs/2012.09855 - Infinite Nature: Perpetual View Generation of Natural Scenes from a Single Image 228 | blog: https://infinite-nature.github.io/ 229 | 230 | ## Wednesday, Dec 7, 2022 231 | Paper: https://arxiv.org/abs/2206.00364 - Elucidating the Design Space of Diffusion-Based Generative Models 232 | video: https://www.youtube.com/watch?v=OYiQctx7kDE 233 | 234 | ## Wednesday, Nov 30, 2022 235 | paper: https://arxiv.org/pdf/2206.10991.pdf - Graph Neural Networks as Gradient Flows: understanding graph convolutions via energy 236 | youtube (author): https://www.youtube.com/watch?v=sgTTtmwOMgE 237 | youtube: https://www.youtube.com/watch?v=hmI4C6AodEQ 238 | 239 | ## Wednesday, Nov 16, 2022 240 | paper: https://www.pnas.org/doi/full/10.1073/pnas.2016239118 241 | video: https://slideslive.com/38942412/biological-structure-and-function-emerge-from-scaling-unsupervised-learning-to-250-million-protein-sequences 242 | 243 | ## Wednesday, Nov 9, 2022 244 | paper: https://arxiv.org/pdf/2209.11178.pdf - Poisson Flow Generative Models 245 | 246 | ## Wednesday, Nov 2, 2022 247 | paper: https://arxiv.org/pdf/2209.12892.pdf - LEARNING TO LEARN WITH GENERATIVE MODELS OF NEURAL NETWORK CHECKPOINTS 248 | blog: https://www.marktechpost.com/2022/10/21/latest-machine-learning-research-at-uc-berkeley-proposes-a-way-to-design-a-learned-optimizer-using-generative-models-of-neural-network-checkpoints/ 249 | author blog: https://www.wpeebles.com/Gpt.html 250 | 251 | ## Wednesday, Oct 26, 2022 252 | paper: Cellular automata as convolutional neural networks https://arxiv.org/pdf/1809.02942.pdf 253 | survey: Collective Intelligence for Deep Learning: A Survey of Recent Developments https://arxiv.org/abs/2111.14377 254 | demo: Self-classifying MNIST Digits https://distill.pub/2020/selforg/mnist/ 255 | 256 | ## Wednesday, Oct 19, 2022 257 | paper: https://proceedings.mlr.press/v162/zhu22c/zhu22c.pdf - Neural-Symbolic Models for Logical Queries on Knowledge Graphs 258 | 259 | ## Wednesday, Oct 12, 2022 260 | paper: https://arxiv.org/pdf/2206.02768.pdf - The Neural Covariance SDE: Shaped Infinite Depth-and-Width Networks at Initialization 261 | 262 | ## Wednesday, Oct 5, 2022 263 | paper: https://papers.nips.cc/paper/2019/file/952285b9b7e7a1be5aa7849f32ffff05-Paper.pdf - Legendre Memory Units: Continuous-Time 264 | 265 | ## Wednesday, Sept 28, 2022 266 | paper: https://arxiv.org/pdf/2208.01618.pdf - An Image is Worth One Word: Personalizing Text-to-Image Generation using Textual Inversion 267 | githup.io: https://textual-inversion.github.io/ 268 | YouTube https://www.youtube.com/watch?v=f3oXa7_SYek 269 | 270 | ## Wednesday, Sept 21, 2022 271 | paper: https://arxiv.org/pdf/2205.14415.pdf - Non-stationary Transformers: Rethinking the Stationarity in Time Series Forecasting 272 | 273 | ## Wednesday, Sept 14, 2022 274 | paper: https://arxiv.org/abs/2110.02402 - Language Modeling using LMUs: 10x Better Data Efficiency or Improved Scaling Compared to Transformers 275 | youtube vid: https://www.youtube.com/watch?v=8t64QaTdBcU 276 | 277 | ## Wednesday, August 31, 2022 278 | Paper: HOW NEURAL NETWORKS EXTRAPOLATE: FROM FEEDFORWARD TO GRAPH NEURAL NETWORKS - https://arxiv.org/pdf/2009.11848.pdf 279 | 280 | ## Wednesday, August 24, 2022 281 | Paper: Masked Siamese Networks for Label-Efficient Learning - https://arxiv.org/abs/2204.07141 282 | 283 | ## Wednesday, August 17, 2022 284 | Paper: Principle of Maximal Coding Rate Reduction https://arxiv.org/abs/2006.08558 285 | ReduNet: https://arxiv.org/pdf/2105.10446.pdf 286 | Github: https://github.com/ryanchankh/mcr2 287 | 288 | ## Wednesday, August 10, 2022 289 | Paper: On the Principles of Parsimony and Self-Consistency for the Emergence of Intelligence https://arxiv.org/abs/2207.04630 290 | Background: On the Principles of Parsimony and Self-Consistency for the Emergence of Intelligence https://arxiv.org/abs/2207.04630 291 | Background: https://www.youtube.com/watch?v=OIVcfZeR1CE youtube by author 292 | Background: https://cmsa.fas.harvard.edu/wp-content/uploads/2021/04/Deep_Networks_from_First_Principles.pdf - slides by author 293 | 294 | 295 | ## Wednesday, August 3, 2022 296 | Paper: Data Distributional Properties Drive Emergent In-Context Learning in Transformers https://arxiv.org/pdf/2205.05055.pdf 297 | 298 | ## Wednesday, July 27, 2022 299 | Paper: A Mathematical Framework for Transformer Circuits https://transformer-circuits.pub/2021/framework/index.html#model-simplifications 300 | 301 | ## Wednesday, July 20, 2022 302 | Paper: A Mathematical Framework for Transformer Circuits https://transformer-circuits.pub/2021/framework/index.html#model-simplifications 303 | 304 | ## Wednesday, July 13, 2022 305 | Paper: https://arxiv.org/abs/2001.08361 - Scaling Laws for Neural Language Models 306 | Blog: https://medium.com/nlplanet/two-minutes-nlp-scaling-laws-for-neural-language-models-add6061aece7 307 | 308 | ## Wednesday, July 6, 2022 309 | Paper: https://arxiv.org/abs/2206.11795 - Video PreTraining (VPT): Learning to Act by Watching Unlabeled Online Videos 310 | https://github.com/openai/Video-Pre-Training 311 | Yannic Review: https://www.youtube.com/watch?v=oz5yZc9ULAc 312 | 313 | ## Wednesday, June 29, 2022 314 | Paper: https://arxiv.org/pdf/2110.00966.pdf - Translating Images into Maps 315 | 316 | ## Wednesday, June 22, 2022 317 | Paper: https://arxiv.org/abs/2205.09665 - Automated Crossword Solving 318 | 319 | ## Wednesday, June 15, 2022 320 | Paper: https://arxiv.org/pdf/2205.10824.pdf - ReLU Fields: The Little Non-linearity That Could 321 | 322 | 323 | ## Wednesday, June 8, 2022 324 | Paper: https://arxiv.org/abs/2102.06810 - Understanding Self-Supervised Learning Dynamics without Contrastive Pairs 325 | 326 | ## Wednesday, June 1, 2022 327 | Paper: https://arxiv.org/pdf/2205.06175.pdf - A Generalist Agent 328 | Blog: https://www.deepmind.com/publications/a-generalist-agent 329 | 330 | ## Wednesday, May 25, 2022 331 | https://arxiv.org/pdf/2202.05780.pdf - A Modern Self-Referential Weight Matrix That Learns to Modify Itself 332 | 333 | ## Wednesday, May 18, 2022 334 | https://openreview.net/pdf?id=M752z9FKJP - LEARNING STRIDES IN CONVOLUTIONAL NEURAL NETWORKS 335 | 336 | ## Wednesday, May 11, 2022 337 | https://openreview.net/pdf?id=b-ny3x071E5 - BOOTSTRAPPED META-LEARNING 338 | 339 | ## Wednesday, May 4, 2022 340 | https://arxiv.org/abs/2202.06991 - Transformer Memory as a Differentiable Search Index 341 | https://www.youtube.com/watch?v=C7mUYocWdG0 - Yannic author interview 342 | https://www.youtube.com/watch?v=qlB0TPBQ7YY - Yannic on Transformer paper 343 | 344 | ## Wednesday, April 27, 2022 345 | https://arxiv.org/abs/2204.06125 - Hierarchical Text-Conditional Image Generation with CLIP Latents 346 | https://openai.com/dall-e-2/ - OpenAI blog 347 | https://www.youtube.com/watch?v=j4xgkjWlfL4 - yannic video 348 | 349 | ## Wednesday, April 20, 2022 350 | https://arxiv.org/pdf/2103.00020.pdf - Learning Transferable Visual Models From Natural Language Supervision 351 | https://www.youtube.com/watch?v=1LUWWAnK_Ks 352 | https://www.youtube.com/watch?v=3X3EY2Fgp3g 353 | 354 | ## Wednesday, April 13, 2022 355 | https://arxiv.org/pdf/2110.13985.pdf - Combining Recurrent, Convolutional, and Continuous-time 356 | Models with Linear State-Space Layers 357 | 358 | ## Wednesday, April 6, 2022 359 | https://arxiv.org/pdf/2202.00666.pdf - Typical Decoding for Natural Language Generation 360 | 361 | https://youtu.be/_EDr3ryrT_Y 362 | 363 | https://www.youtube.com/watch?v=AvHLJqtmQkE 364 | 365 | ## Wednesday, March 30, 2022 366 | https://arxiv.org/pdf/2105.04906.pdf - VICREG: VARIANCE-INVARIANCE-COVARIANCE REGULARIZATION FOR SELF-SUPERVISED LEARNING 367 | https://www.youtube.com/watch?v=MzKDNmOJ67Q 368 | 369 | ## Wednesday, March 23, 2022 370 | https://openreview.net/forum?id=4orlVaC95Bo - Task-Agnostic Undesirable Feature Deactivation Using Out-of-Distribution Data 371 | 372 | ## Wednesday, March 16, 2022 373 | https://arxiv.org/abs/2203.03466 - Tensor Programs V: Tuning Large Neural Networks via Zero-Shot Hyperparameter Transfer 374 | https://www.youtube.com/watch?v=MNOJQINH-qw 375 | 376 | ## Wednesday, March 9, 2022 377 | https://arxiv.org/abs/2201.12122 - Can Wikipedia Help Offline Reinforcement Learning? 378 | Yannic's talk on this, 379 | https://www.youtube.com/watch?v=XHGh19Hbx48 380 | and he also has a followon video interview with the authors 381 | https://www.youtube.com/watch?v=FNDVy_BR8aA 382 | 383 | 384 | ## Wednesday, March 2, 2022 - 385 | https://arxiv.org/pdf/2107.03342.pdf - A Survey of Uncertainty in Deep Neural Networks 386 | 387 | ## Wednesday, February 23, 2022 - 388 | https://arxiv.org/pdf/2201.08239v2.pdf - LaMDA: Language Models for Dialog Applications 389 | 390 | ## Wednesday, February 16, 2022 - 391 | https://openreview.net/pdf?id=TrjbxzRcnf- MEMORIZING TRANSFORMERS 392 | 393 | ## Wednesday, February 9, 2022 - 394 | https://arxiv.org/pdf/2106.07644.pdf - A Continuized View on Nesterov Acceleration for Stochastic Gradient Descent and Randomized Gossip 395 | 396 | ## Wednesday, February 2, 2022 - 397 | https://arxiv.org/pdf/2108.08052.pdf - Moser Flow: Divergence-based Generative Modeling on Manifolds 398 | 399 | ## Wednesday, January 26, 2022 - 400 | https://dylandoblar.github.io/noether-networks/ - Noether Networks: meta-learning useful conserved quantities 401 | 402 | https://www.youtube.com/watch?v=Xp3jR-ttMfo 403 | 404 | ## Wednesday, January 19, 2022 - 405 | https://arxiv.org/pdf/2010.15277.pdf - Class-incremental learning: survey and performance evaluation on image classification 406 | 407 | ## Wednesday, January 12, 2022 - 408 | https://arxiv.org/abs/2006.11287 - Discovering Symbolic Models from Deep Learning with Inductive Biases 409 | 410 | ## Wednesday, January 5, 2022 - 411 | https://arxiv.org/pdf/2006.09252.pdf - Improving Graph Neural Network Expressivity via Subgraph Isomorphism Counting 412 | 413 | ## Wednesday, December 29, 2021 - 414 | https://arxiv.org/pdf/2112.04426.pdf - Improving Language Models by Retrieving from Trillions of Tokens 415 | 416 | https://www.deepmind.com/research/publications/2021/improving-language-models-by-retrieving-from-trillions-of-tokens 417 | 418 | ## Wednesday, December 22, 2021 - 419 | https://arxiv.org/abs/2106.01798 - Implicit MLE: Backpropagating Through Discrete Exponential Family Distributions 420 | 421 | https://www.youtube.com/watch?v=W2UT8NjUqrk 422 | 423 | ## Wednesday, December 15, 2021 - 424 | https://arxiv.org/pdf/2108.01073.pdf - Image Synthesis and Editing with Stochastic Differential Equations 425 | 426 | ## Wednesday, December 1, 2021 - 427 | https://openreview.net/forum?id=HfpNVDg3ExA 428 | OpenReviewOpenReview 429 | Probabilistic Transformer For Time Series Analysis 430 | 431 | ## Wednesday, November 17, 2021 - 432 | https://arxiv.org/pdf/2110.03922.pdf - NEURAL TANGENT KERNEL EIGENVALUES ACCURATELY PREDICT GENERALIZATION 433 | 434 | ## Wednesday, November 10, 2021 - 435 | https://arxiv.org/pdf/2104.00681.pdf - NeuralRecon: Real-Time Coherent 3D Reconstruction from Monocular Video 436 | 437 | https://github.com/zju3dv/NeuralRecon 438 | 439 | 440 | ## Wednesday, October 27, 2021 - 441 | https://arxiv.org/pdf/2110.09485.pdf - Learning in High Dimension Always Amounts to Extrapolation 442 | 443 | ## Wednesday, October 20, 2021 - 444 | https://arxiv.org/pdf/2109.02355.pdf - A Farewell to the Bias-Variance Tradeoff? An Overview of the Theory of Overparameterized Machine Learning 445 | 446 | ## Wednesday, October 13, 2021 - 447 | https://arxiv.org/pdf/2006.09011.pdf - Improved Techniques for Training Score-Based Generative Models 448 | 449 | ## Wednesday, October 6, 2021 - 450 | https://arxiv.org/abs/2006.05929 - Dataset Condensation with Gradient Matching 451 | 452 | ## Wednesday, September 29, 2021 - 453 | https://arxiv.org/abs/1811.10959 - Dataset distillation 454 | 455 | ## Wednesday, September 22, 2021 - 456 | https://arxiv.org/pdf/2003.13216.pdf - Learning to Learn Single Domain Generalization 457 | 458 | ## Wednesday, September 15, 2021 - 459 | https://arxiv.org/pdf/2108.11482.pdf - ETA Prediction with Graph Neural Networks in Google Maps 460 | 461 | ## Wednesday, September 8, 2021 - 462 | https://cascaded-diffusion.github.io/assets/cascaded_diffusion.pdf - Cascaded Diffusion Models for High Fidelity Image Generation 463 | 464 | ## Wednesday, September 1, 2021 - 465 | https://arxiv.org/pdf/2107.06277.pdf - Why Generalization in RL is Difficult: Epistemic POMDPs and Implicit Partial Observability 466 | 467 | ## Wednesday, August 25, 2021 - 468 | https://arxiv.org/abs/2108.07732 - Program Synthesis with Large Models 469 | 470 | ## Wednesday, August 18, 2021 - 471 | https://arxiv.org/abs/2012.13349 - Solving Mixed Integer Programs Using Neural Networks 472 | 473 | ## Wednesday, August 11, 2021 - 474 | https://www.nature.com/articles/s41586-021-03819-2 - DeepFold 475 | 476 | ## Wednesday, August 4, 2021 - 477 | Alphafold - blog https://deepmind.com/blog/article/alphafold-a-solution-to-a-50-year-old-grand-challenge-in-biology paper https://www.nature.com/articles/s41586-021-03819-2 supplemental info https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-021-03819-2/MediaObjects/41586_2021_3819_MOESM1_ESM.pdf 478 | 479 | ## Wednesday, July 21, 2021 - 480 | https://www.zdnet.com/article/googles-supermodel-deepmind-perceiver-is-a-step-on-the-road-to-an-ai-machine-that-could-process-everything/ https://arxiv.org/abs/2103.03206 481 | 482 | ## Wednesday, July 14, 2021 - 483 | https://arxiv.org/pdf/1503.03585.pdf (Deep Unsupervised Learning using Non equilibrium Thermodynamics) by Surya Ganguli at Stanford 484 | ## 485 | Wednesday, July 7, 2021 - 486 | https://arxiv.org/pdf/2105.05233.pdf - Diffusion Models Beat GANs on Image Synthesis 487 | 488 | ## Wednesday, June 30, 2021 - 489 | https://arxiv.org/pdf/2006.11239.pdf - Denoising Diffusion Probabilistic Models 490 | 491 | ## Wednesday, June 23, 2021 - 492 | https://arxiv.org/abs/2010.03409 - Learning mesh-based simulation with graph networks 493 | 494 | https://sites.google.com/view/learning-to-simulate 495 | 496 | https://deepmind.com/research/publications/Learning-to-Simulate-Complex-Physics-with-Graph-Networks 497 | 498 | ## Wednesday, June 16, 2021 - 499 | https://arxiv.org/pdf/2106.01345.pdf - Decision Transformer: Reinforcement Learning via Sequence Modeling 500 | 501 | https://www.youtube.com/watch?v=-buULmf7dec 502 | 503 | https://sites.google.com/berkeley.edu/decision-transformer 504 | 505 | ## Wednesday, June 9, 2021 - 506 | https://arxiv.org/pdf/2103.07945.pdf - Learning One Representation to Optimize All Rewards 507 | 508 | ## Wednesday, June 2, 2021 - 509 | https://distill.pub/2021/multimodal-neurons/ - Multimodal Neurons in Artificial Neural Networks 510 | 511 | https://openai.com/blog/clip/ - CLIP: Connecting Text and Images 512 | 513 | ## Wednesday, May 26, 2021 - 514 | https://arxiv.org/pdf/2104.14294.pdf - Emerging Properties in Self-Supervised Vision Transformers 515 | 516 | https://ai.facebook.com/blog/dino-paws-computer-vision-with-self-supervised-transformers-and-10x-more-efficient-training/ 517 | 518 | ## Wednesday, May 19, 2021 - 519 | https://arxiv.org/pdf/2104.10558.pdf - Contingencies from Observations: Tractable ContingencyPlanning with Learned Behavior Models 520 | 521 | ## Wednesday, May 12, 2021 - 522 | https://arxiv.org/pdf/1806.09055.pdf - DARTS: Differentiable Architecture Search (ICLR 2019) 523 | 524 | ## Wednesday, May 5, 2021 - 525 | https://arxiv.org/pdf/2104.06644.pdf - Masked Language Modeling and the Distributional Hypothesis:Order Word Matters Pre-training for Little 526 | 527 | ## Wednesday, April 28, 2021 - 528 | https://arxiv.org/pdf/2009.03717.pdf - Hierarchical message passing graph neural networks 529 | 530 | ## Wednesday, April 14, 2021 - 531 | https://arxiv.org/pdf/2103.03230v1.pdf - Barlow Twins: Self-Supervised Learning via Redundancy Reduction 532 | 533 | ## Wednesday, April 7, 2021 - 534 | https://arxiv.org/pdf/2103.14770.pdf - Categorical representation learning: morphism is all you need 535 | 536 | ## Wednesday, March 31, 2021 - 537 | https://arxiv.org/pdf/2102.12736v1.pdf - Time-Series Imputation with Wasserstein Interpolation for Optimal Look-Ahead-Bias and Variance Tradeoff 538 | 539 | ## Wednesday, March 24, 2021 - 540 | https://awacrl.github.io/ - Accelerating online reinforcement learning with offline datasets 541 | 542 | ## Wednesday, March 17, 2021 - 543 | https://arxiv.org/pdf/2102.12092.pdf - Zero-Shot Text-to-Image Generation 544 | 545 | https://openai.com/blog/dall-e/ 546 | 547 | ## Wednesday, March 10, 2021 - 548 | https://giotto-ai.github.io/gtda-docs/latest/notebooks/gravitational_waves_detection.html 549 | 550 | ## Wednesday, March 3, 2021 - 551 | https://arxiv.org/pdf/2102.08602.pdf - Modeling long-range interactions without attention 552 | 553 | ## Wednesday, February 24, 2021 - 554 | https://arxiv.org/pdf/2101.08692.pdf - Characterizing signal propagation to close the performance gap in unnormalized resnets 555 | 556 | ## Wednesday, February 17, 2021 - 557 | https://arxiv.org/pdf/2006.10742.pdf - Learning Invariant Representations forReinforcement Learning without Reconstruction 558 | 559 | ## Wednesday, February 10, 2021 - 560 | https://arxiv.org/pdf/2007.13544.pdf - Combining Deep Reinforcement Learning and Search for Imperfect-Information Games 561 | 562 | ## Wednesday, February 3, 2021 - 563 | https://arxiv.org/pdf/2010.11929.pdf - An image is worth 16x16 words: transformers for image recognition at scale 564 | 565 | ## Wednesday, January 27, 2021 - 566 | https://arxiv.org/abs/2003.02821 - What went wrong and when? Instance-wise feature importance for time-series black-box models 567 | 568 | ## Wednesday, January 20, 2021 - 569 | https://arxiv.org/pdf/1912.09363.pdf - Temporal Fusion Transformersfor Interpretable Multi-horizon Time Series Forecasting 570 | 571 | ## Wednesday, January 13, 2021 - 572 | https://arxiv.org/abs/1905.10403 - Neural Jump Stochastic Differential Equations 573 | 574 | ## Wednesday, January 6, 2021 - 575 | http://implicit-layers-tutorial.org/neural_odes/ - We're continuing this from last week. This week we'll cover Ch 3,4,5. 576 | 577 | ## Wednesday, December 30, 2020 - 578 | http://implicit-layers-tutorial.org/ - NeurIPS tutorial on deep implicit networks 579 | 580 | Wednesday, December 23, 2020 - 581 | https://arxiv.org/pdf/1907.03907.pdf - Latent ODEs for Irregularly-Sampled Time Series 582 | 583 | https://www.youtube.com/watch?v=tOkH339Wucs 584 | 585 | ## Wednesday, December 16, 2020 - 586 | https://papers.nips.cc/paper/2020/file/08425b881bcde94a383cd258cea331be-Paper.pdf - Ridge Rider: Finding Diverse Solutions by FollowingEigenvectors of the Hessian 587 | 588 | ## Wednesday, December 9, 2020 - 589 | https://proceedings.neurips.cc/paper/2020/file/28e209b61a52482a0ae1cb9f5959c792-Paper.pdf 590 | “OOD-MAML: Meta-Learning for Few-Shot Out-of-Distribution Detection and Classification" 591 | 592 | ## Wednesday, December 2, 2020 - 593 | https://arxiv.org/pdf/2011.02421.pdf - ONE-SHOT CONDITIONAL AUDIO FILTERING OF ARBITRARY SOUNDS 594 | 595 | ## Wednesday, November 18, 2020 - 596 | https://arxiv.org/pdf/2010.14498.pdf - Implicit under-parametrization inhibits data efficient deep reinforcement learning 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | ## Mar 11 - Hacker Dojo 608 | https://arxiv.org/pdf/2002.11089.pdf - Rewriting History with Inverse RL: Hindsight Inference for Policy Improvement 609 | 610 | ## Mar 4 - Hacker Dojo 611 | https://www.osapublishing.org/DirectPDFAccess/C6D6B2C3-953C-4461-695B6E5E2F993943_415059/prj-7-8-823.pdf?da=1&id=415059&seq=0&mobile=no --Nanophotonic media for artificial neural inference 612 | 613 | 614 | ## Feb 19 - Hacker Dojo 615 | https://arxiv.org/pdf/1910.02789.pdf - Language is Power: Representing States Using Natural Language in Reinforcement Learning 616 | 617 | 618 | ## Feb 12 - Hacker Dojo 619 | https://deepmind.com/blog/article/AlphaFold-Using-AI-for-scientific-discovery - Protein folding paper. 620 | 621 | ## Feb 5 - Hacker Dojo 622 | https://arxiv.org/abs/2001.04451 Reformer, the efficient transformer 623 | https://ai.googleblog.com/2020/01/reformer-efficient-transformer.html 624 | 625 | ## Jan 22 - Hacker Dojo 626 | https://arxiv.org/pdf/1906.05717.pdf - Unsupervised Monocular Depth and Ego-motion Learning with Structure and Semantics 627 | 628 | ## Jan 15 - Hacker Dojo 629 | https://arxiv.org/pdf/1912.09524.pdf - Evolving ab initio trading strategies in heterogeneous environments 630 | 631 | ## Jan 8 - Hacker Dojo 632 | https://arxiv.org/pdf/1911.05892.pdf - Reinforcement Learning for Market Making in Multi-agent Dealer Market 633 | 634 | 635 | ## Dec 18 - Hacker Dojo 636 | https://www.nature.com/articles/s41586-019-1724-z.epdf?author_access_token=lZH3nqPYtWJXfDA10W0CNNRgN0jAjWel9jnR3ZoTv0PSZcPzJFGNAZhOlk4deBCKzKm70KfinloafEF1bCCXL6IIHHgKaDkaTkBcTEv7aT-wqDoG1VeO9-wO3GEoAMF9bAOt7mJ0RWQnRVMbyfgH9A%3D%3D 637 | https://www.gwern.net/docs/rl/2019-vinyals.pdf 638 | https://deepmind.com/blog/article/AlphaStar-Grandmaster-level-in-StarCraft-II-using-multi-agent-reinforcement-learning 639 | 640 | ## Nov 20 - Hacker Dojo 641 | https://arxiv.org/pdf/1911.04252.pdf - Self-training with Noisy Student improves ImageNet classification 642 | 643 | ## Nov 13 - Hacker Dojo 644 | https://arxiv.org/pdf/1910.12713.pdf - Few-shot video-video synthesis 645 | 646 | 647 | ## Nov 6 - Hacker Dojo 648 | https://arxiv.org/pdf/1906.11883.pdf - Unsupervised learning of Object Keypoints for Perception and Control 649 | 650 | ## Oct 30 - Hacker Dojo 651 | https://arxiv.org/pdf/1710.03748.pdf - Emergent Complexity via Multi-Agent Competition 652 | https://openai.com/blog/competitive-self-play/ 653 | 654 | ## Oct 23 - Hacker Dojo 655 | https://arxiv.org/pdf/1703.04908.pdf - Emergence of Grounded Compositional Language in Multi-Agent Populations 656 | 657 | ## Oct 16 - Hacker Dojo 658 | https://arxiv.org/pdf/1909.07528.pdf - Emergent tool use from multi agent autocurricula 659 | https://openai.com/blog/emergent-tool-use/ 660 | 661 | ## Oct 9 - Hacker Dojo 662 | https://arxiv.org/pdf/1901.00949.pdf - Machine Teaching in Hierarchical Genetic Reinforcement Learning: Curriculum Design of Reward Functions for Swarm Shepherding 663 | 664 | ## Sept 25 - Hacker Dojo 665 | https://arxiv.org/pdf/1812.01729.pdf - Boltzman Generators - Sampling equilibrium states of many body systems with deep learning 666 | 667 | 668 | ## Sept 18 - Hacker Dojo 669 | https://arxiv.org/pdf/1907.10599.pdf - Fine Grained Spectral Perspective on Neural Networks 670 | 671 | ## Sept 11 - Hacker Dojo 672 | https://arxiv.org/pdf/1906.08237.pdf - XLNet Generalized autoregressive pretraining for language understanding 673 | 674 | ## Sept 4 - Hacker Dojo 675 | https://arxiv.org/pdf/1905.09272.pdf - Data efficient image recognition with contrastive predictive coding. 676 | 677 | ## August 21 - Hacker Dojo 678 | https://arxiv.org/pdf/1904.10509.pdf - Generating long sequences with sparse transformers 679 | 680 | ## August 14 - Hacker Dojo 681 | https://arxiv.org/pdf/1807.03748.pdf - Representation learning with contrastive predictive coding. 682 | 683 | ## July 31 - Hacker Dojo 684 | https://arxiv.org/pdf/1906.08253.pdf - When to trust your model: model-based policy optimization 685 | 686 | ## July 24 - Hacker Dojo 687 | https://arxiv.org/pdf/1901.09321.pdf - Fixup initialization - residual learning without normalization 688 | 689 | 690 | ## July 17 - Hacker Dojo 691 | http://proceedings.mlr.press/v97/mahoney19a/mahoney19a.pdf - Traditional and heavy tailed self regularization in neural net models 692 | 693 | ## July 3 - Hacker Dojo 694 | https://arxiv.org/pdf/1804.08838.pdf - Measuring intrinsic dimension of objective landscapes 695 | 696 | ## June 19 - Hacker Dojo 697 | https://arxiv.org/abs/1810.09536 - Ordered Neurons: Integrating Tree Structures into Recurrent Neural Networks 698 | 699 | ## June 12 - Hacker Dojo 700 | https://arxiv.org/pdf/1812.05159.pdf - An empirical study of example forgetting during neural network training. 701 | 702 | ## June 5 - Hacker Dojo 703 | https://arxiv.org/pdf/1812.00417.pdf - Snorkel Drybell - A case study in weak supervision at industrial scale 704 | https://arxiv.org/pdf/1905.04981.pdf - Modelling instance level annotator reliability for natural language labelling 705 | 706 | ## May 29 - Hacker Dojo 707 | https://arxiv.org/pdf/1901.09321.pdf - Fixup Initialization: Residual Learning without Normalization 708 | 709 | ## May 22 - Hacker Dojo 710 | https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf - Language Models are Unsupervised Multitask Learners. 711 | 712 | ## May 15 - Hacker Dojo 713 | https://arxiv.org/pdf/1811.00995.pdf - Invertible Residual Networks 714 | 715 | ## Apr 29 - Hacker Dojo 716 | https://arxiv.org/pdf/1904.01681.pdf - Augmented Neural ODE's 717 | 718 | ## Apr 8 - Hacker Dojo 719 | https://arxiv.org/pdf/1901.00596.pdf - Comprehensive Survey of Graph Neural Nets 720 | https://github.com/rusty1s/pytorch_geometric 721 | 722 | ## Apr 1 - Hacker Dojo 723 | https://arxiv.org/pdf/1901.00596.pdf - Comprehensive Survey of Graph Neural Nets 724 | 725 | ## Mar 25 - Hacker Dojo 726 | https://papers.nips.cc/paper/7539-optimal-algorithms-for-non-smooth-distributed-optimization-in-networks.pdf - nips award winner 727 | 728 | ## Mar 18 - Hacker Dojo 729 | https://papers.nips.cc/paper/8200-non-delusional-q-learning-and-value-iteration.pdf - Non-delusional Q-learning and Value Iteration 730 | 731 | ## Mar 11 - Hacker Dojo 732 | https://arxiv.org/pdf/1706.03762.pdf - attention is all you need - Vaswani 733 | https://github.com/jadore801120/attention-is-all-you-need-pytorch - easier to read code 734 | https://www.youtube.com/watch?v=S0KakHcj_rs 735 | https://tdls.a-i.science/events/2018-10-22/ 736 | https://tdls.a-i.science/events/2019-02-04/ 737 | http://nlp.seas.harvard.edu/2018/04/03/attention.html 738 | 739 | 740 | ## Mar 4 - Hacker Dojo 741 | https://arxiv.org/pdf/1806.02643.pdf - Re-evalating Evaluation 742 | 743 | ## Feb 25 - Hacker Dojo 744 | https://arxiv.org/pdf/1812.11951.pdf - Learning to Design RNA 745 | 746 | ## Feb 11 - Hacker Dojo - 747 | https://arxiv.org/pdf/1901.02860.pdf - Transformer XL - Attentive Language Models, Beyond a fixed length context 748 | 749 | ## Feb 4 - Hacker Dojo 750 | https://arxiv.org/pdf/1809.06646.pdf - Model Free Adaptive Optimal Control of Sequential Manufacturing Process Using Reinforcement Learning 751 | 752 | ## January 28 - Hacker Dojo 753 | https://arxiv.org/pdf/1806.07366.pdf - Neural Ordinary Differential Equations - Top paper NIPS2019 754 | 755 | ## January 21 - Hacker Dojo 756 | https://arxiv.org/pdf/1606.05312.pdf - Successor Features for Transfer in Reinforcement Learning 757 | http://proceedings.mlr.press/v37/schaul15.pdf - Universal Value Function Approximators 758 | http://proceedings.mlr.press/v80/barreto18a/barreto18a.pdf - Transfer in deep reinforcement learning using successor features and generalised policy improvement. 759 | 760 | https://www.youtube.com/watch?v=YDCPHekLUI4&t=1053s - Tom Schaul 761 | https://www.youtube.com/watch?v=OCHwXxSW70o - Tejas Kulkarni 762 | 763 | 764 | ## January 14 - Hacker Dojo 765 | https://arxiv.org/pdf/1812.07626.pdf - Universal Successor Features Approximators 766 | 767 | ## January 7 - Hacker Dojo 768 | https://arxiv.org/pdf/1810.12715.pdf - On the Effectiveness of Interval Bound Propagation for Training Verifiably Robust Models 769 | 770 | ## December 17 - Hacker Dojo 771 | https://openreview.net/pdf?id=S1x4ghC9tQ - Temporal Difference Variational Autoencoder 772 | 773 | 774 | ## December 10 - Hacker Dojo 775 | https://openreview.net/pdf?id=S1JHhv6TW - Boosting Dilated Convolution with Mixed Tensor Decompositions 776 | 777 | ## December 3 - Hacker Dojo 778 | https://arxiv.org/pdf/1712.01208.pdf - The case for learned index structures 779 | 780 | ## November 26 - Hacker Dojo 781 | https://arxiv.org/abs/1809.07402 - Generalization properties of nn - Socher 782 | https://einstein.ai/research/blog/identifying-generalization-properties-in-neural-networks - blog for above paper 783 | 784 | ## November 19 - Hacker Dojo 785 | https://arxiv.org/pdf/1802.05983.pdf - Disentangling by Factorising 786 | https://arxiv.org/pdf/1804.00104.pdf - Learning Disentangled Joint, Discrete and Continuous Representations 787 | https://arxiv.org/pdf/1807.05520.pdf - Deep Clustering for Unsupervised Learning of Visual Features 788 | https://github.com/1Konny/FactorVAE 789 | https://github.com/paruby/FactorVAE 790 | https://github.com/nicolasigor/FactorVAE 791 | 792 | ## November 12 - Hacker Dojo 793 | https://arxiv.org/pdf/1810.12894.pdf - Exploration by Random Network Distillation - OpenAI 794 | 795 | ## November 5 - Hacker Dojo 796 | https://arxiv.org/pdf/1810.04805.pdf - Pre-trainged bi directional transformers for language translation 797 | 798 | 799 | ## October 22 - Hacker Dojo 800 | https://arxiv.org/pdf/1801.02613.pdf - Characterizing Adversarial Examples using Local Intrinsic Dimensionality 801 | 802 | 803 | ## October 15 - Hacker Dojo 804 | https://arxiv.org/pdf/1808.06670.pdf - Learning Deep Representations by Mutual Estimation Estimation and Maximization - Hjelm, Bengio 805 | 806 | ## October 8 - Hacker Dojo 807 | https://arxiv.org/pdf/1802.04364.pdf - Junction Tree Variational Auto-Encoder for Molecular Graph Generation 808 | http://snap.stanford.edu/proj/embeddings-www/files/nrltutorial-part2-gnns.pdf 809 | 810 | ## October 1 - Hacker Dojo 811 | https://arxiv.org/pdf/1808.06601.pdf - Video to video synthesis 812 | https://github.com/NVIDIA/vid2vid - code 813 | 814 | ## September 24 - Hacker Dojo 815 | https://arxiv.org/pdf/1807.03146.pdf - Discovery of 3d keypoints from 2d image 816 | 817 | ## September 17 - Hacker Dojo 818 | https://arxiv.org/abs/1709.02371 - PWC-Net: CNNs for Optical Flow Using Pyramid, Warping, and Cost Volume," by Deqing Sun et al. (CVPR 2018) 819 | Phil Ferrier will present the paper and run though his code for us. Phil's code is on his github reop: 820 | https://github.com/philferriere/tfoptflow 821 | 822 | ## September 10 - Hacker Dojo 823 | https://arxiv.org/pdf/1807.03247.pdf - Intriguing failure (and improvement) to CNN for determining rotations. 824 | 825 | ## September 3 - Hacker Dojo 826 | https://arxiv.org/pdf/1803.03324.pdf - Learning Deep Generative Models of Graphs 827 | 828 | ## August 27 - Hacker Dojo 829 | https://arxiv.org/abs/1709.10082 - Optimally decentralized multi-robot collision avoidance w reinforcement learning. 830 | 831 | https://github.com/TensorSwarm/TensorSwarm - Andreas Pasternak code for above 832 | 833 | ## August 13 - Hacker Dojo 834 | https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/learning-dexterity/learning-dexterity-paper.pdf -Robot doing single hand manipulations. 835 | https://www.theverge.com/2018/7/30/17621112/openai-robot-dexterity-dactyl-artificial-intelligence 836 | 837 | ## July 30 - Hacker Dojo - 838 | https://arxiv.org/pdf/1711.03953.pdf - Breaking the softmax bottleneck 839 | https://arxiv.org/pdf/1805.10829.pdf - SigSoftMax: Reanalyzing the softmax bottleneck 840 | https://severelytheoretical.wordpress.com/2018/06/08/the-softmax-bottleneck-is-a-special-case-of-a-more-general-phenomenon/ 841 | 842 | ## July 23 - Hacker Dojo - 843 | https://arxiv.org/pdf/1807.01281.pdf - Human level performance in first person multiplayer games with population reinforcement learning. 844 | https://deepmind.com/blog/capture-the-flag/ 845 | https://www.youtube.com/watch?v=steioHoiEms 846 | https://arxiv.org/abs/1711.09846v2 847 | https://arxiv.org/pdf/1611.05397.pdf 848 | 849 | ## July 16 - Hacker Dojo 850 | https://arxiv.org/pdf/1803.10122.pdf - schmidhuber paper on RL 851 | 852 | ## July 9 - Hacker Dojo 853 | https://deepmind.com/research/publications/neural-scene-representation-and-rendering/ - Rendering 3d scene 854 | 855 | ## July 2 - Hacker Dojo - 856 | https://arxiv.org/pdf/1707.06347.pdf - Proximal Optimization Policies 857 | 858 | ## June 25 - Hacker Dojo 859 | https://openreview.net/pdf?id=BJOFETxR- - Learning to represent programs with graphs 860 | 861 | ## June 18 - Hacker Dojo 862 | https://openreview.net/pdf?id=BkisuzWRW - Zero Shot Visual Imitation - Reinforcement Learning 863 | 864 | 865 | ## June 11 - Hacker Dojo 866 | https://openreview.net/forum?id=HkL7n1-0b - Wasserstein Auto Encoders - one of ICLR top papers. 867 | 868 | ## June 4 - Hacker Dojo 869 | https://openreview.net/pdf?id=Hy7fDog0b - Ambient GAN - Generative Models from Lossy Measurements - ICLR top paper 870 | 871 | 872 | ## May 21 - Hacker Dojo 873 | https://arstechnica.com/science/2018/05/ai-trained-to-navigate-develops-brain-like-location-tracking/ - Grid representations in rat brain 874 | https://deepmind.com/documents/200/Banino_at_al_final.pdf -- 875 | https://www.nature.com/articles/s41586-018-0102-6 -- 876 | 877 | 878 | 879 | ## May 14 - Hacker Dojo 880 | https://arxiv.org/pdf/1712.06567.pdf - Deep Neuroevolution: Genetic Algorithms are a Competitive Alternative for 881 | Training Deep Neural Networks for Reinforcement Learning 882 | https://arxiv.org/pdf/1712.06560.pdf - Improving Exploration in Evolution Strategies for Deep Reinforcement 883 | Learning via a Population of Novelty-Seeking Agents 884 | https://eng.uber.com/deep-neuroevolution/ - Uber engineering blog post 885 | 886 | ## May 7 - Hacker Dojo 887 | https://arxiv.org/pdf/1801.10130.pdf - spherical CNN 888 | 889 | ## Apr 30 - Hacker Dojo 890 | https://arxiv.org/pdf/1710.07313.pdf - Using machine learning to replicate chaotic attractors 891 | http://www.bmp.ds.mpg.de/tl_files/bmp/preprints/Zimmermann_Parlitz_preprint.pdf - paper to be published in "chaos" 892 | https://www.quantamagazine.org/machine-learnings-amazing-ability-to-predict-chaos-20180418/ - blog post 893 | 894 | 895 | ## Apr 23 - Hacker Dojo 896 | https://arxiv.org/pdf/1711.10925.pdf - Deep Image Prior 897 | https://dmitryulyanov.github.io/deep_image_prior - git hub from authors 898 | https://box.skoltech.ru/index.php/s/ib52BOoV58ztuPM 899 | http://mlexplained.com/2018/01/18/paper-dissected-deep-image-prior-explained/ 900 | http://fortune.com/2018/04/24/nvidia-artificial-intelligence-images/ - Article w video showing photo editing use 901 | 902 | ## Apr 16 - Hacker Dojo 903 | Finish Fractal AI 904 | https://arxiv.org/pdf/1711.07971.pdf - non-local filtering 905 | 906 | 907 | ## Apr 9 - Hacker Dojo 908 | http://lanl.arxiv.org/pdf/1803.05049v1 - Fractal AI 909 | 910 | ## Apr 2 - Hacker Dojo 911 | https://arxiv.org/pdf/1803.04831.pdf - IndRNN longer deeper RNN's 912 | 913 | ## Mar 26 - Hacker Dojo 914 | https://arxiv.org/pdf/1711.10433.pdf - parallel wavenet 915 | https://arxiv.org/pdf/1708.04552.pdf - regularizing convnet with cutout (desert paper) 916 | http://www.cs.toronto.edu/~jmartens/docs/Deep_HessianFree.pdf - will get short presentation on this one. 917 | 918 | ## Mar 19 - Hacker Dojo 919 | https://arxiv.org/pdf/1802.03268.pdf - Efficient Neural Architecture Search via Parameter Sharing 920 | https://github.com/carpedm20/ENAS-pytorch 921 | 922 | some related papers and reviews. 923 | https://arxiv.org/pdf/1708.05344.pdf - One shot architecture search 924 | https://openreview.net/forum?id=ByQZjx-0- 925 | and 926 | https://openreview.net/forum?id=rydeCEhs- 927 | 928 | 929 | ## Mar 12 - Hacker Dojo 930 | https://arxiv.org/abs/1703.10135 - tacotron - end-to-end speech synthesis 931 | https://arxiv.org/pdf/1712.05884.pdf - tacotron 2 932 | https://research.googleblog.com/2017/12/tacotron-2-generating-human-like-speech.html - 933 | https://github.com/A-Jacobson/tacotron2 - pytorch code 934 | http://research.baidu.com/deep-speech-3%EF%BC%9Aexploring-neural-transducers-end-end-speech-recognition/ 935 | 936 | ## Feb 26 - Hacker Dojo 937 | https://arxiv.org/pdf/1705.09792.pdf - Deep Complex Networks 938 | 939 | 940 | ## Feb 19 - Hacker Dojo 941 | https://arxiv.org/pdf/1801.10308.pdf - Nested LSTM's 942 | https://arxiv.org/pdf/1705.10142.pdf - KRU from Fair 943 | https://github.com/hannw/nlstm - tf code for Nested LSTM 944 | 945 | ## Feb 12 - Hacker Dojo 946 | http://openaccess.thecvf.com/content_cvpr_2017/papers/Khoreva_Simple_Does_It_CVPR_2017_paper.pdf - Weakly Supervised Instance and Semantic Segmentation 947 | https://www.mpi-inf.mpg.de/departments/computer-vision-and-multimodal-computing/research/weakly-supervised-learning/simple-does-it-weakly-supervised-instance-and-semantic-segmentation/ 948 | https://github.com/philferriere/tfwss - Phil Ferriere's code 949 | https://drive.google.com/file/d/1wPHMA4PqygawvIxRiy-2ZMKcpUO447cz/view?usp=sharing - mehul's notebook on segmentation 950 | 951 | ## Feb 5 - Hacker Dojo 952 | https://arxiv.org/pdf/1511.06939.pdf - using rnn for recommendation system 953 | https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/46488.pdf - latest paper on rnn for recommendation 954 | 955 | ## Jan 29 - Hacker Dojo 956 | https://arxiv.org/pdf/1709.04511.pdf - Empirical study of multi-agent RL 957 | https://github.com/geek-ai/1m-agents - code 958 | 959 | ## Jan 22 - Hacker Dojo 960 | https://arxiv.org/pdf/1704.00028.pdf - Improvements in Wasserstein GAN training 961 | 962 | ## Jan 15 - Hacker Dojo 963 | 964 | https://arxiv.org/pdf/1710.02298.pdf - Combining improvements in deep reinforcement learning 965 | 966 | ## Jan 8 - Hacker Dojo 967 | https://openreview.net/pdf?id=HJWLfGWRb - follow-on to capsule network paper 968 | https://www.youtube.com/watch?v=pPN8d0E3900 969 | https://www.youtube.com/watch?v=2Kawrd5szHE 970 | https://github.com/ageron/handson-ml/blob/master/extra_capsnets.ipynb 971 | https://github.com/naturomics/CapsNet-Tensorflow 972 | https://medium.com/ai%C2%B3-theory-practice-business/understanding-hintons-capsule-networks-part-ii-how-capsules-work-153b6ade9f66 973 | 974 | ## Dec 11 - Hacker Dojo 975 | https://arxiv.org/pdf/1710.09829.pdf - Dynamic routing between capsules - Hinton 976 | 977 | ## Nov 27 - Hacker Dojo 978 | https://arxiv.org/pdf/1701.01724.pdf - DeepStack: Expert-Level Artificial Intelligence in 979 | Heads-Up No-Limit Poker 980 | 981 | ## Nov 13 - Hacker Dojo 982 | https://deepmind.com/documents/119/agz_unformatted_nature.pdf - alpha zero paper 983 | https://webdocs.cs.ualberta.ca/~mmueller/talks/2016-LeeSedol-AlphaGo.pdf - some slides 984 | 985 | 986 | ## Nov 6 - Hacker Dojo 987 | https://arxiv.org/pdf/1703.10593.pdf - cycle consistent GANs 988 | 989 | ## Oct 30 - Hacker Dojo 990 | https://arxiv.org/pdf/1503.02406.pdf Naftali Tishby and Noga Zaslavsky. information bottleneck principle. 991 | 992 | https://www.cs.huji.ac.il/labs/learning/Papers/allerton.pdf - Naftali Tishby, Fernando C. Pereira, and William Bialek. The information bottleneck method. 993 | 994 | https://www.reddit.com/r/MachineLearning/comments/75uua6/r_2_hr_talk_information_theory_of_deep_learning/ 995 | 996 | ## Oct 23 - Hacker Dojo 997 | 998 | Mask R-CNN 999 | https://arxiv.org/abs/1703.06870 1000 | 1001 | 1002 | And these are prerequisites (read at least Fast R-CNN and Faster R-CNN) 1003 | 1004 | R-CNN 1005 | https://arxiv.org/abs/1311.2524 1006 | 1007 | Fast R-CNN 1008 | https://arxiv.org/pdf/1504.08083.pdf 1009 | 1010 | Faster R-CNN 1011 | https://arxiv.org/abs/1506.01497 Feature Pyramid Networks 1012 | https://arxiv.org/abs/1612.03144 1013 | 1014 | 1015 | ## Oct 16 - Hacker Dojo 1016 | https://arxiv.org/pdf/1703.00810.pdf - Opening the Black Box of Neural Nets via Information 1017 | https://www.youtube.com/watch?v=ekUWO_pI2M8 1018 | https://www.youtube.com/watch?v=bLqJHjXihK8 1019 | 1020 | ## Oct 9 - Hacker Dojo 1021 | https://arxiv.org/pdf/1501.00092.pdf - super resolution first paper 1022 | https://arxiv.org/abs/1608.00367 - super resolution second paper 1023 | 1024 | ## Oct 2 - Hacker Dojo 1025 | https://arxiv.org/abs/1604.03901 - Single-Image Depth Perception in the Wild 1026 | 1027 | ## Sept 25 - Hacker Dojo 1028 | https://arxiv.org/pdf/1706.08947.pdf - Exploring generalization in deep networks. 1029 | 1030 | ## Sept 18 - Hacker Dojo 1031 | https://arxiv.org/pdf/1705.02550.pdf - nvidia drone nav 1032 | https://github.com/NVIDIA-Jetson/redtail/wiki - code 1033 | 1034 | ## Sept 11 - Hacker Dojo 1035 | http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.365.5060&rep=rep1&type=pdf - hyperneat ref 1036 | https://arxiv.org/pdf/1609.09106.pdf - Hypernet ref 1037 | http://blog.otoro.net/2016/09/28/hyper-networks/ - blog on hypernet 1038 | https://www.youtube.com/watch?v=-8oyTYViuJ4 - vid on hyperNeat 1039 | http://eplex.cs.ucf.edu/hyperNEATpage/HyperNEAT.html - blog on hyperNeat 1040 | 1041 | ## August 28 - Hacker Dojo 1042 | https://arxiv.org/pdf/1708.05344.pdf - SMASH: One-Shot Model Architecture Search through HyperNetworks 1043 | https://www.youtube.com/watch?v=79tmPL9AL48 - youtube vid on SMASH 1044 | 1045 | ## August 21 - Hacker Dojo 1046 | https://arxiv.org/pdf/1706.02515.pdf - Self Normalizing Neural Networks - Hochreiter 1047 | 1048 | ## August 14 - Hacker Dojo 1049 | https://arxiv.org/pdf/1606.01541.pdf - Reinforcement Learning for Dialog Generation - Jurafsky 1050 | https://github.com/liuyuemaicha/Deep-Reinforcement-Learning-for-Dialogue-Generation-in-tensorflow - tensorflow code for same 1051 | https://github.com/jiweil/ - some related code 1052 | https://arxiv.org/pdf/1612.00563.pdf - self critical training for image captioning - RL for text prob. 1053 | 1054 | Some papers referenced by Jurafsky paper 1055 | [1506.05869] A Neural Conversational Model - Vinyals and Le 1056 | https://arxiv.org/abs/1604.04562 - Dialogue generation system - Wen 1057 | 1058 | 1059 | ## Aug 7 - Hacker Dojo 1060 | https://arxiv.org/pdf/1705.04304.pdf - A Deep Reinforced Model for Abstractive Summarization - socher 1061 | 1062 | ## July 31 - Hacker Dojo 1063 | https://arxiv.org/pdf/1706.01433.pdf - visual interaction networks - deep mind 1064 | https://arxiv.org/pdf/1706.01427.pdf - neural model for relational reasoning - deep mind 1065 | 1066 | 1067 | ## July 24 1068 | Guest Speaker - Using FPGA to speed CNN. 1069 | https://arxiv.org/pdf/1703.03130.pdf - A structured self-attentive sentence embedding - Lin and Bengio 1070 | https://github.com/dennybritz/deeplearning-papernotes/blob/master/notes/self_attention_embedding.md (review) 1071 | https://github.com/yufengm/SelfAttentive code 1072 | https://github.com/Diego999/SelfSent code 1073 | 1074 | ## July 17 - Hacker Dojo 1075 | https://arxiv.org/pdf/1706.03762.pdf - attention is all you need - Vaswani 1076 | https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/models 1077 | https://github.com/jadore801120/attention-is-all-you-need-pytorch - easier to read code 1078 | https://arxiv.org/pdf/1607.06450.pdf - layer normalization paper - hinton 1079 | https://www.youtube.com/watch?v=nR74lBO5M3s - google translate paper - youtube video 1080 | https://arxiv.org/pdf/1609.08144.pdf - google translate paper - 1081 | 1082 | ## July 10 - Hacker Dojo 1083 | https://arxiv.org/pdf/1706.03762.pdf - attention is all you need - Vaswani 1084 | https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/models 1085 | https://github.com/jadore801120/attention-is-all-you-need-pytorch - easier to read code 1086 | https://arxiv.org/pdf/1607.06450.pdf - layer normalization paper - hinton 1087 | 1088 | 1089 | #### Some added references regarding positional encodings 1090 | http://www.machinelearning.org/proceedings/icml2006/047_Connectionist_Tempor.pdf - A. Graves, S. Fernandez, F. Gomez, and J. Schmidhuber 1091 | https://www.reddit.com/r/MachineLearning/comments/6jdi87/r_question_about_positional_encodings_used_in/ 1092 | 1093 | 1094 | ## June 26 - Hacker Dojo 1095 | https://arxiv.org/pdf/1705.03122.pdf - convolutional sequence to sequence learning 1096 | https://arxiv.org/pdf/1706.03762.pdf - attention is all you need - Vaswani 1097 | http://www.machinelearning.org/proceedings/icml2006/047_Connectionist_Tempor.pdf - A. Graves, S. Fernandez, F. Gomez, and J. Schmidhuber 1098 | 1099 | 1100 | ## June 19 - Hacker Dojo 1101 | https://arxiv.org/pdf/1701.02720.pdf - RNN for end to end voice recognition 1102 | 1103 | 1104 | ## June 12 - Hacker Dojo 1105 | New reinforcement learning results -- Too cool for school. Watch the video and you'll be hooked. 1106 | https://www.youtube.com/watch?v=2vnLBb18MuQ&feature=em-subs_digest 1107 | 1108 | http://www.cs.ubc.ca/~van/papers/2017-TOG-deepLoco/index.html - paper 1109 | 1110 | 1111 | ## May 22 - Hacker Dojo 1112 | https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/HintonDengYuEtAl-SPM2012.pdf - comparison of RNN and HMM for speech recognition 1113 | 1114 | ## May 15 - Hacker Dojo 1115 | https://arxiv.org/pdf/1412.6572.pdf - Explaining and Harnessing Adversarial Examples 1116 | 1117 | 1118 | ## May 1 - Hacker Dojo 1119 | https://arxiv.org/abs/1704.03453 - The Space of Transferable Adversarial Examples 1120 | 1121 | 1122 | ## Apr 24 - Hacker Dojo 1123 | https://discourse-production.oss-cn-shanghai.aliyuncs.com/original/3X/1/5/15ba4cef726cab390faa180eb30fd82b693469f9.pdf - Using TPU for data center 1124 | 1125 | 1126 | ## Apr 17 - Hacker Dojo 1127 | Reservoir Computing by Felix Grezes. 1128 | http://www.gc.cuny.edu/CUNY_GC/media/Computer-Science/Student%20Presentations/Felix%20Grezes/Second_Exam_Survey_Felix_Grezes_9_04_2014.pdf 1129 | 1130 | Slides by Felix Grezes: Reservoir Computing for Neural Networks 1131 | http://www.gc.cuny.edu/CUNY_GC/media/Computer-Science/Student%20Presentations/Felix%20Grezes/Second_Exam_Slides_Felix_Grezes_9-14-2014.pdf 1132 | (more at: http://speech.cs.qc.cuny.edu/~felix/ ) 1133 | 1134 | This is a short, very useful backgrounder on randomized projections, 1135 | here used for compressed sensing, in a blog post by Terence Tao 1136 | https://terrytao.wordpress.com/2007/04/13/compressed-sensing-and-single-pixel-cameras/ 1137 | 1138 | and the same story told with illustrations on the Nuit Blanche blog: 1139 | http://nuit-blanche.blogspot.com/2007/07/how-does-rice-one-pixel-camera-work.html 1140 | 1141 | (BTW http://nuit-blanche.blogspot.com is a tremendous website.) 1142 | 1143 | --- 1144 | 1145 | If we have time, we may discuss this paper: 1146 | 1147 | Information Processing Using a Single Dynamical Node as Complex System. 1148 | https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3195233/pdf/ncomms1476.pdf 1149 | 1150 | 1151 | ## Apr 10 - Hacker Dojo 1152 | 1153 | https://arxiv.org/pdf/1603.08678.pdf - Instance-sensitive Fully Convolutional Networks 1154 | 1155 | https://arxiv.org/pdf/1611.07709.pdf - Fully Convolutional Instance-aware Semantic Segmentation 1156 | 1157 | ## Apr 3 - Hacker Dojo 1158 | https://arxiv.org/pdf/1703.03864.pdf - Sutskever paper on using evolutionary systems for optimizing RL prob 1159 | http://jmlr.csail.mit.edu/papers/volume15/wierstra14a/wierstra14a.pdf - ES paper with algo used in Sutskever paper 1160 | 1161 | 1162 | ## Mar 27 - Hacker Dojo 1163 | Aurobindo Tripathy will reprise a talk he's going to give at Embedded Summit this year. His talk will survey recent progress in object detection from RCNN to Single Shot MultiBox Detector and Yolo 9000. 1164 | 1165 | 1166 | ## Mar 20 - Hacker Dojo 1167 | https://arxiv.org/pdf/1612.05424.pdf - Unsupervised Pixel-level domain adaptation with generative adversarial networks 1168 | 1169 | ## Mar 13 - Hacker Dojo 1170 | https://arxiv.org/pdf/1701.06547.pdf - adversarial learning for neural dialog generation 1171 | 1172 | ## February 27 - Hacker Dojo 1173 | https://arxiv.org/pdf/1612.02699.pdf - Deep Supervision with Shape Concepts for Occlusion-Aware 3D Object Parsing 1174 | Zeeshan's slides are in the folder with his name on it. Along with his descriptions of his own ground-breaking work, he gives an excellent history of efforts to identify 3d objects from 2d images. 1175 | 1176 | 1177 | ## February 20 - Hacker Dojo 1178 | https://arxiv.org/pdf/1506.07285.pdf - Ask me anything - Socher 1179 | https://github.com/YerevaNN/Dynamic-memory-networks-in-Theano - Code and implementation notes. 1180 | https://www.youtube.com/watch?v=FCtpHt6JEI8&t=27s - Socher presentation of material 1181 | 1182 | 1183 | ## February 13 - Hacker Dojo 1184 | https://arxiv.org/pdf/1701.06538v1.pdf - Outrageously large neural networks 1185 | 1186 | ## February 6 - Hacker Dojo 1187 | 1188 | https://arxiv.org/pdf/1505.00387v2.pdf - Highway networks 1189 | https://arxiv.org/pdf/1507.06228.pdf - Also highway networks - different examples 1190 | https://arxiv.org/pdf/1607.03474v3.pdf - Recurrent Highway Networks 1191 | 1192 | 1193 | ## January 30 - Hacker Dojo 1194 | https://arxiv.org/pdf/1603.03116v2.pdf - Low-rank pass-through RNN's follow-on to unitary rnn 1195 | https://github.com/Avmb/lowrank-gru - theano code 1196 | 1197 | ## January 23 - HackerDojo 1198 | https://arxiv.org/abs/1612.03242 - Stack Gan Paper 1199 | https://github.com/hanzhanggit/StackGAN - Code 1200 | 1201 | ## January 16 - Hacker Dojo 1202 | https://arxiv.org/pdf/1511.06464v4.pdf - Unitary Evolution RNN 1203 | https://github.com/amarshah/complex_RNN - theano code 1204 | 1205 | ## January 9 - Hacker Dojo 1206 | Cheuksan Edward Wang Talk 1207 | https://arxiv.org/pdf/1612.04642v1.pdf - rotation invariant cnn 1208 | https://github.com/deworrall92/harmonicConvolutions - tf code for harmonic cnn 1209 | http://visual.cs.ucl.ac.uk/pubs/harmonicNets/index.html - blog post by authors 1210 | 1211 | ## January 2 - Hacker Dojo 1212 | https://arxiv.org/pdf/1602.02218v2.pdf - using typing to improve RNN behavior 1213 | http://jmlr.org/proceedings/papers/v37/jozefowicz15.pdf - exploration of alternative LSTM architectures 1214 | 1215 | ## December 19 - Hacker Dojo 1216 | https://arxiv.org/pdf/1611.01576.pdf - Socher qRnn paper 1217 | 1218 | ## December 12 - Hacker Dojo 1219 | https://arxiv.org/pdf/1604.02135v2.pdf - latest segmentation fair 1220 | https://github.com/MarvinTeichmann/tensorflow-fcn - code for segmenter 1221 | 1222 | ## December 5 - Hacker Dojo 1223 | https://arxiv.org/pdf/1506.06204.pdf - Object segmentation 1224 | https://arxiv.org/pdf/1603.08695v2.pdf - refinement of above segmentation paper 1225 | https://code.facebook.com/posts/561187904071636/segmenting-and-refining-images-with-sharpmask/ - blog post 1226 | https://github.com/facebookresearch/deepmask - torch code for deepmask 1227 | 1228 | 1229 | ## November 28 - Hacker Dojo 1230 | https://arxiv.org/pdf/1506.01497v3.pdf 1231 | people.eecs.berkeley.edu/~rbg/slides/rbg-defense-slides.pdf - Girshick thesis slides 1232 | Check edge boxes and selective search 1233 | https://arxiv.org/pdf/1406.4729v4.pdf - key part of architecture 1234 | https://github.com/smallcorgi/Faster-RCNN_TF - excellent code 1235 | 1236 | 1237 | ## November 21 - Hacker Dojo 1238 | https://people.eecs.berkeley.edu/~rbg/papers/r-cnn-cvpr.pdf - RCNN 1239 | https://arxiv.org/pdf/1504.08083v2.pdf - RCNN - first in series 1240 | https://arxiv.org/pdf/1506.01497v3.pdf - Faster R-CNN 1241 | http://techtalks.tv/talks/rich-feature-hierarchies-for-accurate-object-detection-and-semantic-segmentation/60254/ - video of Girshick talk 1242 | 1243 | 1244 | ## November 14 - Hacker Dojo 1245 | https://arxiv.org/pdf/1506.02025v3.pdf - Spatial transformer networks 1246 | https://github.com/daviddao/spatial-transformer-tensorflow - tf code for above 1247 | 1248 | ## October 31 - Hacker Dojo 1249 | https://github.com/jazzsaxmafia/show_attend_and_tell.tensorflow - tf code for attention-captioning 1250 | http://cs.stanford.edu/people/karpathy/densecap/ - karpathy captioning 1251 | https://arxiv.org/pdf/1412.2306v2.pdf - earlier karpathy captioning paper 1252 | 1253 | 1254 | ## October 20 - Galvanize 1255 | https://webdocs.cs.ualberta.ca/~sutton/book/the-book.html - Deep dive into reinforcement learning - Sutton and Barto - Chapters 1 and 2. 1256 | 1257 | ## Oct 17 - Hacker Dojo 1258 | https://arxiv.org/pdf/1608.06993v1.pdf - DenseNet. New reigning champion image classifier 1259 | https://github.com/liuzhuang13/DenseNet - lua code 1260 | The DenseNet paper is straight-forward, so we're also going to start on image captioning 1261 | 1262 | http://www.cs.toronto.edu/~zemel/documents/captionAttn.pdf 1263 | http://kelvinxu.github.io/projects/capgen.html 1264 | http://people.ee.duke.edu/~lcarin/Yunchen9.25.2015.pdf - slides for caption attention 1265 | 1266 | collections of captioning papers. 1267 | https://github.com/kjw0612/awesome-deep-vision#image-captioning - images 1268 | https://github.com/kjw0612/awesome-deep-vision#video-captioning - video 1269 | 1270 | ## Oct 13 - SF 1271 | http://www.mit.edu/~dimitrib/NDP_Encycl.pdf - (early) Bersekas paper on RL, policy and value iteration 1272 | http://www.nervanasys.com/demystifying-deep-reinforcement-learning/?imm_mid=0e2d7e&cmp=em-data-na-na-newsltr_20160420 - blog post on RL. Nice coverage of value iteration 1273 | 1274 | ## Oct 10 - Hacker Dojo 1275 | https://github.com/carpedm20/pixel-rnn-tensorflow - tensorflow code for pixel rnn (and cnn) 1276 | 1277 | ## Sept 19 - Hacker Dojo 1278 | https://arxiv.org/pdf/1606.05328v2.pdf - Conditional Image Generation with PixelCNN decoders 1279 | https://arxiv.org/pdf/1601.06759v3.pdf - Pixel RNN 1280 | https://drive.google.com/file/d/0B3cxcnOkPx9AeWpLVXhkTDJINDQ/view - wavenet Generative Audio 1281 | https://deepmind.com/blog/wavenet-generative-model-raw-audio/ - wavenet blog 1282 | 1283 | ## Sept 15 - Galvanize SF 1284 | http://www.gitxiv.com/posts/fepYG4STYaej3KSPZ/densely-connected-convolutional-netowork-densenet 1285 | 1286 | 1287 | ## Sept 12 - Hacker Dojo 1288 | http://arxiv.org/pdf/1410.3916v11.pdf - original memory networks 1289 | https://arxiv.org/pdf/1606.03126v1.pdf - key/value memory augmented nn 1290 | http://www.thespermwhale.com/jaseweston/icml2016/icml2016-memnn-tutorial.pdf#page=87 - tutorial on memory networks in language understanding 1291 | 1292 | ## August 29 - Hacker Dojo 1293 | https://arxiv.org/pdf/1410.5401v2.pdf - Neural Turing Machines 1294 | https://github.com/carpedm20/NTM-tensorflow 1295 | https://www.youtube.com/watch?v=_H0i0IhEO2g - Alex Graves presentation at microsoft research 1296 | http://www.robots.ox.ac.uk/~tvg/publications/talks/NeuralTuringMachines.pdf - slides for ntm 1297 | 1298 | ## August 25 - Galvanize (SF) 1299 | http://arxiv.org/pdf/1410.3916v11.pdf - original memory networks 1300 | https://arxiv.org/pdf/1606.03126v1.pdf - key/value memory augmented nn 1301 | http://www.thespermwhale.com/jaseweston/icml2016/icml2016-memnn-tutorial.pdf#page=87 - tutorial on memory networks in language understanding 1302 | 1303 | ## August 22 - Hacker Dojo 1304 | https://arxiv.org/pdf/1605.07648v1.pdf - fractal net - alternative to resnet for ultra-deep convolution 1305 | https://github.com/edgelord/FractalNet - tf code 1306 | http://www.gitxiv.com/posts/ibA8QEu8bvBJSDxr9/fractalnet-ultra-deep-neural-networks-without-residuals 1307 | 1308 | ## August 18, 2016 - Galvanize (SF) 1309 | https://arxiv.org/pdf/1602.01783v2.pdf - new RL architecture - deep mind 1310 | 1311 | Code: 1312 | https://github.com/Zeta36/Asynchronous-Methods-for-Deep-Reinforcement-Learning - tf 1313 | https://github.com/miyosuda/async_deep_reinforce - tf 1314 | https://github.com/coreylynch/async-rl - keras (tf) 1315 | https://github.com/muupan/async-rl - chainer (good discussion) 1316 | 1317 | ## August 15, 2016 - Hacker Dojo 1318 | https://arxiv.org/pdf/1607.02533v1.pdf - Hardening deep networks to adversarial examples. 1319 | 1320 | ## August 11, 2016 - Galvanize (SF) 1321 | http://www.gitxiv.com/posts/HQJ3F9YzsQZ3eJjpZ/model-free-episodic-control - deep mind gitxiv paper and code on github 1322 | https://github.com/sudeepraja/Model-Free-Episodic-Control - other code 1323 | https://github.com/ShibiHe/Model-Free-Episodic-Control 1324 | 1325 | ## August 8, 2016 - Hacker Dojo 1326 | https://arxiv.org/pdf/1406.2661.pdf - originating paper on generative adversarial net (gan) - goodfellow, bengio 1327 | http://arxiv.org/pdf/1511.06434v2.pdf - deep cnn gan - radford 1328 | https://github.com/Newmu/dcgan_code - theano code for cnn gan - radford 1329 | 1330 | ## August 4, 2016 - Galvanize (SF) 1331 | http://www.gitxiv.com/posts/HQJ3F9YzsQZ3eJjpZ/model-free-episodic-control - deep mind gitxiv paper and code on github 1332 | 1333 | ## August 1, 2016 - Hacker Dojo 1334 | Papers - 1335 | https://drive.google.com/file/d/0B8Dg3PBX90KNWG5KQXNQOFlBLU1JWWVONkN1UFpnbUR6Y0cw/view?pref=2&pli=1 - Using Stochastic RNN for temporal anomaly detection 1336 | https://home.zhaw.ch/~dueo/bbs/files/vae.pdf - cover math 1337 | https://arxiv.org/pdf/1401.4082v3.pdf - Rezende - Other Original VAE paper 1338 | 1339 | Code Review - 1340 | https://github.com/oduerr/dl_tutorial/blob/master/tensorflow/vae/vae_demo.ipynb 1341 | https://github.com/oduerr/dl_tutorial/blob/master/tensorflow/vae/vae_demo-2D.ipynb 1342 | 1343 | ## July 28, 2016 - SF 1344 | Papers: 1345 | http://arxiv.org/pdf/1410.5401v2.pdf - Neural Turing Machines - Graves et. al. 1346 | https://arxiv.org/pdf/1605.06065v1.pdf - One Shot Learning - DeepMind 1347 | 1348 | Code: 1349 | http://icml.cc/2016/reviews/839.txt 1350 | https://github.com/brendenlake/omniglot 1351 | https://github.com/tristandeleu/ntm-one-shot 1352 | https://github.com/MLWave/extremely-simple-one-shot-learning 1353 | 1354 | ## July 25, 2016 - Hacker Dojo 1355 | Papers - Using VAE for anomaly detection 1356 | https://arxiv.org/pdf/1411.7610.pdf - Stochastic Recurrent Networks 1357 | https://drive.google.com/file/d/0B8Dg3PBX90KNWG5KQXNQOFlBLU1JWWVONkN1UFpnbUR6Y0cw/view?pref=2&pli=1 - Using Stochastic RNN for temporal anomaly detection 1358 | 1359 | 1360 | ## July 21, 2016 - SF 1361 | Papers to read: 1362 | http://www.thespermwhale.com/jaseweston/ram/papers/paper_16.pdf 1363 | http://snowedin.net/tmp/Hochreiter2001.pdf - 1364 | 1365 | Comments / Code 1366 | http://icml.cc/2016/reviews/839.txt 1367 | https://github.com/brendenlake/omniglot 1368 | https://github.com/tristandeleu/ntm-one-shot 1369 | https://github.com/MLWave/extremely-simple-one-shot-learning 1370 | https://www.periscope.tv/hugo_larochelle/1ypJdnPRYEoKW 1371 | 1372 | 1373 | 1374 | ## July 18, 2016 - Hacker Dojo 1375 | Papers to read: 1376 | http://arxiv.org/pdf/1312.6114v10.pdf - variational autoencoders - U of Amsterdam - Kingma and Welling 1377 | http://arxiv.org/pdf/1310.8499v2.pdf - deep autoregressive networks - deep mind 1378 | https://arxiv.org/abs/1606.05908 - tutorial on vae 1379 | 1380 | Commentaries/Code 1381 | https://jmetzen.github.io/2015-11-27/vae.html - metzen - code and discussion 1382 | http://blog.keras.io/building-autoencoders-in-keras.html - chollet - discusses different autoencoders, gives keras code. 1383 | 1384 | 1385 | 1386 | ## June 27, July 11 2016 - Hacker Dojo 1387 | Recurrent network for image generation - Deep Mind 1388 | https://arxiv.org/pdf/1502.04623v2.pdf 1389 | Background and some references cited 1390 | http://blog.evjang.com/2016/06/understanding-and-implementing.html - blog w. code for VAE 1391 | http://arxiv.org/pdf/1312.6114v10.pdf - Variational Auto Encoder 1392 | https://jmetzen.github.io/2015-11-27/vae.html - tf code for variational auto-encoder 1393 | https://www.youtube.com/watch?v=P78QYjWh5sM 1394 | 1395 | https://arxiv.org/pdf/1401.4082.pdf - stochastic backpropagation and approx inference - deep mind 1396 | http://www.cs.toronto.edu/~fritz/absps/colt93.html - keep neural simple by minimizing descr length - hinton 1397 | https://github.com/vivanov879/draw - code 1398 | 1399 | 1400 | ## June 20, 2016 - Penninsula 1401 | Recurrent models of visual attention - Deep Mind 1402 | https://papers.nips.cc/paper/5542-recurrent-models-of-visual-attention.pdf 1403 | 1404 | ## June 23, 29 2016 - SF 1405 | http://arxiv.org/pdf/1410.5401v2.pdf - Neural Turing Machines - Graves et. al. 1406 | https://arxiv.org/pdf/1605.06065v1.pdf - One Shot Learning - DeepMind 1407 | http://www.shortscience.org/paper?bibtexKey=journals/corr/1605.06065 - Larochell comments on One-Shot paper 1408 | https://github.com/shawntan/neural-turing-machines - Code 1409 | https://www.reddit.com/r/MachineLearning/comments/2xcyrl/i_am_j%C3%BCrgen_schmidhuber_ama/cp4ecce - schmidhuber's comments 1410 | http://www.thespermwhale.com/jaseweston/ram/papers/paper_16.pdf 1411 | http://snowedin.net/tmp/Hochreiter2001.pdf - 1412 | Reviews: 1413 | http://icml.cc/2016/reviews/839.txt 1414 | Code 1415 | https://github.com/brendenlake/omniglot 1416 | https://github.com/tristandeleu/ntm-one-shot 1417 | https://github.com/MLWave/extremely-simple-one-shot-learning 1418 | 1419 | ## June 13, 2016 - TBD, Penninsula 1420 | Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning: 1421 | http://arxiv.org/pdf/1602.07261v1.pdf 1422 | 1423 | ## June 9, 2016 - Galvanize 1424 | Visualizing and Understanding RNN: 1425 | https://arxiv.org/pdf/1506.02078v2.pdf 1426 | 1427 | ## June 6, 2016 - Hacker Dojo 1428 | Google inception paper - origin of 1x1 convolution layers 1429 | http://arxiv.org/pdf/1409.4842v1.pdf 1430 | 1431 | ## June 2, May 26, 2016 - Galvanize 1432 | 1433 | Image segmentation with deep encoder-decoder 1434 | 1435 | https://arxiv.org/pdf/1511.00561.pdf 1436 | 1437 | ## May 23, 2016 - Hacker Dojo 1438 | 1439 | Compressed networks, reducing flops by pruning 1440 | 1441 | https://arxiv.org/pdf/1510.00149.pdf 1442 | 1443 | http://arxiv.org/pdf/1602.07360v3.pdf 1444 | 1445 | ## May 16, 2016 1446 | 1447 | Word2Vec meets LDA: 1448 | 1449 | http://arxiv.org/pdf/1605.02019v1.pdf - Paper 1450 | 1451 | https://twitter.com/chrisemoody - Chris Moody's twiter with links to slides etc. 1452 | 1453 | http://qpleple.com/topic-coherence-to-evaluate-topic-models/ - writeup on topic coherence 1454 | 1455 | 1456 | ## May 9, 2016 1457 | 1458 | https://arxiv.org/pdf/1603.05027v2.pdf - Update on microsoft resnet - identity mapping 1459 | 1460 | http://gitxiv.com/posts/MwSDm6A4wPG7TcuPZ/recurrent-batch-normalization - batch normalization w. RNN 1461 | 1462 | 1463 | ## May 2, 2016 1464 | 1465 | Go playing DQN - AlphaGo 1466 | 1467 | https://gogameguru.com/i/2016/03/deepmind-mastering-go.pdf 1468 | 1469 | https://m.youtube.com/watch?sns=em&v=pgX4JSv4J70 - video of slide presentation on paper 1470 | 1471 | https://en.m.wikipedia.org/wiki/List_of_Go_games#Lee.27s_Broken_Ladder_Game - Handling "ladders" in alphgo 1472 | 1473 | https://en.m.wikipedia.org/wiki/Ladder_(Go) - ladders in go 1474 | 1475 | _____________________________________________________________________________________________________________________ 1476 | ## April 25, 2016 - Microsoft Resnet 1477 | The Paper 1478 | 1479 | http://arxiv.org/pdf/1512.03385v1.pdf 1480 | 1481 | References: 1482 | 1483 | http://arxiv.org/pdf/1603.05027v2.pdf - Identity mapping paper 1484 | 1485 | Code: 1486 | 1487 | https://keunwoochoi.wordpress.com/2016/03/09/residual-networks-implementation-on-keras/ - keras code 1488 | 1489 | https://github.com/ry/tensorflow-resnet/blob/master/resnet.py - tensorflow code 1490 | 1491 | https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/skflow/resnet.py 1492 | _________________________________________________________________________________________________________________ 1493 | ## April 18, 2016 - Batch Normalization 1494 | The Paper 1495 | https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf 1496 | http://gitxiv.com/posts/MwSDm6A4wPG7TcuPZ/recurrent-batch-normalization - Batch Normalization for RNN 1497 | 1498 | 1499 | ___________________________________________________________________________________________________________ 1500 | ## April 11, 2016 - Atari Game Playing DQN 1501 | The Paper 1502 | https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) 1503 | 1504 | Related references: 1505 | 1506 | This adds 'soft' and 'hard' attention and the 4 frames are replaced with an LSTM layer: 1507 | 1508 | http://gitxiv.com/posts/NDepNSCBJtngkbAW6/deep-attention-recurrent-q-network 1509 | 1510 | http://home.uchicago.edu/~arij/journalclub/papers/2015_Mnih_et_al.pdf - Nature Paper 1511 | 1512 | http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html - videos at the bottom of the page 1513 | 1514 | http://llcao.net/cu-deeplearning15/presentation/DeepMindNature-preso-w-David-Silver-RL.pdf - David Silver's slides 1515 | 1516 | http://www.cogsci.ucsd.edu/~ajyu/Teaching/Cogs118A_wi09/Class0226/dayan_watkins.pdf 1517 | 1518 | http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html - David Silver 1519 | 1520 | Implementation Examples: 1521 | 1522 | http://stackoverflow.com/questions/35394446/why-doesnt-my-deep-q-network-master-a-simple-gridworld-tensorflow-how-to-ev?rq=1 1523 | 1524 | http://www.danielslater.net/2016/03/deep-q-learning-pong-with-tensorflow.html 1525 | 1526 | __________________________________________________________________________________________________________ 1527 | ## March 3, 2016 Gated Feedback RNN 1528 | The Paper 1529 | 1530 | "Gated RNN" (http://arxiv.org/pdf/1502.02367v4.pdf 1531 | 1532 | -Background Material 1533 | 1534 | http://arxiv.org/pdf/1506.00019v4.pdf - Lipton's excellent review of RNN 1535 | http://www.nehalemlabs.net/prototype/blog/2013/10/10/implementing-a-recurrent-neural-network-in-python/ - Discussion of RNN and theano code for Elman network - Tiago Ramalho 1536 | http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf - Hochreiter's original paper on LSTM 1537 | https://www.youtube.com/watch?v=izGl1YSH_JA - Hinton video on LSTM 1538 | 1539 | -Skylar Payne's GF RNN code 1540 | https://github.com/skylarbpayne/hdDeepLearningStudy/tree/master/tensorflow 1541 | 1542 | -Slides 1543 | https://docs.google.com/presentation/d/1d2keyJxRlDcD1LTl_zjS3i45xDIh2-QvPWU3Te29TuM/edit?usp=sharing 1544 | https://github.com/eadsjr/GFRNNs-nest/tree/master/diagrams/diagrams_formula 1545 | 1546 | ## Reviews 1547 | http://www.computervisionblog.com/2016/06/deep-learning-trends-iclr-2016.html 1548 | https://indico.io/blog/iclr-2016-takeaways/ 1549 | -------------------------------------------------------------------------------- /ZeeshanZiaSlides-DeepSupervision3DObjectParsing/DLStudyGroup.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mike-bowles/hdDeepLearningStudy/7613fa865640022815c22de5854336b344bf998d/ZeeshanZiaSlides-DeepSupervision3DObjectParsing/DLStudyGroup.pdf -------------------------------------------------------------------------------- /awspot/README.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | This document describes how to use spot instances on AWS. 4 | Spot instances deliver a savings of almost 80% of the on-demand rate. 5 | However they get interrupted, potentially loosing data. 6 | 7 | 8 | 9 | The key to utilizing spot instances is automation, especially around interruption. 10 | A simple workaround, is to use an EBS drive that automounts + a user data script that fires off will allows you to 11 | take advantage of cheaper spot instances and train your model for weeks at a time. 12 | 13 | Your spot bid price determines how often your instance get interrupted. 14 | Set it low it will get interrupted moreoften, but you have a firmer handle on price certainity. 15 | Note, the price you pay is the lower of the bid, and the current spot pricing. 16 | Setting a bid to the on-demand rate would virtually guarantee never getting interrupted. 17 | 18 | Finally a S3 bucket is recommended to sync your results. 19 | 20 | 21 | # Solution 22 | 23 | There is some wiring required, that is described here. I recommend building a stack that you can re-use 24 | for your training jobs. 25 | 26 | The stack is shown below. This document describes how to build parts of this stack. 27 | ![CUDA DOCKER AWS](https://www.lucidchart.com/publicSegments/view/b36d7113-4e9a-471c-8cf0-7facf6e17640/image.png) 28 | 29 | 30 | The current version of this is bare bones. Further contribution are required. 31 | 32 | 33 | # Prerequisite: 34 | 35 | 1) An AMI with Cuda8/Docker/Nvidia-Docker installed 36 | I've made public the following ami in the Oregon Region (us-west-2) 37 | 38 | ``` 39 | ami-f266d292 40 | ``` 41 | 42 | 2) Create a Volume with formatted drive (ext4/xfs) in region and snap it. Record its volume id 43 | 44 | 45 | 3) Docker (optional)... 46 | 47 | * If you want to user docker, the AMI is ready to go with Nvidia Docker 48 | https://github.com/NVIDIA/nvidia-docker 49 | 50 | * I'd suggest using DockerHub to store containers (its free unless your code is private) 51 | Docker Hub 52 | 53 | * A suggested Docker container from Waleed that has tensorflow + opencv is here. Note 54 | start it with nvidia-docker, instead of docker if u want GPU support 55 | 56 | 57 | 58 | 59 | 4) Setup an S3 bucket. (optional) 60 | Nothing special, just to push back models. 61 | 62 | For example use s3_parallel to sync your data 63 | ``` 64 | https://github.com/mishudark/s3-parallel-put 65 | ``` 66 | 67 | 68 | # Kick the Tires 69 | 70 | 0) Launch the AMI and login 71 | 72 | 1) Run the nvidia-smi to check on running process 73 | 74 | ``` 75 | nvidia-smi 76 | 77 | Tue Dec 20 01:25:43 2016 78 | +-----------------------------------------------------------------------------+ 79 | | NVIDIA-SMI 367.57 Driver Version: 367.57 | 80 | |-------------------------------+----------------------+----------------------+ 81 | | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | 82 | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | 83 | |===============================+======================+======================| 84 | | 0 Tesla K80 Off | 0000:00:1E.0 Off | 0 | 85 | | N/A 61C P8 30W / 149W | 0MiB / 11439MiB | 0% Default | 86 | +-------------------------------+----------------------+----------------------+ 87 | 88 | ``` 89 | 90 | 91 | 2) Start docker and check flags are passed through 92 | 93 | ``` 94 | sudo nvidia-docker run -v /mnt/data:/mnt/data -it nvidia/cuda bash -l 95 | nvcc -V 96 | ``` 97 | 98 | 99 | # Run a Spot Instances with persistent request. 100 | 101 | 0) View spot pricing and region 102 | 103 | 1) Generate User Data Script (see user_data.sh template) 104 | 105 | 2) Click on AMI->Spot Request 106 | 107 | ``` 108 | i) GPU types- > pick a GPU Instance (ex. p2.xlarge) 109 | ii) Configure Instace Details-> Spot, 110 | BidPrice: x.yy 111 | Persistent Request 112 | Network: default 113 | IAMRole:admin 114 | Launch EBS Optimized 115 | Under Advanced Details-> Pick the user_data.sh you've custom modified 116 | iii) Add Storage -> Accept Defaults 117 | iv) Tags -> You user name and task name is useful 118 | vi) Select "Review And Launch". 119 | ``` 120 | 121 | 4) Login and view logs 122 | 123 | * the AWS startup log 124 | /var/log/cloud-init-output.log 125 | 126 | * Docker running 127 | ``` 128 | sudo docker ps 129 | sudo docker logs 130 | ``` 131 | 132 | * The data /mnt/data folder where you should be dumping results. 133 | ``` 134 | df -h 135 | find /mnt/data/ 136 | ``` 137 | 138 | 139 | # FAQ 140 | 141 | 1) For tensorflow, or any long running job, how do i not loose my training on interruption? 142 | 143 | Use the saver object as described here : 144 | https://www.tensorflow.org/how_tos/variables/ 145 | 146 | Setup your scripts to routinely dump with a step-id(use utc time), checkpoint every 30 minutes or so 147 | Then on restart the latest checkpoint is picked up. 148 | 149 | Note, you will loose some training time, but assuming the AMI stays up for 8 hours, 30 minutes is acceptable as max. 150 | 151 | For more durability upload to S3 incase the EBS fails (rare but can loose all your data) 152 | -------------------------------------------------------------------------------- /awspot/user_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This submitted via the ec2 configure instance page, additional options. Node the output on the instance goes to /var/log/cloud-init-output.log 3 | # Fill in the following after you've clone from our dummy snapshot. 4 | # This will run as root account 5 | # Designed to run with P2.xlarge (Nvidia K80, 12 GB card, good for running imagnet, inception..etc) 6 | 7 | 8 | # 1. Configure all these for your specific case. Docker is optional. only us-west-2 and us-east-1 have p2 instances. 9 | # Your image should be based of nvidia/cuda which is installed 10 | TASK_PATH= 11 | DOCKER_IMAGE= 12 | VOLUME_ID= 13 | AWS_ACCESS_KEY_ID= 14 | AWS_SECRET_ACCESS_KEY= 15 | REGION=us-west-2 16 | 17 | # 2. This will attach the volume and resize it 18 | INSTANCE_ID=`ec2metadata --instance-id` 19 | aws --region=${REGION} ec2 attach-volume --instance-id ${INSTANCE_ID} --volume-id ${VOLUME_ID} --dev /dev/sdb 20 | aws --region=${REGION} ec2 describe-volumes --volume-ids ${VOLUME_ID} 21 | aws --region=${REGION} ec2 describe-volumes --volume-ids ${VOLUME_ID} | grep '"State": "attached"' 22 | until aws --region=us-west-2 ec2 describe-volumes --volume-ids ${VOLUME_ID} | grep '"State": "attached"'; do 23 | sleep 1 24 | done 25 | sudo resize2fs /dev/xvdb 26 | 27 | # 3. mount the volume 28 | mkdir -p /mnt/data && chown -R ubuntu:ubuntu /mnt/data 29 | mount /dev/xvdb /mnt/data 30 | mkdir -p /var/log/mylogs 31 | 32 | 33 | # 4. Now run the task, it could be on the AMI, or you could download a package or git or anything u want 34 | # Here we use docker. 35 | # Pull the docker image. Mount docker to the EBS drive. Start task on nvidia docker. 36 | docker pull ${DOCKER_IMAGE} 37 | nvidia-docker run -v /mnt/data:/mnt/data -i ${DOCKER_IMAGE} bash -c "${TASK_PATH}" > /var/log/mylogs/docker-fractal.log 2>&1 38 | -------------------------------------------------------------------------------- /gatedRNN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Gated Feedback RNN - Notes\n", 8 | "\n", 9 | "Paper's notation\n", 10 | "input is $x_t$\n", 11 | "\n", 12 | "\n", 13 | "\n", 14 | "\n", 15 | "Zaremba (2014) variant of LSTM\n", 16 | "$$\\tilde{c_t} = tanh(W_c x_t + U_c h_{t-1})\\,-\\,new\\,content$$\n", 17 | "$$c_t = f_tc_{t-1} + i_t \\tilde{c_t}\\,-\\, content\\,of\\,memory\\,cell$$\n", 18 | "$$i_t = \\sigma(W_ix_t+U_ih_{t-1})\\,-\\,input\\,gate$$\n", 19 | "$$f_t = \\sigma(W_fx_t+U_fh_{t-1})\\,-\\,forget\\,gate$$\n", 20 | "$$h_t = o_ttanh(c_t)\\,-\\,hidden \\, state$$\n", 21 | "$$o_t = \\sigma(W_ox_t + U_oh_{t-1})\\,-\\,output\\,gate$$\n", 22 | "\n", 23 | "Modification for Gated Feedback LSTM\n", 24 | "$$\\tilde{c_t^j} = tanh(W_c^{j-1\\rightarrow j}h_t^{j-1} + \\sum_{i=1}^L g^{i\\rightarrow j}U_c^{i\\rightarrow j}h_{t-1}^i)$$\n", 25 | "\n", 26 | "\n", 27 | "Gated Recurrent Unit, Cho (2014)\n", 28 | "$$h_t = (1-z_t)h_{t-1}+z_t \\tilde{h_t}\\,-\\,GRU\\, state$$\n", 29 | "$$z_t = \\sigma(W_zx_t+U_zh_{t-1})\\,-\\, update\\,gate$$\n", 30 | "$$\\tilde{h_t} = tanh(Wx_t + r_tUh_{t-1})\\,-\\,new\\,memory\\,content$$\n", 31 | "$$r_t = \\sigma(W_rx_t + U_rh_{t-1})\\,-\\,reset\\,gate$$\n", 32 | "\n", 33 | "Modification for Gated Feedback GRU\n", 34 | "$$\\tilde{h_t^j} = tanh(W^{j-1\\rightarrow j}h_t^{j-1} + r_t^j \\odot \\sum_{i=1}^L g^{i\\rightarrow j}U_c^{i\\rightarrow j}h_{t-1}^i)$$\n", 35 | "\n", 36 | "\n", 37 | "\n", 38 | "\n", 39 | "## Elman net\n", 40 | "\n", 41 | "\n", 42 | "$$h_t = W_hx_t+U_hh_{t-1}\\,-\\,hidden \\, state$$\n", 43 | "\n", 44 | "2-Layer Elman-RNN\n", 45 | "$$h_t^1 = W_h^1x_t+U_h^1h_{t-1}^1\\,-\\,hidden \\, state\\,1$$\n", 46 | "$$h_t^2 = W_h^2h_t^1+U_h^2h_{t-1}^2\\,-\\,hidden \\, state\\,2$$\n", 47 | "\n", 48 | "2-Layer gated feedback RNN - GF-RNN = 2-layer Elman-RNN plus\n", 49 | "$$g^{1\\rightarrow2} = \\sigma(W_g^{1\\rightarrow2}h_t^1 + U_g^{1\\rightarrow2}\\left[\\begin{array}{c}\n", 50 | "h_{t-1}^1\\\\\n", 51 | "h_{t-1}^2\\end{array}\\right])$$\n", 52 | "$$g^{2\\rightarrow1} = \\sigma(W_g^{2\\rightarrow1}x_t + U_g^{2\\rightarrow1}\\left[\\begin{array}{c}\n", 53 | "h_{t-1}^1\\\\\n", 54 | "h_{t-1}^2\\end{array}\\right])$$\n", 55 | "\n", 56 | "Assuming that\n", 57 | "$$h_{t-1}^*=\\left[\\begin{array}{c}\n", 58 | "h_{t-1}^1\\\\\n", 59 | "h_{t-1}^2\\end{array}\\right]$$\n", 60 | "\n", 61 | "Layer-by-layer state - \n", 62 | "$$h_t^j = tanh(W^{j-1\\rightarrow j}h_t^{j-1} + \\sum_{i=1}^L g^{i\\rightarrow j}U^{i\\rightarrow j}h_{t-1}^i)$$\n", 63 | "\n", 64 | "For 2-layer gated rf Elman - \n", 65 | "$$h_t^1 = tanh(W^{0\\rightarrow 1}x_t + g^{1\\rightarrow 1}U^{1\\rightarrow 1}h_{t-1}^1 + g^{2\\rightarrow 1}U^{2\\rightarrow 1}h_{t-1}^2)$$\n", 66 | "$$h_t^2 = tanh(W^{1\\rightarrow 2}h_t^1 + g^{1\\rightarrow 2}U^{1\\rightarrow 2}h_{t-1}^1 + g^{2\\rightarrow 2}U^{2\\rightarrow 2}h_{t-1}^2)$$\n", 67 | "\n" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": { 74 | "collapsed": true 75 | }, 76 | "outputs": [], 77 | "source": [] 78 | } 79 | ], 80 | "metadata": { 81 | "kernelspec": { 82 | "display_name": "Python 2", 83 | "language": "python", 84 | "name": "python2" 85 | }, 86 | "language_info": { 87 | "codemirror_mode": { 88 | "name": "ipython", 89 | "version": 2 90 | }, 91 | "file_extension": ".py", 92 | "mimetype": "text/x-python", 93 | "name": "python", 94 | "nbconvert_exporter": "python", 95 | "pygments_lexer": "ipython2", 96 | "version": "2.7.11" 97 | } 98 | }, 99 | "nbformat": 4, 100 | "nbformat_minor": 0 101 | } 102 | -------------------------------------------------------------------------------- /tensorflow/Gated Feedback Recurrent Neural Network - GF LSTM.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Copyright 2016 Google Inc. All Rights Reserved.\n", 8 | "\n", 9 | "Licensed under the Apache License, Version 2.0 (the \"License\");\n", 10 | "you may not use this file except in compliance with the License.\n", 11 | "You may obtain a copy of the License at\n", 12 | "\n", 13 | " http://www.apache.org/licenses/LICENSE-2.0\n", 14 | "\n", 15 | "Unless required by applicable law or agreed to in writing, software\n", 16 | "distributed under the License is distributed on an \"AS IS\" BASIS,\n", 17 | "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", 18 | "See the License for the specific language governing permissions and\n", 19 | "limitations under the License." 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "## Gated Feedback Recurrent Neural Network\n", 27 | "\n", 28 | "This notebook contains a Tensorflow (http://www.tensorflow.org) implementation of the Gated Feedback Recurrent Neural Network (the LSTM version) from this paper: http://arxiv.org/pdf/1502.02367v4.pdf" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 1, 34 | "metadata": { 35 | "collapsed": false 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "import tensorflow as tf\n", 40 | "from tensorflow.models.rnn.ptb import reader\n", 41 | "import numpy as np\n", 42 | "\n", 43 | "train_data, valid_data, test_data, vocab = reader.ptb_raw_data('simple-examples/data/')" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 2, 49 | "metadata": { 50 | "collapsed": false 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "# Hyperparameters\n", 55 | "batch_size = 20\n", 56 | "num_steps = 20\n", 57 | "hidden_size = 200\n", 58 | "emb_size = 200 # Note: this is kind of a cheat. This will *not* work if emb_size != hidden_size\n", 59 | "vocab_size = 10000\n", 60 | "epochs = 2\n", 61 | "init_scale = 0.1\n", 62 | "num_hidden_layers = 1\n", 63 | "\n", 64 | "lr = tf.placeholder(tf.float32, [])" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 3, 70 | "metadata": { 71 | "collapsed": false 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "## Build Model\n", 76 | "session = tf.Session()\n", 77 | "\n", 78 | "X = tf.placeholder(tf.int32, [batch_size, num_steps])\n", 79 | "targets = tf.placeholder(tf.int64, [batch_size, num_steps])\n", 80 | "\n", 81 | "embedding = tf.Variable(\n", 82 | " tf.random_uniform([vocab_size, emb_size], minval=-init_scale, maxval=init_scale),\n", 83 | " name=\"embedding\")\n", 84 | "\n", 85 | "# For input gate.\n", 86 | "Wi = [tf.Variable(\n", 87 | " tf.random_uniform([emb_size, hidden_size], minval=-init_scale, maxval=init_scale),\n", 88 | " name=\"Wi_%d\" % i) for i in range(num_hidden_layers)]\n", 89 | "Ui = [tf.Variable(\n", 90 | " tf.random_uniform([hidden_size, hidden_size], minval=-init_scale, maxval=init_scale),\n", 91 | " name=\"Ui_%d\" % i) for i in range(num_hidden_layers)]\n", 92 | "\n", 93 | "# For forget gate.\n", 94 | "Wf = [tf.Variable(\n", 95 | " tf.random_uniform([emb_size, hidden_size], minval=-init_scale, maxval=init_scale),\n", 96 | " name=\"Wf_%d\" % i) for i in range(num_hidden_layers)]\n", 97 | "Uf = [tf.Variable(\n", 98 | " tf.random_uniform([hidden_size, hidden_size], minval=-init_scale, maxval=init_scale),\n", 99 | " name=\"Uf_%d\" % i) for i in range(num_hidden_layers)]\n", 100 | "\n", 101 | "# For content -- Quick note: there's no transformation from content -> state. They are both\n", 102 | "# the same size.\n", 103 | "Wc = [tf.Variable(\n", 104 | " tf.random_uniform([emb_size, hidden_size], minval=-init_scale, maxval=init_scale),\n", 105 | " name=\"Wc_%d\" % i) for i in range(num_hidden_layers)]\n", 106 | "Uc = [tf.Variable(\n", 107 | " tf.random_uniform([hidden_size, hidden_size], minval=-init_scale, maxval=init_scale),\n", 108 | " name=\"Uc_%d\" % i) for i in range(num_hidden_layers)]\n", 109 | "\n", 110 | "# For hidden state output gate.\n", 111 | "Wo = [tf.Variable(\n", 112 | " tf.random_uniform([emb_size, hidden_size], minval=-init_scale, maxval=init_scale),\n", 113 | " name=\"Wo_%d\" % i) for i in range(num_hidden_layers)]\n", 114 | "Uo = [tf.Variable(\n", 115 | " tf.random_uniform([hidden_size, hidden_size], minval=-init_scale, maxval=init_scale),\n", 116 | " name=\"Uo_%d\" % i) for i in range(num_hidden_layers)]\n", 117 | "\n", 118 | "# For gated feedback gates (e.g. the contribution of the paper).\n", 119 | "Wg = [tf.Variable(\n", 120 | " tf.random_uniform([emb_size, 1], minval=-init_scale, maxval=init_scale),\n", 121 | " name=\"Wg_%d\" % i) for i in range(num_hidden_layers)]\n", 122 | "Ug = [tf.Variable(\n", 123 | " tf.random_uniform([hidden_size * num_hidden_layers, 1], minval=-init_scale, maxval=init_scale),\n", 124 | " name=\"Ug_%d\" % i) for i in range(num_hidden_layers)]\n", 125 | "\n", 126 | "# For output.\n", 127 | "output_weights = tf.Variable(\n", 128 | " tf.random_uniform([hidden_size, vocab_size], minval=-init_scale, maxval=init_scale),\n", 129 | " name=\"output_weights\")\n", 130 | "output_bias = tf.Variable(tf.zeros([vocab_size]), name=\"output_bias\")\n", 131 | "\n", 132 | "X_in = tf.nn.embedding_lookup(embedding, X)\n", 133 | "\n", 134 | "initial_state = tf.zeros([batch_size, hidden_size])\n", 135 | "content = initial_state\n", 136 | "state = [initial_state] * num_hidden_layers\n", 137 | "prev_concat_h = tf.zeros([batch_size, hidden_size * num_hidden_layers])\n", 138 | "loss = tf.zeros([])\n", 139 | "# TODO: prev concat h\n", 140 | "for time_step in range(num_steps):\n", 141 | " h_prev = X_in[:, time_step, :]\n", 142 | " for layer in range(num_hidden_layers):\n", 143 | " input_gate = tf.nn.sigmoid(tf.matmul(h_prev, Wi[layer]) + tf.matmul(state[layer], Ui[layer]))\n", 144 | " forget_gate = tf.nn.sigmoid(tf.matmul(h_prev, Wf[layer]) + tf.matmul(state[layer], Uf[layer]))\n", 145 | " output_gate = tf.nn.sigmoid(tf.matmul(h_prev, Wo[layer]) + tf.matmul(state[layer], Uo[layer]))\n", 146 | " \n", 147 | " # Main contribution of paper:\n", 148 | " gates = [tf.sigmoid(tf.matmul(h_prev, Wg[i]) + tf.matmul(prev_concat_h, Ug[i])) for i in range(num_hidden_layers)]\n", 149 | " gated_prev_timestep = [gates[i] * tf.matmul(state[layer], Uc[i]) for i in range(num_hidden_layers)]\n", 150 | " new_content = tf.nn.tanh(tf.matmul(h_prev, Wc[layer]) + tf.add_n(gated_prev_timestep))\n", 151 | " \n", 152 | " content = tf.mul(forget_gate, content) + tf.mul(input_gate, new_content)\n", 153 | " state[layer] = tf.mul(output_gate, tf.nn.tanh(content))\n", 154 | " \n", 155 | " logits = tf.nn.bias_add(tf.matmul(state[num_hidden_layers-1], output_weights), output_bias)\n", 156 | " step_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, targets[:, time_step])\n", 157 | " loss += tf.reduce_sum(step_loss)\n", 158 | " prev_concat_h = tf.concat(1, state)\n", 159 | "\n", 160 | "final_state = state\n", 161 | "cost = loss / batch_size\n", 162 | "\n", 163 | "tf.scalar_summary(\"cost\", cost)\n", 164 | "merged = tf.merge_all_summaries()\n", 165 | "writer = tf.train.SummaryWriter(\"summaries/gfrnn\", session.graph_def)" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 4, 171 | "metadata": { 172 | "collapsed": false 173 | }, 174 | "outputs": [ 175 | { 176 | "name": "stdout", 177 | "output_type": "stream", 178 | "text": [ 179 | "Epoch 0\n", 180 | "1000 1536.86554276\n", 181 | "2000 1075.77593235\n", 182 | "3000 845.104395239\n", 183 | "4000 718.76732411\n", 184 | "5000 641.0083012\n", 185 | "6000 593.578288039\n", 186 | "7000 552.39057359\n", 187 | "8000 518.773597082\n", 188 | "9000 490.011530283\n", 189 | "10000 469.79929651\n", 190 | "11000 445.811142429\n", 191 | "12000 427.981149944\n", 192 | "13000 413.231142047\n", 193 | "14000 399.589796646\n", 194 | "15000 387.374589524\n", 195 | "16000 375.535316084\n", 196 | "17000 364.656415065\n", 197 | "18000 357.716949046\n", 198 | "19000 350.007875462\n", 199 | "20000 340.5189465\n", 200 | "21000 334.729726814\n", 201 | "22000 328.91687226\n", 202 | "23000 323.211649227\n", 203 | "24000 315.498545393\n", 204 | "25000 309.774060383\n", 205 | "26000 303.523051557\n", 206 | "27000 297.389077065\n", 207 | "28000 292.475912875\n", 208 | "29000 287.393987197\n", 209 | "30000 283.363092117\n", 210 | "31000 278.863735412\n", 211 | "32000 275.836836503\n", 212 | "33000 272.480957052\n", 213 | "34000 269.913912177\n", 214 | "35000 266.354806615\n", 215 | "36000 263.881540787\n", 216 | "37000 260.255694866\n", 217 | "38000 255.84195618\n", 218 | "39000 253.142906497\n", 219 | "40000 250.839347938\n", 220 | "41000 247.700747125\n", 221 | "42000 244.609508041\n", 222 | "43000 241.109089536\n", 223 | "44000 238.690587697\n", 224 | "45000 236.178669209\n", 225 | "46000 234.919347477\n", 226 | "Epoch 1\n", 227 | "47000 233.529050243\n", 228 | "48000 231.346767272\n", 229 | "49000 228.853817309\n", 230 | "50000 225.70174362\n", 231 | "51000 223.120431433\n", 232 | "52000 221.306655681\n", 233 | "53000 219.796232818\n", 234 | "54000 218.247005976\n", 235 | "55000 216.268173971\n", 236 | "56000 214.857019061\n", 237 | "57000 212.859998202\n", 238 | "58000 210.188553206\n", 239 | "59000 209.079580721\n", 240 | "60000 207.045767747\n", 241 | "61000 205.196654393\n", 242 | "62000 203.450599296\n", 243 | "63000 201.735644431\n", 244 | "64000 200.189708323\n", 245 | "65000 198.788259576\n", 246 | "66000 197.335555278\n", 247 | "67000 195.543393851\n", 248 | "68000 194.538999587\n", 249 | "69000 193.175939226\n", 250 | "70000 191.430713621\n", 251 | "71000 189.920810466\n", 252 | "72000 188.439249937\n", 253 | "73000 186.775430173\n", 254 | "74000 185.052030742\n", 255 | "75000 183.768886451\n", 256 | "76000 182.437888552\n", 257 | "77000 181.030238461\n", 258 | "78000 179.976427611\n", 259 | "79000 178.762464736\n", 260 | "80000 177.910351761\n", 261 | "81000 176.838355055\n", 262 | "82000 175.814496864\n", 263 | "83000 174.593081319\n", 264 | "84000 173.129086523\n", 265 | "85000 171.620987287\n", 266 | "86000 170.677194178\n", 267 | "87000 169.506480399\n", 268 | "88000 168.378289044\n", 269 | "89000 166.883673974\n", 270 | "90000 165.782215237\n", 271 | "91000 164.632383932\n", 272 | "92000 163.851626382\n" 273 | ] 274 | } 275 | ], 276 | "source": [ 277 | "# Train Model\n", 278 | "session.run(tf.initialize_all_variables())\n", 279 | "sgd = tf.train.GradientDescentOptimizer(lr).minimize(cost)\n", 280 | "costs = 0.0\n", 281 | "iters = 0\n", 282 | "for i in range(epochs):\n", 283 | " print 'Epoch', i\n", 284 | " for step, (x, y) in enumerate(reader.ptb_iterator(train_data, batch_size, num_steps)):\n", 285 | " result, step_cost, _, = session.run([merged, cost, sgd],\n", 286 | " {X: x, targets: y, lr: 1.0 / (i + 1)})\n", 287 | " costs += step_cost\n", 288 | " iters += num_steps\n", 289 | " if iters % 1000 == 0:\n", 290 | " print iters, np.exp(costs / iters)\n", 291 | " writer.add_summary(result, iters)\n", 292 | " writer.flush()" 293 | ] 294 | } 295 | ], 296 | "metadata": { 297 | "kernelspec": { 298 | "display_name": "Python 2", 299 | "language": "python", 300 | "name": "python2" 301 | }, 302 | "language_info": { 303 | "codemirror_mode": { 304 | "name": "ipython", 305 | "version": 2 306 | }, 307 | "file_extension": ".py", 308 | "mimetype": "text/x-python", 309 | "name": "python", 310 | "nbconvert_exporter": "python", 311 | "pygments_lexer": "ipython2", 312 | "version": "2.7.10" 313 | } 314 | }, 315 | "nbformat": 4, 316 | "nbformat_minor": 0 317 | } 318 | --------------------------------------------------------------------------------