├── 2LayerLSTM
    ├── lstmClassOld.py
    └── stockTT.bin
├── DavesReservoirComputingRefs
├── GRU
    ├── GRU.py
    └── stockTT.bin
├── ICLR Top paper thumbnail descriptions.
├── LICENSE
├── PapersSuggested_not_read
├── README.md
├── ZeeshanZiaSlides-DeepSupervision3DObjectParsing
    └── DLStudyGroup.pdf
├── awspot
    ├── README.md
    └── user_data.sh
├── gatedRNN.ipynb
└── tensorflow
    └── Gated Feedback Recurrent Neural Network - GF LSTM.ipynb


/2LayerLSTM/lstmClassOld.py:
--------------------------------------------------------------------------------
  1 | import theano
  2 | import theano.tensor as T
  3 | import numpy as np
  4 | import random
  5 | import matplotlib.pyplot as plt
  6 | import cPickle as pickle
  7 | from math import sqrt
  8 | #from lstmClass import LstmLayer, recurrent_fn
  9 | 
 10 | '''Define lstm class for single layer lstm
 11 | The objective is to define in such a way as to facilitate construction of multi-layer
 12 | lstm.  The questions are:
 13 | 1.  Cost function can't go inside the class because it may only be associated with the last
 14 | layer in the stack.
 15 | 2.  Should the scan function be inside or outside of the class?
 16 | 3.  How about the gradient calculations?  It seems like those need to be outside the class
 17 | does that mean that the gradient calculations have to be outside the class statement.
 18 | 4.  Perhaps the scan function and the single layer recurrance function need to be inside
 19 | the class statement, but the cost function goes outside.
 20 | 5.  Then the gradient calculation might only need to have a list of the parameters for which
 21 | the cost needs to be diff'd.  That would just be the list of lstm-layer objects dotted with
 22 | the parameter list for each one.
 23 | 6.  Not clear how gradient of scan function may interact with python oop.  Not sure if scan
 24 | output includes enough for gradient calc.  Perhaps scan should be external to class structure
 25 | Plan A.
 26 | class RNN
 27 | 
 28 | 
 29 | '''
 30 | 
 31 | class LstmLayer(object):
 32 | 
 33 |     def __init__(self, n_in, n_hidden, n_out, name):
 34 |         self.name = name
 35 |         rng = np.random.RandomState(1234)
 36 |         #cell input
 37 |         self.W_ug = np.asarray(rng.normal(size=(n_in, n_hidden), scale= .01, loc = 0.0), dtype = theano.config.floatX)
 38 |         self.W_hg = np.asarray(rng.normal(size=(n_hidden, n_hidden), scale=.01, loc = 0.0), dtype = theano.config.floatX)
 39 |         self.b_g = np.zeros((n_hidden,), dtype=theano.config.floatX)
 40 |         #input gate equation
 41 |         self.W_ui = np.asarray(rng.normal(size=(n_in, n_hidden), scale =.01, loc=0.0), dtype = theano.config.floatX)
 42 |         self.W_hi = np.asarray(rng.normal(size=(n_hidden, n_hidden), scale =.01, loc=0.0), dtype = theano.config.floatX)
 43 |         self.b_i = np.zeros((n_hidden,), dtype=theano.config.floatX)
 44 |         #forget gate equations
 45 |         self.W_uf = np.asarray(rng.normal(size=(n_in, n_hidden), scale =.01, loc=0.0), dtype = theano.config.floatX)
 46 |         self.W_hf = np.asarray(rng.normal(size=(n_hidden, n_hidden), scale =.01, loc=0.0), dtype = theano.config.floatX)
 47 |         self.b_f = np.zeros((n_hidden,), dtype=theano.config.floatX)
 48 |         #cell output gate equations
 49 |         self.W_uo = np.asarray(rng.normal(size=(n_in, n_hidden), scale =.01, loc=0.0), dtype = theano.config.floatX)
 50 |         self.W_ho = np.asarray(rng.normal(size=(n_hidden, n_hidden), scale =.01, loc=0.0), dtype = theano.config.floatX)
 51 |         self.b_o = np.zeros((n_hidden,), dtype=theano.config.floatX)
 52 |         #output layer
 53 |         self.W_hy = np.asarray(rng.normal(size=(n_hidden, n_out), scale =.01, loc=0.0), dtype = theano.config.floatX)
 54 |         self.b_hy = np.zeros((n_out,), dtype=theano.config.floatX)
 55 | 
 56 |         #cell input
 57 |         self.W_ug = theano.shared(self.W_ug, 'W_ug' + self.name)
 58 |         self.W_hg = theano.shared(self.W_hg, 'W_hg' + self.name)
 59 |         self.b_g = theano.shared(self.b_g, 'b_g' + self.name)
 60 |         #input gate equation
 61 |         self.W_ui = theano.shared(self.W_ui, 'W_ui' + self.name)
 62 |         self.W_hi = theano.shared(self.W_hi, 'W_hi' + self.name)
 63 |         self.b_i = theano.shared(self.b_i, 'b_i' + self.name)
 64 |         #forget gate equations
 65 |         self.W_uf = theano.shared(self.W_uf, 'W_uf' + self.name)
 66 |         self.W_hf = theano.shared(self.W_hf, 'W_hf' + self.name)
 67 |         self.b_f = theano.shared(self.b_f, 'b_f' + self.name)
 68 |         #cell output gate equations
 69 |         self.W_uo = theano.shared(self.W_uo, 'W_uo' + self.name)
 70 |         self.W_ho = theano.shared(self.W_ho, 'W_ho' + self.name)
 71 |         self.b_o = theano.shared(self.b_o, 'b_o' + self.name)
 72 |         #output layer
 73 |         self.W_hy = theano.shared(self.W_hy, 'W_hy' + self.name)
 74 |         self.b_hy = theano.shared(self.b_hy, 'b_hy' + self.name)
 75 | 
 76 |         self.h0_tm1 = theano.shared(np.zeros(n_hidden, dtype=theano.config.floatX))
 77 |         self.s0_tm1 = theano.shared(np.zeros(n_hidden, dtype=theano.config.floatX))
 78 |         self.argList = [self.W_ug, self.W_hg, self.b_g, self.W_ui, self.W_hi,
 79 |                         self.b_i, self.W_uf, self.W_hf, self.b_f, self.W_uo, self.W_ho, self.b_o, self.W_hy, self.b_hy]
 80 | 
 81 | def recurrent_fn(u_t, h_tm1, s_tm1, W_ug, W_hg, b_g, W_ui, W_hi, b_i, W_uf, W_hf, b_f,
 82 |                      W_uo, W_ho, b_o, W_hy, b_hy):
 83 |     g_t = T.tanh(T.dot(u_t, W_ug) + T.dot(h_tm1, W_hg) + b_g)
 84 |     i_t = T.nnet.sigmoid(T.dot(u_t, W_ui) + T.dot(h_tm1, W_hi) + b_i)
 85 |     f_t = T.nnet.sigmoid(T.dot(u_t, W_uf) + T.dot(h_tm1, W_hf) + b_f)
 86 |     o_t = T.nnet.sigmoid(T.dot(u_t, W_uo) + T.dot(h_tm1, W_ho) + b_o)
 87 |     s_t = g_t * i_t + s_tm1*f_t
 88 |     h_t = T.tanh(s_t)*o_t
 89 |     #h_t = self.activ(T.dot(h_tm1, W_hh) + T.dot(u_t, W_uh) + b_hh)
 90 |     return [h_t, s_t]
 91 | 
 92 | def fcn2(u_t, h_tm1, s_tm1,h_tm12, s_tm12, W_ug, W_hg, b_g, W_ui, W_hi, b_i, W_uf, W_hf, b_f,
 93 |                      W_uo, W_ho, b_o, W_hy, b_hy, W_ug2, W_hg2, b_g2, W_ui2, W_hi2, b_i2, W_uf2, W_hf2, b_f2,
 94 |                      W_uo2, W_ho2, b_o2, W_hy2, b_hy2):
 95 |     [h_t, s_t] = recurrent_fn(u_t, h_tm1, s_tm1, W_ug, W_hg, b_g, W_ui, W_hi, b_i, W_uf, W_hf, b_f,
 96 |                      W_uo, W_ho, b_o, W_hy, b_hy)
 97 |     o1 = T.dot(h_tm1, W_hy) + b_hy
 98 |     [h_t2, s_t2] = recurrent_fn(o1, h_tm12, s_tm12, W_ug2, W_hg2, b_g2, W_ui2, W_hi2, b_i2, W_uf2, W_hf2, b_f2,
 99 |                      W_uo2, W_ho2, b_o2, W_hy2, b_hy2)
100 |     return [h_t, s_t, h_t2, s_t2]
101 | 
102 | 
103 | #use lstmLayer class to define algebra of lstm and build stack and gradient calculation
104 | 
105 | #one layer lstm stack for stock price prediction
106 | # u = T.matrix()
107 | # t = T.scalar()
108 | # l1 = LstmLayer(n_in=5, n_hidden=10, n_out=1, name='l1')
109 | 
110 | #theano.printing.debugprint([h0_tm1, u, W_hh, W_uh, W_hy, b_hh, b_hy], print_type=True)
111 | #define
112 | # [l1.h, l1.s], _ = theano.scan(recurrent_fn, sequences = u,
113 | #                            outputs_info = [l1.h0_tm1, l1.s0_tm1],
114 | #                            non_sequences = l1.argList)
115 | # y = T.dot(l1.h[-1], l1.W_hy) + l1.b_hy
116 | # cost = ((t - y)**2).mean(axis=0).sum()
117 | # grad = T.grad(cost, l1.argList)
118 | # lr = T.scalar()
119 | # update = [(a, a-lr*b) for (a,b) in zip(l1.argList, grad)]
120 | #
121 | # train_step = theano.function([u, t, lr], cost,
122 | #             on_unused_input='warn',
123 | #             updates=update,
124 | #             allow_input_downcast=True)
125 | 
126 | #two layer lstm stack for stock price prediction
127 | u = T.matrix()
128 | t = T.scalar()
129 | o1 = T.matrix()
130 | l1 = LstmLayer(n_in=5, n_hidden=10, n_out=10, name='l1')
131 | l2 = LstmLayer(n_in=10, n_hidden=10, n_out=1, name='l2')
132 | #theano.printing.debugprint([h0_tm1, u, W_hh, W_uh, W_hy, b_hh, b_hy], print_type=True)
133 | #define
134 | [l1.h, l1.s, l2.h, l2.s], _ = theano.scan(fcn2, sequences = u,
135 |                            outputs_info = [l1.h0_tm1, l1.s0_tm1, l2.h0_tm1, l2.s0_tm1],
136 |                            non_sequences = l1.argList + l2.argList)
137 | #                          non_sequences = l1.argList + l2.argList, mode='DebugMode')
138 | 
139 | 
140 | 
141 | y = T.dot(l2.h[-1], l2.W_hy) + l2.b_hy
142 | cost = ((t - y)**2).mean(axis=0).sum()
143 | grad = T.grad(cost, l1.argList + l2.argList)
144 | lr = T.scalar()
145 | update = [(a, a-lr*b) for (a,b) in zip(l1.argList + l2.argList, grad)]
146 | 
147 | train_step = theano.function([u, t, lr], cost,
148 |             on_unused_input='warn',
149 |             updates=update,
150 |             allow_input_downcast=True)
151 | #           allow_input_downcast=True, mode='DebugMode')
152 | 
153 | 
154 | if __name__ == '__main__':
155 | 
156 |     (xlist, ylist) = pickle.load(open('stockTT.bin', 'rb'))
157 |     nInputs = len(xlist[0])
158 |     x = np.array(xlist, dtype = theano.config.floatX)
159 |     y = np.array(ylist, dtype = theano.config.floatX)
160 |     print "Std Dev of Price Change", np.std(y)
161 |     nHidden = 20
162 |     nOutputs = 1
163 |     lr = 0.01
164 |     eSmooth = 1.0
165 |     nPasses = 1
166 |     vals = []
167 |     errSq = []
168 |     for i in range(nPasses):
169 |         for j in range(len(x)):
170 |             u = np.asarray(xlist[j], dtype = theano.config.floatX).reshape((1,nInputs))
171 |             t = y[j]
172 | 
173 |             c = train_step(u, t, lr)
174 |             if j%10==0: print "iteration {0}: {1}".format(j, np.sqrt(c))
175 |             eSmooth = 0.1*np.sqrt(c) + 0.9*eSmooth
176 |             vals.append(eSmooth)
177 |             errSq.append(c)
178 |     print 'RMS Pred Error', sqrt(np.average(errSq[500:]))
179 |     plt.plot(vals)
180 |     plt.show()
181 | 
182 | 
183 | 
184 | 


--------------------------------------------------------------------------------
/DavesReservoirComputingRefs:
--------------------------------------------------------------------------------
 1 | Here is the material for Reservoir Computing for next week - one paper
 2 | (with accompanying slides), plus a second paper if we have time.
 3 | 
 4 | ===
 5 | Overview:
 6 | 
 7 | Next week's paper is Reservoir Computing by Felix Grezes. But if you are
 8 | new to randomized projections, you may want to start with the blog post
 9 | from by Terence Tao and/or the Nuite Blanche blog post to learn about
10 | the underlying concepts - links to both blogs are below, after Grezes'
11 | paper and slides.
12 | 
13 | If we have time we may also discuss "Information Processing Using a
14 | Single Dynamical Node as Complex System."
15 | 
16 | --
17 | Links:
18 | 
19 | Next weeks' paper (and related slides):
20 | Reservoir Computing by Felix Grezes.
21 | http://www.gc.cuny.edu/CUNY_GC/media/Computer-Science/Student%20Presentations/Felix%20Grezes/Second_Exam_Survey_Felix_Grezes_9_04_2014.pdf
22 | 
23 | Slides by Felix Grezes: Reservoir Computing for Neural Networks
24 | http://www.gc.cuny.edu/CUNY_GC/media/Computer-Science/Student%20Presentations/Felix%20Grezes/Second_Exam_Slides_Felix_Grezes_9-14-2014.pdf
25 | (more at: http://speech.cs.qc.cuny.edu/~felix/ )
26 | 
27 | --
28 | 
29 | This is a short, very useful backgrounder on randomized projections,
30 | here used for compressed sensing, in a blog post by Terence Tao
31 | https://terrytao.wordpress.com/2007/04/13/compressed-sensing-and-single-pixel-cameras/
32 | 
33 | and the same story told with illustrations on the Nuit Blanche blog:
34 | http://nuit-blanche.blogspot.com/2007/07/how-does-rice-one-pixel-camera-work.html
35 | 
36 | (BTW http://nuit-blanche.blogspot.com is a tremendous website.)
37 | 
38 | ---
39 | 
40 | If we have time, we may discuss this paper:
41 | 
42 | Information Processing Using a Single Dynamical Node as Complex System.
43 | https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3195233/pdf/ncomms1476.pdf
44 | 
45 | Supplementary Information to the paper:
46 | http://www.nature.com/article-assets/npg/ncomms/journal/v2/n9/extref/ncomms1476-s1.pdf
47 | 
48 | ===
49 | 
50 | Additional background info on, and software for, randomized projections
51 | and Reservoir Computing:
52 | 
53 | Web Site on Reservoir Computing
54 | http://organic.elis.ugent.be/flavors
55 | --
56 | Reservoir Computing Python Toolkit (OGERr) (see above Reservoir
57 | Computing web site for list of other software packages)
58 | http://organic.elis.ugent.be/software/organic-environment-reservoir-computing-oger-toolbox
59 | --
60 | A good video by Ted Dunning on randomized projections:
61 | https://vimeo.com/33417977
62 | 
63 | A key paper on randomized projections, which is discussed in the above
64 | video:
65 | Finding structure with randomness: Probabilistic algorithms for
66 | constructing approximate matrix decompositions -
67 | https://arxiv.org/abs/0909.4061
68 | --
69 | Some papers on Reservoir Computing that go into more detail:
70 | Constructing optimized binary masks for reservoir computing with delay
71 | systems
72 | https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3887384/pdf/srep03629.pdf
73 | --
74 | A Comparative Study of Reservoir Computing for Temporal Signal
75 | Processing https://arxiv.org/abs/1401.2224
76 | --
77 | Product Reservoir Computing: Time-Series Computation with Multiplicative
78 | Neurons
79 | https://arxiv.org/pdf/1502.00718.pdf
80 | --
81 | Reservoir computing approaches to recurrent neural network training
82 | http://minds.jacobs-university.de/sites/default/files/uploads/papers/2261_LukoseviciusJaeger09.pdf
83 | --
84 | Reservoir Computing and Self-Organized Neural Hierarchies
85 | http://minds.jacobs-university.de/sites/default/files/uploads/papers/Mantas_Lukosevicius_PhD_thesis.pdf
86 | 


--------------------------------------------------------------------------------
/GRU/GRU.py:
--------------------------------------------------------------------------------
  1 | import theano
  2 | import theano.tensor as T
  3 | import numpy as np
  4 | import random
  5 | import matplotlib.pyplot as plt
  6 | import cPickle as pickle
  7 | from math import sqrt
  8 | #from lstmClass import LstmLayer, recurrent_fn
  9 | 
 10 | '''Define GRU class for single layer GRU
 11 | The objective is to define in such a way as to facilitate construction of multi-layer
 12 | GRU.  The questions are:
 13 | 1.  Cost function can't go inside the class because it may only be associated with the last
 14 | layer in the stack.
 15 | 2.  Should the scan function be inside or outside of the class?
 16 | 3.  How about the gradient calculations?  It seems like those need to be outside the class
 17 | does that mean that the gradient calculations have to be outside the class statement.
 18 | 4.  Perhaps the scan function and the single layer recurrance function need to be inside
 19 | the class statement, but the cost function goes outside.
 20 | 5.  Then the gradient calculation might only need to have a list of the parameters for which
 21 | the cost needs to be diff'd.  That would just be the list of lstm-layer objects dotted with
 22 | the parameter list for each one.
 23 | 6.  Not clear how gradient of scan function may interact with python oop.  Not sure if scan
 24 | output includes enough for gradient calc.  Perhaps scan should be external to class structure
 25 | 
 26 | 
 27 | '''
 28 | 
 29 | class GRULayer(object):
 30 | 
 31 |     def __init__(self, n_in, n_hidden, n_out, name):
 32 |         self.name = name
 33 |         rng = np.random.RandomState(1234)
 34 |         #candidate activation - h-twiddle equation
 35 |         self.W_uht = np.asarray(rng.normal(size=(n_in, n_hidden), scale= .01, loc = 0.0), dtype = theano.config.floatX)
 36 |         self.W_hht = np.asarray(rng.normal(size=(n_hidden, n_hidden), scale=.01, loc = 0.0), dtype = theano.config.floatX)
 37 |         self.b_ht = np.zeros((n_hidden,), dtype=theano.config.floatX)
 38 |         #update gate - z eqn
 39 |         self.W_uz = np.asarray(rng.normal(size=(n_in, n_hidden), scale =.01, loc=0.0), dtype = theano.config.floatX)
 40 |         self.W_hz = np.asarray(rng.normal(size=(n_hidden, n_hidden), scale =.01, loc=0.0), dtype = theano.config.floatX)
 41 |         self.b_z = np.zeros((n_hidden,), dtype=theano.config.floatX)
 42 |         #reset gate equations - r
 43 |         self.W_ur = np.asarray(rng.normal(size=(n_in, n_hidden), scale =.01, loc=0.0), dtype = theano.config.floatX)
 44 |         self.W_hr = np.asarray(rng.normal(size=(n_hidden, n_hidden), scale =.01, loc=0.0), dtype = theano.config.floatX)
 45 |         self.b_r = np.zeros((n_hidden,), dtype=theano.config.floatX)
 46 | 
 47 |         #output layer
 48 |         self.W_hy = np.asarray(rng.normal(size=(n_hidden, n_out), scale =.01, loc=0.0), dtype = theano.config.floatX)
 49 |         self.b_y = np.zeros((n_out,), dtype=theano.config.floatX)
 50 | 
 51 |         #candidate activation - h-twiddle equation
 52 |         self.W_uht = theano.shared(self.W_uht, 'W_uht' + self.name)
 53 |         self.W_hht = theano.shared(self.W_hht, 'W_hht' + self.name)
 54 |         self.b_ht = theano.shared(self.b_ht, 'b_ht' + self.name)
 55 |         #update gate - z eqn
 56 |         self.W_uz = theano.shared(self.W_uz, 'W_uz' + self.name)
 57 |         self.W_hz = theano.shared(self.W_hz, 'W_hz' + self.name)
 58 |         self.b_z = theano.shared(self.b_z, 'b_z' + self.name)
 59 |         #reset gate equations
 60 |         self.W_ur = theano.shared(self.W_ur, 'W_ur' + self.name)
 61 |         self.W_hr = theano.shared(self.W_hr, 'W_hr' + self.name)
 62 |         self.b_r = theano.shared(self.b_r, 'b_r' + self.name)
 63 | 
 64 |         #output layer
 65 |         self.W_hy = theano.shared(self.W_hy, 'W_hy' + self.name)
 66 |         self.b_y = theano.shared(self.b_y, 'b_y' + self.name)
 67 | 
 68 |         self.h0_tm1 = theano.shared(np.zeros(n_hidden, dtype=theano.config.floatX))
 69 |         self.argList = [self.W_uht, self.W_hht, self.b_ht, self.W_uz, self.W_hz,
 70 |                         self.b_z, self.W_ur, self.W_hr, self.b_r, self.W_hy, self.b_y]
 71 | 
 72 | def recurrent_fn(u_t, h_tm1, W_uht, W_hht, b_ht, W_uz, W_hz, b_z, W_ur, W_hr, b_r, W_hy, b_y):
 73 |     z_t = T.nnet.sigmoid(T.dot(u_t, W_uz) + T.dot(h_tm1, W_hz) + b_z)
 74 |     r_t = T.nnet.sigmoid(T.dot(u_t, W_ur) + T.dot(h_tm1, W_hr) + b_r)
 75 |     ht_t = T.tanh(T.dot(u_t, W_uht) + T.dot(r_t*h_tm1, W_hht) + b_ht)
 76 | 
 77 |     h_t = (1 - z_t)*h_tm1 + z_t*ht_t
 78 |     return h_t
 79 | 
 80 | # def fcn2(u_t, h_tm1, s_tm1,h_tm12, s_tm12, W_ug, W_hg, b_g, W_ui, W_hi, b_i, W_uf, W_hf, b_f,
 81 | #                      W_uo, W_ho, b_o, W_hy, b_hy, W_ug2, W_hg2, b_g2, W_ui2, W_hi2, b_i2, W_uf2, W_hf2, b_f2,
 82 | #                      W_uo2, W_ho2, b_o2, W_hy2, b_hy2):
 83 | #     [h_t, s_t] = recurrent_fn(u_t, h_tm1, s_tm1, W_ug, W_hg, b_g, W_ui, W_hi, b_i, W_uf, W_hf, b_f,
 84 | #                      W_uo, W_ho, b_o, W_hy, b_hy)
 85 | #     o1 = T.dot(h_tm1, W_hy) + b_hy
 86 | #     [h_t2, s_t2] = recurrent_fn(o1, h_tm12, s_tm12, W_ug2, W_hg2, b_g2, W_ui2, W_hi2, b_i2, W_uf2, W_hf2, b_f2,
 87 | #                      W_uo2, W_ho2, b_o2, W_hy2, b_hy2)
 88 | #     return [h_t, s_t, h_t2, s_t2]
 89 | 
 90 | 
 91 | #use GRULayer class to define algebra of GRU and build stack and gradient calculation
 92 | 
 93 | #one layer gru stack for stock price prediction
 94 | u = T.matrix()
 95 | t = T.scalar()
 96 | l1 = GRULayer(n_in=5, n_hidden=10, n_out=1, name='l1')
 97 | 
 98 | #theano.printing.debugprint([h0_tm1, u, W_hh, W_uh, W_hy, b_hh, b_hy], print_type=True)
 99 | #define
100 | l1.h, _ = theano.scan(recurrent_fn, sequences = u,
101 |                             outputs_info = [l1.h0_tm1],
102 |                             non_sequences = l1.argList)
103 | y = T.dot(l1.h[-1], l1.W_hy) + l1.b_y
104 | cost = ((t - y)**2).mean(axis=0).sum()
105 | grad = T.grad(cost, l1.argList)
106 | lr = T.scalar()
107 | update = [(a, a-lr*b) for (a,b) in zip(l1.argList, grad)]
108 | 
109 | train_step = theano.function([u, t, lr], cost,
110 |              on_unused_input='warn',
111 |              updates=update,
112 |              allow_input_downcast=True)
113 | 
114 | # #two layer lstm stack for stock price prediction
115 | # u = T.matrix()
116 | # t = T.scalar()
117 | # o1 = T.matrix()
118 | # l1 = LstmLayer(n_in=5, n_hidden=10, n_out=10, name='l1')
119 | # l2 = LstmLayer(n_in=10, n_hidden=10, n_out=1, name='l2')
120 | # #theano.printing.debugprint([h0_tm1, u, W_hh, W_uh, W_hy, b_hh, b_hy], print_type=True)
121 | # #define
122 | # [l1.h, l1.s, l2.h, l2.s], _ = theano.scan(fcn2, sequences = u,
123 | #                            outputs_info = [l1.h0_tm1, l1.s0_tm1, l2.h0_tm1, l2.s0_tm1],
124 | #                            non_sequences = l1.argList + l2.argList)
125 | # #                          non_sequences = l1.argList + l2.argList, mode='DebugMode')
126 | 
127 | 
128 | 
129 | # y = T.dot(l2.h[-1], l2.W_hy) + l2.b_hy
130 | # cost = ((t - y)**2).mean(axis=0).sum()
131 | # grad = T.grad(cost, l1.argList + l2.argList)
132 | # lr = T.scalar()
133 | # update = [(a, a-lr*b) for (a,b) in zip(l1.argList + l2.argList, grad)]
134 | #
135 | # train_step = theano.function([u, t, lr], cost,
136 | #             on_unused_input='warn',
137 | #             updates=update,
138 | #             allow_input_downcast=True)
139 | #           allow_input_downcast=True, mode='DebugMode')
140 | 
141 | 
142 | if __name__ == '__main__':
143 | 
144 |     (xlist, ylist) = pickle.load(open('stockTT.bin', 'rb'))
145 |     nInputs = len(xlist[0])
146 |     x = np.array(xlist, dtype = theano.config.floatX)
147 |     y = np.array(ylist, dtype = theano.config.floatX)
148 |     print "Std Dev of Price Change", np.std(y)
149 |     nHidden = 50
150 |     nOutputs = 1
151 |     lr = 0.01
152 |     eSmooth = 1.0
153 |     nPasses = 1
154 |     vals = []
155 |     errSq = []
156 |     for i in range(nPasses):
157 |         for j in range(len(x)):
158 |             u = np.asarray(xlist[j], dtype = theano.config.floatX).reshape((1,nInputs))
159 |             t = y[j]
160 | 
161 |             c = train_step(u, t, lr)
162 |             if j%10==0: print "iteration {0}: {1}".format(j, np.sqrt(c))
163 |             eSmooth = 0.1*np.sqrt(c) + 0.9*eSmooth
164 |             vals.append(eSmooth)
165 |             errSq.append(c)
166 |     print 'RMS Pred Error', sqrt(np.average(errSq[500:]))
167 |     plt.plot(vals)
168 |     plt.show()
169 | 
170 | #with nhidden = 20 (error number is the same with nhidden = 50)
171 | #std dev of price = 3.55612
172 | #std dev of error = 1.36565147217
173 | # 1 - ratio = 0.8525


--------------------------------------------------------------------------------
/ICLR Top paper thumbnail descriptions.:
--------------------------------------------------------------------------------
 1 | https://iclr.cc/Conferences/2018/Schedule?type=Oral   
 2 | 
 3 | Zero shot visual imitation - Two step robot learing.  First cause robot to explore without goals and then frame objective as sequence of views of intermediate steps, while re-labeling exploration sequence so objectives reached during exploration are treated as targets.  Leads to one-shot learning.  
 4 |   
 5 | Boosting dilated convolutional networks w mixed tensor decompositions - theoretical demonstration that layer inter connections improve expressive efficiency in dialation convolution networks.  Interesting and useful result of rare theoretical type.  going to take some work to get through. 
 6 | 
 7 | Principled Adversarial Training - Use wasserstein measure to define distributional neighborhood for generating adversarial training examples.  
 8 | 
 9 | Breaking the softmax Bottleneck - Demonstrate the softmax is too restrictive of a model and propose to overcome restriction by using mixture of softmaxes MoS.  Achieve consistently better performance in a variety of benchmarks. 
10 | 
11 | Characterizing adversarial subspaces using local intrinsic dimensionality - Characterize adversarial subspaces as space filling in neighborhoods of legitimate examples and charactize by local dimensionality.  This characterization yields test for adversarial example and they show vastly improved detection rates that other methods.    
12 | 
13 | Neural Sketch Learning - System for code generation that operates by breaking the problem into two parts 1. generating sketches that describe core operation and 2. filling in the sketches with code that satisfies the details of typing etc.  
14 | 
15 | Learning to represent programs as graphs - Builds on Gated Graph Neural Networks (GGNN) and uses it to attack sub problems of programming, naming variables and using them correctly.  
16 | 
17 | Insufficiency of Existing momentum schemes for stochastic optimization - Demonstrates cases where Nesterov etc don't perform well when using SGD versus GD.  Develop an alternative based on Nesterov. 
18 | 
19 | Convergence of ADAM and Beyond - Analyze convergence issue of Adam on large parameter spaces.  Determine that moving average is not well suited and develop alternative.  Show performance improvements in synthetic cases constructed on the basis of authors' analysis of weaknesses and on benchmarks known to cause Adam problems.  
20 | 
21 | Wasserstein Auto Encoders - Authors use Wasserstein distance as comparator function for AE and adversarial nets.  Demonstrate better convergence properties than GAN while matching GAN's better quality.  
22 | https://wolfweb.unr.edu/homepage/jabuka/Classes/2006_spring/topology/Notes/04%20-%20Congergent%20sequences.pdf  
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright [2016] [Mike Bowles]
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/PapersSuggested_not_read:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
   1 | # hdDeepLearningStudy  
   2 | Papers,code etc for deep learning study group  
   3 | See group discord - https://discord.gg/HuWVmMgmqS   
   4 | zoom link - On the meetup page  
   5 | meeting time - 6:30 pm California time   
   6 | 
   7 | ##  Tuesday, November 21, 2023  
   8 | paper:  MemGPT -Towards LLMs as an Operating System https://arxiv.org/pdf/2310.08560.pdf     
   9 | Blog w MemBPT - https://memgpt.ai/     
  10 | youtube:  https://www.youtube.com/watch?v=nQmZmFERmrg     
  11 | 
  12 | ##  Tuesday, November 14, 2023  
  13 | paper: https://openreview.net/pdf?id=S1KGaTSOTS - CLUSTERFORMER: Clustering As A Universal Visual Learner.  
  14 | 
  15 | ##  Tuesday, November 7, 2023  
  16 | paper: https://arxiv.org/pdf/2310.12962.pdf - An Emulator for Fine-Tuning Large Language Models using Small Language Models  
  17 | 
  18 | ##  Tuesday, October 31,  2023   
  19 | paper: https://www.nature.com/articles/s42256-023-00711-8 - From attribution maps to human-understandable explanations through Concept Relevance Propagation   
  20 | 
  21 | ##  Tuesday, October 24,  2023   
  22 | paper: https://arxiv.org/pdf/2209.12951.pdf - Liquid Structural State-Space Models  
  23 | 
  24 | ##  Tuesday, October 17, 2023  
  25 | paper: Liquid Time-Constant Networks https://arxiv.org/abs/2006.04439  
  26 | youtube: https://www.youtube.com/watch?v=IlliqYiRhMU  
  27 | shorter video: https://www.youtube.com/watch?v=RI35E5ewBuI  
  28 | 
  29 | ##  Tuesday, October 10, 2023   
  30 | paper - 3D Gaussian Splatting for Real-Time Radiance Field Rendering https://arxiv.org/abs/2308.04079   
  31 | youtube: Superb 2 minute video on paper https://www.youtube.com/watch?v=HVv_IQKlafQ   
  32 | youtube: Siggraph 2023 talk on paper - this is 5 minutes https://www.youtube.com/watch?v=T_kXY43VZnk&t=3s   
  33 | Author's blog: including links to code: https://repo-sam.inria.fr/fungraph/3d-gaussian-splatting/   
  34 | 
  35 | ##  Tuesday, October 3 , 2023  
  36 | paper: https://arxiv.org/abs/2112.04035 - Relating transformers to models and neural representations of the hippocampal formation  
  37 | another paper:  https://amygdala.psychdept.arizona.edu/labspace/JclubLabMeetings/JeanMarc-Build-cognitive-maps.pdf - How to build a cognitive map   
  38 | youtube:  https://www.youtube.com/watch?v=9qOaII_PzGY&t=413s - How Your Brain Organizes Information   
  39 | youtube:  https://www.youtube.com/watch?v=cufOEzoVMVA - Can We Build an Artificial Hippocampus?   
  40 | youtube:  https://www.cell.com/cell/fulltext/S0092-8674(20)31388-X - The Tolman-Eichenbaum Machine: Unifying Space and Relational Memory through Generalization in the Hippocampal Formation   
  41 | 
  42 | ##  Tuesday, September 26, 2023  
  43 | paper: https://research.nvidia.com/labs/par/Perfusion/ - 3D Gaussian Splatting for Real-Time Radiance Field Rendering   
  44 | 
  45 | ##  Tuesday, September 19, 2023  
  46 | paper: https://arxiv.org/pdf/2210.09276.pdf - Imagic: Text-Based Real Image Editing with Diffusion Models   
  47 | youtube: https://www.youtube.com/watch?v=PzHMjCtuPuo   
  48 | blog: https://imagic-editing.github.io/   
  49 | 
  50 | ##  Tuesday, Sept 12, 2023  
  51 | paper: https://arxiv.org/abs/2307.02486 - LongNet: Scaling Transformers to 1,000,000,000 Tokens   
  52 | Blog: https://syncedreview.com/2023/07/10/microsofts-longnet-scales-transformer-to-one-billion-tokens   
  53 | 
  54 | ##  Tuesday, Sept 5, 2023  
  55 | https://arxiv.org/pdf/2308.08708.pdf - Consciousness in Artificial Intelligence: Insights from the Science of Consciousness  
  56 | 
  57 | ##  Tuesday, August 29, 2023  
  58 | paper: https://arxiv.org/pdf/2307.15936.pdf - A Theory for Emergence of Complex Skills in Language Models and video  
  59 | youtube: https://www.youtube.com/watch?v=0D23NeBjCeQ   
  60 | 
  61 | ##  Tuesday, August 22, 2023  
  62 | Paper: https://arxiv.org/pdf/2206.04843.pdf -- Neural Laplace: Learning diverse classes of differential equations in the Laplace domain  
  63 | Slides and video from ICML 2022: https://icml.cc/virtual/2022/oral/16728  
  64 | 
  65 | ##  Wednesday, August 16, 2023  
  66 | paper: https://arxiv.org/abs/2308.03296 - Studying Large Language Model Generalization with Influence Functions   
  67 | blog: https://www.anthropic.com/index/influence-functions   
  68 | 
  69 | ##  Wednesday, August 9, 2023  
  70 | paper: Music Generations https://arxiv.org/pdf/2306.05284.pdf   
  71 | blog: https://about.fb.com/news/2023/08/audiocraft-generative-ai-for-music-and-audio/   
  72 | blog: https://ai.meta.com/blog/audiocraft-musicgen-audiogen-encodec-generative-ai-audio/  
  73 | 
  74 | ##  Wednesday, August 2, 2023   
  75 | paper: https://arxiv.org/abs/2205.10343 Towards Understanding Grokking: An Effective Theory of Representation Learning   
  76 | blog: https://ericjmichaud.com/grokking-squared/  
  77 | blog: https://www.beren.io/2022-01-11-Grokking-Grokking/   
  78 | blog: https://www.beren.io/2022-04-17-Understanding_Overparametrized_Generalization/   
  79 | 
  80 | ##  Wednesday, July 26, 2023  
  81 | paper: Mixture of experts (similar to chatGPT4): https://arxiv.org/abs/2305.14705  
  82 | 
  83 | blog: Mixture-of-Experts with Expert Choice Routing -   
  84 | https://ai.googleblog.com/2022/11/mixture-of-experts-with-expert-choice.html  
  85 | 
  86 | blot: Introducing Pathways: A next-generation AI architecture  
  87 | https://blog.google/technology/ai/introducing-pathways-next-generation-ai-architecture/   
  88 | 
  89 | ## Wednesday, July 19, 2023  
  90 | We're going to cover Chapter 16 Deep Networks for Classification  from the following book:  
  91 | https://book-wright-ma.github.io/Book-WM-20210422.pdf - High dimensional Data Analysis with Low Dimensional Models
  92 | blog:  https://terrytao.wordpress.com/2007/04/13/compressed-sensing-and-single-pixel-cameras/#more-25  
  93 | 
  94 | ## Wednesday, July 12, 2023  
  95 | We're going to cover the 4th chapter of this book.   
  96 | https://book-wright-ma.github.io/Book-WM-20210422.pdf - High dimensional Data Analysis with Low Dimensional Models  
  97 | 
  98 | ##  Wednesday, July 5, 2023  
  99 | We're going to cover the 1st chapter of this book.  
 100 | https://book-wright-ma.github.io/Book-WM-20210422.pdf - High dimensional Data Analysis with Low Dimensional Models  
 101 | Blog:  https://terrytao.wordpress.com/2007/04/13/compressed-sensing-and-single-pixel-cameras/#more-25  
 102 | 
 103 | ##  Wednesday, June 28, 2023  
 104 | paper: https://arxiv.org/pdf/2305.17126.pdf - Large Language Models as Tool Makers  
 105 | youtube:  https://www.youtube.com/watch?v=qWI1AJ2nSDY   
 106 | youtube:  https://www.youtube.com/watch?v=KXlPzMRTfMk   
 107 | youtube:  https://www.youtube.com/watch?v=srDVNbxPgZI   
 108 | 
 109 | ##  Wednesday, June 21, 2023  
 110 | Consciousness as a Memory System https://pubmed.ncbi.nlm.nih.gov/36178498/  
 111 | 
 112 | ##  Wednesday, June 14, 2023  
 113 | https://arxiv.org/abs/1804.08838   
 114 | Blog: https://www.uber.com/blog/intrinsic-dimension/   
 115 | more good stuff on intrinsic dimension:   
 116 | Nature paper: https://www.nature.com/articles/s41598-017-11873-y   
 117 | Wikipedia: https://en.wikipedia.org/wiki/Intrinsic_dimension   
 118 | Application - Yann LeCun at 57:15 on does text fully represent world model?   
 119 | https://www.youtube.com/watch?v=SGzMElJ11Cc   
 120 | vs. differing view from Ilya Sutskever at 15:30   
 121 | https://www.youtube.com/watch?v=SjhIlw3Iffs    
 122 | Applying intrinsic dimension to scaling laws in training / loss:   
 123 | https://jmlr.csail.mit.edu/papers/volume23/20-1111/20-1111.pdf   
 124 | https://arxiv.org/abs/2102.06701   
 125 | 
 126 | ##  Wednesday, June 7, 2023  
 127 | Paper:  https://arxiv.org/pdf/2305.16291.pdf   
 128 | Twit:  Tweet with nice overview by author https://twitter.com/DrJimFan/status/1662117784023883777   
 129 | Code:   https://github.com/MineDojo/Voyager   
 130 | website:  https://voyager.minedojo.org/ 
 131 | 
 132 | ##  Wednesday, May 31, 2023   
 133 | paper:  https://arxiv.org/pdf/2203.15556.pdf - Training Compute-Optimal Large Language Models  
 134 | blog:  https://www.lesswrong.com/posts/6Fpvch8RR29qLEWNH/chinchilla-s-wild-implications   
 135 | blog:  https://www.harmdevries.com/post/model-size-vs-compute-overhead/   
 136 | google blog:  https://www.cnbc.com/2023/05/16/googles-palm-2-uses-nearly-five-times-more-text-data-than-predecessor.html  
 137 | 
 138 | ##  Wednesday, May 24, 2023  
 139 | paper:  https://arxiv.org/abs/2212.09720 - The case for 4-bit precision: k-bit Inference Scaling Laws  
 140 | paper:  https://arxiv.org/pdf/2210.17323.pdf - GPTQ: ACCURATE POST-TRAINING QUANTIZATION FOR GENERATIVE PRE-TRAINED TRANSFORMERS  
 141 | 
 142 | ##  Wednesday, May 17, 2023
 143 | paper: https://arxiv.org/pdf/2106.09685.pdf - LORA: LOW-RANK ADAPTATION OF LARGE LANGUAGE MODELS  
 144 | 
 145 | ##  Wednesday, May 10, 2023   
 146 | paper: https://arxiv.org/pdf/2210.03629.pdf - REACT: SYNERGIZING REASONING AND ACTING IN LANGUAGE MODELS   
 147 | paper: https://www.pinecone.io/learn/locality-sensitive-hashing/   
 148 | 
 149 | ##  Wednesday, May 3, 2023   
 150 | paper: https://arxiv.org/pdf/2201.11903.pdf - Chain of thought prompting elicits reasoning in large language models.   
 151 | paper: https://arxiv.org/pdf/2210.03629.pdf - REACT: SYNERGIZING REASONING AND ACTING IN LANGUAGE MODELS   
 152 | paper: https://www.pinecone.io/learn/locality-sensitive-hashing/   
 153 | 
 154 | ##  Wednesday, Apr 26, 2023  
 155 | https://python.langchain.com/en/latest/modules/agents.html  
 156 | https://arxiv.org/pdf/2210.03629.pdf - REACT: SYNERGIZING REASONING AND ACTING IN LANGUAGE MODELS  
 157 | https://www.pinecone.io/learn/locality-sensitive-hashing/  
 158 | 
 159 | ##  Wednesday, Apr 19, 2023 
 160 | Blog:  https://yoheinakajima.com/task-driven-autonomous-agent-utilizing-gpt-4-pinecone-and-langchain-for-diverse-applications/   
 161 | Code:  https://github.com/hwchase17/langchain   
 162 | 
 163 | ##  Wednesday, Apr 12, 2023  
 164 | Paper:  Eliciting Latent Predictions from Transformers with the Tuned Lens https://arxiv.org/abs/2303.08112 
 165 | 
 166 | ##  Wednesday, Apr 5, 2023  
 167 | Paper:  https://openreview.net/pdf?id=lMMaNf6oxKM - Recipe for a General, Powerful, Scalable Graph Transformer  
 168 | youtube: https://www.youtube.com/watch?v=DiLSCReBaTg  
 169 | 
 170 | ##  Wednesday, Mar 29, 2023  
 171 | Paper: https://proceedings.neurips.cc/paper/2021/hash/f1c1592588411002af340cbaedd6fc33-Abstract.html - Do Transformers Really Perform Badly for Graph Representation?   
 172 | video: https://www.youtube.com/watch?v=FKuQpPIRjLk - review by authors  
 173 | video: https://www.youtube.com/watch?v=xQ5ltOOxoFg   
 174 | 
 175 | ##  Wednesday, Mar 22, 2023
 176 | Paper: https://arxiv.org/abs/2212.07359 - Post-hoc Uncertainty Learning using a Dirichlet Meta-Model  
 177 | youtube: https://www.youtube.com/watch?v=nE8XJ1f0zO0  
 178 | 
 179 | ##  Wednesday, Mar 15, 2023  
 180 | Paper: https://arxiv.org/abs/2202.05262 - Locating and Editing Factual Associations in GPT  
 181 | blog:  https://rome.baulab.info/  
 182 | Yannic video: https://www.youtube.com/watch?v=_NMQyOu2HTo  
 183 | 
 184 | ##  Wednesday, Mar 8, 2023  
 185 | Paper: Human-Timescale Adaptation in an Open-Ended Task Space: https://arxiv.org/pdf/2301.07608.pdf  
 186 | https://www.youtube.com/watch?v=A2hOWShiYoM   
 187 | https://sites.google.com/view/adaptive-agent/  
 188 | 
 189 | ##  Wednesday, Mar 1, 2023  
 190 | Paper: Toolformer: Language Models Can Teach Themselves to Use Tools: https://arxiv.org/abs/2302.04761   
 191 | 
 192 | ##  Wednesday, Feb 22, 2023  
 193 | Paper: https://arxiv.org/pdf/2203.02155.pdf - Training language models to follow instructions with human feedback  
 194 | 
 195 | ##  Wednesday, Feb 15, 2023  
 196 | Paper:  https://arxiv.org/pdf/2111.15664.pdf - OCR-free Document Understanding Transformer  
 197 | 
 198 | ##  Wednesday, Feb 8, 2023  
 199 | Paper: https://arxiv.org/abs/2205.06175 - A generalist agent - Gato  
 200 | YouTube: Eden Mayer https://www.youtube.com/watch?v=wSQJZHfAg18   
 201 | YouTube - Jay Alamar https://www.youtube.com/watch?v=kT6DYKgWNHg  
 202 | YouTube - Lex Fridman and Oriol Vinyals on How Gato Works https://www.youtube.com/watch?v=vwB9zO2h9j0  
 203 | Overview - main site on Gato at Deepmind https://www.deepmind.com/publications/a-generalist-agent  
 204 | blog review - https://arshren.medium.com/deep-minds-generalist-agent-gato-209969e12782   
 205 | 
 206 | ## Wednesday, Feb 1, 2023  
 207 | Paper:  https://openreview.net/pdf?id=M95oDwJXayG - ADDRESSING PARAMETER CHOICE ISSUES IN UNSUPERVISED DOMAIN ADAPTATION BY AGGREGATION  
 208 | 
 209 | ##  Wednesday, Jan 25, 2023  
 210 | Paper: https://arxiv.org/pdf/2301.04104v1.pdf - Mastering Diverse Domains through World Models  
 211 | Blog:  https://danijar.com/project/dreamerv3/  
 212 | YouTube:  https://www.youtube.com/watch?v=vfpZu0R1s1Y   
 213 | 
 214 | ##  Wednesday, Jan 18, 2023  
 215 | Paper:  https://arxiv.org/abs/2212.04089 - Composable NN: Editing Models With Task Arithmetic  
 216 | 
 217 | ## Wednesday, Jan 11, 2023  
 218 | Paper:  https://arxiv.org/pdf/1707.06690.pdf - DeepPath: A Reinforcement Learning Method for Knowledge Graph Reasoning  
 219 | 
 220 | ## Wednesday, Jan 4, 2023   
 221 | Paper: https://arxiv.org/abs/2212.04458 - GENERAL-PURPOSE IN-CONTEXT LEARNING BY META-LEARNING TRANSFORMERS  
 222 | 
 223 | ## Wednesday, Dec 21, 2022  
 224 | paper:  https://arxiv.org/pdf/2209.04836.pdf - GIT RE-BASIN: MERGING MODELS MODULO PERMUTATION SYMMETRIES  
 225 | 
 226 | ## Wednesday, Dec 14, 2022  
 227 | paper: https://arxiv.org/abs/2012.09855 - Infinite Nature: Perpetual View Generation of Natural Scenes from a Single Image  
 228 | blog: https://infinite-nature.github.io/  
 229 | 
 230 | ## Wednesday, Dec 7, 2022  
 231 | Paper: https://arxiv.org/abs/2206.00364 - Elucidating the Design Space of Diffusion-Based Generative Models  
 232 | video: https://www.youtube.com/watch?v=OYiQctx7kDE   
 233 | 
 234 | ##  Wednesday, Nov 30, 2022  
 235 | paper: https://arxiv.org/pdf/2206.10991.pdf - Graph Neural Networks as Gradient Flows: understanding graph convolutions via energy  
 236 | youtube (author):  https://www.youtube.com/watch?v=sgTTtmwOMgE  
 237 | youtube:   https://www.youtube.com/watch?v=hmI4C6AodEQ   
 238 | 
 239 | ##  Wednesday, Nov 16, 2022  
 240 | paper: https://www.pnas.org/doi/full/10.1073/pnas.2016239118   
 241 | video: https://slideslive.com/38942412/biological-structure-and-function-emerge-from-scaling-unsupervised-learning-to-250-million-protein-sequences  
 242 | 
 243 | ##  Wednesday, Nov 9, 2022  
 244 | paper: https://arxiv.org/pdf/2209.11178.pdf - Poisson Flow Generative Models   
 245 | 
 246 | ##  Wednesday, Nov 2, 2022  
 247 | paper:  https://arxiv.org/pdf/2209.12892.pdf - LEARNING TO LEARN WITH GENERATIVE MODELS OF NEURAL NETWORK CHECKPOINTS  
 248 | blog: https://www.marktechpost.com/2022/10/21/latest-machine-learning-research-at-uc-berkeley-proposes-a-way-to-design-a-learned-optimizer-using-generative-models-of-neural-network-checkpoints/   
 249 | author blog:  https://www.wpeebles.com/Gpt.html  
 250 | 
 251 | ##  Wednesday, Oct 26, 2022  
 252 | paper:  Cellular automata as convolutional neural networks https://arxiv.org/pdf/1809.02942.pdf  
 253 | survey: Collective Intelligence for Deep Learning: A Survey of Recent Developments https://arxiv.org/abs/2111.14377  
 254 | demo:  Self-classifying MNIST Digits https://distill.pub/2020/selforg/mnist/  
 255 | 
 256 | ##  Wednesday, Oct 19, 2022   
 257 | paper:  https://proceedings.mlr.press/v162/zhu22c/zhu22c.pdf - Neural-Symbolic Models for Logical Queries on Knowledge Graphs  
 258 | 
 259 | ##  Wednesday, Oct 12, 2022  
 260 | paper:  https://arxiv.org/pdf/2206.02768.pdf - The Neural Covariance SDE: Shaped Infinite Depth-and-Width Networks at Initialization  
 261 | 
 262 | ##  Wednesday, Oct 5, 2022  
 263 | paper: https://papers.nips.cc/paper/2019/file/952285b9b7e7a1be5aa7849f32ffff05-Paper.pdf - Legendre Memory Units: Continuous-Time   
 264 | 
 265 | ##  Wednesday, Sept 28, 2022  
 266 | paper: https://arxiv.org/pdf/2208.01618.pdf - An Image is Worth One Word: Personalizing Text-to-Image Generation using Textual Inversion   
 267 | githup.io: https://textual-inversion.github.io/    
 268 | YouTube https://www.youtube.com/watch?v=f3oXa7_SYek   
 269 | 
 270 | ##  Wednesday, Sept 21, 2022  
 271 | paper:  https://arxiv.org/pdf/2205.14415.pdf - Non-stationary Transformers: Rethinking the Stationarity in Time Series Forecasting  
 272 | 
 273 | ##  Wednesday, Sept 14, 2022  
 274 | paper: https://arxiv.org/abs/2110.02402 - Language Modeling using LMUs: 10x Better Data Efficiency or Improved Scaling Compared to Transformers   
 275 | youtube vid: https://www.youtube.com/watch?v=8t64QaTdBcU  
 276 | 
 277 | ##  Wednesday, August 31, 2022  
 278 | Paper: HOW NEURAL NETWORKS EXTRAPOLATE: FROM FEEDFORWARD TO GRAPH NEURAL NETWORKS - https://arxiv.org/pdf/2009.11848.pdf  
 279 | 
 280 | ##  Wednesday, August 24, 2022  
 281 | Paper:  Masked Siamese Networks for Label-Efficient Learning - https://arxiv.org/abs/2204.07141  
 282 | 
 283 | ##  Wednesday, August 17, 2022  
 284 | Paper:  Principle of Maximal Coding Rate Reduction https://arxiv.org/abs/2006.08558  
 285 | ReduNet:  https://arxiv.org/pdf/2105.10446.pdf   
 286 | Github:  https://github.com/ryanchankh/mcr2   
 287 | 
 288 | ##  Wednesday, August 10, 2022  
 289 | Paper:  On the Principles of Parsimony and Self-Consistency for the Emergence of Intelligence https://arxiv.org/abs/2207.04630   
 290 | Background: On the Principles of Parsimony and Self-Consistency for the Emergence of Intelligence https://arxiv.org/abs/2207.04630   
 291 | Background:  https://www.youtube.com/watch?v=OIVcfZeR1CE  youtube by author   
 292 | Background:   https://cmsa.fas.harvard.edu/wp-content/uploads/2021/04/Deep_Networks_from_First_Principles.pdf  -  slides by author  
 293 | 
 294 | 
 295 | ##  Wednesday, August 3, 2022   
 296 | Paper:  Data Distributional Properties Drive Emergent In-Context Learning in Transformers https://arxiv.org/pdf/2205.05055.pdf 
 297 | 
 298 | ##  Wednesday, July 27, 2022  
 299 | Paper: A Mathematical Framework for Transformer Circuits https://transformer-circuits.pub/2021/framework/index.html#model-simplifications  
 300 | 
 301 | ##  Wednesday, July 20, 2022  
 302 | Paper: A Mathematical Framework for Transformer Circuits https://transformer-circuits.pub/2021/framework/index.html#model-simplifications  
 303 | 
 304 | ##  Wednesday, July 13, 2022  
 305 | Paper: https://arxiv.org/abs/2001.08361 - Scaling Laws for Neural Language Models   
 306 | Blog: https://medium.com/nlplanet/two-minutes-nlp-scaling-laws-for-neural-language-models-add6061aece7   
 307 | 
 308 | ##  Wednesday, July 6, 2022  
 309 | Paper: https://arxiv.org/abs/2206.11795 - Video PreTraining (VPT): Learning to Act by Watching Unlabeled Online Videos  
 310 | https://github.com/openai/Video-Pre-Training  
 311 | Yannic Review:  https://www.youtube.com/watch?v=oz5yZc9ULAc  
 312 | 
 313 | ##  Wednesday, June 29, 2022  
 314 | Paper:  https://arxiv.org/pdf/2110.00966.pdf - Translating Images into Maps  
 315 | 
 316 | ##  Wednesday, June 22, 2022
 317 | Paper: https://arxiv.org/abs/2205.09665 - Automated Crossword Solving
 318 | 
 319 | ##  Wednesday, June 15, 2022   
 320 | Paper: https://arxiv.org/pdf/2205.10824.pdf - ReLU Fields: The Little Non-linearity That Could   
 321 | 
 322 | 
 323 | ##  Wednesday, June 8, 2022  
 324 | Paper: https://arxiv.org/abs/2102.06810 - Understanding Self-Supervised Learning Dynamics without Contrastive Pairs   
 325 | 
 326 | ##  Wednesday, June 1, 2022   
 327 | Paper: https://arxiv.org/pdf/2205.06175.pdf - A Generalist Agent  
 328 | Blog: https://www.deepmind.com/publications/a-generalist-agent  
 329 | 
 330 | ##  Wednesday, May 25, 2022   
 331 | https://arxiv.org/pdf/2202.05780.pdf - A Modern Self-Referential Weight Matrix That Learns to Modify Itself  
 332 | 
 333 | ##  Wednesday, May 18, 2022  
 334 | https://openreview.net/pdf?id=M752z9FKJP - LEARNING STRIDES IN CONVOLUTIONAL NEURAL NETWORKS   
 335 | 
 336 | ##  Wednesday, May 11, 2022  
 337 | https://openreview.net/pdf?id=b-ny3x071E5 - BOOTSTRAPPED META-LEARNING   
 338 | 
 339 | ##  Wednesday, May 4, 2022  
 340 | https://arxiv.org/abs/2202.06991 - Transformer Memory as a Differentiable Search Index   
 341 | https://www.youtube.com/watch?v=C7mUYocWdG0 - Yannic author interview   
 342 | https://www.youtube.com/watch?v=qlB0TPBQ7YY - Yannic on Transformer paper   
 343 | 
 344 | ##  Wednesday, April 27, 2022  
 345 | https://arxiv.org/abs/2204.06125 - Hierarchical Text-Conditional Image Generation with CLIP Latents  
 346 | https://openai.com/dall-e-2/ - OpenAI blog  
 347 | https://www.youtube.com/watch?v=j4xgkjWlfL4 - yannic video  
 348 | 
 349 | ##  Wednesday, April 20, 2022 
 350 | https://arxiv.org/pdf/2103.00020.pdf - Learning Transferable Visual Models From Natural Language Supervision  
 351 | https://www.youtube.com/watch?v=1LUWWAnK_Ks  
 352 | https://www.youtube.com/watch?v=3X3EY2Fgp3g  
 353 | 
 354 | ##  Wednesday, April 13, 2022
 355 | https://arxiv.org/pdf/2110.13985.pdf - Combining Recurrent, Convolutional, and Continuous-time
 356 | Models with Linear State-Space Layers
 357 | 
 358 | ##  Wednesday, April 6, 2022
 359 | https://arxiv.org/pdf/2202.00666.pdf - Typical Decoding for Natural Language Generation
 360 | 
 361 | https://youtu.be/_EDr3ryrT_Y 
 362 | 
 363 | https://www.youtube.com/watch?v=AvHLJqtmQkE 
 364 | 
 365 | ##  Wednesday, March 30, 2022
 366 | https://arxiv.org/pdf/2105.04906.pdf - VICREG: VARIANCE-INVARIANCE-COVARIANCE REGULARIZATION FOR SELF-SUPERVISED LEARNING   
 367 | https://www.youtube.com/watch?v=MzKDNmOJ67Q  
 368 | 
 369 | ##  Wednesday, March 23, 2022 
 370 | https://openreview.net/forum?id=4orlVaC95Bo - Task-Agnostic Undesirable Feature Deactivation Using Out-of-Distribution Data
 371 | 
 372 | ##  Wednesday, March 16, 2022
 373 | https://arxiv.org/abs/2203.03466 - Tensor Programs V: Tuning Large Neural Networks via Zero-Shot Hyperparameter Transfer  
 374 | https://www.youtube.com/watch?v=MNOJQINH-qw  
 375 | 
 376 | ##  Wednesday, March 9, 2022 
 377 | https://arxiv.org/abs/2201.12122 - Can Wikipedia Help Offline Reinforcement Learning?   
 378 | Yannic's talk on this,  
 379 | https://www.youtube.com/watch?v=XHGh19Hbx48   
 380 | and he also has a followon video interview with the authors   
 381 | https://www.youtube.com/watch?v=FNDVy_BR8aA   
 382 | 
 383 | 
 384 | ##  Wednesday, March 2, 2022 - 
 385 | https://arxiv.org/pdf/2107.03342.pdf - A Survey of Uncertainty in Deep Neural Networks
 386 | 
 387 | ##  Wednesday, February 23, 2022 - 
 388 | https://arxiv.org/pdf/2201.08239v2.pdf - LaMDA: Language Models for Dialog Applications
 389 | 
 390 | ##  Wednesday, February 16, 2022 - 
 391 | https://openreview.net/pdf?id=TrjbxzRcnf- MEMORIZING TRANSFORMERS
 392 | 
 393 | ##  Wednesday, February 9, 2022 - 
 394 | https://arxiv.org/pdf/2106.07644.pdf - A Continuized View on Nesterov Acceleration for Stochastic Gradient Descent and Randomized Gossip
 395 | 
 396 | ##  Wednesday, February 2, 2022 - 
 397 | https://arxiv.org/pdf/2108.08052.pdf - Moser Flow: Divergence-based Generative Modeling on Manifolds
 398 | 
 399 | ##  Wednesday, January 26, 2022 - 
 400 | https://dylandoblar.github.io/noether-networks/ - Noether Networks: meta-learning useful conserved quantities
 401 | 
 402 | https://www.youtube.com/watch?v=Xp3jR-ttMfo
 403 | 
 404 | ##  Wednesday, January 19, 2022 - 
 405 | https://arxiv.org/pdf/2010.15277.pdf - Class-incremental learning: survey and performance evaluation on image classification
 406 | 
 407 | ##  Wednesday, January 12, 2022 - 
 408 | https://arxiv.org/abs/2006.11287 - Discovering Symbolic Models from Deep Learning with Inductive Biases 
 409 | 
 410 | ##  Wednesday, January 5, 2022 - 
 411 | https://arxiv.org/pdf/2006.09252.pdf - Improving Graph Neural Network Expressivity via Subgraph Isomorphism Counting
 412 | 
 413 | ##  Wednesday, December 29, 2021 - 
 414 | https://arxiv.org/pdf/2112.04426.pdf - Improving Language Models by Retrieving from Trillions of Tokens
 415 | 
 416 | https://www.deepmind.com/research/publications/2021/improving-language-models-by-retrieving-from-trillions-of-tokens
 417 | 
 418 | ##  Wednesday, December 22, 2021 - 
 419 | https://arxiv.org/abs/2106.01798 - Implicit MLE: Backpropagating Through Discrete Exponential Family Distributions
 420 | 
 421 | https://www.youtube.com/watch?v=W2UT8NjUqrk
 422 | 
 423 | ##  Wednesday, December 15, 2021 - 
 424 | https://arxiv.org/pdf/2108.01073.pdf - Image Synthesis and Editing with Stochastic Differential Equations
 425 | 
 426 | ##  Wednesday, December 1, 2021 - 
 427 | https://openreview.net/forum?id=HfpNVDg3ExA
 428 | OpenReviewOpenReview
 429 | Probabilistic Transformer For Time Series Analysis
 430 | 
 431 | ##  Wednesday, November 17, 2021 - 
 432 | https://arxiv.org/pdf/2110.03922.pdf - NEURAL TANGENT KERNEL EIGENVALUES ACCURATELY PREDICT GENERALIZATION
 433 | 
 434 | ##  Wednesday, November 10, 2021 - 
 435 | https://arxiv.org/pdf/2104.00681.pdf - NeuralRecon: Real-Time Coherent 3D Reconstruction from Monocular Video
 436 | 
 437 | https://github.com/zju3dv/NeuralRecon
 438 | 
 439 | 
 440 | ##  Wednesday, October 27, 2021 - 
 441 | https://arxiv.org/pdf/2110.09485.pdf - Learning in High Dimension Always Amounts to Extrapolation
 442 | 
 443 | ##  Wednesday, October 20, 2021 - 
 444 | https://arxiv.org/pdf/2109.02355.pdf - A Farewell to the Bias-Variance Tradeoff? An Overview of the Theory of Overparameterized Machine Learning
 445 | 
 446 | ##  Wednesday, October 13, 2021 - 
 447 | https://arxiv.org/pdf/2006.09011.pdf - Improved Techniques for Training Score-Based Generative Models
 448 | 
 449 | ##  Wednesday, October 6, 2021 - 
 450 | https://arxiv.org/abs/2006.05929 - Dataset Condensation with Gradient Matching
 451 | 
 452 | ##  Wednesday, September 29, 2021 - 
 453 | https://arxiv.org/abs/1811.10959 - Dataset distillation
 454 | 
 455 | ##  Wednesday, September 22, 2021 - 
 456 | https://arxiv.org/pdf/2003.13216.pdf - Learning to Learn Single Domain Generalization
 457 | 
 458 | ##  Wednesday, September 15, 2021 - 
 459 | https://arxiv.org/pdf/2108.11482.pdf - ETA Prediction with Graph Neural Networks in Google Maps
 460 | 
 461 | ##  Wednesday, September 8, 2021 - 
 462 | https://cascaded-diffusion.github.io/assets/cascaded_diffusion.pdf - Cascaded Diffusion Models for High Fidelity Image Generation
 463 | 
 464 | ##  Wednesday, September 1, 2021 - 
 465 | https://arxiv.org/pdf/2107.06277.pdf - Why Generalization in RL is Difficult: Epistemic POMDPs and Implicit Partial Observability
 466 | 
 467 | ##  Wednesday, August 25, 2021 - 
 468 | https://arxiv.org/abs/2108.07732 - Program Synthesis with Large Models
 469 | 
 470 | ##  Wednesday, August 18, 2021 - 
 471 | https://arxiv.org/abs/2012.13349 - Solving Mixed Integer Programs Using Neural Networks
 472 | 
 473 | ##  Wednesday, August 11, 2021 - 
 474 | https://www.nature.com/articles/s41586-021-03819-2 - DeepFold
 475 | 
 476 | ##  Wednesday, August 4, 2021 - 
 477 | Alphafold - blog https://deepmind.com/blog/article/alphafold-a-solution-to-a-50-year-old-grand-challenge-in-biology paper https://www.nature.com/articles/s41586-021-03819-2 supplemental info https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-021-03819-2/MediaObjects/41586_2021_3819_MOESM1_ESM.pdf
 478 | 
 479 | ##  Wednesday, July 21, 2021 - 
 480 | https://www.zdnet.com/article/googles-supermodel-deepmind-perceiver-is-a-step-on-the-road-to-an-ai-machine-that-could-process-everything/ https://arxiv.org/abs/2103.03206
 481 | 
 482 | ##  Wednesday, July 14, 2021 - 
 483 | https://arxiv.org/pdf/1503.03585.pdf (Deep Unsupervised Learning using Non equilibrium Thermodynamics) by Surya Ganguli at Stanford
 484 | ##  
 485 | Wednesday, July 7, 2021 - 
 486 | https://arxiv.org/pdf/2105.05233.pdf - Diffusion Models Beat GANs on Image Synthesis
 487 | 
 488 | ##  Wednesday, June 30, 2021 - 
 489 | https://arxiv.org/pdf/2006.11239.pdf - Denoising Diffusion Probabilistic Models
 490 | 
 491 | ##  Wednesday, June 23, 2021 - 
 492 | https://arxiv.org/abs/2010.03409 - Learning mesh-based simulation with graph networks
 493 | 
 494 | https://sites.google.com/view/learning-to-simulate
 495 | 
 496 | https://deepmind.com/research/publications/Learning-to-Simulate-Complex-Physics-with-Graph-Networks
 497 | 
 498 | ##  Wednesday, June 16, 2021 - 
 499 | https://arxiv.org/pdf/2106.01345.pdf - Decision Transformer: Reinforcement Learning via Sequence Modeling
 500 | 
 501 | https://www.youtube.com/watch?v=-buULmf7dec
 502 | 
 503 | https://sites.google.com/berkeley.edu/decision-transformer
 504 | 
 505 | ##  Wednesday, June 9, 2021 - 
 506 | https://arxiv.org/pdf/2103.07945.pdf - Learning One Representation to Optimize All Rewards
 507 | 
 508 | ##  Wednesday, June 2, 2021 - 
 509 | https://distill.pub/2021/multimodal-neurons/ - Multimodal Neurons in Artificial Neural Networks
 510 | 
 511 | https://openai.com/blog/clip/ - CLIP: Connecting Text and Images
 512 | 
 513 | ##  Wednesday, May 26, 2021 - 
 514 | https://arxiv.org/pdf/2104.14294.pdf - Emerging Properties in Self-Supervised Vision Transformers
 515 | 
 516 | https://ai.facebook.com/blog/dino-paws-computer-vision-with-self-supervised-transformers-and-10x-more-efficient-training/
 517 | 
 518 | ##  Wednesday, May 19, 2021 - 
 519 | https://arxiv.org/pdf/2104.10558.pdf - Contingencies from Observations: Tractable ContingencyPlanning with Learned Behavior Models
 520 | 
 521 | ##  Wednesday, May 12, 2021 - 
 522 | https://arxiv.org/pdf/1806.09055.pdf - DARTS: Differentiable Architecture Search (ICLR 2019)
 523 | 
 524 | ##  Wednesday, May 5, 2021 - 
 525 | https://arxiv.org/pdf/2104.06644.pdf - Masked Language Modeling and the Distributional Hypothesis:Order Word Matters Pre-training for Little
 526 | 
 527 | ##  Wednesday, April 28, 2021 - 
 528 | https://arxiv.org/pdf/2009.03717.pdf - Hierarchical message passing graph neural networks
 529 | 
 530 | ##  Wednesday, April 14, 2021 - 
 531 | https://arxiv.org/pdf/2103.03230v1.pdf - Barlow Twins: Self-Supervised Learning via Redundancy Reduction
 532 | 
 533 | ##  Wednesday, April 7, 2021 - 
 534 | https://arxiv.org/pdf/2103.14770.pdf - Categorical representation learning: morphism is all you need
 535 | 
 536 | ##  Wednesday, March 31, 2021 - 
 537 | https://arxiv.org/pdf/2102.12736v1.pdf - Time-Series Imputation with Wasserstein Interpolation for Optimal Look-Ahead-Bias and Variance Tradeoff
 538 | 
 539 | ## Wednesday, March 24, 2021 - 
 540 | https://awacrl.github.io/ - Accelerating online reinforcement learning with offline datasets
 541 | 
 542 | ##  Wednesday, March 17, 2021 - 
 543 | https://arxiv.org/pdf/2102.12092.pdf - Zero-Shot Text-to-Image Generation
 544 | 
 545 | https://openai.com/blog/dall-e/
 546 | 
 547 | ##  Wednesday, March 10, 2021 - 
 548 | https://giotto-ai.github.io/gtda-docs/latest/notebooks/gravitational_waves_detection.html
 549 | 
 550 | ##  Wednesday, March 3, 2021 - 
 551 | https://arxiv.org/pdf/2102.08602.pdf - Modeling long-range interactions without attention
 552 | 
 553 | ##  Wednesday, February 24, 2021 - 
 554 | https://arxiv.org/pdf/2101.08692.pdf - Characterizing signal propagation to close the performance gap in unnormalized resnets
 555 | 
 556 | ##  Wednesday, February 17, 2021 - 
 557 | https://arxiv.org/pdf/2006.10742.pdf - Learning Invariant Representations forReinforcement Learning without Reconstruction
 558 | 
 559 | ##  Wednesday, February 10, 2021 - 
 560 | https://arxiv.org/pdf/2007.13544.pdf - Combining Deep Reinforcement Learning and Search for Imperfect-Information Games
 561 | 
 562 | ##  Wednesday, February 3, 2021 - 
 563 | https://arxiv.org/pdf/2010.11929.pdf - An image is worth 16x16 words: transformers for image recognition at scale
 564 | 
 565 | ##  Wednesday, January 27, 2021 - 
 566 | https://arxiv.org/abs/2003.02821 - What went wrong and when? Instance-wise feature importance for time-series black-box models
 567 | 
 568 | ##  Wednesday, January 20, 2021 - 
 569 | https://arxiv.org/pdf/1912.09363.pdf - Temporal Fusion Transformersfor Interpretable Multi-horizon Time Series Forecasting
 570 | 
 571 | ##  Wednesday, January 13, 2021 - 
 572 | https://arxiv.org/abs/1905.10403 - Neural Jump Stochastic Differential Equations
 573 | 
 574 | ##  Wednesday, January 6, 2021 - 
 575 | http://implicit-layers-tutorial.org/neural_odes/ - We're continuing this from last week. This week we'll cover Ch 3,4,5.
 576 | 
 577 | ##  Wednesday, December 30, 2020 - 
 578 | http://implicit-layers-tutorial.org/ - NeurIPS tutorial on deep implicit networks
 579 | 
 580 | Wednesday, December 23, 2020 - 
 581 | https://arxiv.org/pdf/1907.03907.pdf - Latent ODEs for Irregularly-Sampled Time Series
 582 | 
 583 | https://www.youtube.com/watch?v=tOkH339Wucs
 584 | 
 585 | ##  Wednesday, December 16, 2020 - 
 586 | https://papers.nips.cc/paper/2020/file/08425b881bcde94a383cd258cea331be-Paper.pdf - Ridge Rider: Finding Diverse Solutions by FollowingEigenvectors of the Hessian
 587 | 
 588 | ##  Wednesday, December 9, 2020 - 
 589 | https://proceedings.neurips.cc/paper/2020/file/28e209b61a52482a0ae1cb9f5959c792-Paper.pdf
 590 | “OOD-MAML: Meta-Learning for Few-Shot Out-of-Distribution Detection and Classification"
 591 | 
 592 | ##  Wednesday, December 2, 2020 - 
 593 | https://arxiv.org/pdf/2011.02421.pdf - ONE-SHOT CONDITIONAL AUDIO FILTERING OF ARBITRARY SOUNDS
 594 | 
 595 | ##  Wednesday, November 18, 2020 - 
 596 | https://arxiv.org/pdf/2010.14498.pdf - Implicit under-parametrization inhibits data efficient deep reinforcement learning
 597 | 
 598 | 
 599 | 
 600 | 
 601 | 
 602 | 
 603 | 
 604 | 
 605 | 
 606 | 
 607 | ## Mar 11 - Hacker Dojo  
 608 | https://arxiv.org/pdf/2002.11089.pdf - Rewriting History with Inverse RL: Hindsight Inference for Policy Improvement  
 609 | 
 610 | ## Mar 4 - Hacker Dojo  
 611 | https://www.osapublishing.org/DirectPDFAccess/C6D6B2C3-953C-4461-695B6E5E2F993943_415059/prj-7-8-823.pdf?da=1&id=415059&seq=0&mobile=no --Nanophotonic media for artificial neural inference  
 612 | 
 613 | 
 614 | ## Feb 19 - Hacker Dojo  
 615 | https://arxiv.org/pdf/1910.02789.pdf - Language is Power: Representing States Using Natural Language in Reinforcement Learning  
 616 | 
 617 | 
 618 | ## Feb 12 - Hacker Dojo  
 619 | https://deepmind.com/blog/article/AlphaFold-Using-AI-for-scientific-discovery - Protein folding paper.
 620 | 
 621 | ## Feb 5 - Hacker Dojo 
 622 | https://arxiv.org/abs/2001.04451 Reformer, the efficient transformer   
 623 | https://ai.googleblog.com/2020/01/reformer-efficient-transformer.html   
 624 | 
 625 | ## Jan 22 - Hacker Dojo  
 626 | https://arxiv.org/pdf/1906.05717.pdf - Unsupervised Monocular Depth and Ego-motion Learning with Structure and Semantics  
 627 | 
 628 | ## Jan 15 - Hacker Dojo
 629 | https://arxiv.org/pdf/1912.09524.pdf - Evolving ab initio trading strategies in heterogeneous environments  
 630 | 
 631 | ## Jan 8 - Hacker Dojo  
 632 | https://arxiv.org/pdf/1911.05892.pdf - Reinforcement Learning for Market Making in Multi-agent Dealer Market  
 633 | 
 634 | 
 635 | ## Dec 18 - Hacker Dojo
 636 | https://www.nature.com/articles/s41586-019-1724-z.epdf?author_access_token=lZH3nqPYtWJXfDA10W0CNNRgN0jAjWel9jnR3ZoTv0PSZcPzJFGNAZhOlk4deBCKzKm70KfinloafEF1bCCXL6IIHHgKaDkaTkBcTEv7aT-wqDoG1VeO9-wO3GEoAMF9bAOt7mJ0RWQnRVMbyfgH9A%3D%3D   
 637 | https://www.gwern.net/docs/rl/2019-vinyals.pdf  
 638 | https://deepmind.com/blog/article/AlphaStar-Grandmaster-level-in-StarCraft-II-using-multi-agent-reinforcement-learning  
 639 | 
 640 | ## Nov 20 - Hacker Dojo 
 641 | https://arxiv.org/pdf/1911.04252.pdf - Self-training with Noisy Student improves ImageNet classification  
 642 | 
 643 | ## Nov 13 - Hacker Dojo  
 644 | https://arxiv.org/pdf/1910.12713.pdf - Few-shot video-video synthesis  
 645 | 
 646 | 
 647 | ## Nov 6 - Hacker Dojo
 648 | https://arxiv.org/pdf/1906.11883.pdf - Unsupervised learning of Object Keypoints for Perception and Control  
 649 | 
 650 | ## Oct 30 - Hacker Dojo  
 651 | https://arxiv.org/pdf/1710.03748.pdf - Emergent Complexity via Multi-Agent Competition  
 652 | https://openai.com/blog/competitive-self-play/  
 653 | 
 654 | ## Oct 23 - Hacker Dojo  
 655 | https://arxiv.org/pdf/1703.04908.pdf - Emergence of Grounded Compositional Language in Multi-Agent Populations  
 656 | 
 657 | ## Oct 16 - Hacker Dojo  
 658 | https://arxiv.org/pdf/1909.07528.pdf - Emergent tool use from multi agent autocurricula  
 659 | https://openai.com/blog/emergent-tool-use/  
 660 | 
 661 | ## Oct 9 - Hacker Dojo  
 662 | https://arxiv.org/pdf/1901.00949.pdf - Machine Teaching in Hierarchical Genetic Reinforcement Learning: Curriculum Design of Reward Functions for Swarm Shepherding  
 663 | 
 664 | ## Sept 25 - Hacker Dojo  
 665 | https://arxiv.org/pdf/1812.01729.pdf - Boltzman Generators - Sampling equilibrium states of many body systems with deep learning
 666 | 
 667 | 
 668 | ## Sept 18 - Hacker Dojo
 669 | https://arxiv.org/pdf/1907.10599.pdf - Fine Grained Spectral Perspective on Neural Networks  
 670 | 
 671 | ## Sept 11 - Hacker Dojo  
 672 | https://arxiv.org/pdf/1906.08237.pdf - XLNet Generalized autoregressive pretraining for language understanding 
 673 | 
 674 | ## Sept 4 - Hacker Dojo  
 675 | https://arxiv.org/pdf/1905.09272.pdf - Data efficient image recognition with contrastive predictive coding.  
 676 | 
 677 | ## August 21 - Hacker Dojo  
 678 | https://arxiv.org/pdf/1904.10509.pdf - Generating long sequences with sparse transformers 
 679 | 
 680 | ## August 14 - Hacker Dojo
 681 | https://arxiv.org/pdf/1807.03748.pdf - Representation learning with contrastive predictive coding.  
 682 | 
 683 | ## July 31 - Hacker Dojo
 684 | https://arxiv.org/pdf/1906.08253.pdf - When to trust your model: model-based policy optimization  
 685 | 
 686 | ## July 24 - Hacker Dojo 
 687 | https://arxiv.org/pdf/1901.09321.pdf - Fixup initialization - residual learning without normalization  
 688 | 
 689 | 
 690 | ## July 17 - Hacker Dojo
 691 | http://proceedings.mlr.press/v97/mahoney19a/mahoney19a.pdf  - Traditional and heavy tailed self regularization in neural net models 
 692 | 
 693 | ## July 3 - Hacker Dojo 
 694 | https://arxiv.org/pdf/1804.08838.pdf - Measuring intrinsic dimension of objective landscapes 
 695 | 
 696 | ## June 19 - Hacker Dojo  
 697 | https://arxiv.org/abs/1810.09536 - Ordered Neurons: Integrating Tree Structures into Recurrent Neural Networks  
 698 | 
 699 | ## June 12 - Hacker Dojo
 700 | https://arxiv.org/pdf/1812.05159.pdf - An empirical study of example forgetting during neural network training.  
 701 | 
 702 | ## June 5 - Hacker Dojo 
 703 | https://arxiv.org/pdf/1812.00417.pdf - Snorkel Drybell - A case study in weak supervision at industrial scale  
 704 | https://arxiv.org/pdf/1905.04981.pdf - Modelling instance level annotator reliability for natural language labelling 
 705 | 
 706 | ## May 29 - Hacker Dojo  
 707 | https://arxiv.org/pdf/1901.09321.pdf - Fixup Initialization: Residual Learning without Normalization  
 708 | 
 709 | ## May 22 - Hacker Dojo  
 710 | https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf - Language Models are Unsupervised Multitask Learners.  
 711 | 
 712 | ## May 15 - Hacker Dojo 
 713 | https://arxiv.org/pdf/1811.00995.pdf - Invertible Residual Networks  
 714 | 
 715 | ## Apr 29 - Hacker Dojo  
 716 | https://arxiv.org/pdf/1904.01681.pdf - Augmented Neural ODE's  
 717 | 
 718 | ## Apr 8 - Hacker Dojo  
 719 | https://arxiv.org/pdf/1901.00596.pdf - Comprehensive Survey of Graph Neural Nets  
 720 | https://github.com/rusty1s/pytorch_geometric  
 721 | 
 722 | ## Apr 1 - Hacker Dojo  
 723 | https://arxiv.org/pdf/1901.00596.pdf - Comprehensive Survey of Graph Neural Nets  
 724 | 
 725 | ## Mar 25 - Hacker Dojo  
 726 | https://papers.nips.cc/paper/7539-optimal-algorithms-for-non-smooth-distributed-optimization-in-networks.pdf  - nips award winner
 727 | 
 728 | ## Mar 18 - Hacker Dojo 
 729 | https://papers.nips.cc/paper/8200-non-delusional-q-learning-and-value-iteration.pdf - Non-delusional Q-learning and Value Iteration  
 730 | 
 731 | ## Mar 11 - Hacker Dojo
 732 | https://arxiv.org/pdf/1706.03762.pdf - attention is all you need - Vaswani
 733 | https://github.com/jadore801120/attention-is-all-you-need-pytorch - easier to read code 
 734 | https://www.youtube.com/watch?v=S0KakHcj_rs  
 735 | https://tdls.a-i.science/events/2018-10-22/  
 736 | https://tdls.a-i.science/events/2019-02-04/  
 737 | http://nlp.seas.harvard.edu/2018/04/03/attention.html  
 738 | 
 739 | 
 740 | ## Mar 4 - Hacker Dojo 
 741 | https://arxiv.org/pdf/1806.02643.pdf - Re-evalating Evaluation
 742 | 
 743 | ## Feb 25 - Hacker Dojo
 744 | https://arxiv.org/pdf/1812.11951.pdf - Learning to Design RNA  
 745 | 
 746 | ## Feb 11 - Hacker Dojo - 
 747 | https://arxiv.org/pdf/1901.02860.pdf - Transformer XL - Attentive Language Models, Beyond a fixed length context
 748 | 
 749 | ## Feb 4 - Hacker Dojo  
 750 | https://arxiv.org/pdf/1809.06646.pdf - Model Free Adaptive Optimal Control of Sequential Manufacturing Process Using Reinforcement Learning  
 751 | 
 752 | ## January 28 - Hacker Dojo  
 753 | https://arxiv.org/pdf/1806.07366.pdf - Neural Ordinary Differential Equations - Top paper NIPS2019 
 754 | 
 755 | ## January 21 - Hacker Dojo 
 756 | https://arxiv.org/pdf/1606.05312.pdf - Successor Features for Transfer in Reinforcement Learning  
 757 | http://proceedings.mlr.press/v37/schaul15.pdf - Universal Value Function Approximators  
 758 | http://proceedings.mlr.press/v80/barreto18a/barreto18a.pdf - Transfer in deep reinforcement learning using successor features and generalised policy improvement.  
 759 | 
 760 | https://www.youtube.com/watch?v=YDCPHekLUI4&t=1053s - Tom Schaul  
 761 | https://www.youtube.com/watch?v=OCHwXxSW70o - Tejas Kulkarni  
 762 | 
 763 | 
 764 | ## January 14 - Hacker Dojo
 765 | https://arxiv.org/pdf/1812.07626.pdf - Universal Successor Features Approximators  
 766 | 
 767 | ## January 7 - Hacker Dojo
 768 | https://arxiv.org/pdf/1810.12715.pdf - On the Effectiveness of Interval Bound Propagation for Training Verifiably Robust Models  
 769 | 
 770 | ## December 17 - Hacker Dojo
 771 | https://openreview.net/pdf?id=S1x4ghC9tQ - Temporal Difference Variational Autoencoder
 772 | 
 773 | 
 774 | ## December 10 - Hacker Dojo  
 775 | https://openreview.net/pdf?id=S1JHhv6TW - Boosting Dilated Convolution with Mixed Tensor Decompositions  
 776 | 
 777 | ## December 3 - Hacker Dojo 
 778 | https://arxiv.org/pdf/1712.01208.pdf - The case for learned index structures  
 779 | 
 780 | ## November 26 - Hacker Dojo 
 781 | https://arxiv.org/abs/1809.07402 - Generalization properties of nn - Socher  
 782 | https://einstein.ai/research/blog/identifying-generalization-properties-in-neural-networks - blog for above paper   
 783 | 
 784 | ## November 19 - Hacker Dojo  
 785 | https://arxiv.org/pdf/1802.05983.pdf - Disentangling by Factorising  
 786 | https://arxiv.org/pdf/1804.00104.pdf - Learning Disentangled Joint, Discrete and Continuous Representations  
 787 | https://arxiv.org/pdf/1807.05520.pdf - Deep Clustering for Unsupervised Learning of Visual Features  
 788 | https://github.com/1Konny/FactorVAE  
 789 | https://github.com/paruby/FactorVAE    
 790 | https://github.com/nicolasigor/FactorVAE  
 791 | 
 792 | ## November 12 - Hacker Dojo
 793 | https://arxiv.org/pdf/1810.12894.pdf - Exploration by Random Network Distillation - OpenAI  
 794 | 
 795 | ## November 5 - Hacker Dojo 
 796 | https://arxiv.org/pdf/1810.04805.pdf - Pre-trainged bi directional transformers for language translation  
 797 | 
 798 | 
 799 | ## October 22 - Hacker Dojo  
 800 | https://arxiv.org/pdf/1801.02613.pdf - Characterizing Adversarial Examples using Local Intrinsic Dimensionality  
 801 | 
 802 | 
 803 | ## October 15 - Hacker Dojo  
 804 | https://arxiv.org/pdf/1808.06670.pdf - Learning Deep Representations by Mutual Estimation Estimation and Maximization - Hjelm, Bengio  
 805 | 
 806 | ## October 8 - Hacker Dojo  
 807 | https://arxiv.org/pdf/1802.04364.pdf - Junction Tree Variational Auto-Encoder for Molecular Graph Generation  
 808 | http://snap.stanford.edu/proj/embeddings-www/files/nrltutorial-part2-gnns.pdf  
 809 | 
 810 | ## October 1 - Hacker Dojo
 811 | https://arxiv.org/pdf/1808.06601.pdf - Video to video synthesis 
 812 | https://github.com/NVIDIA/vid2vid - code  
 813 | 
 814 | ## September 24 - Hacker Dojo 
 815 | https://arxiv.org/pdf/1807.03146.pdf - Discovery of 3d keypoints from 2d image  
 816 | 
 817 | ## September 17 - Hacker Dojo 
 818 | https://arxiv.org/abs/1709.02371 - PWC-Net: CNNs for Optical Flow Using Pyramid, Warping, and Cost Volume," by Deqing Sun et al. (CVPR 2018) 
 819 | Phil Ferrier will present the paper and run though his code for us. Phil's code is on his github reop:  
 820 | https://github.com/philferriere/tfoptflow  
 821 | 
 822 | ## September 10 - Hacker Dojo
 823 | https://arxiv.org/pdf/1807.03247.pdf - Intriguing failure (and improvement) to CNN for determining rotations.  
 824 | 
 825 | ## September 3 - Hacker Dojo  
 826 | https://arxiv.org/pdf/1803.03324.pdf - Learning Deep Generative Models of Graphs  
 827 | 
 828 | ## August 27 - Hacker Dojo
 829 | https://arxiv.org/abs/1709.10082 - Optimally decentralized multi-robot collision avoidance w reinforcement learning.  
 830 | 
 831 | https://github.com/TensorSwarm/TensorSwarm  - Andreas Pasternak code for above  
 832 | 
 833 | ## August 13 - Hacker Dojo
 834 | https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/learning-dexterity/learning-dexterity-paper.pdf  -Robot doing single hand manipulations.  
 835 | https://www.theverge.com/2018/7/30/17621112/openai-robot-dexterity-dactyl-artificial-intelligence  
 836 | 
 837 | ## July 30 - Hacker Dojo - 
 838 | https://arxiv.org/pdf/1711.03953.pdf - Breaking the softmax bottleneck  
 839 | https://arxiv.org/pdf/1805.10829.pdf - SigSoftMax: Reanalyzing the softmax bottleneck  
 840 | https://severelytheoretical.wordpress.com/2018/06/08/the-softmax-bottleneck-is-a-special-case-of-a-more-general-phenomenon/  
 841 | 
 842 | ## July 23 - Hacker Dojo - 
 843 | https://arxiv.org/pdf/1807.01281.pdf - Human level performance in first person multiplayer games with population reinforcement learning.  
 844 | https://deepmind.com/blog/capture-the-flag/ 
 845 | https://www.youtube.com/watch?v=steioHoiEms  
 846 | https://arxiv.org/abs/1711.09846v2  
 847 | https://arxiv.org/pdf/1611.05397.pdf  
 848 | 
 849 | ## July 16 - Hacker Dojo 
 850 | https://arxiv.org/pdf/1803.10122.pdf - schmidhuber paper on RL  
 851 | 
 852 | ## July 9 - Hacker Dojo
 853 | https://deepmind.com/research/publications/neural-scene-representation-and-rendering/  - Rendering 3d scene  
 854 | 
 855 | ## July 2 - Hacker Dojo - 
 856 | https://arxiv.org/pdf/1707.06347.pdf - Proximal Optimization Policies  
 857 | 
 858 | ## June 25 - Hacker Dojo  
 859 | https://openreview.net/pdf?id=BJOFETxR- - Learning to represent programs with graphs  
 860 | 
 861 | ## June 18 - Hacker Dojo  
 862 | https://openreview.net/pdf?id=BkisuzWRW - Zero Shot Visual Imitation - Reinforcement Learning  
 863 | 
 864 | 
 865 | ## June 11 - Hacker Dojo
 866 | https://openreview.net/forum?id=HkL7n1-0b - Wasserstein Auto Encoders - one of ICLR top papers.  
 867 | 
 868 | ## June 4 - Hacker Dojo
 869 | https://openreview.net/pdf?id=Hy7fDog0b - Ambient GAN - Generative Models from Lossy Measurements - ICLR top paper  
 870 | 
 871 | 
 872 | ## May 21 - Hacker Dojo
 873 | https://arstechnica.com/science/2018/05/ai-trained-to-navigate-develops-brain-like-location-tracking/  - Grid representations in rat brain  
 874 | https://deepmind.com/documents/200/Banino_at_al_final.pdf  --  
 875 | https://www.nature.com/articles/s41586-018-0102-6  --  
 876 | 
 877 | 
 878 | 
 879 | ## May 14 - Hacker Dojo
 880 | https://arxiv.org/pdf/1712.06567.pdf - Deep Neuroevolution: Genetic Algorithms are a Competitive Alternative for
 881 | Training Deep Neural Networks for Reinforcement Learning  
 882 | https://arxiv.org/pdf/1712.06560.pdf - Improving Exploration in Evolution Strategies for Deep Reinforcement
 883 | Learning via a Population of Novelty-Seeking Agents  
 884 | https://eng.uber.com/deep-neuroevolution/  - Uber engineering blog post  
 885 | 
 886 | ## May 7 - Hacker Dojo
 887 | https://arxiv.org/pdf/1801.10130.pdf - spherical CNN  
 888 | 
 889 | ## Apr 30 - Hacker Dojo
 890 | https://arxiv.org/pdf/1710.07313.pdf - Using machine learning to replicate chaotic attractors  
 891 | http://www.bmp.ds.mpg.de/tl_files/bmp/preprints/Zimmermann_Parlitz_preprint.pdf - paper to be published in "chaos"  
 892 | https://www.quantamagazine.org/machine-learnings-amazing-ability-to-predict-chaos-20180418/ - blog post  
 893 | 
 894 | 
 895 | ## Apr 23 - Hacker Dojo  
 896 | https://arxiv.org/pdf/1711.10925.pdf - Deep Image Prior  
 897 | https://dmitryulyanov.github.io/deep_image_prior - git hub from authors  
 898 | https://box.skoltech.ru/index.php/s/ib52BOoV58ztuPM  
 899 | http://mlexplained.com/2018/01/18/paper-dissected-deep-image-prior-explained/  
 900 | http://fortune.com/2018/04/24/nvidia-artificial-intelligence-images/ - Article w video showing photo editing use  
 901 | 
 902 | ## Apr 16 - Hacker Dojo 
 903 | Finish Fractal AI  
 904 | https://arxiv.org/pdf/1711.07971.pdf - non-local filtering  
 905 | 
 906 | 
 907 | ## Apr 9 - Hacker Dojo
 908 | http://lanl.arxiv.org/pdf/1803.05049v1 - Fractal AI 
 909 | 
 910 | ## Apr 2 - Hacker Dojo 
 911 | https://arxiv.org/pdf/1803.04831.pdf - IndRNN longer deeper RNN's  
 912 | 
 913 | ## Mar 26 -  Hacker Dojo
 914 | https://arxiv.org/pdf/1711.10433.pdf - parallel wavenet  
 915 | https://arxiv.org/pdf/1708.04552.pdf - regularizing convnet with cutout (desert paper) 
 916 | http://www.cs.toronto.edu/~jmartens/docs/Deep_HessianFree.pdf - will get short presentation on this one.  
 917 | 
 918 | ## Mar 19 - Hacker Dojo 
 919 | https://arxiv.org/pdf/1802.03268.pdf - Efficient Neural Architecture Search via Parameter Sharing  
 920 | https://github.com/carpedm20/ENAS-pytorch 
 921 | 
 922 | some related papers and reviews. 
 923 | https://arxiv.org/pdf/1708.05344.pdf - One shot architecture search  
 924 | https://openreview.net/forum?id=ByQZjx-0-  
 925 | and  
 926 | https://openreview.net/forum?id=rydeCEhs-  
 927 | 
 928 | 
 929 | ## Mar 12 - Hacker Dojo 
 930 | https://arxiv.org/abs/1703.10135 - tacotron - end-to-end speech synthesis  
 931 | https://arxiv.org/pdf/1712.05884.pdf - tacotron 2  
 932 | https://research.googleblog.com/2017/12/tacotron-2-generating-human-like-speech.html - 
 933 | https://github.com/A-Jacobson/tacotron2 - pytorch code 
 934 | http://research.baidu.com/deep-speech-3%EF%BC%9Aexploring-neural-transducers-end-end-speech-recognition/  
 935 | 
 936 | ## Feb 26 - Hacker Dojo  
 937 | https://arxiv.org/pdf/1705.09792.pdf - Deep Complex Networks  
 938 | 
 939 | 
 940 | ## Feb 19 - Hacker Dojo  
 941 | https://arxiv.org/pdf/1801.10308.pdf - Nested LSTM's  
 942 | https://arxiv.org/pdf/1705.10142.pdf - KRU from Fair  
 943 | https://github.com/hannw/nlstm  - tf code for Nested LSTM
 944 | 
 945 | ## Feb 12 - Hacker Dojo  
 946 | http://openaccess.thecvf.com/content_cvpr_2017/papers/Khoreva_Simple_Does_It_CVPR_2017_paper.pdf - Weakly Supervised Instance and Semantic Segmentation  
 947 | https://www.mpi-inf.mpg.de/departments/computer-vision-and-multimodal-computing/research/weakly-supervised-learning/simple-does-it-weakly-supervised-instance-and-semantic-segmentation/  
 948 | https://github.com/philferriere/tfwss - Phil Ferriere's code  
 949 |  https://drive.google.com/file/d/1wPHMA4PqygawvIxRiy-2ZMKcpUO447cz/view?usp=sharing - mehul's notebook on segmentation  
 950 | 
 951 | ## Feb 5 - Hacker Dojo
 952 | https://arxiv.org/pdf/1511.06939.pdf - using rnn for recommendation system  
 953 | https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/46488.pdf - latest paper on rnn for recommendation  
 954 | 
 955 | ## Jan 29 - Hacker Dojo
 956 | https://arxiv.org/pdf/1709.04511.pdf - Empirical study of multi-agent RL  
 957 | https://github.com/geek-ai/1m-agents - code 
 958 | 
 959 | ## Jan 22 - Hacker Dojo  
 960 | https://arxiv.org/pdf/1704.00028.pdf - Improvements in Wasserstein GAN training  
 961 | 
 962 | ## Jan 15 - Hacker Dojo
 963 | 
 964 | https://arxiv.org/pdf/1710.02298.pdf - Combining improvements in deep reinforcement learning  
 965 | 
 966 | ## Jan 8 - Hacker Dojo
 967 | https://openreview.net/pdf?id=HJWLfGWRb - follow-on to capsule network paper  
 968 | https://www.youtube.com/watch?v=pPN8d0E3900  
 969 | https://www.youtube.com/watch?v=2Kawrd5szHE  
 970 | https://github.com/ageron/handson-ml/blob/master/extra_capsnets.ipynb  
 971 | https://github.com/naturomics/CapsNet-Tensorflow  
 972 | https://medium.com/ai%C2%B3-theory-practice-business/understanding-hintons-capsule-networks-part-ii-how-capsules-work-153b6ade9f66  
 973 | 
 974 | ## Dec 11 - Hacker Dojo  
 975 | https://arxiv.org/pdf/1710.09829.pdf - Dynamic routing between capsules - Hinton  
 976 | 
 977 | ## Nov 27 - Hacker Dojo
 978 | https://arxiv.org/pdf/1701.01724.pdf - DeepStack: Expert-Level Artificial Intelligence in
 979 | Heads-Up No-Limit Poker  
 980 | 
 981 | ## Nov 13 - Hacker Dojo
 982 | https://deepmind.com/documents/119/agz_unformatted_nature.pdf - alpha zero paper  
 983 | https://webdocs.cs.ualberta.ca/~mmueller/talks/2016-LeeSedol-AlphaGo.pdf  - some slides  
 984 | 
 985 | 
 986 | ## Nov 6 - Hacker Dojo  
 987 | https://arxiv.org/pdf/1703.10593.pdf - cycle consistent GANs  
 988 | 
 989 | ## Oct 30 - Hacker Dojo  
 990 | https://arxiv.org/pdf/1503.02406.pdf  Naftali Tishby and Noga Zaslavsky. information bottleneck principle.  
 991 | 
 992 | https://www.cs.huji.ac.il/labs/learning/Papers/allerton.pdf - Naftali Tishby, Fernando C. Pereira, and William Bialek. The information bottleneck method. 
 993 | 
 994 | https://www.reddit.com/r/MachineLearning/comments/75uua6/r_2_hr_talk_information_theory_of_deep_learning/  
 995 | 
 996 | ## Oct 23 - Hacker Dojo  
 997 | 
 998 | Mask R-CNN  
 999 | https://arxiv.org/abs/1703.06870  
1000 | 
1001 | 
1002 | And these are prerequisites (read at least Fast R-CNN and Faster R-CNN)  
1003 | 
1004 | R-CNN  
1005 | https://arxiv.org/abs/1311.2524  
1006 | 
1007 | Fast R-CNN  
1008 | https://arxiv.org/pdf/1504.08083.pdf  
1009 | 
1010 | Faster R-CNN  
1011 | https://arxiv.org/abs/1506.01497 Feature Pyramid Networks  
1012 | https://arxiv.org/abs/1612.03144  
1013 | 
1014 | 
1015 | ## Oct 16 - Hacker Dojo 
1016 | https://arxiv.org/pdf/1703.00810.pdf - Opening the Black Box of Neural Nets via Information  
1017 | https://www.youtube.com/watch?v=ekUWO_pI2M8  
1018 | https://www.youtube.com/watch?v=bLqJHjXihK8  
1019 | 
1020 | ## Oct 9 - Hacker Dojo 
1021 | https://arxiv.org/pdf/1501.00092.pdf - super resolution first paper  
1022 | https://arxiv.org/abs/1608.00367 - super resolution second paper  
1023 | 
1024 | ## Oct 2 - Hacker Dojo
1025 | https://arxiv.org/abs/1604.03901 - Single-Image Depth Perception in the Wild  
1026 | 
1027 | ## Sept 25 - Hacker Dojo
1028 | https://arxiv.org/pdf/1706.08947.pdf - Exploring generalization in deep networks.  
1029 | 
1030 | ## Sept 18 - Hacker Dojo
1031 | https://arxiv.org/pdf/1705.02550.pdf - nvidia drone nav  
1032 | https://github.com/NVIDIA-Jetson/redtail/wiki - code  
1033 | 
1034 | ## Sept 11 - Hacker Dojo
1035 | http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.365.5060&rep=rep1&type=pdf - hyperneat ref  
1036 | https://arxiv.org/pdf/1609.09106.pdf - Hypernet ref  
1037 | http://blog.otoro.net/2016/09/28/hyper-networks/ - blog on hypernet  
1038 | https://www.youtube.com/watch?v=-8oyTYViuJ4 - vid on hyperNeat  
1039 | http://eplex.cs.ucf.edu/hyperNEATpage/HyperNEAT.html - blog on hyperNeat
1040 | 
1041 | ## August 28 - Hacker Dojo
1042 | https://arxiv.org/pdf/1708.05344.pdf - SMASH: One-Shot Model Architecture Search through HyperNetworks
1043 | https://www.youtube.com/watch?v=79tmPL9AL48 - youtube vid on SMASH  
1044 | 
1045 | ## August 21 - Hacker Dojo
1046 | https://arxiv.org/pdf/1706.02515.pdf - Self Normalizing Neural Networks - Hochreiter  
1047 | 
1048 | ## August 14 - Hacker Dojo  
1049 | https://arxiv.org/pdf/1606.01541.pdf - Reinforcement Learning for Dialog Generation - Jurafsky  
1050 | https://github.com/liuyuemaicha/Deep-Reinforcement-Learning-for-Dialogue-Generation-in-tensorflow - tensorflow code for same  
1051 | https://github.com/jiweil/ - some related code  
1052 | https://arxiv.org/pdf/1612.00563.pdf - self critical training for image captioning - RL for text prob.  
1053 |   
1054 | Some papers referenced by Jurafsky paper 
1055 | [1506.05869] A Neural Conversational Model - Vinyals and Le  
1056 | https://arxiv.org/abs/1604.04562 - Dialogue generation system - Wen  
1057 | 
1058 | 
1059 | ## Aug 7 - Hacker Dojo
1060 | https://arxiv.org/pdf/1705.04304.pdf - A Deep Reinforced Model for Abstractive Summarization - socher 
1061 | 
1062 | ## July 31 - Hacker Dojo
1063 | https://arxiv.org/pdf/1706.01433.pdf - visual interaction networks - deep mind  
1064 | https://arxiv.org/pdf/1706.01427.pdf - neural model for relational reasoning - deep mind   
1065 | 
1066 | 
1067 | ## July 24  
1068 | Guest Speaker - Using FPGA to speed CNN.  
1069 | https://arxiv.org/pdf/1703.03130.pdf - A structured self-attentive sentence embedding - Lin and Bengio  
1070 | https://github.com/dennybritz/deeplearning-papernotes/blob/master/notes/self_attention_embedding.md (review)  
1071 | https://github.com/yufengm/SelfAttentive  code  
1072 | https://github.com/Diego999/SelfSent  code  
1073 | 
1074 | ## July 17 - Hacker Dojo  
1075 | https://arxiv.org/pdf/1706.03762.pdf - attention is all you need - Vaswani  
1076 | https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/models  
1077 | https://github.com/jadore801120/attention-is-all-you-need-pytorch - easier to read code  
1078 | https://arxiv.org/pdf/1607.06450.pdf - layer normalization paper - hinton  
1079 | https://www.youtube.com/watch?v=nR74lBO5M3s - google translate paper - youtube video  
1080 | https://arxiv.org/pdf/1609.08144.pdf  - google translate paper - 
1081 | 
1082 | ## July 10 - Hacker Dojo  
1083 | https://arxiv.org/pdf/1706.03762.pdf - attention is all you need - Vaswani  
1084 | https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/models  
1085 | https://github.com/jadore801120/attention-is-all-you-need-pytorch - easier to read code  
1086 | https://arxiv.org/pdf/1607.06450.pdf - layer normalization paper - hinton  
1087 | 
1088 | 
1089 | #### Some added references regarding positional encodings
1090 | http://www.machinelearning.org/proceedings/icml2006/047_Connectionist_Tempor.pdf - A. Graves, S. Fernandez, F. Gomez, and J. Schmidhuber  
1091 | https://www.reddit.com/r/MachineLearning/comments/6jdi87/r_question_about_positional_encodings_used_in/  
1092 | 
1093 | 
1094 | ## June 26 - Hacker Dojo
1095 | https://arxiv.org/pdf/1705.03122.pdf - convolutional sequence to sequence learning  
1096 | https://arxiv.org/pdf/1706.03762.pdf - attention is all you need - Vaswani  
1097 | http://www.machinelearning.org/proceedings/icml2006/047_Connectionist_Tempor.pdf - A. Graves, S. Fernandez, F. Gomez, and J. Schmidhuber  
1098 | 
1099 | 
1100 | ## June 19 - Hacker Dojo
1101 | https://arxiv.org/pdf/1701.02720.pdf - RNN for end to end voice recognition
1102 | 
1103 | 
1104 | ## June 12 - Hacker Dojo  
1105 | New reinforcement learning results -- Too cool for school.  Watch the video and you'll be hooked.  
1106 | https://www.youtube.com/watch?v=2vnLBb18MuQ&feature=em-subs_digest  
1107 | 
1108 | http://www.cs.ubc.ca/~van/papers/2017-TOG-deepLoco/index.html - paper  
1109 | 
1110 | 
1111 | ## May 22 - Hacker Dojo  
1112 | https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/HintonDengYuEtAl-SPM2012.pdf - comparison of RNN and HMM for speech recognition  
1113 | 
1114 | ## May 15 - Hacker Dojo  
1115 | https://arxiv.org/pdf/1412.6572.pdf -  Explaining and Harnessing Adversarial Examples
1116 | 
1117 | 
1118 | ## May 1 - Hacker Dojo  
1119 | https://arxiv.org/abs/1704.03453  - The Space of Transferable Adversarial Examples
1120 | 
1121 | 
1122 | ## Apr 24 - Hacker Dojo  
1123 | https://discourse-production.oss-cn-shanghai.aliyuncs.com/original/3X/1/5/15ba4cef726cab390faa180eb30fd82b693469f9.pdf - Using TPU for data center  
1124 | 
1125 | 
1126 | ## Apr 17 - Hacker Dojo
1127 | Reservoir Computing by Felix Grezes.
1128 | http://www.gc.cuny.edu/CUNY_GC/media/Computer-Science/Student%20Presentations/Felix%20Grezes/Second_Exam_Survey_Felix_Grezes_9_04_2014.pdf  
1129 | 
1130 | Slides by Felix Grezes: Reservoir Computing for Neural Networks  
1131 | http://www.gc.cuny.edu/CUNY_GC/media/Computer-Science/Student%20Presentations/Felix%20Grezes/Second_Exam_Slides_Felix_Grezes_9-14-2014.pdf
1132 | (more at: http://speech.cs.qc.cuny.edu/~felix/ )  
1133 | 
1134 | This is a short, very useful backgrounder on randomized projections,  
1135 | here used for compressed sensing, in a blog post by Terence Tao  
1136 | https://terrytao.wordpress.com/2007/04/13/compressed-sensing-and-single-pixel-cameras/  
1137 | 
1138 | and the same story told with illustrations on the Nuit Blanche blog:  
1139 | http://nuit-blanche.blogspot.com/2007/07/how-does-rice-one-pixel-camera-work.html  
1140 | 
1141 | (BTW http://nuit-blanche.blogspot.com is a tremendous website.)  
1142 | 
1143 | ---
1144 | 
1145 | If we have time, we may discuss this paper:  
1146 |  
1147 | Information Processing Using a Single Dynamical Node as Complex System.  
1148 | https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3195233/pdf/ncomms1476.pdf  
1149 | 
1150 | 
1151 | ## Apr 10 - Hacker Dojo  
1152 | 
1153 | https://arxiv.org/pdf/1603.08678.pdf - Instance-sensitive Fully Convolutional Networks  
1154 | 
1155 | https://arxiv.org/pdf/1611.07709.pdf - Fully Convolutional Instance-aware Semantic Segmentation  
1156 | 
1157 | ## Apr 3 - Hacker Dojo
1158 | https://arxiv.org/pdf/1703.03864.pdf - Sutskever paper on using evolutionary systems for optimizing RL prob  
1159 | http://jmlr.csail.mit.edu/papers/volume15/wierstra14a/wierstra14a.pdf - ES paper with algo used in Sutskever paper  
1160 | 
1161 | 
1162 | ## Mar 27 - Hacker Dojo
1163 | Aurobindo Tripathy will reprise a talk he's going to give at Embedded Summit this year.  His talk will survey recent progress in object detection from RCNN to Single Shot MultiBox Detector and Yolo 9000.
1164 | 
1165 | 
1166 | ## Mar 20 - Hacker Dojo  
1167 | https://arxiv.org/pdf/1612.05424.pdf - Unsupervised Pixel-level domain adaptation with generative adversarial networks   
1168 | 
1169 | ## Mar 13 - Hacker Dojo  
1170 | https://arxiv.org/pdf/1701.06547.pdf - adversarial learning for neural dialog generation  
1171 | 
1172 | ## February 27 - Hacker Dojo   
1173 | https://arxiv.org/pdf/1612.02699.pdf - Deep Supervision with Shape Concepts for Occlusion-Aware 3D Object Parsing  
1174 | Zeeshan's slides are in the folder with his name on it.  Along with his descriptions of his own ground-breaking work, he gives an excellent history of efforts to identify 3d objects from 2d images.  
1175 | 
1176 | 
1177 | ## February 20 - Hacker Dojo
1178 | https://arxiv.org/pdf/1506.07285.pdf  - Ask me anything - Socher  
1179 | https://github.com/YerevaNN/Dynamic-memory-networks-in-Theano - Code and implementation notes.  
1180 | https://www.youtube.com/watch?v=FCtpHt6JEI8&t=27s - Socher presentation of material  
1181 | 
1182 | 
1183 | ## February 13 - Hacker Dojo 
1184 | https://arxiv.org/pdf/1701.06538v1.pdf - Outrageously large neural networks  
1185 | 
1186 | ## February 6 - Hacker Dojo  
1187 | 
1188 | https://arxiv.org/pdf/1505.00387v2.pdf - Highway networks  
1189 | https://arxiv.org/pdf/1507.06228.pdf - Also highway networks - different examples   
1190 | https://arxiv.org/pdf/1607.03474v3.pdf - Recurrent Highway Networks  
1191 | 
1192 | 
1193 | ## January 30 - Hacker Dojo
1194 | https://arxiv.org/pdf/1603.03116v2.pdf - Low-rank pass-through RNN's follow-on to unitary rnn
1195 | https://github.com/Avmb/lowrank-gru - theano code
1196 | 
1197 | ## January 23 - HackerDojo
1198 | https://arxiv.org/abs/1612.03242 - Stack Gan Paper  
1199 | https://github.com/hanzhanggit/StackGAN - Code  
1200 | 
1201 | ## January 16 - Hacker Dojo
1202 | https://arxiv.org/pdf/1511.06464v4.pdf - Unitary Evolution RNN
1203 | https://github.com/amarshah/complex_RNN - theano code
1204 | 
1205 | ## January 9 - Hacker Dojo
1206 | Cheuksan Edward Wang Talk  
1207 | https://arxiv.org/pdf/1612.04642v1.pdf - rotation invariant cnn  
1208 | https://github.com/deworrall92/harmonicConvolutions - tf code for harmonic cnn
1209 | http://visual.cs.ucl.ac.uk/pubs/harmonicNets/index.html - blog post by authors
1210 | 
1211 | ## January 2 - Hacker Dojo
1212 | https://arxiv.org/pdf/1602.02218v2.pdf - using typing to improve RNN behavior  
1213 | http://jmlr.org/proceedings/papers/v37/jozefowicz15.pdf - exploration of alternative LSTM architectures  
1214 | 
1215 | ## December 19 - Hacker Dojo 
1216 | https://arxiv.org/pdf/1611.01576.pdf - Socher qRnn paper
1217 | 
1218 | ## December 12 - Hacker Dojo 
1219 | https://arxiv.org/pdf/1604.02135v2.pdf - latest segmentation fair  
1220 | https://github.com/MarvinTeichmann/tensorflow-fcn - code for segmenter   
1221 | 
1222 | ## December 5 - Hacker Dojo
1223 | https://arxiv.org/pdf/1506.06204.pdf - Object segmentation
1224 | https://arxiv.org/pdf/1603.08695v2.pdf - refinement of above segmentation paper  
1225 | https://code.facebook.com/posts/561187904071636/segmenting-and-refining-images-with-sharpmask/ - blog post  
1226 | https://github.com/facebookresearch/deepmask - torch code for deepmask  
1227 | 
1228 | 
1229 | ## November 28 - Hacker Dojo
1230 | https://arxiv.org/pdf/1506.01497v3.pdf  
1231 | people.eecs.berkeley.edu/~rbg/slides/rbg-defense-slides.pdf - Girshick thesis slides  
1232 | Check edge boxes and selective search  
1233 | https://arxiv.org/pdf/1406.4729v4.pdf - key part of architecture  
1234 | https://github.com/smallcorgi/Faster-RCNN_TF - excellent code  
1235 | 
1236 | 
1237 | ## November 21 - Hacker Dojo
1238 | https://people.eecs.berkeley.edu/~rbg/papers/r-cnn-cvpr.pdf - RCNN   
1239 | https://arxiv.org/pdf/1504.08083v2.pdf - RCNN - first in series  
1240 | https://arxiv.org/pdf/1506.01497v3.pdf -  Faster R-CNN   
1241 | http://techtalks.tv/talks/rich-feature-hierarchies-for-accurate-object-detection-and-semantic-segmentation/60254/ - video of Girshick talk  
1242 | 
1243 | 
1244 | ## November 14 - Hacker Dojo
1245 | https://arxiv.org/pdf/1506.02025v3.pdf - Spatial transformer networks   
1246 | https://github.com/daviddao/spatial-transformer-tensorflow - tf code for above   
1247 | 
1248 | ## October 31 - Hacker Dojo
1249 | https://github.com/jazzsaxmafia/show_attend_and_tell.tensorflow - tf code for attention-captioning
1250 | http://cs.stanford.edu/people/karpathy/densecap/ - karpathy captioning
1251 | https://arxiv.org/pdf/1412.2306v2.pdf - earlier karpathy captioning paper
1252 | 
1253 | 
1254 | ## October 20 - Galvanize  
1255 | https://webdocs.cs.ualberta.ca/~sutton/book/the-book.html - Deep dive into reinforcement learning - Sutton and Barto - Chapters 1 and 2.  
1256 | 
1257 | ## Oct 17 - Hacker Dojo
1258 | https://arxiv.org/pdf/1608.06993v1.pdf - DenseNet.  New reigning champion image classifier  
1259 | https://github.com/liuzhuang13/DenseNet - lua code  
1260 | The DenseNet paper is straight-forward, so we're also going to start on image captioning  
1261 | 
1262 | http://www.cs.toronto.edu/~zemel/documents/captionAttn.pdf  
1263 | http://kelvinxu.github.io/projects/capgen.html  
1264 | http://people.ee.duke.edu/~lcarin/Yunchen9.25.2015.pdf - slides for caption attention
1265 | 
1266 | collections of captioning papers. 
1267 | https://github.com/kjw0612/awesome-deep-vision#image-captioning - images  
1268 | https://github.com/kjw0612/awesome-deep-vision#video-captioning - video  
1269 | 
1270 | ## Oct 13 - SF
1271 | http://www.mit.edu/~dimitrib/NDP_Encycl.pdf - (early) Bersekas paper on RL, policy and value iteration  
1272 | http://www.nervanasys.com/demystifying-deep-reinforcement-learning/?imm_mid=0e2d7e&cmp=em-data-na-na-newsltr_20160420 - blog post on RL. Nice coverage of value iteration  
1273 | 
1274 | ## Oct 10 - Hacker Dojo
1275 | https://github.com/carpedm20/pixel-rnn-tensorflow - tensorflow code for pixel rnn (and cnn)  
1276 | 
1277 | ## Sept 19 - Hacker Dojo  
1278 | https://arxiv.org/pdf/1606.05328v2.pdf - Conditional Image Generation with PixelCNN decoders  
1279 | https://arxiv.org/pdf/1601.06759v3.pdf - Pixel RNN  
1280 | https://drive.google.com/file/d/0B3cxcnOkPx9AeWpLVXhkTDJINDQ/view - wavenet Generative Audio  
1281 | https://deepmind.com/blog/wavenet-generative-model-raw-audio/ - wavenet blog  
1282 | 
1283 | ## Sept 15 - Galvanize SF
1284 | http://www.gitxiv.com/posts/fepYG4STYaej3KSPZ/densely-connected-convolutional-netowork-densenet
1285 | 
1286 | 
1287 | ## Sept 12 - Hacker Dojo  
1288 | http://arxiv.org/pdf/1410.3916v11.pdf - original memory networks    
1289 | https://arxiv.org/pdf/1606.03126v1.pdf - key/value memory augmented nn
1290 | http://www.thespermwhale.com/jaseweston/icml2016/icml2016-memnn-tutorial.pdf#page=87 - tutorial on memory networks in language understanding
1291 | 
1292 | ## August 29 - Hacker Dojo
1293 | https://arxiv.org/pdf/1410.5401v2.pdf - Neural Turing Machines  
1294 | https://github.com/carpedm20/NTM-tensorflow  
1295 | https://www.youtube.com/watch?v=_H0i0IhEO2g - Alex Graves presentation at microsoft research  
1296 | http://www.robots.ox.ac.uk/~tvg/publications/talks/NeuralTuringMachines.pdf - slides for ntm  
1297 | 
1298 | ## August 25 - Galvanize (SF)
1299 | http://arxiv.org/pdf/1410.3916v11.pdf - original memory networks    
1300 | https://arxiv.org/pdf/1606.03126v1.pdf - key/value memory augmented nn
1301 | http://www.thespermwhale.com/jaseweston/icml2016/icml2016-memnn-tutorial.pdf#page=87 - tutorial on memory networks in language understanding
1302 | 
1303 | ## August 22 - Hacker Dojo
1304 | https://arxiv.org/pdf/1605.07648v1.pdf - fractal net - alternative to resnet for ultra-deep convolution
1305 | https://github.com/edgelord/FractalNet - tf code  
1306 | http://www.gitxiv.com/posts/ibA8QEu8bvBJSDxr9/fractalnet-ultra-deep-neural-networks-without-residuals  
1307 | 
1308 | ## August 18, 2016 - Galvanize (SF)  
1309 | https://arxiv.org/pdf/1602.01783v2.pdf - new RL architecture - deep mind  
1310 | 
1311 | Code:
1312 | https://github.com/Zeta36/Asynchronous-Methods-for-Deep-Reinforcement-Learning - tf  
1313 | https://github.com/miyosuda/async_deep_reinforce - tf  
1314 | https://github.com/coreylynch/async-rl - keras (tf)  
1315 | https://github.com/muupan/async-rl - chainer (good discussion)  
1316 | 
1317 | ## August 15, 2016 - Hacker Dojo  
1318 | https://arxiv.org/pdf/1607.02533v1.pdf - Hardening deep networks to adversarial examples.  
1319 | 
1320 | ## August 11, 2016 - Galvanize (SF)
1321 | http://www.gitxiv.com/posts/HQJ3F9YzsQZ3eJjpZ/model-free-episodic-control - deep mind gitxiv paper and code on github
1322 | https://github.com/sudeepraja/Model-Free-Episodic-Control - other code
1323 | https://github.com/ShibiHe/Model-Free-Episodic-Control
1324 | 
1325 | ## August 8, 2016 - Hacker Dojo  
1326 | https://arxiv.org/pdf/1406.2661.pdf - originating paper on generative adversarial net (gan) - goodfellow, bengio  
1327 | http://arxiv.org/pdf/1511.06434v2.pdf - deep cnn gan - radford  
1328 | https://github.com/Newmu/dcgan_code - theano code for cnn gan - radford  
1329 | 
1330 | ## August 4, 2016 - Galvanize (SF)
1331 | http://www.gitxiv.com/posts/HQJ3F9YzsQZ3eJjpZ/model-free-episodic-control - deep mind gitxiv paper and code on github
1332 | 
1333 | ## August 1, 2016 - Hacker Dojo
1334 | Papers -   
1335 | https://drive.google.com/file/d/0B8Dg3PBX90KNWG5KQXNQOFlBLU1JWWVONkN1UFpnbUR6Y0cw/view?pref=2&pli=1 - Using Stochastic RNN for temporal anomaly detection  
1336 | https://home.zhaw.ch/~dueo/bbs/files/vae.pdf  - cover math  
1337 | https://arxiv.org/pdf/1401.4082v3.pdf - Rezende - Other Original VAE paper  
1338 | 
1339 | Code Review -   
1340 | https://github.com/oduerr/dl_tutorial/blob/master/tensorflow/vae/vae_demo.ipynb  
1341 | https://github.com/oduerr/dl_tutorial/blob/master/tensorflow/vae/vae_demo-2D.ipynb  
1342 | 
1343 | ## July 28, 2016 - SF
1344 | Papers:  
1345 | http://arxiv.org/pdf/1410.5401v2.pdf - Neural Turing Machines - Graves et. al.  
1346 | https://arxiv.org/pdf/1605.06065v1.pdf - One Shot Learning - DeepMind  
1347 | 
1348 | Code:  
1349 | http://icml.cc/2016/reviews/839.txt  
1350 | https://github.com/brendenlake/omniglot  
1351 | https://github.com/tristandeleu/ntm-one-shot  
1352 | https://github.com/MLWave/extremely-simple-one-shot-learning  
1353 | 
1354 | ## July 25, 2016 - Hacker Dojo
1355 | Papers - Using VAE for anomaly detection  
1356 | https://arxiv.org/pdf/1411.7610.pdf - Stochastic Recurrent Networks  
1357 | https://drive.google.com/file/d/0B8Dg3PBX90KNWG5KQXNQOFlBLU1JWWVONkN1UFpnbUR6Y0cw/view?pref=2&pli=1 - Using Stochastic RNN for temporal anomaly detection  
1358 | 
1359 |   
1360 | ## July 21, 2016 - SF
1361 | Papers to read:  
1362 | http://www.thespermwhale.com/jaseweston/ram/papers/paper_16.pdf  
1363 | http://snowedin.net/tmp/Hochreiter2001.pdf - 
1364 | 
1365 | Comments / Code  
1366 | http://icml.cc/2016/reviews/839.txt  
1367 | https://github.com/brendenlake/omniglot  
1368 | https://github.com/tristandeleu/ntm-one-shot  
1369 | https://github.com/MLWave/extremely-simple-one-shot-learning  
1370 | https://www.periscope.tv/hugo_larochelle/1ypJdnPRYEoKW  
1371 |   
1372 |   
1373 |   
1374 | ## July 18, 2016 - Hacker Dojo  
1375 | Papers to read:  
1376 | http://arxiv.org/pdf/1312.6114v10.pdf - variational autoencoders - U of Amsterdam - Kingma and Welling  
1377 | http://arxiv.org/pdf/1310.8499v2.pdf - deep autoregressive networks - deep mind   
1378 |  https://arxiv.org/abs/1606.05908 - tutorial on vae
1379 |   
1380 | Commentaries/Code  
1381 | https://jmetzen.github.io/2015-11-27/vae.html - metzen - code and discussion  
1382 | http://blog.keras.io/building-autoencoders-in-keras.html - chollet - discusses different autoencoders, gives keras code.  
1383 | 
1384 | 
1385 | 
1386 | ## June 27, July 11 2016 - Hacker Dojo   
1387 | Recurrent network for image generation - Deep Mind   
1388 | https://arxiv.org/pdf/1502.04623v2.pdf  
1389 | Background and some references cited  
1390 | http://blog.evjang.com/2016/06/understanding-and-implementing.html - blog w. code for VAE  
1391 | http://arxiv.org/pdf/1312.6114v10.pdf - Variational Auto Encoder  
1392 | https://jmetzen.github.io/2015-11-27/vae.html - tf code for variational auto-encoder  
1393 | https://www.youtube.com/watch?v=P78QYjWh5sM  
1394 | 
1395 | https://arxiv.org/pdf/1401.4082.pdf  - stochastic backpropagation and approx inference - deep mind  
1396 | http://www.cs.toronto.edu/~fritz/absps/colt93.html - keep neural simple by minimizing descr length - hinton  
1397 | https://github.com/vivanov879/draw - code  
1398 | 
1399 | 
1400 | ## June 20, 2016 - Penninsula   
1401 | Recurrent models of visual attention - Deep Mind   
1402 | https://papers.nips.cc/paper/5542-recurrent-models-of-visual-attention.pdf   
1403 | 
1404 | ## June 23, 29 2016 - SF
1405 | http://arxiv.org/pdf/1410.5401v2.pdf - Neural Turing Machines - Graves et. al.  
1406 | https://arxiv.org/pdf/1605.06065v1.pdf - One Shot Learning - DeepMind  
1407 | http://www.shortscience.org/paper?bibtexKey=journals/corr/1605.06065 - Larochell comments on One-Shot paper  
1408 | https://github.com/shawntan/neural-turing-machines - Code  
1409 | https://www.reddit.com/r/MachineLearning/comments/2xcyrl/i_am_j%C3%BCrgen_schmidhuber_ama/cp4ecce - schmidhuber's comments  
1410 | http://www.thespermwhale.com/jaseweston/ram/papers/paper_16.pdf  
1411 | http://snowedin.net/tmp/Hochreiter2001.pdf - 
1412 | Reviews:  
1413 | http://icml.cc/2016/reviews/839.txt  
1414 | Code
1415 | https://github.com/brendenlake/omniglot
1416 | https://github.com/tristandeleu/ntm-one-shot
1417 | https://github.com/MLWave/extremely-simple-one-shot-learning
1418 | 
1419 | ## June 13, 2016 - TBD, Penninsula
1420 | Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning:  
1421 | http://arxiv.org/pdf/1602.07261v1.pdf  
1422 | 
1423 | ## June 9, 2016 - Galvanize
1424 | Visualizing and Understanding RNN:  
1425 | https://arxiv.org/pdf/1506.02078v2.pdf  
1426 | 
1427 | ## June 6, 2016 - Hacker Dojo
1428 | Google inception paper - origin of 1x1 convolution layers  
1429 | http://arxiv.org/pdf/1409.4842v1.pdf  
1430 | 
1431 | ## June 2, May 26, 2016 - Galvanize
1432 | 
1433 | Image segmentation with deep encoder-decoder
1434 | 
1435 | https://arxiv.org/pdf/1511.00561.pdf
1436 | 
1437 | ## May 23, 2016 - Hacker Dojo
1438 | 
1439 | Compressed networks, reducing flops by pruning
1440 | 
1441 | https://arxiv.org/pdf/1510.00149.pdf
1442 | 
1443 | http://arxiv.org/pdf/1602.07360v3.pdf
1444 | 
1445 | ## May 16, 2016
1446 | 
1447 | Word2Vec meets LDA:
1448 | 
1449 | http://arxiv.org/pdf/1605.02019v1.pdf - Paper
1450 | 
1451 | https://twitter.com/chrisemoody - Chris Moody's twiter with links to slides etc.
1452 | 
1453 | http://qpleple.com/topic-coherence-to-evaluate-topic-models/ - writeup on topic coherence
1454 | 
1455 | 
1456 | ## May 9, 2016
1457 | 
1458 | https://arxiv.org/pdf/1603.05027v2.pdf - Update on microsoft resnet - identity mapping
1459 | 
1460 | http://gitxiv.com/posts/MwSDm6A4wPG7TcuPZ/recurrent-batch-normalization - batch normalization w. RNN
1461 | 
1462 | 
1463 | ## May 2, 2016
1464 | 
1465 | Go playing DQN - AlphaGo
1466 | 
1467 | https://gogameguru.com/i/2016/03/deepmind-mastering-go.pdf
1468 | 
1469 | https://m.youtube.com/watch?sns=em&v=pgX4JSv4J70 - video of slide presentation on paper
1470 | 
1471 | https://en.m.wikipedia.org/wiki/List_of_Go_games#Lee.27s_Broken_Ladder_Game - Handling "ladders" in alphgo
1472 | 
1473 | https://en.m.wikipedia.org/wiki/Ladder_(Go) - ladders in go
1474 | 
1475 | _____________________________________________________________________________________________________________________
1476 | ## April 25, 2016 - Microsoft Resnet
1477 | The Paper
1478 | 
1479 | http://arxiv.org/pdf/1512.03385v1.pdf 
1480 | 
1481 | References:
1482 | 
1483 | http://arxiv.org/pdf/1603.05027v2.pdf - Identity mapping paper
1484 | 
1485 | Code:
1486 | 
1487 | https://keunwoochoi.wordpress.com/2016/03/09/residual-networks-implementation-on-keras/ - keras code
1488 | 
1489 | https://github.com/ry/tensorflow-resnet/blob/master/resnet.py - tensorflow code
1490 | 
1491 | https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/skflow/resnet.py
1492 | _________________________________________________________________________________________________________________
1493 | ## April 18, 2016 - Batch Normalization  
1494 | The Paper  
1495 | https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf  
1496 | http://gitxiv.com/posts/MwSDm6A4wPG7TcuPZ/recurrent-batch-normalization - Batch Normalization for RNN  
1497 | 
1498 | 
1499 | ___________________________________________________________________________________________________________
1500 | ## April 11, 2016 - Atari Game Playing DQN
1501 | The Paper
1502 | https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf)
1503 | 
1504 | Related references:
1505 | 
1506 | This adds 'soft' and 'hard' attention and the 4 frames are replaced with an LSTM layer:
1507 | 
1508 | http://gitxiv.com/posts/NDepNSCBJtngkbAW6/deep-attention-recurrent-q-network
1509 | 
1510 | http://home.uchicago.edu/~arij/journalclub/papers/2015_Mnih_et_al.pdf - Nature Paper
1511 | 
1512 | http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html - videos at the bottom of the page
1513 | 
1514 | http://llcao.net/cu-deeplearning15/presentation/DeepMindNature-preso-w-David-Silver-RL.pdf - David Silver's slides
1515 | 
1516 | http://www.cogsci.ucsd.edu/~ajyu/Teaching/Cogs118A_wi09/Class0226/dayan_watkins.pdf
1517 | 
1518 | http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html - David Silver
1519 | 
1520 | Implementation Examples:
1521 | 
1522 | http://stackoverflow.com/questions/35394446/why-doesnt-my-deep-q-network-master-a-simple-gridworld-tensorflow-how-to-ev?rq=1
1523 | 
1524 | http://www.danielslater.net/2016/03/deep-q-learning-pong-with-tensorflow.html
1525 | 
1526 | __________________________________________________________________________________________________________
1527 | ##  March 3, 2016 Gated Feedback RNN
1528 | The Paper
1529 | 
1530 | "Gated RNN" (http://arxiv.org/pdf/1502.02367v4.pdf
1531 | 
1532 | -Background Material
1533 | 
1534 | http://arxiv.org/pdf/1506.00019v4.pdf - Lipton's excellent review of RNN  
1535 | http://www.nehalemlabs.net/prototype/blog/2013/10/10/implementing-a-recurrent-neural-network-in-python/ - Discussion of RNN and theano code for Elman network - Tiago Ramalho  
1536 | http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf - Hochreiter's original paper on LSTM  
1537 | https://www.youtube.com/watch?v=izGl1YSH_JA - Hinton video on LSTM 
1538 | 
1539 | -Skylar Payne's GF RNN code  
1540 | https://github.com/skylarbpayne/hdDeepLearningStudy/tree/master/tensorflow  
1541 | 
1542 | -Slides
1543 | https://docs.google.com/presentation/d/1d2keyJxRlDcD1LTl_zjS3i45xDIh2-QvPWU3Te29TuM/edit?usp=sharing  
1544 | https://github.com/eadsjr/GFRNNs-nest/tree/master/diagrams/diagrams_formula  
1545 | 
1546 | ## Reviews  
1547 | http://www.computervisionblog.com/2016/06/deep-learning-trends-iclr-2016.html  
1548 | https://indico.io/blog/iclr-2016-takeaways/  
1549 | 


--------------------------------------------------------------------------------
/ZeeshanZiaSlides-DeepSupervision3DObjectParsing/DLStudyGroup.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mike-bowles/hdDeepLearningStudy/7613fa865640022815c22de5854336b344bf998d/ZeeshanZiaSlides-DeepSupervision3DObjectParsing/DLStudyGroup.pdf


--------------------------------------------------------------------------------
/awspot/README.md:
--------------------------------------------------------------------------------
  1 | # Overview
  2 | 
  3 | This document describes how to use spot instances on AWS.  
  4 | Spot instances deliver a savings of almost 80% of the on-demand rate.
  5 | However they get interrupted, potentially loosing data. 
  6 | 
  7 | <http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/how-spot-instances-work.html>
  8 | 
  9 | The key to utilizing spot instances is automation, especially around interruption.
 10 | A simple workaround, is to use an EBS drive that automounts + a user data script that fires off will allows you to 
 11 | take advantage of cheaper spot instances and train your model for weeks at a time.
 12 | 
 13 | Your spot bid price determines how often your instance get interrupted.  
 14 | Set it low it will get interrupted moreoften, but you have a firmer handle on price certainity.
 15 | Note, the price you pay is the lower of the bid, and the current spot pricing. 
 16 | Setting a bid to the on-demand rate would virtually guarantee never getting interrupted.
 17 | 
 18 | Finally a S3 bucket is recommended to sync your results.
 19 | 
 20 | 
 21 | # Solution
 22 | 
 23 | There is some wiring required, that is described here.  I recommend building a stack that you can re-use
 24 | for your training jobs.
 25 | 
 26 | The stack is shown below.  This document describes how to build parts of this stack.
 27 | ![CUDA DOCKER AWS](https://www.lucidchart.com/publicSegments/view/b36d7113-4e9a-471c-8cf0-7facf6e17640/image.png)
 28 | 
 29 | 
 30 | The current version of this is bare bones.  Further contribution are required.
 31 | 
 32 | 
 33 | # Prerequisite:
 34 | 
 35 | 1) An AMI with Cuda8/Docker/Nvidia-Docker installed
 36 | I've made public the following ami in the Oregon Region (us-west-2)
 37 | 
 38 | ```
 39 | ami-f266d292
 40 | ```
 41 | 
 42 | 2) Create a Volume with formatted drive (ext4/xfs) in region and snap it.  Record its volume id
 43 | 
 44 | 
 45 | 3) Docker (optional)...
 46 | 
 47 | * If you want to user docker, the AMI is ready to go with Nvidia Docker
 48 | https://github.com/NVIDIA/nvidia-docker
 49 | 
 50 | * I'd suggest using DockerHub to store containers (its free unless your code is private)
 51 | Docker Hub
 52 | 
 53 | * A suggested Docker container from Waleed that has tensorflow + opencv is here. Note
 54 | start it with nvidia-docker, instead of docker if u want GPU support
 55 | 
 56 | <https://hub.docker.com/r/waleedka/modern-deep-learning/>
 57 | 
 58 | 
 59 | 4) Setup an S3 bucket. (optional)
 60 | Nothing special, just to push back models.
 61 | 
 62 | For example use s3_parallel to sync your data
 63 | ```
 64 | https://github.com/mishudark/s3-parallel-put
 65 | ```
 66 | 
 67 | 
 68 | # Kick the Tires
 69 | 
 70 | 0) Launch the AMI and login
 71 | 
 72 | 1) Run the nvidia-smi to check on running process
 73 | 
 74 | ```
 75 | nvidia-smi
 76 | 
 77 | Tue Dec 20 01:25:43 2016
 78 | +-----------------------------------------------------------------------------+
 79 | | NVIDIA-SMI 367.57                 Driver Version: 367.57                    |
 80 | |-------------------------------+----------------------+----------------------+
 81 | | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
 82 | | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
 83 | |===============================+======================+======================|
 84 | |   0  Tesla K80           Off  | 0000:00:1E.0     Off |                    0 |
 85 | | N/A   61C    P8    30W / 149W |      0MiB / 11439MiB |      0%      Default |
 86 | +-------------------------------+----------------------+----------------------+
 87 | 
 88 | ```
 89 | 
 90 | 
 91 | 2) Start docker and check flags are passed through
 92 | 
 93 | ```
 94 | sudo nvidia-docker run -v /mnt/data:/mnt/data -it nvidia/cuda bash -l
 95 | nvcc -V
 96 | ```
 97 | 
 98 | 
 99 | # Run a Spot Instances with persistent request.
100 | 
101 | 0) View spot pricing and region
102 | 
103 | 1) Generate User Data Script (see user_data.sh template)
104 | 
105 | 2) Click on AMI->Spot Request
106 | 
107 | ```
108 | i) GPU types- > pick a GPU Instance (ex. p2.xlarge)
109 | ii) Configure Instace Details-> Spot, 
110 | BidPrice: x.yy
111 | Persistent Request
112 | Network: default
113 | IAMRole:admin
114 | Launch EBS Optimized
115 | Under Advanced Details-> Pick the user_data.sh you've custom modified
116 | iii) Add Storage -> Accept Defaults
117 | iv) Tags -> You user name and task name is useful
118 | vi) Select "Review And Launch".
119 | ```
120 | 
121 | 4) Login and view logs
122 | 
123 | * the AWS startup log
124 | /var/log/cloud-init-output.log
125 | 
126 | * Docker running 
127 | ```
128 | sudo docker ps
129 | sudo docker logs
130 | ```
131 | 
132 | * The data /mnt/data folder where you should be dumping results.
133 | ```
134 | df -h
135 | find /mnt/data/
136 | ```
137 | 
138 | 
139 | # FAQ
140 | 
141 | 1) For tensorflow, or any long running job, how do i not loose my training on interruption?
142 | 
143 | Use the saver object as described here :
144 | https://www.tensorflow.org/how_tos/variables/
145 | 
146 | Setup your scripts to routinely dump with a step-id(use utc time), checkpoint every 30 minutes or so
147 | Then on restart the latest checkpoint is picked up.
148 | 
149 | Note, you will loose some training time, but assuming the AMI stays up for 8 hours, 30 minutes is acceptable as max.
150 | 
151 | For more durability upload to S3 incase the EBS fails (rare but can loose all your data)
152 | 


--------------------------------------------------------------------------------
/awspot/user_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This submitted via the ec2 configure instance page, additional options.  Node the output on the instance goes to /var/log/cloud-init-output.log
 3 | # Fill in the following after you've clone from our dummy snapshot.
 4 | # This will run as root account
 5 | # Designed to run with P2.xlarge (Nvidia K80, 12 GB card, good for running imagnet, inception..etc)
 6 | 
 7 | 
 8 | # 1. Configure all these for your specific case.  Docker is optional.  only us-west-2 and us-east-1 have p2 instances.
 9 | # Your image should be based of nvidia/cuda which is installed
10 | TASK_PATH=<path to task on docker or local>
11 | DOCKER_IMAGE=<docker_tag>
12 | VOLUME_ID=<aws_volume_id>
13 | AWS_ACCESS_KEY_ID=<XXXXX>
14 | AWS_SECRET_ACCESS_KEY=<YYYYY>
15 | REGION=us-west-2
16 | 
17 | # 2. This will attach the volume and resize it
18 | INSTANCE_ID=`ec2metadata --instance-id`
19 | aws --region=${REGION} ec2 attach-volume --instance-id ${INSTANCE_ID} --volume-id ${VOLUME_ID} --dev /dev/sdb
20 | aws --region=${REGION} ec2 describe-volumes --volume-ids ${VOLUME_ID}
21 | aws --region=${REGION} ec2 describe-volumes --volume-ids ${VOLUME_ID} | grep '"State": "attached"'
22 | until aws --region=us-west-2 ec2 describe-volumes --volume-ids ${VOLUME_ID} | grep '"State": "attached"'; do
23 |   sleep 1
24 |   done
25 | sudo resize2fs /dev/xvdb
26 | 
27 | # 3. mount the volume
28 | mkdir -p /mnt/data && chown -R ubuntu:ubuntu /mnt/data
29 | mount /dev/xvdb /mnt/data
30 | mkdir -p /var/log/mylogs
31 | 
32 | 
33 | # 4. Now run the task, it could be on the AMI, or you could download a package or git or anything u want
34 | # Here we use docker.
35 | # Pull the docker image.  Mount docker to the EBS drive.  Start task on nvidia docker.
36 | docker pull ${DOCKER_IMAGE}
37 | nvidia-docker run -v /mnt/data:/mnt/data -i ${DOCKER_IMAGE} bash -c "${TASK_PATH}" > /var/log/mylogs/docker-fractal.log 2>&1
38 | 


--------------------------------------------------------------------------------
/gatedRNN.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Gated Feedback RNN - Notes\n",
  8 |     "\n",
  9 |     "Paper's notation\n",
 10 |     "input is $x_t$\n",
 11 |     "\n",
 12 |     "\n",
 13 |     "\n",
 14 |     "\n",
 15 |     "Zaremba (2014) variant of LSTM\n",
 16 |     "$$\\tilde{c_t} = tanh(W_c x_t + U_c h_{t-1})\\,-\\,new\\,content$$\n",
 17 |     "$$c_t = f_tc_{t-1} + i_t \\tilde{c_t}\\,-\\, content\\,of\\,memory\\,cell$$\n",
 18 |     "$$i_t = \\sigma(W_ix_t+U_ih_{t-1})\\,-\\,input\\,gate$$\n",
 19 |     "$$f_t = \\sigma(W_fx_t+U_fh_{t-1})\\,-\\,forget\\,gate$$\n",
 20 |     "$$h_t = o_ttanh(c_t)\\,-\\,hidden \\, state$$\n",
 21 |     "$$o_t = \\sigma(W_ox_t + U_oh_{t-1})\\,-\\,output\\,gate$$\n",
 22 |     "\n",
 23 |     "Modification for Gated Feedback LSTM\n",
 24 |     "$$\\tilde{c_t^j} = tanh(W_c^{j-1\\rightarrow j}h_t^{j-1} + \\sum_{i=1}^L g^{i\\rightarrow j}U_c^{i\\rightarrow j}h_{t-1}^i)$$\n",
 25 |     "\n",
 26 |     "\n",
 27 |     "Gated Recurrent Unit, Cho (2014)\n",
 28 |     "$$h_t = (1-z_t)h_{t-1}+z_t \\tilde{h_t}\\,-\\,GRU\\, state$$\n",
 29 |     "$$z_t = \\sigma(W_zx_t+U_zh_{t-1})\\,-\\, update\\,gate$$\n",
 30 |     "$$\\tilde{h_t} = tanh(Wx_t + r_tUh_{t-1})\\,-\\,new\\,memory\\,content$$\n",
 31 |     "$$r_t = \\sigma(W_rx_t + U_rh_{t-1})\\,-\\,reset\\,gate$$\n",
 32 |     "\n",
 33 |     "Modification for Gated Feedback GRU\n",
 34 |     "$$\\tilde{h_t^j} = tanh(W^{j-1\\rightarrow j}h_t^{j-1} + r_t^j \\odot \\sum_{i=1}^L g^{i\\rightarrow j}U_c^{i\\rightarrow j}h_{t-1}^i)$$\n",
 35 |     "\n",
 36 |     "\n",
 37 |     "\n",
 38 |     "\n",
 39 |     "## Elman net\n",
 40 |     "\n",
 41 |     "\n",
 42 |     "$$h_t = W_hx_t+U_hh_{t-1}\\,-\\,hidden \\, state$$\n",
 43 |     "\n",
 44 |     "2-Layer Elman-RNN\n",
 45 |     "$$h_t^1 = W_h^1x_t+U_h^1h_{t-1}^1\\,-\\,hidden \\, state\\,1$$\n",
 46 |     "$$h_t^2 = W_h^2h_t^1+U_h^2h_{t-1}^2\\,-\\,hidden \\, state\\,2$$\n",
 47 |     "\n",
 48 |     "2-Layer gated feedback RNN - GF-RNN = 2-layer Elman-RNN plus\n",
 49 |     "$$g^{1\\rightarrow2} = \\sigma(W_g^{1\\rightarrow2}h_t^1 + U_g^{1\\rightarrow2}\\left[\\begin{array}{c}\n",
 50 |     "h_{t-1}^1\\\\\n",
 51 |     "h_{t-1}^2\\end{array}\\right])$$\n",
 52 |     "$$g^{2\\rightarrow1} = \\sigma(W_g^{2\\rightarrow1}x_t + U_g^{2\\rightarrow1}\\left[\\begin{array}{c}\n",
 53 |     "h_{t-1}^1\\\\\n",
 54 |     "h_{t-1}^2\\end{array}\\right])$$\n",
 55 |     "\n",
 56 |     "Assuming that\n",
 57 |     "$$h_{t-1}^*=\\left[\\begin{array}{c}\n",
 58 |     "h_{t-1}^1\\\\\n",
 59 |     "h_{t-1}^2\\end{array}\\right]$$\n",
 60 |     "\n",
 61 |     "Layer-by-layer state - \n",
 62 |     "$$h_t^j = tanh(W^{j-1\\rightarrow j}h_t^{j-1} + \\sum_{i=1}^L g^{i\\rightarrow j}U^{i\\rightarrow j}h_{t-1}^i)$$\n",
 63 |     "\n",
 64 |     "For 2-layer gated rf Elman - \n",
 65 |     "$$h_t^1 = tanh(W^{0\\rightarrow 1}x_t + g^{1\\rightarrow 1}U^{1\\rightarrow 1}h_{t-1}^1 + g^{2\\rightarrow 1}U^{2\\rightarrow 1}h_{t-1}^2)$$\n",
 66 |     "$$h_t^2 = tanh(W^{1\\rightarrow 2}h_t^1 + g^{1\\rightarrow 2}U^{1\\rightarrow 2}h_{t-1}^1 + g^{2\\rightarrow 2}U^{2\\rightarrow 2}h_{t-1}^2)$$\n",
 67 |     "\n"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {
 74 |     "collapsed": true
 75 |    },
 76 |    "outputs": [],
 77 |    "source": []
 78 |   }
 79 |  ],
 80 |  "metadata": {
 81 |   "kernelspec": {
 82 |    "display_name": "Python 2",
 83 |    "language": "python",
 84 |    "name": "python2"
 85 |   },
 86 |   "language_info": {
 87 |    "codemirror_mode": {
 88 |     "name": "ipython",
 89 |     "version": 2
 90 |    },
 91 |    "file_extension": ".py",
 92 |    "mimetype": "text/x-python",
 93 |    "name": "python",
 94 |    "nbconvert_exporter": "python",
 95 |    "pygments_lexer": "ipython2",
 96 |    "version": "2.7.11"
 97 |   }
 98 |  },
 99 |  "nbformat": 4,
100 |  "nbformat_minor": 0
101 | }
102 | 


--------------------------------------------------------------------------------
/tensorflow/Gated Feedback Recurrent Neural Network - GF LSTM.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Copyright 2016 Google Inc. All Rights Reserved.\n",
  8 |     "\n",
  9 |     "Licensed under the Apache License, Version 2.0 (the \"License\");\n",
 10 |     "you may not use this file except in compliance with the License.\n",
 11 |     "You may obtain a copy of the License at\n",
 12 |     "\n",
 13 |     "    http://www.apache.org/licenses/LICENSE-2.0\n",
 14 |     "\n",
 15 |     "Unless required by applicable law or agreed to in writing, software\n",
 16 |     "distributed under the License is distributed on an \"AS IS\" BASIS,\n",
 17 |     "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
 18 |     "See the License for the specific language governing permissions and\n",
 19 |     "limitations under the License."
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## Gated Feedback Recurrent Neural Network\n",
 27 |     "\n",
 28 |     "This notebook contains a Tensorflow (http://www.tensorflow.org) implementation of the Gated Feedback Recurrent Neural Network (the LSTM version) from this paper: http://arxiv.org/pdf/1502.02367v4.pdf"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 1,
 34 |    "metadata": {
 35 |     "collapsed": false
 36 |    },
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "import tensorflow as tf\n",
 40 |     "from tensorflow.models.rnn.ptb import reader\n",
 41 |     "import numpy as np\n",
 42 |     "\n",
 43 |     "train_data, valid_data, test_data, vocab = reader.ptb_raw_data('simple-examples/data/')"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 2,
 49 |    "metadata": {
 50 |     "collapsed": false
 51 |    },
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "# Hyperparameters\n",
 55 |     "batch_size = 20\n",
 56 |     "num_steps = 20\n",
 57 |     "hidden_size = 200\n",
 58 |     "emb_size = 200 # Note: this is kind of a cheat. This will *not* work if emb_size != hidden_size\n",
 59 |     "vocab_size = 10000\n",
 60 |     "epochs = 2\n",
 61 |     "init_scale = 0.1\n",
 62 |     "num_hidden_layers = 1\n",
 63 |     "\n",
 64 |     "lr = tf.placeholder(tf.float32, [])"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 3,
 70 |    "metadata": {
 71 |     "collapsed": false
 72 |    },
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "## Build Model\n",
 76 |     "session = tf.Session()\n",
 77 |     "\n",
 78 |     "X = tf.placeholder(tf.int32, [batch_size, num_steps])\n",
 79 |     "targets = tf.placeholder(tf.int64, [batch_size, num_steps])\n",
 80 |     "\n",
 81 |     "embedding = tf.Variable(\n",
 82 |     "  tf.random_uniform([vocab_size, emb_size], minval=-init_scale, maxval=init_scale),\n",
 83 |     "  name=\"embedding\")\n",
 84 |     "\n",
 85 |     "# For input gate.\n",
 86 |     "Wi = [tf.Variable(\n",
 87 |     "  tf.random_uniform([emb_size, hidden_size], minval=-init_scale, maxval=init_scale),\n",
 88 |     "  name=\"Wi_%d\" % i) for i in range(num_hidden_layers)]\n",
 89 |     "Ui = [tf.Variable(\n",
 90 |     "  tf.random_uniform([hidden_size, hidden_size], minval=-init_scale, maxval=init_scale),\n",
 91 |     "  name=\"Ui_%d\" % i) for i in range(num_hidden_layers)]\n",
 92 |     "\n",
 93 |     "# For forget gate.\n",
 94 |     "Wf = [tf.Variable(\n",
 95 |     "  tf.random_uniform([emb_size, hidden_size], minval=-init_scale, maxval=init_scale),\n",
 96 |     "  name=\"Wf_%d\" % i) for i in range(num_hidden_layers)]\n",
 97 |     "Uf = [tf.Variable(\n",
 98 |     "  tf.random_uniform([hidden_size, hidden_size], minval=-init_scale, maxval=init_scale),\n",
 99 |     "  name=\"Uf_%d\" % i) for i in range(num_hidden_layers)]\n",
100 |     "\n",
101 |     "# For content -- Quick note: there's no transformation from content -> state. They are both\n",
102 |     "# the same size.\n",
103 |     "Wc = [tf.Variable(\n",
104 |     "  tf.random_uniform([emb_size, hidden_size], minval=-init_scale, maxval=init_scale),\n",
105 |     "  name=\"Wc_%d\" % i) for i in range(num_hidden_layers)]\n",
106 |     "Uc = [tf.Variable(\n",
107 |     "  tf.random_uniform([hidden_size, hidden_size], minval=-init_scale, maxval=init_scale),\n",
108 |     "  name=\"Uc_%d\" % i) for i in range(num_hidden_layers)]\n",
109 |     "\n",
110 |     "# For hidden state output gate.\n",
111 |     "Wo = [tf.Variable(\n",
112 |     "  tf.random_uniform([emb_size, hidden_size], minval=-init_scale, maxval=init_scale),\n",
113 |     "  name=\"Wo_%d\" % i) for i in range(num_hidden_layers)]\n",
114 |     "Uo = [tf.Variable(\n",
115 |     "  tf.random_uniform([hidden_size, hidden_size], minval=-init_scale, maxval=init_scale),\n",
116 |     "  name=\"Uo_%d\" % i) for i in range(num_hidden_layers)]\n",
117 |     "\n",
118 |     "# For gated feedback gates (e.g. the contribution of the paper).\n",
119 |     "Wg = [tf.Variable(\n",
120 |     "  tf.random_uniform([emb_size, 1], minval=-init_scale, maxval=init_scale),\n",
121 |     "  name=\"Wg_%d\" % i) for i in range(num_hidden_layers)]\n",
122 |     "Ug = [tf.Variable(\n",
123 |     "  tf.random_uniform([hidden_size * num_hidden_layers, 1], minval=-init_scale, maxval=init_scale),\n",
124 |     "  name=\"Ug_%d\" % i) for i in range(num_hidden_layers)]\n",
125 |     "\n",
126 |     "# For output.\n",
127 |     "output_weights = tf.Variable(\n",
128 |     "  tf.random_uniform([hidden_size, vocab_size], minval=-init_scale, maxval=init_scale),\n",
129 |     "  name=\"output_weights\")\n",
130 |     "output_bias = tf.Variable(tf.zeros([vocab_size]), name=\"output_bias\")\n",
131 |     "\n",
132 |     "X_in = tf.nn.embedding_lookup(embedding, X)\n",
133 |     "\n",
134 |     "initial_state = tf.zeros([batch_size, hidden_size])\n",
135 |     "content = initial_state\n",
136 |     "state = [initial_state] * num_hidden_layers\n",
137 |     "prev_concat_h = tf.zeros([batch_size, hidden_size * num_hidden_layers])\n",
138 |     "loss = tf.zeros([])\n",
139 |     "# TODO: prev concat h\n",
140 |     "for time_step in range(num_steps):\n",
141 |     "  h_prev = X_in[:, time_step, :]\n",
142 |     "  for layer in range(num_hidden_layers):\n",
143 |     "    input_gate = tf.nn.sigmoid(tf.matmul(h_prev, Wi[layer])  + tf.matmul(state[layer], Ui[layer]))\n",
144 |     "    forget_gate = tf.nn.sigmoid(tf.matmul(h_prev, Wf[layer]) + tf.matmul(state[layer], Uf[layer]))\n",
145 |     "    output_gate = tf.nn.sigmoid(tf.matmul(h_prev, Wo[layer]) + tf.matmul(state[layer], Uo[layer]))\n",
146 |     "    \n",
147 |     "    # Main contribution of paper:\n",
148 |     "    gates = [tf.sigmoid(tf.matmul(h_prev, Wg[i]) + tf.matmul(prev_concat_h, Ug[i])) for i in range(num_hidden_layers)]\n",
149 |     "    gated_prev_timestep = [gates[i] * tf.matmul(state[layer], Uc[i]) for i in range(num_hidden_layers)]\n",
150 |     "    new_content = tf.nn.tanh(tf.matmul(h_prev, Wc[layer]) + tf.add_n(gated_prev_timestep))\n",
151 |     "    \n",
152 |     "    content = tf.mul(forget_gate, content) + tf.mul(input_gate, new_content)\n",
153 |     "    state[layer] = tf.mul(output_gate, tf.nn.tanh(content))\n",
154 |     "    \n",
155 |     "  logits = tf.nn.bias_add(tf.matmul(state[num_hidden_layers-1], output_weights), output_bias)\n",
156 |     "  step_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, targets[:, time_step])\n",
157 |     "  loss += tf.reduce_sum(step_loss)\n",
158 |     "  prev_concat_h = tf.concat(1, state)\n",
159 |     "\n",
160 |     "final_state = state\n",
161 |     "cost = loss / batch_size\n",
162 |     "\n",
163 |     "tf.scalar_summary(\"cost\", cost)\n",
164 |     "merged = tf.merge_all_summaries()\n",
165 |     "writer = tf.train.SummaryWriter(\"summaries/gfrnn\", session.graph_def)"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 4,
171 |    "metadata": {
172 |     "collapsed": false
173 |    },
174 |    "outputs": [
175 |     {
176 |      "name": "stdout",
177 |      "output_type": "stream",
178 |      "text": [
179 |       "Epoch 0\n",
180 |       "1000 1536.86554276\n",
181 |       "2000 1075.77593235\n",
182 |       "3000 845.104395239\n",
183 |       "4000 718.76732411\n",
184 |       "5000 641.0083012\n",
185 |       "6000 593.578288039\n",
186 |       "7000 552.39057359\n",
187 |       "8000 518.773597082\n",
188 |       "9000 490.011530283\n",
189 |       "10000 469.79929651\n",
190 |       "11000 445.811142429\n",
191 |       "12000 427.981149944\n",
192 |       "13000 413.231142047\n",
193 |       "14000 399.589796646\n",
194 |       "15000 387.374589524\n",
195 |       "16000 375.535316084\n",
196 |       "17000 364.656415065\n",
197 |       "18000 357.716949046\n",
198 |       "19000 350.007875462\n",
199 |       "20000 340.5189465\n",
200 |       "21000 334.729726814\n",
201 |       "22000 328.91687226\n",
202 |       "23000 323.211649227\n",
203 |       "24000 315.498545393\n",
204 |       "25000 309.774060383\n",
205 |       "26000 303.523051557\n",
206 |       "27000 297.389077065\n",
207 |       "28000 292.475912875\n",
208 |       "29000 287.393987197\n",
209 |       "30000 283.363092117\n",
210 |       "31000 278.863735412\n",
211 |       "32000 275.836836503\n",
212 |       "33000 272.480957052\n",
213 |       "34000 269.913912177\n",
214 |       "35000 266.354806615\n",
215 |       "36000 263.881540787\n",
216 |       "37000 260.255694866\n",
217 |       "38000 255.84195618\n",
218 |       "39000 253.142906497\n",
219 |       "40000 250.839347938\n",
220 |       "41000 247.700747125\n",
221 |       "42000 244.609508041\n",
222 |       "43000 241.109089536\n",
223 |       "44000 238.690587697\n",
224 |       "45000 236.178669209\n",
225 |       "46000 234.919347477\n",
226 |       "Epoch 1\n",
227 |       "47000 233.529050243\n",
228 |       "48000 231.346767272\n",
229 |       "49000 228.853817309\n",
230 |       "50000 225.70174362\n",
231 |       "51000 223.120431433\n",
232 |       "52000 221.306655681\n",
233 |       "53000 219.796232818\n",
234 |       "54000 218.247005976\n",
235 |       "55000 216.268173971\n",
236 |       "56000 214.857019061\n",
237 |       "57000 212.859998202\n",
238 |       "58000 210.188553206\n",
239 |       "59000 209.079580721\n",
240 |       "60000 207.045767747\n",
241 |       "61000 205.196654393\n",
242 |       "62000 203.450599296\n",
243 |       "63000 201.735644431\n",
244 |       "64000 200.189708323\n",
245 |       "65000 198.788259576\n",
246 |       "66000 197.335555278\n",
247 |       "67000 195.543393851\n",
248 |       "68000 194.538999587\n",
249 |       "69000 193.175939226\n",
250 |       "70000 191.430713621\n",
251 |       "71000 189.920810466\n",
252 |       "72000 188.439249937\n",
253 |       "73000 186.775430173\n",
254 |       "74000 185.052030742\n",
255 |       "75000 183.768886451\n",
256 |       "76000 182.437888552\n",
257 |       "77000 181.030238461\n",
258 |       "78000 179.976427611\n",
259 |       "79000 178.762464736\n",
260 |       "80000 177.910351761\n",
261 |       "81000 176.838355055\n",
262 |       "82000 175.814496864\n",
263 |       "83000 174.593081319\n",
264 |       "84000 173.129086523\n",
265 |       "85000 171.620987287\n",
266 |       "86000 170.677194178\n",
267 |       "87000 169.506480399\n",
268 |       "88000 168.378289044\n",
269 |       "89000 166.883673974\n",
270 |       "90000 165.782215237\n",
271 |       "91000 164.632383932\n",
272 |       "92000 163.851626382\n"
273 |      ]
274 |     }
275 |    ],
276 |    "source": [
277 |     "# Train Model\n",
278 |     "session.run(tf.initialize_all_variables())\n",
279 |     "sgd = tf.train.GradientDescentOptimizer(lr).minimize(cost)\n",
280 |     "costs = 0.0\n",
281 |     "iters = 0\n",
282 |     "for i in range(epochs):\n",
283 |     "  print 'Epoch', i\n",
284 |     "  for step, (x, y) in enumerate(reader.ptb_iterator(train_data, batch_size, num_steps)):\n",
285 |     "    result, step_cost, _, = session.run([merged, cost, sgd],\n",
286 |     "                             {X: x, targets: y, lr: 1.0 / (i + 1)})\n",
287 |     "    costs += step_cost\n",
288 |     "    iters += num_steps\n",
289 |     "    if iters % 1000 == 0:\n",
290 |     "      print iters, np.exp(costs / iters)\n",
291 |     "      writer.add_summary(result, iters)\n",
292 |     "      writer.flush()"
293 |    ]
294 |   }
295 |  ],
296 |  "metadata": {
297 |   "kernelspec": {
298 |    "display_name": "Python 2",
299 |    "language": "python",
300 |    "name": "python2"
301 |   },
302 |   "language_info": {
303 |    "codemirror_mode": {
304 |     "name": "ipython",
305 |     "version": 2
306 |    },
307 |    "file_extension": ".py",
308 |    "mimetype": "text/x-python",
309 |    "name": "python",
310 |    "nbconvert_exporter": "python",
311 |    "pygments_lexer": "ipython2",
312 |    "version": "2.7.10"
313 |   }
314 |  },
315 |  "nbformat": 4,
316 |  "nbformat_minor": 0
317 | }
318 | 


--------------------------------------------------------------------------------