├── .gitignore ├── README.md ├── fig1.png ├── figE.png ├── figEHE.png ├── figFS.png ├── figH.png ├── gentext.py ├── inputeasy.txt ├── inputhard.txt ├── makefigE.py ├── makefigEHE.py ├── makefigFS.py ├── makefigH.py ├── makefigmulti.py ├── min-char-rnn-param.py ├── paper ├── IEEEtran.cls ├── bare_jrnl.tex ├── biblio.bib ├── figE.png ├── figEHE.png ├── figFS.png ├── figH.png ├── nips_2016.aux ├── nips_2016.dvi ├── nips_2016.log ├── nips_2016.out ├── nips_2016.pdf ├── nips_2016.sty ├── nips_2016.tex ├── paper-blx.bib ├── paper.aux ├── paper.bbl ├── paper.blg ├── paper.log ├── paper.out ├── paper.pdf ├── paper.run.xml ├── paper.tex ├── paper.tex.nips └── smallbiblio.bib ├── rnn.py ├── rnn.py.prev ├── rnnAltern.py └── runexp.py /.gitignore: -------------------------------------------------------------------------------- 1 | trial*/ 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This is the source code for the arXiv preprint ["Neural networks with differentiable structure"](https://arxiv.org/abs/1606.06216). 2 | 3 | This code implements recurrent neural networks with differentiable structure: the number of neurons in the network undergoes gradient descent, just like the weights of the 4 | network. The network adjusts its number of neurons to the complexity of the task at hand. 5 | 6 | This code is based on Andrej Karpathy's [`min-char-rnn.py`](https://gist.github.com/karpathy/d4dee566867f8291f086) program. 7 | 8 | `rnn.py` is the main program. You can run it "as is" (`python rnn.py`) to run 9 | the model on the "hard" problem for 100000 cycles. It will generate an output 10 | file called `output.txt`, updated every 1000 cycles, which logs the current 11 | cycle number, position in the input file, loss, number of neurons, and total absolute sum of multipliers. (see code). 12 | 13 | Other 14 | python files in the repository generate inputs or figures, or submit jobs to a cluster. 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasMiconi/DiffRNN/c81eb14ab9d51191b4d7891e0472036077f49bcc/fig1.png -------------------------------------------------------------------------------- /figE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasMiconi/DiffRNN/c81eb14ab9d51191b4d7891e0472036077f49bcc/figE.png -------------------------------------------------------------------------------- /figEHE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasMiconi/DiffRNN/c81eb14ab9d51191b4d7891e0472036077f49bcc/figEHE.png -------------------------------------------------------------------------------- /figFS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasMiconi/DiffRNN/c81eb14ab9d51191b4d7891e0472036077f49bcc/figFS.png -------------------------------------------------------------------------------- /figH.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasMiconi/DiffRNN/c81eb14ab9d51191b4d7891e0472036077f49bcc/figH.png -------------------------------------------------------------------------------- /gentext.py: -------------------------------------------------------------------------------- 1 | import random 2 | import sys 3 | n = 0 4 | s1 = ["a", "b", "b", "a", "a", "b"] 5 | with open('inputhard.txt', 'w') as f: 6 | while True: 7 | s2 = s1[::-1] 8 | pos = random.randint(0, len(s2)-1) 9 | if s2[pos] == "a": 10 | s2[pos] = "b" 11 | else: 12 | s2[pos] = "a" 13 | s1 = s2 14 | n += len(s2)+2 15 | #sys.stdout.write("("+"".join(s2)+")") 16 | f.write("("+"".join(s2)+")") 17 | if n > 1200000: 18 | break 19 | with open('inputeasy.txt', 'w') as f: 20 | for n in range (200000): 21 | f.write("(ab") 22 | while (random.random() < .6): 23 | f.write("ab") 24 | f.write(")") 25 | 26 | #for n in range (200000): 27 | # if random.random() < .5: 28 | # sys.stdout.write("aa") 29 | # else: 30 | # sys.stdout.write("bb") 31 | -------------------------------------------------------------------------------- /makefigE.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | font = {#'family' : 'normal', 6 | # 'weight' : 'bold', 7 | 'size' : 9} 8 | plt.rc('font', **font) 9 | 10 | plt.ion() 11 | np.set_printoptions(precision=3, suppress=True) 12 | 13 | #dirz = glob.glob('trial-max10*') 14 | #dirz = glob.glob('trial-new-hardeasy*') 15 | #dirz = glob.glob('trial-easyhardeasy*') 16 | #dirz = glob.glob('trial-fixedsize*') 17 | dirz = glob.glob('trial-ref*-EASY-*') 18 | dirz.sort() 19 | NBPLOTS = len(dirz) 20 | SS = np.ceil(np.sqrt(NBPLOTS)) 21 | 22 | plt.figure(1, figsize=(3, 2), dpi=100, facecolor='w', edgecolor='k') 23 | 24 | nplot = 1 25 | thards= [] 26 | teasys=[] 27 | colorz=['b', 'b', 'b', 'r', 'g'] 28 | labelz = ['', '', 'Loss', '# Neurons'] 29 | for (num, droot) in enumerate(dirz): 30 | t = [] 31 | for v in range(20): 32 | dfull = droot + "/v" + str(v) 33 | #t.append(np.loadtxt(dfull+"/test.txt")[:200,:]) 34 | t.append(np.loadtxt(dfull+"/output.txt")) 35 | t = np.dstack(t) 36 | tmean = np.mean(t, axis=2) 37 | tstd = np.std(t, axis=2) 38 | tmedian = np.median(t, axis=2) 39 | tq25 = np.percentile(t, 25, axis=2) 40 | tq75 = np.percentile(t, 75, axis=2) 41 | 42 | ax = plt.subplot(SS, SS, nplot) 43 | ax.set_title('Easy problem') 44 | for vari in [3, 2]: # range(2, tmean.shape[1]): 45 | plt.fill_between(range(tmean.shape[0]), tq25[:, vari], tq75[:, vari], linewidth=0.0, alpha=0.3, facecolor=colorz[vari]) 46 | plt.plot(tmedian[:, vari], color=colorz[vari], label=labelz[vari], linewidth=2) 47 | plt.axis([0, tmean.shape[0], 0, 50]) 48 | 49 | print num, tmean[90, :], tmean[190, :], tmean[-1, :], droot 50 | thards.append(tmean[90,:]) 51 | teasys.append(tmean[-1,:]) 52 | 53 | nplot += 1 54 | 55 | plt.xlabel('Iterations (x1000)') 56 | #plt.ylabel('Loss', color='b') 57 | plt.legend(fontsize=8) 58 | plt.tight_layout() 59 | 60 | print "Data read." 61 | 62 | plt.show() 63 | 64 | plt.savefig('figE.png', bbox_inches='tight') 65 | -------------------------------------------------------------------------------- /makefigEHE.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | font = {#'family' : 'normal', 6 | # 'weight' : 'bold', 7 | 'size' : 9} 8 | plt.rc('font', **font) 9 | 10 | plt.ion() 11 | np.set_printoptions(precision=3, suppress=True) 12 | 13 | #dirz = glob.glob('trial-max10*') 14 | #dirz = glob.glob('trial-new-hardeasy*') 15 | #dirz = glob.glob('trial-easyhardeasy*') 16 | #dirz = glob.glob('trial-fixedsize*') 17 | dirz = glob.glob('trial-ref*EASYHARDEASY*') 18 | dirz.sort() 19 | NBPLOTS = len(dirz) 20 | SS = np.ceil(np.sqrt(NBPLOTS)) 21 | 22 | plt.figure(3, figsize=(4, 2.5), dpi=100, facecolor='w', edgecolor='k') 23 | 24 | nplot = 1 25 | thards= [] 26 | teasys=[] 27 | colorz=['b', 'b', 'b', 'r', 'g'] 28 | labelz = ['', '', 'Loss', '# Neurons'] 29 | for (num, droot) in enumerate(dirz): 30 | t = [] 31 | for v in range(20): 32 | dfull = droot + "/v" + str(v) 33 | t.append(np.loadtxt(dfull+"/output.txt")) 34 | t = np.dstack(t) 35 | tmean = np.mean(t, axis=2) 36 | tstd = np.std(t, axis=2) 37 | tmedian = np.median(t, axis=2) 38 | tq25 = np.percentile(t, 25, axis=2) 39 | tq75 = np.percentile(t, 75, axis=2) 40 | 41 | ax = plt.subplot(SS, SS, nplot) 42 | ax.set_title('Easy-Hard-Easy transition') 43 | for vari in [3, 2]: # range(2, tmean.shape[1]): 44 | plt.fill_between(range(tmean.shape[0]), tq25[:, vari], tq75[:, vari], linewidth=0.0, alpha=0.3, facecolor=colorz[vari]) 45 | plt.plot(tmedian[:, vari], color=colorz[vari], label=labelz[vari], linewidth=2) 46 | plt.axis([0, tmean.shape[0], 0, 50]) 47 | 48 | print num, tmean[90, :], tmean[190, :], tmean[-1, :], droot 49 | thards.append(tmean[90,:]) 50 | teasys.append(tmean[-1,:]) 51 | 52 | nplot += 1 53 | 54 | print "Data read." 55 | 56 | plt.axvline(100, linestyle='--', c='k') 57 | plt.axvline(200, linestyle='--', c='k') 58 | plt.text(40, 30, 'Easy') 59 | plt.text(140, 30, 'Hard') 60 | plt.text(240, 30, 'Easy') 61 | 62 | plt.xlabel('Iterations (x1000)') 63 | #plt.ylabel('Loss', color='b') 64 | plt.legend(fontsize=8) 65 | plt.tight_layout() 66 | 67 | 68 | plt.show() 69 | 70 | plt.savefig('figEHE.png', bbox_inches='tight') 71 | -------------------------------------------------------------------------------- /makefigFS.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | font = {#'family' : 'normal', 6 | # 'weight' : 'bold', 7 | 'size' : 9} 8 | plt.rc('font', **font) 9 | 10 | plt.ion() 11 | np.set_printoptions(precision=3, suppress=True) 12 | 13 | #dirz = glob.glob('trial-max10*') 14 | #dirz = glob.glob('trial-new-hardeasy*') 15 | #dirz = glob.glob('trial-easyhardeasy*') 16 | dirz = glob.glob('trial-fixedsize*') 17 | dirz2 = glob.glob('trial-ref-*-HARD-*') 18 | 19 | dirz = dirz + dirz2 20 | dirz.sort() 21 | NBPLOTS = len(dirz) 22 | SS = np.ceil(np.sqrt(NBPLOTS)) 23 | linez=[] 24 | 25 | plt.figure(1, figsize=(4, 2.5), dpi=100, facecolor='w', edgecolor='k') 26 | 27 | nplot = 1 28 | thards= [] 29 | teasys=[] 30 | colorz=['b', 'r', 'g', 'm', 'c', 'orange'] 31 | labelz=['10 neurons', '100 neurons', '27 neurons', '30 neurons', '50 neurons', 'Variable Size'] 32 | for (num, droot) in enumerate(dirz): 33 | t = [] 34 | for v in range(20): 35 | dfull = droot + "/v" + str(v) 36 | t.append(np.loadtxt(dfull+"/output.txt")[:200, :]) 37 | t = np.dstack(t) 38 | tmean = np.mean(t, axis=2) 39 | tstd = np.std(t, axis=2) 40 | tmedian = np.median(t, axis=2) 41 | tq25 = np.percentile(t, 25, axis=2) 42 | tq75 = np.percentile(t, 75, axis=2) 43 | 44 | for vari in [2]: # range(2, tmean.shape[1]): 45 | #plt.fill_between(range(tmean.shape[0]), tq25[:, vari], tq75[:, vari], linewidth=0.0, alpha=0.3, facecolor=colorz[vari]) 46 | if num == len(dirz)-1: # The last curve is that of the variable-size runs 47 | linez.append(plt.plot(tmedian[:, vari], color='k', linewidth=2, label=labelz[num])) 48 | else: 49 | linez.append(plt.plot(tmedian[:, vari], color=colorz[num], label=labelz[num])) 50 | plt.axis([0, tmean.shape[0], 0, 50]) 51 | 52 | print num, tmean[90, :], tmean[190, :], tmean[-1, :], droot 53 | thards.append(tmean[90,:]) 54 | teasys.append(tmean[-1,:]) 55 | 56 | nplot += 1 57 | 58 | plt.xlabel('Iterations (x1000)') 59 | plt.ylabel('Loss') 60 | plt.legend(fontsize=8) 61 | plt.tight_layout() 62 | 63 | print "Data read." 64 | 65 | plt.show() 66 | 67 | plt.savefig('figFS.png', bbox_inches='tight') 68 | -------------------------------------------------------------------------------- /makefigH.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | font = {#'family' : 'normal', 6 | # 'weight' : 'bold', 7 | 'size' : 10} 8 | plt.rc('font', **font) 9 | 10 | plt.ion() 11 | np.set_printoptions(precision=3, suppress=True) 12 | 13 | #dirz = glob.glob('trial-max10*') 14 | #dirz = glob.glob('trial-new-hardeasy*') 15 | #dirz = glob.glob('trial-easyhardeasy*') 16 | #dirz = glob.glob('trial-fixedsize*') 17 | dirz = glob.glob('trial-ref*HARD-*') 18 | dirz.sort() 19 | NBPLOTS = len(dirz) 20 | SS = np.ceil(np.sqrt(NBPLOTS)) 21 | 22 | plt.figure(2, figsize=(3, 2), dpi=100, facecolor='w', edgecolor='k') 23 | 24 | nplot = 1 25 | thards= [] 26 | teasys=[] 27 | colorz=['b', 'b', 'b', 'r', 'g'] 28 | labelz = ['', '', 'Loss', '# Neurons'] 29 | for (num, droot) in enumerate(dirz): 30 | t = [] 31 | for v in range(20): 32 | dfull = droot + "/v" + str(v) 33 | #t.append(np.loadtxt(dfull+"/output.txt")[:200,:]) 34 | t.append(np.loadtxt(dfull+"/output.txt")) 35 | t = np.dstack(t) 36 | tmean = np.mean(t, axis=2) 37 | tstd = np.std(t, axis=2) 38 | tmedian = np.median(t, axis=2) 39 | tq25 = np.percentile(t, 25, axis=2) 40 | tq75 = np.percentile(t, 75, axis=2) 41 | 42 | ax = plt.subplot(SS, SS, nplot) 43 | ax.set_title('Hard problem') 44 | for vari in [3, 2]: # range(2, tmean.shape[1]): 45 | plt.fill_between(range(tmean.shape[0]), tq25[:, vari], tq75[:, vari], linewidth=0.0, alpha=0.3, facecolor=colorz[vari]) 46 | plt.plot(tmedian[:, vari], color=colorz[vari], label=labelz[vari], linewidth=2) 47 | plt.axis([0, tmean.shape[0], 0, 50]) 48 | 49 | print num, tmean[90, :], tmean[190, :], tmean[-1, :], droot 50 | thards.append(tmean[90,:]) 51 | teasys.append(tmean[-1,:]) 52 | 53 | nplot += 1 54 | 55 | plt.xlabel('Iterations (x1000)') 56 | #plt.ylabel('Loss', color='b') 57 | #plt.legend(fontsize=8) 58 | plt.tight_layout() 59 | 60 | print "Data read." 61 | 62 | plt.show() 63 | 64 | plt.savefig('figH.png', bbox_inches='tight') 65 | -------------------------------------------------------------------------------- /makefigmulti.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | font = {#'family' : 'normal', 6 | # 'weight' : 'bold', 7 | 'size' : 10} 8 | plt.rc('font', **font) 9 | 10 | plt.ion() 11 | np.set_printoptions(precision=3, suppress=True) 12 | 13 | #dirz = glob.glob('trial-max10*') 14 | #dirz = glob.glob('trial-new-hardeasy*') 15 | #dirz = glob.glob('trial-EHE*MULTIPGRAD-1*') 16 | dirz = glob.glob('trial-EHE*900000*') 17 | #dirz = glob.glob('trial-fixedsize*') 18 | #dirz = glob.glob('trial-ref*EASYHARDEASY*') 19 | dirz.sort() 20 | NBPLOTS = len(dirz) 21 | SS = np.ceil(np.sqrt(NBPLOTS)) 22 | 23 | plt.figure(1, figsize=(4, 3), dpi=100, facecolor='w', edgecolor='k') 24 | 25 | nplot = 1 26 | perfs = [] 27 | nbneurs = [] 28 | dirs = [] 29 | colorz=['b', 'b', 'b', 'r', 'g'] 30 | for (num, droot) in enumerate(dirz): 31 | t = [] 32 | for v in range(10): 33 | dfull = droot + "/v" + str(v) 34 | t.append(np.loadtxt(dfull+"/output.txt")) 35 | t = np.dstack(t) 36 | tmean = np.mean(t, axis=2) 37 | tstd = np.std(t, axis=2) 38 | tmedian = np.median(t, axis=2) 39 | tq25 = np.percentile(t, 25, axis=2) 40 | tq75 = np.percentile(t, 75, axis=2) 41 | 42 | ax = plt.subplot(SS, SS, nplot) 43 | ax.set_title(num) 44 | for vari in [3, 2]: # range(2, tmean.shape[1]): 45 | plt.fill_between(range(tmean.shape[0]), tq25[:, vari], tq75[:, vari], linewidth=0.0, alpha=0.3, facecolor=colorz[vari]) 46 | plt.plot(tmedian[:, vari], color=colorz[vari]) 47 | plt.axis([0, tmean.shape[0], 0, 50]) 48 | 49 | p1 = int(tmean.shape[0] / 3) 50 | p2 = 2*int(tmean.shape[0] / 3) 51 | p3 = -1 52 | 53 | print num, tmean[p1, :], tmean[p2, :], tmean[p3, :], droot 54 | perfs.append([tmean[p1,2], tmean[p2, 2], tmean[p3, 2]]) 55 | nbneurs.append([tmean[p1,3], tmean[p2, 3], tmean[p3, 3]]) 56 | dirs.append(droot) 57 | 58 | nplot += 1 59 | 60 | print "Data read." 61 | 62 | perfs = np.array(perfs) 63 | p = perfs[:,1] 64 | nbneurs = np.array(nbneurs) 65 | dneur = nbneurs[:, 1] - nbneurs[:,2] 66 | ord = np.argsort(p) 67 | data = np.vstack((ord, dneur[ord], p[ord])).T 68 | 69 | 70 | plt.show() 71 | 72 | #plt.savefig('fig1.png', bbox_inches='tight') 73 | -------------------------------------------------------------------------------- /min-char-rnn-param.py: -------------------------------------------------------------------------------- 1 | """ 2 | Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy) 3 | Modified to take parameters from the command line. 4 | BSD License 5 | """ 6 | import numpy as np 7 | import sys 8 | 9 | g = { 10 | 'NBSTEPS' : 300000, 11 | 'HIDDENSIZE' : 100, 12 | 'RNGSEED' : 0 13 | } 14 | 15 | argpairs = [sys.argv[i:i+2] for i in range(1, len(sys.argv), 2)] 16 | 17 | for argpair in argpairs: 18 | if not (argpair[0] in g): 19 | sys.exit("Error, tried to pass value of non-existent parameter "+argpair[0]) 20 | g[argpair[0]] = int(argpair[1]) 21 | print g 22 | 23 | # data I/O 24 | myf = open("output.txt", "w") 25 | myf.close() 26 | data = open('../../inputhard.txt', 'r').read() # should be simple plain text file 27 | chars = list(set(data)) 28 | data_size, vocab_size = len(data), len(chars) 29 | print 'data has %d characters, %d unique.' % (data_size, vocab_size) 30 | char_to_ix = { ch:i for i,ch in enumerate(chars) } 31 | ix_to_char = { i:ch for i,ch in enumerate(chars) } 32 | 33 | # hyperparameters 34 | hidden_size = g['HIDDENSIZE'] # size of hidden layer of neurons 35 | seq_length = 40 # number of steps to unroll the RNN for 36 | learning_rate = 1e-1 37 | 38 | # model parameters 39 | Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden 40 | Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden 41 | Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output 42 | bh = np.zeros((hidden_size, 1)) # hidden bias 43 | by = np.zeros((vocab_size, 1)) # output bias 44 | 45 | def lossFun(inputs, targets, hprev): 46 | """ 47 | inputs,targets are both list of integers. 48 | hprev is Hx1 array of initial hidden state 49 | returns the loss, gradients on model parameters, and last hidden state 50 | """ 51 | xs, hs, ys, ps = {}, {}, {}, {} 52 | hs[-1] = np.copy(hprev) 53 | loss = 0 54 | # forward pass 55 | for t in xrange(len(inputs)): 56 | xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation 57 | xs[t][inputs[t]] = 1 58 | hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state 59 | ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars 60 | ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars 61 | loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss) 62 | # backward pass: compute gradients going backwards 63 | dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why) 64 | dbh, dby = np.zeros_like(bh), np.zeros_like(by) 65 | dhnext = np.zeros_like(hs[0]) 66 | for t in reversed(xrange(len(inputs))): 67 | dy = np.copy(ps[t]) 68 | dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here 69 | dWhy += np.dot(dy, hs[t].T) 70 | dby += dy 71 | dh = np.dot(Why.T, dy) + dhnext # backprop into h 72 | dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity 73 | dbh += dhraw 74 | dWxh += np.dot(dhraw, xs[t].T) 75 | dWhh += np.dot(dhraw, hs[t-1].T) 76 | dhnext = np.dot(Whh.T, dhraw) 77 | for dparam in [dWxh, dWhh, dWhy, dbh, dby]: 78 | np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients 79 | return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1] 80 | 81 | def sample(h, seed_ix, n): 82 | """ 83 | sample a sequence of integers from the model 84 | h is memory state, seed_ix is seed letter for first time step 85 | """ 86 | x = np.zeros((vocab_size, 1)) 87 | x[seed_ix] = 1 88 | ixes = [] 89 | for t in xrange(n): 90 | h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh) 91 | y = np.dot(Why, h) + by 92 | p = np.exp(y) / np.sum(np.exp(y)) 93 | ix = np.random.choice(range(vocab_size), p=p.ravel()) 94 | x = np.zeros((vocab_size, 1)) 95 | x[ix] = 1 96 | ixes.append(ix) 97 | return ixes 98 | 99 | n, p = 0, 0 100 | mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why) 101 | mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad 102 | smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0 103 | while True: 104 | # prepare inputs (we're sweeping from left to right in steps seq_length long) 105 | if p+seq_length+1 >= len(data) or n == 0: 106 | hprev = np.zeros((hidden_size,1)) # reset RNN memory 107 | p = 0 # go from start of data 108 | inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]] 109 | targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]] 110 | 111 | # sample from the model now and then 112 | if n % 100 == 0: 113 | sample_ix = sample(hprev, inputs[0], 200) 114 | txt = ''.join(ix_to_char[ix] for ix in sample_ix) 115 | print '----\n %s \n----' % (txt, ) 116 | 117 | # forward seq_length characters through the net and fetch gradient 118 | loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev) 119 | smooth_loss = smooth_loss * 0.999 + loss * 0.001 120 | if n % 100 == 0: print 'iter %d, loss: %f' % (n, smooth_loss) # print progress 121 | 122 | if n % 1000 == 0: 123 | with open("output.txt", "a") as myf: 124 | msg = "%d %d %f %d " % (n, p, smooth_loss, hidden_size) # print progress 125 | myf.write(msg+"\n") 126 | 127 | # perform parameter update with Adagrad 128 | for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 129 | [dWxh, dWhh, dWhy, dbh, dby], 130 | [mWxh, mWhh, mWhy, mbh, mby]): 131 | mem += dparam * dparam 132 | param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update 133 | 134 | p += seq_length # move data pointer 135 | n += 1 # iteration counter 136 | if n > g['NBSTEPS']: 137 | sys.exit(0) 138 | 139 | -------------------------------------------------------------------------------- /paper/bare_jrnl.tex: -------------------------------------------------------------------------------- 1 | 2 | %% bare_jrnl.tex 3 | %% V1.4b 4 | %% 2015/08/26 5 | %% by Michael Shell 6 | %% see http://www.michaelshell.org/ 7 | %% for current contact information. 8 | %% 9 | %% This is a skeleton file demonstrating the use of IEEEtran.cls 10 | %% (requires IEEEtran.cls version 1.8b or later) with an IEEE 11 | %% journal paper. 12 | %% 13 | %% Support sites: 14 | %% http://www.michaelshell.org/tex/ieeetran/ 15 | %% http://www.ctan.org/pkg/ieeetran 16 | %% and 17 | %% http://www.ieee.org/ 18 | 19 | %%************************************************************************* 20 | %% Legal Notice: 21 | %% This code is offered as-is without any warranty either expressed or 22 | %% implied; without even the implied warranty of MERCHANTABILITY or 23 | %% FITNESS FOR A PARTICULAR PURPOSE! 24 | %% User assumes all risk. 25 | %% In no event shall the IEEE or any contributor to this code be liable for 26 | %% any damages or losses, including, but not limited to, incidental, 27 | %% consequential, or any other damages, resulting from the use or misuse 28 | %% of any information contained here. 29 | %% 30 | %% All comments are the opinions of their respective authors and are not 31 | %% necessarily endorsed by the IEEE. 32 | %% 33 | %% This work is distributed under the LaTeX Project Public License (LPPL) 34 | %% ( http://www.latex-project.org/ ) version 1.3, and may be freely used, 35 | %% distributed and modified. A copy of the LPPL, version 1.3, is included 36 | %% in the base LaTeX documentation of all distributions of LaTeX released 37 | %% 2003/12/01 or later. 38 | %% Retain all contribution notices and credits. 39 | %% ** Modified files should be clearly indicated as such, including ** 40 | %% ** renaming them and changing author support contact information. ** 41 | %%************************************************************************* 42 | 43 | 44 | % *** Authors should verify (and, if needed, correct) their LaTeX system *** 45 | % *** with the testflow diagnostic prior to trusting their LaTeX platform *** 46 | % *** with production work. The IEEE's font choices and paper sizes can *** 47 | % *** trigger bugs that do not appear when using other class files. *** *** 48 | % The testflow support page is at: 49 | % http://www.michaelshell.org/tex/testflow/ 50 | 51 | 52 | 53 | \documentclass[journal]{IEEEtran} 54 | % 55 | % If IEEEtran.cls has not been installed into the LaTeX system files, 56 | % manually specify the path to it like: 57 | % \documentclass[journal]{../sty/IEEEtran} 58 | 59 | 60 | 61 | 62 | 63 | % Some very useful LaTeX packages include: 64 | % (uncomment the ones you want to load) 65 | 66 | 67 | % *** MISC UTILITY PACKAGES *** 68 | % 69 | %\usepackage{ifpdf} 70 | % Heiko Oberdiek's ifpdf.sty is very useful if you need conditional 71 | % compilation based on whether the output is pdf or dvi. 72 | % usage: 73 | % \ifpdf 74 | % % pdf code 75 | % \else 76 | % % dvi code 77 | % \fi 78 | % The latest version of ifpdf.sty can be obtained from: 79 | % http://www.ctan.org/pkg/ifpdf 80 | % Also, note that IEEEtran.cls V1.7 and later provides a builtin 81 | % \ifCLASSINFOpdf conditional that works the same way. 82 | % When switching from latex to pdflatex and vice-versa, the compiler may 83 | % have to be run twice to clear warning/error messages. 84 | 85 | 86 | 87 | 88 | 89 | 90 | % *** CITATION PACKAGES *** 91 | % 92 | %\usepackage{cite} 93 | % cite.sty was written by Donald Arseneau 94 | % V1.6 and later of IEEEtran pre-defines the format of the cite.sty package 95 | % \cite{} output to follow that of the IEEE. Loading the cite package will 96 | % result in citation numbers being automatically sorted and properly 97 | % "compressed/ranged". e.g., [1], [9], [2], [7], [5], [6] without using 98 | % cite.sty will become [1], [2], [5]--[7], [9] using cite.sty. cite.sty's 99 | % \cite will automatically add leading space, if needed. Use cite.sty's 100 | % noadjust option (cite.sty V3.8 and later) if you want to turn this off 101 | % such as if a citation ever needs to be enclosed in parenthesis. 102 | % cite.sty is already installed on most LaTeX systems. Be sure and use 103 | % version 5.0 (2009-03-20) and later if using hyperref.sty. 104 | % The latest version can be obtained at: 105 | % http://www.ctan.org/pkg/cite 106 | % The documentation is contained in the cite.sty file itself. 107 | 108 | 109 | 110 | 111 | 112 | 113 | % *** GRAPHICS RELATED PACKAGES *** 114 | % 115 | \ifCLASSINFOpdf 116 | % \usepackage[pdftex]{graphicx} 117 | % declare the path(s) where your graphic files are 118 | % \graphicspath{{../pdf/}{../jpeg/}} 119 | % and their extensions so you won't have to specify these with 120 | % every instance of \includegraphics 121 | % \DeclareGraphicsExtensions{.pdf,.jpeg,.png} 122 | \else 123 | % or other class option (dvipsone, dvipdf, if not using dvips). graphicx 124 | % will default to the driver specified in the system graphics.cfg if no 125 | % driver is specified. 126 | % \usepackage[dvips]{graphicx} 127 | % declare the path(s) where your graphic files are 128 | % \graphicspath{{../eps/}} 129 | % and their extensions so you won't have to specify these with 130 | % every instance of \includegraphics 131 | % \DeclareGraphicsExtensions{.eps} 132 | \fi 133 | % graphicx was written by David Carlisle and Sebastian Rahtz. It is 134 | % required if you want graphics, photos, etc. graphicx.sty is already 135 | % installed on most LaTeX systems. The latest version and documentation 136 | % can be obtained at: 137 | % http://www.ctan.org/pkg/graphicx 138 | % Another good source of documentation is "Using Imported Graphics in 139 | % LaTeX2e" by Keith Reckdahl which can be found at: 140 | % http://www.ctan.org/pkg/epslatex 141 | % 142 | % latex, and pdflatex in dvi mode, support graphics in encapsulated 143 | % postscript (.eps) format. pdflatex in pdf mode supports graphics 144 | % in .pdf, .jpeg, .png and .mps (metapost) formats. Users should ensure 145 | % that all non-photo figures use a vector format (.eps, .pdf, .mps) and 146 | % not a bitmapped formats (.jpeg, .png). The IEEE frowns on bitmapped formats 147 | % which can result in "jaggedy"/blurry rendering of lines and letters as 148 | % well as large increases in file sizes. 149 | % 150 | % You can find documentation about the pdfTeX application at: 151 | % http://www.tug.org/applications/pdftex 152 | 153 | 154 | 155 | 156 | 157 | % *** MATH PACKAGES *** 158 | % 159 | %\usepackage{amsmath} 160 | % A popular package from the American Mathematical Society that provides 161 | % many useful and powerful commands for dealing with mathematics. 162 | % 163 | % Note that the amsmath package sets \interdisplaylinepenalty to 10000 164 | % thus preventing page breaks from occurring within multiline equations. Use: 165 | %\interdisplaylinepenalty=2500 166 | % after loading amsmath to restore such page breaks as IEEEtran.cls normally 167 | % does. amsmath.sty is already installed on most LaTeX systems. The latest 168 | % version and documentation can be obtained at: 169 | % http://www.ctan.org/pkg/amsmath 170 | 171 | 172 | 173 | 174 | 175 | % *** SPECIALIZED LIST PACKAGES *** 176 | % 177 | %\usepackage{algorithmic} 178 | % algorithmic.sty was written by Peter Williams and Rogerio Brito. 179 | % This package provides an algorithmic environment fo describing algorithms. 180 | % You can use the algorithmic environment in-text or within a figure 181 | % environment to provide for a floating algorithm. Do NOT use the algorithm 182 | % floating environment provided by algorithm.sty (by the same authors) or 183 | % algorithm2e.sty (by Christophe Fiorio) as the IEEE does not use dedicated 184 | % algorithm float types and packages that provide these will not provide 185 | % correct IEEE style captions. The latest version and documentation of 186 | % algorithmic.sty can be obtained at: 187 | % http://www.ctan.org/pkg/algorithms 188 | % Also of interest may be the (relatively newer and more customizable) 189 | % algorithmicx.sty package by Szasz Janos: 190 | % http://www.ctan.org/pkg/algorithmicx 191 | 192 | 193 | 194 | 195 | % *** ALIGNMENT PACKAGES *** 196 | % 197 | %\usepackage{array} 198 | % Frank Mittelbach's and David Carlisle's array.sty patches and improves 199 | % the standard LaTeX2e array and tabular environments to provide better 200 | % appearance and additional user controls. As the default LaTeX2e table 201 | % generation code is lacking to the point of almost being broken with 202 | % respect to the quality of the end results, all users are strongly 203 | % advised to use an enhanced (at the very least that provided by array.sty) 204 | % set of table tools. array.sty is already installed on most systems. The 205 | % latest version and documentation can be obtained at: 206 | % http://www.ctan.org/pkg/array 207 | 208 | 209 | % IEEEtran contains the IEEEeqnarray family of commands that can be used to 210 | % generate multiline equations as well as matrices, tables, etc., of high 211 | % quality. 212 | 213 | 214 | 215 | 216 | % *** SUBFIGURE PACKAGES *** 217 | %\ifCLASSOPTIONcompsoc 218 | % \usepackage[caption=false,font=normalsize,labelfont=sf,textfont=sf]{subfig} 219 | %\else 220 | % \usepackage[caption=false,font=footnotesize]{subfig} 221 | %\fi 222 | % subfig.sty, written by Steven Douglas Cochran, is the modern replacement 223 | % for subfigure.sty, the latter of which is no longer maintained and is 224 | % incompatible with some LaTeX packages including fixltx2e. However, 225 | % subfig.sty requires and automatically loads Axel Sommerfeldt's caption.sty 226 | % which will override IEEEtran.cls' handling of captions and this will result 227 | % in non-IEEE style figure/table captions. To prevent this problem, be sure 228 | % and invoke subfig.sty's "caption=false" package option (available since 229 | % subfig.sty version 1.3, 2005/06/28) as this is will preserve IEEEtran.cls 230 | % handling of captions. 231 | % Note that the Computer Society format requires a larger sans serif font 232 | % than the serif footnote size font used in traditional IEEE formatting 233 | % and thus the need to invoke different subfig.sty package options depending 234 | % on whether compsoc mode has been enabled. 235 | % 236 | % The latest version and documentation of subfig.sty can be obtained at: 237 | % http://www.ctan.org/pkg/subfig 238 | 239 | 240 | 241 | 242 | % *** FLOAT PACKAGES *** 243 | % 244 | %\usepackage{fixltx2e} 245 | % fixltx2e, the successor to the earlier fix2col.sty, was written by 246 | % Frank Mittelbach and David Carlisle. This package corrects a few problems 247 | % in the LaTeX2e kernel, the most notable of which is that in current 248 | % LaTeX2e releases, the ordering of single and double column floats is not 249 | % guaranteed to be preserved. Thus, an unpatched LaTeX2e can allow a 250 | % single column figure to be placed prior to an earlier double column 251 | % figure. 252 | % Be aware that LaTeX2e kernels dated 2015 and later have fixltx2e.sty's 253 | % corrections already built into the system in which case a warning will 254 | % be issued if an attempt is made to load fixltx2e.sty as it is no longer 255 | % needed. 256 | % The latest version and documentation can be found at: 257 | % http://www.ctan.org/pkg/fixltx2e 258 | 259 | 260 | %\usepackage{stfloats} 261 | % stfloats.sty was written by Sigitas Tolusis. This package gives LaTeX2e 262 | % the ability to do double column floats at the bottom of the page as well 263 | % as the top. (e.g., "\begin{figure*}[!b]" is not normally possible in 264 | % LaTeX2e). It also provides a command: 265 | %\fnbelowfloat 266 | % to enable the placement of footnotes below bottom floats (the standard 267 | % LaTeX2e kernel puts them above bottom floats). This is an invasive package 268 | % which rewrites many portions of the LaTeX2e float routines. It may not work 269 | % with other packages that modify the LaTeX2e float routines. The latest 270 | % version and documentation can be obtained at: 271 | % http://www.ctan.org/pkg/stfloats 272 | % Do not use the stfloats baselinefloat ability as the IEEE does not allow 273 | % \baselineskip to stretch. Authors submitting work to the IEEE should note 274 | % that the IEEE rarely uses double column equations and that authors should try 275 | % to avoid such use. Do not be tempted to use the cuted.sty or midfloat.sty 276 | % packages (also by Sigitas Tolusis) as the IEEE does not format its papers in 277 | % such ways. 278 | % Do not attempt to use stfloats with fixltx2e as they are incompatible. 279 | % Instead, use Morten Hogholm'a dblfloatfix which combines the features 280 | % of both fixltx2e and stfloats: 281 | % 282 | % \usepackage{dblfloatfix} 283 | % The latest version can be found at: 284 | % http://www.ctan.org/pkg/dblfloatfix 285 | 286 | 287 | 288 | 289 | %\ifCLASSOPTIONcaptionsoff 290 | % \usepackage[nomarkers]{endfloat} 291 | % \let\MYoriglatexcaption\caption 292 | % \renewcommand{\caption}[2][\relax]{\MYoriglatexcaption[#2]{#2}} 293 | %\fi 294 | % endfloat.sty was written by James Darrell McCauley, Jeff Goldberg and 295 | % Axel Sommerfeldt. This package may be useful when used in conjunction with 296 | % IEEEtran.cls' captionsoff option. Some IEEE journals/societies require that 297 | % submissions have lists of figures/tables at the end of the paper and that 298 | % figures/tables without any captions are placed on a page by themselves at 299 | % the end of the document. If needed, the draftcls IEEEtran class option or 300 | % \CLASSINPUTbaselinestretch interface can be used to increase the line 301 | % spacing as well. Be sure and use the nomarkers option of endfloat to 302 | % prevent endfloat from "marking" where the figures would have been placed 303 | % in the text. The two hack lines of code above are a slight modification of 304 | % that suggested by in the endfloat docs (section 8.4.1) to ensure that 305 | % the full captions always appear in the list of figures/tables - even if 306 | % the user used the short optional argument of \caption[]{}. 307 | % IEEE papers do not typically make use of \caption[]'s optional argument, 308 | % so this should not be an issue. A similar trick can be used to disable 309 | % captions of packages such as subfig.sty that lack options to turn off 310 | % the subcaptions: 311 | % For subfig.sty: 312 | % \let\MYorigsubfloat\subfloat 313 | % \renewcommand{\subfloat}[2][\relax]{\MYorigsubfloat[]{#2}} 314 | % However, the above trick will not work if both optional arguments of 315 | % the \subfloat command are used. Furthermore, there needs to be a 316 | % description of each subfigure *somewhere* and endfloat does not add 317 | % subfigure captions to its list of figures. Thus, the best approach is to 318 | % avoid the use of subfigure captions (many IEEE journals avoid them anyway) 319 | % and instead reference/explain all the subfigures within the main caption. 320 | % The latest version of endfloat.sty and its documentation can obtained at: 321 | % http://www.ctan.org/pkg/endfloat 322 | % 323 | % The IEEEtran \ifCLASSOPTIONcaptionsoff conditional can also be used 324 | % later in the document, say, to conditionally put the References on a 325 | % page by themselves. 326 | 327 | 328 | 329 | 330 | % *** PDF, URL AND HYPERLINK PACKAGES *** 331 | % 332 | %\usepackage{url} 333 | % url.sty was written by Donald Arseneau. It provides better support for 334 | % handling and breaking URLs. url.sty is already installed on most LaTeX 335 | % systems. The latest version and documentation can be obtained at: 336 | % http://www.ctan.org/pkg/url 337 | % Basically, \url{my_url_here}. 338 | 339 | 340 | 341 | 342 | % *** Do not adjust lengths that control margins, column widths, etc. *** 343 | % *** Do not use packages that alter fonts (such as pslatex). *** 344 | % There should be no need to do such things with IEEEtran.cls V1.6 and later. 345 | % (Unless specifically asked to do so by the journal or conference you plan 346 | % to submit to, of course. ) 347 | 348 | 349 | % correct bad hyphenation here 350 | \hyphenation{op-tical net-works semi-conduc-tor} 351 | 352 | 353 | \begin{document} 354 | % 355 | % paper title 356 | % Titles are generally capitalized except for words such as a, an, and, as, 357 | % at, but, by, for, in, nor, of, on, or, the, to and up, which are usually 358 | % not capitalized unless they are the first or last word of the title. 359 | % Linebreaks \\ can be used within to get better formatting as desired. 360 | % Do not put math or special symbols in the title. 361 | \title{Bare Demo of IEEEtran.cls\\ for IEEE Journals} 362 | % 363 | % 364 | % author names and IEEE memberships 365 | % note positions of commas and nonbreaking spaces ( ~ ) LaTeX will not break 366 | % a structure at a ~ so this keeps an author's name from being broken across 367 | % two lines. 368 | % use \thanks{} to gain access to the first footnote area 369 | % a separate \thanks must be used for each paragraph as LaTeX2e's \thanks 370 | % was not built to handle multiple paragraphs 371 | % 372 | 373 | \author{Michael~Shell,~\IEEEmembership{Member,~IEEE,} 374 | John~Doe,~\IEEEmembership{Fellow,~OSA,} 375 | and~Jane~Doe,~\IEEEmembership{Life~Fellow,~IEEE}% <-this % stops a space 376 | \thanks{M. Shell was with the Department 377 | of Electrical and Computer Engineering, Georgia Institute of Technology, Atlanta, 378 | GA, 30332 USA e-mail: (see http://www.michaelshell.org/contact.html).}% <-this % stops a space 379 | \thanks{J. Doe and J. Doe are with Anonymous University.}% <-this % stops a space 380 | \thanks{Manuscript received April 19, 2005; revised August 26, 2015.}} 381 | 382 | % note the % following the last \IEEEmembership and also \thanks - 383 | % these prevent an unwanted space from occurring between the last author name 384 | % and the end of the author line. i.e., if you had this: 385 | % 386 | % \author{....lastname \thanks{...} \thanks{...} } 387 | % ^------------^------------^----Do not want these spaces! 388 | % 389 | % a space would be appended to the last name and could cause every name on that 390 | % line to be shifted left slightly. This is one of those "LaTeX things". For 391 | % instance, "\textbf{A} \textbf{B}" will typeset as "A B" not "AB". To get 392 | % "AB" then you have to do: "\textbf{A}\textbf{B}" 393 | % \thanks is no different in this regard, so shield the last } of each \thanks 394 | % that ends a line with a % and do not let a space in before the next \thanks. 395 | % Spaces after \IEEEmembership other than the last one are OK (and needed) as 396 | % you are supposed to have spaces between the names. For what it is worth, 397 | % this is a minor point as most people would not even notice if the said evil 398 | % space somehow managed to creep in. 399 | 400 | 401 | 402 | % The paper headers 403 | \markboth{Journal of \LaTeX\ Class Files,~Vol.~14, No.~8, August~2015}% 404 | {Shell \MakeLowercase{\textit{et al.}}: Bare Demo of IEEEtran.cls for IEEE Journals} 405 | % The only time the second header will appear is for the odd numbered pages 406 | % after the title page when using the twoside option. 407 | % 408 | % *** Note that you probably will NOT want to include the author's *** 409 | % *** name in the headers of peer review papers. *** 410 | % You can use \ifCLASSOPTIONpeerreview for conditional compilation here if 411 | % you desire. 412 | 413 | 414 | 415 | 416 | % If you want to put a publisher's ID mark on the page you can do it like 417 | % this: 418 | %\IEEEpubid{0000--0000/00\$00.00~\copyright~2015 IEEE} 419 | % Remember, if you use this you must call \IEEEpubidadjcol in the second 420 | % column for its text to clear the IEEEpubid mark. 421 | 422 | 423 | 424 | % use for special paper notices 425 | %\IEEEspecialpapernotice{(Invited Paper)} 426 | 427 | 428 | 429 | 430 | % make the title area 431 | \maketitle 432 | 433 | % As a general rule, do not put math, special symbols or citations 434 | % in the abstract or keywords. 435 | \begin{abstract} 436 | The abstract goes here. 437 | \end{abstract} 438 | 439 | % Note that keywords are not normally used for peerreview papers. 440 | \begin{IEEEkeywords} 441 | IEEE, IEEEtran, journal, \LaTeX, paper, template. 442 | \end{IEEEkeywords} 443 | 444 | 445 | 446 | 447 | 448 | 449 | % For peer review papers, you can put extra information on the cover 450 | % page as needed: 451 | % \ifCLASSOPTIONpeerreview 452 | % \begin{center} \bfseries EDICS Category: 3-BBND \end{center} 453 | % \fi 454 | % 455 | % For peerreview papers, this IEEEtran command inserts a page break and 456 | % creates the second title. It will be ignored for other modes. 457 | \IEEEpeerreviewmaketitle 458 | 459 | 460 | 461 | \section{Introduction} 462 | % The very first letter is a 2 line initial drop letter followed 463 | % by the rest of the first word in caps. 464 | % 465 | % form to use if the first word consists of a single letter: 466 | % \IEEEPARstart{A}{demo} file is .... 467 | % 468 | % form to use if you need the single drop letter followed by 469 | % normal text (unknown if ever used by the IEEE): 470 | % \IEEEPARstart{A}{}demo file is .... 471 | % 472 | % Some journals put the first two words in caps: 473 | % \IEEEPARstart{T}{his demo} file is .... 474 | % 475 | % Here we have the typical use of a "T" for an initial drop letter 476 | % and "HIS" in caps to complete the first word. 477 | \IEEEPARstart{T}{his} demo file is intended to serve as a ``starter file'' 478 | for IEEE journal papers produced under \LaTeX\ using 479 | IEEEtran.cls version 1.8b and later. 480 | % You must have at least 2 lines in the paragraph with the drop letter 481 | % (should never be an issue) 482 | I wish you the best of success. 483 | 484 | \hfill mds 485 | 486 | \hfill August 26, 2015 487 | 488 | \subsection{Subsection Heading Here} 489 | Subsection text here. 490 | 491 | % needed in second column of first page if using \IEEEpubid 492 | %\IEEEpubidadjcol 493 | 494 | \subsubsection{Subsubsection Heading Here} 495 | Subsubsection text here. 496 | 497 | 498 | % An example of a floating figure using the graphicx package. 499 | % Note that \label must occur AFTER (or within) \caption. 500 | % For figures, \caption should occur after the \includegraphics. 501 | % Note that IEEEtran v1.7 and later has special internal code that 502 | % is designed to preserve the operation of \label within \caption 503 | % even when the captionsoff option is in effect. However, because 504 | % of issues like this, it may be the safest practice to put all your 505 | % \label just after \caption rather than within \caption{}. 506 | % 507 | % Reminder: the "draftcls" or "draftclsnofoot", not "draft", class 508 | % option should be used if it is desired that the figures are to be 509 | % displayed while in draft mode. 510 | % 511 | %\begin{figure}[!t] 512 | %\centering 513 | %\includegraphics[width=2.5in]{myfigure} 514 | % where an .eps filename suffix will be assumed under latex, 515 | % and a .pdf suffix will be assumed for pdflatex; or what has been declared 516 | % via \DeclareGraphicsExtensions. 517 | %\caption{Simulation results for the network.} 518 | %\label{fig_sim} 519 | %\end{figure} 520 | 521 | % Note that the IEEE typically puts floats only at the top, even when this 522 | % results in a large percentage of a column being occupied by floats. 523 | 524 | 525 | % An example of a double column floating figure using two subfigures. 526 | % (The subfig.sty package must be loaded for this to work.) 527 | % The subfigure \label commands are set within each subfloat command, 528 | % and the \label for the overall figure must come after \caption. 529 | % \hfil is used as a separator to get equal spacing. 530 | % Watch out that the combined width of all the subfigures on a 531 | % line do not exceed the text width or a line break will occur. 532 | % 533 | %\begin{figure*}[!t] 534 | %\centering 535 | %\subfloat[Case I]{\includegraphics[width=2.5in]{box}% 536 | %\label{fig_first_case}} 537 | %\hfil 538 | %\subfloat[Case II]{\includegraphics[width=2.5in]{box}% 539 | %\label{fig_second_case}} 540 | %\caption{Simulation results for the network.} 541 | %\label{fig_sim} 542 | %\end{figure*} 543 | % 544 | % Note that often IEEE papers with subfigures do not employ subfigure 545 | % captions (using the optional argument to \subfloat[]), but instead will 546 | % reference/describe all of them (a), (b), etc., within the main caption. 547 | % Be aware that for subfig.sty to generate the (a), (b), etc., subfigure 548 | % labels, the optional argument to \subfloat must be present. If a 549 | % subcaption is not desired, just leave its contents blank, 550 | % e.g., \subfloat[]. 551 | 552 | 553 | % An example of a floating table. Note that, for IEEE style tables, the 554 | % \caption command should come BEFORE the table and, given that table 555 | % captions serve much like titles, are usually capitalized except for words 556 | % such as a, an, and, as, at, but, by, for, in, nor, of, on, or, the, to 557 | % and up, which are usually not capitalized unless they are the first or 558 | % last word of the caption. Table text will default to \footnotesize as 559 | % the IEEE normally uses this smaller font for tables. 560 | % The \label must come after \caption as always. 561 | % 562 | %\begin{table}[!t] 563 | %% increase table row spacing, adjust to taste 564 | %\renewcommand{\arraystretch}{1.3} 565 | % if using array.sty, it might be a good idea to tweak the value of 566 | % \extrarowheight as needed to properly center the text within the cells 567 | %\caption{An Example of a Table} 568 | %\label{table_example} 569 | %\centering 570 | %% Some packages, such as MDW tools, offer better commands for making tables 571 | %% than the plain LaTeX2e tabular which is used here. 572 | %\begin{tabular}{|c||c|} 573 | %\hline 574 | %One & Two\\ 575 | %\hline 576 | %Three & Four\\ 577 | %\hline 578 | %\end{tabular} 579 | %\end{table} 580 | 581 | 582 | % Note that the IEEE does not put floats in the very first column 583 | % - or typically anywhere on the first page for that matter. Also, 584 | % in-text middle ("here") positioning is typically not used, but it 585 | % is allowed and encouraged for Computer Society conferences (but 586 | % not Computer Society journals). Most IEEE journals/conferences use 587 | % top floats exclusively. 588 | % Note that, LaTeX2e, unlike IEEE journals/conferences, places 589 | % footnotes above bottom floats. This can be corrected via the 590 | % \fnbelowfloat command of the stfloats package. 591 | 592 | 593 | 594 | 595 | \section{Conclusion} 596 | The conclusion goes here. 597 | 598 | 599 | 600 | 601 | 602 | % if have a single appendix: 603 | %\appendix[Proof of the Zonklar Equations] 604 | % or 605 | %\appendix % for no appendix heading 606 | % do not use \section anymore after \appendix, only \section* 607 | % is possibly needed 608 | 609 | % use appendices with more than one appendix 610 | % then use \section to start each appendix 611 | % you must declare a \section before using any 612 | % \subsection or using \label (\appendices by itself 613 | % starts a section numbered zero.) 614 | % 615 | 616 | 617 | \appendices 618 | \section{Proof of the First Zonklar Equation} 619 | Appendix one text goes here. 620 | 621 | % you can choose not to have a title for an appendix 622 | % if you want by leaving the argument blank 623 | \section{} 624 | Appendix two text goes here. 625 | 626 | 627 | % use section* for acknowledgment 628 | \section*{Acknowledgment} 629 | 630 | 631 | The authors would like to thank... 632 | 633 | 634 | % Can use something like this to put references on a page 635 | % by themselves when using endfloat and the captionsoff option. 636 | \ifCLASSOPTIONcaptionsoff 637 | \newpage 638 | \fi 639 | 640 | 641 | 642 | % trigger a \newpage just before the given reference 643 | % number - used to balance the columns on the last page 644 | % adjust value as needed - may need to be readjusted if 645 | % the document is modified later 646 | %\IEEEtriggeratref{8} 647 | % The "triggered" command can be changed if desired: 648 | %\IEEEtriggercmd{\enlargethispage{-5in}} 649 | 650 | % references section 651 | 652 | % can use a bibliography generated by BibTeX as a .bbl file 653 | % BibTeX documentation can be easily obtained at: 654 | % http://mirror.ctan.org/biblio/bibtex/contrib/doc/ 655 | % The IEEEtran BibTeX style support page is at: 656 | % http://www.michaelshell.org/tex/ieeetran/bibtex/ 657 | %\bibliographystyle{IEEEtran} 658 | % argument is your BibTeX string definitions and bibliography database(s) 659 | %\bibliography{IEEEabrv,../bib/paper} 660 | % 661 | % manually copy in the resultant .bbl file 662 | % set second argument of \begin to the number of references 663 | % (used to reserve space for the reference number labels box) 664 | \begin{thebibliography}{1} 665 | 666 | \bibitem{IEEEhowto:kopka} 667 | H.~Kopka and P.~W. Daly, \emph{A Guide to \LaTeX}, 3rd~ed.\hskip 1em plus 668 | 0.5em minus 0.4em\relax Harlow, England: Addison-Wesley, 1999. 669 | 670 | \end{thebibliography} 671 | 672 | % biography section 673 | % 674 | % If you have an EPS/PDF photo (graphicx package needed) extra braces are 675 | % needed around the contents of the optional argument to biography to prevent 676 | % the LaTeX parser from getting confused when it sees the complicated 677 | % \includegraphics command within an optional argument. (You could create 678 | % your own custom macro containing the \includegraphics command to make things 679 | % simpler here.) 680 | %\begin{IEEEbiography}[{\includegraphics[width=1in,height=1.25in,clip,keepaspectratio]{mshell}}]{Michael Shell} 681 | % or if you just want to reserve a space for a photo: 682 | 683 | \begin{IEEEbiography}{Michael Shell} 684 | Biography text here. 685 | \end{IEEEbiography} 686 | 687 | % if you will not have a photo at all: 688 | \begin{IEEEbiographynophoto}{John Doe} 689 | Biography text here. 690 | \end{IEEEbiographynophoto} 691 | 692 | % insert where needed to balance the two columns on the last page with 693 | % biographies 694 | %\newpage 695 | 696 | \begin{IEEEbiographynophoto}{Jane Doe} 697 | Biography text here. 698 | \end{IEEEbiographynophoto} 699 | 700 | % You can push biographies down or up by placing 701 | % a \vfill before or after them. The appropriate 702 | % use of \vfill depends on what kind of text is 703 | % on the last page and whether or not the columns 704 | % are being equalized. 705 | 706 | %\vfill 707 | 708 | % Can be used to pull up biographies so that the bottom of the last one 709 | % is flush with the other column. 710 | %\enlargethispage{-5in} 711 | 712 | 713 | 714 | % that's all folks 715 | \end{document} 716 | 717 | 718 | -------------------------------------------------------------------------------- /paper/figE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasMiconi/DiffRNN/c81eb14ab9d51191b4d7891e0472036077f49bcc/paper/figE.png -------------------------------------------------------------------------------- /paper/figEHE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasMiconi/DiffRNN/c81eb14ab9d51191b4d7891e0472036077f49bcc/paper/figEHE.png -------------------------------------------------------------------------------- /paper/figFS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasMiconi/DiffRNN/c81eb14ab9d51191b4d7891e0472036077f49bcc/paper/figFS.png -------------------------------------------------------------------------------- /paper/figH.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasMiconi/DiffRNN/c81eb14ab9d51191b4d7891e0472036077f49bcc/paper/figH.png -------------------------------------------------------------------------------- /paper/nips_2016.aux: -------------------------------------------------------------------------------- 1 | \relax 2 | \providecommand\hyper@newdestlabel[2]{} 3 | \providecommand\HyperFirstAtBeginDocument{\AtBeginDocument} 4 | \HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined 5 | \global\let\oldcontentsline\contentsline 6 | \gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}} 7 | \global\let\oldnewlabel\newlabel 8 | \gdef\newlabel#1#2{\newlabelxx{#1}#2} 9 | \gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}} 10 | \AtEndDocument{\ifx\hyper@anchor\@undefined 11 | \let\contentsline\oldcontentsline 12 | \let\newlabel\oldnewlabel 13 | \fi} 14 | \fi} 15 | \global\let\hyper@last\relax 16 | \gdef\HyperFirstAtBeginDocument#1{#1} 17 | \providecommand\HyField@AuxAddToFields[1]{} 18 | \providecommand\HyField@AuxAddToCoFields[2]{} 19 | \@writefile{toc}{\contentsline {section}{\numberline {1}Submission of papers to NIPS 2016}{1}{section.1}} 20 | \@writefile{toc}{\contentsline {subsection}{\numberline {1.1}Style}{1}{subsection.1.1}} 21 | \@writefile{toc}{\contentsline {subsection}{\numberline {1.2}Retrieval of style files}{1}{subsection.1.2}} 22 | \@writefile{toc}{\contentsline {section}{\numberline {2}General formatting instructions}{2}{section.2}} 23 | \newlabel{gen_inst}{{2}{2}{General formatting instructions}{section.2}{}} 24 | \@writefile{toc}{\contentsline {section}{\numberline {3}Headings: first level}{2}{section.3}} 25 | \newlabel{headings}{{3}{2}{Headings: first level}{section.3}{}} 26 | \@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Headings: second level}{2}{subsection.3.1}} 27 | \@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.1}Headings: third level}{2}{subsubsection.3.1.1}} 28 | \@writefile{toc}{\contentsline {paragraph}{Paragraphs}{2}{section*.1}} 29 | \@writefile{toc}{\contentsline {section}{\numberline {4}Citations, figures, tables, references}{2}{section.4}} 30 | \newlabel{others}{{4}{2}{Citations, figures, tables, references}{section.4}{}} 31 | \@writefile{toc}{\contentsline {subsection}{\numberline {4.1}Citations within the text}{2}{subsection.4.1}} 32 | \@writefile{toc}{\contentsline {subsection}{\numberline {4.2}Footnotes}{3}{subsection.4.2}} 33 | \@writefile{toc}{\contentsline {subsection}{\numberline {4.3}Figures}{3}{subsection.4.3}} 34 | \@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Sample figure caption.}}{3}{figure.1}} 35 | \@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Sample table title}}{4}{table.1}} 36 | \newlabel{sample-table}{{1}{4}{Sample table title}{table.1}{}} 37 | \@writefile{toc}{\contentsline {subsection}{\numberline {4.4}Tables}{4}{subsection.4.4}} 38 | \@writefile{toc}{\contentsline {section}{\numberline {5}Final instructions}{4}{section.5}} 39 | \@writefile{toc}{\contentsline {section}{\numberline {6}Preparing PDF files}{4}{section.6}} 40 | \@writefile{toc}{\contentsline {subsection}{\numberline {6.1}Margins in \LaTeX {}}{5}{subsection.6.1}} 41 | -------------------------------------------------------------------------------- /paper/nips_2016.dvi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasMiconi/DiffRNN/c81eb14ab9d51191b4d7891e0472036077f49bcc/paper/nips_2016.dvi -------------------------------------------------------------------------------- /paper/nips_2016.log: -------------------------------------------------------------------------------- 1 | This is pdfTeX, Version 3.14159265-2.6-1.40.16 (TeX Live 2015) (preloaded format=pdflatex 2015.8.19) 16 JUN 2016 19:07 2 | entering extended mode 3 | restricted \write18 enabled. 4 | %&-line parsing enabled. 5 | **nips_2016.tex 6 | (./nips_2016.tex 7 | LaTeX2e <2015/01/01> patch level 2 8 | Babel <3.9m> and hyphenation patterns for 79 languages loaded. 9 | (/opt/texlive/2015/texmf-dist/tex/latex/base/article.cls 10 | Document Class: article 2014/09/29 v1.4h Standard LaTeX document class 11 | (/opt/texlive/2015/texmf-dist/tex/latex/base/size10.clo 12 | File: size10.clo 2014/09/29 v1.4h Standard LaTeX file (size option) 13 | ) 14 | \c@part=\count79 15 | \c@section=\count80 16 | \c@subsection=\count81 17 | \c@subsubsection=\count82 18 | \c@paragraph=\count83 19 | \c@subparagraph=\count84 20 | \c@figure=\count85 21 | \c@table=\count86 22 | \abovecaptionskip=\skip41 23 | \belowcaptionskip=\skip42 24 | \bibindent=\dimen102 25 | ) (./nips_2016.sty 26 | Package: nips_2016 2016/03/07 NIPS 2016 submission/camera-ready style file 27 | 28 | (/opt/texlive/2015/texmf-dist/tex/latex/natbib/natbib.sty 29 | Package: natbib 2010/09/13 8.31b (PWD, AO) 30 | \bibhang=\skip43 31 | \bibsep=\skip44 32 | LaTeX Info: Redefining \cite on input line 694. 33 | \c@NAT@ctr=\count87 34 | ) 35 | (/opt/texlive/2015/texmf-dist/tex/latex/geometry/geometry.sty 36 | Package: geometry 2010/09/12 v5.6 Page Geometry 37 | 38 | (/opt/texlive/2015/texmf-dist/tex/latex/graphics/keyval.sty 39 | Package: keyval 2014/10/28 v1.15 key=value parser (DPC) 40 | \KV@toks@=\toks14 41 | ) 42 | (/opt/texlive/2015/texmf-dist/tex/generic/oberdiek/ifpdf.sty 43 | Package: ifpdf 2011/01/30 v2.3 Provides the ifpdf switch (HO) 44 | Package ifpdf Info: pdfTeX in PDF mode is detected. 45 | ) 46 | (/opt/texlive/2015/texmf-dist/tex/generic/oberdiek/ifvtex.sty 47 | Package: ifvtex 2010/03/01 v1.5 Detect VTeX and its facilities (HO) 48 | Package ifvtex Info: VTeX not detected. 49 | ) 50 | (/opt/texlive/2015/texmf-dist/tex/generic/ifxetex/ifxetex.sty 51 | Package: ifxetex 2010/09/12 v0.6 Provides ifxetex conditional 52 | ) 53 | \Gm@cnth=\count88 54 | \Gm@cntv=\count89 55 | \c@Gm@tempcnt=\count90 56 | \Gm@bindingoffset=\dimen103 57 | \Gm@wd@mp=\dimen104 58 | \Gm@odd@mp=\dimen105 59 | \Gm@even@mp=\dimen106 60 | \Gm@layoutwidth=\dimen107 61 | \Gm@layoutheight=\dimen108 62 | \Gm@layouthoffset=\dimen109 63 | \Gm@layoutvoffset=\dimen110 64 | \Gm@dimlist=\toks15 65 | ) 66 | \@nipsabovecaptionskip=\skip45 67 | \@nipsbelowcaptionskip=\skip46 68 | ) 69 | (/opt/texlive/2015/texmf-dist/tex/latex/base/inputenc.sty 70 | Package: inputenc 2015/03/17 v1.2c Input encoding file 71 | \inpenc@prehook=\toks16 72 | \inpenc@posthook=\toks17 73 | 74 | (/opt/texlive/2015/texmf-dist/tex/latex/base/utf8.def 75 | File: utf8.def 2015/06/27 v1.1n UTF-8 support for inputenc 76 | Now handling font encoding OML ... 77 | ... no UTF-8 mapping file for font encoding OML 78 | Now handling font encoding T1 ... 79 | ... processing UTF-8 mapping file for font encoding T1 80 | 81 | (/opt/texlive/2015/texmf-dist/tex/latex/base/t1enc.dfu 82 | File: t1enc.dfu 2015/06/27 v1.1n UTF-8 support for inputenc 83 | defining Unicode char U+00A1 (decimal 161) 84 | defining Unicode char U+00A3 (decimal 163) 85 | defining Unicode char U+00AB (decimal 171) 86 | defining Unicode char U+00BB (decimal 187) 87 | defining Unicode char U+00BF (decimal 191) 88 | defining Unicode char U+00C0 (decimal 192) 89 | defining Unicode char U+00C1 (decimal 193) 90 | defining Unicode char U+00C2 (decimal 194) 91 | defining Unicode char U+00C3 (decimal 195) 92 | defining Unicode char U+00C4 (decimal 196) 93 | defining Unicode char U+00C5 (decimal 197) 94 | defining Unicode char U+00C6 (decimal 198) 95 | defining Unicode char U+00C7 (decimal 199) 96 | defining Unicode char U+00C8 (decimal 200) 97 | defining Unicode char U+00C9 (decimal 201) 98 | defining Unicode char U+00CA (decimal 202) 99 | defining Unicode char U+00CB (decimal 203) 100 | defining Unicode char U+00CC (decimal 204) 101 | defining Unicode char U+00CD (decimal 205) 102 | defining Unicode char U+00CE (decimal 206) 103 | defining Unicode char U+00CF (decimal 207) 104 | defining Unicode char U+00D0 (decimal 208) 105 | defining Unicode char U+00D1 (decimal 209) 106 | defining Unicode char U+00D2 (decimal 210) 107 | defining Unicode char U+00D3 (decimal 211) 108 | defining Unicode char U+00D4 (decimal 212) 109 | defining Unicode char U+00D5 (decimal 213) 110 | defining Unicode char U+00D6 (decimal 214) 111 | defining Unicode char U+00D8 (decimal 216) 112 | defining Unicode char U+00D9 (decimal 217) 113 | defining Unicode char U+00DA (decimal 218) 114 | defining Unicode char U+00DB (decimal 219) 115 | defining Unicode char U+00DC (decimal 220) 116 | defining Unicode char U+00DD (decimal 221) 117 | defining Unicode char U+00DE (decimal 222) 118 | defining Unicode char U+00DF (decimal 223) 119 | defining Unicode char U+00E0 (decimal 224) 120 | defining Unicode char U+00E1 (decimal 225) 121 | defining Unicode char U+00E2 (decimal 226) 122 | defining Unicode char U+00E3 (decimal 227) 123 | defining Unicode char U+00E4 (decimal 228) 124 | defining Unicode char U+00E5 (decimal 229) 125 | defining Unicode char U+00E6 (decimal 230) 126 | defining Unicode char U+00E7 (decimal 231) 127 | defining Unicode char U+00E8 (decimal 232) 128 | defining Unicode char U+00E9 (decimal 233) 129 | defining Unicode char U+00EA (decimal 234) 130 | defining Unicode char U+00EB (decimal 235) 131 | defining Unicode char U+00EC (decimal 236) 132 | defining Unicode char U+00ED (decimal 237) 133 | defining Unicode char U+00EE (decimal 238) 134 | defining Unicode char U+00EF (decimal 239) 135 | defining Unicode char U+00F0 (decimal 240) 136 | defining Unicode char U+00F1 (decimal 241) 137 | defining Unicode char U+00F2 (decimal 242) 138 | defining Unicode char U+00F3 (decimal 243) 139 | defining Unicode char U+00F4 (decimal 244) 140 | defining Unicode char U+00F5 (decimal 245) 141 | defining Unicode char U+00F6 (decimal 246) 142 | defining Unicode char U+00F8 (decimal 248) 143 | defining Unicode char U+00F9 (decimal 249) 144 | defining Unicode char U+00FA (decimal 250) 145 | defining Unicode char U+00FB (decimal 251) 146 | defining Unicode char U+00FC (decimal 252) 147 | defining Unicode char U+00FD (decimal 253) 148 | defining Unicode char U+00FE (decimal 254) 149 | defining Unicode char U+00FF (decimal 255) 150 | defining Unicode char U+0102 (decimal 258) 151 | defining Unicode char U+0103 (decimal 259) 152 | defining Unicode char U+0104 (decimal 260) 153 | defining Unicode char U+0105 (decimal 261) 154 | defining Unicode char U+0106 (decimal 262) 155 | defining Unicode char U+0107 (decimal 263) 156 | defining Unicode char U+010C (decimal 268) 157 | defining Unicode char U+010D (decimal 269) 158 | defining Unicode char U+010E (decimal 270) 159 | defining Unicode char U+010F (decimal 271) 160 | defining Unicode char U+0110 (decimal 272) 161 | defining Unicode char U+0111 (decimal 273) 162 | defining Unicode char U+0118 (decimal 280) 163 | defining Unicode char U+0119 (decimal 281) 164 | defining Unicode char U+011A (decimal 282) 165 | defining Unicode char U+011B (decimal 283) 166 | defining Unicode char U+011E (decimal 286) 167 | defining Unicode char U+011F (decimal 287) 168 | defining Unicode char U+0130 (decimal 304) 169 | defining Unicode char U+0131 (decimal 305) 170 | defining Unicode char U+0132 (decimal 306) 171 | defining Unicode char U+0133 (decimal 307) 172 | defining Unicode char U+0139 (decimal 313) 173 | defining Unicode char U+013A (decimal 314) 174 | defining Unicode char U+013D (decimal 317) 175 | defining Unicode char U+013E (decimal 318) 176 | defining Unicode char U+0141 (decimal 321) 177 | defining Unicode char U+0142 (decimal 322) 178 | defining Unicode char U+0143 (decimal 323) 179 | defining Unicode char U+0144 (decimal 324) 180 | defining Unicode char U+0147 (decimal 327) 181 | defining Unicode char U+0148 (decimal 328) 182 | defining Unicode char U+014A (decimal 330) 183 | defining Unicode char U+014B (decimal 331) 184 | defining Unicode char U+0150 (decimal 336) 185 | defining Unicode char U+0151 (decimal 337) 186 | defining Unicode char U+0152 (decimal 338) 187 | defining Unicode char U+0153 (decimal 339) 188 | defining Unicode char U+0154 (decimal 340) 189 | defining Unicode char U+0155 (decimal 341) 190 | defining Unicode char U+0158 (decimal 344) 191 | defining Unicode char U+0159 (decimal 345) 192 | defining Unicode char U+015A (decimal 346) 193 | defining Unicode char U+015B (decimal 347) 194 | defining Unicode char U+015E (decimal 350) 195 | defining Unicode char U+015F (decimal 351) 196 | defining Unicode char U+0160 (decimal 352) 197 | defining Unicode char U+0161 (decimal 353) 198 | defining Unicode char U+0162 (decimal 354) 199 | defining Unicode char U+0163 (decimal 355) 200 | defining Unicode char U+0164 (decimal 356) 201 | defining Unicode char U+0165 (decimal 357) 202 | defining Unicode char U+016E (decimal 366) 203 | defining Unicode char U+016F (decimal 367) 204 | defining Unicode char U+0170 (decimal 368) 205 | defining Unicode char U+0171 (decimal 369) 206 | defining Unicode char U+0178 (decimal 376) 207 | defining Unicode char U+0179 (decimal 377) 208 | defining Unicode char U+017A (decimal 378) 209 | defining Unicode char U+017B (decimal 379) 210 | defining Unicode char U+017C (decimal 380) 211 | defining Unicode char U+017D (decimal 381) 212 | defining Unicode char U+017E (decimal 382) 213 | defining Unicode char U+200C (decimal 8204) 214 | defining Unicode char U+2013 (decimal 8211) 215 | defining Unicode char U+2014 (decimal 8212) 216 | defining Unicode char U+2018 (decimal 8216) 217 | defining Unicode char U+2019 (decimal 8217) 218 | defining Unicode char U+201A (decimal 8218) 219 | defining Unicode char U+201C (decimal 8220) 220 | defining Unicode char U+201D (decimal 8221) 221 | defining Unicode char U+201E (decimal 8222) 222 | defining Unicode char U+2030 (decimal 8240) 223 | defining Unicode char U+2031 (decimal 8241) 224 | defining Unicode char U+2039 (decimal 8249) 225 | defining Unicode char U+203A (decimal 8250) 226 | defining Unicode char U+2423 (decimal 9251) 227 | ) 228 | Now handling font encoding OT1 ... 229 | ... processing UTF-8 mapping file for font encoding OT1 230 | 231 | (/opt/texlive/2015/texmf-dist/tex/latex/base/ot1enc.dfu 232 | File: ot1enc.dfu 2015/06/27 v1.1n UTF-8 support for inputenc 233 | defining Unicode char U+00A1 (decimal 161) 234 | defining Unicode char U+00A3 (decimal 163) 235 | defining Unicode char U+00B8 (decimal 184) 236 | defining Unicode char U+00BF (decimal 191) 237 | defining Unicode char U+00C5 (decimal 197) 238 | defining Unicode char U+00C6 (decimal 198) 239 | defining Unicode char U+00D8 (decimal 216) 240 | defining Unicode char U+00DF (decimal 223) 241 | defining Unicode char U+00E6 (decimal 230) 242 | defining Unicode char U+00EC (decimal 236) 243 | defining Unicode char U+00ED (decimal 237) 244 | defining Unicode char U+00EE (decimal 238) 245 | defining Unicode char U+00EF (decimal 239) 246 | defining Unicode char U+00F8 (decimal 248) 247 | defining Unicode char U+0131 (decimal 305) 248 | defining Unicode char U+0141 (decimal 321) 249 | defining Unicode char U+0142 (decimal 322) 250 | defining Unicode char U+0152 (decimal 338) 251 | defining Unicode char U+0153 (decimal 339) 252 | defining Unicode char U+2013 (decimal 8211) 253 | defining Unicode char U+2014 (decimal 8212) 254 | defining Unicode char U+2018 (decimal 8216) 255 | defining Unicode char U+2019 (decimal 8217) 256 | defining Unicode char U+201C (decimal 8220) 257 | defining Unicode char U+201D (decimal 8221) 258 | ) 259 | Now handling font encoding OMS ... 260 | ... processing UTF-8 mapping file for font encoding OMS 261 | 262 | (/opt/texlive/2015/texmf-dist/tex/latex/base/omsenc.dfu 263 | File: omsenc.dfu 2015/06/27 v1.1n UTF-8 support for inputenc 264 | defining Unicode char U+00A7 (decimal 167) 265 | defining Unicode char U+00B6 (decimal 182) 266 | defining Unicode char U+00B7 (decimal 183) 267 | defining Unicode char U+2020 (decimal 8224) 268 | defining Unicode char U+2021 (decimal 8225) 269 | defining Unicode char U+2022 (decimal 8226) 270 | ) 271 | Now handling font encoding OMX ... 272 | ... no UTF-8 mapping file for font encoding OMX 273 | Now handling font encoding U ... 274 | ... no UTF-8 mapping file for font encoding U 275 | defining Unicode char U+00A9 (decimal 169) 276 | defining Unicode char U+00AA (decimal 170) 277 | defining Unicode char U+00AE (decimal 174) 278 | defining Unicode char U+00BA (decimal 186) 279 | defining Unicode char U+02C6 (decimal 710) 280 | defining Unicode char U+02DC (decimal 732) 281 | defining Unicode char U+200C (decimal 8204) 282 | defining Unicode char U+2026 (decimal 8230) 283 | defining Unicode char U+2122 (decimal 8482) 284 | defining Unicode char U+2423 (decimal 9251) 285 | )) 286 | (/opt/texlive/2015/texmf-dist/tex/latex/base/fontenc.sty 287 | Package: fontenc 2005/09/27 v1.99g Standard LaTeX package 288 | 289 | (/opt/texlive/2015/texmf-dist/tex/latex/base/t1enc.def 290 | File: t1enc.def 2005/09/27 v1.99g Standard LaTeX file 291 | LaTeX Font Info: Redeclaring font encoding T1 on input line 48. 292 | )) 293 | (/opt/texlive/2015/texmf-dist/tex/latex/hyperref/hyperref.sty 294 | Package: hyperref 2012/11/06 v6.83m Hypertext links for LaTeX 295 | 296 | (/opt/texlive/2015/texmf-dist/tex/generic/oberdiek/hobsub-hyperref.sty 297 | Package: hobsub-hyperref 2012/05/28 v1.13 Bundle oberdiek, subset hyperref (HO) 298 | 299 | 300 | (/opt/texlive/2015/texmf-dist/tex/generic/oberdiek/hobsub-generic.sty 301 | Package: hobsub-generic 2012/05/28 v1.13 Bundle oberdiek, subset generic (HO) 302 | Package: hobsub 2012/05/28 v1.13 Construct package bundles (HO) 303 | Package: infwarerr 2010/04/08 v1.3 Providing info/warning/error messages (HO) 304 | Package: ltxcmds 2011/11/09 v1.22 LaTeX kernel commands for general use (HO) 305 | Package: ifluatex 2010/03/01 v1.3 Provides the ifluatex switch (HO) 306 | Package ifluatex Info: LuaTeX not detected. 307 | Package hobsub Info: Skipping package `ifvtex' (already loaded). 308 | Package: intcalc 2007/09/27 v1.1 Expandable calculations with integers (HO) 309 | Package hobsub Info: Skipping package `ifpdf' (already loaded). 310 | Package: etexcmds 2011/02/16 v1.5 Avoid name clashes with e-TeX commands (HO) 311 | Package etexcmds Info: Could not find \expanded. 312 | (etexcmds) That can mean that you are not using pdfTeX 1.50 or 313 | (etexcmds) that some package has redefined \expanded. 314 | (etexcmds) In the latter case, load this package earlier. 315 | Package: kvsetkeys 2012/04/25 v1.16 Key value parser (HO) 316 | Package: kvdefinekeys 2011/04/07 v1.3 Define keys (HO) 317 | Package: pdftexcmds 2011/11/29 v0.20 Utility functions of pdfTeX for LuaTeX (HO 318 | ) 319 | Package pdftexcmds Info: LuaTeX not detected. 320 | Package pdftexcmds Info: \pdf@primitive is available. 321 | Package pdftexcmds Info: \pdf@ifprimitive is available. 322 | Package pdftexcmds Info: \pdfdraftmode found. 323 | Package: pdfescape 2011/11/25 v1.13 Implements pdfTeX's escape features (HO) 324 | Package: bigintcalc 2012/04/08 v1.3 Expandable calculations on big integers (HO 325 | ) 326 | Package: bitset 2011/01/30 v1.1 Handle bit-vector datatype (HO) 327 | Package: uniquecounter 2011/01/30 v1.2 Provide unlimited unique counter (HO) 328 | ) 329 | Package hobsub Info: Skipping package `hobsub' (already loaded). 330 | Package: letltxmacro 2010/09/02 v1.4 Let assignment for LaTeX macros (HO) 331 | Package: hopatch 2012/05/28 v1.2 Wrapper for package hooks (HO) 332 | Package: xcolor-patch 2011/01/30 xcolor patch 333 | Package: atveryend 2011/06/30 v1.8 Hooks at the very end of document (HO) 334 | Package atveryend Info: \enddocument detected (standard20110627). 335 | Package: atbegshi 2011/10/05 v1.16 At begin shipout hook (HO) 336 | Package: refcount 2011/10/16 v3.4 Data extraction from label references (HO) 337 | Package: hycolor 2011/01/30 v1.7 Color options for hyperref/bookmark (HO) 338 | ) 339 | (/opt/texlive/2015/texmf-dist/tex/latex/oberdiek/auxhook.sty 340 | Package: auxhook 2011/03/04 v1.3 Hooks for auxiliary files (HO) 341 | ) 342 | (/opt/texlive/2015/texmf-dist/tex/latex/oberdiek/kvoptions.sty 343 | Package: kvoptions 2011/06/30 v3.11 Key value format for package options (HO) 344 | ) 345 | \@linkdim=\dimen111 346 | \Hy@linkcounter=\count91 347 | \Hy@pagecounter=\count92 348 | 349 | (/opt/texlive/2015/texmf-dist/tex/latex/hyperref/pd1enc.def 350 | File: pd1enc.def 2012/11/06 v6.83m Hyperref: PDFDocEncoding definition (HO) 351 | Now handling font encoding PD1 ... 352 | ... no UTF-8 mapping file for font encoding PD1 353 | ) 354 | \Hy@SavedSpaceFactor=\count93 355 | 356 | (/opt/texlive/2015/texmf-dist/tex/latex/latexconfig/hyperref.cfg 357 | File: hyperref.cfg 2002/06/06 v1.2 hyperref configuration of TeXLive 358 | ) 359 | Package hyperref Info: Hyper figures OFF on input line 4443. 360 | Package hyperref Info: Link nesting OFF on input line 4448. 361 | Package hyperref Info: Hyper index ON on input line 4451. 362 | Package hyperref Info: Plain pages OFF on input line 4458. 363 | Package hyperref Info: Backreferencing OFF on input line 4463. 364 | Package hyperref Info: Implicit mode ON; LaTeX internals redefined. 365 | Package hyperref Info: Bookmarks ON on input line 4688. 366 | \c@Hy@tempcnt=\count94 367 | 368 | (/opt/texlive/2015/texmf-dist/tex/latex/url/url.sty 369 | \Urlmuskip=\muskip10 370 | Package: url 2013/09/16 ver 3.4 Verb mode for urls, etc. 371 | ) 372 | LaTeX Info: Redefining \url on input line 5041. 373 | \XeTeXLinkMargin=\dimen112 374 | \Fld@menulength=\count95 375 | \Field@Width=\dimen113 376 | \Fld@charsize=\dimen114 377 | Package hyperref Info: Hyper figures OFF on input line 6295. 378 | Package hyperref Info: Link nesting OFF on input line 6300. 379 | Package hyperref Info: Hyper index ON on input line 6303. 380 | Package hyperref Info: backreferencing OFF on input line 6310. 381 | Package hyperref Info: Link coloring OFF on input line 6315. 382 | Package hyperref Info: Link coloring with OCG OFF on input line 6320. 383 | Package hyperref Info: PDF/A mode OFF on input line 6325. 384 | LaTeX Info: Redefining \ref on input line 6365. 385 | LaTeX Info: Redefining \pageref on input line 6369. 386 | \Hy@abspage=\count96 387 | \c@Item=\count97 388 | \c@Hfootnote=\count98 389 | ) 390 | 391 | Package hyperref Message: Driver (autodetected): hpdftex. 392 | 393 | (/opt/texlive/2015/texmf-dist/tex/latex/hyperref/hpdftex.def 394 | File: hpdftex.def 2012/11/06 v6.83m Hyperref driver for pdfTeX 395 | \Fld@listcount=\count99 396 | \c@bookmark@seq@number=\count100 397 | 398 | (/opt/texlive/2015/texmf-dist/tex/latex/oberdiek/rerunfilecheck.sty 399 | Package: rerunfilecheck 2011/04/15 v1.7 Rerun checks for auxiliary files (HO) 400 | Package uniquecounter Info: New unique counter `rerunfilecheck' on input line 2 401 | 82. 402 | ) 403 | \Hy@SectionHShift=\skip47 404 | ) 405 | (/opt/texlive/2015/texmf-dist/tex/latex/booktabs/booktabs.sty 406 | Package: booktabs 2005/04/14 v1.61803 publication quality tables 407 | \heavyrulewidth=\dimen115 408 | \lightrulewidth=\dimen116 409 | \cmidrulewidth=\dimen117 410 | \belowrulesep=\dimen118 411 | \belowbottomsep=\dimen119 412 | \aboverulesep=\dimen120 413 | \abovetopsep=\dimen121 414 | \cmidrulesep=\dimen122 415 | \cmidrulekern=\dimen123 416 | \defaultaddspace=\dimen124 417 | \@cmidla=\count101 418 | \@cmidlb=\count102 419 | \@aboverulesep=\dimen125 420 | \@belowrulesep=\dimen126 421 | \@thisruleclass=\count103 422 | \@lastruleclass=\count104 423 | \@thisrulewidth=\dimen127 424 | ) 425 | (/opt/texlive/2015/texmf-dist/tex/latex/amsfonts/amsfonts.sty 426 | Package: amsfonts 2013/01/14 v3.01 Basic AMSFonts support 427 | \@emptytoks=\toks18 428 | \symAMSa=\mathgroup4 429 | \symAMSb=\mathgroup5 430 | LaTeX Font Info: Overwriting math alphabet `\mathfrak' in version `bold' 431 | (Font) U/euf/m/n --> U/euf/b/n on input line 106. 432 | ) 433 | (/opt/texlive/2015/texmf-dist/tex/latex/units/nicefrac.sty 434 | Package: nicefrac 1998/08/04 v0.9b Nice fractions 435 | \L@UnitsRaiseDisplaystyle=\skip48 436 | \L@UnitsRaiseTextstyle=\skip49 437 | \L@UnitsRaiseScriptstyle=\skip50 438 | 439 | (/opt/texlive/2015/texmf-dist/tex/latex/base/ifthen.sty 440 | Package: ifthen 2014/09/29 v1.1c Standard LaTeX ifthen package (DPC) 441 | )) 442 | (/opt/texlive/2015/texmf-dist/tex/latex/microtype/microtype.sty 443 | Package: microtype 2013/05/23 v2.5a Micro-typographical refinements (RS) 444 | \MT@toks=\toks19 445 | \MT@count=\count105 446 | LaTeX Info: Redefining \textls on input line 766. 447 | \MT@outer@kern=\dimen128 448 | LaTeX Info: Redefining \textmicrotypecontext on input line 1285. 449 | \MT@listname@count=\count106 450 | 451 | (/opt/texlive/2015/texmf-dist/tex/latex/microtype/microtype-pdftex.def 452 | File: microtype-pdftex.def 2013/05/23 v2.5a Definitions specific to pdftex (RS) 453 | 454 | LaTeX Info: Redefining \lsstyle on input line 915. 455 | LaTeX Info: Redefining \lslig on input line 915. 456 | \MT@outer@space=\skip51 457 | ) 458 | Package microtype Info: Loading configuration file microtype.cfg. 459 | 460 | (/opt/texlive/2015/texmf-dist/tex/latex/microtype/microtype.cfg 461 | File: microtype.cfg 2013/05/23 v2.5a microtype main configuration file (RS) 462 | )) 463 | (./nips_2016.aux) 464 | \openout1 = `nips_2016.aux'. 465 | 466 | LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 67. 467 | LaTeX Font Info: ... okay on input line 67. 468 | LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 67. 469 | LaTeX Font Info: ... okay on input line 67. 470 | LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 67. 471 | LaTeX Font Info: ... okay on input line 67. 472 | LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 67. 473 | LaTeX Font Info: ... okay on input line 67. 474 | LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 67. 475 | LaTeX Font Info: ... okay on input line 67. 476 | LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 67. 477 | LaTeX Font Info: ... okay on input line 67. 478 | LaTeX Font Info: Checking defaults for PD1/pdf/m/n on input line 67. 479 | LaTeX Font Info: ... okay on input line 67. 480 | LaTeX Font Info: Try loading font information for T1+ptm on input line 67. 481 | (/opt/texlive/2015/texmf-dist/tex/latex/psnfss/t1ptm.fd 482 | File: t1ptm.fd 2001/06/04 font definitions for T1/ptm. 483 | ) 484 | *geometry* driver: auto-detecting 485 | *geometry* detected driver: pdftex 486 | *geometry* verbose mode - [ preamble ] result: 487 | * driver: pdftex 488 | * paper: letterpaper 489 | * layout: 490 | * layoutoffset:(h,v)=(0.0pt,0.0pt) 491 | * modes: 492 | * h-part:(L,W,R)=(108.405pt, 397.48499pt, 108.40501pt) 493 | * v-part:(T,H,B)=(72.26999pt, 650.43pt, 72.27pt) 494 | * \paperwidth=614.295pt 495 | * \paperheight=794.96999pt 496 | * \textwidth=397.48499pt 497 | * \textheight=650.43pt 498 | * \oddsidemargin=36.13501pt 499 | * \evensidemargin=36.13501pt 500 | * \topmargin=-37.0pt 501 | * \headheight=12.0pt 502 | * \headsep=25.0pt 503 | * \topskip=10.0pt 504 | * \footskip=30.0pt 505 | * \marginparwidth=65.0pt 506 | * \marginparsep=11.0pt 507 | * \columnsep=10.0pt 508 | * \skip\footins=9.0pt plus 4.0pt minus 2.0pt 509 | * \hoffset=0.0pt 510 | * \voffset=0.0pt 511 | * \mag=1000 512 | * \@twocolumnfalse 513 | * \@twosidefalse 514 | * \@mparswitchfalse 515 | * \@reversemarginfalse 516 | * (1in=72.27pt=25.4mm, 1cm=28.453pt) 517 | 518 | \AtBeginShipoutBox=\box26 519 | Package hyperref Info: Link coloring OFF on input line 67. 520 | (/opt/texlive/2015/texmf-dist/tex/latex/hyperref/nameref.sty 521 | Package: nameref 2012/10/27 v2.43 Cross-referencing by name of section 522 | 523 | (/opt/texlive/2015/texmf-dist/tex/generic/oberdiek/gettitlestring.sty 524 | Package: gettitlestring 2010/12/03 v1.4 Cleanup title references (HO) 525 | ) 526 | \c@section@level=\count107 527 | ) 528 | LaTeX Info: Redefining \ref on input line 67. 529 | LaTeX Info: Redefining \pageref on input line 67. 530 | LaTeX Info: Redefining \nameref on input line 67. 531 | 532 | (./nips_2016.out) (./nips_2016.out) 533 | \@outlinefile=\write3 534 | \openout3 = `nips_2016.out'. 535 | 536 | LaTeX Info: Redefining \microtypecontext on input line 67. 537 | Package microtype Info: Generating PDF output. 538 | Package microtype Info: Character protrusion enabled (level 2). 539 | Package microtype Info: Using default protrusion set `alltext'. 540 | Package microtype Info: Automatic font expansion enabled (level 2), 541 | (microtype) stretch: 20, shrink: 20, step: 1, non-selected. 542 | Package microtype Info: Using default expansion set `basictext'. 543 | Package microtype Info: No adjustment of tracking. 544 | Package microtype Info: No adjustment of interword spacing. 545 | Package microtype Info: No adjustment of character kerning. 546 | 547 | (/opt/texlive/2015/texmf-dist/tex/latex/microtype/mt-ptm.cfg 548 | File: mt-ptm.cfg 2006/04/20 v1.7 microtype config. file: Times (RS) 549 | ) 550 | LaTeX Font Info: Font shape `T1/ptm/bx/n' in size <17.28> not available 551 | (Font) Font shape `T1/ptm/b/n' tried instead on input line 71. 552 | 553 | (/opt/texlive/2015/texmf-dist/tex/latex/microtype/mt-cmr.cfg 554 | File: mt-cmr.cfg 2013/05/19 v2.2 microtype config. file: Computer Modern Roman 555 | (RS) 556 | ) 557 | LaTeX Font Info: Try loading font information for U+msa on input line 71. 558 | 559 | (/opt/texlive/2015/texmf-dist/tex/latex/amsfonts/umsa.fd 560 | File: umsa.fd 2013/01/14 v3.01 AMS symbols A 561 | ) 562 | (/opt/texlive/2015/texmf-dist/tex/latex/microtype/mt-msa.cfg 563 | File: mt-msa.cfg 2006/02/04 v1.1 microtype config. file: AMS symbols (a) (RS) 564 | ) 565 | LaTeX Font Info: Try loading font information for U+msb on input line 71. 566 | 567 | (/opt/texlive/2015/texmf-dist/tex/latex/amsfonts/umsb.fd 568 | File: umsb.fd 2013/01/14 v3.01 AMS symbols B 569 | ) 570 | (/opt/texlive/2015/texmf-dist/tex/latex/microtype/mt-msb.cfg 571 | File: mt-msb.cfg 2005/06/01 v1.0 microtype config. file: AMS symbols (b) (RS) 572 | ) 573 | LaTeX Font Info: Font shape `T1/ptm/bx/n' in size <10> not available 574 | (Font) Font shape `T1/ptm/b/n' tried instead on input line 71. 575 | LaTeX Font Info: Try loading font information for T1+cmtt on input line 71. 576 | 577 | (/opt/texlive/2015/texmf-dist/tex/latex/base/t1cmtt.fd 578 | File: t1cmtt.fd 2014/09/29 v2.5h Standard LaTeX font definitions 579 | ) 580 | Package microtype Info: Loading generic settings for font family 581 | (microtype) `cmtt' (encoding: T1). 582 | (microtype) For optimal results, create family-specific settings. 583 | (microtype) See the microtype manual for details. 584 | LaTeX Font Info: Font shape `T1/ptm/bx/n' in size <12> not available 585 | (Font) Font shape `T1/ptm/b/n' tried instead on input line 72. 586 | LaTeX Font Info: Font shape `T1/ptm/bx/n' in size <7> not available 587 | (Font) Font shape `T1/ptm/b/n' tried instead on input line 124. 588 | [1 589 | 590 | {/opt/texlive/2015/texmf-var/fonts/map/pdftex/updmap/pdftex.map}] [2] [3] 591 | LaTeX Font Info: Try loading font information for OMS+ptm on input line 320. 592 | 593 | 594 | (/opt/texlive/2015/texmf-dist/tex/latex/psnfss/omsptm.fd 595 | File: omsptm.fd 596 | ) 597 | LaTeX Font Info: Font shape `OMS/ptm/m/n' in size <10> not available 598 | (Font) Font shape `OMS/cmsy/m/n' tried instead on input line 320. 599 | [4] 600 | LaTeX Font Info: Font shape `T1/ptm/bx/it' in size <10> not available 601 | (Font) Font shape `T1/ptm/b/it' tried instead on input line 387. 602 | LaTeX Font Info: Font shape `T1/ptm/bx/n' in size <9> not available 603 | (Font) Font shape `T1/ptm/b/n' tried instead on input line 404. 604 | Package atveryend Info: Empty hook `BeforeClearDocument' on input line 406. 605 | [5] 606 | Package atveryend Info: Empty hook `AfterLastShipout' on input line 406. 607 | 608 | (./nips_2016.aux) 609 | Package atveryend Info: Executing hook `AtVeryEndDocument' on input line 406. 610 | Package atveryend Info: Executing hook `AtEndAfterFileList' on input line 406. 611 | Package rerunfilecheck Info: File `nips_2016.out' has not changed. 612 | (rerunfilecheck) Checksum: 351AF28D8766A694F09B69678630ECDB;962. 613 | Package atveryend Info: Empty hook `AtVeryVeryEnd' on input line 406. 614 | ) 615 | Here is how much of TeX's memory you used: 616 | 7400 strings out of 493091 617 | 111987 string characters out of 6137640 618 | 228400 words of memory out of 5000000 619 | 10648 multiletter control sequences out of 15000+600000 620 | 51815 words of font info for 137 fonts, out of 8000000 for 9000 621 | 1141 hyphenation exceptions out of 8191 622 | 31i,10n,35p,210b,376s stack positions out of 5000i,500n,10000p,200000b,80000s 623 | {/opt/texlive/2015/texmf-dist/fonts/enc/dvips/base/8r.enc}{/opt/texlive/2015/ 624 | texmf-dist/fonts/enc/dvips/cm-super/cm-super-t1.enc} 636 | Output written on nips_2016.pdf (5 pages, 152670 bytes). 637 | PDF statistics: 638 | 169 PDF objects out of 1000 (max. 8388607) 639 | 148 compressed objects within 2 object streams 640 | 28 named destinations out of 1000 (max. 500000) 641 | 26233 words of extra memory for PDF output out of 29859 (max. 10000000) 642 | 643 | -------------------------------------------------------------------------------- /paper/nips_2016.out: -------------------------------------------------------------------------------- 1 | \BOOKMARK [1][-]{section.1}{Submission of papers to NIPS 2016}{}% 1 2 | \BOOKMARK [2][-]{subsection.1.1}{Style}{section.1}% 2 3 | \BOOKMARK [2][-]{subsection.1.2}{Retrieval of style files}{section.1}% 3 4 | \BOOKMARK [1][-]{section.2}{General formatting instructions}{}% 4 5 | \BOOKMARK [1][-]{section.3}{Headings: first level}{}% 5 6 | \BOOKMARK [2][-]{subsection.3.1}{Headings: second level}{section.3}% 6 7 | \BOOKMARK [3][-]{subsubsection.3.1.1}{Headings: third level}{subsection.3.1}% 7 8 | \BOOKMARK [1][-]{section.4}{Citations, figures, tables, references}{}% 8 9 | \BOOKMARK [2][-]{subsection.4.1}{Citations within the text}{section.4}% 9 10 | \BOOKMARK [2][-]{subsection.4.2}{Footnotes}{section.4}% 10 11 | \BOOKMARK [2][-]{subsection.4.3}{Figures}{section.4}% 11 12 | \BOOKMARK [2][-]{subsection.4.4}{Tables}{section.4}% 12 13 | \BOOKMARK [1][-]{section.5}{Final instructions}{}% 13 14 | \BOOKMARK [1][-]{section.6}{Preparing PDF files}{}% 14 15 | \BOOKMARK [2][-]{subsection.6.1}{Margins in LaTeX}{section.6}% 15 16 | -------------------------------------------------------------------------------- /paper/nips_2016.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasMiconi/DiffRNN/c81eb14ab9d51191b4d7891e0472036077f49bcc/paper/nips_2016.pdf -------------------------------------------------------------------------------- /paper/nips_2016.sty: -------------------------------------------------------------------------------- 1 | % partial rewrite of the LaTeX2e package for submissions to the 2 | % Conference on Neural Information Processing Systems (NIPS): 3 | % 4 | % - uses more LaTeX conventions 5 | % - line numbers at submission time replaced with aligned numbers from 6 | % lineno package 7 | % - \nipsfinalcopy replaced with [final] package option 8 | % - automatically loads times package for authors 9 | % - loads natbib automatically; this can be suppressed with the 10 | % [nonatbib] package option 11 | % - adds foot line to first page identifying the conference 12 | % 13 | % Roman Garnett (garnett@wustl.edu) and the many authors of 14 | % nips15submit_e.sty, including MK and drstrip@sandia 15 | % 16 | % last revision: March 2016 17 | 18 | \NeedsTeXFormat{LaTeX2e} 19 | \ProvidesPackage{nips_2016}[2016/03/07 NIPS 2016 submission/camera-ready style file] 20 | 21 | % declare final option, which creates camera-ready copy 22 | \newif\if@nipsfinal\@nipsfinalfalse 23 | \DeclareOption{final}{ 24 | \@nipsfinaltrue 25 | } 26 | 27 | % declare nonatbib option, which does not load natbib in case of 28 | % package clash (users can pass options to natbib via 29 | % \PassOptionsToPackage) 30 | \newif\if@natbib\@natbibtrue 31 | \DeclareOption{nonatbib}{ 32 | \@natbibfalse 33 | } 34 | 35 | \ProcessOptions\relax 36 | 37 | % fonts 38 | \renewcommand{\rmdefault}{ptm} 39 | \renewcommand{\sfdefault}{phv} 40 | 41 | % change this every year for notice string at bottom 42 | \newcommand{\@nipsordinal}{30th} 43 | \newcommand{\@nipsyear}{2016} 44 | \newcommand{\@nipslocation}{Barcelona, Spain} 45 | 46 | % handle tweaks for camera-ready copy vs. submission copy 47 | \if@nipsfinal 48 | \newcommand{\@noticestring}{% 49 | \@nipsordinal\/ Conference on Neural Information Processing Systems 50 | (NIPS \@nipsyear), \@nipslocation.% 51 | } 52 | \else 53 | \newcommand{\@noticestring}{% 54 | Submitted to \@nipsordinal\/ Conference on Neural Information 55 | Processing Systems (NIPS \@nipsyear). Do not distribute.% 56 | } 57 | 58 | % line numbers for submission 59 | \RequirePackage{lineno} 60 | \linenumbers 61 | 62 | % fix incompatibilities between lineno and amsmath, if required, by 63 | % transparently wrapping linenomath environments around amsmath 64 | % environments 65 | \AtBeginDocument{% 66 | \@ifpackageloaded{amsmath}{% 67 | \newcommand*\patchAmsMathEnvironmentForLineno[1]{% 68 | \expandafter\let\csname old#1\expandafter\endcsname\csname #1\endcsname 69 | \expandafter\let\csname oldend#1\expandafter\endcsname\csname end#1\endcsname 70 | \renewenvironment{#1}% 71 | {\linenomath\csname old#1\endcsname}% 72 | {\csname oldend#1\endcsname\endlinenomath}% 73 | }% 74 | \newcommand*\patchBothAmsMathEnvironmentsForLineno[1]{% 75 | \patchAmsMathEnvironmentForLineno{#1}% 76 | \patchAmsMathEnvironmentForLineno{#1*}% 77 | }% 78 | \patchBothAmsMathEnvironmentsForLineno{equation}% 79 | \patchBothAmsMathEnvironmentsForLineno{align}% 80 | \patchBothAmsMathEnvironmentsForLineno{flalign}% 81 | \patchBothAmsMathEnvironmentsForLineno{alignat}% 82 | \patchBothAmsMathEnvironmentsForLineno{gather}% 83 | \patchBothAmsMathEnvironmentsForLineno{multline}% 84 | }{} 85 | } 86 | \fi 87 | 88 | % load natbib unless told otherwise 89 | \if@natbib 90 | \RequirePackage{natbib} 91 | \fi 92 | 93 | % set page geometry 94 | \usepackage[ 95 | letterpaper, 96 | textheight=9in, 97 | textwidth=5.5in, 98 | top=1in 99 | ]{geometry} 100 | 101 | \widowpenalty=10000 102 | \clubpenalty=10000 103 | \flushbottom 104 | \sloppy 105 | 106 | % font sizes with reduced leading 107 | \renewcommand{\normalsize}{% 108 | \@setfontsize\normalsize\@xpt\@xipt 109 | \abovedisplayskip 7\p@ \@plus 2\p@ \@minus 5\p@ 110 | \abovedisplayshortskip \z@ \@plus 3\p@ 111 | \belowdisplayskip \abovedisplayskip 112 | \belowdisplayshortskip 4\p@ \@plus 3\p@ \@minus 3\p@ 113 | } 114 | \normalsize 115 | \renewcommand{\small}{% 116 | \@setfontsize\small\@ixpt\@xpt 117 | \abovedisplayskip 6\p@ \@plus 1.5\p@ \@minus 4\p@ 118 | \abovedisplayshortskip \z@ \@plus 2\p@ 119 | \belowdisplayskip \abovedisplayskip 120 | \belowdisplayshortskip 3\p@ \@plus 2\p@ \@minus 2\p@ 121 | } 122 | \renewcommand{\footnotesize}{\@setfontsize\footnotesize\@ixpt\@xpt} 123 | \renewcommand{\scriptsize}{\@setfontsize\scriptsize\@viipt\@viiipt} 124 | \renewcommand{\tiny}{\@setfontsize\tiny\@vipt\@viipt} 125 | \renewcommand{\large}{\@setfontsize\large\@xiipt{14}} 126 | \renewcommand{\Large}{\@setfontsize\Large\@xivpt{16}} 127 | \renewcommand{\LARGE}{\@setfontsize\LARGE\@xviipt{20}} 128 | \renewcommand{\huge}{\@setfontsize\huge\@xxpt{23}} 129 | \renewcommand{\Huge}{\@setfontsize\Huge\@xxvpt{28}} 130 | 131 | % sections with less space 132 | \providecommand{\section}{} 133 | \renewcommand{\section}{% 134 | \@startsection{section}{1}{\z@}% 135 | {-2.0ex \@plus -0.5ex \@minus -0.2ex}% 136 | { 1.5ex \@plus 0.3ex \@minus 0.2ex}% 137 | {\large\bf\raggedright}% 138 | } 139 | \providecommand{\subsection}{} 140 | \renewcommand{\subsection}{% 141 | \@startsection{subsection}{2}{\z@}% 142 | {-1.8ex \@plus -0.5ex \@minus -0.2ex}% 143 | { 0.8ex \@plus 0.2ex}% 144 | {\normalsize\bf\raggedright}% 145 | } 146 | \providecommand{\subsubsection}{} 147 | \renewcommand{\subsubsection}{% 148 | \@startsection{subsubsection}{3}{\z@}% 149 | {-1.5ex \@plus -0.5ex \@minus -0.2ex}% 150 | { 0.5ex \@plus 0.2ex}% 151 | {\normalsize\bf\raggedright}% 152 | } 153 | \providecommand{\paragraph}{} 154 | \renewcommand{\paragraph}{% 155 | \@startsection{paragraph}{4}{\z@}% 156 | {1.5ex \@plus 0.5ex \@minus 0.2ex}% 157 | {-1em}% 158 | {\normalsize\bf}% 159 | } 160 | \providecommand{\subparagraph}{} 161 | \renewcommand{\subparagraph}{% 162 | \@startsection{subparagraph}{5}{\z@}% 163 | {1.5ex \@plus 0.5ex \@minus 0.2ex}% 164 | {-1em}% 165 | {\normalsize\bf}% 166 | } 167 | \providecommand{\subsubsubsection}{} 168 | \renewcommand{\subsubsubsection}{% 169 | \vskip5pt{\noindent\normalsize\rm\raggedright}% 170 | } 171 | 172 | % float placement 173 | \renewcommand{\topfraction }{0.85} 174 | \renewcommand{\bottomfraction }{0.4} 175 | \renewcommand{\textfraction }{0.1} 176 | \renewcommand{\floatpagefraction}{0.7} 177 | 178 | \newlength{\@nipsabovecaptionskip}\setlength{\@nipsabovecaptionskip}{7\p@} 179 | \newlength{\@nipsbelowcaptionskip}\setlength{\@nipsbelowcaptionskip}{\z@} 180 | 181 | \setlength{\abovecaptionskip}{\@nipsabovecaptionskip} 182 | \setlength{\belowcaptionskip}{\@nipsbelowcaptionskip} 183 | 184 | % swap above/belowcaptionskip lengths for tables 185 | \renewenvironment{table} 186 | {\setlength{\abovecaptionskip}{\@nipsbelowcaptionskip}% 187 | \setlength{\belowcaptionskip}{\@nipsabovecaptionskip}% 188 | \@float{table}} 189 | {\end@float} 190 | 191 | % footnote formatting 192 | \setlength{\footnotesep }{6.65\p@} 193 | \setlength{\skip\footins}{9\p@ \@plus 4\p@ \@minus 2\p@} 194 | \renewcommand{\footnoterule}{\kern-3\p@ \hrule width 12pc \kern 2.6\p@} 195 | \setcounter{footnote}{0} 196 | 197 | % paragraph formatting 198 | \setlength{\parindent}{\z@} 199 | \setlength{\parskip }{5.5\p@} 200 | 201 | % list formatting 202 | \setlength{\topsep }{4\p@ \@plus 1\p@ \@minus 2\p@} 203 | \setlength{\partopsep }{1\p@ \@plus 0.5\p@ \@minus 0.5\p@} 204 | \setlength{\itemsep }{2\p@ \@plus 1\p@ \@minus 0.5\p@} 205 | \setlength{\parsep }{2\p@ \@plus 1\p@ \@minus 0.5\p@} 206 | \setlength{\leftmargin }{3pc} 207 | \setlength{\leftmargini }{\leftmargin} 208 | \setlength{\leftmarginii }{2em} 209 | \setlength{\leftmarginiii}{1.5em} 210 | \setlength{\leftmarginiv }{1.0em} 211 | \setlength{\leftmarginv }{0.5em} 212 | \def\@listi {\leftmargin\leftmargini} 213 | \def\@listii {\leftmargin\leftmarginii 214 | \labelwidth\leftmarginii 215 | \advance\labelwidth-\labelsep 216 | \topsep 2\p@ \@plus 1\p@ \@minus 0.5\p@ 217 | \parsep 1\p@ \@plus 0.5\p@ \@minus 0.5\p@ 218 | \itemsep \parsep} 219 | \def\@listiii{\leftmargin\leftmarginiii 220 | \labelwidth\leftmarginiii 221 | \advance\labelwidth-\labelsep 222 | \topsep 1\p@ \@plus 0.5\p@ \@minus 0.5\p@ 223 | \parsep \z@ 224 | \partopsep 0.5\p@ \@plus 0\p@ \@minus 0.5\p@ 225 | \itemsep \topsep} 226 | \def\@listiv {\leftmargin\leftmarginiv 227 | \labelwidth\leftmarginiv 228 | \advance\labelwidth-\labelsep} 229 | \def\@listv {\leftmargin\leftmarginv 230 | \labelwidth\leftmarginv 231 | \advance\labelwidth-\labelsep} 232 | \def\@listvi {\leftmargin\leftmarginvi 233 | \labelwidth\leftmarginvi 234 | \advance\labelwidth-\labelsep} 235 | 236 | % create title 237 | \providecommand{\maketitle}{} 238 | \renewcommand{\maketitle}{% 239 | \par 240 | \begingroup 241 | \renewcommand{\thefootnote}{\fnsymbol{footnote}} 242 | % for perfect author name centering 243 | \renewcommand{\@makefnmark}{\hbox to \z@{$^{\@thefnmark}$\hss}} 244 | % The footnote-mark was overlapping the footnote-text, 245 | % added the following to fix this problem (MK) 246 | \long\def\@makefntext##1{% 247 | \parindent 1em\noindent 248 | \hbox to 1.8em{\hss $\m@th ^{\@thefnmark}$}##1 249 | } 250 | \thispagestyle{empty} 251 | \@maketitle 252 | \@thanks 253 | % \@notice 254 | \endgroup 255 | \let\maketitle\relax 256 | \let\thanks\relax 257 | } 258 | 259 | % rules for title box at top of first page 260 | \newcommand{\@toptitlebar}{ 261 | \hrule height 4\p@ 262 | \vskip 0.25in 263 | \vskip -\parskip% 264 | } 265 | \newcommand{\@bottomtitlebar}{ 266 | \vskip 0.29in 267 | \vskip -\parskip 268 | \hrule height 1\p@ 269 | \vskip 0.09in% 270 | } 271 | 272 | % create title (includes both anonymized and non-anonymized versions) 273 | \providecommand{\@maketitle}{} 274 | \renewcommand{\@maketitle}{% 275 | \vbox{% 276 | \hsize\textwidth 277 | \linewidth\hsize 278 | \vskip 0.1in 279 | \@toptitlebar 280 | \centering 281 | {\LARGE\bf \@title\par} 282 | \@bottomtitlebar 283 | \if@nipsfinal 284 | \def\And{% 285 | \end{tabular}\hfil\linebreak[0]\hfil% 286 | \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces% 287 | } 288 | \def\AND{% 289 | \end{tabular}\hfil\linebreak[4]\hfil% 290 | \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces% 291 | } 292 | \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\@author\end{tabular}% 293 | \else 294 | \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@} 295 | Anonymous Author(s) \\ 296 | Affiliation \\ 297 | Address \\ 298 | \texttt{email} \\ 299 | \end{tabular}% 300 | \fi 301 | \vskip 0.3in \@minus 0.1in 302 | } 303 | } 304 | 305 | % add conference notice to bottom of first page 306 | \newcommand{\ftype@noticebox}{8} 307 | \newcommand{\@notice}{% 308 | % give a bit of extra room back to authors on first page 309 | \enlargethispage{2\baselineskip}% 310 | \@float{noticebox}[b]% 311 | \footnotesize\@noticestring% 312 | \end@float% 313 | } 314 | 315 | % abstract styling 316 | \renewenvironment{abstract}% 317 | {% 318 | \vskip 0.075in% 319 | \centerline% 320 | {\large\bf Abstract}% 321 | \vspace{0.5ex}% 322 | \begin{quote}% 323 | } 324 | { 325 | \par% 326 | \end{quote}% 327 | \vskip 1ex% 328 | } 329 | 330 | \endinput 331 | -------------------------------------------------------------------------------- /paper/nips_2016.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | 3 | % if you need to pass options to natbib, use, e.g.: 4 | % \PassOptionsToPackage{numbers, compress}{natbib} 5 | % before loading nips_2016 6 | % 7 | % to avoid loading the natbib package, add option nonatbib: 8 | % \usepackage[nonatbib]{nips_2016} 9 | 10 | \usepackage[final]{nips_2016} 11 | 12 | % to compile a camera-ready version, add the [final] option, e.g.: 13 | % \usepackage[final]{nips_2016} 14 | 15 | \usepackage[utf8]{inputenc} % allow utf-8 input 16 | \usepackage[T1]{fontenc} % use 8-bit T1 fonts 17 | \usepackage{hyperref} % hyperlinks 18 | \usepackage{url} % simple URL typesetting 19 | \usepackage{booktabs} % professional-quality tables 20 | \usepackage{amsfonts} % blackboard math symbols 21 | \usepackage{nicefrac} % compact symbols for 1/2, etc. 22 | \usepackage{microtype} % microtypography 23 | 24 | \title{Formatting instructions for NIPS 2016} 25 | 26 | % The \author macro works with any number of authors. There are two 27 | % commands used to separate the names and addresses of multiple 28 | % authors: \And and \AND. 29 | % 30 | % Using \And between authors leaves it to LaTeX to determine where to 31 | % break the lines. Using \AND forces a line break at that point. So, 32 | % if LaTeX puts 3 of 4 authors names on the first line, and the last 33 | % on the second line, try using \AND instead of \And before the third 34 | % author name. 35 | 36 | \author{ 37 | David S.~Hippocampus\thanks{Use footnote for providing further 38 | information about author (webpage, alternative 39 | address)---\emph{not} for acknowledging funding agencies.} \\ 40 | Department of Computer Science\\ 41 | Cranberry-Lemon University\\ 42 | Pittsburgh, PA 15213 \\ 43 | \texttt{hippo@cs.cranberry-lemon.edu} \\ 44 | %% examples of more authors 45 | %% \And 46 | %% Coauthor \\ 47 | %% Affiliation \\ 48 | %% Address \\ 49 | %% \texttt{email} \\ 50 | %% \AND 51 | %% Coauthor \\ 52 | %% Affiliation \\ 53 | %% Address \\ 54 | %% \texttt{email} \\ 55 | %% \And 56 | %% Coauthor \\ 57 | %% Affiliation \\ 58 | %% Address \\ 59 | %% \texttt{email} \\ 60 | %% \And 61 | %% Coauthor \\ 62 | %% Affiliation \\ 63 | %% Address \\ 64 | %% \texttt{email} \\ 65 | } 66 | 67 | \begin{document} 68 | % \nipsfinalcopy is no longer used 69 | 70 | \maketitle 71 | 72 | \begin{abstract} 73 | The abstract paragraph should be indented \nicefrac{1}{2}~inch 74 | (3~picas) on both the left- and right-hand margins. Use 10~point 75 | type, with a vertical spacing (leading) of 11~points. The word 76 | \textbf{Abstract} must be centered, bold, and in point size 12. Two 77 | line spaces precede the abstract. The abstract must be limited to 78 | one paragraph. 79 | \end{abstract} 80 | 81 | \section{Submission of papers to NIPS 2016} 82 | 83 | \textbf{There is a new style file for papers submitted in 2016!} 84 | 85 | NIPS requires electronic submissions. The electronic submission site 86 | is 87 | \begin{center} 88 | \url{https://cmt.research.microsoft.com/NIPS2016/} 89 | \end{center} 90 | 91 | Please read carefully the instructions below and follow them 92 | faithfully. 93 | 94 | \subsection{Style} 95 | 96 | Papers to be submitted to NIPS 2016 must be prepared according to the 97 | instructions presented here. Papers may only be up to eight pages 98 | long, including figures. Since 2009 an additional ninth page 99 | \emph{containing only acknowledgments and/or cited references} is 100 | allowed. Papers that exceed nine pages will not be reviewed, or in any 101 | other way considered for presentation at the conference. 102 | 103 | The margins in 2016 are the same as since 2007, which allow for 104 | $\sim$$15\%$ more words in the paper compared to earlier years. 105 | 106 | Authors are required to use the NIPS \LaTeX{} style files obtainable 107 | at the NIPS website as indicated below. Please make sure you use the 108 | current files and not previous versions. Tweaking the style files may 109 | be grounds for rejection. 110 | 111 | \subsection{Retrieval of style files} 112 | 113 | The style files for NIPS and other conference information are 114 | available on the World Wide Web at 115 | \begin{center} 116 | \url{http://www.nips.cc/} 117 | \end{center} 118 | The file \verb+nips_2016.pdf+ contains these instructions and 119 | illustrates the various formatting requirements your NIPS paper must 120 | satisfy. 121 | 122 | The only supported style file for NIPS 2016 is \verb+nips_2016.sty+, 123 | rewritten for \LaTeXe{}. \textbf{Previous style files for \LaTeX{} 124 | 2.09, Microsoft Word, and RTF are no longer supported!} 125 | 126 | The new \LaTeX{} style file contains two optional arguments: 127 | \verb+final+, which creates a camera-ready copy, and \verb+nonatbib+, 128 | which will not load the \verb+natbib+ package for you in case of 129 | package clash. 130 | 131 | At submission time, please omit the \verb+final+ option. This will 132 | anonymize your submission and add line numbers to aid review. Please 133 | do \emph{not} refer to these line numbers in your paper as they will 134 | be removed during generation of camera-ready copies. 135 | 136 | The file \verb+nips_2016.tex+ may be used as a ``shell'' for writing 137 | your paper. All you have to do is replace the author, title, abstract, 138 | and text of the paper with your own. 139 | 140 | The formatting instructions contained in these style files are 141 | summarized in Sections \ref{gen_inst}, \ref{headings}, and 142 | \ref{others} below. 143 | 144 | \section{General formatting instructions} 145 | \label{gen_inst} 146 | 147 | The text must be confined within a rectangle 5.5~inches (33~picas) 148 | wide and 9~inches (54~picas) long. The left margin is 1.5~inch 149 | (9~picas). Use 10~point type with a vertical spacing (leading) of 150 | 11~points. Times New Roman is the preferred typeface throughout, and 151 | will be selected for you by default. Paragraphs are separated by 152 | \nicefrac{1}{2}~line space (5.5 points), with no indentation. 153 | 154 | The paper title should be 17~point, initial caps/lower case, bold, 155 | centered between two horizontal rules. The top rule should be 4~points 156 | thick and the bottom rule should be 1~point thick. Allow 157 | \nicefrac{1}{4}~inch space above and below the title to rules. All 158 | pages should start at 1~inch (6~picas) from the top of the page. 159 | 160 | For the final version, authors' names are set in boldface, and each 161 | name is centered above the corresponding address. The lead author's 162 | name is to be listed first (left-most), and the co-authors' names (if 163 | different address) are set to follow. If there is only one co-author, 164 | list both author and co-author side by side. 165 | 166 | Please pay special attention to the instructions in Section \ref{others} 167 | regarding figures, tables, acknowledgments, and references. 168 | 169 | \section{Headings: first level} 170 | \label{headings} 171 | 172 | All headings should be lower case (except for first word and proper 173 | nouns), flush left, and bold. 174 | 175 | First-level headings should be in 12-point type. 176 | 177 | \subsection{Headings: second level} 178 | 179 | Second-level headings should be in 10-point type. 180 | 181 | \subsubsection{Headings: third level} 182 | 183 | Third-level headings should be in 10-point type. 184 | 185 | \paragraph{Paragraphs} 186 | 187 | There is also a \verb+\paragraph+ command available, which sets the 188 | heading in bold, flush left, and inline with the text, with the 189 | heading followed by 1\,em of space. 190 | 191 | \section{Citations, figures, tables, references} 192 | \label{others} 193 | 194 | These instructions apply to everyone. 195 | 196 | \subsection{Citations within the text} 197 | 198 | The \verb+natbib+ package will be loaded for you by default. 199 | Citations may be author/year or numeric, as long as you maintain 200 | internal consistency. As to the format of the references themselves, 201 | any style is acceptable as long as it is used consistently. 202 | 203 | The documentation for \verb+natbib+ may be found at 204 | \begin{center} 205 | \url{http://mirrors.ctan.org/macros/latex/contrib/natbib/natnotes.pdf} 206 | \end{center} 207 | Of note is the command \verb+\citet+, which produces citations 208 | appropriate for use in inline text. For example, 209 | \begin{verbatim} 210 | \citet{hasselmo} investigated\dots 211 | \end{verbatim} 212 | produces 213 | \begin{quote} 214 | Hasselmo, et al.\ (1995) investigated\dots 215 | \end{quote} 216 | 217 | If you wish to load the \verb+natbib+ package with options, you may 218 | add the following before loading the \verb+nips_2016+ package: 219 | \begin{verbatim} 220 | \PassOptionsToPackage{options}{natbib} 221 | \end{verbatim} 222 | 223 | If \verb+natbib+ clashes with another package you load, you can add 224 | the optional argument \verb+nonatbib+ when loading the style file: 225 | \begin{verbatim} 226 | \usepackage[nonatbib]{nips_2016} 227 | \end{verbatim} 228 | 229 | As submission is double blind, refer to your own published work in the 230 | third person. That is, use ``In the previous work of Jones et 231 | al.\ [4],'' not ``In our previous work [4].'' If you cite your other 232 | papers that are not widely available (e.g., a journal paper under 233 | review), use anonymous author names in the citation, e.g., an author 234 | of the form ``A.\ Anonymous.'' 235 | 236 | \subsection{Footnotes} 237 | 238 | Footnotes should be used sparingly. If you do require a footnote, 239 | indicate footnotes with a number\footnote{Sample of the first 240 | footnote.} in the text. Place the footnotes at the bottom of the 241 | page on which they appear. Precede the footnote with a horizontal 242 | rule of 2~inches (12~picas). 243 | 244 | Note that footnotes are properly typeset \emph{after} punctuation 245 | marks.\footnote{As in this example.} 246 | 247 | \subsection{Figures} 248 | 249 | All artwork must be neat, clean, and legible. Lines should be dark 250 | enough for purposes of reproduction. The figure number and caption 251 | always appear after the figure. Place one line space before the figure 252 | caption and one line space after the figure. The figure caption should 253 | be lower case (except for first word and proper nouns); figures are 254 | numbered consecutively. 255 | 256 | You may use color figures. However, it is best for the figure 257 | captions and the paper body to be legible if the paper is printed in 258 | either black/white or in color. 259 | \begin{figure}[h] 260 | \centering 261 | \fbox{\rule[-.5cm]{0cm}{4cm} \rule[-.5cm]{4cm}{0cm}} 262 | \caption{Sample figure caption.} 263 | \end{figure} 264 | 265 | \subsection{Tables} 266 | 267 | All tables must be centered, neat, clean and legible. The table 268 | number and title always appear before the table. See 269 | Table~\ref{sample-table}. 270 | 271 | Place one line space before the table title, one line space after the 272 | table title, and one line space after the table. The table title must 273 | be lower case (except for first word and proper nouns); tables are 274 | numbered consecutively. 275 | 276 | Note that publication-quality tables \emph{do not contain vertical 277 | rules.} We strongly suggest the use of the \verb+booktabs+ package, 278 | which allows for typesetting high-quality, professional tables: 279 | \begin{center} 280 | \url{https://www.ctan.org/pkg/booktabs} 281 | \end{center} 282 | This package was used to typeset Table~\ref{sample-table}. 283 | 284 | \begin{table}[t] 285 | \caption{Sample table title} 286 | \label{sample-table} 287 | \centering 288 | \begin{tabular}{lll} 289 | \toprule 290 | \multicolumn{2}{c}{Part} \\ 291 | \cmidrule{1-2} 292 | Name & Description & Size ($\mu$m) \\ 293 | \midrule 294 | Dendrite & Input terminal & $\sim$100 \\ 295 | Axon & Output terminal & $\sim$10 \\ 296 | Soma & Cell body & up to $10^6$ \\ 297 | \bottomrule 298 | \end{tabular} 299 | \end{table} 300 | 301 | \section{Final instructions} 302 | 303 | Do not change any aspects of the formatting parameters in the style 304 | files. In particular, do not modify the width or length of the 305 | rectangle the text should fit into, and do not change font sizes 306 | (except perhaps in the \textbf{References} section; see below). Please 307 | note that pages should be numbered. 308 | 309 | \section{Preparing PDF files} 310 | 311 | Please prepare submission files with paper size ``US Letter,'' and 312 | not, for example, ``A4.'' 313 | 314 | Fonts were the main cause of problems in the past years. Your PDF file 315 | must only contain Type 1 or Embedded TrueType fonts. Here are a few 316 | instructions to achieve this. 317 | 318 | \begin{itemize} 319 | 320 | \item You should directly generate PDF files using \verb+pdflatex+. 321 | 322 | \item You can check which fonts a PDF files uses. In Acrobat Reader, 323 | select the menu Files$>$Document Properties$>$Fonts and select Show 324 | All Fonts. You can also use the program \verb+pdffonts+ which comes 325 | with \verb+xpdf+ and is available out-of-the-box on most Linux 326 | machines. 327 | 328 | \item The IEEE has recommendations for generating PDF files whose 329 | fonts are also acceptable for NIPS. Please see 330 | \url{http://www.emfield.org/icuwb2010/downloads/IEEE-PDF-SpecV32.pdf} 331 | 332 | \item \verb+xfig+ "patterned" shapes are implemented with bitmap 333 | fonts. Use "solid" shapes instead. 334 | 335 | \item The \verb+\bbold+ package almost always uses bitmap fonts. You 336 | should use the equivalent AMS Fonts: 337 | \begin{verbatim} 338 | \usepackage{amsfonts} 339 | \end{verbatim} 340 | followed by, e.g., \verb+\mathbb{R}+, \verb+\mathbb{N}+, or 341 | \verb+\mathbb{C}+ for $\mathbb{R}$, $\mathbb{N}$ or $\mathbb{C}$. You 342 | can also use the following workaround for reals, natural and complex: 343 | \begin{verbatim} 344 | \newcommand{\RR}{I\!\!R} %real numbers 345 | \newcommand{\Nat}{I\!\!N} %natural numbers 346 | \newcommand{\CC}{I\!\!\!\!C} %complex numbers 347 | \end{verbatim} 348 | Note that \verb+amsfonts+ is automatically loaded by the 349 | \verb+amssymb+ package. 350 | 351 | \end{itemize} 352 | 353 | If your file contains type 3 fonts or non embedded TrueType fonts, we 354 | will ask you to fix it. 355 | 356 | \subsection{Margins in \LaTeX{}} 357 | 358 | Most of the margin problems come from figures positioned by hand using 359 | \verb+\special+ or other commands. We suggest using the command 360 | \verb+\includegraphics+ from the \verb+graphicx+ package. Always 361 | specify the figure width as a multiple of the line width as in the 362 | example below: 363 | \begin{verbatim} 364 | \usepackage[pdftex]{graphicx} ... 365 | \includegraphics[width=0.8\linewidth]{myfile.pdf} 366 | \end{verbatim} 367 | See Section 4.4 in the graphics bundle documentation 368 | (\url{http://mirrors.ctan.org/macros/latex/required/graphics/grfguide.pdf}) 369 | 370 | A number of width problems arise when \LaTeX{} cannot properly 371 | hyphenate a line. Please give LaTeX hyphenation hints using the 372 | \verb+\-+ command when necessary. 373 | 374 | \subsubsection*{Acknowledgments} 375 | 376 | Use unnumbered third level headings for the acknowledgments. All 377 | acknowledgments go at the end of the paper. Do not include 378 | acknowledgments in the anonymized submission, only in the final paper. 379 | 380 | \section*{References} 381 | 382 | References follow the acknowledgments. Use unnumbered first-level 383 | heading for the references. Any choice of citation style is acceptable 384 | as long as you are consistent. It is permissible to reduce the font 385 | size to \verb+small+ (9 point) when listing the references. {\bf 386 | Remember that you can use a ninth page as long as it contains 387 | \emph{only} cited references.} 388 | \medskip 389 | 390 | \small 391 | 392 | [1] Alexander, J.A.\ \& Mozer, M.C.\ (1995) Template-based algorithms 393 | for connectionist rule extraction. In G.\ Tesauro, D.S.\ Touretzky and 394 | T.K.\ Leen (eds.), {\it Advances in Neural Information Processing 395 | Systems 7}, pp.\ 609--616. Cambridge, MA: MIT Press. 396 | 397 | [2] Bower, J.M.\ \& Beeman, D.\ (1995) {\it The Book of GENESIS: 398 | Exploring Realistic Neural Models with the GEneral NEural SImulation 399 | System.} New York: TELOS/Springer--Verlag. 400 | 401 | [3] Hasselmo, M.E., Schnell, E.\ \& Barkai, E.\ (1995) Dynamics of 402 | learning and recall at excitatory recurrent synapses and cholinergic 403 | modulation in rat hippocampal region CA3. {\it Journal of 404 | Neuroscience} {\bf 15}(7):5249-5262. 405 | 406 | \end{document} 407 | -------------------------------------------------------------------------------- /paper/paper-blx.bib: -------------------------------------------------------------------------------- 1 | @Comment{$ biblatex control file $} 2 | @Comment{$ biblatex version 2.5 $} 3 | Do not modify this file! 4 | 5 | This is an auxiliary file used by the 'biblatex' package. 6 | This file may safely be deleted. It will be recreated as 7 | required. 8 | 9 | @Control{biblatex-control, 10 | options = {2.5:0:0:1:0:0:1:1:0:0:0:0:1:1:3:1:79:+}, 11 | } 12 | -------------------------------------------------------------------------------- /paper/paper.aux: -------------------------------------------------------------------------------- 1 | \relax 2 | \providecommand\hyper@newdestlabel[2]{} 3 | \providecommand\HyperFirstAtBeginDocument{\AtBeginDocument} 4 | \HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined 5 | \global\let\oldcontentsline\contentsline 6 | \gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}} 7 | \global\let\oldnewlabel\newlabel 8 | \gdef\newlabel#1#2{\newlabelxx{#1}#2} 9 | \gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}} 10 | \AtEndDocument{\ifx\hyper@anchor\@undefined 11 | \let\contentsline\oldcontentsline 12 | \let\newlabel\oldnewlabel 13 | \fi} 14 | \fi} 15 | \global\let\hyper@last\relax 16 | \gdef\HyperFirstAtBeginDocument#1{#1} 17 | \providecommand\HyField@AuxAddToFields[1]{} 18 | \providecommand\HyField@AuxAddToCoFields[2]{} 19 | \bibstyle{biblatex} 20 | \bibdata{paper-blx,smallbiblio} 21 | \citation{biblatex-control} 22 | \citation{Stanley2002-ug} 23 | \citation{Yamins2014-us} 24 | \citation{Stanley2002-ug} 25 | \citation{Olshausen1996-vz} 26 | \@writefile{toc}{\boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax } 27 | \@writefile{lof}{\boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax } 28 | \@writefile{lot}{\boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax } 29 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {I}Introduction}{1}{section.1}} 30 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {II}Method}{1}{section.2}} 31 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{\numberline {\unhbox \voidb@x \hbox {II-A}}Description of the algorithm}{1}{subsection.2.1}} 32 | \citation{Stanley2002-ug} 33 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{\numberline {\unhbox \voidb@x \hbox {II-B}}Implementation details}{2}{subsection.2.2}} 34 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {III}Experiments}{2}{section.3}} 35 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{\numberline {\unhbox \voidb@x \hbox {III-A}}Tasks}{2}{subsection.3.1}} 36 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {IV}Results}{2}{section.4}} 37 | \newlabel{fig:easyandhard}{{IV}{2}{Results}{section.4}{}} 38 | \@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Model performance on an easy task (left panel) and a hard task (right panel). Both performance (cross-entropy loss between predicted and actual character) and number of neurons are shown as a function of time. Dark curves and shaded areas indicate median and inter-quartile range over 20 runs, respectively. The model settles on larger network size for the more complex problem. }}{2}{figure.1}} 39 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{\numberline {\unhbox \voidb@x \hbox {IV-A}}Performance and network size in hard and easy tasks}{2}{subsection.4.1}} 40 | \citation{Stanley2002-ug} 41 | \citation{He2015-gk} 42 | \newlabel{fig:fixedsize}{{\unhbox \voidb@x \hbox {IV-B}}{3}{Dynamical adjustment of network size in response to changing conditions}{subsection.4.2}{}} 43 | \@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces Comparison of performance for variable and fixed size, for the ``hard'' problem. The thick black line shows variable-size network performance and is identical to the blue curve in Fig. \ref {fig:easyandhard}, right panel. Thin colored curves indicate performance of fixed-size networks of various sizes. Curves show medians over 20 runs; inter-quartile ranges (not shown for clarity) are comparable to those seen in Fig. \ref {fig:easyandhard}. Variable-size networks outperform fixed-size networks for the problem described here. }}{3}{figure.2}} 44 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{\numberline {\unhbox \voidb@x \hbox {IV-B}}Dynamical adjustment of network size in response to changing conditions}{3}{subsection.4.2}} 45 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {V}Conclusions and future work}{3}{section.5}} 46 | \newlabel{fig:easyhardeasy}{{\unhbox \voidb@x \hbox {IV-B}}{3}{Dynamical adjustment of network size in response to changing conditions}{figure.2}{}} 47 | \@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces Dynamic adjustment of network size in response to abrupt complexification and simplification of an ongoing task.}}{3}{figure.3}} 48 | -------------------------------------------------------------------------------- /paper/paper.bbl: -------------------------------------------------------------------------------- 1 | % $ biblatex auxiliary file $ 2 | % $ biblatex version 2.5 $ 3 | % Do not modify the above lines! 4 | % 5 | % This is an auxiliary file used by the 'biblatex' package. 6 | % This file may safely be deleted. It will be recreated as 7 | % required. 8 | % 9 | \begingroup 10 | \makeatletter 11 | \@ifundefined{ver@biblatex.sty} 12 | {\@latex@error 13 | {Missing 'biblatex' package} 14 | {The bibliography requires the 'biblatex' package.} 15 | \aftergroup\endinput} 16 | {} 17 | \endgroup 18 | 19 | \entry{He2015-gk}{article}{} 20 | \name{author}{4}{}{% 21 | {{}% 22 | {He}{H.}% 23 | {Kaiming}{K.}% 24 | {}{}% 25 | {}{}}% 26 | {{}% 27 | {Zhang}{Z.}% 28 | {Xiangyu}{X.}% 29 | {}{}% 30 | {}{}}% 31 | {{}% 32 | {Ren}{R.}% 33 | {Shaoqing}{S.}% 34 | {}{}% 35 | {}{}}% 36 | {{}% 37 | {Sun}{S.}% 38 | {Jian}{J.}% 39 | {}{}% 40 | {}{}}% 41 | } 42 | \strng{namehash}{HK+1} 43 | \strng{fullhash}{HKZXRSSJ1} 44 | \field{sortinit}{H} 45 | \field{abstract}{% 46 | Deeper neural networks are more difficult to train. We present a residual 47 | learning framework to ease the training of networks that are substantially 48 | deeper than those used previously. We explicitly reformulate the layers as 49 | learning residual functions with reference to the layer inputs, instead of 50 | learning unreferenced functions. We provide comprehensive empirical evidence 51 | showing that these residual networks are easier to optimize, and can gain 52 | accuracy from considerably increased depth. On the ImageNet dataset we 53 | evaluate residual nets with a depth of up to 152 layers---8x deeper than VGG 54 | nets but still having lower complexity. An ensemble of these residual nets 55 | achieves 3.57\% error on the ImageNet test set. This result won the 1st place 56 | on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 57 | with 100 and 1000 layers. The depth of representations is of central 58 | importance for many visual recognition tasks. Solely due to our extremely 59 | deep representations, we obtain a 28\% relative improvement on the COCO 60 | object detection dataset. Deep residual nets are foundations of our 61 | submissions to ILSVRC \& COCO 2015 competitions, where we also won the 1st 62 | places on the tasks of ImageNet detection, ImageNet localization, COCO 63 | detection, and COCO segmentation.% 64 | } 65 | \verb{eprint} 66 | \verb 1512.03385 67 | \endverb 68 | \field{title}{Deep Residual Learning for Image Recognition} 69 | \field{eprinttype}{arXiv} 70 | \field{eprintclass}{cs.CV} 71 | \field{year}{2015} 72 | \warn{\item Invalid format of field 'month'} 73 | \endentry 74 | 75 | \entry{Olshausen1996-vz}{article}{} 76 | \name{author}{2}{}{% 77 | {{}% 78 | {Olshausen}{O.}% 79 | {Bruno~A}{B.~A.}% 80 | {}{}% 81 | {}{}}% 82 | {{}% 83 | {Field}{F.}% 84 | {David~J}{D.~J.}% 85 | {}{}% 86 | {}{}}% 87 | } 88 | \list{language}{1}{% 89 | {en}% 90 | } 91 | \strng{namehash}{OBAFDJ1} 92 | \strng{fullhash}{OBAFDJ1} 93 | \field{sortinit}{O} 94 | \field{number}{6583} 95 | \field{pages}{607\bibrangedash 609} 96 | \field{title}{Emergence of simple-cell receptive field properties by learning 97 | a sparse code for natural images} 98 | \field{volume}{381} 99 | \field{journaltitle}{Nature} 100 | \field{year}{1996} 101 | \warn{\item Invalid format of field 'month'} 102 | \endentry 103 | 104 | \entry{Stanley2002-ug}{article}{} 105 | \name{author}{2}{}{% 106 | {{}% 107 | {Stanley}{S.}% 108 | {Kenneth~O}{K.~O.}% 109 | {}{}% 110 | {}{}}% 111 | {{}% 112 | {Miikkulainen}{M.}% 113 | {Risto}{R.}% 114 | {}{}% 115 | {}{}}% 116 | } 117 | \list{language}{1}{% 118 | {en}% 119 | } 120 | \strng{namehash}{SKOMR1} 121 | \strng{fullhash}{SKOMR1} 122 | \field{sortinit}{S} 123 | \field{abstract}{% 124 | An important question in neuroevolution is how to gain an advantage from 125 | evolving neural network topologies along with weights. We present a method, 126 | NeuroEvolution of Augmenting Topologies (NEAT), which outperforms the best 127 | fixed-topology method on a challenging benchmark reinforcement learning task. 128 | We claim that the increased efficiency is due to (1) employing a principled 129 | method of crossover of different topologies, (2) protecting structural 130 | innovation using speciation, and (3) incrementally growing from minimal 131 | structure. We test this claim through a series of ablation studies that 132 | demonstrate that each component is necessary to the system as a whole and to 133 | each other. What results is significantly faster learning. NEAT is also an 134 | important contribution to GAs because it shows how it is possible for 135 | evolution to both optimize and complexify solutions simultaneously, offering 136 | the possibility of evolving increasingly complex solutions over generations, 137 | and strengthening the analogy with biological evolution.% 138 | } 139 | \field{number}{2} 140 | \field{pages}{99\bibrangedash 127} 141 | \field{title}{Evolving neural networks through augmenting topologies} 142 | \field{volume}{10} 143 | \field{journaltitle}{Evol. Comput.} 144 | \field{year}{2002} 145 | \endentry 146 | 147 | \entry{Yamins2014-us}{article}{} 148 | \name{author}{6}{}{% 149 | {{}% 150 | {Yamins}{Y.}% 151 | {Daniel L~K}{D.~L.~K.}% 152 | {}{}% 153 | {}{}}% 154 | {{}% 155 | {Hong}{H.}% 156 | {Ha}{H.}% 157 | {}{}% 158 | {}{}}% 159 | {{}% 160 | {Cadieu}{C.}% 161 | {Charles~F}{C.~F.}% 162 | {}{}% 163 | {}{}}% 164 | {{}% 165 | {Solomon}{S.}% 166 | {Ethan~A}{E.~A.}% 167 | {}{}% 168 | {}{}}% 169 | {{}% 170 | {Seibert}{S.}% 171 | {Darren}{D.}% 172 | {}{}% 173 | {}{}}% 174 | {{}% 175 | {DiCarlo}{D.}% 176 | {James~J}{J.~J.}% 177 | {}{}% 178 | {}{}}% 179 | } 180 | \list{language}{1}{% 181 | {en}% 182 | } 183 | \strng{namehash}{YDLK+1} 184 | \strng{fullhash}{YDLKHHCCFSEASDDJJ1} 185 | \field{sortinit}{Y} 186 | \field{abstract}{% 187 | The ventral visual stream underlies key human visual object recognition 188 | abilities. However, neural encoding in the higher areas of the ventral stream 189 | remains poorly understood. Here, we describe a modeling approach that yields 190 | a quantitatively accurate model of inferior temporal (IT) cortex, the highest 191 | ventral cortical area. Using high-throughput computational techniques, we 192 | discovered that, within a class of biologically plausible hierarchical neural 193 | network models, there is a strong correlation between a model’s 194 | categorization performance and its ability to predict individual IT neural 195 | unit response data. To pursue this idea, we then identified a high-performing 196 | neural network that matches human performance on a range of recognition 197 | tasks. Critically, even though we did not constrain this model to match 198 | neural data, its top output layer turns out to be highly predictive of IT 199 | spiking responses to complex naturalistic images at both the single site and 200 | population levels. Moreover, the model’s intermediate layers are highly 201 | predictive of neural responses in the V4 cortex, a midlevel visual area that 202 | provides the dominant cortical input to IT. These results show that 203 | performance optimization---applied in a biologically appropriate model 204 | class---can be used to build quantitative predictive models of neural 205 | processing.% 206 | } 207 | \field{number}{23} 208 | \field{pages}{8619\bibrangedash 8624} 209 | \field{title}{Performance-optimized hierarchical models predict neural 210 | responses in higher visual cortex} 211 | \field{volume}{111} 212 | \field{journaltitle}{Proc. Natl. Acad. Sci. U. S. A.} 213 | \field{year}{2014} 214 | \warn{\item Invalid format of field 'month'} 215 | \endentry 216 | 217 | \lossort 218 | \endlossort 219 | 220 | \endinput 221 | -------------------------------------------------------------------------------- /paper/paper.blg: -------------------------------------------------------------------------------- 1 | This is BibTeX, Version 0.99d (TeX Live 2015) 2 | Capacity: max_strings=35307, hash_size=35307, hash_prime=30011 3 | The top-level auxiliary file: paper.aux 4 | The style file: biblatex.bst 5 | Reallocated singl_function (elt_size=4) to 100 items from 50. 6 | Reallocated singl_function (elt_size=4) to 100 items from 50. 7 | Reallocated singl_function (elt_size=4) to 100 items from 50. 8 | Reallocated singl_function (elt_size=4) to 100 items from 50. 9 | Reallocated singl_function (elt_size=4) to 100 items from 50. 10 | Reallocated wiz_functions (elt_size=4) to 6000 items from 3000. 11 | Reallocated singl_function (elt_size=4) to 100 items from 50. 12 | Reallocated singl_function (elt_size=4) to 100 items from 50. 13 | Reallocated singl_function (elt_size=4) to 100 items from 50. 14 | Reallocated singl_function (elt_size=4) to 100 items from 50. 15 | Reallocated singl_function (elt_size=4) to 100 items from 50. 16 | Database file #1: paper-blx.bib 17 | Database file #2: smallbiblio.bib 18 | Biblatex version: 3.0 19 | Reallocated singl_function (elt_size=4) to 100 items from 50. 20 | Reallocated wiz_functions (elt_size=4) to 9000 items from 6000. 21 | You've used 5 entries, 22 | 6047 wiz_defined-function locations, 23 | 1169 strings with 12524 characters, 24 | and the built_in function-call counts, 14691 in all, are: 25 | = -- 504 26 | > -- 526 27 | < -- 118 28 | + -- 175 29 | - -- 200 30 | * -- 1253 31 | := -- 1086 32 | add.period$ -- 0 33 | call.type$ -- 5 34 | change.case$ -- 48 35 | chr.to.int$ -- 43 36 | cite$ -- 8 37 | duplicate$ -- 1588 38 | empty$ -- 1515 39 | format.name$ -- 333 40 | if$ -- 3230 41 | int.to.chr$ -- 0 42 | int.to.str$ -- 11 43 | missing$ -- 0 44 | newline$ -- 170 45 | num.names$ -- 179 46 | pop$ -- 1337 47 | preamble$ -- 1 48 | purify$ -- 64 49 | quote$ -- 0 50 | skip$ -- 805 51 | stack$ -- 0 52 | substring$ -- 310 53 | swap$ -- 580 54 | text.length$ -- 128 55 | text.prefix$ -- 4 56 | top$ -- 1 57 | type$ -- 178 58 | warning$ -- 0 59 | while$ -- 127 60 | width$ -- 0 61 | write$ -- 164 62 | -------------------------------------------------------------------------------- /paper/paper.out: -------------------------------------------------------------------------------- 1 | \BOOKMARK [1][-]{section.1}{Introduction}{}% 1 2 | \BOOKMARK [1][-]{section.2}{Method}{}% 2 3 | \BOOKMARK [2][-]{subsection.2.1}{Description of the algorithm}{section.2}% 3 4 | \BOOKMARK [2][-]{subsection.2.2}{Implementation details}{section.2}% 4 5 | \BOOKMARK [1][-]{section.3}{Experiments}{}% 5 6 | \BOOKMARK [2][-]{subsection.3.1}{Tasks}{section.3}% 6 7 | \BOOKMARK [1][-]{section.4}{Results}{}% 7 8 | \BOOKMARK [2][-]{subsection.4.1}{Performance and network size in hard and easy tasks}{section.4}% 8 9 | \BOOKMARK [2][-]{subsection.4.2}{Dynamical adjustment of network size in response to changing conditions}{section.4}% 9 10 | \BOOKMARK [1][-]{section.5}{Conclusions and future work}{}% 10 11 | -------------------------------------------------------------------------------- /paper/paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasMiconi/DiffRNN/c81eb14ab9d51191b4d7891e0472036077f49bcc/paper/paper.pdf -------------------------------------------------------------------------------- /paper/paper.run.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 23 | 28 | 33 | 36 | 39 | 42 | ]> 43 | 44 | 45 | latex 46 | 47 | paper.aux 48 | paper-blx.bib 49 | 50 | 51 | paper.bbl 52 | 53 | 54 | blx-compat.def 55 | biblatex.def 56 | numeric.bbx 57 | standard.bbx 58 | numeric.cbx 59 | biblatex.cfg 60 | english.lbx 61 | 62 | 63 | 64 | bibtex 65 | 66 | bibtex 67 | 68 | paper 69 | 70 | 71 | paper.aux 72 | 73 | 74 | paper.bbl 75 | 76 | 77 | paper.bbl 78 | 79 | 80 | paper.aux 81 | paper-blx.bib 82 | 83 | 84 | smallbiblio.bib 85 | 86 | 87 | biblatex.bst 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /paper/paper.tex: -------------------------------------------------------------------------------- 1 | %\documentclass{article} 2 | \documentclass[journal]{IEEEtran} 3 | 4 | %\usepackage[final,nonatbib]{nips_2016} 5 | 6 | 7 | \usepackage[utf8]{inputenc} % allow utf-8 input 8 | \usepackage[T1]{fontenc} % use 8-bit T1 fonts 9 | \usepackage{hyperref} % hyperlinks 10 | \usepackage{url} % simple URL typesetting 11 | \usepackage{booktabs} % professional-quality tables 12 | \usepackage{amsfonts} % blackboard math symbols 13 | \usepackage{nicefrac} % compact symbols for 1/2, etc. 14 | \usepackage{microtype} % microtypography 15 | \usepackage{graphicx} 16 | 17 | \usepackage[backend=bibtex]{biblatex} 18 | \bibliography{smallbiblio} 19 | \AtEveryBibitem{% 20 | \clearlist{language}% 21 | } 22 | 23 | 24 | \title{Neural networks with differentiable structure} 25 | 26 | \author{ 27 | Thomas Miconi\\%\thanks{Use footnote for providing further 28 | % information about author (webpage, alternative 29 | % address)---\emph{not} for acknowledging funding agencies.} \\ 30 | The Neurosciences Institute\\ 31 | La Jolla, CA, USA \\ 32 | \texttt{miconi@nsi.edu} \\ 33 | } 34 | 35 | \begin{document} 36 | 37 | \maketitle 38 | 39 | \begin{abstract} 40 | 41 | While gradient descent has proven highly successful in learning connection 42 | weights for neural networks, the actual structure of these networks is usually determined by hand, or by 43 | other optimization algorithms. Here we describe a simple method to make 44 | network structure differentiable, and therefore accessible to gradient descent. 45 | We test this method on recurrent neural networks applied to simple 46 | sequence prediction problems. Starting with initial networks containing only 47 | one node, the method automatically grows networks that successfully solve the 48 | tasks. The number of nodes in the final network correlates with task 49 | difficulty. The method can dynamically increase network size in response to an 50 | abrupt complexification in the task. 51 | Variable-size networks grown with the method outperform fixed-size 52 | networks of higher, lower or identical size, hinting at a possible advantage of growing networks. We conclude by discussing how this 53 | method could be applied to more complex networks, such as feedforward layered 54 | networks, or multiple-area networks of arbitrary shape. 55 | 56 | \end{abstract} 57 | 58 | \section{Introduction} 59 | 60 | Neural networks are usually optimized by applying some form gradient descent to 61 | the numerical parameters of a fixed connectivity graph. This method can 62 | successfully train very large networks for complex tasks. However, the actual 63 | structure of the network itself (number of neurons, connectivity graph, etc.) is usually not modified by the gradient 64 | descent algorithm. Most often, network structure is designed by hand, in a delicate process of parameter tuning. 65 | When network structure is optimized, it is generally with a different 66 | algorithm, including evolutionary techniques such as NEAT \cite{Stanley2002-ug} or heuristic-based methods such as HyperOpt \cite{Yamins2014-us}. 67 | 68 | Manual design of network structure is time-consuming and subject to arbitrary 69 | choices that may or may not reflect the demands of the task at hand. 70 | Furthermore, letting the size of the network grow autonomously may actually 71 | improve learning performance, as posited in the NEAT framework 72 | \cite{Stanley2002-ug}. It would therefore be desirable to extend the process 73 | of gradient descent to network structure itself. This requires making network 74 | structure differentiable, at least to a usable approximation. Here we describe 75 | a simple method for performing gradient descent over network structure, and 76 | show that this method can adaptively design recurrent networks of a few dozen 77 | units for simple sequence prediction tasks. 78 | 79 | 80 | \section{Method} 81 | 82 | \subsection{Description of the algorithm} 83 | 84 | Here we describe our method, in the context of recurrent networks with 85 | all-to-all potential connectivity (in the conclusion, we suggest how the method 86 | could be extended to more complex architectures, including layered feedforward 87 | networks). In this situation, structure is determined by the number of nodes in 88 | the network $N$, which automatically determines the connectivity graph as a 89 | simple square matrix of size $N*N$. Our goal is to make the number of nodes 90 | differentiable and amenable to gradient descent and backpropagation. 91 | 92 | The first step in our method is to impose a penalty on the L1-norm (sum of absolute values) of \textit{outgoing} weights from each neuron. This includes both lateral and feedforward weights. 93 | As is well-known, minimizing the L1-norm 94 | tends to concentrate the remaining total weight among the fewest possible 95 | elements, in comparison to Euclidean L2-norm minimization. As a result, 96 | backpropagation will tend to minimize the number of neurons with non-zero total output, and 97 | thus of ``active'' neurons: each neuron must ``earn its keep'', by contributing 98 | to overall network performance, to counter-balance the effect of L1-norm 99 | minimization, or else face effective ``soft'' deletion by having its outgoing weights 100 | fall to zero.\footnote{Importantly, note that L1 regularization on outgoing weights is quite different from directly imposing an 101 | L1 regularization on neuron activities themselves. L1 regularization of 102 | neuron activities ensures that few neurons will be active \textit{at any 103 | given time}, but does not ensure that any neuron will become fully silent 104 | over extended time. Instead, L1 regularization of neuron activities may 105 | encourage neurons to distribute and decorrelate their activations other 106 | time so that each neuron responds to a small proportion of inputs; this is 107 | precisely the (intended) effect of L1-regularization in \textit{sparse 108 | coding} schemes \cite{Olshausen1996-vz}. By contrast, penalizing outgoing weights can truly 109 | turn neurons ``on'' or ``off'' in a time-independent fashion: a neuron with 110 | zero output weights is guaranteed to be silent for any input. } 111 | 112 | This method creates a ``soft'' 113 | structural variability, whereby gradient descent tries to solve the task at 114 | hand under the constraint of minimizing the number of neurons with non-zero 115 | outgoing weights. We want to turn these ``soft'' structure changes into hard 116 | structural changes in the actual number of neurons and size of the weight 117 | matrix. To this end, we first specify a \textit{deletion threshold} $T_D$, such 118 | that any neuron for which the L1-norm of outgoing weights falls below this threshold is marked for 119 | potential deletion. Then, we simply specify that at any given time, the network 120 | must only contain a fixed, small number $k$ of neurons below the deletion 121 | threshold. If the number of sub-threshold neurons exceeds $k$, then ``excess'' 122 | sub-threshold neurons are actually deleted from the network. Conversely, if 123 | backpropagation finds it necessary to inflate neuron output weights to the extent that 124 | fewer than $k$ neurons have sub-threshold output weight norm, then we add a new neuron 125 | to the simulation, with initially random connectivity and outgoing weights 126 | initially chosen to have L1-norm exactly equal to the deletion threshold. Note that, because the threshold 127 | value is low, new neurons initially have a very small effect on overall network 128 | behavior. 129 | 130 | This mechanism allows backpropagation to adjust network size to problem 131 | demands. If more neurons are needed to solve the problem at hand, 132 | backpropagation will simply expand the outgoing weights of currently sub-threshold 133 | neurons, so as to allow them to have an impact on output computation, while adjusting their connectivity. By 134 | contrast, if new neurons fail to contribute to network performance, 135 | L1-minimization will reduce their outgoing weights and eventually drive them below 136 | deletion threshold. The sub-threshold neurons thus act as a computational 137 | reserve, ready to be mobilized if the problem at hand demands it. 138 | 139 | Finally, as a stabilization measure, we make 140 | addition and deletion probabilistic, so that whenever a neuron is to be added 141 | or deleted, the event only occur with a certain fixed probability $P_{add}$ or 142 | $P_{del}$. As a result, the network will occasionally possess more or less than 143 | $k$ subthreshold neurons. All networks in our experiment start with only one 144 | node, following the philosophy of ``augmenting topologies'' expounded in NEAT 145 | \cite{Stanley2002-ug}. 146 | 147 | \subsection{Implementation details} 148 | 149 | Our implementation is based on Andrej Karpathy's \verb+min-char-rnn.py+ and 150 | inherits most of its parameters. The networks are trained for 100000 cycles, 151 | where each cycle consists of reading a sequence of 40 characters while trying 152 | to predict the next character, followed by a parameter update based on 153 | backpropagation through time. Network output is provided by a single output 154 | layer with 4 nodes (one per possible character), each of which reports the 155 | predicted probability that the corresponding character is next in the sequence. The output layer is fully connected with the variable-size recurrent layer. 156 | Loss is defined as cross-entropy between the predicted distribution and the 157 | actual (one-hot) outcome. Any addition or deletion also occurs at the same time as parameter 158 | update (that is, at the end of each successive 40-char sequence). 159 | 160 | %% NOTE: This does not apply in the current version of the code 161 | %All multipliers are bounded from below by a low, but not trivial 162 | %value $M_{min}$. If a parameter update drives the value of a multiplier below $M_{min}$, it is automatically set to $M_{min}$. The intended effect is that every neuron (even those with multipliers 163 | %below deletion threshold) should still have a small, but not negligible effect 164 | %on network output, so that a reasonable gradient of error over any neuron's 165 | %parameters can always be computed. This allows all neurons to always be ``ready to help'' if needed. An additional side-effect is to make all multipliers strictly positive, although this is not a critical component of our method. 166 | 167 | 168 | There are thus 5 additional parameters in our method: $k$, $T_D$, $P_{add}$, 169 | $P_{del}$, and $A_{L1reg}$ (the strength of the L1-norm penalty 170 | over the weights). In all simulations shown here, those were set to $k=1$, 171 | $T_D=0.05$, $P_{add}=0.01$, $P_{del}=0.05$, and 172 | $A_{L1reg}=10^{-4}$. 173 | 174 | 175 | All code is available on GitHub at \url{https://github.com/ThomasMiconi/DiffRNN}. 176 | 177 | 178 | \section{Experiments} 179 | 180 | \subsection{Tasks} 181 | 182 | To test the plausibility of our method, we choose two simple sequence prediction 183 | problems. In each problem, the task of the network is to predict the next 184 | character in an ongoing sequence of characters. Both problems use the same 185 | alphabet, consisting of characters $a$, $b$, $($ and $)$. 186 | 187 | The first problem (``easy problem'') is composed of groups of one or more $ab$ 188 | digraphs, enclosed in matching parentheses. After every $ab$ digraph, there is 189 | a constant probability of adding an additional $ab$ digraph (p=0.75), or to close the 190 | group with a closing parenthesis instead (p=0.25). Thus the number of digraphs in each 191 | group follows an exponential distribution. A typical sequence looks like this: 192 | 193 | \begin{center} 194 | $(abab)(ab)(ab)(ababab)(abababababab)(abab)(abababab)\ldots$ 195 | \end{center} 196 | 197 | Note that the problem is highly constrained: the only choice occurs after a 198 | $b$, when the network must decide whether to insert a $)$ or an $a$, which has 199 | a well-defined probability. Every other choice is unambiguously specified by the problem. 200 | 201 | 202 | The second problem (''hard problem'') is composed of groups of six letters 203 | enclosed in matching parentheses. The rule is that each new group must be the 204 | reverse of the previous group, with one randomly chosen letter changed. A 205 | typical sequence looks like this: 206 | 207 | \begin{center} 208 | $(aabbab)(babaaa)(aaabbb)(bbaaaa)(abaabb)(baaaba) \ldots$ 209 | \end{center} 210 | 211 | To reach optimal performance on this task, the network must maintain a memory 212 | of the previous sequence of six characters, and then reverse it, in addition to opening and closing parentheses. This is a 213 | more difficult problem than the previous one, and thus we expect that 214 | optimal networks for either task would look quite different from each other. 215 | 216 | \section{Results} 217 | 218 | \begin{figure}[ht] 219 | \label{fig:easyandhard} 220 | \centering 221 | \includegraphics[scale=0.9]{figE.png} 222 | \includegraphics[scale=0.9]{figH.png} 223 | \caption{Model performance on an easy task (left panel) and a hard task 224 | (right panel). Both performance (cross-entropy loss between predicted and 225 | actual character) and number of neurons are shown as a function of time. Dark curves and shaded areas indicate median and inter-quartile range over 20 runs, respectively. The 226 | model settles on larger network size for the more complex problem. } 227 | \end{figure} 228 | 229 | 230 | \subsection{Performance and network size in hard and easy tasks} 231 | 232 | 233 | Results are shown in Figure \ref{fig:easyandhard}. We show both median 234 | performance (cross-entropy loss) and median number of neurons as a function of 235 | time, over 20 runs. As expected, the hard problem leads to somewhat higher loss 236 | than the easy problem. Importantly, the hard problem elicits larger networks 237 | than the easy problem (37 neurons vs. 14 neurons after 100000 learning cycles). 238 | Thus, the algorithm appropriately allocated more neurons to solve a more difficult 239 | task. 240 | 241 | An important question is whether the use of variable-size networks has an 242 | impact on performance. We compared the performance of our algorithm against 243 | fixed-size networks with various numbers of neurons, ranging from 10 to 100, 244 | including one with the same network size as was eventually preferred by our 245 | algorithm (i.e. 37 neurons). Results are shown in figure \ref{fig:fixedsize}, 246 | again showing the median loss among 20 runs as a function of time. 247 | Intriguingly, the variable-size network actually outperforms fixed-size 248 | networks of any size. This result may reflect the advantages of ``augmenting 249 | topologies'' (starting with a minimal network and only adding complexity as 250 | needed), as expounded in NEAT \cite{Stanley2002-ug}, at least for the simple 251 | problems tackled here. 252 | 253 | 254 | 255 | \subsection{Dynamical adjustment of network size in response to changing conditions} 256 | 257 | What happens if task difficulty suddenly changes? We tested our network by 258 | switching from the ``easy'' to the ``hard'' sequence after 33000 259 | cycles, and then back again to the ``easy'' sequence after 66000 cycles. 260 | Results are shown in Figure \ref{fig:easyhardeasy}. Interestingly, the network 261 | successfully handles the abrupt complexification of the problem by allocating 262 | more neurons. Following a large increase, the network then sheds off excess 263 | neurons, without damaging performance. This process continues when the problem 264 | switches back to the ``easy'' sequence (note that performance quickly returns 265 | to optimal levels). Thus, the network successfully adapts its size to the complexity of the problem at hand. 266 | 267 | 268 | \begin{figure}[t] 269 | \label{fig:fixedsize} 270 | \centering 271 | \includegraphics[scale=0.9]{figFS.png} 272 | \caption{Comparison of performance for variable and fixed size, for the 273 | ``hard'' problem. The thick black line shows variable-size network performance 274 | and is identical to the blue curve in Fig. \ref{fig:easyandhard}, right panel. 275 | Thin colored curves indicate performance of fixed-size networks of various 276 | sizes. Curves show medians over 20 277 | runs; inter-quartile ranges (not shown for clarity) are comparable to those 278 | seen in Fig. \ref{fig:easyandhard}. Variable-size networks outperform fixed-size 279 | networks for the problem described here. } 280 | \end{figure} 281 | 282 | \begin{figure}[b] 283 | \label{fig:easyhardeasy} 284 | \centering 285 | \includegraphics[scale=0.9]{figEHE.png} 286 | \caption{Dynamic adjustment of network size in response to abrupt complexification and simplification of an ongoing task.} 287 | \end{figure} 288 | 289 | 290 | 291 | \section{Conclusions and future work} 292 | 293 | We have described a method through which the size of a recurrent network can be 294 | modified by gradient descent. The method described here can successfully build 295 | networks of appropriate size to handle simple problems. This simple method 296 | immediately suggests several alternatives and possible extensions. 297 | 298 | For example, deletion of neurons could be biased by neuron ``age'' (i.e. how 299 | long the neuron has been present), rather than being random. Deleted neurons 300 | could be partially preserved, so that newly added neurons could actually 301 | inherit connectivity of previously deleted ones, rather than being randomly 302 | initialized. Such adaptations were not necessary for the problems considered 303 | here, but might be considered in future applications to more challenging tasks. 304 | 305 | The method described here extends naturally to layered feedforward networks. 306 | Within each layer, the method can be applied essentially unchanged to adjust 307 | layer size. The number of layers can also be made differentiable, by adding and deleting 308 | \textit{residual} layers \cite{He2015-gk} with initially low 309 | pre-additive output weights. These residual layers, which would initially have minimal impact 310 | on the network's output, would play the same role as sub-threshold neurons in 311 | the method described above. 312 | Similarly, by considering each layer as a higher-order 313 | ``node'', subject to a global outgoing norm penalty, the method described above could in 314 | principle be extended to arbitrary networks composed of multiple areas, with 315 | arbitrary connectivity between areas. Further work is needed to assess the 316 | practicality of these and other possible extensions. 317 | 318 | 319 | \small 320 | 321 | \printbibliography 322 | 323 | \end{document} 324 | -------------------------------------------------------------------------------- /paper/paper.tex.nips: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | 3 | \usepackage[final,nonatbib]{nips_2016} 4 | 5 | 6 | \usepackage[utf8]{inputenc} % allow utf-8 input 7 | \usepackage[T1]{fontenc} % use 8-bit T1 fonts 8 | \usepackage{hyperref} % hyperlinks 9 | \usepackage{url} % simple URL typesetting 10 | \usepackage{booktabs} % professional-quality tables 11 | \usepackage{amsfonts} % blackboard math symbols 12 | \usepackage{nicefrac} % compact symbols for 1/2, etc. 13 | \usepackage{microtype} % microtypography 14 | \usepackage{graphicx} 15 | 16 | \usepackage[backend=bibtex]{biblatex} 17 | \bibliography{smallbiblio} 18 | \AtEveryBibitem{% 19 | \clearlist{language}% 20 | } 21 | 22 | 23 | \title{Neural networks with differentiable structure} 24 | 25 | \author{ 26 | Thomas Miconi\\%\thanks{Use footnote for providing further 27 | % information about author (webpage, alternative 28 | % address)---\emph{not} for acknowledging funding agencies.} \\ 29 | The Neurosciences Institute\\ 30 | La Jolla, CA, USA \\ 31 | \texttt{miconi@nsi.edu} \\ 32 | } 33 | 34 | \begin{document} 35 | 36 | \maketitle 37 | 38 | \begin{abstract} 39 | 40 | While gradient descent has proven highly successful in learning connection 41 | weights for neural networks, the actual structure of these networks is usually determined by hand, or by 42 | other optimization algorithms. Here we describe a simple method to make 43 | network structure differentiable, and therefore accessible to gradient descent. 44 | We test this method on recurrent neural networks applied to simple 45 | sequence prediction problems. Starting with initial networks containing only 46 | one node, the method automatically grows networks that successfully solve the 47 | tasks. The number of nodes in the final network correlates with task 48 | difficulty. The method can dynamically increase network size in response to an 49 | abrupt complexification in the task. 50 | Variable-size networks grown with the method outperform fixed-size 51 | networks of higher, lower or identical size, hinting at a possible advantage of growing networks. We conclude by discussing how this 52 | method could be applied to more complex networks, such as feedforward layered 53 | networks, or multiple-area networks of arbitrary shape. 54 | 55 | \end{abstract} 56 | 57 | \section{Introduction} 58 | 59 | Neural networks are usually optimized by applying some form gradient descent to 60 | the numerical parameters of a fixed connectivity graph. This method can 61 | successfully train very large networks for complex tasks. However, the actual 62 | structure of the network itself (number of neurons, connectivity graph, etc.) is usually not modified by the gradient 63 | descent algorithm. Most often, network structure is designed by hand, in a delicate process of parameter tuning. 64 | When network structure is optimized, it is generally with a different 65 | algorithm, including evolutionary techniques such as NEAT \cite{Stanley2002-ug} or heuristic-based methods such as HyperOpt \cite{Yamins2014-us}. 66 | 67 | Manual design of network structure is time-consuming and subject to arbitrary 68 | choices that may or may not reflect the demands of the task at hand. 69 | Furthermore, letting the size of the network grow autonomously may actually 70 | improve learning performance, as posited in the NEAT framework 71 | \cite{Stanley2002-ug}. It would therefore be desirable to extend the process 72 | of gradient descent to network structure itself. This requires making network 73 | structure differentiable, at least to a usable approximation. Here we describe 74 | a simple method for performing gradient descent over network structure, and 75 | show that this method can adaptively design recurrent networks of a few dozen 76 | units for simple sequence prediction tasks. 77 | 78 | 79 | \section{Method} 80 | 81 | \subsection{Description of the algorithm} 82 | 83 | Here we describe our method, in the context of recurrent networks with 84 | all-to-all potential connectivity (in the conclusion, we suggest how the method 85 | could be extended to more complex architectures, including layered feedforward 86 | networks). In this situation, structure is determined by the number of nodes in 87 | the network $N$, which automatically determines the connectivity graph as a 88 | simple square matrix of size $N*N$. Our goal is to make the number of nodes 89 | differentiable and amenable to gradient descent and backpropagation. 90 | 91 | The first step in our method is to impose a penalty on the L1-norm (sum of absolute values) of \textit{outgoing} weights from each neuron. This includes both lateral and feedforward weights. 92 | As is well-known, minimizing the L1-norm 93 | tends to concentrate the remaining total weight among the fewest possible 94 | elements, in comparison to Euclidean L2-norm minimization. As a result, 95 | backpropagation will tend to minimize the number of neurons with non-zero total output, and 96 | thus of ``active'' neurons: each neuron must ``earn its keep'', by contributing 97 | to overall network performance, to counter-balance the effect of L1-norm 98 | minimization, or else face effective ``soft'' deletion by having its outgoing weights 99 | fall to zero.\footnote{Importantly, note that L1 regularization on outgoing weights is quite different from directly imposing an 100 | L1 regularization on neuron activities themselves. L1 regularization of 101 | neuron activities ensures that few neurons will be active \textit{at any 102 | given time}, but does not ensure that any neuron will become fully silent 103 | over extended time. Instead, L1 regularization of neuron activities may 104 | encourage neurons to distribute and decorrelate their activations other 105 | time so that each neuron responds to a small proportion of inputs; this is 106 | precisely the (intended) effect of L1-regularization in \textit{sparse 107 | coding} schemes \cite{Olshausen1996-vz}. By contrast, penalizing outgoing weights can truly 108 | turn neurons ``on'' or ``off'' in a time-independent fashion: a neuron with 109 | zero output weights is guaranteed to be silent for any input. } 110 | 111 | This method creates a ``soft'' 112 | structural variability, whereby gradient descent tries to solve the task at 113 | hand under the constraint of minimizing the number of neurons with non-zero 114 | outgoing weights. We want to turn these ``soft'' structure changes into hard 115 | structural changes in the actual number of neurons and size of the weight 116 | matrix. To this end, we first specify a \textit{deletion threshold} $T_D$, such 117 | that any neuron for which the L1-norm of outgoing weights fall below this threshold is marked for 118 | potential deletion. Then, we simply specify that at any given time, the network 119 | must only contain a fixed, small number $k$ of neurons below the deletion 120 | threshold. If the number of sub-threshold neurons exceeds $k$, then ``excess'' 121 | sub-threshold neurons are actually deleted from the network. Conversely, if 122 | backpropagation finds it necessary to inflate neuron output weights to the extent that 123 | fewer than $k$ neurons have sub-threshold output weight norm, then we add a new neuron 124 | to the simulation, with initially random connectivity and outgoing weights 125 | initially chosen to have L1-norm exactly equal to the deletion threshold. Note that, because the threshold 126 | value is low, new neurons initially have a very small effect on overall network 127 | behavior. 128 | 129 | This mechanism allows backpropagation to adjust network size to problem 130 | demands. If more neurons are needed to solve the problem at hand, 131 | backpropagation will simply expand the outgoing weights of currently sub-threshold 132 | neurons, so as to allow them to have an impact on output computation, while adjusting their connectivity. By 133 | contrast, if new neurons fail to contribute to network performance, 134 | L1-minimization will reduce their outgoing weights and eventually drive them below 135 | deletion threshold. The sub-threshold neurons thus act as a computational 136 | reserve, ready to be mobilized if the problem at hand demands it. 137 | 138 | Finally, as a stabilization measure, we make 139 | addition and deletion probabilistic, so that whenever a neuron is to be added 140 | or deleted, the event only occur with a certain fixed probability $P_{add}$ or 141 | $P_{del}$. As a result, the network will occasionally possess more or less than 142 | $k$ subthreshold neurons. All networks in our experiment start with only one 143 | node, following the philosophy of ``augmenting topologies'' expounded in NEAT 144 | \cite{Stanley2002-ug}. 145 | 146 | \subsection{Implementation details} 147 | 148 | Our implementation is based on Andrej Karpathy's \verb+min-char-rnn.py+ and 149 | inherits most of its parameters. The networks are trained for 100000 cycles, 150 | where each cycle consists of reading a sequence of 40 characters while trying 151 | to predict the next character, followed by a parameter update based on 152 | backpropagation through time. Network output is provided by a single output 153 | layer with 4 nodes (one per possible character), each of which reports the 154 | predicted probability that the corresponding character is next in the sequence. The output layer is fully connected with the variable-size recurrent layer. 155 | Loss is defined as cross-entropy between the predicted distribution and the 156 | actual (one-hot) outcome. Any addition or deletion also occurs at the same time as parameter 157 | update (that is, at the end of each successive 40-char sequence). 158 | 159 | %% NOTE: This does not apply in the current version of the code 160 | %All multipliers are bounded from below by a low, but not trivial 161 | %value $M_{min}$. If a parameter update drives the value of a multiplier below $M_{min}$, it is automatically set to $M_{min}$. The intended effect is that every neuron (even those with multipliers 162 | %below deletion threshold) should still have a small, but not negligible effect 163 | %on network output, so that a reasonable gradient of error over any neuron's 164 | %parameters can always be computed. This allows all neurons to always be ``ready to help'' if needed. An additional side-effect is to make all multipliers strictly positive, although this is not a critical component of our method. 165 | 166 | 167 | There are thus 5 additional parameters in our method: $k$, $T_D$, $P_{add}$, 168 | $P_{del}$, and $A_{L1reg}$ (the strength of the L1 regularization 169 | over the multipliers). In all simulations shown here, those were set to $k=1$, 170 | $T_D=0.05$, $P_{add}=0.01$, $P_{del}=0.05$, and 171 | $A_{L1reg}=10^{-4}$. 172 | 173 | 174 | All code is available on GitHub at \url{https://github.com/ThomasMiconi/DiffRNN}. 175 | 176 | 177 | \section{Experiments} 178 | 179 | \subsection{Tasks} 180 | 181 | To test the plausibility of our method, we choose two simple sequence prediction 182 | problems. In each problem, the task of the network is to predict the next 183 | character in an ongoing sequence of characters. Both problems use the same 184 | alphabet, consisting of characters $a$, $b$, $($ and $)$. 185 | 186 | The first problem (``easy problem'') is composed of groups of one or more $ab$ 187 | digraphs, enclosed in matching parentheses. After every $ab$ digraph, there is 188 | a constant probability of adding an additional $ab$ digraph (p=0.75), or to close the 189 | group with a closing parenthesis instead (p=0.25). Thus the number of digraphs in each 190 | group follows an exponential distribution. A typical sequence looks like this: 191 | 192 | \begin{center} 193 | $(abab)(ab)(ab)(ababab)(abababababab)(abab)(abababab)\ldots$ 194 | \end{center} 195 | 196 | Note that the problem is highly constrained: the only choice occurs after a 197 | $b$, when the network must decide whether to insert a $)$ or an $a$, which has 198 | a well-defined probability. Every other choice is unambiguously specified by the problem. 199 | 200 | 201 | The second problem (''hard problem'') is composed of groups of six letters 202 | enclosed in matching parentheses. The rule is that each new group must be the 203 | reverse of the previous group, with one randomly chosen letter changed. A 204 | typical sequence looks like this: 205 | 206 | \begin{center} 207 | $(aabbab)(babaaa)(aaabbb)(bbaaaa)(abaabb)(baaaba) \ldots$ 208 | \end{center} 209 | 210 | To reach optimal performance on this task, the network must maintain a memory 211 | of the previous sequence of six characters, and then reverse it, in addition to opening and closing parentheses. This is a 212 | more difficult problem than the previous one, and thus we expect that 213 | optimal networks for either task would look quite different from each other. 214 | 215 | \section{Results} 216 | 217 | \begin{figure}[ht] 218 | \label{fig:easyandhard} 219 | \centering 220 | \includegraphics[scale=0.9]{figE.png} 221 | \includegraphics[scale=0.9]{figH.png} 222 | \caption{Model performance on an easy task (left panel) and a hard task 223 | (right panel). Both performance (cross-entropy loss between predicted and 224 | actual character) and number of neurons are shown as a function of time. Dark curves and shaded areas indicate median and inter-quartile range over 20 runs, respectively. The 225 | model settles on larger network size for the more complex problem. } 226 | \end{figure} 227 | 228 | 229 | \subsection{Performance and network size in hard and easy tasks} 230 | 231 | 232 | Results are shown in Figure \ref{fig:easyandhard}. We show both median 233 | performance (cross-entropy loss) and median number of neurons as a function of 234 | time, over 20 runs. As expected, the hard problem leads to somewhat higher loss 235 | than the easy problem. Importantly, the hard problem elicits larger networks 236 | than the easy problem (37 neurons vs. 14 neurons after 100000 learning cycles). 237 | Thus, the algorithm appropriately allocated more neurons to solve a more difficult 238 | task. 239 | 240 | An important question is whether the use of variable-size networks has an 241 | impact on performance. We compared the performance of our algorithm against 242 | fixed-size networks with various numbers of neurons, ranging from 10 to 100, 243 | including one with the same network size as was eventually preferred by our 244 | algorithm (i.e. 37 neurons). Results are shown in figure \ref{fig:fixedsize}, 245 | again showing the median loss among 20 runs as a function of time. 246 | Intriguingly, the variable-size network actually outperforms fixed-size 247 | networks of any size. This result may reflect the advantages of ``augmenting 248 | topologies'' (starting with a minimal network and only adding complexity as 249 | needed), as expounded in NEAT \cite{Stanley2002-ug}, at least for the simple 250 | problems tackled here. 251 | 252 | 253 | 254 | \subsection{Dynamical adjustment of network size in response to changing conditions} 255 | 256 | What happens if task difficulty suddenly changes? We tested our network by 257 | switching from the ``easy'' to the ``hard'' sequence after 33000 258 | cycles, and then back again to the ``easy'' sequence after 66000 cycles. 259 | Results are shown in Figure \ref{fig:easyhardeasy}. Interestingly, the network 260 | successfully handles the abrupt complexification of the problem by allocating 261 | more neurons. Following a large increase, the network then sheds off excess 262 | neurons, without damaging performance. This process continues when the problem 263 | switches back to the ``easy'' sequence (note that performance quickly returns 264 | to optimal levels). Thus, the network successfully adapts its size to the complexity of the problem at hand. 265 | 266 | 267 | \begin{figure}[t] 268 | \label{fig:fixedsize} 269 | \centering 270 | \includegraphics[scale=0.9]{figFS.png} 271 | \caption{Comparison of performance for variable and fixed size, for the 272 | ``hard'' problem. The thick black line shows variable-size network performance 273 | and is identical to the blue curve in Fig. \ref{fig:easyandhard}, right panel. 274 | Thin colored curves indicate performance of fixed-size networks of various 275 | sizes. Curves show medians over 20 276 | runs; inter-quartile ranges (not shown for clarity) are comparable to those 277 | seen in Fig. \ref{fig:easyandhard}. Variable-size networks outperform fixed-size 278 | networks for the problem described here. } 279 | \end{figure} 280 | 281 | \begin{figure}[b] 282 | \label{fig:easyhardeasy} 283 | \centering 284 | \includegraphics[scale=0.9]{figEHE.png} 285 | \caption{Dynamic adjustment of network size in response to abrupt complexification and simplification of an ongoing task.} 286 | \end{figure} 287 | 288 | 289 | 290 | \section{Conclusions and future work} 291 | 292 | We have described a method through which the size of a recurrent network can be 293 | modified by gradient descent. The method described here can successfully build 294 | networks of appropriate size to handle simple problems. This simple method 295 | immediately suggests several alternatives and possible extensions. 296 | 297 | For example, deletion of neurons could be biased by neuron ``age'' (i.e. how 298 | long the neuron has been present), rather than being random. Deleted neurons 299 | could be partially preserved, so that newly added neurons could actually 300 | inherit connectivity of previously deleted ones, rather than being randomly 301 | initialized. Such adaptations were not necessary for the problems considered 302 | here, but might be considered in future applications to more challenging tasks. 303 | 304 | The method described here extends naturally to layered feedforward networks. 305 | Within each layer, the method can be applied essentially unchanged to adjust 306 | layer size. The number of layers can also be made differentiable, by adding and deleting 307 | \textit{residual} layers \cite{He2015-gk} with initially low 308 | pre-additive output weights. These residual layers, which would initially have minimal impact 309 | on the network's output, would play the same role as sub-threshold neurons in 310 | the method described above. 311 | Similarly, by considering each layer as a higher-order 312 | ``node'', subject to a global outgoing norm penalty, the method described above could in 313 | principle be extended to arbitrary networks composed of multiple areas, with 314 | arbitrary connectivity between areas. Further work is needed to assess the 315 | practicality of these and other possible extensions. 316 | 317 | 318 | \small 319 | 320 | \printbibliography 321 | 322 | \end{document} 323 | -------------------------------------------------------------------------------- /paper/smallbiblio.bib: -------------------------------------------------------------------------------- 1 | 2 | @INPROCEEDINGS{Bergstra2013-lr, 3 | title = "Hyperopt: A Python Library for Optimizing the Hyperparameters of Machine Learning Algorithms", 4 | booktitle = "Proceedings of the 12th Python in Science Conference", 5 | author = "Bergstra, James and Yamins, Dan and Cox, David D", 6 | editor = "der Walt, St\'{e}fan van and Millman, Jarrod and Huff, Katy", 7 | pages = "13--20", 8 | year = 2013 9 | } 10 | 11 | @ARTICLE{Stanley2002-ug, 12 | title = "Evolving neural networks through augmenting topologies", 13 | author = "Stanley, Kenneth O and Miikkulainen, Risto", 14 | affiliation = "Department of Computer Sciences, The University of Texas at Austin, Austin, TX 78712, USA. kstanley@cs.utexas.edu", 15 | abstract = "An important question in neuroevolution is how to gain an advantage from evolving neural network topologies along with weights. We present a method, NeuroEvolution of Augmenting Topologies (NEAT), which outperforms the best fixed-topology method on a challenging benchmark reinforcement learning task. We claim that the increased efficiency is due to (1) employing a principled method of crossover of different topologies, (2) protecting structural innovation using speciation, and (3) incrementally growing from minimal structure. We test this claim through a series of ablation studies that demonstrate that each component is necessary to the system as a whole and to each other. What results is significantly faster learning. NEAT is also an important contribution to GAs because it shows how it is possible for evolution to both optimize and complexify solutions simultaneously, offering the possibility of evolving increasingly complex solutions over generations, and strengthening the analogy with biological evolution.", 16 | journal = "Evol. Comput.", 17 | volume = 10, 18 | number = 2, 19 | pages = "99--127", 20 | year = 2002, 21 | language = "en" 22 | } 23 | 24 | @ARTICLE{He2015-gk, 25 | title = "Deep Residual Learning for Image Recognition", 26 | author = "He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian", 27 | abstract = "Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers---8x deeper than VGG nets but still having lower complexity. An ensemble of these residual nets achieves 3.57\% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 with 100 and 1000 layers. The depth of representations is of central importance for many visual recognition tasks. Solely due to our extremely deep representations, we obtain a 28\% relative improvement on the COCO object detection dataset. Deep residual nets are foundations of our submissions to ILSVRC \& COCO 2015 competitions, where we also won the 1st places on the tasks of ImageNet detection, ImageNet localization, COCO detection, and COCO segmentation.", 28 | month = "10~" # dec, 29 | year = 2015, 30 | archivePrefix = "arXiv", 31 | primaryClass = "cs.CV", 32 | eprint = "1512.03385" 33 | } 34 | 35 | % The entry below contains non-ASCII chars that could not be converted 36 | % to a LaTeX equivalent. 37 | @ARTICLE{Yamins2014-us, 38 | title = "Performance-optimized hierarchical models predict neural responses in higher visual cortex", 39 | author = "Yamins, Daniel L K and Hong, Ha and Cadieu, Charles F and Solomon, Ethan A and Seibert, Darren and DiCarlo, James J", 40 | abstract = "The ventral visual stream underlies key human visual object recognition abilities. However, neural encoding in the higher areas of the ventral stream remains poorly understood. Here, we describe a modeling approach that yields a quantitatively accurate model of inferior temporal (IT) cortex, the highest ventral cortical area. Using high-throughput computational techniques, we discovered that, within a class of biologically plausible hierarchical neural network models, there is a strong correlation between a model’s categorization performance and its ability to predict individual IT neural unit response data. To pursue this idea, we then identified a high-performing neural network that matches human performance on a range of recognition tasks. Critically, even though we did not constrain this model to match neural data, its top output layer turns out to be highly predictive of IT spiking responses to complex naturalistic images at both the single site and population levels. Moreover, the model’s intermediate layers are highly predictive of neural responses in the V4 cortex, a midlevel visual area that provides the dominant cortical input to IT. These results show that performance optimization---applied in a biologically appropriate model class---can be used to build quantitative predictive models of neural processing.", 41 | journal = "Proc. Natl. Acad. Sci. U. S. A.", 42 | volume = 111, 43 | number = 23, 44 | pages = "8619--8624", 45 | month = "10~" # jun, 46 | year = 2014, 47 | language = "en" 48 | } 49 | 50 | @ARTICLE{Olshausen1996-vz, 51 | title = "Emergence of simple-cell receptive field properties by learning a sparse code for natural images", 52 | author = "Olshausen, Bruno A and Field, David J", 53 | journal = "Nature", 54 | volume = 381, 55 | number = 6583, 56 | pages = "607--609", 57 | month = "13~" # jun, 58 | year = 1996, 59 | language = "en" 60 | } 61 | -------------------------------------------------------------------------------- /rnn.py: -------------------------------------------------------------------------------- 1 | """ 2 | Differentiable-structure RNN, by Thomas Miconi. 3 | 4 | Mostly based on minimal character-level Vanilla RNN model by Andrej Karpathy 5 | (@karpathy): https://gist.github.com/karpathy/d4dee566867f8291f086 6 | 7 | BSD License 8 | 9 | """ 10 | import numpy as np 11 | import math 12 | import sys 13 | 14 | # Global meta-parameters, modifiable by command line 15 | g = { 16 | 'ADDDEL': 1, 17 | 'ETA' : .01, 18 | 'NBNEUR': 40, # Number of neurons for fixed-size experiments (ignored if adddel is 1) 19 | 'MAXDW': .01, 20 | 'DIR' : '.', # The directory of input text files 21 | 'NBSTEPS' : 100000, 22 | 'COEFFWPEN' : 1e-4, 23 | 'EXPTYPE' : 'HARD', 24 | 'DELETIONTHRESHOLD': .05, 25 | 'MINMULTIP': .025, # Must be lower than DELETIONTHRESHOLD ! NOTE: Has no effect in the current version of the code. 26 | 'NBMARGIN' : 1, 27 | 'PROBADEL': .05, 28 | 'PROBAADD': .01, 29 | 'RNGSEED' : 0 30 | } 31 | 32 | # Command line parameters parsing 33 | 34 | argpairs = [sys.argv[i:i+2] for i in range(1, len(sys.argv), 2)] 35 | for argpair in argpairs: 36 | if not (argpair[0] in g): 37 | raise Exception("Error, tried to pass value of non-existent parameter "+argpair[0]) 38 | if argpair[0] == 'EXPTYPE' or argpair[0] == 'DIR': 39 | g[argpair[0]] = argpair[1] 40 | else: 41 | g[argpair[0]] = float(argpair[1]) 42 | 43 | if (g['EXPTYPE'] not in ['HARD', 'EASY', 'HARDEASY', 'EASYHARDEASY']): 44 | raise Exception('Wrong EXPTYPE value') 45 | g['NBMARGIN'] = int(g['NBMARGIN']) 46 | g['RNGSEED'] = int(g['RNGSEED']) 47 | print g 48 | 49 | np.random.seed(g['RNGSEED']) 50 | 51 | 52 | # data I/O 53 | myf = open("output.txt", "w") 54 | myf.close() 55 | if (g['EXPTYPE'] == 'EASY') | (g['EXPTYPE'] == 'EASYHARDEASY'): 56 | data = open(g['DIR'] + '/inputeasy.txt', 'r').read() # should be simple plain text file 57 | else: 58 | data = open(g['DIR'] + '/inputhard.txt', 'r').read() # should be simple plain text file 59 | chars = list(set(data)) 60 | data_size, vocab_size = len(data), len(chars) 61 | print 'data has', data_size, 'characters,', vocab_size, 'unique.'# % (data_size, vocab_size) 62 | char_to_ix = { ch:i for i,ch in enumerate(chars) } 63 | ix_to_char = { i:ch for i,ch in enumerate(chars) } 64 | 65 | # hyperparameters 66 | MAX_HIDDEN_SIZE = 100 # Maximum size of hidden layer of neurons (same as fixed size in original min-char-rnn.py) 67 | if g['ADDDEL']: 68 | hidden_size = 1 # size of hidden layer of neurons - start from 1 node. 69 | else: 70 | hidden_size = g['NBNEUR'] # fixed size 71 | seq_length = 40 # number of steps to unroll the RNN for 72 | learning_rate = g['ETA'] 73 | 74 | # network parameters 75 | Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden 76 | Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden 77 | Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden (after multiplier) to output. See below 78 | bh = np.zeros((hidden_size, 1)) # hidden bias 79 | by = np.zeros((vocab_size, 1)) # output bias 80 | normz = np.zeros_like(bh) 81 | 82 | ages = np.zeros(hidden_size) # Ages of all neurons. Not used at present. 83 | 84 | def lossFun(inputs, targets, hprev): 85 | """ 86 | inputs,targets are both list of integers. 87 | hprev is Hx1 array of initial hidden state 88 | returns the loss, gradients on model parameters, and last hidden state 89 | """ 90 | xs, hs, ys, ps = {}, {}, {}, {} 91 | hs[-1] = np.copy(hprev) 92 | loss = 0 93 | # forward pass 94 | for t in xrange(len(inputs)): 95 | xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation 96 | xs[t][inputs[t]] = 1 97 | 98 | hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state 99 | ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars 100 | ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars 101 | loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss) 102 | 103 | # backward pass: compute gradients going backwards 104 | dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why) 105 | dbh, dby = np.zeros_like(bh), np.zeros_like(by) 106 | dhnext = np.zeros_like(hs[0]) 107 | for t in reversed(xrange(len(inputs))): 108 | dy = np.copy(ps[t]) 109 | dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here 110 | dWhy += np.dot(dy, hs[t].T) 111 | dby += dy 112 | dh = np.dot(Why.T, dy) + dhnext 113 | dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity 114 | dbh += dhraw 115 | dWxh += np.dot(dhraw, xs[t].T) 116 | dWhh += np.dot(dhraw, hs[t-1].T) 117 | dhnext = np.dot(Whh.T, dhraw) 118 | #for dparam in [dWxh, dWhh, dWhy, dbh, dby]: 119 | # np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients - clipping is actually done just before the update, after learning_rate has been applied - see below 120 | return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1] 121 | 122 | def sample(h, seed_ix, n): 123 | """ 124 | sample a sequence of integers from the model 125 | h is memory state, seed_ix is seed letter for first time step 126 | """ 127 | x = np.zeros((vocab_size, 1)) 128 | x[seed_ix] = 1 129 | ixes = [] 130 | for t in xrange(n): 131 | h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh) 132 | y = np.dot(Why, h) + by 133 | p = np.exp(y) / np.sum(np.exp(y)) 134 | ix = np.random.choice(range(vocab_size), p=p.ravel()) 135 | x = np.zeros((vocab_size, 1)) 136 | x[ix] = 1 137 | ixes.append(ix) 138 | return ixes 139 | 140 | n, p = 0, 0 141 | mWxh, mWhh, mWhy = .01 * np.ones_like(Wxh), .01 * np.zeros_like(Whh), .01 * np.zeros_like(Why) 142 | mbh, mby = .01 * np.ones_like(bh), .01 * np.zeros_like(by) # memory variables for RMSProp 143 | smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0 144 | 145 | 146 | while True: 147 | # prepare inputs (we're sweeping from left to right in steps seq_length long) 148 | if p+seq_length+1 >= len(data) or n == 0: 149 | hprev = np.zeros((hidden_size,1)) # reset RNN memory 150 | p = 0 # go from start of data 151 | inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]] 152 | targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]] 153 | 154 | # sample from the model now and then 155 | if n % 100 == 0: 156 | sample_ix = sample(hprev, inputs[0], 200) 157 | txt = ''.join(ix_to_char[ix] for ix in sample_ix) 158 | print '----\n %s \n----' % (txt, ) 159 | 160 | # forward seq_length characters through the net and fetch gradient 161 | loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev) 162 | smooth_loss = smooth_loss * 0.99 + loss * 0.01 163 | if n % 100 == 0: 164 | print 'iter %d, position in data %d, loss: %f , nb hidden neurons %d, sum-abs norms: %f' % (n, p, smooth_loss, hidden_size, sum(abs(normz))), # print progress 165 | print normz.T 166 | if n % 1000 == 0: 167 | with open("output.txt", "a") as myf: 168 | msg = "%d %d %f %d %f" % (n, p, smooth_loss, hidden_size, sum(abs(normz))) # print progress 169 | myf.write(msg+"\n") 170 | 171 | 172 | # perform parameter update with Adagrad 173 | 174 | for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 175 | [dWxh, dWhh, dWhy, dbh, dby], 176 | [mWxh, mWhh, mWhy, mbh, mby]): 177 | # mem += dparam * dparam # Adagrad 178 | mem += .01 * (dparam * dparam - mem) # RMSProp 179 | RMSdelta = -learning_rate * dparam / np.sqrt(mem + 1e-8) # RMSProp update 180 | np.clip(RMSdelta, -g['MAXDW'], g['MAXDW'], out = RMSdelta) # Clipping the weight modifications 181 | param += RMSdelta 182 | 183 | # Note that 1-norm penalty on weights is applied even for fized-size! If you want to have no penalty, set COEFFWPEN to 0 (but this will decrease performance). 184 | Why -= g['COEFFWPEN'] * np.sign(Why) 185 | Whh -= g['COEFFWPEN'] * np.sign(Whh) 186 | 187 | # Computing the L1-norm of outgoing weights for each neuron. 188 | # The norm of lateral weights is scaled by the number of neurons and multiplied by 4, so it should remain roughly similar to the norm of feedforward weights as the network changes size (there are 4 output neurons) 189 | normz = .5 * (np.sum(np.abs(Why), axis = 0) + 4.0 * np.sum(np.abs(Whh), axis = 0) / hidden_size) 190 | 191 | 192 | if g['ADDDEL']: 193 | 194 | 195 | # Neuron addition / deletion 196 | # Deletable neurons are those whose outgoing weights fall below a certain threshold in L1-norm. 197 | # We want to delete excess below-threshold neurons, keeping only NBMARGIN below-threshold neuron at any time; or add one new neuron if no below-threshold neuron remains. (Both with a certain probability) 198 | 199 | ages += 1 200 | 201 | #normz[normz < g['MINMULTIP']] = g['MINMULTIP'] # outgoing weight norms are clipped from below 202 | 203 | 204 | # Which neurons are above threshold ('selected' for preservation) ? 205 | sel = abs(normz) > g['DELETIONTHRESHOLD']#[0] # | (ages < 500) 206 | 207 | if sum(sel) < hidden_size - g['NBMARGIN'] : 208 | 209 | # Preserve 1-PROBADEL% of the below-threshold neurons, in addition to NBMARGIN below-threshold neurons (NBMARGIN is usually set to 1). 210 | # (Perhaps select the most recent neurons for deletion? Future work.) 211 | deletable = np.where(sel == False)[0] 212 | np.random.shuffle(deletable) 213 | for xx in range(g['NBMARGIN']): 214 | sel[deletable[xx]] = True 215 | deletable = deletable[g['NBMARGIN']:] 216 | for x in deletable: 217 | if np.random.rand() > g['PROBADEL']: # Note that this is a test for preservation rather than deletion, hence the > 218 | sel[x] = True 219 | 220 | 221 | # Delete all other deletable neurons 222 | hidden_size = sum(sel) 223 | Whh = Whh[sel,:][:, sel] 224 | Wxh = Wxh[sel, :] 225 | normz = normz[sel] 226 | Why = Why[:, sel] 227 | bh = bh[sel] 228 | hprev = hprev[sel] 229 | ages = ages[sel] 230 | 231 | mWxh = mWxh[sel, :] 232 | mWhh = mWhh[sel,:][:, sel] 233 | mWhy = mWhy[:, sel] 234 | mbh = mbh[sel] 235 | 236 | 237 | # Addition of new neurons, if appropriate: 238 | if hidden_size < MAX_HIDDEN_SIZE -1: 239 | if ( (sum((abs(normz) > g['DELETIONTHRESHOLD'])) > hidden_size - g['NBMARGIN']) & (np.random.rand() < g['PROBAADD'])) \ 240 | | (np.random.rand() < 1e-4): 241 | 242 | Whh = np.append(Whh, np.random.randn(1, hidden_size)*0.01, axis=0) 243 | Wxh = np.append(Wxh, np.random.randn(1, vocab_size)*0.01, axis=0) 244 | 245 | # The (absolute values of) outgoing weights of the added neuron must sum to g['DELETIONTHRESHOLD'] 246 | newWhy = np.random.randn(vocab_size,1) 247 | newWhy = .5 * g['DELETIONTHRESHOLD'] * newWhy / (1e-8 + np.sum(abs(newWhy))) 248 | Why = np.append(Why, newWhy, axis=1) 249 | 250 | newWhh = np.random.randn(hidden_size+1, 1) 251 | newWhh = .5 * hidden_size * g['DELETIONTHRESHOLD'] * newWhh / (1e-8 + 4.0 * np.sum(abs(newWhh))) 252 | #newWhh *= .01 253 | Whh = np.append(Whh, newWhh, axis=1) 254 | 255 | bh = np.append(bh, np.zeros((1,1)), axis=0) 256 | hprev = np.append(hprev, np.zeros((1,1)), axis=0) 257 | #normz = np.append(normz, g['DELETIONTHRESHOLD'] ) 258 | ages = np.append(ages, 0) 259 | 260 | mWhh = np.append(mWhh, .01 * np.ones((1, hidden_size)), axis=0) 261 | mWhh = np.append(mWhh, .01 * np.ones((hidden_size+1, 1)), axis=1) 262 | mWxh = np.append(mWxh, .01 * np.ones((1, vocab_size)), axis=0) 263 | mWhy = np.append(mWhy, .01 * np.ones((vocab_size,1)), axis=1) 264 | mbh = np.append(mbh, .01 * np.ones((1,1)), axis=0) 265 | 266 | hidden_size += 1 267 | print "Adding Neuron" 268 | 269 | 270 | 271 | p += seq_length # move data pointer 272 | n += 1 # iteration counter 273 | if (n == int(g['NBSTEPS'] / 3)) & (g['EXPTYPE'] == 'EASYHARDEASY'): 274 | data = open(g['DIR'] + '/inputhard.txt', 'r').read() # should be simple plain text file 275 | p = 0 276 | if (n == int(g['NBSTEPS'] / 2)) & (g['EXPTYPE'] == 'HARDEASY'): 277 | data = open(g['DIR'] + '/inputeasy.txt', 'r').read() # should be simple plain text file 278 | p = 0 279 | if (n == int(2 * g['NBSTEPS'] / 3)) & (g['EXPTYPE'] == 'EASYHARDEASY'): 280 | data = open(g['DIR'] + '/inputeasy.txt', 'r').read() # should be simple plain text file 281 | p = 0 282 | if n > g['NBSTEPS']: 283 | print "Done!" 284 | sys.exit(0) 285 | 286 | -------------------------------------------------------------------------------- /rnn.py.prev: -------------------------------------------------------------------------------- 1 | """ 2 | Differentiable-structure RNN, by Thomas Miconi. 3 | 4 | Largely based on minimal character-level Vanilla RNN model by Andrej Karpathy (@karpathy): https://gist.github.com/karpathy/d4dee566867f8291f086 5 | 6 | BSD License 7 | 8 | """ 9 | import numpy as np 10 | import math 11 | import sys 12 | 13 | # Global meta-parameters, modifiable by command line 14 | g = { 15 | 'NBSTEPS' : 300000, 16 | 'COEFFMULTIPNORM' : 3e-5, 17 | 'EXPTYPE' : 'HARD', 18 | 'DELETIONTHRESHOLD': .05, 19 | 'MINMULTIP': .025, # Must be lower than DELETIONTHRESHOLD ! 20 | 'NBMARGIN' : 1, 21 | 'PROBADEL': .25, 22 | 'PROBAADD': .05, 23 | 'RNGSEED' : 0 24 | } 25 | 26 | # Command line parameters parsing 27 | 28 | argpairs = [sys.argv[i:i+2] for i in range(1, len(sys.argv), 2)] 29 | for argpair in argpairs: 30 | if not (argpair[0] in g): 31 | sys.exit("Error, tried to pass value of non-existent parameter "+argpair[0]) 32 | if argpair[0] == 'EXPTYPE': 33 | g['EXPTYPE'] = argpair[1] 34 | else: 35 | g[argpair[0]] = float(argpair[1]) 36 | 37 | if (g['EXPTYPE'] not in ['HARD', 'EASY', 'HARDEASY', 'EASYHARDEASY']): 38 | sys.exit('Wrong EXPTYPE value') 39 | g['NBMARGIN'] = int(g['NBMARGIN']) 40 | g['RNGSEED'] = int(g['RNGSEED']) 41 | print g 42 | 43 | np.random.seed(g['RNGSEED']) 44 | 45 | 46 | # data I/O 47 | # NOTE: the input files are specified two directories up because I generally use the program with a different working directory. Modify as needed. 48 | myf = open("test.txt", "w") 49 | myf.close() 50 | if (g['EXPTYPE'] == 'EASY') | (g['EXPTYPE'] == 'EASYHARDEASY'): 51 | data = open('./inputeasy.txt', 'r').read() # should be simple plain text file 52 | else: 53 | data = open('./inputhard.txt', 'r').read() # should be simple plain text file 54 | chars = list(set(data)) 55 | data_size, vocab_size = len(data), len(chars) 56 | print 'data has', data_size, 'characters,', vocab_size, 'unique.'# % (data_size, vocab_size) 57 | char_to_ix = { ch:i for i,ch in enumerate(chars) } 58 | ix_to_char = { i:ch for i,ch in enumerate(chars) } 59 | 60 | # hyperparameters 61 | MAX_HIDDEN_SIZE = 100 # Maximum size of hidden layer of neurons (same as fixed size in original min-char-rnn.py) 62 | hidden_size = 1 # size of hidden layer of neurons - start from 1 node. 63 | seq_length = 40 # number of steps to unroll the RNN for 64 | learning_rate = 1e-1 65 | 66 | # network parameters 67 | Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden 68 | Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden 69 | multips = .001 * np.ones((hidden_size, 1)); # multipliers 70 | multips[0,0] = 1.0 # Start with a multiplier of 1 on the single starting node. 71 | Wiy = np.random.randn(vocab_size, hidden_size)*0.01 # hidden (after multiplier) to output. See below 72 | bh = np.zeros((hidden_size, 1)) # hidden bias 73 | by = np.zeros((vocab_size, 1)) # output bias 74 | 75 | ages = np.zeros(hidden_size) # Ages of all neurons. Not used at present. 76 | 77 | def lossFun(inputs, targets, hprev): 78 | """ 79 | inputs,targets are both list of integers. 80 | hprev is Hx1 array of initial hidden state 81 | returns the loss, gradients on model parameters, and last hidden state 82 | """ 83 | xs, hs, intoys, ys, ps = {}, {}, {}, {}, {} 84 | hs[-1] = np.copy(hprev) 85 | loss = 0 86 | # forward pass 87 | for t in xrange(len(inputs)): 88 | xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation 89 | xs[t][inputs[t]] = 1 90 | 91 | hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state 92 | intoys[t] = multips * hs[t] # "intoy" is the output of the hidden layer after the multipliers, which is to be fed "into" y (through the Wiy weight matrix) 93 | ys[t] = np.dot(Wiy, intoys[t]) + by # unnormalized log probabilities for next chars 94 | ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars 95 | loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss) 96 | 97 | # backward pass: compute gradients going backwards 98 | dWxh, dWhh, dmultips, dWiy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(multips), np.zeros_like(Wiy) 99 | dbh, dby = np.zeros_like(bh), np.zeros_like(by) 100 | dhnext = np.zeros_like(hs[0]) 101 | for t in reversed(xrange(len(inputs))): 102 | dy = np.copy(ps[t]) 103 | dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here 104 | dWiy += np.dot(dy, intoys[t].T) 105 | dby += dy 106 | dintoy = np.dot(Wiy.T, dy) # dE/dIntoY, as a function of dE/dy 107 | 108 | # Gradient to be applied to the multipliers 109 | dmultips += (1.0 * dintoy * multips # This part descends the error gradient 110 | + g['COEFFMULTIPNORM'] * np.sign(multips)) # L1-norm regularization. The derivative of abs(x) is sign(x). Thus, descending the gradient of abs(x) over x is simply subtracting a constant multiple of sign(x). 111 | # + .001 * multips) # This would add an L2-regularization term, which we don't use here. 112 | 113 | dh = dintoy * multips + dhnext 114 | dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity 115 | dbh += dhraw 116 | dWxh += np.dot(dhraw, xs[t].T) 117 | dWhh += np.dot(dhraw, hs[t-1].T) 118 | dhnext = np.dot(Whh.T, dhraw) 119 | for dparam in [dWxh, dWhh, dmultips, dWiy, dbh, dby]: 120 | np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients 121 | return loss, dWxh, dWhh, dmultips, dWiy, dbh, dby, hs[len(inputs)-1] 122 | 123 | def sample(h, seed_ix, n): 124 | """ 125 | sample a sequence of integers from the model 126 | h is memory state, seed_ix is seed letter for first time step 127 | """ 128 | x = np.zeros((vocab_size, 1)) 129 | x[seed_ix] = 1 130 | ixes = [] 131 | for t in xrange(n): 132 | h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh) 133 | y = np.dot(Wiy, multips * h) + by 134 | p = np.exp(y) / np.sum(np.exp(y)) 135 | ix = np.random.choice(range(vocab_size), p=p.ravel()) 136 | x = np.zeros((vocab_size, 1)) 137 | x[ix] = 1 138 | ixes.append(ix) 139 | return ixes 140 | 141 | n, p = 0, 0 142 | mWxh, mWhh, mmultips, mWiy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(multips), np.zeros_like(Wiy) 143 | mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad 144 | smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0 145 | 146 | 147 | while True: 148 | # prepare inputs (we're sweeping from left to right in steps seq_length long) 149 | if p+seq_length+1 >= len(data) or n == 0: 150 | hprev = np.zeros((hidden_size,1)) # reset RNN memory 151 | p = 0 # go from start of data 152 | inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]] 153 | targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]] 154 | 155 | # sample from the model now and then 156 | if n % 100 == 0: 157 | sample_ix = sample(hprev, inputs[0], 200) 158 | txt = ''.join(ix_to_char[ix] for ix in sample_ix) 159 | print '----\n %s \n----' % (txt, ) 160 | 161 | # forward seq_length characters through the net and fetch gradient 162 | loss, dWxh, dWhh, dmultips, dWiy, dbh, dby, hprev = lossFun(inputs, targets, hprev) 163 | smooth_loss = smooth_loss * 0.999 + loss * 0.001 164 | if n % 100 == 0: 165 | print 'iter %d, position in data %d, loss: %f , nb hidden neurons %d, sum-abs multips: %f' % (n, p, smooth_loss, hidden_size, sum(abs(multips))), # print progress 166 | print multips.T 167 | if n % 1000 == 0: 168 | with open("output.txt", "a") as myf: 169 | msg = "%d %d %f %d %f" % (n, p, smooth_loss, hidden_size, sum(abs(multips))) # print progress 170 | myf.write(msg+"\n") 171 | 172 | 173 | # perform parameter update with Adagrad 174 | for param, dparam, mem in zip([Wxh, Whh, multips, Wiy, bh, by], 175 | [dWxh, dWhh, dmultips, dWiy, dbh, dby], 176 | [mWxh, mWhh, mmultips, mWiy, mbh, mby]): 177 | mem += dparam * dparam 178 | param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update 179 | 180 | 181 | # Neuron addition / deletion 182 | # Deletable neurons are those whose multipliers fall below threshold. 183 | # We want to delete excess below-threshold neurons, keeping only NBMARGIN below-threshold neuron at any time; or add one new neuron if no below-threshold neuron remains. 184 | 185 | ages += 1 186 | 187 | multips[multips < g['MINMULTIP']] = g['MINMULTIP'] # multipliers are clipped from below 188 | 189 | 190 | # Which neurons are above threshold ('selected' for preservation) ? 191 | sel = (abs(multips) > g['DELETIONTHRESHOLD'])[:,0] # | (ages < 500) 192 | 193 | if sum(sel) < hidden_size - g['NBMARGIN'] : 194 | 195 | # Preserve 1-PROBADEL% of the below-threshold neurons, in addition to NBMARGIN below-threshold neurons (NBMARGIN is usually set to 1). 196 | # (Perhaps select the most recent neurons for deletion? Future work.) 197 | deletable = np.where(sel == False)[0] 198 | np.random.shuffle(deletable) 199 | for xx in range(g['NBMARGIN']): 200 | sel[deletable[xx]] = True 201 | deletable = deletable[g['NBMARGIN']:] 202 | for x in deletable: 203 | if np.random.rand() > g['PROBADEL']: # Note that this is a test for preservation rather than deletion, hence the > 204 | sel[x] = True 205 | 206 | 207 | # Delete all other deletable neurons 208 | hidden_size = sum(sel) 209 | Whh = Whh[sel,:][:, sel] 210 | Wxh = Wxh[sel, :] 211 | multips = multips[sel] 212 | Wiy = Wiy[:, sel] 213 | bh = bh[sel] 214 | hprev = hprev[sel] 215 | ages = ages[sel] 216 | 217 | mWxh = mWxh[sel, :] 218 | mWhh = mWhh[sel,:][:, sel] 219 | mmultips = mmultips[sel] 220 | mWiy = mWiy[:, sel] 221 | mbh = mbh[sel] 222 | 223 | if hidden_size < MAX_HIDDEN_SIZE -1: 224 | if ( (sum((abs(multips) > g['DELETIONTHRESHOLD'])[:,0]) > hidden_size - g['NBMARGIN']) & (np.random.rand() < g['PROBAADD'])) \ 225 | | (np.random.rand() < 1e-4): 226 | # Add a new neuron 227 | Whh = np.append(Whh, np.random.randn(1, hidden_size)*0.01, axis=0) 228 | Whh = np.append(Whh, np.random.randn(hidden_size+1, 1)*0.01, axis=1) 229 | Wxh = np.append(Wxh, np.random.randn(1, vocab_size)*0.01, axis=0) 230 | Wiy = np.append(Wiy, np.random.randn(vocab_size,1)*0.01, axis=1) 231 | bh = np.append(bh, np.zeros((1,1)), axis=0) 232 | hprev = np.append(hprev, np.zeros((1,1)), axis=0) 233 | multips = np.append(multips, g['DELETIONTHRESHOLD'] * np.ones((1,1)), axis=0) # Initial multiplier for new neurons is set to deletion threshold 234 | ages = np.append(ages, 0) 235 | 236 | mWhh = np.append(mWhh, np.zeros((1, hidden_size)), axis=0) 237 | mWhh = np.append(mWhh, np.zeros((hidden_size+1, 1)), axis=1) 238 | mWxh = np.append(mWxh, np.zeros((1, vocab_size)), axis=0) 239 | mWiy = np.append(mWiy, np.zeros((vocab_size,1)), axis=1) 240 | mbh = np.append(mbh, np.zeros((1,1)), axis=0) 241 | mmultips = np.append(mmultips, np.zeros((1,1)), axis=0) 242 | 243 | hidden_size += 1 244 | print "Adding Neuron" 245 | 246 | 247 | 248 | p += seq_length # move data pointer 249 | n += 1 # iteration counter 250 | if (n == 100000) & (g['EXPTYPE'] == 'EASYHARDEASY'): 251 | data = open('./inputhard.txt', 'r').read() # should be simple plain text file 252 | p = 0 253 | if (n == 100000) & (g['EXPTYPE'] == 'HARDEASY'): 254 | data = open('./inputeasy.txt', 'r').read() # should be simple plain text file 255 | p = 0 256 | if (n == 200000) & (g['EXPTYPE'] == 'EASYHARDEASY'): 257 | data = open('./inputeasy.txt', 'r').read() # should be simple plain text file 258 | p = 0 259 | if n > g['NBSTEPS']: 260 | sys.exit(0) 261 | 262 | -------------------------------------------------------------------------------- /rnnAltern.py: -------------------------------------------------------------------------------- 1 | """ 2 | Differentiable-structure RNN, by Thomas Miconi. 3 | 4 | This is an alternative version in which the multipliers apply directly to the 5 | output of each hidden neuron, and thus also affect the recurrent connections. 6 | It also works, but produces noticeably lower performance. 7 | 8 | Largely based on minimal character-level Vanilla RNN model by Andrej Karpathy (@karpathy): https://gist.github.com/karpathy/d4dee566867f8291f086 9 | 10 | REMINDER: if you modify something in the forward pass, remember to modify it also in the sampling function! 11 | 12 | BSD License 13 | 14 | """ 15 | import numpy as np 16 | import math 17 | import sys 18 | 19 | # Global meta-parameters, modifiable by command line 20 | g = { 21 | 'DIR': '../..', 22 | 'NBSTEPS' : 300000, 23 | 'COEFFMULTIPGRAD' : 1.0, 24 | 'COEFFMULTIPNORM' : 3e-5, 25 | 'EXPTYPE' : 'EASYHARDEASY', 26 | 'DELETIONTHRESHOLD': .05, 27 | 'MINMULTIP':.025, # Must be lower than DELETIONTHRESHOLD ! 28 | 'NBMARGIN' : 1, 29 | 'PROBADEL': .25, 30 | 'PROBAADD': .05, 31 | 'RNGSEED' : 0 32 | } 33 | 34 | # Command line parameters parsing 35 | 36 | argpairs = [sys.argv[i:i+2] for i in range(1, len(sys.argv), 2)] 37 | for argpair in argpairs: 38 | if not (argpair[0] in g): 39 | sys.exit("Error, tried to pass value of non-existent parameter "+argpair[0]) 40 | if (argpair[0] == 'EXPTYPE') or (argpair[0] == 'DIR'): 41 | g[argpair[0]] = argpair[1] 42 | else: 43 | g[argpair[0]] = float(argpair[1]) 44 | 45 | if (g['EXPTYPE'] not in ['HARD', 'EASY', 'HARDEASY', 'EASYHARDEASY']): 46 | sys.exit('Wrong EXPTYPE value') 47 | g['NBMARGIN'] = int(g['NBMARGIN']) 48 | g['RNGSEED'] = int(g['RNGSEED']) 49 | print g 50 | 51 | np.random.seed(g['RNGSEED']) 52 | 53 | 54 | # data I/O 55 | # NOTE: the input files are specified two directories up because I generally use the program with a different working directory. Modify as needed. 56 | myf = open("test.txt", "w") 57 | myf.close() 58 | if (g['EXPTYPE'] == 'EASY') | (g['EXPTYPE'] == 'EASYHARDEASY'): 59 | data = open(g['DIR']+'/inputeasy.txt', 'r').read() # should be simple plain text file 60 | else: 61 | #data = open('./inputhard.txt', 'r').read() # should be simple plain text file 62 | data = open(g['DIR']+'/inputhard.txt', 'r').read() # should be simple plain text file 63 | chars = list(set(data)) 64 | data_size, vocab_size = len(data), len(chars) 65 | print 'data has', data_size, 'characters,', vocab_size, 'unique.'# % (data_size, vocab_size) 66 | char_to_ix = { ch:i for i,ch in enumerate(chars) } 67 | ix_to_char = { i:ch for i,ch in enumerate(chars) } 68 | 69 | # hyperparameters 70 | MAX_HIDDEN_SIZE = 100 # Maximum size of hidden layer of neurons (same as fixed size in original min-char-rnn.py) 71 | hidden_size = 1 # 1 # size of hidden layer of neurons - start from 1 node. 72 | seq_length = 40 # number of steps to unroll the RNN for 73 | learning_rate = 1e-1 74 | 75 | # network parameters 76 | Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden 77 | Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden 78 | multips = .001 * np.ones((hidden_size, 1)); # multipliers 79 | multips.fill(1.0) # Start with a multiplier of 1 on the single starting node. 80 | 81 | Wiy = np.random.randn(vocab_size, hidden_size)*0.01 # hidden (after multiplier) to output. See below 82 | bh = np.zeros((hidden_size, 1)) # hidden bias 83 | by = np.zeros((vocab_size, 1)) # output bias 84 | 85 | ages = np.zeros(hidden_size) # Ages of all neurons. Not used at present. 86 | 87 | def lossFun(inputs, targets, postmultipprev): 88 | """ 89 | inputs,targets are both list of integers. 90 | hprev is Hx1 array of initial hidden state 91 | returns the loss, gradients on model parameters, and last hidden state 92 | """ 93 | xs, hs, postmultips, ys, ps = {}, {}, {}, {}, {} 94 | postmultips[-1] = np.copy(postmultipprev) 95 | loss = 0 96 | # forward pass 97 | for t in xrange(len(inputs)): 98 | xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation 99 | xs[t][inputs[t]] = 1 100 | 101 | hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, postmultips[t-1]) + bh) # hidden state 102 | postmultips[t] = multips * hs[t] # "postmultip" is the output of the hidden layer after the multipliers, which is to be fed "into" y (through the Wiy weight matrix) 103 | ys[t] = np.dot(Wiy, postmultips[t]) + by # unnormalized log probabilities for next chars 104 | ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars 105 | loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss) 106 | 107 | # backward pass: compute gradients going backwards 108 | dWxh, dWhh, dmultips, dWiy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(multips), np.zeros_like(Wiy) 109 | dbh, dby = np.zeros_like(bh), np.zeros_like(by) 110 | dpostmultipnext = np.zeros_like(postmultips[0]) 111 | for t in reversed(xrange(len(inputs))): 112 | dy = np.copy(ps[t]) 113 | dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here 114 | dWiy += np.dot(dy, postmultips[t].T) 115 | dby += dy 116 | dpostmultip = np.dot(Wiy.T, dy) + dpostmultipnext # dE/dIntoY, as a function of dE/dy 117 | 118 | # Gradient to be applied to the multipliers 119 | dmultips += (g['COEFFMULTIPGRAD'] * dpostmultip * hs[t] # This part descends the error gradient 120 | + g['COEFFMULTIPNORM'] * np.sign(multips)) # L1-norm regularization. The derivative of abs(x) is sign(x). 121 | # + .001 * multips) # This would add an L2-regularization term, which we don't use here. 122 | 123 | dh = dpostmultip * multips 124 | dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity 125 | dbh += dhraw 126 | dWxh += np.dot(dhraw, xs[t].T) 127 | dWhh += np.dot(dhraw, postmultips[t-1].T) 128 | dpostmultipnext = np.dot(Whh.T, dhraw) 129 | for dparam in [dWxh, dWhh, dmultips, dWiy, dbh, dby]: 130 | np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients 131 | return loss, dWxh, dWhh, dmultips, dWiy, dbh, dby, postmultips[len(inputs)-1] 132 | 133 | def sample(h, seed_ix, n): 134 | """ 135 | sample a sequence of integers from the model 136 | h is memory state, seed_ix is seed letter for first time step 137 | """ 138 | x = np.zeros((vocab_size, 1)) 139 | x[seed_ix] = 1 140 | ixes = [] 141 | for t in xrange(n): 142 | h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, multips*h) + bh) 143 | y = np.dot(Wiy, multips * h) + by 144 | p = np.exp(y) / np.sum(np.exp(y)) 145 | ix = np.random.choice(range(vocab_size), p=p.ravel()) 146 | x = np.zeros((vocab_size, 1)) 147 | x[ix] = 1 148 | ixes.append(ix) 149 | return ixes 150 | 151 | n, p = 0, 0 152 | mWxh, mWhh, mmultips, mWiy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(multips), np.zeros_like(Wiy) 153 | mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad 154 | smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0 155 | 156 | 157 | while True: 158 | # prepare inputs (we're sweeping from left to right in steps seq_length long) 159 | if p+seq_length+1 >= len(data) or n == 0: 160 | postmultipprev = np.zeros((hidden_size,1)) # reset RNN memory 161 | p = 0 # go from start of data 162 | inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]] 163 | targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]] 164 | 165 | # sample from the model now and then 166 | if n % 100 == 0: 167 | sample_ix = sample(postmultipprev, inputs[0], 200) 168 | txt = ''.join(ix_to_char[ix] for ix in sample_ix) 169 | print '----\n %s \n----' % (txt, ) 170 | 171 | # forward seq_length characters through the net and fetch gradient 172 | loss, dWxh, dWhh, dmultips, dWiy, dbh, dby, postmultipprev = lossFun(inputs, targets, postmultipprev) 173 | smooth_loss = smooth_loss * 0.999 + loss * 0.001 174 | if n % 100 == 0: 175 | print 'iter %d, position in data %d, loss: %f , nb hidden neurons %d, sum-abs multips: %f' % (n, p, smooth_loss, hidden_size, sum(abs(multips))), # print progress 176 | print multips.T 177 | if n % 1000 == 0: 178 | with open("output.txt", "a") as myf: 179 | msg = "%d %d %f %d %f" % (n, p, smooth_loss, hidden_size, sum(abs(multips))) # print progress 180 | myf.write(msg+"\n") 181 | 182 | 183 | # perform parameter update with Adagrad 184 | for param, dparam, mem in zip([Wxh, Whh, multips, Wiy, bh, by], 185 | [dWxh, dWhh, dmultips, dWiy, dbh, dby], 186 | [mWxh, mWhh, mmultips, mWiy, mbh, mby]): 187 | mem += dparam * dparam 188 | param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update 189 | 190 | 191 | # Neuron addition / deletion 192 | # Deletable neurons are those whose multipliers fall below threshold. 193 | # We want to delete excess below-threshold neurons, keeping only NBMARGIN below-threshold neuron at any time; or add one new neuron if no below-threshold neuron remains. 194 | 195 | ages += 1 196 | 197 | multips[multips < g['MINMULTIP']] = g['MINMULTIP'] # multipliers are clipped from below 198 | 199 | #""" 200 | 201 | # Addition and deletion of neurons 202 | # Which neurons are above threshold ('selected' for preservation) ? 203 | sel = (abs(multips) > g['DELETIONTHRESHOLD'])[:,0] # | (ages < 500) 204 | 205 | if sum(sel) < hidden_size - g['NBMARGIN'] : 206 | 207 | # Preserve 1-PROBADEL% of the below-threshold neurons, in addition to NBMARGIN below-threshold neurons (NBMARGIN is usually set to 1). 208 | # (Perhaps select the most recent neurons for deletion? Future work.) 209 | deletable = np.where(sel == False)[0] 210 | np.random.shuffle(deletable) 211 | for xx in range(g['NBMARGIN']): 212 | sel[deletable[xx]] = True 213 | deletable = deletable[g['NBMARGIN']:] 214 | for x in deletable: 215 | if np.random.rand() > g['PROBADEL']: # Note that this is a test for preservation rather than deletion, hence the > 216 | sel[x] = True 217 | 218 | 219 | # Delete all other deletable neurons 220 | hidden_size = sum(sel) 221 | Whh = Whh[sel,:][:, sel] 222 | Wxh = Wxh[sel, :] 223 | multips = multips[sel] 224 | Wiy = Wiy[:, sel] 225 | bh = bh[sel] 226 | postmultipprev = postmultipprev[sel] 227 | ages = ages[sel] 228 | 229 | mWxh = mWxh[sel, :] 230 | mWhh = mWhh[sel,:][:, sel] 231 | mmultips = mmultips[sel] 232 | mWiy = mWiy[:, sel] 233 | mbh = mbh[sel] 234 | 235 | if hidden_size < MAX_HIDDEN_SIZE -1: 236 | if ( (sum((abs(multips) > g['DELETIONTHRESHOLD'])[:,0]) > hidden_size - g['NBMARGIN']) & (np.random.rand() < g['PROBAADD'])) \ 237 | | (np.random.rand() < 1e-4): 238 | # Add a new neuron 239 | Whh = np.append(Whh, np.random.randn(1, hidden_size)*0.01, axis=0) 240 | Whh = np.append(Whh, np.random.randn(hidden_size+1, 1)*0.01, axis=1) 241 | Wxh = np.append(Wxh, np.random.randn(1, vocab_size)*0.01, axis=0) 242 | Wiy = np.append(Wiy, np.random.randn(vocab_size,1)*0.01, axis=1) 243 | bh = np.append(bh, np.zeros((1,1)), axis=0) 244 | postmultipprev = np.append(postmultipprev, np.zeros((1,1)), axis=0) 245 | multips = np.append(multips, g['DELETIONTHRESHOLD'] * np.ones((1,1)), axis=0) # Initial multiplier for new neurons is set to deletion threshold 246 | ages = np.append(ages, 0) 247 | 248 | mWhh = np.append(mWhh, np.zeros((1, hidden_size)), axis=0) 249 | mWhh = np.append(mWhh, np.zeros((hidden_size+1, 1)), axis=1) 250 | mWxh = np.append(mWxh, np.zeros((1, vocab_size)), axis=0) 251 | mWiy = np.append(mWiy, np.zeros((vocab_size,1)), axis=1) 252 | mbh = np.append(mbh, np.zeros((1,1)), axis=0) 253 | mmultips = np.append(mmultips, np.zeros((1,1)), axis=0) 254 | 255 | hidden_size += 1 256 | print "Adding Neuron" 257 | 258 | #""" 259 | 260 | p += seq_length # move data pointer 261 | n += 1 # iteration counter 262 | if (n == int(g['NBSTEPS'] / 3)) & (g['EXPTYPE'] == 'EASYHARDEASY'): 263 | data = open(g['DIR']+'/inputhard.txt', 'r').read() # should be simple plain text file 264 | p = 0 265 | if (n == int(g['NBSTEPS'] / 3)) & (g['EXPTYPE'] == 'HARDEASY'): 266 | data = open(g['DIR']+'/inputeasy.txt', 'r').read() # should be simple plain text file 267 | p = 0 268 | if (n == 2 * int(g['NBSTEPS'] / 3)) & (g['EXPTYPE'] == 'EASYHARDEASY'): 269 | data = open(g['DIR']+'/inputeasy.txt', 'r').read() # should be simple plain text file 270 | p = 0 271 | if n > g['NBSTEPS']: 272 | sys.exit(0) 273 | 274 | -------------------------------------------------------------------------------- /runexp.py: -------------------------------------------------------------------------------- 1 | # Submit jobs to the cluster. 2 | 3 | # /opt/python-2.7.10/bin/python 4 | 5 | 6 | import sys 7 | import os 8 | import shutil 9 | 10 | """ 11 | g = { 12 | 'COEFFMULTIPNORM' : 3e-5, 13 | 'DELETIONTHRESHOLD': .01, 14 | 'MINMULTIP': .01*.25, # Must be lower than DELETIONTHRESHOLD ! 15 | 'NBMARGIN' : 1, 16 | 'PROBADEL': .003, 17 | 'PROBAADD': .1, 18 | 'RNGSEED' : 0 19 | } 20 | """ 21 | allopts = [ 22 | 23 | #"HIDDENSIZE 10 NBSTEPS 300000", 24 | #"HIDDENSIZE 30 NBSTEPS 300000", 25 | #"HIDDENSIZE 22 NBSTEPS 300000", 26 | #"HIDDENSIZE 50 NBSTEPS 300000", 27 | ##"HIDDENSIZE 70 NBSTEPS 300000", 28 | #"HIDDENSIZE 100 NBSTEPS 300000", 29 | 30 | 31 | "EXPTYPE EASY COEFFMULTIPNORM 3e-5 NBMARGIN 1 DELETIONTHRESHOLD .05 MINMULTIP .025 PROBADEL .25 PROBAADD .05 NBSTEPS 300000", 32 | #"EXPTYPE HARD COEFFMULTIPNORM 3e-5 NBMARGIN 1 DELETIONTHRESHOLD .05 MINMULTIP .025 PROBADEL .25 PROBAADD .05 NBSTEPS 300000", 33 | #"EXPTYPE EASYHARDEASY COEFFMULTIPNORM 3e-5 NBMARGIN 1 DELETIONTHRESHOLD .05 MINMULTIP .025 PROBADEL .25 PROBAADD .05 NBSTEPS 300000", 34 | 35 | 36 | ] 37 | 38 | 39 | for optionz in allopts: 40 | 41 | #dirname = "trial-ref-" + optionz.replace(' ', '-') 42 | #dirname = "trial-fixedsize-CMN-" + optionz.replace(' ', '-') 43 | dirname = "trial-ref-CMN-" + optionz.replace(' ', '-') 44 | 45 | if os.path.exists(dirname): 46 | shutil.rmtree(dirname) 47 | os.mkdir(dirname) 48 | os.chdir(dirname) 49 | print os.getcwd() 50 | 51 | for v in range(20): 52 | os.mkdir("v"+str(v)) 53 | os.chdir("v"+str(v)) 54 | CMD = "bsub -q short -W 4:00 -eo e.txt -g /rnn /opt/python-2.7.10/bin/python ../../rnn.py " + optionz + " RNGSEED " + str(v) 55 | #CMD = "bsub -q short -W 4:00 -eo e.txt -oo o.txt -g /rnn /opt/python-2.7.10/bin/python ../../rnn.py " + optionz + " RNGSEED " + str(v) 56 | #CMD = "bsub -q short -W 6:00 -eo e.txt -oo o.txt -g /rnn /opt/python-2.7.10/bin/python ../../min-char-rnn-param.py " + optionz + " RNGSEED " + str(v) # For fixed-size 57 | #print CMD 58 | retval = os.system(CMD) 59 | print retval 60 | os.chdir('..') 61 | 62 | os.chdir('..') 63 | 64 | 65 | #print dirname 66 | #for RNGSEED in range(2): 67 | #st = "python rnn.py COEFFMULTIPNORM " + str(CMN) + " DELETIONTHRESHOLD " + str(DT) + " MINMULTIP " \ 68 | #+ str(MMmultiplierofDT*DT) + " PROBADEL " + str(PD) + " PROBAADD " + str(PAmultiplierofPD * PD) \ 69 | #+ " RNGSEED " + str(RNGSEED) + " NUMBERMARGIN " + str(NM) 70 | 71 | 72 | 73 | 74 | --------------------------------------------------------------------------------