├── README.md ├── cartpole.py ├── runs └── README.md └── task_transfer ├── gen_model.py ├── network.params └── try_transfer.py /README.md: -------------------------------------------------------------------------------- 1 | # GenerativeControl 2 | 3 | Code for a generative controller for the AI Gym cartpole task. 4 | 5 | The code in the root directory of this repository is set up to train sets of 6 | controllers on the AI Gym cartpole-v1 task in order to collect statistics about their learning 7 | and performane. It generates a set of 10 runs, 125 episodes in length, 8 | starting from a random seed given by the first commandline argument. 9 | 10 | The task_transfer/ subdirectory contains a version of the code that 11 | trains a single model and saves the weights, along with a second 12 | program which loads the model and evaluates its performance for 13 | different reward functions. -------------------------------------------------------------------------------- /cartpole.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | import matplotlib.pyplot 4 | 5 | from math import * 6 | 7 | import theano 8 | import theano.tensor as T 9 | 10 | import lasagne 11 | 12 | from PIL import Image 13 | import scipy.misc 14 | import sys 15 | 16 | np.random.seed(int(sys.argv[1])+1238) 17 | 18 | env = gym.make('CartPole-v1') 19 | 20 | # Fixed point network 21 | 22 | weights = np.array([1.2, 1.8, 3.0, 0.8]) 23 | 24 | LATENT = 2 25 | FUTURE = 16 26 | HIDDEN = 256 27 | 28 | for trial in range(int(sys.argv[1]),int(sys.argv[1])+10): 29 | context_var = T.matrix() 30 | latent = T.matrix() 31 | d_input = T.matrix() 32 | target = T.vector() 33 | targs = T.vector() 34 | 35 | context_input = lasagne.layers.InputLayer((None,4), input_var = context_var) 36 | latent_input = lasagne.layers.InputLayer((None,LATENT), input_var = latent) 37 | state_input = lasagne.layers.InputLayer((None,FUTURE*5), input_var = d_input) 38 | 39 | plist = [] 40 | dense1 = lasagne.layers.DenseLayer(state_input, num_units = HIDDEN) 41 | plist.append(dense1.W) 42 | plist.append(dense1.b) 43 | dense2 = lasagne.layers.DenseLayer(dense1, num_units = HIDDEN) 44 | plist.append(dense2.W) 45 | plist.append(dense2.b) 46 | dense3 = lasagne.layers.DenseLayer(dense2, num_units = HIDDEN) 47 | plist.append(dense3.W) 48 | plist.append(dense3.b) 49 | enc = lasagne.layers.DenseLayer(dense3, num_units = LATENT, nonlinearity = lasagne.nonlinearities.tanh) 50 | plist.append(enc.W) 51 | plist.append(enc.b) 52 | enc_noise = lasagne.layers.GaussianNoiseLayer(enc, sigma=0.2) 53 | stack2 = lasagne.layers.ConcatLayer([context_input, enc_noise]) 54 | ddense1 = lasagne.layers.DenseLayer(stack2, num_units = HIDDEN) 55 | plist.append(ddense1.W) 56 | plist.append(ddense1.b) 57 | ddense2 = lasagne.layers.DenseLayer(ddense1, num_units = HIDDEN) 58 | plist.append(ddense2.W) 59 | plist.append(ddense2.b) 60 | ddense3 = lasagne.layers.DenseLayer(ddense2, num_units = HIDDEN) 61 | plist.append(ddense3.W) 62 | plist.append(ddense3.b) 63 | out = lasagne.layers.DenseLayer(ddense3, num_units = 5*FUTURE, nonlinearity = None) 64 | plist.append(out.W) 65 | plist.append(out.b) 66 | 67 | def addBlock(ctx_in, state_in, params): 68 | dense1 = lasagne.layers.DenseLayer(state_in, num_units = HIDDEN, W=params[0], b=params[1]) 69 | dense2 = lasagne.layers.DenseLayer(dense1, num_units = HIDDEN, W=params[2], b=params[3]) 70 | dense3 = lasagne.layers.DenseLayer(dense2, num_units = HIDDEN, W=params[4], b=params[5]) 71 | enc = lasagne.layers.DenseLayer(dense3, num_units = LATENT, nonlinearity = lasagne.nonlinearities.tanh, W=params[6], b=params[7]) 72 | enc_noise = lasagne.layers.GaussianNoiseLayer(enc, sigma=0.2) 73 | stack2 = lasagne.layers.ConcatLayer([ctx_in, enc_noise]) 74 | ddense1 = lasagne.layers.DenseLayer(stack2, num_units = HIDDEN, W=params[8], b=params[9]) 75 | ddense2 = lasagne.layers.DenseLayer(ddense1, num_units = HIDDEN, W=params[10], b=params[11]) 76 | ddense3 = lasagne.layers.DenseLayer(ddense2, num_units = HIDDEN, W=params[12], b=params[13]) 77 | out = lasagne.layers.DenseLayer(ddense2, num_units = 5*FUTURE, nonlinearity = None, W=params[14], b=params[15]) 78 | 79 | return enc, out 80 | 81 | enc2, out2 = addBlock(context_input, out, plist) 82 | enc3, out3 = addBlock(context_input, out2, plist) 83 | enc4, out4 = addBlock(context_input, out3, plist) 84 | enc5, out5 = addBlock(context_input, out4, plist) 85 | enc6, out6 = addBlock(context_input, out5, plist) 86 | enc7, out7 = addBlock(context_input, out6, plist) 87 | 88 | params = lasagne.layers.get_all_params(out7,trainable=True) 89 | 90 | outs = lasagne.layers.get_output([out,out2,out3,out4,out5,out6,out7]) 91 | encs = lasagne.layers.get_output([enc,enc2,enc3,enc4,enc5,enc6,enc7]) 92 | 93 | loss = 0 94 | for i in range(len(outs)): 95 | loss = loss + T.mean((outs[i] - d_input)**2) 96 | 97 | reg = lasagne.regularization.regularize_network_params(out7, lasagne.regularization.l2)*5e-4 98 | lr = theano.shared(np.array([1e-4],dtype=np.float32)) 99 | 100 | updates = lasagne.updates.adam(loss+reg, params, learning_rate = lr[0], beta1=0.5) 101 | 102 | train = theano.function([context_var, d_input], loss, updates=updates, allow_input_downcast=True) 103 | encode = theano.function([d_input], encs[0], allow_input_downcast=True) 104 | stack2.input_layers[1] = latent_input 105 | gen_out = lasagne.layers.get_output(out) 106 | 107 | reward = T.mean(weights[0]*abs(gen_out[:,0+5*(FUTURE-1)]-targs[0]))+T.mean(T.sum(weights[1:]*(gen_out[:,1+5*(FUTURE-1):5*(FUTURE-1)+4]-targs[1:])**2,axis=1),axis=0) 108 | 109 | sample = theano.function([context_var, latent], gen_out, allow_input_downcast = True) 110 | latent_grad = theano.function([context_var, latent, targs], [theano.grad(reward, latent), reward], allow_input_downcast = True) 111 | 112 | def getPolicy(obs, targ, platent): 113 | latent = platent.copy() 114 | obs2 = obs 115 | for i in range(100): 116 | grad,rw = latent_grad(obs2, latent, targ) 117 | grad = -grad/np.sqrt(np.sum(grad**2,axis=1)+1e-16) 118 | latent += 0.05*grad - 0.001*latent 119 | return sample(obs2, latent)[0], latent 120 | 121 | def trainNet(): 122 | BS = 1000 123 | contexts = [] 124 | policies = [] 125 | 126 | meanlen = np.mean(np.array([x.shape[0] for x in data])) 127 | 128 | for i in range(BS): 129 | j = np.random.randint(len(data)) 130 | if data[j].shape[0]>FUTURE+1: 131 | k = np.random.randint(data[j].shape[0]-FUTURE-1) 132 | contexts.append(data[j][k,0:4]) 133 | policies.append(data[j][k+1:k+1+FUTURE,:].reshape((FUTURE*5))) 134 | 135 | policies = np.array(policies) 136 | contexts = np.array(contexts) 137 | 138 | d_err = train(contexts, policies) 139 | 140 | return d_err 141 | 142 | data = [] 143 | preds = [] 144 | rewards = [] 145 | dlatents = [] 146 | discerr = [] 147 | 148 | for cycle in range(25): 149 | rate = 1e-4 150 | lr.set_value(np.array([rate],dtype=np.float32)) 151 | 152 | for sub in range(5): 153 | obs = env.reset() 154 | obs[0] *= 10 155 | obs[2] *= 10 156 | latent = np.random.randn(1,LATENT) 157 | targ = np.zeros(4) 158 | 159 | policy,latent = getPolicy(np.array(obs).reshape((1,4)),targ,latent) 160 | done = False 161 | 162 | run_obs = [] 163 | run_act = [] 164 | run_preds = [] 165 | run_latents = [] 166 | step= 0 167 | j = 0 168 | 169 | while (not done) and (step<500): 170 | act = (np.random.rand()<(0.5*(policy[4+j*5]+1)))*1 171 | run_preds.append(policy[5*j:5*j+5]) 172 | obs, reward, done, info = env.step(act) 173 | obs[0] *= 10 174 | obs[2] *= 10 175 | run_act.append(2*act-1) 176 | run_obs.append(obs) 177 | err = np.mean( (obs-policy[j*5:j*5+4])**2 ) 178 | 179 | j += 1 180 | 181 | if j>1 or err>0.05: 182 | policy,latent = getPolicy(np.array(obs).reshape((1,4)),targ,latent) 183 | j = 0 184 | 185 | run_latents.append(latent[0]) 186 | #env.render() 187 | step += 1 188 | 189 | run_act = np.array(run_act) 190 | run_obs = np.array(run_obs) 191 | dlatents.append(np.array(run_latents)) 192 | data.append(np.hstack([run_obs, run_act.reshape((run_act.shape[0],1))])) 193 | preds.append(np.array(run_preds)) 194 | rewards.append(run_obs.shape[0]) 195 | 196 | f = open("runs/%.6d.txt" % trial,"a") 197 | f.write("%d\n" % run_obs.shape[0]) 198 | f.close() 199 | 200 | de = 0 201 | 202 | for epoch in range(400): 203 | de = trainNet() 204 | 205 | -------------------------------------------------------------------------------- /runs/README.md: -------------------------------------------------------------------------------- 1 | Placeholder directory for where the results of training runs are stored. -------------------------------------------------------------------------------- /task_transfer/gen_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | import matplotlib.pyplot 4 | 5 | from math import * 6 | 7 | import theano 8 | import theano.tensor as T 9 | 10 | import lasagne 11 | 12 | from PIL import Image 13 | import scipy.misc 14 | import sys 15 | import pickle 16 | 17 | np.random.seed(int(sys.argv[1])+1238) 18 | 19 | env = gym.make('CartPole-v1') 20 | 21 | weights = np.array([1.2, 1.8, 3.0, 0.8]) 22 | 23 | LATENT = 2 24 | FUTURE = 16 25 | HIDDEN = 256 26 | 27 | for trial in range(1): 28 | context_var = T.matrix() 29 | latent = T.matrix() 30 | d_input = T.matrix() 31 | target = T.vector() 32 | targs = T.vector() 33 | 34 | context_input = lasagne.layers.InputLayer((None,4), input_var = context_var) 35 | latent_input = lasagne.layers.InputLayer((None,LATENT), input_var = latent) 36 | state_input = lasagne.layers.InputLayer((None,FUTURE*5), input_var = d_input) 37 | 38 | plist = [] 39 | #stack1 = lasagne.layers.ConcatLayer([state_input, context_input]) 40 | dense1 = lasagne.layers.DenseLayer(state_input, num_units = HIDDEN) 41 | plist.append(dense1.W) 42 | plist.append(dense1.b) 43 | dense2 = lasagne.layers.DenseLayer(dense1, num_units = HIDDEN) 44 | plist.append(dense2.W) 45 | plist.append(dense2.b) 46 | dense3 = lasagne.layers.DenseLayer(dense2, num_units = HIDDEN) 47 | plist.append(dense3.W) 48 | plist.append(dense3.b) 49 | enc = lasagne.layers.DenseLayer(dense3, num_units = LATENT, nonlinearity = lasagne.nonlinearities.tanh) 50 | plist.append(enc.W) 51 | plist.append(enc.b) 52 | enc_noise = lasagne.layers.GaussianNoiseLayer(enc, sigma=0.2) 53 | stack2 = lasagne.layers.ConcatLayer([context_input, enc_noise]) 54 | ddense1 = lasagne.layers.DenseLayer(stack2, num_units = HIDDEN) 55 | plist.append(ddense1.W) 56 | plist.append(ddense1.b) 57 | ddense2 = lasagne.layers.DenseLayer(ddense1, num_units = HIDDEN) 58 | plist.append(ddense2.W) 59 | plist.append(ddense2.b) 60 | ddense3 = lasagne.layers.DenseLayer(ddense2, num_units = HIDDEN) 61 | plist.append(ddense3.W) 62 | plist.append(ddense3.b) 63 | out = lasagne.layers.DenseLayer(ddense3, num_units = 5*FUTURE, nonlinearity = None) 64 | plist.append(out.W) 65 | plist.append(out.b) 66 | 67 | def addBlock(ctx_in, state_in, params): 68 | dense1 = lasagne.layers.DenseLayer(state_in, num_units = HIDDEN, W=params[0], b=params[1]) 69 | dense2 = lasagne.layers.DenseLayer(dense1, num_units = HIDDEN, W=params[2], b=params[3]) 70 | dense3 = lasagne.layers.DenseLayer(dense2, num_units = HIDDEN, W=params[4], b=params[5]) 71 | enc = lasagne.layers.DenseLayer(dense3, num_units = LATENT, nonlinearity = lasagne.nonlinearities.tanh, W=params[6], b=params[7]) 72 | enc_noise = lasagne.layers.GaussianNoiseLayer(enc, sigma=0.2) 73 | stack2 = lasagne.layers.ConcatLayer([ctx_in, enc_noise]) 74 | ddense1 = lasagne.layers.DenseLayer(stack2, num_units = HIDDEN, W=params[8], b=params[9]) 75 | ddense2 = lasagne.layers.DenseLayer(ddense1, num_units = HIDDEN, W=params[10], b=params[11]) 76 | ddense3 = lasagne.layers.DenseLayer(ddense2, num_units = HIDDEN, W=params[12], b=params[13]) 77 | out = lasagne.layers.DenseLayer(ddense2, num_units = 5*FUTURE, nonlinearity = None, W=params[14], b=params[15]) 78 | 79 | return enc, out 80 | 81 | enc2, out2 = addBlock(context_input, out, plist) 82 | enc3, out3 = addBlock(context_input, out2, plist) 83 | enc4, out4 = addBlock(context_input, out3, plist) 84 | enc5, out5 = addBlock(context_input, out4, plist) 85 | enc6, out6 = addBlock(context_input, out5, plist) 86 | enc7, out7 = addBlock(context_input, out6, plist) 87 | 88 | params = lasagne.layers.get_all_params(out7,trainable=True) 89 | 90 | outs = lasagne.layers.get_output([out,out2,out3,out4,out5,out6,out7]) 91 | encs = lasagne.layers.get_output([enc,enc2,enc3,enc4,enc5,enc6,enc7]) 92 | 93 | loss = 0 94 | for i in range(len(outs)): 95 | loss = loss + T.mean((outs[i] - d_input)**2) 96 | 97 | reg = lasagne.regularization.regularize_network_params(out7, lasagne.regularization.l2)*5e-4 98 | lr = theano.shared(np.array([5e-4],dtype=np.float32)) 99 | 100 | updates = lasagne.updates.adam(loss+reg, params, learning_rate = lr[0], beta1=0.5) 101 | 102 | train = theano.function([context_var, d_input], loss, updates=updates, allow_input_downcast=True) 103 | encode = theano.function([d_input], encs[0], allow_input_downcast=True) 104 | stack2.input_layers[1] = latent_input 105 | gen_out = lasagne.layers.get_output(out) 106 | 107 | reward = T.mean(weights[0]*abs(gen_out[:,0+5*(FUTURE-1)]-targs[0]))+T.mean(T.sum(weights[1:]*(gen_out[:,1+5*(FUTURE-1):5*(FUTURE-1)+4]-targs[1:])**2,axis=1),axis=0) 108 | 109 | sample = theano.function([context_var, latent], gen_out, allow_input_downcast = True) 110 | latent_grad = theano.function([context_var, latent, targs], [theano.grad(reward, latent), reward], allow_input_downcast = True) 111 | def getPolicy(obs, targ, platent): 112 | latent = platent.copy() 113 | obs2 = obs 114 | for i in range(100): 115 | grad,rw = latent_grad(obs2, latent, targ) 116 | grad = -grad/np.sqrt(np.sum(grad**2,axis=1)+1e-16) 117 | latent += 0.05*grad - 0.001*latent 118 | return sample(obs2, latent)[0], latent 119 | 120 | def trainNet(): 121 | BS = 1000 122 | contexts = [] 123 | policies = [] 124 | 125 | meanlen = np.mean(np.array([x.shape[0] for x in data])) 126 | 127 | for i in range(BS): 128 | j = np.random.randint(len(data)) 129 | if data[j].shape[0]>FUTURE+1: 130 | k = np.random.randint(data[j].shape[0]-FUTURE-1) 131 | contexts.append(data[j][k,0:4]) 132 | policies.append(data[j][k+1:k+1+FUTURE,:].reshape((FUTURE*5))) 133 | 134 | policies = np.array(policies) 135 | contexts = np.array(contexts) 136 | 137 | d_err = train(contexts, policies) 138 | 139 | return d_err 140 | 141 | data = [] 142 | preds = [] 143 | rewards = [] 144 | dlatents = [] 145 | discerr = [] 146 | 147 | for cycle in range(25): 148 | rate = 1e-4 149 | lr.set_value(np.array([rate],dtype=np.float32)) 150 | 151 | for sub in range(5): 152 | obs = env.reset() 153 | obs[0] *= 10 154 | obs[2] *= 10 155 | latent = np.random.randn(1,LATENT) 156 | targ = np.zeros(4) 157 | 158 | policy,latent = getPolicy(np.array(obs).reshape((1,4)),targ,latent) 159 | done = False 160 | 161 | run_obs = [] 162 | run_act = [] 163 | run_preds = [] 164 | run_latents = [] 165 | step= 0 166 | j = 0 167 | 168 | while (not done) and (step<500): 169 | act = (np.random.rand()<(0.5*(policy[4+j*5]+1)))*1 170 | run_preds.append(policy[5*j:5*j+5]) 171 | obs, reward, done, info = env.step(act) 172 | obs[0] *= 10 173 | obs[2] *= 10 174 | run_act.append(2*act-1) 175 | run_obs.append(obs) 176 | err = np.mean( (obs-policy[j*5:j*5+4])**2 ) 177 | 178 | j += 1 179 | 180 | if j>1 or err>0.05: 181 | policy,latent = getPolicy(np.array(obs).reshape((1,4)),targ,latent) 182 | j = 0 183 | 184 | run_latents.append(latent[0]) 185 | #env.render() 186 | step += 1 187 | 188 | run_act = np.array(run_act) 189 | run_obs = np.array(run_obs) 190 | dlatents.append(np.array(run_latents)) 191 | data.append(np.hstack([run_obs, run_act.reshape((run_act.shape[0],1))])) 192 | preds.append(np.array(run_preds)) 193 | rewards.append(run_obs.shape[0]) 194 | 195 | f = open("basereward.txt", "a") 196 | f.write("%d\n" % run_obs.shape[0]) 197 | f.close() 198 | 199 | de = 0 200 | 201 | for epoch in range(400): 202 | de = trainNet() 203 | 204 | pickle.dump(lasagne.layers.get_all_param_values(out7),open("network.params","wb")) 205 | -------------------------------------------------------------------------------- /task_transfer/try_transfer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | import matplotlib.pyplot 4 | 5 | from math import * 6 | 7 | import theano 8 | import theano.tensor as T 9 | 10 | import lasagne 11 | 12 | from PIL import Image 13 | import scipy.misc 14 | import sys 15 | import pickle 16 | 17 | np.random.seed(5342) 18 | 19 | env = gym.make('CartPole-v1') 20 | 21 | weights = np.array([1.2, 1.8, 3.0, 0.8]) 22 | 23 | LATENT = 2 24 | FUTURE = 16 25 | HIDDEN = 256 26 | 27 | context_var = T.matrix() 28 | latent = T.matrix() 29 | d_input = T.matrix() 30 | target = T.vector() 31 | targs = T.vector() 32 | 33 | context_input = lasagne.layers.InputLayer((None,4), input_var = context_var) 34 | latent_input = lasagne.layers.InputLayer((None,LATENT), input_var = latent) 35 | state_input = lasagne.layers.InputLayer((None,FUTURE*5), input_var = d_input) 36 | 37 | plist = [] 38 | dense1 = lasagne.layers.DenseLayer(state_input, num_units = HIDDEN) 39 | plist.append(dense1.W) 40 | plist.append(dense1.b) 41 | dense2 = lasagne.layers.DenseLayer(dense1, num_units = HIDDEN) 42 | plist.append(dense2.W) 43 | plist.append(dense2.b) 44 | dense3 = lasagne.layers.DenseLayer(dense2, num_units = HIDDEN) 45 | plist.append(dense3.W) 46 | plist.append(dense3.b) 47 | enc = lasagne.layers.DenseLayer(dense3, num_units = LATENT, nonlinearity = lasagne.nonlinearities.tanh) 48 | plist.append(enc.W) 49 | plist.append(enc.b) 50 | enc_noise = lasagne.layers.GaussianNoiseLayer(enc, sigma=0.2) 51 | stack2 = lasagne.layers.ConcatLayer([context_input, enc_noise]) 52 | ddense1 = lasagne.layers.DenseLayer(stack2, num_units = HIDDEN) 53 | plist.append(ddense1.W) 54 | plist.append(ddense1.b) 55 | ddense2 = lasagne.layers.DenseLayer(ddense1, num_units = HIDDEN) 56 | plist.append(ddense2.W) 57 | plist.append(ddense2.b) 58 | ddense3 = lasagne.layers.DenseLayer(ddense2, num_units = HIDDEN) 59 | plist.append(ddense3.W) 60 | plist.append(ddense3.b) 61 | out = lasagne.layers.DenseLayer(ddense3, num_units = 5*FUTURE, nonlinearity = None) 62 | plist.append(out.W) 63 | plist.append(out.b) 64 | 65 | def addBlock(ctx_in, state_in, params): 66 | dense1 = lasagne.layers.DenseLayer(state_in, num_units = HIDDEN, W=params[0], b=params[1]) 67 | dense2 = lasagne.layers.DenseLayer(dense1, num_units = HIDDEN, W=params[2], b=params[3]) 68 | dense3 = lasagne.layers.DenseLayer(dense2, num_units = HIDDEN, W=params[4], b=params[5]) 69 | enc = lasagne.layers.DenseLayer(dense3, num_units = LATENT, nonlinearity = lasagne.nonlinearities.tanh, W=params[6], b=params[7]) 70 | enc_noise = lasagne.layers.GaussianNoiseLayer(enc, sigma=0.2) 71 | stack2 = lasagne.layers.ConcatLayer([ctx_in, enc_noise]) 72 | ddense1 = lasagne.layers.DenseLayer(stack2, num_units = HIDDEN, W=params[8], b=params[9]) 73 | ddense2 = lasagne.layers.DenseLayer(ddense1, num_units = HIDDEN, W=params[10], b=params[11]) 74 | ddense3 = lasagne.layers.DenseLayer(ddense2, num_units = HIDDEN, W=params[12], b=params[13]) 75 | out = lasagne.layers.DenseLayer(ddense2, num_units = 5*FUTURE, nonlinearity = None, W=params[14], b=params[15]) 76 | 77 | return enc, out 78 | 79 | enc2, out2 = addBlock(context_input, out, plist) 80 | enc3, out3 = addBlock(context_input, out2, plist) 81 | enc4, out4 = addBlock(context_input, out3, plist) 82 | enc5, out5 = addBlock(context_input, out4, plist) 83 | enc6, out6 = addBlock(context_input, out5, plist) 84 | enc7, out7 = addBlock(context_input, out6, plist) 85 | 86 | params = lasagne.layers.get_all_params(out7,trainable=True) 87 | 88 | outs = lasagne.layers.get_output([out,out2,out3,out4,out5,out6,out7]) 89 | encs = lasagne.layers.get_output([enc,enc2,enc3,enc4,enc5,enc6,enc7]) 90 | 91 | loss = 0 92 | for i in range(len(outs)): 93 | loss = loss + T.mean((outs[i] - d_input)**2) 94 | 95 | reg = lasagne.regularization.regularize_network_params(out7, lasagne.regularization.l2)*5e-4 96 | lr = theano.shared(np.array([5e-4],dtype=np.float32)) 97 | 98 | updates = lasagne.updates.adam(loss+reg, params, learning_rate = lr[0], beta1=0.5) 99 | 100 | train = theano.function([context_var, d_input], loss, updates=updates, allow_input_downcast=True) 101 | encode = theano.function([d_input], encs[0], allow_input_downcast=True) 102 | stack2.input_layers[1] = latent_input 103 | gen_out = lasagne.layers.get_output(out) 104 | 105 | reward = T.mean(weights[0]*abs(gen_out[:,0+5*(FUTURE-1)]-targs[0]))+T.mean(T.sum(weights[1:]*(gen_out[:,1+5*(FUTURE-1):5*(FUTURE-1)+4]-targs[1:])**2,axis=1),axis=0) 106 | 107 | sample = theano.function([context_var, latent], gen_out, allow_input_downcast = True) 108 | latent_grad = theano.function([context_var, latent, targs], [theano.grad(reward, latent), reward], allow_input_downcast = True) 109 | 110 | lasagne.layers.set_all_param_values(out7,pickle.load(open("network.params","rb"))) 111 | 112 | def getPolicy(obs, targ, platent): 113 | latent = platent.copy() 114 | obs2 = obs 115 | for i in range(100): 116 | grad,rw = latent_grad(obs2, latent, targ) 117 | grad = -grad/np.sqrt(np.sum(grad**2,axis=1)+1e-16) 118 | latent += 0.05*grad - 0.001*latent 119 | return sample(obs2, latent)[0], latent 120 | 121 | def attemptTask(amp, period): 122 | obs = env.reset() 123 | obs[0] *= 10 124 | obs[2] *= 10 125 | latent = np.random.randn(1,LATENT) 126 | targ = np.zeros(4) 127 | 128 | policy,latent = getPolicy(np.array(obs).reshape((1,4)),targ,latent) 129 | done = False 130 | 131 | xtarg = [] 132 | xpos = [] 133 | 134 | step = 0 135 | j = 0 136 | 137 | while (not done) and (step<5000): 138 | targ[0] = 10*amp*sin(2*3.1415*step/period) 139 | act = (np.random.rand()<(0.5*(policy[4+j*5]+1)))*1 140 | obs, reward, done, info = env.step(act) 141 | obs[0] *= 10 142 | obs[2] *= 10 143 | err = np.mean( (obs-policy[j*5:j*5+4])**2 ) 144 | xtarg.append(targ[0]/10.0) 145 | xpos.append(obs[0]/10.0) 146 | 147 | j += 1 148 | 149 | if j>1 or err>0.05: 150 | policy,latent = getPolicy(np.array(obs).reshape((1,4)),targ,latent) 151 | j = 0 152 | 153 | step += 1 154 | 155 | return step, np.hstack([np.array(xpos).reshape((len(xpos),1)), np.array(xtarg).reshape((len(xpos),1))]) 156 | 157 | amp = float(sys.argv[1]) 158 | period = float(sys.argv[2]) 159 | 160 | r,x = attemptTask(amp,period) 161 | 162 | np.savetxt("traj/%.3g_%.3g.txt" % (amp,period), x) 163 | 164 | rewards = [] 165 | for i in range(20): 166 | r,x = attemptTask(amp,period) 167 | rewards.append(r) 168 | 169 | f = open("performance.txt","a") 170 | f.write("%.6g %.6g %.6g\n" % (amp, period, np.mean(rewards))) 171 | f.close() 172 | --------------------------------------------------------------------------------