├── .gitignore ├── final_report.pdf ├── README.md ├── env_test.py └── actor_critic_agent.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | __pycache__/* 3 | **ubyte -------------------------------------------------------------------------------- /final_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bchidamb/RL-Image-Classification/HEAD/final_report.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CS 165 Project 2 | ### Reinforcement Learning for Image Classification 3 | Bhairav Chidambaram, Rohan Choudhury, Connor Soohoo (advised by Hoang Le) 4 | 5 | ## Results 6 | See final_report.pdf for a summary of the experiment, plots, and discussion of the results. 7 | 8 | ## References 9 | https://arxiv.org/pdf/1811.06032.pdf 10 | 11 | -------------------------------------------------------------------------------- /env_test.py: -------------------------------------------------------------------------------- 1 | from mnist_env import MNISTEnv 2 | import numpy as np 3 | 4 | # Run this file to test if the environment is working 5 | 6 | env = MNISTEnv(type='train', seed=None) 7 | 8 | obs = env.reset() 9 | done = False 10 | 11 | while not done: 12 | 13 | env.render() 14 | action = env.action_space.sample() 15 | dir, Y_pred = action % 4, action // 4 16 | print("Agent moved %s" % (['North', 'South', 'East', 'West'][dir])) 17 | print("Agent guessed %d" % Y_pred) 18 | 19 | _, reward, done, _ = env.step(action) 20 | print("Received reward %.1f on step %d" % (reward, env.steps)) -------------------------------------------------------------------------------- /actor_critic_agent.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import random 3 | import time 4 | import copy 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | import torch.optim as optim 11 | 12 | 13 | class MNISTNet(nn.Module): 14 | ''' 15 | A CNN with ReLU activations and a three-headed output, two for the 16 | actor and one for the critic 17 | 18 | y1 - action distribution 19 | y2 - critic's estimate of value 20 | 21 | Input shape: (batch_size, D_in) 22 | Output shape: (batch_size, 40), (batch_size, 1) 23 | ''' 24 | 25 | def __init__(self): 26 | 27 | super(MNISTNet, self).__init__() 28 | 29 | same_padding = (5 - 1) // 2 30 | 31 | self.conv1 = nn.Conv2d(1, 10, 5, padding=same_padding) 32 | self.conv2 = nn.Conv2d(10, 10, 5, padding=same_padding) 33 | self.lin1 = nn.Linear(10 * 7 * 7, 50) 34 | 35 | self.out_dir = nn.Linear(50, 4) 36 | self.out_digit = nn.Linear(50, 10) 37 | self.out_critic = nn.Linear(50, 1) 38 | 39 | def forward(self, x): 40 | 41 | x = self.conv1(x) 42 | x = F.relu(x) 43 | x = nn.MaxPool2d(2)(x) 44 | 45 | x = self.conv2(x) 46 | x = F.relu(x) 47 | x = nn.MaxPool2d(2)(x) 48 | 49 | x = x.view(-1, 10 * 7 * 7) 50 | x = self.lin1(x) 51 | 52 | pi1 = self.out_digit(x) 53 | pi1 = F.softmax(pi1, dim=-1) 54 | 55 | pi2 = self.out_dir(x) 56 | pi2 = F.softmax(pi2, dim=-1) 57 | 58 | # https://discuss.pytorch.org/t/batch-outer-product/4025 59 | y1 = torch.bmm(pi1.unsqueeze(2), pi2.unsqueeze(1)) 60 | y1 = y1.view(-1, 40) 61 | 62 | y2 = self.out_critic(x) 63 | 64 | return y1, y2 65 | 66 | def torch_to_numpy(tensor): 67 | return tensor.data.numpy() 68 | 69 | def numpy_to_torch(array): 70 | return torch.tensor(array).float() 71 | 72 | class ActorCriticNNAgent: 73 | ''' 74 | Neural-net agent that trains using the actor-critic algorithm. The critic 75 | is a value function that returns expected discounted reward given the 76 | state as input. We use advantage defined as 77 | 78 | A = r + g * V(s') - V(s) 79 | 80 | Notation: 81 | A - advantage 82 | V - value function 83 | r - current reward 84 | g - discount factor 85 | s - current state 86 | s' - next state 87 | ''' 88 | 89 | def __init__(self, new_network, params=None, obs_to_input=lambda x: x, 90 | lr=1e-3, df=0.5, alpha=0.5): 91 | 92 | # model and parameters 93 | if params is not None: 94 | self.model = new_network(params) 95 | else: 96 | self.model = new_network() 97 | if isinstance(self.model, torch.nn.Module): 98 | self.optimizer = optim.Adam(self.model.parameters(), lr=lr) 99 | self.df = df # discount factor 100 | self.alpha = alpha # multiply critic updates by this factor 101 | 102 | # initialize replay history 103 | self.replay = [] 104 | 105 | # function that converts observation into input of dimension D_in 106 | self.obs_to_input = obs_to_input 107 | 108 | # if trainable is changed to false, the model won't be updated 109 | self.trainable = True 110 | 111 | def act(self, o, env=None, display=False): 112 | 113 | # feed observation as input to net to get distribution as output 114 | x = self.obs_to_input(o) 115 | x = numpy_to_torch([x]) 116 | y1, y2 = self.model(x) 117 | 118 | pi = torch_to_numpy(y1).flatten() 119 | v = torch_to_numpy(y2).squeeze() 120 | 121 | # sample action from distribution 122 | a = np.random.choice(np.arange(40), p=pi) 123 | 124 | if display: 125 | direction, digit = a % 4, a // 4 126 | pi1 = pi.reshape((10, 4)).sum(axis=0) 127 | pi2 = pi.reshape((10, 4)).sum(axis=1) 128 | 129 | print("") 130 | print("Sampled action:", (direction, digit)) 131 | print("Value estimate:", v) 132 | print("Distributions:", pi1, pi2, sep='\n') 133 | 134 | # update current episode in replay with observation and chosen action 135 | if self.trainable: 136 | self.replay[-1]['observations'].append(o) 137 | self.replay[-1]['actions'].append(a) 138 | 139 | return np.array(a) 140 | 141 | def new_episode(self): 142 | # start a new episode in replay 143 | self.replay.append({'observations': [], 'actions': [], 'rewards': []}) 144 | 145 | def store_reward(self, r): 146 | # insert 0s for actions that received no reward; end with reward r 147 | episode = self.replay[-1] 148 | T_no_reward = len(episode['actions']) - len(episode['rewards']) - 1 149 | episode['rewards'] += [0.0] * T_no_reward + [r] 150 | 151 | def _calculate_discounted_rewards(self): 152 | # calculate and store discounted rewards per episode 153 | 154 | for episode in self.replay: 155 | 156 | R = episode['rewards'] 157 | R_disc = [] 158 | R_sum = 0 159 | for r in R[::-1]: 160 | R_sum = r + self.df * R_sum 161 | R_disc.insert(0, R_sum) 162 | 163 | episode['rewards_disc'] = R_disc 164 | 165 | def update(self): 166 | 167 | assert(self.trainable) 168 | 169 | episode_losses = torch.tensor(0.0) 170 | N = len(self.replay) 171 | self._calculate_discounted_rewards() 172 | 173 | for episode in self.replay: 174 | 175 | O = episode['observations'] 176 | A = episode['actions'] 177 | R = numpy_to_torch(episode['rewards']) 178 | R_disc = numpy_to_torch(episode['rewards_disc']) 179 | T = len(R_disc) 180 | 181 | # forward pass, Y1 is pi(a | s), Y2 is V(s) 182 | X = numpy_to_torch([self.obs_to_input(o) for o in O]) 183 | Y1, Y2 = self.model(X) 184 | pi = Y1 185 | Vs_curr = Y2.view(-1) 186 | 187 | # log probabilities of selected actions 188 | log_prob = torch.log(pi[np.arange(T), A]) 189 | 190 | # advantage of selected actions over expected reward given state 191 | Vs_next = torch.cat((Vs_curr[1:], torch.tensor([0.]))) 192 | adv = R + self.df * Vs_next - Vs_curr 193 | 194 | # ignore gradients so the critic isn't affected by actor loss 195 | adv = adv.detach() 196 | 197 | # actor loss is -1 * advantage-weighted sum of log likelihood 198 | # critic loss is the SE between values and discounted rewards 199 | actor_loss = -torch.dot(log_prob, adv) 200 | critic_loss = torch.sum((R_disc - Vs_curr) ** 2) 201 | episode_losses += actor_loss + critic_loss * self.alpha 202 | 203 | # backward pass 204 | self.optimizer.zero_grad() 205 | loss = episode_losses / N 206 | loss.backward() 207 | self.optimizer.step() 208 | 209 | # reset the replay history 210 | self.replay = [] 211 | 212 | def copy(self): 213 | 214 | # create a copy of this agent with frozen weights 215 | agent = ActorCriticNNAgent(lambda x: 0, 0, self.obs_to_input) 216 | agent.model = copy.deepcopy(self.model) 217 | agent.trainable = False 218 | for param in agent.model.parameters(): 219 | param.requires_grad = False 220 | 221 | return agent 222 | --------------------------------------------------------------------------------