├── .gitignore
├── final_report.pdf
├── README.md
├── env_test.py
└── actor_critic_agent.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | __pycache__/*
3 | **ubyte


--------------------------------------------------------------------------------
/final_report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bchidamb/RL-Image-Classification/HEAD/final_report.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CS 165 Project
 2 | ### Reinforcement Learning for Image Classification
 3 | Bhairav Chidambaram, Rohan Choudhury, Connor Soohoo (advised by Hoang Le)
 4 | 
 5 | ## Results
 6 | See final_report.pdf for a summary of the experiment, plots, and discussion of the results.
 7 | 
 8 | ## References
 9 | https://arxiv.org/pdf/1811.06032.pdf
10 | 
11 | 


--------------------------------------------------------------------------------
/env_test.py:
--------------------------------------------------------------------------------
 1 | from mnist_env import MNISTEnv
 2 | import numpy as np
 3 | 
 4 | # Run this file to test if the environment is working
 5 | 
 6 | env = MNISTEnv(type='train', seed=None)
 7 | 
 8 | obs = env.reset()
 9 | done = False
10 | 
11 | while not done:
12 | 
13 |     env.render()
14 |     action = env.action_space.sample()
15 |     dir, Y_pred = action % 4, action // 4
16 |     print("Agent moved %s" % (['North', 'South', 'East', 'West'][dir]))
17 |     print("Agent guessed %d" % Y_pred)
18 |     
19 |     _, reward, done, _ = env.step(action)
20 |     print("Received reward %.1f on step %d" % (reward, env.steps))


--------------------------------------------------------------------------------
/actor_critic_agent.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import random
  3 | import time
  4 | import copy
  5 | import numpy as np
  6 | import matplotlib.pyplot as plt
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | import torch.optim as optim
 11 | 
 12 | 
 13 | class MNISTNet(nn.Module):
 14 |     '''
 15 |     A CNN with ReLU activations and a three-headed output, two for the 
 16 |     actor and one for the critic
 17 |     
 18 |     y1 - action distribution
 19 |     y2 - critic's estimate of value
 20 |     
 21 |     Input shape:    (batch_size, D_in)
 22 |     Output shape:   (batch_size, 40), (batch_size, 1)
 23 |     '''
 24 |     
 25 |     def __init__(self):
 26 |         
 27 |         super(MNISTNet, self).__init__()
 28 |         
 29 |         same_padding = (5 - 1) // 2
 30 |         
 31 |         self.conv1 = nn.Conv2d(1, 10, 5, padding=same_padding)
 32 |         self.conv2 = nn.Conv2d(10, 10, 5, padding=same_padding)
 33 |         self.lin1  = nn.Linear(10 * 7 * 7, 50)
 34 |         
 35 |         self.out_dir = nn.Linear(50, 4)
 36 |         self.out_digit = nn.Linear(50, 10)
 37 |         self.out_critic = nn.Linear(50, 1)
 38 |     
 39 |     def forward(self, x):
 40 |     
 41 |         x = self.conv1(x)
 42 |         x = F.relu(x)
 43 |         x = nn.MaxPool2d(2)(x)
 44 |         
 45 |         x = self.conv2(x)
 46 |         x = F.relu(x)
 47 |         x = nn.MaxPool2d(2)(x)
 48 |         
 49 |         x = x.view(-1, 10 * 7 * 7)
 50 |         x = self.lin1(x)
 51 |         
 52 |         pi1 = self.out_digit(x)
 53 |         pi1 = F.softmax(pi1, dim=-1)
 54 |         
 55 |         pi2 = self.out_dir(x)
 56 |         pi2 = F.softmax(pi2, dim=-1)
 57 |         
 58 |         # https://discuss.pytorch.org/t/batch-outer-product/4025
 59 |         y1 = torch.bmm(pi1.unsqueeze(2), pi2.unsqueeze(1))
 60 |         y1 = y1.view(-1, 40)
 61 |         
 62 |         y2 = self.out_critic(x)
 63 |         
 64 |         return y1, y2
 65 | 
 66 | def torch_to_numpy(tensor):
 67 |     return tensor.data.numpy()
 68 | 
 69 | def numpy_to_torch(array):
 70 |     return torch.tensor(array).float()
 71 | 
 72 | class ActorCriticNNAgent:
 73 |     '''
 74 |     Neural-net agent that trains using the actor-critic algorithm. The critic 
 75 |     is a value function that returns expected discounted reward given the
 76 |     state as input. We use advantage defined as
 77 |     
 78 |         A = r + g * V(s') - V(s)
 79 |         
 80 |     Notation:
 81 |         A - advantage
 82 |         V - value function
 83 |         r - current reward
 84 |         g - discount factor
 85 |         s - current state
 86 |         s' - next state
 87 |     '''
 88 |     
 89 |     def __init__(self, new_network, params=None, obs_to_input=lambda x: x, 
 90 |                  lr=1e-3, df=0.5, alpha=0.5):
 91 |     
 92 |         # model and parameters
 93 |         if params is not None:
 94 |             self.model = new_network(params)
 95 |         else:
 96 |             self.model = new_network()
 97 |         if isinstance(self.model, torch.nn.Module):
 98 |             self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
 99 |         self.df = df # discount factor
100 |         self.alpha = alpha # multiply critic updates by this factor
101 |         
102 |         # initialize replay history
103 |         self.replay = []
104 |         
105 |         # function that converts observation into input of dimension D_in
106 |         self.obs_to_input = obs_to_input
107 |         
108 |         # if trainable is changed to false, the model won't be updated
109 |         self.trainable = True
110 |     
111 |     def act(self, o, env=None, display=False):
112 |         
113 |         # feed observation as input to net to get distribution as output
114 |         x = self.obs_to_input(o)
115 |         x = numpy_to_torch([x])
116 |         y1, y2 = self.model(x)
117 |         
118 |         pi = torch_to_numpy(y1).flatten()
119 |         v  = torch_to_numpy(y2).squeeze()
120 |         
121 |         # sample action from distribution
122 |         a = np.random.choice(np.arange(40), p=pi)
123 |         
124 |         if display:
125 |             direction, digit = a % 4, a // 4
126 |             pi1 = pi.reshape((10, 4)).sum(axis=0)
127 |             pi2 = pi.reshape((10, 4)).sum(axis=1)
128 |             
129 |             print("")
130 |             print("Sampled action:", (direction, digit))
131 |             print("Value estimate:", v) 
132 |             print("Distributions:", pi1, pi2, sep='\n')
133 |         
134 |         # update current episode in replay with observation and chosen action
135 |         if self.trainable:
136 |             self.replay[-1]['observations'].append(o)
137 |             self.replay[-1]['actions'].append(a)
138 |         
139 |         return np.array(a)
140 |     
141 |     def new_episode(self):
142 |         # start a new episode in replay
143 |         self.replay.append({'observations': [], 'actions': [], 'rewards': []})
144 |     
145 |     def store_reward(self, r):
146 |         # insert 0s for actions that received no reward; end with reward r
147 |         episode = self.replay[-1]
148 |         T_no_reward = len(episode['actions']) - len(episode['rewards']) - 1
149 |         episode['rewards'] += [0.0] * T_no_reward + [r]
150 |     
151 |     def _calculate_discounted_rewards(self):
152 |         # calculate and store discounted rewards per episode
153 |         
154 |         for episode in self.replay:
155 |             
156 |             R = episode['rewards']
157 |             R_disc = []
158 |             R_sum = 0
159 |             for r in R[::-1]:
160 |                 R_sum = r + self.df * R_sum
161 |                 R_disc.insert(0, R_sum)
162 |                 
163 |             episode['rewards_disc'] = R_disc
164 |     
165 |     def update(self):
166 |         
167 |         assert(self.trainable)
168 |         
169 |         episode_losses = torch.tensor(0.0)
170 |         N = len(self.replay)
171 |         self._calculate_discounted_rewards()
172 |         
173 |         for episode in self.replay:
174 | 
175 |             O = episode['observations']
176 |             A = episode['actions']
177 |             R = numpy_to_torch(episode['rewards'])
178 |             R_disc = numpy_to_torch(episode['rewards_disc'])
179 |             T = len(R_disc)
180 |             
181 |             # forward pass, Y1 is pi(a | s), Y2 is V(s)
182 |             X = numpy_to_torch([self.obs_to_input(o) for o in O])
183 |             Y1, Y2 = self.model(X)
184 |             pi = Y1
185 |             Vs_curr = Y2.view(-1)
186 |             
187 |             # log probabilities of selected actions
188 |             log_prob = torch.log(pi[np.arange(T), A])
189 |             
190 |             # advantage of selected actions over expected reward given state
191 |             Vs_next = torch.cat((Vs_curr[1:], torch.tensor([0.])))
192 |             adv = R + self.df * Vs_next - Vs_curr
193 |             
194 |             # ignore gradients so the critic isn't affected by actor loss
195 |             adv = adv.detach()
196 |             
197 |             # actor loss is -1 * advantage-weighted sum of log likelihood
198 |             # critic loss is the SE between values and discounted rewards
199 |             actor_loss = -torch.dot(log_prob, adv)
200 |             critic_loss = torch.sum((R_disc - Vs_curr) ** 2)
201 |             episode_losses += actor_loss + critic_loss * self.alpha
202 |             
203 |         # backward pass
204 |         self.optimizer.zero_grad()
205 |         loss = episode_losses / N
206 |         loss.backward()
207 |         self.optimizer.step()
208 |         
209 |         # reset the replay history
210 |         self.replay = []
211 |     
212 |     def copy(self):
213 |         
214 |         # create a copy of this agent with frozen weights
215 |         agent = ActorCriticNNAgent(lambda x: 0, 0, self.obs_to_input)
216 |         agent.model = copy.deepcopy(self.model)
217 |         agent.trainable = False
218 |         for param in agent.model.parameters():
219 |             param.requires_grad = False
220 |             
221 |         return agent
222 | 


--------------------------------------------------------------------------------