├── 01-blog_code ├── Gridworld │ └── gridworld.py ├── Gridworld2 │ └── gridworld2.py ├── Tic-Tac-Toe │ └── example.py ├── core │ └── core.py ├── dqn │ ├── approxagent.py │ └── approximator.py ├── puckworld │ └── puckworld.py └── sarsa │ ├── sarsa(lambda).py │ └── sarsa.py └── README.md /01-blog_code/Gridworld/gridworld.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Implementation of small grid world example illustrated by David Silver 3 | in his Reinforcement Learning Lecture3 - Planning by Dynamic Programming. 4 | 5 | The value function converges to: 6 | 0.00 -14.00 -20.00 -22.00 7 | -14.00 -18.00 -20.00 -20.00 8 | -20.00 -20.00 -18.00 -14.00 9 | -22.00 -20.00 -14.00 0.00 10 | At Iterate No.153 11 | ''' 12 | # id of the states, 0 and 15 are terminal states 13 | states = [i for i in range(16)] 14 | # 0* 1 2 3 15 | # 4 5 6 7 16 | # 8 9 10 11 17 | # 12 13 14 15* 18 | 19 | # initial values of states 20 | values = [0 for _ in range(16)] 21 | 22 | # Action 23 | actions = ["n", "e", "s", "w"] 24 | 25 | # 行为对应的状态改变量 26 | # use a dictionary for convenient computation of next state id. 27 | ds_actions = {"n": -4, "e": 1, "s": 4, "w": -1} 28 | 29 | # discount factor 30 | gamma = 1.00 31 | 32 | 33 | # 根据当前状态和采取的行为计算下一个状态id以及得到的即时奖励 34 | def nextState(s, a): 35 | next_state = s 36 | if (s % 4 == 0 and a == "w") or (s < 4 and a == "n") \ 37 | or ((s+1) % 4 == 0 and a == "e") or (s > 11 and a == "s"): 38 | pass 39 | else: 40 | ds = ds_actions[a] 41 | next_state = s + ds 42 | return next_state 43 | 44 | 45 | # reward of a state 46 | def rewardOf(s): 47 | return 0 if s in [0,15] else -1 48 | 49 | 50 | # check if a state is terminate state 51 | def isTerminateState(s): 52 | return s in [0, 15] 53 | 54 | 55 | # get successor states of a given state s 56 | def getSuccessors(s): 57 | successors = [] 58 | if isTerminateState(s): 59 | return successors 60 | for a in actions: 61 | next_state = nextState(s, a) 62 | # if s != next_state: 63 | successors.append(next_state) 64 | return successors 65 | 66 | 67 | # update the value of state s 68 | def updateValue(s): 69 | sucessors = getSuccessors(s) 70 | newValue = 0 # values[s] 71 | num = 4 # len(successors) 72 | reward = rewardOf(s) 73 | for next_state in sucessors: 74 | newValue += 1.00/num * (reward + gamma * values[next_state]) 75 | return newValue 76 | 77 | 78 | # perform one-step iteration 79 | def performOneIteration(): 80 | newValues = [0 for _ in range(16)] 81 | for s in states: 82 | newValues[s] = updateValue(s) 83 | global values 84 | values = newValues 85 | printValue(values) 86 | 87 | 88 | # show some array info of the small grid world 89 | def printValue(v): 90 | for i in range(16): 91 | print('{0:>6.2f}'.format(v[i]), end=" ") 92 | if (i+1) % 4 == 0: 93 | print("") 94 | print() 95 | 96 | # test function 97 | def test(): 98 | printValue(states) 99 | printValue(values) 100 | for s in states: 101 | reward = rewardOf(s) 102 | for a in actions: 103 | next_state = nextState(s, a) 104 | print("({0}, {1}) -> {2}, with reward {3}".format(s, a,next_state, reward)) 105 | 106 | for i in range(200): 107 | performOneIteration() 108 | printValue(values) 109 | 110 | def main(): 111 | max_iterate_times = 160 112 | cur_iterate_times = 0 113 | while cur_iterate_times <= max_iterate_times: 114 | print("Iterate No.{0}".format(cur_iterate_times)) 115 | performOneIteration() 116 | cur_iterate_times += 1 117 | printValue(values) 118 | 119 | if __name__ == '__main__': 120 | main() 121 | -------------------------------------------------------------------------------- /01-blog_code/Gridworld2/gridworld2.py: -------------------------------------------------------------------------------- 1 | """ 2 | General GridWorld Environment 3 | """ 4 | import math 5 | import gym 6 | from gym import spaces 7 | from gym.utils import seeding 8 | import numpy as np 9 | 10 | 11 | class Grid(object): 12 | def __init__(self, x:int = None, 13 | y:int = None, 14 | type:int = 0, 15 | reward:float = 0.0, 16 | value:float = 0.0): # value属性备用 17 | self.x = x # 坐标x 18 | self.y = y 19 | self.type = type # 类别值(0:空;1:障碍或边界) 20 | self.reward = reward # 该格子的即时奖励 21 | self.value = value # 该格子的价值,暂没用上 22 | self.name = None # 该格子的名称 23 | self._update_name() 24 | 25 | def _update_name(self): 26 | self.name = "X{0}-Y{1}".format(self.x, self.y) 27 | 28 | def __str__(self): 29 | return "name:{4}, x:{0}, y:{1}, type:{2}, value:{3}".format(self.x, 30 | self.y, 31 | self.type, 32 | self.value, 33 | self.name 34 | ) 35 | 36 | 37 | class GridMatrix(object): 38 | '''格子矩阵,通过不同的设置,模拟不同的格子世界环境 39 | ''' 40 | def __init__(self, n_width:int, # 水平方向格子数 41 | n_height:int, # 竖直方向格子数 42 | default_type:int = 0, # 默认类型 43 | default_reward:float = 0.0, # 默认即时奖励值 44 | default_value:float = 0.0 # 默认价值(这个有点多余) 45 | ): 46 | self.grids = None 47 | self.n_height = n_height 48 | self.n_width = n_width 49 | self.len = n_width * n_height 50 | self.default_reward = default_reward 51 | self.default_value = default_value 52 | self.default_type = default_type 53 | self.reset() 54 | 55 | def reset(self): 56 | self.grids = [] 57 | for x in range(self.n_height): 58 | for y in range(self.n_width): 59 | self.grids.append(Grid(x, 60 | y, 61 | self.default_type, 62 | self.default_reward, 63 | self.default_value)) 64 | 65 | def get_grid(self, x, y=None): 66 | '''获取一个格子信息 67 | args:坐标信息,由x,y表示或仅有一个类型为tuple的x表示 68 | return:grid object 69 | ''' 70 | xx, yy = None, None 71 | if isinstance(x, int): 72 | xx, yy = x, y 73 | elif isinstance(x, tuple): 74 | xx, yy = x[0], x[1] 75 | assert(xx >= 0 and yy >= 0 and xx < self.n_width and yy < self.n_height), "任意坐标值应在合理区间" 76 | index = yy * self.n_width + xx 77 | return self.grids[index] 78 | 79 | def set_reward(self, x, y, reward): 80 | grid = self.get_grid(x, y) 81 | if grid is not None: 82 | grid.reward = reward 83 | else: 84 | raise("grid doesn't exist") 85 | 86 | def set_value(self, x, y, value): 87 | grid = self.get_grid(x, y) 88 | if grid is not None: 89 | grid.value = value 90 | else: 91 | raise("grid doesn't exist") 92 | 93 | def set_type(self, x, y, type): 94 | grid = self.get_grid(x, y) 95 | if grid is not None: 96 | grid.type = type 97 | else: 98 | raise("grid doesn't exist") 99 | 100 | def get_reward(self, x, y): 101 | grid = self.get_grid(x, y) 102 | if grid is None: 103 | return None 104 | return grid.reward 105 | 106 | def get_value(self, x, y): 107 | grid = self.get_grid(x, y) 108 | if grid is None: 109 | return None 110 | return grid.value 111 | 112 | def get_type(self, x, y): 113 | grid = self.get_grid(x, y) 114 | if grid is None: 115 | return None 116 | return grid.type 117 | 118 | 119 | class GridWorldEnv(gym.Env): 120 | '''格子世界环境,可以模拟各种不同的格子世界 121 | ''' 122 | metadata = { 123 | 'render.modes': ['human', 'rgb_array'], 124 | 'video.frames_per_second': 30 125 | } 126 | 127 | def __init__(self, n_width: int=10, 128 | n_height: int = 7, 129 | u_size=40, 130 | default_reward: float = 0, 131 | default_type=0, 132 | windy=False): 133 | self.u_size = u_size # 当前格子绘制尺寸 134 | self.n_width = n_width # 格子世界宽度(以格子数计) 135 | self.n_height = n_height # 高度 136 | self.width = u_size * n_width # 场景宽度 screen width 137 | self.height = u_size * n_height # 场景长度 138 | self.default_reward = default_reward 139 | self.default_type = default_type 140 | self._adjust_size() 141 | 142 | self.grids = GridMatrix(n_width=self.n_width, 143 | n_height=self.n_height, 144 | default_reward=self.default_reward, 145 | default_type=self.default_type, 146 | default_value=0.0) 147 | self.reward = 0 # for rendering 148 | self.action = None # for rendering 149 | self.windy = windy # 是否是有风格子世界 150 | 151 | # 0,1,2,3 represent left, right, up, down 152 | self.action_space = spaces.Discrete(4) 153 | # 观察空间由low和high决定 154 | self.observation_space = spaces.Discrete(self.n_height * self.n_width) 155 | # 坐标原点为左下角,这个pyglet是一致的 156 | # 通过设置起始点、终止点以及特殊奖励和类型的格子可以构建各种不同类型的格子世界环境 157 | # 比如:随机行走、汽车租赁、悬崖行走等David Silver公开课中的示例 158 | self.ends = [(7, 3)] # 终止格子坐标,可以有多个 159 | self.start = (0, 3) # 起始格子坐标,只有一个 160 | self.types = [] # 特殊种类的格子在此设置。[(3,2,1)]表示(3,2)处值为1 161 | self.rewards = [] # 特殊奖励的格子在此设置,终止格子奖励0 162 | self.refresh_setting() 163 | self.viewer = None # 图形接口对象 164 | self.seed() # 产生一个随机子 165 | self.reset() 166 | 167 | def _adjust_size(self): 168 | '''调整场景尺寸适合最大宽度、高度不超过800 169 | ''' 170 | pass 171 | 172 | def seed(self, seed=None): 173 | # 产生一个随机化时需要的种子,同时返回一个np_random对象,支持后续的随机化生成操作 174 | self.np_random, seed = seeding.np_random(seed) 175 | return [seed] 176 | 177 | def step(self, action): 178 | assert self.action_space.contains(action), "%r (%s) invalid" % (action, type(action)) 179 | self.action = action # action for rendering 180 | old_x, old_y = self._state_to_xy(self.state) 181 | new_x, new_y = old_x, old_y 182 | 183 | # wind effect 184 | # 有风效果,其数字表示个体离开(而不是进入)该格子时朝向别的方向会被吹偏离的格子数 185 | if self.windy: 186 | if new_x in [3, 4, 5, 8]: 187 | new_y += 1 188 | elif new_x in [6, 7]: 189 | new_y += 2 190 | 191 | if action == 0: new_x -= 1 # left 192 | elif action == 1: new_x += 1 # right 193 | elif action == 2: new_y += 1 # up 194 | elif action == 3: new_y -= 1 # down 195 | elif action == 4: new_x, new_y = new_x - 1, new_y - 1 196 | elif action == 5: new_x, new_y = new_x + 1, new_y - 1 197 | elif action == 6: new_x, new_y = new_x + 1, new_y - 1 198 | elif action == 7: new_x, new_y = new_x + 1, new_y + 1 199 | # boundary effect 200 | if new_x < 0: new_x = 0 201 | if new_x >= self.n_width: new_x = self.n_width - 1 202 | if new_y < 0: new_y = 0 203 | if new_y >= self.n_height: new_y = self.n_height - 1 204 | 205 | # wall effect: 206 | # 类型为1的格子为障碍格子,不可进入 207 | if self.grids.get_type(new_x, new_y) == 1: 208 | new_x, new_y = old_x, old_y 209 | 210 | self.reward = self.grids.get_reward(new_x, new_y) 211 | done = self._is_end_state(new_x, new_y) 212 | self.state = self._xy_to_state(new_x, new_y) 213 | # 提供格子世界所有的信息在info内 214 | info = {"x": new_x, "y": new_y, "grids": self.grids} 215 | return self.state, self.reward, done, info 216 | 217 | # 将状态变为横纵坐标 218 | def _state_to_xy(self, s): 219 | x = s % self.n_width 220 | y = int((s - x) / self.n_width) 221 | return x, y 222 | 223 | def _xy_to_state(self, x, y=None): 224 | if isinstance(x, int): 225 | assert (isinstance(y, int)), "incomplete Position info" 226 | return x + self.n_width * y 227 | elif isinstance(x, tuple): 228 | return x[0] + self.n_width * x[1] 229 | return -1 # 未知状态 230 | 231 | def refresh_setting(self): 232 | '''用户在使用该类创建格子世界后可能会修改格子世界某些格子类型或奖励值 233 | 的设置,修改设置后通过调用该方法使得设置生效。 234 | ''' 235 | for x, y, r in self.rewards: 236 | self.grids.set_reward(x, y, r) 237 | for x, y, t in self.types: 238 | self.grids.set_type(x, y, t) 239 | 240 | def reset(self): 241 | self.state = self._xy_to_state(self.start) 242 | return self.state 243 | 244 | # 判断是否是终止状态 245 | def _is_end_state(self, x, y=None): 246 | if y is not None: 247 | xx, yy = x, y 248 | elif isinstance(x, int): 249 | xx, yy = self._state_to_xy(x) 250 | else: 251 | assert (isinstance(x, tuple)), "坐标数据不完整" 252 | xx, yy = x[0], x[1] 253 | for end in self.ends: 254 | if xx == end[0] and yy == end[1]: 255 | return True 256 | return False 257 | 258 | # 图形化界面 259 | def render(self, mode='human', close=False): 260 | if close: 261 | if self.viewer is not None: 262 | self.viewer.close() 263 | self.viewer = None 264 | return 265 | zero = (0, 0) 266 | u_size = self.u_size 267 | m = 2 # 格子之间的间隙尺寸 268 | 269 | # 如果还没有设定屏幕对象,则初始化整个屏幕具备的元素。 270 | if self.viewer is None: 271 | from gym.envs.classic_control import rendering 272 | self.viewer = rendering.Viewer(self.width, self.height) 273 | 274 | # 在Viewer里绘制一个几何图像的步骤如下: 275 | # 1. 建立该对象需要的数据本身 276 | # 2. 使用rendering提供的方法返回一个geom对象 277 | # 3. 对geom对象进行一些对象颜色、线宽、线型、变换属性的设置(有些对象提供一些个 278 | # 性化的方法来设置属性,具体请参考继承自这些Geom的对象),这其中有一个重要的 279 | # 属性就是变换属性, 280 | # 该属性负责对对象在屏幕中的位置、渲染、缩放进行渲染。如果某对象 281 | # 在呈现时可能发生上述变化,则应建立关于该对象的变换属性。该属性是一个 282 | # Transform对象,而一个Transform对象,包括translate、rotate和scale 283 | # 三个属性,每个属性都由以np.array对象描述的矩阵决定。 284 | # 4. 将新建立的geom对象添加至viewer的绘制对象列表里,如果在屏幕上只出现一次, 285 | # 将其加入到add_onegeom()列表中,如果需要多次渲染,则将其加入add_geom() 286 | # 5. 在渲染整个viewer之前,对有需要的geom的参数进行修改,修改主要基于该对象 287 | # 的Transform对象 288 | # 6. 调用Viewer的render()方法进行绘制 289 | ''' 绘制水平竖直格子线,由于设置了格子之间的间隙,可不用此段代码 290 | for i in range(self.n_width+1): 291 | line = rendering.Line(start = (i*u_size, 0), 292 | end =(i*u_size, u_size*self.n_height)) 293 | line.set_color(0.5,0,0) 294 | self.viewer.add_geom(line) 295 | for i in range(self.n_height): 296 | line = rendering.Line(start = (0, i*u_size), 297 | end = (u_size*self.n_width, i*u_size)) 298 | line.set_color(0,0,1) 299 | self.viewer.add_geom(line) 300 | ''' 301 | 302 | # 绘制格子 303 | for x in range(self.n_width): 304 | for y in range(self.n_height): 305 | v = [(x * u_size + m, y * u_size + m), 306 | ((x + 1) * u_size - m, y * u_size + m), 307 | ((x + 1) * u_size - m, (y + 1) * u_size - m), 308 | (x * u_size + m, (y + 1) * u_size - m)] 309 | 310 | rect = rendering.FilledPolygon(v) 311 | r = self.grids.get_reward(x, y) / 10 312 | if r < 0: 313 | rect.set_color(0.9 - r, 0.9 + r, 0.9 + r) 314 | elif r > 0: 315 | rect.set_color(0.3, 0.5 + r, 0.3) 316 | else: 317 | rect.set_color(0.9, 0.9, 0.9) 318 | self.viewer.add_geom(rect) 319 | # 绘制边框 320 | v_outline = [(x * u_size + m, y * u_size + m), 321 | ((x + 1) * u_size - m, y * u_size + m), 322 | ((x + 1) * u_size - m, (y + 1) * u_size - m), 323 | (x * u_size + m, (y + 1) * u_size - m)] 324 | outline = rendering.make_polygon(v_outline, False) 325 | outline.set_linewidth(3) 326 | 327 | if self._is_end_state(x, y): 328 | # 给终点方格添加金黄色边框 329 | outline.set_color(0.9, 0.9, 0) 330 | self.viewer.add_geom(outline) 331 | if self.start[0] == x and self.start[1] == y: 332 | outline.set_color(0.5, 0.5, 0.8) 333 | self.viewer.add_geom(outline) 334 | if self.grids.get_type(x, y) == 1: # 障碍格子用深灰色表示 335 | rect.set_color(0.3, 0.3, 0.3) 336 | else: 337 | pass 338 | # 绘制个体 339 | self.agent = rendering.make_circle(u_size / 4, 30, True) 340 | self.agent.set_color(1.0, 1.0, 0.0) 341 | self.viewer.add_geom(self.agent) 342 | self.agent_trans = rendering.Transform() 343 | self.agent.add_attr(self.agent_trans) 344 | 345 | # 更新个体位置 346 | x, y = self._state_to_xy(self.state) 347 | self.agent_trans.set_translation((x + 0.5) * u_size, (y + 0.5) * u_size) 348 | 349 | return self.viewer.render(return_rgb_array= mode == 'rgb_array') 350 | 351 | def LargeGridWorld(): 352 | '''10*10的一个格子世界环境,设置参照: 353 | http://cs.stanford.edu/people/karpathy/reinforcejs/gridworld_td.html 354 | ''' 355 | env = GridWorldEnv(n_width=10, 356 | n_height=10, 357 | u_size=40, 358 | default_reward=0, 359 | default_type=0, 360 | windy=False) 361 | env.start = (0, 9) 362 | env.ends = [(5, 4)] 363 | env.types = [(4, 2, 1), (4, 3, 1), (4, 4, 1), (4, 5, 1), (4, 6, 1), (4, 7, 1), 364 | (1, 7, 1), (2, 7, 1), (3, 7, 1), (4, 7, 1), (6, 7, 1), (7, 7, 1), 365 | (8, 7, 1)] 366 | env.rewards = [(3, 2, -1), (3, 6, -1), (5, 2, -1), (6, 2, -1), (8, 3, -1), 367 | (8, 4, -1), (5, 4, 1), (6, 4, -1), (5, 5, -1), (6, 5, -1)] 368 | env.refresh_setting() 369 | return env 370 | 371 | def SimpleGridWorld(): 372 | '''无风10*7的格子,设置参照: David Silver强化学习公开课视频 第3讲 373 | ''' 374 | env = GridWorldEnv(n_width=10, 375 | n_height=7, 376 | u_size=60, 377 | default_reward=-1, 378 | default_type=0, 379 | windy=False) 380 | env.start = (0, 3) 381 | env.ends = [(7, 3)] 382 | env.rewards = [(7, 3, 1)] 383 | env.refresh_setting() 384 | return env 385 | 386 | def WindyGridWorld(): 387 | '''有风10*7的格子,设置参照: David Silver强化学习公开课视频 第5讲 388 | ''' 389 | env = GridWorldEnv(n_width=10, 390 | n_height = 7, 391 | u_size = 60, 392 | default_reward = -1, 393 | default_type = 0, 394 | windy=True) 395 | env.start = (0,3) 396 | env.ends = [(7,3)] 397 | env.rewards = [(7,3,1)] 398 | 399 | env.refresh_setting() 400 | return env 401 | 402 | def RandomWalk(): 403 | '''随机行走示例环境 404 | ''' 405 | env = GridWorldEnv(n_width=7, 406 | n_height = 1, 407 | u_size = 80, 408 | default_reward = 0, 409 | default_type = 0, 410 | windy=False) 411 | env.action_space = spaces.Discrete(2) # left or right 412 | env.start = (3,0) 413 | env.ends = [(6,0),(0,0)] 414 | env.rewards = [(6,0,1)] 415 | env.refresh_setting() 416 | return env 417 | 418 | def CliffWalk(): 419 | '''悬崖行走格子世界环境 420 | ''' 421 | env = GridWorldEnv(n_width=12, 422 | n_height = 4, 423 | u_size = 60, 424 | default_reward = -1, 425 | default_type = 0, 426 | windy=False) 427 | env.action_space = spaces.Discrete(4) # left or right 428 | env.start = (0,0) 429 | env.ends = [(11,0)] 430 | # env.rewards=[] 431 | # env.types = [(5,1,1),(5,2,1)] 432 | for i in range(10): 433 | env.rewards.append((i+1,0,-100)) 434 | env.ends.append((i+1,0)) 435 | env.refresh_setting() 436 | return env 437 | 438 | def SkullAndTreasure(): 439 | '''骷髅与钱币示例,解释随机策略的有效性 David Silver 强化学习公开课第六讲 策略梯度 440 | ''' 441 | env = GridWorldEnv(n_width=5, 442 | n_height=2, 443 | u_size=60, 444 | default_reward=-1, 445 | default_type=0, 446 | windy=False) 447 | env.action_space = spaces.Discrete(4) # left or right 448 | env.start = (0, 1) 449 | env.ends = [(2, 0)] 450 | env.rewards = [(0, 0, -100), (2, 0, 100), (4, 0, -100)] 451 | env.types = [(1, 0, 1), (3, 0, 1)] 452 | env.refresh_setting() 453 | return env 454 | 455 | if __name__ == "__main__": 456 | env = GridWorldEnv() 457 | print("hello") 458 | env.reset() 459 | nfs = env.observation_space 460 | nfa = env.action_space 461 | print("nfs:%s; nfa:%s" % (nfs, nfa)) 462 | print(env.observation_space) 463 | print(env.action_space) 464 | print(env.state) 465 | env.render() 466 | # x = input("press any key to exit") 467 | for _ in range(20000): 468 | env.render() 469 | a = env.action_space.sample() 470 | state, reward, isdone, info = env.step(a) 471 | print("{0}, {1}, {2}, {3}".format(a, reward, isdone, info)) 472 | 473 | print("env closed") 474 | -------------------------------------------------------------------------------- /01-blog_code/Tic-Tac-Toe/example.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | 4 | BOARD_ROWS = 3 5 | BOARD_COLS = 3 6 | BOARD_SIZE = BOARD_ROWS * BOARD_COLS 7 | 8 | class State: 9 | def __init__(self): 10 | # the board is represented by an n * n array, 11 | # 1 represents a chessman of the player who moves first, 12 | # -1 represents a chessman of another player 13 | # 0 represents an empty position 14 | self.data = np.zeros((BOARD_ROWS, BOARD_COLS)) 15 | self.winner = None 16 | self.hash_val = None 17 | self.end = None 18 | 19 | # compute the hash value for one state, it's unique 20 | def hash(self): 21 | if self.hash_val is None: 22 | self.hash_val = 0 23 | for i in self.data.reshape(BOARD_ROWS * BOARD_COLS): 24 | if i == -1: 25 | i = 2 26 | self.hash_val = self.hash_val * 3 + i 27 | return int(self.hash_val) 28 | 29 | # check whether a player has won the game, or it's a tie 30 | def is_end(self): 31 | if self.end is not None: 32 | return self.end 33 | results = [] 34 | # check row 35 | for i in range(0, BOARD_ROWS): 36 | results.append(np.sum(self.data[i, :])) 37 | # check columns 38 | for i in range(0, BOARD_COLS): 39 | results.append(np.sum(self.data[:, i])) 40 | 41 | # check diagonals 42 | results.append(0) 43 | for i in range(0, BOARD_ROWS): 44 | results[-1] += self.data[i, i] 45 | results.append(0) 46 | for i in range(0, BOARD_ROWS): 47 | results[-1] += self.data[i, BOARD_ROWS - 1 - i] 48 | 49 | for result in results: 50 | if result == 3: 51 | self.winner = 1 52 | self.end = True 53 | return self.end 54 | if result == -3: 55 | self.winner = -1 56 | self.end = True 57 | return self.end 58 | 59 | # whether it's a tie 60 | sum = np.sum(np.abs(self.data)) 61 | if sum == BOARD_ROWS * BOARD_COLS: 62 | self.winner = 0 63 | self.end = True 64 | return self.end 65 | 66 | # game is still going on 67 | self.end = False 68 | return self.end 69 | 70 | # @symbol: 1 or -1 71 | # put chessman symbol in position (i, j) 72 | def next_state(self, i, j, symbol): 73 | new_state = State() 74 | new_state.data = np.copy(self.data) 75 | new_state.data[i, j] = symbol 76 | return new_state 77 | 78 | # print the board 79 | def print(self): 80 | for i in range(0, BOARD_ROWS): 81 | print('-------------') 82 | out = '| ' 83 | for j in range(0, BOARD_COLS): 84 | if self.data[i, j] == 1: 85 | token = '*' 86 | if self.data[i, j] == 0: 87 | token = '0' 88 | if self.data[i, j] == -1: 89 | token = 'x' 90 | out += token + ' | ' 91 | print(out) 92 | print('-------------') 93 | 94 | def get_all_states_impl(current_state, current_symbol, all_states): 95 | for i in range(0, BOARD_ROWS): 96 | for j in range(0, BOARD_COLS): 97 | if current_state.data[i][j] == 0: 98 | newState = current_state.next_state(i, j, current_symbol) 99 | newHash = newState.hash() 100 | if newHash not in all_states.keys(): 101 | isEnd = newState.is_end() 102 | all_states[newHash] = (newState, isEnd) 103 | if not isEnd: 104 | get_all_states_impl(newState, -current_symbol, all_states) 105 | 106 | def get_all_states(): 107 | current_symbol = 1 108 | current_state = State() 109 | all_states = dict() 110 | all_states[current_state.hash()] = (current_state, current_state.is_end()) 111 | get_all_states_impl(current_state, current_symbol, all_states) 112 | return all_states 113 | 114 | # all possible board configurations 115 | all_states = get_all_states() 116 | 117 | class Judger: 118 | # @player1: the player who will move first, its chessman will be 1 119 | # @player2: another player with a chessman -1 120 | # @feedback: if True, both players will receive rewards when game is end 121 | def __init__(self, player1, player2): 122 | self.p1 = player1 123 | self.p2 = player2 124 | self.current_player = None 125 | self.p1_symbol = 1 126 | self.p2_symbol = -1 127 | self.p1.set_symbol(self.p1_symbol) 128 | self.p2.set_symbol(self.p2_symbol) 129 | self.current_state = State() 130 | 131 | def reset(self): 132 | self.p1.reset() 133 | self.p2.reset() 134 | 135 | def alternate(self): 136 | while True: 137 | yield self.p1 138 | yield self.p2 139 | 140 | # @print: if True, print each board during the game 141 | def play(self, print=False): 142 | alternator = self.alternate() 143 | self.reset() 144 | current_state = State() 145 | self.p1.set_state(current_state) 146 | self.p2.set_state(current_state) 147 | while True: 148 | player = next(alternator) 149 | if print: 150 | current_state.print() 151 | [i, j, symbol] = player.act() 152 | next_state_hash = current_state.next_state(i, j, symbol).hash() 153 | current_state, is_end = all_states[next_state_hash] 154 | self.p1.set_state(current_state) 155 | self.p2.set_state(current_state) 156 | if is_end: 157 | if print: 158 | current_state.print() 159 | return current_state.winner 160 | 161 | # AI player 162 | class Player: 163 | # @step_size: the step size to update estimations 164 | # @epsilon: the probability to explore 165 | def __init__(self, step_size=0.1, epsilon=0.1): 166 | self.estimations = dict() 167 | self.step_size = step_size 168 | self.epsilon = epsilon 169 | self.states = [] 170 | self.greedy = [] 171 | 172 | def reset(self): 173 | self.states = [] 174 | self.greedy = [] 175 | 176 | def set_state(self, state): 177 | self.states.append(state) 178 | self.greedy.append(True) 179 | 180 | def set_symbol(self, symbol): 181 | self.symbol = symbol 182 | for hash_val in all_states.keys(): 183 | (state, is_end) = all_states[hash_val] 184 | if is_end: 185 | if state.winner == self.symbol: 186 | self.estimations[hash_val] = 1.0 187 | elif state.winner == 0: 188 | # we need to distinguish between a tie and a lose 189 | self.estimations[hash_val] = 0.5 190 | else: 191 | self.estimations[hash_val] = 0 192 | else: 193 | self.estimations[hash_val] = 0.5 194 | 195 | # update value estimation 196 | def backup(self): 197 | # for debug 198 | # print('player trajectory') 199 | # for state in self.states: 200 | # state.print() 201 | 202 | self.states = [state.hash() for state in self.states] 203 | 204 | for i in reversed(range(len(self.states) - 1)): 205 | state = self.states[i] 206 | td_error = self.greedy[i] * (self.estimations[self.states[i + 1]] - self.estimations[state]) 207 | self.estimations[state] += self.step_size * td_error 208 | 209 | # choose an action based on the state 210 | def act(self): 211 | state = self.states[-1] 212 | next_states = [] 213 | next_positions = [] 214 | for i in range(BOARD_ROWS): 215 | for j in range(BOARD_COLS): 216 | if state.data[i, j] == 0: 217 | next_positions.append([i, j]) 218 | next_states.append(state.next_state(i, j, self.symbol).hash()) 219 | 220 | if np.random.rand() < self.epsilon: 221 | action = next_positions[np.random.randint(len(next_positions))] 222 | action.append(self.symbol) 223 | self.greedy[-1] = False 224 | return action 225 | 226 | values = [] 227 | for hash, pos in zip(next_states, next_positions): 228 | values.append((self.estimations[hash], pos)) 229 | np.random.shuffle(values) 230 | values.sort(key=lambda x: x[0], reverse=True) 231 | action = values[0][1] 232 | action.append(self.symbol) 233 | return action 234 | 235 | def save_policy(self): 236 | with open('policy_%s.bin' % ('first' if self.symbol == 1 else 'second'), 'wb') as f: 237 | pickle.dump(self.estimations, f) 238 | 239 | def load_policy(self): 240 | with open('policy_%s.bin' % ('first' if self.symbol == 1 else 'second'), 'rb') as f: 241 | self.estimations = pickle.load(f) 242 | 243 | # human interface 244 | # input a number to put a chessman 245 | # | q | w | e | 246 | # | a | s | d | 247 | # | z | x | c | 248 | class HumanPlayer: 249 | def __init__(self, **kwargs): 250 | self.symbol = None 251 | self.keys = ['q', 'w', 'e', 'a', 's', 'd', 'z', 'x', 'c'] 252 | self.state = None 253 | return 254 | 255 | def reset(self): 256 | return 257 | 258 | def set_state(self, state): 259 | self.state = state 260 | 261 | def set_symbol(self, symbol): 262 | self.symbol = symbol 263 | return 264 | 265 | def backup(self, _): 266 | return 267 | 268 | def act(self): 269 | self.state.print() 270 | key = input("Input your position:") 271 | data = self.keys.index(key) 272 | i = data // int(BOARD_COLS) 273 | j = data % BOARD_COLS 274 | return (i, j, self.symbol) 275 | 276 | def train(epochs): 277 | player1 = Player(epsilon=0.01) 278 | player2 = Player(epsilon=0.01) 279 | judger = Judger(player1, player2) 280 | player1_win = 0.0 281 | player2_win = 0.0 282 | for i in range(1, epochs + 1): 283 | winner = judger.play(print=False) 284 | if winner == 1: 285 | player1_win += 1 286 | if winner == -1: 287 | player2_win += 1 288 | print('Epoch %d, player 1 win %.02f, player 2 win %.02f' % (i, player1_win / i, player2_win / i)) 289 | player1.backup() 290 | player2.backup() 291 | judger.reset() 292 | player1.save_policy() 293 | player2.save_policy() 294 | 295 | def compete(turns): 296 | player1 = Player(epsilon=0) 297 | player2 = Player(epsilon=0) 298 | judger = Judger(player1, player2) 299 | player1.load_policy() 300 | player2.load_policy() 301 | player1_win = 0.0 302 | player2_win = 0.0 303 | for i in range(0, turns): 304 | winner = judger.play() 305 | if winner == 1: 306 | player1_win += 1 307 | if winner == -1: 308 | player2_win += 1 309 | judger.reset() 310 | print('%d turns, player 1 win %.02f, player 2 win %.02f' % (turns, player1_win / turns, player2_win / turns)) 311 | 312 | # The game is a zero sum game. If both players are playing with an optimal strategy, every game will end in a tie. 313 | # So we test whether the AI can guarantee at least a tie if it goes second. 314 | def play(): 315 | while True: 316 | player1 = HumanPlayer() 317 | player2 = Player(epsilon=0) 318 | judger = Judger(player1, player2) 319 | player2.load_policy() 320 | winner = judger.play() 321 | if winner == player2.symbol: 322 | print("You lose!") 323 | elif winner == player1.symbol: 324 | print("You win!") 325 | else: 326 | print("It is a tie!") 327 | 328 | if __name__ == '__main__': 329 | train(int(1e5)) 330 | compete(int(1e3)) 331 | play() 332 | -------------------------------------------------------------------------------- /01-blog_code/core/core.py: -------------------------------------------------------------------------------- 1 | #!/home/python3.5 2 | # -*- coding: utf-8 -*- 3 | # core file of reinforcment learning 4 | 5 | from random import random, choice 6 | import gym 7 | from gym import Env 8 | import numpy as np 9 | from collections import namedtuple 10 | from typing import List 11 | import random 12 | 13 | 14 | class State(object): 15 | def __init__(self, name): 16 | self.name = name 17 | 18 | 19 | class Transition(object): 20 | def __init__(self, s0, a0, reward: float, is_done: bool, s1): 21 | self.data = [s0, a0, reward, is_done, s1] 22 | 23 | # 如果一个类想被用于for ... in循环,类似list或tuple那样,就必须实现一个__iter__()方法, 24 | # 该方法返回一个迭代对象 25 | def __iter__(self): 26 | return iter(self.data) 27 | 28 | def __str__(self): 29 | return "s:{0:<3} a:{1:<3} r:{2:<4} is_end:{3:<5} s1:{4:<3}". \ 30 | format(self.data[0], 31 | self.data[1], 32 | self.data[2], 33 | self.data[3], 34 | self.data[4]) 35 | 36 | @property 37 | def s0(self): return self.data[0] 38 | 39 | @property 40 | def a0(self): return self.data[1] 41 | 42 | @property 43 | def reward(self): return self.data[2] 44 | 45 | @property 46 | def is_done(self): return self.data[3] 47 | 48 | @property 49 | def s1(self): return self.data[4] 50 | 51 | 52 | class Episode(object): 53 | def __init__(self, e_id: int = 0) -> None: 54 | self.total_reward = 0 # 总的获得的奖励 55 | self.trans_list = [] # 状态转移列表 56 | self.name = str(e_id) # 可以给Episode起个名字 57 | 58 | def push(self, trans: Transition) -> float: 59 | self.trans_list.append(trans) 60 | self.total_reward += trans.reward 61 | return self.total_reward 62 | 63 | @property 64 | def len(self): 65 | return len(self.trans_list) 66 | 67 | def __str__(self): 68 | return "episode {0:<4} {1:>4} steps,total reward:{2:<8.2f}". \ 69 | format(self.name, self.len, self.total_reward) 70 | 71 | def print_detail(self): 72 | print("detail of ({0}):".format(self)) 73 | for i, trans in enumerate(self.trans_list): 74 | print("step{0:<4} ".format(i), end=" ") 75 | print(trans) 76 | 77 | def pop(self) -> Transition: 78 | '''normally this method shouldn't be invoked. 79 | ''' 80 | if self.len > 1: 81 | trans = self.trans_list.pop() 82 | self.total_reward -= trans.reward 83 | return trans 84 | else: 85 | return None 86 | 87 | def is_complete(self) -> bool: 88 | '''check if an episode is an complete episode 89 | ''' 90 | if self.len == 0: 91 | return False 92 | return self.trans_list[self.len - 1].is_done 93 | 94 | def sample(self, batch_size=1): 95 | '''随机产生一个trans 96 | ''' 97 | return random.sample(self.trans_list, k=batch_size) 98 | 99 | def __len__(self) -> int: 100 | return self.len 101 | 102 | 103 | class Experience(object): 104 | '''this class is used to record the whole experience of an agent organized 105 | by an episode list. agent can randomly sample transitions or episodes from 106 | its experience. 107 | ''' 108 | 109 | def __init__(self, capacity: int = 20000): 110 | self.capacity = capacity # 容量:指的是trans总数量 111 | self.episodes = [] # episode列表 112 | self.next_id = 0 # 下一个episode的Id 113 | self.total_trans = 0 114 | 115 | def __str__(self): 116 | return "exp info:{0:5} episodes, memory usage {1}/{2}". \ 117 | format(self.len, self.total_trans, self.capacity) 118 | 119 | @property 120 | def len(self): 121 | return len(self.episodes) 122 | 123 | def __len__(self): 124 | return self.len 125 | 126 | def __remove(self, index=0): 127 | '''扔掉一个Episode,默认第一个。 128 | remove an episode, defautly the first one. 129 | args: 130 | the index of the episode to remove 131 | return: 132 | if exists return the episode else return None 133 | ''' 134 | if index > self.len - 1: 135 | raise (Exception("invalid index")) 136 | if self.len > 0: 137 | episode = self.episodes[index] 138 | self.episodes.remove(episode) 139 | self.total_trans -= episode.len 140 | return episode 141 | else: 142 | return None 143 | 144 | def __remove_first(self): 145 | self.__remove(index=0) 146 | 147 | def push(self, trans): 148 | '''压入一个状态转换 149 | ''' 150 | if self.capacity <= 0: 151 | return 152 | while self.total_trans >= self.capacity: # 可能会有空episode吗? 153 | episode = self.__remove_first() 154 | cur_episode = None 155 | if self.len == 0 or self.episodes[self.len - 1].is_complete(): 156 | cur_episode = Episode(self.next_id) 157 | self.next_id += 1 158 | self.episodes.append(cur_episode) 159 | else: 160 | cur_episode = self.episodes[self.len - 1] 161 | self.total_trans += 1 162 | return cur_episode.push(trans) # return total reward of an episode 163 | 164 | def sample(self, batch_size): # sample transition 165 | '''randomly sample some transitions from agent's experience.abs 166 | 随机获取一定数量的状态转化对象Transition 167 | args: 168 | number of transitions need to be sampled 169 | return: 170 | list of Transition. 171 | ''' 172 | sample_trans = [] 173 | for _ in range(batch_size): 174 | index = int(random.random() * self.len) 175 | sample_trans += self.episodes[index].sample() 176 | return sample_trans 177 | 178 | def sample_episode(self, episode_num=1): # sample episode 179 | '''随机获取一定数量完整的Episode 180 | ''' 181 | return random.sample(self.episodes, k=episode_num) 182 | 183 | @property 184 | def last(self): 185 | if self.len > 0: 186 | return self.episodes[self.len - 1] 187 | return None 188 | 189 | 190 | class Agent(object): 191 | '''Base Class of Agent 192 | ''' 193 | def __init__(self, env: Env = None, 194 | trans_capacity = 0): 195 | # 保存一些Agent可以观测到的环境信息以及已经学到的经验 196 | self.env = env 197 | self.obs_space = env.observation_space if env is not None else None 198 | self.action_space = env.action_space if env is not None else None 199 | self.experience = Experience(capacity=trans_capacity) 200 | # 有一个变量记录agent当前的state相对来说还是比较方便的。要注意对该变量的维护、更新 201 | self.state = None # current observation of an agent 202 | 203 | def performPolicy(self, policy_fun, s): 204 | if policy_fun is None: 205 | return self.action_space.sample() 206 | return policy_fun(s) 207 | 208 | def act(self, a0): 209 | s0 = self.state 210 | s1, r1, is_done, info = self.env.step(a0) 211 | # TODO add extra code here 212 | trans = Transition(s0, a0, r1, is_done, s1) 213 | total_reward = self.experience.push(trans) 214 | self.state = s1 215 | return s1, r1, is_done, info, total_reward 216 | 217 | def learning(self): 218 | '''need to be implemented by all subclasses 219 | ''' 220 | raise NotImplementedError 221 | 222 | def sample(self, batch_size=64): 223 | '''随机取样 224 | ''' 225 | return self.experience.sample(batch_size) 226 | 227 | @property 228 | def total_trans(self): 229 | '''得到Experience里记录的总的状态转换数量 230 | ''' 231 | return self.experience.total_trans 232 | -------------------------------------------------------------------------------- /01-blog_code/dqn/approxagent.py: -------------------------------------------------------------------------------- 1 | # /usr/local/bin/python3.7 2 | # -*- coding:utf-8 -*- 3 | 4 | from random import random, choice 5 | from gym import Env, spaces 6 | import sys 7 | import gym 8 | import numpy as np 9 | from approximator import Approximator 10 | import torch 11 | 12 | sys.path.append('../Gridworld2') 13 | from gridworld2 import * 14 | 15 | sys.path.append('../core') 16 | from core import Transition, Experience, Agent 17 | 18 | 19 | class ApproxQAgent(Agent): 20 | '''使用近似的价值函数实现的Q学习的个体 21 | ''' 22 | 23 | def __init__(self, env: Env = None, 24 | trans_capacity=20000, 25 | hidden_dim: int = 16): 26 | if env is None: 27 | raise Exception("agent should have an environment") 28 | super(ApproxQAgent, self).__init__(env, trans_capacity) 29 | self.input_dim, self.output_dim = 1, 1 30 | 31 | # 适应不同的状态和行为空间类型 32 | if isinstance(env.observation_space, spaces.Discrete): 33 | self.input_dim = 1 34 | elif isinstance(env.observation_space, spaces.Box): 35 | self.input_dim = env.observation_space.shape[0] 36 | 37 | if isinstance(env.action_space, spaces.Discrete): 38 | self.output_dim = env.action_space.n 39 | elif isinstance(env.action_space, spaces.Box): 40 | self.output_dim = env.action_space.shape[0] 41 | # print("{},{}".format(self.input_dim, self.output_dim)) 42 | 43 | # 隐藏层神经元数目 44 | self.hidden_dim = hidden_dim 45 | # 关键在下面两句,声明了两个近似价值函数 46 | # 变量Q是一个计算价值,产生loss的近似函数(网络), 47 | # 该网络参数在一定时间段内不更新参数 48 | self.Q = Approximator(dim_input=self.input_dim, 49 | dim_output=self.output_dim, 50 | dim_hidden=self.hidden_dim) 51 | # 变量PQ是一个生成策略的近似函数,该函数(网络)的参数频繁更新 52 | # 更新参数的网络 53 | self.PQ = self.Q.clone() 54 | return 55 | 56 | def _learning_from_memory(self, gamma, batch_size, learning_rate, epochs): 57 | # 随机获取记忆里的Transmition 58 | trans_pieces = self.sample(batch_size) 59 | states_0 = np.vstack([x.s0 for x in trans_pieces]) 60 | actions_0 = np.array([x.a0 for x in trans_pieces]) 61 | reward_1 = np.array([x.reward for x in trans_pieces]) 62 | is_done = np.array([x.is_done for x in trans_pieces]) 63 | states_1 = np.vstack([x.s1 for x in trans_pieces]) 64 | 65 | X_batch = states_0 66 | # 调用的时approximator的__call__方法 67 | y_batch = self.Q(states_0) 68 | 69 | # 使用了Batch,代码是矩阵运算 70 | # np.max => axis=1时取出最大的一列;axis=0时取出最大的一行 71 | # ~ True = -2; ~ False = -1 72 | Q_target = reward_1 + gamma * np.max(self.Q(states_1), axis=1) * (~ is_done) 73 | y_batch[np.arange(len(X_batch)), actions_0] = Q_target 74 | # loss is a torch Variable with size of 1 75 | loss = self.PQ.fit(x=X_batch, 76 | y=y_batch, 77 | learning_rate=learning_rate, 78 | epochs=epochs) 79 | mean_loss = loss.sum().item() / batch_size 80 | self._update_Q_net() 81 | return mean_loss 82 | 83 | def learning(self, gamma=0.99, 84 | learning_rate=1e-5, 85 | max_episodes=1000, 86 | batch_size=64, 87 | min_epsilon=0.2, 88 | epsilon_factor=0.1, 89 | epochs=1): 90 | '''learning的主要工作是构建经历,当构建的经历足够时,同时启动基于经历的学习 91 | ''' 92 | total_steps, step_in_episode, num_episode = 0, 0, 0 93 | target_episode = max_episodes * epsilon_factor 94 | while num_episode < max_episodes: 95 | epsilon = self._decayed_epsilon(cur_episode=num_episode, 96 | min_epsilon=min_epsilon, 97 | max_epsilon=1, 98 | target_episode=target_episode) 99 | self.state = self.env.reset() 100 | self.env.render() 101 | step_in_episode = 0 102 | loss, mean_loss = 0.00, 0.00 103 | is_done = False 104 | while not is_done: 105 | s0 = self.state 106 | a0 = self.performPolicy(s0, epsilon) 107 | # act方法封装了将Transition记录至Experience中的过程 108 | s1, r1, is_done, info, total_reward = self.act(a0) 109 | # self.env.render() 110 | step_in_episode += 1 111 | # 当经历里有足够大小的Transition时,开始启用基于经历的学习 112 | if self.total_trans > batch_size: 113 | loss += self._learning_from_memory(gamma, 114 | batch_size, 115 | learning_rate, 116 | epochs) 117 | mean_loss = loss / step_in_episode 118 | print("{0} epsilon:{1:3.2f}, loss:{2:.3f}". 119 | format(self.experience.last, epsilon, mean_loss)) 120 | # print(self.experience) 121 | total_steps += step_in_episode 122 | num_episode += 1 123 | return 124 | 125 | def _decayed_epsilon(self, cur_episode: int, 126 | min_epsilon: float, 127 | max_epsilon: float, 128 | target_episode: int) -> float: 129 | '''获得一个在一定范围内的epsilon 130 | ''' 131 | slope = (min_epsilon - max_epsilon) / (target_episode) 132 | intercept = max_epsilon 133 | return max(min_epsilon, slope * cur_episode + intercept) 134 | 135 | def _curPolicy(self, s, epsilon=None): 136 | '''依据更新策略的价值函数(网络)产生一个行为 137 | ''' 138 | Q_s = self.PQ(s) 139 | rand_value = random() 140 | if epsilon is not None and rand_value < epsilon: 141 | return self.env.action_space.sample() 142 | else: 143 | return int(np.argmax(Q_s)) 144 | 145 | def performPolicy(self, s, epsilon=None): 146 | return self._curPolicy(s, epsilon) 147 | 148 | def _update_Q_net(self): 149 | '''将更新策略的Q网络(连带其参数)复制给输出目标Q值的网络 150 | ''' 151 | self.Q = self.PQ.clone() 152 | 153 | 154 | def testApproxQAgent(): 155 | env = gym.make("MountainCar-v0") 156 | # env = gym.make("PuckWorld-v0") 157 | # env = SimpleGridWorld() 158 | 159 | # 保存训练的视频 160 | # directory = "/home/reinforce/monitor" 161 | # env = gym.wrappers.Monitor(env, directory, force=True) 162 | 163 | agent = ApproxQAgent(env, 164 | trans_capacity=10000, # 记忆容量(按状态转换数计) 165 | hidden_dim=16) # 隐藏神经元数量 166 | env.reset() 167 | print("Learning...") 168 | agent.learning(gamma=0.99, # 衰减引子 169 | learning_rate=1e-3, # 学习率 170 | batch_size=64, # 集中学习的规模 171 | max_episodes=2000, # 最大训练Episode数量 172 | min_epsilon=0.01, # 最小Epsilon 173 | epsilon_factor=0.3, # 开始使用最小Epsilon时Episode的序号占最大 174 | # Episodes序号之比,该比值越小,表示使用 175 | # min_epsilon的episode越多 176 | epochs=2 # 每个batch_size训练的次数 177 | ) 178 | 179 | 180 | if __name__ == "__main__": 181 | testApproxQAgent() 182 | -------------------------------------------------------------------------------- /01-blog_code/dqn/approximator.py: -------------------------------------------------------------------------------- 1 | # /usr/local/bin/python3.7 2 | # -*- coding:utf-8 -*- 3 | # function approximators of reinforcment learning 4 | 5 | import numpy as np 6 | import torch 7 | from torch.autograd import Variable 8 | import copy 9 | 10 | 11 | class Approximator(torch.nn.Module): 12 | '''base class of different function approximator subclasses 13 | ''' 14 | 15 | def __init__(self, dim_input=1, dim_output=1, dim_hidden=16): 16 | super(Approximator, self).__init__() 17 | self.dim_input = dim_input 18 | self.dim_output = dim_output 19 | self.dim_hidden = dim_hidden 20 | 21 | # function Linear:__init(inputSize, outputSize) 22 | # hidden layer 23 | self.linear1 = torch.nn.Linear(self.dim_input, self.dim_hidden) 24 | self.linear2 = torch.nn.Linear(self.dim_hidden, self.dim_output) 25 | pass 26 | 27 | def predict(self, x): 28 | # 实现ReLU:->max(0, x) 29 | # torch.clamp(input,min,max,out=None)-> Tensor 30 | # 将input中的元素限制在[min,max]范围内并返回一个Tensor 31 | h_relu = self.linear1(x).clamp(min=0) 32 | y_pred = self.linear2(h_relu) 33 | return y_pred 34 | 35 | def fit(self, x, 36 | y, 37 | criterion=None, 38 | optimizer=None, 39 | epochs=1, 40 | learning_rate=1e-4): 41 | if criterion is None: 42 | # MSELoss(reduce=False, size_average=False) 43 | # 如果 reduce = False,那么 size_average 参数失效,直接返回向量形式的 loss; 44 | # 如果 reduce = True,那么 loss 返回的是标量 45 | # 如果 size_average = True,返回 loss.mean(); 46 | # 如果 size_average = False,返回 loss.sum(); 47 | criterion = torch.nn.MSELoss(size_average=False) 48 | if optimizer is None: 49 | # Adam(Adaptive Moment Estimation)本质上是带有动量项的RMSprop 50 | optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate) 51 | if epochs < 1: 52 | epochs = 1 53 | 54 | x = self._prepare_data(x) 55 | y = self._prepare_data(y, False) 56 | 57 | for t in range(epochs): 58 | y_pred = self.predict(x) 59 | loss = criterion(y_pred, y) 60 | # 把梯度置零,也就是把loss关于weight的导数变成0 61 | optimizer.zero_grad() 62 | loss.backward() 63 | optimizer.step() 64 | 65 | return loss 66 | 67 | def _prepare_data(self, x, requires_grad=True): 68 | '''将numpy格式的数据转化为Torch的Variable 69 | ''' 70 | if isinstance(x, np.ndarray): 71 | x = Variable(torch.from_numpy(x), requires_grad=requires_grad) 72 | if isinstance(x, int): 73 | x = Variable(torch.Tensor([[x]]), requires_grad=requires_grad) 74 | # 从from_numpy()转换过来的数据是DoubleTensor形式 75 | x = x.float() 76 | if x.data.dim() == 1: 77 | # 增加一个纬度 78 | x = x.unsqueeze(0) 79 | return x 80 | 81 | def __call__(self, x): 82 | '''根据输入返回输出,类似于 predict 函数 83 | ''' 84 | x = self._prepare_data(x) 85 | pred = self.predict(x) 86 | return pred.data.numpy() 87 | 88 | def clone(self): 89 | '''返回当前模型的深度拷贝对象 90 | ''' 91 | return copy.deepcopy(self) 92 | 93 | 94 | def test(): 95 | N, D_in, H, D_out = 64, 100, 50, 1 96 | # torch.rand(*sizes, out=None) → Tensor 97 | # 返回一个张量,包含了从区间[0, 1)的均匀分布中抽取的一组随机数。张量的形状由参数sizes定义。 98 | # torch.randn(*sizes, out=None) → Tensor,服从标准正态分布 99 | x = Variable(torch.randn(N, D_in)) 100 | y = Variable(torch.randn(N, D_out), requires_grad=False) 101 | 102 | model = Approximator(D_in, D_out, H) 103 | 104 | model.fit(x, y, epochs=1000) 105 | print(x[2]) 106 | y_pred = model.predict(x[2]) 107 | print(y[2]) 108 | print(y_pred) 109 | new_model = model.clone() 110 | new_pred = new_model.predict(x[2]) 111 | print(new_pred) 112 | print(model is new_model) 113 | 114 | 115 | if __name__ == "__main__": 116 | test() 117 | -------------------------------------------------------------------------------- /01-blog_code/puckworld/puckworld.py: -------------------------------------------------------------------------------- 1 | """ 2 | PuckWorld Environment for OpenAI gym 3 | The data used in this model comes from: 4 | http://cs.stanford.edu/people/karpathy/reinforcejs/puckworld.html 5 | """ 6 | 7 | import math 8 | import gym 9 | from gym import spaces 10 | from gym.utils import seeding 11 | import numpy as np 12 | 13 | RAD2DEG = 57.29577951308232 # 弧度与角度换算关系1弧度=57.29..角度 14 | 15 | 16 | class PuckWorldEnv(gym.Env): 17 | metadata = { 18 | 'render.modes': ['human', 'rgb_array'], 19 | 'video.frames_per_second': 30 20 | } 21 | 22 | def __init__(self): 23 | self.width = 600 # 场景宽度 screen width 24 | self.height = 600 # 场景长度 25 | self.l_unit = 1.0 # 场景长度单位 pysical world width 26 | self.v_unit = 1.0 # 速度单位 velocity 27 | self.max_speed = 0.025 # max agent velocity along a axis 28 | 29 | self.re_pos_interval = 30 # 目标重置距离时间 30 | self.accel = 0.002 # agent 加速度 31 | self.rad = 0.05 # agent 半径,目标半径 32 | self.target_rad = 0.01 # target radius. 33 | self.goal_dis = self.rad # 目标接近距离 expected goal distance 34 | self.t = 0 # puck world clock 35 | self.update_time = 100 # time for target randomize its position 36 | # 作为观察空间每一个特征值的下限 37 | self.low = np.array([0, # agent position x 38 | 0, 39 | -self.max_speed, # agent velocity 40 | -self.max_speed, 41 | 0, # target position x 42 | 0, 43 | ]) 44 | self.high = np.array([self.l_unit, 45 | self.l_unit, 46 | self.max_speed, 47 | self.max_speed, 48 | self.l_unit, 49 | self.l_unit, 50 | ]) 51 | self.reward = 0 # for rendering 52 | self.action = None # for rendering 53 | self.viewer = None 54 | # 0,1,2,3,4 represent left, right, up, down, -, five moves. 55 | self.action_space = spaces.Discrete(5) 56 | # 观察空间由low和high决定 57 | self.observation_space = spaces.Box(self.low, self.high) 58 | 59 | self._seed() # 产生一个随机数种子 60 | self.reset() 61 | 62 | def _seed(self, seed=None): 63 | # 产生一个随机化时需要的种子,同时返回一个np_random对象,支持后续的随机化生成操作 64 | self.np_random, seed = seeding.np_random(seed) 65 | return [seed] 66 | 67 | def _step(self, action): 68 | assert self.action_space.contains(action), \ 69 | "%r (%s) invalid" % (action, type(action)) 70 | 71 | self.action = action # action for rendering 72 | ppx, ppy, pvx, pvy, tx, ty = self.state # 获取agent位置,速度,目标位置 73 | ppx, ppy = ppx + pvx, ppy + pvy # update agent position 74 | pvx, pvy = pvx * 0.95, pvy * 0.95 # natural velocity loss 75 | 76 | if action == 0: pvx -= self.accel # left 77 | if action == 1: pvx += self.accel # right 78 | if action == 2: pvy += self.accel # up 79 | if action == 3: pvy -= self.accel # down 80 | if action == 4: pass # no move 81 | 82 | if ppx < self.rad: # encounter left bound 83 | pvx *= -0.5 84 | ppx = self.rad 85 | if ppx > 1 - self.rad: # right bound 86 | pvx *= -0.5 87 | ppx = 1 - self.rad 88 | if ppy < self.rad: # bottom bound 89 | pvy *= -0.5 90 | ppy = self.rad 91 | if ppy > 1 - self.rad: # right bound 92 | pvy *= -0.5 93 | ppy = 1 - self.rad 94 | 95 | self.t += 1 96 | if self.t % self.update_time == 0: # update target position 97 | tx = self._random_pos() # randomly 98 | ty = self._random_pos() 99 | 100 | dx, dy = ppx - tx, ppy - ty # calculate distance from 101 | dis = self._compute_dis(dx, dy) # agent to target 102 | 103 | self.reward = self.goal_dis - dis # give an reward 104 | 105 | done = bool(dis <= self.goal_dis) 106 | 107 | self.state = (ppx, ppy, pvx, pvy, tx, ty) 108 | return np.array(self.state), self.reward, done, {} 109 | 110 | def _random_pos(self): 111 | return self.np_random.uniform(low=0, high=self.l_unit) 112 | 113 | def _compute_dis(self, dx, dy): 114 | return math.sqrt(math.pow(dx, 2) + math.pow(dy, 2)) 115 | 116 | def _reset(self): 117 | self.state = np.array([self._random_pos(), 118 | self._random_pos(), 119 | 0, 120 | 0, 121 | self._random_pos(), 122 | self._random_pos() 123 | ]) 124 | return self.state # np.array(self.state) 125 | 126 | def _render(self, mode='human', close=False): 127 | if close: 128 | if self.viewer is not None: 129 | self.viewer.close() 130 | self.viewer = None 131 | return 132 | 133 | scale = self.width / self.l_unit # 计算两者映射关系 134 | rad = self.rad * scale # 随后都是用世界尺寸来描述 135 | t_rad = self.target_rad * scale # target radius 136 | 137 | # 如果还没有设定屏幕对象,则初始化整个屏幕具备的元素。 138 | if self.viewer is None: 139 | from gym.envs.classic_control import rendering 140 | self.viewer = rendering.Viewer(self.width, self.height) 141 | 142 | # 在Viewer里绘制一个几何图像的步骤如下: 143 | # 1. 建立该对象需要的数据本身 144 | # 2. 使用rendering提供的方法返回一个geom对象 145 | # 3. 对geom对象进行一些对象颜色、线宽、线型、变换属性的设置(有些对象提供一些个 146 | # 性化的方法 147 | # 来设置属性,具体请参考继承自这些Geom的对象),这其中有一个重要的属性就是 148 | # 变换属性,该属性负责对对象在屏幕中的位置、渲染、缩放进行渲染。如果某对象 149 | # 在呈现时可能发生上述变化,则应建立关于该对象的变换属性。该属性是一个 150 | # Transform对象,而一个Transform对象,包括translate、rotate和scale 151 | # 三个属性,每个属性都由以np.array对象描述的矩阵决定。 152 | # 4. 将新建立的geom对象添加至viewer的绘制对象列表里,如果在屏幕上只出现一次, 153 | # 将其加入到add_onegeom()列表中,如果需要多次渲染,则将其加入add_geom() 154 | # 5. 在渲染整个viewer之前,对有需要的geom的参数进行修改,修改主要基于该对象 155 | # 的Transform对象 156 | # 6. 调用Viewer的render()方法进行绘制 157 | 158 | target = rendering.make_circle(t_rad, 30, True) 159 | target.set_color(0.1, 0.9, 0.1) 160 | self.viewer.add_geom(target) 161 | target_circle = rendering.make_circle(t_rad, 30, False) 162 | target_circle.set_color(0, 0, 0) 163 | self.viewer.add_geom(target_circle) 164 | self.target_trans = rendering.Transform() 165 | target.add_attr(self.target_trans) 166 | target_circle.add_attr(self.target_trans) 167 | 168 | self.agent = rendering.make_circle(rad, 30, True) 169 | self.agent.set_color(0, 1, 0) 170 | self.viewer.add_geom(self.agent) 171 | self.agent_trans = rendering.Transform() 172 | self.agent.add_attr(self.agent_trans) 173 | 174 | agent_circle = rendering.make_circle(rad, 30, False) 175 | agent_circle.set_color(0, 0, 0) 176 | agent_circle.add_attr(self.agent_trans) 177 | self.viewer.add_geom(agent_circle) 178 | 179 | # start_p = (0, 0) 180 | # end_p = (0.7 * rad, 0) 181 | # self.line = rendering.Line(start_p, end_p) 182 | # self.line.linewidth = rad / 10 183 | self.line_trans = rendering.Transform() 184 | # self.line.add_attr(self.line_trans) 185 | # self.viewer.add_geom(self.line) 186 | self.arrow = rendering.FilledPolygon([ 187 | (0.7 * rad, 0.15 * rad), 188 | (rad, 0), 189 | (0.7 * rad, -0.15 * rad) 190 | ]) 191 | self.arrow.set_color(0, 0, 0) 192 | self.arrow.add_attr(self.line_trans) 193 | self.viewer.add_geom(self.arrow) 194 | 195 | # 如果已经为屏幕准备好了要绘制的对象 196 | # 本例中唯一要做的就是改变小车的位置和旋转 197 | ppx, ppy, _, _, tx, ty = self.state 198 | self.target_trans.set_translation(tx * scale, ty * scale) 199 | self.agent_trans.set_translation(ppx * scale, ppy * scale) 200 | # 按距离给Agent着色 201 | vv, ms = self.reward + 0.3, 1 202 | r, g, b, = 0, 1, 0 203 | if vv >= 0: 204 | r, g, b = 1 - ms * vv, 1, 1 - ms * vv 205 | else: 206 | r, g, b = 1, 1 + ms * vv, 1 + ms * vv 207 | self.agent.set_color(r, g, b) 208 | 209 | a = self.action 210 | if a in [0, 1, 2, 3]: 211 | # 根据action绘制箭头 212 | degree = 0 213 | if a == 0: 214 | degree = 180 215 | elif a == 1: 216 | degree = 0 217 | elif a == 2: 218 | degree = 90 219 | else: 220 | degree = 270 221 | self.line_trans.set_translation(ppx * scale, ppy * scale) 222 | self.line_trans.set_rotation(degree / RAD2DEG) 223 | # self.line.set_color(0,0,0) 224 | self.arrow.set_color(0, 0, 0) 225 | else: 226 | # self.line.set_color(r,g,b) 227 | self.arrow.set_color(r, g, b) 228 | return self.viewer.render(return_rgb_array=mode == 'rgb_array') 229 | 230 | 231 | if __name__ == "__main__": 232 | env = PuckWorldEnv() 233 | print("hello") 234 | env.reset() 235 | nfs = env.observation_space.shape[0] 236 | nfa = env.action_space 237 | print("nfs:%s; nfa:d" % (nfs)) 238 | print(env.observation_space) 239 | print(env.action_space) 240 | 241 | # for _ in range(10000): 242 | # env.render() 243 | # env.step(env.action_space.sample()) 244 | 245 | print("env closed") 246 | -------------------------------------------------------------------------------- /01-blog_code/sarsa/sarsa(lambda).py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # An agent powered by Sarsa(lambda) for discrete ovservation 3 | # and action spaces 4 | 5 | from random import random 6 | from gym import Env 7 | import gym 8 | import sys 9 | 10 | sys.path.append('../Gridworld2') 11 | from gridworld2 import * # 可以导入各种格子世界环境 12 | 13 | 14 | class SarsaLambdaAgent(object): 15 | def __init__(self, env: Env): 16 | self.env = env 17 | self.Q = {} # {s0:[,,,,,,],s1:[]} 数组内元素个数为行为空间大小 18 | self.E = {} # Eligibility Trace 19 | self.state = None 20 | self._init_agent() 21 | return 22 | 23 | def _init_agent(self): 24 | self.state = self.env.reset() 25 | s_name = self._name_state(self.state) 26 | self._assert_state_in_QE(s_name, randomized=False) 27 | 28 | # using simple decaying epsilon greedy exploration 29 | def _curPolicy(self, s, num_episode, use_epsilon): 30 | epsilon = 1.00 / (num_episode + 1) # 衰减的epsilon-greedy 31 | Q_s = self.Q[s] 32 | rand_value = random() 33 | if use_epsilon and rand_value < epsilon: 34 | return self.env.action_space.sample() 35 | else: 36 | return int(max(Q_s, key=Q_s.get)) 37 | 38 | # Agent依据当前策略和状态生成下一步与环境交互所要执行的动作 39 | # 该方法并不执行生成的行为 40 | def performPolicy(self, s, num_episode, use_epsilon=True): 41 | return self._curPolicy(s, num_episode, use_epsilon) 42 | 43 | def act(self, a): # Agent执行动作a 44 | return self.env.step(a) 45 | 46 | def learning(self, lambda_, gamma, alpha, max_episode_num): 47 | total_time = 0 48 | num_episode = 1 49 | while num_episode <= max_episode_num: 50 | self._resetEValue() 51 | s0 = self._name_state(self.env.reset()) 52 | a0 = self.performPolicy(s0, num_episode) 53 | self.env.render() 54 | 55 | time_in_episode = 0 56 | is_done = False 57 | while not is_done: 58 | s1, r1, is_done, info = self.act(a0) 59 | self.env.render() 60 | s1 = self._name_state(s1) 61 | self._assert_state_in_QE(s1, randomized=True) 62 | 63 | a1 = self.performPolicy(s1, num_episode) 64 | 65 | q = self._get_(self.Q, s0, a0) 66 | q_prime = self._get_(self.Q, s1, a1) 67 | delta = r1 + gamma * q_prime - q 68 | 69 | e = self._get_(self.E, s0, a0) 70 | e = e + 1 71 | self._set_(self.E, s0, a0, e) # set E before update E 72 | 73 | state_action_list = list(zip(self.E.keys(), self.E.values())) 74 | for s, a_es in state_action_list: 75 | for a in range(self.env.action_space.n): 76 | e_value = a_es[a] 77 | old_q = self._get_(self.Q, s, a) 78 | new_q = old_q + alpha * delta * e_value 79 | new_e = gamma * lambda_ * e_value 80 | self._set_(self.Q, s, a, new_q) 81 | self._set_(self.E, s, a, new_e) 82 | 83 | if num_episode == max_episode_num: 84 | print("t:{0:>2}: s:{1}, a:{2:10}, s1:{3}". 85 | format(time_in_episode, s0, a0, s1)) 86 | 87 | s0, a0 = s1, a1 88 | time_in_episode += 1 89 | 90 | print("Episode {0} takes {1} steps.".format( 91 | num_episode, time_in_episode)) 92 | total_time += time_in_episode 93 | num_episode += 1 94 | return 95 | 96 | def _is_state_in_Q(self, s): 97 | return self.Q.get(s) is not None 98 | 99 | def _init_state_value(self, s_name, randomized=True): 100 | if not self._is_state_in_Q(s_name): 101 | self.Q[s_name], self.E[s_name] = {}, {} 102 | for action in range(self.env.action_space.n): 103 | default_v = random() / 10 if randomized is True else 0.0 104 | self.Q[s_name][action] = default_v 105 | self.E[s_name][action] = 0.0 106 | 107 | def _assert_state_in_QE(self, s, randomized=True): 108 | if not self._is_state_in_Q(s): 109 | self._init_state_value(s, randomized) 110 | 111 | def _name_state(self, state): 112 | '''给个体的一个观测(状态)生成一个不重复的字符串作为Q、E字典里的键 113 | ''' 114 | return str(state) 115 | 116 | def _get_(self, QorE, s, a): 117 | self._assert_state_in_QE(s, randomized=True) 118 | return QorE[s][a] 119 | 120 | def _set_(self, QorE, s, a, value): 121 | self._assert_state_in_QE(s, randomized=True) 122 | QorE[s][a] = value 123 | 124 | def _resetEValue(self): 125 | for value_dic in self.E.values(): 126 | for action in range(self.env.action_space.n): 127 | value_dic[action] = 0.00 128 | 129 | 130 | def main(): 131 | env = WindyGridWorld() 132 | # directory = "" 133 | # env = gym.wrappers.Monitor(env, directory, force=True) 134 | agent = SarsaLambdaAgent(env) 135 | print("Learning...") 136 | agent.learning(lambda_=0.01, 137 | gamma=0.9, 138 | alpha=0.1, 139 | max_episode_num=1000) 140 | 141 | 142 | if __name__ == "__main__": 143 | main() 144 | -------------------------------------------------------------------------------- /01-blog_code/sarsa/sarsa.py: -------------------------------------------------------------------------------- 1 | from random import random # 随机策略时用到 2 | from gym import Env 3 | import gym 4 | import sys 5 | 6 | sys.path.append('../Gridworld2') 7 | from gridworld2 import * # 可以导入各种格子世界环境 8 | 9 | ''' 10 | SARSA(0)算法简单实现 11 | ''' 12 | class Agent(object): 13 | def __init__(self, env: Env): 14 | self.env = env # 个体持有环境的引用 15 | self.Q = {} # 个体维护一张行为价值表Q 16 | self._initAgent() 17 | self.state = None # 个体当前的观察,最好写成obs 18 | 19 | def performPolicy(self, s, episode_num, use_epsilon): # 执行一个策略 20 | epsilon = 1.00 / (episode_num + 1) 21 | Q_s = self.Q[s] 22 | str_act = "unknown" 23 | rand_value = random() 24 | action = None 25 | if use_epsilon and rand_value < epsilon: 26 | action = self.env.action_space.sample() 27 | else: 28 | str_act = max(Q_s, key=Q_s.get) 29 | action = int(str_act) 30 | return action 31 | 32 | def act(self, a): # 执行一个行为 33 | return self.env.step(a) 34 | 35 | def learning(self): # 学习过程 36 | pass 37 | 38 | def _get_state_name(self, state): 39 | return str(state) 40 | 41 | def _is_state_in_Q(self, s): # 判断s的Q值是否存在 42 | return self.Q.get(s) is not None 43 | 44 | def _init_state_value(self, s_name, randomized=True): # 初始化某状态的Q值 45 | if not self._is_state_in_Q(s_name): 46 | self.Q[s_name] = {} 47 | for action in range(self.env.action_space.n): # 针对其所有可能行为 48 | default_v = random() / 10 if randomized is True else 0.0 49 | self.Q[s_name][action] = default_v 50 | 51 | def _assert_state_in_Q(self, s, randomized=True): # 确保某状态Q值存在 52 | # 找不到状态s的Q值 53 | if not self._is_state_in_Q(s): 54 | self._init_state_value(s, randomized) 55 | 56 | def _initAgent(self): 57 | self.state = self.env.reset() 58 | s_name = self._get_state_name(self.state) 59 | self._assert_state_in_Q(s_name, randomized=False) 60 | 61 | def _get_Q(self, s, a): # 获取Q(s,a) 62 | self._assert_state_in_Q(s, randomized=True) 63 | return self.Q[s][a] 64 | 65 | def _set_Q(self, s, a, value): # 设置Q(s,a) 66 | self._assert_state_in_Q(s, randomized=True) 67 | self.Q[s][a] = value 68 | 69 | def learning(self, gamma, alpha, max_episode_num): # sarsa learning 70 | # self.Position_t_name, self.reward_t1 = self.observe(env) 71 | total_time, time_in_episode, num_episode = 0, 0, 0 72 | while num_episode < max_episode_num: # 设置终止条件 73 | self.state = self.env.reset() # 环境初始化 74 | s0 = self._get_state_name(self.state) # 获取个体对于观测的命名 75 | self.env.render() # 显示UI界面 76 | a0 = self.performPolicy(s0, num_episode, use_epsilon=True) 77 | 78 | time_in_episode = 0 79 | is_done = False 80 | while not is_done: # 针对一个Episode内部 81 | s1, r1, is_done, info = self.act(a0) # 执行行为 82 | self.env.render() 83 | s1 = self._get_state_name(s1) 84 | self._assert_state_in_Q(s1, randomized=True) 85 | # 获得A’,在下行代码中添加参数use_epsilon = False即变成Q学习算法 86 | a1 = self.performPolicy(s1, num_episode, use_epsilon=True) 87 | old_q = self._get_Q(s0, a0) 88 | q_prime = self._get_Q(s1, a1) 89 | td_target = r1 + gamma * q_prime 90 | # alpha = alpha / num_episode 91 | new_q = old_q + alpha * (td_target - old_q) 92 | self._set_Q(s0, a0, new_q) 93 | 94 | if num_episode == max_episode_num: # 终端显示最后Episode的信息 95 | print("t:{0:>2}: s:{1}, a:{2:2}, s1:{3}". \ 96 | format(time_in_episode, s0, a0, s1)) 97 | 98 | s0, a0 = s1, a1 99 | time_in_episode += 1 100 | 101 | print("Episode {0} takes {1} steps.".format( 102 | num_episode, time_in_episode)) # 显示每一个Episode花费了多少步 103 | total_time += time_in_episode 104 | num_episode += 1 105 | return 106 | 107 | 108 | def main(): 109 | env = SimpleGridWorld() 110 | agent = Agent(env) 111 | print("Learning...") 112 | agent.learning(gamma=0.9, 113 | alpha=0.1, 114 | max_episode_num=800) 115 | 116 | 117 | if __name__ == "__main__": 118 | main() 119 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 强化学习的博客及配套代码 2 | 记录自己强化学习由浅入深的学习过程,目前主要参考的资料是David Silver的公开课,下面提到的代码有部分源于网络。 3 | 4 | ## [目录](#目录) 5 | - [强化学习博客与代码](#强化学习博客与代码) 6 | 7 | ## 强化学习博客与代码: 8 | |**博客** | **代码** | 9 | | --------------------------------------------------------------------------------------------- |:-------------:| 10 | | [强化学习-术语和数学符号](https://blog.csdn.net/u011254180/article/details/84031546) | 无 | 11 | | [强化学习(一)简介](https://blog.csdn.net/u011254180/article/details/83349455) | 无 | 12 | | [强化学习(二)马尔科夫决策过程](https://blog.csdn.net/u011254180/article/details/83387344) | 无 | 13 | | [强化学习(三)动态规划寻找最优策略](https://blog.csdn.net/u011254180/article/details/83573220) | 无 | 14 | | [强化学习(四)不基于模型的预测](https://blog.csdn.net/u011254180/article/details/83994391) | 无 | 15 | | [强化学习(五)不基于模型的控制](https://blog.csdn.net/u011254180/article/details/84253095) | 无 | 16 | | [强化学习实践(一)Tic-Tac-Toe游戏](https://blog.csdn.net/u011254180/article/details/86479795) | [代码](/01-blog_code/Tic-Tac-Toe/example.py) | 17 | | [强化学习实践(二)迭代法评估4\*4方格世界下的随机策略](https://blog.csdn.net/u011254180/article/details/88133551) | [代码](/01-blog_code/Gridworld/gridworld.py) | 18 | | [强化学习实践(三)理解gym的建模思想](https://blog.csdn.net/u011254180/article/details/88211536) | 无 | 19 | | [强化学习实践(四)编写通用的格子世界环境类](https://blog.csdn.net/u011254180/article/details/88220484) | [代码](/01-blog_code/Gridworld2/gridworld2.py) | 20 | | [强化学习实践(五)Agent类和SARSA算法实现](https://blog.csdn.net/u011254180/article/details/88430601) | [代码](/01-blog_code/sarsa/sarsa.py) | 21 | | [强化学习实践(六)SARSA(λ)算法实现](https://blog.csdn.net/u011254180/article/details/88673519) | [代码](/01-blog_code/sarsa/sarsa(lambda).py) | 22 | | [强化学习(六)价值函数的近似表示](https://blog.csdn.net/u011254180/article/details/89238765) | 无 | 23 | | [强化学习实践(七)给Agent添加记忆功能](https://blog.csdn.net/u011254180/article/details/89326920) | [代码](/01-blog_code/core/core.py) | 24 | | [强化学习(七)策略梯度](https://blog.csdn.net/u011254180/article/details/89431822) | 无 | 25 | | [强化学习(八)整合学习与规划](https://blog.csdn.net/u011254180/article/details/89556617) | 无 | 26 | | [强化学习(九)探索与利用](https://blog.csdn.net/u011254180/article/details/90063387) | 无 | 27 | | [强化学习实践(八)DQN的实现](https://blog.csdn.net/u011254180/article/details/90240163) | [代码](/01-blog_code/dqn/approxagent.py) | 28 | --------------------------------------------------------------------------------