├── README.md └── Q-Learning ├── __pycache__ ├── Env.cpython-37.pyc └── QL.cpython-37.pyc ├── Test.py ├── QL.py └── Env.py /README.md: -------------------------------------------------------------------------------- 1 | # Reinforcement-Learning 2 | Let’s study Reinforcement Learning together 3 | -------------------------------------------------------------------------------- /Q-Learning/__pycache__/Env.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bubbliiiing/Reinforcement-Learning/HEAD/Q-Learning/__pycache__/Env.cpython-37.pyc -------------------------------------------------------------------------------- /Q-Learning/__pycache__/QL.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bubbliiiing/Reinforcement-Learning/HEAD/Q-Learning/__pycache__/QL.cpython-37.pyc -------------------------------------------------------------------------------- /Q-Learning/Test.py: -------------------------------------------------------------------------------- 1 | from Env import Env 2 | from QL import QL 3 | import numpy as np 4 | import time 5 | 6 | LONG = 6 #总长度为6 7 | MAZE_PLACE = 6 #宝藏在第6位 8 | TIMES = 15 #进行15次迭代 9 | 10 | people = QL(['left','right']) #生成QLearn主体的对象,包含left和right 11 | site = Env(LONG,MAZE_PLACE) #生成测试环境 12 | for episode in range(TIMES): 13 | state = site.get_observation() #观察初始环境 14 | site.draw() #生成图像 15 | time.sleep(0.3) #暂停 16 | while(1): 17 | done = site.get_terminal() #判断当前环境是否到达最后 18 | if done: #如果到达,则初始化 19 | interaction = '\n第%s次世代,共使用步数:%s。'%(episode+1 ,site.count) 20 | print(interaction) 21 | site.retry() 22 | time.sleep(2) 23 | break 24 | action = people.choose_action(state) #获得下一步方向 25 | state_after,score,pre_done = site.get_target(action) #获得下一步的环境的实际情况 26 | people.learn(state,action,score,state_after,pre_done) #根据所处的当前环境对各个动作的预测得分和下一步的环境的实际情况更新当前环境的q表 27 | site.update_place(action) #更新位置 28 | state = state_after #状态更新 29 | site.draw() #更新画布 30 | time.sleep(0.3) 31 | 32 | 33 | print(people.q_table) 34 | -------------------------------------------------------------------------------- /Q-Learning/QL.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | class QL: 5 | def __init__(self, actions, learning_rate=0.05, reward_decay=0.9, e_greedy=0.9): 6 | self.actions = actions #初始化可以进行的各种行为,传入为列表 7 | self.lr = learning_rate #学习率,用于更新Q_table的值 8 | self.gamma = reward_decay #当没有到达终点时,下一环境对当前环境的影响 9 | self.epsilon = e_greedy #随机选择几率为1-e_greedy,当处于e_greedy内时,不随机选择。 10 | self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64) #生成q_table,列向量为columns 11 | 12 | def choose_action(self,observation): 13 | self.check_observation(observation) #检测是否到达过这个点,如果没到达过,在Q表中增加这个节点 14 | action_list = self.q_table.loc[observation,:] #取出当前observation所在的不同方向 15 | 16 | if(np.random.uniform() < self.epsilon): #如果在epsilon几率内 17 | action = np.random.choice(action_list[action_list == np.max(action_list)].index) #选出当前observation中Q值最大的方向 18 | else: 19 | action = np.random.choice(self.actions) #如果不在epsilon内,则随机选择一个动作 20 | return action #返回应当做的action 21 | 22 | def learn(self,observation_now,action,score,observation_after,done): 23 | self.check_observation(observation_after) #检查是否存在下一环境对应的方向状态 24 | q_predict = self.q_table.loc[observation_now,action] #获得当前状态下,当前所作动作所对应的预测得分 25 | if done: 26 | q_target = score #如果完成了则q_target为下一个环境的实际情况得分,本例子中此时score为1 27 | else: 28 | q_target = score + self.gamma * self.q_table.loc[observation_after, :].max() #如果未完成则取下一个环境若干个动作中的最大得分作为这个环境的价值传递给当前环境 29 | #根据所处的当前环境对各个动作的预测得分和下一步的环境的实际情况更新当前环境的q表 30 | self.q_table.loc[observation_now, action] += self.lr * (q_target - q_predict) 31 | 32 | def check_observation(self,observation): 33 | if observation not in self.q_table.index: #如果不存在 34 | self.q_table = self.q_table.append( #则通过series函数生成新的一列 35 | pd.Series( 36 | [0]*len(self.actions), 37 | index=self.actions, 38 | name=observation,) 39 | ) 40 | -------------------------------------------------------------------------------- /Q-Learning/Env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import time 4 | 5 | class Env: 6 | def __init__(self,column,maze_column): 7 | self.column = column #表示地图的长度 8 | self.maze_column = maze_column - 1 #宝藏所在的位置 9 | self.x = 0 #初始化x 10 | self.map = np.arange(column) #给予每个地点一个标号 11 | self.count = 0 #用于技术一共走了多少步 12 | 13 | 14 | def draw(self): 15 | a = [] 16 | for j in range(self.column) : #更新图画 17 | if j == self.x: 18 | a.append('o') 19 | elif j == self.maze_column: 20 | a.append('m') 21 | else: 22 | a.append('_') 23 | interaction = ''.join(a) 24 | print('\r{}'.format(interaction),end = '') 25 | 26 | 27 | def get_observation(self): 28 | return self.map[self.x] #返回现在在所 29 | 30 | 31 | def get_terminal(self): 32 | if self.x == self.maze_column: #如果得到了宝藏,则返回已经完成 33 | done = True 34 | else: 35 | done = False 36 | return done 37 | 38 | 39 | def update_place(self,action): 40 | self.count += 1 #更新的时候表示已经走了一步 41 | if action == 'right': 42 | if self.x < self.column - 1: 43 | self.x += 1 44 | elif action == 'left': #left 45 | if self.x > 0: 46 | self.x -= 1 47 | 48 | def get_target(self,action): 49 | if action == 'right': #获得下一步的环境的实际情况 50 | if self.x + 1 == self.maze_column: 51 | score = 1 52 | pre_done = True 53 | else: 54 | score = 0 55 | pre_done = False 56 | return self.map[self.x + 1],score,pre_done 57 | elif action == 'left': #left 58 | if self.x - 1 == self.maze_column: 59 | score = 1 60 | pre_done = Ture 61 | else: 62 | score = 0 63 | pre_done = False 64 | return self.map[self.x - 1],score,pre_done 65 | 66 | 67 | 68 | def retry(self): #初始化 69 | self.x = 0 70 | self.count = 0 71 | --------------------------------------------------------------------------------