├── main.pdf ├── reinforcement learning.md ├── README.md └── multi-armed bandit.py /main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mario0716/RL-Note/HEAD/main.pdf -------------------------------------------------------------------------------- /reinforcement learning.md: -------------------------------------------------------------------------------- 1 | 如果做什么事都要强调一下motivation的话,那我在这个没人的地方写这个笔记的motivation也很简单 2 | 3 | ## Motivation 4 | For Relaxing In My Terrible Life... 5 | 6 | ## NFL 定理(No Free Lunch) 7 | 8 | 原理 $ \mathbb{E}_{ote}(\zeta_a|X, f) = \sum_{h} \sum_{x \in \mathcal{X} - X}P(x)I(h(x) \neq f(x))P(h|X,\zeta_a) $ 9 | 10 | 一种假设h(x)的错误期望 11 | $\sum_{x\in\mathcal{X} - X} P(x)I(h(x) \neq f(x))$ 12 | 13 | 产生每种假设h(x)的概率 14 | $P(h|X,\zeta_a)$ 15 | 16 | 17 | ## Apprenticeship learning via inverse reinforcement learning 18 | 19 | 20 | 计算回报的期望 21 | 22 | 1.把所有的回报加起来除以次数 23 | 2.增量式更新 24 | 25 | 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RL-Note 2 | RL-Note mainly includes the notes of learning RL theory, algoriythm and code.⛽️ 3 | 4 | I learn about reinforcement theory from [shuhuai008](https://space.bilibili.com/97068901) on Bilibi website 5 | 6 | | Chaptor | Status | Time | 7 | | :----- | :--: | -------: | 8 | |Markove Decision Procession|✅|2025.01.24| 9 | |Dynamic Programming|✅|2025.01.25| 10 | |Monte Carlo Method|⌛️| | 11 | |Gaussian Process|✅|2025.03.10| 12 | |Gaussian Mixture Model|✅|2025.04.23| 13 | 14 | Besides, I also learn about RL with [Prof. Weinan Zhang, SJTU](https://space.bilibili.com/3546754433681656/lists/4126508?type=season) on bilibili website. 15 | 16 | | Chaptor | Status | 17 | | :----- | :--: | 18 | |Imitation Learning I| ✅ | 19 | |Imitation Learning II|✅| 20 | |Offline Reinforcement Learning|✅| 21 | |Multi-Agent Reinforcement Learning I|✅| 22 | |Multi-Agent Reinforcement Learning II|✅| 23 | |Diffusion Models for Reinforcement Learning|✅| 24 | |Multi-Agent Reinforcement Learning III|✅| 25 | 26 | ## note 📒 27 | Note my own understanding of RL-theory and idea.🧠 Besides, some paper reading is necessary.📖 28 | 29 | ## code🧑‍💻 30 | Update the code recurrence and add detailed explanation of RL-algorithm.✍️ 31 | -------------------------------------------------------------------------------- /multi-armed bandit.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | class BernoulliBandit: 5 | """ 6 | bernoulli multi-armed bandit 7 | """ 8 | def __init__(self, K): 9 | self.probs = np.random.uniform(size=K) #随机生成K个0~1的数,作为拉动每根拉杆的获奖 10 | # 概率 11 | self.best_idx = np.argmax(self.probs) #获奖概率最大的拉杆 12 | self.best_prob = self.probs[self.best_idx] #最大的获奖概率 13 | self.K = K 14 | 15 | def step(self, k): 16 | """ 17 | 当玩家选择了k号拉杆后 根据拉动该老虎机的k号拉杆获得奖励的概率返回 18 | 1 获奖 19 | 0 未获奖 20 | """ 21 | if np.random.rand() < self.probs[k]: 22 | return 1 23 | else: 24 | return 0 25 | 26 | 27 | # 设定随机种子,使实验具有可重复性 28 | # ? 通过设置相同的种子,可以确保每次运行程序时生成的随机数序列都是相同的。 29 | np.random.seed(1) 30 | K = 10 31 | bandit_10_arm = BernoulliBandit(K) 32 | 33 | print("随机生成一个%d的伯努利老虎机" % K) 34 | print("获奖概率最大的拉杆使%d号其获奖概率为%.4f" % (bandit_10_arm.best_idx, bandit_10_arm.best_prob)) 35 | 36 | 37 | 38 | """ 39 | Strategy of Multi-Armed Bandit 40 | """ 41 | class Solver: 42 | """ 43 | the framework of multi-armed bandit 44 | """ 45 | def __init__(self, bandit): 46 | self.bandit = bandit 47 | self.counts = np.zeros(self.bandit.K) #每根拉杆尝试的次数 48 | self.regret = 0 #当前步的累计懊悔 49 | self.actions = [] #list save action 50 | self.regrets = [] #list save regret 51 | 52 | def update_regret(self, k): 53 | """ 54 | calculate and save cumulative regret, k is the number of action bandit 55 | """ 56 | self.regret += self.bandit.best_prob - self.bandit.probs[k] 57 | self.regrets.append(self.regret) 58 | 59 | def run_one_step(self): 60 | """ 61 | return the number of action bandit 62 | """ 63 | raise NotImplementedError 64 | 65 | def run(self, num_steps): 66 | """ 67 | num_steps: the total number epoch 68 | """ 69 | for _ in range(num_steps): 70 | k = self.run_one_step() 71 | self.counts[k] += 1 72 | self.actions.append(k) 73 | self.update_regret(k) 74 | 75 | 76 | """ 77 | epsilon-greedy algorithm 78 | """ 79 | class EpsilonGreedy(Solver): 80 | """ 81 | son of solver class :) 82 | """ 83 | def __init__(self, bandit, epsilon=0.01, init_prob=1.0): 84 | super(EpsilonGreedy, self).__init__(bandit) 85 | self.epsilon = epsilon 86 | # initialize reward estimation 87 | self.estimates = np.array([init_prob] * self.bandit.K) 88 | 89 | def run_one_step(self): 90 | if np.random.random() < self.epsilon: 91 | k = np.random.randint(0, self.bandit.K) # choose one bandit randomly 92 | else: 93 | k = np.argmax(self.estimates) # choose the best reward estimation bandit 94 | 95 | r = self.bandit.step(k) # get reward of this action 96 | # ? 97 | self.estimates[k] += 1. / (self.counts[k] + 1) * (r - self.estimates[k]) 98 | return k 99 | 100 | 101 | # plot results 102 | def plot_results(solvers, solver_names): 103 | """ 104 | plot image of regret changing with time 105 | solver: list, each mab strategy 106 | solver_names:list, name of strategy 107 | """ 108 | for idx, solver in enumerate(solvers): 109 | time_list = range(len(solver.regrets)) 110 | plt.plot(time_list, solver.regrets, label=solver_names[idx]) 111 | 112 | plt.xlabel('Time steps') 113 | plt.ylabel('Cumulative regrets') 114 | plt.title('%d-armed bandit' % solvers[0].bandit.K) 115 | plt.legend() 116 | plt.show() 117 | 118 | np.random.seed(1) 119 | epsilon_greedy_solver = EpsilonGreedy(bandit_10_arm, epsilon=0.01) 120 | epsilon_greedy_solver.run(5000) 121 | print('epsilon-greedy algorithm cumulative regrets: ', epsilon_greedy_solver.regret) 122 | plot_results([epsilon_greedy_solver], ["EpsilonGreedy"]) 123 | 124 | 125 | np.random.seed(0) 126 | epsilons = [1e-4, 0.01, 0.1, 0.25, 0.5] 127 | # hyperparameter test trick 128 | epsilon_greedy_solver_list = [ 129 | EpsilonGreedy(bandit_10_arm, epsilon=e) for e in epsilons 130 | ] 131 | epsilon_greedy_solver_names = ["epsilon={}".format(e) for e in epsilons] 132 | for solver in epsilon_greedy_solver_list: 133 | solver.run(5000) 134 | 135 | plot_results(epsilon_greedy_solver_list, epsilon_greedy_solver_names) 136 | 137 | # epsilon-decaying greedy algorithm 138 | class DecayingEpsilonGreedy(Solver): 139 | """ 140 | epsilon decays by time 141 | """ 142 | def __init__(self, bandit, init_prob=1.0): 143 | super(DecayingEpsilonGreedy, self).__init__(bandit) 144 | self.estimates = np.array([init_prob] * self.bandit.K) 145 | self.total_count = 0 146 | 147 | def run_one_step(self): 148 | self.total_count += 1 149 | if np.random.random() < 1 / self.total_count: 150 | k = np.random.randint(0, self.bandit.K) 151 | else: 152 | k = np.argmax(self.estimates) 153 | 154 | r = self.bandit.step(k) 155 | self.estimates[k] += 1. / (self.counts[k] + 1) * (r - self.estimates[k]) 156 | 157 | return k 158 | 159 | np.random.seed(1) 160 | decaying_epsilon_greedy_slover = DecayingEpsilonGreedy(bandit_10_arm) 161 | decaying_epsilon_greedy_slover.run(5000) 162 | print('epsilon-decaying greedy algorithm cumulative regrets: ', decaying_epsilon_greedy_slover.regret) 163 | plot_results([decaying_epsilon_greedy_slover], ['DecayingEpsilonGreedy']) 164 | 165 | 166 | """ 167 | upper confidence bound 168 | """ 169 | class UCB(Solver): 170 | def __init__(self, bandit, coef, init_prob=1.0): 171 | super(UCB, self).__init__(bandit) 172 | self.total_count = 0 173 | self.estimates = np.array([init_prob] * self.bandit.K) 174 | self.coef = coef 175 | 176 | def run_one_step(self): 177 | self.total_count += 1 178 | ucb = self.estimates + self.coef * np.sqrt( 179 | np.log(self.total_count) / (2 * (self.counts + 1))) # calculate upper confidence bound 180 | k = np.argmax(ucb) 181 | r = self.bandit.step(k) 182 | self.estimates[k] += 1. / (self.counts[k] + 1) * (r - self.estimates[k]) 183 | return k 184 | 185 | np.random.seed(1) 186 | coef = 1 187 | UCB_solver = UCB(bandit_10_arm, coef) 188 | UCB_solver.run(5000) 189 | print('upper confidence bound cumulative regret: ', UCB_solver.regret) 190 | plot_results([UCB_solver], ['UCB']) 191 | 192 | class ThompsonSampling(Solver): 193 | def __init__(self, bandit): 194 | super(ThompsonSampling, self).__init__(bandit) 195 | self._a = np.ones(self.bandit.K) # list, number of reward = 1 for each bandit 196 | self._b = np.ones(self.bandit.K) # list, number of reward = 0 for each bandit 197 | 198 | def run_one_step(self): 199 | samples = np.random.beta(self._a, self._b) # sampling by beta-distribution 200 | k = np.argmax(samples) # select best reward bandit 201 | r = self.bandit.step(k) 202 | 203 | self._a[k] += r # update 1st param of beta-distribution 204 | self._b[k] += (1 - r) # update 2nd param of beta 205 | return k 206 | 207 | np.random.seed(1) 208 | thompson_sampling_solver = ThompsonSampling(bandit_10_arm) 209 | thompson_sampling_solver.run(5000) 210 | print('thompson sampling algorithm regret:', thompson_sampling_solver.regret) 211 | plot_results([thompson_sampling_solver], ['ThompsonSampling']) 212 | --------------------------------------------------------------------------------