├── .gitignore ├── LICENSE.md ├── README.md ├── ch01 ├── avg.py ├── bandit.py ├── bandit_avg.py └── non_stationary.py ├── ch04 ├── dp.py ├── dp_inplace.py ├── gridworld_play.py ├── policy_eval.py ├── policy_iter.py └── value_iter.py ├── ch05 ├── dice.py ├── importance_sampling.py ├── mc_control.py ├── mc_control_offpolicy.py └── mc_eval.py ├── ch06 ├── q_learning.py ├── q_learning_simple.py ├── sarsa.py ├── sarsa_off_policy.py └── td_eval.py ├── ch07 ├── dezero1.py ├── dezero2.py ├── dezero3.py ├── dezero4.py └── q_learning_nn.py ├── ch08 ├── dqn.py ├── gym_play.py └── replay_buffer.py ├── ch09 ├── actor_critic.py ├── reinforce.py └── simple_pg.py ├── common ├── gridworld.py ├── gridworld_render.py └── utils.py ├── cover.jpeg ├── equations_and_figures_4.zip ├── notebooks ├── 01_bandit.ipynb ├── 04_dynamic_programming.ipynb ├── 05_montecarlo.ipynb ├── 06_temporal_difference.ipynb ├── 07_neural_networks.ipynb ├── 08_dqn.ipynb └── 09_policy_gradient.ipynb ├── pytorch ├── actor_critic.py ├── dqn.py ├── reinforce.py └── simple_pg.py └── series overview.png /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *~ 3 | 4 | build/ 5 | dist/ 6 | dezero.egg-info/ 7 | tmp/ 8 | 9 | *.dot 10 | *.json 11 | src/.idea/* 12 | .idea/* 13 | 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | *.py[cod] 17 | *$py.class 18 | 19 | # C extensions 20 | *.so 21 | 22 | # Distribution / packaging 23 | .Python 24 | build/ 25 | develop-eggs/ 26 | dist/ 27 | downloads/ 28 | eggs/ 29 | .eggs/ 30 | lib/ 31 | lib64/ 32 | parts/ 33 | sdist/ 34 | var/ 35 | wheels/ 36 | pip-wheel-metadata/ 37 | share/python-wheels/ 38 | *.egg-info/ 39 | .installed.cfg 40 | *.egg 41 | MANIFEST 42 | 43 | # PyInstaller 44 | # Usually these files are written by a python script from a template 45 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 46 | *.manifest 47 | *.spec 48 | 49 | # Installer logs 50 | pip-log.txt 51 | pip-delete-this-directory.txt 52 | 53 | # Unit test / coverage reports 54 | htmlcov/ 55 | .tox/ 56 | .nox/ 57 | .coverage 58 | .coverage.* 59 | .cache 60 | nosetests.xml 61 | coverage.xml 62 | *.cover 63 | .hypothesis/ 64 | .pytest_cache/ 65 | 66 | # Translations 67 | *.mo 68 | *.pot 69 | 70 | # Django stuff: 71 | *.log 72 | local_settings.py 73 | db.sqlite3 74 | db.sqlite3-journal 75 | 76 | # Flask stuff: 77 | instance/ 78 | .webassets-cache 79 | 80 | # Scrapy stuff: 81 | .scrapy 82 | 83 | # Sphinx documentation 84 | docs/_build/ 85 | 86 | # PyBuilder 87 | target/ 88 | 89 | # Jupyter Notebook 90 | .ipynb_checkpoints 91 | 92 | # IPython 93 | profile_default/ 94 | ipython_config.py 95 | 96 | # pyenv 97 | .python-version 98 | 99 | # pipenv 100 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 101 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 102 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 103 | # install all needed dependencies. 104 | #Pipfile.lock 105 | 106 | # celery beat schedule file 107 | celerybeat-schedule 108 | 109 | # SageMath parsed files 110 | *.sage.py 111 | 112 | # Environments 113 | .env 114 | .venv 115 | env/ 116 | venv/ 117 | ENV/ 118 | env.bak/ 119 | venv.bak/ 120 | 121 | # Spyder project settings 122 | .spyderproject 123 | .spyproject 124 | 125 | # Rope project settings 126 | .ropeproject 127 | 128 | # mkdocs documentation 129 | /site 130 | 131 | # mypy 132 | .mypy_cache/ 133 | .dmypy.json 134 | dmypy.json 135 | 136 | # Pyre type checker 137 | .pyre/ 138 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2021 Koki Saitoh 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 『밑바닥부터 시작하는 딥러닝 ❹』
: 이번엔 강화 학습이다! 2 | 3 | 4 | 5 | **강화 학습 핵심 이론부터 문제 풀이, 심층 강화 학습까지 한 권에!** 6 | 7 | 이 책의 특징은 제목 그대로 ‘밑바닥부터 만들어가는 것’입니다. 속을 알 수 없는 외부 라이브러리에 의존하지 않고 강화 학습 알고리즘을 처음부터 구현하면서 배웁니다. 그림으로 원리를 이해하고 수학으로 강화 학습 문제를 풀어본 다음, 코드로 구현해 배운 내용을 되짚어봅니다. 코드는 최대한 간결하면서도 강화 학습에서 중요한 아이디어가 명확하게 드러나도록 짰습니다. 단계적으로 수준을 높이면서 다양한 문제에 접할 수 있도록 구성하였으니 강화 학습의 어려움과 재미를 모두 느낄 수 있을 것입니다. 8 | 9 | 10 | [미리보기](https://preview2.hanbit.co.kr/books/yyxd/#p=1) | [알려진 오류(정오표)](https://docs.google.com/document/d/1fsPVXyPF0gpmN57VV6k0uxMfWXUbiQCwno8vCTYpMc8/edit) | [본문 그림과 수식 이미지 모음](https://github.com/WegraLee/deep-learning-from-scratch-4/blob/master/equations_and_figures_4.zip?raw=true) 11 | 12 | --- 13 | 14 | ## 파일 구성 15 | 16 | |폴더 이름 |설명                         | 17 | |:-- |:-- | 18 | |ch01       |1장에서 사용하는 소스 코드 | 19 | |... |... | 20 | |ch09 |9장에서 사용하는 소스 코드 | 21 | |common     |공통으로 사용하는 소스 코드  | 22 | |notebooks  |주피터 노트북 형태의 소스 코드 | 23 | |pytorch    |파이토치용으로 포팅된 소스 코드  | 24 | 25 | ## 주피터 노트북 26 | 이 책의 코드는 주피터 노트북으로도 제공됩니다. 다음 표의 링크를 클릭하면 구글과 캐글 같은 클라우드 서비스에서 노트북을 실행할 수 있습니다. 27 | 28 | | 장 | Colab | 캐글 | Studio Lab | 29 | | :--- | :--- | :--- | :--- | 30 | | 1장 밴디트 문제| [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oreilly-japan/deep-learning-from-scratch-4/blob/master/notebooks/01_bandit.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/oreilly-japan/deep-learning-from-scratch-4/blob/master/notebooks/01_bandit.ipynb) | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/oreilly-japan/deep-learning-from-scratch-4/blob/master/notebooks/01_bandit.ipynb) | 31 | | 4장 동적 프로그래밍 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oreilly-japan/deep-learning-from-scratch-4/blob/master/notebooks/04_dynamic_programming.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/oreilly-japan/deep-learning-from-scratch-4/blob/master/notebooks/04_dynamic_programming.ipynb) | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/oreilly-japan/deep-learning-from-scratch-4/blob/master/notebooks/04_dynamic_programming.ipynb) | 32 | | 5장 몬테카를로법 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oreilly-japan/deep-learning-from-scratch-4/blob/master/notebooks/05_montecarlo.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/oreilly-japan/deep-learning-from-scratch-4/blob/master/notebooks/05_montecarlo.ipynb) | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/oreilly-japan/deep-learning-from-scratch-4/blob/master/notebooks/05_montecarlo.ipynb) | 33 | | 6장 TD법 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oreilly-japan/deep-learning-from-scratch-4/blob/master/notebooks/06_temporal_difference.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/oreilly-japan/deep-learning-from-scratch-4/blob/master/notebooks/06_temporal_difference.ipynb) | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/oreilly-japan/deep-learning-from-scratch-4/blob/master/notebooks/06_temporal_difference.ipynb) | 34 | | 7장 신경망과 Q 러닝 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oreilly-japan/deep-learning-from-scratch-4/blob/master/notebooks/07_neural_networks.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/oreilly-japan/deep-learning-from-scratch-4/blob/master/notebooks/07_neural_networks.ipynb) | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/oreilly-japan/deep-learning-from-scratch-4/blob/master/notebooks/07_neural_networks.ipynb) | 35 | | 8장 DQN | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oreilly-japan/deep-learning-from-scratch-4/blob/master/notebooks/08_dqn.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/oreilly-japan/deep-learning-from-scratch-4/blob/master/notebooks/08_dqn.ipynb) | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/oreilly-japan/deep-learning-from-scratch-4/blob/master/notebooks/08_dqn.ipynb) | 36 | | 9장 정책 경사법 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oreilly-japan/deep-learning-from-scratch-4/blob/master/notebooks/09_policy_gradient.ipynb) | [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/oreilly-japan/deep-learning-from-scratch-4/blob/master/notebooks/09_policy_gradient.ipynb) | [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/oreilly-japan/deep-learning-from-scratch-4/blob/master/notebooks/09_policy_gradient.ipynb) | 37 | 38 | 39 | ## 요구사항 40 | 소스 코드를 실행하려면 아래의 소프트웨어가 설치되어 있어야 합니다. 41 | 42 | * 파이썬 3.x 43 | * NumPy 44 | * Matplotlib 45 | * OpenAI Gym 46 | * DeZero (혹은 파이토치) 47 | 48 | 이 책은 딥러닝 프레임워크로 DeZero를 사용합니다. DeZero는 시리즈 3편에서 만든 프레임워크입니다('pip install dezero' 명령으로 설치할 수 있습니다). 49 | 50 | 파이토치를 사용한 구현은 [pytorch 디렉터리](https://github.com/WegraLee/deep-learning-from-scratch-4/tree/master/pytorch)에서 제공합니다. 51 | 52 | ## 실행 방법 53 | 54 | 예제 코드들은 장별로 나눠 저장되어 있습니다. 실행하려면 다음과 같이 파이썬 명령을 실행하세요. 55 | 56 | ``` 57 | $ python ch01/avg.py 58 | $ python ch08/dqn.py 59 | 60 | $ cd ch09 61 | $ python actor_critic.py 62 | ``` 63 | 64 | 보다시피 각 디렉터리로 이동 후 실행해도 되고, 상위 디렉터리에서 ch0x 디렉터리를 지정해 실행해도 됩니다. 65 | 66 | --- 67 | 68 | ## 팬픽 - 바닷속 딥러닝 어드벤처 (5부작) 69 | 70 | 71 | 72 | "<밑바닥부터 시작하는 딥러닝>의 주인공 생선들은 딥러닝 기술로 바닷속 생태계를 어떻게 혁신하고 있을까요? 어공지능의 첨단을 이끌어가는 밑시딥 생선들과 신나는 모험을 떠나보세요." 73 | 74 | 바닷속 세계를 배경으로, 해양 생물들이 자신의 특성과 필요에 맞는 딥러닝 기술을 개발하여 문제를 해결해 나가는 모험을 그린 연작 소설입니다. 시리즈를 읽으신 분은 더 많은 재미를 느끼실 수 있도록 딥러닝 요소들을 곳곳에 삽입하였습니다. 75 | 76 | 각 편의 주인공과 주제는 다음과 같습니다. 77 | 78 | 1. **시야를 찾아서**: 쏨뱅이(쏨)가 **이미지 처리 기술**을 개발하여 주변 환경을 선명하게 파악 79 | 1. **상어공주**: 괭이상어 공주(꽹)가 **자연어 처리** 기술로 돌고래 왕자와의 사랑을 쟁취 80 | 1. **DeZero의 창조자**: 나뭇잎해룡(잎룡)이 **딥러닝 프레임워크**를 만들어 기술 보급과 협업 촉진 81 | 1. **제발, 가즈아!**: 가자미(가즈아)가 **심층 강화 학습**으로 먹이가 풍부한 새로운 바다 개척 82 | 1. **피쉬카소와 천재의 초상**: 유령실고기(피쉬카소)가 **이미지 생성 모델**로 바닷속 예술계 혁신 83 | 84 | 소설 보러 가기 85 | 86 | --- 87 | 88 | ## 라이선스 89 | 90 | 이 저장소의 소스 코드는 [MIT 라이선스](http://www.opensource.org/licenses/MIT)를 따릅니다. 91 | 상업적 목적으로도 자유롭게 이용하실 수 있습니다. 92 | -------------------------------------------------------------------------------- /ch01/avg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # 기본 구현 4 | np.random.seed(0) # 시드 고정 5 | rewards = [] 6 | 7 | for n in range(1, 11): # 10번 플레이 8 | reward = np.random.rand() # 보상(무작위수로 시뮬레이션) 9 | rewards.append(reward) 10 | Q = sum(rewards) / n 11 | print(Q) 12 | 13 | print('---') 14 | 15 | # 증분 구현 16 | np.random.seed(0) 17 | Q = 0 18 | 19 | for n in range(1, 11): 20 | reward = np.random.rand() 21 | Q = Q + (reward - Q) / n # [식 1.5] 22 | print(Q) 23 | -------------------------------------------------------------------------------- /ch01/bandit.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | class Bandit: 6 | def __init__(self, arms=10): # arms = 슬롯머신 대수 7 | self.rates = np.random.rand(arms # 슬롯머신 각각의 승률 설정(무작위) 8 | 9 | def play(self, arm): 10 | rate = self.rates[arm] 11 | if rate > np.random.rand(): 12 | return 1 13 | else: 14 | return 0 15 | 16 | 17 | class Agent: 18 | def __init__(self, epsilon, action_size=10): 19 | self.epsilon = epsilon # 무작위로 행동할 확률(탐색 확률) 20 | self.Qs = np.zeros(action_size) 21 | self.ns = np.zeros(action_size) 22 | 23 | # 슬롯머신의 가치 추정 24 | def update(self, action, reward): 25 | self.ns[action] += 1 26 | self.Qs[action] += (reward - self.Qs[action]) / self.ns[action] 27 | 28 | # 행동 선택(ε-탐욕 정책) 29 | def get_action(self): 30 | if np.random.rand() < self.epsilon: 31 | return np.random.randint(0, len(self.Qs)) # 무작위 행동 선택 32 | return np.argmax(self.Qs) # 탐욕 행동 선택 33 | 34 | 35 | if __name__ == '__main__': 36 | steps = 1000 37 | epsilon = 0.1 38 | 39 | bandit = Bandit() 40 | agent = Agent(epsilon) 41 | total_reward = 0 42 | total_rewards = [] # 보상 합 43 | rates = [] # 승률 44 | 45 | for step in range(steps): 46 | action = agent.get_action() # 행동 선택 47 | reward = bandit.play(action) # 실제로 플레이하고 보상을 받음 48 | agent.update(action, reward) # 행동과 보상을 통해 학습 49 | total_reward += reward 50 | 51 | total_rewards.append(total_reward) # 현재까지의 보상 합 저장 52 | rates.append(total_reward / (step + 1)) # 현재까지의 승률 저장 53 | 54 | print(total_reward) 55 | 56 | # [그림 1-12] 단계별 보상 총합 57 | plt.ylabel('Total reward') 58 | plt.xlabel('Steps') 59 | plt.plot(total_rewards) 60 | plt.show() 61 | 62 | # [그림 1-13] 단계별 승률 63 | plt.ylabel('Rates') 64 | plt.xlabel('Steps') 65 | plt.plot(rates) 66 | plt.show() 67 | -------------------------------------------------------------------------------- /ch01/bandit_avg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from bandit import Bandit, Agent 4 | 5 | 6 | runs = 200 7 | steps = 1000 8 | epsilon = 0.1 9 | all_rates = np.zeros((runs, steps)) # (200, 1000) 형상 배열 10 | 11 | for run in range(runs): # 200번 실험 12 | bandit = Bandit() 13 | agent = Agent(epsilon) 14 | total_reward = 0 15 | rates = [] 16 | 17 | for step in range(steps): 18 | action = agent.get_action() 19 | reward = bandit.play(action) 20 | agent.update(action, reward) 21 | total_reward += reward 22 | rates.append(total_reward / (step + 1)) 23 | 24 | all_rates[run] = rates # 보상 결과 기록 25 | 26 | avg_rates = np.average(all_rates, axis=0) # 각 단계의 평균 저장 27 | 28 | # [그림 1-16] 단계별 승률(200번 실험 후 평균) 29 | plt.ylabel('Rates') 30 | plt.xlabel('Steps') 31 | plt.plot(avg_rates) 32 | plt.show() 33 | -------------------------------------------------------------------------------- /ch01/non_stationary.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from bandit import Agent 4 | 5 | 6 | class NonStatBandit: 7 | def __init__(self, arms=10): 8 | self.arms = arms 9 | self.rates = np.random.rand(arms) 10 | 11 | def play(self, arm): 12 | rate = self.rates[arm] 13 | self.rates += 0.1 * np.random.randn(self.arms) # 노이즈 추가 14 | if rate > np.random.rand(): 15 | return 1 16 | else: 17 | return 0 18 | 19 | 20 | class AlphaAgent: 21 | def __init__(self, epsilon, alpha, actions=10): 22 | self.epsilon = epsilon 23 | self.Qs = np.zeros(actions) 24 | self.alpha = alpha # 고정값 α 25 | 26 | def update(self, action, reward): 27 | # α로 갱신 28 | self.Qs[action] += (reward - self.Qs[action]) * self.alpha 29 | 30 | def get_action(self): 31 | if np.random.rand() < self.epsilon: 32 | return np.random.randint(0, len(self.Qs)) 33 | return np.argmax(self.Qs) 34 | 35 | 36 | runs = 200 37 | steps = 1000 38 | epsilon = 0.1 39 | alpha = 0.8 40 | agent_types = ['sample average', 'alpha const update'] 41 | results = {} 42 | 43 | for agent_type in agent_types: 44 | all_rates = np.zeros((runs, steps)) # (200, 1000) 45 | 46 | for run in range(runs): 47 | if agent_type == 'sample average': 48 | agent = Agent(epsilon) 49 | else: 50 | agent = AlphaAgent(epsilon, alpha) 51 | 52 | bandit = NonStatBandit() 53 | total_reward = 0 54 | rates = [] 55 | 56 | for step in range(steps): 57 | action = agent.get_action() 58 | reward = bandit.play(action) 59 | agent.update(action, reward) 60 | total_reward += reward 61 | rates.append(total_reward / (step + 1)) 62 | 63 | all_rates[run] = rates 64 | 65 | avg_rates = np.average(all_rates, axis=0) 66 | results[agent_type] = avg_rates 67 | 68 | # [그림 1-20] 표본 평균과 고정값 α에 의한 갱신 비교 69 | plt.figure() 70 | plt.ylabel('Average Rates') 71 | plt.xlabel('Steps') 72 | for key, avg_rates in results.items(): 73 | plt.plot(avg_rates, label=key) 74 | plt.legend() 75 | plt.show() 76 | -------------------------------------------------------------------------------- /ch04/dp.py: -------------------------------------------------------------------------------- 1 | V = {'L1': 0.0, 'L2': 0.0} 2 | new_V = V.copy() 3 | 4 | cnt = 0 # 갱신 횟수 기록 5 | while True: 6 | new_V['L1'] = 0.5 * (-1 + 0.9 * V['L1']) + 0.5 * (1 + 0.9 * V['L2']) 7 | new_V['L2'] = 0.5 * (0 + 0.9 * V['L1']) + 0.5 * (-1 + 0.9 * V['L2']) 8 | 9 | # 갱신된 양의 최댓값 10 | delta = abs(new_V['L1'] - V['L1']) 11 | delta = max(delta, abs(new_V['L2'] - V['L2'])) 12 | V = new_V.copy() 13 | 14 | cnt += 1 15 | if delta < 0.0001: # 임계값 = 0.0001 16 | print(V) 17 | print('갱신 횟수:', cnt) 18 | break 19 | -------------------------------------------------------------------------------- /ch04/dp_inplace.py: -------------------------------------------------------------------------------- 1 | V = {'L1': 0.0, 'L2': 0.0} 2 | 3 | cnt = 0 4 | while True: 5 | t = 0.5 * (-1 + 0.9 * V['L1']) + 0.5 * (1 + 0.9 * V['L2']) 6 | delta = abs(t - V['L1']) 7 | V['L1'] = t 8 | 9 | t = 0.5 * (0 + 0.9 * V['L1']) + 0.5 * (-1 + 0.9 * V['L2']) 10 | delta = max(delta, abs(t - V['L2'])) 11 | V['L2'] = t 12 | 13 | cnt += 1 14 | if delta < 0.0001: 15 | print(V) 16 | print('갱신 횟수:', cnt) 17 | break 18 | -------------------------------------------------------------------------------- /ch04/gridworld_play.py: -------------------------------------------------------------------------------- 1 | if '__file__' in globals(): 2 | import os, sys 3 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 4 | import numpy as np 5 | from common.gridworld import GridWorld 6 | 7 | env = GridWorld() 8 | V = {} 9 | for state in env.states(): 10 | V[state] = np.random.randn() # 더미 상태 가치 함수 11 | env.render_v(V) 12 | -------------------------------------------------------------------------------- /ch04/policy_eval.py: -------------------------------------------------------------------------------- 1 | if '__file__' in globals(): 2 | import os, sys 3 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 4 | from collections import defaultdict 5 | from common.gridworld import GridWorld 6 | 7 | 8 | def eval_onestep(pi, V, env, gamma=0.9): 9 | for state in env.states(): # 각 상태에 접근 10 | if state == env.goal_state: # ❷ 목표 상태에서의 가치 함수는 항상 0 11 | V[state] = 0 12 | continue 13 | 14 | action_probs = pi[state] 15 | new_V = 0 16 | 17 | # 각 행동에 접근 18 | for action, action_prob in action_probs.items(): 19 | next_state = env.next_state(state, action) 20 | r = env.reward(state, action, next_state) 21 | # 새로운 가치 함수 22 | new_V += action_prob * (r + gamma * V[next_state]) 23 | 24 | V[state] = new_V 25 | return V 26 | 27 | 28 | def policy_eval(pi, V, env, gamma, threshold=0.001): 29 | while True: 30 | old_V = V.copy() # 갱신 전 가치 함수 31 | V = eval_onestep(pi, V, env, gamma) 32 | 33 | # 갱신된 양의 최댓값 계산 34 | delta = 0 35 | for state in V.keys(): 36 | t = abs(V[state] - old_V[state]) 37 | if delta < t: 38 | delta = t 39 | 40 | # 임계값과 비교 41 | if delta < threshold: 42 | break 43 | return V 44 | 45 | 46 | if __name__ == '__main__': 47 | env = GridWorld() 48 | gamma = 0.9 # 할인율 49 | 50 | pi = defaultdict(lambda: {0: 0.25, 1: 0.25, 2: 0.25, 3: 0.25}) # 정책 51 | V = defaultdict(lambda: 0) # 가치 함수 52 | 53 | V = policy_eval(pi, V, env, gamma) # 정책 평가 54 | 55 | # [그림 4-13] 무작위 정책의 가치 함수 56 | env.render_v(V, pi) 57 | 58 | -------------------------------------------------------------------------------- /ch04/policy_iter.py: -------------------------------------------------------------------------------- 1 | if '__file__' in globals(): 2 | import os, sys 3 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 4 | from collections import defaultdict 5 | from common.gridworld import GridWorld 6 | from ch04.policy_eval import policy_eval 7 | 8 | 9 | def argmax(d): 10 | """d (dict)""" 11 | max_value = max(d.values()) 12 | max_key = -1 13 | for key, value in d.items(): 14 | if value == max_value: 15 | max_key = key 16 | return max_key 17 | 18 | 19 | def greedy_policy(V, env, gamma): 20 | pi = {} 21 | 22 | for state in env.states(): 23 | action_values = {} 24 | 25 | for action in env.actions(): 26 | next_state = env.next_state(state, action) 27 | r = env.reward(state, action, next_state) 28 | value = r + gamma * V[next_state] 29 | action_values[action] = value 30 | 31 | max_action = argmax(action_values) 32 | action_probs = {0: 0, 1: 0, 2: 0, 3: 0} 33 | action_probs[max_action] = 1.0 34 | pi[state] = action_probs 35 | return pi 36 | 37 | 38 | def policy_iter(env, gamma, threshold=0.001, is_render=True): 39 | pi = defaultdict(lambda: {0: 0.25, 1: 0.25, 2: 0.25, 3: 0.25}) 40 | V = defaultdict(lambda: 0) 41 | 42 | while True: 43 | V = policy_eval(pi, V, env, gamma, threshold) # 평가 44 | new_pi = greedy_policy(V, env, gamma) # 개선 45 | 46 | if is_render: 47 | env.render_v(V, pi) 48 | 49 | if new_pi == pi: # 갱신 여부 확인 50 | break 51 | pi = new_pi 52 | 53 | return pi 54 | 55 | 56 | # 가치 함수와 정책 시각화(책의 [그림 4-16]은 처음과 마지막 그래프만 표시했음) 57 | if __name__ == '__main__': 58 | env = GridWorld() 59 | gamma = 0.9 60 | pi = policy_iter(env, gamma) 61 | -------------------------------------------------------------------------------- /ch04/value_iter.py: -------------------------------------------------------------------------------- 1 | if '__file__' in globals(): 2 | import os, sys 3 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 4 | from collections import defaultdict 5 | from common.gridworld import GridWorld 6 | from ch04.policy_iter import greedy_policy 7 | 8 | 9 | def value_iter_onestep(V, env, gamma): 10 | for state in env.states(): # 모든 상태에 차례로 접근 11 | if state == env.goal_state: # 목표 상태에서의 가치 함수는 항상 0 12 | V[state] = 0 13 | continue 14 | 15 | action_values = [] 16 | for action in env.actions(): # 모든 행동에 차례로 접근 17 | next_state = env.next_state(state, action) 18 | r = env.reward(state, action, next_state) 19 | value = r + gamma * V[next_state] # 새로운 가치 함수 20 | action_values.append(value) 21 | 22 | V[state] = max(action_values) # 최댓값 추출 23 | return V 24 | 25 | 26 | def value_iter(V, env, gamma, threshold=0.001, is_render=True): 27 | while True: 28 | if is_render: 29 | env.render_v(V) 30 | 31 | old_V = V.copy() # 갱신 전 가치 함수 32 | V = value_iter_onestep(V, env, gamma) 33 | 34 | # 갱신된 양의 최댓값 구하기 35 | delta = 0 36 | for state in V.keys(): 37 | t = abs(V[state] - old_V[state]) 38 | if delta < t: 39 | delta = t 40 | 41 | # 임계값과 비교 42 | if delta < threshold: 43 | break 44 | return V 45 | 46 | 47 | # [그림 4-24] 및 [그림 4-25] 48 | if __name__ == '__main__': 49 | V = defaultdict(lambda: 0) 50 | env = GridWorld() 51 | gamma = 0.9 52 | 53 | V = value_iter(V, env, gamma) # 최적 가치 함수 찾기 54 | 55 | pi = greedy_policy(V, env, gamma) # 최적 정책 찾기 56 | env.render_v(V, pi) 57 | -------------------------------------------------------------------------------- /ch05/dice.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def sample(dices=2): 5 | x = 0 6 | for _ in range(dices): 7 | x += np.random.choice([1, 2, 3, 4, 5, 6]) 8 | return x 9 | 10 | 11 | trial = 1000 12 | V, n = 0, 0 13 | 14 | for _ in range(trial): 15 | s = sample() 16 | n += 1 17 | V += (s - V) / n 18 | print(V) 19 | -------------------------------------------------------------------------------- /ch05/importance_sampling.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | x = np.array([1, 2, 3]) # 확률 변수 4 | pi = np.array([0.1, 0.1, 0.8]) # 확률 분포 5 | 6 | # =========== 기댓값의 참값 계산 ================== 7 | e = np.sum(x * pi) 8 | print('참값(E_pi[x]):', e) 9 | 10 | # =========== 몬테카를로법으로 계산 ================== 11 | n = 100 # 샘플 개수 12 | samples = [] 13 | for _ in range(n): 14 | s = np.random.choice(x, p=pi) # pi를 이용한 샘플링 15 | samples.append(s) 16 | 17 | mean = np.mean(samples) # 샘플들의 평균 18 | var = np.var(samples) # 샘플들의 분산 19 | print('몬테카를로법: {:.2f} (분산: {:.2f})'.format(np.mean(samples), np.var(samples))) 20 | 21 | # =========== 중요도 샘플링으로 계산 =========== 22 | b = np.array([0.2, 0.2, 0.6]) #b = np.array([1/3, 1/3, 1/3]) 23 | samples = [] 24 | for _ in range(n): 25 | idx = np.arange(len(b)) # b의 인덱스([0, 1, 2]) 26 | i = np.random.choice(idx, p=b) # b를 사용하여 샘플링 27 | s = x[i] 28 | rho = pi[i] / b[i] # 가중치 29 | samples.append(rho * s) # 샘플 데이터에 가중치를 곱해 저장 30 | 31 | mean = np.mean(samples) 32 | var = np.var(samples) 33 | print('중요도 샘플링: {:.2f} (분산: {:.2f})'.format(np.mean(samples), np.var(samples))) 34 | -------------------------------------------------------------------------------- /ch05/mc_control.py: -------------------------------------------------------------------------------- 1 | import os, sys; sys.path.append(os.path.join(os.path.dirname(__file__), '..')) # for importing the parent dirs 2 | import numpy as np 3 | from collections import defaultdict 4 | from common.gridworld import GridWorld 5 | # from common.utils import greedy_probs 6 | 7 | 8 | def greedy_probs(Q, state, epsilon=0, action_size=4): 9 | qs = [Q[(state, action)] for action in range(action_size)] 10 | max_action = np.argmax(qs) 11 | 12 | base_prob = epsilon / action_size 13 | action_probs = {action: base_prob for action in range(action_size)} #{0: ε/4, 1: ε/4, 2: ε/4, 3: ε/4} 14 | action_probs[max_action] += (1 - epsilon) 15 | return action_probs 16 | 17 | 18 | class McAgent: 19 | def __init__(self): 20 | self.gamma = 0.9 21 | self.epsilon = 0.1 # (첫 번째 개선) ε-탐욕 정책의 ε 22 | self.alpha = 0.1 # (두 번째 개선) Q 함수 갱신 시의 고정값 α 23 | self.action_size = 4 24 | 25 | random_actions = {0: 0.25, 1: 0.25, 2: 0.25, 3: 0.25} 26 | self.pi = defaultdict(lambda: random_actions) 27 | self.Q = defaultdict(lambda: 0) 28 | # self.cnts = defaultdict(lambda: 0) 29 | self.memory = [] 30 | 31 | def get_action(self, state): 32 | action_probs = self.pi[state] 33 | actions = list(action_probs.keys()) 34 | probs = list(action_probs.values()) 35 | return np.random.choice(actions, p=probs) 36 | 37 | def add(self, state, action, reward): 38 | data = (state, action, reward) 39 | self.memory.append(data) 40 | 41 | def reset(self): 42 | self.memory.clear() 43 | 44 | def update(self): 45 | G = 0 46 | for data in reversed(self.memory): 47 | state, action, reward = data 48 | G = self.gamma * G + reward 49 | key = (state, action) 50 | # self.cnts[key] += 1 51 | # self.Q[key] += (G - self.Q[key]) / self.cnts[key] 52 | self.Q[key] += (G - self.Q[key]) * self.alpha 53 | self.pi[state] = greedy_probs(self.Q, state, self.epsilon) 54 | 55 | 56 | env = GridWorld() 57 | agent = McAgent() 58 | 59 | episodes = 10000 60 | for episode in range(episodes): 61 | state = env.reset() 62 | agent.reset() 63 | 64 | while True: 65 | action = agent.get_action(state) 66 | next_state, reward, done = env.step(action) 67 | 68 | agent.add(state, action, reward) 69 | if done: 70 | agent.update() 71 | break 72 | 73 | state = next_state 74 | 75 | # [그림 5-17] 및 [그림 5-18] 76 | env.render_q(agent.Q) 77 | -------------------------------------------------------------------------------- /ch05/mc_control_offpolicy.py: -------------------------------------------------------------------------------- 1 | import os, sys; sys.path.append(os.path.join(os.path.dirname(__file__), '..')) # for importing the parent dirs 2 | from collections import defaultdict 3 | import numpy as np 4 | from common.gridworld import GridWorld 5 | from common.utils import greedy_probs 6 | 7 | 8 | class McOffPolicyAgent: 9 | def __init__(self): 10 | self.gamma = 0.9 11 | self.epsilon = 0.1 12 | self.alpha = 0.2 13 | self.action_size = 4 14 | 15 | random_actions = {0: 0.25, 1: 0.25, 2: 0.25, 3: 0.25} 16 | self.pi = defaultdict(lambda: random_actions) 17 | self.b = defaultdict(lambda: random_actions) 18 | self.Q = defaultdict(lambda: 0) 19 | self.memory = [] 20 | 21 | def get_action(self, state): 22 | action_probs = self.b[state] 23 | actions = list(action_probs.keys()) 24 | probs = list(action_probs.values()) 25 | return np.random.choice(actions, p=probs) 26 | 27 | def add(self, state, action, reward): 28 | data = (state, action, reward) 29 | self.memory.append(data) 30 | 31 | def reset(self): 32 | self.memory.clear() 33 | 34 | def update(self): 35 | G = 0 36 | rho = 1 37 | 38 | for data in reversed(self.memory): 39 | state, action, reward = data 40 | key = (state, action) 41 | 42 | G = self.gamma * rho * G + reward 43 | self.Q[key] += (G - self.Q[key]) * self.alpha 44 | rho *= self.pi[state][action] / self.b[state][action] 45 | 46 | self.pi[state] = greedy_probs(self.Q, state, epsilon=0) 47 | self.b[state] = greedy_probs(self.Q, state, self.epsilon) 48 | 49 | 50 | env = GridWorld() 51 | agent = McOffPolicyAgent() 52 | 53 | episodes = 10000 54 | for episode in range(episodes): 55 | state = env.reset() 56 | agent.reset() 57 | 58 | while True: 59 | action = agent.get_action(state) 60 | next_state, reward, done = env.step(action) 61 | 62 | agent.add(state, action, reward) 63 | if done: 64 | agent.update() 65 | break 66 | 67 | state = next_state 68 | 69 | env.render_q(agent.Q) -------------------------------------------------------------------------------- /ch05/mc_eval.py: -------------------------------------------------------------------------------- 1 | import os, sys; sys.path.append(os.path.join(os.path.dirname(__file__), '..')) # for importing the parent dirs 2 | from collections import defaultdict 3 | import numpy as np 4 | from common.gridworld import GridWorld 5 | 6 | 7 | class RandomAgent: 8 | def __init__(self): 9 | self.gamma = 0.9 10 | self.action_size = 4 11 | 12 | random_actions = {0: 0.25, 1: 0.25, 2: 0.25, 3: 0.25} 13 | self.pi = defaultdict(lambda: random_actions) 14 | self.V = defaultdict(lambda: 0) 15 | self.cnts = defaultdict(lambda: 0) 16 | self.memory = [] 17 | 18 | def get_action(self, state): 19 | action_probs = self.pi[state] 20 | actions = list(action_probs.keys()) 21 | probs = list(action_probs.values()) 22 | return np.random.choice(actions, p=probs) 23 | 24 | def add(self, state, action, reward): 25 | data = (state, action, reward) 26 | self.memory.append(data) 27 | 28 | def reset(self): 29 | self.memory.clear() 30 | 31 | def eval(self): 32 | G = 0 33 | for data in reversed(self.memory): # 역방향으로(reserved) 따라가기 34 | state, action, reward = data 35 | G = self.gamma * G + reward 36 | self.cnts[state] += 1 37 | self.V[state] += (G - self.V[state]) / self.cnts[state] 38 | 39 | 40 | env = GridWorld() 41 | agent = RandomAgent() 42 | 43 | episodes = 1000 44 | for episode in range(episodes): # 에피소드 1000번 수행 45 | state = env.reset() 46 | agent.reset() 47 | 48 | while True: 49 | action = agent.get_action(state) # 행동 선택 50 | next_state, reward, done = env.step(action) # 행동 수행 51 | 52 | agent.add(state, action, reward) # (상태, 행동, 보상) 저장 53 | if done: # 목표에 도달 시 54 | agent.eval() # 몬테카를로법으로 가치 함수 갱신 55 | break # 다음 에피소드 시작 56 | 57 | state = next_state 58 | 59 | # [그림 5-12] 몬테카를로법으로 얻은 가치 함수 60 | env.render_v(agent.V) 61 | -------------------------------------------------------------------------------- /ch06/q_learning.py: -------------------------------------------------------------------------------- 1 | import os, sys; sys.path.append(os.path.join(os.path.dirname(__file__), '..')) # for importing the parent dirs 2 | from collections import defaultdict 3 | import numpy as np 4 | from common.gridworld import GridWorld 5 | from common.utils import greedy_probs 6 | 7 | 8 | class QLearningAgent: 9 | def __init__(self): 10 | self.gamma = 0.9 11 | self.alpha = 0.8 12 | self.epsilon = 0.1 13 | self.action_size = 4 14 | 15 | random_actions = {0: 0.25, 1: 0.25, 2: 0.25, 3: 0.25} 16 | self.pi = defaultdict(lambda: random_actions) 17 | self.b = defaultdict(lambda: random_actions) # 행동 정책 18 | self.Q = defaultdict(lambda: 0) 19 | 20 | def get_action(self, state): 21 | action_probs = self.b[state] # 행동 정책에서 가져옴 22 | actions = list(action_probs.keys()) 23 | probs = list(action_probs.values()) 24 | return np.random.choice(actions, p=probs) 25 | 26 | def update(self, state, action, reward, next_state, done): 27 | if done: # 목표에 도달 28 | next_q_max = 0 29 | else: # 그 외에는 다음 상태에서 Q 함수의 최댓값 계산 30 | next_qs = [self.Q[next_state, a] for a in range(self.action_size)] 31 | next_q_max = max(next_qs) 32 | 33 | # Q 함수 갱신 34 | target = reward + self.gamma * next_q_max 35 | self.Q[state, action] += (target - self.Q[state, action]) * self.alpha 36 | 37 | # 행동 정책과 대상 정책 갱신 38 | self.pi[state] = greedy_probs(self.Q, state, epsilon=0) 39 | self.b[state] = greedy_probs(self.Q, state, self.epsilon) 40 | 41 | 42 | env = GridWorld() 43 | agent = QLearningAgent() 44 | 45 | episodes = 10000 46 | for episode in range(episodes): 47 | state = env.reset() 48 | 49 | while True: 50 | action = agent.get_action(state) 51 | next_state, reward, done = env.step(action) 52 | 53 | agent.update(state, action, reward, next_state, done) 54 | if done: 55 | break 56 | state = next_state 57 | 58 | # [그림 6-15] Q 러닝으로 얻은 Q 함수와 정책 59 | env.render_q(agent.Q) 60 | -------------------------------------------------------------------------------- /ch06/q_learning_simple.py: -------------------------------------------------------------------------------- 1 | import os, sys; sys.path.append(os.path.join(os.path.dirname(__file__), '..')) # for importing the parent dirs 2 | from collections import defaultdict 3 | import numpy as np 4 | from common.gridworld import GridWorld 5 | 6 | 7 | class QLearningAgent: 8 | def __init__(self): 9 | self.gamma = 0.9 10 | self.alpha = 0.8 11 | self.epsilon = 0.1 12 | self.action_size = 4 13 | self.Q = defaultdict(lambda: 0) 14 | 15 | def get_action(self, state): 16 | if np.random.rand() < self.epsilon: # epsilon의 확률로 무작위 행동 17 | return np.random.choice(self.action_size) 18 | else: # (1 - epsilon)의 확률로 탐욕 행동 19 | qs = [self.Q[state, a] for a in range(self.action_size)] 20 | return np.argmax(qs) 21 | 22 | def update(self, state, action, reward, next_state, done): 23 | if done: 24 | next_q_max = 0 25 | else: 26 | next_qs = [self.Q[next_state, a] for a in range(self.action_size)] 27 | next_q_max = max(next_qs) 28 | 29 | target = reward + self.gamma * next_q_max 30 | self.Q[state, action] += (target - self.Q[state, action]) * self.alpha 31 | 32 | 33 | env = GridWorld() 34 | agent = QLearningAgent() 35 | 36 | episodes = 1000 37 | for episode in range(episodes): 38 | state = env.reset() 39 | 40 | while True: 41 | action = agent.get_action(state) 42 | next_state, reward, done = env.step(action) 43 | 44 | agent.update(state, action, reward, next_state, done) 45 | if done: 46 | break 47 | state = next_state 48 | 49 | env.render_q(agent.Q) 50 | -------------------------------------------------------------------------------- /ch06/sarsa.py: -------------------------------------------------------------------------------- 1 | import os, sys; sys.path.append(os.path.join(os.path.dirname(__file__), '..')) # for importing the parent dirs 2 | from collections import defaultdict, deque 3 | import numpy as np 4 | from common.gridworld import GridWorld 5 | from common.utils import greedy_probs 6 | 7 | 8 | class SarsaAgent: 9 | def __init__(self): 10 | self.gamma = 0.9 11 | self.alpha = 0.8 12 | self.epsilon = 0.1 13 | self.action_size = 4 14 | 15 | random_actions = {0: 0.25, 1: 0.25, 2: 0.25, 3: 0.25} 16 | self.pi = defaultdict(lambda: random_actions) 17 | self.Q = defaultdict(lambda: 0) 18 | self.memory = deque(maxlen=2) # deque 사용 19 | 20 | def get_action(self, state): 21 | action_probs = self.pi[state] # pi에서 선택 22 | actions = list(action_probs.keys()) 23 | probs = list(action_probs.values()) 24 | return np.random.choice(actions, p=probs) 25 | 26 | def reset(self): 27 | self.memory.clear() 28 | 29 | def update(self, state, action, reward, done): 30 | self.memory.append((state, action, reward, done)) 31 | if len(self.memory) < 2: 32 | return 33 | 34 | state, action, reward, done = self.memory[0] 35 | next_state, next_action, _, _ = self.memory[1] 36 | next_q = 0 if done else self.Q[next_state, next_action] # 다음 Q 함수 37 | 38 | # TD법으로 self.Q 갱신 39 | target = reward + self.gamma * next_q 40 | self.Q[state, action] += (target - self.Q[state, action]) * self.alpha 41 | 42 | # 정책 개선 43 | self.pi[state] = greedy_probs(self.Q, state, self.epsilon) 44 | 45 | 46 | env = GridWorld() 47 | agent = SarsaAgent() 48 | 49 | episodes = 10000 50 | for episode in range(episodes): 51 | state = env.reset() 52 | agent.reset() 53 | 54 | while True: 55 | action = agent.get_action(state) 56 | next_state, reward, done = env.step(action) 57 | 58 | agent.update(state, action, reward, done) # 매번 호출 59 | 60 | if done: 61 | # 목표에 도달했을 때도 호출 62 | agent.update(next_state, None, None, None) 63 | break 64 | state = next_state 65 | 66 | # [그림 6-7] SARSA로 얻은 결과 67 | env.render_q(agent.Q) 68 | -------------------------------------------------------------------------------- /ch06/sarsa_off_policy.py: -------------------------------------------------------------------------------- 1 | if '__file__' in globals(): 2 | import os, sys 3 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 4 | from collections import defaultdict, deque 5 | import numpy as np 6 | from common.gridworld import GridWorld 7 | from common.utils import greedy_probs 8 | 9 | 10 | class SarsaOffPolicyAgent: 11 | def __init__(self): 12 | self.gamma = 0.9 13 | self.alpha = 0.8 14 | self.epsilon = 0.1 15 | self.action_size = 4 16 | 17 | random_actions = {0: 0.25, 1: 0.25, 2: 0.25, 3: 0.25} 18 | self.pi = defaultdict(lambda: random_actions) 19 | self.b = defaultdict(lambda: random_actions) 20 | self.Q = defaultdict(lambda: 0) 21 | self.memory = deque(maxlen=2) 22 | 23 | def get_action(self, state): 24 | action_probs = self.b[state] # 행동 정책에서 가져옴 25 | actions = list(action_probs.keys()) 26 | probs = list(action_probs.values()) 27 | return np.random.choice(actions, p=probs) 28 | 29 | def reset(self): 30 | self.memory.clear() 31 | 32 | def update(self, state, action, reward, done): 33 | self.memory.append((state, action, reward, done)) 34 | if len(self.memory) < 2: 35 | return 36 | 37 | state, action, reward, done = self.memory[0] 38 | next_state, next_action, _, _ = self.memory[1] 39 | 40 | if done: 41 | next_q = 0 42 | rho = 1 43 | else: 44 | next_q = self.Q[next_state, next_action] 45 | rho = self.pi[next_state][next_action] / self.b[next_state][next_action] # 가중치 rho 계산 46 | 47 | # rho로 TD 목표 보정 48 | target = rho * (reward + self.gamma * next_q) 49 | self.Q[state, action] += (target - self.Q[state, action]) * self.alpha 50 | 51 | # 각각의 정책 개선 52 | self.pi[state] = greedy_probs(self.Q, state, 0) 53 | self.b[state] = greedy_probs(self.Q, state, self.epsilon) 54 | 55 | 56 | env = GridWorld() 57 | agent = SarsaOffPolicyAgent() 58 | 59 | episodes = 10000 60 | for episode in range(episodes): 61 | state = env.reset() 62 | agent.reset() 63 | 64 | while True: 65 | action = agent.get_action(state) 66 | next_state, reward, done = env.step(action) 67 | 68 | agent.update(state, action, reward, done) 69 | 70 | if done: 71 | agent.update(next_state, None, None, None) 72 | break 73 | state = next_state 74 | 75 | # [그림 6-9] 오프-정책 SARSA로 얻은 결과 76 | env.render_q(agent.Q) 77 | -------------------------------------------------------------------------------- /ch06/td_eval.py: -------------------------------------------------------------------------------- 1 | import os, sys; sys.path.append(os.path.join(os.path.dirname(__file__), '..')) # for importing the parent dirs 2 | from collections import defaultdict 3 | import numpy as np 4 | from common.gridworld import GridWorld 5 | 6 | 7 | class TdAgent: 8 | def __init__(self): 9 | self.gamma = 0.9 10 | self.alpha = 0.01 11 | self.action_size = 4 12 | 13 | random_actions = {0: 0.25, 1: 0.25, 2: 0.25, 3: 0.25} 14 | self.pi = defaultdict(lambda: random_actions) 15 | self.V = defaultdict(lambda: 0) 16 | 17 | def get_action(self, state): 18 | action_probs = self.pi[state] 19 | actions = list(action_probs.keys()) 20 | probs = list(action_probs.values()) 21 | return np.random.choice(actions, p=probs) 22 | 23 | def eval(self, state, reward, next_state, done): 24 | next_V = 0 if done else self.V[next_state] # 목표 지점의 가치 함수는 0 25 | target = reward + self.gamma * next_V 26 | self.V[state] += (target - self.V[state]) * self.alpha 27 | 28 | 29 | env = GridWorld() 30 | agent = TdAgent() 31 | 32 | episodes = 1000 33 | for episode in range(episodes): 34 | state = env.reset() 35 | 36 | while True: 37 | action = agent.get_action(state) 38 | next_state, reward, done = env.step(action) 39 | 40 | agent.eval(state, reward, next_state, done) # 매번 호출 41 | if done: 42 | break 43 | state = next_state 44 | 45 | # [그림 6-5] TD법으로 얻은 가치 함수 46 | env.render_v(agent.V) 47 | -------------------------------------------------------------------------------- /ch07/dezero1.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from dezero import Variable 3 | import dezero.functions as F 4 | 5 | # 벡터의 내적 6 | a = np.array([1, 2, 3]) 7 | b = np.array([4, 5, 6]) 8 | a, b = Variable(a), Variable(b) # 생략 가능 9 | c = F.matmul(a, b) 10 | print(c) 11 | 12 | # 행렬의 곱 13 | a = np.array([[1, 2], [3, 4]]) 14 | b = np.array([[5, 6], [7, 8]]) 15 | c = F.matmul(a, b) 16 | print(c) 17 | -------------------------------------------------------------------------------- /ch07/dezero2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from dezero import Variable 3 | 4 | def rosenbrock(x0, x1): 5 | y = 100 * (x1 - x0 ** 2) ** 2 + (x0 - 1) ** 2 6 | return y 7 | 8 | x0 = Variable(np.array(0.0)) 9 | x1 = Variable(np.array(2.0)) 10 | 11 | iters = 10000 # 반복 횟수 12 | lr = 0.001 # 학습률 13 | 14 | for i in range(iters): # 갱신 반복 15 | y = rosenbrock(x0, x1) 16 | 17 | # 이전 반복에서 더해진 미분 초기화 18 | x0.cleargrad() 19 | x1.cleargrad() 20 | 21 | # 미분(역전파) 22 | y.backward() 23 | 24 | # 변수 갱신 25 | x0.data -= lr * x0.grad.data 26 | x1.data -= lr * x1.grad.data 27 | 28 | print(x0, x1) 29 | -------------------------------------------------------------------------------- /ch07/dezero3.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from dezero import Variable 4 | import dezero.functions as F 5 | 6 | # 토이 데이터셋 7 | np.random.seed(0) 8 | x = np.random.rand(100, 1) 9 | y = 5 + 2 * x + np.random.rand(100, 1) 10 | x, y = Variable(x), Variable(y) # 생략 가능 11 | 12 | # 매개변수 정의 13 | W = Variable(np.zeros((1, 1))) 14 | b = Variable(np.zeros(1)) 15 | 16 | # 예측 함수 17 | def predict(x): 18 | y = F.matmul(x, W) + b # 행렬의 곱으로 여러 데이터 일괄 계산 19 | return y 20 | 21 | # 평균 제곱 오차(식 7.2) 계산 함수 22 | def mean_squared_error(x0, x1): 23 | diff = x0 - x1 24 | return F.sum(diff ** 2) / len(diff) 25 | 26 | # 경사 하강법으로 매개변수 갱신 27 | lr = 0.1 28 | iters = 100 29 | 30 | for i in range(iters): 31 | y_pred = predict(x) 32 | loss = mean_squared_error(y, y_pred) 33 | # 또는 loss = F.mean_squared_error(y, y_pred) 34 | 35 | W.cleargrad() 36 | b.cleargrad() 37 | loss.backward() 38 | 39 | W.data -= lr * W.grad.data 40 | b.data -= lr * b.grad.data 41 | 42 | if i % 10 == 0: # 10회 반복마다 출력 43 | print(loss.data) 44 | 45 | print('====') 46 | print('W =', W.data) 47 | print('b =', b.data) 48 | 49 | # [그림 7-9] 학습 후 모델 50 | plt.scatter(x.data, y.data, s=10) 51 | plt.xlabel('x') 52 | plt.ylabel('y') 53 | t = np.arange(0, 1, .01)[:, np.newaxis] 54 | y_pred = predict(t) 55 | plt.plot(t, y_pred.data, color='r') 56 | plt.show() 57 | -------------------------------------------------------------------------------- /ch07/dezero4.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from dezero import Model 4 | from dezero import optimizers # 옵티마이저들이 들어 있는 패키지 임포트 5 | import dezero.layers as L 6 | import dezero.functions as F 7 | 8 | # 데이터셋 생성 9 | np.random.seed(0) 10 | x = np.random.rand(100, 1) 11 | y = np.sin(2 * np.pi * x) + np.random.rand(100, 1) 12 | 13 | lr = 0.2 14 | iters = 10000 15 | 16 | class TwoLayerNet(Model): 17 | def __init__(self, hidden_size, out_size): 18 | super().__init__() 19 | self.l1 = L.Linear(hidden_size) 20 | self.l2 = L.Linear(out_size) 21 | 22 | def forward(self, x): 23 | y = F.sigmoid(self.l1(x)) 24 | y = self.l2(y) 25 | return y 26 | 27 | model = TwoLayerNet(10, 1) 28 | optimizer = optimizers.SGD(lr) # 옵티마이저 생성 29 | optimizer.setup(model) # 최적화할 모델을 옵티마이저에 등록 30 | 31 | for i in range(iters): 32 | y_pred = model(x) 33 | loss = F.mean_squared_error(y, y_pred) 34 | 35 | model.cleargrads() 36 | loss.backward() 37 | 38 | optimizer.update() # 옵티마이저로 매개변수 갱신 39 | if i % 1000 == 0: 40 | print(loss.data) 41 | 42 | # 그래프로 시각화([그림 7-12]와 같음) 43 | plt.scatter(x, y, s=10) 44 | plt.xlabel('x') 45 | plt.ylabel('y') 46 | t = np.arange(0, 1, .01)[:, np.newaxis] 47 | y_pred = model(t) 48 | plt.plot(t, y_pred.data, color='r') 49 | plt.show() 50 | -------------------------------------------------------------------------------- /ch07/q_learning_nn.py: -------------------------------------------------------------------------------- 1 | import os, sys; sys.path.append(os.path.join(os.path.dirname(__file__), '..')) # for importing the parent dirs 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | from dezero import Model 5 | from dezero import optimizers 6 | import dezero.functions as F 7 | import dezero.layers as L 8 | from common.gridworld import GridWorld 9 | 10 | 11 | def one_hot(state): 12 | HEIGHT, WIDTH = 3, 4 13 | vec = np.zeros(HEIGHT * WIDTH, dtype=np.float32) 14 | y, x = state 15 | idx = WIDTH * y + x 16 | vec[idx] = 1.0 17 | return vec[np.newaxis, :] 18 | 19 | 20 | class QNet(Model): 21 | def __init__(self): 22 | super().__init__() 23 | self.l1 = L.Linear(100) # 중간층의 크기 24 | self.l2 = L.Linear(4) # 행동의 크기(가능한 행동의 개수) 25 | 26 | def forward(self, x): 27 | x = F.relu(self.l1(x)) 28 | x = self.l2(x) 29 | return x 30 | 31 | 32 | class QLearningAgent: 33 | def __init__(self): 34 | self.gamma = 0.9 35 | self.lr = 0.01 36 | self.epsilon = 0.1 37 | self.action_size = 4 38 | 39 | self.qnet = QNet() # 신경망 초기화 40 | self.optimizer = optimizers.SGD(self.lr) # 옵티마이저 생성 41 | self.optimizer.setup(self.qnet) # 옵티마이저에 신경망 등록 42 | 43 | def get_action(self, state_vec): 44 | if np.random.rand() < self.epsilon: 45 | return np.random.choice(self.action_size) 46 | else: 47 | qs = self.qnet(state_vec) 48 | return qs.data.argmax() 49 | 50 | def update(self, state, action, reward, next_state, done): 51 | # 다음 상태에서 최대가 되는 Q 함수의 값(next_q) 계산 52 | if done: # 목표 상태에 도달 53 | next_q = np.zeros(1) # [0.] # [0.] (목표 상태에서의 Q 함수는 항상 0) 54 | else: # 그 외 상태 55 | next_qs = self.qnet(next_state) 56 | next_q = next_qs.max(axis=1) 57 | next_q.unchain() # next_q를 역전파 대상에서 제외 58 | 59 | # 목표 60 | target = self.gamma * next_q + reward 61 | # 현재 상태에서의 Q 함수 값(q) 계산 62 | qs = self.qnet(state) 63 | q = qs[:, action] 64 | # 목표(target)와 q의 오차 계산 65 | loss = F.mean_squared_error(target, q) 66 | 67 | # 역전파 → 매개변수 갱신 68 | self.qnet.cleargrads() 69 | loss.backward() 70 | self.optimizer.update() 71 | 72 | return loss.data 73 | 74 | 75 | env = GridWorld() 76 | agent = QLearningAgent() 77 | 78 | episodes = 1000 # 에피소드 수 79 | loss_history = [] 80 | 81 | for episode in range(episodes): 82 | state = env.reset() 83 | state = one_hot(state) 84 | total_loss, cnt = 0, 0 85 | done = False 86 | 87 | while not done: 88 | action = agent.get_action(state) 89 | next_state, reward, done = env.step(action) 90 | next_state = one_hot(next_state) 91 | 92 | loss = agent.update(state, action, reward, next_state, done) 93 | total_loss += loss 94 | cnt += 1 95 | state = next_state 96 | 97 | average_loss = total_loss / cnt 98 | loss_history.append(average_loss) 99 | 100 | 101 | # [그림 7-14] 에피소드별 손실 추이 102 | plt.xlabel('episode') 103 | plt.ylabel('loss') 104 | plt.plot(range(len(loss_history)), loss_history) 105 | plt.show() 106 | 107 | # [그림 7-15] 신경망을 이용한 Q 러닝으로 얻은 Q 함수와 정책 108 | Q = {} 109 | for state in env.states(): 110 | for action in env.action_space: 111 | q = agent.qnet(one_hot(state))[:, action] 112 | Q[state, action] = float(q.data) 113 | env.render_q(Q) 114 | -------------------------------------------------------------------------------- /ch08/dqn.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from collections import deque 3 | import random 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import gym 7 | from dezero import Model 8 | from dezero import optimizers 9 | import dezero.functions as F 10 | import dezero.layers as L 11 | 12 | 13 | class ReplayBuffer: 14 | def __init__(self, buffer_size, batch_size): 15 | self.buffer = deque(maxlen=buffer_size) 16 | self.batch_size = batch_size 17 | 18 | def add(self, state, action, reward, next_state, done): 19 | data = (state, action, reward, next_state, done) 20 | self.buffer.append(data) 21 | 22 | def __len__(self): 23 | return len(self.buffer) 24 | 25 | def get_batch(self): 26 | data = random.sample(self.buffer, self.batch_size) 27 | 28 | state = np.stack([x[0] for x in data]) 29 | action = np.array([x[1] for x in data]) 30 | reward = np.array([x[2] for x in data]) 31 | next_state = np.stack([x[3] for x in data]) 32 | done = np.array([x[4] for x in data]).astype(np.int32) 33 | return state, action, reward, next_state, done 34 | 35 | 36 | class QNet(Model): # 신경망 클래스 37 | def __init__(self, action_size): 38 | super().__init__() 39 | self.l1 = L.Linear(128) 40 | self.l2 = L.Linear(128) 41 | self.l3 = L.Linear(action_size) 42 | 43 | def forward(self, x): 44 | x = F.relu(self.l1(x)) 45 | x = F.relu(self.l2(x)) 46 | x = self.l3(x) 47 | return x 48 | 49 | 50 | class DQNAgent: # 에이전트 클래스 51 | def __init__(self): 52 | self.gamma = 0.98 53 | self.lr = 0.0005 54 | self.epsilon = 0.1 55 | self.buffer_size = 10000 # 경험 재생 버퍼 크기 56 | self.batch_size = 32 # 미니배치 크기 57 | self.action_size = 2 58 | 59 | self.replay_buffer = ReplayBuffer(self.buffer_size, self.batch_size) 60 | self.qnet = QNet(self.action_size) # 원본 신경망 61 | self.qnet_target = QNet(self.action_size) # 목표 신경망 62 | self.optimizer = optimizers.Adam(self.lr) 63 | self.optimizer.setup(self.qnet) # 옵티마이저에 qnet 등록 64 | 65 | def get_action(self, state): 66 | if np.random.rand() < self.epsilon: 67 | return np.random.choice(self.action_size) 68 | else: 69 | state = state[np.newaxis, :] # 배치 처리용 차원 추가 70 | qs = self.qnet(state) 71 | return qs.data.argmax() 72 | 73 | def update(self, state, action, reward, next_state, done): 74 | # 경험 재생 버퍼에 경험 데이터 추가 75 | self.replay_buffer.add(state, action, reward, next_state, done) 76 | if len(self.replay_buffer) < self.batch_size: 77 | return # 데이터가 미니배치 크기만큼 쌓이지 않았다면 여기서 끝 78 | 79 | # 미니배치 크기 이상이 쌓이면 미니배치 생성 80 | state, action, reward, next_state, done = self.replay_buffer.get_batch() 81 | qs = self.qnet(state) 82 | q = qs[np.arange(self.batch_size), action] 83 | 84 | next_qs = self.qnet_target(next_state) 85 | next_q = next_qs.max(axis=1) 86 | next_q.unchain() 87 | target = reward + (1 - done) * self.gamma * next_q 88 | 89 | loss = F.mean_squared_error(q, target) 90 | 91 | self.qnet.cleargrads() 92 | loss.backward() 93 | self.optimizer.update() 94 | 95 | def sync_qnet(self): # 두 신경망 동기화 96 | self.qnet_target = copy.deepcopy(self.qnet) 97 | 98 | episodes = 300 # 에피소드 수 99 | sync_interval = 20 # 신경망 동기화 주기(20번째 에피소드마다 동기화) 100 | env = gym.make('CartPole-v0', render_mode='rgb_array') 101 | agent = DQNAgent() 102 | reward_history = [] # 에피소드별 보상 기록 103 | 104 | for episode in range(episodes): 105 | state = env.reset()[0] 106 | done = False 107 | total_reward = 0 108 | 109 | while not done: 110 | action = agent.get_action(state) 111 | next_state, reward, terminated, truncated, info = env.step(action) 112 | done = terminated | truncated 113 | 114 | agent.update(state, action, reward, next_state, done) 115 | state = next_state 116 | total_reward += reward 117 | 118 | if episode % sync_interval == 0: 119 | agent.sync_qnet() 120 | 121 | reward_history.append(total_reward) 122 | if episode % 10 == 0: 123 | print("episode :{}, total reward : {}".format(episode, total_reward)) 124 | 125 | 126 | # [그림 8-8] 「카트 폴」에서 에피소드별 보상 총합의 추이 127 | plt.xlabel('Episode') 128 | plt.ylabel('Total Reward') 129 | plt.plot(range(len(reward_history)), reward_history) 130 | plt.show() 131 | 132 | 133 | # 학습이 끝난 에이전트에 탐욕 행동을 선택하도록 하여 플레이 134 | agent.epsilon = 0 # 탐욕 정책(무작위로 행동할 확률 ε을 0로 설정) 135 | state = env.reset()[0] 136 | done = False 137 | total_reward = 0 138 | 139 | while not done: 140 | action = agent.get_action(state) 141 | next_state, reward, terminated, truncated, info = env.step(action) 142 | done = terminated | truncated 143 | state = next_state 144 | total_reward += reward 145 | env.render() 146 | print('Total Reward:', total_reward) 147 | -------------------------------------------------------------------------------- /ch08/gym_play.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | 4 | 5 | env = gym.make('CartPole-v0', render_mode='human') 6 | state = env.reset()[0] 7 | done = False 8 | 9 | while not done: # 에피소드가 끝날 때까지 반복 10 | env.render() # 진행 과정 시각화 11 | action = np.random.choice([0, 1]) # 행동 선택(무작위) 12 | next_state, reward, terminated, truncated, info = env.step(action) 13 | done = terminated | truncated # 둘 중 하나만 True면 에피소드 종료 14 | env.close() 15 | -------------------------------------------------------------------------------- /ch08/replay_buffer.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | import random 3 | import numpy as np 4 | import gym 5 | 6 | 7 | class ReplayBuffer: 8 | def __init__(self, buffer_size, batch_size): 9 | self.buffer = deque(maxlen=buffer_size) 10 | self.batch_size = batch_size 11 | 12 | def add(self, state, action, reward, next_state, done): 13 | data = (state, action, reward, next_state, done) 14 | self.buffer.append(data) 15 | 16 | def __len__(self): 17 | return len(self.buffer) 18 | 19 | def get_batch(self): 20 | data = random.sample(self.buffer, self.batch_size) 21 | 22 | state = np.stack([x[0] for x in data]) 23 | action = np.array([x[1] for x in data]) 24 | reward = np.array([x[2] for x in data]) 25 | next_state = np.stack([x[3] for x in data]) 26 | done = np.array([x[4] for x in data]).astype(np.int32) 27 | return state, action, reward, next_state, done 28 | 29 | 30 | env = gym.make('CartPole-v0', render_mode='human') 31 | replay_buffer = ReplayBuffer(buffer_size=10000, batch_size=32) 32 | 33 | for episode in range(10): # 에피소드 10회 수행 34 | state = env.reset()[0] 35 | done = False 36 | 37 | while not done: 38 | action = 0 # 항상 0번째 행동만 수행 39 | next_state, reward, terminated, truncated, info = env.step(action) # 경험 데이터 획득 40 | done = terminated | truncated 41 | 42 | replay_buffer.add(state, action, reward, next_state, done) # 버퍼에 추가 43 | state = next_state 44 | 45 | # 경험 데이터 버퍼로부터 미니배치 생성 46 | state, action, reward, next_state, done = replay_buffer.get_batch() 47 | print(state.shape) # (32, 4) 48 | print(action.shape) # (32,) 49 | print(reward.shape) # (32,) 50 | print(next_state.shape) # (32, 4) 51 | print(done.shape) # (32,) 52 | -------------------------------------------------------------------------------- /ch09/actor_critic.py: -------------------------------------------------------------------------------- 1 | if '__file__' in globals(): 2 | import os, sys 3 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 4 | import numpy as np 5 | import gym 6 | from dezero import Model 7 | from dezero import optimizers 8 | import dezero.functions as F 9 | import dezero.layers as L 10 | 11 | 12 | class PolicyNet(Model): # 정책 신경망 13 | def __init__(self, action_size=2): 14 | super().__init__() 15 | self.l1 = L.Linear(128) 16 | self.l2 = L.Linear(action_size) 17 | 18 | def forward(self, x): 19 | x = F.relu(self.l1(x)) 20 | x = self.l2(x) 21 | x = F.softmax(x) # 확률 출력 22 | return x 23 | 24 | 25 | class ValueNet(Model): # 가치 함수 신경망 26 | def __init__(self): 27 | super().__init__() 28 | self.l1 = L.Linear(128) 29 | self.l2 = L.Linear(1) 30 | 31 | def forward(self, x): 32 | x = F.relu(self.l1(x)) 33 | x = self.l2(x) 34 | return x 35 | 36 | 37 | class Agent: 38 | def __init__(self): 39 | self.gamma = 0.98 40 | self.lr_pi = 0.0002 41 | self.lr_v = 0.0005 42 | self.action_size = 2 43 | 44 | self.pi = PolicyNet() 45 | self.v = ValueNet() 46 | self.optimizer_pi = optimizers.Adam(self.lr_pi).setup(self.pi) 47 | self.optimizer_v = optimizers.Adam(self.lr_v).setup(self.v) 48 | 49 | def get_action(self, state): 50 | state = state[np.newaxis, :] # 배치 처리용 축 추가 51 | probs = self.pi(state) 52 | probs = probs[0] 53 | action = np.random.choice(len(probs), p=probs.data) 54 | return action, probs[action] # 선택된 행동과 해당 행동의 확률 반환 55 | 56 | def update(self, state, action_prob, reward, next_state, done): 57 | # 배치 처리용 축 추가 58 | state = state[np.newaxis, :] 59 | next_state = next_state[np.newaxis, :] 60 | 61 | # 가치 함수(self.v)의 손실 계산 62 | target = reward + self.gamma * self.v(next_state) * (1 - done) # TD 목표 63 | target.unchain() 64 | v = self.v(state) # 현재 상태의 가치 함수 65 | loss_v = F.mean_squared_error(v, target) # 두 값의 평균 제곱 오차 66 | 67 | # 정책(self.pi)의 손실 계산 68 | delta = target - v 69 | delta.unchain() 70 | loss_pi = -F.log(action_prob) * delta 71 | 72 | # 신경망 학습 73 | self.v.cleargrads() 74 | self.pi.cleargrads() 75 | loss_v.backward() 76 | loss_pi.backward() 77 | self.optimizer_v.update() 78 | self.optimizer_pi.update() 79 | 80 | 81 | episodes = 3000 82 | env = gym.make('CartPole-v0', render_mode='rgb_array') 83 | agent = Agent() 84 | reward_history = [] 85 | 86 | for episode in range(episodes): 87 | state = env.reset()[0] 88 | done = False 89 | total_reward = 0 90 | 91 | while not done: 92 | action, prob = agent.get_action(state) 93 | next_state, reward, terminated, truncated, info = env.step(action) 94 | done = terminated | truncated 95 | 96 | agent.update(state, prob, reward, next_state, done) 97 | 98 | state = next_state 99 | total_reward += reward 100 | 101 | reward_history.append(total_reward) 102 | if episode % 100 == 0: 103 | print("episode :{}, total reward : {:.1f}".format(episode, total_reward)) 104 | 105 | 106 | # [그림 9-11]의 왼쪽 그래프 107 | from common.utils import plot_total_reward 108 | plot_total_reward(reward_history) 109 | -------------------------------------------------------------------------------- /ch09/reinforce.py: -------------------------------------------------------------------------------- 1 | if '__file__' in globals(): 2 | import os, sys 3 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 4 | import numpy as np 5 | import gym 6 | from dezero import Model 7 | from dezero import optimizers 8 | import dezero.functions as F 9 | import dezero.layers as L 10 | 11 | 12 | class Policy(Model): 13 | def __init__(self, action_size): 14 | super().__init__() 15 | self.l1 = L.Linear(128) 16 | self.l2 = L.Linear(action_size) 17 | 18 | def forward(self, x): 19 | x = F.relu(self.l1(x)) 20 | x = F.softmax(self.l2(x)) 21 | return x 22 | 23 | 24 | class Agent: 25 | def __init__(self): 26 | self.gamma = 0.98 27 | self.lr = 0.0002 28 | self.action_size = 2 29 | 30 | self.memory = [] 31 | self.pi = Policy(self.action_size) 32 | self.optimizer = optimizers.Adam(self.lr) 33 | self.optimizer.setup(self.pi) 34 | 35 | def get_action(self, state): 36 | state = state[np.newaxis, :] 37 | probs = self.pi(state) 38 | probs = probs[0] 39 | action = np.random.choice(len(probs), p=probs.data) 40 | return action, probs[action] 41 | 42 | def add(self, reward, prob): 43 | data = (reward, prob) 44 | self.memory.append(data) 45 | 46 | def update(self): 47 | self.pi.cleargrads() 48 | 49 | G, loss = 0, 0 50 | for reward, prob in reversed(self.memory): 51 | G = reward + self.gamma * G # 수익 G 계산 52 | loss += -F.log(prob) * G # 손실 함수 계산 53 | 54 | loss.backward() 55 | self.optimizer.update() 56 | self.memory = [] 57 | 58 | 59 | episodes = 3000 60 | env = gym.make('CartPole-v0', render_mode='rgb_array') 61 | agent = Agent() 62 | reward_history = [] 63 | 64 | for episode in range(episodes): 65 | state = env.reset()[0] 66 | done = False 67 | sum_reward = 0 68 | 69 | while not done: 70 | action, prob = agent.get_action(state) 71 | next_state, reward, terminated, truncated, info = env.step(action) 72 | done = terminated | truncated 73 | 74 | agent.add(reward, prob) 75 | state = next_state 76 | sum_reward += reward 77 | 78 | agent.update() 79 | 80 | reward_history.append(sum_reward) 81 | if episode % 100 == 0: 82 | print("episode :{}, total reward : {:.1f}".format(episode, sum_reward)) 83 | 84 | 85 | # [그림 9-4]의 왼쪽 그래프 86 | from common.utils import plot_total_reward 87 | plot_total_reward(reward_history) 88 | -------------------------------------------------------------------------------- /ch09/simple_pg.py: -------------------------------------------------------------------------------- 1 | if '__file__' in globals(): 2 | import os, sys 3 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 4 | import numpy as np 5 | import gym 6 | from dezero import Model 7 | from dezero import optimizers 8 | import dezero.functions as F 9 | import dezero.layers as L 10 | 11 | 12 | class Policy(Model): 13 | def __init__(self, action_size): 14 | super().__init__() 15 | self.l1 = L.Linear(128) # 첫 번째 계층 16 | self.l2 = L.Linear(action_size) # 두 번째 계층 17 | 18 | def forward(self, x): 19 | x = F.relu(self.l1(x)) # 첫 번째 계층에서는 ReLU 함수 사용 20 | x = F.softmax(self.l2(x)) # 두 번째 계층에서는 소프트맥스 함수 사용 21 | return x 22 | 23 | 24 | class Agent: 25 | def __init__(self): 26 | self.gamma = 0.98 27 | self.lr = 0.0002 28 | self.action_size = 2 29 | 30 | self.memory = [] 31 | self.pi = Policy(self.action_size) 32 | self.optimizer = optimizers.Adam(self.lr) 33 | self.optimizer.setup(self.pi) 34 | 35 | def get_action(self, state): 36 | state = state[np.newaxis, :] # 배치 처리용 축 추가 37 | probs = self.pi(state) # 순전파 수행 38 | probs = probs[0] 39 | action = np.random.choice(len(probs), p=probs.data) # 행동 선택 40 | return action, probs[action] # 선택된 행동과 확률 반환 41 | 42 | def add(self, reward, prob): 43 | data = (reward, prob) 44 | self.memory.append(data) 45 | 46 | def update(self): 47 | self.pi.cleargrads() 48 | 49 | G, loss = 0, 0 50 | for reward, prob in reversed(self.memory): # 수익 G 계산 51 | G = reward + self.gamma * G 52 | 53 | for reward, prob in self.memory: # 손실 함수 계산 54 | loss += -F.log(prob) * G 55 | 56 | loss.backward() 57 | self.optimizer.update() 58 | self.memory = [] # 메모리 초기화 59 | 60 | 61 | episodes = 3000 62 | env = gym.make('CartPole-v0', render_mode='rgb_array') 63 | agent = Agent() 64 | reward_history = [] 65 | 66 | for episode in range(episodes): 67 | state = env.reset()[0] 68 | done = False 69 | total_reward = 0 70 | 71 | while not done: 72 | action, prob = agent.get_action(state) # 행동 선택 73 | next_state, reward, terminated, truncated, info = env.step(action) # 행동 수행 74 | done = terminated | truncated 75 | 76 | agent.add(reward, prob) # 보상과 행동의 확률을 에이전트에 추가 77 | state = next_state # 상태 전이 78 | total_reward += reward # 보상 총합 계산 79 | 80 | agent.update() # 정책 갱신 81 | 82 | reward_history.append(total_reward) 83 | if episode % 100 == 0: 84 | print("episode :{}, total reward : {:.1f}".format(episode, total_reward)) 85 | 86 | 87 | # [그림 9-2] 에피소드별 보상 합계 추이 88 | from common.utils import plot_total_reward 89 | plot_total_reward(reward_history) 90 | -------------------------------------------------------------------------------- /common/gridworld.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import common.gridworld_render as render_helper 3 | 4 | 5 | class GridWorld: 6 | def __init__(self): 7 | self.action_space = [0, 1, 2, 3] # 행동 공간(가능한 행동들) 8 | self.action_meaning = { # 행동의 의미 9 | 0: "UP", 10 | 1: "DOWN", 11 | 2: "LEFT", 12 | 3: "RIGHT", 13 | } 14 | 15 | self.reward_map = np.array( # 보상 맵(각 좌표의 보상 값) 16 | [[0, 0, 0, 1.0], 17 | [0, None, 0, -1.0], 18 | [0, 0, 0, 0]] 19 | ) 20 | self.goal_state = (0, 3) # 목표 상태(좌표) 21 | self.wall_state = (1, 1) # 벽 상태(좌표) 22 | self.start_state = (2, 0) # 시작 상태(좌표) 23 | self.agent_state = self.start_state # 에이전트 초기 상태(좌표) 24 | 25 | @property 26 | def height(self): 27 | return len(self.reward_map) 28 | 29 | @property 30 | def width(self): 31 | return len(self.reward_map[0]) 32 | 33 | @property 34 | def shape(self): 35 | return self.reward_map.shape 36 | 37 | def actions(self): 38 | return self.action_space 39 | 40 | def states(self): 41 | for h in range(self.height): 42 | for w in range(self.width): 43 | yield (h, w) 44 | 45 | def next_state(self, state, action): 46 | # 이동 위치 계산 47 | action_move_map = [(-1, 0), (1, 0), (0, -1), (0, 1)] 48 | move = action_move_map[action] 49 | next_state = (state[0] + move[0], state[1] + move[1]) 50 | ny, nx = next_state 51 | 52 | # 이동한 위치가 그리드 월드의 테두리 밖이나 벽인가? 53 | if nx < 0 or nx >= self.width or ny < 0 or ny >= self.height: 54 | next_state = state 55 | elif next_state == self.wall_state: 56 | next_state = state 57 | 58 | return next_state # 다음 상태 반환 59 | 60 | def reward(self, state, action, next_state): 61 | return self.reward_map[next_state] 62 | 63 | def reset(self): 64 | self.agent_state = self.start_state 65 | return self.agent_state 66 | 67 | def step(self, action): 68 | state = self.agent_state 69 | next_state = self.next_state(state, action) 70 | reward = self.reward(state, action, next_state) 71 | done = (next_state == self.goal_state) 72 | 73 | self.agent_state = next_state 74 | return next_state, reward, done 75 | 76 | def render_v(self, v=None, policy=None, print_value=True): 77 | renderer = render_helper.Renderer(self.reward_map, self.goal_state, 78 | self.wall_state) 79 | renderer.render_v(v, policy, print_value) 80 | 81 | def render_q(self, q=None, print_value=True): 82 | renderer = render_helper.Renderer(self.reward_map, self.goal_state, 83 | self.wall_state) 84 | renderer.render_q(q, print_value) 85 | -------------------------------------------------------------------------------- /common/gridworld_render.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | class Renderer: 7 | def __init__(self, reward_map, goal_state, wall_state): 8 | self.reward_map = reward_map 9 | self.goal_state = goal_state 10 | self.wall_state = wall_state 11 | self.ys = len(self.reward_map) 12 | self.xs = len(self.reward_map[0]) 13 | 14 | self.ax = None 15 | self.fig = None 16 | self.first_flg = True 17 | 18 | def set_figure(self, figsize=None): 19 | fig = plt.figure(figsize=figsize) 20 | self.ax = fig.add_subplot(111) 21 | ax = self.ax 22 | ax.clear() 23 | ax.tick_params(labelbottom=False, labelleft=False, labelright=False, labeltop=False) 24 | ax.set_xticks(range(self.xs)) 25 | ax.set_yticks(range(self.ys)) 26 | ax.set_xlim(0, self.xs) 27 | ax.set_ylim(0, self.ys) 28 | ax.grid(True) 29 | 30 | def render_v(self, v=None, policy=None, print_value=True): 31 | self.set_figure() 32 | 33 | ys, xs = self.ys, self.xs 34 | ax = self.ax 35 | 36 | if v is not None: 37 | color_list = ['red', 'white', 'green'] 38 | cmap = matplotlib.colors.LinearSegmentedColormap.from_list( 39 | 'colormap_name', color_list) 40 | 41 | # dict -> ndarray 42 | v_dict = v 43 | v = np.zeros(self.reward_map.shape) 44 | for state, value in v_dict.items(): 45 | v[state] = value 46 | 47 | vmax, vmin = v.max(), v.min() 48 | vmax = max(vmax, abs(vmin)) 49 | vmin = -1 * vmax 50 | vmax = 1 if vmax < 1 else vmax 51 | vmin = -1 if vmin > -1 else vmin 52 | 53 | ax.pcolormesh(np.flipud(v), cmap=cmap, vmin=vmin, vmax=vmax) 54 | 55 | for y in range(ys): 56 | for x in range(xs): 57 | state = (y, x) 58 | r = self.reward_map[y, x] 59 | if r != 0 and r is not None: 60 | txt = 'R ' + str(r) 61 | if state == self.goal_state: 62 | txt = txt + ' (GOAL)' 63 | ax.text(x+.1, ys-y-0.9, txt) 64 | 65 | if (v is not None) and state != self.wall_state: 66 | if print_value: 67 | offsets = [(0.4, -0.15), (-0.15, -0.3)] 68 | key = 0 69 | if v.shape[0] > 7: key = 1 70 | offset = offsets[key] 71 | ax.text(x+offset[0], ys-y+offset[1], "{:12.2f}".format(v[y, x])) 72 | 73 | if policy is not None and state != self.wall_state: 74 | actions = policy[state] 75 | max_actions = [kv[0] for kv in actions.items() if kv[1] == max(actions.values())] 76 | 77 | arrows = ["↑", "↓", "←", "→"] 78 | offsets = [(0, 0.1), (0, -0.1), (-0.1, 0), (0.1, 0)] 79 | for action in max_actions: 80 | arrow = arrows[action] 81 | offset = offsets[action] 82 | if state == self.goal_state: 83 | continue 84 | ax.text(x+0.45+offset[0], ys-y-0.5+offset[1], arrow) 85 | 86 | if state == self.wall_state: 87 | ax.add_patch(plt.Rectangle((x,ys-y-1), 1, 1, fc=(0.4, 0.4, 0.4, 1.))) 88 | plt.show() 89 | 90 | def render_q(self, q, show_greedy_policy=True): 91 | self.set_figure() 92 | 93 | ys, xs = self.ys, self.xs 94 | ax = self.ax 95 | action_space = [0, 1, 2, 3] 96 | 97 | qmax, qmin = max(q.values()), min(q.values()) 98 | qmax = max(qmax, abs(qmin)) 99 | qmin = -1 * qmax 100 | qmax = 1 if qmax < 1 else qmax 101 | qmin = -1 if qmin > -1 else qmin 102 | 103 | 104 | color_list = ['red', 'white', 'green'] 105 | cmap = matplotlib.colors.LinearSegmentedColormap.from_list( 106 | 'colormap_name', color_list) 107 | 108 | for y in range(ys): 109 | for x in range(xs): 110 | for action in action_space: 111 | state = (y, x) 112 | r = self.reward_map[y, x] 113 | if r != 0 and r is not None: 114 | txt = 'R ' + str(r) 115 | if state == self.goal_state: 116 | txt = txt + ' (GOAL)' 117 | ax.text(x+.05, ys-y-0.95, txt) 118 | 119 | if state == self.goal_state: 120 | continue 121 | 122 | tx, ty = x, ys-y-1 123 | 124 | action_map = { 125 | 0: ((0.5+tx, 0.5+ty), (tx+1, ty+1), (tx, ty+1)), 126 | 1: ((tx, ty), (tx+1, ty), (tx+0.5, ty+0.5)), 127 | 2: ((tx, ty), (tx+0.5, ty+0.5), (tx, ty+1)), 128 | 3: ((0.5+tx, 0.5+ty), (tx+1, ty), (tx+1, ty+1)), 129 | } 130 | offset_map = { 131 | 0: (0.1, 0.8), 132 | 1: (0.1, 0.1), 133 | 2: (-0.2, 0.4), 134 | 3: (0.4, 0.4), 135 | } 136 | if state == self.wall_state: 137 | ax.add_patch(plt.Rectangle((tx, ty), 1, 1, fc=(0.4, 0.4, 0.4, 1.))) 138 | elif state in self.goal_state: 139 | ax.add_patch(plt.Rectangle((tx, ty), 1, 1, fc=(0., 1., 0., 1.))) 140 | else: 141 | 142 | tq = q[(state, action)] 143 | color_scale = 0.5 + (tq / qmax) / 2 # normalize: 0.0-1.0 144 | 145 | poly = plt.Polygon(action_map[action],fc=cmap(color_scale)) 146 | ax.add_patch(poly) 147 | 148 | offset= offset_map[action] 149 | ax.text(tx+offset[0], ty+offset[1], "{:12.2f}".format(tq)) 150 | plt.show() 151 | 152 | if show_greedy_policy: 153 | policy = {} 154 | for y in range(self.ys): 155 | for x in range(self.xs): 156 | state = (y, x) 157 | qs = [q[state, action] for action in range(4)] # action_size 158 | max_action = np.argmax(qs) 159 | probs = {0:0.0, 1:0.0, 2:0.0, 3:0.0} 160 | probs[max_action] = 1 161 | policy[state] = probs 162 | self.render_v(None, policy) -------------------------------------------------------------------------------- /common/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | def argmax(xs): 6 | idxes = [i for i, x in enumerate(xs) if x == max(xs)] 7 | if len(idxes) == 1: 8 | return idxes[0] 9 | elif len(idxes) == 0: 10 | return np.random.choice(len(xs)) 11 | 12 | selected = np.random.choice(idxes) 13 | return selected 14 | 15 | 16 | def greedy_probs(Q, state, epsilon=0, action_size=4): 17 | qs = [Q[(state, action)] for action in range(action_size)] 18 | max_action = argmax(qs) # OR np.argmax(qs) 19 | base_prob = epsilon / action_size 20 | action_probs = {action: base_prob for action in range(action_size)} #{0: ε/4, 1: ε/4, 2: ε/4, 3: ε/4} 21 | action_probs[max_action] += (1 - epsilon) 22 | return action_probs 23 | 24 | 25 | def plot_total_reward(reward_history): 26 | plt.xlabel('Episode') 27 | plt.ylabel('Total Reward') 28 | plt.plot(range(len(reward_history)), reward_history) 29 | plt.show() 30 | 31 | 32 | -------------------------------------------------------------------------------- /cover.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WegraLee/deep-learning-from-scratch-4/b82cd6432b4e63ce6a4ab2b925fc74a1227fb06a/cover.jpeg -------------------------------------------------------------------------------- /equations_and_figures_4.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WegraLee/deep-learning-from-scratch-4/b82cd6432b4e63ce6a4ab2b925fc74a1227fb06a/equations_and_figures_4.zip -------------------------------------------------------------------------------- /notebooks/08_dqn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "64a9da69", 6 | "metadata": {}, 7 | "source": [ 8 | "## SETUP" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "1e89c815", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "!pip install gym[classic_control]\n", 19 | "!pip install dezero" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "id": "d354811c", 25 | "metadata": {}, 26 | "source": [ 27 | "## ch08/gym_play.py" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "id": "923b86c1", 34 | "metadata": {}, 35 | "outputs": [ 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "action: 0\n", 41 | "action: 0\n", 42 | "action: 0\n", 43 | "action: 1\n", 44 | "action: 1\n", 45 | "action: 0\n", 46 | "action: 0\n", 47 | "action: 0\n", 48 | "action: 0\n", 49 | "action: 0\n", 50 | "action: 0\n", 51 | "action: 1\n", 52 | "action: 0\n" 53 | ] 54 | } 55 | ], 56 | "source": [ 57 | "import numpy as np\n", 58 | "import gym\n", 59 | "\n", 60 | "\n", 61 | "env = gym.make('CartPole-v0')\n", 62 | "state = env.reset()\n", 63 | "done = False\n", 64 | "\n", 65 | "while not done:\n", 66 | " #env.render()\n", 67 | " action = np.random.choice([0, 1])\n", 68 | " next_state, reward, done, info = env.step(action)\n", 69 | " print('action:', action)\n", 70 | "#env.close()" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "id": "f3c0b72d", 76 | "metadata": {}, 77 | "source": [ 78 | "## ch08/replay_buffer.py" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 3, 84 | "id": "41011871", 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "(32, 4)\n", 92 | "(32,)\n", 93 | "(32,)\n", 94 | "(32, 4)\n", 95 | "(32,)\n" 96 | ] 97 | } 98 | ], 99 | "source": [ 100 | "from collections import deque\n", 101 | "import random\n", 102 | "import numpy as np\n", 103 | "import gym\n", 104 | "\n", 105 | "\n", 106 | "class ReplayBuffer:\n", 107 | " def __init__(self, buffer_size, batch_size):\n", 108 | " self.buffer = deque(maxlen=buffer_size)\n", 109 | " self.batch_size = batch_size\n", 110 | "\n", 111 | " def add(self, state, action, reward, next_state, done):\n", 112 | " data = (state, action, reward, next_state, done)\n", 113 | " self.buffer.append(data)\n", 114 | "\n", 115 | " def __len__(self):\n", 116 | " return len(self.buffer)\n", 117 | "\n", 118 | " def get_batch(self):\n", 119 | " data = random.sample(self.buffer, self.batch_size)\n", 120 | "\n", 121 | " state = np.stack([x[0] for x in data])\n", 122 | " action = np.array([x[1] for x in data])\n", 123 | " reward = np.array([x[2] for x in data])\n", 124 | " next_state = np.stack([x[3] for x in data])\n", 125 | " done = np.array([x[4] for x in data]).astype(np.int32)\n", 126 | " return state, action, reward, next_state, done\n", 127 | "\n", 128 | "\n", 129 | "env = gym.make('CartPole-v0')\n", 130 | "replay_buffer = ReplayBuffer(buffer_size=10000, batch_size=32)\n", 131 | "\n", 132 | "for episode in range(10):\n", 133 | " state = env.reset()\n", 134 | " done = False\n", 135 | "\n", 136 | " while not done:\n", 137 | " action = 0\n", 138 | " next_state, reward, done, info = env.step(action)\n", 139 | " replay_buffer.add(state, action, reward, next_state, done)\n", 140 | " state = next_state\n", 141 | "\n", 142 | "state, action, reward, next_state, done = replay_buffer.get_batch()\n", 143 | "print(state.shape) # (32, 4)\n", 144 | "print(action.shape) # (32,)\n", 145 | "print(reward.shape) # (32,)\n", 146 | "print(next_state.shape) # (32, 4)\n", 147 | "print(done.shape) # (32,)" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "id": "13c09484", 153 | "metadata": {}, 154 | "source": [ 155 | "## ch08/dqn.py" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 4, 161 | "id": "5e448679", 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "name": "stdout", 166 | "output_type": "stream", 167 | "text": [ 168 | "episode :0, total reward : 14.0\n", 169 | "episode :10, total reward : 98.0\n", 170 | "episode :20, total reward : 38.0\n", 171 | "episode :30, total reward : 20.0\n", 172 | "episode :40, total reward : 12.0\n", 173 | "episode :50, total reward : 10.0\n", 174 | "episode :60, total reward : 23.0\n", 175 | "episode :70, total reward : 17.0\n", 176 | "episode :80, total reward : 9.0\n", 177 | "episode :90, total reward : 124.0\n", 178 | "episode :100, total reward : 122.0\n", 179 | "episode :110, total reward : 186.0\n", 180 | "episode :120, total reward : 156.0\n", 181 | "episode :130, total reward : 198.0\n", 182 | "episode :140, total reward : 146.0\n", 183 | "episode :150, total reward : 200.0\n", 184 | "episode :160, total reward : 200.0\n", 185 | "episode :170, total reward : 193.0\n", 186 | "episode :180, total reward : 200.0\n", 187 | "episode :190, total reward : 142.0\n", 188 | "episode :200, total reward : 200.0\n", 189 | "episode :210, total reward : 200.0\n", 190 | "episode :220, total reward : 179.0\n", 191 | "episode :230, total reward : 149.0\n", 192 | "episode :240, total reward : 200.0\n", 193 | "episode :250, total reward : 200.0\n", 194 | "episode :260, total reward : 200.0\n", 195 | "episode :270, total reward : 200.0\n", 196 | "episode :280, total reward : 200.0\n", 197 | "episode :290, total reward : 200.0\n" 198 | ] 199 | }, 200 | { 201 | "data": { 202 | "image/png": "\n", 203 | "text/plain": [ 204 | "
" 205 | ] 206 | }, 207 | "metadata": { 208 | "needs_background": "light" 209 | }, 210 | "output_type": "display_data" 211 | }, 212 | { 213 | "name": "stdout", 214 | "output_type": "stream", 215 | "text": [ 216 | "Total Reward: 200.0\n" 217 | ] 218 | } 219 | ], 220 | "source": [ 221 | "import copy\n", 222 | "from collections import deque\n", 223 | "import random\n", 224 | "import matplotlib.pyplot as plt\n", 225 | "import numpy as np\n", 226 | "import gym\n", 227 | "from dezero import Model\n", 228 | "from dezero import optimizers\n", 229 | "import dezero.functions as F\n", 230 | "import dezero.layers as L\n", 231 | "\n", 232 | "\n", 233 | "class ReplayBuffer:\n", 234 | " def __init__(self, buffer_size, batch_size):\n", 235 | " self.buffer = deque(maxlen=buffer_size)\n", 236 | " self.batch_size = batch_size\n", 237 | "\n", 238 | " def add(self, state, action, reward, next_state, done):\n", 239 | " data = (state, action, reward, next_state, done)\n", 240 | " self.buffer.append(data)\n", 241 | "\n", 242 | " def __len__(self):\n", 243 | " return len(self.buffer)\n", 244 | "\n", 245 | " def get_batch(self):\n", 246 | " data = random.sample(self.buffer, self.batch_size)\n", 247 | "\n", 248 | " state = np.stack([x[0] for x in data])\n", 249 | " action = np.array([x[1] for x in data])\n", 250 | " reward = np.array([x[2] for x in data])\n", 251 | " next_state = np.stack([x[3] for x in data])\n", 252 | " done = np.array([x[4] for x in data]).astype(np.int32)\n", 253 | " return state, action, reward, next_state, done\n", 254 | "\n", 255 | "\n", 256 | "class QNet(Model):\n", 257 | " def __init__(self, action_size):\n", 258 | " super().__init__()\n", 259 | " self.l1 = L.Linear(128)\n", 260 | " self.l2 = L.Linear(128)\n", 261 | " self.l3 = L.Linear(action_size)\n", 262 | "\n", 263 | " def forward(self, x):\n", 264 | " x = F.relu(self.l1(x))\n", 265 | " x = F.relu(self.l2(x))\n", 266 | " x = self.l3(x)\n", 267 | " return x\n", 268 | "\n", 269 | "\n", 270 | "class DQNAgent:\n", 271 | " def __init__(self):\n", 272 | " self.gamma = 0.98\n", 273 | " self.lr = 0.0005\n", 274 | " self.epsilon = 0.1\n", 275 | " self.buffer_size = 10000\n", 276 | " self.batch_size = 32\n", 277 | " self.action_size = 2\n", 278 | "\n", 279 | " self.replay_buffer = ReplayBuffer(self.buffer_size, self.batch_size)\n", 280 | " self.qnet = QNet(self.action_size)\n", 281 | " self.qnet_target = QNet(self.action_size)\n", 282 | " self.optimizer = optimizers.Adam(self.lr)\n", 283 | " self.optimizer.setup(self.qnet)\n", 284 | "\n", 285 | " def get_action(self, state):\n", 286 | " if np.random.rand() < self.epsilon:\n", 287 | " return np.random.choice(self.action_size)\n", 288 | " else:\n", 289 | " state = state[np.newaxis, :]\n", 290 | " qs = self.qnet(state)\n", 291 | " return qs.data.argmax()\n", 292 | "\n", 293 | " def update(self, state, action, reward, next_state, done):\n", 294 | " self.replay_buffer.add(state, action, reward, next_state, done)\n", 295 | " if len(self.replay_buffer) < self.batch_size:\n", 296 | " return\n", 297 | "\n", 298 | " state, action, reward, next_state, done = self.replay_buffer.get_batch()\n", 299 | " qs = self.qnet(state)\n", 300 | " q = qs[np.arange(self.batch_size), action]\n", 301 | "\n", 302 | " next_qs = self.qnet_target(next_state)\n", 303 | " next_q = next_qs.max(axis=1)\n", 304 | " next_q.unchain()\n", 305 | " target = reward + (1 - done) * self.gamma * next_q\n", 306 | "\n", 307 | " loss = F.mean_squared_error(q, target)\n", 308 | "\n", 309 | " self.qnet.cleargrads()\n", 310 | " loss.backward()\n", 311 | " self.optimizer.update()\n", 312 | "\n", 313 | " def sync_qnet(self):\n", 314 | " self.qnet_target = copy.deepcopy(self.qnet)\n", 315 | "\n", 316 | "episodes = 300\n", 317 | "sync_interval = 20\n", 318 | "env = gym.make('CartPole-v0')\n", 319 | "agent = DQNAgent()\n", 320 | "reward_history = []\n", 321 | "\n", 322 | "for episode in range(episodes):\n", 323 | " state = env.reset()\n", 324 | " done = False\n", 325 | " total_reward = 0\n", 326 | "\n", 327 | " while not done:\n", 328 | " action = agent.get_action(state)\n", 329 | " next_state, reward, done, info = env.step(action)\n", 330 | "\n", 331 | " agent.update(state, action, reward, next_state, done)\n", 332 | " state = next_state\n", 333 | " total_reward += reward\n", 334 | "\n", 335 | " if episode % sync_interval == 0:\n", 336 | " agent.sync_qnet()\n", 337 | "\n", 338 | " reward_history.append(total_reward)\n", 339 | " if episode % 10 == 0:\n", 340 | " print(\"episode :{}, total reward : {}\".format(episode, total_reward))\n", 341 | "\n", 342 | "\n", 343 | "# === Plot ===\n", 344 | "plt.xlabel('Episode')\n", 345 | "plt.ylabel('Total Reward')\n", 346 | "plt.plot(range(len(reward_history)), reward_history)\n", 347 | "plt.show()\n", 348 | "\n", 349 | "\n", 350 | "# === Play CartPole ===\n", 351 | "agent.epsilon = 0 # greedy policy\n", 352 | "state = env.reset()\n", 353 | "done = False\n", 354 | "total_reward = 0\n", 355 | "\n", 356 | "while not done:\n", 357 | " action = agent.get_action(state)\n", 358 | " next_state, reward, done, info = env.step(action)\n", 359 | " state = next_state\n", 360 | " total_reward += reward\n", 361 | " #env.render()\n", 362 | "print('Total Reward:', total_reward)" 363 | ] 364 | } 365 | ], 366 | "metadata": { 367 | "kernelspec": { 368 | "display_name": "Python 3 (ipykernel)", 369 | "language": "python", 370 | "name": "python3" 371 | }, 372 | "language_info": { 373 | "codemirror_mode": { 374 | "name": "ipython", 375 | "version": 3 376 | }, 377 | "file_extension": ".py", 378 | "mimetype": "text/x-python", 379 | "name": "python", 380 | "nbconvert_exporter": "python", 381 | "pygments_lexer": "ipython3", 382 | "version": "3.9.4" 383 | } 384 | }, 385 | "nbformat": 4, 386 | "nbformat_minor": 5 387 | } 388 | -------------------------------------------------------------------------------- /notebooks/09_policy_gradient.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "dcad5835", 6 | "metadata": {}, 7 | "source": [ 8 | "## SETUP" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "cc983bba", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "!pip install gym[classic_control]\n", 19 | "!pip install dezero" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "id": "758a1577", 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import matplotlib.pyplot as plt\n", 30 | "\n", 31 | "# utility functions (common functions)\n", 32 | "def plot_total_reward(reward_history):\n", 33 | " plt.xlabel('Episode')\n", 34 | " plt.ylabel('Total Reward')\n", 35 | " plt.plot(range(len(reward_history)), reward_history)\n", 36 | " plt.show()" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "id": "ab0de828", 42 | "metadata": {}, 43 | "source": [ 44 | "## ch09/simple_pg.py" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "id": "b311d659", 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "name": "stdout", 55 | "output_type": "stream", 56 | "text": [ 57 | "episode :0, total reward : 11.0\n", 58 | "episode :100, total reward : 26.0\n", 59 | "episode :200, total reward : 14.0\n", 60 | "episode :300, total reward : 22.0\n", 61 | "episode :400, total reward : 13.0\n", 62 | "episode :500, total reward : 13.0\n", 63 | "episode :600, total reward : 75.0\n", 64 | "episode :700, total reward : 57.0\n", 65 | "episode :800, total reward : 30.0\n", 66 | "episode :900, total reward : 82.0\n", 67 | "episode :1000, total reward : 74.0\n", 68 | "episode :1100, total reward : 176.0\n", 69 | "episode :1200, total reward : 74.0\n", 70 | "episode :1300, total reward : 41.0\n", 71 | "episode :1400, total reward : 76.0\n", 72 | "episode :1500, total reward : 79.0\n", 73 | "episode :1600, total reward : 94.0\n", 74 | "episode :1700, total reward : 187.0\n", 75 | "episode :1800, total reward : 28.0\n", 76 | "episode :1900, total reward : 23.0\n", 77 | "episode :2000, total reward : 133.0\n", 78 | "episode :2100, total reward : 69.0\n", 79 | "episode :2200, total reward : 52.0\n", 80 | "episode :2300, total reward : 113.0\n", 81 | "episode :2400, total reward : 47.0\n", 82 | "episode :2500, total reward : 101.0\n", 83 | "episode :2600, total reward : 83.0\n", 84 | "episode :2700, total reward : 63.0\n", 85 | "episode :2800, total reward : 40.0\n", 86 | "episode :2900, total reward : 102.0\n" 87 | ] 88 | }, 89 | { 90 | "data": { 91 | "image/png": "\n", 92 | "text/plain": [ 93 | "
" 94 | ] 95 | }, 96 | "metadata": { 97 | "needs_background": "light" 98 | }, 99 | "output_type": "display_data" 100 | } 101 | ], 102 | "source": [ 103 | "import numpy as np\n", 104 | "import gym\n", 105 | "from dezero import Model\n", 106 | "from dezero import optimizers\n", 107 | "import dezero.functions as F\n", 108 | "import dezero.layers as L\n", 109 | "\n", 110 | "\n", 111 | "class Policy(Model):\n", 112 | " def __init__(self, action_size):\n", 113 | " super().__init__()\n", 114 | " self.l1 = L.Linear(128)\n", 115 | " self.l2 = L.Linear(action_size)\n", 116 | "\n", 117 | " def forward(self, x):\n", 118 | " x = F.relu(self.l1(x))\n", 119 | " x = F.softmax(self.l2(x))\n", 120 | " return x\n", 121 | "\n", 122 | "\n", 123 | "class Agent:\n", 124 | " def __init__(self):\n", 125 | " self.gamma = 0.98\n", 126 | " self.lr = 0.0002\n", 127 | " self.action_size = 2\n", 128 | "\n", 129 | " self.memory = []\n", 130 | " self.pi = Policy(self.action_size)\n", 131 | " self.optimizer = optimizers.Adam(self.lr)\n", 132 | " self.optimizer.setup(self.pi)\n", 133 | "\n", 134 | " def get_action(self, state):\n", 135 | " state = state[np.newaxis, :]\n", 136 | " probs = self.pi(state)\n", 137 | " probs = probs[0]\n", 138 | " action = np.random.choice(len(probs), p=probs.data)\n", 139 | " return action, probs[action]\n", 140 | "\n", 141 | " def add(self, reward, prob):\n", 142 | " data = (reward, prob)\n", 143 | " self.memory.append(data)\n", 144 | "\n", 145 | " def update(self):\n", 146 | " self.pi.cleargrads()\n", 147 | "\n", 148 | " G, loss = 0, 0\n", 149 | " for reward, prob in reversed(self.memory):\n", 150 | " G = reward + self.gamma * G\n", 151 | "\n", 152 | " for reward, prob in self.memory:\n", 153 | " loss += -F.log(prob) * G\n", 154 | "\n", 155 | " loss.backward()\n", 156 | " self.optimizer.update()\n", 157 | " self.memory = []\n", 158 | "\n", 159 | "\n", 160 | "episodes = 3000\n", 161 | "env = gym.make('CartPole-v0')\n", 162 | "agent = Agent()\n", 163 | "reward_history = []\n", 164 | "\n", 165 | "for episode in range(episodes):\n", 166 | " state = env.reset()\n", 167 | " done = False\n", 168 | " total_reward = 0\n", 169 | "\n", 170 | " while not done:\n", 171 | " action, prob = agent.get_action(state)\n", 172 | " next_state, reward, done, info = env.step(action)\n", 173 | "\n", 174 | " agent.add(reward, prob)\n", 175 | " state = next_state\n", 176 | " total_reward += reward\n", 177 | "\n", 178 | " agent.update()\n", 179 | "\n", 180 | " reward_history.append(total_reward)\n", 181 | " if episode % 100 == 0:\n", 182 | " print(\"episode :{}, total reward : {:.1f}\".format(episode, total_reward))\n", 183 | "\n", 184 | "\n", 185 | "# plot\n", 186 | "plot_total_reward(reward_history)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "id": "2acaed03", 192 | "metadata": {}, 193 | "source": [ 194 | "## ch09/reinforce.py" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 4, 200 | "id": "7d956534", 201 | "metadata": {}, 202 | "outputs": [ 203 | { 204 | "name": "stdout", 205 | "output_type": "stream", 206 | "text": [ 207 | "episode :0, total reward : 13.0\n", 208 | "episode :100, total reward : 48.0\n", 209 | "episode :200, total reward : 36.0\n", 210 | "episode :300, total reward : 26.0\n", 211 | "episode :400, total reward : 48.0\n", 212 | "episode :500, total reward : 33.0\n", 213 | "episode :600, total reward : 193.0\n", 214 | "episode :700, total reward : 58.0\n", 215 | "episode :800, total reward : 124.0\n", 216 | "episode :900, total reward : 200.0\n", 217 | "episode :1000, total reward : 108.0\n", 218 | "episode :1100, total reward : 200.0\n", 219 | "episode :1200, total reward : 200.0\n", 220 | "episode :1300, total reward : 176.0\n", 221 | "episode :1400, total reward : 122.0\n", 222 | "episode :1500, total reward : 200.0\n", 223 | "episode :1600, total reward : 196.0\n", 224 | "episode :1700, total reward : 200.0\n", 225 | "episode :1800, total reward : 200.0\n", 226 | "episode :1900, total reward : 170.0\n", 227 | "episode :2000, total reward : 200.0\n", 228 | "episode :2100, total reward : 200.0\n", 229 | "episode :2200, total reward : 200.0\n", 230 | "episode :2300, total reward : 200.0\n", 231 | "episode :2400, total reward : 200.0\n", 232 | "episode :2500, total reward : 200.0\n", 233 | "episode :2600, total reward : 200.0\n", 234 | "episode :2700, total reward : 200.0\n", 235 | "episode :2800, total reward : 133.0\n", 236 | "episode :2900, total reward : 200.0\n" 237 | ] 238 | }, 239 | { 240 | "data": { 241 | "image/png": "\n", 242 | "text/plain": [ 243 | "
" 244 | ] 245 | }, 246 | "metadata": { 247 | "needs_background": "light" 248 | }, 249 | "output_type": "display_data" 250 | } 251 | ], 252 | "source": [ 253 | "class Policy(Model):\n", 254 | " def __init__(self, action_size):\n", 255 | " super().__init__()\n", 256 | " self.l1 = L.Linear(128)\n", 257 | " self.l2 = L.Linear(action_size)\n", 258 | "\n", 259 | " def forward(self, x):\n", 260 | " x = F.relu(self.l1(x))\n", 261 | " x = F.softmax(self.l2(x))\n", 262 | " return x\n", 263 | "\n", 264 | "\n", 265 | "class Agent:\n", 266 | " def __init__(self):\n", 267 | " self.gamma = 0.98\n", 268 | " self.lr = 0.0002\n", 269 | " self.action_size = 2\n", 270 | "\n", 271 | " self.memory = []\n", 272 | " self.pi = Policy(self.action_size)\n", 273 | " self.optimizer = optimizers.Adam(self.lr)\n", 274 | " self.optimizer.setup(self.pi)\n", 275 | "\n", 276 | " def get_action(self, state):\n", 277 | " state = state[np.newaxis, :]\n", 278 | " probs = self.pi(state)\n", 279 | " probs = probs[0]\n", 280 | " action = np.random.choice(len(probs), p=probs.data)\n", 281 | " return action, probs[action]\n", 282 | "\n", 283 | " def add(self, reward, prob):\n", 284 | " data = (reward, prob)\n", 285 | " self.memory.append(data)\n", 286 | "\n", 287 | " def update(self):\n", 288 | " self.pi.cleargrads()\n", 289 | "\n", 290 | " G, loss = 0, 0\n", 291 | " for reward, prob in reversed(self.memory):\n", 292 | " G = reward + self.gamma * G\n", 293 | " loss += -F.log(prob) * G\n", 294 | "\n", 295 | " loss.backward()\n", 296 | " self.optimizer.update()\n", 297 | " self.memory = []\n", 298 | "\n", 299 | "\n", 300 | "episodes = 3000\n", 301 | "env = gym.make('CartPole-v0')\n", 302 | "agent = Agent()\n", 303 | "reward_history = []\n", 304 | "\n", 305 | "for episode in range(episodes):\n", 306 | " state = env.reset()\n", 307 | " done = False\n", 308 | " sum_reward = 0\n", 309 | "\n", 310 | " while not done:\n", 311 | " action, prob = agent.get_action(state)\n", 312 | " next_state, reward, done, info = env.step(action)\n", 313 | "\n", 314 | " agent.add(reward, prob)\n", 315 | " state = next_state\n", 316 | " sum_reward += reward\n", 317 | "\n", 318 | " agent.update()\n", 319 | "\n", 320 | " reward_history.append(sum_reward)\n", 321 | " if episode % 100 == 0:\n", 322 | " print(\"episode :{}, total reward : {:.1f}\".format(episode, sum_reward))\n", 323 | "\n", 324 | "\n", 325 | "# plot\n", 326 | "plot_total_reward(reward_history)" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "id": "3372fddf", 332 | "metadata": {}, 333 | "source": [ 334 | "## ch09/actor_critic.py" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "id": "59a22521", 341 | "metadata": {}, 342 | "outputs": [ 343 | { 344 | "name": "stdout", 345 | "output_type": "stream", 346 | "text": [ 347 | "episode :0, total reward : 41.0\n", 348 | "episode :100, total reward : 12.0\n", 349 | "episode :200, total reward : 10.0\n", 350 | "episode :300, total reward : 15.0\n", 351 | "episode :400, total reward : 67.0\n", 352 | "episode :500, total reward : 54.0\n", 353 | "episode :600, total reward : 166.0\n", 354 | "episode :700, total reward : 200.0\n", 355 | "episode :800, total reward : 151.0\n", 356 | "episode :900, total reward : 200.0\n", 357 | "episode :1000, total reward : 200.0\n" 358 | ] 359 | } 360 | ], 361 | "source": [ 362 | "class PolicyNet(Model):\n", 363 | " def __init__(self, action_size=2):\n", 364 | " super().__init__()\n", 365 | " self.l1 = L.Linear(128)\n", 366 | " self.l2 = L.Linear(action_size)\n", 367 | "\n", 368 | " def forward(self, x):\n", 369 | " x = F.relu(self.l1(x))\n", 370 | " x = self.l2(x)\n", 371 | " x = F.softmax(x)\n", 372 | " return x\n", 373 | "\n", 374 | "\n", 375 | "class ValueNet(Model):\n", 376 | " def __init__(self):\n", 377 | " super().__init__()\n", 378 | " self.l1 = L.Linear(128)\n", 379 | " self.l2 = L.Linear(1)\n", 380 | "\n", 381 | " def forward(self, x):\n", 382 | " x = F.relu(self.l1(x))\n", 383 | " x = self.l2(x)\n", 384 | " return x\n", 385 | "\n", 386 | "\n", 387 | "class Agent:\n", 388 | " def __init__(self):\n", 389 | " self.gamma = 0.98\n", 390 | " self.lr_pi = 0.0002\n", 391 | " self.lr_v = 0.0005\n", 392 | " self.action_size = 2\n", 393 | "\n", 394 | " self.pi = PolicyNet()\n", 395 | " self.v = ValueNet()\n", 396 | " self.optimizer_pi = optimizers.Adam(self.lr_pi).setup(self.pi)\n", 397 | " self.optimizer_v = optimizers.Adam(self.lr_v).setup(self.v)\n", 398 | "\n", 399 | " def get_action(self, state):\n", 400 | " state = state[np.newaxis, :] # add batch axis\n", 401 | " probs = self.pi(state)\n", 402 | " probs = probs[0]\n", 403 | " action = np.random.choice(len(probs), p=probs.data)\n", 404 | " return action, probs[action]\n", 405 | "\n", 406 | " def update(self, state, action_prob, reward, next_state, done):\n", 407 | " state = state[np.newaxis, :] # add batch axis\n", 408 | " next_state = next_state[np.newaxis, :]\n", 409 | "\n", 410 | " # ========== (1) Update V network ===========\n", 411 | " target = reward + self.gamma * self.v(next_state) * (1 - done)\n", 412 | " target.unchain()\n", 413 | " v = self.v(state)\n", 414 | " loss_v = F.mean_squared_error(v, target)\n", 415 | "\n", 416 | " # ========== (2) Update pi network ===========\n", 417 | " delta = target - v\n", 418 | " delta.unchain()\n", 419 | " loss_pi = -F.log(action_prob) * delta\n", 420 | "\n", 421 | " self.v.cleargrads()\n", 422 | " self.pi.cleargrads()\n", 423 | " loss_v.backward()\n", 424 | " loss_pi.backward()\n", 425 | " self.optimizer_v.update()\n", 426 | " self.optimizer_pi.update()\n", 427 | "\n", 428 | "\n", 429 | "episodes = 3000\n", 430 | "env = gym.make('CartPole-v0')\n", 431 | "agent = Agent()\n", 432 | "reward_history = []\n", 433 | "\n", 434 | "for episode in range(episodes):\n", 435 | " state = env.reset()\n", 436 | " done = False\n", 437 | " total_reward = 0\n", 438 | "\n", 439 | " while not done:\n", 440 | " action, prob = agent.get_action(state)\n", 441 | " next_state, reward, done, info = env.step(action)\n", 442 | "\n", 443 | " agent.update(state, prob, reward, next_state, done)\n", 444 | "\n", 445 | " state = next_state\n", 446 | " total_reward += reward\n", 447 | "\n", 448 | " reward_history.append(total_reward)\n", 449 | " if episode % 100 == 0:\n", 450 | " print(\"episode :{}, total reward : {:.1f}\".format(episode, total_reward))\n", 451 | "\n", 452 | "\n", 453 | "# plot\n", 454 | "plot_total_reward(reward_history)" 455 | ] 456 | } 457 | ], 458 | "metadata": { 459 | "kernelspec": { 460 | "display_name": "Python 3 (ipykernel)", 461 | "language": "python", 462 | "name": "python3" 463 | }, 464 | "language_info": { 465 | "codemirror_mode": { 466 | "name": "ipython", 467 | "version": 3 468 | }, 469 | "file_extension": ".py", 470 | "mimetype": "text/x-python", 471 | "name": "python", 472 | "nbconvert_exporter": "python", 473 | "pygments_lexer": "ipython3", 474 | "version": "3.9.4" 475 | } 476 | }, 477 | "nbformat": 4, 478 | "nbformat_minor": 5 479 | } 480 | -------------------------------------------------------------------------------- /pytorch/actor_critic.py: -------------------------------------------------------------------------------- 1 | if '__file__' in globals(): 2 | import os, sys 3 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 4 | import numpy as np 5 | import gym 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | import torch.optim as optim 10 | from torch.distributions import Categorical 11 | from common.utils import plot_total_reward 12 | 13 | 14 | class PolicyNet(nn.Module): 15 | def __init__(self, action_size): 16 | super().__init__() 17 | self.l1 = nn.Linear(4, 128) 18 | self.l2 = nn.Linear(128, action_size) 19 | 20 | def forward(self, x): 21 | x = F.relu(self.l1(x)) 22 | x = F.softmax(self.l2(x), dim=1) 23 | return x 24 | 25 | 26 | class ValueNet(nn.Module): 27 | def __init__(self): 28 | super().__init__() 29 | self.l1 = nn.Linear(4, 128) 30 | self.l2 = nn.Linear(128, 1) 31 | 32 | def forward(self, x): 33 | x = F.relu(self.l1(x)) 34 | x = self.l2(x) 35 | return x 36 | 37 | 38 | class Agent: 39 | def __init__(self): 40 | self.gamma = 0.98 41 | self.lr_pi = 0.0002 42 | self.lr_v = 0.0005 43 | self.action_size = 2 44 | 45 | self.pi = PolicyNet(self.action_size) 46 | self.v = ValueNet() 47 | 48 | self.optimizer_pi = optim.Adam(self.pi.parameters(), lr=self.lr_pi) 49 | self.optimizer_v = optim.Adam(self.v.parameters(), lr=self.lr_v) 50 | 51 | def get_action(self, state): 52 | state = torch.tensor(state[np.newaxis, :]) 53 | probs = self.pi(state) 54 | probs = probs[0] 55 | m = Categorical(probs) 56 | action = m.sample().item() 57 | return action, probs[action] 58 | 59 | def update(self, state, action_prob, reward, next_state, done): 60 | state = torch.tensor(state[np.newaxis, :]) 61 | next_state = torch.tensor(next_state[np.newaxis, :]) 62 | 63 | target = reward + self.gamma * self.v(next_state) * (1 - done) 64 | target.detach() 65 | v = self.v(state) 66 | loss_fn = nn.MSELoss() 67 | loss_v = loss_fn(v, target) 68 | 69 | delta = target - v 70 | loss_pi = -torch.log(action_prob) * delta.item() 71 | 72 | self.optimizer_v.zero_grad() 73 | self.optimizer_pi.zero_grad() 74 | loss_v.backward() 75 | loss_pi.backward() 76 | self.optimizer_v.step() 77 | self.optimizer_pi.step() 78 | 79 | 80 | env = gym.make('CartPole-v0') 81 | agent = Agent() 82 | reward_history = [] 83 | 84 | for episode in range(2000): 85 | state = env.reset() 86 | done = False 87 | total_reward = 0 88 | 89 | while not done: 90 | action, prob = agent.get_action(state) 91 | next_state, reward, done, info = env.step(action) 92 | 93 | agent.update(state, prob, reward, next_state, done) 94 | 95 | state = next_state 96 | total_reward += reward 97 | 98 | reward_history.append(total_reward) 99 | if episode % 100 == 0: 100 | print("episode :{}, total reward : {:.1f}".format(episode, total_reward)) 101 | 102 | plot_total_reward(reward_history) -------------------------------------------------------------------------------- /pytorch/dqn.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from collections import deque 3 | import random 4 | import numpy as np 5 | import gym 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | import torch.optim as optim 10 | 11 | 12 | class ReplayBuffer: 13 | def __init__(self, buffer_size, batch_size): 14 | self.buffer = deque(maxlen=buffer_size) 15 | self.batch_size = batch_size 16 | 17 | def add(self, state, action, reward, next_state, done): 18 | data = (state, action, reward, next_state, done) 19 | self.buffer.append(data) 20 | 21 | def __len__(self): 22 | return len(self.buffer) 23 | 24 | def get_batch(self): 25 | data = random.sample(self.buffer, self.batch_size) 26 | 27 | state = torch.tensor(np.stack([x[0] for x in data])) 28 | action = torch.tensor(np.array([x[1] for x in data]).astype(np.long)) 29 | reward = torch.tensor(np.array([x[2] for x in data]).astype(np.float32)) 30 | next_state = torch.tensor(np.stack([x[3] for x in data])) 31 | done = torch.tensor(np.array([x[4] for x in data]).astype(np.int32)) 32 | return state, action, reward, next_state, done 33 | 34 | 35 | class QNet(nn.Module): 36 | def __init__(self, action_size): 37 | super().__init__() 38 | self.l1 = nn.Linear(4, 128) 39 | self.l2 = nn.Linear(128, 128) 40 | self.l3 = nn.Linear(128, action_size) 41 | 42 | def forward(self, x): 43 | x = F.relu(self.l1(x)) 44 | x = F.relu(self.l2(x)) 45 | x = self.l3(x) 46 | return x 47 | 48 | 49 | class DQNAgent: 50 | def __init__(self): 51 | self.gamma = 0.98 52 | self.lr = 0.0005 53 | self.epsilon = 0.1 54 | self.buffer_size = 10000 55 | self.batch_size = 32 56 | self.action_size = 2 57 | 58 | self.replay_buffer = ReplayBuffer(self.buffer_size, self.batch_size) 59 | self.qnet = QNet(self.action_size) 60 | self.qnet_target = QNet(self.action_size) 61 | self.optimizer = optim.Adam(self.qnet.parameters(), lr=self.lr) 62 | 63 | def get_action(self, state): 64 | if np.random.rand() < self.epsilon: 65 | return np.random.choice(self.action_size) 66 | else: 67 | state = torch.tensor(state[np.newaxis, :]) 68 | qs = self.qnet(state) 69 | return qs.argmax().item() 70 | 71 | def update(self, state, action, reward, next_state, done): 72 | self.replay_buffer.add(state, action, reward, next_state, done) 73 | if len(self.replay_buffer) < self.batch_size: 74 | return 75 | 76 | state, action, reward, next_state, done = self.replay_buffer.get_batch() 77 | qs = self.qnet(state) 78 | q = qs[np.arange(len(action)), action] 79 | 80 | next_qs = self.qnet_target(next_state) 81 | next_q = next_qs.max(1)[0] 82 | 83 | next_q.detach() 84 | target = reward + (1 - done) * self.gamma * next_q 85 | 86 | loss_fn = nn.MSELoss() 87 | loss = loss_fn(q, target) 88 | 89 | self.optimizer.zero_grad() 90 | loss.backward() 91 | self.optimizer.step() 92 | 93 | def sync_qnet(self): 94 | self.qnet_target.load_state_dict(self.qnet.state_dict()) 95 | 96 | 97 | episodes = 300 98 | sync_interval = 20 99 | env = gym.make('CartPole-v0') 100 | agent = DQNAgent() 101 | reward_history = [] 102 | 103 | for episode in range(episodes): 104 | state = env.reset() 105 | done = False 106 | total_reward = 0 107 | 108 | while not done: 109 | action = agent.get_action(state) 110 | next_state, reward, done, info = env.step(action) 111 | 112 | agent.update(state, action, reward, next_state, done) 113 | state = next_state 114 | total_reward += reward 115 | 116 | if episode % sync_interval == 0: 117 | agent.sync_qnet() 118 | 119 | reward_history.append(total_reward) 120 | if episode % 10 == 0: 121 | print("episode :{}, total reward : {}".format(episode, total_reward)) -------------------------------------------------------------------------------- /pytorch/reinforce.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torch.optim as optim 7 | from torch.distributions import Categorical 8 | 9 | 10 | class Policy(nn.Module): 11 | def __init__(self, action_size): 12 | super().__init__() 13 | self.l1 = nn.Linear(4, 128) 14 | self.l2 = nn.Linear(128, action_size) 15 | 16 | def forward(self, x): 17 | x = F.relu(self.l1(x)) 18 | x = F.softmax(self.l2(x), dim=1) 19 | return x 20 | 21 | 22 | class Agent: 23 | def __init__(self): 24 | self.gamma = 0.98 25 | self.lr = 0.0002 26 | self.action_size = 2 27 | 28 | self.memory = [] 29 | self.pi = Policy(self.action_size) 30 | self.optimizer = optim.Adam(self.pi.parameters(), lr=self.lr) 31 | 32 | def get_action(self, state): 33 | state = torch.tensor(state[np.newaxis, :]) 34 | probs = self.pi(state) 35 | probs = probs[0] 36 | m = Categorical(probs) 37 | action = m.sample().item() 38 | return action, probs[action] 39 | 40 | def add(self, reward, prob): 41 | data = (reward, prob) 42 | self.memory.append(data) 43 | 44 | def update(self): 45 | G, loss = 0, 0 46 | for reward, prob in reversed(self.memory): 47 | G = reward + self.gamma * G 48 | loss += - torch.log(prob) * G 49 | 50 | self.optimizer.zero_grad() 51 | loss.backward() 52 | self.optimizer.step() 53 | self.memory = [] 54 | 55 | 56 | env = gym.make('CartPole-v0') 57 | agent = Agent() 58 | reward_history = [] 59 | 60 | for episode in range(3000): 61 | state = env.reset() 62 | done = False 63 | sum_reward = 0 64 | 65 | while not done: 66 | action, prob = agent.get_action(state) 67 | next_state, reward, done, info = env.step(action) 68 | 69 | agent.add(reward, prob) 70 | state = next_state 71 | sum_reward += reward 72 | 73 | agent.update() 74 | 75 | reward_history.append(sum_reward) 76 | if episode % 100 == 0: 77 | print("episode :{}, total reward : {:.1f}".format(episode, sum_reward)) -------------------------------------------------------------------------------- /pytorch/simple_pg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torch.optim as optim 7 | from torch.distributions import Categorical 8 | 9 | 10 | class Policy(nn.Module): 11 | def __init__(self, action_size): 12 | super().__init__() 13 | self.l1 = nn.Linear(4, 128) 14 | self.l2 = nn.Linear(128, action_size) 15 | 16 | def forward(self, x): 17 | x = F.relu(self.l1(x)) 18 | x = F.softmax(self.l2(x), dim=1) 19 | return x 20 | 21 | 22 | class Agent: 23 | def __init__(self): 24 | self.gamma = 0.98 25 | self.lr = 0.0002 26 | self.action_size = 2 27 | 28 | self.memory = [] 29 | self.pi = Policy(self.action_size) 30 | self.optimizer = optim.Adam(self.pi.parameters(), lr=self.lr) 31 | 32 | def get_action(self, state): 33 | state = torch.tensor(state[np.newaxis, :]) 34 | probs = self.pi(state) 35 | probs = probs[0] 36 | m = Categorical(probs) 37 | action = m.sample().item() 38 | return action, probs[action] 39 | 40 | def add(self, reward, prob): 41 | data = (reward, prob) 42 | self.memory.append(data) 43 | 44 | def update(self): 45 | G, loss = 0, 0 46 | for reward, prob in reversed(self.memory): 47 | G = reward + self.gamma * G 48 | 49 | for reward, prob in self.memory: 50 | loss += - torch.log(prob) * G 51 | 52 | self.optimizer.zero_grad() 53 | loss.backward() 54 | self.optimizer.step() 55 | self.memory = [] 56 | 57 | 58 | env = gym.make('CartPole-v0') 59 | agent = Agent() 60 | reward_history = [] 61 | 62 | for episode in range(3000): 63 | state = env.reset() 64 | done = False 65 | total_reward = 0 66 | 67 | while not done: 68 | action, prob = agent.get_action(state) 69 | next_state, reward, done, info = env.step(action) 70 | 71 | agent.add(reward, prob) 72 | state = next_state 73 | total_reward += reward 74 | 75 | agent.update() 76 | 77 | reward_history.append(total_reward) 78 | if episode % 100 == 0: 79 | print("episode :{}, total reward : {:.1f}".format(episode, total_reward)) -------------------------------------------------------------------------------- /series overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WegraLee/deep-learning-from-scratch-4/b82cd6432b4e63ce6a4ab2b925fc74a1227fb06a/series overview.png --------------------------------------------------------------------------------