首付(.*?)税费
(.*?)万\(仅供参考\)\s+.*? #首付,税费
36 |
37 |
38 |
(.*?)
#户型
39 |
(.*?)
#楼层
40 |
41 |
42 |
(.*?)
#朝向
43 |
(.*?)
#装修
44 |
45 |
46 |
(.*?)
#面积
47 |
(.*?)
#建设时间
48 |
49 |
50 |
51 |
# 小区链接,小区名
52 |
# 小区所在区域:昌平,北七家,五环到六环
53 |
看房时间(.*?)
# 看房时间
54 |
链家编号(.*?).*?
# 看房时间
55 | .*?
.*?
56 | ''',re.X|re.S)
57 | pattern_tag = re.compile(r'''
(.*?)''',re.X|re.S)
58 | pattern_para = re.compile(r'''
(.*?)
''',re.X|re.S)
59 | # 剔除特殊符号
60 | #p_single = re.compile(r'
') # 剔除图片信息
61 | #p_pair = re.compile(r'<(.*?)\s?.*?>(.*?)\1>') # 剔除外链信息
62 | p_html = re.compile(r'(<[^>]+>)|( )',re.S)
63 |
64 | # 抓取资讯首页
65 | # http://jiqizhixin.com/edge/p/1
66 | #["燕城苑南北两居,业主诚心出售,看房方便。", "南北通透两居,视野好,集中供暖!", "35", "2", "345", "40464", "121万 ", "13.8", "2室2厅", "高楼层/共6层", "南 北", "平层/简装", "85.26平米", "1995年建/板楼", "/xiaoqu/1111027381547/", "燕城苑北区", "/ershoufang/changping/", "昌平", "/ershoufang/beiqijia/", "北七家", " 五至六环", "有租户需要预约", "101100960378"]
67 | output_format = ['房源','备注','关注人数','看过人数','总价','均价','首付','税费','户型','楼层','朝向','装修','面积','年代','小区链接','小区名称','区链接','区名','镇链接','镇名','街道','看房','编号']
68 | print('\t'.join(output_format))
69 | page_list = ['101100791393','101100960378']
70 | #for page in page_list:
71 | for line in file('house_id.txt'):
72 | page = line.strip().strip('.html')
73 | #curl_path = 'curl http://bj.lianjia.com/ershoufang/101100960378.html'
74 | curl_path = 'curl http://bj.lianjia.com/ershoufang/%s.html'%(page)
75 | content = subprocess.check_output(curl_path,shell=True); #如果命令执行的返回值不为0,则会抛出CalledProcessError的错误
76 | #print(content)
77 | result = re.findall(pattern_article,content)
78 | output_dict = dict(zip(output_format,result[0]))
79 | #output = re.findall(pattern_article,content.decode('utf8'))
80 | print('\t'.join(result[0]))
81 | #print(json.dumps(result[0],ensure_ascii=False))
82 | #print(json.dumps(output_dict,ensure_ascii=False))
83 |
84 |
85 |
--------------------------------------------------------------------------------
/python/mysql_test.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python2.7
2 | # coding=utf-8
3 | #参考
http://www.cnblogs.com/fnng/p/3565912.html
4 | '''
5 | #mysql初始化-shell
6 | mysql=/usr/local/mysql/bin/mysql
7 | $mysql -uroot -pwqw < init.sql
8 | ------
9 | $mysql -uroot -p123456 <117] # 按照数值过滤筛选
21 | #df[df.time<'2016-07-20']
22 | #new.values.tolist() # DataFrame转成list结构
23 | #df.sort(columns='time') # 排序
24 |
--------------------------------------------------------------------------------
/python/pylint.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding:utf8
3 | """
4 | test sample. google编码规范:(URL不受80字符限制)
5 | https://zh-google-styleguide.readthedocs.io/en/latest/google-python-styleguide/python_style_rules/
6 | 2017-11-24
7 | wangqiwen@didichuxing.com
8 | """
9 | #import的包一定要使用;import包分成3部分,依次排序:①系统包②第三方包③自定义包。每部分按照字母顺序排序,一次不能导入多个包
10 | import sys
11 |
12 | class MyClass(object):
13 | """class测试: 类名满足Pascal风格"""
14 | public_name = '-public-' # public
15 | _myname = '-protected' # protected
16 | __private_name = '-private-' # private
17 |
18 | def __init__(self, name="wang"):
19 | self._myname = name
20 | print '我的名字是%s'%(self._myname)
21 |
22 | def say(self):
23 | """打招呼"""
24 | print '你好,我是%s,%s,%s'%(self._myname, self.public_name, self.__private_name)
25 | return 'yes'
26 |
27 | def modify(self, name="-"):
28 | """更改属性值"""
29 | self._myname = name
30 |
31 | def my_fun(value=0, delta=9):
32 | """
33 | 外部函数:名字_连接。多参数时,逗号后面加一个空格
34 | """
35 | res = value + delta
36 | return res
37 |
38 | def main():
39 | """main function"""
40 | #main里的都是全局变量,需要大写
41 | value = 3
42 | new = my_fun(value)
43 | v_result = MyClass("wqw")
44 | #不能访问protected、private变量.W._myname, W.__private_name
45 | #超过80字符时,可以用\换行,注:(),[]时可省略\
46 | print >> sys.stdout, 'hello,related values are listed as : %s , %s,I am \
47 | %s,%s ...'%(value, new, v_result.say(), v_result.public_name)
48 | print >> sys.stdout, 'hello,related values are listed as : %s , %s,I am %s,%s ...'%(value, new, v_result.say(), v_result.public_name) # pylint: disable=line-too-long
49 | #参考:How do I disable a Pylint warning?
50 | #https://stackoverflow.com/questions/4341746/how-do-i-disable-a-pylint-warning
51 |
52 | if __name__ == '__main__':
53 | A = 3 # 此处为全局变量,一律大写
54 | main()
55 |
56 | # */* vim: set expandtab ts=4 sw=4 sts=4 tw=400: */
57 |
--------------------------------------------------------------------------------
/python/python-coding.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wqw547243068/wangqiwen/07b64ae47d91581e1c339f40bc765fd7815b47ff/python/python-coding.png
--------------------------------------------------------------------------------
/python/python入门神图.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wqw547243068/wangqiwen/07b64ae47d91581e1c339f40bc765fd7815b47ff/python/python入门神图.jpg
--------------------------------------------------------------------------------
/python/samplt.py:
--------------------------------------------------------------------------------
1 | #https://bitbucket.org/gastlygem/documents/src/e4749bcf2e73/sample.py
2 | # -*- coding: utf-8 -*-
3 | # 给 Python 初学者的超快速脚本解说
4 |
5 | import os
6 |
7 | def main():
8 | print '你好, 世界!'
9 | print "单引号,双引号,其实是一码事"
10 | print '字符串内的引号需被转义(如 O\'Neil)'
11 | print "换个不同的引号就无需转义了(看 O'Neil)"
12 |
13 | print """三引号(亦可以是三个单引号)可以安全地处理单双引号混用,例如:
14 | O'Neil 说: "姚明太瘦。"
15 | 姚明说: "O'Neil 太老。"
16 | 而且还能跨行,跨行后的格式也能被保留。
17 | """
18 |
19 | print '=' * 10
20 | print '这将直接执行', os.getcwd()
21 |
22 | add(5, 10)
23 |
24 | counter = 0
25 | counter += 1
26 |
27 | food = ['苹果', '杏子', '李子', '梨']
28 | for i in food:
29 | print '俺就爱整只: %s' % i
30 |
31 | print '从0数到9'
32 | for i in range(10):
33 | print i
34 |
35 | def add(param1, param2):
36 | """做了点加法.
37 | 喔,其实还胡乱判断了一气。
38 | """
39 | # 这也是一个注释。
40 | res = param1 + param2
41 | print '%s + %s = %s' %(param1, param2, res)
42 |
43 | if res < 50:
44 | print '这个这个'
45 | elif res >= 50 and (param1 == 42 or param2 == 24):
46 | print '那个那个'
47 | else:
48 | print '嗯哼...'
49 |
50 | return res # 注释还可以像这样直接跟在一句代码的后面
51 |
52 | if __name__ == '__main__':
53 | main()
54 |
--------------------------------------------------------------------------------
/python/w2v.py:
--------------------------------------------------------------------------------
1 | # 参考地址:http://www.52nlp.cn/%E4%B8%AD%E8%8B%B1%E6%96%87%E7%BB%B4%E5%9F%BA%E7%99%BE%E7%A7%91%E8%AF%AD%E6%96%99%E4%B8%8A%E7%9A%84word2vec%E5%AE%9E%E9%AA%8C
2 | import logging
3 | import os
4 | import time
5 |
6 | import gensim
7 | from gensim.models import word2vec
8 | import jieba
9 | #import nltk
10 | import json
11 |
12 | #a=jieba.cut(str,cut_all=False)
13 | #print '/'.join(a)
14 |
15 | logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s',level=logging.INFO)
16 | start1 = time.clock()
17 | input_file_name = u'E:/百度云/IT技术_new/编程语言/python/demo/word/result.txt' # 原始文件Unicode编码
18 | input_file_f = open(input_file_name,'r')
19 | #contents = input_file_f.read() # 整个文件读到一个变量里
20 | print '读取文件耗时:',time.clock()
21 | #sentences = [i.strip().split(" ") for i in contents[:10]]
22 | sentences = []
23 | print '转换后:\n','|'.join(['&'.join(i) for i in sentences])
24 | # 开始逐行处理
25 | for line in input_file_f.readlines():
26 | #按行读取
27 | sentences.append(line.strip().split(" "))
28 | #print '行数:%s,内容:\n'%(len(sentences)),json.dumps(sentences,ensure_ascii=False)
29 | #sentences是句子序列,句子又是单词列表,比如,sentences = [['first', 'sentence'], ['second', 'sentence']]
30 | model = word2vec.Word2Vec(sentences,min_count=2,size=200) #min_count表示小于该数的单词会被剔除,默认值为5;size表示神经网络的隐藏层单元数,默认为100
31 | #保存生成的训练模型
32 | output_model = u'E:/百度云/IT技术_new/编程语言/python/demo/word/model'
33 | model.save(output_model)#加载模型文件new_model = gensim.models.Word2Vec.load('model/mymodel4')
34 | #=================
35 | #加载模型文件
36 | new_model = gensim.models.Word2Vec.load(output_model)
37 | dir(new_model) # 多种函数方法,
38 | print new_model.vector_size # 词向量维度
39 | print ','.join(new_model.index2word) # index2word保存单词
40 | # 计算指定词的所以相似词
41 | test_word = '经理'
42 | similar_word_list = new_model.most_similar(test_word)
43 | print json.dumps(similar_word_list,ensure_ascii=False)
44 | #print json.dumps(similar_word_list,ensure_ascii=False,indent=4)
45 | # 抽取北京的搜索session:select query_list from user_satisfy_query where dt=20160918 and province rlike '^010' and count > 1;
46 | #print json.dumps(new_model.most_similar(u'天安门'),ensure_ascii=False)
47 | #In [76]: print json.dumps(new_model.most_similar(u'旅店'),ensure_ascii=False)
48 | #[["莫泰", 0.8472937345504761], ["易佰", 0.8139138221740723], ["168", 0.7009128928184509], ["连锁", 0.6979336738586426], ["旅馆", 0.6874777674674988], ["旺子成", 0.6520262360572815], ["快捷", 0.6426747441291809], ["家庭旅馆", 0.6317397356033325], ["人在旅途", 0.6164605021476746], ["寺易佰", 0.6112728714942932]]
49 | #In [77]: print json.dumps(new_model.most_similar(u'菜馆'),ensure_ascii=False)
50 | #[["家常菜", 0.8295753598213196], ["风味", 0.8144116401672363], ["正宗", 0.8008058071136475], ["菜", 0.787124514579773], ["饺子馆", 0.7830443382263184], ["刀削面", 0.7752013802528381], ["特色", 0.7629570364952087], ["面馆", 0.7591361403465271], ["面", 0.7421250939369202], ["农家菜", 0.7410575747489929]]
51 | #In [158]: print json.dumps(new_model.most_similar(u'软件园'),ensure_ascii=False)
52 | #[["用友", 0.7017531991004944], ["金蝶", 0.6142528057098389], ["孵化器", 0.5947192907333374], ["网易", 0.5910834074020386], ["f11", 0.584527850151062], ["软件", 0.5816747546195984], ["租贷", 0.5489269495010376], ["卵", 0.5268262624740601], ["鲜花网", 0.5116425156593323], ["广联达", 0.507921576499939]]
53 | #In [171]: print json.dumps(new_model.most_similar(u'美食'),ensure_ascii=False)
54 | #[["中餐", 0.8337364196777344], ["川菜", 0.7456749677658081], ["快餐", 0.7315336465835571], ["西餐", 0.6596412658691406], ["自助餐", 0.6401817202568054], ["老姬", 0.6020432710647583], ["日本料理", 0.5849108099937439], ["合利屋", 0.5827316045761108], ["nokia", 0.5804284811019897], ["早点", 0.5785887241363525]]
55 | #In [176]: print json.dumps(new_model.most_similar(u'麦当劳'),ensure_ascii=False)
56 | #[["肯德基", 0.857654869556427], ["肯德鸡", 0.6457746028900146], ["KFC", 0.6434839963912964], ["kfc", 0.6308714151382446], ["街鼎", 0.6141167283058167], ["FSDT", 0.589178204536438], ["康得基", 0.5770742893218994], ["得来", 0.5747169852256775], ["十佛营", 0.5702893137931824], ["必胜客", 0.5698955655097961]]
57 | print '(1)找某个词的相似词汇如下:\n词汇\t相似度\n','\n'.join(['%s\t%s'%(i[0],i[1]) for i in similar_word_list])
58 | # 计算任意两个词的相似度
59 | word_1 = '经理';word_2 = '数据'
60 | print '(2)任意两个词汇的相似度(%s与%s)'%(word_1,word_2),new_model.similarity(word_1,word_2)
61 | word_set_1 = ['经理','效率'];word_set_2 = ['数据','流程','重复']
62 | print '(3)两个数据集间的余弦距离(%s)与(%s):'%(json.dumps(word_set_1,ensure_ascii=False),json.dumps(word_set_1,ensure_ascii=False)),new_model.n_similarity(word_set_1, word_set_2)
63 | print '(4)找集合中不同的一项:(%s)'%(json.dumps(word_set_2,ensure_ascii=False)),new_model.doesnt_match(word_set_2)
64 | # 独特的组合加减法
65 | print json.dumps(new_model.most_similar(positive=[u'麦当劳'],negative=[u'肯德基',u'真功夫']),ensure_ascii=False)
66 |
--------------------------------------------------------------------------------
/qr_code.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wqw547243068/wangqiwen/07b64ae47d91581e1c339f40bc765fd7815b47ff/qr_code.gif
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/LICENCE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/RL_cover.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wqw547243068/wangqiwen/07b64ae47d91581e1c339f40bc765fd7815b47ff/rl/Reinforcement-learning-with-tensorflow/RL_cover.jpg
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/11_Dyna_Q/RL_brain.py:
--------------------------------------------------------------------------------
1 | """
2 | This part of code is the Dyna-Q learning brain, which is a brain of the agent.
3 | All decisions and learning processes are made in here.
4 |
5 | View more on my tutorial page: https://morvanzhou.github.io/tutorials/
6 | """
7 |
8 | import numpy as np
9 | import pandas as pd
10 |
11 |
12 | class QLearningTable:
13 | def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
14 | self.actions = actions # a list
15 | self.lr = learning_rate
16 | self.gamma = reward_decay
17 | self.epsilon = e_greedy
18 | self.q_table = pd.DataFrame(columns=self.actions)
19 |
20 | def choose_action(self, observation):
21 | self.check_state_exist(observation)
22 | # action selection
23 | if np.random.uniform() < self.epsilon:
24 | # choose best action
25 | state_action = self.q_table.ix[observation, :]
26 | state_action = state_action.reindex(np.random.permutation(state_action.index)) # some actions have same value
27 | action = state_action.argmax()
28 | else:
29 | # choose random action
30 | action = np.random.choice(self.actions)
31 | return action
32 |
33 | def learn(self, s, a, r, s_):
34 | self.check_state_exist(s_)
35 | q_predict = self.q_table.ix[s, a]
36 | if s_ != 'terminal':
37 | q_target = r + self.gamma * self.q_table.ix[s_, :].max() # next state is not terminal
38 | else:
39 | q_target = r # next state is terminal
40 | self.q_table.ix[s, a] += self.lr * (q_target - q_predict) # update
41 |
42 | def check_state_exist(self, state):
43 | if state not in self.q_table.index:
44 | # append new state to q table
45 | self.q_table = self.q_table.append(
46 | pd.Series(
47 | [0]*len(self.actions),
48 | index=self.q_table.columns,
49 | name=state,
50 | )
51 | )
52 |
53 |
54 | class EnvModel:
55 | """Similar to the memory buffer in DQN, you can store past experiences in here.
56 | Alternatively, the model can generate next state and reward signal accurately."""
57 | def __init__(self, actions):
58 | # the simplest case is to think about the model is a memory which has all past transition information
59 | self.actions = actions
60 | self.database = pd.DataFrame(columns=actions, dtype=np.object)
61 |
62 | def store_transition(self, s, a, r, s_):
63 | if s not in self.database.index:
64 | self.database = self.database.append(
65 | pd.Series(
66 | [None] * len(self.actions),
67 | index=self.database.columns,
68 | name=s,
69 | ))
70 | self.database.set_value(s, a, (r, s_))
71 |
72 | def sample_s_a(self):
73 | s = np.random.choice(self.database.index)
74 | a = np.random.choice(self.database.ix[s].dropna().index) # filter out the None value
75 | return s, a
76 |
77 | def get_r_s_(self, s, a):
78 | r, s_ = self.database.ix[s, a]
79 | return r, s_
80 |
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/11_Dyna_Q/maze_env.py:
--------------------------------------------------------------------------------
1 | """
2 | Reinforcement learning maze example.
3 |
4 | Red rectangle: explorer.
5 | Black rectangles: hells [reward = -1].
6 | Yellow bin circle: paradise [reward = +1].
7 | All other states: ground [reward = 0].
8 |
9 | This script is the environment part of this example. The RL is in RL_brain.py.
10 |
11 | View more on my tutorial page: https://morvanzhou.github.io/tutorials/
12 | """
13 |
14 |
15 | import numpy as np
16 | np.random.seed(1)
17 | import tkinter as tk
18 | import time
19 |
20 |
21 | UNIT = 40 # pixels
22 | MAZE_H = 4 # grid height
23 | MAZE_W = 4 # grid width
24 |
25 |
26 | class Maze(tk.Tk, object):
27 | def __init__(self):
28 | super(Maze, self).__init__()
29 | self.action_space = ['u', 'd', 'l', 'r']
30 | self.n_actions = len(self.action_space)
31 | self.title('maze')
32 | self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT))
33 | self._build_maze()
34 |
35 | def _build_maze(self):
36 | self.canvas = tk.Canvas(self, bg='white',
37 | height=MAZE_H * UNIT,
38 | width=MAZE_W * UNIT)
39 |
40 | # create grids
41 | for c in range(0, MAZE_W * UNIT, UNIT):
42 | x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
43 | self.canvas.create_line(x0, y0, x1, y1)
44 | for r in range(0, MAZE_H * UNIT, UNIT):
45 | x0, y0, x1, y1 = 0, r, MAZE_H * UNIT, r
46 | self.canvas.create_line(x0, y0, x1, y1)
47 |
48 | # create origin
49 | origin = np.array([20, 20])
50 |
51 | # hell
52 | hell1_center = origin + np.array([UNIT * 2, UNIT])
53 | self.hell1 = self.canvas.create_rectangle(
54 | hell1_center[0] - 15, hell1_center[1] - 15,
55 | hell1_center[0] + 15, hell1_center[1] + 15,
56 | fill='black')
57 | # hell
58 | hell2_center = origin + np.array([UNIT, UNIT * 2])
59 | self.hell2 = self.canvas.create_rectangle(
60 | hell2_center[0] - 15, hell2_center[1] - 15,
61 | hell2_center[0] + 15, hell2_center[1] + 15,
62 | fill='black')
63 |
64 | # create oval
65 | oval_center = origin + UNIT * 2
66 | self.oval = self.canvas.create_oval(
67 | oval_center[0] - 15, oval_center[1] - 15,
68 | oval_center[0] + 15, oval_center[1] + 15,
69 | fill='yellow')
70 |
71 | # create red rect
72 | self.rect = self.canvas.create_rectangle(
73 | origin[0] - 15, origin[1] - 15,
74 | origin[0] + 15, origin[1] + 15,
75 | fill='red')
76 |
77 | # pack all
78 | self.canvas.pack()
79 |
80 | def reset(self):
81 | self.update()
82 | time.sleep(0.5)
83 | self.canvas.delete(self.rect)
84 | origin = np.array([20, 20])
85 | self.rect = self.canvas.create_rectangle(
86 | origin[0] - 15, origin[1] - 15,
87 | origin[0] + 15, origin[1] + 15,
88 | fill='red')
89 | # return observation
90 | return self.canvas.coords(self.rect)
91 |
92 | def step(self, action):
93 | s = self.canvas.coords(self.rect)
94 | base_action = np.array([0, 0])
95 | if action == 0: # up
96 | if s[1] > UNIT:
97 | base_action[1] -= UNIT
98 | elif action == 1: # down
99 | if s[1] < (MAZE_H - 1) * UNIT:
100 | base_action[1] += UNIT
101 | elif action == 2: # right
102 | if s[0] < (MAZE_W - 1) * UNIT:
103 | base_action[0] += UNIT
104 | elif action == 3: # left
105 | if s[0] > UNIT:
106 | base_action[0] -= UNIT
107 |
108 | self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent
109 |
110 | s_ = self.canvas.coords(self.rect) # next state
111 |
112 | # reward function
113 | if s_ == self.canvas.coords(self.oval):
114 | reward = 1
115 | done = True
116 | elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]:
117 | reward = -1
118 | done = True
119 | else:
120 | reward = 0
121 | done = False
122 |
123 | return s_, reward, done
124 |
125 | def render(self):
126 | # time.sleep(0.1)
127 | self.update()
128 |
129 |
130 |
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/11_Dyna_Q/run_this.py:
--------------------------------------------------------------------------------
1 | """
2 | Simplest model-based RL, Dyna-Q.
3 |
4 | Red rectangle: explorer.
5 | Black rectangles: hells [reward = -1].
6 | Yellow bin circle: paradise [reward = +1].
7 | All other states: ground [reward = 0].
8 |
9 | This script is the main part which controls the update method of this example.
10 | The RL is in RL_brain.py.
11 |
12 | View more on my tutorial page: https://morvanzhou.github.io/tutorials/
13 | """
14 |
15 | from maze_env import Maze
16 | from RL_brain import QLearningTable, EnvModel
17 |
18 |
19 | def update():
20 | for episode in range(40):
21 | s = env.reset()
22 | while True:
23 | env.render()
24 | a = RL.choose_action(str(s))
25 | s_, r, done = env.step(a)
26 | RL.learn(str(s), a, r, str(s_))
27 |
28 | # use a model to output (r, s_) by inputting (s, a)
29 | # the model in dyna Q version is just like a memory replay buffer
30 | env_model.store_transition(str(s), a, r, s_)
31 | for n in range(10): # learn 10 more times using the env_model
32 | ms, ma = env_model.sample_s_a() # ms in here is a str
33 | mr, ms_ = env_model.get_r_s_(ms, ma)
34 | RL.learn(ms, ma, mr, str(ms_))
35 |
36 | s = s_
37 | if done:
38 | break
39 |
40 | # end of game
41 | print('game over')
42 | env.destroy()
43 |
44 |
45 | if __name__ == "__main__":
46 | env = Maze()
47 | RL = QLearningTable(actions=list(range(env.n_actions)))
48 | env_model = EnvModel(actions=list(range(env.n_actions)))
49 |
50 | env.after(0, update)
51 | env.mainloop()
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/12_Proximal_Policy_Optimization/simply_PPO.py:
--------------------------------------------------------------------------------
1 | """
2 | A simple version of Proximal Policy Optimization (PPO) using single thread.
3 |
4 | Based on:
5 | 1. Emergence of Locomotion Behaviours in Rich Environments (Google Deepmind): [https://arxiv.org/abs/1707.02286]
6 | 2. Proximal Policy Optimization Algorithms (OpenAI): [https://arxiv.org/abs/1707.06347]
7 |
8 | View more on my tutorial website: https://morvanzhou.github.io/tutorials
9 |
10 | Dependencies:
11 | tensorflow r1.2
12 | gym 0.9.2
13 | """
14 |
15 | import tensorflow as tf
16 | import numpy as np
17 | import matplotlib.pyplot as plt
18 | import gym
19 |
20 | EP_MAX = 1000
21 | EP_LEN = 200
22 | GAMMA = 0.9
23 | A_LR = 0.0001
24 | C_LR = 0.0002
25 | BATCH = 32
26 | A_UPDATE_STEPS = 10
27 | C_UPDATE_STEPS = 10
28 | S_DIM, A_DIM = 3, 1
29 | METHOD = [
30 | dict(name='kl_pen', kl_target=0.01, lam=0.5), # KL penalty
31 | dict(name='clip', epsilon=0.2), # Clipped surrogate objective, find this is better
32 | ][1] # choose the method for optimization
33 |
34 |
35 | class PPO(object):
36 |
37 | def __init__(self):
38 | self.sess = tf.Session()
39 | self.tfs = tf.placeholder(tf.float32, [None, S_DIM], 'state')
40 |
41 | # critic
42 | with tf.variable_scope('critic'):
43 | l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu)
44 | self.v = tf.layers.dense(l1, 1)
45 | self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r')
46 | self.advantage = self.tfdc_r - self.v
47 | self.closs = tf.reduce_mean(tf.square(self.advantage))
48 | self.ctrain_op = tf.train.AdamOptimizer(C_LR).minimize(self.closs)
49 |
50 | # actor
51 | pi, pi_params = self._build_anet('pi', trainable=True)
52 | oldpi, oldpi_params = self._build_anet('oldpi', trainable=False)
53 | with tf.variable_scope('sample_action'):
54 | self.sample_op = tf.squeeze(pi.sample(1), axis=0) # choosing action
55 | with tf.variable_scope('update_oldpi'):
56 | self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)]
57 |
58 | self.tfa = tf.placeholder(tf.float32, [None, A_DIM], 'action')
59 | self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage')
60 | with tf.variable_scope('loss'):
61 | with tf.variable_scope('surrogate'):
62 | # ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa))
63 | ratio = pi.prob(self.tfa) / oldpi.prob(self.tfa)
64 | surr = ratio * self.tfadv
65 | if METHOD['name'] == 'kl_pen':
66 | self.tflam = tf.placeholder(tf.float32, None, 'lambda')
67 | kl = tf.distributions.kl_divergence(oldpi, pi)
68 | self.kl_mean = tf.reduce_mean(kl)
69 | self.aloss = -(tf.reduce_mean(surr - self.tflam * kl))
70 | else: # clipping method, find this is better
71 | self.aloss = -tf.reduce_mean(tf.minimum(
72 | surr,
73 | tf.clip_by_value(ratio, 1.-METHOD['epsilon'], 1.+METHOD['epsilon'])*self.tfadv))
74 |
75 | with tf.variable_scope('atrain'):
76 | self.atrain_op = tf.train.AdamOptimizer(A_LR).minimize(self.aloss)
77 |
78 | tf.summary.FileWriter("log/", self.sess.graph)
79 |
80 | self.sess.run(tf.global_variables_initializer())
81 |
82 | def update(self, s, a, r):
83 | self.sess.run(self.update_oldpi_op)
84 | adv = self.sess.run(self.advantage, {self.tfs: s, self.tfdc_r: r})
85 | # adv = (adv - adv.mean())/(adv.std()+1e-6) # sometimes helpful
86 |
87 | # update actor
88 | if METHOD['name'] == 'kl_pen':
89 | for _ in range(A_UPDATE_STEPS):
90 | _, kl = self.sess.run(
91 | [self.atrain_op, self.kl_mean],
92 | {self.tfs: s, self.tfa: a, self.tfadv: adv, self.tflam: METHOD['lam']})
93 | if kl > 4*METHOD['kl_target']: # this in in google's paper
94 | break
95 | if kl < METHOD['kl_target'] / 1.5: # adaptive lambda, this is in OpenAI's paper
96 | METHOD['lam'] /= 2
97 | elif kl > METHOD['kl_target'] * 1.5:
98 | METHOD['lam'] *= 2
99 | METHOD['lam'] = np.clip(METHOD['lam'], 1e-4, 10) # sometimes explode, this clipping is my solution
100 | else: # clipping method, find this is better (OpenAI's paper)
101 | [self.sess.run(self.atrain_op, {self.tfs: s, self.tfa: a, self.tfadv: adv}) for _ in range(A_UPDATE_STEPS)]
102 |
103 | # update critic
104 | [self.sess.run(self.ctrain_op, {self.tfs: s, self.tfdc_r: r}) for _ in range(C_UPDATE_STEPS)]
105 |
106 | def _build_anet(self, name, trainable):
107 | with tf.variable_scope(name):
108 | l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu, trainable=trainable)
109 | mu = 2 * tf.layers.dense(l1, A_DIM, tf.nn.tanh, trainable=trainable)
110 | sigma = tf.layers.dense(l1, A_DIM, tf.nn.softplus, trainable=trainable)
111 | norm_dist = tf.distributions.Normal(loc=mu, scale=sigma)
112 | params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name)
113 | return norm_dist, params
114 |
115 | def choose_action(self, s):
116 | s = s[np.newaxis, :]
117 | a = self.sess.run(self.sample_op, {self.tfs: s})[0]
118 | return np.clip(a, -2, 2)
119 |
120 | def get_v(self, s):
121 | if s.ndim < 2: s = s[np.newaxis, :]
122 | return self.sess.run(self.v, {self.tfs: s})[0, 0]
123 |
124 | env = gym.make('Pendulum-v0').unwrapped
125 | ppo = PPO()
126 | all_ep_r = []
127 |
128 | for ep in range(EP_MAX):
129 | s = env.reset()
130 | buffer_s, buffer_a, buffer_r = [], [], []
131 | ep_r = 0
132 | for t in range(EP_LEN): # in one episode
133 | env.render()
134 | a = ppo.choose_action(s)
135 | s_, r, done, _ = env.step(a)
136 | buffer_s.append(s)
137 | buffer_a.append(a)
138 | buffer_r.append((r+8)/8) # normalize reward, find to be useful
139 | s = s_
140 | ep_r += r
141 |
142 | # update ppo
143 | if (t+1) % BATCH == 0 or t == EP_LEN-1:
144 | v_s_ = ppo.get_v(s_)
145 | discounted_r = []
146 | for r in buffer_r[::-1]:
147 | v_s_ = r + GAMMA * v_s_
148 | discounted_r.append(v_s_)
149 | discounted_r.reverse()
150 |
151 | bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis]
152 | buffer_s, buffer_a, buffer_r = [], [], []
153 | ppo.update(bs, ba, br)
154 | if ep == 0: all_ep_r.append(ep_r)
155 | else: all_ep_r.append(all_ep_r[-1]*0.9 + ep_r*0.1)
156 | print(
157 | 'Ep: %i' % ep,
158 | "|Ep_r: %i" % ep_r,
159 | ("|Lam: %.4f" % METHOD['lam']) if METHOD['name'] == 'kl_pen' else '',
160 | )
161 |
162 | plt.plot(np.arange(len(all_ep_r)), all_ep_r)
163 | plt.xlabel('Episode');plt.ylabel('Moving averaged episode reward');plt.show()
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/1_command_line_reinforcement_learning/draw.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | import matplotlib.animation as animation
4 |
5 |
6 | def update_line(num, data, line):
7 | line.set_data(data[..., :num])
8 | return line,
9 |
10 | fig = plt.figure()
11 | ax = fig.add_subplot(111)
12 |
13 |
14 | def update(icon_list):
15 | #icon_list = ['-', 'o', '-', '-', 'T', '-']
16 | print(icon_list)
17 | # plt.clf()
18 | list_len = len(icon_list)
19 | delta = 0.9/list_len
20 | color_dict = {'-':'g', 'o':'b', 'T':'r'}
21 | start_point = (0.05, 0.05)
22 | plt.text(0.5, 0.6, 'Episode 5', horizontalalignment='center', fontsize=12,
23 | verticalalignment='center', transform=ax.transAxes)
24 | for idx, item in enumerate(icon_list):
25 | color_value = color_dict[item]
26 | end_point = (start_point[0]+idx*delta, 0.3)
27 | ax.add_patch(plt.Rectangle((end_point[0], end_point[1]+delta), delta, delta*0.5,
28 | linestyle='--', edgecolor='b', linewidth=1, alpha=0.5))
29 | plt.text(end_point[0]+0.5*delta, end_point[1]+1.25*delta, idx, horizontalalignment='center', fontsize=12,
30 | verticalalignment='center', transform=ax.transAxes)
31 | ax.add_patch(plt.Rectangle(end_point, delta, delta,
32 | color='%s'%(color_value), linestyle='--', edgecolor='y', linewidth=1, alpha=0.5)) #, fill=None
33 | plt.text(end_point[0]+0.5*delta, end_point[1]+0.5*delta, item, horizontalalignment='center', fontsize=12,
34 | verticalalignment='center', transform=ax.transAxes)
35 | # plt.show()
36 | #ax.add_patch(plt.Rectangle((0.1,0.1),0.3,0.3))
37 | def init():
38 | plt.xlim(0, 1)
39 | plt.ylim(0, 1)
40 | plt.xlabel('x')
41 | plt.title('test')
42 | return l,
43 |
44 | data = [['-', 'o', '-', '-', 'T', '-'],
45 | ['-', '-', 'o', '-', 'T', '-'],
46 | ['-', '-', '-', 'o', 'T', '-']]
47 | # l, = plt.plot([], [], 'r-')
48 | # line_ani = animation.FuncAnimation(fig, update, data, interval=10, blit=True)
49 | for item in data:
50 | update(item)
51 | plt.show()
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/1_command_line_reinforcement_learning/treasure_on_right.py:
--------------------------------------------------------------------------------
1 | """
2 | A simple example for Reinforcement Learning using table lookup Q-learning method.
3 | An agent "o" is on the left of a 1 dimensional world, the treasure is on the rightmost location.
4 | Run this program and to see how the agent will improve its strategy of finding the treasure.
5 |
6 | View more on my tutorial page: https://morvanzhou.github.io/tutorials/
7 | """
8 |
9 | import numpy as np
10 | import pandas as pd
11 | import time
12 |
13 | np.random.seed(2) # reproducible
14 |
15 |
16 | N_STATES = 6 # the length of the 1 dimensional world
17 | ACTIONS = ['left', 'right'] # available actions
18 | EPSILON = 0.9 # greedy police
19 | ALPHA = 0.1 # learning rate
20 | GAMMA = 0.9 # discount factor
21 | MAX_EPISODES = 13 # maximum episodes
22 | FRESH_TIME = 0.3 # fresh time for one move
23 |
24 |
25 | def build_q_table(n_states, actions):
26 | table = pd.DataFrame(
27 | np.zeros((n_states, len(actions))), # q_table initial values
28 | columns=actions, # actions's name
29 | )
30 | # print(table) # show table
31 | return table
32 |
33 |
34 | def choose_action(state, q_table):
35 | # This is how to choose an action
36 | state_actions = q_table.iloc[state, :]
37 | if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()): # act non-greedy or state-action have no value
38 | action_name = np.random.choice(ACTIONS)
39 | else: # act greedy
40 | action_name = state_actions.idxmax() # replace argmax to idxmax as argmax means a different function in newer version of pandas
41 | return action_name
42 |
43 |
44 | def get_env_feedback(S, A):
45 | # This is how agent will interact with the environment
46 | if A == 'right': # move right
47 | if S == N_STATES - 2: # terminate
48 | S_ = 'terminal'
49 | R = 1
50 | else:
51 | S_ = S + 1
52 | R = 0
53 | else: # move left
54 | R = 0
55 | if S == 0:
56 | S_ = S # reach the wall
57 | else:
58 | S_ = S - 1
59 | return S_, R
60 |
61 |
62 | def update_env(S, episode, step_counter):
63 | # This is how environment be updated
64 | env_list = ['-']*(N_STATES-1) + ['T'] # '---------T' our environment
65 | if S == 'terminal':
66 | interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter)
67 | print('\r{}'.format(interaction), end='')
68 | time.sleep(2)
69 | print('\r ', end='')
70 | else:
71 | env_list[S] = 'o'
72 | interaction = ''.join(env_list)
73 | print('\r{}'.format(interaction), end='')
74 | time.sleep(FRESH_TIME)
75 |
76 |
77 | def rl():
78 | # main part of RL loop
79 | q_table = build_q_table(N_STATES, ACTIONS)
80 | for episode in range(MAX_EPISODES):
81 | step_counter = 0
82 | S = 0
83 | is_terminated = False
84 | update_env(S, episode, step_counter)
85 | while not is_terminated:
86 |
87 | A = choose_action(S, q_table)
88 | S_, R = get_env_feedback(S, A) # take action & get next state and reward
89 | q_predict = q_table.loc[S, A]
90 | if S_ != 'terminal':
91 | q_target = R + GAMMA * q_table.iloc[S_, :].max() # next state is not terminal
92 | else:
93 | q_target = R # next state is terminal
94 | is_terminated = True # terminate this episode
95 |
96 | q_table.loc[S, A] += ALPHA * (q_target - q_predict) # update
97 | S = S_ # move to next state
98 |
99 | update_env(S, episode, step_counter+1)
100 | step_counter += 1
101 | return q_table
102 |
103 |
104 | if __name__ == "__main__":
105 | q_table = rl()
106 | print('\r\nQ-table:\n')
107 | print(q_table)
108 |
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/1_command_line_reinforcement_learning/treasure_on_right_wqw.py:
--------------------------------------------------------------------------------
1 |
2 | """
3 | A simple example for Reinforcement Learning using table lookup Q-learning method.
4 | An agent "o" is on the left of a 1 dimensional world, the treasure is on the rightmost location.
5 | Run this program and to see how the agent will improve its strategy of finding the treasure.
6 |
7 | View more on my tutorial page: https://morvanzhou.github.io/tutorials/
8 | """
9 |
10 | import numpy as np
11 | import pandas as pd
12 | import time
13 |
14 | np.random.seed(2) # reproducible
15 |
16 |
17 | N_STATES = 6 # 状态数目 the length of the 1 dimensional world
18 | ACTIONS = ['left', 'right'] # 可行动作列表 available actions
19 | EPSILON = 0.9 # 贪心因子 greedy police
20 | ALPHA = 0.1 # 学习率 learning rate
21 | GAMMA = 0.9 # 折扣损失 discount factor
22 | MAX_EPISODES = 13 # 最大训练回合 maximum episodes
23 | FRESH_TIME = 0.3 # 每回合休息时间 fresh time for one move
24 |
25 |
26 | import numpy as np
27 | import matplotlib.pyplot as plt
28 | import matplotlib.animation as animation
29 |
30 |
31 | def render():
32 | """
33 | 绘制网格图
34 | """
35 | def update_line(num, data, line):
36 | line.set_data(data[..., :num])
37 | return line,
38 |
39 | fig1 = plt.figure()
40 | data = np.random.rand(2, 25)
41 | l, = plt.plot([], [], 'r-')
42 | plt.xlim(0, 1)
43 | plt.ylim(0, 1)
44 | plt.xlabel('x')
45 | plt.title('test')
46 | line_ani = animation.FuncAnimation(fig1, update_line, 25, fargs=(data, l), interval=50, blit=True)
47 | plt.show()
48 |
49 |
50 | def build_q_table(n_states, actions):
51 | """
52 | 构建Q表, nXa
53 | """
54 | # Q表初始化
55 | table = pd.DataFrame(
56 | np.zeros((n_states, len(actions))), # q_table initial values
57 | columns=actions, # actions's name
58 | )
59 | # print(table) # show table
60 | return table
61 |
62 |
63 | def choose_action(state, q_table):
64 | """
65 | 策略函数:如何选择下一步动作 This is how to choose an action
66 | """
67 | # 获取该状态下所有动作奖励列表
68 | state_actions = q_table.iloc[state, :]
69 | if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()):
70 | # 随机模式(探索)act non-greedy or state-action have no value
71 | action_name = np.random.choice(ACTIONS)
72 | else: # 贪婪模式(利用)act greedy
73 | action_name = state_actions.idxmax()
74 | # replace argmax to idxmax as argmax means a different function in newer version of pandas
75 | return action_name
76 |
77 |
78 | def get_env_feedback(S, A):
79 | """
80 | agent从环境中获取反馈,S状态下采取A获得的奖励R (S, A) -> R
81 | This is how agent will interact with the environment
82 | """
83 | if A == 'right':
84 | # move right
85 | if S == N_STATES - 2: # terminate
86 | S_ = 'terminal'
87 | R = 1
88 | else:
89 | S_ = S + 1
90 | R = 0
91 | else: # move left
92 | R = 0
93 | if S == 0:
94 | S_ = S # reach the wall
95 | else:
96 | S_ = S - 1
97 | return S_, R
98 |
99 |
100 | def update_env(S, episode, step_counter):
101 | # This is how environment be updated
102 | env_list = ['-']*(N_STATES-1) + ['T'] # '---------T' our environment
103 | if S == 'terminal':
104 | interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter)
105 | print(' => {}'.format(interaction))
106 | #print('\r{}'.format(interaction), end='')
107 | time.sleep(1)
108 | #print('\r ', end='')
109 | else:
110 | env_list[S] = 'o'
111 | interaction = ''.join(env_list)
112 | print('\r{}'.format(interaction), end='')
113 | time.sleep(FRESH_TIME)
114 |
115 |
116 | def rl():
117 | """ 强化学习程序主体 """
118 | # main part of RL loop
119 | q_table = build_q_table(N_STATES, ACTIONS)
120 | # 最多玩MAX_EPISODE局
121 | for episode in range(MAX_EPISODES):
122 | step_counter = 0
123 | S = 0
124 | is_terminated = False
125 | update_env(S, episode, step_counter)
126 | while not is_terminated:
127 | A = choose_action(S, q_table)
128 | S_, R = get_env_feedback(S, A) # take action & get next state and reward
129 | q_predict = q_table.loc[S, A]
130 | # Q Learning算法
131 | if S_ != 'terminal':
132 | q_target = R + GAMMA * q_table.iloc[S_, :].max() # next state is not terminal
133 | else:
134 | q_target = R # next state is terminal
135 | is_terminated = True # terminate this episode
136 | # 输出Q表
137 | print('\rQ-table: %s\n' % (q_table))
138 | q_table.loc[S, A] += ALPHA * (q_target - q_predict) # update
139 | S = S_ # move to next state
140 | update_env(S, episode, step_counter+1)
141 | step_counter += 1
142 | return q_table
143 |
144 |
145 | if __name__ == "__main__":
146 | q_table = rl()
147 | print('\r\nQ-table:')
148 | print(q_table)
149 |
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/2_Q_Learning_maze/RL_brain.py:
--------------------------------------------------------------------------------
1 | """
2 | This part of code is the Q learning brain, which is a brain of the agent.
3 | All decisions are made in here.
4 |
5 | View more on my tutorial page: https://morvanzhou.github.io/tutorials/
6 | """
7 |
8 | import numpy as np
9 | import pandas as pd
10 |
11 |
12 | class QLearningTable:
13 | def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
14 | self.actions = actions # a list
15 | self.lr = learning_rate
16 | self.gamma = reward_decay
17 | self.epsilon = e_greedy
18 | self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)
19 |
20 | def choose_action(self, observation):
21 | self.check_state_exist(observation)
22 | # action selection
23 | if np.random.uniform() < self.epsilon:
24 | # choose best action
25 | state_action = self.q_table.loc[observation, :]
26 | # some actions may have the same value, randomly choose on in these actions
27 | action = np.random.choice(state_action[state_action == np.max(state_action)].index)
28 | else:
29 | # choose random action
30 | action = np.random.choice(self.actions)
31 | return action
32 |
33 | def learn(self, s, a, r, s_):
34 | self.check_state_exist(s_)
35 | q_predict = self.q_table.loc[s, a]
36 | if s_ != 'terminal':
37 | q_target = r + self.gamma * self.q_table.loc[s_, :].max() # next state is not terminal
38 | else:
39 | q_target = r # next state is terminal
40 | self.q_table.loc[s, a] += self.lr * (q_target - q_predict) # update
41 |
42 | def check_state_exist(self, state):
43 | if state not in self.q_table.index:
44 | # append new state to q table
45 | self.q_table = self.q_table.append(
46 | pd.Series(
47 | [0]*len(self.actions),
48 | index=self.q_table.columns,
49 | name=state,
50 | )
51 | )
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/2_Q_Learning_maze/maze_env.py:
--------------------------------------------------------------------------------
1 | """
2 | Reinforcement learning maze example.
3 |
4 | Red rectangle: explorer.
5 | Black rectangles: hells [reward = -1].
6 | Yellow bin circle: paradise [reward = +1].
7 | All other states: ground [reward = 0].
8 |
9 | This script is the environment part of this example. The RL is in RL_brain.py.
10 |
11 | View more on my tutorial page: https://morvanzhou.github.io/tutorials/
12 | """
13 |
14 | import numpy as np
15 | import time
16 | import sys
17 |
18 | if sys.version_info.major == 2:
19 | import Tkinter as tk
20 | else:
21 | import tkinter as tk
22 |
23 |
24 | UNIT = 100 # pixels 格子大小,初始值为40
25 | HALF_UNIT = UNIT/2 # 格子位置基准
26 | MOVE_HALF = UNIT/2.5 # 运动节点的半径
27 | BOUND = 5 # 边界
28 | MAZE_H = 4 # grid height
29 | MAZE_W = 4 # grid width
30 |
31 |
32 | class Maze(tk.Tk, object):
33 | def __init__(self):
34 | super(Maze, self).__init__()
35 | self.action_space = ['u', 'd', 'l', 'r']
36 | self.n_actions = len(self.action_space)
37 | self.title('maze迷宫问题')
38 | self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT))
39 | self._build_maze()
40 |
41 | def _build_maze(self):
42 | self.canvas = tk.Canvas(self, bg='white',
43 | height=MAZE_H * UNIT,
44 | width=MAZE_W * UNIT)
45 | # create grids 画网格
46 | for c in range(0, MAZE_W * UNIT, UNIT):
47 | x0, y0, x1, y1 = c, 0+BOUND, c, MAZE_H * UNIT
48 | self.canvas.create_line(x0, y0, x1, y1)
49 | for r in range(0, MAZE_H * UNIT, UNIT):
50 | x0, y0, x1, y1 = 0+BOUND, r, MAZE_W * UNIT, r
51 | self.canvas.create_line(x0, y0, x1, y1)
52 | # create origin
53 | origin = np.array([HALF_UNIT, HALF_UNIT])
54 | # hell
55 | hell1_center = origin + np.array([UNIT * 2, UNIT])
56 | self.hell1 = self.canvas.create_rectangle(
57 | hell1_center[0] - MOVE_HALF, hell1_center[1] - MOVE_HALF,
58 | hell1_center[0] + MOVE_HALF, hell1_center[1] + MOVE_HALF,
59 | fill='black')
60 | # hell
61 | hell2_center = origin + np.array([UNIT, UNIT * 2])
62 | self.hell2 = self.canvas.create_rectangle(
63 | hell2_center[0] - MOVE_HALF, hell2_center[1] - MOVE_HALF,
64 | hell2_center[0] + MOVE_HALF, hell2_center[1] + MOVE_HALF,
65 | fill='black')
66 | # create oval
67 | oval_center = origin + UNIT * 2
68 | self.oval = self.canvas.create_oval(
69 | oval_center[0] - MOVE_HALF, oval_center[1] - MOVE_HALF,
70 | oval_center[0] + MOVE_HALF, oval_center[1] + MOVE_HALF,
71 | fill='yellow')
72 | # create red rect
73 | self.rect = self.canvas.create_rectangle(
74 | origin[0] - MOVE_HALF, origin[1] - MOVE_HALF,
75 | origin[0] + MOVE_HALF, origin[1] + MOVE_HALF,
76 | fill='red')
77 | # pack all
78 | self.canvas.pack()
79 |
80 | def reset(self):
81 | self.update()
82 | time.sleep(0.5)
83 | self.canvas.delete(self.rect)
84 | origin = np.array([HALF_UNIT, HALF_UNIT])
85 | self.rect = self.canvas.create_rectangle(
86 | origin[0] - MOVE_HALF, origin[1] - MOVE_HALF,
87 | origin[0] + MOVE_HALF, origin[1] + MOVE_HALF,
88 | fill='red')
89 | # return observation
90 | return self.canvas.coords(self.rect)
91 |
92 | def step(self, action):
93 | s = self.canvas.coords(self.rect)
94 | base_action = np.array([0, 0])
95 | if action == 0: # up
96 | if s[1] > UNIT:
97 | base_action[1] -= UNIT
98 | elif action == 1: # down
99 | if s[1] < (MAZE_H - 1) * UNIT:
100 | base_action[1] += UNIT
101 | elif action == 2: # right
102 | if s[0] < (MAZE_W - 1) * UNIT:
103 | base_action[0] += UNIT
104 | elif action == 3: # left
105 | if s[0] > UNIT:
106 | base_action[0] -= UNIT
107 |
108 | self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent
109 |
110 | s_ = self.canvas.coords(self.rect) # next state
111 |
112 | # reward function
113 | if s_ == self.canvas.coords(self.oval):
114 | reward = 1
115 | done = True
116 | s_ = 'terminal'
117 | elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]:
118 | reward = -1
119 | done = True
120 | s_ = 'terminal'
121 | else:
122 | reward = 0
123 | done = False
124 |
125 | return s_, reward, done
126 |
127 | def render(self):
128 | time.sleep(0.1)
129 | self.update()
130 |
131 |
132 | def update():
133 | for t in range(10):
134 | s = env.reset()
135 | while True:
136 | env.render()
137 | a = 1
138 | s, r, done = env.step(a)
139 | if done:
140 | break
141 |
142 | if __name__ == '__main__':
143 | env = Maze()
144 | env.after(100, update)
145 | env.mainloop()
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/2_Q_Learning_maze/run_this.py:
--------------------------------------------------------------------------------
1 | """
2 | Reinforcement learning maze example.
3 |
4 | Red rectangle: explorer.
5 | Black rectangles: hells [reward = -1].
6 | Yellow bin circle: paradise [reward = +1].
7 | All other states: ground [reward = 0].
8 |
9 | This script is the main part which controls the update method of this example.
10 | The RL is in RL_brain.py.
11 |
12 | View more on my tutorial page: https://morvanzhou.github.io/tutorials/
13 | """
14 |
15 | from maze_env import Maze
16 | from RL_brain import QLearningTable
17 |
18 |
19 | def update():
20 | for episode in range(100):
21 | # initial observation
22 | observation = env.reset()
23 |
24 | while True:
25 | # fresh env
26 | env.render()
27 |
28 | # RL choose action based on observation
29 | action = RL.choose_action(str(observation))
30 |
31 | # RL take action and get next observation and reward
32 | observation_, reward, done = env.step(action)
33 |
34 | # RL learn from this transition
35 | RL.learn(str(observation), action, reward, str(observation_))
36 |
37 | # swap observation
38 | observation = observation_
39 |
40 | # break while loop when end of this episode
41 | if done:
42 | break
43 |
44 | # end of game
45 | print('game over')
46 | env.destroy()
47 |
48 | if __name__ == "__main__":
49 | env = Maze()
50 | RL = QLearningTable(actions=list(range(env.n_actions)))
51 |
52 | env.after(100, update)
53 | env.mainloop()
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/3_Sarsa_maze/RL_brain.py:
--------------------------------------------------------------------------------
1 | """
2 | This part of code is the Q learning brain, which is a brain of the agent.
3 | All decisions are made in here.
4 |
5 | View more on my tutorial page: https://morvanzhou.github.io/tutorials/
6 | """
7 |
8 | import numpy as np
9 | import pandas as pd
10 |
11 |
12 | class RL(object):
13 | def __init__(self, action_space, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
14 | self.actions = action_space # a list
15 | self.lr = learning_rate
16 | self.gamma = reward_decay
17 | self.epsilon = e_greedy
18 |
19 | self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)
20 |
21 | def check_state_exist(self, state):
22 | if state not in self.q_table.index:
23 | # append new state to q table
24 | self.q_table = self.q_table.append(
25 | pd.Series(
26 | [0]*len(self.actions),
27 | index=self.q_table.columns,
28 | name=state,
29 | )
30 | )
31 |
32 | def choose_action(self, observation):
33 | self.check_state_exist(observation)
34 | # action selection
35 | if np.random.rand() < self.epsilon:
36 | # choose best action
37 | state_action = self.q_table.loc[observation, :]
38 | # some actions may have the same value, randomly choose on in these actions
39 | action = np.random.choice(state_action[state_action == np.max(state_action)].index)
40 | else:
41 | # choose random action
42 | action = np.random.choice(self.actions)
43 | return action
44 |
45 | def learn(self, *args):
46 | pass
47 |
48 |
49 | # off-policy
50 | class QLearningTable(RL):
51 | def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
52 | super(QLearningTable, self).__init__(actions, learning_rate, reward_decay, e_greedy)
53 |
54 | def learn(self, s, a, r, s_):
55 | self.check_state_exist(s_)
56 | q_predict = self.q_table.loc[s, a]
57 | if s_ != 'terminal':
58 | q_target = r + self.gamma * self.q_table.loc[s_, :].max() # next state is not terminal
59 | else:
60 | q_target = r # next state is terminal
61 | self.q_table.loc[s, a] += self.lr * (q_target - q_predict) # update
62 |
63 |
64 | # on-policy
65 | class SarsaTable(RL):
66 |
67 | def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
68 | super(SarsaTable, self).__init__(actions, learning_rate, reward_decay, e_greedy)
69 |
70 | def learn(self, s, a, r, s_, a_):
71 | self.check_state_exist(s_)
72 | q_predict = self.q_table.loc[s, a]
73 | if s_ != 'terminal':
74 | q_target = r + self.gamma * self.q_table.loc[s_, a_] # next state is not terminal
75 | else:
76 | q_target = r # next state is terminal
77 | self.q_table.loc[s, a] += self.lr * (q_target - q_predict) # update
78 |
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/3_Sarsa_maze/maze_env.py:
--------------------------------------------------------------------------------
1 | """
2 | Reinforcement learning maze example.
3 |
4 | Red rectangle: explorer.
5 | Black rectangles: hells [reward = -1].
6 | Yellow bin circle: paradise [reward = +1].
7 | All other states: ground [reward = 0].
8 |
9 | This script is the environment part of this example. The RL is in RL_brain.py.
10 |
11 | View more on my tutorial page: https://morvanzhou.github.io/tutorials/
12 | """
13 |
14 | import numpy as np
15 | import time
16 | import sys
17 |
18 | if sys.version_info.major == 2:
19 | import Tkinter as tk
20 | else:
21 | import tkinter as tk
22 |
23 |
24 | UNIT = 100 # pixels 格子大小,初始值为40
25 | HALF_UNIT = UNIT/2 # 格子位置基准
26 | MOVE_HALF = UNIT/2.5 # 运动节点的半径
27 | BOUND = 5 # 边界
28 | MAZE_H = 4 # grid height
29 | MAZE_W = 4 # grid width
30 |
31 |
32 | class Maze(tk.Tk, object):
33 | def __init__(self):
34 | super(Maze, self).__init__()
35 | self.action_space = ['u', 'd', 'l', 'r']
36 | self.n_actions = len(self.action_space)
37 | self.title('maze迷宫问题')
38 | self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT))
39 | self._build_maze()
40 |
41 | def _build_maze(self):
42 | self.canvas = tk.Canvas(self, bg='white',
43 | height=MAZE_H * UNIT,
44 | width=MAZE_W * UNIT)
45 | # create grids 画网格
46 | for c in range(0, MAZE_W * UNIT, UNIT):
47 | x0, y0, x1, y1 = c, 0+BOUND, c, MAZE_H * UNIT
48 | self.canvas.create_line(x0, y0, x1, y1)
49 | for r in range(0, MAZE_H * UNIT, UNIT):
50 | x0, y0, x1, y1 = 0+BOUND, r, MAZE_W * UNIT, r
51 | self.canvas.create_line(x0, y0, x1, y1)
52 | # create origin
53 | origin = np.array([HALF_UNIT, HALF_UNIT])
54 | # hell
55 | hell1_center = origin + np.array([UNIT * 2, UNIT])
56 | self.hell1 = self.canvas.create_rectangle(
57 | hell1_center[0] - MOVE_HALF, hell1_center[1] - MOVE_HALF,
58 | hell1_center[0] + MOVE_HALF, hell1_center[1] + MOVE_HALF,
59 | fill='black')
60 | # hell
61 | hell2_center = origin + np.array([UNIT, UNIT * 2])
62 | self.hell2 = self.canvas.create_rectangle(
63 | hell2_center[0] - MOVE_HALF, hell2_center[1] - MOVE_HALF,
64 | hell2_center[0] + MOVE_HALF, hell2_center[1] + MOVE_HALF,
65 | fill='black')
66 | # create oval
67 | oval_center = origin + UNIT * 2
68 | self.oval = self.canvas.create_oval(
69 | oval_center[0] - MOVE_HALF, oval_center[1] - MOVE_HALF,
70 | oval_center[0] + MOVE_HALF, oval_center[1] + MOVE_HALF,
71 | fill='yellow')
72 | # create red rect
73 | self.rect = self.canvas.create_rectangle(
74 | origin[0] - MOVE_HALF, origin[1] - MOVE_HALF,
75 | origin[0] + MOVE_HALF, origin[1] + MOVE_HALF,
76 | fill='red')
77 | # pack all
78 | self.canvas.pack()
79 |
80 | def reset(self):
81 | self.update()
82 | time.sleep(0.5)
83 | self.canvas.delete(self.rect)
84 | origin = np.array([HALF_UNIT, HALF_UNIT])
85 | self.rect = self.canvas.create_rectangle(
86 | origin[0] - MOVE_HALF, origin[1] - MOVE_HALF,
87 | origin[0] + MOVE_HALF, origin[1] + MOVE_HALF,
88 | fill='red')
89 | # return observation
90 | return self.canvas.coords(self.rect)
91 |
92 | def step(self, action):
93 | s = self.canvas.coords(self.rect)
94 | base_action = np.array([0, 0])
95 | if action == 0: # up
96 | if s[1] > UNIT:
97 | base_action[1] -= UNIT
98 | elif action == 1: # down
99 | if s[1] < (MAZE_H - 1) * UNIT:
100 | base_action[1] += UNIT
101 | elif action == 2: # right
102 | if s[0] < (MAZE_W - 1) * UNIT:
103 | base_action[0] += UNIT
104 | elif action == 3: # left
105 | if s[0] > UNIT:
106 | base_action[0] -= UNIT
107 |
108 | self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent
109 |
110 | s_ = self.canvas.coords(self.rect) # next state
111 |
112 | # reward function
113 | if s_ == self.canvas.coords(self.oval):
114 | reward = 1
115 | done = True
116 | s_ = 'terminal'
117 | elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]:
118 | reward = -1
119 | done = True
120 | s_ = 'terminal'
121 | else:
122 | reward = 0
123 | done = False
124 |
125 | return s_, reward, done
126 |
127 | def render(self):
128 | time.sleep(0.1)
129 | self.update()
130 |
131 |
132 | def update():
133 | for t in range(10):
134 | s = env.reset()
135 | while True:
136 | env.render()
137 | a = 1
138 | s, r, done = env.step(a)
139 | if done:
140 | break
141 |
142 | if __name__ == '__main__':
143 | env = Maze()
144 | env.after(100, update)
145 | env.mainloop()
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/3_Sarsa_maze/run_this.py:
--------------------------------------------------------------------------------
1 | """
2 | Sarsa is a online updating method for Reinforcement learning.
3 |
4 | Unlike Q learning which is a offline updating method, Sarsa is updating while in the current trajectory.
5 |
6 | You will see the sarsa is more coward when punishment is close because it cares about all behaviours,
7 | while q learning is more brave because it only cares about maximum behaviour.
8 | """
9 |
10 | from maze_env import Maze
11 | from RL_brain import SarsaTable
12 |
13 |
14 | def update():
15 | for episode in range(100):
16 | # initial observation
17 | observation = env.reset()
18 |
19 | # RL choose action based on observation
20 | action = RL.choose_action(str(observation))
21 |
22 | while True:
23 | # fresh env
24 | env.render()
25 |
26 | # RL take action and get next observation and reward
27 | observation_, reward, done = env.step(action)
28 |
29 | # RL choose action based on next observation
30 | action_ = RL.choose_action(str(observation_))
31 |
32 | # RL learn from this transition (s, a, r, s, a) ==> Sarsa
33 | RL.learn(str(observation), action, reward, str(observation_), action_)
34 |
35 | # swap observation and action
36 | observation = observation_
37 | action = action_
38 |
39 | # break while loop when end of this episode
40 | if done:
41 | break
42 |
43 | # end of game
44 | print('game over')
45 | env.destroy()
46 |
47 | if __name__ == "__main__":
48 | env = Maze()
49 | RL = SarsaTable(actions=list(range(env.n_actions)))
50 |
51 | env.after(100, update)
52 | env.mainloop()
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/4_Sarsa_lambda_maze/RL_brain.py:
--------------------------------------------------------------------------------
1 | """
2 | This part of code is the Q learning brain, which is a brain of the agent.
3 | All decisions are made in here.
4 |
5 | View more on my tutorial page: https://morvanzhou.github.io/tutorials/
6 | """
7 |
8 | import numpy as np
9 | import pandas as pd
10 |
11 |
12 | class RL(object):
13 | def __init__(self, action_space, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
14 | self.actions = action_space # a list
15 | self.lr = learning_rate
16 | self.gamma = reward_decay
17 | self.epsilon = e_greedy
18 |
19 | self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)
20 |
21 | def check_state_exist(self, state):
22 | if state not in self.q_table.index:
23 | # append new state to q table
24 | self.q_table = self.q_table.append(
25 | pd.Series(
26 | [0]*len(self.actions),
27 | index=self.q_table.columns,
28 | name=state,
29 | )
30 | )
31 |
32 | def choose_action(self, observation):
33 | self.check_state_exist(observation)
34 | # action selection
35 | if np.random.rand() < self.epsilon:
36 | # choose best action
37 | state_action = self.q_table.loc[observation, :]
38 | # some actions may have the same value, randomly choose on in these actions
39 | action = np.random.choice(state_action[state_action == np.max(state_action)].index)
40 | else:
41 | # choose random action
42 | action = np.random.choice(self.actions)
43 | return action
44 |
45 | def learn(self, *args):
46 | pass
47 |
48 |
49 | # backward eligibility traces
50 | class SarsaLambdaTable(RL):
51 | def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, trace_decay=0.9):
52 | super(SarsaLambdaTable, self).__init__(actions, learning_rate, reward_decay, e_greedy)
53 |
54 | # backward view, eligibility trace.
55 | self.lambda_ = trace_decay
56 | self.eligibility_trace = self.q_table.copy()
57 |
58 | def check_state_exist(self, state):
59 | if state not in self.q_table.index:
60 | # append new state to q table
61 | to_be_append = pd.Series(
62 | [0] * len(self.actions),
63 | index=self.q_table.columns,
64 | name=state,
65 | )
66 | self.q_table = self.q_table.append(to_be_append)
67 |
68 | # also update eligibility trace
69 | self.eligibility_trace = self.eligibility_trace.append(to_be_append)
70 |
71 | def learn(self, s, a, r, s_, a_):
72 | self.check_state_exist(s_)
73 | q_predict = self.q_table.loc[s, a]
74 | if s_ != 'terminal':
75 | q_target = r + self.gamma * self.q_table.loc[s_, a_] # next state is not terminal
76 | else:
77 | q_target = r # next state is terminal
78 | error = q_target - q_predict
79 |
80 | # increase trace amount for visited state-action pair
81 |
82 | # Method 1:
83 | # self.eligibility_trace.loc[s, a] += 1
84 |
85 | # Method 2:
86 | self.eligibility_trace.loc[s, :] *= 0
87 | self.eligibility_trace.loc[s, a] = 1
88 |
89 | # Q update
90 | self.q_table += self.lr * error * self.eligibility_trace
91 |
92 | # decay eligibility trace after update
93 | self.eligibility_trace *= self.gamma*self.lambda_
94 |
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/4_Sarsa_lambda_maze/maze_env.py:
--------------------------------------------------------------------------------
1 | """
2 | Reinforcement learning maze example.
3 |
4 | Red rectangle: explorer.
5 | Black rectangles: hells [reward = -1].
6 | Yellow bin circle: paradise [reward = +1].
7 | All other states: ground [reward = 0].
8 |
9 | This script is the environment part of this example.
10 | The RL is in RL_brain.py.
11 |
12 | View more on my tutorial page: https://morvanzhou.github.io/tutorials/
13 | """
14 |
15 |
16 | import numpy as np
17 | import time
18 | import sys
19 | if sys.version_info.major == 2:
20 | import Tkinter as tk
21 | else:
22 | import tkinter as tk
23 |
24 |
25 | UNIT = 40 # pixels
26 | MAZE_H = 4 # grid height
27 | MAZE_W = 4 # grid width
28 |
29 |
30 | class Maze(tk.Tk, object):
31 | def __init__(self):
32 | super(Maze, self).__init__()
33 | self.action_space = ['u', 'd', 'l', 'r']
34 | self.n_actions = len(self.action_space)
35 | self.title('maze')
36 | self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT))
37 | self._build_maze()
38 |
39 | def _build_maze(self):
40 | self.canvas = tk.Canvas(self, bg='white',
41 | height=MAZE_H * UNIT,
42 | width=MAZE_W * UNIT)
43 |
44 | # create grids
45 | for c in range(0, MAZE_W * UNIT, UNIT):
46 | x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
47 | self.canvas.create_line(x0, y0, x1, y1)
48 | for r in range(0, MAZE_H * UNIT, UNIT):
49 | x0, y0, x1, y1 = 0, r, MAZE_W * UNIT, r
50 | self.canvas.create_line(x0, y0, x1, y1)
51 |
52 | # create origin
53 | origin = np.array([20, 20])
54 |
55 | # hell
56 | hell1_center = origin + np.array([UNIT * 2, UNIT])
57 | self.hell1 = self.canvas.create_rectangle(
58 | hell1_center[0] - 15, hell1_center[1] - 15,
59 | hell1_center[0] + 15, hell1_center[1] + 15,
60 | fill='black')
61 | # hell
62 | hell2_center = origin + np.array([UNIT, UNIT * 2])
63 | self.hell2 = self.canvas.create_rectangle(
64 | hell2_center[0] - 15, hell2_center[1] - 15,
65 | hell2_center[0] + 15, hell2_center[1] + 15,
66 | fill='black')
67 |
68 | # create oval
69 | oval_center = origin + UNIT * 2
70 | self.oval = self.canvas.create_oval(
71 | oval_center[0] - 15, oval_center[1] - 15,
72 | oval_center[0] + 15, oval_center[1] + 15,
73 | fill='yellow')
74 |
75 | # create red rect
76 | self.rect = self.canvas.create_rectangle(
77 | origin[0] - 15, origin[1] - 15,
78 | origin[0] + 15, origin[1] + 15,
79 | fill='red')
80 |
81 | # pack all
82 | self.canvas.pack()
83 |
84 | def reset(self):
85 | self.update()
86 | time.sleep(0.5)
87 | self.canvas.delete(self.rect)
88 | origin = np.array([20, 20])
89 | self.rect = self.canvas.create_rectangle(
90 | origin[0] - 15, origin[1] - 15,
91 | origin[0] + 15, origin[1] + 15,
92 | fill='red')
93 | # return observation
94 | return self.canvas.coords(self.rect)
95 |
96 | def step(self, action):
97 | s = self.canvas.coords(self.rect)
98 | base_action = np.array([0, 0])
99 | if action == 0: # up
100 | if s[1] > UNIT:
101 | base_action[1] -= UNIT
102 | elif action == 1: # down
103 | if s[1] < (MAZE_H - 1) * UNIT:
104 | base_action[1] += UNIT
105 | elif action == 2: # right
106 | if s[0] < (MAZE_W - 1) * UNIT:
107 | base_action[0] += UNIT
108 | elif action == 3: # left
109 | if s[0] > UNIT:
110 | base_action[0] -= UNIT
111 |
112 | self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent
113 |
114 | s_ = self.canvas.coords(self.rect) # next state
115 |
116 | # reward function
117 | if s_ == self.canvas.coords(self.oval):
118 | reward = 1
119 | done = True
120 | s_ = 'terminal'
121 | elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]:
122 | reward = -1
123 | done = True
124 | s_ = 'terminal'
125 | else:
126 | reward = 0
127 | done = False
128 |
129 | return s_, reward, done
130 |
131 | def render(self):
132 | time.sleep(0.05)
133 | self.update()
134 |
135 |
136 |
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/4_Sarsa_lambda_maze/run_this.py:
--------------------------------------------------------------------------------
1 | """
2 | Sarsa is a online updating method for Reinforcement learning.
3 |
4 | Unlike Q learning which is a offline updating method, Sarsa is updating while in the current trajectory.
5 |
6 | You will see the sarsa is more coward when punishment is close because it cares about all behaviours,
7 | while q learning is more brave because it only cares about maximum behaviour.
8 | """
9 |
10 | from maze_env import Maze
11 | from RL_brain import SarsaLambdaTable
12 |
13 |
14 | def update():
15 | for episode in range(100):
16 | # initial observation
17 | observation = env.reset()
18 |
19 | # RL choose action based on observation
20 | action = RL.choose_action(str(observation))
21 |
22 | # initial all zero eligibility trace
23 | RL.eligibility_trace *= 0
24 |
25 | while True:
26 | # fresh env
27 | env.render()
28 |
29 | # RL take action and get next observation and reward
30 | observation_, reward, done = env.step(action)
31 |
32 | # RL choose action based on next observation
33 | action_ = RL.choose_action(str(observation_))
34 |
35 | # RL learn from this transition (s, a, r, s, a) ==> Sarsa
36 | RL.learn(str(observation), action, reward, str(observation_), action_)
37 |
38 | # swap observation and action
39 | observation = observation_
40 | action = action_
41 |
42 | # break while loop when end of this episode
43 | if done:
44 | break
45 |
46 | # end of game
47 | print('game over')
48 | env.destroy()
49 |
50 | if __name__ == "__main__":
51 | env = Maze()
52 | RL = SarsaLambdaTable(actions=list(range(env.n_actions)))
53 |
54 | env.after(100, update)
55 | env.mainloop()
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/5.1_Double_DQN/run_Pendulum.py:
--------------------------------------------------------------------------------
1 | """
2 | Double DQN & Natural DQN comparison,
3 | The Pendulum example.
4 |
5 | View more on my tutorial page: https://morvanzhou.github.io/tutorials/
6 |
7 | Using:
8 | Tensorflow: 1.0
9 | gym: 0.8.0
10 | """
11 |
12 |
13 | import gym
14 | from RL_brain import DoubleDQN
15 | import numpy as np
16 | import matplotlib.pyplot as plt
17 | import tensorflow as tf
18 |
19 |
20 | env = gym.make('Pendulum-v0')
21 | env = env.unwrapped
22 | env.seed(1)
23 | MEMORY_SIZE = 3000
24 | ACTION_SPACE = 11
25 |
26 | sess = tf.Session()
27 | with tf.variable_scope('Natural_DQN'):
28 | natural_DQN = DoubleDQN(
29 | n_actions=ACTION_SPACE, n_features=3, memory_size=MEMORY_SIZE,
30 | e_greedy_increment=0.001, double_q=False, sess=sess
31 | )
32 |
33 | with tf.variable_scope('Double_DQN'):
34 | double_DQN = DoubleDQN(
35 | n_actions=ACTION_SPACE, n_features=3, memory_size=MEMORY_SIZE,
36 | e_greedy_increment=0.001, double_q=True, sess=sess, output_graph=True)
37 |
38 | sess.run(tf.global_variables_initializer())
39 |
40 |
41 | def train(RL):
42 | total_steps = 0
43 | observation = env.reset()
44 | while True:
45 | # if total_steps - MEMORY_SIZE > 8000: env.render()
46 |
47 | action = RL.choose_action(observation)
48 |
49 | f_action = (action-(ACTION_SPACE-1)/2)/((ACTION_SPACE-1)/4) # convert to [-2 ~ 2] float actions
50 | observation_, reward, done, info = env.step(np.array([f_action]))
51 |
52 | reward /= 10 # normalize to a range of (-1, 0). r = 0 when get upright
53 | # the Q target at upright state will be 0, because Q_target = r + gamma * Qmax(s', a') = 0 + gamma * 0
54 | # so when Q at this state is greater than 0, the agent overestimates the Q. Please refer to the final result.
55 |
56 | RL.store_transition(observation, action, reward, observation_)
57 |
58 | if total_steps > MEMORY_SIZE: # learning
59 | RL.learn()
60 |
61 | if total_steps - MEMORY_SIZE > 20000: # stop game
62 | break
63 |
64 | observation = observation_
65 | total_steps += 1
66 | return RL.q
67 |
68 | q_natural = train(natural_DQN)
69 | q_double = train(double_DQN)
70 |
71 | plt.plot(np.array(q_natural), c='r', label='natural')
72 | plt.plot(np.array(q_double), c='b', label='double')
73 | plt.legend(loc='best')
74 | plt.ylabel('Q eval')
75 | plt.xlabel('training steps')
76 | plt.grid()
77 | plt.show()
78 |
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/5.2_Prioritized_Replay_DQN/run_MountainCar.py:
--------------------------------------------------------------------------------
1 | """
2 | The DQN improvement: Prioritized Experience Replay (based on https://arxiv.org/abs/1511.05952)
3 |
4 | View more on my tutorial page: https://morvanzhou.github.io/tutorials/
5 |
6 | Using:
7 | Tensorflow: 1.0
8 | gym: 0.8.0
9 | """
10 |
11 |
12 | import gym
13 | from RL_brain import DQNPrioritizedReplay
14 | import matplotlib.pyplot as plt
15 | import tensorflow as tf
16 | import numpy as np
17 |
18 | env = gym.make('MountainCar-v0')
19 | env = env.unwrapped
20 | env.seed(21)
21 | MEMORY_SIZE = 10000
22 |
23 | sess = tf.Session()
24 | with tf.variable_scope('natural_DQN'):
25 | RL_natural = DQNPrioritizedReplay(
26 | n_actions=3, n_features=2, memory_size=MEMORY_SIZE,
27 | e_greedy_increment=0.00005, sess=sess, prioritized=False,
28 | )
29 |
30 | with tf.variable_scope('DQN_with_prioritized_replay'):
31 | RL_prio = DQNPrioritizedReplay(
32 | n_actions=3, n_features=2, memory_size=MEMORY_SIZE,
33 | e_greedy_increment=0.00005, sess=sess, prioritized=True, output_graph=True,
34 | )
35 | sess.run(tf.global_variables_initializer())
36 |
37 |
38 | def train(RL):
39 | total_steps = 0
40 | steps = []
41 | episodes = []
42 | for i_episode in range(20):
43 | observation = env.reset()
44 | while True:
45 | # env.render()
46 |
47 | action = RL.choose_action(observation)
48 |
49 | observation_, reward, done, info = env.step(action)
50 |
51 | if done: reward = 10
52 |
53 | RL.store_transition(observation, action, reward, observation_)
54 |
55 | if total_steps > MEMORY_SIZE:
56 | RL.learn()
57 |
58 | if done:
59 | print('episode ', i_episode, ' finished')
60 | steps.append(total_steps)
61 | episodes.append(i_episode)
62 | break
63 |
64 | observation = observation_
65 | total_steps += 1
66 | return np.vstack((episodes, steps))
67 |
68 | his_natural = train(RL_natural)
69 | his_prio = train(RL_prio)
70 |
71 | # compare based on first success
72 | plt.plot(his_natural[0, :], his_natural[1, :] - his_natural[1, 0], c='b', label='natural DQN')
73 | plt.plot(his_prio[0, :], his_prio[1, :] - his_prio[1, 0], c='r', label='DQN with prioritized replay')
74 | plt.legend(loc='best')
75 | plt.ylabel('total training time')
76 | plt.xlabel('episode')
77 | plt.grid()
78 | plt.show()
79 |
80 |
81 |
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/5.3_Dueling_DQN/run_Pendulum.py:
--------------------------------------------------------------------------------
1 | """
2 | Dueling DQN & Natural DQN comparison
3 |
4 | View more on my tutorial page: https://morvanzhou.github.io/tutorials/
5 |
6 | Using:
7 | Tensorflow: 1.0
8 | gym: 0.8.0
9 | """
10 |
11 |
12 | import gym
13 | from RL_brain import DuelingDQN
14 | import numpy as np
15 | import matplotlib.pyplot as plt
16 | import tensorflow as tf
17 |
18 |
19 | env = gym.make('Pendulum-v0')
20 | env = env.unwrapped
21 | env.seed(1)
22 | MEMORY_SIZE = 3000
23 | ACTION_SPACE = 25
24 |
25 | sess = tf.Session()
26 | with tf.variable_scope('natural'):
27 | natural_DQN = DuelingDQN(
28 | n_actions=ACTION_SPACE, n_features=3, memory_size=MEMORY_SIZE,
29 | e_greedy_increment=0.001, sess=sess, dueling=False)
30 |
31 | with tf.variable_scope('dueling'):
32 | dueling_DQN = DuelingDQN(
33 | n_actions=ACTION_SPACE, n_features=3, memory_size=MEMORY_SIZE,
34 | e_greedy_increment=0.001, sess=sess, dueling=True, output_graph=True)
35 |
36 | sess.run(tf.global_variables_initializer())
37 |
38 |
39 | def train(RL):
40 | acc_r = [0]
41 | total_steps = 0
42 | observation = env.reset()
43 | while True:
44 | # if total_steps-MEMORY_SIZE > 9000: env.render()
45 |
46 | action = RL.choose_action(observation)
47 |
48 | f_action = (action-(ACTION_SPACE-1)/2)/((ACTION_SPACE-1)/4) # [-2 ~ 2] float actions
49 | observation_, reward, done, info = env.step(np.array([f_action]))
50 |
51 | reward /= 10 # normalize to a range of (-1, 0)
52 | acc_r.append(reward + acc_r[-1]) # accumulated reward
53 |
54 | RL.store_transition(observation, action, reward, observation_)
55 |
56 | if total_steps > MEMORY_SIZE:
57 | RL.learn()
58 |
59 | if total_steps-MEMORY_SIZE > 15000:
60 | break
61 |
62 | observation = observation_
63 | total_steps += 1
64 | return RL.cost_his, acc_r
65 |
66 | c_natural, r_natural = train(natural_DQN)
67 | c_dueling, r_dueling = train(dueling_DQN)
68 |
69 | plt.figure(1)
70 | plt.plot(np.array(c_natural), c='r', label='natural')
71 | plt.plot(np.array(c_dueling), c='b', label='dueling')
72 | plt.legend(loc='best')
73 | plt.ylabel('cost')
74 | plt.xlabel('training steps')
75 | plt.grid()
76 |
77 | plt.figure(2)
78 | plt.plot(np.array(r_natural), c='r', label='natural')
79 | plt.plot(np.array(r_dueling), c='b', label='dueling')
80 | plt.legend(loc='best')
81 | plt.ylabel('accumulated reward')
82 | plt.xlabel('training steps')
83 | plt.grid()
84 |
85 | plt.show()
86 |
87 |
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/5_Deep_Q_Network/maze_env.py:
--------------------------------------------------------------------------------
1 | """
2 | Reinforcement learning maze example.
3 |
4 | Red rectangle: explorer.
5 | Black rectangles: hells [reward = -1].
6 | Yellow bin circle: paradise [reward = +1].
7 | All other states: ground [reward = 0].
8 |
9 | This script is the environment part of this example.
10 | The RL is in RL_brain.py.
11 |
12 | View more on my tutorial page: https://morvanzhou.github.io/tutorials/
13 | """
14 | import numpy as np
15 | import time
16 | import sys
17 | if sys.version_info.major == 2:
18 | import Tkinter as tk
19 | else:
20 | import tkinter as tk
21 |
22 | UNIT = 40 # pixels
23 | MAZE_H = 4 # grid height
24 | MAZE_W = 4 # grid width
25 |
26 |
27 | class Maze(tk.Tk, object):
28 | def __init__(self):
29 | super(Maze, self).__init__()
30 | self.action_space = ['u', 'd', 'l', 'r']
31 | self.n_actions = len(self.action_space)
32 | self.n_features = 2
33 | self.title('maze')
34 | self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT))
35 | self._build_maze()
36 |
37 | def _build_maze(self):
38 | self.canvas = tk.Canvas(self, bg='white',
39 | height=MAZE_H * UNIT,
40 | width=MAZE_W * UNIT)
41 |
42 | # create grids
43 | for c in range(0, MAZE_W * UNIT, UNIT):
44 | x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
45 | self.canvas.create_line(x0, y0, x1, y1)
46 | for r in range(0, MAZE_H * UNIT, UNIT):
47 | x0, y0, x1, y1 = 0, r, MAZE_W * UNIT, r
48 | self.canvas.create_line(x0, y0, x1, y1)
49 |
50 | # create origin
51 | origin = np.array([20, 20])
52 |
53 | # hell
54 | hell1_center = origin + np.array([UNIT * 2, UNIT])
55 | self.hell1 = self.canvas.create_rectangle(
56 | hell1_center[0] - 15, hell1_center[1] - 15,
57 | hell1_center[0] + 15, hell1_center[1] + 15,
58 | fill='black')
59 | # hell
60 | # hell2_center = origin + np.array([UNIT, UNIT * 2])
61 | # self.hell2 = self.canvas.create_rectangle(
62 | # hell2_center[0] - 15, hell2_center[1] - 15,
63 | # hell2_center[0] + 15, hell2_center[1] + 15,
64 | # fill='black')
65 |
66 | # create oval
67 | oval_center = origin + UNIT * 2
68 | self.oval = self.canvas.create_oval(
69 | oval_center[0] - 15, oval_center[1] - 15,
70 | oval_center[0] + 15, oval_center[1] + 15,
71 | fill='yellow')
72 |
73 | # create red rect
74 | self.rect = self.canvas.create_rectangle(
75 | origin[0] - 15, origin[1] - 15,
76 | origin[0] + 15, origin[1] + 15,
77 | fill='red')
78 |
79 | # pack all
80 | self.canvas.pack()
81 |
82 | def reset(self):
83 | self.update()
84 | time.sleep(0.1)
85 | self.canvas.delete(self.rect)
86 | origin = np.array([20, 20])
87 | self.rect = self.canvas.create_rectangle(
88 | origin[0] - 15, origin[1] - 15,
89 | origin[0] + 15, origin[1] + 15,
90 | fill='red')
91 | # return observation
92 | return (np.array(self.canvas.coords(self.rect)[:2]) - np.array(self.canvas.coords(self.oval)[:2]))/(MAZE_H*UNIT)
93 |
94 | def step(self, action):
95 | s = self.canvas.coords(self.rect)
96 | base_action = np.array([0, 0])
97 | if action == 0: # up
98 | if s[1] > UNIT:
99 | base_action[1] -= UNIT
100 | elif action == 1: # down
101 | if s[1] < (MAZE_H - 1) * UNIT:
102 | base_action[1] += UNIT
103 | elif action == 2: # right
104 | if s[0] < (MAZE_W - 1) * UNIT:
105 | base_action[0] += UNIT
106 | elif action == 3: # left
107 | if s[0] > UNIT:
108 | base_action[0] -= UNIT
109 |
110 | self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent
111 |
112 | next_coords = self.canvas.coords(self.rect) # next state
113 |
114 | # reward function
115 | if next_coords == self.canvas.coords(self.oval):
116 | reward = 1
117 | done = True
118 | elif next_coords in [self.canvas.coords(self.hell1)]:
119 | reward = -1
120 | done = True
121 | else:
122 | reward = 0
123 | done = False
124 | s_ = (np.array(next_coords[:2]) - np.array(self.canvas.coords(self.oval)[:2]))/(MAZE_H*UNIT)
125 | return s_, reward, done
126 |
127 | def render(self):
128 | # time.sleep(0.01)
129 | self.update()
130 |
131 |
132 |
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/5_Deep_Q_Network/run_this.py:
--------------------------------------------------------------------------------
1 | from maze_env import Maze
2 | from RL_brain import DeepQNetwork
3 |
4 |
5 | def run_maze():
6 | step = 0
7 | for episode in range(300):
8 | # initial observation
9 | observation = env.reset()
10 |
11 | while True:
12 | # fresh env
13 | env.render()
14 |
15 | # RL choose action based on observation
16 | action = RL.choose_action(observation)
17 |
18 | # RL take action and get next observation and reward
19 | observation_, reward, done = env.step(action)
20 |
21 | RL.store_transition(observation, action, reward, observation_)
22 |
23 | if (step > 200) and (step % 5 == 0):
24 | RL.learn()
25 |
26 | # swap observation
27 | observation = observation_
28 |
29 | # break while loop when end of this episode
30 | if done:
31 | break
32 | step += 1
33 |
34 | # end of game
35 | print('game over')
36 | env.destroy()
37 |
38 |
39 | if __name__ == "__main__":
40 | # maze game
41 | env = Maze()
42 | RL = DeepQNetwork(env.n_actions, env.n_features,
43 | learning_rate=0.01,
44 | reward_decay=0.9,
45 | e_greedy=0.9,
46 | replace_target_iter=200,
47 | memory_size=2000,
48 | # output_graph=True
49 | )
50 | env.after(100, run_maze)
51 | env.mainloop()
52 | RL.plot_cost()
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/6_OpenAI_gym/run_CartPole.py:
--------------------------------------------------------------------------------
1 | """
2 | Deep Q network,
3 |
4 | Using:
5 | Tensorflow: 1.0
6 | gym: 0.7.3
7 | """
8 |
9 |
10 | import gym
11 | from RL_brain import DeepQNetwork
12 |
13 | env = gym.make('CartPole-v0')
14 | env = env.unwrapped
15 |
16 | print(env.action_space)
17 | print(env.observation_space)
18 | print(env.observation_space.high)
19 | print(env.observation_space.low)
20 |
21 | RL = DeepQNetwork(n_actions=env.action_space.n,
22 | n_features=env.observation_space.shape[0],
23 | learning_rate=0.01, e_greedy=0.9,
24 | replace_target_iter=100, memory_size=2000,
25 | e_greedy_increment=0.001,)
26 |
27 | total_steps = 0
28 |
29 |
30 | for i_episode in range(100):
31 |
32 | observation = env.reset()
33 | ep_r = 0
34 | while True:
35 | env.render()
36 |
37 | action = RL.choose_action(observation)
38 |
39 | observation_, reward, done, info = env.step(action)
40 |
41 | # the smaller theta and closer to center the better
42 | x, x_dot, theta, theta_dot = observation_
43 | r1 = (env.x_threshold - abs(x))/env.x_threshold - 0.8
44 | r2 = (env.theta_threshold_radians - abs(theta))/env.theta_threshold_radians - 0.5
45 | reward = r1 + r2
46 |
47 | RL.store_transition(observation, action, reward, observation_)
48 |
49 | ep_r += reward
50 | if total_steps > 1000:
51 | RL.learn()
52 |
53 | if done:
54 | print('episode: ', i_episode,
55 | 'ep_r: ', round(ep_r, 2),
56 | ' epsilon: ', round(RL.epsilon, 2))
57 | break
58 |
59 | observation = observation_
60 | total_steps += 1
61 |
62 | RL.plot_cost()
63 |
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/6_OpenAI_gym/run_MountainCar.py:
--------------------------------------------------------------------------------
1 | """
2 | Deep Q network,
3 |
4 | Using:
5 | Tensorflow: 1.0
6 | gym: 0.8.0
7 | """
8 |
9 |
10 | import gym
11 | from RL_brain import DeepQNetwork
12 |
13 | env = gym.make('MountainCar-v0')
14 | env = env.unwrapped
15 |
16 | print(env.action_space)
17 | print(env.observation_space)
18 | print(env.observation_space.high)
19 | print(env.observation_space.low)
20 |
21 | RL = DeepQNetwork(n_actions=3, n_features=2, learning_rate=0.001, e_greedy=0.9,
22 | replace_target_iter=300, memory_size=3000,
23 | e_greedy_increment=0.0002,)
24 |
25 | total_steps = 0
26 |
27 |
28 | for i_episode in range(10):
29 |
30 | observation = env.reset()
31 | ep_r = 0
32 | while True:
33 | env.render()
34 |
35 | action = RL.choose_action(observation)
36 |
37 | observation_, reward, done, info = env.step(action)
38 |
39 | position, velocity = observation_
40 |
41 | # the higher the better
42 | reward = abs(position - (-0.5)) # r in [0, 1]
43 |
44 | RL.store_transition(observation, action, reward, observation_)
45 |
46 | if total_steps > 1000:
47 | RL.learn()
48 |
49 | ep_r += reward
50 | if done:
51 | get = '| Get' if observation_[0] >= env.unwrapped.goal_position else '| ----'
52 | print('Epi: ', i_episode,
53 | get,
54 | '| Ep_r: ', round(ep_r, 4),
55 | '| Epsilon: ', round(RL.epsilon, 2))
56 | break
57 |
58 | observation = observation_
59 | total_steps += 1
60 |
61 | RL.plot_cost()
62 |
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/7_Policy_gradient_softmax/RL_brain.py:
--------------------------------------------------------------------------------
1 | """
2 | This part of code is the reinforcement learning brain, which is a brain of the agent.
3 | All decisions are made in here.
4 |
5 | Policy Gradient, Reinforcement Learning.
6 |
7 | View more on my tutorial page: https://morvanzhou.github.io/tutorials/
8 |
9 | Using:
10 | Tensorflow: 1.0
11 | gym: 0.8.0
12 | """
13 |
14 | import numpy as np
15 | import tensorflow as tf
16 |
17 | # reproducible
18 | np.random.seed(1)
19 | tf.set_random_seed(1)
20 |
21 |
22 | class PolicyGradient:
23 | def __init__(
24 | self,
25 | n_actions,
26 | n_features,
27 | learning_rate=0.01,
28 | reward_decay=0.95,
29 | output_graph=False,
30 | ):
31 | self.n_actions = n_actions
32 | self.n_features = n_features
33 | self.lr = learning_rate
34 | self.gamma = reward_decay
35 |
36 | self.ep_obs, self.ep_as, self.ep_rs = [], [], []
37 |
38 | self._build_net()
39 |
40 | self.sess = tf.Session()
41 |
42 | if output_graph:
43 | # $ tensorboard --logdir=logs
44 | # http://0.0.0.0:6006/
45 | # tf.train.SummaryWriter soon be deprecated, use following
46 | tf.summary.FileWriter("logs/", self.sess.graph)
47 |
48 | self.sess.run(tf.global_variables_initializer())
49 |
50 | def _build_net(self):
51 | with tf.name_scope('inputs'):
52 | self.tf_obs = tf.placeholder(tf.float32, [None, self.n_features], name="observations")
53 | self.tf_acts = tf.placeholder(tf.int32, [None, ], name="actions_num")
54 | self.tf_vt = tf.placeholder(tf.float32, [None, ], name="actions_value")
55 | # fc1
56 | layer = tf.layers.dense(
57 | inputs=self.tf_obs,
58 | units=10,
59 | activation=tf.nn.tanh, # tanh activation
60 | kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3),
61 | bias_initializer=tf.constant_initializer(0.1),
62 | name='fc1'
63 | )
64 | # fc2
65 | all_act = tf.layers.dense(
66 | inputs=layer,
67 | units=self.n_actions,
68 | activation=None,
69 | kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3),
70 | bias_initializer=tf.constant_initializer(0.1),
71 | name='fc2'
72 | )
73 |
74 | self.all_act_prob = tf.nn.softmax(all_act, name='act_prob') # use softmax to convert to probability
75 |
76 | with tf.name_scope('loss'):
77 | # to maximize total reward (log_p * R) is to minimize -(log_p * R), and the tf only have minimize(loss)
78 | neg_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=all_act, labels=self.tf_acts) # this is negative log of chosen action
79 | # or in this way:
80 | # neg_log_prob = tf.reduce_sum(-tf.log(self.all_act_prob)*tf.one_hot(self.tf_acts, self.n_actions), axis=1)
81 | loss = tf.reduce_mean(neg_log_prob * self.tf_vt) # reward guided loss
82 |
83 | with tf.name_scope('train'):
84 | self.train_op = tf.train.AdamOptimizer(self.lr).minimize(loss)
85 |
86 | def choose_action(self, observation):
87 | prob_weights = self.sess.run(self.all_act_prob, feed_dict={self.tf_obs: observation[np.newaxis, :]})
88 | action = np.random.choice(range(prob_weights.shape[1]), p=prob_weights.ravel()) # select action w.r.t the actions prob
89 | return action
90 |
91 | def store_transition(self, s, a, r):
92 | self.ep_obs.append(s)
93 | self.ep_as.append(a)
94 | self.ep_rs.append(r)
95 |
96 | def learn(self):
97 | # discount and normalize episode reward
98 | discounted_ep_rs_norm = self._discount_and_norm_rewards()
99 |
100 | # train on episode
101 | self.sess.run(self.train_op, feed_dict={
102 | self.tf_obs: np.vstack(self.ep_obs), # shape=[None, n_obs]
103 | self.tf_acts: np.array(self.ep_as), # shape=[None, ]
104 | self.tf_vt: discounted_ep_rs_norm, # shape=[None, ]
105 | })
106 |
107 | self.ep_obs, self.ep_as, self.ep_rs = [], [], [] # empty episode data
108 | return discounted_ep_rs_norm
109 |
110 | def _discount_and_norm_rewards(self):
111 | # discount episode rewards
112 | discounted_ep_rs = np.zeros_like(self.ep_rs)
113 | running_add = 0
114 | for t in reversed(range(0, len(self.ep_rs))):
115 | running_add = running_add * self.gamma + self.ep_rs[t]
116 | discounted_ep_rs[t] = running_add
117 |
118 | # normalize episode rewards
119 | discounted_ep_rs -= np.mean(discounted_ep_rs)
120 | discounted_ep_rs /= np.std(discounted_ep_rs)
121 | return discounted_ep_rs
122 |
123 |
124 |
125 |
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/7_Policy_gradient_softmax/run_CartPole.py:
--------------------------------------------------------------------------------
1 | """
2 | Policy Gradient, Reinforcement Learning.
3 |
4 | The cart pole example
5 |
6 | View more on my tutorial page: https://morvanzhou.github.io/tutorials/
7 |
8 | Using:
9 | Tensorflow: 1.0
10 | gym: 0.8.0
11 | """
12 |
13 | import gym
14 | from RL_brain import PolicyGradient
15 | import matplotlib.pyplot as plt
16 |
17 | DISPLAY_REWARD_THRESHOLD = 400 # renders environment if total episode reward is greater then this threshold
18 | RENDER = False # rendering wastes time
19 |
20 | env = gym.make('CartPole-v0')
21 | env.seed(1) # reproducible, general Policy gradient has high variance
22 | env = env.unwrapped
23 |
24 | print(env.action_space)
25 | print(env.observation_space)
26 | print(env.observation_space.high)
27 | print(env.observation_space.low)
28 |
29 | RL = PolicyGradient(
30 | n_actions=env.action_space.n,
31 | n_features=env.observation_space.shape[0],
32 | learning_rate=0.02,
33 | reward_decay=0.99,
34 | # output_graph=True,
35 | )
36 |
37 | for i_episode in range(3000):
38 |
39 | observation = env.reset()
40 |
41 | while True:
42 | if RENDER: env.render()
43 |
44 | action = RL.choose_action(observation)
45 |
46 | observation_, reward, done, info = env.step(action)
47 |
48 | RL.store_transition(observation, action, reward)
49 |
50 | if done:
51 | ep_rs_sum = sum(RL.ep_rs)
52 |
53 | if 'running_reward' not in globals():
54 | running_reward = ep_rs_sum
55 | else:
56 | running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
57 | if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering
58 | print("episode:", i_episode, " reward:", int(running_reward))
59 |
60 | vt = RL.learn()
61 |
62 | if i_episode == 0:
63 | plt.plot(vt) # plot the episode vt
64 | plt.xlabel('episode steps')
65 | plt.ylabel('normalized state-action value')
66 | plt.show()
67 | break
68 |
69 | observation = observation_
70 |
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/7_Policy_gradient_softmax/run_MountainCar.py:
--------------------------------------------------------------------------------
1 | """
2 | Policy Gradient, Reinforcement Learning.
3 |
4 | The cart pole example
5 |
6 | View more on my tutorial page: https://morvanzhou.github.io/tutorials/
7 |
8 | Using:
9 | Tensorflow: 1.0
10 | gym: 0.8.0
11 | """
12 |
13 | import gym
14 | from RL_brain import PolicyGradient
15 | import matplotlib.pyplot as plt
16 |
17 | DISPLAY_REWARD_THRESHOLD = -2000 # renders environment if total episode reward is greater then this threshold
18 | # episode: 154 reward: -10667
19 | # episode: 387 reward: -2009
20 | # episode: 489 reward: -1006
21 | # episode: 628 reward: -502
22 |
23 | RENDER = False # rendering wastes time
24 |
25 | env = gym.make('MountainCar-v0')
26 | env.seed(1) # reproducible, general Policy gradient has high variance
27 | env = env.unwrapped
28 |
29 | print(env.action_space)
30 | print(env.observation_space)
31 | print(env.observation_space.high)
32 | print(env.observation_space.low)
33 |
34 | RL = PolicyGradient(
35 | n_actions=env.action_space.n,
36 | n_features=env.observation_space.shape[0],
37 | learning_rate=0.02,
38 | reward_decay=0.995,
39 | # output_graph=True,
40 | )
41 |
42 | for i_episode in range(1000):
43 |
44 | observation = env.reset()
45 |
46 | while True:
47 | if RENDER: env.render()
48 |
49 | action = RL.choose_action(observation)
50 |
51 | observation_, reward, done, info = env.step(action) # reward = -1 in all cases
52 |
53 | RL.store_transition(observation, action, reward)
54 |
55 | if done:
56 | # calculate running reward
57 | ep_rs_sum = sum(RL.ep_rs)
58 | if 'running_reward' not in globals():
59 | running_reward = ep_rs_sum
60 | else:
61 | running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
62 | if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering
63 |
64 | print("episode:", i_episode, " reward:", int(running_reward))
65 |
66 | vt = RL.learn() # train
67 |
68 | if i_episode == 30:
69 | plt.plot(vt) # plot the episode vt
70 | plt.xlabel('episode steps')
71 | plt.ylabel('normalized state-action value')
72 | plt.show()
73 |
74 | break
75 |
76 | observation = observation_
77 |
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/8_Actor_Critic_Advantage/AC_CartPole.py:
--------------------------------------------------------------------------------
1 | """
2 | Actor-Critic using TD-error as the Advantage, Reinforcement Learning.
3 |
4 | The cart pole example. Policy is oscillated.
5 |
6 | View more on my tutorial page: https://morvanzhou.github.io/tutorials/
7 |
8 | Using:
9 | tensorflow 1.0
10 | gym 0.8.0
11 | """
12 |
13 | import numpy as np
14 | import tensorflow as tf
15 | import gym
16 |
17 | np.random.seed(2)
18 | tf.set_random_seed(2) # reproducible
19 |
20 | # Superparameters
21 | OUTPUT_GRAPH = False
22 | MAX_EPISODE = 3000
23 | DISPLAY_REWARD_THRESHOLD = 200 # renders environment if total episode reward is greater then this threshold
24 | MAX_EP_STEPS = 1000 # maximum time step in one episode
25 | RENDER = False # rendering wastes time
26 | GAMMA = 0.9 # reward discount in TD error
27 | LR_A = 0.001 # learning rate for actor
28 | LR_C = 0.01 # learning rate for critic
29 |
30 | env = gym.make('CartPole-v0')
31 | env.seed(1) # reproducible
32 | env = env.unwrapped
33 |
34 | N_F = env.observation_space.shape[0]
35 | N_A = env.action_space.n
36 |
37 |
38 | class Actor(object):
39 | def __init__(self, sess, n_features, n_actions, lr=0.001):
40 | self.sess = sess
41 |
42 | self.s = tf.placeholder(tf.float32, [1, n_features], "state")
43 | self.a = tf.placeholder(tf.int32, None, "act")
44 | self.td_error = tf.placeholder(tf.float32, None, "td_error") # TD_error
45 |
46 | with tf.variable_scope('Actor'):
47 | l1 = tf.layers.dense(
48 | inputs=self.s,
49 | units=20, # number of hidden units
50 | activation=tf.nn.relu,
51 | kernel_initializer=tf.random_normal_initializer(0., .1), # weights
52 | bias_initializer=tf.constant_initializer(0.1), # biases
53 | name='l1'
54 | )
55 |
56 | self.acts_prob = tf.layers.dense(
57 | inputs=l1,
58 | units=n_actions, # output units
59 | activation=tf.nn.softmax, # get action probabilities
60 | kernel_initializer=tf.random_normal_initializer(0., .1), # weights
61 | bias_initializer=tf.constant_initializer(0.1), # biases
62 | name='acts_prob'
63 | )
64 |
65 | with tf.variable_scope('exp_v'):
66 | log_prob = tf.log(self.acts_prob[0, self.a])
67 | self.exp_v = tf.reduce_mean(log_prob * self.td_error) # advantage (TD_error) guided loss
68 |
69 | with tf.variable_scope('train'):
70 | self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v) # minimize(-exp_v) = maximize(exp_v)
71 |
72 | def learn(self, s, a, td):
73 | s = s[np.newaxis, :]
74 | feed_dict = {self.s: s, self.a: a, self.td_error: td}
75 | _, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict)
76 | return exp_v
77 |
78 | def choose_action(self, s):
79 | s = s[np.newaxis, :]
80 | probs = self.sess.run(self.acts_prob, {self.s: s}) # get probabilities for all actions
81 | return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel()) # return a int
82 |
83 |
84 | class Critic(object):
85 | def __init__(self, sess, n_features, lr=0.01):
86 | self.sess = sess
87 |
88 | self.s = tf.placeholder(tf.float32, [1, n_features], "state")
89 | self.v_ = tf.placeholder(tf.float32, [1, 1], "v_next")
90 | self.r = tf.placeholder(tf.float32, None, 'r')
91 |
92 | with tf.variable_scope('Critic'):
93 | l1 = tf.layers.dense(
94 | inputs=self.s,
95 | units=20, # number of hidden units
96 | activation=tf.nn.relu, # None
97 | # have to be linear to make sure the convergence of actor.
98 | # But linear approximator seems hardly learns the correct Q.
99 | kernel_initializer=tf.random_normal_initializer(0., .1), # weights
100 | bias_initializer=tf.constant_initializer(0.1), # biases
101 | name='l1'
102 | )
103 |
104 | self.v = tf.layers.dense(
105 | inputs=l1,
106 | units=1, # output units
107 | activation=None,
108 | kernel_initializer=tf.random_normal_initializer(0., .1), # weights
109 | bias_initializer=tf.constant_initializer(0.1), # biases
110 | name='V'
111 | )
112 |
113 | with tf.variable_scope('squared_TD_error'):
114 | self.td_error = self.r + GAMMA * self.v_ - self.v
115 | self.loss = tf.square(self.td_error) # TD_error = (r+gamma*V_next) - V_eval
116 | with tf.variable_scope('train'):
117 | self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)
118 |
119 | def learn(self, s, r, s_):
120 | s, s_ = s[np.newaxis, :], s_[np.newaxis, :]
121 |
122 | v_ = self.sess.run(self.v, {self.s: s_})
123 | td_error, _ = self.sess.run([self.td_error, self.train_op],
124 | {self.s: s, self.v_: v_, self.r: r})
125 | return td_error
126 |
127 |
128 | sess = tf.Session()
129 |
130 | actor = Actor(sess, n_features=N_F, n_actions=N_A, lr=LR_A)
131 | critic = Critic(sess, n_features=N_F, lr=LR_C) # we need a good teacher, so the teacher should learn faster than the actor
132 |
133 | sess.run(tf.global_variables_initializer())
134 |
135 | if OUTPUT_GRAPH:
136 | tf.summary.FileWriter("logs/", sess.graph)
137 |
138 | for i_episode in range(MAX_EPISODE):
139 | s = env.reset()
140 | t = 0
141 | track_r = []
142 | while True:
143 | if RENDER: env.render()
144 |
145 | a = actor.choose_action(s)
146 |
147 | s_, r, done, info = env.step(a)
148 |
149 | if done: r = -20
150 |
151 | track_r.append(r)
152 |
153 | td_error = critic.learn(s, r, s_) # gradient = grad[r + gamma * V(s_) - V(s)]
154 | actor.learn(s, a, td_error) # true_gradient = grad[logPi(s,a) * td_error]
155 |
156 | s = s_
157 | t += 1
158 |
159 | if done or t >= MAX_EP_STEPS:
160 | ep_rs_sum = sum(track_r)
161 |
162 | if 'running_reward' not in globals():
163 | running_reward = ep_rs_sum
164 | else:
165 | running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
166 | if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering
167 | print("episode:", i_episode, " reward:", int(running_reward))
168 | break
169 |
170 |
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/8_Actor_Critic_Advantage/AC_continue_Pendulum.py:
--------------------------------------------------------------------------------
1 | """
2 | Actor-Critic with continuous action using TD-error as the Advantage, Reinforcement Learning.
3 |
4 | The Pendulum example (based on https://github.com/dennybritz/reinforcement-learning/blob/master/PolicyGradient/Continuous%20MountainCar%20Actor%20Critic%20Solution.ipynb)
5 |
6 | Cannot converge!!! oscillate!!!
7 |
8 | View more on my tutorial page: https://morvanzhou.github.io/tutorials/
9 |
10 | Using:
11 | tensorflow r1.3
12 | gym 0.8.0
13 | """
14 |
15 | import tensorflow as tf
16 | import numpy as np
17 | import gym
18 |
19 | np.random.seed(2)
20 | tf.set_random_seed(2) # reproducible
21 |
22 |
23 | class Actor(object):
24 | def __init__(self, sess, n_features, action_bound, lr=0.0001):
25 | self.sess = sess
26 |
27 | self.s = tf.placeholder(tf.float32, [1, n_features], "state")
28 | self.a = tf.placeholder(tf.float32, None, name="act")
29 | self.td_error = tf.placeholder(tf.float32, None, name="td_error") # TD_error
30 |
31 | l1 = tf.layers.dense(
32 | inputs=self.s,
33 | units=30, # number of hidden units
34 | activation=tf.nn.relu,
35 | kernel_initializer=tf.random_normal_initializer(0., .1), # weights
36 | bias_initializer=tf.constant_initializer(0.1), # biases
37 | name='l1'
38 | )
39 |
40 | mu = tf.layers.dense(
41 | inputs=l1,
42 | units=1, # number of hidden units
43 | activation=tf.nn.tanh,
44 | kernel_initializer=tf.random_normal_initializer(0., .1), # weights
45 | bias_initializer=tf.constant_initializer(0.1), # biases
46 | name='mu'
47 | )
48 |
49 | sigma = tf.layers.dense(
50 | inputs=l1,
51 | units=1, # output units
52 | activation=tf.nn.softplus, # get action probabilities
53 | kernel_initializer=tf.random_normal_initializer(0., .1), # weights
54 | bias_initializer=tf.constant_initializer(1.), # biases
55 | name='sigma'
56 | )
57 | global_step = tf.Variable(0, trainable=False)
58 | # self.e = epsilon = tf.train.exponential_decay(2., global_step, 1000, 0.9)
59 | self.mu, self.sigma = tf.squeeze(mu*2), tf.squeeze(sigma+0.1)
60 | self.normal_dist = tf.distributions.Normal(self.mu, self.sigma)
61 |
62 | self.action = tf.clip_by_value(self.normal_dist.sample(1), action_bound[0], action_bound[1])
63 |
64 | with tf.name_scope('exp_v'):
65 | log_prob = self.normal_dist.log_prob(self.a) # loss without advantage
66 | self.exp_v = log_prob * self.td_error # advantage (TD_error) guided loss
67 | # Add cross entropy cost to encourage exploration
68 | self.exp_v += 0.01*self.normal_dist.entropy()
69 |
70 | with tf.name_scope('train'):
71 | self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v, global_step) # min(v) = max(-v)
72 |
73 | def learn(self, s, a, td):
74 | s = s[np.newaxis, :]
75 | feed_dict = {self.s: s, self.a: a, self.td_error: td}
76 | _, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict)
77 | return exp_v
78 |
79 | def choose_action(self, s):
80 | s = s[np.newaxis, :]
81 | return self.sess.run(self.action, {self.s: s}) # get probabilities for all actions
82 |
83 |
84 | class Critic(object):
85 | def __init__(self, sess, n_features, lr=0.01):
86 | self.sess = sess
87 | with tf.name_scope('inputs'):
88 | self.s = tf.placeholder(tf.float32, [1, n_features], "state")
89 | self.v_ = tf.placeholder(tf.float32, [1, 1], name="v_next")
90 | self.r = tf.placeholder(tf.float32, name='r')
91 |
92 | with tf.variable_scope('Critic'):
93 | l1 = tf.layers.dense(
94 | inputs=self.s,
95 | units=30, # number of hidden units
96 | activation=tf.nn.relu,
97 | kernel_initializer=tf.random_normal_initializer(0., .1), # weights
98 | bias_initializer=tf.constant_initializer(0.1), # biases
99 | name='l1'
100 | )
101 |
102 | self.v = tf.layers.dense(
103 | inputs=l1,
104 | units=1, # output units
105 | activation=None,
106 | kernel_initializer=tf.random_normal_initializer(0., .1), # weights
107 | bias_initializer=tf.constant_initializer(0.1), # biases
108 | name='V'
109 | )
110 |
111 | with tf.variable_scope('squared_TD_error'):
112 | self.td_error = tf.reduce_mean(self.r + GAMMA * self.v_ - self.v)
113 | self.loss = tf.square(self.td_error) # TD_error = (r+gamma*V_next) - V_eval
114 | with tf.variable_scope('train'):
115 | self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)
116 |
117 | def learn(self, s, r, s_):
118 | s, s_ = s[np.newaxis, :], s_[np.newaxis, :]
119 |
120 | v_ = self.sess.run(self.v, {self.s: s_})
121 | td_error, _ = self.sess.run([self.td_error, self.train_op],
122 | {self.s: s, self.v_: v_, self.r: r})
123 | return td_error
124 |
125 |
126 | OUTPUT_GRAPH = False
127 | MAX_EPISODE = 1000
128 | MAX_EP_STEPS = 200
129 | DISPLAY_REWARD_THRESHOLD = -100 # renders environment if total episode reward is greater then this threshold
130 | RENDER = False # rendering wastes time
131 | GAMMA = 0.9
132 | LR_A = 0.001 # learning rate for actor
133 | LR_C = 0.01 # learning rate for critic
134 |
135 | env = gym.make('Pendulum-v0')
136 | env.seed(1) # reproducible
137 | env = env.unwrapped
138 |
139 | N_S = env.observation_space.shape[0]
140 | A_BOUND = env.action_space.high
141 |
142 | sess = tf.Session()
143 |
144 | actor = Actor(sess, n_features=N_S, lr=LR_A, action_bound=[-A_BOUND, A_BOUND])
145 | critic = Critic(sess, n_features=N_S, lr=LR_C)
146 |
147 | sess.run(tf.global_variables_initializer())
148 |
149 | if OUTPUT_GRAPH:
150 | tf.summary.FileWriter("logs/", sess.graph)
151 |
152 | for i_episode in range(MAX_EPISODE):
153 | s = env.reset()
154 | t = 0
155 | ep_rs = []
156 | while True:
157 | # if RENDER:
158 | env.render()
159 | a = actor.choose_action(s)
160 |
161 | s_, r, done, info = env.step(a)
162 | r /= 10
163 |
164 | td_error = critic.learn(s, r, s_) # gradient = grad[r + gamma * V(s_) - V(s)]
165 | actor.learn(s, a, td_error) # true_gradient = grad[logPi(s,a) * td_error]
166 |
167 | s = s_
168 | t += 1
169 | ep_rs.append(r)
170 | if t > MAX_EP_STEPS:
171 | ep_rs_sum = sum(ep_rs)
172 | if 'running_reward' not in globals():
173 | running_reward = ep_rs_sum
174 | else:
175 | running_reward = running_reward * 0.9 + ep_rs_sum * 0.1
176 | if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering
177 | print("episode:", i_episode, " reward:", int(running_reward))
178 | break
179 |
180 |
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG_update.py:
--------------------------------------------------------------------------------
1 | """
2 | Deep Deterministic Policy Gradient (DDPG), Reinforcement Learning.
3 | DDPG is Actor Critic based algorithm.
4 | Pendulum example.
5 |
6 | View more on my tutorial page: https://morvanzhou.github.io/tutorials/
7 |
8 | Using:
9 | tensorflow 1.0
10 | gym 0.8.0
11 | """
12 |
13 | import tensorflow as tf
14 | import numpy as np
15 | import gym
16 | import time
17 |
18 |
19 | ##################### hyper parameters ####################
20 |
21 | MAX_EPISODES = 200
22 | MAX_EP_STEPS = 200
23 | LR_A = 0.001 # learning rate for actor
24 | LR_C = 0.002 # learning rate for critic
25 | GAMMA = 0.9 # reward discount
26 | TAU = 0.01 # soft replacement
27 | MEMORY_CAPACITY = 10000
28 | BATCH_SIZE = 32
29 |
30 | RENDER = False
31 | ENV_NAME = 'Pendulum-v0'
32 |
33 | ############################### DDPG ####################################
34 |
35 | class DDPG(object):
36 | def __init__(self, a_dim, s_dim, a_bound,):
37 | self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32)
38 | self.pointer = 0
39 | self.sess = tf.Session()
40 |
41 | self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
42 | self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
43 | self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_')
44 | self.R = tf.placeholder(tf.float32, [None, 1], 'r')
45 |
46 | with tf.variable_scope('Actor'):
47 | self.a = self._build_a(self.S, scope='eval', trainable=True)
48 | a_ = self._build_a(self.S_, scope='target', trainable=False)
49 | with tf.variable_scope('Critic'):
50 | # assign self.a = a in memory when calculating q for td_error,
51 | # otherwise the self.a is from Actor when updating Actor
52 | q = self._build_c(self.S, self.a, scope='eval', trainable=True)
53 | q_ = self._build_c(self.S_, a_, scope='target', trainable=False)
54 |
55 | # networks parameters
56 | self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
57 | self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
58 | self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
59 | self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')
60 |
61 | # target net replacement
62 | self.soft_replace = [tf.assign(t, (1 - TAU) * t + TAU * e)
63 | for t, e in zip(self.at_params + self.ct_params, self.ae_params + self.ce_params)]
64 |
65 | q_target = self.R + GAMMA * q_
66 | # in the feed_dic for the td_error, the self.a should change to actions in memory
67 | td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q)
68 | self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list=self.ce_params)
69 |
70 | a_loss = - tf.reduce_mean(q) # maximize the q
71 | self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list=self.ae_params)
72 |
73 | self.sess.run(tf.global_variables_initializer())
74 |
75 | def choose_action(self, s):
76 | return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0]
77 |
78 | def learn(self):
79 | # soft target replacement
80 | self.sess.run(self.soft_replace)
81 |
82 | indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE)
83 | bt = self.memory[indices, :]
84 | bs = bt[:, :self.s_dim]
85 | ba = bt[:, self.s_dim: self.s_dim + self.a_dim]
86 | br = bt[:, -self.s_dim - 1: -self.s_dim]
87 | bs_ = bt[:, -self.s_dim:]
88 |
89 | self.sess.run(self.atrain, {self.S: bs})
90 | self.sess.run(self.ctrain, {self.S: bs, self.a: ba, self.R: br, self.S_: bs_})
91 |
92 | def store_transition(self, s, a, r, s_):
93 | transition = np.hstack((s, a, [r], s_))
94 | index = self.pointer % MEMORY_CAPACITY # replace the old memory with new memory
95 | self.memory[index, :] = transition
96 | self.pointer += 1
97 |
98 | def _build_a(self, s, scope, trainable):
99 | with tf.variable_scope(scope):
100 | net = tf.layers.dense(s, 30, activation=tf.nn.relu, name='l1', trainable=trainable)
101 | a = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, name='a', trainable=trainable)
102 | return tf.multiply(a, self.a_bound, name='scaled_a')
103 |
104 | def _build_c(self, s, a, scope, trainable):
105 | with tf.variable_scope(scope):
106 | n_l1 = 30
107 | w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], trainable=trainable)
108 | w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], trainable=trainable)
109 | b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
110 | net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
111 | return tf.layers.dense(net, 1, trainable=trainable) # Q(s,a)
112 |
113 | ############################### training ####################################
114 |
115 | env = gym.make(ENV_NAME)
116 | env = env.unwrapped
117 | env.seed(1)
118 |
119 | s_dim = env.observation_space.shape[0]
120 | a_dim = env.action_space.shape[0]
121 | a_bound = env.action_space.high
122 |
123 | ddpg = DDPG(a_dim, s_dim, a_bound)
124 |
125 | var = 3 # control exploration
126 | t1 = time.time()
127 | for i in range(MAX_EPISODES):
128 | s = env.reset()
129 | ep_reward = 0
130 | for j in range(MAX_EP_STEPS):
131 | if RENDER:
132 | env.render()
133 |
134 | # Add exploration noise
135 | a = ddpg.choose_action(s)
136 | a = np.clip(np.random.normal(a, var), -2, 2) # add randomness to action selection for exploration
137 | s_, r, done, info = env.step(a)
138 |
139 | ddpg.store_transition(s, a, r / 10, s_)
140 |
141 | if ddpg.pointer > MEMORY_CAPACITY:
142 | var *= .9995 # decay the action randomness
143 | ddpg.learn()
144 |
145 | s = s_
146 | ep_reward += r
147 | if j == MAX_EP_STEPS-1:
148 | print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
149 | # if ep_reward > -300:RENDER = True
150 | break
151 | print('Running time: ', time.time() - t1)
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG_update2.py:
--------------------------------------------------------------------------------
1 | """
2 | Note: This is a updated version from my previous code,
3 | for the target network, I use moving average to soft replace target parameters instead using assign function.
4 | By doing this, it has 20% speed up on my machine (CPU).
5 |
6 | Deep Deterministic Policy Gradient (DDPG), Reinforcement Learning.
7 | DDPG is Actor Critic based algorithm.
8 | Pendulum example.
9 |
10 | View more on my tutorial page: https://morvanzhou.github.io/tutorials/
11 |
12 | Using:
13 | tensorflow 1.0
14 | gym 0.8.0
15 | """
16 |
17 | import tensorflow as tf
18 | import numpy as np
19 | import gym
20 | import time
21 |
22 |
23 | ##################### hyper parameters ####################
24 |
25 | MAX_EPISODES = 200
26 | MAX_EP_STEPS = 200
27 | LR_A = 0.001 # learning rate for actor
28 | LR_C = 0.002 # learning rate for critic
29 | GAMMA = 0.9 # reward discount
30 | TAU = 0.01 # soft replacement
31 | MEMORY_CAPACITY = 10000
32 | BATCH_SIZE = 32
33 |
34 | RENDER = False
35 | ENV_NAME = 'Pendulum-v0'
36 |
37 |
38 | ############################### DDPG ####################################
39 |
40 |
41 | class DDPG(object):
42 | def __init__(self, a_dim, s_dim, a_bound,):
43 | self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32)
44 | self.pointer = 0
45 | self.sess = tf.Session()
46 |
47 | self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
48 | self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
49 | self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_')
50 | self.R = tf.placeholder(tf.float32, [None, 1], 'r')
51 |
52 | self.a = self._build_a(self.S,)
53 | q = self._build_c(self.S, self.a, )
54 | a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Actor')
55 | c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Critic')
56 | ema = tf.train.ExponentialMovingAverage(decay=1 - TAU) # soft replacement
57 |
58 | def ema_getter(getter, name, *args, **kwargs):
59 | return ema.average(getter(name, *args, **kwargs))
60 |
61 | target_update = [ema.apply(a_params), ema.apply(c_params)] # soft update operation
62 | a_ = self._build_a(self.S_, reuse=True, custom_getter=ema_getter) # replaced target parameters
63 | q_ = self._build_c(self.S_, a_, reuse=True, custom_getter=ema_getter)
64 |
65 | a_loss = - tf.reduce_mean(q) # maximize the q
66 | self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list=a_params)
67 |
68 | with tf.control_dependencies(target_update): # soft replacement happened at here
69 | q_target = self.R + GAMMA * q_
70 | td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q)
71 | self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list=c_params)
72 |
73 | self.sess.run(tf.global_variables_initializer())
74 |
75 | def choose_action(self, s):
76 | return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0]
77 |
78 | def learn(self):
79 | indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE)
80 | bt = self.memory[indices, :]
81 | bs = bt[:, :self.s_dim]
82 | ba = bt[:, self.s_dim: self.s_dim + self.a_dim]
83 | br = bt[:, -self.s_dim - 1: -self.s_dim]
84 | bs_ = bt[:, -self.s_dim:]
85 |
86 | self.sess.run(self.atrain, {self.S: bs})
87 | self.sess.run(self.ctrain, {self.S: bs, self.a: ba, self.R: br, self.S_: bs_})
88 |
89 | def store_transition(self, s, a, r, s_):
90 | transition = np.hstack((s, a, [r], s_))
91 | index = self.pointer % MEMORY_CAPACITY # replace the old memory with new memory
92 | self.memory[index, :] = transition
93 | self.pointer += 1
94 |
95 | def _build_a(self, s, reuse=None, custom_getter=None):
96 | trainable = True if reuse is None else False
97 | with tf.variable_scope('Actor', reuse=reuse, custom_getter=custom_getter):
98 | net = tf.layers.dense(s, 30, activation=tf.nn.relu, name='l1', trainable=trainable)
99 | a = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, name='a', trainable=trainable)
100 | return tf.multiply(a, self.a_bound, name='scaled_a')
101 |
102 | def _build_c(self, s, a, reuse=None, custom_getter=None):
103 | trainable = True if reuse is None else False
104 | with tf.variable_scope('Critic', reuse=reuse, custom_getter=custom_getter):
105 | n_l1 = 30
106 | w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], trainable=trainable)
107 | w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], trainable=trainable)
108 | b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
109 | net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
110 | return tf.layers.dense(net, 1, trainable=trainable) # Q(s,a)
111 |
112 |
113 | ############################### training ####################################
114 |
115 | env = gym.make(ENV_NAME)
116 | env = env.unwrapped
117 | env.seed(1)
118 |
119 | s_dim = env.observation_space.shape[0]
120 | a_dim = env.action_space.shape[0]
121 | a_bound = env.action_space.high
122 |
123 | ddpg = DDPG(a_dim, s_dim, a_bound)
124 |
125 | var = 3 # control exploration
126 | t1 = time.time()
127 | for i in range(MAX_EPISODES):
128 | s = env.reset()
129 | ep_reward = 0
130 | for j in range(MAX_EP_STEPS):
131 | if RENDER:
132 | env.render()
133 |
134 | # Add exploration noise
135 | a = ddpg.choose_action(s)
136 | a = np.clip(np.random.normal(a, var), -2, 2) # add randomness to action selection for exploration
137 | s_, r, done, info = env.step(a)
138 |
139 | ddpg.store_transition(s, a, r / 10, s_)
140 |
141 | if ddpg.pointer > MEMORY_CAPACITY:
142 | var *= .9995 # decay the action randomness
143 | ddpg.learn()
144 |
145 | s = s_
146 | ep_reward += r
147 | if j == MAX_EP_STEPS-1:
148 | print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
149 | # if ep_reward > -300:RENDER = True
150 | break
151 |
152 | print('Running time: ', time.time() - t1)
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/Curiosity_Model/Curiosity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wqw547243068/wangqiwen/07b64ae47d91581e1c339f40bc765fd7815b47ff/rl/Reinforcement-learning-with-tensorflow/contents/Curiosity_Model/Curiosity.png
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/Curiosity_Model/Curiosity.py:
--------------------------------------------------------------------------------
1 | """This is a simple implementation of [Large-Scale Study of Curiosity-Driven Learning](https://arxiv.org/abs/1808.04355)"""
2 |
3 | import numpy as np
4 | import tensorflow as tf
5 | import gym
6 | import matplotlib.pyplot as plt
7 |
8 |
9 | class CuriosityNet:
10 | def __init__(
11 | self,
12 | n_a,
13 | n_s,
14 | lr=0.01,
15 | gamma=0.98,
16 | epsilon=0.95,
17 | replace_target_iter=300,
18 | memory_size=10000,
19 | batch_size=128,
20 | output_graph=False,
21 | ):
22 | self.n_a = n_a
23 | self.n_s = n_s
24 | self.lr = lr
25 | self.gamma = gamma
26 | self.epsilon = epsilon
27 | self.replace_target_iter = replace_target_iter
28 | self.memory_size = memory_size
29 | self.batch_size = batch_size
30 |
31 | # total learning step
32 | self.learn_step_counter = 0
33 | self.memory_counter = 0
34 |
35 | # initialize zero memory [s, a, r, s_]
36 | self.memory = np.zeros((self.memory_size, n_s * 2 + 2))
37 | self.tfs, self.tfa, self.tfr, self.tfs_, self.dyn_train, self.dqn_train, self.q, self.int_r = \
38 | self._build_nets()
39 |
40 | t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_net')
41 | e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='eval_net')
42 |
43 | with tf.variable_scope('hard_replacement'):
44 | self.target_replace_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]
45 |
46 | self.sess = tf.Session()
47 |
48 | if output_graph:
49 | tf.summary.FileWriter("logs/", self.sess.graph)
50 |
51 | self.sess.run(tf.global_variables_initializer())
52 |
53 | def _build_nets(self):
54 | tfs = tf.placeholder(tf.float32, [None, self.n_s], name="s") # input State
55 | tfa = tf.placeholder(tf.int32, [None, ], name="a") # input Action
56 | tfr = tf.placeholder(tf.float32, [None, ], name="ext_r") # extrinsic reward
57 | tfs_ = tf.placeholder(tf.float32, [None, self.n_s], name="s_") # input Next State
58 |
59 | # dynamics net
60 | dyn_s_, curiosity, dyn_train = self._build_dynamics_net(tfs, tfa, tfs_)
61 |
62 | # normal RL model
63 | total_reward = tf.add(curiosity, tfr, name="total_r")
64 | q, dqn_loss, dqn_train = self._build_dqn(tfs, tfa, total_reward, tfs_)
65 | return tfs, tfa, tfr, tfs_, dyn_train, dqn_train, q, curiosity
66 |
67 | def _build_dynamics_net(self, s, a, s_):
68 | with tf.variable_scope("dyn_net"):
69 | float_a = tf.expand_dims(tf.cast(a, dtype=tf.float32, name="float_a"), axis=1, name="2d_a")
70 | sa = tf.concat((s, float_a), axis=1, name="sa")
71 | encoded_s_ = s_ # here we use s_ as the encoded s_
72 |
73 | dyn_l = tf.layers.dense(sa, 32, activation=tf.nn.relu)
74 | dyn_s_ = tf.layers.dense(dyn_l, self.n_s) # predicted s_
75 | with tf.name_scope("int_r"):
76 | squared_diff = tf.reduce_sum(tf.square(encoded_s_ - dyn_s_), axis=1) # intrinsic reward
77 |
78 | # It is better to reduce the learning rate in order to stay curious
79 | train_op = tf.train.RMSPropOptimizer(self.lr, name="dyn_opt").minimize(tf.reduce_mean(squared_diff))
80 | return dyn_s_, squared_diff, train_op
81 |
82 | def _build_dqn(self, s, a, r, s_):
83 | with tf.variable_scope('eval_net'):
84 | e1 = tf.layers.dense(s, 128, tf.nn.relu)
85 | q = tf.layers.dense(e1, self.n_a, name="q")
86 | with tf.variable_scope('target_net'):
87 | t1 = tf.layers.dense(s_, 128, tf.nn.relu)
88 | q_ = tf.layers.dense(t1, self.n_a, name="q_")
89 |
90 | with tf.variable_scope('q_target'):
91 | q_target = r + self.gamma * tf.reduce_max(q_, axis=1, name="Qmax_s_")
92 |
93 | with tf.variable_scope('q_wrt_a'):
94 | a_indices = tf.stack([tf.range(tf.shape(a)[0], dtype=tf.int32), a], axis=1)
95 | q_wrt_a = tf.gather_nd(params=q, indices=a_indices)
96 |
97 | loss = tf.losses.mean_squared_error(labels=q_target, predictions=q_wrt_a) # TD error
98 | train_op = tf.train.RMSPropOptimizer(self.lr, name="dqn_opt").minimize(
99 | loss, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "eval_net"))
100 | return q, loss, train_op
101 |
102 | def store_transition(self, s, a, r, s_):
103 | transition = np.hstack((s, [a, r], s_))
104 | # replace the old memory with new memory
105 | index = self.memory_counter % self.memory_size
106 | self.memory[index, :] = transition
107 | self.memory_counter += 1
108 |
109 | def choose_action(self, observation):
110 | # to have batch dimension when feed into tf placeholder
111 | s = observation[np.newaxis, :]
112 |
113 | if np.random.uniform() < self.epsilon:
114 | # forward feed the observation and get q value for every actions
115 | actions_value = self.sess.run(self.q, feed_dict={self.tfs: s})
116 | action = np.argmax(actions_value)
117 | else:
118 | action = np.random.randint(0, self.n_a)
119 | return action
120 |
121 | def learn(self):
122 | # check to replace target parameters
123 | if self.learn_step_counter % self.replace_target_iter == 0:
124 | self.sess.run(self.target_replace_op)
125 |
126 | # sample batch memory from all memory
127 | top = self.memory_size if self.memory_counter > self.memory_size else self.memory_counter
128 | sample_index = np.random.choice(top, size=self.batch_size)
129 | batch_memory = self.memory[sample_index, :]
130 |
131 | bs, ba, br, bs_ = batch_memory[:, :self.n_s], batch_memory[:, self.n_s], \
132 | batch_memory[:, self.n_s + 1], batch_memory[:, -self.n_s:]
133 | self.sess.run(self.dqn_train, feed_dict={self.tfs: bs, self.tfa: ba, self.tfr: br, self.tfs_: bs_})
134 | if self.learn_step_counter % 1000 == 0: # delay training in order to stay curious
135 | self.sess.run(self.dyn_train, feed_dict={self.tfs: bs, self.tfa: ba, self.tfs_: bs_})
136 | self.learn_step_counter += 1
137 |
138 |
139 | env = gym.make('MountainCar-v0')
140 | env = env.unwrapped
141 |
142 | dqn = CuriosityNet(n_a=3, n_s=2, lr=0.01, output_graph=False)
143 | ep_steps = []
144 | for epi in range(200):
145 | s = env.reset()
146 | steps = 0
147 | while True:
148 | env.render()
149 | a = dqn.choose_action(s)
150 | s_, r, done, info = env.step(a)
151 | dqn.store_transition(s, a, r, s_)
152 | dqn.learn()
153 | if done:
154 | print('Epi: ', epi, "| steps: ", steps)
155 | ep_steps.append(steps)
156 | break
157 | s = s_
158 | steps += 1
159 |
160 | plt.plot(ep_steps)
161 | plt.ylabel("steps")
162 | plt.xlabel("episode")
163 | plt.show()
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/contents/Curiosity_Model/Random_Network_Distillation.py:
--------------------------------------------------------------------------------
1 | """This is a simple implementation of [Exploration by Random Network Distillation](https://arxiv.org/abs/1810.12894)"""
2 |
3 | import numpy as np
4 | import tensorflow as tf
5 | import gym
6 | import matplotlib.pyplot as plt
7 |
8 |
9 | class CuriosityNet:
10 | def __init__(
11 | self,
12 | n_a,
13 | n_s,
14 | lr=0.01,
15 | gamma=0.95,
16 | epsilon=1.,
17 | replace_target_iter=300,
18 | memory_size=10000,
19 | batch_size=128,
20 | output_graph=False,
21 | ):
22 | self.n_a = n_a
23 | self.n_s = n_s
24 | self.lr = lr
25 | self.gamma = gamma
26 | self.epsilon = epsilon
27 | self.replace_target_iter = replace_target_iter
28 | self.memory_size = memory_size
29 | self.batch_size = batch_size
30 | self.s_encode_size = 1000 # give a hard job for predictor to learn
31 |
32 | # total learning step
33 | self.learn_step_counter = 0
34 | self.memory_counter = 0
35 |
36 | # initialize zero memory [s, a, r, s_]
37 | self.memory = np.zeros((self.memory_size, n_s * 2 + 2))
38 | self.tfs, self.tfa, self.tfr, self.tfs_, self.pred_train, self.dqn_train, self.q = \
39 | self._build_nets()
40 |
41 | t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_net')
42 | e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='eval_net')
43 |
44 | with tf.variable_scope('hard_replacement'):
45 | self.target_replace_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]
46 |
47 | self.sess = tf.Session()
48 |
49 | if output_graph:
50 | tf.summary.FileWriter("logs/", self.sess.graph)
51 |
52 | self.sess.run(tf.global_variables_initializer())
53 |
54 | def _build_nets(self):
55 | tfs = tf.placeholder(tf.float32, [None, self.n_s], name="s") # input State
56 | tfa = tf.placeholder(tf.int32, [None, ], name="a") # input Action
57 | tfr = tf.placeholder(tf.float32, [None, ], name="ext_r") # extrinsic reward
58 | tfs_ = tf.placeholder(tf.float32, [None, self.n_s], name="s_") # input Next State
59 |
60 | # fixed random net
61 | with tf.variable_scope("random_net"):
62 | rand_encode_s_ = tf.layers.dense(tfs_, self.s_encode_size)
63 |
64 | # predictor
65 | ri, pred_train = self._build_predictor(tfs_, rand_encode_s_)
66 |
67 | # normal RL model
68 | q, dqn_loss, dqn_train = self._build_dqn(tfs, tfa, ri, tfr, tfs_)
69 | return tfs, tfa, tfr, tfs_, pred_train, dqn_train, q
70 |
71 | def _build_predictor(self, s_, rand_encode_s_):
72 | with tf.variable_scope("predictor"):
73 | net = tf.layers.dense(s_, 128, tf.nn.relu)
74 | out = tf.layers.dense(net, self.s_encode_size)
75 |
76 | with tf.name_scope("int_r"):
77 | ri = tf.reduce_sum(tf.square(rand_encode_s_ - out), axis=1) # intrinsic reward
78 | train_op = tf.train.RMSPropOptimizer(self.lr, name="predictor_opt").minimize(
79 | tf.reduce_mean(ri), var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "predictor"))
80 |
81 | return ri, train_op
82 |
83 | def _build_dqn(self, s, a, ri, re, s_):
84 | with tf.variable_scope('eval_net'):
85 | e1 = tf.layers.dense(s, 128, tf.nn.relu)
86 | q = tf.layers.dense(e1, self.n_a, name="q")
87 | with tf.variable_scope('target_net'):
88 | t1 = tf.layers.dense(s_, 128, tf.nn.relu)
89 | q_ = tf.layers.dense(t1, self.n_a, name="q_")
90 |
91 | with tf.variable_scope('q_target'):
92 | q_target = re + ri + self.gamma * tf.reduce_max(q_, axis=1, name="Qmax_s_")
93 |
94 | with tf.variable_scope('q_wrt_a'):
95 | a_indices = tf.stack([tf.range(tf.shape(a)[0], dtype=tf.int32), a], axis=1)
96 | q_wrt_a = tf.gather_nd(params=q, indices=a_indices)
97 |
98 | loss = tf.losses.mean_squared_error(labels=q_target, predictions=q_wrt_a) # TD error
99 | train_op = tf.train.RMSPropOptimizer(self.lr, name="dqn_opt").minimize(
100 | loss, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "eval_net"))
101 | return q, loss, train_op
102 |
103 | def store_transition(self, s, a, r, s_):
104 | transition = np.hstack((s, [a, r], s_))
105 | # replace the old memory with new memory
106 | index = self.memory_counter % self.memory_size
107 | self.memory[index, :] = transition
108 | self.memory_counter += 1
109 |
110 | def choose_action(self, observation):
111 | # to have batch dimension when feed into tf placeholder
112 | s = observation[np.newaxis, :]
113 |
114 | if np.random.uniform() < self.epsilon:
115 | # forward feed the observation and get q value for every actions
116 | actions_value = self.sess.run(self.q, feed_dict={self.tfs: s})
117 | action = np.argmax(actions_value)
118 | else:
119 | action = np.random.randint(0, self.n_a)
120 | return action
121 |
122 | def learn(self):
123 | # check to replace target parameters
124 | if self.learn_step_counter % self.replace_target_iter == 0:
125 | self.sess.run(self.target_replace_op)
126 |
127 | # sample batch memory from all memory
128 | top = self.memory_size if self.memory_counter > self.memory_size else self.memory_counter
129 | sample_index = np.random.choice(top, size=self.batch_size)
130 | batch_memory = self.memory[sample_index, :]
131 |
132 | bs, ba, br, bs_ = batch_memory[:, :self.n_s], batch_memory[:, self.n_s], \
133 | batch_memory[:, self.n_s + 1], batch_memory[:, -self.n_s:]
134 | self.sess.run(self.dqn_train, feed_dict={self.tfs: bs, self.tfa: ba, self.tfr: br, self.tfs_: bs_})
135 | if self.learn_step_counter % 100 == 0: # delay training in order to stay curious
136 | self.sess.run(self.pred_train, feed_dict={self.tfs_: bs_})
137 | self.learn_step_counter += 1
138 |
139 |
140 | env = gym.make('MountainCar-v0')
141 | env = env.unwrapped
142 |
143 | dqn = CuriosityNet(n_a=3, n_s=2, lr=0.01, output_graph=False)
144 | ep_steps = []
145 | for epi in range(200):
146 | s = env.reset()
147 | steps = 0
148 | while True:
149 | # env.render()
150 | a = dqn.choose_action(s)
151 | s_, r, done, info = env.step(a)
152 | dqn.store_transition(s, a, r, s_)
153 | dqn.learn()
154 | if done:
155 | print('Epi: ', epi, "| steps: ", steps)
156 | ep_steps.append(steps)
157 | break
158 | s = s_
159 | steps += 1
160 |
161 | plt.plot(ep_steps)
162 | plt.ylabel("steps")
163 | plt.xlabel("episode")
164 | plt.show()
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/experiments/2D_car/collision.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | def intersection():
4 | p = np.array([0, 0])
5 | r = np.array([1, 1])
6 | q = np.array([0.1, 0.1])
7 | s = np.array([.1, .1])
8 |
9 | if np.cross(r, s) == 0 and np.cross((q-p), r) == 0: # collinear
10 | # t0 = (q − p) · r / (r · r)
11 | # t1 = (q + s − p) · r / (r · r) = t0 + s · r / (r · r)
12 | t0 = np.dot(q-p, r)/np.dot(r, r)
13 | t1 = t0 + np.dot(s, r)/np.dot(r, r)
14 | print(t1, t0)
15 | if ((np.dot(s, r) > 0) and (0 <= t1 - t0 <= 1)) or ((np.dot(s, r) <= 0) and (0 <= t0 - t1 <= 1)):
16 | print('collinear and overlapping, q_s in p_r')
17 | else:
18 | print('collinear and disjoint')
19 | elif np.cross(r, s) == 0 and np.cross((q-p), r) != 0: # parallel r × s = 0 and (q − p) × r ≠ 0,
20 | print('parallel')
21 | else:
22 | t = np.cross((q - p), s) / np.cross(r, s)
23 | u = np.cross((q - p), r) / np.cross(r, s)
24 | if 0 <= t <= 1 and 0 <= u <= 1:
25 | # If r × s ≠ 0 and 0 ≤ t ≤ 1 and 0 ≤ u ≤ 1, the two line segments meet at the point p + t r = q + u s
26 | print('intersection: ', p + t*r)
27 | else:
28 | print('not parallel and not intersect')
29 |
30 |
31 | def point2segment():
32 | p = np.array([-1, 1]) # coordination of point
33 | a = np.array([0, 1]) # coordination of line segment end 1
34 | b = np.array([1, 0]) # coordination of line segment end 2
35 | ab = b-a # line ab
36 | ap = p-a
37 | distance = np.abs(np.cross(ab, ap)/np.linalg.norm(ab)) # d = (AB x AC)/|AB|
38 | print(distance)
39 |
40 | # angle Cos(θ) = A dot B /(|A||B|)
41 | bp = p-b
42 | cosTheta1 = np.dot(ap, ab) / (np.linalg.norm(ap) * np.linalg.norm(ab))
43 | theta1 = np.arccos(cosTheta1)
44 | cosTheta2 = np.dot(bp, ab) / (np.linalg.norm(bp) * np.linalg.norm(ab))
45 | theta2 = np.arccos(cosTheta2)
46 | if np.pi/2 <= (theta1 % (np.pi*2)) <= 3/2 * np.pi:
47 | print('out of a')
48 | elif -np.pi/2 <= (theta2 % (np.pi*2)) <= np.pi/2:
49 | print('out of b')
50 | else:
51 | print('between a and b')
52 |
53 |
54 |
55 | if __name__ == '__main__':
56 | point2segment()
57 | # intersection()
58 |
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/experiments/Solve_BipedalWalker/log/events.out.tfevents.1490801027.Morvan:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wqw547243068/wangqiwen/07b64ae47d91581e1c339f40bc765fd7815b47ff/rl/Reinforcement-learning-with-tensorflow/experiments/Solve_BipedalWalker/log/events.out.tfevents.1490801027.Morvan
--------------------------------------------------------------------------------
/rl/Reinforcement-learning-with-tensorflow/experiments/Solve_LunarLander/run_LunarLander.py:
--------------------------------------------------------------------------------
1 | """
2 | Deep Q network,
3 |
4 | LunarLander-v2 example
5 |
6 | Using:
7 | Tensorflow: 1.0
8 | gym: 0.8.0
9 | """
10 |
11 |
12 | import gym
13 | from gym import wrappers
14 | from DuelingDQNPrioritizedReplay import DuelingDQNPrioritizedReplay
15 |
16 | env = gym.make('LunarLander-v2')
17 | # env = env.unwrapped
18 | env.seed(1)
19 |
20 | N_A = env.action_space.n
21 | N_S = env.observation_space.shape[0]
22 | MEMORY_CAPACITY = 50000
23 | TARGET_REP_ITER = 2000
24 | MAX_EPISODES = 900
25 | E_GREEDY = 0.95
26 | E_INCREMENT = 0.00001
27 | GAMMA = 0.99
28 | LR = 0.0001
29 | BATCH_SIZE = 32
30 | HIDDEN = [400, 400]
31 | RENDER = True
32 |
33 | RL = DuelingDQNPrioritizedReplay(
34 | n_actions=N_A, n_features=N_S, learning_rate=LR, e_greedy=E_GREEDY, reward_decay=GAMMA,
35 | hidden=HIDDEN, batch_size=BATCH_SIZE, replace_target_iter=TARGET_REP_ITER,
36 | memory_size=MEMORY_CAPACITY, e_greedy_increment=E_INCREMENT,)
37 |
38 |
39 | total_steps = 0
40 | running_r = 0
41 | r_scale = 100
42 | for i_episode in range(MAX_EPISODES):
43 | s = env.reset() # (coord_x, coord_y, vel_x, vel_y, angle, angular_vel, l_leg_on_ground, r_leg_on_ground)
44 | ep_r = 0
45 | while True:
46 | if total_steps > MEMORY_CAPACITY: env.render()
47 | a = RL.choose_action(s)
48 | s_, r, done, _ = env.step(a)
49 | if r == -100: r = -30
50 | r /= r_scale
51 |
52 | ep_r += r
53 | RL.store_transition(s, a, r, s_)
54 | if total_steps > MEMORY_CAPACITY:
55 | RL.learn()
56 | if done:
57 | land = '| Landed' if r == 100/r_scale else '| ------'
58 | running_r = 0.99 * running_r + 0.01 * ep_r
59 | print('Epi: ', i_episode,
60 | land,
61 | '| Epi_R: ', round(ep_r, 2),
62 | '| Running_R: ', round(running_r, 2),
63 | '| Epsilon: ', round(RL.epsilon, 3))
64 | break
65 |
66 | s = s_
67 | total_steps += 1
68 |
69 |
--------------------------------------------------------------------------------
/rl/openai/gym_hello.py:
--------------------------------------------------------------------------------
1 | import gym
2 | env = gym.make('CartPole-v0')
3 | env.reset() #重置环境状态
4 | for _ in range(1000):
5 | env.render() #重绘每一帧
6 | env.step(env.action_space.sample()) # take a random action
--------------------------------------------------------------------------------
/rl/openai/gym_test.py:
--------------------------------------------------------------------------------
1 | import gym
2 | env = gym.make("Taxi-v1")
3 | observation = env.reset()
4 | for _ in range(1000):
5 | env.render()
6 | action = env.action_space.sample() # your agent here (this takes random actions)
7 | observation, reward, done, info = env.step(action)
--------------------------------------------------------------------------------
/rl/readme.md:
--------------------------------------------------------------------------------
1 | ## 强化学习
2 |
3 | - 安装deepmind工具包:
4 | ```shell
5 | pip install git+git://github.com/deepmind/trfl.git
6 | ```
7 |
8 |
9 |
--------------------------------------------------------------------------------
/rl/ucl/test.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding:utf8
3 |
4 | # pip install tensorflow # 1.8以上
5 | # pip install git+git://github.com/deepmind/trfl.git
6 | import tensorflow as tf
7 | import trfl
8 |
9 | # Q-values for the previous and next timesteps, shape [batch_size, num_actions].
10 | q_tm1 = tf.get_variable(
11 | "q_tm1", initializer=[[1., 1., 0.], [1., 2., 0.]], dtype=tf.float32)
12 | q_t = tf.get_variable(
13 | "q_t", initializer=[[0., 1., 0.], [1., 2., 0.]], dtype=tf.float32)
14 |
15 | # Action indices, discounts and rewards, shape [batch_size].
16 | a_tm1 = tf.constant([0, 1], dtype=tf.int32)
17 | r_t = tf.constant([1, 1], dtype=tf.float32)
18 | pcont_t = tf.constant([0, 1], dtype=tf.float32) # the discount factor
19 |
20 | # Q-learning loss, and auxiliary data.
21 | loss, q_learning = trfl.qlearning(q_tm1, a_tm1, r_t, pcont_t, q_t)
22 |
23 | reduced_loss = tf.reduce_mean(loss)
24 | optimizer = tf.train.AdamOptimizer(learning_rate=0.1)
25 | train_op = optimizer.minimize(reduced_loss)
26 |
27 |
--------------------------------------------------------------------------------
/web/Untitled Diagram.drawio:
--------------------------------------------------------------------------------
1 | 1ddNc6IwGAfwT8N1B5KAcFTb7vbQS7sze85ChExDHifGt/30+yBB0eBMZ2odykXyzwvhF4wS0Hm9+2n4snqBQqiAhMUuoA8BISSNCX40yb5NIpqFbVIaWbjsFLzJf8KFXbO1LMTqrKEFUFYuz8MctBa5Pcu4MbA9b7YAdX7VJS+FF7zlXPnpH1nYyqVRkp0qfglZVu7SKZm0FX95/l4aWGt3vYDQxeFoq2vejeVudFXxAra9iD4GdG4AbHtW7+ZCNbgdW9vv6Urtcd5GaPuRDiSKqeu04Wotulkf5mb3nQd2QnoszBZSqTkoMIcK+hSmkzDGHJY8l7ZZ7yTE4soaeBddQw266VvZWmEpaoYBbfvDHA7M3VSEsWJ39ZaiIxQ+gQJqYc0em7gOlGY/4raTe/xomHbJ9rSacejWoOot5DHk7gkqj+OfFPHEQV5HJZ9Ajch0OhsXajS5RI3ub0o/YSrSOGFkVKYsHYEp80yf9QZvQIL2cPFWrSNxWzZhWOZKlhoLOXYT6DRrVCRuplNXUcuiUPdzJdmlK27QA7BRNgA7uZFr7Lm+ig2o9XeGTcYAm3iwvw0v5Hd2ZZ5rHN/ddeK5vkglLXcjDbB6jB/Q6i1FlNxGL068bTQb0mMDeOxGeKmHNwe9UBL/mo4bj7FLPDb4I/SVeJmPV3FYjVzO3wzvLtcN05NDHa5zMXI7/ytL2RfaYfH0RnWo67230sf/
--------------------------------------------------------------------------------
/web/pydown/pydown.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | pydown
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 | Markdown + deck.js
34 | Simple html presentation maker
35 | You just need to write markdown file
36 | Support code highlighting
37 |
def hello():
38 | print 'hello world'
39 |
40 | You can customize with css
41 | like dropping the centering
42 | Easy to use
43 |
44 | - write your slides markdown file
45 | - python main.py md directory
46 |
47 | The previous slide just looks like this
48 |
!SLIDE left
49 |
50 | ##Easy to use
51 |
52 | 1. write your slides markdown file
53 | 2. python main.py md directory
54 |
55 | Just simple
56 | and enjoy yourself
57 |
61 |
62 |
63 |
64 |
65 | 8
66 | /
67 | 8
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
85 |
86 |
87 |
88 |
--------------------------------------------------------------------------------
/web/pydown/pydown_files/deck.scale.css:
--------------------------------------------------------------------------------
1 | /* Remove this line if you are embedding deck.js in a page and
2 | using the scale extension. */
3 | .csstransforms {
4 | overflow: hidden;
5 | }
6 |
7 | .csstransforms .deck-container.deck-scale:not(.deck-menu) > .slide {
8 | -webkit-box-sizing: padding-box;
9 | -moz-box-sizing: padding-box;
10 | box-sizing: padding-box;
11 | width: 100%;
12 | padding-bottom: 20px;
13 | }
14 | .csstransforms .deck-container.deck-scale:not(.deck-menu) > .slide > .deck-slide-scaler {
15 | -webkit-transform-origin: 50% 0;
16 | -moz-transform-origin: 50% 0;
17 | -o-transform-origin: 50% 0;
18 | -ms-transform-origin: 50% 0;
19 | transform-origin: 50% 0;
20 | }
21 |
22 | .csstransforms .deck-container.deck-menu .deck-slide-scaler {
23 | -webkit-transform: none !important;
24 | -moz-transform: none !important;
25 | -o-transform: none !important;
26 | -ms-transform: none !important;
27 | transform: none !important;
28 | }
29 |
--------------------------------------------------------------------------------
/web/pydown/pydown_files/deck.scale.js.下载:
--------------------------------------------------------------------------------
1 | /*!
2 | Deck JS - deck.scale
3 | Copyright (c) 2011-2012 Caleb Troughton
4 | Dual licensed under the MIT license and GPL license.
5 | https://github.com/imakewebthings/deck.js/blob/master/MIT-license.txt
6 | https://github.com/imakewebthings/deck.js/blob/master/GPL-license.txt
7 | */
8 |
9 | /*
10 | This module adds automatic scaling to the deck. Slides are scaled down
11 | using CSS transforms to fit within the deck container. If the container is
12 | big enough to hold the slides without scaling, no scaling occurs. The user
13 | can disable and enable scaling with a keyboard shortcut.
14 |
15 | Note: CSS transforms may make Flash videos render incorrectly. Presenters
16 | that need to use video may want to disable scaling to play them. HTML5 video
17 | works fine.
18 | */
19 | (function($, deck, window, undefined) {
20 | var $d = $(document),
21 | $w = $(window),
22 | baseHeight, // Value to scale against
23 | timer, // Timeout id for debouncing
24 | rootSlides,
25 |
26 | /*
27 | Internal function to do all the dirty work of scaling the slides.
28 | */
29 | scaleDeck = function() {
30 | var opts = $[deck]('getOptions'),
31 | obh = opts.baseHeight,
32 | $container = $[deck]('getContainer'),
33 | baseHeight = obh ? obh : $container.height();
34 |
35 | // Scale each slide down if necessary (but don't scale up)
36 | $.each(rootSlides, function(i, $slide) {
37 | var slideHeight = $slide.innerHeight(),
38 | $scaler = $slide.find('.' + opts.classes.scaleSlideWrapper),
39 | scale = $container.hasClass(opts.classes.scale) ?
40 | baseHeight / slideHeight :
41 | 1;
42 |
43 | $.each('Webkit Moz O ms Khtml'.split(' '), function(i, prefix) {
44 | if (scale === 1) {
45 | $scaler.css(prefix + 'Transform', '');
46 | }
47 | else {
48 | $scaler.css(prefix + 'Transform', 'scale(' + scale + ')');
49 | }
50 | });
51 | });
52 | }
53 |
54 | /*
55 | Extends defaults/options.
56 |
57 | options.classes.scale
58 | This class is added to the deck container when scaling is enabled.
59 | It is enabled by default when the module is included.
60 |
61 | options.classes.scaleSlideWrapper
62 | Scaling is done using a wrapper around the contents of each slide. This
63 | class is applied to that wrapper.
64 |
65 | options.keys.scale
66 | The numeric keycode used to toggle enabling and disabling scaling.
67 |
68 | options.baseHeight
69 | When baseHeight is falsy, as it is by default, the deck is scaled in
70 | proportion to the height of the deck container. You may instead specify
71 | a height as a number of px, and slides will be scaled against this
72 | height regardless of the container size.
73 |
74 | options.scaleDebounce
75 | Scaling on the browser resize event is debounced. This number is the
76 | threshold in milliseconds. You can learn more about debouncing here:
77 | http://unscriptable.com/index.php/2009/03/20/debouncing-javascript-methods/
78 |
79 | */
80 | $.extend(true, $[deck].defaults, {
81 | classes: {
82 | scale: 'deck-scale',
83 | scaleSlideWrapper: 'deck-slide-scaler'
84 | },
85 |
86 | keys: {
87 | scale: 83 // s
88 | },
89 |
90 | baseHeight: null,
91 | scaleDebounce: 200
92 | });
93 |
94 | /*
95 | jQuery.deck('disableScale')
96 |
97 | Disables scaling and removes the scale class from the deck container.
98 | */
99 | $[deck]('extend', 'disableScale', function() {
100 | $[deck]('getContainer').removeClass($[deck]('getOptions').classes.scale);
101 | scaleDeck();
102 | });
103 |
104 | /*
105 | jQuery.deck('enableScale')
106 |
107 | Enables scaling and adds the scale class to the deck container.
108 | */
109 | $[deck]('extend', 'enableScale', function() {
110 | $[deck]('getContainer').addClass($[deck]('getOptions').classes.scale);
111 | scaleDeck();
112 | });
113 |
114 | /*
115 | jQuery.deck('toggleScale')
116 |
117 | Toggles between enabling and disabling scaling.
118 | */
119 | $[deck]('extend', 'toggleScale', function() {
120 | var $c = $[deck]('getContainer');
121 | $[deck]($c.hasClass($[deck]('getOptions').classes.scale) ?
122 | 'disableScale' : 'enableScale');
123 | });
124 |
125 | $d.bind('deck.init', function() {
126 | var opts = $[deck]('getOptions'),
127 | slideTest = $.map([
128 | opts.classes.before,
129 | opts.classes.previous,
130 | opts.classes.current,
131 | opts.classes.next,
132 | opts.classes.after
133 | ], function(el, i) {
134 | return '.' + el;
135 | }).join(', ');
136 |
137 | // Build top level slides array
138 | rootSlides = [];
139 | $.each($[deck]('getSlides'), function(i, $el) {
140 | if (!$el.parentsUntil(opts.selectors.container, slideTest).length) {
141 | rootSlides.push($el);
142 | }
143 | });
144 |
145 | // Use a wrapper on each slide to handle content scaling
146 | $.each(rootSlides, function(i, $slide) {
147 | $slide.children().wrapAll('');
148 | });
149 |
150 | // Debounce the resize scaling
151 | $w.unbind('resize.deckscale').bind('resize.deckscale', function() {
152 | window.clearTimeout(timer);
153 | timer = window.setTimeout(scaleDeck, opts.scaleDebounce);
154 | })
155 | // Scale once on load, in case images or something change layout
156 | .unbind('load.deckscale').bind('load.deckscale', scaleDeck);
157 |
158 | // Bind key events
159 | $d.unbind('keydown.deckscale').bind('keydown.deckscale', function(e) {
160 | if (e.which === opts.keys.scale || $.inArray(e.which, opts.keys.scale) > -1) {
161 | $[deck]('toggleScale');
162 | e.preventDefault();
163 | }
164 | });
165 |
166 | // Enable scale on init
167 | $[deck]('enableScale');
168 | });
169 | })(jQuery, 'deck', this);
170 |
171 |
--------------------------------------------------------------------------------
/web/pydown/pydown_files/deck.status.css:
--------------------------------------------------------------------------------
1 | .deck-container .deck-status {
2 | position: absolute;
3 | bottom: 10px;
4 | right: 5px;
5 | color: #888;
6 | z-index: 3;
7 | margin: 0;
8 | }
9 |
10 | body.deck-container .deck-status {
11 | position: fixed;
12 | }
13 |
14 | @media print {
15 | .deck-status {
16 | display: none;
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/web/pydown/pydown_files/deck.status.js.下载:
--------------------------------------------------------------------------------
1 | /*!
2 | Deck JS - deck.status
3 | Copyright (c) 2011 Caleb Troughton
4 | Dual licensed under the MIT license and GPL license.
5 | https://github.com/imakewebthings/deck.js/blob/master/MIT-license.txt
6 | https://github.com/imakewebthings/deck.js/blob/master/GPL-license.txt
7 | */
8 |
9 | /*
10 | This module adds a (current)/(total) style status indicator to the deck.
11 | */
12 | (function($, deck, undefined) {
13 | var $d = $(document),
14 |
15 | updateCurrent = function(e, from, to) {
16 | var opts = $[deck]('getOptions');
17 |
18 | $(opts.selectors.statusCurrent).text(opts.countNested ?
19 | to + 1 :
20 | $[deck]('getSlide', to).data('rootSlide')
21 | );
22 | };
23 |
24 | /*
25 | Extends defaults/options.
26 |
27 | options.selectors.statusCurrent
28 | The element matching this selector displays the current slide number.
29 |
30 | options.selectors.statusTotal
31 | The element matching this selector displays the total number of slides.
32 |
33 | options.countNested
34 | If false, only top level slides will be counted in the current and
35 | total numbers.
36 | */
37 | $.extend(true, $[deck].defaults, {
38 | selectors: {
39 | statusCurrent: '.deck-status-current',
40 | statusTotal: '.deck-status-total'
41 | },
42 |
43 | countNested: true
44 | });
45 |
46 | $d.bind('deck.init', function() {
47 | var opts = $[deck]('getOptions'),
48 | slides = $[deck]('getSlides'),
49 | $current = $[deck]('getSlide'),
50 | ndx;
51 |
52 | // Set total slides once
53 | if (opts.countNested) {
54 | $(opts.selectors.statusTotal).text(slides.length);
55 | }
56 | else {
57 | /* Determine root slides by checking each slide's ancestor tree for
58 | any of the slide classes. */
59 | var rootIndex = 1,
60 | slideTest = $.map([
61 | opts.classes.before,
62 | opts.classes.previous,
63 | opts.classes.current,
64 | opts.classes.next,
65 | opts.classes.after
66 | ], function(el, i) {
67 | return '.' + el;
68 | }).join(', ');
69 |
70 | /* Store the 'real' root slide number for use during slide changes. */
71 | $.each(slides, function(i, $el) {
72 | var $parentSlides = $el.parentsUntil(opts.selectors.container, slideTest);
73 |
74 | $el.data('rootSlide', $parentSlides.length ?
75 | $parentSlides.last().data('rootSlide') :
76 | rootIndex++
77 | );
78 | });
79 |
80 | $(opts.selectors.statusTotal).text(rootIndex - 1);
81 | }
82 |
83 | // Find where we started in the deck and set initial state
84 | $.each(slides, function(i, $el) {
85 | if ($el === $current) {
86 | ndx = i;
87 | return false;
88 | }
89 | });
90 | updateCurrent(null, ndx, ndx);
91 | })
92 | /* Update current slide number with each change event */
93 | .bind('deck.change', updateCurrent);
94 | })(jQuery, 'deck');
95 |
96 |
--------------------------------------------------------------------------------
/web/pydown/pydown_files/horizontal-slide.css:
--------------------------------------------------------------------------------
1 | .csstransitions.csstransforms {
2 | overflow-x: hidden;
3 | }
4 | .csstransitions.csstransforms .deck-container > .slide {
5 | -webkit-transition: -webkit-transform 500ms ease-in-out;
6 | -moz-transition: -moz-transform 500ms ease-in-out;
7 | -ms-transition: -ms-transform 500ms ease-in-out;
8 | -o-transition: -o-transform 500ms ease-in-out;
9 | transition: transform 500ms ease-in-out;
10 | }
11 | .csstransitions.csstransforms .deck-container:not(.deck-menu) > .slide {
12 | position: absolute;
13 | top: 0;
14 | left: 0;
15 | -webkit-box-sizing: border-box;
16 | -moz-box-sizing: border-box;
17 | box-sizing: border-box;
18 | width: 100%;
19 | padding: 0 48px;
20 | }
21 | .csstransitions.csstransforms .deck-container:not(.deck-menu) > .slide .slide {
22 | position: relative;
23 | left: 0;
24 | top: 0;
25 | -webkit-transition: -webkit-transform 500ms ease-in-out, opacity 500ms ease-in-out;
26 | -moz-transition: -moz-transform 500ms ease-in-out, opacity 500ms ease-in-out;
27 | -ms-transition: -ms-transform 500ms ease-in-out, opacity 500ms ease-in-out;
28 | -o-transition: -o-transform 500ms ease-in-out, opacity 500ms ease-in-out;
29 | transition: -webkit-transform 500ms ease-in-out, opacity 500ms ease-in-out;
30 | }
31 | .csstransitions.csstransforms .deck-container:not(.deck-menu) > .slide .deck-next, .csstransitions.csstransforms .deck-container:not(.deck-menu) > .slide .deck-after {
32 | visibility: visible;
33 | -webkit-transform: translate3d(200%, 0, 0);
34 | -moz-transform: translate(200%, 0);
35 | -ms-transform: translate(200%, 0);
36 | -o-transform: translate(200%, 0);
37 | transform: translate3d(200%, 0, 0);
38 | }
39 | .csstransitions.csstransforms .deck-container:not(.deck-menu) > .deck-previous {
40 | -webkit-transform: translate3d(-200%, 0, 0);
41 | -moz-transform: translate(-200%, 0);
42 | -ms-transform: translate(-200%, 0);
43 | -o-transform: translate(-200%, 0);
44 | transform: translate3d(-200%, 0, 0);
45 | }
46 | .csstransitions.csstransforms .deck-container:not(.deck-menu) > .deck-before {
47 | -webkit-transform: translate3d(-400%, 0, 0);
48 | -moz-transform: translate(-400%, 0);
49 | -ms-transform: translate(-400%, 0);
50 | -o-transform: translate(-400%, 0);
51 | transform: translate3d(-400%, 0, 0);
52 | }
53 | .csstransitions.csstransforms .deck-container:not(.deck-menu) > .deck-next {
54 | -webkit-transform: translate3d(200%, 0, 0);
55 | -moz-transform: translate(200%, 0);
56 | -ms-transform: translate(200%, 0);
57 | -o-transform: translate(200%, 0);
58 | transform: translate3d(200%, 0, 0);
59 | }
60 | .csstransitions.csstransforms .deck-container:not(.deck-menu) > .deck-after {
61 | -webkit-transform: translate3d(400%, 0, 0);
62 | -moz-transform: translate(400%, 0);
63 | -ms-transform: translate(400%, 0);
64 | -o-transform: translate(400%, 0);
65 | transform: translate3d(400%, 0, 0);
66 | }
67 | .csstransitions.csstransforms .deck-container:not(.deck-menu) > .deck-before .slide, .csstransitions.csstransforms .deck-container:not(.deck-menu) > .deck-previous .slide {
68 | visibility: visible;
69 | }
70 | .csstransitions.csstransforms .deck-container:not(.deck-menu) > .deck-child-current {
71 | -webkit-transform: none;
72 | -moz-transform: none;
73 | -ms-transform: none;
74 | -o-transform: none;
75 | transform: none;
76 | }
77 |
--------------------------------------------------------------------------------
/web/pydown/pydown_files/md_hl.css:
--------------------------------------------------------------------------------
1 | /*pygments style from Armin Ronacher*/
2 | pre .hll { background-color: #ffffcc }
3 | pre .c { color: #8f5902; font-style: italic } /* Comment */
4 | pre .err { color: #a40000; border: 1px solid #ef2929 } /* Error */
5 | pre .g { color: #000000 } /* Generic */
6 | pre .k { color: #204a87; font-weight: bold } /* Keyword */
7 | pre .l { color: #000000 } /* Literal */
8 | pre .n { color: #000000 } /* Name */
9 | pre .o { color: #ce5c00; font-weight: bold } /* Operator */
10 | pre .x { color: #000000 } /* Other */
11 | pre .p { color: #000000; font-weight: bold } /* Punctuation */
12 | pre .cm { color: #8f5902; font-style: italic } /* Comment.Multiline */
13 | pre .cp { color: #8f5902; font-style: italic } /* Comment.Preproc */
14 | pre .c1 { color: #8f5902; font-style: italic } /* Comment.Single */
15 | pre .cs { color: #8f5902; font-style: italic } /* Comment.Special */
16 | pre .gd { color: #a40000 } /* Generic.Deleted */
17 | pre .ge { color: #000000; font-style: italic } /* Generic.Emph */
18 | pre .gr { color: #ef2929 } /* Generic.Error */
19 | pre .gh { color: #000080; font-weight: bold } /* Generic.Heading */
20 | pre .gi { color: #00A000 } /* Generic.Inserted */
21 | pre .go { color: #000000; font-style: italic } /* Generic.Output */
22 | pre .gp { color: #8f5902 } /* Generic.Prompt */
23 | pre .gs { color: #000000; font-weight: bold } /* Generic.Strong */
24 | pre .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
25 | pre .gt { color: #a40000; font-weight: bold } /* Generic.Traceback */
26 | pre .kc { color: #204a87; font-weight: bold } /* Keyword.Constant */
27 | pre .kd { color: #204a87; font-weight: bold } /* Keyword.Declaration */
28 | pre .kn { color: #204a87; font-weight: bold } /* Keyword.Namespace */
29 | pre .kp { color: #204a87; font-weight: bold } /* Keyword.Pseudo */
30 | pre .kr { color: #204a87; font-weight: bold } /* Keyword.Reserved */
31 | pre .kt { color: #204a87; font-weight: bold } /* Keyword.Type */
32 | pre .ld { color: #000000 } /* Literal.Date */
33 | pre .m { color: #0000cf; font-weight: bold } /* Literal.Number */
34 | pre .s { color: #4e9a06 } /* Literal.String */
35 | pre .na { color: #c4a000 } /* Name.Attribute */
36 | pre .nb { color: #204a87 } /* Name.Builtin */
37 | pre .nc { color: #000000 } /* Name.Class */
38 | pre .no { color: #000000 } /* Name.Constant */
39 | pre .nd { color: #5c35cc; font-weight: bold } /* Name.Decorator */
40 | pre .ni { color: #ce5c00 } /* Name.Entity */
41 | pre .ne { color: #cc0000; font-weight: bold } /* Name.Exception */
42 | pre .nf { color: #000000 } /* Name.Function */
43 | pre .nl { color: #f57900 } /* Name.Label */
44 | pre .nn { color: #000000 } /* Name.Namespace */
45 | pre .nx { color: #000000 } /* Name.Other */
46 | pre .py { color: #000000 } /* Name.Property */
47 | pre .nt { color: #204a87; font-weight: bold } /* Name.Tag */
48 | pre .nv { color: #000000 } /* Name.Variable */
49 | pre .ow { color: #204a87; font-weight: bold } /* Operator.Word */
50 | pre .w { color: #f8f8f8; text-decoration: underline } /* Text.Whitespace */
51 | pre .mf { color: #0000cf; font-weight: bold } /* Literal.Number.Float */
52 | pre .mh { color: #0000cf; font-weight: bold } /* Literal.Number.Hex */
53 | pre .mi { color: #0000cf; font-weight: bold } /* Literal.Number.Integer */
54 | pre .mo { color: #0000cf; font-weight: bold } /* Literal.Number.Oct */
55 | pre .sb { color: #4e9a06 } /* Literal.String.Backtick */
56 | pre .sc { color: #4e9a06 } /* Literal.String.Char */
57 | pre .sd { color: #8f5902; font-style: italic } /* Literal.String.Doc */
58 | pre .s2 { color: #4e9a06 } /* Literal.String.Double */
59 | pre .se { color: #4e9a06 } /* Literal.String.Escape */
60 | pre .sh { color: #4e9a06 } /* Literal.String.Heredoc */
61 | pre .si { color: #4e9a06 } /* Literal.String.Interpol */
62 | pre .sx { color: #4e9a06 } /* Literal.String.Other */
63 | pre .sr { color: #4e9a06 } /* Literal.String.Regex */
64 | pre .s1 { color: #4e9a06 } /* Literal.String.Single */
65 | pre .ss { color: #4e9a06 } /* Literal.String.Symbol */
66 | pre .bp { color: #3465a4 } /* Name.Builtin.Pseudo */
67 | pre .vc { color: #000000 } /* Name.Variable.Class */
68 | pre .vg { color: #000000 } /* Name.Variable.Global */
69 | pre .vi { color: #000000 } /* Name.Variable.Instance */
70 | pre .il { color: #0000cf; font-weight: bold } /* Literal.Number.Integer.Long */
71 |
--------------------------------------------------------------------------------
/web/timeline/643.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wqw547243068/wangqiwen/07b64ae47d91581e1c339f40bc765fd7815b47ff/web/timeline/643.jpg
--------------------------------------------------------------------------------
/web/timeline/css/about.css:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wqw547243068/wangqiwen/07b64ae47d91581e1c339f40bc765fd7815b47ff/web/timeline/css/about.css
--------------------------------------------------------------------------------
/web/timeline/timeline.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
12 | 静态可以折叠时光轴
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
20170111
25 |
-
26 | 20170111 22:32:45
27 |
出入口系统
28 |
29 |
-
30 | 20170111 21:00:31
31 |
停车场系统
32 |
33 |
-
34 | 20170111 17:30:45
35 |
楼宇门禁系统
36 |
37 |
38 |
39 |
40 |
20170112
41 |
-
42 | 20170112 14:03:41
43 |
视频监控系统
44 |
45 |
-
46 | 20170112 11:24:47
47 |
电子巡更系统
48 |
49 |
50 |
51 |
20170113
52 |
- 20170112 14:03:41
视频监控系统
53 |
- 20170112 14:03:41
视频监控系统
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
72 |
73 |
74 |
--------------------------------------------------------------------------------
/web/timeline/tl.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
28 |
29 |
30 |
31 |
32 |
33 | -
34 |
35 |
36 |
2018-05-01
37 |
天气晴朗
38 |
适合跑步
39 |
查看更多>>
40 |
41 |
42 | -
43 |
44 |
45 |
2018-05-02
46 |
天气晴朗
47 |
适合跑步
48 |
查看更多>>
49 |
50 |
51 | -
52 |
53 |
54 |
2018-05-03
55 |
天气晴朗
56 |
适合跑步
57 |
查看更多>>
58 |
59 |
60 | -
61 |
62 |
63 |
2018-05-04
64 |
天气晴朗
65 |
适合跑步
66 |
查看更多>>
67 |
68 |
69 |
70 |
71 |
72 |
74 |
75 |
76 |
--------------------------------------------------------------------------------
/学习资料汇总:
--------------------------------------------------------------------------------
1 | 学习资料汇总
2 |
3 | 【2017-3-18】[序列学习-马尔科夫模型](http://blog.csdn.net/dark_scope/article/details/61417336)
4 | [2017-3-27] 算法优化——如何避开鞍点,http://www.csuldw.com/2016/07/10/2016-07-10-saddlepoints/?utm_source=tuicool&utm_medium=referral
5 |
--------------------------------------------------------------------------------
/微软-ML算法指南.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wqw547243068/wangqiwen/07b64ae47d91581e1c339f40bc765fd7815b47ff/微软-ML算法指南.png
--------------------------------------------------------------------------------