├── main_results_plot.png ├── double_dqn_architecture.png ├── LSTM_based_ddqn_architecture.png ├── high_level_play_environment.png ├── figure_LSTM_lkbk100_unit50_SEQ1.png ├── figure_LSTM_lkbk200_unit100_1stage.png ├── .gitattributes ├── decay.py ├── distribution_plot.py ├── lfsr_test.py ├── randmove.py ├── Readme.md ├── rps_deepRL.py └── rps_deepRL_withLSTM.py /main_results_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dennylslee/rock-paper-scissors-DeepRL/HEAD/main_results_plot.png -------------------------------------------------------------------------------- /double_dqn_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dennylslee/rock-paper-scissors-DeepRL/HEAD/double_dqn_architecture.png -------------------------------------------------------------------------------- /LSTM_based_ddqn_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dennylslee/rock-paper-scissors-DeepRL/HEAD/LSTM_based_ddqn_architecture.png -------------------------------------------------------------------------------- /high_level_play_environment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dennylslee/rock-paper-scissors-DeepRL/HEAD/high_level_play_environment.png -------------------------------------------------------------------------------- /figure_LSTM_lkbk100_unit50_SEQ1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dennylslee/rock-paper-scissors-DeepRL/HEAD/figure_LSTM_lkbk100_unit50_SEQ1.png -------------------------------------------------------------------------------- /figure_LSTM_lkbk200_unit100_1stage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dennylslee/rock-paper-scissors-DeepRL/HEAD/figure_LSTM_lkbk200_unit100_1stage.png -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /decay.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from matplotlib import style 3 | #from matplotlib import cm 4 | #from matplotlib.ticker import LinearLocator, FormatStrFormatter 5 | style.use('ggplot') 6 | 7 | lsta, lstb, lstc, lstd = [], [], [], [] 8 | a , b, c, d = 1, 1, 1, 1 9 | decay1 = 0.998 10 | decay2 = 0.997 11 | decay3 = 0.996 12 | decay4 = 0.992 13 | 14 | 15 | for i in range(1000): 16 | a, b, c, d = a*decay1, b*decay2, c*decay3, d*decay4 17 | lsta.append(a) 18 | lstb.append(b) 19 | lstc.append(c) 20 | lstd.append(d) 21 | 22 | plt.title('Decay curve', loc='center', weight='bold', color='Black') 23 | plt.plot(lsta, color='blue') 24 | plt.plot(lstb, color='orange') 25 | plt.plot(lstc, color='green') 26 | plt.plot(lstd, color='black') 27 | plt.show() -------------------------------------------------------------------------------- /distribution_plot.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import matplotlib.mlab as mlab 5 | from matplotlib import style 6 | style.use('ggplot') 7 | 8 | num_bins = 100 9 | sigma1, sigma2 = 2.0, 0.5 10 | norm_mu = 0 11 | a, b = [], [] 12 | for i in range(10000): 13 | a.append(random.gauss(norm_mu, sigma1)) 14 | b.append(random.gauss(norm_mu, sigma2)) 15 | 16 | plt.title('Distribution Plots', loc='center', weight='bold', color='Black') 17 | 18 | #n, bins1, patches1 = plt.hist(a, num_bins, facecolor='blue', alpha=0.5) 19 | m, bins2, patches2 = plt.hist(b, num_bins, facecolor='red', alpha=0.5) 20 | plt.show(block = False) 21 | 22 | # some test code 23 | teststr = 'start' 24 | for i in range(50): 25 | j = 61 26 | k = 2 27 | teststr = 'start' 28 | if i % (j // k) == 0: teststr = 'change' 29 | print ('i', i, teststr, i % (j // k)) 30 | -------------------------------------------------------------------------------- /lfsr_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from matplotlib import style 4 | style.use('ggplot') 5 | 6 | 7 | def lfsr2(seed, taps, nbits): 8 | sr = seed 9 | while 1: 10 | xor = 1 11 | for t in taps: 12 | if (sr & (1<<(t-1))) != 0: 13 | xor ^= 1 14 | sr = (xor << nbits-1) + (sr >> 1) 15 | yield xor, sr 16 | if sr == seed: 17 | break 18 | # ---------------------------- main program --------------------------------------------- 19 | # nbits, tapindex, seed = 12, (12,11,10,4,1), 0b11001001 20 | nbits, tapindex, seed = 8, (8,6,5,4,1), 0b00001001 21 | datalist, movelist = [], [] 22 | 23 | # -------------------------- generate the random sequence -------------------------------- 24 | 25 | for xor, sr in lfsr2(0b11001001, tapindex, nbits): 26 | lfsr_gen = int(bin(2**nbits+sr)[3:], base=2) 27 | datalist.append(lfsr_gen) 28 | print (xor, lfsr_gen) 29 | 30 | 31 | for i in datalist: 32 | move = i % 3 # use a mod 3 to create 3 bins 33 | movelist.append(move) 34 | 35 | print ('Player1 RPS LRSR ditribution') 36 | print ('Player1 rock:', movelist.count(0)) # count the num of ones in list (rock) 37 | print ('Player1 paper:', movelist.count(1)) # count the num of twos in list (paper) 38 | print ('Player1 sessios:', movelist.count(2)) # count the num of threes in list (sessiors) 39 | print ('total moves:', len(movelist)) 40 | print ('sample moves', movelist[:20]) 41 | 42 | #---------------- print the PDF chart -------------------------------- 43 | 44 | x = np.array(datalist) 45 | nbins = 20 46 | n, bins = np.histogram(x, nbins, density=1) 47 | pdfx = np.zeros(n.size) 48 | pdfy = np.zeros(n.size) 49 | for k in range(n.size): 50 | pdfx[k] = 0.5*(bins[k]+bins[k+1]) 51 | pdfy[k] = n[k] 52 | plt.plot(pdfx, pdfy) # plot the probability distributed function 53 | plt.show(block = False) 54 | -------------------------------------------------------------------------------- /randmove.py: -------------------------------------------------------------------------------- 1 | # This module generate one rock-paper-sessiors move based on the Mode selected. 2 | import random 3 | 4 | def lfsr2(seed, taps, nbits): 5 | sr = seed 6 | while 1: 7 | xor = 1 8 | for t in taps: 9 | if (sr & (1<<(t-1))) != 0: 10 | xor ^= 1 11 | sr = (xor << nbits-1) + (sr >> 1) 12 | yield xor, sr 13 | if sr == seed: 14 | break 15 | 16 | def genOneMove(self, mode, stage): 17 | if mode == 'PRNG': 18 | # change play strategy for player2 along the way 19 | lowPlay = {0:0, 1:1, 2:2, 3:0, 4:2} # key = stage number, value = r(0), p(1), s(2) 20 | meanPlay = {0:1, 1:2, 2:0, 3:2, 4:1} # key = stage number, value = r(0), p(1), s(2) 21 | hiPlay = {0:2, 1:0, 2:1, 3:1, 4:0} # key = stage number, value = r(0), p(1), s(2) 22 | # gen a random numbe from guassian & quantize it 23 | a = random.gauss(self.norm_mu, self.norm_sigma) 24 | if a **2 < 1: # the middle bell is the paper move 25 | play = meanPlay[stage] 26 | elif a < -1: # lower than cutoff -1 is the rock move 27 | play = lowPlay[stage] 28 | else: 29 | play = hiPlay[stage] # else higher than +1 is the sessiors move 30 | return play 31 | 32 | elif mode == 'SEQ': # simple repeating pattern as 'random generator' 33 | dict = {'r':0, 'p':1, 's': 2} 34 | seqlist = 'rpprsspsrsrpprspsprspsppsrrspsprrsspsrpsrpsrsps' # the pattern sequence here 35 | self.seqIndex = 0 if self.seqIndex == len(seqlist)-1 else self.seqIndex + 1 36 | return dict[seqlist[self.seqIndex]] 37 | 38 | elif mode == 'LFSR': 39 | nbits, tapindex, seed = 12, (12,11,10,4,1), 0b11001001 40 | #nbits, tapindex, seed = 8, (8,6,5,4,1), 0b11001001 41 | lfsrlist = [] 42 | for xor, sr in lfsr2(seed, tapindex, nbits): 43 | lfsr_gen = int(bin(2**nbits+sr)[3:], base=2) 44 | lfsrlist.append(lfsr_gen % 3) 45 | self.seqIndex = 0 if self.seqIndex == len(lfsrlist)-1 else self.seqIndex + 1 46 | return lfsrlist[self.seqIndex] 47 | 48 | else: 49 | print('Error: random mode does not exist!') 50 | 51 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | The objective of this project is to construct an AI agent to play a simple 2 players rock-paper-scissors game using reinforcement learning (RL) technique - particularly the double-DQN algorithm. 4 | 5 | In previous design project, we have built a player using simple LSTM-based neural network that is trained using traditional supervised learning method. The downside is that the model is effectively static and does not adapt to model drift or fundamental behavioral changes. With a RL-based approach, we like to see that the AI agent demostrates some ability to counter the changing strategy of the opponent and continue to generate better win rate than the opposing player. 6 | 7 | Note that unlike other RL project in which the AI agent is to learn a task successfully and generalizes to other future data (i.e. a form of semi supervised learning using rewards as label - this is my analogy and is technically not precise), in this particular set up, there is no completion per se. That is the game never ends. The RL agent simply continues to adapt and adjust as best as it can. 8 | 9 | ## Q-Learning Basics 10 | 11 | This project utiltized the double-DQN RL algorithm as the basis for the AI agent as player 1. DeepMind was very instrutmental in popularizing Q-learning recently and had contributed multple architectural enhancements in recent years - do note that Q-Learning was first pioneered by Watkins in 1992. See few popular papers on DQN (deep Q-learning network) in the reference section. 12 | 13 | In its essence, Deep Q-Learning is to learn a "policy" (which in practice is a deep learning network) such that it maximizes the expected return of future action. The Q-value is the "action value" function. Its absolute magnitude has no real meaning but is recurrsively derived to guide the system on which action to take at any moment such that the chosen action is believed to lead to maximum future rewards. 14 | 15 | Some of the enhancments utilized in this design that is over and beyond basic Q-learning are as following. Most of these are introduced mainly by DeepMind in recent years. 16 | 17 | 1) Dual models (hence the term "double") - one model is to drive the action decision (also sometime refers as online model or behavior model) and one to act as the target model. The inner weights are "tranferred" from the action model to the target model on every move cycle on a discounted basis. 18 | 2) Exploiting vs exploration using epsilon-greedy method. This is essential in any RL design since exploration is important, especially during the start of the process, for searching in different areas of that state space that might lead to better optimal operation position. When exploring, it is often termed "off-policy" whereas taking an action according to exploitation is often termed "on-policy". 19 | 3) Experience replay - instead of using most recent history as the learning space, DeepMind introduced the concept of experience replay in which past pass-through of the state space is stored in memory. Such memories are recalled (sampled) at each move and used in the SGD process (thus achieving the reinforcement notion). 20 | 21 | ## Acknowledgement 22 | 23 | Much of the code is adopted from A. Oppermann's blog in Medium. It is an excellent tutorial with detailed walk through. You can find it [here](https://towardsdatascience.com/self-learning-ai-agents-part-ii-deep-q-learning-b5ac60c3f47). 24 | 25 | # RPS High Level Environment 26 | 27 | The environment of this game play is depicted below. It follows the classical RL environment definition. 28 | 29 | ![pic1](https://github.com/dennylslee/rock-paper-scissors-DeepRL/blob/master/high_level_play_environment.png) 30 | 31 | Inside the "environment" is the embedded player 2 (i.e. the opponent). This player might adopt different type of play strategy. The interaction contains the following: 32 | 33 | 1. action space: either rock, paper, or scissors that the AI agent (player 1) puts out. 34 | 2. rewards: this is an indication from the environment back to player 1. The reward is simply a value of 1 if it is a win for player 1, and 0 otherwise. 35 | 3. state: this is where the fun is and some creativity comes into play (which might affect the player winning outcome). In this setup, we have designed the state space to be: 36 | - win, tie, lost indicators: one of the three can be set to a 1 for a particular state 37 | - winRateTrend, tieRateTrend, lostRateTrend: this is an indicator which reflects a positively-trending moving average (set to 1) or not (set to 0). All three indicators are assessed independently. 38 | - winRateMovingAvg, tieRateMovingAvg, lostRateMovingAvg: floating point value between 0 and 1 which indicates the rate. This rate is calculated based on a configured moving average window size. 39 | 40 | 41 | ## RPS AI player architecture using Double-DQN 42 | 43 | The overall architecture of the double-DQN agent design is depicted below. The yellow section is coded inside the step method and is iterated over experience replay batch size of N samples. The green section is the design that control the exploration vs exploitation action. The oragne section is the main action model (on policy). This is the model in which we want to ultimately achieve the optimal policy for making the best possible action. The action model's weights are transferred with discounted to the target model periodically. 44 | 45 | ![pic2](https://github.com/dennylslee/rock-paper-scissors-DeepRL/blob/master/double_dqn_architecture.png) 46 | 47 | (Nothing really special here in the illustration - just a block diagram version of the commonly published Dobule-DQN pesudo code) 48 | 49 | ## Player 2 ("the opponent") behaviors 50 | 51 | The player 2 code is embedded in a separate moduled called randmove. Player 2 can play 3 modes which need to be manually configured in the environment class setup: 52 | 53 | 1. "PRNG" which uses python Gaussian distribution with a controllable mean and sigma value. The larget the sigma, the larger the spread and the more random the sequence appears (which makes it harder to predict). 54 | 2. "SEQ" this is a simple manually typed in sequence of rock-paper-scissors. 55 | 3. "LFSR" this is a N-bits pseudo-random generator implemented based on linear feedback shift register of certain length. Depending on the tap location, it might or might exhibit maximal length cycle. 56 | 57 | The results below is based mainly on PRNG in which the play strategy is as following: 58 | 59 | 1) the overall game play is divided into N (=5) stages, each with equal number of moves 60 | 2) the moves of player 2 in each stage is generated using the PRNG Guassian distribution with different dominant move type (i.e. rock or paper of scissors) 61 | 3) the sigma value is also decreasesd as the stage progress. 62 | 63 | The overall game objective is to observe if the RL agent can adjust to the changing behaviors and maintain a good win rate. 64 | 65 | ## DNN design 66 | 67 | In this project, we used a simple full mesh DNN which proves to be one of the main limitation since the RPS game is essentially a time series (sequence) problem. The DNN model can learn distribution but it has no notion of sequence. 68 | 69 | For a high dimensional PRNG like the Guassian generator, a DNN (especially a small one) has no hope in detecting the sequential pattern coming from a complex PRNG generator. However, it is capable in understanding the distribution statistically and its play strategy is effectively based on observing shift in statistics. 70 | 71 | ## Hyperparameters 72 | 73 | The following is the set of hyperparammeters in the game which can alter the outcome win rate and in general the adaptability of the AI agent. 74 | 75 | 1. memory batch size = 32: the larger batch, the more reinforcement is used on each round making the convergence faster but would also make adaptation slower. 76 | 2. memory maxlen: this is done using python collection deque object. The longer the memory, the more experience it retains which could lead to faster convergence but can negatively affect adaptability since it needs time to flush out the memory when the opponents' behavior changes. 77 | 3. moving window size = 8: change the smoothing factor of moving average state variables. 78 | 4. gamma - a factor to determine how much of the future action-value do you want the algorithm to incorporate at each step. 79 | 5. DNN layers = 3: effectively the entropy of the NN and its ability to learning the distribution 80 | 6. DNN nodes = 64: same as above 81 | action to target model transfer learning (tau): how fast the transfer mimic after the action model. (we didn't play much around this one) 82 | 6. reward system = 1 for win, 0 otherwise: using simple reward and allows the complexity of adaptation controlled by the NN design and the state space design. 83 | 7. state design = too much state indicatores is a waste of computation resource since not all state variables are important. Too little would hinder what the RL agent observes thus limits its policy forming ability. 84 | 8. epsilon: exploration percentage controlled in conjunction with the decay rate. Note that there is a minimum value which is important to the adaptative effective since the system will continue to explore off-policy moves and it is how it discovers new opponent's behavior. 85 | 86 | # Results 87 | 88 | The player 2's move percentage clearly depictes the changing strategy across the different stages of the game - each stage with a different dominant move type and a different spread due to the decreasing sigma value. Player 1 (RL agent) seems to successfully adapt to the change and eventually win more due to the weakened player 2 behaviors (i.e. it gets less random over time). From that perspective, the code has successfully achieved its main objective. 89 | 90 | However, it is somewhat disappointing that win rate in any state does not outperfrom the statistical behaviors of its opponent since the DNN architecture has the inherent limitation as mentioned in the earlier section. 91 | 92 | ![pic3](https://github.com/dennylslee/rock-paper-scissors-DeepRL/blob/master/main_results_plot.png) 93 | 94 | The max Q value of action model is also plotted. It shows a nice convergenece over the length of game. Corresponding the Q-value is upsetted during the transition from one stage to another when the opponent changes its play strategy. During that time, reward is not received based given the previous policy's strategy and corresponding the Q-value suffered. However, the exploration allows new policy to be learned by building up new memories and the new winning counter-move bubbles back up to the top eventually and the winning rate recuperates. 95 | 96 | # Using LSTM as policy and target network 97 | 98 | As a second phase to this project, I have evolved the design to utilize a LSTM-based RNN in both the policy (action and target) networks. The architecture is rather brute force and is illustrated below. The intuition is that LSTM should over time learn to perform better than a simple DNN given its inert ability to recognize sequential pattern. In a nutshell, the DDQN's original DNN model (for both action and target model) is swapped with a LSTM. Since the LSTM deals with, and is trained on, sequence; the overall data preparation design is changed to adjust for this arrangeement: 99 | 1) the input to the LSTM is a sequence of states. The sequence is of length 'lookback' 100 | 2) the DDQN experience replay concept is still retained. However, each experience is now a sequence of states of lenght lookback. For example, each experience replay is a sample from the deque memory, the code then retrieves the immediate prior loopback number of states. The code then repeat the same retrieval process RL_batch number of times. 101 | 102 | ![pic4](https://github.com/dennylslee/rock-paper-scissors-DeepRL/blob/master/LSTM_based_ddqn_architecture.png) 103 | 104 | In general, such change in architecture did not yield any significant breakthrough in results (and to some degree, it is worse performing than a simple claissically trained static LSTM model). 105 | 106 | 1) It did not perform well (meaning win-tie-lost rate are roughly 33% each) when the opponent is 'PRNG'. The opponent is simply too high dimensions. 107 | 2) Some testing were done aginst a simplier 12bit 'LFSR' which means the sequence repeats itself in 4,096 moves. 108 | - GRU vs LSTM were used with no apparent difference 109 | - the hyperparameters varied are (a) lookback length (b) inner LSTM unit size (b) experience reply batch size. 110 | - a flatten dense layer architecture versus a basic many-to-one LSTM architecture were tried and neither provide any apparent advantage (neither one improveds the performance) 111 | - see the captured results below. 112 | 3) Further testing was conducted on a short (approx 30moves) self-entered r-p-s sequence. Based on a fairly small size LSTM, a consistently higher win rate is observed. But this does not surpass the performance a classical (supervised learning) approach using a statically-trained LSTM. 113 | 114 | All-in-all this LSTM architecture working within a DDQN structure is rather non-performing and better design is desirable. 115 | 116 | ![pic5](https://github.com/dennylslee/rock-paper-scissors-DeepRL/blob/master/figure_LSTM_lkbk200_unit100_1stage.png) 117 | 118 | ![pic6](https://github.com/dennylslee/rock-paper-scissors-DeepRL/blob/master/figure_LSTM_lkbk100_unit50_SEQ1.png) 119 | 120 | # Future Works 121 | 122 | We will need to rethink a more appropropriate LSTM-based solution to improve the win rate performance. 123 | 124 | # Reference 125 | 126 | 1. Mnih, et.al. "Playing Atari with Deep Reinforcement Learning", 2013 127 | 2. Hasselt, et.al. "Deep Reinforcement Learning with Dobule Q-learning", 2015 128 | 3. Mnih, et. al. "Human-Level control through deep reinforcement learning", 2015 129 | 4. Packer, et. al. "Assessing Generalization in Deep Reinforcement Learning" 130 | 5. A. Oppermann, "self learing AI agents part-II Deep Q-learning", 2018 131 | 6. Watkins & Dayan, "Q-Learning", 1992 132 | 133 | -------------------------------------------------------------------------------- /rps_deepRL.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import math 4 | 5 | from keras.models import Sequential 6 | from keras.layers import Dense, Dropout 7 | from keras.optimizers import Adam 8 | 9 | from collections import deque 10 | from randmove import genOneMove 11 | 12 | import matplotlib.pyplot as plt 13 | from matplotlib import style 14 | style.use('ggplot') 15 | 16 | # -------------------------- SETTING UP THE ENVIRONMENT -------------------------------------- 17 | # simple game, therefore we are not using the open gym custom set up 18 | #--------------------------------------------------------------------------------------------- 19 | class RPSenv(): 20 | def __init__ (self): 21 | self.action_space = [0,1,2] # integer representation of r/p/s 22 | self.seed = random.seed(4) # make it deterministic 23 | self.norm_mu = 0 # center point for guassian distribution 24 | self.norm_sigma = 2.0 # sigma for std distribution 25 | self.seqIndex = 0 # index for pointing to the SEQ sequnce 26 | self.p2Mode = 'PRNG' # SEQ or PRNG or LFSR 27 | self.p2Count = [0, 0, 0] # player 2 win tie lost count 28 | self.p1Count = [0, 0, 0] # player 1 win tie lost count 29 | self.window = 10 # window size for rate trending calc 30 | self.cumWinRate, self.cumTieRate, self.cumLostRate = None, None, None 31 | self.cumWinCount, self.cumTieCount, self.cumLostCount = None, None, None 32 | self.winRateTrend, self.tieRateTrend, self.lostRateTrend = 0, 0, 0 33 | self.winRateMovingAvg, self.tieRateMovingAvg, self.lostRateMovingAvg = 0, 0, 0 34 | self.winRateBuf, self.tieRateBuf, self.lostRateBuf \ 35 | = deque(maxlen=self.window), deque(maxlen=self.window), deque(maxlen=self.window) 36 | # put all the observation state in here; shape in Keras input format 37 | self.state = np.array([[ \ 38 | None, None, None, \ 39 | self.winRateTrend, self.tieRateTrend, self.lostRateTrend, \ 40 | self.winRateMovingAvg, self.tieRateMovingAvg, self.lostRateMovingAvg \ 41 | ]]) 42 | 43 | def reset(self): 44 | # reset all the state 45 | self.cumWinRate, self.cumTieRate, self.cumLostRate = 0, 0, 0 46 | self.cumWinCount, self.cumTieCount, self.cumLostCount = 0, 0, 0 47 | self.winRateTrend, self.tieRateTrend, self.lostRateTrend = 0, 0, 0 48 | self.winRateMovingAvg, self.tieRateMovingAvg, self.lostRateMovingAvg = 0, 0, 0 49 | return np.array([0, 0, 0, 0, 0, 0, 0, 0, 0]) 50 | 51 | def step(self, action, moveCount, stage): 52 | # value mode is PRNG or SEQ 53 | p2Move = genOneMove(self, self.p2Mode, stage) # play one move from player2 54 | self.p2Count[p2Move] += 1 55 | p1Move = action 56 | self.p1Count[p1Move] += 1 57 | 58 | # check who won, set flag and assign reward 59 | win, tie, lost = 0, 0, 0 60 | if p1Move == p2Move: 61 | self.cumTieCount, tie = self.cumTieCount + 1, 1 62 | elif (p1Move - p2Move == 1) or (p1Move - p2Move == -2): 63 | self.cumWinCount, win = self.cumWinCount + 1, 1 64 | else: 65 | self.cumLostCount, lost = self.cumLostCount + 1, 1 66 | 67 | # update the running rates 68 | self.cumWinRate = self.cumWinCount / moveCount 69 | self.cumTieRate = self.cumTieCount / moveCount 70 | self.cumLostRate = self.cumLostCount / moveCount 71 | # update moving avg buffer 72 | self.winRateBuf.append(self.cumWinRate) 73 | self.tieRateBuf.append(self.cumTieRate) 74 | self.lostRateBuf.append(self.cumLostRate) 75 | # calculate trend 76 | tmp = [0, 0, 0] 77 | self.winRateTrend, self.tieRateTrend, self.lostRateTrend = 0, 0, 0 78 | if moveCount >= self.window: 79 | tmp[0] = sum(self.winRateBuf[i] for i in range(self.window)) / self.window 80 | tmp[1] = sum(self.tieRateBuf[i] for i in range(self.window)) / self.window 81 | tmp[2] = sum(self.lostRateBuf[i] for i in range(self.window)) / self.window 82 | # win rate trend analysis 83 | if self.winRateMovingAvg < tmp[0]: 84 | self.winRateTrend = 1 # win rate trending up. That's good 85 | else: 86 | self.winRateTrend = 0 # win rate trending down. That's bad 87 | # tie rate trend analysis 88 | if self.tieRateMovingAvg < tmp[1]: 89 | self.tieRateTrend = 1 # tie rate trending up. That's bad 90 | else: 91 | self.tieRateTrend = 0 # tie rate trending down. Neutral 92 | # lost rate trend analysis 93 | if self.lostRateMovingAvg < tmp[2]: 94 | self.lostRateTrend = 1 # lst rate trending up. That's bad 95 | else: 96 | self.lostRateTrend = 0 # lost rate trending down. That's good 97 | self.winRateMovingAvg, self.tieRateMovingAvg, self.lostRateMovingAvg = tmp[0], tmp[1], tmp[2] 98 | # net reward in this round 99 | reward = win 100 | # record the state and reshape it for Keras input format 101 | dim = self.state.shape[1] 102 | self.state = np.array([\ 103 | win, tie, lost, \ 104 | self.winRateTrend, self.tieRateTrend, self.lostRateTrend, \ 105 | self.winRateMovingAvg, self.tieRateMovingAvg, self.lostRateMovingAvg \ 106 | ]).reshape(1, dim) 107 | # this game is done when it hits this goal 108 | done = False 109 | return self.state, reward, done, dim 110 | 111 | # ------------------------- class for the Double-DQN agent --------------------------------- 112 | # facilities utilized here: 113 | # 1) Double DQN networks: one for behavior policy, one for target policy 114 | # 2) Learn from sample from pool of memories 115 | # 3) Basic TD-Learning stuff: learning rate, gamma for discounting future rewards 116 | # 4) Use of epsilon-greedy policy for controlling exploration vs exploitation 117 | #------------------------------------------------------------------------------------------- 118 | class DDQN: 119 | def __init__(self, env): 120 | self.env = env 121 | # initialize the memory and auto drop when memory exceeds maxlen 122 | # this controls how far out in history the "expeience replay" can select from 123 | self.memory = deque(maxlen=2000) 124 | # future reward discount rate of the max Q of next state 125 | self.gamma = 0.9 126 | # epsilon denotes the fraction of time dedicated to exploration (as oppse to exploitation) 127 | self.epsilon = 1.0 128 | self.epsilon_min = 0.01 129 | self.epsilon_decay = 0.9910 130 | # model learning rate (use in backprop SGD process) 131 | self.learning_rate = 0.005 132 | # transfer learning proportion contrl between the target and action/behavioral NN 133 | self.tau = .125 134 | # create two models for double-DQN implementation 135 | self.model = self.create_model() 136 | self.target_model = self.create_model() 137 | # some space to collect TD target for instrumentaion 138 | self.TDtargetdelta, self.TDtarget = [], [] 139 | self.Qmax =[] 140 | 141 | def create_model(self): 142 | model = Sequential() 143 | state_shape = self.env.state.shape[1] 144 | model.add(Dense(24, input_dim=state_shape, activation="relu")) 145 | model.add(Dense(24, activation="relu")) 146 | model.add(Dense(24, activation="relu")) 147 | # let the output be the predicted target value. NOTE: do not use activation to squash it! 148 | model.add(Dense(len(self.env.action_space))) 149 | model.compile(loss="mean_squared_error", optimizer=Adam(lr=self.learning_rate)) 150 | print(model.summary()) 151 | 152 | return model 153 | 154 | def act(self, state): 155 | # this is to take one action 156 | self.epsilon *= self.epsilon_decay 157 | self.epsilon = max(self.epsilon_min, self.epsilon) 158 | # decide to take a random exploration or make a policy-based action (thru NN prediction) 159 | if np.random.random() < self.epsilon: 160 | # return a random move from action space 161 | return random.choice(self.env.action_space) 162 | else: 163 | # return a policy move 164 | self.Qmax.append(max(self.model.predict(state)[0])) 165 | return np.argmax(self.model.predict(state)[0]) 166 | 167 | def remember(self, state, action, reward, new_state, done): 168 | # store up a big pool of memory 169 | self.memory.append([state, action, reward, new_state, done]) 170 | 171 | def replay(self): # DeepMind "experience replay" method 172 | # the sample size from memory to learn from 173 | batch_size = 32 174 | # do nothing untl the memory is large enough 175 | if len(self.memory) < batch_size: return 176 | # get the samples 177 | samples = random.sample(self.memory, batch_size) 178 | # do the training (learning); this is DeepMind tricks of using "Double" model (Mnih 2015) 179 | for sample in samples: 180 | state, action, reward, new_state, done = sample 181 | target = self.target_model.predict(state) 182 | #print('target at state is ', target) 183 | if done: 184 | target[0][action] = reward 185 | else: 186 | Q_future = max(self.target_model.predict(new_state)[0]) 187 | TDtarget = reward + Q_future * self.gamma 188 | self.TDtarget.append(TDtarget) 189 | self.TDtargetdelta.append(TDtarget - target[0][action]) 190 | target[0][action] = TDtarget 191 | # do one pass gradient descend using target as 'label' to train the action model 192 | self.model.fit(state, target, epochs=1, verbose=0) 193 | 194 | def target_train(self): 195 | # transfer weights proportionally from the action/behave model to the target model 196 | weights = self.model.get_weights() 197 | target_weights = self.target_model.get_weights() 198 | for i in range(len(target_weights)): 199 | target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau) 200 | self.target_model.set_weights(target_weights) 201 | 202 | def save_model(self, fn): 203 | self.model.save(fn) 204 | 205 | # ------------------------- MAIN BODY ---------------------------------------- 206 | 207 | def main(): 208 | episodes, trial_len = 150, 300 # lenght of game play 209 | stage, totalStages = 0, 5 # # of stages with change distribution 210 | sigma_reduce = -0.1 # sigma change amount at each stage 211 | cumReward, argmax = 0, 0 # init for intrumentation 212 | steps, rateTrack = [], [] 213 | avgQmaxList, avgQ_futureList,avgQ_targetmaxList, avgTDtargetList = [], [], [], [] 214 | avgCumRewardList = [] 215 | p1Rate, p2Rate = [], [] 216 | # declare the game play environment and AI agent 217 | env = RPSenv() 218 | dqn_agent = DDQN(env=env) 219 | # ------------------------------------------ start the game ----------------------------------------- 220 | print('STARTING THE GAME with %s episodes each with %s moves' % (episodes, trial_len), '\n') 221 | for episode in range(episodes): 222 | cur_state = env.reset().reshape(1,env.state.shape[1]) # reset and get initial state in Keras shape 223 | cumReward = 0 224 | # ----------------- Select play strategy (apply to PRNG mode only) ------------------------------ 225 | # this change strategy affects distribution of r-p-s 226 | # ----------------------------------------------------------------------------------------------- 227 | if (episode+1) % (episodes // totalStages) == 0: env.norm_sigma += sigma_reduce # step repsonse change the gaussian distribution 228 | # this change strategy affects which move is dominant (control in randmove module) 229 | stage = episode // (episodes // totalStages) # divide total episodes into 3 equal length stages 230 | 231 | for step in range(trial_len): 232 | # AI agent take one action 233 | action = dqn_agent.act(cur_state) 234 | # play the one move and see how the environment reacts to it 235 | new_state, reward, done, info = env.step(action, step + 1, stage) 236 | cumReward += reward 237 | # record the play into memory pool 238 | dqn_agent.remember(cur_state, action, reward, new_state, done) 239 | # perform Q-learning from using |"experience replay": learn from random samples in memory 240 | dqn_agent.replay() 241 | # apply tranfer learning from actions model to the target model. 242 | dqn_agent.target_train() 243 | # update the current state with environment new state 244 | cur_state = new_state 245 | if done: break 246 | #-------------------------------- INSTRUMENTAL AND PLOTTING ------------------------------------------- 247 | # the instrumental are performed at the end of each episode 248 | # store epsiode #, winr rate, tie rate, lost rate, etc. etc. 249 | #------------------------------------------------------------------------------------------------------ 250 | rateTrack.append([episode+1, env.cumWinRate, env.cumTieRate, env.cumLostRate]) 251 | if True: # print ongoing performance 252 | print('EPISODE ', episode + 1), 253 | if env.p2Mode == 'PRNG': print('stage:', stage, ' sigma:', env.norm_sigma) 254 | print(' WIN RATE %.2f ' % env.cumWinRate, \ 255 | ' tie rate %.2f' % env.cumTieRate, \ 256 | 'lose rate %.2f' % env.cumLostRate) 257 | 258 | # print move distribution between the players 259 | if True: 260 | p1Rate.append([env.p1Count[0] / trial_len, env.p1Count[1] / trial_len, env.p1Count[2] / trial_len]) 261 | p2Rate.append([env.p2Count[0] / trial_len, env.p2Count[1] / trial_len, env.p2Count[2] / trial_len]) 262 | print (' P1 rock rate: %.2f paper rate: %.2f scissors rate: %.2f' % (p1Rate[-1][0], p1Rate[-1][1], p1Rate[-1][2])) 263 | print (' P2 rock rate: %.2f paper rate: %.2f scissors rate: %.2f' % (p2Rate[-1][0], p2Rate[-1][1], p2Rate[-1][2])) 264 | env.p1Count, env.p2Count = [0,0,0], [0,0,0] 265 | 266 | # summarize Qmax from action model and reward 267 | avgQmax = sum(dqn_agent.Qmax) / trial_len # from action model 268 | avgQmaxList.append(avgQmax) 269 | 270 | avgCumReward = cumReward / trial_len 271 | avgCumRewardList.append(avgCumReward) 272 | if True: 273 | print(' Avg reward: %.2f Avg Qmax: %.2f' % (avgCumReward, avgQmax)) 274 | dqn_agent.Qmax=[] # reset for next episode 275 | 276 | 277 | # ---------------- plot the main plot when all the episodes are done --------------------------- 278 | # 279 | if True: 280 | fig = plt.figure(figsize=(12,5)) 281 | plt.subplots_adjust(wspace = 0.2, hspace = 0.2) 282 | 283 | # plot the average Qmax 284 | rpsplot = fig.add_subplot(321) 285 | plt.title('Average Qmax from action model', loc='Left', weight='bold', color='Black', \ 286 | fontdict = {'fontsize' : 10}) 287 | rpsplot.plot(avgQmaxList, color='blue') 288 | 289 | # plot the TDtarget 290 | rpsplot = fig.add_subplot(323) 291 | plt.title('TD target minus Q target from experience replay', loc='Left', weight='bold', \ 292 | color='Black', fontdict = {'fontsize' : 10}) 293 | rpsplot.plot(dqn_agent.TDtarget, color='blue') 294 | 295 | # plot the TDtarget 296 | rpsplot = fig.add_subplot(325) 297 | plt.title('TD target from experience replay', loc='Left', weight='bold', color='Black', \ 298 | fontdict = {'fontsize' : 10}) 299 | rpsplot.plot(dqn_agent.TDtargetdelta, color='blue') 300 | 301 | # plot thte win rate 302 | rpsplot = fig.add_subplot(322) 303 | plt.title('Win-Tie-Lost Rate', loc='Left', weight='bold', color='Black', \ 304 | fontdict = {'fontsize' : 10}) 305 | rpsplot.plot([i[1] for i in rateTrack], color='green') 306 | rpsplot.plot([i[2] for i in rateTrack], color='blue') 307 | rpsplot.plot([i[3] for i in rateTrack], color='red') 308 | 309 | # plot thte win rate 310 | rpsplot = fig.add_subplot(324) 311 | plt.title('Player 2 move percentage', loc='Left', weight='bold', color='Black', \ 312 | fontdict = {'fontsize' : 10}) 313 | rpsplot.plot([i[0] for i in p2Rate], color='orange') 314 | rpsplot.plot([i[1] for i in p2Rate], color='red') 315 | rpsplot.plot([i[2] for i in p2Rate], color='green') 316 | 317 | # plot the reward 318 | rpsplot = fig.add_subplot(326) 319 | plt.title('Average Reward per Episode', loc='Left', weight='bold', color='Black', \ 320 | fontdict = {'fontsize' : 10}) 321 | rpsplot.plot(avgCumRewardList, color='green') 322 | plt.show(block = False) 323 | 324 | if __name__ == "__main__": 325 | main() 326 | -------------------------------------------------------------------------------- /rps_deepRL_withLSTM.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import math 4 | 5 | from keras.models import Sequential 6 | from keras.layers import Dense, Dropout, LSTM, GRU, TimeDistributed, Flatten 7 | from keras.optimizers import Adam 8 | 9 | from collections import deque 10 | from randmove import genOneMove 11 | 12 | import matplotlib.pyplot as plt 13 | from matplotlib import style 14 | style.use('ggplot') 15 | 16 | # -------------------------- SETTING UP THE ENVIRONMENT -------------------------------------- 17 | # simple game, therefore we are not using the open gym custom set up 18 | #--------------------------------------------------------------------------------------------- 19 | class RPSenv(): 20 | def __init__ (self): 21 | self.action_space = [0,1,2] # integer representation of r/p/s 22 | self.seed = random.seed(4) # make it deterministic 23 | self.norm_mu = 0 # center point for guassian distribution 24 | self.norm_sigma = 2.0 # sigma for std distribution 25 | self.seqIndex = 0 # index for pointing to the SEQ sequnce 26 | self.p2Mode = 'SEQ' # SEQ or PRNG or LFSR 27 | self.p2Count = [0, 0, 0] # player 2 win tie lost count 28 | self.p1Count = [0, 0, 0] # player 1 win tie lost count 29 | self.window = 10 # window size for rate trending calc 30 | self.cumWinRate, self.cumTieRate, self.cumLostRate = None, None, None 31 | self.overallWinRate, self.overallTieRate, self.overallLostRate = 0, 0, 0 32 | self.cumWinCount, self.cumTieCount, self.cumLostCount = None, None, None 33 | self.winRateTrend, self.tieRateTrend, self.lostRateTrend = 0, 0, 0 34 | self.winRateMovingAvg, self.tieRateMovingAvg, self.lostRateMovingAvg = 0, 0, 0 35 | self.winRateBuf, self.tieRateBuf, self.lostRateBuf \ 36 | = deque(maxlen=self.window), deque(maxlen=self.window), deque(maxlen=self.window) 37 | # put all the observation state in here; shape in Keras input format 38 | self.state = np.array([[ \ 39 | None, None, None, \ 40 | self.winRateTrend, self.tieRateTrend, self.lostRateTrend, \ 41 | self.winRateMovingAvg, self.tieRateMovingAvg, self.lostRateMovingAvg \ 42 | ]]) 43 | 44 | def reset(self): 45 | # reset all the state 46 | self.cumWinRate, self.cumTieRate, self.cumLostRate = 0, 0, 0 47 | self.cumWinCount, self.cumTieCount, self.cumLostCount = 0, 0, 0 48 | self.winRateTrend, self.tieRateTrend, self.lostRateTrend = 0, 0, 0 49 | self.winRateMovingAvg, self.tieRateMovingAvg, self.lostRateMovingAvg = 0, 0, 0 50 | return np.array([0, 0, 0, 0, 0, 0, 0, 0, 0]) 51 | 52 | def step(self, action, moveCount, stage): 53 | # value mode is PRNG or SEQ 54 | p2Move = genOneMove(self, self.p2Mode, stage) # play one move from player2 55 | self.p2Count[p2Move] += 1 56 | p1Move = action 57 | self.p1Count[p1Move] += 1 58 | 59 | # check who won, set flag and assign reward 60 | win, tie, lost = 0, 0, 0 61 | if p1Move == p2Move: 62 | self.cumTieCount, tie = self.cumTieCount + 1, 1 63 | elif (p1Move - p2Move == 1) or (p1Move - p2Move == -2): 64 | self.cumWinCount, win = self.cumWinCount + 1, 1 65 | else: 66 | self.cumLostCount, lost = self.cumLostCount + 1, 1 67 | 68 | # update the running rates 69 | self.cumWinRate = self.cumWinCount / moveCount 70 | self.cumTieRate = self.cumTieCount / moveCount 71 | self.cumLostRate = self.cumLostCount / moveCount 72 | # update moving avg buffer 73 | self.winRateBuf.append(self.cumWinRate) 74 | self.tieRateBuf.append(self.cumTieRate) 75 | self.lostRateBuf.append(self.cumLostRate) 76 | # calculate trend 77 | tmp = [0, 0, 0] 78 | self.winRateTrend, self.tieRateTrend, self.lostRateTrend = 0, 0, 0 79 | if moveCount >= self.window: 80 | tmp[0] = sum(self.winRateBuf[i] for i in range(self.window)) / self.window 81 | tmp[1] = sum(self.tieRateBuf[i] for i in range(self.window)) / self.window 82 | tmp[2] = sum(self.lostRateBuf[i] for i in range(self.window)) / self.window 83 | # win rate trend analysis 84 | if self.winRateMovingAvg < tmp[0]: 85 | self.winRateTrend = 1 # win rate trending up. That's good 86 | else: 87 | self.winRateTrend = 0 # win rate trending down. That's bad 88 | # tie rate trend analysis 89 | if self.tieRateMovingAvg < tmp[1]: 90 | self.tieRateTrend = 1 # tie rate trending up. That's bad 91 | else: 92 | self.tieRateTrend = 0 # tie rate trending down. Neutral 93 | # lost rate trend analysis 94 | if self.lostRateMovingAvg < tmp[2]: 95 | self.lostRateTrend = 1 # lst rate trending up. That's bad 96 | else: 97 | self.lostRateTrend = 0 # lost rate trending down. That's good 98 | self.winRateMovingAvg, self.tieRateMovingAvg, self.lostRateMovingAvg = tmp[0], tmp[1], tmp[2] 99 | # net reward in this round 100 | reward = win 101 | # record the state and reshape it for Keras input format 102 | dim = self.state.shape[1] 103 | self.state = np.array([\ 104 | win, tie, lost, \ 105 | self.winRateTrend, self.tieRateTrend, self.lostRateTrend, \ 106 | self.winRateMovingAvg, self.tieRateMovingAvg, self.lostRateMovingAvg \ 107 | ]).reshape(1, dim) 108 | # this game is done when it hits this goal 109 | done = False 110 | return self.state, reward, done, dim 111 | 112 | # ------------------------- class for the Double-DQN agent --------------------------------- 113 | # facilities utilized here: 114 | # 1) Double DQN networks: one for behavior policy, one for target policy 115 | # 2) Learn from sample from pool of memories 116 | # 3) Basic TD-Learning stuff: learning rate, gamma for discounting future rewards 117 | # 4) Use of epsilon-greedy policy for controlling exploration vs exploitation 118 | #------------------------------------------------------------------------------------------- 119 | class DDQN: 120 | def __init__(self, env): 121 | self.env = env 122 | # initialize the memory and auto drop when memory exceeds maxlen 123 | # this controls how far out in history the "expeience replay" can select from 124 | self.maxlen = 3000 125 | self.memory = deque(maxlen = self.maxlen) 126 | # future reward discount rate of the max Q of next state 127 | self.gamma = 0.9 128 | # epsilon denotes the fraction of time dedicated to exploration (as oppse to exploitation) 129 | self.epsilon = 1.0 130 | self.epsilon_min = 0.01 131 | self.epsilon_decay = 0.9910 132 | # model learning rate (use in backprop SGD process) 133 | self.learning_rate = 0.005 134 | # transfer learning proportion contrl between the target and action/behavioral NN 135 | self.tau = .125 136 | # hyyperparameters for LSTM 137 | self.lookback = 100 138 | self.hiddenUnits = 50 139 | # create two models for double-DQN implementation 140 | self.model = self.create_model() 141 | self.target_model = self.create_model() 142 | # some space to collect TD target for instrumentaion 143 | self.TDtargetdelta, self.TDtarget = [], [] 144 | self.Qmax =[] 145 | 146 | 147 | def create_model(self): 148 | input_feature_dim = self.env.state.shape[1] 149 | output_feature_dim = len(self.env.action_space) 150 | model = Sequential() 151 | #model.add(GRU(self.hiddenUnits,\ 152 | model.add(LSTM(self.hiddenUnits,\ 153 | return_sequences = False,\ 154 | #activation = None, \ 155 | #recurrent_activation = None, \ 156 | input_shape = (self.lookback, input_feature_dim))) 157 | # let the output be the predicted target value. NOTE: do not use activation to squash it! 158 | #model.add(TimeDistributed(Dense(4))) 159 | #model.add(Flatten()) 160 | model.add(Dense(output_feature_dim)) 161 | model.compile(loss="mean_squared_error", optimizer=Adam(lr=self.learning_rate)) 162 | print(model.summary()) 163 | return model 164 | 165 | def act(self, state, step): 166 | # this is to take one action 167 | self.epsilon *= self.epsilon_decay 168 | self.epsilon = max(self.epsilon_min, self.epsilon) 169 | # decide to take a random exploration or make a policy-based action (thru NN prediction) 170 | # with a LSTM design, delay on policy prediction after at least lookback steps have accumlated 171 | if np.random.random() < self.epsilon or step < self.lookback + 1: 172 | # return a random move from action space 173 | return random.choice(self.env.action_space) 174 | else: 175 | # return a policy move 176 | state_set = np.empty((1, self.env.state.shape[1])) # iniitial with 2 dims 177 | for j in range(self.lookback): 178 | state_tmp, _, _, _, _ = self.memory[-(j+1)] # get the most recent state and the previous N states 179 | if j == 0: 180 | state_set[0] = state_tmp # iniitalize the first record 181 | else: 182 | state_set = np.concatenate((state_set, state_tmp), axis = 0) # get a consecutive set of states for LSTM prediction 183 | state_set = state_set[None, :, :] # make the tensor 3 dim to align with Keras reqmt 184 | #print(state_set) 185 | #print(state_set.shape) 186 | self.Qmax.append(max(self.model.predict(state_set)[0])) 187 | return np.argmax(self.model.predict(state_set)[0]) 188 | 189 | def remember(self, state, action, reward, new_state, done): 190 | # store up a big pool of memory 191 | self.memory.append([state, action, reward, new_state, done]) 192 | 193 | def replay(self): 194 | # DeepMind "experience replay" method 195 | # do the training (learning); this is DeepMind tricks of using "Double" model (Mnih 2015) 196 | # the sample size from memory to learn from 197 | #------------------------ 198 | # do nothing untl the memory is large enough 199 | RL_batch_size = 24 # this is experience replay batch_size (not the LSTM fitting batch size) 200 | if len(self.memory) < RL_batch_size: return 201 | # get the samples; each sample is a sequence of consecutive states with same lookback length as LSTM definition 202 | for i in range(RL_batch_size): 203 | state_set = np.empty((1, self.env.state.shape[1])) 204 | new_state_set = np.empty((1, self.env.state.shape[1])) 205 | if len(self.memory) <= self.lookback: # check if memory is large enough to retrieve the time sequence 206 | return 207 | else: 208 | a = random.randint(0, len(self.memory) - self.lookback) # first get a random location 209 | state, action, reward, new_state, done = self.memory[-(a+1)] # retrieve a sample from memory at that loc; latest element at the end of deque 210 | for j in range(self.lookback): 211 | state_tmp, _, _, new_state_tmp, _ = self.memory[-(a+j+1)] # get a consecutive set of states 212 | if j == 0: 213 | state_set[0] = state_tmp 214 | new_state_set[0] = new_state_tmp 215 | else: 216 | state_set = np.concatenate((state_set, state_tmp), axis = 0) # get a consecutive set of states for LSTM prediction 217 | new_state_set = np.concatenate((new_state_set, new_state_tmp), axis = 0) 218 | # do the prediction from current state 219 | state_set = state_set[None, :, :] # make the tensor 3 dim to align with Keras reqmt 220 | new_state_set = new_state_set[None, :, :] # make the tensor 3 dim to align with Keras reqmt 221 | target = self.target_model.predict(state_set) 222 | # do the Q leanring 223 | if done: 224 | target[0][action] = reward 225 | else: 226 | Q_future = max(self.target_model.predict(new_state_set)[0]) 227 | TDtarget = reward + Q_future * self.gamma 228 | self.TDtarget.append(TDtarget) 229 | self.TDtargetdelta.append(TDtarget - target[0][action]) 230 | target[0][action] = TDtarget 231 | # do one pass gradient descend using target as 'label' to train the action model 232 | self.model.fit(state_set, target, batch_size = 1, epochs = 1, verbose = 0) 233 | 234 | def target_train(self): 235 | # transfer weights proportionally from the action/behave model to the target model 236 | weights = self.model.get_weights() 237 | target_weights = self.target_model.get_weights() 238 | for i in range(len(target_weights)): 239 | target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau) 240 | self.target_model.set_weights(target_weights) 241 | 242 | def save_model(self, fn): 243 | self.model.save(fn) 244 | 245 | # ------------------------- MAIN BODY ---------------------------------------- 246 | 247 | def main(): 248 | episodes, trial_len = 50, 200 # lenght of game play 249 | stage, totalStages = 0, 2 # # of stages with change distribution 250 | sigma_reduce = -0.1 # sigma change amount at each stage 251 | cumReward, argmax = 0, 0 # init for intrumentation 252 | steps, rateTrack, overallRateTrack = [], [], [] 253 | avgQmaxList, avgQ_futureList,avgQ_targetmaxList, avgTDtargetList = [], [], [], [] 254 | avgCumRewardList = [] 255 | p1Rate, p2Rate = [], [] 256 | # declare the game play environment and AI agent 257 | env = RPSenv() 258 | dqn_agent = DDQN(env=env) 259 | # ------------------------------------------ start the game ----------------------------------------- 260 | print('STARTING THE GAME with %s episodes each with %s moves' % (episodes, trial_len), '\n') 261 | for episode in range(episodes): 262 | cur_state = env.reset().reshape(1,env.state.shape[1]) # reset and get initial state in Keras shape 263 | cumReward = 0 264 | # ----------------- Select play strategy (apply to PRNG mode only) ------------------------------ 265 | # this change strategy affects distribution of r-p-s 266 | # ----------------------------------------------------------------------------------------------- 267 | if (episode+1) % (episodes // totalStages) == 0: env.norm_sigma += sigma_reduce # step repsonse change the gaussian distribution 268 | # this change strategy affects which move is dominant (control in randmove module) 269 | stage = episode // (episodes // totalStages) # divide total episodes into 3 equal length stages 270 | 271 | for step in range(trial_len): 272 | # AI agent take one action 273 | action = dqn_agent.act(cur_state, step) 274 | # play the one move and see how the environment reacts to it 275 | new_state, reward, done, info = env.step(action, step + 1, stage) 276 | cumReward += reward 277 | # record the play into memory pool 278 | dqn_agent.remember(cur_state, action, reward, new_state, done) 279 | # perform Q-learning from using |"experience replay": learn from random samples in memory 280 | dqn_agent.replay() 281 | # apply tranfer learning from actions model to the target model. 282 | dqn_agent.target_train() 283 | # update the current state with environment new state 284 | cur_state = new_state 285 | if done: break 286 | #-------------------------------- INSTRUMENTAL AND PLOTTING ------------------------------------------- 287 | # the instrumental are performed at the end of each episode 288 | # store epsiode #, winr rate, tie rate, lost rate, etc. etc. 289 | #------------------------------------------------------------------------------------------------------ 290 | rateTrack.append([episode+1, env.cumWinRate, env.cumTieRate, env.cumLostRate]) 291 | env.overallWinRate += env.cumWinRate 292 | env.overallTieRate += env.cumTieRate 293 | env.overallLostRate += env.cumLostRate 294 | overallRateTrack.append([episode+1, 295 | env.overallWinRate / (episode +1), \ 296 | env.overallTieRate / (episode +1), \ 297 | env.overallLostRate / (episode +1),]) 298 | if True: # print ongoing performance 299 | print('EPISODE ', episode + 1), 300 | if env.p2Mode == 'PRNG': print('stage:', stage, ' sigma:', env.norm_sigma) 301 | print(' WIN RATE %.2f ' % env.cumWinRate, \ 302 | ' tie rate %.2f' % env.cumTieRate, \ 303 | 'lose rate %.2f' % env.cumLostRate) 304 | 305 | # print move distribution between the players 306 | if True: 307 | p1Rate.append([env.p1Count[0] / trial_len, env.p1Count[1] / trial_len, env.p1Count[2] / trial_len]) 308 | p2Rate.append([env.p2Count[0] / trial_len, env.p2Count[1] / trial_len, env.p2Count[2] / trial_len]) 309 | print (' P1 rock rate: %.2f paper rate: %.2f scissors rate: %.2f' % (p1Rate[-1][0], p1Rate[-1][1], p1Rate[-1][2])) 310 | print (' P2 rock rate: %.2f paper rate: %.2f scissors rate: %.2f' % (p2Rate[-1][0], p2Rate[-1][1], p2Rate[-1][2])) 311 | env.p1Count, env.p2Count = [0,0,0], [0,0,0] 312 | 313 | # summarize Qmax from action model and reward 314 | avgQmax = sum(dqn_agent.Qmax) / trial_len # from action model 315 | avgQmaxList.append(avgQmax) 316 | 317 | avgCumReward = cumReward / trial_len 318 | avgCumRewardList.append(avgCumReward) 319 | if True: 320 | print(' Avg reward: %.2f Avg Qmax: %.2f' % (avgCumReward, avgQmax)) 321 | dqn_agent.Qmax=[] # reset for next episode 322 | 323 | 324 | # ---------------- plot the main plot when all the episodes are done --------------------------- 325 | # 326 | if True: 327 | fig = plt.figure(figsize=(12,5)) 328 | plt.subplots_adjust(wspace = 0.2, hspace = 0.2) 329 | 330 | # plot the average Qmax 331 | rpsplot = fig.add_subplot(321) 332 | plt.title('Average Qmax from action model', loc='Left', weight='bold', color='Black', \ 333 | fontdict = {'fontsize' : 10}) 334 | rpsplot.plot(avgQmaxList, color='blue') 335 | 336 | # plot the TDtarget 337 | rpsplot = fig.add_subplot(323) 338 | plt.title('TD target minus Q target from experience replay', loc='Left', weight='bold', \ 339 | color='Black', fontdict = {'fontsize' : 10}) 340 | rpsplot.plot(dqn_agent.TDtarget, color='blue') 341 | 342 | # plot the TDtarget 343 | #rpsplot = fig.add_subplot(325) 344 | #plt.title('TD target from experience replay', loc='Left', weight='bold', color='Black', \ 345 | # fontdict = {'fontsize' : 10}) 346 | #rpsplot.plot(dqn_agent.TDtargetdelta, color='blue') 347 | 348 | # plot thte win rate 349 | rpsplot = fig.add_subplot(322) 350 | plt.title('Win-Tie-Lost Rate', loc='Left', weight='bold', color='Black', \ 351 | fontdict = {'fontsize' : 10}) 352 | rpsplot.plot([i[1] for i in rateTrack], color='green') 353 | rpsplot.plot([i[2] for i in rateTrack], color='blue') 354 | rpsplot.plot([i[3] for i in rateTrack], color='red') 355 | 356 | # plot thte win rate 357 | rpsplot = fig.add_subplot(324) 358 | plt.title('Player-1 Overall Win-Tie-Lost Rate', loc='Left', weight='bold', color='Black', \ 359 | fontdict = {'fontsize' : 10}) 360 | rpsplot.plot([i[1] for i in overallRateTrack], color='green') 361 | rpsplot.plot([i[2] for i in overallRateTrack], color='blue') 362 | rpsplot.plot([i[3] for i in overallRateTrack], color='red') 363 | 364 | # plot thte win rate 365 | rpsplot = fig.add_subplot(326) 366 | plt.title('Player 2 move percentage', loc='Left', weight='bold', color='Black', \ 367 | fontdict = {'fontsize' : 10}) 368 | rpsplot.plot([i[0] for i in p2Rate], color='orange') 369 | rpsplot.plot([i[1] for i in p2Rate], color='red') 370 | rpsplot.plot([i[2] for i in p2Rate], color='green') 371 | 372 | # plot the reward 373 | rpsplot = fig.add_subplot(325) 374 | plt.title('Average Reward per Episode', loc='Left', weight='bold', color='Black', \ 375 | fontdict = {'fontsize' : 10}) 376 | rpsplot.plot(avgCumRewardList, color='green') 377 | plt.show(block = False) 378 | 379 | if __name__ == "__main__": 380 | main() 381 | --------------------------------------------------------------------------------