├── main_results_plot.png
├── double_dqn_architecture.png
├── LSTM_based_ddqn_architecture.png
├── high_level_play_environment.png
├── figure_LSTM_lkbk100_unit50_SEQ1.png
├── figure_LSTM_lkbk200_unit100_1stage.png
├── .gitattributes
├── decay.py
├── distribution_plot.py
├── lfsr_test.py
├── randmove.py
├── Readme.md
├── rps_deepRL.py
└── rps_deepRL_withLSTM.py


/main_results_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dennylslee/rock-paper-scissors-DeepRL/HEAD/main_results_plot.png


--------------------------------------------------------------------------------
/double_dqn_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dennylslee/rock-paper-scissors-DeepRL/HEAD/double_dqn_architecture.png


--------------------------------------------------------------------------------
/LSTM_based_ddqn_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dennylslee/rock-paper-scissors-DeepRL/HEAD/LSTM_based_ddqn_architecture.png


--------------------------------------------------------------------------------
/high_level_play_environment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dennylslee/rock-paper-scissors-DeepRL/HEAD/high_level_play_environment.png


--------------------------------------------------------------------------------
/figure_LSTM_lkbk100_unit50_SEQ1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dennylslee/rock-paper-scissors-DeepRL/HEAD/figure_LSTM_lkbk100_unit50_SEQ1.png


--------------------------------------------------------------------------------
/figure_LSTM_lkbk200_unit100_1stage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dennylslee/rock-paper-scissors-DeepRL/HEAD/figure_LSTM_lkbk200_unit100_1stage.png


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/decay.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | from matplotlib import style
 3 | #from matplotlib import cm
 4 | #from matplotlib.ticker import LinearLocator, FormatStrFormatter
 5 | style.use('ggplot')
 6 | 
 7 | lsta, lstb, lstc, lstd = [], [], [], []
 8 | a , b, c, d = 1, 1, 1, 1
 9 | decay1 = 0.998
10 | decay2 = 0.997
11 | decay3 = 0.996
12 | decay4 = 0.992
13 | 
14 | 
15 | for i in range(1000):
16 | 	a, b, c, d = a*decay1, b*decay2, c*decay3, d*decay4
17 | 	lsta.append(a)
18 | 	lstb.append(b)
19 | 	lstc.append(c)
20 | 	lstd.append(d)
21 | 
22 | plt.title('Decay curve', loc='center', weight='bold', color='Black')
23 | plt.plot(lsta, color='blue')
24 | plt.plot(lstb, color='orange')
25 | plt.plot(lstc, color='green')
26 | plt.plot(lstd, color='black')
27 | plt.show()


--------------------------------------------------------------------------------
/distribution_plot.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import matplotlib.mlab as mlab
 5 | from matplotlib import style
 6 | style.use('ggplot')
 7 | 
 8 | num_bins = 100
 9 | sigma1, sigma2 = 2.0, 0.5
10 | norm_mu = 0
11 | a, b = [], []
12 | for i in range(10000):
13 | 	a.append(random.gauss(norm_mu, sigma1))
14 | 	b.append(random.gauss(norm_mu, sigma2))
15 | 
16 | plt.title('Distribution Plots', loc='center', weight='bold', color='Black')
17 | 
18 | #n, bins1, patches1 = plt.hist(a, num_bins, facecolor='blue', alpha=0.5)
19 | m, bins2, patches2 = plt.hist(b, num_bins, facecolor='red', alpha=0.5)
20 | plt.show(block = False)
21 | 
22 | # some test code
23 | teststr = 'start'
24 | for i in range(50):
25 | 	j = 61
26 | 	k = 2
27 | 	teststr = 'start'
28 | 	if i % (j // k) == 0: teststr = 'change'
29 | 	print ('i', i, teststr, i % (j // k))
30 | 


--------------------------------------------------------------------------------
/lfsr_test.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | from matplotlib import style
 4 | style.use('ggplot')
 5 | 
 6 | 
 7 | def lfsr2(seed, taps, nbits):
 8 |     sr = seed
 9 |     while 1:
10 |         xor = 1
11 |         for t in taps:
12 |             if (sr & (1<<(t-1))) != 0:
13 |                 xor ^= 1
14 |         sr = (xor << nbits-1) + (sr >> 1)
15 |         yield xor, sr
16 |         if sr == seed:
17 |             break
18 | # ---------------------------- main program ---------------------------------------------
19 | # nbits, tapindex, seed = 12, (12,11,10,4,1), 0b11001001
20 | nbits, tapindex, seed = 8, (8,6,5,4,1), 0b00001001
21 | datalist, movelist = [], []
22 | 
23 | # -------------------------- generate the random sequence --------------------------------
24 | 
25 | for xor, sr in lfsr2(0b11001001, tapindex, nbits):
26 |     lfsr_gen = int(bin(2**nbits+sr)[3:], base=2)
27 |     datalist.append(lfsr_gen)
28 |     print (xor, lfsr_gen)
29 | 
30 | 
31 | for i in datalist:
32 |     move = i % 3        # use a mod 3 to create 3 bins
33 |     movelist.append(move)
34 | 
35 | print ('Player1 RPS LRSR ditribution')
36 | print ('Player1 rock:', movelist.count(0))  # count the num of ones in list (rock)
37 | print ('Player1 paper:', movelist.count(1)) # count the num of twos in list (paper)
38 | print ('Player1 sessios:', movelist.count(2))   # count the num of threes in list (sessiors)
39 | print ('total moves:', len(movelist))
40 | print ('sample moves', movelist[:20])
41 | 
42 | #---------------- print the PDF chart --------------------------------
43 | 
44 | x = np.array(datalist)
45 | nbins = 20
46 | n, bins = np.histogram(x, nbins, density=1)
47 | pdfx = np.zeros(n.size)
48 | pdfy = np.zeros(n.size)
49 | for k in range(n.size):
50 |     pdfx[k] = 0.5*(bins[k]+bins[k+1])
51 |     pdfy[k] = n[k]
52 | plt.plot(pdfx, pdfy)        # plot the probability distributed function
53 | plt.show(block = False)
54 | 


--------------------------------------------------------------------------------
/randmove.py:
--------------------------------------------------------------------------------
 1 | # This module generate one rock-paper-sessiors move based on the Mode selected.
 2 | import random
 3 | 
 4 | def lfsr2(seed, taps, nbits):
 5 |     sr = seed
 6 |     while 1:
 7 |         xor = 1
 8 |         for t in taps:
 9 |             if (sr & (1<<(t-1))) != 0:
10 |                 xor ^= 1
11 |         sr = (xor << nbits-1) + (sr >> 1)
12 |         yield xor, sr
13 |         if sr == seed:
14 |             break
15 | 
16 | def genOneMove(self, mode, stage):				
17 | 	if mode == 'PRNG':
18 | 		# change play strategy for player2 along the way 
19 | 		lowPlay  = {0:0, 1:1, 2:2, 3:0, 4:2} 			# key = stage number, value = r(0), p(1), s(2)
20 | 		meanPlay = {0:1, 1:2, 2:0, 3:2, 4:1} 			# key = stage number, value = r(0), p(1), s(2)
21 | 		hiPlay   = {0:2, 1:0, 2:1, 3:1, 4:0} 			# key = stage number, value = r(0), p(1), s(2)
22 | 		# gen a random numbe from guassian & quantize it
23 | 		a = random.gauss(self.norm_mu, self.norm_sigma) 
24 | 		if a **2 < 1:  									# the middle bell is the paper move
25 | 			play = meanPlay[stage]
26 | 		elif a < -1:									# lower than cutoff -1 is the rock move
27 | 			play = lowPlay[stage]
28 | 		else:
29 | 			play = hiPlay[stage] 						# else higher than +1 is the sessiors move 
30 | 		return play
31 | 	
32 | 	elif mode == 'SEQ':					# simple repeating pattern as 'random generator'
33 | 		dict = {'r':0, 'p':1, 's': 2}
34 | 		seqlist = 'rpprsspsrsrpprspsprspsppsrrspsprrsspsrpsrpsrsps'			# the pattern sequence here
35 | 		self.seqIndex = 0 if self.seqIndex == len(seqlist)-1 else self.seqIndex + 1
36 | 		return dict[seqlist[self.seqIndex]]
37 | 	
38 | 	elif mode == 'LFSR':
39 | 		nbits, tapindex, seed = 12, (12,11,10,4,1), 0b11001001
40 | 		#nbits, tapindex, seed = 8, (8,6,5,4,1), 0b11001001
41 | 		lfsrlist = []
42 | 		for xor, sr in lfsr2(seed, tapindex, nbits):
43 | 		    lfsr_gen = int(bin(2**nbits+sr)[3:], base=2)
44 | 		    lfsrlist.append(lfsr_gen % 3)
45 | 		self.seqIndex = 0 if self.seqIndex == len(lfsrlist)-1 else self.seqIndex + 1
46 | 		return lfsrlist[self.seqIndex]
47 | 	
48 | 	else:
49 | 		print('Error: random mode does not exist!')
50 | 
51 | 


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
  1 | # Introduction
  2 | 
  3 | The objective of this project is to construct an AI agent to play a simple 2 players rock-paper-scissors game using reinforcement learning (RL) technique - particularly the double-DQN algorithm.  
  4 | 
  5 | In previous design project, we have built a player using simple LSTM-based neural network that is trained using traditional supervised learning method.  The downside is that the model is effectively static and does not adapt to model drift or fundamental behavioral changes.  With a RL-based approach, we like to see that the AI agent demostrates some ability to counter the changing strategy of the opponent and continue to generate better win rate than the opposing player.  
  6 | 
  7 | Note that unlike other RL project in which the AI agent is to learn a task successfully and generalizes to other future data (i.e. a form of semi supervised learning using rewards as label - this is my analogy and is technically not precise), in this particular set up, there is no completion per se.  That is the game never ends. The RL agent simply continues to adapt and adjust as best as it can. 
  8 | 
  9 | ## Q-Learning Basics
 10 | 
 11 | This project utiltized the double-DQN RL algorithm as the basis for the AI agent as player 1.  DeepMind was very instrutmental in popularizing Q-learning recently and had contributed multple architectural enhancements in recent years - do note that Q-Learning was first pioneered by Watkins in 1992.  See few popular papers on DQN (deep Q-learning network) in the reference section.
 12 | 
 13 | In its essence, Deep Q-Learning is to learn a "policy" (which in practice is a deep learning network) such that it maximizes the expected return of future action. The Q-value is the "action value" function. Its absolute magnitude has no real meaning but is recurrsively derived to guide the system on which action to take at any moment such that the chosen action is believed to lead to maximum future rewards. 
 14 | 
 15 | Some of the enhancments utilized in this design that is over and beyond basic Q-learning are as following.  Most of these are introduced mainly by DeepMind in recent years. 
 16 | 
 17 | 1) Dual models (hence the term "double") - one model is to drive the action decision (also sometime refers as online model or behavior model) and one to act as the target model.  The inner weights are "tranferred" from the action model to the target model on every move cycle on a discounted basis.
 18 | 2) Exploiting vs exploration using epsilon-greedy method. This is essential in any RL design since exploration is important, especially during the start of the process, for searching in different areas of that state space that might lead to better optimal operation position.  When exploring, it is often termed "off-policy" whereas taking an action according to exploitation is often termed "on-policy".
 19 | 3) Experience replay - instead of using most recent history as the learning space, DeepMind introduced the concept of experience replay in which past pass-through of the state space is stored in memory.  Such memories are recalled (sampled) at each move and used in the SGD process (thus achieving the reinforcement notion).
 20 | 
 21 | ## Acknowledgement
 22 | 
 23 | Much of the code is adopted from A. Oppermann's blog in Medium. It is an excellent tutorial with detailed walk through. You can find it [here](https://towardsdatascience.com/self-learning-ai-agents-part-ii-deep-q-learning-b5ac60c3f47).
 24 | 
 25 | # RPS High Level Environment 
 26 | 
 27 | The environment of this game play is depicted below. It follows the classical RL environment definition.
 28 | 
 29 | ![pic1](https://github.com/dennylslee/rock-paper-scissors-DeepRL/blob/master/high_level_play_environment.png)
 30 | 
 31 | Inside the "environment" is the embedded player 2 (i.e. the opponent).  This player might adopt different type of play strategy. The interaction contains the following:
 32 | 
 33 | 1. action space:  either rock, paper, or scissors that the AI agent (player 1) puts out.
 34 | 2. rewards:  this is an indication from the environment back to player 1.  The reward is simply a value of 1 if it is a win for player 1, and 0 otherwise.
 35 | 3. state:  this is where the fun is and some creativity comes into play (which might affect the player winning outcome). In this setup, we have designed the state space to be:
 36 |     - win, tie, lost indicators: one of the three can be set to a 1 for a particular state
 37 |     - winRateTrend, tieRateTrend, lostRateTrend:  this is an indicator which reflects a positively-trending moving average (set to 1) or not (set to 0).  All three indicators are assessed independently.
 38 |     - winRateMovingAvg, tieRateMovingAvg, lostRateMovingAvg: floating point value between 0 and 1 which indicates the rate. This rate is calculated based on a configured moving average window size.
 39 | 
 40 | 
 41 | ## RPS AI player architecture using Double-DQN
 42 | 
 43 | The overall architecture of the double-DQN agent design is depicted below.  The yellow section is coded inside the step method and is iterated over experience replay batch size of N samples.  The green section is the design that control the exploration vs exploitation action.  The oragne section is the main action model (on policy).  This is the model in which we want to ultimately achieve the optimal policy for making the best possible action.  The action model's weights are transferred with discounted to the target model periodically. 
 44 | 
 45 | ![pic2](https://github.com/dennylslee/rock-paper-scissors-DeepRL/blob/master/double_dqn_architecture.png)
 46 | 
 47 | (Nothing really special here in the illustration - just a block diagram version of the commonly published Dobule-DQN pesudo code)
 48 | 
 49 | ## Player 2 ("the opponent") behaviors
 50 | 
 51 | The player 2 code is embedded in a separate moduled called randmove. Player 2 can play 3 modes which need to be manually configured in the environment class setup:  
 52 | 
 53 | 1. "PRNG" which uses python Gaussian distribution with a controllable mean and sigma value.  The larget the sigma, the larger the spread and the more random the sequence appears (which makes it harder to predict).
 54 | 2. "SEQ" this is a simple manually typed in sequence of rock-paper-scissors.
 55 | 3. "LFSR"  this is a N-bits pseudo-random generator implemented based on linear feedback shift register of certain length.  Depending on the tap location, it might or might exhibit maximal length cycle. 
 56 | 
 57 | The results below is based mainly on PRNG in which the play strategy is as following:
 58 | 
 59 | 1) the overall game play is divided into N (=5) stages, each with equal number of moves
 60 | 2) the moves of player 2 in each stage is generated using the PRNG Guassian distribution with different dominant move type (i.e. rock or paper of scissors)
 61 | 3) the sigma value is also decreasesd as the stage progress.
 62 | 
 63 | The overall game objective is to observe if the RL agent can adjust to the changing behaviors and maintain a good win rate.
 64 |  
 65 | ## DNN design
 66 | 
 67 | In this project, we used a simple full mesh DNN which proves to be one of the main limitation since the RPS game is essentially a time series (sequence) problem. The DNN model can learn distribution but it has no notion of sequence. 
 68 | 
 69 | For a high dimensional PRNG like the Guassian generator, a DNN (especially a small one) has no hope in detecting the sequential pattern coming from a complex PRNG generator.  However, it is capable in understanding the distribution statistically and its play strategy is effectively based on observing shift in statistics. 
 70 | 
 71 | ## Hyperparameters
 72 | 
 73 | The following is the set of hyperparammeters in the game which can alter the outcome win rate and in general the adaptability of the AI agent.
 74 | 
 75 | 1. memory batch size = 32: the larger batch, the more reinforcement is used on each round making the convergence faster but would also make adaptation slower.
 76 | 2. memory maxlen: this is done using python collection deque object. The longer the memory, the more experience it retains which could lead to faster convergence but can negatively affect adaptability since it needs time to flush out the memory when the opponents' behavior changes. 
 77 | 3. moving window size = 8: change the smoothing factor of moving average state variables.
 78 | 4. gamma - a factor to determine how much of the future action-value do you want the algorithm to incorporate at each step.
 79 | 5. DNN layers = 3:  effectively the entropy of the NN and its ability to learning the distribution
 80 | 6. DNN nodes = 64: same as above
 81 | action to target model transfer learning (tau): how fast the transfer mimic after the action model. (we didn't play much around this one)
 82 | 6. reward system = 1 for win, 0 otherwise: using simple reward and allows the complexity of adaptation controlled by the NN design and the state space design.
 83 | 7. state design = too much state indicatores is a waste of computation resource since not all state variables are important.  Too little would hinder what the RL agent observes thus limits its policy forming ability. 
 84 | 8. epsilon: exploration percentage controlled in conjunction with the decay rate.  Note that there is a minimum value which is important to the adaptative effective since the system will continue to explore off-policy moves and it is how it discovers new opponent's behavior. 
 85 | 
 86 | # Results
 87 | 
 88 | The player 2's move percentage clearly depictes the changing strategy across the different stages of the game - each stage with a different dominant move type and a different spread due to the decreasing sigma value.  Player 1 (RL agent) seems to successfully adapt to the change and eventually win more due to the weakened player 2 behaviors (i.e. it gets less random over time).  From that perspective, the code has successfully achieved its main objective. 
 89 | 
 90 | However, it is somewhat disappointing that win rate in any state does not outperfrom the statistical behaviors of its opponent since the DNN architecture has the inherent limitation as mentioned in the earlier section. 
 91 | 
 92 | ![pic3](https://github.com/dennylslee/rock-paper-scissors-DeepRL/blob/master/main_results_plot.png)
 93 | 
 94 | The max Q value of action model is also plotted.  It shows a nice convergenece over the length of game. Corresponding the Q-value is upsetted during the transition from one stage to another when the opponent changes its play strategy.  During that time, reward is not received based given the previous policy's strategy and corresponding the Q-value suffered.  However, the exploration allows new policy to be learned by building up new memories and the new winning counter-move bubbles back up to the top eventually and the winning rate recuperates. 
 95 | 
 96 | # Using LSTM as policy and target network
 97 | 
 98 | As a second phase to this project, I have evolved the design to utilize a LSTM-based RNN in both the policy (action and target) networks.  The architecture is rather brute force and is illustrated below.  The intuition is that LSTM should over time learn to perform better than a simple DNN given its inert ability to recognize sequential pattern.  In a nutshell, the DDQN's original DNN model (for both action and target model) is swapped with a LSTM.  Since the LSTM deals with, and is trained on, sequence; the overall data preparation design is changed to adjust for this arrangeement:
 99 | 1) the input to the LSTM is a sequence of states. The sequence is of length 'lookback'
100 | 2) the DDQN experience replay concept is still retained.  However, each experience is now a sequence of states of lenght lookback.  For example, each experience replay is a sample from the deque memory, the code then retrieves the immediate prior loopback number of states.  The code then repeat the same retrieval process RL_batch number of times. 
101 | 
102 | ![pic4](https://github.com/dennylslee/rock-paper-scissors-DeepRL/blob/master/LSTM_based_ddqn_architecture.png)
103 | 
104 | In general,  such change in architecture did not yield any significant breakthrough in results (and to some degree, it is worse performing than a simple claissically trained static LSTM model).
105 | 
106 | 1) It did not perform well (meaning win-tie-lost rate are roughly 33% each) when the opponent is 'PRNG'.  The opponent is simply too high dimensions. 
107 | 2) Some testing were done aginst a simplier 12bit 'LFSR' which means the sequence repeats itself in 4,096 moves.  
108 | - GRU vs LSTM were used with no apparent difference
109 | - the hyperparameters varied are (a) lookback length (b) inner LSTM unit size (b) experience reply batch size.
110 | - a flatten dense layer architecture versus a basic many-to-one LSTM architecture were tried and neither provide any apparent advantage (neither one improveds the performance)
111 | - see the captured results below.
112 | 3)  Further testing was conducted on a short (approx 30moves) self-entered r-p-s sequence.  Based on a fairly small size LSTM, a consistently higher win rate is observed.  But this does not surpass the performance a classical (supervised learning) approach using a statically-trained LSTM.
113 | 
114 | All-in-all this LSTM architecture working within a DDQN structure is rather non-performing and better design is desirable. 
115 | 
116 | ![pic5](https://github.com/dennylslee/rock-paper-scissors-DeepRL/blob/master/figure_LSTM_lkbk200_unit100_1stage.png)
117 | 
118 | ![pic6](https://github.com/dennylslee/rock-paper-scissors-DeepRL/blob/master/figure_LSTM_lkbk100_unit50_SEQ1.png)
119 | 
120 | # Future Works 
121 | 
122 | We will need to rethink a more appropropriate LSTM-based solution to improve the win rate performance.
123 | 
124 | # Reference
125 | 
126 | 1. Mnih, et.al.  "Playing Atari with Deep Reinforcement Learning", 2013
127 | 2. Hasselt, et.al. "Deep Reinforcement Learning with Dobule Q-learning", 2015
128 | 3. Mnih, et. al. "Human-Level control through deep reinforcement learning", 2015
129 | 4. Packer, et. al. "Assessing Generalization in Deep Reinforcement Learning"
130 | 5. A. Oppermann, "self learing AI agents part-II Deep Q-learning", 2018
131 | 6. Watkins & Dayan, "Q-Learning", 1992
132 | 
133 | 


--------------------------------------------------------------------------------
/rps_deepRL.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | import math
  4 | 
  5 | from keras.models import Sequential
  6 | from keras.layers import Dense, Dropout
  7 | from keras.optimizers import Adam
  8 | 
  9 | from collections import deque
 10 | from randmove import genOneMove
 11 | 
 12 | import matplotlib.pyplot as plt
 13 | from matplotlib import style
 14 | style.use('ggplot')
 15 | 
 16 | # -------------------------- SETTING UP THE ENVIRONMENT --------------------------------------
 17 | # simple game, therefore we are not using the open gym custom set up
 18 | #---------------------------------------------------------------------------------------------
 19 | class RPSenv():
 20 | 	def __init__ (self):
 21 | 		self.action_space = [0,1,2]		# integer representation of r/p/s
 22 | 		self.seed = random.seed(4) 		# make it deterministic
 23 | 		self.norm_mu = 0				# center point for guassian distribution
 24 | 		self.norm_sigma = 2.0			# sigma for std distribution 
 25 | 		self.seqIndex = 0				# index for pointing to the SEQ sequnce 
 26 | 		self.p2Mode = 'PRNG'  			# SEQ or PRNG or LFSR
 27 | 		self.p2Count = [0, 0, 0] 		# player 2 win tie lost count
 28 | 		self.p1Count = [0, 0, 0]		# player 1 win tie lost count
 29 | 		self.window = 10					# window size for rate trending calc
 30 | 		self.cumWinRate, self.cumTieRate, self.cumLostRate = None, None, None
 31 | 		self.cumWinCount, self.cumTieCount, self.cumLostCount = None, None, None
 32 | 		self.winRateTrend, self.tieRateTrend, self.lostRateTrend = 0, 0, 0
 33 | 		self.winRateMovingAvg, self.tieRateMovingAvg, self.lostRateMovingAvg = 0, 0, 0
 34 | 		self.winRateBuf, self.tieRateBuf, self.lostRateBuf \
 35 | 			= deque(maxlen=self.window), deque(maxlen=self.window), deque(maxlen=self.window)
 36 | 		# put all the observation state in here; shape in Keras input format
 37 | 		self.state = np.array([[ \
 38 | 			None, None, None, \
 39 | 			self.winRateTrend, self.tieRateTrend, self.lostRateTrend, \
 40 | 			self.winRateMovingAvg, self.tieRateMovingAvg, self.lostRateMovingAvg \
 41 | 			]])  
 42 | 
 43 | 	def reset(self):
 44 | 		# reset all the state
 45 | 		self.cumWinRate, self.cumTieRate, self.cumLostRate = 0, 0, 0
 46 | 		self.cumWinCount, self.cumTieCount, self.cumLostCount = 0, 0, 0
 47 | 		self.winRateTrend, self.tieRateTrend, self.lostRateTrend = 0, 0, 0
 48 | 		self.winRateMovingAvg, self.tieRateMovingAvg, self.lostRateMovingAvg = 0, 0, 0
 49 | 		return np.array([0, 0, 0, 0, 0, 0, 0, 0, 0])
 50 | 
 51 | 	def step(self, action, moveCount, stage):	
 52 | 		# value mode is PRNG or SEQ
 53 | 		p2Move = genOneMove(self, self.p2Mode, stage)			# play one move from player2
 54 | 		self.p2Count[p2Move] += 1
 55 | 		p1Move = action
 56 | 		self.p1Count[p1Move] += 1
 57 | 
 58 | 		# check who won, set flag and assign reward 
 59 | 		win, tie, lost = 0, 0, 0
 60 | 		if p1Move == p2Move:
 61 | 			self.cumTieCount, tie   = self.cumTieCount  + 1, 1
 62 | 		elif (p1Move - p2Move == 1) or (p1Move - p2Move == -2):
 63 | 			self.cumWinCount, win   = self.cumWinCount  + 1, 1
 64 | 		else:
 65 | 			self.cumLostCount, lost = self.cumLostCount + 1, 1
 66 | 
 67 | 		# update the running rates 
 68 | 		self.cumWinRate = self.cumWinCount / moveCount
 69 | 		self.cumTieRate = self.cumTieCount / moveCount
 70 | 		self.cumLostRate = self.cumLostCount / moveCount
 71 | 		# update moving avg buffer
 72 | 		self.winRateBuf.append(self.cumWinRate) 
 73 | 		self.tieRateBuf.append(self.cumTieRate)
 74 | 		self.lostRateBuf.append(self.cumLostRate)
 75 | 		# calculate trend
 76 | 		tmp = [0, 0, 0]
 77 | 		self.winRateTrend, self.tieRateTrend, self.lostRateTrend = 0, 0, 0
 78 | 		if moveCount >= self.window:
 79 | 			tmp[0] = sum(self.winRateBuf[i] for i in range(self.window)) / self.window
 80 | 			tmp[1] = sum(self.tieRateBuf[i] for i in range(self.window)) / self.window
 81 | 			tmp[2] = sum(self.lostRateBuf[i] for i in range(self.window)) / self.window
 82 | 			# win rate trend analysis
 83 | 			if self.winRateMovingAvg  < tmp[0]: 
 84 | 				self.winRateTrend = 1		# win rate trending up. That's good
 85 | 			else: 
 86 | 				self.winRateTrend = 0		# win rate trending down. That's bad
 87 | 			# tie rate trend analysis
 88 | 			if self.tieRateMovingAvg  < tmp[1]:
 89 | 				self.tieRateTrend = 1  		# tie rate trending up. That's bad
 90 | 			else:
 91 | 				self.tieRateTrend = 0  		# tie rate trending down.  Neutral
 92 | 			# lost rate trend analysis
 93 | 			if self.lostRateMovingAvg  < tmp[2]:
 94 | 				self.lostRateTrend = 1  	# lst rate trending up.  That's bad
 95 | 			else:
 96 | 				self.lostRateTrend = 0  	# lost rate trending down. That's good
 97 | 			self.winRateMovingAvg, self.tieRateMovingAvg, self.lostRateMovingAvg = tmp[0], tmp[1], tmp[2]
 98 | 		# net reward in this round
 99 | 		reward = win 									
100 | 		# record the state and reshape it for Keras input format
101 | 		dim = self.state.shape[1]
102 | 		self.state = np.array([\
103 | 			win, tie, lost, \
104 | 			self.winRateTrend, self.tieRateTrend, self.lostRateTrend, \
105 | 			self.winRateMovingAvg, self.tieRateMovingAvg, self.lostRateMovingAvg \
106 | 			]).reshape(1, dim)
107 | 		# this game is done when it hits this goal
108 | 		done = False 
109 | 		return self.state, reward, done, dim
110 | 
111 | # ------------------------- class for the Double-DQN agent ---------------------------------
112 | # facilities utilized here:
113 | # 1)  Double DQN networks: one for behavior policy, one for target policy
114 | # 2)  Learn from  sample from pool of memories 
115 | # 3)  Basic TD-Learning stuff:  learning rate,  gamma for discounting future rewards
116 | # 4)  Use of epsilon-greedy policy for controlling exploration vs exploitation
117 | #-------------------------------------------------------------------------------------------
118 | class DDQN:
119 |     def __init__(self, env):
120 |         self.env     = env
121 |         # initialize the memory and auto drop when memory exceeds maxlen
122 |         # this controls how far out in history the "expeience replay" can select from
123 |         self.memory  = deque(maxlen=2000)   
124 |         # future reward discount rate of the max Q of next state
125 |         self.gamma = 0.9 			  
126 |         # epsilon denotes the fraction of time dedicated to exploration (as oppse to exploitation)
127 |         self.epsilon = 1.0
128 |         self.epsilon_min = 0.01
129 |         self.epsilon_decay = 0.9910
130 |         # model learning rate (use in backprop SGD process)
131 |         self.learning_rate = 0.005	
132 |         # transfer learning proportion contrl between the target and action/behavioral NN
133 |         self.tau = .125 			
134 |         # create two models for double-DQN implementation
135 |         self.model        = self.create_model()
136 |         self.target_model = self.create_model()
137 |         # some space to collect TD target for instrumentaion
138 |         self.TDtargetdelta, self.TDtarget = [], []
139 |         self.Qmax =[]
140 | 
141 |     def create_model(self):
142 |         model   = Sequential()
143 |         state_shape  = self.env.state.shape[1]
144 |         model.add(Dense(24, input_dim=state_shape, activation="relu"))
145 |         model.add(Dense(24, activation="relu"))
146 |         model.add(Dense(24, activation="relu"))
147 | 		# let the output be the predicted target value.  NOTE: do not use activation to squash it!
148 |         model.add(Dense(len(self.env.action_space)))  
149 |         model.compile(loss="mean_squared_error", optimizer=Adam(lr=self.learning_rate))
150 |         print(model.summary())
151 | 
152 |         return model
153 | 
154 |     def act(self, state):
155 |     	# this is to take one action
156 |         self.epsilon *= self.epsilon_decay
157 |         self.epsilon = max(self.epsilon_min, self.epsilon)
158 |         # decide to take a random exploration or make a policy-based action (thru NN prediction)
159 |         if np.random.random() < self.epsilon:
160 |         	# return a random move from action space
161 |         	return random.choice(self.env.action_space)
162 |         else:
163 |         	# return a policy move
164 |         	self.Qmax.append(max(self.model.predict(state)[0]))
165 |         	return np.argmax(self.model.predict(state)[0])
166 | 
167 |     def remember(self, state, action, reward, new_state, done):
168 | 		# store up a big pool of memory
169 |         self.memory.append([state, action, reward, new_state, done])
170 | 
171 |     def replay(self):  		# DeepMind "experience replay" method
172 |     	# the sample size from memory to learn from
173 |         batch_size = 32
174 |         # do nothing untl the memory is large enough
175 |         if len(self.memory) < batch_size: return
176 |         # get the samples
177 |         samples = random.sample(self.memory, batch_size)
178 |         # do the training (learning); this is DeepMind tricks of using "Double" model (Mnih 2015)
179 |         for sample in samples:
180 |             state, action, reward, new_state, done = sample
181 |             target = self.target_model.predict(state)
182 |             #print('target at state is ', target)
183 |             if done:
184 |                 target[0][action] = reward
185 |             else:
186 |                 Q_future = max(self.target_model.predict(new_state)[0]) 
187 |                 TDtarget = reward + Q_future * self.gamma
188 |                 self.TDtarget.append(TDtarget)
189 |                 self.TDtargetdelta.append(TDtarget - target[0][action])
190 |                 target[0][action] = TDtarget	 			
191 |             # do one pass gradient descend using target as 'label' to train the action model
192 |             self.model.fit(state, target, epochs=1, verbose=0)
193 |         
194 |     def target_train(self):
195 |     	# transfer weights  proportionally from the action/behave model to the target model
196 |         weights = self.model.get_weights()
197 |         target_weights = self.target_model.get_weights()
198 |         for i in range(len(target_weights)):
199 |             target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)
200 |         self.target_model.set_weights(target_weights)
201 | 
202 |     def save_model(self, fn):
203 |         self.model.save(fn)
204 | 
205 | # ------------------------- MAIN BODY ----------------------------------------
206 | 
207 | def main():
208 | 	episodes, trial_len =  150, 300					# lenght of game play
209 | 	stage, totalStages = 0, 5						# # of stages with change distribution
210 | 	sigma_reduce = -0.1								# sigma change amount at each stage
211 | 	cumReward, argmax = 0, 0						# init for intrumentation
212 | 	steps, rateTrack = [], []
213 | 	avgQmaxList, avgQ_futureList,avgQ_targetmaxList, avgTDtargetList = [], [], [], []
214 | 	avgCumRewardList = []
215 | 	p1Rate, p2Rate = [], []
216 | 	# declare the game play environment and AI agent
217 | 	env = RPSenv()
218 | 	dqn_agent = DDQN(env=env)
219 | 	# ------------------------------------------ start the game -----------------------------------------
220 | 	print('STARTING THE GAME with %s episodes each with %s moves' % (episodes, trial_len), '\n')
221 | 	for episode in range(episodes):
222 | 		cur_state = env.reset().reshape(1,env.state.shape[1])   # reset and get initial state in Keras shape
223 | 		cumReward = 0
224 | 		# ----------------- Select play strategy (apply to PRNG mode only) ------------------------------
225 | 		#   this change strategy affects distribution of r-p-s
226 | 		# -----------------------------------------------------------------------------------------------
227 | 		if (episode+1) % (episodes // totalStages) == 0: env.norm_sigma += sigma_reduce	# step repsonse change the gaussian distribution
228 | 		#   this change strategy affects which move is dominant (control in randmove module)
229 | 		stage = episode // (episodes // totalStages)									# divide total episodes into 3 equal length stages
230 | 		
231 | 		for step in range(trial_len):
232 | 			# AI agent take one action
233 | 			action = dqn_agent.act(cur_state)
234 | 			# play the one move and see how the environment reacts to it
235 | 			new_state, reward, done, info = env.step(action, step + 1, stage)
236 | 			cumReward += reward
237 | 			# record the play into memory pool
238 | 			dqn_agent.remember(cur_state, action, reward, new_state, done)
239 | 			# perform Q-learning from using |"experience replay": learn from random samples in memory
240 | 			dqn_agent.replay()
241 |             # apply tranfer learning from actions model to the target model.
242 | 			dqn_agent.target_train() 
243 | 			# update the current state with environment new state
244 | 			cur_state = new_state
245 | 			if done:  break
246 | 		#-------------------------------- INSTRUMENTAL AND PLOTTING -------------------------------------------
247 | 		# the instrumental are performed at the end of each episode
248 | 		# store epsiode #, winr rate, tie rate, lost rate, etc. etc.
249 | 		#------------------------------------------------------------------------------------------------------
250 | 		rateTrack.append([episode+1, env.cumWinRate, env.cumTieRate, env.cumLostRate])
251 | 		if True:		# print ongoing performance
252 | 			print('EPISODE ', episode + 1), 
253 | 			if env.p2Mode == 'PRNG': print('stage:', stage, ' sigma:', env.norm_sigma)
254 | 			print(' WIN RATE %.2f ' % env.cumWinRate, \
255 | 				'    tie rate %.2f' % env.cumTieRate, \
256 | 				'lose rate %.2f' % env.cumLostRate)
257 | 		
258 | 		# print move distribution between the players
259 | 		if True:
260 | 			p1Rate.append([env.p1Count[0] / trial_len, env.p1Count[1] / trial_len, env.p1Count[2] / trial_len])
261 | 			p2Rate.append([env.p2Count[0] / trial_len, env.p2Count[1] / trial_len, env.p2Count[2] / trial_len])
262 | 			print (' P1 rock rate: %.2f paper rate: %.2f scissors rate: %.2f' %  (p1Rate[-1][0], p1Rate[-1][1], p1Rate[-1][2]))
263 | 			print (' P2 rock rate: %.2f paper rate: %.2f scissors rate: %.2f' %  (p2Rate[-1][0], p2Rate[-1][1], p2Rate[-1][2]))
264 | 			env.p1Count, env.p2Count = [0,0,0], [0,0,0]
265 | 		
266 | 		# summarize Qmax from action model and reward 
267 | 		avgQmax = sum(dqn_agent.Qmax) / trial_len  	# from action model
268 | 		avgQmaxList.append(avgQmax)
269 | 
270 | 		avgCumReward = cumReward / trial_len
271 | 		avgCumRewardList.append(avgCumReward)
272 | 		if True:
273 | 			print(' Avg reward: %.2f Avg Qmax: %.2f' % (avgCumReward, avgQmax))
274 | 		dqn_agent.Qmax=[] 		# reset for next episode
275 | 
276 | 
277 | 	# ---------------- plot the main plot when all the episodes are done ---------------------------
278 | 	#
279 | 	if True:
280 | 		fig = plt.figure(figsize=(12,5))	
281 | 		plt.subplots_adjust(wspace = 0.2, hspace = 0.2)
282 | 		
283 | 		# plot the average Qmax
284 | 		rpsplot = fig.add_subplot(321)
285 | 		plt.title('Average Qmax from action model', loc='Left', weight='bold', color='Black', \
286 | 			fontdict = {'fontsize' : 10})
287 | 		rpsplot.plot(avgQmaxList, color='blue')
288 | 		
289 | 		# plot the TDtarget
290 | 		rpsplot = fig.add_subplot(323)
291 | 		plt.title('TD target minus Q target from experience replay', loc='Left', weight='bold', \
292 | 			color='Black', fontdict = {'fontsize' : 10})
293 | 		rpsplot.plot(dqn_agent.TDtarget, color='blue')
294 | 		
295 | 		# plot the TDtarget
296 | 		rpsplot = fig.add_subplot(325)
297 | 		plt.title('TD target from experience replay', loc='Left', weight='bold', color='Black', \
298 | 			fontdict = {'fontsize' : 10})
299 | 		rpsplot.plot(dqn_agent.TDtargetdelta, color='blue')
300 | 		
301 | 		# plot thte win rate
302 | 		rpsplot = fig.add_subplot(322)
303 | 		plt.title('Win-Tie-Lost Rate', loc='Left', weight='bold', color='Black', \
304 | 			fontdict = {'fontsize' : 10})
305 | 		rpsplot.plot([i[1] for i in rateTrack], color='green')
306 | 		rpsplot.plot([i[2] for i in rateTrack], color='blue')
307 | 		rpsplot.plot([i[3] for i in rateTrack], color='red')
308 | 		
309 | 		# plot thte win rate
310 | 		rpsplot = fig.add_subplot(324)
311 | 		plt.title('Player 2 move percentage', loc='Left', weight='bold', color='Black', \
312 | 			fontdict = {'fontsize' : 10})
313 | 		rpsplot.plot([i[0] for i in p2Rate], color='orange')
314 | 		rpsplot.plot([i[1] for i in p2Rate], color='red')
315 | 		rpsplot.plot([i[2] for i in p2Rate], color='green')
316 | 		
317 | 		# plot the reward 
318 | 		rpsplot = fig.add_subplot(326)
319 | 		plt.title('Average Reward per Episode', loc='Left', weight='bold', color='Black', \
320 | 			fontdict = {'fontsize' : 10})
321 | 		rpsplot.plot(avgCumRewardList, color='green')
322 | 		plt.show(block = False)
323 | 	
324 | if __name__ == "__main__":
325 | 	main()
326 | 


--------------------------------------------------------------------------------
/rps_deepRL_withLSTM.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | import math
  4 | 
  5 | from keras.models import Sequential
  6 | from keras.layers import Dense, Dropout, LSTM, GRU, TimeDistributed, Flatten
  7 | from keras.optimizers import Adam
  8 | 
  9 | from collections import deque
 10 | from randmove import genOneMove
 11 | 
 12 | import matplotlib.pyplot as plt
 13 | from matplotlib import style
 14 | style.use('ggplot')
 15 | 
 16 | # -------------------------- SETTING UP THE ENVIRONMENT --------------------------------------
 17 | # simple game, therefore we are not using the open gym custom set up
 18 | #---------------------------------------------------------------------------------------------
 19 | class RPSenv():
 20 | 	def __init__ (self):
 21 | 		self.action_space = [0,1,2]		# integer representation of r/p/s
 22 | 		self.seed = random.seed(4) 		# make it deterministic
 23 | 		self.norm_mu = 0				# center point for guassian distribution
 24 | 		self.norm_sigma = 2.0			# sigma for std distribution 
 25 | 		self.seqIndex = 0				# index for pointing to the SEQ sequnce 
 26 | 		self.p2Mode = 'SEQ'  			# SEQ or PRNG or LFSR
 27 | 		self.p2Count = [0, 0, 0] 		# player 2 win tie lost count
 28 | 		self.p1Count = [0, 0, 0]		# player 1 win tie lost count
 29 | 		self.window = 10					# window size for rate trending calc
 30 | 		self.cumWinRate, self.cumTieRate, self.cumLostRate = None, None, None
 31 | 		self.overallWinRate, self.overallTieRate, self.overallLostRate = 0, 0, 0
 32 | 		self.cumWinCount, self.cumTieCount, self.cumLostCount = None, None, None
 33 | 		self.winRateTrend, self.tieRateTrend, self.lostRateTrend = 0, 0, 0
 34 | 		self.winRateMovingAvg, self.tieRateMovingAvg, self.lostRateMovingAvg = 0, 0, 0
 35 | 		self.winRateBuf, self.tieRateBuf, self.lostRateBuf \
 36 | 			= deque(maxlen=self.window), deque(maxlen=self.window), deque(maxlen=self.window)
 37 | 		# put all the observation state in here; shape in Keras input format
 38 | 		self.state = np.array([[ \
 39 | 			None, None, None, \
 40 | 			self.winRateTrend, self.tieRateTrend, self.lostRateTrend, \
 41 | 			self.winRateMovingAvg, self.tieRateMovingAvg, self.lostRateMovingAvg \
 42 | 			]])  
 43 | 
 44 | 	def reset(self):
 45 | 		# reset all the state
 46 | 		self.cumWinRate, self.cumTieRate, self.cumLostRate = 0, 0, 0
 47 | 		self.cumWinCount, self.cumTieCount, self.cumLostCount = 0, 0, 0
 48 | 		self.winRateTrend, self.tieRateTrend, self.lostRateTrend = 0, 0, 0
 49 | 		self.winRateMovingAvg, self.tieRateMovingAvg, self.lostRateMovingAvg = 0, 0, 0
 50 | 		return np.array([0, 0, 0, 0, 0, 0, 0, 0, 0])
 51 | 
 52 | 	def step(self, action, moveCount, stage):	
 53 | 		# value mode is PRNG or SEQ
 54 | 		p2Move = genOneMove(self, self.p2Mode, stage)			# play one move from player2
 55 | 		self.p2Count[p2Move] += 1
 56 | 		p1Move = action
 57 | 		self.p1Count[p1Move] += 1
 58 | 
 59 | 		# check who won, set flag and assign reward 
 60 | 		win, tie, lost = 0, 0, 0
 61 | 		if p1Move == p2Move:
 62 | 			self.cumTieCount, tie   = self.cumTieCount  + 1, 1
 63 | 		elif (p1Move - p2Move == 1) or (p1Move - p2Move == -2):
 64 | 			self.cumWinCount, win   = self.cumWinCount  + 1, 1
 65 | 		else:
 66 | 			self.cumLostCount, lost = self.cumLostCount + 1, 1
 67 | 
 68 | 		# update the running rates 
 69 | 		self.cumWinRate = self.cumWinCount / moveCount
 70 | 		self.cumTieRate = self.cumTieCount / moveCount
 71 | 		self.cumLostRate = self.cumLostCount / moveCount
 72 | 		# update moving avg buffer
 73 | 		self.winRateBuf.append(self.cumWinRate) 
 74 | 		self.tieRateBuf.append(self.cumTieRate)
 75 | 		self.lostRateBuf.append(self.cumLostRate)
 76 | 		# calculate trend
 77 | 		tmp = [0, 0, 0]
 78 | 		self.winRateTrend, self.tieRateTrend, self.lostRateTrend = 0, 0, 0
 79 | 		if moveCount >= self.window:
 80 | 			tmp[0] = sum(self.winRateBuf[i] for i in range(self.window)) / self.window
 81 | 			tmp[1] = sum(self.tieRateBuf[i] for i in range(self.window)) / self.window
 82 | 			tmp[2] = sum(self.lostRateBuf[i] for i in range(self.window)) / self.window
 83 | 			# win rate trend analysis
 84 | 			if self.winRateMovingAvg  < tmp[0]: 
 85 | 				self.winRateTrend = 1		# win rate trending up. That's good
 86 | 			else: 
 87 | 				self.winRateTrend = 0		# win rate trending down. That's bad
 88 | 			# tie rate trend analysis
 89 | 			if self.tieRateMovingAvg  < tmp[1]:
 90 | 				self.tieRateTrend = 1  		# tie rate trending up. That's bad
 91 | 			else:
 92 | 				self.tieRateTrend = 0  		# tie rate trending down.  Neutral
 93 | 			# lost rate trend analysis
 94 | 			if self.lostRateMovingAvg  < tmp[2]:
 95 | 				self.lostRateTrend = 1  	# lst rate trending up.  That's bad
 96 | 			else:
 97 | 				self.lostRateTrend = 0  	# lost rate trending down. That's good
 98 | 			self.winRateMovingAvg, self.tieRateMovingAvg, self.lostRateMovingAvg = tmp[0], tmp[1], tmp[2]
 99 | 		# net reward in this round
100 | 		reward = win 									
101 | 		# record the state and reshape it for Keras input format
102 | 		dim = self.state.shape[1]
103 | 		self.state = np.array([\
104 | 			win, tie, lost, \
105 | 			self.winRateTrend, self.tieRateTrend, self.lostRateTrend, \
106 | 			self.winRateMovingAvg, self.tieRateMovingAvg, self.lostRateMovingAvg \
107 | 			]).reshape(1, dim)
108 | 		# this game is done when it hits this goal
109 | 		done = False 
110 | 		return self.state, reward, done, dim
111 | 
112 | # ------------------------- class for the Double-DQN agent ---------------------------------
113 | # facilities utilized here:
114 | # 1)  Double DQN networks: one for behavior policy, one for target policy
115 | # 2)  Learn from  sample from pool of memories 
116 | # 3)  Basic TD-Learning stuff:  learning rate,  gamma for discounting future rewards
117 | # 4)  Use of epsilon-greedy policy for controlling exploration vs exploitation
118 | #-------------------------------------------------------------------------------------------
119 | class DDQN:
120 |     def __init__(self, env):
121 |         self.env     = env
122 |         # initialize the memory and auto drop when memory exceeds maxlen
123 |         # this controls how far out in history the "expeience replay" can select from
124 |         self.maxlen = 3000
125 |         self.memory  = deque(maxlen = self.maxlen)   
126 |         # future reward discount rate of the max Q of next state
127 |         self.gamma = 0.9 			  
128 |         # epsilon denotes the fraction of time dedicated to exploration (as oppse to exploitation)
129 |         self.epsilon = 1.0
130 |         self.epsilon_min = 0.01
131 |         self.epsilon_decay = 0.9910
132 |         # model learning rate (use in backprop SGD process)
133 |         self.learning_rate = 0.005	
134 |         # transfer learning proportion contrl between the target and action/behavioral NN
135 |         self.tau = .125 
136 |         # hyyperparameters for LSTM
137 |         self.lookback = 100
138 |         self.hiddenUnits = 50
139 |         # create two models for double-DQN implementation
140 |         self.model        = self.create_model()
141 |         self.target_model = self.create_model()
142 |         # some space to collect TD target for instrumentaion
143 |         self.TDtargetdelta, self.TDtarget = [], []
144 |         self.Qmax =[]
145 |         
146 | 
147 |     def create_model(self):
148 |         input_feature_dim = self.env.state.shape[1]
149 |         output_feature_dim = len(self.env.action_space)
150 |         model = Sequential()
151 |         #model.add(GRU(self.hiddenUnits,\
152 |         model.add(LSTM(self.hiddenUnits,\
153 | 		    return_sequences = False,\
154 | 		    #activation = None, \
155 | 		    #recurrent_activation = None, \
156 |             input_shape = (self.lookback, input_feature_dim)))
157 | 		# let the output be the predicted target value.  NOTE: do not use activation to squash it! 
158 |         #model.add(TimeDistributed(Dense(4)))
159 |         #model.add(Flatten())
160 |         model.add(Dense(output_feature_dim))
161 |         model.compile(loss="mean_squared_error", optimizer=Adam(lr=self.learning_rate))
162 |         print(model.summary())
163 |         return model
164 | 
165 |     def act(self, state, step):
166 |     	# this is to take one action
167 |         self.epsilon *= self.epsilon_decay
168 |         self.epsilon = max(self.epsilon_min, self.epsilon)
169 |         # decide to take a random exploration or make a policy-based action (thru NN prediction)
170 |         # with a LSTM design, delay on policy prediction after at least lookback steps have accumlated
171 |         if np.random.random() < self.epsilon or step < self.lookback + 1:
172 |         	# return a random move from action space
173 |         	return random.choice(self.env.action_space)
174 |         else:
175 |             # return a policy move
176 |             state_set = np.empty((1, self.env.state.shape[1]))  # iniitial with 2 dims
177 |             for j in range(self.lookback):
178 |                 state_tmp, _, _, _, _  = self.memory[-(j+1)]    # get the most recent state and the previous N states
179 |                 if j == 0:
180 |                 	state_set[0] = state_tmp                 	# iniitalize the first record
181 |                 else:
182 |                     state_set = np.concatenate((state_set, state_tmp), axis = 0)		# get a consecutive set of states for LSTM prediction        
183 |             state_set = state_set[None, :, :]  					# make the tensor 3 dim to align with Keras reqmt
184 |             #print(state_set)
185 |             #print(state_set.shape)
186 |             self.Qmax.append(max(self.model.predict(state_set)[0]))
187 |             return np.argmax(self.model.predict(state_set)[0])
188 | 
189 |     def remember(self, state, action, reward, new_state, done):
190 | 		# store up a big pool of memory
191 |         self.memory.append([state, action, reward, new_state, done])
192 | 
193 |     def replay(self):  		
194 |     	# DeepMind "experience replay" method
195 |     	# do the training (learning); this is DeepMind tricks of using "Double" model (Mnih 2015)
196 |     	# the sample size from memory to learn from
197 |      	#------------------------
198 |         # do nothing untl the memory is large enough
199 |         RL_batch_size = 24  # this is experience replay batch_size (not the LSTM fitting batch size)
200 |         if len(self.memory) < RL_batch_size: return
201 |         # get the samples; each sample is a sequence of consecutive states with same lookback length as LSTM definition
202 |         for i in range(RL_batch_size):
203 |             state_set     = np.empty((1, self.env.state.shape[1]))
204 |             new_state_set = np.empty((1, self.env.state.shape[1]))
205 |             if len(self.memory) <= self.lookback:							# check if memory is large enough to retrieve the time sequence
206 |                 return
207 |             else:
208 |        	    	a = random.randint(0, len(self.memory) - self.lookback)		# first get a random location
209 |             state, action, reward, new_state, done = self.memory[-(a+1)]    # retrieve a sample from memory at that loc; latest element at the end of deque
210 |             for j in range(self.lookback):
211 |                 state_tmp, _, _, new_state_tmp, _  = self.memory[-(a+j+1)]  # get a consecutive set of states
212 |                 if j == 0:
213 |                 	state_set[0] = state_tmp
214 |                 	new_state_set[0] = new_state_tmp
215 |                 else:
216 |                     state_set = np.concatenate((state_set, state_tmp), axis = 0)		# get a consecutive set of states for LSTM prediction
217 |                     new_state_set = np.concatenate((new_state_set, new_state_tmp), axis = 0)  
218 |             # do the prediction from current state
219 |             state_set     = state_set[None, :, :]							# make the tensor 3 dim to align with Keras reqmt
220 |             new_state_set = new_state_set[None, :, :]						# make the tensor 3 dim to align with Keras reqmt
221 |             target        = self.target_model.predict(state_set)
222 |             # do the Q leanring
223 |             if done:
224 |                 target[0][action] = reward
225 |             else:
226 |                 Q_future = max(self.target_model.predict(new_state_set)[0]) 
227 |                 TDtarget = reward + Q_future * self.gamma
228 |                 self.TDtarget.append(TDtarget)
229 |                 self.TDtargetdelta.append(TDtarget - target[0][action])
230 |                 target[0][action] = TDtarget	 			
231 |             # do one pass gradient descend using target as 'label' to train the action model
232 |             self.model.fit(state_set, target, batch_size = 1,  epochs = 1, verbose = 0)
233 |         
234 |     def target_train(self):
235 |     	# transfer weights  proportionally from the action/behave model to the target model
236 |         weights = self.model.get_weights()
237 |         target_weights = self.target_model.get_weights()
238 |         for i in range(len(target_weights)):
239 |             target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)
240 |         self.target_model.set_weights(target_weights)
241 | 
242 |     def save_model(self, fn):
243 |         self.model.save(fn)
244 | 
245 | # ------------------------- MAIN BODY ----------------------------------------
246 | 
247 | def main():
248 | 	episodes, trial_len =  50, 200					# lenght of game play
249 | 	stage, totalStages = 0, 2						# # of stages with change distribution
250 | 	sigma_reduce = -0.1								# sigma change amount at each stage
251 | 	cumReward, argmax = 0, 0						# init for intrumentation
252 | 	steps, rateTrack, overallRateTrack = [], [], []
253 | 	avgQmaxList, avgQ_futureList,avgQ_targetmaxList, avgTDtargetList = [], [], [], []
254 | 	avgCumRewardList = []
255 | 	p1Rate, p2Rate = [], []
256 | 	# declare the game play environment and AI agent
257 | 	env = RPSenv()
258 | 	dqn_agent = DDQN(env=env)
259 | 	# ------------------------------------------ start the game -----------------------------------------
260 | 	print('STARTING THE GAME with %s episodes each with %s moves' % (episodes, trial_len), '\n')
261 | 	for episode in range(episodes):
262 | 		cur_state = env.reset().reshape(1,env.state.shape[1])   # reset and get initial state in Keras shape
263 | 		cumReward = 0
264 | 		# ----------------- Select play strategy (apply to PRNG mode only) ------------------------------
265 | 		#   this change strategy affects distribution of r-p-s
266 | 		# -----------------------------------------------------------------------------------------------
267 | 		if (episode+1) % (episodes // totalStages) == 0: env.norm_sigma += sigma_reduce	# step repsonse change the gaussian distribution
268 | 		#   this change strategy affects which move is dominant (control in randmove module)
269 | 		stage = episode // (episodes // totalStages)									# divide total episodes into 3 equal length stages
270 | 		
271 | 		for step in range(trial_len):
272 | 			# AI agent take one action
273 | 			action = dqn_agent.act(cur_state, step)
274 | 			# play the one move and see how the environment reacts to it
275 | 			new_state, reward, done, info = env.step(action, step + 1, stage)
276 | 			cumReward += reward
277 | 			# record the play into memory pool
278 | 			dqn_agent.remember(cur_state, action, reward, new_state, done)
279 | 			# perform Q-learning from using |"experience replay": learn from random samples in memory
280 | 			dqn_agent.replay()
281 |             # apply tranfer learning from actions model to the target model.
282 | 			dqn_agent.target_train() 
283 | 			# update the current state with environment new state
284 | 			cur_state = new_state
285 | 			if done:  break
286 | 		#-------------------------------- INSTRUMENTAL AND PLOTTING -------------------------------------------
287 | 		# the instrumental are performed at the end of each episode
288 | 		# store epsiode #, winr rate, tie rate, lost rate, etc. etc.
289 | 		#------------------------------------------------------------------------------------------------------
290 | 		rateTrack.append([episode+1, env.cumWinRate, env.cumTieRate, env.cumLostRate])
291 | 		env.overallWinRate += env.cumWinRate
292 | 		env.overallTieRate += env.cumTieRate
293 | 		env.overallLostRate += env.cumLostRate
294 | 		overallRateTrack.append([episode+1, 
295 | 			env.overallWinRate / (episode +1), \
296 | 			env.overallTieRate / (episode +1), \
297 | 			env.overallLostRate / (episode +1),])
298 | 		if True:		# print ongoing performance
299 | 			print('EPISODE ', episode + 1), 
300 | 			if env.p2Mode == 'PRNG': print('stage:', stage, ' sigma:', env.norm_sigma)
301 | 			print(' WIN RATE %.2f ' % env.cumWinRate, \
302 | 				'    tie rate %.2f' % env.cumTieRate, \
303 | 				'lose rate %.2f' % env.cumLostRate)
304 | 		
305 | 		# print move distribution between the players
306 | 		if True:
307 | 			p1Rate.append([env.p1Count[0] / trial_len, env.p1Count[1] / trial_len, env.p1Count[2] / trial_len])
308 | 			p2Rate.append([env.p2Count[0] / trial_len, env.p2Count[1] / trial_len, env.p2Count[2] / trial_len])
309 | 			print (' P1 rock rate: %.2f paper rate: %.2f scissors rate: %.2f' %  (p1Rate[-1][0], p1Rate[-1][1], p1Rate[-1][2]))
310 | 			print (' P2 rock rate: %.2f paper rate: %.2f scissors rate: %.2f' %  (p2Rate[-1][0], p2Rate[-1][1], p2Rate[-1][2]))
311 | 			env.p1Count, env.p2Count = [0,0,0], [0,0,0]
312 | 		
313 | 		# summarize Qmax from action model and reward 
314 | 		avgQmax = sum(dqn_agent.Qmax) / trial_len  	# from action model
315 | 		avgQmaxList.append(avgQmax)
316 | 
317 | 		avgCumReward = cumReward / trial_len
318 | 		avgCumRewardList.append(avgCumReward)
319 | 		if True:
320 | 			print(' Avg reward: %.2f Avg Qmax: %.2f' % (avgCumReward, avgQmax))
321 | 		dqn_agent.Qmax=[] 		# reset for next episode
322 | 
323 | 
324 | 	# ---------------- plot the main plot when all the episodes are done ---------------------------
325 | 	#
326 | 	if True:
327 | 		fig = plt.figure(figsize=(12,5))	
328 | 		plt.subplots_adjust(wspace = 0.2, hspace = 0.2)
329 | 		
330 | 		# plot the average Qmax
331 | 		rpsplot = fig.add_subplot(321)
332 | 		plt.title('Average Qmax from action model', loc='Left', weight='bold', color='Black', \
333 | 			fontdict = {'fontsize' : 10})
334 | 		rpsplot.plot(avgQmaxList, color='blue')
335 | 		
336 | 		# plot the TDtarget
337 | 		rpsplot = fig.add_subplot(323)
338 | 		plt.title('TD target minus Q target from experience replay', loc='Left', weight='bold', \
339 | 			color='Black', fontdict = {'fontsize' : 10})
340 | 		rpsplot.plot(dqn_agent.TDtarget, color='blue')
341 | 		
342 | 		# plot the TDtarget
343 | 		#rpsplot = fig.add_subplot(325)
344 | 		#plt.title('TD target from experience replay', loc='Left', weight='bold', color='Black', \
345 | 		#	fontdict = {'fontsize' : 10})
346 | 		#rpsplot.plot(dqn_agent.TDtargetdelta, color='blue')
347 | 		
348 | 		# plot thte win rate
349 | 		rpsplot = fig.add_subplot(322)
350 | 		plt.title('Win-Tie-Lost Rate', loc='Left', weight='bold', color='Black', \
351 | 			fontdict = {'fontsize' : 10})
352 | 		rpsplot.plot([i[1] for i in rateTrack], color='green')
353 | 		rpsplot.plot([i[2] for i in rateTrack], color='blue')
354 | 		rpsplot.plot([i[3] for i in rateTrack], color='red')
355 | 		
356 | 		# plot thte win rate
357 | 		rpsplot = fig.add_subplot(324)
358 | 		plt.title('Player-1 Overall Win-Tie-Lost Rate', loc='Left', weight='bold', color='Black', \
359 | 			fontdict = {'fontsize' : 10})
360 | 		rpsplot.plot([i[1] for i in overallRateTrack], color='green')
361 | 		rpsplot.plot([i[2] for i in overallRateTrack], color='blue')
362 | 		rpsplot.plot([i[3] for i in overallRateTrack], color='red')
363 | 
364 | 		# plot thte win rate
365 | 		rpsplot = fig.add_subplot(326)
366 | 		plt.title('Player 2 move percentage', loc='Left', weight='bold', color='Black', \
367 | 			fontdict = {'fontsize' : 10})
368 | 		rpsplot.plot([i[0] for i in p2Rate], color='orange')
369 | 		rpsplot.plot([i[1] for i in p2Rate], color='red')
370 | 		rpsplot.plot([i[2] for i in p2Rate], color='green')
371 | 		
372 | 		# plot the reward 
373 | 		rpsplot = fig.add_subplot(325)
374 | 		plt.title('Average Reward per Episode', loc='Left', weight='bold', color='Black', \
375 | 			fontdict = {'fontsize' : 10})
376 | 		rpsplot.plot(avgCumRewardList, color='green')
377 | 		plt.show(block = False)
378 | 	
379 | if __name__ == "__main__":
380 | 	main()
381 | 


--------------------------------------------------------------------------------