├── ActorCritic
    ├── BepedalWalker_ActorCritic.py
    ├── Mujoco_HalfCheetah_v1.0.py
    └── README.md
├── DeepDeterministicSeletiveMemory
    ├── AntBulletEnv.py
    ├── BipedalWalker_v3.0.py
    ├── HalfCheetah.v1.0.py
    ├── Hopper_v1.0.py
    ├── LunarLander_v1.py
    ├── MainAlgo_PR_v1.0.py
    ├── No_Exp_replay_v1.0.py
    ├── Q_as_discr
    │   ├── LunarLanderContinuous_v1.0.py
    │   └── Main_algo.py
    ├── README.md
    ├── ROBOTIC_Template_Experimental_v0.1.py
    ├── RoboschoolHalfCheetah_v1.py
    ├── Tensorflow
    │   ├── BipedalWalker_v2.0.py
    │   ├── Hopper_v1.py
    │   ├── LunarLander_v1.py
    │   ├── Rocker_lander.py
    │   └── _Main_Algo_v1.py
    ├── Walker2D.v2.0.py
    ├── Walker2DBulletEnv.py
    ├── _MainAlgo_v4.2.py
    └── test_algo.py
├── DeepQN
    ├── AtariSpaceInvadors_CNN.py
    ├── Atari_Pong_DeepQN.py
    ├── CartPole_QLearning.py
    ├── Examples
    │   ├── Breakout.py
    │   ├── Breakout2.py
    │   ├── PixelCopter_.py
    │   └── Pong.py
    ├── Main_Gym.py
    ├── Main_Gym_2_Channels.py
    ├── Main_Gym_Channel_Last.py
    ├── Main_Gym_Param_Noise.py
    ├── Main_Gym_v2.py
    └── Main_PLE.py
├── DeepQNPyTorch
    ├── DQN_torch.py
    ├── Lander_torch.py
    └── Readme.md
├── Experimental
    ├── AtariPong_v0.8.5.py
    ├── AtariPong_v0.8.7.py
    ├── AtariPong_v0.9.1.py
    ├── AtariPong_v0.9.2.py
    ├── AtariPong_v0.9.5.py
    ├── Atari_Pong_DeepQN.py
    ├── Atari_Pong_v0.4.3.py
    ├── Atari_pong_v0.4.1_1hot.py
    ├── Atari_pong_v0.4_QN.py
    ├── Atari_pong_v0.8.1.py
    ├── BipedalWalker_PG_v0.2.py
    ├── BipedalWalker_SM.py
    ├── BipedalWalker_SM_v0.7.py
    ├── BipedalWalker_SM_v0.8.py
    ├── BipedalWalker_SM_v0.9.1.py
    ├── BipedalWalker_SM_v0.9.py
    ├── BipedalWalker_SM_v2.py
    ├── BipedalWalker_SelectiveMemory_V5.py
    ├── BipedalWalker_Selective_Memory.py
    ├── BipedalWalker_policyGradient_v0.1.py
    ├── BipedalWalker_v1.0.py
    ├── BipedalWalker_v3.py
    ├── BipedalWalker_v4.py
    ├── CNN_test.py
    ├── Cartpol_CNN_RL.py
    ├── Cartpole_Highest_Reward_mem.py
    ├── Cartpole_simple_CNN.py
    ├── LunarLander_PG_v0.6.py
    ├── LunarLander_SM_v0.3.py
    ├── MNIST_image_Classification.py
    ├── Readme.md
    ├── Walker_A3C.py
    └── image_rescale.py
├── LICENSE
├── NeuroEvolution
    ├── Implementations
    │   ├── BipedalWalker.py
    │   ├── LunarLander.py
    │   ├── Pendulum.py
    │   ├── RS_Ant.py
    │   └── RS_Hopper.py
    ├── Lib
    │   └── Individual.py
    ├── Main_TF_0.7.py
    ├── Main_v4.py
    └── old
    │   ├── Main.py
    │   ├── Main2.py
    │   ├── Main3.py
    │   └── Main4_Multy_Gen.py
├── OptimalPolicyTreeSearch
    └── Cartpole_OPTS.py
├── ParameterNoising
    ├── AntBulletEnv.py
    ├── BipedalWalker_PN_v3.0.py
    ├── Hopper_v1.py
    ├── InvertedDoublePendulum.py
    ├── InvertedPendulum.py
    ├── LunarLanding_v3.0.py
    ├── NoisingFunction.py
    ├── Q_as_disc
    │   ├── Chettah_v1.0.py
    │   ├── LunarLander_v1.py
    │   └── _MainAlgo_Deep_Adaptive_noise_v1.py
    ├── Tensorflow
    │   ├── BipedalWalker_v1.0.py
    │   ├── LunarLander_Continuous.py
    │   ├── LunarLander_V2_wGameAdv.py
    │   └── Main_Algo.py
    ├── Test
    │   ├── BipedalWalker.py
    │   ├── DDSM_Custom_Loss.py
    │   ├── LunarLander_v0.4_w_adaptive_noise.py
    │   └── test.py
    ├── Walker2D.py
    ├── _MainAlgo.py
    └── _MainAlgo_Adaptive_Noise_v2.0.py
├── PolicyGradient
    ├── BipedalWalker_v1.0.py
    ├── HalfCheetah_V1.0.py
    ├── Hopper_v0.9.7.py
    └── Walker2D_v0.9.5.py
├── Pytorch
    ├── ActorCritic
    │   ├── Actor_Critic.py
    │   ├── Advantage_Actor_Critic.py
    │   ├── Load_AC_model.py
    │   ├── Output_noising
    │   │   ├── Actor_Critic_BipedalWalker.py
    │   │   ├── Actor_Critic_Noisy_output_Mem_fix.py
    │   │   ├── Readme.md
    │   │   └── load_AC_Model.py
    │   ├── Parameter_Noising
    │   │   ├── Advantage_Actor_Critic.py
    │   │   ├── Load_AC_model.py
    │   │   ├── Readme.md
    │   │   └── agent_and_model.py
    │   ├── Readme.md
    │   ├── actor01TDQN_RL_MODEL.trl
    │   ├── agent_and_model.py
    │   └── critic01TDQN_RL_MODEL.trl
    ├── DQN
    │   ├── DQN_Cartpol_old_1.py
    │   ├── DQN_Lander.py
    │   ├── DQN_Lander_Old_1.py
    │   ├── DQN_tut.py
    │   ├── DQN_tut_2.py
    │   ├── Load_Agent.py
    │   └── Readme.md
    ├── DQN_CNN
    │   ├── ATARI_DQN_CNN.py
    │   ├── Load_ATARI_AGENT.py
    │   ├── Readme.md
    │   └── agent_and_model.py
    ├── PPO
    │   ├── PPO_LunarLander.py
    │   └── Readme.md
    └── PolicyGradient
    │   ├── Load_model.py
    │   ├── Readme.md
    │   ├── agent_and_model.py
    │   └── policy_gradien_2.py
├── QLearning
    ├── LunarLander_QL.py
    └── LunarLander_v2.py
├── README.md
├── SelectiveMemory
    ├── Ant_SMA_V1.py
    ├── BipedalWalker_v5.py
    ├── BipedalWalker_v6.py
    ├── BipedalWalker_v7.py
    ├── CartPole_SelectiveMemory.py
    ├── DDSM
    │   └── BipedalWalker_DDSM.py
    ├── HalfCheetah_SMA_v1.py
    ├── Hopper_SMA_v1.0.py
    ├── Hopper_SMA_v2.0.py
    ├── LunarLanderContinuous_V1.py
    ├── LunarLander_Selective_Memory.py
    ├── MountainCarContinuous_SMA.py
    ├── MujocoHalfCheetah_v1.0.py
    ├── QasFeature
    │   ├── BipedalWalker_v3.py
    │   ├── BipedalWalker_v4.py
    │   ├── BipedalWalker_v7.py
    │   ├── HalfCheetah_SMQ_V1.py
    │   ├── Hopper_v2.0.py
    │   ├── LunarLanderContinuous_SMQ_v1.py
    │   └── LunarLander_SMQ_V1.py
    └── README.md
├── SimpleNN
    ├── MinFinder.py
    ├── Sonar.csv
    └── data-01-test-score.csv
├── SkillPolicyLearning
    ├── CartPole.py
    └── LunarLander.py
├── Stable_baselines3
    ├── Readme.md
    ├── ppo_load.py
    └── ppo_main2.py
├── Tensorforce
    ├── Readme.md
    ├── tf_LunarLanderContinuous_ppo.py
    ├── tf_LunarLander_ppo.py
    ├── tf_loader.py
    └── tf_main.py
└── img
    ├── DeepQN.png
    ├── LunarLandQLearning.png
    ├── Readme.md
    ├── Screen Shot 2017-11-01 at 7.41.58 PM.png
    ├── ScreenShot1.jpg
    ├── Walker.png
    ├── cCartPole.jpg
    ├── cPong.jpg
    └── cWalker.jpg


/ActorCritic/README.md:
--------------------------------------------------------------------------------
 1 | # Solving Bipedal Walker with Actor Critic with Python and Keras
 2 | ```python
 3 | model.fit(Machine_Learning, epochs=Inf)
 4 | ```
 5 | <table style="width:100% border: none" >
 6 |   <tr>
 7 |     <th><img src="/img/Walker.png" height="250" align="center"/></th> 
 8 |   <th><img src="/img/DeepQN.png" height="250" align="center"/></th> 
 9 |   </tr>
10 | </Table>
11 | 
12 | ### What is Fit ML
13 | Fit Machine Learning (FitML) is blog that houses a collection of python Machine Learning articles and examples, often focusing on Reinforcement Learning. Here, you will find code related to Q Learning, Actor-Critic, MDP, Bellman, OpenAI solutions and custom implemented approaches to solving some of the toughest and most interesting problems to date (Yes, I am "baised").
14 | 
15 | ### What is Bipedal Walker anyway?
16 | Bipedal Walker is an <a href="https://openai.com/systems/">OpenAI Gym</a> environment where an agent learns to control a bipedal walker in order to reach the end of an obstacle course. What makes this challenging is that 
17 | 1) The agent only receives limbs coordinates along with Lidar information
18 | 2) Actions are vectors of 4 real numbers
19 | So our agent has to learn to balance,walk,run,jump on its own without any human intervention.
20 | 
21 | ### Why Q-Learning alone doesn't work
22 | For those acquinted with QLearning, it becomes clear very quickly that we cannot apply a greedy policy here. Simply relying on a Q-value function approximator and polling on non-discrete action space, let alone a vector of continuous action space is simply impractical. In order to overcome this challenge we use the Actor Critic Method where 1 Nerual Network is in charge of approximating how good an action is, and the other learns what to do in any given situation.
23 | 
24 | Let's see how this is implemented using keras.
25 | 
26 | ### Creating The Actor and the Critic
27 | Since we don't know how good an action is going to be until such time that we have take it, a common technique in Reinforcement Learning is to predict/approximate this using a function approximator a.k.a. a Neural Network. We will call this first network QModel. It will take as input a combination of state-action and estimate how good this is.
28 | 
29 | ```Python
30 | #nitialize the Reward predictor model
31 | Qmodel = Sequential()
32 | #model.add(Dense(num_env_variables+num_env_actions, activation='tanh', input_dim=dataX.shape[1]))
33 | Qmodel.add(Dense(4096, activation='tanh', input_dim=dataX.shape[1]))
34 | Qmodel.add(Dense(dataY.shape[1])) #dataY.shape[1] is 1 corresponding to the single Real approximated value
35 | 
36 | ```
37 | 
38 | We now need to ensure that we have a way to act optimally at every state. This is where the Actor comes in. This is another function approximator that takes a state as input and outputs an action.
39 | 
40 | ### Helper functions
41 | We then declare a set of helper functions that are going to be use to optimze our actions at every state.
42 | 
43 | ```Python
44 | def predictTotalRewards(qstate, action):
45 |     qs_a = np.concatenate((qstate,action), axis=0)
46 |     predX = np.zeros(shape=(1,num_env_variables+num_env_actions))
47 |     predX[0] = qs_a
48 | 
49 |     #print("trying to predict reward at qs_a", predX[0])
50 |     pred = Qmodel.predict(predX[0].reshape(1,predX.shape[1]))
51 |     remembered_total_reward = pred[0][0]
52 |     return remembered_total_reward
53 | 
54 | 
55 | def GetRememberedOptimalPolicy(qstate):
56 |     predX = np.zeros(shape=(1,num_env_variables))
57 |     predX[0] = qstate
58 | 
59 |     #print("trying to predict reward at qs_a", predX[0])
60 |     pred = action_predictor_model.predict(predX[0].reshape(1,predX.shape[1]))
61 |     r_remembered_optimal_policy = pred[0]
62 |     return r_remembered_optimal_policy
63 | ```
64 | 
65 | # Exploration
66 | As we initally have no concept of optimal policy, we need to ensure that some actions are taken stochastically. This will prevent our model from stagnating in its improvement.
67 | 
68 | ```python
69 |     prob = np.random.rand(1)
70 |     explore_prob = starting_explore_prob-(starting_explore_prob/num_games_to_play)*game
71 | 
72 |     #Chose between prediction and chance
73 |     if prob < explore_prob:
74 |         #take a random action
75 |         a = env.action_space.sample()
76 |  ```
77 | 
78 | 
79 | #### Very good course on Actor Critic
80 | http://mi.eng.cam.ac.uk/~mg436/LectureSlides/MLSALT7/L6.pdf
81 | 
82 | ### Who is Michel Aka
83 | *Michel is an AI researcher and a graduate from University of Montreal who currently works in the Healthcare industry.*
84 | 


--------------------------------------------------------------------------------
/DeepDeterministicSeletiveMemory/README.md:
--------------------------------------------------------------------------------
 1 | # FitML
 2 | ```python
 3 | model.fit(Machine_Learning, epochs=Inf)
 4 | ```
 5 | 
 6 | <table style="width:100% border: none" >
 7 |   <tr>
 8 |     <th><img src="/img/cWalker.jpg" width="250"/></th>       
 9 |   </tr>
10 | </Table>
11 |  
12 | https://youtu.be/hKrFFeZqq3E
13 | 
14 | #### How does Selective Memory work?
15 | 
16 | The intuition behind Policy Gradient is that it optimizes the parameters of the network in the direction of higher expected sum of rewards. What if we could do the same in a computationally more effective way that also turns out to be more intuitive: enter what I am calling Selective Memory.
17 | 
18 | 1) Our objective here is to ensure that the Policy function converges towards higher rewards. 
19 | 
20 | 2) We know that Neural Networks will converge towards assigned labeled of our data set and will also generalize (function approximation). 
21 | 
22 | 3) What if there was a way to select our training (reinforcement) data set so that it ensures that we converge towards our objective; Higher expected rewards.
23 | 
24 | Here we propose the approach of selectively remembering actions based on the how high a reward was. In other words, the probability *P* of recording an action state into memory (or a rollout) is dependent on the actual sum of reward yeilded by this action trajectory. (Notice that we are not using the expected sum of reward here but the actual computed value at the end of the rollout).
25 | 
26 | What does this look like in code
27 | 
28 | First we creat our function approximators Neural Networks
29 | ```python
30 | #nitialize the Reward predictor model
31 | model = Sequential()
32 | #model.add(Dense(num_env_variables+num_env_actions, activation='tanh', input_dim=dataX.shape[1]))
33 | model.add(Dense(1024, activation='relu', input_dim=dataX.shape[1]))
34 | model.add(Dense(256, activation='tanh'))
35 | model.add(Dense(dataY.shape[1]))
36 | opt = optimizers.adam(lr=learning_rate)
37 | model.compile(loss='mse', optimizer=opt, metrics=['accuracy'])
38 | 
39 | 
40 | #initialize the action predictor model
41 | action_predictor_model = Sequential()
42 | #model.add(Dense(num_env_variables+num_env_actions, activation='tanh', input_dim=dataX.shape[1]))
43 | action_predictor_model.add(Dense(1024, activation='relu', input_dim=apdataX.shape[1]))
44 | action_predictor_model.add(Dense(512, activation='relu'))
45 | action_predictor_model.add(Dense(apdataY.shape[1],activation='tanh'))
46 | ```
47 | 
48 | Then we calculate sum of rewards at the end of each rollout using Bellman.
49 | 
50 | Then we careful select what we want to remember i.e. store in memory.
51 | 
52 | There is a number of approaches we have used to discriminate on the nature of the State-Actions or State-Action-Rewards that we will be keeping in memory to train our Actor. One discriminates for each indivudual action state, the other discriminates an entire rollout batch. Reguardless the principle is the same. We determine how good an action is compared to the average remembered good actions.
53 | 
54 | ```python
55 | def addToMemory(reward,averageReward):
56 | 
57 |     prob = 0.1
58 |     if( reward > averageReward):
59 |         prob = prob + 0.9 * math.tanh(reward - averageReward)
60 |     else:
61 |         prob = prob + 0.1 * math.tanh(reward - averageReward)
62 | 
63 |     if np.random.rand(1)<=prob :
64 |         print("Adding reward",reward," based on prob ", prob)
65 |         return True
66 |     else:
67 |         return False
68 | ```
69 | 
70 | ```python
71 |     for i in range(0,gameR.shape[0]):
72 |         if addToMemory(gameR[i][0],-1,50):
73 |             tempGameSA = np.vstack((tempGameSA, gameSA[i]))
74 |             tempGameA = np.vstack((tempGameA,gameA[i]))
75 |             tempGameR = np.vstack((tempGameR,gameR[i]))
76 |             tempGameS = np.vstack((tempGameS,gameS[i]))
77 | ```
78 | 
79 | Here gameSA, gameA, gameR and gameS represent the various State-Action pairs, Actions, actual discounted sum of rewards and States respectively.
80 | 
81 | When we get a new state we then act based on optimal policy which has been trained on memory primed with only the best results yeilding actions.
82 | ```python
83 |     #Get Remembered optiomal policy
84 |     remembered_optimal_policy = GetRememberedOptimalPolicy(qs)
85 |     a = remembered_optimal_policy
86 | ```
87 | 
88 | ### What type of results do we get?
89 | Our agent is able to crawl, stand up, walk, run, jump after 500 episodes in the famous openAI BipedalWalker test. After 3000 iterations, our agent is able to advance fast and be very stable on its feet.
90 | You can watch it in action here: https://youtu.be/hKrFFeZqq3E.
91 | 
92 | 
93 | ### What is Fit ML
94 | Fit Machine Learning (FitML) is blog that houses a collection of python Machine Learning articles and examples, often focusing on Reinforcement Learning. Here, you will find code related to Q Learning, Actor-Critic, MDP, Bellman, OpenAI solutions and custom implemented approaches to solving some of the toughest and most interesting problems to date (Yes, I am "baised").
95 | 
96 | ### Who is Michel Aka
97 | *Michel is an AI researcher and a graduate from University of Montreal who currently works in the Healthcare industry.*
98 | 


--------------------------------------------------------------------------------
/DeepQN/CartPole_QLearning.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Cartpole-v0 solution by Michel Aka
  3 | https://github.com/FitMachineLearning/FitML/
  4 | https://www.youtube.com/channel/UCi7_WxajoowBl4_9P0DhzzA/featured
  5 | Using DeepQ Learning
  6 | 
  7 | '''
  8 | import numpy as np
  9 | import keras
 10 | import gym
 11 | import os
 12 | import h5py
 13 | 
 14 | from keras.models import Sequential
 15 | from keras.layers import Dense, Dropout
 16 | from keras.layers import Embedding
 17 | from keras.layers import LSTM
 18 | from keras import optimizers
 19 | 
 20 | 
 21 | num_env_variables = 4
 22 | num_env_actions = 2
 23 | num_training_exmaples = 30
 24 | timesteps = 1
 25 | num_initial_observation = 60
 26 | learning_rate = 0.001
 27 | weigths_filename = "Cartpole-weights_DQN_Mem_1Loop.h5"
 28 | 
 29 | b_discount = 0.95
 30 | max_memory_len = 5000
 31 | num_failures_for_retrain = 10
 32 | starting_explore_prob = 0.05
 33 | initial_training_epochs = 2
 34 | RL_training_eporcs = 2
 35 | num_anticipation_steps = 6
 36 | load_previous_weights = False
 37 | observe_and_train = True
 38 | Do_RL = True
 39 | save_weights = True
 40 | Learning_cycles = 1500
 41 | 
 42 | 
 43 | #One hot encoding array
 44 | possible_actions = np.arange(0,num_env_actions)
 45 | actions_1_hot = np.zeros((num_env_actions,num_env_actions))
 46 | actions_1_hot[np.arange(num_env_actions),possible_actions] = 1
 47 | 
 48 | #Create testing enviroment
 49 | env = gym.make('CartPole-v0')
 50 | env.reset()
 51 | 
 52 | #initialize training matrix with random states and actions
 53 | dataX = np.random.random(( num_training_exmaples,num_env_variables+num_env_actions ))
 54 | #Only one output for the total score
 55 | dataY = np.random.random((num_training_exmaples,1))
 56 | 
 57 | 
 58 | 
 59 | #nitialize the LSTM with random weights
 60 | 
 61 | model = Sequential()
 62 | #model.add(Dense(num_env_variables+num_env_actions, activation='tanh', input_dim=dataX.shape[1]))
 63 | model.add(Dense(512, activation='relu', input_dim=dataX.shape[1]))
 64 | model.add(Dense(dataY.shape[1]))
 65 | 
 66 | opt = optimizers.adam(lr=learning_rate)
 67 | 
 68 | model.compile(loss='mse', optimizer=opt, metrics=['accuracy'])
 69 | 
 70 | #load previous model weights if they exist
 71 | if load_previous_weights:
 72 |     dir_path = os.path.realpath(".")
 73 |     fn = dir_path + "/"+weigths_filename
 74 |     print("filepath ", fn)
 75 |     if  os.path.isfile(fn):
 76 |         print("loading weights")
 77 |         model.load_weights(weigths_filename)
 78 |     else:
 79 |         print("File ",weigths_filename," does not exis. Retraining... ")
 80 | 
 81 | #Record first 500 in a sequence and add them to the training sequence
 82 | total_steps = 0
 83 | dataX = np.zeros(shape=(1,num_env_variables+num_env_actions))
 84 | dataY = np.zeros(shape=(1,1))
 85 | 
 86 | memoryX = np.zeros(shape=(1,num_env_variables+num_env_actions))
 87 | memoryY = np.zeros(shape=(1,1))
 88 | 
 89 | 
 90 | print("dataX shape", dataX.shape)
 91 | print("dataY shape", dataY.shape)
 92 | 
 93 | 
 94 | 
 95 | def predictTotalRewards(qstate, action):
 96 |     qs_a = np.concatenate((qstate,actions_1_hot[action]), axis=0)
 97 |     predX = np.zeros(shape=(1,num_env_variables+num_env_actions))
 98 |     predX[0] = qs_a
 99 | 
100 |     #print("trying to predict reward at qs_a", predX[0])
101 |     pred = model.predict(predX[0].reshape(1,predX.shape[1]))
102 |     remembered_total_reward = pred[0][0]
103 |     return remembered_total_reward
104 | 
105 | 
106 | 
107 | if observe_and_train:
108 |     #observe for 100 games
109 | 
110 | 
111 |     for game in range(500):
112 |         gameX = np.zeros(shape=(1,num_env_variables+num_env_actions))
113 |         gameY = np.zeros(shape=(1,1))
114 |         #Get the Q state
115 |         qs = env.reset()
116 |         for step in range (1000):
117 | 
118 |             if game < num_initial_observation:
119 |                 #take a radmon action
120 |                 a = env.action_space.sample()
121 |             else:
122 |                 prob = np.random.rand(1)
123 |                 explore_prob = starting_explore_prob-(starting_explore_prob/Learning_cycles)*game
124 | 
125 |                 #Chose between prediction and chance
126 |                 if prob < explore_prob:
127 |                     #take a random action
128 |                     a=env.action_space.sample()
129 |                     #print("taking random action",a, "at total_steps" , total_steps)
130 |                     #print("prob ", prob, "explore_prob", explore_prob)
131 | 
132 |                 else:
133 |                     ##chose an action by estimating consequences of actions for the next num_anticipation_steps steps ahead
134 |                     #works best with looking 6 steps ahead
135 |                     #Also works best if you train the model more itterations
136 |                     utility_possible_actions = np.zeros(shape=(num_env_actions))
137 | 
138 |                     utility_possible_actions[0] = predictTotalRewards(qs,0)
139 |                     utility_possible_actions[1] = predictTotalRewards(qs,1)
140 | 
141 | 
142 |                     #chose argmax action of estimated anticipated rewards
143 |                     #print("utility_possible_actions ",utility_possible_actions)
144 |                     #print("argmax of utitity", np.argmax(utility_possible_actions))
145 |                     a = np.argmax(utility_possible_actions)
146 | 
147 | 
148 | 
149 |             env.render()
150 |             qs_a = np.concatenate((qs,actions_1_hot[a]), axis=0)
151 | 
152 |             #print("action",a," qs_a",qs_a)
153 |             #get the target state and reward
154 |             s,r,done,info = env.step(a)
155 |             #record only the first x number of states
156 | 
157 |             if done and step <=196:
158 |                 r = -1
159 | 
160 |             if step ==0:
161 |                 gameX[0] = qs_a
162 |                 gameY[0] = np.array([r])
163 |                 memoryX[0] = qs_a
164 |                 memoryY[0] = np.array([r])
165 | 
166 |             gameX = np.vstack((gameX,qs_a))
167 |             gameY = np.vstack((gameY,np.array([r])))
168 | 
169 | 
170 |             if done :
171 |                 #GAME ENDED
172 |                 #Calculate Q values from end to start of game
173 |                 for i in range(0,gameY.shape[0]):
174 |                     #print("Updating total_reward at game epoch ",(gameY.shape[0]-1) - i)
175 |                     if i==0:
176 |                         #print("reward at the last step ",gameY[(gameY.shape[0]-1)-i][0])
177 |                         gameY[(gameY.shape[0]-1)-i][0] = gameY[(gameY.shape[0]-1)-i][0]
178 |                     else:
179 |                         #print("local error before Bellman", gameY[(gameY.shape[0]-1)-i][0],"Next error ", gameY[(gameY.shape[0]-1)-i+1][0])
180 |                         gameY[(gameY.shape[0]-1)-i][0] = gameY[(gameY.shape[0]-1)-i][0]+b_discount*gameY[(gameY.shape[0]-1)-i+1][0]
181 |                         #print("reward at step",i,"away from the end is",gameY[(gameY.shape[0]-1)-i][0])
182 |                     if i==gameY.shape[0]-1:
183 |                         print("Training Game #",game, " steps = ", step ,"last reward", r," finished with headscore ", gameY[(gameY.shape[0]-1)-i][0])
184 | 
185 |                 if memoryX.shape[0] ==1:
186 |                     memoryX = gameX
187 |                     memoryY = gameY
188 |                 else:
189 |                     #Add experience to memory
190 |                     memoryX = np.concatenate((memoryX,gameX),axis=0)
191 |                     memoryY = np.concatenate((memoryY,gameY),axis=0)
192 | 
193 |                 #if memory is full remove first element
194 |                 if np.alen(memoryX) >= max_memory_len:
195 |                     print("memory full. mem len ", np.alen(memoryX))
196 |                     for l in range(np.alen(gameX)):
197 |                         memoryX = np.delete(memoryX, 0, axis=0)
198 |                         memoryY = np.delete(memoryY, 0, axis=0)
199 | 
200 |             #Update the states
201 |             qs=s
202 | 
203 |             #Retrain every X failures after num_initial_observation
204 |             if done and game >= num_env_actions:
205 |                 if game%10 == 0:
206 |                     print("Training  game# ", game,"momory size", memoryX.shape[0])
207 |                     model.fit(memoryX,memoryY, batch_size=32,epochs=initial_training_epochs,verbose=2)
208 | 
209 |             if done:
210 |                 if r > 0:
211 |                     print("Game ",game," WON***")
212 |                 #Game ended - Break
213 |                 break
214 | 
215 |     print("Observation complete. - Begin LSTM training")
216 | 
217 |     print("dataX shape", dataX.shape)
218 |     print(dataX[0:20])
219 |     print("dataY shape", dataY.shape)
220 |     print(dataY)
221 | 
222 | 
223 | 
224 | 
225 | 
226 | 
227 | 
228 | 
229 | if save_weights:
230 |     #Save model
231 |     print("Saving weights")
232 |     model.save_weights(weigths_filename)
233 | 


--------------------------------------------------------------------------------
/DeepQNPyTorch/DQN_torch.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import torch.optim as optim
  5 | import numpy as np
  6 | import warnings
  7 | warnings.filterwarnings("ignore", category=UserWarning)
  8 | 
  9 | class DeepQNetwork(nn.Module):
 10 |     def __init__(self, lr, input_dims, fc1_dims, fc2_dims,n_actions):
 11 |         super(DeepQNetwork, self).__init__()
 12 |         # print("input_dims ", input_dims[0], " n_actions ",n_actions)
 13 |         self.input_dims = input_dims[0]+n_actions
 14 |         self.fc1_dims = fc1_dims
 15 |         self.fc2_dims = fc2_dims
 16 |         self.n_actions = 1
 17 |         self.fc1 = nn.Linear(self.input_dims,self.fc1_dims)
 18 |         self.fc2 = nn.Linear(self.fc1_dims,self.fc2_dims)
 19 |         self.fc3 = nn.Linear(self.fc2_dims,self.n_actions)
 20 |         self.optimizer = optim.Adam(self.parameters(),lr=lr)
 21 |         self.loss = nn.MSELoss()
 22 |         if torch.cuda.is_available():
 23 |             print("Using CUDA")
 24 |         self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cuda:1')
 25 | 
 26 |         # print("Cuda device ",self.device)
 27 |         self.to(self.device)
 28 | 
 29 |     def forward(self,state_action):
 30 |         #action to 1 hot
 31 |         # action_1hot = np.zeros(self.n_actions)
 32 |         # action_1hot[action] = 1.0
 33 |         # observation_state = np.append(observation,action_1hot)
 34 |         state = torch.Tensor(state_action).to(self.device)
 35 |         x = F.relu(self.fc1(state))
 36 |         x = F.relu((self.fc2(x)))
 37 |         predicted_Q_value = self.fc3(x)
 38 |         return predicted_Q_value
 39 | 
 40 | 
 41 | class Agent(object):
 42 |     def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions,
 43 |                 max_mem_size=100000, eps_end=0.01, eps_dec=0.996):
 44 |         self.gamma = gamma
 45 |         self.epsilon = epsilon
 46 |         self.eps_end = eps_end
 47 |         self.eps_dec = eps_dec
 48 |         self.lr = lr
 49 |         self.n_actions = n_actions
 50 |         # print("input_dims ", input_dims)
 51 |         self.input_dims = input_dims[0]
 52 |         self.batch_size = batch_size
 53 |         self.action_space = [i for i in range(n_actions)]
 54 |         self.max_mem_size = max_mem_size
 55 |         self.mem_counter = 0
 56 |         self.Q_eval = DeepQNetwork(lr=lr,n_actions=n_actions, input_dims=input_dims, fc1_dims=256, fc2_dims=256)
 57 |         self.state_memory = np.zeros((self.max_mem_size,self.input_dims))
 58 |         self.new_state_memory = np.zeros((self.max_mem_size,self.input_dims))
 59 |         self.action_memory = np.zeros((self.max_mem_size,n_actions))
 60 |         self.action_state_memory = np.zeros((self.max_mem_size,self.input_dims+n_actions))
 61 |         self.reward_memory = np.zeros(self.max_mem_size)
 62 |         self.Q_memory = np.zeros(self.max_mem_size)
 63 |         self.terminal_memory = np.zeros(self.max_mem_size)
 64 | 
 65 |     def store_transition(self, state, action, reward, state_, terminal):
 66 |         index = self.mem_counter % self.max_mem_size
 67 |         self.state_memory[index] = state
 68 |         actions = np.zeros(self.n_actions)
 69 |         actions[action] = 1.0
 70 |         self.action_memory[index] = actions
 71 |         self.reward_memory[index] = reward
 72 |         self.terminal_memory[index] = terminal
 73 |         self.new_state_memory[index] = state_
 74 |         self.action_state_memory[index] = np.append(state,actions)
 75 |         self.mem_counter+=1
 76 | 
 77 |     def calculate_bellman(self,episode_len):
 78 |         for i in range(episode_len):
 79 |             index = ((self.mem_counter-1)-i) % self.max_mem_size
 80 |             next_index = ((self.mem_counter)-i) % self.max_mem_size
 81 |             if i==0:
 82 |                 self.Q_memory[index] = self.reward_memory[index]
 83 |                 # print("last Q ", self.Q_memory[index])
 84 |             else:
 85 |                 self.Q_memory[index] = self.reward_memory[index] + self.gamma * self.Q_memory[next_index]
 86 |                 # print("Q ", self.Q_memory[index])
 87 | 
 88 |     def update_epsilon(self):
 89 |         self.epsilon = self.epsilon * self.eps_dec if self.epsilon > self.eps_end else self.eps_end
 90 | 
 91 |     def process_end_of_episode(self,episode_len):
 92 |         self.calculate_bellman(episode_len)
 93 |         self.update_epsilon()
 94 | 
 95 |     def choose_action(self, observation):
 96 |         rand = np.random.random()
 97 |         if rand < self.epsilon:
 98 |             action = np.random.choice(self.action_space)
 99 |         else:
100 |             estimated_Q_values = torch.Tensor( np.zeros(self.n_actions)).to(self.Q_eval.device)
101 |             for i in range (self.n_actions):
102 |                 action_1hot = np.zeros(self.n_actions)
103 |                 action_1hot[i] = 1.0
104 |                 # print("about to evaulate action ",i, " array of action ", action_1hot," with observation",  observation[:10])
105 |                 # print("concatanated sate action vector ", np.append(observation,action_1hot) )
106 |                 estimated_Q_values[i] = self.Q_eval.forward(  np.append(observation,action_1hot)  )
107 | 
108 | 
109 |             # actions = self.Q_eval.forward(observation)
110 | 
111 |             # print("estimated Q values", estimated_Q_values)
112 |             action = torch.argmax(estimated_Q_values).item()
113 |         return action
114 | 
115 |     def learn(self, step_counter):
116 |         if self.mem_counter > self.batch_size:
117 |             self.Q_eval.optimizer.zero_grad()
118 |             max_mem = self.mem_counter if self.mem_counter < self.max_mem_size else self.max_mem_size
119 |             batch = np.random.choice(max_mem,self.batch_size)
120 |             #print("batch size", batch.size())
121 |             state_batch = self.state_memory[batch]
122 |             action_batch = self.action_memory[batch]
123 |             action_values = np.array(self.action_space, dtype=np.uint8)
124 |             action_indices = np.dot(action_batch, action_values)
125 |             reward_batch = self.reward_memory[batch]
126 |             q_batch = self.Q_memory[batch]
127 |             terminal_batch = self.terminal_memory[batch]
128 |             new_state_batch = self.new_state_memory[batch]
129 |             action_state_batch = self.action_state_memory[batch]
130 | 
131 |             reward_batch = torch.Tensor(reward_batch).to(self.Q_eval.device)
132 |             terminal_batch = torch.Tensor(terminal_batch).to(self.Q_eval.device)
133 | 
134 |             q_eval = self.Q_eval.forward(action_state_batch).to(self.Q_eval.device)
135 |             q_target = torch.Tensor(q_batch).to(self.Q_eval.device)
136 |             # q_next = self.Q_eval.forward(new_state_batch).to(self.Q_eval.device)
137 | 
138 |             # batch_index = np.arange(self.batch_size, dtype=np.int32)
139 |             # q_target[action_batch] = reward_batch  + self.gamma*torch.max(q_next, dim=1)[0]*terminal_batch
140 | 
141 |             # self.epsilon = self.epsilon * self.eps_dec if self.epsilon > self.eps_end else self.eps_end
142 |             # if (step_counter%50)==49:
143 |             #     print("Q eval ",q_eval, "q target", q_target)
144 |             loss = self.Q_eval.loss(q_eval,q_target)
145 |             loss.backward()
146 |             self.Q_eval.optimizer.step()
147 | print(torch.cuda.is_available())
148 | 


--------------------------------------------------------------------------------
/DeepQNPyTorch/Lander_torch.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | from DQN_torch import Agent
 3 | # from utils import plotLearning
 4 | import numpy as np
 5 | import warnings
 6 | warnings.filterwarnings("ignore", category=UserWarning)
 7 | 
 8 | if __name__ =='__main__':
 9 |     env=gym.make('LunarLander-v2')
10 |     brain = Agent(gamma=0.98, epsilon=0.7, batch_size=2, n_actions=4,
11 |                 input_dims=[8], lr=0.01, eps_end=0.02,eps_dec=0.992)
12 |     scores = []
13 |     eps_history = []
14 |     n_games = 50000
15 |     score = 0
16 | 
17 |     for i in range (n_games):
18 |         if i%10 == 0 and i>0 and i>10:
19 |             avg_score = np.mean(scores[:-10])
20 |             print('epside ', i, 'score', score,
21 |             'average score %3f' % avg_score,
22 |             'epsilon %3f' % brain.epsilon)
23 |         else:
24 |             print('episode ',i, 'score', score)
25 |         score = 0
26 | 
27 |         eps_history.append(brain.epsilon)
28 |         observation = env.reset()
29 |         done = False
30 |         step_counter = 0
31 |         while not done:
32 |             env.render()
33 |             action = brain.choose_action(observation)
34 |             observation_, reward, done, info = env.step(action)
35 |             # print("chosen action after chose action ", action)
36 |             score+=reward
37 |             brain.store_transition(observation,action, reward, observation_,done)
38 |             if i>5:
39 |                 brain.learn(step_counter)
40 |             observation = observation_
41 |             step_counter += 1
42 |         # EPISODE done
43 |         # "CALCULATE BELL MAN IN AGENT CLASS brain.compute_reward()"
44 |         brain.process_end_of_episode(step_counter)
45 |         scores.append(score)
46 | 
47 |     # x = [i+1 for i in range(n_games)]
48 |     # filename = 'lunar-lander.png'
49 |     # plotLearning(x, scores, eps_history, filename,)
50 | 


--------------------------------------------------------------------------------
/DeepQNPyTorch/Readme.md:
--------------------------------------------------------------------------------
1 | # This forlder contains RL pytorch agents for open AI 
2 | 


--------------------------------------------------------------------------------
/Experimental/CNN_test.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | https://hackernoon.com/visualizing-parts-of-convolutional-neural-networks-using-keras-and-cats-5cc01b214e59
 4 | https://stackoverflow.com/questions/43895750/keras-input-shape-for-conv2d-and-manually-loaded-images
 5 | '''
 6 | 
 7 | import matplotlib.pylab as plt
 8 | import matplotlib.image as mpimg
 9 | import numpy as np
10 | import scipy
11 | import keras
12 | 
13 | from keras.models import Sequential
14 | from keras.layers import Conv2D
15 | 
16 | cat = mpimg.imread('cat.png')
17 | print("Shape", cat.shape)
18 | plt.imshow(cat)
19 | plt.show()
20 | 
21 | def show_cat(cat_batch):
22 |     print("cat shape before transfo",cat_batch.shape)
23 |     cat = np.squeeze(cat_batch,axis=0)
24 |     print( "cat.shape", cat.shape)
25 |     plt.imshow(cat)
26 |     plt.show()
27 | 
28 | cat_batch = cat.reshape(1,cat.shape[0],cat.shape[1],4)
29 | 
30 | input_shape = ( cat.shape[0], cat.shape[1], 4 )
31 | 
32 | model = Sequential()
33 | model.add(Conv2D(4, kernel_size=(3, 3), activation='relu', input_shape=input_shape))
34 | 
35 | print("predicting ... ")
36 | conv_cat = model.predict(cat_batch)
37 | show_cat(conv_cat)
38 | 


--------------------------------------------------------------------------------
/Experimental/Cartpol_CNN_RL.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Cartpole solution by the Author of the Fit Machine Learning Blog
  3 | https://github.com/FitMachineLearning/FitML/
  4 | 
  5 | This solution oberserves the first 30 games
  6 | Then plays after being trained from first 30 games.
  7 | 
  8 | While it plays it will take actions/policies that are sometimes random enabling
  9 | it to explore solution it hasn't yet explored.
 10 | 
 11 | The proability of random action reduces over each episode as it gets more and more comfortable with the rules of the game.
 12 | 
 13 | '''
 14 | import numpy as np
 15 | import keras
 16 | import gym
 17 | import os
 18 | import h5py
 19 | 
 20 | from keras.models import Sequential
 21 | from keras.layers import Dense, Dropout
 22 | from keras.layers import Embedding
 23 | from keras.layers import LSTM
 24 | from keras import optimizers
 25 | 
 26 | 
 27 | num_env_variables = 4
 28 | num_env_actions = 1
 29 | num_training_exmaples = 100
 30 | timesteps = 1
 31 | num_initial_observation = 4000
 32 | 
 33 | 
 34 | num_failures_for_retrain = 5
 35 | starting_explore_prob = 0.30
 36 | training_epochs = 500
 37 | num_anticipation_steps = 8
 38 | load_previous_weights = False
 39 | observe_and_train = True
 40 | save_weights = True
 41 | 
 42 | 
 43 | #Create testing enviroment
 44 | env = gym.make('CartPole-v0')
 45 | env.reset()
 46 | 
 47 | #initialize training matrix with random states and actions
 48 | dataX = np.random.random(( num_training_exmaples,num_env_variables+num_env_actions ))
 49 | #initize training matrix corresponding expected states and expected rewards (random)
 50 | dataY = np.random.random((num_training_exmaples,num_env_variables+1))
 51 | 
 52 | 
 53 | 
 54 | #nitialize the LSTM with random weights
 55 | 
 56 | model = Sequential()
 57 | model.add(Dense(20, activation='relu', input_dim=dataX.shape[1]))
 58 | model.add(Dense(20))
 59 | model.add(Dense(dataY.shape[1]))
 60 | 
 61 | opt = optimizers.adam(lr=0.01)
 62 | 
 63 | model.compile(loss='mse', optimizer=opt, metrics=['accuracy'])
 64 | 
 65 | #load previous model weights if they exist
 66 | if load_previous_weights:
 67 |     dir_path = os.path.realpath(".")
 68 |     fn = dir_path + "/CP-weights_RL_CNN.h5"
 69 |     print("filepath ", fn)
 70 |     if  os.path.isfile(fn):
 71 |         print("loading weights")
 72 |         model.load_weights("CP-weights_RL_CNN.h5")
 73 |     else:
 74 |         print("File CP-weights_RL_CNN.h5 does not exis. Retraining... ")
 75 | 
 76 | #Record first 500 in a sequence and add them to the training sequence
 77 | total_steps = 0
 78 | dataX = np.zeros(shape=(1,5))
 79 | dataY = np.zeros(shape=(1,5))
 80 | 
 81 | print("dataX shape", dataX.shape)
 82 | print("dataY shape", dataY.shape)
 83 | 
 84 | if observe_and_train:
 85 |     #observe for 100 games
 86 |     for game in range(30):
 87 | 
 88 |         if total_steps >= num_initial_observation:
 89 |             break
 90 |         #Get the Q state
 91 |         qs = env.reset()
 92 |         for step in range (200):
 93 |             a=0
 94 |             if np.random.rand(1) < 0.5:
 95 |                 a=0
 96 |             else:
 97 |                 a=1
 98 |             env.render()
 99 |             qs_a = np.concatenate((qs,np.array([a])), axis=0)
100 | 
101 |             #get the target state and reward
102 |             s,r,done,info = env.step(a)
103 | 
104 |             #set reward in case of failure
105 |             if done:
106 |                 r = -1
107 | 
108 |             #concatenate target state and reward
109 |             s_r = np.concatenate((s,np.array([r])), axis=0)
110 | 
111 |             if done:
112 |                 #print negative reward array
113 |                 print("Negative reward s_r: ", s_r)
114 | 
115 |             #print("reward = ", r)
116 |             #print("target state", s)
117 |             #print("concatenate(s,r)", s_r)
118 | 
119 | 
120 |             #record only the first x number of states
121 |             if total_steps ==0:
122 |                 dataX[0] = qs_a
123 |                 dataY[0] = s_r
124 | 
125 |             if total_steps < (num_initial_observation-1):
126 |                 dataX = np.vstack((dataX,qs_a))
127 |                 dataY = np.vstack((dataY,s_r))
128 | 
129 |             #Update the states
130 |             qs=s
131 | 
132 | 
133 |             total_steps += 1
134 |             if done :
135 |                 break
136 | 
137 |     print("Observation complete. - Begin LSTM training")
138 | 
139 |     print("dataX shape", dataX.shape)
140 |     print(dataX[0:5])
141 |     print("dataY shape", dataY.shape)
142 |     print(dataY[0:5])
143 | 
144 | 
145 |     #feedX = np.reshape(dataX, (dataX.shape[0], 1, dataX.shape[1] ))
146 |     #feedY = np.reshape(dataY, (dataY.shape[0], 1, dataY.shape[1] ))
147 |     feedX = dataX
148 |     feedY = dataY
149 | 
150 | 
151 |     #The more epochs you train the model, the better is becomes at predicting future states
152 |     #This in turn will improve the results of the Bellman equation and thus will lead us to
153 |     # better decisions in our MDP process
154 |     model.fit(feedX,feedY, batch_size=1,epochs=training_epochs,verbose=2)
155 | 
156 |     print("total_steps ", total_steps)
157 |     print("dataX ", dataX[0:10])
158 |     print("dataY ", dataY[0:10])
159 |     #print("dataY ", dataY)
160 | 
161 |     print("Initial training complete. Begin tentative exploration.")
162 | 
163 | '''
164 | dataX = np.random.random((1,5))
165 | res = model.predict(dataX[0].reshape(1,dataX.shape[1]))
166 | nstate = res[0][:-1]
167 | print("predicted output ", res)
168 | print("expected reward ", res[0][4])
169 | print("expected state ", nstate)
170 | '''
171 | 
172 | def estimateReward(qstate,action, depth):
173 |     if depth <= 0:
174 |         return 0
175 |     #calculate/estimate reward at this state and get the next state
176 |     qs_a = np.concatenate((qstate,np.array([action])), axis=0)
177 |     predX = np.zeros(shape=(1,5))
178 |     predX[0] = qs_a
179 |     pred = model.predict(predX[0].reshape(1,predX.shape[1]))
180 |     reward = pred[0][4]
181 |     expected_state = pred[0][:-1]
182 | 
183 |     '''
184 |     print("depth -- ", depth)
185 |     print("qstate", qstate)
186 |     print("action", action)
187 |     print("pred", pred)
188 |     print("expected_state", expected_state)
189 |     print("reward", reward)
190 |     '''
191 |     # Bellman -- reward at this state = reward + Sum of discounted expected rewards for all actions (recursively)
192 |     #recursively calculate the reward at future states for all possible actions
193 |     discounted_future_rewards = 0.95*estimateReward(expected_state,0,depth-1)+ 0.95*estimateReward(expected_state,1,depth-1)
194 | 
195 |     #print("discounted_future_rewards", discounted_future_rewards)
196 |     #add current state and discounted future state reward
197 |     return reward + discounted_future_rewards
198 | 
199 | 
200 | print("** Estimating reward for dataX[0] with action 1 usint Bellman", estimateReward(dataX[0][:-1],1,2))
201 | print("** Estimating reward for dataX[0] with action 0 usint Bellman", estimateReward(dataX[0][:-1],0,2))
202 | 
203 | 
204 | 
205 | #####
206 | #####
207 | #Play the game for X rounds using the Bellman with LSTM anticipation model
208 | 
209 | explore_prob = starting_explore_prob
210 | failures = 0
211 | for game in range(20):
212 |     total_steps =0
213 |     #Get the Q state
214 |     qs = env.reset()
215 |     #over the next 50 games reduce the probability of explore_prob
216 |     explore_prob = starting_explore_prob-(starting_explore_prob/20.0)*game
217 |     print("- Episode", game, " explore_prob",explore_prob)
218 |     for step in range (300):
219 | 
220 |         prob = np.random.rand(1)
221 | 
222 |         #Chose between prediction and chance
223 |         if prob < explore_prob:
224 |             #take a random action
225 |             #print("taking random action ", total_steps)
226 |             #print("prob ", prob, "explore_prob", explore_prob)
227 |             if np.random.rand(1) < 0.5:
228 |                 a=0
229 |             else:
230 |                 a=1
231 |         else:
232 |             ##chose an action by estimating consequences of actions for the next num_anticipation_steps steps ahead
233 |             #works best with looking 6 steps ahead
234 |             #Also works best if you train the model more itterations
235 |             estimated_anticipated_reward_a = estimateReward(qs,1,num_anticipation_steps)
236 |             estimated_anticipated_reward_b = estimateReward(qs,0,num_anticipation_steps)
237 |             #print(" estimated rewards a and b", estimated_anticipated_reward_a, estimated_anticipated_reward_b)
238 |             #chose argmax action of estimated anticipated rewards
239 |             if estimated_anticipated_reward_a > estimated_anticipated_reward_b:
240 |                 a = 1
241 |             else:
242 |                 a = 0
243 | 
244 | 
245 |         env.render()
246 |         #get the target state and reward
247 |         s,r,done,info = env.step(a)
248 |         qs=s
249 | 
250 | 
251 |         #set reward in case of failure
252 |         if done:
253 |             if total_steps >= 198:
254 |                 print("*** Game Won after ", total_steps, " steps")
255 |             else:
256 |                 r = -1
257 |                 print("*** failed after ", total_steps, " steps")
258 | 
259 |         qs_a = np.concatenate((qs,np.array([a])), axis=0)
260 |         s_r = np.concatenate((s,np.array([r])), axis=0)
261 |         #add event to training set
262 |         dataX = np.vstack((dataX,qs_a))
263 |         dataY = np.vstack((dataY,s_r))
264 | 
265 |         total_steps += 1
266 |         if done and r==-1:
267 |             #If you fail, add new knowledge to NN
268 |             #Retrain the NN
269 |             #break
270 |             print("Retraining the network")
271 |             feedX = dataX
272 |             feedY = dataY
273 |             failures +=1
274 | 
275 |             #retrain after every 6 failures
276 |             if failures >= num_failures_for_retrain:
277 |                 print("Retraining the network after failure ", failures)
278 |                 model.fit(feedX,feedY, batch_size=1,epochs=training_epochs,verbose=2)
279 |                 failures = 0
280 | 
281 | 
282 |         if done:
283 |             break
284 | 
285 | if save_weights:
286 |     #Save model
287 |     print("Saving weights")
288 |     model.save_weights("CP-weights_RL_CNN.h5")
289 | 


--------------------------------------------------------------------------------
/Experimental/Cartpole_simple_CNN.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Cartpole solution by the Author of the Fit Machine Learning Blog
  3 | https://github.com/FitMachineLearning/FitML/
  4 | 
  5 | This is the simplified version of the open AI problem. This solution does not use LSTMs but uses 
  6 | One Dense RELU and one Dense sigmoid instead.
  7 | 
  8 | It does not perform as well as the LSTM model which is better able to do longer term predictions.
  9 | 
 10 | '''
 11 | import numpy as np
 12 | import keras
 13 | import gym
 14 | import os
 15 | import h5py
 16 | 
 17 | from keras.models import Sequential
 18 | from keras.layers import Dense, Dropout
 19 | from keras.layers import Embedding
 20 | from keras.layers import LSTM
 21 | from keras import optimizers
 22 | 
 23 | 
 24 | num_env_variables = 4
 25 | num_env_actions = 1
 26 | num_training_exmaples = 100
 27 | timesteps = 1
 28 | num_initial_observation = 4000
 29 | training_epochs = 500
 30 | num_anticipation_steps = 7
 31 | load_previous_weights = True
 32 | observe_and_train = False
 33 | save_weights = False
 34 | 
 35 | 
 36 | #Create testing enviroment
 37 | env = gym.make('CartPole-v0')
 38 | env.reset()
 39 | 
 40 | #initialize training matrix with random states and actions
 41 | dataX = np.random.random(( num_training_exmaples,num_env_variables+num_env_actions ))
 42 | #initize training matrix corresponding expected states and expected rewards (random)
 43 | dataY = np.random.random((num_training_exmaples,num_env_variables+1))
 44 | 
 45 | 
 46 | 
 47 | #nitialize the LSTM with random weights
 48 | 
 49 | model = Sequential()
 50 | model.add(Dense(16, activation='relu', input_dim=dataX.shape[1]))
 51 | model.add(Dense(dataY.shape[1]))
 52 | 
 53 | opt = optimizers.adam(lr=0.01)
 54 | 
 55 | model.compile(loss='mse', optimizer=opt, metrics=['accuracy'])
 56 | 
 57 | #load previous model weights if they exist
 58 | if load_previous_weights:
 59 |     dir_path = os.path.realpath(".")
 60 |     fn = dir_path + "/CP-weights_Simple_CNN.h5"
 61 |     print("filepath ", fn)
 62 |     if  os.path.isfile(fn):
 63 |         print("loading weights")
 64 |         model.load_weights("CP-weights_Simple_CNN.h5")
 65 |     else:
 66 |         print("File CP-weights.h5 does not exis. Retraining... ")
 67 | 
 68 | #Record first 500 in a sequence and add them to the training sequence
 69 | total_steps = 0
 70 | dataX = np.zeros(shape=(1,5))
 71 | dataY = np.zeros(shape=(1,5))
 72 | 
 73 | print("dataX shape", dataX.shape)
 74 | print("dataY shape", dataY.shape)
 75 | 
 76 | if observe_and_train:
 77 |     #observe for 100 games
 78 |     for game in range(100):
 79 | 
 80 |         if total_steps >= num_initial_observation:
 81 |             break
 82 |         #Get the Q state
 83 |         qs = env.reset()
 84 |         for step in range (200):
 85 |             a=0
 86 |             if np.random.rand(1) < 0.5:
 87 |                 a=0
 88 |             else:
 89 |                 a=1
 90 |             env.render()
 91 |             qs_a = np.concatenate((qs,np.array([a])), axis=0)
 92 | 
 93 |             #get the target state and reward
 94 |             s,r,done,info = env.step(a)
 95 | 
 96 |             #set reward in case of failure
 97 |             if done:
 98 |                 r = -1
 99 | 
100 |             #concatenate target state and reward
101 |             s_r = np.concatenate((s,np.array([r])), axis=0)
102 | 
103 |             if done:
104 |                 #print negative reward array
105 |                 print("Negative reward s_r: ", s_r)
106 | 
107 |             #print("reward = ", r)
108 |             #print("target state", s)
109 |             #print("concatenate(s,r)", s_r)
110 | 
111 | 
112 |             #record only the first x number of states
113 |             if total_steps ==0:
114 |                 dataX[0] = qs_a
115 |                 dataY[0] = s_r
116 | 
117 |             if total_steps < (num_initial_observation-1):
118 |                 dataX = np.vstack((dataX,qs_a))
119 |                 dataY = np.vstack((dataY,s_r))
120 | 
121 |             #Update the states
122 |             qs=s
123 | 
124 | 
125 |             total_steps += 1
126 |             if done :
127 |                 break
128 | 
129 |     print("Observation complete. - Begin LSTM training")
130 | 
131 |     print("dataX shape", dataX.shape)
132 |     print(dataX[0:5])
133 |     print("dataY shape", dataY.shape)
134 |     print(dataY[0:5])
135 | 
136 | 
137 |     #feedX = np.reshape(dataX, (dataX.shape[0], 1, dataX.shape[1] ))
138 |     #feedY = np.reshape(dataY, (dataY.shape[0], 1, dataY.shape[1] ))
139 |     feedX = dataX
140 |     feedY = dataY
141 | 
142 | 
143 |     #The more epochs you train the model, the better is becomes at predicting future states
144 |     #This in turn will improve the results of the Bellman equation and thus will lead us to
145 |     # better decisions in our MDP process
146 |     model.fit(feedX,feedY, batch_size=1,epochs=training_epochs,verbose=2)
147 | 
148 |     print("total_steps ", total_steps)
149 |     print("dataX ", dataX[0:10])
150 |     print("dataY ", dataY[0:10])
151 |     #print("dataY ", dataY)
152 | 
153 | 
154 | dataX = np.random.random((1,5))
155 | 
156 | 
157 | 
158 | res = model.predict(dataX[0].reshape(1,dataX.shape[1]))
159 | nstate = res[0][:-1]
160 | 
161 | print("predicted output ", res)
162 | print("expected reward ", res[0][4])
163 | print("expected state ", nstate)
164 | 
165 | def estimateReward(qstate,action, depth):
166 |     if depth <= 0:
167 |         return 0
168 |     #calculate/estimate reward at this state and get the next state
169 |     qs_a = np.concatenate((qstate,np.array([action])), axis=0)
170 |     predX = np.zeros(shape=(1,5))
171 |     predX[0] = qs_a
172 |     pred = model.predict(predX[0].reshape(1,predX.shape[1]))
173 |     reward = pred[0][4]
174 |     expected_state = pred[0][:-1]
175 | 
176 |     '''
177 |     print("depth -- ", depth)
178 |     print("qstate", qstate)
179 |     print("action", action)
180 |     print("pred", pred)
181 |     print("expected_state", expected_state)
182 |     print("reward", reward)
183 |     '''
184 |     # Bellman -- reward at this state = reward + Sum of discounted expected rewards for all actions (recursively)
185 |     #recursively calculate the reward at future states for all possible actions
186 |     discounted_future_rewards = 0.95*estimateReward(expected_state,0,depth-1)+ 0.95*estimateReward(expected_state,1,depth-1)
187 | 
188 |     #print("discounted_future_rewards", discounted_future_rewards)
189 |     #add current state and discounted future state reward
190 |     return reward + discounted_future_rewards
191 | 
192 | 
193 | print("** Estimating reward for dataX[0] with action 1 usint Bellman", estimateReward(dataX[0][:-1],1,2))
194 | print("** Estimating reward for dataX[0] with action 0 usint Bellman", estimateReward(dataX[0][:-1],0,2))
195 | 
196 | 
197 | 
198 | #####
199 | #####
200 | #Play the game for X rounds using the Bellman with LSTM anticipation model
201 | 
202 | 
203 | for game in range(3):
204 |     total_steps =0
205 |     #Get the Q state
206 |     qs = env.reset()
207 |     for step in range (300):
208 |         ##chose an action by estimating consequences of actions for the next num_anticipation_steps steps ahead
209 |         #works best with looking 6 steps ahead
210 |         #Also works best if you train the model more itterations
211 |         estimated_anticipated_reward_a = estimateReward(qs,1,num_anticipation_steps)
212 |         estimated_anticipated_reward_b = estimateReward(qs,0,num_anticipation_steps)
213 |         #print(" estimated rewards a and b", estimated_anticipated_reward_a, estimated_anticipated_reward_b)
214 | 
215 |         #chose argmax action of estimated anticipated rewards
216 |         if estimated_anticipated_reward_a > estimated_anticipated_reward_b:
217 |             a = 1
218 |         else:
219 |             a = 0
220 | 
221 |         env.render()
222 | 
223 | 
224 |         #get the target state and reward
225 |         s,r,done,info = env.step(a)
226 | 
227 | 
228 | 
229 |         qs=s
230 |         #set reward in case of failure
231 |         if done:
232 |             r = -1
233 |             if total_steps >= 198:
234 |                 print("*** Game Won after ", total_steps, " steps")
235 |             else:
236 |                 print("** failed after ", total_steps, " steps")
237 | 
238 | 
239 |         total_steps += 1
240 |         if done :
241 |             break
242 | 
243 | if save_weights:
244 |     #Save model
245 |     print("Saving weights")
246 |     model.save_weights("CP-weights_Simple_CNN.h5")
247 | 


--------------------------------------------------------------------------------
/Experimental/MNIST_image_Classification.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | MNIST dataset classification with Keras.
 4 | Credits to fchollet. Find him on github.
 5 | '''
 6 | 
 7 | from __future__ import print_function
 8 | import keras
 9 | from keras.datasets import mnist
10 | from keras.models import Sequential
11 | from keras.layers import Dense,Dropout,Flatten
12 | from keras.layers import Conv2D, MaxPooling2D
13 | from keras import backend as K
14 | 
15 | batch_size = 128
16 | num_class = 10
17 | epochs = 3
18 | 
19 | #input image dimensions
20 | img_rows, img_cols = 28,28
21 | 
22 | #seperate train and test dataset
23 | (x_train,y_train), (x_test,y_test) = mnist.load_data()
24 | 
25 | if K.image_data_format() == 'channels_first':
26 |     x_train = x_train.reshape(x_train.shape[0],1,img_rows,img_cols)
27 |     x_test = x_test.reshape(x_test.shape[0],1,img_rows,img_cols)
28 |     input_shape = (1,img_rows,img_cols)
29 | else:
30 |     x_train = x_train.reshape(x_train.shape[0],img_rows,img_cols,1)
31 |     x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
32 |     input_shape = (img_rows,img_cols,1)
33 | 
34 | x_train = x_train.astype('float32')
35 | x_test = x_test.astype('float32')
36 | x_train /=255
37 | x_test /=255
38 | print('x_train shape', x_train.shape)
39 | print('x_test shape', x_test.shape)
40 | print('y_train', y_train)
41 | 
42 | 
43 | # convert class to binanry class
44 | y_train = keras.utils.to_categorical(y_train,num_class)
45 | y_test = keras.utils.to_categorical(y_test,num_class)
46 | 
47 | #print('y_train', y_train)
48 | 
49 | #Declare the model
50 | model = Sequential()
51 | model.add(Conv2D(32,kernel_size=(3,3), activation='relu',input_shape=input_shape))
52 | model.add(Conv2D(64,(3,3), activation='relu'))
53 | model.add(MaxPooling2D(pool_size=(2, 2)))
54 | model.add(Dropout(0.25))
55 | model.add(Flatten())
56 | model.add(Dense(128,activation='relu'))
57 | model.add((Dropout(0.5)))
58 | model.add(Dense(num_class,activation='softmax'))
59 | 
60 | model.compile(loss=keras.losses.categorical_crossentropy,
61 |               optimizer=keras.optimizers.Adadelta(),
62 |               metrics=['accuracy'])
63 | 
64 | model.fit(x_train,y_train,
65 |     batch_size=batch_size,
66 |     epochs=epochs,
67 |     verbose=1,
68 |     validation_data=(x_test,y_test))
69 | 
70 | score = model.evaluate(x_test[1000:],y_test[1000:],verbose=0)
71 | print('Test loss:', score[0])
72 | print('Test accuracy:', score[1])
73 | 


--------------------------------------------------------------------------------
/Experimental/Readme.md:
--------------------------------------------------------------------------------
1 | # Collection of experimental code samples
2 | 


--------------------------------------------------------------------------------
/Experimental/image_rescale.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | https://hackernoon.com/visualizing-parts-of-convolutional-neural-networks-using-keras-and-cats-5cc01b214e59
 4 | https://stackoverflow.com/questions/43895750/keras-input-shape-for-conv2d-and-manually-loaded-images
 5 | '''
 6 | 
 7 | import matplotlib.pylab as plt
 8 | import matplotlib.image as mpimg
 9 | import numpy as np
10 | import scipy
11 | import keras
12 | 
13 | from scipy import misc
14 | from keras.models import Sequential
15 | from keras.layers import Conv2D
16 | 
17 | 
18 | 
19 | def show_cat(cat_batch):
20 |     print("cat shape before transfo",cat_batch.shape)
21 |     cat = np.squeeze(cat_batch,axis=0)
22 |     print( "cat.shape", cat.shape)
23 |     plt.imshow(cat)
24 |     plt.show()
25 | 
26 | def resize_cat(cat):
27 |     cat = scipy.misc.imresize(cat,size=(cat.shape[0]/2,cat.shape[1]/2))
28 |     plt.imshow(cat)
29 |     plt.show()
30 | 
31 | cat = mpimg.imread('cat.png')
32 | print("Shape", cat.shape)
33 | plt.imshow(cat)
34 | plt.show()
35 | resize_cat(cat)
36 | 
37 | cat_batch = cat.reshape(1,cat.shape[0],cat.shape[1],4)
38 | 
39 | input_shape = ( cat.shape[0], cat.shape[1], 4 )
40 | 
41 | model = Sequential()
42 | model.add(Conv2D(4, kernel_size=(3, 3), activation='relu', input_shape=input_shape))
43 | 
44 | print("predicting ... ")
45 | conv_cat = model.predict(cat_batch)
46 | show_cat(conv_cat)
47 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 FitMachineLearning
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NeuroEvolution/Implementations/RS_Hopper.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import keras
  3 | import gym
  4 | import roboschool
  5 | 
  6 | from keras.layers.advanced_activations import LeakyReLU, PReLU
  7 | from keras.models import Sequential
  8 | from keras.layers import Dense, Dropout
  9 | from keras import optimizers
 10 | 
 11 | from Lib.Individual import Individual
 12 | 
 13 | ENVIRONMENT_NAME = "RoboschoolHopper-v1"
 14 | OBSERVATION_SPACE = 15
 15 | ACTION_SPACE = 3
 16 | 
 17 | B_DISCOUNT = 0.98
 18 | 
 19 | POPULATION_SIZE = 10
 20 | NETWORK_WIDTH = 512
 21 | NUM_TEST_EPISODES = 3
 22 | NUM_SELECTED_FOR_REPRODUCTION = 2
 23 | NOISE_SIGMA = 0.3
 24 | MUTATION_PROB = 0.05
 25 | 
 26 | 
 27 | MAX_GENERATIONS = 20000
 28 | 
 29 | CLIP_ACTIONS = True
 30 | MAX_STEPS = 996
 31 | 
 32 | all_individuals = []
 33 | generations_count = 0
 34 | total_population_counter = 0
 35 | 
 36 | 
 37 | 
 38 | 
 39 | 
 40 | '''---------ENVIRONMENT INITIALIZATION--------'''
 41 | 
 42 | env = gym.make(ENVIRONMENT_NAME)
 43 | #env.render(mode="human")
 44 | env.reset()
 45 | 
 46 | print("-- Observations",env.observation_space)
 47 | print("-- actionspace",env.action_space)
 48 | 
 49 | 
 50 | #initialize training matrix with random states and actions
 51 | apdataX = np.random.random(( 5,OBSERVATION_SPACE ))
 52 | apdataY = np.random.random((5,ACTION_SPACE))
 53 | 
 54 | 
 55 | '''---------------------'''
 56 | 
 57 | def GetRememberedOptimalPolicy(targetModel,qstate):
 58 |     predX = np.zeros(shape=(1,OBSERVATION_SPACE))
 59 |     predX[0] = qstate
 60 | 
 61 |     #print("trying to predict reward at qs_a", predX[0])
 62 |     pred = targetModel.predict(predX[0].reshape(1,predX.shape[1]))
 63 |     r_remembered_optimal_policy = pred[0]
 64 |     return r_remembered_optimal_policy
 65 | 
 66 | 
 67 | def create_model(network_width, observation_space, action_space):
 68 |     action_predictor_model = Sequential()
 69 |     action_predictor_model.add(Dense(network_width, activation='relu', input_dim=observation_space))
 70 |     action_predictor_model.add(Dense(action_space))
 71 |     return action_predictor_model
 72 | 
 73 | def initialize_population(population_size,network_width, observation_space, action_space, environment_name,total_population_counter):
 74 |     initial_population = []
 75 |     for i in range (population_size):
 76 |         action_predictor_model = create_model(network_width, observation_space, action_space)
 77 |         indiv = Individual(generationID=0, indivID=total_population_counter , network = action_predictor_model)
 78 |         total_population_counter += 1
 79 |         initial_population.append(indiv)
 80 |     return initial_population, total_population_counter
 81 | 
 82 | def test_individual(indiv,num_test_episodes):
 83 |     indiv.lifeScore = 0
 84 |     allRewards = []
 85 |     for i in range(num_test_episodes):
 86 |         episodeRewards = []
 87 |         #print("episode "+str(i)+" performing test for indiv ",indiv.printme())
 88 |         qs = env.reset()
 89 |         for step in range (5000):
 90 |             a = GetRememberedOptimalPolicy(indiv.network, qs)
 91 |             if CLIP_ACTIONS:
 92 |                 for i in range (np.alen(a)):
 93 |                     if a[i] < -1: a[i]=-0.99999999999
 94 |                     if a[i] > 1: a[i] = 0.99999999999
 95 |             qs,r,done,info = env.step(a)
 96 |             episodeRewards.append(r)
 97 |             #indiv.lifeScore += r
 98 |             env.render()
 99 |             if step > MAX_STEPS:
100 |                 done = True
101 |             if done:
102 |                 episodeRewards.reverse()
103 |                 for j in range(len(episodeRewards)):
104 |                     #if j ==0:
105 |                     #    print("last reward ",episodeRewards[j])
106 |                     if j > 0:
107 |                         episodeRewards[j] = episodeRewards[j] + B_DISCOUNT * episodeRewards[j-1]
108 |                 #avg = sum(episodeRewards)/len(episodeRewards)
109 |                 #print("episode average ", avg)
110 |                 for j in range(len(episodeRewards)):
111 |                     allRewards.append(episodeRewards[j])
112 |                 #allRewards = allRewards + episodeRewards
113 |                 break
114 |         epAvg = sum(episodeRewards) / len(episodeRewards)
115 |         print("generationID",indiv.generationID,"IndivID",indiv.indivID,"episodeRewards rewards ",epAvg)
116 | 
117 |     avg = sum(allRewards) / len(allRewards)
118 |     indiv.lifeScore = avg
119 |     #indiv.lifeScore = np.random.rand(1)[0]*50
120 |     print("indivID - ",indiv.indivID,"lifeScore =",indiv.lifeScore)
121 | 
122 | 
123 | def test_all_individuals(num_test_episodes):
124 |     for i in range(len(all_individuals)):
125 |         test_individual(all_individuals[i],NUM_TEST_EPISODES)
126 | 
127 | 
128 | def select_top_individuals(num_selected,population_size):
129 |     scores = np.zeros(population_size)
130 |     for i in range(np.alen(scores)):
131 |         scores[i] = all_individuals[i].lifeScore
132 | 
133 |     print( scores )
134 |     topScores = scores[ scores.argsort()[-num_selected:][::-1] ]
135 |     #print ("Top Scores ", topScores)
136 |     selected_individuals = []
137 |     for i in range(len(all_individuals)):
138 |         if all_individuals[i].lifeScore >= topScores.min():
139 |             #print("Selecting individual",i," with score ", all_individuals[i].lifeScore,"cuttoff ", topScores.min())
140 |             selected_individuals.append(all_individuals[i])
141 | 
142 |     for i in range (len(selected_individuals)):
143 |         print(selected_individuals[i].printme())
144 | 
145 |     return selected_individuals
146 | 
147 | # --- Parameter Noising
148 | def add_noise_simple(mu,noiseSigma, largeNoise=False):
149 |     x =   np.random.rand(1) - 0.5 #probability of doing x
150 |     if np.random.rand(1) < MUTATION_PROB:
151 |         #print("mutating")
152 |         if not largeNoise:
153 |             x = x*noiseSigma
154 |         else:
155 |             x = x*noiseSigma   #Sigma = width of the standard deviaion
156 |     else:
157 |         x = 0
158 |         #print ("x/200",x,"big_sigma",big_sigma)
159 |     return mu + x
160 | 
161 | 
162 | add_noise_simple = np.vectorize(add_noise_simple,otypes=[np.float])
163 | 
164 | 
165 | def add_noise_to_model(targetModel,noiseSigma=NOISE_SIGMA,largeNoise = True):
166 | 
167 |     sz = len(targetModel.layers)
168 |     #if largeNoise:
169 |     #    print("Setting Large Noise!")
170 |     for k in range(sz):
171 |         w = targetModel.layers[k].get_weights()
172 |         if np.alen(w) >0 :
173 |             #print("k==>",k)
174 |             w[0] = add_noise_simple(w[0],noiseSigma,largeNoise)
175 | 
176 |         targetModel.layers[k].set_weights(w)
177 |     return targetModel
178 | 
179 | def add_mutations(individuals,noiseSigma=NOISE_SIGMA):
180 |     for i in range (len(individuals)):
181 |         individuals[i].network = add_noise_to_model(individuals[i].network,noiseSigma,True)
182 | 
183 | 
184 | def populate_next_generation(generationID,top_individuals,population_size, network_width, observation_space, action_space,total_population_counter):
185 |     newPop = top_individuals
186 |     for i in range( population_size - len(top_individuals)):
187 |         newModel = create_model(network_width, observation_space, action_space)
188 |         model1 = top_individuals[0].network
189 |         model2 = top_individuals[1].network
190 |         sz = len(newModel.layers)
191 |         #if largeNoise:
192 |         #    print("Setting Large Noise!")
193 |         for k in range(sz):
194 |             w = newModel.layers[k].get_weights()
195 |             w1 = model1.layers[k].get_weights()
196 |             w2 = model2.layers[k].get_weights()
197 | 
198 |             if np.alen(w) >0 :
199 |                 #print("k==>",k)
200 |                 #w[0][0] = combine_weights(w[0][0],w1[0][0],w2[0][0])
201 |                 for j in range(np.alen(w[0])):
202 |                     y=w[0][j]
203 |                     y1 = w1[0][j]
204 |                     y2 = w2[0][j]
205 |                     for l in range (np.alen(y)):
206 |                         z=y[l]
207 |                         z1=y1[l]
208 |                         z2=y2[l]
209 |                         if np.random.rand(1)>0.5:
210 |                             z=z1+0.0
211 |                         else:
212 |                             z=z2+0.0
213 |                         y[l]=z
214 |                     w[0][j]=y
215 | 
216 |             newModel.layers[k].set_weights(w)
217 |         top_individuals.append( Individual(generationID,total_population_counter,newModel) )
218 |         total_population_counter+=1
219 |     return top_individuals,total_population_counter
220 | 
221 | 
222 | 
223 | 
224 | ''' ------------------'''
225 | 
226 | all_individuals,total_population_counter = initialize_population(population_size=POPULATION_SIZE,
227 |     network_width=NETWORK_WIDTH,
228 |     observation_space=OBSERVATION_SPACE,
229 |     action_space=ACTION_SPACE,
230 |     environment_name=ENVIRONMENT_NAME,
231 |     total_population_counter=total_population_counter)
232 | 
233 | 
234 | for gens in range (MAX_GENERATIONS):
235 |     test_all_individuals(NUM_TEST_EPISODES)
236 |     top_individuals = select_top_individuals(NUM_SELECTED_FOR_REPRODUCTION,POPULATION_SIZE)
237 |     generations_count += 1
238 |     print("Generating next Gen ",generations_count)
239 |     all_individuals,total_population_counter = populate_next_generation(generations_count,top_individuals,
240 |         POPULATION_SIZE,NETWORK_WIDTH,
241 |         OBSERVATION_SPACE,
242 |         ACTION_SPACE,
243 |         total_population_counter)
244 |     print("@@@@ Adding Noise @@@@")
245 |     add_mutations(all_individuals)
246 | 
247 | 
248 | 
249 | #for i in range (len(all_individuals)):
250 | #    all_individuals[i].printNetwork()
251 | 


--------------------------------------------------------------------------------
/NeuroEvolution/Lib/Individual.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import keras
 3 | from keras.layers.advanced_activations import LeakyReLU, PReLU
 4 | from keras.models import Sequential
 5 | from keras.layers import Dense, Dropout
 6 | 
 7 | class Individual:
 8 | 
 9 |     def __init__(self, generationID,indivID, network):
10 |         self.generationID = generationID
11 |         self.indivID = indivID
12 |         self.network = network
13 |         #self.mutationSigma = mutationSigma
14 |         self.lifeScore = -10000
15 | 
16 |     def printme(self):
17 |         return "Generation %2d Individual %4d life score %4.3f network %s"%(self.generationID,+self.indivID,self.lifeScore,self.network)
18 |         #print("say what?",self.network)
19 | 
20 |     def printNetwork(self):
21 |         print("--- ID",self.indivID,"lifeScore ",self.lifeScore)
22 |         sz = len(self.network.layers)
23 |         #if largeNoise:
24 |         #    print("Setting Large Noise!")
25 |         for k in range(sz):
26 |             w = self.network.layers[k].get_weights()
27 |             if np.alen(w) >0 :
28 |                 print("k==>",k)
29 |                 print("w[0]",w[0])
30 |                 #print("w[1]",w[1])
31 |                 #print("w[3]",w[3])
32 | 
33 | 
34 | class IndividualTF:
35 |     def __init__(self, generationID,indivID, apw_h,apw_h2,
36 |         apw_h3,apw_o, appy_x):
37 |         self.generationID = generationID
38 |         self.indivID = indivID
39 |         #self.network = network
40 |         self.apw_h = apw_h
41 |         self.apw_h2 = apw_h2
42 |         self.apw_h3 = apw_h3
43 |         self.apw_o = apw_o
44 |         self.appy_x = appy_x
45 | 
46 |         #self.mutationSigma = mutationSigma
47 |         self.lifeScore = -10000
48 | 
49 |     def printme(self):
50 |         return "Generation %2d Individual %4d life score %4.3f network "%(self.generationID,+self.indivID,self.lifeScore)
51 |         #print("say what?",self.network)
52 | 
53 |     def printNetwork(self):
54 |         print("--- ID",self.indivID,"lifeScore ",self.lifeScore)
55 |         print("apw_h ", self.apw_h)
56 |         print("apw_o ", self.apw_o)
57 | 


--------------------------------------------------------------------------------
/NeuroEvolution/old/Main.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import keras
  3 | import gym
  4 | import roboschool
  5 | 
  6 | from keras.layers.advanced_activations import LeakyReLU, PReLU
  7 | from keras.models import Sequential
  8 | from keras.layers import Dense, Dropout
  9 | from keras import optimizers
 10 | 
 11 | from Lib.Individual import Individual
 12 | 
 13 | ENVIRONMENT_NAME = "RoboschoolHopper-v1"
 14 | OBSERVATION_SPACE = 15
 15 | ACTION_SPACE = 3
 16 | 
 17 | B_DISCOUNT = 0.98
 18 | 
 19 | POPULATION_SIZE = 10
 20 | NETWORK_WIDTH = 512
 21 | NUM_TEST_EPISODES = 3
 22 | NUM_SELECTED_FOR_REPRODUCTION = 2
 23 | NOISE_SIGMA = 0.06
 24 | 
 25 | MAX_GENERATIONS = 20000
 26 | 
 27 | CLIP_ACTIONS = True
 28 | MAX_STEPS = 996
 29 | 
 30 | all_individuals = []
 31 | generations_count = 0
 32 | total_population_counter = 0
 33 | 
 34 | 
 35 | 
 36 | 
 37 | 
 38 | '''---------ENVIRONMENT INITIALIZATION--------'''
 39 | 
 40 | env = gym.make(ENVIRONMENT_NAME)
 41 | #env.render(mode="human")
 42 | env.reset()
 43 | 
 44 | print("-- Observations",env.observation_space)
 45 | print("-- actionspace",env.action_space)
 46 | 
 47 | 
 48 | #initialize training matrix with random states and actions
 49 | apdataX = np.random.random(( 5,OBSERVATION_SPACE ))
 50 | apdataY = np.random.random((5,ACTION_SPACE))
 51 | 
 52 | 
 53 | '''---------------------'''
 54 | 
 55 | def GetRememberedOptimalPolicy(targetModel,qstate):
 56 |     predX = np.zeros(shape=(1,OBSERVATION_SPACE))
 57 |     predX[0] = qstate
 58 | 
 59 |     #print("trying to predict reward at qs_a", predX[0])
 60 |     pred = targetModel.predict(predX[0].reshape(1,predX.shape[1]))
 61 |     r_remembered_optimal_policy = pred[0]
 62 |     return r_remembered_optimal_policy
 63 | 
 64 | 
 65 | def create_model(network_width, observation_space, action_space):
 66 |     action_predictor_model = Sequential()
 67 |     action_predictor_model.add(Dense(network_width, activation='relu', input_dim=observation_space))
 68 |     action_predictor_model.add(Dense(action_space))
 69 |     return action_predictor_model
 70 | 
 71 | def initialize_population(population_size,network_width, observation_space, action_space, environment_name,total_population_counter):
 72 |     initial_population = []
 73 |     for i in range (population_size):
 74 |         action_predictor_model = create_model(network_width, observation_space, action_space)
 75 |         indiv = Individual(generationID=0, indivID=total_population_counter , network = action_predictor_model)
 76 |         total_population_counter += 1
 77 |         initial_population.append(indiv)
 78 |     return initial_population, total_population_counter
 79 | 
 80 | def test_individual(indiv,num_test_episodes):
 81 |     indiv.lifeScore = 0
 82 |     allRewards = []
 83 |     for i in range(num_test_episodes):
 84 |         episodeRewards = []
 85 |         #print("episode "+str(i)+" performing test for indiv ",indiv.printme())
 86 |         qs = env.reset()
 87 |         for step in range (5000):
 88 |             a = GetRememberedOptimalPolicy(indiv.network, qs)
 89 |             if CLIP_ACTIONS:
 90 |                 for i in range (np.alen(a)):
 91 |                     if a[i] < -1: a[i]=-0.99999999999
 92 |                     if a[i] > 1: a[i] = 0.99999999999
 93 |             qs,r,done,info = env.step(a)
 94 |             episodeRewards.append(r)
 95 |             #indiv.lifeScore += r
 96 |             env.render()
 97 |             if step > MAX_STEPS:
 98 |                 done = True
 99 |             if done:
100 |                 episodeRewards.reverse()
101 |                 for j in range(len(episodeRewards)):
102 |                     #if j ==0:
103 |                     #    print("last reward ",episodeRewards[j])
104 |                     if j > 0:
105 |                         episodeRewards[j] = episodeRewards[j] + B_DISCOUNT * episodeRewards[j-1]
106 |                 #avg = sum(episodeRewards)/len(episodeRewards)
107 |                 #print("episode average ", avg)
108 |                 for j in range(len(episodeRewards)):
109 |                     allRewards.append(episodeRewards[j])
110 |                 #allRewards = allRewards + episodeRewards
111 |                 break
112 |         epAvg = sum(episodeRewards) / len(episodeRewards)
113 |         print("generationID",indiv.generationID,"IndivID",indiv.indivID,"episodeRewards rewards ",epAvg)
114 | 
115 |     avg = sum(allRewards) / len(allRewards)
116 |     indiv.lifeScore = avg
117 |     #indiv.lifeScore = np.random.rand(1)[0]*50
118 |     print("indivID - ",indiv.indivID,"lifeScore =",indiv.lifeScore)
119 | 
120 | 
121 | def test_all_individuals(num_test_episodes):
122 |     for i in range(len(all_individuals)):
123 |         test_individual(all_individuals[i],NUM_TEST_EPISODES)
124 | 
125 | 
126 | def select_top_individuals(num_selected,population_size):
127 |     scores = np.zeros(population_size)
128 |     for i in range(np.alen(scores)):
129 |         scores[i] = all_individuals[i].lifeScore
130 | 
131 |     print( scores )
132 |     topScores = scores[ scores.argsort()[-num_selected:][::-1] ]
133 |     #print ("Top Scores ", topScores)
134 |     selected_individuals = []
135 |     for i in range(len(all_individuals)):
136 |         if all_individuals[i].lifeScore >= topScores.min():
137 |             #print("Selecting individual",i," with score ", all_individuals[i].lifeScore,"cuttoff ", topScores.min())
138 |             selected_individuals.append(all_individuals[i])
139 | 
140 |     for i in range (len(selected_individuals)):
141 |         print(selected_individuals[i].printme())
142 | 
143 |     return selected_individuals
144 | 
145 | # --- Parameter Noising
146 | def add_noise_simple(mu,noiseSigma, largeNoise=False):
147 |     x =   np.random.rand(1) - 0.5 #probability of doing x
148 |     if not largeNoise:
149 |         x = x*noiseSigma
150 |     else:
151 |         x = x*noiseSigma   #Sigma = width of the standard deviaion
152 |     #print ("x/200",x,"big_sigma",big_sigma)
153 |     return mu + x
154 | 
155 | 
156 | add_noise_simple = np.vectorize(add_noise_simple,otypes=[np.float])
157 | 
158 | 
159 | def add_noise_to_model(targetModel,noiseSigma=NOISE_SIGMA,largeNoise = True):
160 | 
161 |     sz = len(targetModel.layers)
162 |     #if largeNoise:
163 |     #    print("Setting Large Noise!")
164 |     for k in range(sz):
165 |         w = targetModel.layers[k].get_weights()
166 |         if np.alen(w) >0 :
167 |             #print("k==>",k)
168 |             w[0] = add_noise_simple(w[0],noiseSigma,largeNoise)
169 | 
170 |         targetModel.layers[k].set_weights(w)
171 |     return targetModel
172 | 
173 | def add_mutations(individuals,noiseSigma=NOISE_SIGMA):
174 |     for i in range (len(individuals)):
175 |         individuals[i].network = add_noise_to_model(individuals[i].network,noiseSigma,True)
176 | 
177 | 
178 | def populate_next_generation(generationID,top_individuals,population_size, network_width, observation_space, action_space,total_population_counter):
179 |     newPop = top_individuals
180 |     for i in range( population_size - len(top_individuals)):
181 |         newModel = create_model(network_width, observation_space, action_space)
182 |         model1 = top_individuals[0].network
183 |         model2 = top_individuals[1].network
184 |         sz = len(newModel.layers)
185 |         #if largeNoise:
186 |         #    print("Setting Large Noise!")
187 |         for k in range(sz):
188 |             w = newModel.layers[k].get_weights()
189 |             w1 = model1.layers[k].get_weights()
190 |             w2 = model2.layers[k].get_weights()
191 | 
192 |             if np.alen(w) >0 :
193 |                 #print("k==>",k)
194 |                 #w[0][0] = combine_weights(w[0][0],w1[0][0],w2[0][0])
195 |                 for j in range(np.alen(w[0])):
196 |                     y=w[0][j]
197 |                     y1 = w1[0][j]
198 |                     y2 = w2[0][j]
199 |                     for l in range (np.alen(y)):
200 |                         z=y[l]
201 |                         z1=y1[l]
202 |                         z2=y2[l]
203 |                         if np.random.rand(1)>0.5:
204 |                             z=z1+0.0
205 |                         else:
206 |                             z=z2+0.0
207 |                         y[l]=z
208 |                     w[0][j]=y
209 | 
210 |             newModel.layers[k].set_weights(w)
211 |         top_individuals.append( Individual(generationID,total_population_counter,newModel) )
212 |         total_population_counter+=1
213 |     return top_individuals,total_population_counter
214 | 
215 | 
216 | 
217 | 
218 | ''' ------------------'''
219 | 
220 | all_individuals,total_population_counter = initialize_population(population_size=POPULATION_SIZE,
221 |     network_width=NETWORK_WIDTH,
222 |     observation_space=OBSERVATION_SPACE,
223 |     action_space=ACTION_SPACE,
224 |     environment_name=ENVIRONMENT_NAME,
225 |     total_population_counter=total_population_counter)
226 | 
227 | 
228 | for gens in range (MAX_GENERATIONS):
229 |     test_all_individuals(NUM_TEST_EPISODES)
230 |     top_individuals = select_top_individuals(NUM_SELECTED_FOR_REPRODUCTION,POPULATION_SIZE)
231 |     generations_count += 1
232 |     print("Generating next Gen ",generations_count)
233 |     all_individuals,total_population_counter = populate_next_generation(generations_count,top_individuals,
234 |         POPULATION_SIZE,NETWORK_WIDTH,
235 |         OBSERVATION_SPACE,
236 |         ACTION_SPACE,
237 |         total_population_counter)
238 |     print("@@@@ Adding Noise @@@@")
239 |     add_mutations(all_individuals)
240 | 
241 | 
242 | 
243 | #for i in range (len(all_individuals)):
244 | #    all_individuals[i].printNetwork()
245 | 


--------------------------------------------------------------------------------
/NeuroEvolution/old/Main2.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import keras
  3 | import gym
  4 | import roboschool
  5 | 
  6 | from keras.layers.advanced_activations import LeakyReLU, PReLU
  7 | from keras.models import Sequential
  8 | from keras.layers import Dense, Dropout
  9 | from keras import optimizers
 10 | 
 11 | from Lib.Individual import Individual
 12 | '''
 13 | ENVIRONMENT_NAME = "RoboschoolAnt-v1"
 14 | OBSERVATION_SPACE = 28
 15 | ACTION_SPACE = 8
 16 | '''
 17 | ENVIRONMENT_NAME = "RoboschoolHopper-v1"
 18 | OBSERVATION_SPACE = 15
 19 | ACTION_SPACE = 3
 20 | 
 21 | B_DISCOUNT = 0.98
 22 | 
 23 | POPULATION_SIZE = 15
 24 | NETWORK_WIDTH = 32
 25 | NETWORK_HIDDEN_LAYERS = 1
 26 | NUM_TEST_EPISODES = 1
 27 | NUM_SELECTED_FOR_REPRODUCTION = 2
 28 | NOISE_SIGMA = 0.01
 29 | MUTATION_PROB = 0.05
 30 | 
 31 | MAX_GENERATIONS = 200000
 32 | 
 33 | USE_GAUSSIAN_NOISE = False
 34 | HAS_EARLY_TERMINATION_REWARD = False
 35 | EARLY_TERMINATION_REWARD = -2
 36 | CLIP_ACTIONS = False
 37 | MAX_STEPS = 650
 38 | 
 39 | all_individuals = []
 40 | generations_count = 0
 41 | total_population_counter = 0
 42 | #numLandings = 0
 43 | 
 44 | 
 45 | 
 46 | 
 47 | 
 48 | '''---------ENVIRONMENT INITIALIZATION--------'''
 49 | 
 50 | env = gym.make(ENVIRONMENT_NAME)
 51 | #env.render(mode="human")
 52 | env.reset()
 53 | 
 54 | print("-- Observations",env.observation_space)
 55 | print("-- actionspace",env.action_space)
 56 | 
 57 | 
 58 | #initialize training matrix with random states and actions
 59 | apdataX = np.random.random(( 5,OBSERVATION_SPACE ))
 60 | apdataY = np.random.random((5,ACTION_SPACE))
 61 | 
 62 | 
 63 | '''---------------------'''
 64 | 
 65 | def GetRememberedOptimalPolicy(targetModel,qstate):
 66 |     predX = np.zeros(shape=(1,OBSERVATION_SPACE))
 67 |     predX[0] = qstate
 68 | 
 69 |     #print("trying to predict reward at qs_a", predX[0])
 70 |     pred = targetModel.predict(predX[0].reshape(1,predX.shape[1]))
 71 |     r_remembered_optimal_policy = pred[0]
 72 |     return r_remembered_optimal_policy
 73 | 
 74 | 
 75 | def create_model(network_width, network_hidden_layers, observation_space, action_space):
 76 |     action_predictor_model = Sequential()
 77 |     action_predictor_model.add(Dense(network_width, activation='relu', input_dim=observation_space))
 78 |     for i in range(network_hidden_layers):
 79 |         action_predictor_model.add(Dense(network_width, activation='relu'))
 80 | 
 81 |     action_predictor_model.add(Dense(action_space))
 82 |     return action_predictor_model
 83 | 
 84 | def initialize_population(population_size,network_width,network_hidden_layers, observation_space, action_space, environment_name,total_population_counter):
 85 |     initial_population = []
 86 |     for i in range (population_size):
 87 |         action_predictor_model = create_model(network_width,network_hidden_layers, observation_space, action_space)
 88 |         indiv = Individual(generationID=0, indivID=total_population_counter , network = action_predictor_model)
 89 |         total_population_counter += 1
 90 |         initial_population.append(indiv)
 91 |     return initial_population, total_population_counter
 92 | 
 93 | def test_individual(indiv,num_test_episodes):
 94 |     indiv.lifeScore = 0
 95 |     allRewards = []
 96 |     for i in range(num_test_episodes):
 97 |         episodeRewards = []
 98 |         #print("episode "+str(i)+" performing test for indiv ",indiv.printme())
 99 |         qs = env.reset()
100 |         for step in range (5000):
101 |             a = GetRememberedOptimalPolicy(indiv.network, qs)
102 |             if CLIP_ACTIONS:
103 |                 for i in range (np.alen(a)):
104 |                     if a[i] < -1: a[i]=-0.99999999999
105 |                     if a[i] > 1: a[i] = 0.99999999999
106 |             qs,r,done,info = env.step(a)
107 |             if HAS_EARLY_TERMINATION_REWARD and done and step<MAX_STEPS-3:
108 |                 r = EARLY_TERMINATION_REWARD
109 |             episodeRewards.append(r)
110 | 
111 |             #indiv.lifeScore += r
112 |             env.render()
113 |             if step > MAX_STEPS:
114 |                 done = True
115 |             if done:
116 |                 episodeRewards.reverse()
117 |                 for j in range(len(episodeRewards)):
118 |                     #if j ==0:
119 |                     #    print("last reward ",episodeRewards[j])
120 |                     if j > 0:
121 |                         episodeRewards[j] = episodeRewards[j] + B_DISCOUNT * episodeRewards[j-1]
122 |                 #avg = sum(episodeRewards)/len(episodeRewards)
123 |                 #print("episode average ", avg)
124 |                 for j in range(len(episodeRewards)):
125 |                     allRewards.append(episodeRewards[j])
126 |                 #allRewards = allRewards + episodeRewards
127 |                 epAvg = sum(episodeRewards) / len(episodeRewards)
128 |                 allRewards.append(epAvg)
129 |                 #if epAvg >0:
130 |                 #    numLandings = numLandings+1
131 | 
132 |                 break
133 |         #print("generationID",indiv.generationID,"IndivID",indiv.indivID,"episodeRewards rewards ",epAvg)
134 | 
135 |         avg = sum(allRewards) / len(allRewards)
136 |         indiv.lifeScore = avg
137 |     #indiv.lifeScore = np.random.rand(1)[0]*50
138 |     print("generationID",indiv.generationID,"indivID - ",indiv.indivID,"numLandings ",0,"lifeScore =",indiv.lifeScore)
139 | 
140 | 
141 | def test_all_individuals(num_test_episodes):
142 |     for i in range(len(all_individuals)):
143 |         test_individual(all_individuals[i],NUM_TEST_EPISODES)
144 | 
145 | 
146 | def select_top_individuals(num_selected,population_size):
147 |     scores = np.zeros(population_size)
148 |     for i in range(np.alen(scores)):
149 |         scores[i] = all_individuals[i].lifeScore
150 | 
151 |     print( scores )
152 |     topScores = scores[ scores.argsort()[-num_selected:][::-1] ]
153 |     #print ("Top Scores ", topScores)
154 |     selected_individuals = []
155 |     for i in range(len(all_individuals)):
156 |         if all_individuals[i].lifeScore >= topScores.min():
157 |             #print("Selecting individual",i," with score ", all_individuals[i].lifeScore,"cuttoff ", topScores.min())
158 |             selected_individuals.append(all_individuals[i])
159 | 
160 | 
161 |     print("Selected individuals ")
162 |     for i in range (len(selected_individuals)):
163 |         print(selected_individuals[i].printme())
164 | 
165 |     return selected_individuals
166 | 
167 | # --- Parameter Noising
168 | 
169 | def add_noise(mu,noiseSigma, largeNoise=False):
170 | 
171 |     if largeNoise:
172 |         sig = noiseSigma
173 |     else:
174 |         #print("Adding Large parameter noise")
175 |         sig = noiseSigma #Sigma = width of the standard deviaion
176 |     #mu = means
177 |     x =   np.random.rand(1) #probability of doing x
178 |     #print ("x prob ",x)
179 |     if x >0.5:
180 |         return mu + np.exp(-np.power(x - mu, 2.) / (2 * np.power(sig, 2.)))
181 |     else:
182 |         return mu - np.exp(-np.power(x - mu, 2.) / (2 * np.power(sig, 2.)))
183 | 
184 | def add_noise_simple(mu,noiseSigma, largeNoise=False):
185 |     x =   np.random.rand(1) - 0.5 #probability of doing x
186 |     if np.random.rand(1) < MUTATION_PROB:
187 |         print("mutating")
188 |         if not largeNoise:
189 |             x = x*noiseSigma
190 |         else:
191 |             x = x*noiseSigma   #Sigma = width of the standard deviaion
192 |     else:
193 |         x = 0
194 |         #print ("x/200",x,"big_sigma",big_sigma)
195 |     return mu + x
196 | 
197 | 
198 | add_noise_simple = np.vectorize(add_noise_simple,otypes=[np.float])
199 | add_noise = np.vectorize(add_noise,otypes=[np.float])
200 | 
201 | def add_noise_to_model(targetModel,noiseSigma=NOISE_SIGMA,largeNoise = True):
202 | 
203 |     sz = len(targetModel.layers)
204 |     #if largeNoise:
205 |     #    print("Setting Large Noise!")
206 |     for k in range(sz):
207 |         w = targetModel.layers[k].get_weights()
208 |         if np.alen(w) >0 :
209 |             #print("k==>",k)
210 |             w[0] = add_noise_simple(w[0],noiseSigma,largeNoise)
211 | 
212 |         targetModel.layers[k].set_weights(w)
213 |     return targetModel
214 | 
215 | 
216 | ''' MUTATIONS '''
217 | def add_mutations(individuals,noiseSigma=NOISE_SIGMA):
218 |     for i in range (len(individuals)):
219 |         if i >2 and i%5==0:
220 |             individuals[i].network = add_noise_to_model(individuals[i].network,noiseSigma*2,True)
221 | 
222 | 
223 | def populate_next_generation(generationID,top_individuals,population_size, network_width,network_hidden_layers, observation_space, action_space,total_population_counter):
224 |     newPop = top_individuals
225 |     for i in range( population_size - len(top_individuals)):
226 |         newModel = create_model(network_width, network_hidden_layers, observation_space, action_space)
227 |         model1 = top_individuals[0].network
228 |         model2 = top_individuals[1].network
229 |         sz = len(newModel.layers)
230 |         #if largeNoise:
231 |         #    print("Setting Large Noise!")
232 |         for k in range(sz):
233 |             w = newModel.layers[k].get_weights()
234 |             w1 = model1.layers[k].get_weights()
235 |             w2 = model2.layers[k].get_weights()
236 | 
237 |             if np.alen(w) >0 :
238 |                 #print("k==>",k)
239 |                 #w[0][0] = combine_weights(w[0][0],w1[0][0],w2[0][0])
240 |                 for j in range(np.alen(w[0])):
241 |                     y=w[0][j]
242 |                     y1 = w1[0][j]
243 |                     y2 = w2[0][j]
244 |                     for l in range (np.alen(y)):
245 |                         z=y[l]
246 |                         z1=y1[l]
247 |                         z2=y2[l]
248 |                         if np.random.rand(1)>0.5:
249 |                             z=z1+0.0
250 |                         else:
251 |                             z=z2+0.0
252 |                         y[l]=z
253 |                     w[0][j]=y
254 | 
255 |             newModel.layers[k].set_weights(w)
256 |         top_individuals.append( Individual(generationID,total_population_counter,newModel) )
257 |         total_population_counter+=1
258 |     return top_individuals,total_population_counter
259 | 
260 | 
261 | 
262 | 
263 | ''' ------------------'''
264 | 
265 | all_individuals,total_population_counter = initialize_population(population_size=POPULATION_SIZE,
266 |     network_width=NETWORK_WIDTH,
267 |     network_hidden_layers = NETWORK_HIDDEN_LAYERS,
268 |     observation_space=OBSERVATION_SPACE,
269 |     action_space=ACTION_SPACE,
270 |     environment_name=ENVIRONMENT_NAME,
271 |     total_population_counter=total_population_counter)
272 | 
273 | 
274 | for gens in range (MAX_GENERATIONS):
275 |     test_all_individuals(NUM_TEST_EPISODES)
276 |     top_individuals = select_top_individuals(NUM_SELECTED_FOR_REPRODUCTION,POPULATION_SIZE)
277 |     generations_count += 1
278 |     print("Generating next Gen ",generations_count)
279 |     all_individuals,total_population_counter = populate_next_generation(generations_count,top_individuals,
280 |         POPULATION_SIZE,NETWORK_WIDTH, NETWORK_HIDDEN_LAYERS,
281 |         OBSERVATION_SPACE,
282 |         ACTION_SPACE,
283 |         total_population_counter)
284 |     #print("@@@@ Adding Noise @@@@")
285 |     add_mutations(all_individuals)
286 | 
287 | 
288 | 
289 | #for i in range (len(all_individuals)):
290 | #    all_individuals[i].printNetwork()
291 | 


--------------------------------------------------------------------------------
/OptimalPolicyTreeSearch/Cartpole_OPTS.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Cartpole solution by Michel Aka the Author of the Fit Machine Learning Blog
  3 | 
  4 | Demo here
  5 | https://www.youtube.com/watch?v=TguWjWvRp8c
  6 | '''
  7 | import numpy as np
  8 | import keras
  9 | import gym
 10 | import os
 11 | import h5py
 12 | 
 13 | from keras.models import Sequential
 14 | from keras.layers import Dense, Dropout
 15 | from keras.layers import Embedding
 16 | from keras.layers import LSTM
 17 | from keras import optimizers
 18 | 
 19 | 
 20 | num_env_variables = 4
 21 | num_env_actions = 1
 22 | num_training_exmaples = 100
 23 | timesteps = 1
 24 | num_initial_observation = 4000
 25 | 
 26 | #recommend 850 for tensorflow backend , but 500 should be enough for theano backend
 27 | training_epochs = 850
 28 | num_anticipation_steps = 6
 29 | load_previous_weights = True
 30 | observe_and_train = False
 31 | save_weights = False
 32 | 
 33 | 
 34 | #Create testing enviroment
 35 | env = gym.make('CartPole-v0')
 36 | env.reset()
 37 | 
 38 | #initialize training matrix with random states and actions
 39 | dataX = np.random.random(( num_training_exmaples,num_env_variables+num_env_actions ))
 40 | #initize training matrix corresponding expected states and expected rewards (random)
 41 | dataY = np.random.random((num_training_exmaples,num_env_variables+1))
 42 | 
 43 | 
 44 | 
 45 | #nitialize the LSTM with random weights
 46 | 
 47 | model = Sequential()
 48 | model.add(LSTM(16,return_sequences=True, stateful=True , batch_size=1,  input_shape=(timesteps, dataX.shape[1])))
 49 | model.add(LSTM(16, return_sequences=True))
 50 | model.add(Dense(16, activation='relu'))
 51 | model.add(Dense(dataY.shape[1]))
 52 | 
 53 | opt = optimizers.adam(lr=0.01)
 54 | 
 55 | model.compile(loss='mse', optimizer=opt, metrics=['accuracy'])
 56 | 
 57 | #load previous model weights if they exist
 58 | if load_previous_weights:
 59 |     dir_path = os.path.realpath(".")
 60 |     fn = dir_path + "/CP-weights.h5"
 61 |     print("filepath ", fn)
 62 |     if  os.path.isfile(fn):
 63 |         print("loading weights")
 64 |         model.load_weights("CP-weights.h5")
 65 |     else:
 66 |         print("File CP-weights.h5 does not exis. Retraining... ")
 67 | 
 68 | #Record first 500 in a sequence and add them to the training sequence
 69 | total_steps = 0
 70 | dataX = np.zeros(shape=(1,5))
 71 | dataY = np.zeros(shape=(1,5))
 72 | 
 73 | print("dataX shape", dataX.shape)
 74 | print("dataY shape", dataY.shape)
 75 | 
 76 | if observe_and_train:
 77 |     #observe for 100 games
 78 |     for game in range(100):
 79 | 
 80 |         if total_steps >= num_initial_observation:
 81 |             break
 82 |         #Get the Q state
 83 |         qs = env.reset()
 84 |         for step in range (200):
 85 |             a=0
 86 |             if np.random.rand(1) < 0.5:
 87 |                 a=0
 88 |             else:
 89 |                 a=1
 90 |             env.render()
 91 |             qs_a = np.concatenate((qs,np.array([a])), axis=0)
 92 | 
 93 |             #get the target state and reward
 94 |             s,r,done,info = env.step(a)
 95 | 
 96 |             #set reward in case of failure
 97 |             if done:
 98 |                 r = -1
 99 | 
100 |             #concatenate target state and reward
101 |             s_r = np.concatenate((s,np.array([r])), axis=0)
102 | 
103 |             if done:
104 |                 #print negative reward array
105 |                 print("Negative reward s_r: ", s_r)
106 | 
107 |             #print("reward = ", r)
108 |             #print("target state", s)
109 |             #print("concatenate(s,r)", s_r)
110 | 
111 | 
112 |             #record only the first x number of states
113 |             if total_steps ==0:
114 |                 dataX[0] = qs_a
115 |                 dataY[0] = s_r
116 | 
117 |             if total_steps < (num_initial_observation-1):
118 |                 dataX = np.vstack((dataX,qs_a))
119 |                 dataY = np.vstack((dataY,s_r))
120 | 
121 |             #Update the states
122 |             qs=s
123 | 
124 | 
125 |             total_steps += 1
126 |             if done :
127 |                 break
128 | 
129 |     print("Observation complete. - Begin LSTM training")
130 | 
131 |     print("dataX shape", dataX.shape)
132 |     print(dataX[0:5])
133 |     print("dataY shape", dataY.shape)
134 |     print(dataY[0:5])
135 | 
136 |     feedX = np.reshape(dataX, (dataX.shape[0], 1, dataX.shape[1] ))
137 |     feedY = np.reshape(dataY, (dataY.shape[0], 1, dataY.shape[1] ))
138 | 
139 | 
140 |     #The more epochs you train the model, the better is becomes at predicting future states
141 |     #This in turn will improve the results of the Bellman equation and thus will lead us to
142 |     # better decisions in our MDP process
143 |     model.fit(feedX,feedY, batch_size=1,epochs=training_epochs,verbose=2)
144 | 
145 |     print("total_steps ", total_steps)
146 |     print("dataX ", dataX[0:10])
147 |     print("dataY ", dataY[0:10])
148 |     #print("dataY ", dataY)
149 | 
150 | dataX = np.random.random((1,5))
151 | 
152 | res = model.predict(dataX[0].reshape(1,1,dataX.shape[1]))
153 | nstate = res[0][0][:-1]
154 | 
155 | print("predicted output ", res)
156 | print("expected reward ", res[0][0][4])
157 | print("expected state ", nstate)
158 | 
159 | def estimateReward(qstate,action, depth):
160 |     if depth <= 0:
161 |         return 0
162 |     #calculate/estimate reward at this state and get the next state
163 |     qs_a = np.concatenate((qstate,np.array([action])), axis=0)
164 |     predX = np.zeros(shape=(1,5))
165 |     predX[0] = qs_a
166 |     pred = model.predict(predX[0].reshape(1,1,predX.shape[1]))
167 |     reward = pred[0][0][4]
168 |     expected_state = pred[0][0][:-1]
169 | 
170 |     '''
171 |     print("depth -- ", depth)
172 |     print("qstate", qstate)
173 |     print("action", action)
174 |     print("pred", pred)
175 |     print("expected_state", expected_state)
176 |     print("reward", reward)
177 |     '''
178 |     # Bellman -- reward at this state = reward + Sum of discounted expected rewards for all actions (recursively)
179 |     #recursively calculate the reward at future states for all possible actions
180 |     discounted_future_rewards = 0.95*estimateReward(expected_state,0,depth-1)+ 0.95*estimateReward(expected_state,1,depth-1)
181 | 
182 |     #print("discounted_future_rewards", discounted_future_rewards)
183 |     #add current state and discounted future state reward
184 |     return reward + discounted_future_rewards
185 | 
186 | 
187 | print("** Estimating reward for dataX[0] with action 1 usint Bellman", estimateReward(dataX[0][:-1],1,2))
188 | print("** Estimating reward for dataX[0] with action 0 usint Bellman", estimateReward(dataX[0][:-1],0,2))
189 | 
190 | 
191 | 
192 | #####
193 | #####
194 | #Play the game for X rounds using the Bellman with LSTM anticipation model
195 | 
196 | 
197 | for game in range(3):
198 |     total_steps =0
199 |     #Get the Q state
200 |     qs = env.reset()
201 |     for step in range (300):
202 |         ##chose an action by estimating consequences of actions for the next num_anticipation_steps steps ahead
203 |         #works best with looking 6 steps ahead
204 |         #Also works best if you train the model more itterations
205 |         estimated_anticipated_reward_a = estimateReward(qs,1,num_anticipation_steps)
206 |         estimated_anticipated_reward_b = estimateReward(qs,0,num_anticipation_steps)
207 |         #print(" estimated rewards a and b", estimated_anticipated_reward_a, estimated_anticipated_reward_b)
208 | 
209 |         #chose argmax action of estimated anticipated rewards
210 |         if estimated_anticipated_reward_a > estimated_anticipated_reward_b:
211 |             a = 1
212 |         else:
213 |             a = 0
214 | 
215 |         env.render()
216 | 
217 | 
218 |         #get the target state and reward
219 |         s,r,done,info = env.step(a)
220 | 
221 | 
222 | 
223 |         qs=s
224 |         #set reward in case of failure
225 |         if done:
226 |             r = -1
227 |             if total_steps >= 198:
228 |                 print("*** Game Won after ", total_steps, " steps")
229 |             else:
230 |                 print("** failed after ", total_steps, " steps")
231 | 
232 | 
233 |         total_steps += 1
234 |         if done :
235 |             break
236 | 
237 | if save_weights:
238 |     #Save model
239 |     print("Saving weights")
240 |     model.save_weights("CP-weights.h5")
241 | 


--------------------------------------------------------------------------------
/ParameterNoising/NoisingFunction.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import keras
 3 | import gym
 4 | 
 5 | import pygal
 6 | import os
 7 | import h5py
 8 | import matplotlib.pyplot as plt
 9 | import math
10 | 
11 | from keras.layers.advanced_activations import LeakyReLU, PReLU
12 | from keras.models import Sequential
13 | from keras.layers import Dense, Dropout
14 | from keras.layers import Embedding
15 | from keras import optimizers
16 | 
17 | 
18 | #nitialize the Reward predictor model
19 | Qmodel = Sequential()
20 | #model.add(Dense(num_env_variables+num_env_actions, activation='tanh', input_dim=dataX.shape[1]))
21 | Qmodel.add(Dense(32, activation='relu', input_dim=1))
22 | Qmodel.add(Dropout(0.5))
23 | Qmodel.add(Dense(2, activation='relu'))
24 | #Qmodel.add(Dropout(0.5))
25 | #Qmodel.add(Dense(256, activation='tanh'))
26 | #Qmodel.add(Dropout(0.5))
27 | #Qmodel.add(Dense(256, activation='relu'))
28 | #Qmodel.add(Dropout(0.5))
29 | #Qmodel.add(Dense(512, activation='relu'))
30 | #Qmodel.add(Dropout(0.2))
31 | #Qmodel.add(Dense(256, activation='relu'))
32 | #Qmodel.add(Dropout(0.2))
33 | 
34 | Qmodel.add(Dense(1))
35 | #opt = optimizers.adam(lr=learning_rate)
36 | opt = optimizers.RMSprop()
37 | Qmodel.compile(loss='mse', optimizer=opt, metrics=['accuracy'])
38 | 
39 | 
40 | def add_noise(mu):
41 |     sig = 0.15 #Sigma = width of the standard deviaion
42 |     #mu = means
43 |     x =   np.random.rand(1) #probability of doing x
44 |     return mu + np.exp(-np.power(x - mu, 2.) / (2 * np.power(sig, 2.)))
45 | 
46 | add_noise = np.vectorize(add_noise,otypes=[np.float])
47 | 
48 | def add_noise_to_model(model_to_scramble):
49 |     sz = len(model_to_scramble.layers)
50 |     for k in range(sz):
51 |         w = model_to_scramble.layers[k].get_weights()
52 |         print("w ==>",w)
53 |         if np.alen(w) >0:
54 |             w[0] = add_noise(w[0])
55 |             print("w / noise ==>",w)
56 |         model_to_scramble.layers[k].set_weights(w )
57 | 
58 | 
59 | 
60 | print("end")
61 | 


--------------------------------------------------------------------------------
/Pytorch/ActorCritic/Load_AC_model.py:
--------------------------------------------------------------------------------
  1 | ## DQN Tutorial
  2 | ## Implementation from https://github.com/FitMachineLearning
  3 | 
  4 | import torch
  5 | import gym
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | import torch.optim as optim
  9 | import numpy as np
 10 | from dataclasses import dataclass
 11 | from typing import Any
 12 | from random import random
 13 | from agent_and_model import sars,DQNAgent,CriticModel,ActorModel, ReplayBuffer
 14 | 
 15 | 
 16 | 
 17 | 
 18 | 
 19 | 
 20 | def get_one_hot(action,n_dim):
 21 |     retval = np.zeros(n_dim)
 22 |     retval[action] = 1.0
 23 |     return retval
 24 | 
 25 | 
 26 | def train_actor(actor_model, critic_model, state_transitions, num_actor_training_samples, num_actions):
 27 |     #for each observation get the critic to generate the Q value corresponding to each action_space
 28 |     #retain action observation pairs corresponding to the highest Q values
 29 |     #train the actor to converge towards that set
 30 | 
 31 |     #Generate random actions
 32 |     random_actions = []
 33 |     for i in range(num_actor_training_samples):
 34 |         random_actions.append( np.random.rand(num_actions)*2-1 )
 35 |     #Get random observations
 36 |     random_states = [s.state for s in state_transitions]
 37 | 
 38 |     # import ipdb; ipdb.set_trace()
 39 | 
 40 |     # for earch state add the best corresponding action to random actions
 41 |     for i in range(len(random_states)):
 42 |         with torch.no_grad():
 43 |             act = actor_model(torch.Tensor(random_states[i]).to(actor_model.device)) .cpu().detach().numpy()
 44 |             random_actions.append(act)
 45 | 
 46 | 
 47 | 
 48 |     best_state_action = []
 49 |     for i_states in range(len(random_states)):
 50 |         QAs = []
 51 | 
 52 |         # get the Qvalues from the random actions
 53 |         for i_actions in range(len(random_actions)):
 54 |             with torch.no_grad():
 55 |                 qval = critic_model( torch.Tensor(   torch.cat( (torch.Tensor(random_states[i_states]),torch.Tensor(random_actions[i_actions])),0 )    ).to(critic_model.device) ).cpu()
 56 |                 QAs.append( qval )
 57 |         # get index for best actions between all random actions and the actor's predicted actions
 58 |         #_sars = sars(observation,action,reward,observation_next,done,0.0)
 59 |         best_state_action.append(sars(random_states[i_states], random_actions[np.argmax(QAs)],0.0,None,False,np.max(QAs) ))
 60 |     # import ipdb;ipdb.set_trace()
 61 | 
 62 |     t_random_states = torch.stack( ([torch.Tensor(s.state) for s in best_state_action]) ).to(actor_model.device)
 63 |     target_actions = torch.stack( ([torch.Tensor(s.action) for s in best_state_action]) ).to(actor_model.device)
 64 |     actor_model.zero_grad()
 65 |     predicted_actions = actor_model(t_random_states)
 66 | 
 67 |     # loss = F.smooth_l1_loss(torch.sum(pred_qvals*one_hot_actions,-1), actual_Q_values.view(-1) )
 68 |     loss = F.smooth_l1_loss(predicted_actions, target_actions ).mean()
 69 |     loss.backward()
 70 |     actor_model.opt.step()
 71 |     return loss
 72 | 
 73 | def train_critic(critic_model, state_transitions, num_actions):
 74 |     if len(state_transitions) <=0:
 75 |         print("empty state transitions")
 76 |         return
 77 | 
 78 | 
 79 |     cur_states = torch.stack( ([torch.Tensor(torch.cat((torch.Tensor(s.state),torch.Tensor(s.action)),0)) for s in state_transitions]) ).to(critic_model.device)
 80 | 
 81 | 
 82 |     rewards = torch.stack( ([torch.Tensor([s.reward]) for s in state_transitions]) ).to(critic_model.device)
 83 |     Qs = torch.stack( ([torch.Tensor([s.qval]) for s in state_transitions]) ).to(critic_model.device)
 84 |     mask = torch.stack(([torch.Tensor([0]) if s.done else torch.Tensor([1]) for s in state_transitions])).to(critic_model.device)
 85 |     next_states = torch.stack( ([torch.Tensor(s.next_state) for s in state_transitions]) ).to(critic_model.device)
 86 |     actions = [s.action for s in state_transitions]
 87 |     # import ipdb; ipdb.set_trace()
 88 |     with torch.no_grad():
 89 |         actual_Q_values = Qs
 90 |         # pred_qvals_next = critic_model(next_states)[0]
 91 |     critic_model.opt.zero_grad()
 92 |     pred_qvals = critic_model(cur_states)
 93 | 
 94 |     # one_hot_actions = F.one_hot(torch.LongTensor(actions),num_actions).to(model.device)
 95 |     # loss = torch.mean(torch.sqrt((torch.sum(pred_qvals*one_hot_actions,-1) - actual_Q_values.view(-1) )**2)).to(model.device)
 96 |     loss = F.smooth_l1_loss(pred_qvals.view(-1), actual_Q_values.view(-1) )
 97 |     # loss = F.smooth_l1_loss(torch.sum(pred_qvals,-1), rewards.view(-1)+0.98*mask[:,0]*pred_qvals_next.view(-1) ).mean()
 98 |     loss.backward()
 99 |     critic_model.opt.step()
100 |     return loss
101 | 
102 | def update_Qs(replay_buffer,step_counter,episode_len,buffer_size):
103 |     for i in range(episode_len):
104 |         # if(step_counter > buffer_size):
105 |         # import ipdb; ipdb.set_trace()
106 |         index = episode_len-i
107 |         next_index = index+1
108 |         if i==0:
109 |             replay_buffer[index].qval = replay_buffer[index].reward
110 |             if(step_counter%2000==0):
111 |                 print("i",i,"q ",replay_buffer[index].qval)
112 |         else:
113 |             replay_buffer[index].qval = replay_buffer[index].reward + 0.98 * replay_buffer[next_index].qval
114 |             if(step_counter%2000==0):
115 |                 print("i",i,"q ",replay_buffer[index].qval)
116 |     return replay_buffer
117 | 
118 | 
119 | 
120 | 
121 | if __name__=='__main__':
122 |     DEBUGER_ON = True
123 |     NUM_GAMES = 100
124 |     MAX_EPISODE_STEPS = 10000
125 |     TARGET_MODEL_UPDATE_INTERVAL = 50
126 |     EPSILON_MIN = 0.05
127 |     EPSILON_START = 0.5
128 |     EPSLILON_COUNT = 6000 #Games
129 |     INITIAL_RANDOM_STEPS = 5000
130 |     RANDOM_GAME_EVERY = 20
131 |     TRAIN_CRITIC_EVERY_N_STEP = 300
132 |     CRITIC_TRAINING_SAMPLE_SIZE = 256
133 |     TRAIN_ACTOR_EVERY_N_GAME = 1
134 |     ACTOR_TRAINING_SAMPLE_SIZE = 8
135 |     NUM_ACTOR_TRAINING_SAMPLES = 40
136 |     TRAINING_ITTERATIONS = 1
137 |     NUM_ACTOR_TRAINING_SAMPLES = 128
138 |     PRINT_EVERY = 1
139 |     RENDER_ENV = True
140 |     LOAD_MODEL = True
141 |     SAVE_MODEL = False
142 |     MODEL_FILE_NAME = "TDQN_RL_MODEL.trl"
143 |     MODEL_ID = "01"
144 |     SAVE_MODEL_EVERY = 25
145 | 
146 |     epsilon = EPSILON_START
147 |     env = gym.make('LunarLanderContinuous-v2')
148 |     # env = gym.make('BipedalWalker-v3')
149 | 
150 |     observation = env.reset()
151 |     print("env action space ", env.action_space.shape)
152 |     am = ActorModel(env.observation_space.shape,env.action_space.shape,lr=0.008)
153 |     cm = CriticModel(env.observation_space.shape,env.action_space.shape,lr=0.01)
154 |     agent = DQNAgent( am , cm )
155 |     # import ipdb;ipdb.set_trace()
156 | 
157 |     if LOAD_MODEL:
158 |         agent.actor_model = torch.load("A2C_actor"+MODEL_ID+MODEL_FILE_NAME)
159 |         agent.critic_model = torch.load("A2C_critic"+MODEL_ID+MODEL_FILE_NAME)
160 | 
161 |         agent.actor_model.eval()
162 |         agent.critic_model.eval()
163 | 
164 |     step_counter = 0
165 |     last_step_count = 0
166 | 
167 | 
168 |     action = []
169 |     for game in range (NUM_GAMES):
170 |         episode_sars = []
171 |         score = 0
172 |         for step in range (MAX_EPISODE_STEPS):
173 |             if RENDER_ENV:
174 |                 env.render()
175 | 
176 |             if random()<-0.1:
177 |                 action = env.action_space.sample()
178 |             else:
179 |                 # import ipdb; ipdb.set_trace()
180 |                 action = agent.get_actions(observation).cpu().detach().numpy()
181 |                 # print("action ", action)
182 |             observation_next, reward, done, info = env.step(action)
183 |             score += reward
184 | 
185 |             observation = observation_next
186 |             step_counter+=1
187 |             last_step_count = step
188 |             if done:
189 | 
190 |                 break
191 | 
192 |         observation = env.reset()
193 |         epsilon = max(EPSILON_MIN, epsilon-((EPSILON_START-EPSILON_MIN)/EPSLILON_COUNT) )
194 |         if (game%PRINT_EVERY==0):
195 |             print("episide ", game,"last score",reward, "game score ", score ,"episode_len",last_step_count, "epsilon",epsilon )
196 |         avg_reward = []
197 |         # print("epsilon ", epsilon)
198 | 


--------------------------------------------------------------------------------
/Pytorch/ActorCritic/Output_noising/Readme.md:
--------------------------------------------------------------------------------
1 | Actor Critic implementation with Actor Noising.
2 | The idea behind Actor noising is as follows, during training of the actor, in addition to randomly generated actions, 
3 | a set of action is also generated that correspond to the actor's predicted output where gaussian noise has been added.
4 | 
5 | This idea, similar to parameter noising, adds significan performance improvement over simple actor critic method.
6 | 


--------------------------------------------------------------------------------
/Pytorch/ActorCritic/Parameter_Noising/Load_AC_model.py:
--------------------------------------------------------------------------------
  1 | ## DQN Tutorial
  2 | ## Implementation from https://github.com/FitMachineLearning
  3 | 
  4 | import torch
  5 | import gym
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | import torch.optim as optim
  9 | import numpy as np
 10 | from dataclasses import dataclass
 11 | from typing import Any
 12 | from random import random
 13 | from agent_and_model import sars,DQNAgent,CriticModel,ActorModel, ReplayBuffer
 14 | 
 15 | 
 16 | 
 17 | 
 18 | 
 19 | 
 20 | def get_one_hot(action,n_dim):
 21 |     retval = np.zeros(n_dim)
 22 |     retval[action] = 1.0
 23 |     return retval
 24 | 
 25 | 
 26 | def train_actor(actor_model, critic_model, state_transitions, num_actor_training_samples, num_actions):
 27 |     #for each observation get the critic to generate the Q value corresponding to each action_space
 28 |     #retain action observation pairs corresponding to the highest Q values
 29 |     #train the actor to converge towards that set
 30 | 
 31 |     #Generate random actions
 32 |     random_actions = []
 33 |     for i in range(num_actor_training_samples):
 34 |         random_actions.append( np.random.rand(num_actions)*2-1 )
 35 |     #Get random observations
 36 |     random_states = [s.state for s in state_transitions]
 37 | 
 38 |     # import ipdb; ipdb.set_trace()
 39 | 
 40 |     # for earch state add the best corresponding action to random actions
 41 |     for i in range(len(random_states)):
 42 |         with torch.no_grad():
 43 |             act = actor_model(torch.Tensor(random_states[i]).to(actor_model.device)) .cpu().detach().numpy()
 44 |             random_actions.append(act)
 45 | 
 46 | 
 47 | 
 48 |     best_state_action = []
 49 |     for i_states in range(len(random_states)):
 50 |         QAs = []
 51 | 
 52 |         # get the Qvalues from the random actions
 53 |         for i_actions in range(len(random_actions)):
 54 |             with torch.no_grad():
 55 |                 qval = critic_model( torch.Tensor(   torch.cat( (torch.Tensor(random_states[i_states]),torch.Tensor(random_actions[i_actions])),0 )    ).to(critic_model.device) ).cpu()
 56 |                 QAs.append( qval )
 57 |         # get index for best actions between all random actions and the actor's predicted actions
 58 |         #_sars = sars(observation,action,reward,observation_next,done,0.0)
 59 |         best_state_action.append(sars(random_states[i_states], random_actions[np.argmax(QAs)],0.0,None,False,np.max(QAs) ))
 60 |     # import ipdb;ipdb.set_trace()
 61 | 
 62 |     t_random_states = torch.stack( ([torch.Tensor(s.state) for s in best_state_action]) ).to(actor_model.device)
 63 |     target_actions = torch.stack( ([torch.Tensor(s.action) for s in best_state_action]) ).to(actor_model.device)
 64 |     actor_model.zero_grad()
 65 |     predicted_actions = actor_model(t_random_states)
 66 | 
 67 |     # loss = F.smooth_l1_loss(torch.sum(pred_qvals*one_hot_actions,-1), actual_Q_values.view(-1) )
 68 |     loss = F.smooth_l1_loss(predicted_actions, target_actions ).mean()
 69 |     loss.backward()
 70 |     actor_model.opt.step()
 71 |     return loss
 72 | 
 73 | def train_critic(critic_model, state_transitions, num_actions):
 74 |     if len(state_transitions) <=0:
 75 |         print("empty state transitions")
 76 |         return
 77 | 
 78 | 
 79 |     cur_states = torch.stack( ([torch.Tensor(torch.cat((torch.Tensor(s.state),torch.Tensor(s.action)),0)) for s in state_transitions]) ).to(critic_model.device)
 80 | 
 81 | 
 82 |     rewards = torch.stack( ([torch.Tensor([s.reward]) for s in state_transitions]) ).to(critic_model.device)
 83 |     Qs = torch.stack( ([torch.Tensor([s.qval]) for s in state_transitions]) ).to(critic_model.device)
 84 |     mask = torch.stack(([torch.Tensor([0]) if s.done else torch.Tensor([1]) for s in state_transitions])).to(critic_model.device)
 85 |     next_states = torch.stack( ([torch.Tensor(s.next_state) for s in state_transitions]) ).to(critic_model.device)
 86 |     actions = [s.action for s in state_transitions]
 87 |     # import ipdb; ipdb.set_trace()
 88 |     with torch.no_grad():
 89 |         actual_Q_values = Qs
 90 |         # pred_qvals_next = critic_model(next_states)[0]
 91 |     critic_model.opt.zero_grad()
 92 |     pred_qvals = critic_model(cur_states)
 93 | 
 94 |     # one_hot_actions = F.one_hot(torch.LongTensor(actions),num_actions).to(model.device)
 95 |     # loss = torch.mean(torch.sqrt((torch.sum(pred_qvals*one_hot_actions,-1) - actual_Q_values.view(-1) )**2)).to(model.device)
 96 |     loss = F.smooth_l1_loss(pred_qvals.view(-1), actual_Q_values.view(-1) )
 97 |     # loss = F.smooth_l1_loss(torch.sum(pred_qvals,-1), rewards.view(-1)+0.98*mask[:,0]*pred_qvals_next.view(-1) ).mean()
 98 |     loss.backward()
 99 |     critic_model.opt.step()
100 |     return loss
101 | 
102 | def update_Qs(replay_buffer,step_counter,episode_len,buffer_size):
103 |     for i in range(episode_len):
104 |         # if(step_counter > buffer_size):
105 |         # import ipdb; ipdb.set_trace()
106 |         index = episode_len-i
107 |         next_index = index+1
108 |         if i==0:
109 |             replay_buffer[index].qval = replay_buffer[index].reward
110 |             if(step_counter%2000==0):
111 |                 print("i",i,"q ",replay_buffer[index].qval)
112 |         else:
113 |             replay_buffer[index].qval = replay_buffer[index].reward + 0.98 * replay_buffer[next_index].qval
114 |             if(step_counter%2000==0):
115 |                 print("i",i,"q ",replay_buffer[index].qval)
116 |     return replay_buffer
117 | 
118 | 
119 | 
120 | 
121 | if __name__=='__main__':
122 |     DEBUGER_ON = True
123 |     NUM_GAMES = 100
124 |     MAX_EPISODE_STEPS = 10000
125 |     TARGET_MODEL_UPDATE_INTERVAL = 50
126 |     EPSILON_MIN = 0.05
127 |     EPSILON_START = 0.5
128 |     EPSLILON_COUNT = 6000 #Games
129 |     INITIAL_RANDOM_STEPS = 5000
130 |     RANDOM_GAME_EVERY = 20
131 |     TRAIN_CRITIC_EVERY_N_STEP = 300
132 |     CRITIC_TRAINING_SAMPLE_SIZE = 256
133 |     TRAIN_ACTOR_EVERY_N_GAME = 1
134 |     ACTOR_TRAINING_SAMPLE_SIZE = 8
135 |     NUM_ACTOR_TRAINING_SAMPLES = 40
136 |     TRAINING_ITTERATIONS = 1
137 |     NUM_ACTOR_TRAINING_SAMPLES = 128
138 |     PRINT_EVERY = 1
139 |     RENDER_ENV = True
140 |     LOAD_MODEL = True
141 |     SAVE_MODEL = False
142 |     MODEL_FILE_NAME = "TDQN_RL_MODEL.trl"
143 |     MODEL_ID = "01"
144 |     SAVE_MODEL_EVERY = 25
145 | 
146 |     epsilon = EPSILON_START
147 |     env = gym.make('LunarLanderContinuous-v2')
148 |     # env = gym.make('BipedalWalker-v3')
149 | 
150 |     observation = env.reset()
151 |     print("env action space ", env.action_space.shape)
152 |     am = ActorModel(env.observation_space.shape,env.action_space.shape,lr=0.008)
153 |     cm = CriticModel(env.observation_space.shape,env.action_space.shape,lr=0.01)
154 |     agent = DQNAgent( am , cm )
155 |     # import ipdb;ipdb.set_trace()
156 | 
157 |     if LOAD_MODEL:
158 |         agent.actor_model = torch.load("A2C_actor"+MODEL_ID+MODEL_FILE_NAME)
159 |         agent.critic_model = torch.load("A2C_critic"+MODEL_ID+MODEL_FILE_NAME)
160 | 
161 |         agent.actor_model.eval()
162 |         agent.critic_model.eval()
163 | 
164 |     step_counter = 0
165 |     last_step_count = 0
166 | 
167 | 
168 |     action = []
169 |     for game in range (NUM_GAMES):
170 |         episode_sars = []
171 |         score = 0
172 |         for step in range (MAX_EPISODE_STEPS):
173 |             if RENDER_ENV:
174 |                 env.render()
175 | 
176 |             if random()<-0.1:
177 |                 action = env.action_space.sample()
178 |             else:
179 |                 # import ipdb; ipdb.set_trace()
180 |                 action = agent.get_actions(observation).cpu().detach().numpy()
181 |                 # print("action ", action)
182 |             observation_next, reward, done, info = env.step(action)
183 |             score += reward
184 | 
185 |             observation = observation_next
186 |             step_counter+=1
187 |             last_step_count = step
188 |             if done:
189 | 
190 |                 break
191 | 
192 |         observation = env.reset()
193 |         epsilon = max(EPSILON_MIN, epsilon-((EPSILON_START-EPSILON_MIN)/EPSLILON_COUNT) )
194 |         if (game%PRINT_EVERY==0):
195 |             print("episide ", game,"last score",reward, "game score ", score ,"episode_len",last_step_count, "epsilon",epsilon )
196 |         avg_reward = []
197 |         # print("epsilon ", epsilon)
198 | 


--------------------------------------------------------------------------------
/Pytorch/ActorCritic/Parameter_Noising/Readme.md:
--------------------------------------------------------------------------------
1 | Actor Critic implementation with Parameter noising
2 | 
3 | Traing the agent with *python Advantage_Actor_Critic.py*. This saves the agent every 10 episodes.
4 | View the saved agent with *python Load_AC_model.py".
5 | 


--------------------------------------------------------------------------------
/Pytorch/ActorCritic/Parameter_Noising/agent_and_model.py:
--------------------------------------------------------------------------------
  1 | ## DQN Tutorial
  2 | ## Implementation from https://github.com/FitMachineLearning
  3 | import torch
  4 | import gym
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | import torch.optim as optim
  8 | import numpy as np
  9 | from dataclasses import dataclass
 10 | from typing import Any
 11 | from random import random
 12 | 
 13 | 
 14 | @dataclass
 15 | class sars:
 16 |     state: Any
 17 |     action: Any
 18 |     reward: float
 19 |     next_state: Any
 20 |     done: bool
 21 |     qval: float
 22 |     advantage: float = 0.0
 23 | 
 24 | class DQNAgent:
 25 |     def __init__(self,actor_model,critic_model):
 26 |         self.actor_model = actor_model
 27 |         self.critic_model = critic_model
 28 | 
 29 |     def get_actions(self, observations):
 30 |         # import ipdb; ipdb.set_trace()
 31 |         guessed_actions = self.actor_model(torch.Tensor(observations).to(self.actor_model.device))
 32 |         return guessed_actions
 33 | 
 34 |     def get_predicted_Q_values(self,observation_and_action):
 35 |         guessed_Qs = self.critic_model(torch.Tensor(observation_and_action))
 36 |         return guessed_Qs(-1)[1]
 37 | 
 38 |     def update_target_model(self):
 39 |         self.targetModel.load_state_dict(self.model.state_dict())
 40 | 
 41 | class ActorModel(nn.Module):
 42 |     def __init__(self, obs_shape, action_shape,lr):
 43 |         super(ActorModel,self).__init__()
 44 |         assert len(obs_shape) ==1, "This network only works on flat observations"
 45 |         self.obs_shape = obs_shape
 46 |         self.action_shape = action_shape
 47 | 
 48 |         # import ipdb; ipdb.set_trace()
 49 |         self.net = torch.nn.Sequential(
 50 |             torch.nn.Linear(obs_shape[0],1024),
 51 |             torch.nn.ReLU(),
 52 |             torch.nn.Linear(1024,512),
 53 |             torch.nn.ReLU(),
 54 |             torch.nn.Linear(512,256),
 55 |             torch.nn.ReLU(),
 56 |             torch.nn.Linear(256,128),
 57 |             torch.nn.ReLU(),
 58 |             torch.nn.Linear(128,action_shape[0])
 59 |         )
 60 |         self.opt = optim.Adam(self.net.parameters(),lr=lr)
 61 |         if torch.cuda.is_available():
 62 |             print("Using CUDA")
 63 |         self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cuda:1')
 64 |         self.to(self.device)
 65 | 
 66 |     def forward(self, x):
 67 |         return self.net(x)
 68 | 
 69 | 
 70 | class CriticModel(nn.Module):
 71 |     def __init__(self, obs_shape, action_shape,lr):
 72 |         super(CriticModel,self).__init__()
 73 |         assert len(obs_shape) ==1, "This network only works on flat observations"
 74 |         self.obs_shape = obs_shape
 75 |         self.action_shape = action_shape
 76 | 
 77 |         self.net = torch.nn.Sequential(
 78 |             torch.nn.Linear(obs_shape[0]+action_shape[0],1024),
 79 |             torch.nn.ReLU(),
 80 |             torch.nn.Linear(1024,512),
 81 |             torch.nn.ReLU(),
 82 |             torch.nn.Linear(512,256),
 83 |             torch.nn.ReLU(),
 84 |             torch.nn.Linear(256,128),
 85 |             torch.nn.ReLU(),
 86 |             torch.nn.Linear(128,1) # one out put because we are predicting Q values
 87 |         )
 88 |         self.opt = optim.Adam(self.net.parameters(),lr=lr)
 89 |         if torch.cuda.is_available():
 90 |             print("Using CUDA")
 91 |         self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cuda:1')
 92 |         self.to(self.device)
 93 | 
 94 |     def forward(self, x):
 95 |         return self.net(x)
 96 | 
 97 | class ReplayBuffer:
 98 |     def __init__(self, buffer_size = 1000):
 99 |         # self.buffer_size = buffer_size
100 |         self.buffer_size = buffer_size
101 |         self.buffer = np.empty((buffer_size),dtype=object)
102 | 
103 |         # self.buffer = []
104 |         self.index = 0
105 | 
106 |     def insert(self, sars):
107 |         # self.buffer.append(sars)
108 |         # print("inserting index ", self.index, "@",self.index%self.buffer_size)
109 |         if(self.index == 10):
110 |             print("first 10 ",self.buffer[0:10])
111 |             # import ipdb; ipdb.set_trace()
112 | 
113 |         # if(self.index > self.buffer_size and self.index%self.buffer_size==0):
114 |         #     print("first 10 ",self.buffer[0:10])
115 |         #     print("last 10 ",self.buffer[-10:])
116 |         #     print("")
117 |         #     import ipdb; ipdb.set_trace()
118 |         self.buffer[self.index%self.buffer_size] = sars
119 |         self.index+=1
120 |         # self.buffer.append(sars)
121 |         # if(len(self.buffer)>self.buffer_size):
122 |         #     self.buffer = self.buffer[1:]
123 |         #     # print("Clipping Buffer at size", len(self.buffer))
124 | 
125 |     def sample(self, num_samples,current_episode_steps):
126 |         # assert num_samples < min(len(self.buffer),self.index)
127 |         # if num_samples>self.index:
128 |         # print("sampling n ",min(num_samples,self.index))
129 |         a = self.buffer[0:min(self.index,self.buffer_size)]
130 |         if len(self.buffer) > 0:
131 |             return np.random.choice(a, min(num_samples,self.index))
132 |         else:
133 |             return []
134 | 


--------------------------------------------------------------------------------
/Pytorch/ActorCritic/Readme.md:
--------------------------------------------------------------------------------
1 | Implementation of Actor Critic RL argorithm by.
2 | To Train run **python ActorCritic.py**
3 | This will save your trained model in a local directory every 50 game
4 | to view the model in action run **python load_AC_model.py**, note that if you run load_AC_model.py first, it will automatically load the saved agents. I have included a version of the saved actor network (optimal policy) in the project folder.
5 | 


--------------------------------------------------------------------------------
/Pytorch/ActorCritic/actor01TDQN_RL_MODEL.trl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FitMachineLearning/FitML/1218e10c527e1ca117cc7548419904d7801433c4/Pytorch/ActorCritic/actor01TDQN_RL_MODEL.trl


--------------------------------------------------------------------------------
/Pytorch/ActorCritic/agent_and_model.py:
--------------------------------------------------------------------------------
  1 | ## DQN Tutorial
  2 | ## Implementation from https://github.com/FitMachineLearning
  3 | import torch
  4 | import gym
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | import torch.optim as optim
  8 | import numpy as np
  9 | from dataclasses import dataclass
 10 | from typing import Any
 11 | from random import random
 12 | 
 13 | 
 14 | @dataclass
 15 | class sars:
 16 |     state: Any
 17 |     action: Any
 18 |     reward: float
 19 |     next_state: Any
 20 |     done: bool
 21 |     qval: float
 22 |     advantage: float = 0.0
 23 | 
 24 | class DQNAgent:
 25 |     def __init__(self,actor_model,critic_model):
 26 |         self.actor_model = actor_model
 27 |         self.critic_model = critic_model
 28 | 
 29 |     def get_actions(self, observations):
 30 |         # import ipdb; ipdb.set_trace()
 31 |         guessed_actions = self.actor_model(torch.Tensor(observations).to(self.actor_model.device))
 32 |         return guessed_actions
 33 | 
 34 |     def get_predicted_Q_values(self,observation_and_action):
 35 |         guessed_Qs = self.critic_model(torch.Tensor(observation_and_action))
 36 |         return guessed_Qs(-1)[1]
 37 | 
 38 |     def update_target_model(self):
 39 |         self.targetModel.load_state_dict(self.model.state_dict())
 40 | 
 41 | class ActorModel(nn.Module):
 42 |     def __init__(self, obs_shape, action_shape,lr):
 43 |         super(ActorModel,self).__init__()
 44 |         assert len(obs_shape) ==1, "This network only works on flat observations"
 45 |         self.obs_shape = obs_shape
 46 |         self.action_shape = action_shape
 47 | 
 48 |         # import ipdb; ipdb.set_trace()
 49 |         self.net = torch.nn.Sequential(
 50 |             torch.nn.Linear(obs_shape[0],512),
 51 |             torch.nn.ReLU(),
 52 |             # torch.nn.Linear(1024,256),
 53 |             # torch.nn.ReLU(),
 54 |             torch.nn.Linear(512,action_shape[0])
 55 |         )
 56 |         self.opt = optim.Adam(self.net.parameters(),lr=lr)
 57 |         if torch.cuda.is_available():
 58 |             print("Using CUDA")
 59 |         self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cuda:1')
 60 |         self.to(self.device)
 61 | 
 62 |     def forward(self, x):
 63 |         return self.net(x)
 64 | 
 65 | 
 66 | class CriticModel(nn.Module):
 67 |     def __init__(self, obs_shape, action_shape,lr):
 68 |         super(CriticModel,self).__init__()
 69 |         assert len(obs_shape) ==1, "This network only works on flat observations"
 70 |         self.obs_shape = obs_shape
 71 |         self.action_shape = action_shape
 72 | 
 73 |         self.net = torch.nn.Sequential(
 74 |             torch.nn.Linear(obs_shape[0]+action_shape[0],512),
 75 |             torch.nn.ReLU(),
 76 |             # torch.nn.Linear(2048,512),
 77 |             # torch.nn.ReLU(),
 78 |             torch.nn.Linear(512,1) # one out put because we are predicting Q values
 79 |         )
 80 |         self.opt = optim.Adam(self.net.parameters(),lr=lr)
 81 |         if torch.cuda.is_available():
 82 |             print("Using CUDA")
 83 |         self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cuda:1')
 84 |         self.to(self.device)
 85 | 
 86 |     def forward(self, x):
 87 |         return self.net(x)
 88 | 
 89 | class ReplayBuffer:
 90 |     def __init__(self, buffer_size = 1000):
 91 |         # self.buffer_size = buffer_size
 92 |         self.buffer_size = buffer_size
 93 |         self.buffer = np.empty((buffer_size),dtype=object)
 94 | 
 95 |         # self.buffer = []
 96 |         self.index = 0
 97 | 
 98 |     def insert(self, sars):
 99 |         # self.buffer.append(sars)
100 |         # print("inserting index ", self.index, "@",self.index%self.buffer_size)
101 |         if(self.index == 10):
102 |             print("first 10 ",self.buffer[0:10])
103 |             # import ipdb; ipdb.set_trace()
104 | 
105 |         # if(self.index > self.buffer_size and self.index%self.buffer_size==0):
106 |         #     print("first 10 ",self.buffer[0:10])
107 |         #     print("last 10 ",self.buffer[-10:])
108 |         #     print("")
109 |         #     import ipdb; ipdb.set_trace()
110 |         self.buffer[self.index%self.buffer_size] = sars
111 |         self.index+=1
112 |         # self.buffer.append(sars)
113 |         # if(len(self.buffer)>self.buffer_size):
114 |         #     self.buffer = self.buffer[1:]
115 |         #     # print("Clipping Buffer at size", len(self.buffer))
116 | 
117 |     def sample(self, num_samples,current_episode_steps):
118 |         # assert num_samples < min(len(self.buffer),self.index)
119 |         # if num_samples>self.index:
120 |         # print("sampling n ",min(num_samples,self.index))
121 |         a = self.buffer[0:min(self.index,self.buffer_size)]
122 |         if len(self.buffer) > 0:
123 |             return np.random.choice(a, min(num_samples,self.index))
124 |         else:
125 |             return []
126 | 


--------------------------------------------------------------------------------
/Pytorch/ActorCritic/critic01TDQN_RL_MODEL.trl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FitMachineLearning/FitML/1218e10c527e1ca117cc7548419904d7801433c4/Pytorch/ActorCritic/critic01TDQN_RL_MODEL.trl


--------------------------------------------------------------------------------
/Pytorch/DQN/DQN_Cartpol_old_1.py:
--------------------------------------------------------------------------------
  1 | ## DQN Tutorial
  2 | ## https://www.youtube.com/watch?v=WHRQUZrxxGw
  3 | import torch
  4 | import gym
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | import torch.optim as optim
  8 | import numpy as np
  9 | from dataclasses import dataclass
 10 | from typing import Any
 11 | from random import random
 12 | 
 13 | @dataclass
 14 | class sars:
 15 |     state: Any
 16 |     action: int
 17 |     reward: float
 18 |     next_state: Any
 19 |     done: bool
 20 |     qval: float
 21 | 
 22 | class DQNAgent:
 23 |     def __init__(self,model,targetModel):
 24 |         self.model = model
 25 |         self.targetModel = targetModel
 26 | 
 27 |     def get_actions(self, observations):
 28 |         q_vals = self.model(torch.Tensor(observations).to(self.model.device))
 29 | 
 30 | 
 31 |         return q_vals.max(-1)[1]
 32 | 
 33 |     def update_target_model(self):
 34 |         self.targetModel.load_state_dict(self.model.state_dict())
 35 | 
 36 | class Model(nn.Module):
 37 |     def __init__(self, obs_shape, num_actions,lr):
 38 |         super(Model,self).__init__()
 39 |         assert len(obs_shape) ==1, "This network only works on flat observations"
 40 |         self.obs_shape = obs_shape
 41 |         self.num_action = num_actions
 42 | 
 43 |         self.net = torch.nn.Sequential(
 44 |             torch.nn.Linear(obs_shape[0],32),
 45 |             torch.nn.ReLU(),
 46 |             torch.nn.Linear(32,num_actions)
 47 |         )
 48 |         self.opt = optim.Adam(self.net.parameters(),lr=lr)
 49 |         if torch.cuda.is_available():
 50 |             print("Using CUDA")
 51 |         self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cuda:1')
 52 |         self.to(self.device)
 53 | 
 54 | 
 55 |     def forward(self, x):
 56 |         return self.net(x)
 57 | 
 58 | 
 59 | 
 60 | class ReplayBuffer:
 61 |     def __init__(self, buffer_size = 1000):
 62 |         self.buffer_size = buffer_size
 63 |         # self.buffer = [None]*buffer_size
 64 |         self.buffer = []
 65 |         self.index = 0
 66 | 
 67 |     def insert(self, sars):
 68 |         # self.buffer.append(sars)
 69 |         # print("inserting index ", self.index, "@",self.index%self.buffer_size)
 70 |         if(self.index == 10):
 71 |             print("first 10 ",self.buffer[0:10])
 72 |             # import ipdb; ipdb.set_trace()
 73 | 
 74 |         # if(self.index > self.buffer_size and self.index%self.buffer_size==0):
 75 |         #     print("first 10 ",self.buffer[0:10])
 76 |         #     print("last 10 ",self.buffer[-10:])
 77 |         #     print("")
 78 |         #     import ipdb; ipdb.set_trace()
 79 |         # self.buffer[self.index%self.buffer_size] = sars
 80 |         self.index+=1
 81 |         self.buffer.append(sars)
 82 |         if(len(self.buffer)>self.buffer_size):
 83 |             self.buffer = self.buffer[1:]
 84 |             # print("Clipping Buffer at size", len(self.buffer))
 85 | 
 86 |     def sample(self, num_samples,current_episode_steps):
 87 |         # assert num_samples < min(len(self.buffer),self.index)
 88 |         # if num_samples>self.index:
 89 |         # print("sampling n ",min(num_samples,self.index))
 90 |         # a = self.buffer[0:((self.index-current_episode_steps)%self.buffer_size)]
 91 |         if len(self.buffer) > 0:
 92 |             return np.random.choice(self.buffer, min(num_samples,self.index))
 93 |         else:
 94 |             return []
 95 | 
 96 | 
 97 | 
 98 | def get_one_hot(action,n_dim):
 99 |     retval = np.zeros(n_dim)
100 |     retval[action] = 1.0
101 |     return retval
102 | 
103 | 
104 | def train_step(model, state_transitions, tgt, num_actions):
105 |     if len(state_transitions) <=0:
106 |         print("empty state transitions")
107 |         return
108 |     cur_states = torch.stack( ([torch.Tensor(s.state) for s in state_transitions]) ).to(model.device)
109 |     rewards = torch.stack( ([torch.Tensor([s.reward]) for s in state_transitions]) ).to(model.device)
110 |     Qs = torch.stack( ([torch.Tensor([s.qval]) for s in state_transitions]) ).to(model.device)
111 |     mask = torch.stack(([torch.Tensor([0]) if s.done else torch.Tensor([1]) for s in state_transitions])).to(model.device)
112 |     next_states = torch.stack( ([torch.Tensor(s.next_state) for s in state_transitions]) ).to(model.device)
113 |     actions = [s.action for s in state_transitions]
114 | 
115 |     with torch.no_grad():
116 |         actual_Q_values = Qs
117 |         # pred_qvals_next = tgt(next_states)
118 |         # pred_qvals_next = pred_qvals_next.max(axis=1)[0]
119 | 
120 |     model.opt.zero_grad()
121 |     pred_qvals = model(cur_states)
122 |     one_hot_actions = F.one_hot(torch.LongTensor(actions),num_actions).to(model.device)
123 | 
124 | 
125 |     # loss = (rewards + mask[:,0]*pred_qvals_next - torch.sum(pred_qvals*one_hot_actions,-1)).mean()
126 |     # print("loss input", torch.sum(pred_qvals*one_hot_actions,-1))
127 |     # print("loss target", (rewards + 0.98*mask[:,0]*pred_qvals_next))
128 | 
129 |     # loss = F.smooth_l1_loss(torch.sum(pred_qvals*one_hot_actions,-1), (rewards + 0.98*mask[:,0]*pred_qvals_next)[0] )
130 |     loss = F.smooth_l1_loss(torch.sum(pred_qvals*one_hot_actions,-1), actual_Q_values[0] )
131 | 
132 |     loss.backward()
133 |     model.opt.step()
134 |     print("loss ", loss)
135 |     return loss
136 | 
137 | def update_Qs(replay_buffer,step_counter,episode_len,buffer_size):
138 |     for i in range(episode_len):
139 |         # if(step_counter > buffer_size):
140 |         # import ipdb; ipdb.set_trace()
141 |         index = episode_len-i
142 |         next_index = index+1
143 |         if i==0:
144 |             replay_buffer[index].qval = replay_buffer[index].reward
145 |             # print("i",i,"q ",replay_buffer[index].qval)
146 |         else:
147 |             replay_buffer[index].qval = replay_buffer[index].reward + 0.98 * replay_buffer[next_index].qval
148 |             # print("i",i,"q ",replay_buffer[index].qval)
149 |     return replay_buffer
150 | 
151 | def train_step2(model,state_transitions,targetModel,num_actions):
152 |     # print("state_transitions" , state_transitions)
153 |     cur_states = torch.stack(([torch.Tensor(s.state) for s in state_transitions]))
154 |     next_states = torch.stack(([torch.Tensor(s.next_state) for s in state_transitions]))
155 | 
156 |     rewards = torch.stack(([torch.Tensor([s.reward]) for s in state_transitions]))
157 |     # act = torch.Tensor(np.zeros(num_actions))
158 |     actions = torch.stack([torch.Tensor(get_one_hot(action,num_actions)) for s in state_transitions])
159 | 
160 |     mask = torch.stack([torch.Tensor([0]) if s.done else torch.Tensor([1]) for s in state_transitions])
161 | 
162 |     with torch.no_grad():
163 |         # qevals_next = targetModel(next_states).max(-1)
164 |         qevals_next = targetModel(next_states)
165 |         # print("qevals_next",qevals_next)
166 |         qevals_next = qevals_next.max(axis=1)[0]
167 |         # print("qevals_next . max",qevals_next)
168 | 
169 |     model.opt.zero_grad()
170 |     qevals = model(cur_states)
171 | 
172 |     # print("rewards ",rewards.shape, rewards)
173 |     # print("qevals ",qevals.shape,qevals)
174 |     # # print("maks ",mask.shape,mask)
175 |     # print("actions ",actions.shape,actions)
176 |     print("qevals_next",qevals_next)
177 |     #
178 |     print("qeval*actions ",  torch.sum(qevals*actions,axis=1) )
179 |     # print("qeval*actions . mean() ",  torch.sum(qevals*actions,axis=1).mean() )
180 | 
181 | 
182 |     loss =  ( (rewards + 0.98 * qevals_next*mask[:,0] ) - (torch.sum(qevals*actions,axis=1)) ).mean()
183 |     # loss =  ( (rewards + 0.98 * qevals_next*mask) - qevals*actions ).mean()
184 |     loss.backward()
185 |     model.opt.step()
186 | 
187 |     print("Loss ", loss)
188 |     return loss
189 | 
190 | 
191 | if __name__=='__main__':
192 |     NUM_GAMES = 50000
193 |     MAX_EPISODE_STEPS = 600
194 |     TARGET_MODEL_UPDATE_INTERVAL = 50
195 |     EPSILON_MIN = 0.01
196 |     EPSILON_START = 0.3
197 |     EPSLILON_COUNT = 2000 #Games
198 |     RANDOM_GAME_EVERY = 20
199 |     TRAIN_EVERY_N_STEPS = 15
200 |     PRINT_EVERY = 10
201 | 
202 |     epsilon = EPSILON_START
203 |     # env = gym.make('LunarLander-v2')
204 |     env = gym.make('CartPole-v1')
205 | 
206 |     observation = env.reset()
207 |     # obs2 = np.random.random(4)
208 |     # allObs = np.array([observation,obs2])
209 |     m = Model(env.observation_space.shape,env.action_space.n,lr=0.01)
210 |     rb = ReplayBuffer(3000)
211 |     agent = DQNAgent(m, Model(env.observation_space.shape,env.action_space.n,lr=0.01) )
212 |     step_counter = 0
213 |     avg_reward = []
214 |     # qeval = m(torch.Tensor(allObs))
215 |     # # print("allObs ", allObs)
216 |     # # print("qeval ",qeval)
217 | 
218 | 
219 |     for game in range (NUM_GAMES):
220 |         # if game == 8:
221 |         #     print("rb ",rb.buffer)
222 |         episode_sars = []
223 |         # # if game%TARGET_MODEL_UPDATE_INTERVAL == 0 :
224 |         # #     # print("game", game," updating target model")
225 |         #     agent.update_target_model()
226 |         for step in range (MAX_EPISODE_STEPS):
227 |             env.render()
228 |             # import ipdb; ipdb.set_trace()
229 |             action = 0
230 |             if step_counter<1000 or random()<epsilon or game%RANDOM_GAME_EVERY==0:
231 |                 action = env.action_space.sample()
232 |                 # print("random action")
233 |             else:
234 |                 action = agent.get_actions(observation).item()
235 |                 # print("*** action ",action)
236 |                 # action = action.data.cpu()
237 | 
238 |             observation_next, reward, done, info = env.step(action)
239 |             if done:
240 |                 reward=-100
241 |             _sars = sars(observation,action,reward,observation_next,done,0.0)
242 |             episode_sars.append(_sars)
243 |             avg_reward.append([reward])
244 |             # if(reward==-100):
245 |             #     print("Adding -100 ",reward)
246 |             if rb.index > 3000 and step_counter%TRAIN_EVERY_N_STEPS==0:
247 |                 # print("rb sample", rb.sample(1))
248 |                 train_step(agent.model,rb.sample(1,step),agent.targetModel,env.action_space.n)
249 |                 # print("training  size ",rb.index%rb.buffer_size, " - ",rb.index , "")
250 |             observation = observation_next
251 |             step_counter+=1
252 |             if done:
253 | 
254 |                 # reward = -100
255 |                 # print("last reward ", reward)
256 | 
257 |                 rb.episode_sars = update_Qs(episode_sars,step_counter,step,len(episode_sars))
258 |                 for j in range(len(episode_sars)):
259 |                     rb.insert(episode_sars[j])
260 | 
261 |                 observation = env.reset()
262 |                 break
263 | 
264 | 
265 |         epsilon = max(EPSILON_MIN, epsilon-((EPSILON_START-EPSILON_MIN)/EPSLILON_COUNT) )
266 |         if (game%PRINT_EVERY==0):
267 |             print("episide ", game,"score", np.average( avg_reward), "epsilon",epsilon )
268 |         avg_reward = []
269 |         # print("epsilon ", epsilon)
270 | 


--------------------------------------------------------------------------------
/Pytorch/DQN/Load_Agent.py:
--------------------------------------------------------------------------------
  1 | ## DQN Tutorial
  2 | ## https://www.youtube.com/watch?v=WHRQUZrxxGw
  3 | import torch
  4 | import gym
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | import torch.optim as optim
  8 | import numpy as np
  9 | from dataclasses import dataclass
 10 | from typing import Any
 11 | from random import random
 12 | 
 13 | @dataclass
 14 | class sars:
 15 |     state: Any
 16 |     action: int
 17 |     reward: float
 18 |     next_state: Any
 19 |     done: bool
 20 |     qval: float
 21 | 
 22 | class DQNAgent:
 23 |     def __init__(self,model,targetModel):
 24 |         self.model = model
 25 |         self.targetModel = targetModel
 26 | 
 27 |     def get_actions(self, observations):
 28 |         q_vals = self.model(torch.Tensor(observations).to(self.model.device))
 29 |         return q_vals.max(-1)[1]
 30 | 
 31 |     def update_target_model(self):
 32 |         self.targetModel.load_state_dict(self.model.state_dict())
 33 | 
 34 | class Model(nn.Module):
 35 |     def __init__(self, obs_shape, num_actions,lr):
 36 |         super(Model,self).__init__()
 37 |         assert len(obs_shape) ==1, "This network only works on flat observations"
 38 |         self.obs_shape = obs_shape
 39 |         self.num_action = num_actions
 40 | 
 41 |         self.net = torch.nn.Sequential(
 42 |             torch.nn.Linear(obs_shape[0],128),
 43 |             torch.nn.ReLU(),
 44 |             torch.nn.Linear(128,num_actions)
 45 |         )
 46 |         self.opt = optim.Adam(self.net.parameters(),lr=lr)
 47 |         if torch.cuda.is_available():
 48 |             print("Using CUDA")
 49 |         self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cuda:1')
 50 |         self.to(self.device)
 51 | 
 52 | 
 53 |     def forward(self, x):
 54 |         return self.net(x)
 55 | 
 56 | 
 57 | 
 58 | class ReplayBuffer:
 59 |     def __init__(self, buffer_size = 1000):
 60 |         self.buffer_size = buffer_size
 61 |         # self.buffer = [None]*buffer_size
 62 |         self.buffer = []
 63 |         self.index = 0
 64 | 
 65 |     def insert(self, sars):
 66 |         # self.buffer.append(sars)
 67 |         # print("inserting index ", self.index, "@",self.index%self.buffer_size)
 68 |         if(self.index == 10):
 69 |             print("first 10 ",self.buffer[0:10])
 70 |             # import ipdb; ipdb.set_trace()
 71 | 
 72 |         # if(self.index > self.buffer_size and self.index%self.buffer_size==0):
 73 |         #     print("first 10 ",self.buffer[0:10])
 74 |         #     print("last 10 ",self.buffer[-10:])
 75 |         #     print("")
 76 |         #     import ipdb; ipdb.set_trace()
 77 |         # self.buffer[self.index%self.buffer_size] = sars
 78 |         self.index+=1
 79 |         self.buffer.append(sars)
 80 |         if(len(self.buffer)>self.buffer_size):
 81 |             self.buffer = self.buffer[1:]
 82 |             # print("Clipping Buffer at size", len(self.buffer))
 83 | 
 84 |     def sample(self, num_samples,current_episode_steps):
 85 |         # assert num_samples < min(len(self.buffer),self.index)
 86 |         # if num_samples>self.index:
 87 |         # print("sampling n ",min(num_samples,self.index))
 88 |         # a = self.buffer[0:((self.index-current_episode_steps)%self.buffer_size)]
 89 |         if len(self.buffer) > 0:
 90 |             return np.random.choice(self.buffer, min(num_samples,self.index))
 91 |         else:
 92 |             return []
 93 | 
 94 | 
 95 | 
 96 | def get_one_hot(action,n_dim):
 97 |     retval = np.zeros(n_dim)
 98 |     retval[action] = 1.0
 99 |     return retval
100 | 
101 | 
102 | def train_step(model, state_transitions, tgt, num_actions):
103 |     if len(state_transitions) <=0:
104 |         print("empty state transitions")
105 |         return
106 |     cur_states = torch.stack( ([torch.Tensor(s.state) for s in state_transitions]) ).to(model.device)
107 |     rewards = torch.stack( ([torch.Tensor([s.reward]) for s in state_transitions]) ).to(model.device)
108 |     Qs = torch.stack( ([torch.Tensor([s.qval]) for s in state_transitions]) ).to(model.device)
109 |     mask = torch.stack(([torch.Tensor([0]) if s.done else torch.Tensor([1]) for s in state_transitions])).to(model.device)
110 |     next_states = torch.stack( ([torch.Tensor(s.next_state) for s in state_transitions]) ).to(model.device)
111 |     actions = [s.action for s in state_transitions]
112 |     # import ipdb; ipdb.set_trace()
113 |     with torch.no_grad():
114 |         # actual_Q_values = Qs
115 |         pred_qvals_next = model(next_states).max(-1)[0]
116 |     model.opt.zero_grad()
117 |     pred_qvals = model(cur_states)
118 | 
119 |     one_hot_actions = F.one_hot(torch.LongTensor(actions),num_actions).to(model.device)
120 |     # loss = torch.mean(torch.sqrt((torch.sum(pred_qvals*one_hot_actions,-1) - actual_Q_values.view(-1) )**2)).to(model.device)
121 |     # loss = F.smooth_l1_loss(torch.sum(pred_qvals*one_hot_actions,-1), actual_Q_values.view(-1) )
122 |     loss = F.smooth_l1_loss(torch.sum(pred_qvals*one_hot_actions,-1), rewards.view(-1)+0.99*mask[:,0]*pred_qvals_next.view(-1) ).mean()
123 |     loss.backward()
124 |     model.opt.step()
125 |     return loss
126 | 
127 | 
128 | 
129 | 
130 | 
131 | def update_Qs(replay_buffer,step_counter,episode_len,buffer_size):
132 |     for i in range(episode_len):
133 |         # if(step_counter > buffer_size):
134 |         # import ipdb; ipdb.set_trace()
135 |         index = episode_len-i
136 |         next_index = index+1
137 |         if i==0:
138 |             replay_buffer[index].qval = replay_buffer[index].reward
139 |             if(step_counter%2000==0):
140 |                 print("i",i,"q ",replay_buffer[index].qval)
141 |         else:
142 |             replay_buffer[index].qval = replay_buffer[index].reward + 0.99 * replay_buffer[next_index].qval
143 |             if(step_counter%2000==0):
144 |                 print("i",i,"q ",replay_buffer[index].qval)
145 |     return replay_buffer
146 | 
147 | 
148 | if __name__=='__main__':
149 |     DEBUGER_ON = True
150 |     NUM_GAMES = 50000
151 |     MAX_EPISODE_STEPS = 1490
152 |     TARGET_MODEL_UPDATE_INTERVAL = 50
153 |     EPSILON_MIN = 0.05
154 |     EPSILON_START = 0.5
155 |     EPSLILON_COUNT = 6000 #Games
156 |     RANDOM_GAME_EVERY = 20
157 |     TRAIN_EVERY_N_STEPS = 25
158 |     TRAINING_SAMPLE_SIZE = 256
159 |     TRAINING_ITTERATIONS = 1
160 |     PRINT_EVERY = 1
161 |     RENDER_ENV = True
162 |     LOAD_MODEL = True
163 |     SAVE_MODEL = False
164 |     MODEL_FILE_NAME = "TDQN_RL_MODEL.trl"
165 |     MODEL_ID = "01"
166 |     SAVE_MODEL_EVERY = 25
167 | 
168 |     epsilon = EPSILON_START
169 |     env = gym.make('LunarLander-v2')
170 |     # env = gym.make('CartPole-v1')
171 | 
172 |     observation = env.reset()
173 |     agent = DQNAgent(Model(env.observation_space.shape,env.action_space.n,lr=0.0001), Model(env.observation_space.shape,env.action_space.n,lr=0.0001) )
174 |     if LOAD_MODEL:
175 |         print("Loading Model ", ""+MODEL_ID+MODEL_FILE_NAME)
176 |         agent.model = torch.load(""+MODEL_ID+MODEL_FILE_NAME)
177 |         agent.model.eval()
178 |     step_counter = 0
179 |     avg_reward = []
180 |     last_step_count = 0
181 |     # qeval = m(torch.Tensor(allObs))
182 |     # # print("allObs ", allObs)
183 |     # # print("qeval ",qeval)
184 | 
185 | 
186 |     for game in range (NUM_GAMES):
187 |         # if game == 8:
188 |         #     print("rb ",rb.buffer)
189 |         score = 0
190 | 
191 |         for step in range (MAX_EPISODE_STEPS):
192 |             if RENDER_ENV:
193 |                 env.render()
194 |             # import ipdb; ipdb.set_trace()
195 |             action = 0
196 |             action = agent.get_actions(observation).item()
197 | 
198 | 
199 |             observation_next, reward, done, info = env.step(action)
200 | 
201 |             score += reward
202 | 
203 |             observation = observation_next
204 |             step_counter+=1
205 |             last_step_count = step
206 |             if done:
207 | 
208 |                 observation = env.reset()
209 |                 break
210 | 
211 | 
212 |         epsilon = max(EPSILON_MIN, epsilon-((EPSILON_START-EPSILON_MIN)/EPSLILON_COUNT) )
213 |         if (game%PRINT_EVERY==0):
214 |             print("episide ", game,"last score",reward, "game score ", score ,"episode_len",last_step_count, "epsilon",epsilon )
215 |         avg_reward = []
216 |         # print("epsilon ", epsilon)
217 | 


--------------------------------------------------------------------------------
/Pytorch/DQN/Readme.md:
--------------------------------------------------------------------------------
 1 | Minimalist implementation of DQN with Pytorch. 
 2 | 
 3 | How to use?
 4 | 
 5 | You need DQN_Lander.py to train the agent.
 6 | **python DQN_lander.py**
 7 | 
 8 | This will save the latest version of the model on a local file.
 9 | You can Load the model and view it in action with
10 | **python load_DQN_Model.py**
11 | 
12 | Hyper parameters are in the __main__ section.
13 | Both agent and loader need to the same environment name.
14 | 
15 | 


--------------------------------------------------------------------------------
/Pytorch/DQN_CNN/Load_ATARI_AGENT.py:
--------------------------------------------------------------------------------
  1 | ## DQN Tutorial
  2 | import torch
  3 | import gym
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | import torch.optim as optim
  7 | import numpy as np
  8 | from dataclasses import dataclass
  9 | from typing import Any
 10 | from random import random
 11 | from PIL import Image
 12 | from agent_and_model import DQNAgent,sars, Model, ReplayBuffer
 13 | import plotly.express as px
 14 | 
 15 | def get_one_hot(action,n_dim):
 16 |     retval = np.zeros(n_dim)
 17 |     retval[action] = 1.0
 18 |     return retval
 19 | 
 20 | 
 21 | def plot_score(all_scores):
 22 |     fig = px.line(x=np.arange(len(all_scores)),y=all_scores)
 23 |     fig.write_html('Play_DQN_CNN_Trend_figure.html')
 24 | 
 25 | if __name__=='__main__':
 26 |     DEBUGER_ON = True
 27 |     NUM_GAMES = 50000
 28 |     MAX_EPISODE_STEPS = 10000
 29 |     TARGET_MODEL_UPDATE_INTERVAL = 50
 30 |     EPSILON_MIN = 0.05
 31 |     EPSILON_START = 0.3
 32 |     EPSLILON_COUNT = 4000 #Games
 33 |     RANDOM_GAME_EVERY = 10
 34 |     TRAIN_EVERY_N_STEPS = 10
 35 |     TRAINING_SAMPLE_SIZE = 1
 36 |     TRAINING_ITTERATIONS = 1
 37 |     PRINT_EVERY = 1
 38 |     RENDER_ENV = True
 39 |     LOAD_MODEL = True
 40 |     SAVE_MODEL = True
 41 |     MODEL_FILE_NAME = "TDQN_RL_MODEL.trl"
 42 |     MODEL_ID = "01"
 43 |     SAVE_MODEL_EVERY = 25
 44 | 
 45 |     epsilon = EPSILON_START
 46 |     env = gym.make('Pong-v0')
 47 |     # env = gym.make('CartPole-v1')
 48 | 
 49 |     agent = DQNAgent(Model(env.observation_space.shape,env.action_space.n,lr=0.0001), Model(env.observation_space.shape,env.action_space.n,lr=0.0001) )
 50 | 
 51 |     observation = env.reset()
 52 |     frame1 = []
 53 |     frame2 = []
 54 |     frame3 = []
 55 |     frame1 = agent.process_frame(observation)
 56 |     frame2 = agent.process_frame(observation)
 57 |     frame3 = agent.process_frame(observation)
 58 |     # import ipdb; ipdb.set_trace()
 59 |     observation = np.concatenate((frame1,frame2,frame3),axis=1)
 60 |     observation = observation.reshape((1,3,160,140*3))
 61 | 
 62 |     if LOAD_MODEL:
 63 |         print("Loading Model ", ""+MODEL_ID+MODEL_FILE_NAME)
 64 |         agent.model = torch.load(""+MODEL_ID+MODEL_FILE_NAME)
 65 |         # agent.model.load_state_dict(torch.load(""+MODEL_ID+MODEL_FILE_NAME))
 66 |         agent.model.eval()
 67 |     step_counter = 0
 68 |     avg_reward = []
 69 |     rolling_average = 0
 70 | 
 71 | 
 72 |     for game in range (NUM_GAMES):
 73 |         episode_steps = 0
 74 |         score = 0.0
 75 |         all_scores = []
 76 |         for step in range (MAX_EPISODE_STEPS):
 77 |             if RENDER_ENV:
 78 |                 env.render()
 79 |             # import ipdb; ipdb.set_trace()
 80 |             action = 0
 81 | 
 82 |             action = agent.get_actions(observation).item()
 83 | 
 84 |             frame3 = frame2
 85 |             frame2 = frame1
 86 |             frame1, reward, done, info = env.step(action)
 87 | 
 88 | 
 89 |             score += reward
 90 |             # print("frame1", frame1.shape)
 91 |             frame1 = agent.process_frame(frame1)
 92 |             observation_next = np.concatenate((frame1,frame2,frame3),axis=1)
 93 | 
 94 |             # print("obs - concatenate", observation_next.shape)
 95 |             # if True or step%100==99:
 96 |             #     img = Image.fromarray(observation_next, 'RGB')
 97 |             #     img.save('my.png')
 98 |             #     img.show()
 99 |             # if done and reward <=-100:
100 |             #     reward = -300
101 |             observation_next = observation_next.reshape((1,3,160,140*3))
102 | 
103 |             # reward *= 100
104 |             avg_reward.append([reward])
105 | 
106 |             observation = observation_next
107 |             step_counter+=1
108 |             episode_steps = step
109 |             if done:
110 |                 observation = env.reset()
111 |                 frame1 = []
112 |                 frame2 = []
113 |                 frame3 = []
114 |                 frame1 = agent.process_frame(observation)
115 |                 frame2 = agent.process_frame(observation)
116 |                 frame3 = agent.process_frame(observation)
117 |                 # import ipdb; ipdb.set_trace()
118 |                 observation = np.concatenate((frame1,frame2,frame3),axis=1)
119 |                 observation = observation.reshape((1,3,160,140*3))
120 |                 break
121 | 
122 |         rolling_average = 0.05*score + (1-0.05)*rolling_average
123 |         epsilon = max(EPSILON_MIN, epsilon-((EPSILON_START-EPSILON_MIN)/EPSLILON_COUNT) )
124 |         all_scores.append(score)
125 |         if (game%PRINT_EVERY==0):
126 |             plot_score(all_scores)
127 |             print("episide ", game,"last score",reward,"rolling score ", rolling_average ,"episode_len", episode_steps , "score", score, "epsilon",epsilon )
128 |         avg_reward = []
129 |         # print("epsilon ", epsilon)
130 | 


--------------------------------------------------------------------------------
/Pytorch/DQN_CNN/Readme.md:
--------------------------------------------------------------------------------
1 | DQN Algorithm that takes images as input.
2 | You can train the agent using **python ATARI_DQN_CNN.py**
3 | You can see the agent in action **using python Load_ATARI_AGENT.py**
4 | 


--------------------------------------------------------------------------------
/Pytorch/DQN_CNN/agent_and_model.py:
--------------------------------------------------------------------------------
  1 | ## DQN Tutorial
  2 | ## Implementation from https://github.com/FitMachineLearning
  3 | import torch
  4 | import gym
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | import torch.optim as optim
  8 | import numpy as np
  9 | from dataclasses import dataclass
 10 | from typing import Any
 11 | from random import random
 12 | from PIL import Image
 13 | 
 14 | 
 15 | @dataclass
 16 | class sars:
 17 |     state: Any
 18 |     action: int
 19 |     reward: float
 20 |     next_state: Any
 21 |     done: bool
 22 |     qval: float
 23 | 
 24 | class DQNAgent:
 25 |     def __init__(self,model,targetModel):
 26 |         self.model = model
 27 |         self.targetModel = targetModel
 28 | 
 29 |     def get_actions(self, observations):
 30 |         q_vals = self.model(torch.Tensor(observations).to(self.model.device))
 31 |         return q_vals.max(-1)[1]
 32 | 
 33 |     def update_target_model(self):
 34 |         self.targetModel.load_state_dict(self.model.state_dict())
 35 | 
 36 |     def process_frame(self,frame):
 37 |         img = Image.fromarray(frame, 'RGB')
 38 |         width, height = img.size
 39 |         frame =  img.crop((5,35,width-15,height-15))
 40 |         return frame
 41 | 
 42 | class Model(nn.Module):
 43 |     def __init__(self, obs_shape, num_actions,lr):
 44 |         super(Model,self).__init__()
 45 |         # assert len(obs_shape) ==1, "This network only works on flat observations"
 46 |         self.obs_shape = obs_shape
 47 |         self.num_action = num_actions
 48 |         # import ipdb; ipdb.set_trace()
 49 | 
 50 |         self.conv_net = torch.nn.Sequential(
 51 |             nn.BatchNorm2d(3),
 52 |             nn.Conv2d(3, 32, 8, 4),
 53 |             # nn.MaxPool2d(4),
 54 |             # nn.Dropout(0.2),
 55 |             nn.ReLU(),
 56 |             nn.Conv2d(32, 64, 4, 2),
 57 |             # nn.Dropout(0.2),
 58 |             nn.ReLU(),
 59 |             nn.Conv2d(64, 64, 3,1),
 60 |             # nn.MaxPool2d(4),
 61 |             # nn.Dropout(0.2),
 62 |             nn.ReLU()
 63 | 
 64 |         )
 65 |         self.linear_layer = torch.nn.Sequential(
 66 |             torch.nn.Linear(50176,256),
 67 |             # nn.Dropout(0.6),
 68 |             # torch.nn.ReLU(),
 69 |             # torch.nn.Linear(128,256),
 70 |             # nn.Dropout(0.2),
 71 |             torch.nn.ReLU(),
 72 |             torch.nn.Linear(256,num_actions)
 73 |         )
 74 |         self.opt = optim.Adam(self.conv_net.parameters(),lr=lr)
 75 |         self.opt2 = optim.Adam(self.linear_layer.parameters(),lr=lr)
 76 | 
 77 |         if torch.cuda.is_available():
 78 |             print("Using CUDA")
 79 |         self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cuda:1')
 80 |         self.to(self.device)
 81 | 
 82 | 
 83 |     def forward(self, x):
 84 |         x = self.conv_net(x)
 85 |         x = x.view(x.size(0),-1)
 86 |         x = self.linear_layer(x)
 87 |         return x
 88 | 
 89 | 
 90 | 
 91 | class ReplayBuffer:
 92 |     def __init__(self, buffer_size = 1000):
 93 |         # self.buffer_size = buffer_size
 94 |         self.buffer_size = buffer_size
 95 |         self.buffer = np.empty((buffer_size),dtype=object)
 96 | 
 97 |         # self.buffer = []
 98 |         self.index = 0
 99 | 
100 |     def insert(self, sars):
101 | 
102 |         # if self.index>1000:
103 |         #     # import ipdb; ipdb.set_trace()
104 |         #     Qs = np.array([s.qval for s in self.buffer[0:(min(self.index,self.buffer_size))]])
105 |         #     Qs_threshold = Qs.mean() + Qs.var()/4
106 |         #     select_prob = 1 - ( ( sars.qval - Qs_threshold ) / Qs_threshold)
107 |         #     select_prob = max(0.15,select_prob)
108 |         #     if random()<select_prob:
109 |         #         return
110 | 
111 | 
112 | 
113 |         self.buffer[self.index%self.buffer_size] = sars
114 |         self.index+=1
115 | 
116 |     def sample(self, num_samples,current_episode_steps):
117 |         # assert num_samples < min(len(self.buffer),self.index)
118 |         # if num_samples>self.index:
119 |         # print("sampling n ",min(num_samples,self.index))
120 |         a = self.buffer[0:min(self.index,self.buffer_size)]
121 |         if len(self.buffer) > 0:
122 |             return np.random.choice(a, min(num_samples,self.index))
123 |         else:
124 |             return []
125 | 
126 |     def sample_top(self, num_samples,current_episode_steps):
127 |         import ipdb; ipdb.set_trace()
128 |         Qs = np.array([s.qvals for s in self.buffer])
129 |         Qs_threshold = Qs.mean() + Qs.var()/3
130 | 
131 |         # if num_samples>self.index:
132 |         # print("sampling n ",min(num_samples,self.index))
133 |         a = self.buffer[0:min(self.index,self.buffer_size)]
134 |         if len(self.buffer) > 0:
135 |             return np.random.choice(a, min(num_samples,self.index))
136 |         else:
137 |             return []
138 | 


--------------------------------------------------------------------------------
/Pytorch/PPO/PPO_LunarLander.py:
--------------------------------------------------------------------------------
  1 | # Working PPO implmentation
  2 | 
  3 | # Modifed from
  4 | # https://github.com/sweetice/Deep-reinforcement-learning-with-pytorch/
  5 | 
  6 | import argparse
  7 | import pickle
  8 | from collections import namedtuple
  9 | from itertools import count
 10 | 
 11 | import os, time
 12 | import numpy as np
 13 | import matplotlib.pyplot as plt
 14 | 
 15 | import gym
 16 | import torch
 17 | import torch.nn as nn
 18 | import torch.nn.functional as F
 19 | import torch.optim as optim
 20 | from torch.distributions import Normal, Categorical
 21 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
 22 | # from tensorboardX import SummaryWriter
 23 | 
 24 | # Parameters
 25 | gamma = 0.99
 26 | render = True
 27 | seed = 1
 28 | log_interval = 10
 29 | 
 30 | # env = gym.make('CartPole-v0').unwrapped
 31 | env = gym.make('LunarLander-v2').unwrapped
 32 | num_state = env.observation_space.shape[0]
 33 | num_action = env.action_space.n
 34 | torch.manual_seed(seed)
 35 | env.seed(seed)
 36 | Transition = namedtuple('Transition', ['state', 'action',  'a_log_prob', 'reward', 'next_state'])
 37 | 
 38 | class Actor(nn.Module):
 39 |     def __init__(self):
 40 |         super(Actor, self).__init__()
 41 |         self.fc1 = nn.Linear(num_state, 100)
 42 |         self.action_head = nn.Linear(100, num_action)
 43 | 
 44 |     def forward(self, x):
 45 |         x = F.relu(self.fc1(x))
 46 |         action_prob = F.softmax(self.action_head(x), dim=1)
 47 |         return action_prob
 48 | 
 49 | 
 50 | class Critic(nn.Module):
 51 |     def __init__(self):
 52 |         super(Critic, self).__init__()
 53 |         self.fc1 = nn.Linear(num_state, 100)
 54 |         self.state_value = nn.Linear(100, 1)
 55 | 
 56 |     def forward(self, x):
 57 |         x = F.relu(self.fc1(x))
 58 |         value = self.state_value(x)
 59 |         return value
 60 | 
 61 | 
 62 | class PPO():
 63 |     clip_param = 0.2
 64 |     max_grad_norm = 0.5
 65 |     ppo_update_time = 10
 66 |     buffer_capacity = 1000
 67 |     batch_size = 32
 68 | 
 69 |     def __init__(self):
 70 |         super(PPO, self).__init__()
 71 |         self.actor_net = Actor()
 72 |         self.critic_net = Critic()
 73 |         self.buffer = []
 74 |         self.counter = 0
 75 |         self.training_step = 0
 76 |         # self.writer = SummaryWriter('../exp')
 77 | 
 78 |         self.actor_optimizer = optim.Adam(self.actor_net.parameters(), 1e-3)
 79 |         self.critic_net_optimizer = optim.Adam(self.critic_net.parameters(), 3e-3)
 80 |         if not os.path.exists('../param'):
 81 |             os.makedirs('../param/net_param')
 82 |             os.makedirs('../param/img')
 83 | 
 84 |     def select_action(self, state):
 85 |         state = torch.from_numpy(state).float().unsqueeze(0)
 86 |         with torch.no_grad():
 87 |             action_prob = self.actor_net(state)
 88 |         c = Categorical(action_prob)
 89 |         action = c.sample()
 90 |         return action.item(), action_prob[:,action.item()].item()
 91 | 
 92 |     def get_value(self, state):
 93 |         state = torch.from_numpy(state)
 94 |         with torch.no_grad():
 95 |             value = self.critic_net(state)
 96 |         return value.item()
 97 | 
 98 |     def save_param(self):
 99 |         torch.save(self.actor_net.state_dict(), '../param/net_param/actor_net' + str(time.time())[:10], +'.pkl')
100 |         torch.save(self.critic_net.state_dict(), '../param/net_param/critic_net' + str(time.time())[:10], +'.pkl')
101 | 
102 |     def store_transition(self, transition):
103 |         self.buffer.append(transition)
104 |         self.counter += 1
105 | 
106 | 
107 |     def update(self, i_ep):
108 |         state = torch.tensor([t.state for t in self.buffer], dtype=torch.float)
109 |         action = torch.tensor([t.action for t in self.buffer], dtype=torch.long).view(-1, 1)
110 |         reward = [t.reward for t in self.buffer]
111 |         # update: don't need next_state
112 |         #reward = torch.tensor([t.reward for t in self.buffer], dtype=torch.float).view(-1, 1)
113 |         #next_state = torch.tensor([t.next_state for t in self.buffer], dtype=torch.float)
114 |         old_action_log_prob = torch.tensor([t.a_log_prob for t in self.buffer], dtype=torch.float).view(-1, 1)
115 | 
116 |         R = 0
117 |         Gt = []
118 |         for r in reward[::-1]:
119 |             R = r + gamma * R
120 |             Gt.insert(0, R)
121 |         Gt = torch.tensor(Gt, dtype=torch.float)
122 |         #print("The agent is updateing....")
123 |         for i in range(self.ppo_update_time):
124 |             for index in BatchSampler(SubsetRandomSampler(range(len(self.buffer))), self.batch_size, False):
125 |                 if self.training_step % 1000 ==0:
126 |                     print('I_ep {} ，train {} times'.format(i_ep,self.training_step))
127 |                 #with torch.no_grad():
128 |                 Gt_index = Gt[index].view(-1, 1)
129 |                 V = self.critic_net(state[index])
130 |                 delta = Gt_index - V
131 |                 advantage = delta.detach()
132 |                 # epoch iteration, PPO core!!!
133 |                 action_prob = self.actor_net(state[index]).gather(1, action[index]) # new policy
134 | 
135 |                 ratio = (action_prob/old_action_log_prob[index])
136 |                 surr1 = ratio * advantage
137 |                 surr2 = torch.clamp(ratio, 1 - self.clip_param, 1 + self.clip_param) * advantage
138 | 
139 |                 # update actor network
140 |                 action_loss = -torch.min(surr1, surr2).mean()  # MAX->MIN desent
141 |                 # self.writer.add_scalar('loss/action_loss', action_loss, global_step=self.training_step)
142 |                 self.actor_optimizer.zero_grad()
143 |                 action_loss.backward()
144 |                 nn.utils.clip_grad_norm_(self.actor_net.parameters(), self.max_grad_norm)
145 |                 self.actor_optimizer.step()
146 | 
147 |                 #update critic network
148 |                 value_loss = F.mse_loss(Gt_index, V)
149 |                 # self.writer.add_scalar('loss/value_loss', value_loss, global_step=self.training_step)
150 |                 self.critic_net_optimizer.zero_grad()
151 |                 value_loss.backward()
152 |                 nn.utils.clip_grad_norm_(self.critic_net.parameters(), self.max_grad_norm)
153 |                 self.critic_net_optimizer.step()
154 |                 self.training_step += 1
155 | 
156 |         del self.buffer[:] # clear experience
157 | 
158 | 
159 | def main():
160 |     agent = PPO()
161 |     for i_epoch in range(1000):
162 |         state = env.reset()
163 |         if render: env.render()
164 | 
165 |         for t in count():
166 |             action, action_prob = agent.select_action(state)
167 |             next_state, reward, done, _ = env.step(action)
168 |             if t>800:
169 |                 done = True
170 |             trans = Transition(state, action, action_prob, reward, next_state)
171 |             if render: env.render()
172 |             agent.store_transition(trans)
173 |             state = next_state
174 | 
175 |             if done :
176 |                 if len(agent.buffer) >= agent.batch_size:agent.update(i_epoch)
177 |                 # agent.writer.add_scalar('liveTime/livestep', t, global_step=i_epoch)
178 |                 break
179 | 
180 | 
181 | if __name__ == '__main__':
182 |     main()
183 |     print("end")
184 | 


--------------------------------------------------------------------------------
/Pytorch/PPO/Readme.md:
--------------------------------------------------------------------------------
1 | Simple implementation PPO, that fixes a few errors. Now runs on pytorch 1.5.0
2 | 
3 | 
4 | Credit: Code modified from sweetice 's original version
5 | 
6 | 


--------------------------------------------------------------------------------
/Pytorch/PolicyGradient/Load_model.py:
--------------------------------------------------------------------------------
  1 | ## DQN Tutorial
  2 | ## Implementation from https://github.com/FitMachineLearning
  3 | 
  4 | import torch
  5 | import gym
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | import torch.optim as optim
  9 | from torch.distributions import Categorical
 10 | import numpy as np
 11 | from dataclasses import dataclass
 12 | from typing import Any
 13 | from random import random
 14 | 
 15 | from agent_and_model import Policy
 16 | 
 17 | 
 18 | def select_action(state, policy):
 19 |     state = torch.from_numpy(state).float().unsqueeze(0)
 20 |     probs = policy(state.to(policy.device))
 21 |     m = Categorical(probs)
 22 |     action = m.sample()
 23 |     policy.saved_log_probs.append(m.log_prob(action))
 24 |     return action.item()
 25 | 
 26 | 
 27 | 
 28 | 
 29 | if __name__=='__main__':
 30 |     DEBUGER_ON = True
 31 |     NUM_GAMES = 100
 32 |     MAX_EPISODE_STEPS = 10000
 33 |     TARGET_MODEL_UPDATE_INTERVAL = 50
 34 |     EPSILON_MIN = 0.05
 35 |     EPSILON_START = 0.5
 36 |     EPSLILON_COUNT = 6000 #Games
 37 |     INITIAL_RANDOM_STEPS = 5000
 38 |     RANDOM_GAME_EVERY = 20
 39 |     TRAIN_CRITIC_EVERY_N_STEP = 300
 40 |     CRITIC_TRAINING_SAMPLE_SIZE = 256
 41 |     TRAIN_ACTOR_EVERY_N_GAME = 1
 42 |     ACTOR_TRAINING_SAMPLE_SIZE = 8
 43 |     NUM_ACTOR_TRAINING_SAMPLES = 40
 44 |     TRAINING_ITTERATIONS = 1
 45 |     NUM_ACTOR_TRAINING_SAMPLES = 128
 46 |     PRINT_EVERY = 1
 47 |     RENDER_ENV = True
 48 |     LOAD_MODEL = True
 49 |     SAVE_MODEL = False
 50 |     MODEL_FILE_NAME = "TDQN_RL_MODEL.trl"
 51 |     MODEL_ID = "01"
 52 |     SAVE_MODEL_EVERY = 25
 53 | 
 54 |     epsilon = EPSILON_START
 55 |     env = gym.make('LunarLander-v2')
 56 |     # env = gym.make('BipedalWalker-v3')
 57 | 
 58 |     observation = env.reset()
 59 |     print("env action space ", env.action_space.shape)
 60 |     policy=Policy()
 61 | 
 62 |     # import ipdb;ipdb.set_trace()
 63 | 
 64 |     if LOAD_MODEL:
 65 |         policy = torch.load("pg_policy.trl")
 66 |         policy.eval()
 67 | 
 68 |     step_counter = 0
 69 |     last_step_count = 0
 70 | 
 71 | 
 72 |     action = []
 73 |     for game in range (NUM_GAMES):
 74 |         episode_sars = []
 75 |         score = 0
 76 |         for step in range (MAX_EPISODE_STEPS):
 77 |             if RENDER_ENV:
 78 |                 env.render()
 79 | 
 80 |             if random()<-0.1:
 81 |                 action = env.action_space.sample()
 82 |             else:
 83 |                 # import ipdb; ipdb.set_trace()
 84 |                 action = select_action(observation,policy)
 85 |                 # print("action ", action)
 86 |             observation_next, reward, done, info = env.step(action)
 87 |             score += reward
 88 | 
 89 |             observation = observation_next
 90 |             step_counter+=1
 91 |             last_step_count = step
 92 |             if done:
 93 | 
 94 |                 break
 95 | 
 96 |         observation = env.reset()
 97 |         epsilon = max(EPSILON_MIN, epsilon-((EPSILON_START-EPSILON_MIN)/EPSLILON_COUNT) )
 98 |         if (game%PRINT_EVERY==0):
 99 |             print("episide ", game,"last score",reward, "game score ", score ,"episode_len",last_step_count, "epsilon",epsilon )
100 |         avg_reward = []
101 |         # print("epsilon ", epsilon)
102 | 


--------------------------------------------------------------------------------
/Pytorch/PolicyGradient/Readme.md:
--------------------------------------------------------------------------------
1 | # Implementation from FitMachineLeaning
2 | # Modified from the reinforce aglorimth
3 | @ https://github.com/pytorch/examples/blob/master/reinforcement_learning/reinforce.py
4 | 


--------------------------------------------------------------------------------
/Pytorch/PolicyGradient/agent_and_model.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import gym
 3 | import numpy as np
 4 | from itertools import count
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | import torch.nn.functional as F
 9 | import torch.optim as optim
10 | from torch.distributions import Categorical
11 | 
12 | class Policy(nn.Module):
13 |     def __init__(self):
14 |         super(Policy, self).__init__()
15 |         self.affine1 = nn.Linear(8, 64)
16 |         self.dropout1 = nn.Dropout(p=0.1)
17 |         self.affine2 = nn.Linear(64, 64)
18 |         self.dropout2 = nn.Dropout(p=0.2)
19 |         self.affine3 = nn.Linear(64, 4)
20 | 
21 |         self.saved_log_probs = []
22 |         self.rewards = []
23 |         self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cuda:1')
24 |         self.to(self.device)
25 | 
26 | 
27 |     def forward(self, x):
28 |         x = self.affine1(x)
29 |         # x = self.dropout1(x)
30 |         x = self.affine2(x)
31 |         x = self.dropout2(x)
32 |         x = F.relu(x)
33 |         action_scores = self.affine3(x)
34 |         return F.softmax(action_scores, dim=1)
35 | 


--------------------------------------------------------------------------------
/Pytorch/PolicyGradient/policy_gradien_2.py:
--------------------------------------------------------------------------------
 1 | # Implementation from FitMachineLeaning
 2 | # Modified from the reinforce aglorimth
 3 | # @ https://github.com/pytorch/examples/blob/master/reinforcement_learning/reinforce.py
 4 | import argparse
 5 | import gym
 6 | import numpy as np
 7 | from itertools import count
 8 | 
 9 | import torch
10 | import torch.nn as nn
11 | import torch.nn.functional as F
12 | import torch.optim as optim
13 | from torch.distributions import Categorical
14 | from agent_and_model import Policy
15 | 
16 | 
17 | parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
18 | parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
19 |                     help='discount factor (default: 0.99)')
20 | parser.add_argument('--seed', type=int, default=543, metavar='N',
21 |                     help='random seed (default: 543)')
22 | parser.add_argument('--render', action='store_true',
23 |                     help='render the environment')
24 | parser.add_argument('--log-interval', type=int, default=10, metavar='N',
25 |                     help='interval between training status logs (default: 10)')
26 | args = parser.parse_args()
27 | 
28 | 
29 | # env = gym.make('CartPole-v1')
30 | env = gym.make('LunarLander-v2')
31 | env.seed(args.seed)
32 | torch.manual_seed(args.seed)
33 | 
34 | 
35 | policy = Policy()
36 | optimizer = optim.Adam(policy.parameters(), lr=1e-3)
37 | eps = np.finfo(np.float32).eps.item()
38 | 
39 | 
40 | def select_action(state):
41 |     state = torch.from_numpy(state).float().unsqueeze(0)
42 |     probs = policy(state.to(policy.device))
43 |     m = Categorical(probs)
44 |     action = m.sample()
45 |     policy.saved_log_probs.append(m.log_prob(action))
46 |     return action.item()
47 | 
48 | 
49 | def finish_episode():
50 |     R = 0
51 |     policy_loss = []
52 |     returns = []
53 |     for r in policy.rewards[::-1]:
54 |         R = r + args.gamma * R
55 |         returns.insert(0, R)
56 |     returns = torch.tensor(returns).to(policy.device)
57 |     returns = (returns - returns.mean()) / (returns.std() + eps)
58 |     for log_prob, R in zip(policy.saved_log_probs, returns):
59 |         policy_loss.append(-log_prob * R)
60 |     optimizer.zero_grad()
61 |     policy_loss = torch.cat(policy_loss).sum()
62 |     policy_loss.backward()
63 |     optimizer.step()
64 |     del policy.rewards[:]
65 |     del policy.saved_log_probs[:]
66 | 
67 | 
68 | def main():
69 |     running_reward = 10
70 |     for i_episode in count(1):
71 |         state, ep_reward = env.reset(), 0
72 |         for t in range(1, 20000):  # Don't infinite loop while learning
73 |             action = select_action(state)
74 |             state, reward, done, _ = env.step(action)
75 |             # if True or args.render:
76 |             #     env.render()
77 |             policy.rewards.append(reward)
78 |             ep_reward += reward
79 |             if done:
80 |                 break
81 | 
82 |         running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
83 |         finish_episode()
84 |         # if i_episode % args.log_interval == 0:
85 |         if i_episode % 1 == 0:
86 |             print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
87 |                   i_episode, ep_reward, running_reward))
88 | 
89 |         if i_episode % 10 ==0:
90 |             torch.save(policy,"pg_policy.trl")
91 |         if running_reward > env.spec.reward_threshold:
92 |             print("Solved! Running reward is now {} and "
93 |                   "the last episode runs to {} time steps!".format(running_reward, t))
94 |             break
95 | 
96 | 
97 | if __name__ == '__main__':
98 |     main()
99 | 


--------------------------------------------------------------------------------
/QLearning/LunarLander_QL.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | LunarLander-v2 solution by Michel Aka
  3 | https://github.com/FitMachineLearning/FitML/
  4 | https://www.youtube.com/channel/UCi7_WxajoowBl4_9P0DhzzA/featured
  5 | Using Modified Q Learning, Bellman, Reinforcement Learning, RL memory
  6 | 
  7 | '''
  8 | import numpy as np
  9 | import keras
 10 | import gym
 11 | import os
 12 | import h5py
 13 | 
 14 | from keras.models import Sequential
 15 | from keras.layers import Dense, Dropout
 16 | from keras.layers import Embedding
 17 | from keras import optimizers
 18 | 
 19 | 
 20 | num_env_variables = 8
 21 | num_env_actions = 4
 22 | num_initial_observation = 0
 23 | learning_rate = 0.001
 24 | weigths_filename = "LL-QL-v2-weights.h5"
 25 | 
 26 | b_discount = 0.98
 27 | max_memory_len = 60000
 28 | starting_explore_prob = 0.05
 29 | training_epochs = 2
 30 | load_previous_weights = True
 31 | observe_and_train = True
 32 | save_weights = True
 33 | num_games_to_play = 50
 34 | 
 35 | 
 36 | #One hot encoding array
 37 | possible_actions = np.arange(0,num_env_actions)
 38 | actions_1_hot = np.zeros((num_env_actions,num_env_actions))
 39 | actions_1_hot[np.arange(num_env_actions),possible_actions] = 1
 40 | 
 41 | #Create testing enviroment
 42 | env = gym.make('LunarLander-v2')
 43 | env.reset()
 44 | 
 45 | #initialize training matrix with random states and actions
 46 | dataX = np.random.random(( 5,num_env_variables+num_env_actions ))
 47 | #Only one output for the total score
 48 | dataY = np.random.random((5,1))
 49 | 
 50 | 
 51 | 
 52 | #nitialize the Neural Network with random weights
 53 | 
 54 | model = Sequential()
 55 | #model.add(Dense(num_env_variables+num_env_actions, activation='tanh', input_dim=dataX.shape[1]))
 56 | model.add(Dense(512, activation='relu', input_dim=dataX.shape[1]))
 57 | model.add(Dense(dataY.shape[1]))
 58 | 
 59 | opt = optimizers.adam(lr=learning_rate)
 60 | 
 61 | model.compile(loss='mse', optimizer=opt, metrics=['accuracy'])
 62 | 
 63 | #load previous model weights if they exist
 64 | if load_previous_weights:
 65 |     dir_path = os.path.realpath(".")
 66 |     fn = dir_path + "/"+weigths_filename
 67 |     print("filepath ", fn)
 68 |     if  os.path.isfile(fn):
 69 |         print("loading weights")
 70 |         model.load_weights(weigths_filename)
 71 |     else:
 72 |         print("File ",weigths_filename," does not exis. Retraining... ")
 73 | 
 74 | #Initialize training data array 
 75 | total_steps = 0
 76 | dataX = np.zeros(shape=(1,num_env_variables+num_env_actions))
 77 | dataY = np.zeros(shape=(1,1))
 78 | 
 79 | #Initialize Memory Array data array 
 80 | memoryX = np.zeros(shape=(1,num_env_variables+num_env_actions))
 81 | memoryY = np.zeros(shape=(1,1))
 82 | 
 83 | 
 84 | print("dataX shape", dataX.shape)
 85 | print("dataY shape", dataY.shape)
 86 | 
 87 | 
 88 | #This function predicts the reward that will result from taking an "action" at a state "qstate"
 89 | def predictTotalRewards(qstate, action):
 90 |     qs_a = np.concatenate((qstate,actions_1_hot[action]), axis=0)
 91 |     predX = np.zeros(shape=(1,num_env_variables+num_env_actions))
 92 |     predX[0] = qs_a
 93 | 
 94 |     #print("trying to predict reward at qs_a", predX[0])
 95 |     pred = model.predict(predX[0].reshape(1,predX.shape[1]))
 96 |     remembered_total_reward = pred[0][0]
 97 |     return remembered_total_reward
 98 | 
 99 | 
100 | 
101 | if observe_and_train:
102 | 
103 |     #Play the game a determine number of times
104 |     for game in range(num_games_to_play):
105 |         gameX = np.zeros(shape=(1,num_env_variables+num_env_actions))
106 |         gameY = np.zeros(shape=(1,1))
107 |         #Get the initial Q state
108 |         qs = env.reset()
109 |         for step in range (40000):
110 | 
111 |             #Learn from observation and not playing 
112 |             if game < num_initial_observation:
113 |                 #take a radmon action
114 |                 a = env.action_space.sample()
115 |             else:
116 |                 #Now playing and also learning from experience during play
117 |                 
118 |                 #Calculate probability to take deterministic action vs random action (epsilon)
119 |                 prob = np.random.rand(1)
120 |                 explore_prob = starting_explore_prob-(starting_explore_prob/num_games_to_play)*game
121 | 
122 |                 #Chose between prediction and chance
123 |                 if prob < explore_prob:
124 |                     #take a random action
125 |                     a=env.action_space.sample()
126 |                     #print("taking random action",a, "at total_steps" , total_steps)
127 |                     #print("prob ", prob, "explore_prob", explore_prob)
128 | 
129 |                 else:
130 |                     ##chose an action by estimating the function-estimator remembered consequences of all possible actions
131 |                     ## Bellman states that the best policy (i.e. action) is the one that maximizez expected rewards for future states
132 |                     ## to caculate rewards we compute the reward a this state t + the discounted (b_discount) reward at all possible state t+1
133 |                     ## all states t+1 are estimated by our function estimator (our Neural Network)
134 | 
135 | 
136 |                     utility_possible_actions = np.zeros(shape=(num_env_actions))
137 | 
138 |                     utility_possible_actions[0] = predictTotalRewards(qs,0)
139 |                     utility_possible_actions[1] = predictTotalRewards(qs,1)
140 |                     utility_possible_actions[2] = predictTotalRewards(qs,2)
141 |                     utility_possible_actions[3] = predictTotalRewards(qs,3)
142 | 
143 | 
144 |                     #chose argmax action of estimated anticipated rewards
145 |                     #print("utility_possible_actions ",utility_possible_actions)
146 |                     #print("argmax of utitity", np.argmax(utility_possible_actions))
147 |                     a = np.argmax(utility_possible_actions)
148 | 
149 | 
150 | 
151 |             env.render()
152 |             qs_a = np.concatenate((qs,actions_1_hot[a]), axis=0)
153 | 
154 |             #print("action",a," qs_a",qs_a)
155 |             #Perform the optimal action and get the target state and reward
156 |             s,r,done,info = env.step(a)
157 |             
158 |             
159 |             #record information for training and memory
160 |             if step ==0:
161 |                 gameX[0] = qs_a
162 |                 gameY[0] = np.array([r])
163 |                 memoryX[0] = qs_a
164 |                 memoryY[0] = np.array([r])
165 | 
166 |             gameX = np.vstack((gameX,qs_a))
167 |             gameY = np.vstack((gameY,np.array([r])))
168 | 
169 | 
170 |             if done :
171 |                 #GAME ENDED
172 |                 #Calculate Q values from end to start of game (From last step to first)
173 |                 for i in range(0,gameY.shape[0]):
174 |                     #print("Updating total_reward at game epoch ",(gameY.shape[0]-1) - i)
175 |                     if i==0:
176 |                         #print("reward at the last step ",gameY[(gameY.shape[0]-1)-i][0])
177 |                         gameY[(gameY.shape[0]-1)-i][0] = gameY[(gameY.shape[0]-1)-i][0]
178 |                     else:
179 |                         #print("local error before Bellman", gameY[(gameY.shape[0]-1)-i][0],"Next error ", gameY[(gameY.shape[0]-1)-i+1][0])
180 |                         gameY[(gameY.shape[0]-1)-i][0] = gameY[(gameY.shape[0]-1)-i][0]+b_discount*gameY[(gameY.shape[0]-1)-i+1][0]
181 |                         #print("reward at step",i,"away from the end is",gameY[(gameY.shape[0]-1)-i][0])
182 |                     if i==gameY.shape[0]-1:
183 |                         print("Training Game #",game, " steps = ", step ,"last reward", r," finished with headscore ", gameY[(gameY.shape[0]-1)-i][0])
184 | 
185 |                 if memoryX.shape[0] ==1:
186 |                     memoryX = gameX
187 |                     memoryY = gameY
188 |                 else:
189 |                     #Add experience to memory
190 |                     memoryX = np.concatenate((memoryX,gameX),axis=0)
191 |                     memoryY = np.concatenate((memoryY,gameY),axis=0)
192 | 
193 |                 #if memory is full remove first element
194 |                 if np.alen(memoryX) >= max_memory_len:
195 |                     #print("memory full. mem len ", np.alen(memoryX))
196 |                     for l in range(np.alen(gameX)):
197 |                         memoryX = np.delete(memoryX, 0, axis=0)
198 |                         memoryY = np.delete(memoryY, 0, axis=0)
199 | 
200 |             #Update the states
201 |             qs=s
202 | 
203 |             #Retrain every X game after num_initial_observation
204 |             if done and game >= num_initial_observation:
205 |                 if game%10 == 0:
206 |                     print("Training  game# ", game,"momory size", memoryX.shape[0])
207 |                     model.fit(memoryX,memoryY, batch_size=32,epochs=training_epochs,verbose=2)
208 | 
209 |             if done:
210 |                 if r >= 0 and r <99:
211 |                     print("Game ",game," ended with positive reward ")
212 |                 if r > 50:
213 |                     print("Game ", game," WON *** " )
214 |                 #Game ended - Break
215 |                 break
216 | 
217 | 
218 | 
219 | 
220 | 
221 | if save_weights:
222 |     #Save model
223 |     print("Saving weights")
224 |     model.save_weights(weigths_filename)
225 | 


--------------------------------------------------------------------------------
/QLearning/LunarLander_v2.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | LunarLander-v2 solution by Michel Aka
  3 | https://github.com/FitMachineLearning/FitML/
  4 | https://www.youtube.com/channel/UCi7_WxajoowBl4_9P0DhzzA/featured
  5 | Using Modified Q Learning, Bellman, Reinforcement Learning, RL memory
  6 | 
  7 | '''
  8 | import numpy as np
  9 | import keras
 10 | import gym
 11 | import os
 12 | import h5py
 13 | 
 14 | from keras.models import Sequential
 15 | from keras.layers import Dense, Dropout
 16 | from keras.layers import Embedding
 17 | from keras import optimizers
 18 | 
 19 | 
 20 | num_env_variables = 8
 21 | num_env_actions = 4
 22 | num_initial_observation = 15
 23 | learning_rate = 0.003
 24 | weigths_filename = "LL-QL-v2-weights.h5"
 25 | 
 26 | b_discount = 0.99
 27 | max_memory_len = 60000
 28 | starting_explore_prob = 0.05
 29 | training_epochs = 3
 30 | load_previous_weights = True
 31 | observe_and_train = True
 32 | save_weights = True
 33 | num_games_to_play = 1000
 34 | 
 35 | 
 36 | #One hot encoding array
 37 | possible_actions = np.arange(0,num_env_actions)
 38 | actions_1_hot = np.zeros((num_env_actions,num_env_actions))
 39 | actions_1_hot[np.arange(num_env_actions),possible_actions] = 1
 40 | 
 41 | #Create testing enviroment
 42 | env = gym.make('LunarLander-v2')
 43 | env.reset()
 44 | 
 45 | #initialize training matrix with random states and actions
 46 | dataX = np.random.random(( 5,num_env_variables+num_env_actions ))
 47 | #Only one output for the total score
 48 | dataY = np.random.random((5,1))
 49 | 
 50 | 
 51 | 
 52 | #nitialize the Neural Network with random weights
 53 | 
 54 | model = Sequential()
 55 | #model.add(Dense(num_env_variables+num_env_actions, activation='tanh', input_dim=dataX.shape[1]))
 56 | model.add(Dense(512, activation='relu', input_dim=dataX.shape[1]))
 57 | model.add(Dense(256, activation='relu' ))
 58 | model.add(Dense(256, activation='relu'))
 59 | model.add(Dense(dataY.shape[1]))
 60 | 
 61 | opt = optimizers.adam(lr=learning_rate)
 62 | 
 63 | model.compile(loss='mse', optimizer=opt, metrics=['accuracy'])
 64 | 
 65 | #load previous model weights if they exist
 66 | if load_previous_weights:
 67 |     dir_path = os.path.realpath(".")
 68 |     fn = dir_path + "/"+weigths_filename
 69 |     print("filepath ", fn)
 70 |     if  os.path.isfile(fn):
 71 |         print("loading weights")
 72 |         model.load_weights(weigths_filename)
 73 |     else:
 74 |         print("File ",weigths_filename," does not exis. Retraining... ")
 75 | 
 76 | #Initialize training data array
 77 | total_steps = 0
 78 | dataX = np.zeros(shape=(1,num_env_variables+num_env_actions))
 79 | dataY = np.zeros(shape=(1,1))
 80 | 
 81 | #Initialize Memory Array data array
 82 | memoryX = np.zeros(shape=(1,num_env_variables+num_env_actions))
 83 | memoryY = np.zeros(shape=(1,1))
 84 | 
 85 | 
 86 | print("dataX shape", dataX.shape)
 87 | print("dataY shape", dataY.shape)
 88 | 
 89 | 
 90 | #This function predicts the reward that will result from taking an "action" at a state "qstate"
 91 | def predictTotalRewards(qstate, action):
 92 |     qs_a = np.concatenate((qstate,actions_1_hot[action]), axis=0)
 93 |     predX = np.zeros(shape=(1,num_env_variables+num_env_actions))
 94 |     predX[0] = qs_a
 95 | 
 96 |     #print("trying to predict reward at qs_a", predX[0])
 97 |     pred = model.predict(predX[0].reshape(1,predX.shape[1]))
 98 |     remembered_total_reward = pred[0][0]
 99 |     return remembered_total_reward
100 | 
101 | 
102 | 
103 | if observe_and_train:
104 | 
105 |     #Play the game a determine number of times
106 |     for game in range(num_games_to_play):
107 |         gameX = np.zeros(shape=(1,num_env_variables+num_env_actions))
108 |         gameY = np.zeros(shape=(1,1))
109 |         #Get the initial Q state
110 |         qs = env.reset()
111 |         for step in range (40000):
112 | 
113 |             #Learn from observation and not playing
114 |             if game < num_initial_observation:
115 |                 #take a radmon action
116 |                 a = env.action_space.sample()
117 |             else:
118 |                 #Now playing and also learning from experience during play
119 | 
120 |                 #Calculate probability to take deterministic action vs random action (epsilon)
121 |                 prob = np.random.rand(1)
122 |                 explore_prob = starting_explore_prob-(starting_explore_prob/num_games_to_play)*game
123 | 
124 |                 #Chose between prediction and chance
125 |                 if prob < explore_prob:
126 |                     #take a random action
127 |                     a=env.action_space.sample()
128 |                     #print("taking random action",a, "at total_steps" , total_steps)
129 |                     #print("prob ", prob, "explore_prob", explore_prob)
130 | 
131 |                 else:
132 |                     ##chose an action by estimating the function-estimator remembered consequences of all possible actions
133 |                     ## Bellman states that the best policy (i.e. action) is the one that maximizez expected rewards for future states
134 |                     ## to caculate rewards we compute the reward a this state t + the discounted (b_discount) reward at all possible state t+1
135 |                     ## all states t+1 are estimated by our function estimator (our Neural Network)
136 | 
137 | 
138 |                     utility_possible_actions = np.zeros(shape=(num_env_actions))
139 | 
140 |                     utility_possible_actions[0] = predictTotalRewards(qs,0)
141 |                     utility_possible_actions[1] = predictTotalRewards(qs,1)
142 |                     utility_possible_actions[2] = predictTotalRewards(qs,2)
143 |                     utility_possible_actions[3] = predictTotalRewards(qs,3)
144 | 
145 | 
146 |                     #chose argmax action of estimated anticipated rewards
147 |                     #print("utility_possible_actions ",utility_possible_actions)
148 |                     #print("argmax of utitity", np.argmax(utility_possible_actions))
149 |                     a = np.argmax(utility_possible_actions)
150 | 
151 | 
152 | 
153 |             env.render()
154 |             qs_a = np.concatenate((qs,actions_1_hot[a]), axis=0)
155 | 
156 |             #print("action",a," qs_a",qs_a)
157 |             #Perform the optimal action and get the target state and reward
158 |             s,r,done,info = env.step(a)
159 | 
160 | 
161 |             #record information for training and memory
162 |             if step ==0:
163 |                 gameX[0] = qs_a
164 |                 gameY[0] = np.array([r])
165 |                 memoryX[0] = qs_a
166 |                 memoryY[0] = np.array([r])
167 | 
168 |             gameX = np.vstack((gameX,qs_a))
169 |             gameY = np.vstack((gameY,np.array([r])))
170 | 
171 | 
172 |             if done :
173 |                 #GAME ENDED
174 |                 #Calculate Q values from end to start of game (From last step to first)
175 |                 for i in range(0,gameY.shape[0]):
176 |                     #print("Updating total_reward at game epoch ",(gameY.shape[0]-1) - i)
177 |                     if i==0:
178 |                         #print("reward at the last step ",gameY[(gameY.shape[0]-1)-i][0])
179 |                         gameY[(gameY.shape[0]-1)-i][0] = gameY[(gameY.shape[0]-1)-i][0]
180 |                     else:
181 |                         #print("local error before Bellman", gameY[(gameY.shape[0]-1)-i][0],"Next error ", gameY[(gameY.shape[0]-1)-i+1][0])
182 |                         gameY[(gameY.shape[0]-1)-i][0] = gameY[(gameY.shape[0]-1)-i][0]+b_discount*gameY[(gameY.shape[0]-1)-i+1][0]
183 |                         #print("reward at step",i,"away from the end is",gameY[(gameY.shape[0]-1)-i][0])
184 |                     if i==gameY.shape[0]-1 and game%5==0:
185 |                         print("Training Game #",game, " steps = ", step ,"last reward", r," finished with headscore ", gameY[(gameY.shape[0]-1)-i][0])
186 | 
187 |                 if memoryX.shape[0] ==1:
188 |                     memoryX = gameX
189 |                     memoryY = gameY
190 |                 else:
191 |                     #Add experience to memory
192 |                     memoryX = np.concatenate((memoryX,gameX),axis=0)
193 |                     memoryY = np.concatenate((memoryY,gameY),axis=0)
194 | 
195 |                 #if memory is full remove first element
196 |                 if np.alen(memoryX) >= max_memory_len:
197 |                     #print("memory full. mem len ", np.alen(memoryX))
198 |                     for l in range(np.alen(gameX)):
199 |                         memoryX = np.delete(memoryX, 0, axis=0)
200 |                         memoryY = np.delete(memoryY, 0, axis=0)
201 | 
202 |             #Update the states
203 |             qs=s
204 | 
205 |             #Retrain every X game after num_initial_observation
206 |             if done and game >= num_initial_observation:
207 |                 if game%10 == 0:
208 |                     print("Training  game# ", game,"momory size", memoryX.shape[0])
209 |                     model.fit(memoryX,memoryY, batch_size=256,nb_epoch=training_epochs,verbose=0)
210 | 
211 |             if done:
212 |                 if r >= 0 and r <99:
213 |                     print("Game ",game," ended with positive reward ")
214 |                 if r > 50:
215 |                     print("Game ", game," WON *** " )
216 |                 #Game ended - Break
217 |                 break
218 | 
219 | 
220 | 
221 | 
222 | 
223 | if save_weights:
224 |     #Save model
225 |     print("Saving weights")
226 |     model.save_weights(weigths_filename)
227 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # FitML
 2 | ```python
 3 | model.fit(Machine_Learning, epochs=Inf)
 4 | ```
 5 | <table style="width:100% border: none" >
 6 |   <tr>
 7 |     <th><img src="/img/cCartPole.jpg" width="250"/></th>      
 8 |     <th><img src="/img/LunarLandQLearning.png" width="250"/></th>
 9 |     <th><img src="/img/cWalker.jpg" width="250"/></th> 
10 |     <th><img src="/img/cPong.jpg" width="250"/></th>      
11 |   </tr>
12 | </Table>
13 | 
14 | ### What is Fit ML
15 | Fit Machine Learning (FitML) is blog that houses a collection of python Machine Learning articles and examples, often focusing on Reinforcement Learning. Here, you will find code related to Q Learning, Actor-Critic, MDP, Bellman, OpenAI solutions and custom implemented approaches to solving some of the toughest and most interesting problems to date (Yes, I am "baised"). 
16 | 
17 | ### Who is Michel Aka
18 | *Michel is an AI researcher and a graduate from University of Montreal who currently works in the Healthcare industry.*
19 | 
20 | ### How to use for Reinforcement Learning Algorithm
21 | - (Optional) Clone the repo 
22 | - Select the algorithm that you need (Folders are named by the RL algorithm ). Policy Gradient/ Parameter Noising/ Actor Critic / Selective memory
23 | - Get an instance of the algorithm with the environment you need. If the one you are looking for isn't there, get any environment.py file from the algorithm folder of choice and follow the steps below.
24 | - Install the dependencies
25 | - - Usually "pip install <library name>". Example "pip install pygal"
26 | - Replace the name of the environment in line 81 of the code.
27 |  ```Python
28 |   env = gym.make('BipedalWalker-v2')
29 |   # replace with
30 |   env = gym.make('<your-environement-name-here>')
31 |  ```
32 |    or set the ```ENVIRONMENT_NAME =``` to your environment name. Example ```ENVIRONMENT_NAME = "BipedalWalker-v2"```.
33 |   
34 | - set the environment's observation and action space and viriables. If you don't know them, run the script once and they will be printed in the first lines of your output.
35 |  ```Python
36 |   num_env_variables = <number of observation variables here>
37 |   num_env_actions = <number of action variables here>
38 |  ```
39 | - (Optional) you can check the results of your agent as it progresses with the .svg file in the same directory as your script. Any modern browser can view them. 
40 | 
41 | ### RL Approaches
42 | 
43 | #### Optimal Policy Tree Search
44 | 
45 | This is a RL technique which is characterized by computing the estimated value of expected sum of reward for n time steps ahead. This technique has the advantage of yeilding a better estimation of taking a specific policy, however it is computationally expensive and memorry inneficient. If one had a super computer and very large amount of memory, this technique would do extremely well for discrete action space problem/environments. I believe Alfa-Go uses a varient of this technique.
46 | 
47 | See examples and find out more about Optimal Policy Tree Search <a href="https://github.com/FitMachineLearning/FitML/tree/master/OptimalPolicyTreeSearch"> here </a>.
48 | 
49 | #### Selective Memory
50 | 
51 | As far as I know, I haven't seen anyone in the litterature implement this technique before.
52 | 
53 | The intuition behind Policy Gradient is that it optimizes the parameters of the network in the direction of higher expected sum of rewards. What if we could do the same in a computationally more effective way that also turns out to be more intuitive: enter what I am calling Selective Memory. 
54 | 
55 | We chose what to commit to memory based on actual sum of rewards
56 | 
57 | Find out more <a href="https://github.com/FitMachineLearning/FitML/tree/master/SelectiveMemory"> here </a>.
58 | 
59 | 
60 | #### Q-Learning
61 | 
62 | Q-Learning is a well knon Reinforcement Learning approach, popularized by Google Deep Mind, when they used it to master multiple early console era games. Q-Learning focuses on estimating the expected sum of rewards using the Bellman equation in order to determine which action to take. Q-Learning works especially well in discrete action space and on problems where the *f(S)->Q* is differentiable, this is not always the case.
63 | 
64 | Find out more about Q-Learning <a href="https://github.com/FitMachineLearning/FitML/tree/master/DeepQN"> here </a>.
65 | 
66 | 
67 | #### Actor Critique Approaches
68 | 
69 | Actor Critique is an RL technique which combines Policy Gradient appraoch with a Critique (Q value estimator)
70 | 
71 | Find out more about Actor-Critique <a href="https://github.com/FitMachineLearning/FitML/tree/master/ActorCritic"> here </a>.
72 | 
73 | ### Recommended Progression for the Newcomer
74 | 
75 | [coming soon]
76 | 
77 | ###
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/SelectiveMemory/README.md:
--------------------------------------------------------------------------------
 1 | # FitML
 2 | ```python
 3 | model.fit(Machine_Learning, epochs=Inf)
 4 | ```
 5 | 
 6 | <table style="width:100% border: none" >
 7 |   <tr>
 8 |     <th><img src="/img/cWalker.jpg" width="250"/></th>       
 9 |   </tr>
10 | </Table>
11 | 
12 | https://youtu.be/hKrFFeZqq3E
13 | 
14 | #### How does Selective Memory work?
15 | 
16 | The intuition behind Policy Gradient is that it optimizes the parameters of the network in the direction of higher expected sum of rewards. What if we could do the same in a computationally more effective way that also turns out to be more intuitive: enter what I am calling Selective Memory.
17 | 
18 | 1) Our objective here is to ensure that the Policy function converges towards higher rewards. 
19 | 
20 | 2) We know that Neural Networks will converge towards assigned labeled of our data set and will also generalize (function approximation). 
21 | 
22 | 3) What if there was a way to select our training (reinforcement) data set so that it ensures that we converge towards our objective; Higher expected rewards.
23 | 
24 | Here we propose the approach of selectively remembering actions based on the how high a reward was. In other words, the probability *P* of recording an action state into memory (or a rollout) is dependent on the actual sum of reward yeilded by this action trajectory. (Notice that we are not using the expected sum of reward here but the actual computed value at the end of the rollout).
25 | 
26 | What does this look like in code
27 | 
28 | First we creat our function approximators Neural Networks
29 | ```python
30 | #nitialize the Reward predictor model
31 | model = Sequential()
32 | #model.add(Dense(num_env_variables+num_env_actions, activation='tanh', input_dim=dataX.shape[1]))
33 | model.add(Dense(1024, activation='relu', input_dim=dataX.shape[1]))
34 | model.add(Dense(256, activation='tanh'))
35 | model.add(Dense(dataY.shape[1]))
36 | opt = optimizers.adam(lr=learning_rate)
37 | model.compile(loss='mse', optimizer=opt, metrics=['accuracy'])
38 | 
39 | 
40 | #initialize the action predictor model
41 | action_predictor_model = Sequential()
42 | #model.add(Dense(num_env_variables+num_env_actions, activation='tanh', input_dim=dataX.shape[1]))
43 | action_predictor_model.add(Dense(1024, activation='relu', input_dim=apdataX.shape[1]))
44 | action_predictor_model.add(Dense(512, activation='relu'))
45 | action_predictor_model.add(Dense(apdataY.shape[1],activation='tanh'))
46 | ```
47 | 
48 | Then we calculate sum of rewards at the end of each rollout using Bellman.
49 | 
50 | Then we careful select what we want to remember i.e. store in memory.
51 | 
52 | There is a number of approaches we have used to discriminate on the nature of the State-Actions or State-Action-Rewards that we will be keeping in memory to train our Actor. One discriminates for each indivudual action state, the other discriminates an entire rollout batch. Reguardless the principle is the same. We determine how good an action is compared to the average remembered good actions.
53 | 
54 | ```python
55 | def addToMemory(reward,averageReward):
56 | 
57 |     prob = 0.1
58 |     if( reward > averageReward):
59 |         prob = prob + 0.9 * math.tanh(reward - averageReward)
60 |     else:
61 |         prob = prob + 0.1 * math.tanh(reward - averageReward)
62 | 
63 |     if np.random.rand(1)<=prob :
64 |         print("Adding reward",reward," based on prob ", prob)
65 |         return True
66 |     else:
67 |         return False
68 | ```
69 | 
70 | ```python
71 |     for i in range(0,gameR.shape[0]):
72 |         if addToMemory(gameR[i][0],-1,50):
73 |             tempGameSA = np.vstack((tempGameSA, gameSA[i]))
74 |             tempGameA = np.vstack((tempGameA,gameA[i]))
75 |             tempGameR = np.vstack((tempGameR,gameR[i]))
76 |             tempGameS = np.vstack((tempGameS,gameS[i]))
77 | ```
78 | 
79 | Here gameSA, gameA, gameR and gameS represent the various State-Action pairs, Actions, actual discounted sum of rewards and States respectively.
80 | 
81 | When we get a new state we then act based on optimal policy which has been trained on memory primed with only the best results yeilding actions.
82 | ```python
83 |     #Get Remembered optiomal policy
84 |     remembered_optimal_policy = GetRememberedOptimalPolicy(qs)
85 |     a = remembered_optimal_policy
86 | ```
87 | 
88 | ### What type of results do we get?
89 | Our agent is able to crawl, stand up, walk, run, jump after 500 episodes in the famous openAI BipedalWalker test. After 3000 iterations, our agent is able to advance fast and be very stable on its feet.
90 | You can watch it in action here: https://youtu.be/hKrFFeZqq3E.
91 | 
92 | 
93 | ### What is Fit ML
94 | Fit Machine Learning (FitML) is blog that houses a collection of python Machine Learning articles and examples, often focusing on Reinforcement Learning. Here, you will find code related to Q Learning, Actor-Critic, MDP, Bellman, OpenAI solutions and custom implemented approaches to solving some of the toughest and most interesting problems to date (Yes, I am "baised").
95 | 
96 | ### Who is Michel Aka
97 | *Michel is an AI researcher and a graduate from University of Montreal who currently works in the Healthcare industry.*
98 | 


--------------------------------------------------------------------------------
/SimpleNN/MinFinder.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | # # Annotations for the Sirajology Python NN Example
  4 | #
  5 | # This code comes from a demo NN program from the YouTube video https://youtu.be/h3l4qz76JhQ. The program creates an neural network that simulates the exclusive OR function with two inputs and one output.
  6 | #
  7 | #
  8 | 
  9 | # In[23]:
 10 | 
 11 | import numpy as np  # Note: there is a typo on this line in the video
 12 | import pandas as pd
 13 | import os
 14 | 
 15 | 
 16 | # The following is a function definition of the sigmoid function, which is the type of non-linearity chosen for this neural net. It is not the only type of non-linearity that can be chosen, but is has nice analytical features and is easy to teach with. In practice, large-scale deep learning systems use piecewise-linear functions because they are much less expensive to evaluate.
 17 | #
 18 | # The implementation of this function does double duty. If the deriv=True flag is passed in, the function instead calculates the derivative of the function, which is used in the error backpropogation step.
 19 | 
 20 | # In[24]:
 21 | 
 22 | 
 23 | def nonlin(x, deriv=False):  # Note: there is a typo on this line in the video
 24 |     if(deriv==True):
 25 |         return (x*(1-x))
 26 | 
 27 |     return 1/(1+np.exp(-x))  # Note: there is a typo on this line in the video
 28 | 
 29 | 
 30 | # The following code creates the input matrix. Although not mentioned in the video, the third column is for accommodating the bias term and is not part of the input.
 31 | 
 32 | # In[25]:
 33 | 
 34 | def read_inputs_and_outputs():
 35 |     fullDataSet = pd.read_csv(dir_path + "/sonar.csv")
 36 |     #load input data
 37 |     X = fullDataSet[fullDataSet.columns[0:60]].values
 38 |     Y_ = fullDataSet[fullDataSet.columns[60]]
 39 |     Y = np.zeros((X.shape[0],1))
 40 | 
 41 |     print("X.shape", X.shape)
 42 |     print("Y.shape", Y.shape)
 43 |     print(Y[0:2])
 44 | 
 45 |     for i in range(X.shape[0]):
 46 |         if Y_[i] == 'R':
 47 |             Y[i,0] = 1
 48 |         else:
 49 |             Y[i,0] = 0
 50 | 
 51 |     return (X, Y)
 52 | 
 53 | 
 54 | dir_path = os.path.dirname(os.path.realpath(__file__))
 55 | 
 56 | X, Y = read_inputs_and_outputs()
 57 | #input data
 58 | #X = np.array([[0,0,1],  # Note: there is a typo on this line in the video
 59 | #            [0,1,1],
 60 | #            [1,0,1],
 61 | #            [1,1,1]])
 62 | 
 63 | 
 64 | # The output of the exclusive OR function follows.
 65 | 
 66 | # In[26]:
 67 | 
 68 | #output data
 69 | #y = np.array([[0],
 70 | #             [1],
 71 | #             [1],
 72 | #             [0]])
 73 | 
 74 | 
 75 | # The seed for the random generator is set so that it will return the same random numbers each time, which is sometimes useful for debugging.
 76 | 
 77 | # In[27]:
 78 | 
 79 | np.random.seed(6)
 80 | 
 81 | 
 82 | # Now we intialize the weights to random values. syn0 are the weights between the input layer and the hidden layer.  It is a 3x4 matrix because there are two input weights plus a bias term (=3) and four nodes in the hidden layer (=4). syn1 are the weights between the hidden layer and the output layer. It is a 4x1 matrix because there are 4 nodes in the hidden layer and one output. Note that there is no bias term feeding the output layer in this example. The weights are initially generated randomly because optimization tends not to work well when all the weights start at the same value. Note that neither of the neural networks shown in the video describe the example.
 83 | 
 84 | #Declare constants
 85 | 
 86 | num_nodes_input_layer = 60
 87 | num_nodes_hl1 = 60
 88 | num_nodes_hl2 = 60
 89 | num_nodes_output_layer = 1
 90 | learning_rate = 0.005
 91 | 
 92 | 
 93 | # In[28]:
 94 | 
 95 | #synapses
 96 | syn0 = 2*np.random.random((num_nodes_input_layer,num_nodes_hl1)) - 1  # 3x4 matrix of weights ((2 inputs + 1 bias) x 4 nodes in the hidden layer)
 97 | syn1 = 2*np.random.random((num_nodes_hl1, num_nodes_hl2)) - 1  # 4x1 matrix of weights. (4 nodes x 1 output) - no bias term in the hidden layer.
 98 | syn2 = 2*np.random.random((num_nodes_hl2,num_nodes_output_layer)) -1
 99 | 
100 | print("syn2 = ", syn2.shape)
101 | print(syn2[0:10])
102 | 
103 | 
104 | # This is the main training loop. The output shows the evolution of the error between the model and desired. The error steadily decreases.
105 | 
106 | # In[29]:
107 | 
108 | #training step
109 | # Python2 Note: In the follow command, you may improve
110 | #   performance by replacing 'range' with 'xrange'.
111 | for j in range(3000):
112 | 
113 |     # Calculate forward through the network.
114 |     l0 = X
115 |     l1 = nonlin(np.dot(l0, syn0))
116 |     l2 = nonlin(np.dot(l1, syn1))
117 |     l_out = nonlin(np.dot(l2, syn2))
118 | 
119 | 
120 | 
121 |     # Back propagation of errors using the chain rule.
122 |     l_out_error = Y - l_out
123 |     cost = np.sum(l_out_error**2)/2
124 | 
125 |     if j == 0 :
126 |         print("layer out output", l_out.shape)
127 |         print(l_out)
128 | 
129 |     print("ittaration ", j, " overall cost = ", cost)
130 | 
131 | 
132 |     if j  == 0 or j==1:   # Only print the error every 10000 steps, to save time and limit the amount of output.
133 |         print("layer error", l_out_error)
134 |         print(l_out_error)
135 | 
136 | 
137 | 
138 |     if j  == 0 or j==1:   # Only print the error every 10000 steps, to save time and limit the amount of output.
139 |         print("Error: " + str(np.mean(np.abs(l_out_error))))
140 | 
141 |     l_out_delta = l_out_error*nonlin(l_out, deriv=True)
142 |     if j == 0 or j==1 :
143 |         print("layer 1 output", l1.shape)
144 |         #print(l1)
145 | 
146 | 
147 |     l2_error = l_out_delta.dot(syn2.T)
148 |     l2_delta = l2_error * nonlin(l2,deriv=True)
149 |     if j == 0 or j==1 :
150 |         print("l2 ")
151 |         #print(l2)
152 | 
153 | 
154 | 
155 |     l1_error = l2_delta.dot(syn1.T)
156 | 
157 |     l1_delta = l1_error * nonlin(l1,deriv=True)
158 | 
159 |     #update weights (no learning rate term)
160 |     syn2 += learning_rate * l2.T.dot(l_out_delta)
161 |     syn1 += learning_rate * l1.T.dot(l2_delta)
162 |     syn0 += learning_rate * l0.T.dot(l1_delta)
163 | 
164 | print("Output after training")
165 | print(l_out[80:120])
166 | 
167 | 
168 | 
169 | 
170 | # See how the final output closely approximates the true output [0, 1, 1, 0]. If you increase the number of interations in the training loop (currently 60000), the final output will be even closer.
171 | 
172 | # In[30]:
173 | 
174 | #get_ipython().run_cell_magic(u'HTML', u'', u'#The following line is for embedding the YouTube video \n#   in this Jupyter Notebook. You may remove it without peril. \n<iframe width="560" height="315" src="https://www.youtube.com/embed/h3l4qz76JhQ" frameborder="0" allowfullscreen></iframe>')
175 | 
176 | 
177 | # In[ ]:
178 | 


--------------------------------------------------------------------------------
/SimpleNN/data-01-test-score.csv:
--------------------------------------------------------------------------------
 1 | 73,80,75,152
 2 | 93,88,93,185
 3 | 89,91,90,180
 4 | 96,98,100,196
 5 | 73,66,70,142
 6 | 53,46,55,101
 7 | 69,74,77,149
 8 | 47,56,60,115
 9 | 87,79,90,175
10 | 79,70,88,164
11 | 69,70,73,141
12 | 70,65,74,141
13 | 93,95,91,184
14 | 79,80,73,152
15 | 70,73,78,148
16 | 93,89,96,192
17 | 78,75,68,147
18 | 81,90,93,183
19 | 88,92,86,177
20 | 78,83,77,159
21 | 82,86,90,177
22 | 86,82,89,175
23 | 78,83,85,175
24 | 76,83,71,149
25 | 96,93,95,192
26 | 


--------------------------------------------------------------------------------
/Stable_baselines3/Readme.md:
--------------------------------------------------------------------------------
 1 | Simple implementation of Stable Baline3 algorithms.
 2 | 
 3 | **to Train:**
 4 | 
 5 | *python ppo_main2.py*
 6 | 
 7 | This will save the model in the local directory
 8 | 
 9 | **to view the agent**
10 | 
11 | *python ppo_load.py*
12 | 


--------------------------------------------------------------------------------
/Stable_baselines3/ppo_load.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import pybullet, pybullet_envs
 3 | import torch as th
 4 | 
 5 | from stable_baselines3 import PPO
 6 | from stable_baselines3.common.evaluation import evaluate_policy
 7 | 
 8 | # Create environment
 9 | # env = gym.make('LunarLanderContinuous-v2')
10 | 
11 | env = gym.make('BipedalWalker-v3')
12 | env.render(mode="human")
13 | 
14 | policy_kwargs = dict(activation_fn=th.nn.LeakyReLU, net_arch=[512, 512])
15 | # Instantiate the agent
16 | model = PPO('MlpPolicy', env,learning_rate=0.0003,policy_kwargs=policy_kwargs, verbose=1)
17 | 
18 | del model  # delete trained model to demonstrate loading
19 | # Load the trained agent
20 | model = PPO.load("ppo_Ant")
21 | 
22 | # # Evaluate the agent
23 | # mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
24 | 
25 | # Enjoy trained agent
26 | obs = env.reset()
27 | for i in range(100):
28 |     dones = False
29 |     game_score = 0
30 |     steps = 0
31 |     while not dones:
32 |         action, _states = model.predict(obs, deterministic=True)
33 |         obs, rewards, dones, info = env.step(action)
34 |         # import ipdb;ipdb.set_trace()
35 |         game_score+=rewards
36 |         steps+=1
37 |         env.render()
38 |     print("game ", i ," steps   ",steps, " game score %.3f"%game_score)
39 |     obs = env.reset()
40 |     # break
41 | 


--------------------------------------------------------------------------------
/Stable_baselines3/ppo_main2.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import pybullet, pybullet_envs
 3 | import torch as th
 4 | 
 5 | from stable_baselines3 import PPO
 6 | from stable_baselines3.common.evaluation import evaluate_policy
 7 | 
 8 | 
 9 | # Create environment
10 | # env = gym.make('LunarLanderContinuous-v2')
11 | 
12 | env = gym.make('BipedalWalker-v3')
13 | # env.render(mode="human")
14 | 
15 | policy_kwargs = dict(activation_fn=th.nn.LeakyReLU, net_arch=[512, 512])
16 | # Instantiate the agent
17 | model = PPO('MlpPolicy', env,learning_rate=0.0003,policy_kwargs=policy_kwargs, verbose=1)
18 | # Train the agent
19 | for i in range(8000):
20 |     print("Training itteration ",i)
21 |     model.learn(total_timesteps=10000)
22 |     # Save the agent
23 |     model.save("ppo_Ant")
24 |     mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=5)
25 |     print("mean_reward ", mean_reward)
26 |     if mean_reward >= 270:
27 |         print("***Agent Trained with average reward ", mean_reward)
28 |         break
29 | 
30 | del model  # delete trained model to demonstrate loading
31 | # Load the trained agent
32 | # model = PPO.load("ppo_Ant")
33 | 
34 | # Evaluate the agent
35 | # mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
36 | 
37 | # Enjoy trained agent
38 | # obs = env.reset()
39 | # for i in range(100):
40 | #     action, _states = model.predict(obs, deterministic=True)
41 | #     obs, rewards, dones, info = env.step(action)
42 | #     env.render()
43 | 


--------------------------------------------------------------------------------
/Tensorforce/Readme.md:
--------------------------------------------------------------------------------
 1 | Simple implmentation of RL algorithms using Tensorforce Library.
 2 | 
 3 | This will serve as a starting point for RL beginners. 
 4 | 
 5 | **to train the model:**
 6 | *python tf_LunarLanderContinuous_ppo.py*
 7 | 
 8 | **to load saved the model:**
 9 | *python tf_loader.ppy*
10 | 
11 | Make sure you do not run the trainer and loader at the same time (espeically on GPU) this will fail.
12 | 


--------------------------------------------------------------------------------
/Tensorforce/tf_LunarLanderContinuous_ppo.py:
--------------------------------------------------------------------------------
 1 | from tensorforce import Agent, Environment
 2 | from tensorforce.agents import PPOAgent
 3 | from tensorforce.environments import OpenAIGym
 4 | 
 5 | # Pre-defined or custom environment
 6 | # environment = Environment.create(
 7 | #     environment='gym', level='CartPole', max_episode_timesteps=500
 8 | # )
 9 | 
10 | 
11 | # environment = OpenAIGym('CartPole-v0', visualize=True, max_episode_steps=500)
12 | environment = OpenAIGym('LunarLanderContinuous-v2', visualize=False, max_episode_steps=500)
13 | # environment = OpenAIGym('BipedalWalker-v3', visualize=False, max_episode_steps=500)
14 | 
15 | 
16 | agent = Agent.create(
17 |     agent='ppo', environment=environment, batch_size=10,
18 |     network=[
19 |         dict(type='dense', size=64),
20 |         dict(type='dense', size=64)
21 |     ],
22 |     learning_rate=1e-3
23 | 
24 | )
25 | 
26 | 
27 | running_score = 0.0
28 | # Train for 300 episodes
29 | for i_epoch in range(50000):
30 |     game_score = 0.0
31 |     # Initialize episode
32 |     states = environment.reset()
33 |     terminal = False
34 | 
35 |     while not terminal:
36 |         # Episode timestep
37 |         actions = agent.act(states=states)
38 |         states, terminal, reward = environment.execute(actions=actions)
39 |         game_score+=reward
40 |         agent.observe(terminal=terminal, reward=reward)
41 | 
42 |     running_score = 0.95*running_score + 0.05*game_score
43 |     if i_epoch%5==0:
44 |         print("Game ", i_epoch, "       game score %.2f"%game_score,"       running score %.2f"%running_score)
45 | 
46 |     if i_epoch%10==0 and i_epoch>20:
47 |         agent.save()
48 |     if running_score >= 250:
49 |         agent.save()
50 |         break()
51 | 
52 | agent.close()
53 | environment.close()
54 | 


--------------------------------------------------------------------------------
/Tensorforce/tf_LunarLander_ppo.py:
--------------------------------------------------------------------------------
 1 | from tensorforce import Agent, Environment
 2 | from tensorforce.agents import PPOAgent
 3 | from tensorforce.environments import OpenAIGym
 4 | 
 5 | # Pre-defined or custom environment
 6 | # environment = Environment.create(
 7 | #     environment='gym', level='CartPole', max_episode_timesteps=500
 8 | # )
 9 | 
10 | # Network as list of layers
11 | network_spec = [
12 |     # dict(type='dense', size=32, activation='tanh'),
13 |     dict(type='dense', size=128, activation='tanh')
14 | ]
15 | 
16 | # environment = OpenAIGym('CartPole-v0', visualize=True, max_episode_steps=500)
17 | environment = OpenAIGym('LunarLander-v2', visualize=True, max_episode_steps=500)
18 | 
19 | 
20 | # Instantiate a Tensorforce agent
21 | # agent = Agent.create(
22 | #     agent='tensorforce',
23 | #     environment=environment,  # alternatively: states, actions, (max_episode_timesteps)
24 | #     memory=10000,
25 | #     update=dict(unit='timesteps', batch_size=64),
26 | #     optimizer=dict(type='adam', learning_rate=3e-4),
27 | #     policy=dict(network='auto'),
28 | #     objective='policy_gradient',
29 | #     reward_estimation=dict(horizon=20)
30 | # )
31 | 
32 | agent = Agent.create(
33 |     agent='ppo', environment=environment, batch_size=10,
34 |     learning_rate=1e-3,
35 | 
36 | )
37 | 
38 | # agent = PPOAgent(
39 | #     states_spec=environment.states,
40 | #     actions_spec=environment.actions,
41 | #     network_spec=network_spec,
42 | #     batch_size=4096,
43 | #     # BatchAgent
44 | #     keep_last_timestep=True,
45 | #     # PPOAgent
46 | #     step_optimizer=dict(
47 | #         type='adam',
48 | #         learning_rate=1e-3
49 | #     ),
50 | #     optimization_steps=10,
51 | #     # Model
52 | #     scope='ppo',
53 | #     discount=0.99,
54 | #     # DistributionModel
55 | #     distributions_spec=None,
56 | #     entropy_regularization=0.01,
57 | #     # PGModel
58 | #     baseline_mode=None,
59 | #     baseline=None,
60 | #     baseline_optimizer=None,
61 | #     gae_lambda=None,
62 | #     # PGLRModel
63 | #     likelihood_ratio_clipping=0.2,
64 | #     # summary_spec=None,
65 | #     # distributed_spec=None
66 | # )
67 | running_score = 0.0
68 | # Train for 300 episodes
69 | for i_epoch in range(3000):
70 |     game_score = 0.0
71 |     # Initialize episode
72 |     states = environment.reset()
73 |     terminal = False
74 | 
75 |     while not terminal:
76 |         # Episode timestep
77 |         actions = agent.act(states=states)
78 |         states, terminal, reward = environment.execute(actions=actions)
79 |         game_score+=reward
80 |         agent.observe(terminal=terminal, reward=reward)
81 | 
82 |     running_score = 0.95*running_score + 0.05*game_score
83 |     if i_epoch%2==0:
84 |         print("Game ", i_epoch, "       game score %.2f"%game_score,"       running score %.2f"%running_score)
85 | 
86 | 
87 | agent.close()
88 | environment.close()
89 | 


--------------------------------------------------------------------------------
/Tensorforce/tf_loader.py:
--------------------------------------------------------------------------------
 1 | from tensorforce import Agent, Environment
 2 | from tensorforce.agents import PPOAgent
 3 | from tensorforce.environments import OpenAIGym
 4 | 
 5 | # Pre-defined or custom environment
 6 | # environment = Environment.create(
 7 | #     environment='gym', level='CartPole', max_episode_timesteps=500
 8 | # )
 9 | 
10 | 
11 | # environment = OpenAIGym('CartPole-v0', visualize=True, max_episode_steps=500)
12 | environment = OpenAIGym('LunarLanderContinuous-v2', visualize=True, max_episode_steps=500)
13 | # environment = OpenAIGym('BipedalWalker-v3', visualize=False, max_episode_steps=500)
14 | 
15 | 
16 | agent = Agent.create(
17 |     agent='ppo', environment=environment, batch_size=10,
18 |     network=[
19 |         dict(type='dense', size=64),
20 |         dict(type='dense', size=64)
21 |     ],
22 |     learning_rate=1e-3,
23 |     name='agent_loader'
24 | 
25 | )
26 | # import ipdb;ipdb.set_trace()
27 | agent = agent.load()
28 | 
29 | running_score = 0.0
30 | # Train for 300 episodes
31 | for i_epoch in range(50000):
32 |     game_score = 0.0
33 |     # Initialize episode
34 |     states = environment.reset()
35 |     terminal = False
36 | 
37 |     while not terminal:
38 |         # Episode timestep
39 |         actions = agent.act(states=states,evaluation=True)
40 |         states, terminal, reward = environment.execute(actions=actions)
41 |         game_score+=reward
42 |         # agent.observe(terminal=terminal, reward=reward)
43 | 
44 |     running_score = 0.95*running_score + 0.05*game_score
45 |     if i_epoch%5==0:
46 |         print("Game ", i_epoch, "       game score %.2f"%game_score,"       running score %.2f"%running_score)
47 | 
48 | 
49 | 
50 | agent.close()
51 | environment.close()
52 | 


--------------------------------------------------------------------------------
/Tensorforce/tf_main.py:
--------------------------------------------------------------------------------
 1 | from tensorforce import Agent, Environment
 2 | from tensorforce.agents import PPOAgent
 3 | from tensorforce.environments import OpenAIGym
 4 | 
 5 | # Pre-defined or custom environment
 6 | # environment = Environment.create(
 7 | #     environment='gym', level='CartPole', max_episode_timesteps=500
 8 | # )
 9 | 
10 | # Network as list of layers
11 | network_spec = [
12 |     dict(type='dense', size=32, activation='tanh'),
13 |     dict(type='dense', size=32, activation='tanh')
14 | ]
15 | 
16 | environment = OpenAIGym('CartPole-v0', visualize=True, max_episode_steps=500)
17 | 
18 | 
19 | # Instantiate a Tensorforce agent
20 | # agent = Agent.create(
21 | #     agent='tensorforce',
22 | #     environment=environment,  # alternatively: states, actions, (max_episode_timesteps)
23 | #     memory=10000,
24 | #     update=dict(unit='timesteps', batch_size=64),
25 | #     optimizer=dict(type='adam', learning_rate=3e-4),
26 | #     policy=dict(network='auto'),
27 | #     objective='policy_gradient',
28 | #     reward_estimation=dict(horizon=20)
29 | # )
30 | 
31 | agent = Agent.create(
32 |     agent='ppo', environment=environment, batch_size=10, learning_rate=1e-3
33 | )
34 | 
35 | # agent = PPOAgent(
36 | #     states_spec=environment.states,
37 | #     actions_spec=environment.actions,
38 | #     network_spec=network_spec,
39 | #     batch_size=4096,
40 | #     # BatchAgent
41 | #     keep_last_timestep=True,
42 | #     # PPOAgent
43 | #     step_optimizer=dict(
44 | #         type='adam',
45 | #         learning_rate=1e-3
46 | #     ),
47 | #     optimization_steps=10,
48 | #     # Model
49 | #     scope='ppo',
50 | #     discount=0.99,
51 | #     # DistributionModel
52 | #     distributions_spec=None,
53 | #     entropy_regularization=0.01,
54 | #     # PGModel
55 | #     baseline_mode=None,
56 | #     baseline=None,
57 | #     baseline_optimizer=None,
58 | #     gae_lambda=None,
59 | #     # PGLRModel
60 | #     likelihood_ratio_clipping=0.2,
61 | #     # summary_spec=None,
62 | #     # distributed_spec=None
63 | # )
64 | 
65 | # Train for 300 episodes
66 | for _ in range(300):
67 | 
68 |     # Initialize episode
69 |     states = environment.reset()
70 |     terminal = False
71 | 
72 |     while not terminal:
73 |         # Episode timestep
74 |         actions = agent.act(states=states)
75 |         states, terminal, reward = environment.execute(actions=actions)
76 |         agent.observe(terminal=terminal, reward=reward)
77 | 
78 | agent.close()
79 | environment.close()
80 | 


--------------------------------------------------------------------------------
/img/DeepQN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FitMachineLearning/FitML/1218e10c527e1ca117cc7548419904d7801433c4/img/DeepQN.png


--------------------------------------------------------------------------------
/img/LunarLandQLearning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FitMachineLearning/FitML/1218e10c527e1ca117cc7548419904d7801433c4/img/LunarLandQLearning.png


--------------------------------------------------------------------------------
/img/Readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/img/Screen Shot 2017-11-01 at 7.41.58 PM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FitMachineLearning/FitML/1218e10c527e1ca117cc7548419904d7801433c4/img/Screen Shot 2017-11-01 at 7.41.58 PM.png


--------------------------------------------------------------------------------
/img/ScreenShot1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FitMachineLearning/FitML/1218e10c527e1ca117cc7548419904d7801433c4/img/ScreenShot1.jpg


--------------------------------------------------------------------------------
/img/Walker.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FitMachineLearning/FitML/1218e10c527e1ca117cc7548419904d7801433c4/img/Walker.png


--------------------------------------------------------------------------------
/img/cCartPole.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FitMachineLearning/FitML/1218e10c527e1ca117cc7548419904d7801433c4/img/cCartPole.jpg


--------------------------------------------------------------------------------
/img/cPong.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FitMachineLearning/FitML/1218e10c527e1ca117cc7548419904d7801433c4/img/cPong.jpg


--------------------------------------------------------------------------------
/img/cWalker.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FitMachineLearning/FitML/1218e10c527e1ca117cc7548419904d7801433c4/img/cWalker.jpg


--------------------------------------------------------------------------------