├── ActorCritic ├── BepedalWalker_ActorCritic.py ├── Mujoco_HalfCheetah_v1.0.py └── README.md ├── DeepDeterministicSeletiveMemory ├── AntBulletEnv.py ├── BipedalWalker_v3.0.py ├── HalfCheetah.v1.0.py ├── Hopper_v1.0.py ├── LunarLander_v1.py ├── MainAlgo_PR_v1.0.py ├── No_Exp_replay_v1.0.py ├── Q_as_discr │ ├── LunarLanderContinuous_v1.0.py │ └── Main_algo.py ├── README.md ├── ROBOTIC_Template_Experimental_v0.1.py ├── RoboschoolHalfCheetah_v1.py ├── Tensorflow │ ├── BipedalWalker_v2.0.py │ ├── Hopper_v1.py │ ├── LunarLander_v1.py │ ├── Rocker_lander.py │ └── _Main_Algo_v1.py ├── Walker2D.v2.0.py ├── Walker2DBulletEnv.py ├── _MainAlgo_v4.2.py └── test_algo.py ├── DeepQN ├── AtariSpaceInvadors_CNN.py ├── Atari_Pong_DeepQN.py ├── CartPole_QLearning.py ├── Examples │ ├── Breakout.py │ ├── Breakout2.py │ ├── PixelCopter_.py │ └── Pong.py ├── Main_Gym.py ├── Main_Gym_2_Channels.py ├── Main_Gym_Channel_Last.py ├── Main_Gym_Param_Noise.py ├── Main_Gym_v2.py └── Main_PLE.py ├── DeepQNPyTorch ├── DQN_torch.py ├── Lander_torch.py └── Readme.md ├── Experimental ├── AtariPong_v0.8.5.py ├── AtariPong_v0.8.7.py ├── AtariPong_v0.9.1.py ├── AtariPong_v0.9.2.py ├── AtariPong_v0.9.5.py ├── Atari_Pong_DeepQN.py ├── Atari_Pong_v0.4.3.py ├── Atari_pong_v0.4.1_1hot.py ├── Atari_pong_v0.4_QN.py ├── Atari_pong_v0.8.1.py ├── BipedalWalker_PG_v0.2.py ├── BipedalWalker_SM.py ├── BipedalWalker_SM_v0.7.py ├── BipedalWalker_SM_v0.8.py ├── BipedalWalker_SM_v0.9.1.py ├── BipedalWalker_SM_v0.9.py ├── BipedalWalker_SM_v2.py ├── BipedalWalker_SelectiveMemory_V5.py ├── BipedalWalker_Selective_Memory.py ├── BipedalWalker_policyGradient_v0.1.py ├── BipedalWalker_v1.0.py ├── BipedalWalker_v3.py ├── BipedalWalker_v4.py ├── CNN_test.py ├── Cartpol_CNN_RL.py ├── Cartpole_Highest_Reward_mem.py ├── Cartpole_simple_CNN.py ├── LunarLander_PG_v0.6.py ├── LunarLander_SM_v0.3.py ├── MNIST_image_Classification.py ├── Readme.md ├── Walker_A3C.py └── image_rescale.py ├── LICENSE ├── NeuroEvolution ├── Implementations │ ├── BipedalWalker.py │ ├── LunarLander.py │ ├── Pendulum.py │ ├── RS_Ant.py │ └── RS_Hopper.py ├── Lib │ └── Individual.py ├── Main_TF_0.7.py ├── Main_v4.py └── old │ ├── Main.py │ ├── Main2.py │ ├── Main3.py │ └── Main4_Multy_Gen.py ├── OptimalPolicyTreeSearch └── Cartpole_OPTS.py ├── ParameterNoising ├── AntBulletEnv.py ├── BipedalWalker_PN_v3.0.py ├── Hopper_v1.py ├── InvertedDoublePendulum.py ├── InvertedPendulum.py ├── LunarLanding_v3.0.py ├── NoisingFunction.py ├── Q_as_disc │ ├── Chettah_v1.0.py │ ├── LunarLander_v1.py │ └── _MainAlgo_Deep_Adaptive_noise_v1.py ├── Tensorflow │ ├── BipedalWalker_v1.0.py │ ├── LunarLander_Continuous.py │ ├── LunarLander_V2_wGameAdv.py │ └── Main_Algo.py ├── Test │ ├── BipedalWalker.py │ ├── DDSM_Custom_Loss.py │ ├── LunarLander_v0.4_w_adaptive_noise.py │ └── test.py ├── Walker2D.py ├── _MainAlgo.py └── _MainAlgo_Adaptive_Noise_v2.0.py ├── PolicyGradient ├── BipedalWalker_v1.0.py ├── HalfCheetah_V1.0.py ├── Hopper_v0.9.7.py └── Walker2D_v0.9.5.py ├── Pytorch ├── ActorCritic │ ├── Actor_Critic.py │ ├── Advantage_Actor_Critic.py │ ├── Load_AC_model.py │ ├── Output_noising │ │ ├── Actor_Critic_BipedalWalker.py │ │ ├── Actor_Critic_Noisy_output_Mem_fix.py │ │ ├── Readme.md │ │ └── load_AC_Model.py │ ├── Parameter_Noising │ │ ├── Advantage_Actor_Critic.py │ │ ├── Load_AC_model.py │ │ ├── Readme.md │ │ └── agent_and_model.py │ ├── Readme.md │ ├── actor01TDQN_RL_MODEL.trl │ ├── agent_and_model.py │ └── critic01TDQN_RL_MODEL.trl ├── DQN │ ├── DQN_Cartpol_old_1.py │ ├── DQN_Lander.py │ ├── DQN_Lander_Old_1.py │ ├── DQN_tut.py │ ├── DQN_tut_2.py │ ├── Load_Agent.py │ └── Readme.md ├── DQN_CNN │ ├── ATARI_DQN_CNN.py │ ├── Load_ATARI_AGENT.py │ ├── Readme.md │ └── agent_and_model.py ├── PPO │ ├── PPO_LunarLander.py │ └── Readme.md └── PolicyGradient │ ├── Load_model.py │ ├── Readme.md │ ├── agent_and_model.py │ └── policy_gradien_2.py ├── QLearning ├── LunarLander_QL.py └── LunarLander_v2.py ├── README.md ├── SelectiveMemory ├── Ant_SMA_V1.py ├── BipedalWalker_v5.py ├── BipedalWalker_v6.py ├── BipedalWalker_v7.py ├── CartPole_SelectiveMemory.py ├── DDSM │ └── BipedalWalker_DDSM.py ├── HalfCheetah_SMA_v1.py ├── Hopper_SMA_v1.0.py ├── Hopper_SMA_v2.0.py ├── LunarLanderContinuous_V1.py ├── LunarLander_Selective_Memory.py ├── MountainCarContinuous_SMA.py ├── MujocoHalfCheetah_v1.0.py ├── QasFeature │ ├── BipedalWalker_v3.py │ ├── BipedalWalker_v4.py │ ├── BipedalWalker_v7.py │ ├── HalfCheetah_SMQ_V1.py │ ├── Hopper_v2.0.py │ ├── LunarLanderContinuous_SMQ_v1.py │ └── LunarLander_SMQ_V1.py └── README.md ├── SimpleNN ├── MinFinder.py ├── Sonar.csv └── data-01-test-score.csv ├── SkillPolicyLearning ├── CartPole.py └── LunarLander.py ├── Stable_baselines3 ├── Readme.md ├── ppo_load.py └── ppo_main2.py ├── Tensorforce ├── Readme.md ├── tf_LunarLanderContinuous_ppo.py ├── tf_LunarLander_ppo.py ├── tf_loader.py └── tf_main.py └── img ├── DeepQN.png ├── LunarLandQLearning.png ├── Readme.md ├── Screen Shot 2017-11-01 at 7.41.58 PM.png ├── ScreenShot1.jpg ├── Walker.png ├── cCartPole.jpg ├── cPong.jpg └── cWalker.jpg /ActorCritic/README.md: -------------------------------------------------------------------------------- 1 | # Solving Bipedal Walker with Actor Critic with Python and Keras 2 | ```python 3 | model.fit(Machine_Learning, epochs=Inf) 4 | ``` 5 | 6 | 7 | 8 | 9 | 10 |
11 | 12 | ### What is Fit ML 13 | Fit Machine Learning (FitML) is blog that houses a collection of python Machine Learning articles and examples, often focusing on Reinforcement Learning. Here, you will find code related to Q Learning, Actor-Critic, MDP, Bellman, OpenAI solutions and custom implemented approaches to solving some of the toughest and most interesting problems to date (Yes, I am "baised"). 14 | 15 | ### What is Bipedal Walker anyway? 16 | Bipedal Walker is an OpenAI Gym environment where an agent learns to control a bipedal walker in order to reach the end of an obstacle course. What makes this challenging is that 17 | 1) The agent only receives limbs coordinates along with Lidar information 18 | 2) Actions are vectors of 4 real numbers 19 | So our agent has to learn to balance,walk,run,jump on its own without any human intervention. 20 | 21 | ### Why Q-Learning alone doesn't work 22 | For those acquinted with QLearning, it becomes clear very quickly that we cannot apply a greedy policy here. Simply relying on a Q-value function approximator and polling on non-discrete action space, let alone a vector of continuous action space is simply impractical. In order to overcome this challenge we use the Actor Critic Method where 1 Nerual Network is in charge of approximating how good an action is, and the other learns what to do in any given situation. 23 | 24 | Let's see how this is implemented using keras. 25 | 26 | ### Creating The Actor and the Critic 27 | Since we don't know how good an action is going to be until such time that we have take it, a common technique in Reinforcement Learning is to predict/approximate this using a function approximator a.k.a. a Neural Network. We will call this first network QModel. It will take as input a combination of state-action and estimate how good this is. 28 | 29 | ```Python 30 | #nitialize the Reward predictor model 31 | Qmodel = Sequential() 32 | #model.add(Dense(num_env_variables+num_env_actions, activation='tanh', input_dim=dataX.shape[1])) 33 | Qmodel.add(Dense(4096, activation='tanh', input_dim=dataX.shape[1])) 34 | Qmodel.add(Dense(dataY.shape[1])) #dataY.shape[1] is 1 corresponding to the single Real approximated value 35 | 36 | ``` 37 | 38 | We now need to ensure that we have a way to act optimally at every state. This is where the Actor comes in. This is another function approximator that takes a state as input and outputs an action. 39 | 40 | ### Helper functions 41 | We then declare a set of helper functions that are going to be use to optimze our actions at every state. 42 | 43 | ```Python 44 | def predictTotalRewards(qstate, action): 45 | qs_a = np.concatenate((qstate,action), axis=0) 46 | predX = np.zeros(shape=(1,num_env_variables+num_env_actions)) 47 | predX[0] = qs_a 48 | 49 | #print("trying to predict reward at qs_a", predX[0]) 50 | pred = Qmodel.predict(predX[0].reshape(1,predX.shape[1])) 51 | remembered_total_reward = pred[0][0] 52 | return remembered_total_reward 53 | 54 | 55 | def GetRememberedOptimalPolicy(qstate): 56 | predX = np.zeros(shape=(1,num_env_variables)) 57 | predX[0] = qstate 58 | 59 | #print("trying to predict reward at qs_a", predX[0]) 60 | pred = action_predictor_model.predict(predX[0].reshape(1,predX.shape[1])) 61 | r_remembered_optimal_policy = pred[0] 62 | return r_remembered_optimal_policy 63 | ``` 64 | 65 | # Exploration 66 | As we initally have no concept of optimal policy, we need to ensure that some actions are taken stochastically. This will prevent our model from stagnating in its improvement. 67 | 68 | ```python 69 | prob = np.random.rand(1) 70 | explore_prob = starting_explore_prob-(starting_explore_prob/num_games_to_play)*game 71 | 72 | #Chose between prediction and chance 73 | if prob < explore_prob: 74 | #take a random action 75 | a = env.action_space.sample() 76 | ``` 77 | 78 | 79 | #### Very good course on Actor Critic 80 | http://mi.eng.cam.ac.uk/~mg436/LectureSlides/MLSALT7/L6.pdf 81 | 82 | ### Who is Michel Aka 83 | *Michel is an AI researcher and a graduate from University of Montreal who currently works in the Healthcare industry.* 84 | -------------------------------------------------------------------------------- /DeepDeterministicSeletiveMemory/README.md: -------------------------------------------------------------------------------- 1 | # FitML 2 | ```python 3 | model.fit(Machine_Learning, epochs=Inf) 4 | ``` 5 | 6 | 7 | 8 | 9 | 10 |
11 | 12 | https://youtu.be/hKrFFeZqq3E 13 | 14 | #### How does Selective Memory work? 15 | 16 | The intuition behind Policy Gradient is that it optimizes the parameters of the network in the direction of higher expected sum of rewards. What if we could do the same in a computationally more effective way that also turns out to be more intuitive: enter what I am calling Selective Memory. 17 | 18 | 1) Our objective here is to ensure that the Policy function converges towards higher rewards. 19 | 20 | 2) We know that Neural Networks will converge towards assigned labeled of our data set and will also generalize (function approximation). 21 | 22 | 3) What if there was a way to select our training (reinforcement) data set so that it ensures that we converge towards our objective; Higher expected rewards. 23 | 24 | Here we propose the approach of selectively remembering actions based on the how high a reward was. In other words, the probability *P* of recording an action state into memory (or a rollout) is dependent on the actual sum of reward yeilded by this action trajectory. (Notice that we are not using the expected sum of reward here but the actual computed value at the end of the rollout). 25 | 26 | What does this look like in code 27 | 28 | First we creat our function approximators Neural Networks 29 | ```python 30 | #nitialize the Reward predictor model 31 | model = Sequential() 32 | #model.add(Dense(num_env_variables+num_env_actions, activation='tanh', input_dim=dataX.shape[1])) 33 | model.add(Dense(1024, activation='relu', input_dim=dataX.shape[1])) 34 | model.add(Dense(256, activation='tanh')) 35 | model.add(Dense(dataY.shape[1])) 36 | opt = optimizers.adam(lr=learning_rate) 37 | model.compile(loss='mse', optimizer=opt, metrics=['accuracy']) 38 | 39 | 40 | #initialize the action predictor model 41 | action_predictor_model = Sequential() 42 | #model.add(Dense(num_env_variables+num_env_actions, activation='tanh', input_dim=dataX.shape[1])) 43 | action_predictor_model.add(Dense(1024, activation='relu', input_dim=apdataX.shape[1])) 44 | action_predictor_model.add(Dense(512, activation='relu')) 45 | action_predictor_model.add(Dense(apdataY.shape[1],activation='tanh')) 46 | ``` 47 | 48 | Then we calculate sum of rewards at the end of each rollout using Bellman. 49 | 50 | Then we careful select what we want to remember i.e. store in memory. 51 | 52 | There is a number of approaches we have used to discriminate on the nature of the State-Actions or State-Action-Rewards that we will be keeping in memory to train our Actor. One discriminates for each indivudual action state, the other discriminates an entire rollout batch. Reguardless the principle is the same. We determine how good an action is compared to the average remembered good actions. 53 | 54 | ```python 55 | def addToMemory(reward,averageReward): 56 | 57 | prob = 0.1 58 | if( reward > averageReward): 59 | prob = prob + 0.9 * math.tanh(reward - averageReward) 60 | else: 61 | prob = prob + 0.1 * math.tanh(reward - averageReward) 62 | 63 | if np.random.rand(1)<=prob : 64 | print("Adding reward",reward," based on prob ", prob) 65 | return True 66 | else: 67 | return False 68 | ``` 69 | 70 | ```python 71 | for i in range(0,gameR.shape[0]): 72 | if addToMemory(gameR[i][0],-1,50): 73 | tempGameSA = np.vstack((tempGameSA, gameSA[i])) 74 | tempGameA = np.vstack((tempGameA,gameA[i])) 75 | tempGameR = np.vstack((tempGameR,gameR[i])) 76 | tempGameS = np.vstack((tempGameS,gameS[i])) 77 | ``` 78 | 79 | Here gameSA, gameA, gameR and gameS represent the various State-Action pairs, Actions, actual discounted sum of rewards and States respectively. 80 | 81 | When we get a new state we then act based on optimal policy which has been trained on memory primed with only the best results yeilding actions. 82 | ```python 83 | #Get Remembered optiomal policy 84 | remembered_optimal_policy = GetRememberedOptimalPolicy(qs) 85 | a = remembered_optimal_policy 86 | ``` 87 | 88 | ### What type of results do we get? 89 | Our agent is able to crawl, stand up, walk, run, jump after 500 episodes in the famous openAI BipedalWalker test. After 3000 iterations, our agent is able to advance fast and be very stable on its feet. 90 | You can watch it in action here: https://youtu.be/hKrFFeZqq3E. 91 | 92 | 93 | ### What is Fit ML 94 | Fit Machine Learning (FitML) is blog that houses a collection of python Machine Learning articles and examples, often focusing on Reinforcement Learning. Here, you will find code related to Q Learning, Actor-Critic, MDP, Bellman, OpenAI solutions and custom implemented approaches to solving some of the toughest and most interesting problems to date (Yes, I am "baised"). 95 | 96 | ### Who is Michel Aka 97 | *Michel is an AI researcher and a graduate from University of Montreal who currently works in the Healthcare industry.* 98 | -------------------------------------------------------------------------------- /DeepQN/CartPole_QLearning.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Cartpole-v0 solution by Michel Aka 3 | https://github.com/FitMachineLearning/FitML/ 4 | https://www.youtube.com/channel/UCi7_WxajoowBl4_9P0DhzzA/featured 5 | Using DeepQ Learning 6 | 7 | ''' 8 | import numpy as np 9 | import keras 10 | import gym 11 | import os 12 | import h5py 13 | 14 | from keras.models import Sequential 15 | from keras.layers import Dense, Dropout 16 | from keras.layers import Embedding 17 | from keras.layers import LSTM 18 | from keras import optimizers 19 | 20 | 21 | num_env_variables = 4 22 | num_env_actions = 2 23 | num_training_exmaples = 30 24 | timesteps = 1 25 | num_initial_observation = 60 26 | learning_rate = 0.001 27 | weigths_filename = "Cartpole-weights_DQN_Mem_1Loop.h5" 28 | 29 | b_discount = 0.95 30 | max_memory_len = 5000 31 | num_failures_for_retrain = 10 32 | starting_explore_prob = 0.05 33 | initial_training_epochs = 2 34 | RL_training_eporcs = 2 35 | num_anticipation_steps = 6 36 | load_previous_weights = False 37 | observe_and_train = True 38 | Do_RL = True 39 | save_weights = True 40 | Learning_cycles = 1500 41 | 42 | 43 | #One hot encoding array 44 | possible_actions = np.arange(0,num_env_actions) 45 | actions_1_hot = np.zeros((num_env_actions,num_env_actions)) 46 | actions_1_hot[np.arange(num_env_actions),possible_actions] = 1 47 | 48 | #Create testing enviroment 49 | env = gym.make('CartPole-v0') 50 | env.reset() 51 | 52 | #initialize training matrix with random states and actions 53 | dataX = np.random.random(( num_training_exmaples,num_env_variables+num_env_actions )) 54 | #Only one output for the total score 55 | dataY = np.random.random((num_training_exmaples,1)) 56 | 57 | 58 | 59 | #nitialize the LSTM with random weights 60 | 61 | model = Sequential() 62 | #model.add(Dense(num_env_variables+num_env_actions, activation='tanh', input_dim=dataX.shape[1])) 63 | model.add(Dense(512, activation='relu', input_dim=dataX.shape[1])) 64 | model.add(Dense(dataY.shape[1])) 65 | 66 | opt = optimizers.adam(lr=learning_rate) 67 | 68 | model.compile(loss='mse', optimizer=opt, metrics=['accuracy']) 69 | 70 | #load previous model weights if they exist 71 | if load_previous_weights: 72 | dir_path = os.path.realpath(".") 73 | fn = dir_path + "/"+weigths_filename 74 | print("filepath ", fn) 75 | if os.path.isfile(fn): 76 | print("loading weights") 77 | model.load_weights(weigths_filename) 78 | else: 79 | print("File ",weigths_filename," does not exis. Retraining... ") 80 | 81 | #Record first 500 in a sequence and add them to the training sequence 82 | total_steps = 0 83 | dataX = np.zeros(shape=(1,num_env_variables+num_env_actions)) 84 | dataY = np.zeros(shape=(1,1)) 85 | 86 | memoryX = np.zeros(shape=(1,num_env_variables+num_env_actions)) 87 | memoryY = np.zeros(shape=(1,1)) 88 | 89 | 90 | print("dataX shape", dataX.shape) 91 | print("dataY shape", dataY.shape) 92 | 93 | 94 | 95 | def predictTotalRewards(qstate, action): 96 | qs_a = np.concatenate((qstate,actions_1_hot[action]), axis=0) 97 | predX = np.zeros(shape=(1,num_env_variables+num_env_actions)) 98 | predX[0] = qs_a 99 | 100 | #print("trying to predict reward at qs_a", predX[0]) 101 | pred = model.predict(predX[0].reshape(1,predX.shape[1])) 102 | remembered_total_reward = pred[0][0] 103 | return remembered_total_reward 104 | 105 | 106 | 107 | if observe_and_train: 108 | #observe for 100 games 109 | 110 | 111 | for game in range(500): 112 | gameX = np.zeros(shape=(1,num_env_variables+num_env_actions)) 113 | gameY = np.zeros(shape=(1,1)) 114 | #Get the Q state 115 | qs = env.reset() 116 | for step in range (1000): 117 | 118 | if game < num_initial_observation: 119 | #take a radmon action 120 | a = env.action_space.sample() 121 | else: 122 | prob = np.random.rand(1) 123 | explore_prob = starting_explore_prob-(starting_explore_prob/Learning_cycles)*game 124 | 125 | #Chose between prediction and chance 126 | if prob < explore_prob: 127 | #take a random action 128 | a=env.action_space.sample() 129 | #print("taking random action",a, "at total_steps" , total_steps) 130 | #print("prob ", prob, "explore_prob", explore_prob) 131 | 132 | else: 133 | ##chose an action by estimating consequences of actions for the next num_anticipation_steps steps ahead 134 | #works best with looking 6 steps ahead 135 | #Also works best if you train the model more itterations 136 | utility_possible_actions = np.zeros(shape=(num_env_actions)) 137 | 138 | utility_possible_actions[0] = predictTotalRewards(qs,0) 139 | utility_possible_actions[1] = predictTotalRewards(qs,1) 140 | 141 | 142 | #chose argmax action of estimated anticipated rewards 143 | #print("utility_possible_actions ",utility_possible_actions) 144 | #print("argmax of utitity", np.argmax(utility_possible_actions)) 145 | a = np.argmax(utility_possible_actions) 146 | 147 | 148 | 149 | env.render() 150 | qs_a = np.concatenate((qs,actions_1_hot[a]), axis=0) 151 | 152 | #print("action",a," qs_a",qs_a) 153 | #get the target state and reward 154 | s,r,done,info = env.step(a) 155 | #record only the first x number of states 156 | 157 | if done and step <=196: 158 | r = -1 159 | 160 | if step ==0: 161 | gameX[0] = qs_a 162 | gameY[0] = np.array([r]) 163 | memoryX[0] = qs_a 164 | memoryY[0] = np.array([r]) 165 | 166 | gameX = np.vstack((gameX,qs_a)) 167 | gameY = np.vstack((gameY,np.array([r]))) 168 | 169 | 170 | if done : 171 | #GAME ENDED 172 | #Calculate Q values from end to start of game 173 | for i in range(0,gameY.shape[0]): 174 | #print("Updating total_reward at game epoch ",(gameY.shape[0]-1) - i) 175 | if i==0: 176 | #print("reward at the last step ",gameY[(gameY.shape[0]-1)-i][0]) 177 | gameY[(gameY.shape[0]-1)-i][0] = gameY[(gameY.shape[0]-1)-i][0] 178 | else: 179 | #print("local error before Bellman", gameY[(gameY.shape[0]-1)-i][0],"Next error ", gameY[(gameY.shape[0]-1)-i+1][0]) 180 | gameY[(gameY.shape[0]-1)-i][0] = gameY[(gameY.shape[0]-1)-i][0]+b_discount*gameY[(gameY.shape[0]-1)-i+1][0] 181 | #print("reward at step",i,"away from the end is",gameY[(gameY.shape[0]-1)-i][0]) 182 | if i==gameY.shape[0]-1: 183 | print("Training Game #",game, " steps = ", step ,"last reward", r," finished with headscore ", gameY[(gameY.shape[0]-1)-i][0]) 184 | 185 | if memoryX.shape[0] ==1: 186 | memoryX = gameX 187 | memoryY = gameY 188 | else: 189 | #Add experience to memory 190 | memoryX = np.concatenate((memoryX,gameX),axis=0) 191 | memoryY = np.concatenate((memoryY,gameY),axis=0) 192 | 193 | #if memory is full remove first element 194 | if np.alen(memoryX) >= max_memory_len: 195 | print("memory full. mem len ", np.alen(memoryX)) 196 | for l in range(np.alen(gameX)): 197 | memoryX = np.delete(memoryX, 0, axis=0) 198 | memoryY = np.delete(memoryY, 0, axis=0) 199 | 200 | #Update the states 201 | qs=s 202 | 203 | #Retrain every X failures after num_initial_observation 204 | if done and game >= num_env_actions: 205 | if game%10 == 0: 206 | print("Training game# ", game,"momory size", memoryX.shape[0]) 207 | model.fit(memoryX,memoryY, batch_size=32,epochs=initial_training_epochs,verbose=2) 208 | 209 | if done: 210 | if r > 0: 211 | print("Game ",game," WON***") 212 | #Game ended - Break 213 | break 214 | 215 | print("Observation complete. - Begin LSTM training") 216 | 217 | print("dataX shape", dataX.shape) 218 | print(dataX[0:20]) 219 | print("dataY shape", dataY.shape) 220 | print(dataY) 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | if save_weights: 230 | #Save model 231 | print("Saving weights") 232 | model.save_weights(weigths_filename) 233 | -------------------------------------------------------------------------------- /DeepQNPyTorch/DQN_torch.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.optim as optim 5 | import numpy as np 6 | import warnings 7 | warnings.filterwarnings("ignore", category=UserWarning) 8 | 9 | class DeepQNetwork(nn.Module): 10 | def __init__(self, lr, input_dims, fc1_dims, fc2_dims,n_actions): 11 | super(DeepQNetwork, self).__init__() 12 | # print("input_dims ", input_dims[0], " n_actions ",n_actions) 13 | self.input_dims = input_dims[0]+n_actions 14 | self.fc1_dims = fc1_dims 15 | self.fc2_dims = fc2_dims 16 | self.n_actions = 1 17 | self.fc1 = nn.Linear(self.input_dims,self.fc1_dims) 18 | self.fc2 = nn.Linear(self.fc1_dims,self.fc2_dims) 19 | self.fc3 = nn.Linear(self.fc2_dims,self.n_actions) 20 | self.optimizer = optim.Adam(self.parameters(),lr=lr) 21 | self.loss = nn.MSELoss() 22 | if torch.cuda.is_available(): 23 | print("Using CUDA") 24 | self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cuda:1') 25 | 26 | # print("Cuda device ",self.device) 27 | self.to(self.device) 28 | 29 | def forward(self,state_action): 30 | #action to 1 hot 31 | # action_1hot = np.zeros(self.n_actions) 32 | # action_1hot[action] = 1.0 33 | # observation_state = np.append(observation,action_1hot) 34 | state = torch.Tensor(state_action).to(self.device) 35 | x = F.relu(self.fc1(state)) 36 | x = F.relu((self.fc2(x))) 37 | predicted_Q_value = self.fc3(x) 38 | return predicted_Q_value 39 | 40 | 41 | class Agent(object): 42 | def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions, 43 | max_mem_size=100000, eps_end=0.01, eps_dec=0.996): 44 | self.gamma = gamma 45 | self.epsilon = epsilon 46 | self.eps_end = eps_end 47 | self.eps_dec = eps_dec 48 | self.lr = lr 49 | self.n_actions = n_actions 50 | # print("input_dims ", input_dims) 51 | self.input_dims = input_dims[0] 52 | self.batch_size = batch_size 53 | self.action_space = [i for i in range(n_actions)] 54 | self.max_mem_size = max_mem_size 55 | self.mem_counter = 0 56 | self.Q_eval = DeepQNetwork(lr=lr,n_actions=n_actions, input_dims=input_dims, fc1_dims=256, fc2_dims=256) 57 | self.state_memory = np.zeros((self.max_mem_size,self.input_dims)) 58 | self.new_state_memory = np.zeros((self.max_mem_size,self.input_dims)) 59 | self.action_memory = np.zeros((self.max_mem_size,n_actions)) 60 | self.action_state_memory = np.zeros((self.max_mem_size,self.input_dims+n_actions)) 61 | self.reward_memory = np.zeros(self.max_mem_size) 62 | self.Q_memory = np.zeros(self.max_mem_size) 63 | self.terminal_memory = np.zeros(self.max_mem_size) 64 | 65 | def store_transition(self, state, action, reward, state_, terminal): 66 | index = self.mem_counter % self.max_mem_size 67 | self.state_memory[index] = state 68 | actions = np.zeros(self.n_actions) 69 | actions[action] = 1.0 70 | self.action_memory[index] = actions 71 | self.reward_memory[index] = reward 72 | self.terminal_memory[index] = terminal 73 | self.new_state_memory[index] = state_ 74 | self.action_state_memory[index] = np.append(state,actions) 75 | self.mem_counter+=1 76 | 77 | def calculate_bellman(self,episode_len): 78 | for i in range(episode_len): 79 | index = ((self.mem_counter-1)-i) % self.max_mem_size 80 | next_index = ((self.mem_counter)-i) % self.max_mem_size 81 | if i==0: 82 | self.Q_memory[index] = self.reward_memory[index] 83 | # print("last Q ", self.Q_memory[index]) 84 | else: 85 | self.Q_memory[index] = self.reward_memory[index] + self.gamma * self.Q_memory[next_index] 86 | # print("Q ", self.Q_memory[index]) 87 | 88 | def update_epsilon(self): 89 | self.epsilon = self.epsilon * self.eps_dec if self.epsilon > self.eps_end else self.eps_end 90 | 91 | def process_end_of_episode(self,episode_len): 92 | self.calculate_bellman(episode_len) 93 | self.update_epsilon() 94 | 95 | def choose_action(self, observation): 96 | rand = np.random.random() 97 | if rand < self.epsilon: 98 | action = np.random.choice(self.action_space) 99 | else: 100 | estimated_Q_values = torch.Tensor( np.zeros(self.n_actions)).to(self.Q_eval.device) 101 | for i in range (self.n_actions): 102 | action_1hot = np.zeros(self.n_actions) 103 | action_1hot[i] = 1.0 104 | # print("about to evaulate action ",i, " array of action ", action_1hot," with observation", observation[:10]) 105 | # print("concatanated sate action vector ", np.append(observation,action_1hot) ) 106 | estimated_Q_values[i] = self.Q_eval.forward( np.append(observation,action_1hot) ) 107 | 108 | 109 | # actions = self.Q_eval.forward(observation) 110 | 111 | # print("estimated Q values", estimated_Q_values) 112 | action = torch.argmax(estimated_Q_values).item() 113 | return action 114 | 115 | def learn(self, step_counter): 116 | if self.mem_counter > self.batch_size: 117 | self.Q_eval.optimizer.zero_grad() 118 | max_mem = self.mem_counter if self.mem_counter < self.max_mem_size else self.max_mem_size 119 | batch = np.random.choice(max_mem,self.batch_size) 120 | #print("batch size", batch.size()) 121 | state_batch = self.state_memory[batch] 122 | action_batch = self.action_memory[batch] 123 | action_values = np.array(self.action_space, dtype=np.uint8) 124 | action_indices = np.dot(action_batch, action_values) 125 | reward_batch = self.reward_memory[batch] 126 | q_batch = self.Q_memory[batch] 127 | terminal_batch = self.terminal_memory[batch] 128 | new_state_batch = self.new_state_memory[batch] 129 | action_state_batch = self.action_state_memory[batch] 130 | 131 | reward_batch = torch.Tensor(reward_batch).to(self.Q_eval.device) 132 | terminal_batch = torch.Tensor(terminal_batch).to(self.Q_eval.device) 133 | 134 | q_eval = self.Q_eval.forward(action_state_batch).to(self.Q_eval.device) 135 | q_target = torch.Tensor(q_batch).to(self.Q_eval.device) 136 | # q_next = self.Q_eval.forward(new_state_batch).to(self.Q_eval.device) 137 | 138 | # batch_index = np.arange(self.batch_size, dtype=np.int32) 139 | # q_target[action_batch] = reward_batch + self.gamma*torch.max(q_next, dim=1)[0]*terminal_batch 140 | 141 | # self.epsilon = self.epsilon * self.eps_dec if self.epsilon > self.eps_end else self.eps_end 142 | # if (step_counter%50)==49: 143 | # print("Q eval ",q_eval, "q target", q_target) 144 | loss = self.Q_eval.loss(q_eval,q_target) 145 | loss.backward() 146 | self.Q_eval.optimizer.step() 147 | print(torch.cuda.is_available()) 148 | -------------------------------------------------------------------------------- /DeepQNPyTorch/Lander_torch.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from DQN_torch import Agent 3 | # from utils import plotLearning 4 | import numpy as np 5 | import warnings 6 | warnings.filterwarnings("ignore", category=UserWarning) 7 | 8 | if __name__ =='__main__': 9 | env=gym.make('LunarLander-v2') 10 | brain = Agent(gamma=0.98, epsilon=0.7, batch_size=2, n_actions=4, 11 | input_dims=[8], lr=0.01, eps_end=0.02,eps_dec=0.992) 12 | scores = [] 13 | eps_history = [] 14 | n_games = 50000 15 | score = 0 16 | 17 | for i in range (n_games): 18 | if i%10 == 0 and i>0 and i>10: 19 | avg_score = np.mean(scores[:-10]) 20 | print('epside ', i, 'score', score, 21 | 'average score %3f' % avg_score, 22 | 'epsilon %3f' % brain.epsilon) 23 | else: 24 | print('episode ',i, 'score', score) 25 | score = 0 26 | 27 | eps_history.append(brain.epsilon) 28 | observation = env.reset() 29 | done = False 30 | step_counter = 0 31 | while not done: 32 | env.render() 33 | action = brain.choose_action(observation) 34 | observation_, reward, done, info = env.step(action) 35 | # print("chosen action after chose action ", action) 36 | score+=reward 37 | brain.store_transition(observation,action, reward, observation_,done) 38 | if i>5: 39 | brain.learn(step_counter) 40 | observation = observation_ 41 | step_counter += 1 42 | # EPISODE done 43 | # "CALCULATE BELL MAN IN AGENT CLASS brain.compute_reward()" 44 | brain.process_end_of_episode(step_counter) 45 | scores.append(score) 46 | 47 | # x = [i+1 for i in range(n_games)] 48 | # filename = 'lunar-lander.png' 49 | # plotLearning(x, scores, eps_history, filename,) 50 | -------------------------------------------------------------------------------- /DeepQNPyTorch/Readme.md: -------------------------------------------------------------------------------- 1 | # This forlder contains RL pytorch agents for open AI 2 | -------------------------------------------------------------------------------- /Experimental/CNN_test.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 3 | https://hackernoon.com/visualizing-parts-of-convolutional-neural-networks-using-keras-and-cats-5cc01b214e59 4 | https://stackoverflow.com/questions/43895750/keras-input-shape-for-conv2d-and-manually-loaded-images 5 | ''' 6 | 7 | import matplotlib.pylab as plt 8 | import matplotlib.image as mpimg 9 | import numpy as np 10 | import scipy 11 | import keras 12 | 13 | from keras.models import Sequential 14 | from keras.layers import Conv2D 15 | 16 | cat = mpimg.imread('cat.png') 17 | print("Shape", cat.shape) 18 | plt.imshow(cat) 19 | plt.show() 20 | 21 | def show_cat(cat_batch): 22 | print("cat shape before transfo",cat_batch.shape) 23 | cat = np.squeeze(cat_batch,axis=0) 24 | print( "cat.shape", cat.shape) 25 | plt.imshow(cat) 26 | plt.show() 27 | 28 | cat_batch = cat.reshape(1,cat.shape[0],cat.shape[1],4) 29 | 30 | input_shape = ( cat.shape[0], cat.shape[1], 4 ) 31 | 32 | model = Sequential() 33 | model.add(Conv2D(4, kernel_size=(3, 3), activation='relu', input_shape=input_shape)) 34 | 35 | print("predicting ... ") 36 | conv_cat = model.predict(cat_batch) 37 | show_cat(conv_cat) 38 | -------------------------------------------------------------------------------- /Experimental/Cartpol_CNN_RL.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Cartpole solution by the Author of the Fit Machine Learning Blog 3 | https://github.com/FitMachineLearning/FitML/ 4 | 5 | This solution oberserves the first 30 games 6 | Then plays after being trained from first 30 games. 7 | 8 | While it plays it will take actions/policies that are sometimes random enabling 9 | it to explore solution it hasn't yet explored. 10 | 11 | The proability of random action reduces over each episode as it gets more and more comfortable with the rules of the game. 12 | 13 | ''' 14 | import numpy as np 15 | import keras 16 | import gym 17 | import os 18 | import h5py 19 | 20 | from keras.models import Sequential 21 | from keras.layers import Dense, Dropout 22 | from keras.layers import Embedding 23 | from keras.layers import LSTM 24 | from keras import optimizers 25 | 26 | 27 | num_env_variables = 4 28 | num_env_actions = 1 29 | num_training_exmaples = 100 30 | timesteps = 1 31 | num_initial_observation = 4000 32 | 33 | 34 | num_failures_for_retrain = 5 35 | starting_explore_prob = 0.30 36 | training_epochs = 500 37 | num_anticipation_steps = 8 38 | load_previous_weights = False 39 | observe_and_train = True 40 | save_weights = True 41 | 42 | 43 | #Create testing enviroment 44 | env = gym.make('CartPole-v0') 45 | env.reset() 46 | 47 | #initialize training matrix with random states and actions 48 | dataX = np.random.random(( num_training_exmaples,num_env_variables+num_env_actions )) 49 | #initize training matrix corresponding expected states and expected rewards (random) 50 | dataY = np.random.random((num_training_exmaples,num_env_variables+1)) 51 | 52 | 53 | 54 | #nitialize the LSTM with random weights 55 | 56 | model = Sequential() 57 | model.add(Dense(20, activation='relu', input_dim=dataX.shape[1])) 58 | model.add(Dense(20)) 59 | model.add(Dense(dataY.shape[1])) 60 | 61 | opt = optimizers.adam(lr=0.01) 62 | 63 | model.compile(loss='mse', optimizer=opt, metrics=['accuracy']) 64 | 65 | #load previous model weights if they exist 66 | if load_previous_weights: 67 | dir_path = os.path.realpath(".") 68 | fn = dir_path + "/CP-weights_RL_CNN.h5" 69 | print("filepath ", fn) 70 | if os.path.isfile(fn): 71 | print("loading weights") 72 | model.load_weights("CP-weights_RL_CNN.h5") 73 | else: 74 | print("File CP-weights_RL_CNN.h5 does not exis. Retraining... ") 75 | 76 | #Record first 500 in a sequence and add them to the training sequence 77 | total_steps = 0 78 | dataX = np.zeros(shape=(1,5)) 79 | dataY = np.zeros(shape=(1,5)) 80 | 81 | print("dataX shape", dataX.shape) 82 | print("dataY shape", dataY.shape) 83 | 84 | if observe_and_train: 85 | #observe for 100 games 86 | for game in range(30): 87 | 88 | if total_steps >= num_initial_observation: 89 | break 90 | #Get the Q state 91 | qs = env.reset() 92 | for step in range (200): 93 | a=0 94 | if np.random.rand(1) < 0.5: 95 | a=0 96 | else: 97 | a=1 98 | env.render() 99 | qs_a = np.concatenate((qs,np.array([a])), axis=0) 100 | 101 | #get the target state and reward 102 | s,r,done,info = env.step(a) 103 | 104 | #set reward in case of failure 105 | if done: 106 | r = -1 107 | 108 | #concatenate target state and reward 109 | s_r = np.concatenate((s,np.array([r])), axis=0) 110 | 111 | if done: 112 | #print negative reward array 113 | print("Negative reward s_r: ", s_r) 114 | 115 | #print("reward = ", r) 116 | #print("target state", s) 117 | #print("concatenate(s,r)", s_r) 118 | 119 | 120 | #record only the first x number of states 121 | if total_steps ==0: 122 | dataX[0] = qs_a 123 | dataY[0] = s_r 124 | 125 | if total_steps < (num_initial_observation-1): 126 | dataX = np.vstack((dataX,qs_a)) 127 | dataY = np.vstack((dataY,s_r)) 128 | 129 | #Update the states 130 | qs=s 131 | 132 | 133 | total_steps += 1 134 | if done : 135 | break 136 | 137 | print("Observation complete. - Begin LSTM training") 138 | 139 | print("dataX shape", dataX.shape) 140 | print(dataX[0:5]) 141 | print("dataY shape", dataY.shape) 142 | print(dataY[0:5]) 143 | 144 | 145 | #feedX = np.reshape(dataX, (dataX.shape[0], 1, dataX.shape[1] )) 146 | #feedY = np.reshape(dataY, (dataY.shape[0], 1, dataY.shape[1] )) 147 | feedX = dataX 148 | feedY = dataY 149 | 150 | 151 | #The more epochs you train the model, the better is becomes at predicting future states 152 | #This in turn will improve the results of the Bellman equation and thus will lead us to 153 | # better decisions in our MDP process 154 | model.fit(feedX,feedY, batch_size=1,epochs=training_epochs,verbose=2) 155 | 156 | print("total_steps ", total_steps) 157 | print("dataX ", dataX[0:10]) 158 | print("dataY ", dataY[0:10]) 159 | #print("dataY ", dataY) 160 | 161 | print("Initial training complete. Begin tentative exploration.") 162 | 163 | ''' 164 | dataX = np.random.random((1,5)) 165 | res = model.predict(dataX[0].reshape(1,dataX.shape[1])) 166 | nstate = res[0][:-1] 167 | print("predicted output ", res) 168 | print("expected reward ", res[0][4]) 169 | print("expected state ", nstate) 170 | ''' 171 | 172 | def estimateReward(qstate,action, depth): 173 | if depth <= 0: 174 | return 0 175 | #calculate/estimate reward at this state and get the next state 176 | qs_a = np.concatenate((qstate,np.array([action])), axis=0) 177 | predX = np.zeros(shape=(1,5)) 178 | predX[0] = qs_a 179 | pred = model.predict(predX[0].reshape(1,predX.shape[1])) 180 | reward = pred[0][4] 181 | expected_state = pred[0][:-1] 182 | 183 | ''' 184 | print("depth -- ", depth) 185 | print("qstate", qstate) 186 | print("action", action) 187 | print("pred", pred) 188 | print("expected_state", expected_state) 189 | print("reward", reward) 190 | ''' 191 | # Bellman -- reward at this state = reward + Sum of discounted expected rewards for all actions (recursively) 192 | #recursively calculate the reward at future states for all possible actions 193 | discounted_future_rewards = 0.95*estimateReward(expected_state,0,depth-1)+ 0.95*estimateReward(expected_state,1,depth-1) 194 | 195 | #print("discounted_future_rewards", discounted_future_rewards) 196 | #add current state and discounted future state reward 197 | return reward + discounted_future_rewards 198 | 199 | 200 | print("** Estimating reward for dataX[0] with action 1 usint Bellman", estimateReward(dataX[0][:-1],1,2)) 201 | print("** Estimating reward for dataX[0] with action 0 usint Bellman", estimateReward(dataX[0][:-1],0,2)) 202 | 203 | 204 | 205 | ##### 206 | ##### 207 | #Play the game for X rounds using the Bellman with LSTM anticipation model 208 | 209 | explore_prob = starting_explore_prob 210 | failures = 0 211 | for game in range(20): 212 | total_steps =0 213 | #Get the Q state 214 | qs = env.reset() 215 | #over the next 50 games reduce the probability of explore_prob 216 | explore_prob = starting_explore_prob-(starting_explore_prob/20.0)*game 217 | print("- Episode", game, " explore_prob",explore_prob) 218 | for step in range (300): 219 | 220 | prob = np.random.rand(1) 221 | 222 | #Chose between prediction and chance 223 | if prob < explore_prob: 224 | #take a random action 225 | #print("taking random action ", total_steps) 226 | #print("prob ", prob, "explore_prob", explore_prob) 227 | if np.random.rand(1) < 0.5: 228 | a=0 229 | else: 230 | a=1 231 | else: 232 | ##chose an action by estimating consequences of actions for the next num_anticipation_steps steps ahead 233 | #works best with looking 6 steps ahead 234 | #Also works best if you train the model more itterations 235 | estimated_anticipated_reward_a = estimateReward(qs,1,num_anticipation_steps) 236 | estimated_anticipated_reward_b = estimateReward(qs,0,num_anticipation_steps) 237 | #print(" estimated rewards a and b", estimated_anticipated_reward_a, estimated_anticipated_reward_b) 238 | #chose argmax action of estimated anticipated rewards 239 | if estimated_anticipated_reward_a > estimated_anticipated_reward_b: 240 | a = 1 241 | else: 242 | a = 0 243 | 244 | 245 | env.render() 246 | #get the target state and reward 247 | s,r,done,info = env.step(a) 248 | qs=s 249 | 250 | 251 | #set reward in case of failure 252 | if done: 253 | if total_steps >= 198: 254 | print("*** Game Won after ", total_steps, " steps") 255 | else: 256 | r = -1 257 | print("*** failed after ", total_steps, " steps") 258 | 259 | qs_a = np.concatenate((qs,np.array([a])), axis=0) 260 | s_r = np.concatenate((s,np.array([r])), axis=0) 261 | #add event to training set 262 | dataX = np.vstack((dataX,qs_a)) 263 | dataY = np.vstack((dataY,s_r)) 264 | 265 | total_steps += 1 266 | if done and r==-1: 267 | #If you fail, add new knowledge to NN 268 | #Retrain the NN 269 | #break 270 | print("Retraining the network") 271 | feedX = dataX 272 | feedY = dataY 273 | failures +=1 274 | 275 | #retrain after every 6 failures 276 | if failures >= num_failures_for_retrain: 277 | print("Retraining the network after failure ", failures) 278 | model.fit(feedX,feedY, batch_size=1,epochs=training_epochs,verbose=2) 279 | failures = 0 280 | 281 | 282 | if done: 283 | break 284 | 285 | if save_weights: 286 | #Save model 287 | print("Saving weights") 288 | model.save_weights("CP-weights_RL_CNN.h5") 289 | -------------------------------------------------------------------------------- /Experimental/Cartpole_simple_CNN.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Cartpole solution by the Author of the Fit Machine Learning Blog 3 | https://github.com/FitMachineLearning/FitML/ 4 | 5 | This is the simplified version of the open AI problem. This solution does not use LSTMs but uses 6 | One Dense RELU and one Dense sigmoid instead. 7 | 8 | It does not perform as well as the LSTM model which is better able to do longer term predictions. 9 | 10 | ''' 11 | import numpy as np 12 | import keras 13 | import gym 14 | import os 15 | import h5py 16 | 17 | from keras.models import Sequential 18 | from keras.layers import Dense, Dropout 19 | from keras.layers import Embedding 20 | from keras.layers import LSTM 21 | from keras import optimizers 22 | 23 | 24 | num_env_variables = 4 25 | num_env_actions = 1 26 | num_training_exmaples = 100 27 | timesteps = 1 28 | num_initial_observation = 4000 29 | training_epochs = 500 30 | num_anticipation_steps = 7 31 | load_previous_weights = True 32 | observe_and_train = False 33 | save_weights = False 34 | 35 | 36 | #Create testing enviroment 37 | env = gym.make('CartPole-v0') 38 | env.reset() 39 | 40 | #initialize training matrix with random states and actions 41 | dataX = np.random.random(( num_training_exmaples,num_env_variables+num_env_actions )) 42 | #initize training matrix corresponding expected states and expected rewards (random) 43 | dataY = np.random.random((num_training_exmaples,num_env_variables+1)) 44 | 45 | 46 | 47 | #nitialize the LSTM with random weights 48 | 49 | model = Sequential() 50 | model.add(Dense(16, activation='relu', input_dim=dataX.shape[1])) 51 | model.add(Dense(dataY.shape[1])) 52 | 53 | opt = optimizers.adam(lr=0.01) 54 | 55 | model.compile(loss='mse', optimizer=opt, metrics=['accuracy']) 56 | 57 | #load previous model weights if they exist 58 | if load_previous_weights: 59 | dir_path = os.path.realpath(".") 60 | fn = dir_path + "/CP-weights_Simple_CNN.h5" 61 | print("filepath ", fn) 62 | if os.path.isfile(fn): 63 | print("loading weights") 64 | model.load_weights("CP-weights_Simple_CNN.h5") 65 | else: 66 | print("File CP-weights.h5 does not exis. Retraining... ") 67 | 68 | #Record first 500 in a sequence and add them to the training sequence 69 | total_steps = 0 70 | dataX = np.zeros(shape=(1,5)) 71 | dataY = np.zeros(shape=(1,5)) 72 | 73 | print("dataX shape", dataX.shape) 74 | print("dataY shape", dataY.shape) 75 | 76 | if observe_and_train: 77 | #observe for 100 games 78 | for game in range(100): 79 | 80 | if total_steps >= num_initial_observation: 81 | break 82 | #Get the Q state 83 | qs = env.reset() 84 | for step in range (200): 85 | a=0 86 | if np.random.rand(1) < 0.5: 87 | a=0 88 | else: 89 | a=1 90 | env.render() 91 | qs_a = np.concatenate((qs,np.array([a])), axis=0) 92 | 93 | #get the target state and reward 94 | s,r,done,info = env.step(a) 95 | 96 | #set reward in case of failure 97 | if done: 98 | r = -1 99 | 100 | #concatenate target state and reward 101 | s_r = np.concatenate((s,np.array([r])), axis=0) 102 | 103 | if done: 104 | #print negative reward array 105 | print("Negative reward s_r: ", s_r) 106 | 107 | #print("reward = ", r) 108 | #print("target state", s) 109 | #print("concatenate(s,r)", s_r) 110 | 111 | 112 | #record only the first x number of states 113 | if total_steps ==0: 114 | dataX[0] = qs_a 115 | dataY[0] = s_r 116 | 117 | if total_steps < (num_initial_observation-1): 118 | dataX = np.vstack((dataX,qs_a)) 119 | dataY = np.vstack((dataY,s_r)) 120 | 121 | #Update the states 122 | qs=s 123 | 124 | 125 | total_steps += 1 126 | if done : 127 | break 128 | 129 | print("Observation complete. - Begin LSTM training") 130 | 131 | print("dataX shape", dataX.shape) 132 | print(dataX[0:5]) 133 | print("dataY shape", dataY.shape) 134 | print(dataY[0:5]) 135 | 136 | 137 | #feedX = np.reshape(dataX, (dataX.shape[0], 1, dataX.shape[1] )) 138 | #feedY = np.reshape(dataY, (dataY.shape[0], 1, dataY.shape[1] )) 139 | feedX = dataX 140 | feedY = dataY 141 | 142 | 143 | #The more epochs you train the model, the better is becomes at predicting future states 144 | #This in turn will improve the results of the Bellman equation and thus will lead us to 145 | # better decisions in our MDP process 146 | model.fit(feedX,feedY, batch_size=1,epochs=training_epochs,verbose=2) 147 | 148 | print("total_steps ", total_steps) 149 | print("dataX ", dataX[0:10]) 150 | print("dataY ", dataY[0:10]) 151 | #print("dataY ", dataY) 152 | 153 | 154 | dataX = np.random.random((1,5)) 155 | 156 | 157 | 158 | res = model.predict(dataX[0].reshape(1,dataX.shape[1])) 159 | nstate = res[0][:-1] 160 | 161 | print("predicted output ", res) 162 | print("expected reward ", res[0][4]) 163 | print("expected state ", nstate) 164 | 165 | def estimateReward(qstate,action, depth): 166 | if depth <= 0: 167 | return 0 168 | #calculate/estimate reward at this state and get the next state 169 | qs_a = np.concatenate((qstate,np.array([action])), axis=0) 170 | predX = np.zeros(shape=(1,5)) 171 | predX[0] = qs_a 172 | pred = model.predict(predX[0].reshape(1,predX.shape[1])) 173 | reward = pred[0][4] 174 | expected_state = pred[0][:-1] 175 | 176 | ''' 177 | print("depth -- ", depth) 178 | print("qstate", qstate) 179 | print("action", action) 180 | print("pred", pred) 181 | print("expected_state", expected_state) 182 | print("reward", reward) 183 | ''' 184 | # Bellman -- reward at this state = reward + Sum of discounted expected rewards for all actions (recursively) 185 | #recursively calculate the reward at future states for all possible actions 186 | discounted_future_rewards = 0.95*estimateReward(expected_state,0,depth-1)+ 0.95*estimateReward(expected_state,1,depth-1) 187 | 188 | #print("discounted_future_rewards", discounted_future_rewards) 189 | #add current state and discounted future state reward 190 | return reward + discounted_future_rewards 191 | 192 | 193 | print("** Estimating reward for dataX[0] with action 1 usint Bellman", estimateReward(dataX[0][:-1],1,2)) 194 | print("** Estimating reward for dataX[0] with action 0 usint Bellman", estimateReward(dataX[0][:-1],0,2)) 195 | 196 | 197 | 198 | ##### 199 | ##### 200 | #Play the game for X rounds using the Bellman with LSTM anticipation model 201 | 202 | 203 | for game in range(3): 204 | total_steps =0 205 | #Get the Q state 206 | qs = env.reset() 207 | for step in range (300): 208 | ##chose an action by estimating consequences of actions for the next num_anticipation_steps steps ahead 209 | #works best with looking 6 steps ahead 210 | #Also works best if you train the model more itterations 211 | estimated_anticipated_reward_a = estimateReward(qs,1,num_anticipation_steps) 212 | estimated_anticipated_reward_b = estimateReward(qs,0,num_anticipation_steps) 213 | #print(" estimated rewards a and b", estimated_anticipated_reward_a, estimated_anticipated_reward_b) 214 | 215 | #chose argmax action of estimated anticipated rewards 216 | if estimated_anticipated_reward_a > estimated_anticipated_reward_b: 217 | a = 1 218 | else: 219 | a = 0 220 | 221 | env.render() 222 | 223 | 224 | #get the target state and reward 225 | s,r,done,info = env.step(a) 226 | 227 | 228 | 229 | qs=s 230 | #set reward in case of failure 231 | if done: 232 | r = -1 233 | if total_steps >= 198: 234 | print("*** Game Won after ", total_steps, " steps") 235 | else: 236 | print("** failed after ", total_steps, " steps") 237 | 238 | 239 | total_steps += 1 240 | if done : 241 | break 242 | 243 | if save_weights: 244 | #Save model 245 | print("Saving weights") 246 | model.save_weights("CP-weights_Simple_CNN.h5") 247 | -------------------------------------------------------------------------------- /Experimental/MNIST_image_Classification.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 3 | MNIST dataset classification with Keras. 4 | Credits to fchollet. Find him on github. 5 | ''' 6 | 7 | from __future__ import print_function 8 | import keras 9 | from keras.datasets import mnist 10 | from keras.models import Sequential 11 | from keras.layers import Dense,Dropout,Flatten 12 | from keras.layers import Conv2D, MaxPooling2D 13 | from keras import backend as K 14 | 15 | batch_size = 128 16 | num_class = 10 17 | epochs = 3 18 | 19 | #input image dimensions 20 | img_rows, img_cols = 28,28 21 | 22 | #seperate train and test dataset 23 | (x_train,y_train), (x_test,y_test) = mnist.load_data() 24 | 25 | if K.image_data_format() == 'channels_first': 26 | x_train = x_train.reshape(x_train.shape[0],1,img_rows,img_cols) 27 | x_test = x_test.reshape(x_test.shape[0],1,img_rows,img_cols) 28 | input_shape = (1,img_rows,img_cols) 29 | else: 30 | x_train = x_train.reshape(x_train.shape[0],img_rows,img_cols,1) 31 | x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) 32 | input_shape = (img_rows,img_cols,1) 33 | 34 | x_train = x_train.astype('float32') 35 | x_test = x_test.astype('float32') 36 | x_train /=255 37 | x_test /=255 38 | print('x_train shape', x_train.shape) 39 | print('x_test shape', x_test.shape) 40 | print('y_train', y_train) 41 | 42 | 43 | # convert class to binanry class 44 | y_train = keras.utils.to_categorical(y_train,num_class) 45 | y_test = keras.utils.to_categorical(y_test,num_class) 46 | 47 | #print('y_train', y_train) 48 | 49 | #Declare the model 50 | model = Sequential() 51 | model.add(Conv2D(32,kernel_size=(3,3), activation='relu',input_shape=input_shape)) 52 | model.add(Conv2D(64,(3,3), activation='relu')) 53 | model.add(MaxPooling2D(pool_size=(2, 2))) 54 | model.add(Dropout(0.25)) 55 | model.add(Flatten()) 56 | model.add(Dense(128,activation='relu')) 57 | model.add((Dropout(0.5))) 58 | model.add(Dense(num_class,activation='softmax')) 59 | 60 | model.compile(loss=keras.losses.categorical_crossentropy, 61 | optimizer=keras.optimizers.Adadelta(), 62 | metrics=['accuracy']) 63 | 64 | model.fit(x_train,y_train, 65 | batch_size=batch_size, 66 | epochs=epochs, 67 | verbose=1, 68 | validation_data=(x_test,y_test)) 69 | 70 | score = model.evaluate(x_test[1000:],y_test[1000:],verbose=0) 71 | print('Test loss:', score[0]) 72 | print('Test accuracy:', score[1]) 73 | -------------------------------------------------------------------------------- /Experimental/Readme.md: -------------------------------------------------------------------------------- 1 | # Collection of experimental code samples 2 | -------------------------------------------------------------------------------- /Experimental/image_rescale.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 3 | https://hackernoon.com/visualizing-parts-of-convolutional-neural-networks-using-keras-and-cats-5cc01b214e59 4 | https://stackoverflow.com/questions/43895750/keras-input-shape-for-conv2d-and-manually-loaded-images 5 | ''' 6 | 7 | import matplotlib.pylab as plt 8 | import matplotlib.image as mpimg 9 | import numpy as np 10 | import scipy 11 | import keras 12 | 13 | from scipy import misc 14 | from keras.models import Sequential 15 | from keras.layers import Conv2D 16 | 17 | 18 | 19 | def show_cat(cat_batch): 20 | print("cat shape before transfo",cat_batch.shape) 21 | cat = np.squeeze(cat_batch,axis=0) 22 | print( "cat.shape", cat.shape) 23 | plt.imshow(cat) 24 | plt.show() 25 | 26 | def resize_cat(cat): 27 | cat = scipy.misc.imresize(cat,size=(cat.shape[0]/2,cat.shape[1]/2)) 28 | plt.imshow(cat) 29 | plt.show() 30 | 31 | cat = mpimg.imread('cat.png') 32 | print("Shape", cat.shape) 33 | plt.imshow(cat) 34 | plt.show() 35 | resize_cat(cat) 36 | 37 | cat_batch = cat.reshape(1,cat.shape[0],cat.shape[1],4) 38 | 39 | input_shape = ( cat.shape[0], cat.shape[1], 4 ) 40 | 41 | model = Sequential() 42 | model.add(Conv2D(4, kernel_size=(3, 3), activation='relu', input_shape=input_shape)) 43 | 44 | print("predicting ... ") 45 | conv_cat = model.predict(cat_batch) 46 | show_cat(conv_cat) 47 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 FitMachineLearning 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NeuroEvolution/Implementations/RS_Hopper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import keras 3 | import gym 4 | import roboschool 5 | 6 | from keras.layers.advanced_activations import LeakyReLU, PReLU 7 | from keras.models import Sequential 8 | from keras.layers import Dense, Dropout 9 | from keras import optimizers 10 | 11 | from Lib.Individual import Individual 12 | 13 | ENVIRONMENT_NAME = "RoboschoolHopper-v1" 14 | OBSERVATION_SPACE = 15 15 | ACTION_SPACE = 3 16 | 17 | B_DISCOUNT = 0.98 18 | 19 | POPULATION_SIZE = 10 20 | NETWORK_WIDTH = 512 21 | NUM_TEST_EPISODES = 3 22 | NUM_SELECTED_FOR_REPRODUCTION = 2 23 | NOISE_SIGMA = 0.3 24 | MUTATION_PROB = 0.05 25 | 26 | 27 | MAX_GENERATIONS = 20000 28 | 29 | CLIP_ACTIONS = True 30 | MAX_STEPS = 996 31 | 32 | all_individuals = [] 33 | generations_count = 0 34 | total_population_counter = 0 35 | 36 | 37 | 38 | 39 | 40 | '''---------ENVIRONMENT INITIALIZATION--------''' 41 | 42 | env = gym.make(ENVIRONMENT_NAME) 43 | #env.render(mode="human") 44 | env.reset() 45 | 46 | print("-- Observations",env.observation_space) 47 | print("-- actionspace",env.action_space) 48 | 49 | 50 | #initialize training matrix with random states and actions 51 | apdataX = np.random.random(( 5,OBSERVATION_SPACE )) 52 | apdataY = np.random.random((5,ACTION_SPACE)) 53 | 54 | 55 | '''---------------------''' 56 | 57 | def GetRememberedOptimalPolicy(targetModel,qstate): 58 | predX = np.zeros(shape=(1,OBSERVATION_SPACE)) 59 | predX[0] = qstate 60 | 61 | #print("trying to predict reward at qs_a", predX[0]) 62 | pred = targetModel.predict(predX[0].reshape(1,predX.shape[1])) 63 | r_remembered_optimal_policy = pred[0] 64 | return r_remembered_optimal_policy 65 | 66 | 67 | def create_model(network_width, observation_space, action_space): 68 | action_predictor_model = Sequential() 69 | action_predictor_model.add(Dense(network_width, activation='relu', input_dim=observation_space)) 70 | action_predictor_model.add(Dense(action_space)) 71 | return action_predictor_model 72 | 73 | def initialize_population(population_size,network_width, observation_space, action_space, environment_name,total_population_counter): 74 | initial_population = [] 75 | for i in range (population_size): 76 | action_predictor_model = create_model(network_width, observation_space, action_space) 77 | indiv = Individual(generationID=0, indivID=total_population_counter , network = action_predictor_model) 78 | total_population_counter += 1 79 | initial_population.append(indiv) 80 | return initial_population, total_population_counter 81 | 82 | def test_individual(indiv,num_test_episodes): 83 | indiv.lifeScore = 0 84 | allRewards = [] 85 | for i in range(num_test_episodes): 86 | episodeRewards = [] 87 | #print("episode "+str(i)+" performing test for indiv ",indiv.printme()) 88 | qs = env.reset() 89 | for step in range (5000): 90 | a = GetRememberedOptimalPolicy(indiv.network, qs) 91 | if CLIP_ACTIONS: 92 | for i in range (np.alen(a)): 93 | if a[i] < -1: a[i]=-0.99999999999 94 | if a[i] > 1: a[i] = 0.99999999999 95 | qs,r,done,info = env.step(a) 96 | episodeRewards.append(r) 97 | #indiv.lifeScore += r 98 | env.render() 99 | if step > MAX_STEPS: 100 | done = True 101 | if done: 102 | episodeRewards.reverse() 103 | for j in range(len(episodeRewards)): 104 | #if j ==0: 105 | # print("last reward ",episodeRewards[j]) 106 | if j > 0: 107 | episodeRewards[j] = episodeRewards[j] + B_DISCOUNT * episodeRewards[j-1] 108 | #avg = sum(episodeRewards)/len(episodeRewards) 109 | #print("episode average ", avg) 110 | for j in range(len(episodeRewards)): 111 | allRewards.append(episodeRewards[j]) 112 | #allRewards = allRewards + episodeRewards 113 | break 114 | epAvg = sum(episodeRewards) / len(episodeRewards) 115 | print("generationID",indiv.generationID,"IndivID",indiv.indivID,"episodeRewards rewards ",epAvg) 116 | 117 | avg = sum(allRewards) / len(allRewards) 118 | indiv.lifeScore = avg 119 | #indiv.lifeScore = np.random.rand(1)[0]*50 120 | print("indivID - ",indiv.indivID,"lifeScore =",indiv.lifeScore) 121 | 122 | 123 | def test_all_individuals(num_test_episodes): 124 | for i in range(len(all_individuals)): 125 | test_individual(all_individuals[i],NUM_TEST_EPISODES) 126 | 127 | 128 | def select_top_individuals(num_selected,population_size): 129 | scores = np.zeros(population_size) 130 | for i in range(np.alen(scores)): 131 | scores[i] = all_individuals[i].lifeScore 132 | 133 | print( scores ) 134 | topScores = scores[ scores.argsort()[-num_selected:][::-1] ] 135 | #print ("Top Scores ", topScores) 136 | selected_individuals = [] 137 | for i in range(len(all_individuals)): 138 | if all_individuals[i].lifeScore >= topScores.min(): 139 | #print("Selecting individual",i," with score ", all_individuals[i].lifeScore,"cuttoff ", topScores.min()) 140 | selected_individuals.append(all_individuals[i]) 141 | 142 | for i in range (len(selected_individuals)): 143 | print(selected_individuals[i].printme()) 144 | 145 | return selected_individuals 146 | 147 | # --- Parameter Noising 148 | def add_noise_simple(mu,noiseSigma, largeNoise=False): 149 | x = np.random.rand(1) - 0.5 #probability of doing x 150 | if np.random.rand(1) < MUTATION_PROB: 151 | #print("mutating") 152 | if not largeNoise: 153 | x = x*noiseSigma 154 | else: 155 | x = x*noiseSigma #Sigma = width of the standard deviaion 156 | else: 157 | x = 0 158 | #print ("x/200",x,"big_sigma",big_sigma) 159 | return mu + x 160 | 161 | 162 | add_noise_simple = np.vectorize(add_noise_simple,otypes=[np.float]) 163 | 164 | 165 | def add_noise_to_model(targetModel,noiseSigma=NOISE_SIGMA,largeNoise = True): 166 | 167 | sz = len(targetModel.layers) 168 | #if largeNoise: 169 | # print("Setting Large Noise!") 170 | for k in range(sz): 171 | w = targetModel.layers[k].get_weights() 172 | if np.alen(w) >0 : 173 | #print("k==>",k) 174 | w[0] = add_noise_simple(w[0],noiseSigma,largeNoise) 175 | 176 | targetModel.layers[k].set_weights(w) 177 | return targetModel 178 | 179 | def add_mutations(individuals,noiseSigma=NOISE_SIGMA): 180 | for i in range (len(individuals)): 181 | individuals[i].network = add_noise_to_model(individuals[i].network,noiseSigma,True) 182 | 183 | 184 | def populate_next_generation(generationID,top_individuals,population_size, network_width, observation_space, action_space,total_population_counter): 185 | newPop = top_individuals 186 | for i in range( population_size - len(top_individuals)): 187 | newModel = create_model(network_width, observation_space, action_space) 188 | model1 = top_individuals[0].network 189 | model2 = top_individuals[1].network 190 | sz = len(newModel.layers) 191 | #if largeNoise: 192 | # print("Setting Large Noise!") 193 | for k in range(sz): 194 | w = newModel.layers[k].get_weights() 195 | w1 = model1.layers[k].get_weights() 196 | w2 = model2.layers[k].get_weights() 197 | 198 | if np.alen(w) >0 : 199 | #print("k==>",k) 200 | #w[0][0] = combine_weights(w[0][0],w1[0][0],w2[0][0]) 201 | for j in range(np.alen(w[0])): 202 | y=w[0][j] 203 | y1 = w1[0][j] 204 | y2 = w2[0][j] 205 | for l in range (np.alen(y)): 206 | z=y[l] 207 | z1=y1[l] 208 | z2=y2[l] 209 | if np.random.rand(1)>0.5: 210 | z=z1+0.0 211 | else: 212 | z=z2+0.0 213 | y[l]=z 214 | w[0][j]=y 215 | 216 | newModel.layers[k].set_weights(w) 217 | top_individuals.append( Individual(generationID,total_population_counter,newModel) ) 218 | total_population_counter+=1 219 | return top_individuals,total_population_counter 220 | 221 | 222 | 223 | 224 | ''' ------------------''' 225 | 226 | all_individuals,total_population_counter = initialize_population(population_size=POPULATION_SIZE, 227 | network_width=NETWORK_WIDTH, 228 | observation_space=OBSERVATION_SPACE, 229 | action_space=ACTION_SPACE, 230 | environment_name=ENVIRONMENT_NAME, 231 | total_population_counter=total_population_counter) 232 | 233 | 234 | for gens in range (MAX_GENERATIONS): 235 | test_all_individuals(NUM_TEST_EPISODES) 236 | top_individuals = select_top_individuals(NUM_SELECTED_FOR_REPRODUCTION,POPULATION_SIZE) 237 | generations_count += 1 238 | print("Generating next Gen ",generations_count) 239 | all_individuals,total_population_counter = populate_next_generation(generations_count,top_individuals, 240 | POPULATION_SIZE,NETWORK_WIDTH, 241 | OBSERVATION_SPACE, 242 | ACTION_SPACE, 243 | total_population_counter) 244 | print("@@@@ Adding Noise @@@@") 245 | add_mutations(all_individuals) 246 | 247 | 248 | 249 | #for i in range (len(all_individuals)): 250 | # all_individuals[i].printNetwork() 251 | -------------------------------------------------------------------------------- /NeuroEvolution/Lib/Individual.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import keras 3 | from keras.layers.advanced_activations import LeakyReLU, PReLU 4 | from keras.models import Sequential 5 | from keras.layers import Dense, Dropout 6 | 7 | class Individual: 8 | 9 | def __init__(self, generationID,indivID, network): 10 | self.generationID = generationID 11 | self.indivID = indivID 12 | self.network = network 13 | #self.mutationSigma = mutationSigma 14 | self.lifeScore = -10000 15 | 16 | def printme(self): 17 | return "Generation %2d Individual %4d life score %4.3f network %s"%(self.generationID,+self.indivID,self.lifeScore,self.network) 18 | #print("say what?",self.network) 19 | 20 | def printNetwork(self): 21 | print("--- ID",self.indivID,"lifeScore ",self.lifeScore) 22 | sz = len(self.network.layers) 23 | #if largeNoise: 24 | # print("Setting Large Noise!") 25 | for k in range(sz): 26 | w = self.network.layers[k].get_weights() 27 | if np.alen(w) >0 : 28 | print("k==>",k) 29 | print("w[0]",w[0]) 30 | #print("w[1]",w[1]) 31 | #print("w[3]",w[3]) 32 | 33 | 34 | class IndividualTF: 35 | def __init__(self, generationID,indivID, apw_h,apw_h2, 36 | apw_h3,apw_o, appy_x): 37 | self.generationID = generationID 38 | self.indivID = indivID 39 | #self.network = network 40 | self.apw_h = apw_h 41 | self.apw_h2 = apw_h2 42 | self.apw_h3 = apw_h3 43 | self.apw_o = apw_o 44 | self.appy_x = appy_x 45 | 46 | #self.mutationSigma = mutationSigma 47 | self.lifeScore = -10000 48 | 49 | def printme(self): 50 | return "Generation %2d Individual %4d life score %4.3f network "%(self.generationID,+self.indivID,self.lifeScore) 51 | #print("say what?",self.network) 52 | 53 | def printNetwork(self): 54 | print("--- ID",self.indivID,"lifeScore ",self.lifeScore) 55 | print("apw_h ", self.apw_h) 56 | print("apw_o ", self.apw_o) 57 | -------------------------------------------------------------------------------- /NeuroEvolution/old/Main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import keras 3 | import gym 4 | import roboschool 5 | 6 | from keras.layers.advanced_activations import LeakyReLU, PReLU 7 | from keras.models import Sequential 8 | from keras.layers import Dense, Dropout 9 | from keras import optimizers 10 | 11 | from Lib.Individual import Individual 12 | 13 | ENVIRONMENT_NAME = "RoboschoolHopper-v1" 14 | OBSERVATION_SPACE = 15 15 | ACTION_SPACE = 3 16 | 17 | B_DISCOUNT = 0.98 18 | 19 | POPULATION_SIZE = 10 20 | NETWORK_WIDTH = 512 21 | NUM_TEST_EPISODES = 3 22 | NUM_SELECTED_FOR_REPRODUCTION = 2 23 | NOISE_SIGMA = 0.06 24 | 25 | MAX_GENERATIONS = 20000 26 | 27 | CLIP_ACTIONS = True 28 | MAX_STEPS = 996 29 | 30 | all_individuals = [] 31 | generations_count = 0 32 | total_population_counter = 0 33 | 34 | 35 | 36 | 37 | 38 | '''---------ENVIRONMENT INITIALIZATION--------''' 39 | 40 | env = gym.make(ENVIRONMENT_NAME) 41 | #env.render(mode="human") 42 | env.reset() 43 | 44 | print("-- Observations",env.observation_space) 45 | print("-- actionspace",env.action_space) 46 | 47 | 48 | #initialize training matrix with random states and actions 49 | apdataX = np.random.random(( 5,OBSERVATION_SPACE )) 50 | apdataY = np.random.random((5,ACTION_SPACE)) 51 | 52 | 53 | '''---------------------''' 54 | 55 | def GetRememberedOptimalPolicy(targetModel,qstate): 56 | predX = np.zeros(shape=(1,OBSERVATION_SPACE)) 57 | predX[0] = qstate 58 | 59 | #print("trying to predict reward at qs_a", predX[0]) 60 | pred = targetModel.predict(predX[0].reshape(1,predX.shape[1])) 61 | r_remembered_optimal_policy = pred[0] 62 | return r_remembered_optimal_policy 63 | 64 | 65 | def create_model(network_width, observation_space, action_space): 66 | action_predictor_model = Sequential() 67 | action_predictor_model.add(Dense(network_width, activation='relu', input_dim=observation_space)) 68 | action_predictor_model.add(Dense(action_space)) 69 | return action_predictor_model 70 | 71 | def initialize_population(population_size,network_width, observation_space, action_space, environment_name,total_population_counter): 72 | initial_population = [] 73 | for i in range (population_size): 74 | action_predictor_model = create_model(network_width, observation_space, action_space) 75 | indiv = Individual(generationID=0, indivID=total_population_counter , network = action_predictor_model) 76 | total_population_counter += 1 77 | initial_population.append(indiv) 78 | return initial_population, total_population_counter 79 | 80 | def test_individual(indiv,num_test_episodes): 81 | indiv.lifeScore = 0 82 | allRewards = [] 83 | for i in range(num_test_episodes): 84 | episodeRewards = [] 85 | #print("episode "+str(i)+" performing test for indiv ",indiv.printme()) 86 | qs = env.reset() 87 | for step in range (5000): 88 | a = GetRememberedOptimalPolicy(indiv.network, qs) 89 | if CLIP_ACTIONS: 90 | for i in range (np.alen(a)): 91 | if a[i] < -1: a[i]=-0.99999999999 92 | if a[i] > 1: a[i] = 0.99999999999 93 | qs,r,done,info = env.step(a) 94 | episodeRewards.append(r) 95 | #indiv.lifeScore += r 96 | env.render() 97 | if step > MAX_STEPS: 98 | done = True 99 | if done: 100 | episodeRewards.reverse() 101 | for j in range(len(episodeRewards)): 102 | #if j ==0: 103 | # print("last reward ",episodeRewards[j]) 104 | if j > 0: 105 | episodeRewards[j] = episodeRewards[j] + B_DISCOUNT * episodeRewards[j-1] 106 | #avg = sum(episodeRewards)/len(episodeRewards) 107 | #print("episode average ", avg) 108 | for j in range(len(episodeRewards)): 109 | allRewards.append(episodeRewards[j]) 110 | #allRewards = allRewards + episodeRewards 111 | break 112 | epAvg = sum(episodeRewards) / len(episodeRewards) 113 | print("generationID",indiv.generationID,"IndivID",indiv.indivID,"episodeRewards rewards ",epAvg) 114 | 115 | avg = sum(allRewards) / len(allRewards) 116 | indiv.lifeScore = avg 117 | #indiv.lifeScore = np.random.rand(1)[0]*50 118 | print("indivID - ",indiv.indivID,"lifeScore =",indiv.lifeScore) 119 | 120 | 121 | def test_all_individuals(num_test_episodes): 122 | for i in range(len(all_individuals)): 123 | test_individual(all_individuals[i],NUM_TEST_EPISODES) 124 | 125 | 126 | def select_top_individuals(num_selected,population_size): 127 | scores = np.zeros(population_size) 128 | for i in range(np.alen(scores)): 129 | scores[i] = all_individuals[i].lifeScore 130 | 131 | print( scores ) 132 | topScores = scores[ scores.argsort()[-num_selected:][::-1] ] 133 | #print ("Top Scores ", topScores) 134 | selected_individuals = [] 135 | for i in range(len(all_individuals)): 136 | if all_individuals[i].lifeScore >= topScores.min(): 137 | #print("Selecting individual",i," with score ", all_individuals[i].lifeScore,"cuttoff ", topScores.min()) 138 | selected_individuals.append(all_individuals[i]) 139 | 140 | for i in range (len(selected_individuals)): 141 | print(selected_individuals[i].printme()) 142 | 143 | return selected_individuals 144 | 145 | # --- Parameter Noising 146 | def add_noise_simple(mu,noiseSigma, largeNoise=False): 147 | x = np.random.rand(1) - 0.5 #probability of doing x 148 | if not largeNoise: 149 | x = x*noiseSigma 150 | else: 151 | x = x*noiseSigma #Sigma = width of the standard deviaion 152 | #print ("x/200",x,"big_sigma",big_sigma) 153 | return mu + x 154 | 155 | 156 | add_noise_simple = np.vectorize(add_noise_simple,otypes=[np.float]) 157 | 158 | 159 | def add_noise_to_model(targetModel,noiseSigma=NOISE_SIGMA,largeNoise = True): 160 | 161 | sz = len(targetModel.layers) 162 | #if largeNoise: 163 | # print("Setting Large Noise!") 164 | for k in range(sz): 165 | w = targetModel.layers[k].get_weights() 166 | if np.alen(w) >0 : 167 | #print("k==>",k) 168 | w[0] = add_noise_simple(w[0],noiseSigma,largeNoise) 169 | 170 | targetModel.layers[k].set_weights(w) 171 | return targetModel 172 | 173 | def add_mutations(individuals,noiseSigma=NOISE_SIGMA): 174 | for i in range (len(individuals)): 175 | individuals[i].network = add_noise_to_model(individuals[i].network,noiseSigma,True) 176 | 177 | 178 | def populate_next_generation(generationID,top_individuals,population_size, network_width, observation_space, action_space,total_population_counter): 179 | newPop = top_individuals 180 | for i in range( population_size - len(top_individuals)): 181 | newModel = create_model(network_width, observation_space, action_space) 182 | model1 = top_individuals[0].network 183 | model2 = top_individuals[1].network 184 | sz = len(newModel.layers) 185 | #if largeNoise: 186 | # print("Setting Large Noise!") 187 | for k in range(sz): 188 | w = newModel.layers[k].get_weights() 189 | w1 = model1.layers[k].get_weights() 190 | w2 = model2.layers[k].get_weights() 191 | 192 | if np.alen(w) >0 : 193 | #print("k==>",k) 194 | #w[0][0] = combine_weights(w[0][0],w1[0][0],w2[0][0]) 195 | for j in range(np.alen(w[0])): 196 | y=w[0][j] 197 | y1 = w1[0][j] 198 | y2 = w2[0][j] 199 | for l in range (np.alen(y)): 200 | z=y[l] 201 | z1=y1[l] 202 | z2=y2[l] 203 | if np.random.rand(1)>0.5: 204 | z=z1+0.0 205 | else: 206 | z=z2+0.0 207 | y[l]=z 208 | w[0][j]=y 209 | 210 | newModel.layers[k].set_weights(w) 211 | top_individuals.append( Individual(generationID,total_population_counter,newModel) ) 212 | total_population_counter+=1 213 | return top_individuals,total_population_counter 214 | 215 | 216 | 217 | 218 | ''' ------------------''' 219 | 220 | all_individuals,total_population_counter = initialize_population(population_size=POPULATION_SIZE, 221 | network_width=NETWORK_WIDTH, 222 | observation_space=OBSERVATION_SPACE, 223 | action_space=ACTION_SPACE, 224 | environment_name=ENVIRONMENT_NAME, 225 | total_population_counter=total_population_counter) 226 | 227 | 228 | for gens in range (MAX_GENERATIONS): 229 | test_all_individuals(NUM_TEST_EPISODES) 230 | top_individuals = select_top_individuals(NUM_SELECTED_FOR_REPRODUCTION,POPULATION_SIZE) 231 | generations_count += 1 232 | print("Generating next Gen ",generations_count) 233 | all_individuals,total_population_counter = populate_next_generation(generations_count,top_individuals, 234 | POPULATION_SIZE,NETWORK_WIDTH, 235 | OBSERVATION_SPACE, 236 | ACTION_SPACE, 237 | total_population_counter) 238 | print("@@@@ Adding Noise @@@@") 239 | add_mutations(all_individuals) 240 | 241 | 242 | 243 | #for i in range (len(all_individuals)): 244 | # all_individuals[i].printNetwork() 245 | -------------------------------------------------------------------------------- /NeuroEvolution/old/Main2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import keras 3 | import gym 4 | import roboschool 5 | 6 | from keras.layers.advanced_activations import LeakyReLU, PReLU 7 | from keras.models import Sequential 8 | from keras.layers import Dense, Dropout 9 | from keras import optimizers 10 | 11 | from Lib.Individual import Individual 12 | ''' 13 | ENVIRONMENT_NAME = "RoboschoolAnt-v1" 14 | OBSERVATION_SPACE = 28 15 | ACTION_SPACE = 8 16 | ''' 17 | ENVIRONMENT_NAME = "RoboschoolHopper-v1" 18 | OBSERVATION_SPACE = 15 19 | ACTION_SPACE = 3 20 | 21 | B_DISCOUNT = 0.98 22 | 23 | POPULATION_SIZE = 15 24 | NETWORK_WIDTH = 32 25 | NETWORK_HIDDEN_LAYERS = 1 26 | NUM_TEST_EPISODES = 1 27 | NUM_SELECTED_FOR_REPRODUCTION = 2 28 | NOISE_SIGMA = 0.01 29 | MUTATION_PROB = 0.05 30 | 31 | MAX_GENERATIONS = 200000 32 | 33 | USE_GAUSSIAN_NOISE = False 34 | HAS_EARLY_TERMINATION_REWARD = False 35 | EARLY_TERMINATION_REWARD = -2 36 | CLIP_ACTIONS = False 37 | MAX_STEPS = 650 38 | 39 | all_individuals = [] 40 | generations_count = 0 41 | total_population_counter = 0 42 | #numLandings = 0 43 | 44 | 45 | 46 | 47 | 48 | '''---------ENVIRONMENT INITIALIZATION--------''' 49 | 50 | env = gym.make(ENVIRONMENT_NAME) 51 | #env.render(mode="human") 52 | env.reset() 53 | 54 | print("-- Observations",env.observation_space) 55 | print("-- actionspace",env.action_space) 56 | 57 | 58 | #initialize training matrix with random states and actions 59 | apdataX = np.random.random(( 5,OBSERVATION_SPACE )) 60 | apdataY = np.random.random((5,ACTION_SPACE)) 61 | 62 | 63 | '''---------------------''' 64 | 65 | def GetRememberedOptimalPolicy(targetModel,qstate): 66 | predX = np.zeros(shape=(1,OBSERVATION_SPACE)) 67 | predX[0] = qstate 68 | 69 | #print("trying to predict reward at qs_a", predX[0]) 70 | pred = targetModel.predict(predX[0].reshape(1,predX.shape[1])) 71 | r_remembered_optimal_policy = pred[0] 72 | return r_remembered_optimal_policy 73 | 74 | 75 | def create_model(network_width, network_hidden_layers, observation_space, action_space): 76 | action_predictor_model = Sequential() 77 | action_predictor_model.add(Dense(network_width, activation='relu', input_dim=observation_space)) 78 | for i in range(network_hidden_layers): 79 | action_predictor_model.add(Dense(network_width, activation='relu')) 80 | 81 | action_predictor_model.add(Dense(action_space)) 82 | return action_predictor_model 83 | 84 | def initialize_population(population_size,network_width,network_hidden_layers, observation_space, action_space, environment_name,total_population_counter): 85 | initial_population = [] 86 | for i in range (population_size): 87 | action_predictor_model = create_model(network_width,network_hidden_layers, observation_space, action_space) 88 | indiv = Individual(generationID=0, indivID=total_population_counter , network = action_predictor_model) 89 | total_population_counter += 1 90 | initial_population.append(indiv) 91 | return initial_population, total_population_counter 92 | 93 | def test_individual(indiv,num_test_episodes): 94 | indiv.lifeScore = 0 95 | allRewards = [] 96 | for i in range(num_test_episodes): 97 | episodeRewards = [] 98 | #print("episode "+str(i)+" performing test for indiv ",indiv.printme()) 99 | qs = env.reset() 100 | for step in range (5000): 101 | a = GetRememberedOptimalPolicy(indiv.network, qs) 102 | if CLIP_ACTIONS: 103 | for i in range (np.alen(a)): 104 | if a[i] < -1: a[i]=-0.99999999999 105 | if a[i] > 1: a[i] = 0.99999999999 106 | qs,r,done,info = env.step(a) 107 | if HAS_EARLY_TERMINATION_REWARD and done and step MAX_STEPS: 114 | done = True 115 | if done: 116 | episodeRewards.reverse() 117 | for j in range(len(episodeRewards)): 118 | #if j ==0: 119 | # print("last reward ",episodeRewards[j]) 120 | if j > 0: 121 | episodeRewards[j] = episodeRewards[j] + B_DISCOUNT * episodeRewards[j-1] 122 | #avg = sum(episodeRewards)/len(episodeRewards) 123 | #print("episode average ", avg) 124 | for j in range(len(episodeRewards)): 125 | allRewards.append(episodeRewards[j]) 126 | #allRewards = allRewards + episodeRewards 127 | epAvg = sum(episodeRewards) / len(episodeRewards) 128 | allRewards.append(epAvg) 129 | #if epAvg >0: 130 | # numLandings = numLandings+1 131 | 132 | break 133 | #print("generationID",indiv.generationID,"IndivID",indiv.indivID,"episodeRewards rewards ",epAvg) 134 | 135 | avg = sum(allRewards) / len(allRewards) 136 | indiv.lifeScore = avg 137 | #indiv.lifeScore = np.random.rand(1)[0]*50 138 | print("generationID",indiv.generationID,"indivID - ",indiv.indivID,"numLandings ",0,"lifeScore =",indiv.lifeScore) 139 | 140 | 141 | def test_all_individuals(num_test_episodes): 142 | for i in range(len(all_individuals)): 143 | test_individual(all_individuals[i],NUM_TEST_EPISODES) 144 | 145 | 146 | def select_top_individuals(num_selected,population_size): 147 | scores = np.zeros(population_size) 148 | for i in range(np.alen(scores)): 149 | scores[i] = all_individuals[i].lifeScore 150 | 151 | print( scores ) 152 | topScores = scores[ scores.argsort()[-num_selected:][::-1] ] 153 | #print ("Top Scores ", topScores) 154 | selected_individuals = [] 155 | for i in range(len(all_individuals)): 156 | if all_individuals[i].lifeScore >= topScores.min(): 157 | #print("Selecting individual",i," with score ", all_individuals[i].lifeScore,"cuttoff ", topScores.min()) 158 | selected_individuals.append(all_individuals[i]) 159 | 160 | 161 | print("Selected individuals ") 162 | for i in range (len(selected_individuals)): 163 | print(selected_individuals[i].printme()) 164 | 165 | return selected_individuals 166 | 167 | # --- Parameter Noising 168 | 169 | def add_noise(mu,noiseSigma, largeNoise=False): 170 | 171 | if largeNoise: 172 | sig = noiseSigma 173 | else: 174 | #print("Adding Large parameter noise") 175 | sig = noiseSigma #Sigma = width of the standard deviaion 176 | #mu = means 177 | x = np.random.rand(1) #probability of doing x 178 | #print ("x prob ",x) 179 | if x >0.5: 180 | return mu + np.exp(-np.power(x - mu, 2.) / (2 * np.power(sig, 2.))) 181 | else: 182 | return mu - np.exp(-np.power(x - mu, 2.) / (2 * np.power(sig, 2.))) 183 | 184 | def add_noise_simple(mu,noiseSigma, largeNoise=False): 185 | x = np.random.rand(1) - 0.5 #probability of doing x 186 | if np.random.rand(1) < MUTATION_PROB: 187 | print("mutating") 188 | if not largeNoise: 189 | x = x*noiseSigma 190 | else: 191 | x = x*noiseSigma #Sigma = width of the standard deviaion 192 | else: 193 | x = 0 194 | #print ("x/200",x,"big_sigma",big_sigma) 195 | return mu + x 196 | 197 | 198 | add_noise_simple = np.vectorize(add_noise_simple,otypes=[np.float]) 199 | add_noise = np.vectorize(add_noise,otypes=[np.float]) 200 | 201 | def add_noise_to_model(targetModel,noiseSigma=NOISE_SIGMA,largeNoise = True): 202 | 203 | sz = len(targetModel.layers) 204 | #if largeNoise: 205 | # print("Setting Large Noise!") 206 | for k in range(sz): 207 | w = targetModel.layers[k].get_weights() 208 | if np.alen(w) >0 : 209 | #print("k==>",k) 210 | w[0] = add_noise_simple(w[0],noiseSigma,largeNoise) 211 | 212 | targetModel.layers[k].set_weights(w) 213 | return targetModel 214 | 215 | 216 | ''' MUTATIONS ''' 217 | def add_mutations(individuals,noiseSigma=NOISE_SIGMA): 218 | for i in range (len(individuals)): 219 | if i >2 and i%5==0: 220 | individuals[i].network = add_noise_to_model(individuals[i].network,noiseSigma*2,True) 221 | 222 | 223 | def populate_next_generation(generationID,top_individuals,population_size, network_width,network_hidden_layers, observation_space, action_space,total_population_counter): 224 | newPop = top_individuals 225 | for i in range( population_size - len(top_individuals)): 226 | newModel = create_model(network_width, network_hidden_layers, observation_space, action_space) 227 | model1 = top_individuals[0].network 228 | model2 = top_individuals[1].network 229 | sz = len(newModel.layers) 230 | #if largeNoise: 231 | # print("Setting Large Noise!") 232 | for k in range(sz): 233 | w = newModel.layers[k].get_weights() 234 | w1 = model1.layers[k].get_weights() 235 | w2 = model2.layers[k].get_weights() 236 | 237 | if np.alen(w) >0 : 238 | #print("k==>",k) 239 | #w[0][0] = combine_weights(w[0][0],w1[0][0],w2[0][0]) 240 | for j in range(np.alen(w[0])): 241 | y=w[0][j] 242 | y1 = w1[0][j] 243 | y2 = w2[0][j] 244 | for l in range (np.alen(y)): 245 | z=y[l] 246 | z1=y1[l] 247 | z2=y2[l] 248 | if np.random.rand(1)>0.5: 249 | z=z1+0.0 250 | else: 251 | z=z2+0.0 252 | y[l]=z 253 | w[0][j]=y 254 | 255 | newModel.layers[k].set_weights(w) 256 | top_individuals.append( Individual(generationID,total_population_counter,newModel) ) 257 | total_population_counter+=1 258 | return top_individuals,total_population_counter 259 | 260 | 261 | 262 | 263 | ''' ------------------''' 264 | 265 | all_individuals,total_population_counter = initialize_population(population_size=POPULATION_SIZE, 266 | network_width=NETWORK_WIDTH, 267 | network_hidden_layers = NETWORK_HIDDEN_LAYERS, 268 | observation_space=OBSERVATION_SPACE, 269 | action_space=ACTION_SPACE, 270 | environment_name=ENVIRONMENT_NAME, 271 | total_population_counter=total_population_counter) 272 | 273 | 274 | for gens in range (MAX_GENERATIONS): 275 | test_all_individuals(NUM_TEST_EPISODES) 276 | top_individuals = select_top_individuals(NUM_SELECTED_FOR_REPRODUCTION,POPULATION_SIZE) 277 | generations_count += 1 278 | print("Generating next Gen ",generations_count) 279 | all_individuals,total_population_counter = populate_next_generation(generations_count,top_individuals, 280 | POPULATION_SIZE,NETWORK_WIDTH, NETWORK_HIDDEN_LAYERS, 281 | OBSERVATION_SPACE, 282 | ACTION_SPACE, 283 | total_population_counter) 284 | #print("@@@@ Adding Noise @@@@") 285 | add_mutations(all_individuals) 286 | 287 | 288 | 289 | #for i in range (len(all_individuals)): 290 | # all_individuals[i].printNetwork() 291 | -------------------------------------------------------------------------------- /OptimalPolicyTreeSearch/Cartpole_OPTS.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Cartpole solution by Michel Aka the Author of the Fit Machine Learning Blog 3 | 4 | Demo here 5 | https://www.youtube.com/watch?v=TguWjWvRp8c 6 | ''' 7 | import numpy as np 8 | import keras 9 | import gym 10 | import os 11 | import h5py 12 | 13 | from keras.models import Sequential 14 | from keras.layers import Dense, Dropout 15 | from keras.layers import Embedding 16 | from keras.layers import LSTM 17 | from keras import optimizers 18 | 19 | 20 | num_env_variables = 4 21 | num_env_actions = 1 22 | num_training_exmaples = 100 23 | timesteps = 1 24 | num_initial_observation = 4000 25 | 26 | #recommend 850 for tensorflow backend , but 500 should be enough for theano backend 27 | training_epochs = 850 28 | num_anticipation_steps = 6 29 | load_previous_weights = True 30 | observe_and_train = False 31 | save_weights = False 32 | 33 | 34 | #Create testing enviroment 35 | env = gym.make('CartPole-v0') 36 | env.reset() 37 | 38 | #initialize training matrix with random states and actions 39 | dataX = np.random.random(( num_training_exmaples,num_env_variables+num_env_actions )) 40 | #initize training matrix corresponding expected states and expected rewards (random) 41 | dataY = np.random.random((num_training_exmaples,num_env_variables+1)) 42 | 43 | 44 | 45 | #nitialize the LSTM with random weights 46 | 47 | model = Sequential() 48 | model.add(LSTM(16,return_sequences=True, stateful=True , batch_size=1, input_shape=(timesteps, dataX.shape[1]))) 49 | model.add(LSTM(16, return_sequences=True)) 50 | model.add(Dense(16, activation='relu')) 51 | model.add(Dense(dataY.shape[1])) 52 | 53 | opt = optimizers.adam(lr=0.01) 54 | 55 | model.compile(loss='mse', optimizer=opt, metrics=['accuracy']) 56 | 57 | #load previous model weights if they exist 58 | if load_previous_weights: 59 | dir_path = os.path.realpath(".") 60 | fn = dir_path + "/CP-weights.h5" 61 | print("filepath ", fn) 62 | if os.path.isfile(fn): 63 | print("loading weights") 64 | model.load_weights("CP-weights.h5") 65 | else: 66 | print("File CP-weights.h5 does not exis. Retraining... ") 67 | 68 | #Record first 500 in a sequence and add them to the training sequence 69 | total_steps = 0 70 | dataX = np.zeros(shape=(1,5)) 71 | dataY = np.zeros(shape=(1,5)) 72 | 73 | print("dataX shape", dataX.shape) 74 | print("dataY shape", dataY.shape) 75 | 76 | if observe_and_train: 77 | #observe for 100 games 78 | for game in range(100): 79 | 80 | if total_steps >= num_initial_observation: 81 | break 82 | #Get the Q state 83 | qs = env.reset() 84 | for step in range (200): 85 | a=0 86 | if np.random.rand(1) < 0.5: 87 | a=0 88 | else: 89 | a=1 90 | env.render() 91 | qs_a = np.concatenate((qs,np.array([a])), axis=0) 92 | 93 | #get the target state and reward 94 | s,r,done,info = env.step(a) 95 | 96 | #set reward in case of failure 97 | if done: 98 | r = -1 99 | 100 | #concatenate target state and reward 101 | s_r = np.concatenate((s,np.array([r])), axis=0) 102 | 103 | if done: 104 | #print negative reward array 105 | print("Negative reward s_r: ", s_r) 106 | 107 | #print("reward = ", r) 108 | #print("target state", s) 109 | #print("concatenate(s,r)", s_r) 110 | 111 | 112 | #record only the first x number of states 113 | if total_steps ==0: 114 | dataX[0] = qs_a 115 | dataY[0] = s_r 116 | 117 | if total_steps < (num_initial_observation-1): 118 | dataX = np.vstack((dataX,qs_a)) 119 | dataY = np.vstack((dataY,s_r)) 120 | 121 | #Update the states 122 | qs=s 123 | 124 | 125 | total_steps += 1 126 | if done : 127 | break 128 | 129 | print("Observation complete. - Begin LSTM training") 130 | 131 | print("dataX shape", dataX.shape) 132 | print(dataX[0:5]) 133 | print("dataY shape", dataY.shape) 134 | print(dataY[0:5]) 135 | 136 | feedX = np.reshape(dataX, (dataX.shape[0], 1, dataX.shape[1] )) 137 | feedY = np.reshape(dataY, (dataY.shape[0], 1, dataY.shape[1] )) 138 | 139 | 140 | #The more epochs you train the model, the better is becomes at predicting future states 141 | #This in turn will improve the results of the Bellman equation and thus will lead us to 142 | # better decisions in our MDP process 143 | model.fit(feedX,feedY, batch_size=1,epochs=training_epochs,verbose=2) 144 | 145 | print("total_steps ", total_steps) 146 | print("dataX ", dataX[0:10]) 147 | print("dataY ", dataY[0:10]) 148 | #print("dataY ", dataY) 149 | 150 | dataX = np.random.random((1,5)) 151 | 152 | res = model.predict(dataX[0].reshape(1,1,dataX.shape[1])) 153 | nstate = res[0][0][:-1] 154 | 155 | print("predicted output ", res) 156 | print("expected reward ", res[0][0][4]) 157 | print("expected state ", nstate) 158 | 159 | def estimateReward(qstate,action, depth): 160 | if depth <= 0: 161 | return 0 162 | #calculate/estimate reward at this state and get the next state 163 | qs_a = np.concatenate((qstate,np.array([action])), axis=0) 164 | predX = np.zeros(shape=(1,5)) 165 | predX[0] = qs_a 166 | pred = model.predict(predX[0].reshape(1,1,predX.shape[1])) 167 | reward = pred[0][0][4] 168 | expected_state = pred[0][0][:-1] 169 | 170 | ''' 171 | print("depth -- ", depth) 172 | print("qstate", qstate) 173 | print("action", action) 174 | print("pred", pred) 175 | print("expected_state", expected_state) 176 | print("reward", reward) 177 | ''' 178 | # Bellman -- reward at this state = reward + Sum of discounted expected rewards for all actions (recursively) 179 | #recursively calculate the reward at future states for all possible actions 180 | discounted_future_rewards = 0.95*estimateReward(expected_state,0,depth-1)+ 0.95*estimateReward(expected_state,1,depth-1) 181 | 182 | #print("discounted_future_rewards", discounted_future_rewards) 183 | #add current state and discounted future state reward 184 | return reward + discounted_future_rewards 185 | 186 | 187 | print("** Estimating reward for dataX[0] with action 1 usint Bellman", estimateReward(dataX[0][:-1],1,2)) 188 | print("** Estimating reward for dataX[0] with action 0 usint Bellman", estimateReward(dataX[0][:-1],0,2)) 189 | 190 | 191 | 192 | ##### 193 | ##### 194 | #Play the game for X rounds using the Bellman with LSTM anticipation model 195 | 196 | 197 | for game in range(3): 198 | total_steps =0 199 | #Get the Q state 200 | qs = env.reset() 201 | for step in range (300): 202 | ##chose an action by estimating consequences of actions for the next num_anticipation_steps steps ahead 203 | #works best with looking 6 steps ahead 204 | #Also works best if you train the model more itterations 205 | estimated_anticipated_reward_a = estimateReward(qs,1,num_anticipation_steps) 206 | estimated_anticipated_reward_b = estimateReward(qs,0,num_anticipation_steps) 207 | #print(" estimated rewards a and b", estimated_anticipated_reward_a, estimated_anticipated_reward_b) 208 | 209 | #chose argmax action of estimated anticipated rewards 210 | if estimated_anticipated_reward_a > estimated_anticipated_reward_b: 211 | a = 1 212 | else: 213 | a = 0 214 | 215 | env.render() 216 | 217 | 218 | #get the target state and reward 219 | s,r,done,info = env.step(a) 220 | 221 | 222 | 223 | qs=s 224 | #set reward in case of failure 225 | if done: 226 | r = -1 227 | if total_steps >= 198: 228 | print("*** Game Won after ", total_steps, " steps") 229 | else: 230 | print("** failed after ", total_steps, " steps") 231 | 232 | 233 | total_steps += 1 234 | if done : 235 | break 236 | 237 | if save_weights: 238 | #Save model 239 | print("Saving weights") 240 | model.save_weights("CP-weights.h5") 241 | -------------------------------------------------------------------------------- /ParameterNoising/NoisingFunction.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import keras 3 | import gym 4 | 5 | import pygal 6 | import os 7 | import h5py 8 | import matplotlib.pyplot as plt 9 | import math 10 | 11 | from keras.layers.advanced_activations import LeakyReLU, PReLU 12 | from keras.models import Sequential 13 | from keras.layers import Dense, Dropout 14 | from keras.layers import Embedding 15 | from keras import optimizers 16 | 17 | 18 | #nitialize the Reward predictor model 19 | Qmodel = Sequential() 20 | #model.add(Dense(num_env_variables+num_env_actions, activation='tanh', input_dim=dataX.shape[1])) 21 | Qmodel.add(Dense(32, activation='relu', input_dim=1)) 22 | Qmodel.add(Dropout(0.5)) 23 | Qmodel.add(Dense(2, activation='relu')) 24 | #Qmodel.add(Dropout(0.5)) 25 | #Qmodel.add(Dense(256, activation='tanh')) 26 | #Qmodel.add(Dropout(0.5)) 27 | #Qmodel.add(Dense(256, activation='relu')) 28 | #Qmodel.add(Dropout(0.5)) 29 | #Qmodel.add(Dense(512, activation='relu')) 30 | #Qmodel.add(Dropout(0.2)) 31 | #Qmodel.add(Dense(256, activation='relu')) 32 | #Qmodel.add(Dropout(0.2)) 33 | 34 | Qmodel.add(Dense(1)) 35 | #opt = optimizers.adam(lr=learning_rate) 36 | opt = optimizers.RMSprop() 37 | Qmodel.compile(loss='mse', optimizer=opt, metrics=['accuracy']) 38 | 39 | 40 | def add_noise(mu): 41 | sig = 0.15 #Sigma = width of the standard deviaion 42 | #mu = means 43 | x = np.random.rand(1) #probability of doing x 44 | return mu + np.exp(-np.power(x - mu, 2.) / (2 * np.power(sig, 2.))) 45 | 46 | add_noise = np.vectorize(add_noise,otypes=[np.float]) 47 | 48 | def add_noise_to_model(model_to_scramble): 49 | sz = len(model_to_scramble.layers) 50 | for k in range(sz): 51 | w = model_to_scramble.layers[k].get_weights() 52 | print("w ==>",w) 53 | if np.alen(w) >0: 54 | w[0] = add_noise(w[0]) 55 | print("w / noise ==>",w) 56 | model_to_scramble.layers[k].set_weights(w ) 57 | 58 | 59 | 60 | print("end") 61 | -------------------------------------------------------------------------------- /Pytorch/ActorCritic/Load_AC_model.py: -------------------------------------------------------------------------------- 1 | ## DQN Tutorial 2 | ## Implementation from https://github.com/FitMachineLearning 3 | 4 | import torch 5 | import gym 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import torch.optim as optim 9 | import numpy as np 10 | from dataclasses import dataclass 11 | from typing import Any 12 | from random import random 13 | from agent_and_model import sars,DQNAgent,CriticModel,ActorModel, ReplayBuffer 14 | 15 | 16 | 17 | 18 | 19 | 20 | def get_one_hot(action,n_dim): 21 | retval = np.zeros(n_dim) 22 | retval[action] = 1.0 23 | return retval 24 | 25 | 26 | def train_actor(actor_model, critic_model, state_transitions, num_actor_training_samples, num_actions): 27 | #for each observation get the critic to generate the Q value corresponding to each action_space 28 | #retain action observation pairs corresponding to the highest Q values 29 | #train the actor to converge towards that set 30 | 31 | #Generate random actions 32 | random_actions = [] 33 | for i in range(num_actor_training_samples): 34 | random_actions.append( np.random.rand(num_actions)*2-1 ) 35 | #Get random observations 36 | random_states = [s.state for s in state_transitions] 37 | 38 | # import ipdb; ipdb.set_trace() 39 | 40 | # for earch state add the best corresponding action to random actions 41 | for i in range(len(random_states)): 42 | with torch.no_grad(): 43 | act = actor_model(torch.Tensor(random_states[i]).to(actor_model.device)) .cpu().detach().numpy() 44 | random_actions.append(act) 45 | 46 | 47 | 48 | best_state_action = [] 49 | for i_states in range(len(random_states)): 50 | QAs = [] 51 | 52 | # get the Qvalues from the random actions 53 | for i_actions in range(len(random_actions)): 54 | with torch.no_grad(): 55 | qval = critic_model( torch.Tensor( torch.cat( (torch.Tensor(random_states[i_states]),torch.Tensor(random_actions[i_actions])),0 ) ).to(critic_model.device) ).cpu() 56 | QAs.append( qval ) 57 | # get index for best actions between all random actions and the actor's predicted actions 58 | #_sars = sars(observation,action,reward,observation_next,done,0.0) 59 | best_state_action.append(sars(random_states[i_states], random_actions[np.argmax(QAs)],0.0,None,False,np.max(QAs) )) 60 | # import ipdb;ipdb.set_trace() 61 | 62 | t_random_states = torch.stack( ([torch.Tensor(s.state) for s in best_state_action]) ).to(actor_model.device) 63 | target_actions = torch.stack( ([torch.Tensor(s.action) for s in best_state_action]) ).to(actor_model.device) 64 | actor_model.zero_grad() 65 | predicted_actions = actor_model(t_random_states) 66 | 67 | # loss = F.smooth_l1_loss(torch.sum(pred_qvals*one_hot_actions,-1), actual_Q_values.view(-1) ) 68 | loss = F.smooth_l1_loss(predicted_actions, target_actions ).mean() 69 | loss.backward() 70 | actor_model.opt.step() 71 | return loss 72 | 73 | def train_critic(critic_model, state_transitions, num_actions): 74 | if len(state_transitions) <=0: 75 | print("empty state transitions") 76 | return 77 | 78 | 79 | cur_states = torch.stack( ([torch.Tensor(torch.cat((torch.Tensor(s.state),torch.Tensor(s.action)),0)) for s in state_transitions]) ).to(critic_model.device) 80 | 81 | 82 | rewards = torch.stack( ([torch.Tensor([s.reward]) for s in state_transitions]) ).to(critic_model.device) 83 | Qs = torch.stack( ([torch.Tensor([s.qval]) for s in state_transitions]) ).to(critic_model.device) 84 | mask = torch.stack(([torch.Tensor([0]) if s.done else torch.Tensor([1]) for s in state_transitions])).to(critic_model.device) 85 | next_states = torch.stack( ([torch.Tensor(s.next_state) for s in state_transitions]) ).to(critic_model.device) 86 | actions = [s.action for s in state_transitions] 87 | # import ipdb; ipdb.set_trace() 88 | with torch.no_grad(): 89 | actual_Q_values = Qs 90 | # pred_qvals_next = critic_model(next_states)[0] 91 | critic_model.opt.zero_grad() 92 | pred_qvals = critic_model(cur_states) 93 | 94 | # one_hot_actions = F.one_hot(torch.LongTensor(actions),num_actions).to(model.device) 95 | # loss = torch.mean(torch.sqrt((torch.sum(pred_qvals*one_hot_actions,-1) - actual_Q_values.view(-1) )**2)).to(model.device) 96 | loss = F.smooth_l1_loss(pred_qvals.view(-1), actual_Q_values.view(-1) ) 97 | # loss = F.smooth_l1_loss(torch.sum(pred_qvals,-1), rewards.view(-1)+0.98*mask[:,0]*pred_qvals_next.view(-1) ).mean() 98 | loss.backward() 99 | critic_model.opt.step() 100 | return loss 101 | 102 | def update_Qs(replay_buffer,step_counter,episode_len,buffer_size): 103 | for i in range(episode_len): 104 | # if(step_counter > buffer_size): 105 | # import ipdb; ipdb.set_trace() 106 | index = episode_len-i 107 | next_index = index+1 108 | if i==0: 109 | replay_buffer[index].qval = replay_buffer[index].reward 110 | if(step_counter%2000==0): 111 | print("i",i,"q ",replay_buffer[index].qval) 112 | else: 113 | replay_buffer[index].qval = replay_buffer[index].reward + 0.98 * replay_buffer[next_index].qval 114 | if(step_counter%2000==0): 115 | print("i",i,"q ",replay_buffer[index].qval) 116 | return replay_buffer 117 | 118 | 119 | 120 | 121 | if __name__=='__main__': 122 | DEBUGER_ON = True 123 | NUM_GAMES = 100 124 | MAX_EPISODE_STEPS = 10000 125 | TARGET_MODEL_UPDATE_INTERVAL = 50 126 | EPSILON_MIN = 0.05 127 | EPSILON_START = 0.5 128 | EPSLILON_COUNT = 6000 #Games 129 | INITIAL_RANDOM_STEPS = 5000 130 | RANDOM_GAME_EVERY = 20 131 | TRAIN_CRITIC_EVERY_N_STEP = 300 132 | CRITIC_TRAINING_SAMPLE_SIZE = 256 133 | TRAIN_ACTOR_EVERY_N_GAME = 1 134 | ACTOR_TRAINING_SAMPLE_SIZE = 8 135 | NUM_ACTOR_TRAINING_SAMPLES = 40 136 | TRAINING_ITTERATIONS = 1 137 | NUM_ACTOR_TRAINING_SAMPLES = 128 138 | PRINT_EVERY = 1 139 | RENDER_ENV = True 140 | LOAD_MODEL = True 141 | SAVE_MODEL = False 142 | MODEL_FILE_NAME = "TDQN_RL_MODEL.trl" 143 | MODEL_ID = "01" 144 | SAVE_MODEL_EVERY = 25 145 | 146 | epsilon = EPSILON_START 147 | env = gym.make('LunarLanderContinuous-v2') 148 | # env = gym.make('BipedalWalker-v3') 149 | 150 | observation = env.reset() 151 | print("env action space ", env.action_space.shape) 152 | am = ActorModel(env.observation_space.shape,env.action_space.shape,lr=0.008) 153 | cm = CriticModel(env.observation_space.shape,env.action_space.shape,lr=0.01) 154 | agent = DQNAgent( am , cm ) 155 | # import ipdb;ipdb.set_trace() 156 | 157 | if LOAD_MODEL: 158 | agent.actor_model = torch.load("A2C_actor"+MODEL_ID+MODEL_FILE_NAME) 159 | agent.critic_model = torch.load("A2C_critic"+MODEL_ID+MODEL_FILE_NAME) 160 | 161 | agent.actor_model.eval() 162 | agent.critic_model.eval() 163 | 164 | step_counter = 0 165 | last_step_count = 0 166 | 167 | 168 | action = [] 169 | for game in range (NUM_GAMES): 170 | episode_sars = [] 171 | score = 0 172 | for step in range (MAX_EPISODE_STEPS): 173 | if RENDER_ENV: 174 | env.render() 175 | 176 | if random()<-0.1: 177 | action = env.action_space.sample() 178 | else: 179 | # import ipdb; ipdb.set_trace() 180 | action = agent.get_actions(observation).cpu().detach().numpy() 181 | # print("action ", action) 182 | observation_next, reward, done, info = env.step(action) 183 | score += reward 184 | 185 | observation = observation_next 186 | step_counter+=1 187 | last_step_count = step 188 | if done: 189 | 190 | break 191 | 192 | observation = env.reset() 193 | epsilon = max(EPSILON_MIN, epsilon-((EPSILON_START-EPSILON_MIN)/EPSLILON_COUNT) ) 194 | if (game%PRINT_EVERY==0): 195 | print("episide ", game,"last score",reward, "game score ", score ,"episode_len",last_step_count, "epsilon",epsilon ) 196 | avg_reward = [] 197 | # print("epsilon ", epsilon) 198 | -------------------------------------------------------------------------------- /Pytorch/ActorCritic/Output_noising/Readme.md: -------------------------------------------------------------------------------- 1 | Actor Critic implementation with Actor Noising. 2 | The idea behind Actor noising is as follows, during training of the actor, in addition to randomly generated actions, 3 | a set of action is also generated that correspond to the actor's predicted output where gaussian noise has been added. 4 | 5 | This idea, similar to parameter noising, adds significan performance improvement over simple actor critic method. 6 | -------------------------------------------------------------------------------- /Pytorch/ActorCritic/Parameter_Noising/Load_AC_model.py: -------------------------------------------------------------------------------- 1 | ## DQN Tutorial 2 | ## Implementation from https://github.com/FitMachineLearning 3 | 4 | import torch 5 | import gym 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import torch.optim as optim 9 | import numpy as np 10 | from dataclasses import dataclass 11 | from typing import Any 12 | from random import random 13 | from agent_and_model import sars,DQNAgent,CriticModel,ActorModel, ReplayBuffer 14 | 15 | 16 | 17 | 18 | 19 | 20 | def get_one_hot(action,n_dim): 21 | retval = np.zeros(n_dim) 22 | retval[action] = 1.0 23 | return retval 24 | 25 | 26 | def train_actor(actor_model, critic_model, state_transitions, num_actor_training_samples, num_actions): 27 | #for each observation get the critic to generate the Q value corresponding to each action_space 28 | #retain action observation pairs corresponding to the highest Q values 29 | #train the actor to converge towards that set 30 | 31 | #Generate random actions 32 | random_actions = [] 33 | for i in range(num_actor_training_samples): 34 | random_actions.append( np.random.rand(num_actions)*2-1 ) 35 | #Get random observations 36 | random_states = [s.state for s in state_transitions] 37 | 38 | # import ipdb; ipdb.set_trace() 39 | 40 | # for earch state add the best corresponding action to random actions 41 | for i in range(len(random_states)): 42 | with torch.no_grad(): 43 | act = actor_model(torch.Tensor(random_states[i]).to(actor_model.device)) .cpu().detach().numpy() 44 | random_actions.append(act) 45 | 46 | 47 | 48 | best_state_action = [] 49 | for i_states in range(len(random_states)): 50 | QAs = [] 51 | 52 | # get the Qvalues from the random actions 53 | for i_actions in range(len(random_actions)): 54 | with torch.no_grad(): 55 | qval = critic_model( torch.Tensor( torch.cat( (torch.Tensor(random_states[i_states]),torch.Tensor(random_actions[i_actions])),0 ) ).to(critic_model.device) ).cpu() 56 | QAs.append( qval ) 57 | # get index for best actions between all random actions and the actor's predicted actions 58 | #_sars = sars(observation,action,reward,observation_next,done,0.0) 59 | best_state_action.append(sars(random_states[i_states], random_actions[np.argmax(QAs)],0.0,None,False,np.max(QAs) )) 60 | # import ipdb;ipdb.set_trace() 61 | 62 | t_random_states = torch.stack( ([torch.Tensor(s.state) for s in best_state_action]) ).to(actor_model.device) 63 | target_actions = torch.stack( ([torch.Tensor(s.action) for s in best_state_action]) ).to(actor_model.device) 64 | actor_model.zero_grad() 65 | predicted_actions = actor_model(t_random_states) 66 | 67 | # loss = F.smooth_l1_loss(torch.sum(pred_qvals*one_hot_actions,-1), actual_Q_values.view(-1) ) 68 | loss = F.smooth_l1_loss(predicted_actions, target_actions ).mean() 69 | loss.backward() 70 | actor_model.opt.step() 71 | return loss 72 | 73 | def train_critic(critic_model, state_transitions, num_actions): 74 | if len(state_transitions) <=0: 75 | print("empty state transitions") 76 | return 77 | 78 | 79 | cur_states = torch.stack( ([torch.Tensor(torch.cat((torch.Tensor(s.state),torch.Tensor(s.action)),0)) for s in state_transitions]) ).to(critic_model.device) 80 | 81 | 82 | rewards = torch.stack( ([torch.Tensor([s.reward]) for s in state_transitions]) ).to(critic_model.device) 83 | Qs = torch.stack( ([torch.Tensor([s.qval]) for s in state_transitions]) ).to(critic_model.device) 84 | mask = torch.stack(([torch.Tensor([0]) if s.done else torch.Tensor([1]) for s in state_transitions])).to(critic_model.device) 85 | next_states = torch.stack( ([torch.Tensor(s.next_state) for s in state_transitions]) ).to(critic_model.device) 86 | actions = [s.action for s in state_transitions] 87 | # import ipdb; ipdb.set_trace() 88 | with torch.no_grad(): 89 | actual_Q_values = Qs 90 | # pred_qvals_next = critic_model(next_states)[0] 91 | critic_model.opt.zero_grad() 92 | pred_qvals = critic_model(cur_states) 93 | 94 | # one_hot_actions = F.one_hot(torch.LongTensor(actions),num_actions).to(model.device) 95 | # loss = torch.mean(torch.sqrt((torch.sum(pred_qvals*one_hot_actions,-1) - actual_Q_values.view(-1) )**2)).to(model.device) 96 | loss = F.smooth_l1_loss(pred_qvals.view(-1), actual_Q_values.view(-1) ) 97 | # loss = F.smooth_l1_loss(torch.sum(pred_qvals,-1), rewards.view(-1)+0.98*mask[:,0]*pred_qvals_next.view(-1) ).mean() 98 | loss.backward() 99 | critic_model.opt.step() 100 | return loss 101 | 102 | def update_Qs(replay_buffer,step_counter,episode_len,buffer_size): 103 | for i in range(episode_len): 104 | # if(step_counter > buffer_size): 105 | # import ipdb; ipdb.set_trace() 106 | index = episode_len-i 107 | next_index = index+1 108 | if i==0: 109 | replay_buffer[index].qval = replay_buffer[index].reward 110 | if(step_counter%2000==0): 111 | print("i",i,"q ",replay_buffer[index].qval) 112 | else: 113 | replay_buffer[index].qval = replay_buffer[index].reward + 0.98 * replay_buffer[next_index].qval 114 | if(step_counter%2000==0): 115 | print("i",i,"q ",replay_buffer[index].qval) 116 | return replay_buffer 117 | 118 | 119 | 120 | 121 | if __name__=='__main__': 122 | DEBUGER_ON = True 123 | NUM_GAMES = 100 124 | MAX_EPISODE_STEPS = 10000 125 | TARGET_MODEL_UPDATE_INTERVAL = 50 126 | EPSILON_MIN = 0.05 127 | EPSILON_START = 0.5 128 | EPSLILON_COUNT = 6000 #Games 129 | INITIAL_RANDOM_STEPS = 5000 130 | RANDOM_GAME_EVERY = 20 131 | TRAIN_CRITIC_EVERY_N_STEP = 300 132 | CRITIC_TRAINING_SAMPLE_SIZE = 256 133 | TRAIN_ACTOR_EVERY_N_GAME = 1 134 | ACTOR_TRAINING_SAMPLE_SIZE = 8 135 | NUM_ACTOR_TRAINING_SAMPLES = 40 136 | TRAINING_ITTERATIONS = 1 137 | NUM_ACTOR_TRAINING_SAMPLES = 128 138 | PRINT_EVERY = 1 139 | RENDER_ENV = True 140 | LOAD_MODEL = True 141 | SAVE_MODEL = False 142 | MODEL_FILE_NAME = "TDQN_RL_MODEL.trl" 143 | MODEL_ID = "01" 144 | SAVE_MODEL_EVERY = 25 145 | 146 | epsilon = EPSILON_START 147 | env = gym.make('LunarLanderContinuous-v2') 148 | # env = gym.make('BipedalWalker-v3') 149 | 150 | observation = env.reset() 151 | print("env action space ", env.action_space.shape) 152 | am = ActorModel(env.observation_space.shape,env.action_space.shape,lr=0.008) 153 | cm = CriticModel(env.observation_space.shape,env.action_space.shape,lr=0.01) 154 | agent = DQNAgent( am , cm ) 155 | # import ipdb;ipdb.set_trace() 156 | 157 | if LOAD_MODEL: 158 | agent.actor_model = torch.load("A2C_actor"+MODEL_ID+MODEL_FILE_NAME) 159 | agent.critic_model = torch.load("A2C_critic"+MODEL_ID+MODEL_FILE_NAME) 160 | 161 | agent.actor_model.eval() 162 | agent.critic_model.eval() 163 | 164 | step_counter = 0 165 | last_step_count = 0 166 | 167 | 168 | action = [] 169 | for game in range (NUM_GAMES): 170 | episode_sars = [] 171 | score = 0 172 | for step in range (MAX_EPISODE_STEPS): 173 | if RENDER_ENV: 174 | env.render() 175 | 176 | if random()<-0.1: 177 | action = env.action_space.sample() 178 | else: 179 | # import ipdb; ipdb.set_trace() 180 | action = agent.get_actions(observation).cpu().detach().numpy() 181 | # print("action ", action) 182 | observation_next, reward, done, info = env.step(action) 183 | score += reward 184 | 185 | observation = observation_next 186 | step_counter+=1 187 | last_step_count = step 188 | if done: 189 | 190 | break 191 | 192 | observation = env.reset() 193 | epsilon = max(EPSILON_MIN, epsilon-((EPSILON_START-EPSILON_MIN)/EPSLILON_COUNT) ) 194 | if (game%PRINT_EVERY==0): 195 | print("episide ", game,"last score",reward, "game score ", score ,"episode_len",last_step_count, "epsilon",epsilon ) 196 | avg_reward = [] 197 | # print("epsilon ", epsilon) 198 | -------------------------------------------------------------------------------- /Pytorch/ActorCritic/Parameter_Noising/Readme.md: -------------------------------------------------------------------------------- 1 | Actor Critic implementation with Parameter noising 2 | 3 | Traing the agent with *python Advantage_Actor_Critic.py*. This saves the agent every 10 episodes. 4 | View the saved agent with *python Load_AC_model.py". 5 | -------------------------------------------------------------------------------- /Pytorch/ActorCritic/Parameter_Noising/agent_and_model.py: -------------------------------------------------------------------------------- 1 | ## DQN Tutorial 2 | ## Implementation from https://github.com/FitMachineLearning 3 | import torch 4 | import gym 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import torch.optim as optim 8 | import numpy as np 9 | from dataclasses import dataclass 10 | from typing import Any 11 | from random import random 12 | 13 | 14 | @dataclass 15 | class sars: 16 | state: Any 17 | action: Any 18 | reward: float 19 | next_state: Any 20 | done: bool 21 | qval: float 22 | advantage: float = 0.0 23 | 24 | class DQNAgent: 25 | def __init__(self,actor_model,critic_model): 26 | self.actor_model = actor_model 27 | self.critic_model = critic_model 28 | 29 | def get_actions(self, observations): 30 | # import ipdb; ipdb.set_trace() 31 | guessed_actions = self.actor_model(torch.Tensor(observations).to(self.actor_model.device)) 32 | return guessed_actions 33 | 34 | def get_predicted_Q_values(self,observation_and_action): 35 | guessed_Qs = self.critic_model(torch.Tensor(observation_and_action)) 36 | return guessed_Qs(-1)[1] 37 | 38 | def update_target_model(self): 39 | self.targetModel.load_state_dict(self.model.state_dict()) 40 | 41 | class ActorModel(nn.Module): 42 | def __init__(self, obs_shape, action_shape,lr): 43 | super(ActorModel,self).__init__() 44 | assert len(obs_shape) ==1, "This network only works on flat observations" 45 | self.obs_shape = obs_shape 46 | self.action_shape = action_shape 47 | 48 | # import ipdb; ipdb.set_trace() 49 | self.net = torch.nn.Sequential( 50 | torch.nn.Linear(obs_shape[0],1024), 51 | torch.nn.ReLU(), 52 | torch.nn.Linear(1024,512), 53 | torch.nn.ReLU(), 54 | torch.nn.Linear(512,256), 55 | torch.nn.ReLU(), 56 | torch.nn.Linear(256,128), 57 | torch.nn.ReLU(), 58 | torch.nn.Linear(128,action_shape[0]) 59 | ) 60 | self.opt = optim.Adam(self.net.parameters(),lr=lr) 61 | if torch.cuda.is_available(): 62 | print("Using CUDA") 63 | self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cuda:1') 64 | self.to(self.device) 65 | 66 | def forward(self, x): 67 | return self.net(x) 68 | 69 | 70 | class CriticModel(nn.Module): 71 | def __init__(self, obs_shape, action_shape,lr): 72 | super(CriticModel,self).__init__() 73 | assert len(obs_shape) ==1, "This network only works on flat observations" 74 | self.obs_shape = obs_shape 75 | self.action_shape = action_shape 76 | 77 | self.net = torch.nn.Sequential( 78 | torch.nn.Linear(obs_shape[0]+action_shape[0],1024), 79 | torch.nn.ReLU(), 80 | torch.nn.Linear(1024,512), 81 | torch.nn.ReLU(), 82 | torch.nn.Linear(512,256), 83 | torch.nn.ReLU(), 84 | torch.nn.Linear(256,128), 85 | torch.nn.ReLU(), 86 | torch.nn.Linear(128,1) # one out put because we are predicting Q values 87 | ) 88 | self.opt = optim.Adam(self.net.parameters(),lr=lr) 89 | if torch.cuda.is_available(): 90 | print("Using CUDA") 91 | self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cuda:1') 92 | self.to(self.device) 93 | 94 | def forward(self, x): 95 | return self.net(x) 96 | 97 | class ReplayBuffer: 98 | def __init__(self, buffer_size = 1000): 99 | # self.buffer_size = buffer_size 100 | self.buffer_size = buffer_size 101 | self.buffer = np.empty((buffer_size),dtype=object) 102 | 103 | # self.buffer = [] 104 | self.index = 0 105 | 106 | def insert(self, sars): 107 | # self.buffer.append(sars) 108 | # print("inserting index ", self.index, "@",self.index%self.buffer_size) 109 | if(self.index == 10): 110 | print("first 10 ",self.buffer[0:10]) 111 | # import ipdb; ipdb.set_trace() 112 | 113 | # if(self.index > self.buffer_size and self.index%self.buffer_size==0): 114 | # print("first 10 ",self.buffer[0:10]) 115 | # print("last 10 ",self.buffer[-10:]) 116 | # print("") 117 | # import ipdb; ipdb.set_trace() 118 | self.buffer[self.index%self.buffer_size] = sars 119 | self.index+=1 120 | # self.buffer.append(sars) 121 | # if(len(self.buffer)>self.buffer_size): 122 | # self.buffer = self.buffer[1:] 123 | # # print("Clipping Buffer at size", len(self.buffer)) 124 | 125 | def sample(self, num_samples,current_episode_steps): 126 | # assert num_samples < min(len(self.buffer),self.index) 127 | # if num_samples>self.index: 128 | # print("sampling n ",min(num_samples,self.index)) 129 | a = self.buffer[0:min(self.index,self.buffer_size)] 130 | if len(self.buffer) > 0: 131 | return np.random.choice(a, min(num_samples,self.index)) 132 | else: 133 | return [] 134 | -------------------------------------------------------------------------------- /Pytorch/ActorCritic/Readme.md: -------------------------------------------------------------------------------- 1 | Implementation of Actor Critic RL argorithm by. 2 | To Train run **python ActorCritic.py** 3 | This will save your trained model in a local directory every 50 game 4 | to view the model in action run **python load_AC_model.py**, note that if you run load_AC_model.py first, it will automatically load the saved agents. I have included a version of the saved actor network (optimal policy) in the project folder. 5 | -------------------------------------------------------------------------------- /Pytorch/ActorCritic/actor01TDQN_RL_MODEL.trl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FitMachineLearning/FitML/1218e10c527e1ca117cc7548419904d7801433c4/Pytorch/ActorCritic/actor01TDQN_RL_MODEL.trl -------------------------------------------------------------------------------- /Pytorch/ActorCritic/agent_and_model.py: -------------------------------------------------------------------------------- 1 | ## DQN Tutorial 2 | ## Implementation from https://github.com/FitMachineLearning 3 | import torch 4 | import gym 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import torch.optim as optim 8 | import numpy as np 9 | from dataclasses import dataclass 10 | from typing import Any 11 | from random import random 12 | 13 | 14 | @dataclass 15 | class sars: 16 | state: Any 17 | action: Any 18 | reward: float 19 | next_state: Any 20 | done: bool 21 | qval: float 22 | advantage: float = 0.0 23 | 24 | class DQNAgent: 25 | def __init__(self,actor_model,critic_model): 26 | self.actor_model = actor_model 27 | self.critic_model = critic_model 28 | 29 | def get_actions(self, observations): 30 | # import ipdb; ipdb.set_trace() 31 | guessed_actions = self.actor_model(torch.Tensor(observations).to(self.actor_model.device)) 32 | return guessed_actions 33 | 34 | def get_predicted_Q_values(self,observation_and_action): 35 | guessed_Qs = self.critic_model(torch.Tensor(observation_and_action)) 36 | return guessed_Qs(-1)[1] 37 | 38 | def update_target_model(self): 39 | self.targetModel.load_state_dict(self.model.state_dict()) 40 | 41 | class ActorModel(nn.Module): 42 | def __init__(self, obs_shape, action_shape,lr): 43 | super(ActorModel,self).__init__() 44 | assert len(obs_shape) ==1, "This network only works on flat observations" 45 | self.obs_shape = obs_shape 46 | self.action_shape = action_shape 47 | 48 | # import ipdb; ipdb.set_trace() 49 | self.net = torch.nn.Sequential( 50 | torch.nn.Linear(obs_shape[0],512), 51 | torch.nn.ReLU(), 52 | # torch.nn.Linear(1024,256), 53 | # torch.nn.ReLU(), 54 | torch.nn.Linear(512,action_shape[0]) 55 | ) 56 | self.opt = optim.Adam(self.net.parameters(),lr=lr) 57 | if torch.cuda.is_available(): 58 | print("Using CUDA") 59 | self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cuda:1') 60 | self.to(self.device) 61 | 62 | def forward(self, x): 63 | return self.net(x) 64 | 65 | 66 | class CriticModel(nn.Module): 67 | def __init__(self, obs_shape, action_shape,lr): 68 | super(CriticModel,self).__init__() 69 | assert len(obs_shape) ==1, "This network only works on flat observations" 70 | self.obs_shape = obs_shape 71 | self.action_shape = action_shape 72 | 73 | self.net = torch.nn.Sequential( 74 | torch.nn.Linear(obs_shape[0]+action_shape[0],512), 75 | torch.nn.ReLU(), 76 | # torch.nn.Linear(2048,512), 77 | # torch.nn.ReLU(), 78 | torch.nn.Linear(512,1) # one out put because we are predicting Q values 79 | ) 80 | self.opt = optim.Adam(self.net.parameters(),lr=lr) 81 | if torch.cuda.is_available(): 82 | print("Using CUDA") 83 | self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cuda:1') 84 | self.to(self.device) 85 | 86 | def forward(self, x): 87 | return self.net(x) 88 | 89 | class ReplayBuffer: 90 | def __init__(self, buffer_size = 1000): 91 | # self.buffer_size = buffer_size 92 | self.buffer_size = buffer_size 93 | self.buffer = np.empty((buffer_size),dtype=object) 94 | 95 | # self.buffer = [] 96 | self.index = 0 97 | 98 | def insert(self, sars): 99 | # self.buffer.append(sars) 100 | # print("inserting index ", self.index, "@",self.index%self.buffer_size) 101 | if(self.index == 10): 102 | print("first 10 ",self.buffer[0:10]) 103 | # import ipdb; ipdb.set_trace() 104 | 105 | # if(self.index > self.buffer_size and self.index%self.buffer_size==0): 106 | # print("first 10 ",self.buffer[0:10]) 107 | # print("last 10 ",self.buffer[-10:]) 108 | # print("") 109 | # import ipdb; ipdb.set_trace() 110 | self.buffer[self.index%self.buffer_size] = sars 111 | self.index+=1 112 | # self.buffer.append(sars) 113 | # if(len(self.buffer)>self.buffer_size): 114 | # self.buffer = self.buffer[1:] 115 | # # print("Clipping Buffer at size", len(self.buffer)) 116 | 117 | def sample(self, num_samples,current_episode_steps): 118 | # assert num_samples < min(len(self.buffer),self.index) 119 | # if num_samples>self.index: 120 | # print("sampling n ",min(num_samples,self.index)) 121 | a = self.buffer[0:min(self.index,self.buffer_size)] 122 | if len(self.buffer) > 0: 123 | return np.random.choice(a, min(num_samples,self.index)) 124 | else: 125 | return [] 126 | -------------------------------------------------------------------------------- /Pytorch/ActorCritic/critic01TDQN_RL_MODEL.trl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FitMachineLearning/FitML/1218e10c527e1ca117cc7548419904d7801433c4/Pytorch/ActorCritic/critic01TDQN_RL_MODEL.trl -------------------------------------------------------------------------------- /Pytorch/DQN/DQN_Cartpol_old_1.py: -------------------------------------------------------------------------------- 1 | ## DQN Tutorial 2 | ## https://www.youtube.com/watch?v=WHRQUZrxxGw 3 | import torch 4 | import gym 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import torch.optim as optim 8 | import numpy as np 9 | from dataclasses import dataclass 10 | from typing import Any 11 | from random import random 12 | 13 | @dataclass 14 | class sars: 15 | state: Any 16 | action: int 17 | reward: float 18 | next_state: Any 19 | done: bool 20 | qval: float 21 | 22 | class DQNAgent: 23 | def __init__(self,model,targetModel): 24 | self.model = model 25 | self.targetModel = targetModel 26 | 27 | def get_actions(self, observations): 28 | q_vals = self.model(torch.Tensor(observations).to(self.model.device)) 29 | 30 | 31 | return q_vals.max(-1)[1] 32 | 33 | def update_target_model(self): 34 | self.targetModel.load_state_dict(self.model.state_dict()) 35 | 36 | class Model(nn.Module): 37 | def __init__(self, obs_shape, num_actions,lr): 38 | super(Model,self).__init__() 39 | assert len(obs_shape) ==1, "This network only works on flat observations" 40 | self.obs_shape = obs_shape 41 | self.num_action = num_actions 42 | 43 | self.net = torch.nn.Sequential( 44 | torch.nn.Linear(obs_shape[0],32), 45 | torch.nn.ReLU(), 46 | torch.nn.Linear(32,num_actions) 47 | ) 48 | self.opt = optim.Adam(self.net.parameters(),lr=lr) 49 | if torch.cuda.is_available(): 50 | print("Using CUDA") 51 | self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cuda:1') 52 | self.to(self.device) 53 | 54 | 55 | def forward(self, x): 56 | return self.net(x) 57 | 58 | 59 | 60 | class ReplayBuffer: 61 | def __init__(self, buffer_size = 1000): 62 | self.buffer_size = buffer_size 63 | # self.buffer = [None]*buffer_size 64 | self.buffer = [] 65 | self.index = 0 66 | 67 | def insert(self, sars): 68 | # self.buffer.append(sars) 69 | # print("inserting index ", self.index, "@",self.index%self.buffer_size) 70 | if(self.index == 10): 71 | print("first 10 ",self.buffer[0:10]) 72 | # import ipdb; ipdb.set_trace() 73 | 74 | # if(self.index > self.buffer_size and self.index%self.buffer_size==0): 75 | # print("first 10 ",self.buffer[0:10]) 76 | # print("last 10 ",self.buffer[-10:]) 77 | # print("") 78 | # import ipdb; ipdb.set_trace() 79 | # self.buffer[self.index%self.buffer_size] = sars 80 | self.index+=1 81 | self.buffer.append(sars) 82 | if(len(self.buffer)>self.buffer_size): 83 | self.buffer = self.buffer[1:] 84 | # print("Clipping Buffer at size", len(self.buffer)) 85 | 86 | def sample(self, num_samples,current_episode_steps): 87 | # assert num_samples < min(len(self.buffer),self.index) 88 | # if num_samples>self.index: 89 | # print("sampling n ",min(num_samples,self.index)) 90 | # a = self.buffer[0:((self.index-current_episode_steps)%self.buffer_size)] 91 | if len(self.buffer) > 0: 92 | return np.random.choice(self.buffer, min(num_samples,self.index)) 93 | else: 94 | return [] 95 | 96 | 97 | 98 | def get_one_hot(action,n_dim): 99 | retval = np.zeros(n_dim) 100 | retval[action] = 1.0 101 | return retval 102 | 103 | 104 | def train_step(model, state_transitions, tgt, num_actions): 105 | if len(state_transitions) <=0: 106 | print("empty state transitions") 107 | return 108 | cur_states = torch.stack( ([torch.Tensor(s.state) for s in state_transitions]) ).to(model.device) 109 | rewards = torch.stack( ([torch.Tensor([s.reward]) for s in state_transitions]) ).to(model.device) 110 | Qs = torch.stack( ([torch.Tensor([s.qval]) for s in state_transitions]) ).to(model.device) 111 | mask = torch.stack(([torch.Tensor([0]) if s.done else torch.Tensor([1]) for s in state_transitions])).to(model.device) 112 | next_states = torch.stack( ([torch.Tensor(s.next_state) for s in state_transitions]) ).to(model.device) 113 | actions = [s.action for s in state_transitions] 114 | 115 | with torch.no_grad(): 116 | actual_Q_values = Qs 117 | # pred_qvals_next = tgt(next_states) 118 | # pred_qvals_next = pred_qvals_next.max(axis=1)[0] 119 | 120 | model.opt.zero_grad() 121 | pred_qvals = model(cur_states) 122 | one_hot_actions = F.one_hot(torch.LongTensor(actions),num_actions).to(model.device) 123 | 124 | 125 | # loss = (rewards + mask[:,0]*pred_qvals_next - torch.sum(pred_qvals*one_hot_actions,-1)).mean() 126 | # print("loss input", torch.sum(pred_qvals*one_hot_actions,-1)) 127 | # print("loss target", (rewards + 0.98*mask[:,0]*pred_qvals_next)) 128 | 129 | # loss = F.smooth_l1_loss(torch.sum(pred_qvals*one_hot_actions,-1), (rewards + 0.98*mask[:,0]*pred_qvals_next)[0] ) 130 | loss = F.smooth_l1_loss(torch.sum(pred_qvals*one_hot_actions,-1), actual_Q_values[0] ) 131 | 132 | loss.backward() 133 | model.opt.step() 134 | print("loss ", loss) 135 | return loss 136 | 137 | def update_Qs(replay_buffer,step_counter,episode_len,buffer_size): 138 | for i in range(episode_len): 139 | # if(step_counter > buffer_size): 140 | # import ipdb; ipdb.set_trace() 141 | index = episode_len-i 142 | next_index = index+1 143 | if i==0: 144 | replay_buffer[index].qval = replay_buffer[index].reward 145 | # print("i",i,"q ",replay_buffer[index].qval) 146 | else: 147 | replay_buffer[index].qval = replay_buffer[index].reward + 0.98 * replay_buffer[next_index].qval 148 | # print("i",i,"q ",replay_buffer[index].qval) 149 | return replay_buffer 150 | 151 | def train_step2(model,state_transitions,targetModel,num_actions): 152 | # print("state_transitions" , state_transitions) 153 | cur_states = torch.stack(([torch.Tensor(s.state) for s in state_transitions])) 154 | next_states = torch.stack(([torch.Tensor(s.next_state) for s in state_transitions])) 155 | 156 | rewards = torch.stack(([torch.Tensor([s.reward]) for s in state_transitions])) 157 | # act = torch.Tensor(np.zeros(num_actions)) 158 | actions = torch.stack([torch.Tensor(get_one_hot(action,num_actions)) for s in state_transitions]) 159 | 160 | mask = torch.stack([torch.Tensor([0]) if s.done else torch.Tensor([1]) for s in state_transitions]) 161 | 162 | with torch.no_grad(): 163 | # qevals_next = targetModel(next_states).max(-1) 164 | qevals_next = targetModel(next_states) 165 | # print("qevals_next",qevals_next) 166 | qevals_next = qevals_next.max(axis=1)[0] 167 | # print("qevals_next . max",qevals_next) 168 | 169 | model.opt.zero_grad() 170 | qevals = model(cur_states) 171 | 172 | # print("rewards ",rewards.shape, rewards) 173 | # print("qevals ",qevals.shape,qevals) 174 | # # print("maks ",mask.shape,mask) 175 | # print("actions ",actions.shape,actions) 176 | print("qevals_next",qevals_next) 177 | # 178 | print("qeval*actions ", torch.sum(qevals*actions,axis=1) ) 179 | # print("qeval*actions . mean() ", torch.sum(qevals*actions,axis=1).mean() ) 180 | 181 | 182 | loss = ( (rewards + 0.98 * qevals_next*mask[:,0] ) - (torch.sum(qevals*actions,axis=1)) ).mean() 183 | # loss = ( (rewards + 0.98 * qevals_next*mask) - qevals*actions ).mean() 184 | loss.backward() 185 | model.opt.step() 186 | 187 | print("Loss ", loss) 188 | return loss 189 | 190 | 191 | if __name__=='__main__': 192 | NUM_GAMES = 50000 193 | MAX_EPISODE_STEPS = 600 194 | TARGET_MODEL_UPDATE_INTERVAL = 50 195 | EPSILON_MIN = 0.01 196 | EPSILON_START = 0.3 197 | EPSLILON_COUNT = 2000 #Games 198 | RANDOM_GAME_EVERY = 20 199 | TRAIN_EVERY_N_STEPS = 15 200 | PRINT_EVERY = 10 201 | 202 | epsilon = EPSILON_START 203 | # env = gym.make('LunarLander-v2') 204 | env = gym.make('CartPole-v1') 205 | 206 | observation = env.reset() 207 | # obs2 = np.random.random(4) 208 | # allObs = np.array([observation,obs2]) 209 | m = Model(env.observation_space.shape,env.action_space.n,lr=0.01) 210 | rb = ReplayBuffer(3000) 211 | agent = DQNAgent(m, Model(env.observation_space.shape,env.action_space.n,lr=0.01) ) 212 | step_counter = 0 213 | avg_reward = [] 214 | # qeval = m(torch.Tensor(allObs)) 215 | # # print("allObs ", allObs) 216 | # # print("qeval ",qeval) 217 | 218 | 219 | for game in range (NUM_GAMES): 220 | # if game == 8: 221 | # print("rb ",rb.buffer) 222 | episode_sars = [] 223 | # # if game%TARGET_MODEL_UPDATE_INTERVAL == 0 : 224 | # # # print("game", game," updating target model") 225 | # agent.update_target_model() 226 | for step in range (MAX_EPISODE_STEPS): 227 | env.render() 228 | # import ipdb; ipdb.set_trace() 229 | action = 0 230 | if step_counter<1000 or random() 3000 and step_counter%TRAIN_EVERY_N_STEPS==0: 247 | # print("rb sample", rb.sample(1)) 248 | train_step(agent.model,rb.sample(1,step),agent.targetModel,env.action_space.n) 249 | # print("training size ",rb.index%rb.buffer_size, " - ",rb.index , "") 250 | observation = observation_next 251 | step_counter+=1 252 | if done: 253 | 254 | # reward = -100 255 | # print("last reward ", reward) 256 | 257 | rb.episode_sars = update_Qs(episode_sars,step_counter,step,len(episode_sars)) 258 | for j in range(len(episode_sars)): 259 | rb.insert(episode_sars[j]) 260 | 261 | observation = env.reset() 262 | break 263 | 264 | 265 | epsilon = max(EPSILON_MIN, epsilon-((EPSILON_START-EPSILON_MIN)/EPSLILON_COUNT) ) 266 | if (game%PRINT_EVERY==0): 267 | print("episide ", game,"score", np.average( avg_reward), "epsilon",epsilon ) 268 | avg_reward = [] 269 | # print("epsilon ", epsilon) 270 | -------------------------------------------------------------------------------- /Pytorch/DQN/Load_Agent.py: -------------------------------------------------------------------------------- 1 | ## DQN Tutorial 2 | ## https://www.youtube.com/watch?v=WHRQUZrxxGw 3 | import torch 4 | import gym 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import torch.optim as optim 8 | import numpy as np 9 | from dataclasses import dataclass 10 | from typing import Any 11 | from random import random 12 | 13 | @dataclass 14 | class sars: 15 | state: Any 16 | action: int 17 | reward: float 18 | next_state: Any 19 | done: bool 20 | qval: float 21 | 22 | class DQNAgent: 23 | def __init__(self,model,targetModel): 24 | self.model = model 25 | self.targetModel = targetModel 26 | 27 | def get_actions(self, observations): 28 | q_vals = self.model(torch.Tensor(observations).to(self.model.device)) 29 | return q_vals.max(-1)[1] 30 | 31 | def update_target_model(self): 32 | self.targetModel.load_state_dict(self.model.state_dict()) 33 | 34 | class Model(nn.Module): 35 | def __init__(self, obs_shape, num_actions,lr): 36 | super(Model,self).__init__() 37 | assert len(obs_shape) ==1, "This network only works on flat observations" 38 | self.obs_shape = obs_shape 39 | self.num_action = num_actions 40 | 41 | self.net = torch.nn.Sequential( 42 | torch.nn.Linear(obs_shape[0],128), 43 | torch.nn.ReLU(), 44 | torch.nn.Linear(128,num_actions) 45 | ) 46 | self.opt = optim.Adam(self.net.parameters(),lr=lr) 47 | if torch.cuda.is_available(): 48 | print("Using CUDA") 49 | self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cuda:1') 50 | self.to(self.device) 51 | 52 | 53 | def forward(self, x): 54 | return self.net(x) 55 | 56 | 57 | 58 | class ReplayBuffer: 59 | def __init__(self, buffer_size = 1000): 60 | self.buffer_size = buffer_size 61 | # self.buffer = [None]*buffer_size 62 | self.buffer = [] 63 | self.index = 0 64 | 65 | def insert(self, sars): 66 | # self.buffer.append(sars) 67 | # print("inserting index ", self.index, "@",self.index%self.buffer_size) 68 | if(self.index == 10): 69 | print("first 10 ",self.buffer[0:10]) 70 | # import ipdb; ipdb.set_trace() 71 | 72 | # if(self.index > self.buffer_size and self.index%self.buffer_size==0): 73 | # print("first 10 ",self.buffer[0:10]) 74 | # print("last 10 ",self.buffer[-10:]) 75 | # print("") 76 | # import ipdb; ipdb.set_trace() 77 | # self.buffer[self.index%self.buffer_size] = sars 78 | self.index+=1 79 | self.buffer.append(sars) 80 | if(len(self.buffer)>self.buffer_size): 81 | self.buffer = self.buffer[1:] 82 | # print("Clipping Buffer at size", len(self.buffer)) 83 | 84 | def sample(self, num_samples,current_episode_steps): 85 | # assert num_samples < min(len(self.buffer),self.index) 86 | # if num_samples>self.index: 87 | # print("sampling n ",min(num_samples,self.index)) 88 | # a = self.buffer[0:((self.index-current_episode_steps)%self.buffer_size)] 89 | if len(self.buffer) > 0: 90 | return np.random.choice(self.buffer, min(num_samples,self.index)) 91 | else: 92 | return [] 93 | 94 | 95 | 96 | def get_one_hot(action,n_dim): 97 | retval = np.zeros(n_dim) 98 | retval[action] = 1.0 99 | return retval 100 | 101 | 102 | def train_step(model, state_transitions, tgt, num_actions): 103 | if len(state_transitions) <=0: 104 | print("empty state transitions") 105 | return 106 | cur_states = torch.stack( ([torch.Tensor(s.state) for s in state_transitions]) ).to(model.device) 107 | rewards = torch.stack( ([torch.Tensor([s.reward]) for s in state_transitions]) ).to(model.device) 108 | Qs = torch.stack( ([torch.Tensor([s.qval]) for s in state_transitions]) ).to(model.device) 109 | mask = torch.stack(([torch.Tensor([0]) if s.done else torch.Tensor([1]) for s in state_transitions])).to(model.device) 110 | next_states = torch.stack( ([torch.Tensor(s.next_state) for s in state_transitions]) ).to(model.device) 111 | actions = [s.action for s in state_transitions] 112 | # import ipdb; ipdb.set_trace() 113 | with torch.no_grad(): 114 | # actual_Q_values = Qs 115 | pred_qvals_next = model(next_states).max(-1)[0] 116 | model.opt.zero_grad() 117 | pred_qvals = model(cur_states) 118 | 119 | one_hot_actions = F.one_hot(torch.LongTensor(actions),num_actions).to(model.device) 120 | # loss = torch.mean(torch.sqrt((torch.sum(pred_qvals*one_hot_actions,-1) - actual_Q_values.view(-1) )**2)).to(model.device) 121 | # loss = F.smooth_l1_loss(torch.sum(pred_qvals*one_hot_actions,-1), actual_Q_values.view(-1) ) 122 | loss = F.smooth_l1_loss(torch.sum(pred_qvals*one_hot_actions,-1), rewards.view(-1)+0.99*mask[:,0]*pred_qvals_next.view(-1) ).mean() 123 | loss.backward() 124 | model.opt.step() 125 | return loss 126 | 127 | 128 | 129 | 130 | 131 | def update_Qs(replay_buffer,step_counter,episode_len,buffer_size): 132 | for i in range(episode_len): 133 | # if(step_counter > buffer_size): 134 | # import ipdb; ipdb.set_trace() 135 | index = episode_len-i 136 | next_index = index+1 137 | if i==0: 138 | replay_buffer[index].qval = replay_buffer[index].reward 139 | if(step_counter%2000==0): 140 | print("i",i,"q ",replay_buffer[index].qval) 141 | else: 142 | replay_buffer[index].qval = replay_buffer[index].reward + 0.99 * replay_buffer[next_index].qval 143 | if(step_counter%2000==0): 144 | print("i",i,"q ",replay_buffer[index].qval) 145 | return replay_buffer 146 | 147 | 148 | if __name__=='__main__': 149 | DEBUGER_ON = True 150 | NUM_GAMES = 50000 151 | MAX_EPISODE_STEPS = 1490 152 | TARGET_MODEL_UPDATE_INTERVAL = 50 153 | EPSILON_MIN = 0.05 154 | EPSILON_START = 0.5 155 | EPSLILON_COUNT = 6000 #Games 156 | RANDOM_GAME_EVERY = 20 157 | TRAIN_EVERY_N_STEPS = 25 158 | TRAINING_SAMPLE_SIZE = 256 159 | TRAINING_ITTERATIONS = 1 160 | PRINT_EVERY = 1 161 | RENDER_ENV = True 162 | LOAD_MODEL = True 163 | SAVE_MODEL = False 164 | MODEL_FILE_NAME = "TDQN_RL_MODEL.trl" 165 | MODEL_ID = "01" 166 | SAVE_MODEL_EVERY = 25 167 | 168 | epsilon = EPSILON_START 169 | env = gym.make('LunarLander-v2') 170 | # env = gym.make('CartPole-v1') 171 | 172 | observation = env.reset() 173 | agent = DQNAgent(Model(env.observation_space.shape,env.action_space.n,lr=0.0001), Model(env.observation_space.shape,env.action_space.n,lr=0.0001) ) 174 | if LOAD_MODEL: 175 | print("Loading Model ", ""+MODEL_ID+MODEL_FILE_NAME) 176 | agent.model = torch.load(""+MODEL_ID+MODEL_FILE_NAME) 177 | agent.model.eval() 178 | step_counter = 0 179 | avg_reward = [] 180 | last_step_count = 0 181 | # qeval = m(torch.Tensor(allObs)) 182 | # # print("allObs ", allObs) 183 | # # print("qeval ",qeval) 184 | 185 | 186 | for game in range (NUM_GAMES): 187 | # if game == 8: 188 | # print("rb ",rb.buffer) 189 | score = 0 190 | 191 | for step in range (MAX_EPISODE_STEPS): 192 | if RENDER_ENV: 193 | env.render() 194 | # import ipdb; ipdb.set_trace() 195 | action = 0 196 | action = agent.get_actions(observation).item() 197 | 198 | 199 | observation_next, reward, done, info = env.step(action) 200 | 201 | score += reward 202 | 203 | observation = observation_next 204 | step_counter+=1 205 | last_step_count = step 206 | if done: 207 | 208 | observation = env.reset() 209 | break 210 | 211 | 212 | epsilon = max(EPSILON_MIN, epsilon-((EPSILON_START-EPSILON_MIN)/EPSLILON_COUNT) ) 213 | if (game%PRINT_EVERY==0): 214 | print("episide ", game,"last score",reward, "game score ", score ,"episode_len",last_step_count, "epsilon",epsilon ) 215 | avg_reward = [] 216 | # print("epsilon ", epsilon) 217 | -------------------------------------------------------------------------------- /Pytorch/DQN/Readme.md: -------------------------------------------------------------------------------- 1 | Minimalist implementation of DQN with Pytorch. 2 | 3 | How to use? 4 | 5 | You need DQN_Lander.py to train the agent. 6 | **python DQN_lander.py** 7 | 8 | This will save the latest version of the model on a local file. 9 | You can Load the model and view it in action with 10 | **python load_DQN_Model.py** 11 | 12 | Hyper parameters are in the __main__ section. 13 | Both agent and loader need to the same environment name. 14 | 15 | -------------------------------------------------------------------------------- /Pytorch/DQN_CNN/Load_ATARI_AGENT.py: -------------------------------------------------------------------------------- 1 | ## DQN Tutorial 2 | import torch 3 | import gym 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torch.optim as optim 7 | import numpy as np 8 | from dataclasses import dataclass 9 | from typing import Any 10 | from random import random 11 | from PIL import Image 12 | from agent_and_model import DQNAgent,sars, Model, ReplayBuffer 13 | import plotly.express as px 14 | 15 | def get_one_hot(action,n_dim): 16 | retval = np.zeros(n_dim) 17 | retval[action] = 1.0 18 | return retval 19 | 20 | 21 | def plot_score(all_scores): 22 | fig = px.line(x=np.arange(len(all_scores)),y=all_scores) 23 | fig.write_html('Play_DQN_CNN_Trend_figure.html') 24 | 25 | if __name__=='__main__': 26 | DEBUGER_ON = True 27 | NUM_GAMES = 50000 28 | MAX_EPISODE_STEPS = 10000 29 | TARGET_MODEL_UPDATE_INTERVAL = 50 30 | EPSILON_MIN = 0.05 31 | EPSILON_START = 0.3 32 | EPSLILON_COUNT = 4000 #Games 33 | RANDOM_GAME_EVERY = 10 34 | TRAIN_EVERY_N_STEPS = 10 35 | TRAINING_SAMPLE_SIZE = 1 36 | TRAINING_ITTERATIONS = 1 37 | PRINT_EVERY = 1 38 | RENDER_ENV = True 39 | LOAD_MODEL = True 40 | SAVE_MODEL = True 41 | MODEL_FILE_NAME = "TDQN_RL_MODEL.trl" 42 | MODEL_ID = "01" 43 | SAVE_MODEL_EVERY = 25 44 | 45 | epsilon = EPSILON_START 46 | env = gym.make('Pong-v0') 47 | # env = gym.make('CartPole-v1') 48 | 49 | agent = DQNAgent(Model(env.observation_space.shape,env.action_space.n,lr=0.0001), Model(env.observation_space.shape,env.action_space.n,lr=0.0001) ) 50 | 51 | observation = env.reset() 52 | frame1 = [] 53 | frame2 = [] 54 | frame3 = [] 55 | frame1 = agent.process_frame(observation) 56 | frame2 = agent.process_frame(observation) 57 | frame3 = agent.process_frame(observation) 58 | # import ipdb; ipdb.set_trace() 59 | observation = np.concatenate((frame1,frame2,frame3),axis=1) 60 | observation = observation.reshape((1,3,160,140*3)) 61 | 62 | if LOAD_MODEL: 63 | print("Loading Model ", ""+MODEL_ID+MODEL_FILE_NAME) 64 | agent.model = torch.load(""+MODEL_ID+MODEL_FILE_NAME) 65 | # agent.model.load_state_dict(torch.load(""+MODEL_ID+MODEL_FILE_NAME)) 66 | agent.model.eval() 67 | step_counter = 0 68 | avg_reward = [] 69 | rolling_average = 0 70 | 71 | 72 | for game in range (NUM_GAMES): 73 | episode_steps = 0 74 | score = 0.0 75 | all_scores = [] 76 | for step in range (MAX_EPISODE_STEPS): 77 | if RENDER_ENV: 78 | env.render() 79 | # import ipdb; ipdb.set_trace() 80 | action = 0 81 | 82 | action = agent.get_actions(observation).item() 83 | 84 | frame3 = frame2 85 | frame2 = frame1 86 | frame1, reward, done, info = env.step(action) 87 | 88 | 89 | score += reward 90 | # print("frame1", frame1.shape) 91 | frame1 = agent.process_frame(frame1) 92 | observation_next = np.concatenate((frame1,frame2,frame3),axis=1) 93 | 94 | # print("obs - concatenate", observation_next.shape) 95 | # if True or step%100==99: 96 | # img = Image.fromarray(observation_next, 'RGB') 97 | # img.save('my.png') 98 | # img.show() 99 | # if done and reward <=-100: 100 | # reward = -300 101 | observation_next = observation_next.reshape((1,3,160,140*3)) 102 | 103 | # reward *= 100 104 | avg_reward.append([reward]) 105 | 106 | observation = observation_next 107 | step_counter+=1 108 | episode_steps = step 109 | if done: 110 | observation = env.reset() 111 | frame1 = [] 112 | frame2 = [] 113 | frame3 = [] 114 | frame1 = agent.process_frame(observation) 115 | frame2 = agent.process_frame(observation) 116 | frame3 = agent.process_frame(observation) 117 | # import ipdb; ipdb.set_trace() 118 | observation = np.concatenate((frame1,frame2,frame3),axis=1) 119 | observation = observation.reshape((1,3,160,140*3)) 120 | break 121 | 122 | rolling_average = 0.05*score + (1-0.05)*rolling_average 123 | epsilon = max(EPSILON_MIN, epsilon-((EPSILON_START-EPSILON_MIN)/EPSLILON_COUNT) ) 124 | all_scores.append(score) 125 | if (game%PRINT_EVERY==0): 126 | plot_score(all_scores) 127 | print("episide ", game,"last score",reward,"rolling score ", rolling_average ,"episode_len", episode_steps , "score", score, "epsilon",epsilon ) 128 | avg_reward = [] 129 | # print("epsilon ", epsilon) 130 | -------------------------------------------------------------------------------- /Pytorch/DQN_CNN/Readme.md: -------------------------------------------------------------------------------- 1 | DQN Algorithm that takes images as input. 2 | You can train the agent using **python ATARI_DQN_CNN.py** 3 | You can see the agent in action **using python Load_ATARI_AGENT.py** 4 | -------------------------------------------------------------------------------- /Pytorch/DQN_CNN/agent_and_model.py: -------------------------------------------------------------------------------- 1 | ## DQN Tutorial 2 | ## Implementation from https://github.com/FitMachineLearning 3 | import torch 4 | import gym 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import torch.optim as optim 8 | import numpy as np 9 | from dataclasses import dataclass 10 | from typing import Any 11 | from random import random 12 | from PIL import Image 13 | 14 | 15 | @dataclass 16 | class sars: 17 | state: Any 18 | action: int 19 | reward: float 20 | next_state: Any 21 | done: bool 22 | qval: float 23 | 24 | class DQNAgent: 25 | def __init__(self,model,targetModel): 26 | self.model = model 27 | self.targetModel = targetModel 28 | 29 | def get_actions(self, observations): 30 | q_vals = self.model(torch.Tensor(observations).to(self.model.device)) 31 | return q_vals.max(-1)[1] 32 | 33 | def update_target_model(self): 34 | self.targetModel.load_state_dict(self.model.state_dict()) 35 | 36 | def process_frame(self,frame): 37 | img = Image.fromarray(frame, 'RGB') 38 | width, height = img.size 39 | frame = img.crop((5,35,width-15,height-15)) 40 | return frame 41 | 42 | class Model(nn.Module): 43 | def __init__(self, obs_shape, num_actions,lr): 44 | super(Model,self).__init__() 45 | # assert len(obs_shape) ==1, "This network only works on flat observations" 46 | self.obs_shape = obs_shape 47 | self.num_action = num_actions 48 | # import ipdb; ipdb.set_trace() 49 | 50 | self.conv_net = torch.nn.Sequential( 51 | nn.BatchNorm2d(3), 52 | nn.Conv2d(3, 32, 8, 4), 53 | # nn.MaxPool2d(4), 54 | # nn.Dropout(0.2), 55 | nn.ReLU(), 56 | nn.Conv2d(32, 64, 4, 2), 57 | # nn.Dropout(0.2), 58 | nn.ReLU(), 59 | nn.Conv2d(64, 64, 3,1), 60 | # nn.MaxPool2d(4), 61 | # nn.Dropout(0.2), 62 | nn.ReLU() 63 | 64 | ) 65 | self.linear_layer = torch.nn.Sequential( 66 | torch.nn.Linear(50176,256), 67 | # nn.Dropout(0.6), 68 | # torch.nn.ReLU(), 69 | # torch.nn.Linear(128,256), 70 | # nn.Dropout(0.2), 71 | torch.nn.ReLU(), 72 | torch.nn.Linear(256,num_actions) 73 | ) 74 | self.opt = optim.Adam(self.conv_net.parameters(),lr=lr) 75 | self.opt2 = optim.Adam(self.linear_layer.parameters(),lr=lr) 76 | 77 | if torch.cuda.is_available(): 78 | print("Using CUDA") 79 | self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cuda:1') 80 | self.to(self.device) 81 | 82 | 83 | def forward(self, x): 84 | x = self.conv_net(x) 85 | x = x.view(x.size(0),-1) 86 | x = self.linear_layer(x) 87 | return x 88 | 89 | 90 | 91 | class ReplayBuffer: 92 | def __init__(self, buffer_size = 1000): 93 | # self.buffer_size = buffer_size 94 | self.buffer_size = buffer_size 95 | self.buffer = np.empty((buffer_size),dtype=object) 96 | 97 | # self.buffer = [] 98 | self.index = 0 99 | 100 | def insert(self, sars): 101 | 102 | # if self.index>1000: 103 | # # import ipdb; ipdb.set_trace() 104 | # Qs = np.array([s.qval for s in self.buffer[0:(min(self.index,self.buffer_size))]]) 105 | # Qs_threshold = Qs.mean() + Qs.var()/4 106 | # select_prob = 1 - ( ( sars.qval - Qs_threshold ) / Qs_threshold) 107 | # select_prob = max(0.15,select_prob) 108 | # if random()self.index: 119 | # print("sampling n ",min(num_samples,self.index)) 120 | a = self.buffer[0:min(self.index,self.buffer_size)] 121 | if len(self.buffer) > 0: 122 | return np.random.choice(a, min(num_samples,self.index)) 123 | else: 124 | return [] 125 | 126 | def sample_top(self, num_samples,current_episode_steps): 127 | import ipdb; ipdb.set_trace() 128 | Qs = np.array([s.qvals for s in self.buffer]) 129 | Qs_threshold = Qs.mean() + Qs.var()/3 130 | 131 | # if num_samples>self.index: 132 | # print("sampling n ",min(num_samples,self.index)) 133 | a = self.buffer[0:min(self.index,self.buffer_size)] 134 | if len(self.buffer) > 0: 135 | return np.random.choice(a, min(num_samples,self.index)) 136 | else: 137 | return [] 138 | -------------------------------------------------------------------------------- /Pytorch/PPO/PPO_LunarLander.py: -------------------------------------------------------------------------------- 1 | # Working PPO implmentation 2 | 3 | # Modifed from 4 | # https://github.com/sweetice/Deep-reinforcement-learning-with-pytorch/ 5 | 6 | import argparse 7 | import pickle 8 | from collections import namedtuple 9 | from itertools import count 10 | 11 | import os, time 12 | import numpy as np 13 | import matplotlib.pyplot as plt 14 | 15 | import gym 16 | import torch 17 | import torch.nn as nn 18 | import torch.nn.functional as F 19 | import torch.optim as optim 20 | from torch.distributions import Normal, Categorical 21 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler 22 | # from tensorboardX import SummaryWriter 23 | 24 | # Parameters 25 | gamma = 0.99 26 | render = True 27 | seed = 1 28 | log_interval = 10 29 | 30 | # env = gym.make('CartPole-v0').unwrapped 31 | env = gym.make('LunarLander-v2').unwrapped 32 | num_state = env.observation_space.shape[0] 33 | num_action = env.action_space.n 34 | torch.manual_seed(seed) 35 | env.seed(seed) 36 | Transition = namedtuple('Transition', ['state', 'action', 'a_log_prob', 'reward', 'next_state']) 37 | 38 | class Actor(nn.Module): 39 | def __init__(self): 40 | super(Actor, self).__init__() 41 | self.fc1 = nn.Linear(num_state, 100) 42 | self.action_head = nn.Linear(100, num_action) 43 | 44 | def forward(self, x): 45 | x = F.relu(self.fc1(x)) 46 | action_prob = F.softmax(self.action_head(x), dim=1) 47 | return action_prob 48 | 49 | 50 | class Critic(nn.Module): 51 | def __init__(self): 52 | super(Critic, self).__init__() 53 | self.fc1 = nn.Linear(num_state, 100) 54 | self.state_value = nn.Linear(100, 1) 55 | 56 | def forward(self, x): 57 | x = F.relu(self.fc1(x)) 58 | value = self.state_value(x) 59 | return value 60 | 61 | 62 | class PPO(): 63 | clip_param = 0.2 64 | max_grad_norm = 0.5 65 | ppo_update_time = 10 66 | buffer_capacity = 1000 67 | batch_size = 32 68 | 69 | def __init__(self): 70 | super(PPO, self).__init__() 71 | self.actor_net = Actor() 72 | self.critic_net = Critic() 73 | self.buffer = [] 74 | self.counter = 0 75 | self.training_step = 0 76 | # self.writer = SummaryWriter('../exp') 77 | 78 | self.actor_optimizer = optim.Adam(self.actor_net.parameters(), 1e-3) 79 | self.critic_net_optimizer = optim.Adam(self.critic_net.parameters(), 3e-3) 80 | if not os.path.exists('../param'): 81 | os.makedirs('../param/net_param') 82 | os.makedirs('../param/img') 83 | 84 | def select_action(self, state): 85 | state = torch.from_numpy(state).float().unsqueeze(0) 86 | with torch.no_grad(): 87 | action_prob = self.actor_net(state) 88 | c = Categorical(action_prob) 89 | action = c.sample() 90 | return action.item(), action_prob[:,action.item()].item() 91 | 92 | def get_value(self, state): 93 | state = torch.from_numpy(state) 94 | with torch.no_grad(): 95 | value = self.critic_net(state) 96 | return value.item() 97 | 98 | def save_param(self): 99 | torch.save(self.actor_net.state_dict(), '../param/net_param/actor_net' + str(time.time())[:10], +'.pkl') 100 | torch.save(self.critic_net.state_dict(), '../param/net_param/critic_net' + str(time.time())[:10], +'.pkl') 101 | 102 | def store_transition(self, transition): 103 | self.buffer.append(transition) 104 | self.counter += 1 105 | 106 | 107 | def update(self, i_ep): 108 | state = torch.tensor([t.state for t in self.buffer], dtype=torch.float) 109 | action = torch.tensor([t.action for t in self.buffer], dtype=torch.long).view(-1, 1) 110 | reward = [t.reward for t in self.buffer] 111 | # update: don't need next_state 112 | #reward = torch.tensor([t.reward for t in self.buffer], dtype=torch.float).view(-1, 1) 113 | #next_state = torch.tensor([t.next_state for t in self.buffer], dtype=torch.float) 114 | old_action_log_prob = torch.tensor([t.a_log_prob for t in self.buffer], dtype=torch.float).view(-1, 1) 115 | 116 | R = 0 117 | Gt = [] 118 | for r in reward[::-1]: 119 | R = r + gamma * R 120 | Gt.insert(0, R) 121 | Gt = torch.tensor(Gt, dtype=torch.float) 122 | #print("The agent is updateing....") 123 | for i in range(self.ppo_update_time): 124 | for index in BatchSampler(SubsetRandomSampler(range(len(self.buffer))), self.batch_size, False): 125 | if self.training_step % 1000 ==0: 126 | print('I_ep {} ,train {} times'.format(i_ep,self.training_step)) 127 | #with torch.no_grad(): 128 | Gt_index = Gt[index].view(-1, 1) 129 | V = self.critic_net(state[index]) 130 | delta = Gt_index - V 131 | advantage = delta.detach() 132 | # epoch iteration, PPO core!!! 133 | action_prob = self.actor_net(state[index]).gather(1, action[index]) # new policy 134 | 135 | ratio = (action_prob/old_action_log_prob[index]) 136 | surr1 = ratio * advantage 137 | surr2 = torch.clamp(ratio, 1 - self.clip_param, 1 + self.clip_param) * advantage 138 | 139 | # update actor network 140 | action_loss = -torch.min(surr1, surr2).mean() # MAX->MIN desent 141 | # self.writer.add_scalar('loss/action_loss', action_loss, global_step=self.training_step) 142 | self.actor_optimizer.zero_grad() 143 | action_loss.backward() 144 | nn.utils.clip_grad_norm_(self.actor_net.parameters(), self.max_grad_norm) 145 | self.actor_optimizer.step() 146 | 147 | #update critic network 148 | value_loss = F.mse_loss(Gt_index, V) 149 | # self.writer.add_scalar('loss/value_loss', value_loss, global_step=self.training_step) 150 | self.critic_net_optimizer.zero_grad() 151 | value_loss.backward() 152 | nn.utils.clip_grad_norm_(self.critic_net.parameters(), self.max_grad_norm) 153 | self.critic_net_optimizer.step() 154 | self.training_step += 1 155 | 156 | del self.buffer[:] # clear experience 157 | 158 | 159 | def main(): 160 | agent = PPO() 161 | for i_epoch in range(1000): 162 | state = env.reset() 163 | if render: env.render() 164 | 165 | for t in count(): 166 | action, action_prob = agent.select_action(state) 167 | next_state, reward, done, _ = env.step(action) 168 | if t>800: 169 | done = True 170 | trans = Transition(state, action, action_prob, reward, next_state) 171 | if render: env.render() 172 | agent.store_transition(trans) 173 | state = next_state 174 | 175 | if done : 176 | if len(agent.buffer) >= agent.batch_size:agent.update(i_epoch) 177 | # agent.writer.add_scalar('liveTime/livestep', t, global_step=i_epoch) 178 | break 179 | 180 | 181 | if __name__ == '__main__': 182 | main() 183 | print("end") 184 | -------------------------------------------------------------------------------- /Pytorch/PPO/Readme.md: -------------------------------------------------------------------------------- 1 | Simple implementation PPO, that fixes a few errors. Now runs on pytorch 1.5.0 2 | 3 | 4 | Credit: Code modified from sweetice 's original version 5 | 6 | -------------------------------------------------------------------------------- /Pytorch/PolicyGradient/Load_model.py: -------------------------------------------------------------------------------- 1 | ## DQN Tutorial 2 | ## Implementation from https://github.com/FitMachineLearning 3 | 4 | import torch 5 | import gym 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import torch.optim as optim 9 | from torch.distributions import Categorical 10 | import numpy as np 11 | from dataclasses import dataclass 12 | from typing import Any 13 | from random import random 14 | 15 | from agent_and_model import Policy 16 | 17 | 18 | def select_action(state, policy): 19 | state = torch.from_numpy(state).float().unsqueeze(0) 20 | probs = policy(state.to(policy.device)) 21 | m = Categorical(probs) 22 | action = m.sample() 23 | policy.saved_log_probs.append(m.log_prob(action)) 24 | return action.item() 25 | 26 | 27 | 28 | 29 | if __name__=='__main__': 30 | DEBUGER_ON = True 31 | NUM_GAMES = 100 32 | MAX_EPISODE_STEPS = 10000 33 | TARGET_MODEL_UPDATE_INTERVAL = 50 34 | EPSILON_MIN = 0.05 35 | EPSILON_START = 0.5 36 | EPSLILON_COUNT = 6000 #Games 37 | INITIAL_RANDOM_STEPS = 5000 38 | RANDOM_GAME_EVERY = 20 39 | TRAIN_CRITIC_EVERY_N_STEP = 300 40 | CRITIC_TRAINING_SAMPLE_SIZE = 256 41 | TRAIN_ACTOR_EVERY_N_GAME = 1 42 | ACTOR_TRAINING_SAMPLE_SIZE = 8 43 | NUM_ACTOR_TRAINING_SAMPLES = 40 44 | TRAINING_ITTERATIONS = 1 45 | NUM_ACTOR_TRAINING_SAMPLES = 128 46 | PRINT_EVERY = 1 47 | RENDER_ENV = True 48 | LOAD_MODEL = True 49 | SAVE_MODEL = False 50 | MODEL_FILE_NAME = "TDQN_RL_MODEL.trl" 51 | MODEL_ID = "01" 52 | SAVE_MODEL_EVERY = 25 53 | 54 | epsilon = EPSILON_START 55 | env = gym.make('LunarLander-v2') 56 | # env = gym.make('BipedalWalker-v3') 57 | 58 | observation = env.reset() 59 | print("env action space ", env.action_space.shape) 60 | policy=Policy() 61 | 62 | # import ipdb;ipdb.set_trace() 63 | 64 | if LOAD_MODEL: 65 | policy = torch.load("pg_policy.trl") 66 | policy.eval() 67 | 68 | step_counter = 0 69 | last_step_count = 0 70 | 71 | 72 | action = [] 73 | for game in range (NUM_GAMES): 74 | episode_sars = [] 75 | score = 0 76 | for step in range (MAX_EPISODE_STEPS): 77 | if RENDER_ENV: 78 | env.render() 79 | 80 | if random()<-0.1: 81 | action = env.action_space.sample() 82 | else: 83 | # import ipdb; ipdb.set_trace() 84 | action = select_action(observation,policy) 85 | # print("action ", action) 86 | observation_next, reward, done, info = env.step(action) 87 | score += reward 88 | 89 | observation = observation_next 90 | step_counter+=1 91 | last_step_count = step 92 | if done: 93 | 94 | break 95 | 96 | observation = env.reset() 97 | epsilon = max(EPSILON_MIN, epsilon-((EPSILON_START-EPSILON_MIN)/EPSLILON_COUNT) ) 98 | if (game%PRINT_EVERY==0): 99 | print("episide ", game,"last score",reward, "game score ", score ,"episode_len",last_step_count, "epsilon",epsilon ) 100 | avg_reward = [] 101 | # print("epsilon ", epsilon) 102 | -------------------------------------------------------------------------------- /Pytorch/PolicyGradient/Readme.md: -------------------------------------------------------------------------------- 1 | # Implementation from FitMachineLeaning 2 | # Modified from the reinforce aglorimth 3 | @ https://github.com/pytorch/examples/blob/master/reinforcement_learning/reinforce.py 4 | -------------------------------------------------------------------------------- /Pytorch/PolicyGradient/agent_and_model.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | import numpy as np 4 | from itertools import count 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | import torch.optim as optim 10 | from torch.distributions import Categorical 11 | 12 | class Policy(nn.Module): 13 | def __init__(self): 14 | super(Policy, self).__init__() 15 | self.affine1 = nn.Linear(8, 64) 16 | self.dropout1 = nn.Dropout(p=0.1) 17 | self.affine2 = nn.Linear(64, 64) 18 | self.dropout2 = nn.Dropout(p=0.2) 19 | self.affine3 = nn.Linear(64, 4) 20 | 21 | self.saved_log_probs = [] 22 | self.rewards = [] 23 | self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cuda:1') 24 | self.to(self.device) 25 | 26 | 27 | def forward(self, x): 28 | x = self.affine1(x) 29 | # x = self.dropout1(x) 30 | x = self.affine2(x) 31 | x = self.dropout2(x) 32 | x = F.relu(x) 33 | action_scores = self.affine3(x) 34 | return F.softmax(action_scores, dim=1) 35 | -------------------------------------------------------------------------------- /Pytorch/PolicyGradient/policy_gradien_2.py: -------------------------------------------------------------------------------- 1 | # Implementation from FitMachineLeaning 2 | # Modified from the reinforce aglorimth 3 | # @ https://github.com/pytorch/examples/blob/master/reinforcement_learning/reinforce.py 4 | import argparse 5 | import gym 6 | import numpy as np 7 | from itertools import count 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | import torch.optim as optim 13 | from torch.distributions import Categorical 14 | from agent_and_model import Policy 15 | 16 | 17 | parser = argparse.ArgumentParser(description='PyTorch REINFORCE example') 18 | parser.add_argument('--gamma', type=float, default=0.99, metavar='G', 19 | help='discount factor (default: 0.99)') 20 | parser.add_argument('--seed', type=int, default=543, metavar='N', 21 | help='random seed (default: 543)') 22 | parser.add_argument('--render', action='store_true', 23 | help='render the environment') 24 | parser.add_argument('--log-interval', type=int, default=10, metavar='N', 25 | help='interval between training status logs (default: 10)') 26 | args = parser.parse_args() 27 | 28 | 29 | # env = gym.make('CartPole-v1') 30 | env = gym.make('LunarLander-v2') 31 | env.seed(args.seed) 32 | torch.manual_seed(args.seed) 33 | 34 | 35 | policy = Policy() 36 | optimizer = optim.Adam(policy.parameters(), lr=1e-3) 37 | eps = np.finfo(np.float32).eps.item() 38 | 39 | 40 | def select_action(state): 41 | state = torch.from_numpy(state).float().unsqueeze(0) 42 | probs = policy(state.to(policy.device)) 43 | m = Categorical(probs) 44 | action = m.sample() 45 | policy.saved_log_probs.append(m.log_prob(action)) 46 | return action.item() 47 | 48 | 49 | def finish_episode(): 50 | R = 0 51 | policy_loss = [] 52 | returns = [] 53 | for r in policy.rewards[::-1]: 54 | R = r + args.gamma * R 55 | returns.insert(0, R) 56 | returns = torch.tensor(returns).to(policy.device) 57 | returns = (returns - returns.mean()) / (returns.std() + eps) 58 | for log_prob, R in zip(policy.saved_log_probs, returns): 59 | policy_loss.append(-log_prob * R) 60 | optimizer.zero_grad() 61 | policy_loss = torch.cat(policy_loss).sum() 62 | policy_loss.backward() 63 | optimizer.step() 64 | del policy.rewards[:] 65 | del policy.saved_log_probs[:] 66 | 67 | 68 | def main(): 69 | running_reward = 10 70 | for i_episode in count(1): 71 | state, ep_reward = env.reset(), 0 72 | for t in range(1, 20000): # Don't infinite loop while learning 73 | action = select_action(state) 74 | state, reward, done, _ = env.step(action) 75 | # if True or args.render: 76 | # env.render() 77 | policy.rewards.append(reward) 78 | ep_reward += reward 79 | if done: 80 | break 81 | 82 | running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward 83 | finish_episode() 84 | # if i_episode % args.log_interval == 0: 85 | if i_episode % 1 == 0: 86 | print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format( 87 | i_episode, ep_reward, running_reward)) 88 | 89 | if i_episode % 10 ==0: 90 | torch.save(policy,"pg_policy.trl") 91 | if running_reward > env.spec.reward_threshold: 92 | print("Solved! Running reward is now {} and " 93 | "the last episode runs to {} time steps!".format(running_reward, t)) 94 | break 95 | 96 | 97 | if __name__ == '__main__': 98 | main() 99 | -------------------------------------------------------------------------------- /QLearning/LunarLander_QL.py: -------------------------------------------------------------------------------- 1 | ''' 2 | LunarLander-v2 solution by Michel Aka 3 | https://github.com/FitMachineLearning/FitML/ 4 | https://www.youtube.com/channel/UCi7_WxajoowBl4_9P0DhzzA/featured 5 | Using Modified Q Learning, Bellman, Reinforcement Learning, RL memory 6 | 7 | ''' 8 | import numpy as np 9 | import keras 10 | import gym 11 | import os 12 | import h5py 13 | 14 | from keras.models import Sequential 15 | from keras.layers import Dense, Dropout 16 | from keras.layers import Embedding 17 | from keras import optimizers 18 | 19 | 20 | num_env_variables = 8 21 | num_env_actions = 4 22 | num_initial_observation = 0 23 | learning_rate = 0.001 24 | weigths_filename = "LL-QL-v2-weights.h5" 25 | 26 | b_discount = 0.98 27 | max_memory_len = 60000 28 | starting_explore_prob = 0.05 29 | training_epochs = 2 30 | load_previous_weights = True 31 | observe_and_train = True 32 | save_weights = True 33 | num_games_to_play = 50 34 | 35 | 36 | #One hot encoding array 37 | possible_actions = np.arange(0,num_env_actions) 38 | actions_1_hot = np.zeros((num_env_actions,num_env_actions)) 39 | actions_1_hot[np.arange(num_env_actions),possible_actions] = 1 40 | 41 | #Create testing enviroment 42 | env = gym.make('LunarLander-v2') 43 | env.reset() 44 | 45 | #initialize training matrix with random states and actions 46 | dataX = np.random.random(( 5,num_env_variables+num_env_actions )) 47 | #Only one output for the total score 48 | dataY = np.random.random((5,1)) 49 | 50 | 51 | 52 | #nitialize the Neural Network with random weights 53 | 54 | model = Sequential() 55 | #model.add(Dense(num_env_variables+num_env_actions, activation='tanh', input_dim=dataX.shape[1])) 56 | model.add(Dense(512, activation='relu', input_dim=dataX.shape[1])) 57 | model.add(Dense(dataY.shape[1])) 58 | 59 | opt = optimizers.adam(lr=learning_rate) 60 | 61 | model.compile(loss='mse', optimizer=opt, metrics=['accuracy']) 62 | 63 | #load previous model weights if they exist 64 | if load_previous_weights: 65 | dir_path = os.path.realpath(".") 66 | fn = dir_path + "/"+weigths_filename 67 | print("filepath ", fn) 68 | if os.path.isfile(fn): 69 | print("loading weights") 70 | model.load_weights(weigths_filename) 71 | else: 72 | print("File ",weigths_filename," does not exis. Retraining... ") 73 | 74 | #Initialize training data array 75 | total_steps = 0 76 | dataX = np.zeros(shape=(1,num_env_variables+num_env_actions)) 77 | dataY = np.zeros(shape=(1,1)) 78 | 79 | #Initialize Memory Array data array 80 | memoryX = np.zeros(shape=(1,num_env_variables+num_env_actions)) 81 | memoryY = np.zeros(shape=(1,1)) 82 | 83 | 84 | print("dataX shape", dataX.shape) 85 | print("dataY shape", dataY.shape) 86 | 87 | 88 | #This function predicts the reward that will result from taking an "action" at a state "qstate" 89 | def predictTotalRewards(qstate, action): 90 | qs_a = np.concatenate((qstate,actions_1_hot[action]), axis=0) 91 | predX = np.zeros(shape=(1,num_env_variables+num_env_actions)) 92 | predX[0] = qs_a 93 | 94 | #print("trying to predict reward at qs_a", predX[0]) 95 | pred = model.predict(predX[0].reshape(1,predX.shape[1])) 96 | remembered_total_reward = pred[0][0] 97 | return remembered_total_reward 98 | 99 | 100 | 101 | if observe_and_train: 102 | 103 | #Play the game a determine number of times 104 | for game in range(num_games_to_play): 105 | gameX = np.zeros(shape=(1,num_env_variables+num_env_actions)) 106 | gameY = np.zeros(shape=(1,1)) 107 | #Get the initial Q state 108 | qs = env.reset() 109 | for step in range (40000): 110 | 111 | #Learn from observation and not playing 112 | if game < num_initial_observation: 113 | #take a radmon action 114 | a = env.action_space.sample() 115 | else: 116 | #Now playing and also learning from experience during play 117 | 118 | #Calculate probability to take deterministic action vs random action (epsilon) 119 | prob = np.random.rand(1) 120 | explore_prob = starting_explore_prob-(starting_explore_prob/num_games_to_play)*game 121 | 122 | #Chose between prediction and chance 123 | if prob < explore_prob: 124 | #take a random action 125 | a=env.action_space.sample() 126 | #print("taking random action",a, "at total_steps" , total_steps) 127 | #print("prob ", prob, "explore_prob", explore_prob) 128 | 129 | else: 130 | ##chose an action by estimating the function-estimator remembered consequences of all possible actions 131 | ## Bellman states that the best policy (i.e. action) is the one that maximizez expected rewards for future states 132 | ## to caculate rewards we compute the reward a this state t + the discounted (b_discount) reward at all possible state t+1 133 | ## all states t+1 are estimated by our function estimator (our Neural Network) 134 | 135 | 136 | utility_possible_actions = np.zeros(shape=(num_env_actions)) 137 | 138 | utility_possible_actions[0] = predictTotalRewards(qs,0) 139 | utility_possible_actions[1] = predictTotalRewards(qs,1) 140 | utility_possible_actions[2] = predictTotalRewards(qs,2) 141 | utility_possible_actions[3] = predictTotalRewards(qs,3) 142 | 143 | 144 | #chose argmax action of estimated anticipated rewards 145 | #print("utility_possible_actions ",utility_possible_actions) 146 | #print("argmax of utitity", np.argmax(utility_possible_actions)) 147 | a = np.argmax(utility_possible_actions) 148 | 149 | 150 | 151 | env.render() 152 | qs_a = np.concatenate((qs,actions_1_hot[a]), axis=0) 153 | 154 | #print("action",a," qs_a",qs_a) 155 | #Perform the optimal action and get the target state and reward 156 | s,r,done,info = env.step(a) 157 | 158 | 159 | #record information for training and memory 160 | if step ==0: 161 | gameX[0] = qs_a 162 | gameY[0] = np.array([r]) 163 | memoryX[0] = qs_a 164 | memoryY[0] = np.array([r]) 165 | 166 | gameX = np.vstack((gameX,qs_a)) 167 | gameY = np.vstack((gameY,np.array([r]))) 168 | 169 | 170 | if done : 171 | #GAME ENDED 172 | #Calculate Q values from end to start of game (From last step to first) 173 | for i in range(0,gameY.shape[0]): 174 | #print("Updating total_reward at game epoch ",(gameY.shape[0]-1) - i) 175 | if i==0: 176 | #print("reward at the last step ",gameY[(gameY.shape[0]-1)-i][0]) 177 | gameY[(gameY.shape[0]-1)-i][0] = gameY[(gameY.shape[0]-1)-i][0] 178 | else: 179 | #print("local error before Bellman", gameY[(gameY.shape[0]-1)-i][0],"Next error ", gameY[(gameY.shape[0]-1)-i+1][0]) 180 | gameY[(gameY.shape[0]-1)-i][0] = gameY[(gameY.shape[0]-1)-i][0]+b_discount*gameY[(gameY.shape[0]-1)-i+1][0] 181 | #print("reward at step",i,"away from the end is",gameY[(gameY.shape[0]-1)-i][0]) 182 | if i==gameY.shape[0]-1: 183 | print("Training Game #",game, " steps = ", step ,"last reward", r," finished with headscore ", gameY[(gameY.shape[0]-1)-i][0]) 184 | 185 | if memoryX.shape[0] ==1: 186 | memoryX = gameX 187 | memoryY = gameY 188 | else: 189 | #Add experience to memory 190 | memoryX = np.concatenate((memoryX,gameX),axis=0) 191 | memoryY = np.concatenate((memoryY,gameY),axis=0) 192 | 193 | #if memory is full remove first element 194 | if np.alen(memoryX) >= max_memory_len: 195 | #print("memory full. mem len ", np.alen(memoryX)) 196 | for l in range(np.alen(gameX)): 197 | memoryX = np.delete(memoryX, 0, axis=0) 198 | memoryY = np.delete(memoryY, 0, axis=0) 199 | 200 | #Update the states 201 | qs=s 202 | 203 | #Retrain every X game after num_initial_observation 204 | if done and game >= num_initial_observation: 205 | if game%10 == 0: 206 | print("Training game# ", game,"momory size", memoryX.shape[0]) 207 | model.fit(memoryX,memoryY, batch_size=32,epochs=training_epochs,verbose=2) 208 | 209 | if done: 210 | if r >= 0 and r <99: 211 | print("Game ",game," ended with positive reward ") 212 | if r > 50: 213 | print("Game ", game," WON *** " ) 214 | #Game ended - Break 215 | break 216 | 217 | 218 | 219 | 220 | 221 | if save_weights: 222 | #Save model 223 | print("Saving weights") 224 | model.save_weights(weigths_filename) 225 | -------------------------------------------------------------------------------- /QLearning/LunarLander_v2.py: -------------------------------------------------------------------------------- 1 | ''' 2 | LunarLander-v2 solution by Michel Aka 3 | https://github.com/FitMachineLearning/FitML/ 4 | https://www.youtube.com/channel/UCi7_WxajoowBl4_9P0DhzzA/featured 5 | Using Modified Q Learning, Bellman, Reinforcement Learning, RL memory 6 | 7 | ''' 8 | import numpy as np 9 | import keras 10 | import gym 11 | import os 12 | import h5py 13 | 14 | from keras.models import Sequential 15 | from keras.layers import Dense, Dropout 16 | from keras.layers import Embedding 17 | from keras import optimizers 18 | 19 | 20 | num_env_variables = 8 21 | num_env_actions = 4 22 | num_initial_observation = 15 23 | learning_rate = 0.003 24 | weigths_filename = "LL-QL-v2-weights.h5" 25 | 26 | b_discount = 0.99 27 | max_memory_len = 60000 28 | starting_explore_prob = 0.05 29 | training_epochs = 3 30 | load_previous_weights = True 31 | observe_and_train = True 32 | save_weights = True 33 | num_games_to_play = 1000 34 | 35 | 36 | #One hot encoding array 37 | possible_actions = np.arange(0,num_env_actions) 38 | actions_1_hot = np.zeros((num_env_actions,num_env_actions)) 39 | actions_1_hot[np.arange(num_env_actions),possible_actions] = 1 40 | 41 | #Create testing enviroment 42 | env = gym.make('LunarLander-v2') 43 | env.reset() 44 | 45 | #initialize training matrix with random states and actions 46 | dataX = np.random.random(( 5,num_env_variables+num_env_actions )) 47 | #Only one output for the total score 48 | dataY = np.random.random((5,1)) 49 | 50 | 51 | 52 | #nitialize the Neural Network with random weights 53 | 54 | model = Sequential() 55 | #model.add(Dense(num_env_variables+num_env_actions, activation='tanh', input_dim=dataX.shape[1])) 56 | model.add(Dense(512, activation='relu', input_dim=dataX.shape[1])) 57 | model.add(Dense(256, activation='relu' )) 58 | model.add(Dense(256, activation='relu')) 59 | model.add(Dense(dataY.shape[1])) 60 | 61 | opt = optimizers.adam(lr=learning_rate) 62 | 63 | model.compile(loss='mse', optimizer=opt, metrics=['accuracy']) 64 | 65 | #load previous model weights if they exist 66 | if load_previous_weights: 67 | dir_path = os.path.realpath(".") 68 | fn = dir_path + "/"+weigths_filename 69 | print("filepath ", fn) 70 | if os.path.isfile(fn): 71 | print("loading weights") 72 | model.load_weights(weigths_filename) 73 | else: 74 | print("File ",weigths_filename," does not exis. Retraining... ") 75 | 76 | #Initialize training data array 77 | total_steps = 0 78 | dataX = np.zeros(shape=(1,num_env_variables+num_env_actions)) 79 | dataY = np.zeros(shape=(1,1)) 80 | 81 | #Initialize Memory Array data array 82 | memoryX = np.zeros(shape=(1,num_env_variables+num_env_actions)) 83 | memoryY = np.zeros(shape=(1,1)) 84 | 85 | 86 | print("dataX shape", dataX.shape) 87 | print("dataY shape", dataY.shape) 88 | 89 | 90 | #This function predicts the reward that will result from taking an "action" at a state "qstate" 91 | def predictTotalRewards(qstate, action): 92 | qs_a = np.concatenate((qstate,actions_1_hot[action]), axis=0) 93 | predX = np.zeros(shape=(1,num_env_variables+num_env_actions)) 94 | predX[0] = qs_a 95 | 96 | #print("trying to predict reward at qs_a", predX[0]) 97 | pred = model.predict(predX[0].reshape(1,predX.shape[1])) 98 | remembered_total_reward = pred[0][0] 99 | return remembered_total_reward 100 | 101 | 102 | 103 | if observe_and_train: 104 | 105 | #Play the game a determine number of times 106 | for game in range(num_games_to_play): 107 | gameX = np.zeros(shape=(1,num_env_variables+num_env_actions)) 108 | gameY = np.zeros(shape=(1,1)) 109 | #Get the initial Q state 110 | qs = env.reset() 111 | for step in range (40000): 112 | 113 | #Learn from observation and not playing 114 | if game < num_initial_observation: 115 | #take a radmon action 116 | a = env.action_space.sample() 117 | else: 118 | #Now playing and also learning from experience during play 119 | 120 | #Calculate probability to take deterministic action vs random action (epsilon) 121 | prob = np.random.rand(1) 122 | explore_prob = starting_explore_prob-(starting_explore_prob/num_games_to_play)*game 123 | 124 | #Chose between prediction and chance 125 | if prob < explore_prob: 126 | #take a random action 127 | a=env.action_space.sample() 128 | #print("taking random action",a, "at total_steps" , total_steps) 129 | #print("prob ", prob, "explore_prob", explore_prob) 130 | 131 | else: 132 | ##chose an action by estimating the function-estimator remembered consequences of all possible actions 133 | ## Bellman states that the best policy (i.e. action) is the one that maximizez expected rewards for future states 134 | ## to caculate rewards we compute the reward a this state t + the discounted (b_discount) reward at all possible state t+1 135 | ## all states t+1 are estimated by our function estimator (our Neural Network) 136 | 137 | 138 | utility_possible_actions = np.zeros(shape=(num_env_actions)) 139 | 140 | utility_possible_actions[0] = predictTotalRewards(qs,0) 141 | utility_possible_actions[1] = predictTotalRewards(qs,1) 142 | utility_possible_actions[2] = predictTotalRewards(qs,2) 143 | utility_possible_actions[3] = predictTotalRewards(qs,3) 144 | 145 | 146 | #chose argmax action of estimated anticipated rewards 147 | #print("utility_possible_actions ",utility_possible_actions) 148 | #print("argmax of utitity", np.argmax(utility_possible_actions)) 149 | a = np.argmax(utility_possible_actions) 150 | 151 | 152 | 153 | env.render() 154 | qs_a = np.concatenate((qs,actions_1_hot[a]), axis=0) 155 | 156 | #print("action",a," qs_a",qs_a) 157 | #Perform the optimal action and get the target state and reward 158 | s,r,done,info = env.step(a) 159 | 160 | 161 | #record information for training and memory 162 | if step ==0: 163 | gameX[0] = qs_a 164 | gameY[0] = np.array([r]) 165 | memoryX[0] = qs_a 166 | memoryY[0] = np.array([r]) 167 | 168 | gameX = np.vstack((gameX,qs_a)) 169 | gameY = np.vstack((gameY,np.array([r]))) 170 | 171 | 172 | if done : 173 | #GAME ENDED 174 | #Calculate Q values from end to start of game (From last step to first) 175 | for i in range(0,gameY.shape[0]): 176 | #print("Updating total_reward at game epoch ",(gameY.shape[0]-1) - i) 177 | if i==0: 178 | #print("reward at the last step ",gameY[(gameY.shape[0]-1)-i][0]) 179 | gameY[(gameY.shape[0]-1)-i][0] = gameY[(gameY.shape[0]-1)-i][0] 180 | else: 181 | #print("local error before Bellman", gameY[(gameY.shape[0]-1)-i][0],"Next error ", gameY[(gameY.shape[0]-1)-i+1][0]) 182 | gameY[(gameY.shape[0]-1)-i][0] = gameY[(gameY.shape[0]-1)-i][0]+b_discount*gameY[(gameY.shape[0]-1)-i+1][0] 183 | #print("reward at step",i,"away from the end is",gameY[(gameY.shape[0]-1)-i][0]) 184 | if i==gameY.shape[0]-1 and game%5==0: 185 | print("Training Game #",game, " steps = ", step ,"last reward", r," finished with headscore ", gameY[(gameY.shape[0]-1)-i][0]) 186 | 187 | if memoryX.shape[0] ==1: 188 | memoryX = gameX 189 | memoryY = gameY 190 | else: 191 | #Add experience to memory 192 | memoryX = np.concatenate((memoryX,gameX),axis=0) 193 | memoryY = np.concatenate((memoryY,gameY),axis=0) 194 | 195 | #if memory is full remove first element 196 | if np.alen(memoryX) >= max_memory_len: 197 | #print("memory full. mem len ", np.alen(memoryX)) 198 | for l in range(np.alen(gameX)): 199 | memoryX = np.delete(memoryX, 0, axis=0) 200 | memoryY = np.delete(memoryY, 0, axis=0) 201 | 202 | #Update the states 203 | qs=s 204 | 205 | #Retrain every X game after num_initial_observation 206 | if done and game >= num_initial_observation: 207 | if game%10 == 0: 208 | print("Training game# ", game,"momory size", memoryX.shape[0]) 209 | model.fit(memoryX,memoryY, batch_size=256,nb_epoch=training_epochs,verbose=0) 210 | 211 | if done: 212 | if r >= 0 and r <99: 213 | print("Game ",game," ended with positive reward ") 214 | if r > 50: 215 | print("Game ", game," WON *** " ) 216 | #Game ended - Break 217 | break 218 | 219 | 220 | 221 | 222 | 223 | if save_weights: 224 | #Save model 225 | print("Saving weights") 226 | model.save_weights(weigths_filename) 227 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FitML 2 | ```python 3 | model.fit(Machine_Learning, epochs=Inf) 4 | ``` 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
13 | 14 | ### What is Fit ML 15 | Fit Machine Learning (FitML) is blog that houses a collection of python Machine Learning articles and examples, often focusing on Reinforcement Learning. Here, you will find code related to Q Learning, Actor-Critic, MDP, Bellman, OpenAI solutions and custom implemented approaches to solving some of the toughest and most interesting problems to date (Yes, I am "baised"). 16 | 17 | ### Who is Michel Aka 18 | *Michel is an AI researcher and a graduate from University of Montreal who currently works in the Healthcare industry.* 19 | 20 | ### How to use for Reinforcement Learning Algorithm 21 | - (Optional) Clone the repo 22 | - Select the algorithm that you need (Folders are named by the RL algorithm ). Policy Gradient/ Parameter Noising/ Actor Critic / Selective memory 23 | - Get an instance of the algorithm with the environment you need. If the one you are looking for isn't there, get any environment.py file from the algorithm folder of choice and follow the steps below. 24 | - Install the dependencies 25 | - - Usually "pip install ". Example "pip install pygal" 26 | - Replace the name of the environment in line 81 of the code. 27 | ```Python 28 | env = gym.make('BipedalWalker-v2') 29 | # replace with 30 | env = gym.make('') 31 | ``` 32 | or set the ```ENVIRONMENT_NAME =``` to your environment name. Example ```ENVIRONMENT_NAME = "BipedalWalker-v2"```. 33 | 34 | - set the environment's observation and action space and viriables. If you don't know them, run the script once and they will be printed in the first lines of your output. 35 | ```Python 36 | num_env_variables = 37 | num_env_actions = 38 | ``` 39 | - (Optional) you can check the results of your agent as it progresses with the .svg file in the same directory as your script. Any modern browser can view them. 40 | 41 | ### RL Approaches 42 | 43 | #### Optimal Policy Tree Search 44 | 45 | This is a RL technique which is characterized by computing the estimated value of expected sum of reward for n time steps ahead. This technique has the advantage of yeilding a better estimation of taking a specific policy, however it is computationally expensive and memorry inneficient. If one had a super computer and very large amount of memory, this technique would do extremely well for discrete action space problem/environments. I believe Alfa-Go uses a varient of this technique. 46 | 47 | See examples and find out more about Optimal Policy Tree Search here . 48 | 49 | #### Selective Memory 50 | 51 | As far as I know, I haven't seen anyone in the litterature implement this technique before. 52 | 53 | The intuition behind Policy Gradient is that it optimizes the parameters of the network in the direction of higher expected sum of rewards. What if we could do the same in a computationally more effective way that also turns out to be more intuitive: enter what I am calling Selective Memory. 54 | 55 | We chose what to commit to memory based on actual sum of rewards 56 | 57 | Find out more here . 58 | 59 | 60 | #### Q-Learning 61 | 62 | Q-Learning is a well knon Reinforcement Learning approach, popularized by Google Deep Mind, when they used it to master multiple early console era games. Q-Learning focuses on estimating the expected sum of rewards using the Bellman equation in order to determine which action to take. Q-Learning works especially well in discrete action space and on problems where the *f(S)->Q* is differentiable, this is not always the case. 63 | 64 | Find out more about Q-Learning here . 65 | 66 | 67 | #### Actor Critique Approaches 68 | 69 | Actor Critique is an RL technique which combines Policy Gradient appraoch with a Critique (Q value estimator) 70 | 71 | Find out more about Actor-Critique here . 72 | 73 | ### Recommended Progression for the Newcomer 74 | 75 | [coming soon] 76 | 77 | ### 78 | 79 | 80 | -------------------------------------------------------------------------------- /SelectiveMemory/README.md: -------------------------------------------------------------------------------- 1 | # FitML 2 | ```python 3 | model.fit(Machine_Learning, epochs=Inf) 4 | ``` 5 | 6 | 7 | 8 | 9 | 10 |
11 | 12 | https://youtu.be/hKrFFeZqq3E 13 | 14 | #### How does Selective Memory work? 15 | 16 | The intuition behind Policy Gradient is that it optimizes the parameters of the network in the direction of higher expected sum of rewards. What if we could do the same in a computationally more effective way that also turns out to be more intuitive: enter what I am calling Selective Memory. 17 | 18 | 1) Our objective here is to ensure that the Policy function converges towards higher rewards. 19 | 20 | 2) We know that Neural Networks will converge towards assigned labeled of our data set and will also generalize (function approximation). 21 | 22 | 3) What if there was a way to select our training (reinforcement) data set so that it ensures that we converge towards our objective; Higher expected rewards. 23 | 24 | Here we propose the approach of selectively remembering actions based on the how high a reward was. In other words, the probability *P* of recording an action state into memory (or a rollout) is dependent on the actual sum of reward yeilded by this action trajectory. (Notice that we are not using the expected sum of reward here but the actual computed value at the end of the rollout). 25 | 26 | What does this look like in code 27 | 28 | First we creat our function approximators Neural Networks 29 | ```python 30 | #nitialize the Reward predictor model 31 | model = Sequential() 32 | #model.add(Dense(num_env_variables+num_env_actions, activation='tanh', input_dim=dataX.shape[1])) 33 | model.add(Dense(1024, activation='relu', input_dim=dataX.shape[1])) 34 | model.add(Dense(256, activation='tanh')) 35 | model.add(Dense(dataY.shape[1])) 36 | opt = optimizers.adam(lr=learning_rate) 37 | model.compile(loss='mse', optimizer=opt, metrics=['accuracy']) 38 | 39 | 40 | #initialize the action predictor model 41 | action_predictor_model = Sequential() 42 | #model.add(Dense(num_env_variables+num_env_actions, activation='tanh', input_dim=dataX.shape[1])) 43 | action_predictor_model.add(Dense(1024, activation='relu', input_dim=apdataX.shape[1])) 44 | action_predictor_model.add(Dense(512, activation='relu')) 45 | action_predictor_model.add(Dense(apdataY.shape[1],activation='tanh')) 46 | ``` 47 | 48 | Then we calculate sum of rewards at the end of each rollout using Bellman. 49 | 50 | Then we careful select what we want to remember i.e. store in memory. 51 | 52 | There is a number of approaches we have used to discriminate on the nature of the State-Actions or State-Action-Rewards that we will be keeping in memory to train our Actor. One discriminates for each indivudual action state, the other discriminates an entire rollout batch. Reguardless the principle is the same. We determine how good an action is compared to the average remembered good actions. 53 | 54 | ```python 55 | def addToMemory(reward,averageReward): 56 | 57 | prob = 0.1 58 | if( reward > averageReward): 59 | prob = prob + 0.9 * math.tanh(reward - averageReward) 60 | else: 61 | prob = prob + 0.1 * math.tanh(reward - averageReward) 62 | 63 | if np.random.rand(1)<=prob : 64 | print("Adding reward",reward," based on prob ", prob) 65 | return True 66 | else: 67 | return False 68 | ``` 69 | 70 | ```python 71 | for i in range(0,gameR.shape[0]): 72 | if addToMemory(gameR[i][0],-1,50): 73 | tempGameSA = np.vstack((tempGameSA, gameSA[i])) 74 | tempGameA = np.vstack((tempGameA,gameA[i])) 75 | tempGameR = np.vstack((tempGameR,gameR[i])) 76 | tempGameS = np.vstack((tempGameS,gameS[i])) 77 | ``` 78 | 79 | Here gameSA, gameA, gameR and gameS represent the various State-Action pairs, Actions, actual discounted sum of rewards and States respectively. 80 | 81 | When we get a new state we then act based on optimal policy which has been trained on memory primed with only the best results yeilding actions. 82 | ```python 83 | #Get Remembered optiomal policy 84 | remembered_optimal_policy = GetRememberedOptimalPolicy(qs) 85 | a = remembered_optimal_policy 86 | ``` 87 | 88 | ### What type of results do we get? 89 | Our agent is able to crawl, stand up, walk, run, jump after 500 episodes in the famous openAI BipedalWalker test. After 3000 iterations, our agent is able to advance fast and be very stable on its feet. 90 | You can watch it in action here: https://youtu.be/hKrFFeZqq3E. 91 | 92 | 93 | ### What is Fit ML 94 | Fit Machine Learning (FitML) is blog that houses a collection of python Machine Learning articles and examples, often focusing on Reinforcement Learning. Here, you will find code related to Q Learning, Actor-Critic, MDP, Bellman, OpenAI solutions and custom implemented approaches to solving some of the toughest and most interesting problems to date (Yes, I am "baised"). 95 | 96 | ### Who is Michel Aka 97 | *Michel is an AI researcher and a graduate from University of Montreal who currently works in the Healthcare industry.* 98 | -------------------------------------------------------------------------------- /SimpleNN/MinFinder.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # # Annotations for the Sirajology Python NN Example 4 | # 5 | # This code comes from a demo NN program from the YouTube video https://youtu.be/h3l4qz76JhQ. The program creates an neural network that simulates the exclusive OR function with two inputs and one output. 6 | # 7 | # 8 | 9 | # In[23]: 10 | 11 | import numpy as np # Note: there is a typo on this line in the video 12 | import pandas as pd 13 | import os 14 | 15 | 16 | # The following is a function definition of the sigmoid function, which is the type of non-linearity chosen for this neural net. It is not the only type of non-linearity that can be chosen, but is has nice analytical features and is easy to teach with. In practice, large-scale deep learning systems use piecewise-linear functions because they are much less expensive to evaluate. 17 | # 18 | # The implementation of this function does double duty. If the deriv=True flag is passed in, the function instead calculates the derivative of the function, which is used in the error backpropogation step. 19 | 20 | # In[24]: 21 | 22 | 23 | def nonlin(x, deriv=False): # Note: there is a typo on this line in the video 24 | if(deriv==True): 25 | return (x*(1-x)) 26 | 27 | return 1/(1+np.exp(-x)) # Note: there is a typo on this line in the video 28 | 29 | 30 | # The following code creates the input matrix. Although not mentioned in the video, the third column is for accommodating the bias term and is not part of the input. 31 | 32 | # In[25]: 33 | 34 | def read_inputs_and_outputs(): 35 | fullDataSet = pd.read_csv(dir_path + "/sonar.csv") 36 | #load input data 37 | X = fullDataSet[fullDataSet.columns[0:60]].values 38 | Y_ = fullDataSet[fullDataSet.columns[60]] 39 | Y = np.zeros((X.shape[0],1)) 40 | 41 | print("X.shape", X.shape) 42 | print("Y.shape", Y.shape) 43 | print(Y[0:2]) 44 | 45 | for i in range(X.shape[0]): 46 | if Y_[i] == 'R': 47 | Y[i,0] = 1 48 | else: 49 | Y[i,0] = 0 50 | 51 | return (X, Y) 52 | 53 | 54 | dir_path = os.path.dirname(os.path.realpath(__file__)) 55 | 56 | X, Y = read_inputs_and_outputs() 57 | #input data 58 | #X = np.array([[0,0,1], # Note: there is a typo on this line in the video 59 | # [0,1,1], 60 | # [1,0,1], 61 | # [1,1,1]]) 62 | 63 | 64 | # The output of the exclusive OR function follows. 65 | 66 | # In[26]: 67 | 68 | #output data 69 | #y = np.array([[0], 70 | # [1], 71 | # [1], 72 | # [0]]) 73 | 74 | 75 | # The seed for the random generator is set so that it will return the same random numbers each time, which is sometimes useful for debugging. 76 | 77 | # In[27]: 78 | 79 | np.random.seed(6) 80 | 81 | 82 | # Now we intialize the weights to random values. syn0 are the weights between the input layer and the hidden layer. It is a 3x4 matrix because there are two input weights plus a bias term (=3) and four nodes in the hidden layer (=4). syn1 are the weights between the hidden layer and the output layer. It is a 4x1 matrix because there are 4 nodes in the hidden layer and one output. Note that there is no bias term feeding the output layer in this example. The weights are initially generated randomly because optimization tends not to work well when all the weights start at the same value. Note that neither of the neural networks shown in the video describe the example. 83 | 84 | #Declare constants 85 | 86 | num_nodes_input_layer = 60 87 | num_nodes_hl1 = 60 88 | num_nodes_hl2 = 60 89 | num_nodes_output_layer = 1 90 | learning_rate = 0.005 91 | 92 | 93 | # In[28]: 94 | 95 | #synapses 96 | syn0 = 2*np.random.random((num_nodes_input_layer,num_nodes_hl1)) - 1 # 3x4 matrix of weights ((2 inputs + 1 bias) x 4 nodes in the hidden layer) 97 | syn1 = 2*np.random.random((num_nodes_hl1, num_nodes_hl2)) - 1 # 4x1 matrix of weights. (4 nodes x 1 output) - no bias term in the hidden layer. 98 | syn2 = 2*np.random.random((num_nodes_hl2,num_nodes_output_layer)) -1 99 | 100 | print("syn2 = ", syn2.shape) 101 | print(syn2[0:10]) 102 | 103 | 104 | # This is the main training loop. The output shows the evolution of the error between the model and desired. The error steadily decreases. 105 | 106 | # In[29]: 107 | 108 | #training step 109 | # Python2 Note: In the follow command, you may improve 110 | # performance by replacing 'range' with 'xrange'. 111 | for j in range(3000): 112 | 113 | # Calculate forward through the network. 114 | l0 = X 115 | l1 = nonlin(np.dot(l0, syn0)) 116 | l2 = nonlin(np.dot(l1, syn1)) 117 | l_out = nonlin(np.dot(l2, syn2)) 118 | 119 | 120 | 121 | # Back propagation of errors using the chain rule. 122 | l_out_error = Y - l_out 123 | cost = np.sum(l_out_error**2)/2 124 | 125 | if j == 0 : 126 | print("layer out output", l_out.shape) 127 | print(l_out) 128 | 129 | print("ittaration ", j, " overall cost = ", cost) 130 | 131 | 132 | if j == 0 or j==1: # Only print the error every 10000 steps, to save time and limit the amount of output. 133 | print("layer error", l_out_error) 134 | print(l_out_error) 135 | 136 | 137 | 138 | if j == 0 or j==1: # Only print the error every 10000 steps, to save time and limit the amount of output. 139 | print("Error: " + str(np.mean(np.abs(l_out_error)))) 140 | 141 | l_out_delta = l_out_error*nonlin(l_out, deriv=True) 142 | if j == 0 or j==1 : 143 | print("layer 1 output", l1.shape) 144 | #print(l1) 145 | 146 | 147 | l2_error = l_out_delta.dot(syn2.T) 148 | l2_delta = l2_error * nonlin(l2,deriv=True) 149 | if j == 0 or j==1 : 150 | print("l2 ") 151 | #print(l2) 152 | 153 | 154 | 155 | l1_error = l2_delta.dot(syn1.T) 156 | 157 | l1_delta = l1_error * nonlin(l1,deriv=True) 158 | 159 | #update weights (no learning rate term) 160 | syn2 += learning_rate * l2.T.dot(l_out_delta) 161 | syn1 += learning_rate * l1.T.dot(l2_delta) 162 | syn0 += learning_rate * l0.T.dot(l1_delta) 163 | 164 | print("Output after training") 165 | print(l_out[80:120]) 166 | 167 | 168 | 169 | 170 | # See how the final output closely approximates the true output [0, 1, 1, 0]. If you increase the number of interations in the training loop (currently 60000), the final output will be even closer. 171 | 172 | # In[30]: 173 | 174 | #get_ipython().run_cell_magic(u'HTML', u'', u'#The following line is for embedding the YouTube video \n# in this Jupyter Notebook. You may remove it without peril. \n') 175 | 176 | 177 | # In[ ]: 178 | -------------------------------------------------------------------------------- /SimpleNN/data-01-test-score.csv: -------------------------------------------------------------------------------- 1 | 73,80,75,152 2 | 93,88,93,185 3 | 89,91,90,180 4 | 96,98,100,196 5 | 73,66,70,142 6 | 53,46,55,101 7 | 69,74,77,149 8 | 47,56,60,115 9 | 87,79,90,175 10 | 79,70,88,164 11 | 69,70,73,141 12 | 70,65,74,141 13 | 93,95,91,184 14 | 79,80,73,152 15 | 70,73,78,148 16 | 93,89,96,192 17 | 78,75,68,147 18 | 81,90,93,183 19 | 88,92,86,177 20 | 78,83,77,159 21 | 82,86,90,177 22 | 86,82,89,175 23 | 78,83,85,175 24 | 76,83,71,149 25 | 96,93,95,192 26 | -------------------------------------------------------------------------------- /Stable_baselines3/Readme.md: -------------------------------------------------------------------------------- 1 | Simple implementation of Stable Baline3 algorithms. 2 | 3 | **to Train:** 4 | 5 | *python ppo_main2.py* 6 | 7 | This will save the model in the local directory 8 | 9 | **to view the agent** 10 | 11 | *python ppo_load.py* 12 | -------------------------------------------------------------------------------- /Stable_baselines3/ppo_load.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import pybullet, pybullet_envs 3 | import torch as th 4 | 5 | from stable_baselines3 import PPO 6 | from stable_baselines3.common.evaluation import evaluate_policy 7 | 8 | # Create environment 9 | # env = gym.make('LunarLanderContinuous-v2') 10 | 11 | env = gym.make('BipedalWalker-v3') 12 | env.render(mode="human") 13 | 14 | policy_kwargs = dict(activation_fn=th.nn.LeakyReLU, net_arch=[512, 512]) 15 | # Instantiate the agent 16 | model = PPO('MlpPolicy', env,learning_rate=0.0003,policy_kwargs=policy_kwargs, verbose=1) 17 | 18 | del model # delete trained model to demonstrate loading 19 | # Load the trained agent 20 | model = PPO.load("ppo_Ant") 21 | 22 | # # Evaluate the agent 23 | # mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) 24 | 25 | # Enjoy trained agent 26 | obs = env.reset() 27 | for i in range(100): 28 | dones = False 29 | game_score = 0 30 | steps = 0 31 | while not dones: 32 | action, _states = model.predict(obs, deterministic=True) 33 | obs, rewards, dones, info = env.step(action) 34 | # import ipdb;ipdb.set_trace() 35 | game_score+=rewards 36 | steps+=1 37 | env.render() 38 | print("game ", i ," steps ",steps, " game score %.3f"%game_score) 39 | obs = env.reset() 40 | # break 41 | -------------------------------------------------------------------------------- /Stable_baselines3/ppo_main2.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import pybullet, pybullet_envs 3 | import torch as th 4 | 5 | from stable_baselines3 import PPO 6 | from stable_baselines3.common.evaluation import evaluate_policy 7 | 8 | 9 | # Create environment 10 | # env = gym.make('LunarLanderContinuous-v2') 11 | 12 | env = gym.make('BipedalWalker-v3') 13 | # env.render(mode="human") 14 | 15 | policy_kwargs = dict(activation_fn=th.nn.LeakyReLU, net_arch=[512, 512]) 16 | # Instantiate the agent 17 | model = PPO('MlpPolicy', env,learning_rate=0.0003,policy_kwargs=policy_kwargs, verbose=1) 18 | # Train the agent 19 | for i in range(8000): 20 | print("Training itteration ",i) 21 | model.learn(total_timesteps=10000) 22 | # Save the agent 23 | model.save("ppo_Ant") 24 | mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=5) 25 | print("mean_reward ", mean_reward) 26 | if mean_reward >= 270: 27 | print("***Agent Trained with average reward ", mean_reward) 28 | break 29 | 30 | del model # delete trained model to demonstrate loading 31 | # Load the trained agent 32 | # model = PPO.load("ppo_Ant") 33 | 34 | # Evaluate the agent 35 | # mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) 36 | 37 | # Enjoy trained agent 38 | # obs = env.reset() 39 | # for i in range(100): 40 | # action, _states = model.predict(obs, deterministic=True) 41 | # obs, rewards, dones, info = env.step(action) 42 | # env.render() 43 | -------------------------------------------------------------------------------- /Tensorforce/Readme.md: -------------------------------------------------------------------------------- 1 | Simple implmentation of RL algorithms using Tensorforce Library. 2 | 3 | This will serve as a starting point for RL beginners. 4 | 5 | **to train the model:** 6 | *python tf_LunarLanderContinuous_ppo.py* 7 | 8 | **to load saved the model:** 9 | *python tf_loader.ppy* 10 | 11 | Make sure you do not run the trainer and loader at the same time (espeically on GPU) this will fail. 12 | -------------------------------------------------------------------------------- /Tensorforce/tf_LunarLanderContinuous_ppo.py: -------------------------------------------------------------------------------- 1 | from tensorforce import Agent, Environment 2 | from tensorforce.agents import PPOAgent 3 | from tensorforce.environments import OpenAIGym 4 | 5 | # Pre-defined or custom environment 6 | # environment = Environment.create( 7 | # environment='gym', level='CartPole', max_episode_timesteps=500 8 | # ) 9 | 10 | 11 | # environment = OpenAIGym('CartPole-v0', visualize=True, max_episode_steps=500) 12 | environment = OpenAIGym('LunarLanderContinuous-v2', visualize=False, max_episode_steps=500) 13 | # environment = OpenAIGym('BipedalWalker-v3', visualize=False, max_episode_steps=500) 14 | 15 | 16 | agent = Agent.create( 17 | agent='ppo', environment=environment, batch_size=10, 18 | network=[ 19 | dict(type='dense', size=64), 20 | dict(type='dense', size=64) 21 | ], 22 | learning_rate=1e-3 23 | 24 | ) 25 | 26 | 27 | running_score = 0.0 28 | # Train for 300 episodes 29 | for i_epoch in range(50000): 30 | game_score = 0.0 31 | # Initialize episode 32 | states = environment.reset() 33 | terminal = False 34 | 35 | while not terminal: 36 | # Episode timestep 37 | actions = agent.act(states=states) 38 | states, terminal, reward = environment.execute(actions=actions) 39 | game_score+=reward 40 | agent.observe(terminal=terminal, reward=reward) 41 | 42 | running_score = 0.95*running_score + 0.05*game_score 43 | if i_epoch%5==0: 44 | print("Game ", i_epoch, " game score %.2f"%game_score," running score %.2f"%running_score) 45 | 46 | if i_epoch%10==0 and i_epoch>20: 47 | agent.save() 48 | if running_score >= 250: 49 | agent.save() 50 | break() 51 | 52 | agent.close() 53 | environment.close() 54 | -------------------------------------------------------------------------------- /Tensorforce/tf_LunarLander_ppo.py: -------------------------------------------------------------------------------- 1 | from tensorforce import Agent, Environment 2 | from tensorforce.agents import PPOAgent 3 | from tensorforce.environments import OpenAIGym 4 | 5 | # Pre-defined or custom environment 6 | # environment = Environment.create( 7 | # environment='gym', level='CartPole', max_episode_timesteps=500 8 | # ) 9 | 10 | # Network as list of layers 11 | network_spec = [ 12 | # dict(type='dense', size=32, activation='tanh'), 13 | dict(type='dense', size=128, activation='tanh') 14 | ] 15 | 16 | # environment = OpenAIGym('CartPole-v0', visualize=True, max_episode_steps=500) 17 | environment = OpenAIGym('LunarLander-v2', visualize=True, max_episode_steps=500) 18 | 19 | 20 | # Instantiate a Tensorforce agent 21 | # agent = Agent.create( 22 | # agent='tensorforce', 23 | # environment=environment, # alternatively: states, actions, (max_episode_timesteps) 24 | # memory=10000, 25 | # update=dict(unit='timesteps', batch_size=64), 26 | # optimizer=dict(type='adam', learning_rate=3e-4), 27 | # policy=dict(network='auto'), 28 | # objective='policy_gradient', 29 | # reward_estimation=dict(horizon=20) 30 | # ) 31 | 32 | agent = Agent.create( 33 | agent='ppo', environment=environment, batch_size=10, 34 | learning_rate=1e-3, 35 | 36 | ) 37 | 38 | # agent = PPOAgent( 39 | # states_spec=environment.states, 40 | # actions_spec=environment.actions, 41 | # network_spec=network_spec, 42 | # batch_size=4096, 43 | # # BatchAgent 44 | # keep_last_timestep=True, 45 | # # PPOAgent 46 | # step_optimizer=dict( 47 | # type='adam', 48 | # learning_rate=1e-3 49 | # ), 50 | # optimization_steps=10, 51 | # # Model 52 | # scope='ppo', 53 | # discount=0.99, 54 | # # DistributionModel 55 | # distributions_spec=None, 56 | # entropy_regularization=0.01, 57 | # # PGModel 58 | # baseline_mode=None, 59 | # baseline=None, 60 | # baseline_optimizer=None, 61 | # gae_lambda=None, 62 | # # PGLRModel 63 | # likelihood_ratio_clipping=0.2, 64 | # # summary_spec=None, 65 | # # distributed_spec=None 66 | # ) 67 | running_score = 0.0 68 | # Train for 300 episodes 69 | for i_epoch in range(3000): 70 | game_score = 0.0 71 | # Initialize episode 72 | states = environment.reset() 73 | terminal = False 74 | 75 | while not terminal: 76 | # Episode timestep 77 | actions = agent.act(states=states) 78 | states, terminal, reward = environment.execute(actions=actions) 79 | game_score+=reward 80 | agent.observe(terminal=terminal, reward=reward) 81 | 82 | running_score = 0.95*running_score + 0.05*game_score 83 | if i_epoch%2==0: 84 | print("Game ", i_epoch, " game score %.2f"%game_score," running score %.2f"%running_score) 85 | 86 | 87 | agent.close() 88 | environment.close() 89 | -------------------------------------------------------------------------------- /Tensorforce/tf_loader.py: -------------------------------------------------------------------------------- 1 | from tensorforce import Agent, Environment 2 | from tensorforce.agents import PPOAgent 3 | from tensorforce.environments import OpenAIGym 4 | 5 | # Pre-defined or custom environment 6 | # environment = Environment.create( 7 | # environment='gym', level='CartPole', max_episode_timesteps=500 8 | # ) 9 | 10 | 11 | # environment = OpenAIGym('CartPole-v0', visualize=True, max_episode_steps=500) 12 | environment = OpenAIGym('LunarLanderContinuous-v2', visualize=True, max_episode_steps=500) 13 | # environment = OpenAIGym('BipedalWalker-v3', visualize=False, max_episode_steps=500) 14 | 15 | 16 | agent = Agent.create( 17 | agent='ppo', environment=environment, batch_size=10, 18 | network=[ 19 | dict(type='dense', size=64), 20 | dict(type='dense', size=64) 21 | ], 22 | learning_rate=1e-3, 23 | name='agent_loader' 24 | 25 | ) 26 | # import ipdb;ipdb.set_trace() 27 | agent = agent.load() 28 | 29 | running_score = 0.0 30 | # Train for 300 episodes 31 | for i_epoch in range(50000): 32 | game_score = 0.0 33 | # Initialize episode 34 | states = environment.reset() 35 | terminal = False 36 | 37 | while not terminal: 38 | # Episode timestep 39 | actions = agent.act(states=states,evaluation=True) 40 | states, terminal, reward = environment.execute(actions=actions) 41 | game_score+=reward 42 | # agent.observe(terminal=terminal, reward=reward) 43 | 44 | running_score = 0.95*running_score + 0.05*game_score 45 | if i_epoch%5==0: 46 | print("Game ", i_epoch, " game score %.2f"%game_score," running score %.2f"%running_score) 47 | 48 | 49 | 50 | agent.close() 51 | environment.close() 52 | -------------------------------------------------------------------------------- /Tensorforce/tf_main.py: -------------------------------------------------------------------------------- 1 | from tensorforce import Agent, Environment 2 | from tensorforce.agents import PPOAgent 3 | from tensorforce.environments import OpenAIGym 4 | 5 | # Pre-defined or custom environment 6 | # environment = Environment.create( 7 | # environment='gym', level='CartPole', max_episode_timesteps=500 8 | # ) 9 | 10 | # Network as list of layers 11 | network_spec = [ 12 | dict(type='dense', size=32, activation='tanh'), 13 | dict(type='dense', size=32, activation='tanh') 14 | ] 15 | 16 | environment = OpenAIGym('CartPole-v0', visualize=True, max_episode_steps=500) 17 | 18 | 19 | # Instantiate a Tensorforce agent 20 | # agent = Agent.create( 21 | # agent='tensorforce', 22 | # environment=environment, # alternatively: states, actions, (max_episode_timesteps) 23 | # memory=10000, 24 | # update=dict(unit='timesteps', batch_size=64), 25 | # optimizer=dict(type='adam', learning_rate=3e-4), 26 | # policy=dict(network='auto'), 27 | # objective='policy_gradient', 28 | # reward_estimation=dict(horizon=20) 29 | # ) 30 | 31 | agent = Agent.create( 32 | agent='ppo', environment=environment, batch_size=10, learning_rate=1e-3 33 | ) 34 | 35 | # agent = PPOAgent( 36 | # states_spec=environment.states, 37 | # actions_spec=environment.actions, 38 | # network_spec=network_spec, 39 | # batch_size=4096, 40 | # # BatchAgent 41 | # keep_last_timestep=True, 42 | # # PPOAgent 43 | # step_optimizer=dict( 44 | # type='adam', 45 | # learning_rate=1e-3 46 | # ), 47 | # optimization_steps=10, 48 | # # Model 49 | # scope='ppo', 50 | # discount=0.99, 51 | # # DistributionModel 52 | # distributions_spec=None, 53 | # entropy_regularization=0.01, 54 | # # PGModel 55 | # baseline_mode=None, 56 | # baseline=None, 57 | # baseline_optimizer=None, 58 | # gae_lambda=None, 59 | # # PGLRModel 60 | # likelihood_ratio_clipping=0.2, 61 | # # summary_spec=None, 62 | # # distributed_spec=None 63 | # ) 64 | 65 | # Train for 300 episodes 66 | for _ in range(300): 67 | 68 | # Initialize episode 69 | states = environment.reset() 70 | terminal = False 71 | 72 | while not terminal: 73 | # Episode timestep 74 | actions = agent.act(states=states) 75 | states, terminal, reward = environment.execute(actions=actions) 76 | agent.observe(terminal=terminal, reward=reward) 77 | 78 | agent.close() 79 | environment.close() 80 | -------------------------------------------------------------------------------- /img/DeepQN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FitMachineLearning/FitML/1218e10c527e1ca117cc7548419904d7801433c4/img/DeepQN.png -------------------------------------------------------------------------------- /img/LunarLandQLearning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FitMachineLearning/FitML/1218e10c527e1ca117cc7548419904d7801433c4/img/LunarLandQLearning.png -------------------------------------------------------------------------------- /img/Readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /img/Screen Shot 2017-11-01 at 7.41.58 PM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FitMachineLearning/FitML/1218e10c527e1ca117cc7548419904d7801433c4/img/Screen Shot 2017-11-01 at 7.41.58 PM.png -------------------------------------------------------------------------------- /img/ScreenShot1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FitMachineLearning/FitML/1218e10c527e1ca117cc7548419904d7801433c4/img/ScreenShot1.jpg -------------------------------------------------------------------------------- /img/Walker.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FitMachineLearning/FitML/1218e10c527e1ca117cc7548419904d7801433c4/img/Walker.png -------------------------------------------------------------------------------- /img/cCartPole.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FitMachineLearning/FitML/1218e10c527e1ca117cc7548419904d7801433c4/img/cCartPole.jpg -------------------------------------------------------------------------------- /img/cPong.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FitMachineLearning/FitML/1218e10c527e1ca117cc7548419904d7801433c4/img/cPong.jpg -------------------------------------------------------------------------------- /img/cWalker.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FitMachineLearning/FitML/1218e10c527e1ca117cc7548419904d7801433c4/img/cWalker.jpg --------------------------------------------------------------------------------