├── README.md └── PPO ├── schedules.py ├── networks.py ├── arguments.py ├── main.py ├── rl_algorithms.py ├── agents.py ├── buffers.py └── env_wrappers.py /README.md: -------------------------------------------------------------------------------- 1 | # Reinforcement-Learning-Pytorch -------------------------------------------------------------------------------- /PPO/schedules.py: -------------------------------------------------------------------------------- 1 | 2 | class LinearSchedule(object): 3 | def __init__(self, schedule_timesteps, final_p, initial_p=1.0): 4 | """Linear interpolation between initial_p and final_p over 5 | schedule_timesteps. After this many timesteps pass final_p is 6 | returned. 7 | 8 | Parameters 9 | ---------- 10 | schedule_timesteps: int 11 | Number of timesteps for which to linearly anneal initial_p 12 | to final_p 13 | initial_p: float 14 | initial output value_head 15 | final_p: float 16 | final output value_head 17 | from baselines 18 | """ 19 | self.schedule_timesteps = schedule_timesteps 20 | self.final_p = final_p 21 | self.initial_p = initial_p 22 | 23 | def value(self, t): 24 | """See Schedule.value_head""" 25 | fraction = min(float(t) / self.schedule_timesteps, 1.0) 26 | return self.initial_p + fraction * (self.final_p - self.initial_p) 27 | 28 | -------------------------------------------------------------------------------- /PPO/networks.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.distributions.categorical import Categorical 4 | import numpy as np 5 | 6 | class nature_cnn(nn.Module): 7 | def __init__(self,action_space,state_space): 8 | """ 9 | CNN from Nature paper. 10 | """ 11 | super(nature_cnn, self).__init__() 12 | 13 | self.conv1 = nn.Conv2d(in_channels=state_space[-1],out_channels=32,kernel_size=8,stride=4) 14 | self.conv2 = nn.Conv2d(in_channels=32, out_channels=64,kernel_size=4,stride=2) 15 | self.conv3 = nn.Conv2d(in_channels=64, out_channels=64,kernel_size=3,stride=1) 16 | 17 | self.fc = nn.Linear(3136,512) 18 | self.logits = nn.Linear(512,action_space) 19 | self.value = nn.Linear(512,1) 20 | 21 | #code level skill 3 22 | 23 | nn.init.xavier_normal_(self.conv1.weight.data,gain=np.sqrt(2.)) 24 | nn.init.xavier_normal_(self.conv2.weight.data,gain=np.sqrt(2.)) 25 | nn.init.xavier_normal_(self.conv3.weight.data,gain=np.sqrt(2.)) 26 | nn.init.constant_(self.conv1.bias.data,0.0) 27 | nn.init.constant_(self.conv2.bias.data,0.0) 28 | nn.init.constant_(self.conv3.bias.data,0.0) 29 | 30 | nn.init.xavier_normal_(self.fc.weight.data,gain=np.sqrt(2.)) 31 | nn.init.constant_(self.fc.bias.data,0.0) 32 | nn.init.xavier_normal_(self.logits.weight.data,gain=np.sqrt(2.)) 33 | nn.init.constant_(self.logits.bias.data,0.0) 34 | nn.init.xavier_normal_(self.value.weight.data) 35 | nn.init.constant_(self.value.bias.data,0.0) 36 | 37 | def forward(self,unscaled_images): 38 | s = unscaled_images / 255. # scale 39 | s = torch.transpose(s,1,3) # NHWC -> NCHW 40 | 41 | s = nn.functional.relu(self.conv1(s)) 42 | s = nn.functional.relu(self.conv2(s)) 43 | s = nn.functional.relu(self.conv3(s)) 44 | 45 | s = s.view(-1,self.fc.in_features) 46 | 47 | s = nn.functional.relu(self.fc(s)) 48 | logits = self.logits(s) 49 | 50 | p = torch.nn.Softmax(dim=-1)(logits) + 1e-8 51 | # p = torch.nn.Softmax(dim=-1)(logits) 52 | policy_head = Categorical(probs=p) 53 | 54 | # print('logits:',logits) 55 | # policy_head = Categorical(logits=logits) 56 | # policy_head.probs# will change random number???? 57 | # print('policy sample:',policy_head.probs.shape) 58 | 59 | value = self.value(s) 60 | 61 | return policy_head,value 62 | -------------------------------------------------------------------------------- /PPO/arguments.py: -------------------------------------------------------------------------------- 1 | # cuda 2 | import numpy as np 3 | import pandas as pd 4 | CUDA_VISIBLE_DEVICES = 3 5 | 6 | 7 | ############################### atari game ######################### 8 | # learning rate 9 | learning_rate = 2.5e-4 10 | # learning_rate = 2.5e-4/2 11 | # learning_rate = 2.5e-4 12 | # learning_rate = 2.5e-4*2 13 | # learning_rate = 2.5e-4*4 14 | # learning_rate = 2.5e-4*8 15 | # learning_rate = 2.5e-4*16 16 | 17 | # learning_rate = 2.5e-4*32 18 | 19 | reward_scale = 1 20 | record_time = 0 21 | reward_num = pd.DataFrame(columns={'run', 'record_time', '0', '10', 'others'}) 22 | # SEED_LIST = [1,2,3] 23 | # 24 | #SEED_LIST = [4, 5, 6] 25 | #SEED_LIST = [7,8,9] 26 | #SEED_LIST = [10,11,12] 27 | # SEED_LIST = [13,14,15] 28 | # SEED_LIST = [16,17,18] 29 | # SEED_LIST = [19,20] 30 | SEED_LIST = [21, 22,23,24] 31 | SEED_LIST = [25,26,27] 32 | SEED_LIST = [28,29,30] 33 | 34 | #SEED_LIST = [4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] 35 | # SEED_LIST = [1,2,3] 36 | # SEE_LIST 37 | zero_reward_num = 0 38 | ten_reward_num = 0 39 | other_reward = 0 40 | 41 | ATARI_NAME = 'AlienNoFrameskip-v4' 42 | # ATARI_NAME = 'BoxingNoFrameskip-v4' 43 | # ATARI_NAME = 'MsPacmanNoFrameskip-v4' 44 | # ATARI_NAME = 'StarGunnerNoFrameskip-v4' 45 | 46 | # ATARI_NAME = 'BreakoutNoFrameskip-v4' 47 | # ATARI_NAME = 'QbertNoFrameskip-v4' 48 | # ATARI_NAME = 'ZaxxonNoFrameskip-v4' 49 | 50 | # env 51 | # ATARI_NAME = 'BeamRiderNoFrameskip-v4' 52 | # ATARI_NAME = 'BowlingNoFrameskip-v4' 53 | # ATARI_NAME = 'BoxingNoFrameskip-v4' 54 | # ATARI_NAME = 'BreakoutNoFrameskip-v4' 55 | # ATARI_NAME = 'CentipedeNoFrameskip-v4' 56 | # ATARI_NAME = 'ChopperCommandNoFrameskip-v4' 57 | # ATARI_NAME = 'CrazyClimberNoFrameskip-v4' 58 | # ATARI_NAME = 'DemonAttackNoFrameskip-v4' 59 | # ATARI_NAME = 'DoubleDunkNoFrameskip-v4' 60 | # ATARI_NAME = 'EnduroNoFrameskip-v4' 61 | # ATARI_NAME = 'FishingDerbyNoFrameskip-v4' 62 | # ATARI_NAME = 'FreewayNoFrameskip-v4' 63 | # ATARI_NAME = 'FrostbiteNoFrameskip-v4' 64 | # ATARI_NAME = 'GopherNoFrameskip-v4' 65 | # ATARI_NAME = 'GravitarNoFrameskip-v4' 66 | # ATARI_NAME = 'IceHockeyNoFrameskip-v4' 67 | # ATARI_NAME = 'JamesbondNoFrameskip-v4' 68 | # ATARI_NAME = 'KangarooNoFrameskip-v4' 69 | # ATARI_NAME = 'KrullNoFrameskip-v4' 70 | # ATARI_NAME = 'KungFuMasterNoFrameskip-v4' 71 | # ATARI_NAME = 'MontezumaRevengeNoFrameskip-v4' 72 | # ATARI_NAME = 'MsPacmanNoFrameskip-v4' 73 | # ATARI_NAME = 'NameThisGameNoFrameskip-v4' 74 | # ATARI_NAME = 'PitfallNoFrameskip-v4' 75 | 76 | # ATARI_NAME = 'PongNoFrameskip-v4' 77 | # ATARI_NAME = 'PrivateEyeNoFrameskip-v4' 78 | # ATARI_NAME = 'QbertNoFrameskip-v4' 79 | # ATARI_NAME = 'RiverraidNoFrameskip-v4' 80 | # ATARI_NAME = 'RoadRunnerNoFrameskip-v4' 81 | # ATARI_NAME = 'RobotankNoFrameskip-v4' 82 | # ATARI_NAME = 'SeaquestNoFrameskip-v4' 83 | # ATARI_NAME = 'SpaceInvadersNoFrameskip-v4' 84 | # ATARI_NAME = 'StarGunnerNoFrameskip-v4' 85 | # ATARI_NAME = 'TennisNoFrameskip-v4' 86 | # ATARI_NAME = 'TimePilotNoFrameskip-v4' 87 | # ATARI_NAME = 'TutankhamNoFrameskip-v4' 88 | # ATARI_NAME = 'UpNDownNoFrameskip-v4' 89 | # ATARI_NAME = 'VentureNoFrameskip-v4' 90 | # ATARI_NAME = 'VideoPinballNoFrameskip-v4' 91 | # ATARI_NAME = 'WizardOfWorNoFrameskip-v4' 92 | # ATARI_NAME = 'ZaxxonNoFrameskip-v4' 93 | 94 | # ATARI_NAME = 'AlienNoFrameskip-v4' 95 | # ATARI_NAME = 'AmidarNoFrameskip-v4' 96 | # ATARI_NAME = 'AssaultNoFrameskip-v4' 97 | # ATARI_NAME = 'AsterixNoFrameskip-v4' 98 | # ATARI_NAME = 'AsteroidsNoFrameskip-v4' 99 | # ATARI_NAME = 'AtlantisNoFrameskip-v4' 100 | # ATARI_NAME = 'BankHeistNoFrameskip-v4' 101 | # ATARI_NAME = 'BattleZoneNoFrameskip-v4' 102 | 103 | # MAX_RUNS = 3 104 | NUMBER_ENV = 8 105 | FINAL_STEP = 1e7 106 | tempM = [] # temp. mean score 107 | Mean_Score = [] 108 | tempC = [] # temp. mean clip frac 109 | Clip_Fraction = [] 110 | clip_fraction = pd.DataFrame(columns=['run', 'update_time', 'clip_fraction']) 111 | tempMeanLength = [] 112 | Mean_Length = [] 113 | batch_env = None 114 | run = 0 115 | update_time = 0 116 | -------------------------------------------------------------------------------- /PPO/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | from env_wrappers import * 3 | import arguments as args 4 | from agents import PPO_Agent 5 | from networks import nature_cnn 6 | import pandas as pd 7 | import torch 8 | import random 9 | import json 10 | def run_atari(train = True, render = False): 11 | os.environ["CUDA_VISIBLE_DEVICES"] = str(args.CUDA_VISIBLE_DEVICES) 12 | args.batch_env = Baselines_DummyVecEnv(env_id=args.ATARI_NAME,num_env=args.NUMBER_ENV) 13 | agent = PPO_Agent(args.batch_env.action_space,args.batch_env.observation_space,nature_cnn) 14 | 15 | states = args.batch_env.reset() 16 | rewards, dones, info = None,None,None 17 | current_step = 0 18 | tstart = time.time() 19 | while current_step < args.FINAL_STEP: 20 | actions = agent.act(states,rewards, dones, info, train=train,current_step=current_step) 21 | next_states, rewards, dones, info = args.batch_env.step(actions) 22 | if render: 23 | args.batch_env.render() 24 | states = next_states 25 | current_step += args.batch_env.get_num_of_envs() 26 | if current_step % 10000 == 0: 27 | tnow = time.time() 28 | fps = current_step / (tnow - tstart) 29 | print('game: {}, run: {}, lr:{}, reward_scale:{}, current_step: {:.2e}, time: {:.2f}, fps: {:.2f}, mean reward: {}, mean length: {}'.format( 30 | args.ATARI_NAME, [args.run,args.SEED_LIST],args.learning_rate,args.reward_scale, current_step, tnow - tstart, fps,args.batch_env.get_episode_rewmean(),args.batch_env.get_episode_lenmean())) 31 | 32 | args.Mean_Score.append(args.tempM) 33 | args.Clip_Fraction.append(args.tempC) 34 | args.Mean_Length.append(args.tempMeanLength) 35 | 36 | args.tempC = [] 37 | args.tempM = [] 38 | args.tempMeanLength = [] 39 | 40 | def setup_seed(seed): 41 | torch.manual_seed(seed) 42 | torch.cuda.manual_seed(seed) 43 | # torch.cuda.manual_seed_all(seed) 44 | np.random.seed(seed) 45 | random.seed(seed) 46 | torch.backends.cudnn.benchmark = False 47 | torch.backends.cudnn.deterministic = True 48 | 49 | 50 | if __name__ == '__main__': 51 | print('Starting...') 52 | print('lr: ', args.learning_rate) 53 | # while args.run < args.MAX_RUNS: 54 | # seed = 1000*(args.run+1) 55 | # setup_seed(seed) 56 | # 57 | # run_atari() 58 | # args.run += 1 59 | # args.update_time = 0 60 | print(torch.cuda.is_available()) 61 | 62 | print(args.SEED_LIST) 63 | for item in args.SEED_LIST: 64 | seed = 1000 * item 65 | setup_seed(seed) 66 | args.other_reward, args.zero_reward_num, args.ten_reward_num = 0, 0, 0 67 | run_atari() 68 | 69 | args.run += 1 70 | args.update_time = 0 71 | args.record_time = 0 72 | 73 | # path = "loss_data" + str(args.run) +'-'+str(args.learning_rate[args.lr])+'-'+str(args.reward_scale)+ ".json" 74 | # with open(path, mode='w') as loss_data_file: 75 | # json.dump(args.returns, loss_data_file) 76 | # args.returns = {} 77 | 78 | 79 | # print(len(args.Mean_Score[0])) 80 | 81 | 82 | print('Saving data...') 83 | mean_score = pd.DataFrame(args.Mean_Score).melt(var_name='iteration', value_name='mean_score') # thiskind of dict is easy to draw line plots with std 84 | clip_fraction = pd.DataFrame(args.Clip_Fraction).melt(var_name='update_time', value_name='clip_fraction') 85 | mean_length = pd.DataFrame(args.Mean_Length).melt(var_name='iteration', value_name='mean_length') 86 | 87 | 88 | args.reward_num.to_csv(args.ATARI_NAME + '_' + str(args.learning_rate)+'_' + str(args.SEED_LIST)+'_' +str(args.reward_scale) + "_reward_num.csv", index=False) 89 | print("Save reward_num successfully.") 90 | # save mean score 91 | mean_score.to_csv(args.ATARI_NAME+'_'+str(args.learning_rate)+'_' + str(args.SEED_LIST)+'_'+str(args.reward_scale) +"_mean_score_std.csv", index=False) 92 | print("Save mean_score successfully.") 93 | 94 | # save mean length 95 | mean_length.to_csv(args.ATARI_NAME+'_'+str(args.learning_rate)+'_' + str(args.SEED_LIST)+'_'+str(args.reward_scale) +"_mean_length_std.csv", index=False) 96 | print("Save mean_length successfully.") 97 | 98 | # save clip fraction 99 | clip_fraction.to_csv(args.ATARI_NAME+'_'+str(args.learning_rate)+'_' + str(args.SEED_LIST)+'_'+str(args.reward_scale) +"_clip_fraction_std.csv", index=False) 100 | 101 | args.clip_fraction.to_csv(args.ATARI_NAME+'_'+str(args.learning_rate)+'_' + str(args.SEED_LIST)+'_'+str(args.reward_scale)+"_clip_fraction.csv", index=False) # easy to distinguish runs 102 | print("Save clip_fraction successfully.") 103 | print('Done.') -------------------------------------------------------------------------------- /PPO/rl_algorithms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import arguments as args 3 | import numpy as np 4 | import pandas as pd 5 | class PPO_clip(): 6 | def __init__(self, net, decay,device): 7 | self.net = net 8 | self.decay = decay 9 | self.device = device 10 | # parameters 11 | self.value_factor = 1. # paper value 12 | self.entropy_factor = 0.01 13 | self.clip_epsilon = 0.1 14 | self.learning_rate = args.learning_rate 15 | # self.adjustlr = args.adjustlr 16 | self.training_epoch = 3 17 | self.time_horizon = 128 # ENV_NUMBER x TIME_HORIZON should be lager than BATCH_SIZE 18 | self.batch_size = 256 # can be bigger 19 | 20 | self.optimizer = torch.optim.Adam(self.net.parameters(), lr=self.learning_rate) 21 | # print('self.net.parameters():',self.net.parameters()) 22 | 23 | def learn(self,current_step,state_batch,action_batch,return_batch,old_value_batch,old_log_prob_batch,adv_batch): 24 | state_batch = torch.from_numpy(state_batch).to(self.device) 25 | action_batch = torch.from_numpy(action_batch).to(self.device) 26 | return_batch = torch.from_numpy(return_batch).to(self.device) 27 | 28 | old_value_batch = torch.from_numpy(old_value_batch).to(self.device) 29 | old_log_prob_batch = torch.from_numpy(old_log_prob_batch).to(self.device) 30 | adv_batch = torch.from_numpy(adv_batch).to(self.device) 31 | # print('state_batch:',state_batch) 32 | 33 | self.alpha = self.decay.value(current_step) 34 | lr = self.learning_rate * self.alpha 35 | 36 | for param_group in self.optimizer.param_groups: 37 | param_group['lr'] = lr 38 | 39 | policy_head, value_batch = self.net(state_batch) 40 | # print('value_batch:',value_batch) 41 | log_prob_batch = policy_head.log_prob(action_batch) 42 | # print('log_prob_batch : ',log_prob_batch.shape) 43 | 44 | self.v_loss,self.v_others = self.value_loss_clip(value_batch, return_batch,old_value_batch) # todo: not mentioned in paper, but used in openai baselines 45 | self.v_loss_no_clip,_ = self.value_loss(value_batch, return_batch) 46 | self.pi_loss,self.pi_others = self.policy_loss(log_prob_batch,old_log_prob_batch,adv_batch) 47 | # print('entropy :',policy_head.entropy()) 48 | self.entropy = torch.mean(policy_head.entropy()) 49 | 50 | loss = self.v_loss_no_clip * self.value_factor - self.pi_loss - self.entropy * self.entropy_factor 51 | 52 | self.optimizer.zero_grad() 53 | loss.backward() 54 | torch.nn.utils.clip_grad_norm_(self.net.parameters(), 0.5) 55 | self.optimizer.step() 56 | 57 | return self.v_loss.cpu().item(), self.v_loss_no_clip.cpu().item() 58 | 59 | 60 | def value_loss(self,value_batch,return_batch): 61 | value_loss = torch.mean((value_batch - return_batch) ** 2) 62 | # others = {'r_square':r_square} 63 | others = None 64 | return value_loss,others 65 | 66 | def value_loss_clip(self,value_batch, return_batch,old_value_batch):#value clip code level skill 1 67 | value_clipped = old_value_batch + torch.clamp(value_batch-old_value_batch,-self.clip_epsilon, self.clip_epsilon) 68 | value_loss_1 = (value_batch - return_batch) ** 2 69 | value_loss_2 = (return_batch - value_clipped) ** 2 70 | # print('value loss: ',value_loss_1.shape) 71 | # print('max: ',torch.max(value_loss_1, value_loss_2).shape) 72 | value_loss = .5 * torch.mean(torch.max(value_loss_1, value_loss_2)) 73 | 74 | # others = {'r_square':r_square} 75 | others = None 76 | return value_loss, others 77 | 78 | def policy_loss(self,log_prob_batch,old_log_prob_batch,adv_batch): 79 | ratio = torch.exp(log_prob_batch - old_log_prob_batch) 80 | ratio_average = ratio.cpu().detach().numpy().mean() 81 | #args.Ratio['run_time'].append(args.index) 82 | #args.Ratio['ratio'].append(ratio_average) 83 | 84 | ratio = ratio.view(-1,1) # take care the dimension here!!! 85 | 86 | # print('log_prob_batch:',log_prob_batch.shape) 87 | # print('old_log_prob_batch:',old_log_prob_batch.shape) 88 | # print('adv_batch:',adv_batch.shape) 89 | # print('ratio:',ratio.shape) 90 | 91 | surrogate_1 = ratio * adv_batch 92 | surrogate_2 = torch.clamp(ratio, 1 - self.clip_epsilon*self.alpha, 1 + self.clip_epsilon*self.alpha) * adv_batch 93 | surrogate = torch.min(surrogate_1, surrogate_2) 94 | policy_loss = torch.mean(surrogate) 95 | 96 | approxkl = .5 * torch.mean((old_log_prob_batch - log_prob_batch)**2) 97 | # print('ratio : ', torch.gt(torch.abs(ratio-1.),self.clip_epsilon*self.alpha).float()) 98 | clipfrac = torch.mean(torch.gt(torch.abs(ratio-1.),self.clip_epsilon*self.alpha).float()) 99 | # print('clipfrac :',clipfrac) 100 | 101 | args.clip_fraction = args.clip_fraction.append(pd.DataFrame({'run': [args.run], 'update_time':[args.update_time], 'clip_fraction':clipfrac.item()}),ignore_index=True) 102 | 103 | args.tempC.append(clipfrac.item()) 104 | #print(args.clip_fraction) 105 | args.update_time +=1 106 | others = {'approxkl':approxkl,'clipfrac':clipfrac} 107 | return policy_loss, others 108 | -------------------------------------------------------------------------------- /PPO/agents.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from rl_algorithms import PPO_clip 4 | from schedules import LinearSchedule 5 | import arguments as args 6 | from buffers import BatchBuffer 7 | import seaborn as sns 8 | import matplotlib.pyplot as plt 9 | import arguments as args 10 | import json 11 | import pandas as pd 12 | class PPO_Agent(): 13 | def __init__(self,action_space, state_space,net): 14 | self.action_space = action_space 15 | self.state_space = state_space 16 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 17 | # self.device = "cpu" 18 | 19 | self.net = net(self.action_space,self.state_space).to(self.device) 20 | self.decay = LinearSchedule(schedule_timesteps=args.FINAL_STEP, final_p=0.) 21 | self.update = PPO_clip(self.net, self.decay,self.device) 22 | self.batch_buffer = BatchBuffer(buffer_num=args.NUMBER_ENV,gamma=0.99,lam=0.95) 23 | 24 | def act(self,states,rewards,dones,info,train,current_step): 25 | states = torch.from_numpy(np.array(states)).to(self.device).float() 26 | policy_head, values = self.net(states) 27 | # print('policy head: ',policy_head) 28 | # print('values: ',values) 29 | actions = policy_head.sample() 30 | log_probs = policy_head.log_prob(actions) 31 | 32 | if train: 33 | self.train(states.detach().cpu().numpy(), 34 | actions.detach().cpu().numpy(), 35 | rewards, 36 | dones, 37 | values.detach().cpu().numpy(), 38 | log_probs.detach().cpu().numpy(), 39 | current_step) 40 | 41 | return actions.detach().cpu().numpy() 42 | 43 | def train(self,states,actions,rewards,dones,values,log_probs,current_step): 44 | values = np.reshape(values, [-1]) 45 | if rewards is None and dones is None: 46 | for i in range(self.batch_buffer.buffer_num): 47 | self.batch_buffer.buffer_list[i].add_data( 48 | state_t=states[i], 49 | action_t=actions[i], 50 | value_t=values[i], 51 | log_prob_t=log_probs[i]) 52 | else: 53 | for i in range(self.batch_buffer.buffer_num): 54 | self.batch_buffer.buffer_list[i].add_data( 55 | state_t=states[i], 56 | action_t=actions[i], 57 | reward_t=rewards[i], 58 | terminal_t=dones[i], 59 | value_t=values[i], 60 | log_prob_t=log_probs[i]) 61 | 62 | if current_step > 0 and current_step / self.batch_buffer.buffer_num % self.update.time_horizon == 0: 63 | #print(np.shape(self.batch_buffer.buffer_list)) 64 | 65 | args.tempM.append(args.batch_env.get_episode_rewmean()) 66 | args.tempMeanLength.append(args.batch_env.get_episode_lenmean()) 67 | args.reward_num = args.reward_num.append(pd.DataFrame({'run': [args.run], 'record_time': [args.record_time], '0': [args.zero_reward_num], '10': [args.ten_reward_num], 'others': [args.other_reward]})) 68 | 69 | s, a, ret, v, logp, adv = self.batch_buffer.get_data() 70 | args.other_reward, args.ten_reward_num, args.zero_reward_num = 0, 0, 0 71 | 72 | # miu = np.mean(ret, axis=1).reshape(-1, 1) 73 | # std = np.std(ret, axis=1).reshape(-1, 1) 74 | # # print(miu.shape, std.shape) 75 | # ret = (ret - miu) / (std + 1e-8) 76 | for epoch in range(self.update.training_epoch): 77 | s, a, ret, v, logp, adv = self.batch_buffer.shuffle_data(s, self.state_space, a, ret, v, logp, adv) 78 | 79 | num_batch = self.update.time_horizon*self.batch_buffer.buffer_num // self.update.batch_size 80 | for i in range(num_batch): 81 | 82 | batch_s, batch_a, batch_ret, batch_v, batch_logp, batch_adv = self.batch_buffer.get_minibatch(i*self.update.batch_size, 83 | self.update.batch_size, 84 | s, a, 85 | ret, 86 | v, 87 | logp, 88 | adv) 89 | # print('minibatch data shape:', s.shape, a.shape, ret.shape, v.shape,logp.shape, adv.shape) 90 | # print('minibatch data shape:', batch_s.shape, batch_a.shape, batch_ret.shape, batch_v.shape,batch_logp.shape, batch_adv.shape) 91 | 92 | _, _ = self.update.learn(current_step, batch_s, batch_a, batch_ret, batch_v, batch_logp, batch_adv) 93 | 94 | args.record_time += 1 95 | #reward 96 | self.batch_buffer.initialize_buffer_list() 97 | 98 | states = torch.from_numpy(states).to(self.device) 99 | actions = torch.from_numpy(actions).to(self.device) 100 | policy_head, values = self.net(states) 101 | log_probs = policy_head.log_prob(actions) 102 | log_probs = log_probs.detach().cpu().numpy() 103 | states = states.detach().cpu().numpy() 104 | actions = actions.detach().cpu().numpy() 105 | values = values.detach().cpu().numpy() 106 | values = np.reshape(values, [-1]) 107 | 108 | for i in range(self.batch_buffer.buffer_num): 109 | self.batch_buffer.buffer_list[i].add_data( 110 | state_t=states[i], 111 | action_t=actions[i], 112 | value_t=values[i], 113 | log_prob_t=log_probs[i]) # here may be the problem 114 | -------------------------------------------------------------------------------- /PPO/buffers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # need to speed up, such as put them on numpy directly or on Tensor 4 | class Buffer(): 5 | def __init__(self): 6 | self.initialize_buffer() 7 | 8 | def initialize_buffer(self): 9 | self.state_list = [] 10 | self.action_list = [] 11 | self.reward_list = [] 12 | self.terminal_list = [] 13 | self.value_list = [] 14 | self.log_prob_list = [] 15 | 16 | def add_data(self,state_t=None,action_t=None,reward_t=None,terminal_t=None,value_t=None,log_prob_t=None): 17 | if state_t is not None: 18 | self.state_list.append(state_t) 19 | if action_t is not None: 20 | self.action_list.append(action_t) 21 | if reward_t is not None: 22 | self.reward_list.append(reward_t) 23 | if terminal_t is not None: 24 | self.terminal_list.append(terminal_t) 25 | if value_t is not None: 26 | self.value_list.append(value_t) 27 | if log_prob_t is not None: 28 | self.log_prob_list.append(log_prob_t) 29 | 30 | class BatchBuffer(): 31 | def __init__(self,buffer_num,gamma,lam): 32 | self.buffer_num = buffer_num 33 | self.buffer_list = [Buffer() for _ in range(self.buffer_num)] 34 | self.gamma = gamma 35 | self.lam = lam 36 | def initialize_buffer_list(self): 37 | for buffer in self.buffer_list: 38 | buffer.initialize_buffer() 39 | 40 | def add_batch_data(self,states_t=None,actions_t=None,rewards_t=None,terminals_t=None,values_t=None,log_probs_t=None): 41 | for i in range(self.buffer_num): 42 | self.buffer_list[i].add_data(states_t[i],actions_t[i],rewards_t[i],terminals_t[i],values_t[i],log_probs_t[i]) 43 | 44 | def buffer_list_to_array(self): 45 | states = [] 46 | actions = [] 47 | rewards = [] 48 | terminals = [] 49 | values = [] 50 | log_probs = [] 51 | for buffer in self.buffer_list: 52 | states.append(buffer.state_list) 53 | actions.append(buffer.action_list) 54 | rewards.append(buffer.reward_list) 55 | terminals.append(buffer.terminal_list) 56 | values.append(buffer.value_list) 57 | log_probs.append(buffer.log_prob_list) 58 | 59 | states = np.array(states) 60 | actions = np.array(actions) 61 | rewards = np.array(rewards) 62 | terminals = np.array(terminals) 63 | values = np.array(values) 64 | log_probs = np.array(log_probs) 65 | 66 | return states,actions,rewards,terminals,values,log_probs 67 | 68 | def compute_reward_to_go_returns(self,rewards,values,terminals): 69 | ''' 70 | the env will reset directly once it ends and return a new state 71 | st is only one more than at and rt at the end of the episode 72 | state: s1 s2 s3 ... st-1 - 73 | action: a1 a2 a3 ... at-1 - 74 | reward: r1 r2 r3 ... rt-1 - 75 | terminal: t1 t2 t3 ... tt-1 - 76 | value: v1 v2 v3 ... vt-1 vt 77 | ''' 78 | # (N,T) -> (T,N) N:n_envs T:traj_length 79 | rewards = np.transpose(rewards,[1,0]) 80 | values = np.transpose(values, [1, 0]) 81 | terminals = np.transpose(terminals,[1,0]) 82 | R = values[-1] 83 | returns = [] 84 | 85 | for i in reversed(range(rewards.shape[0])): 86 | R = rewards[i] + (1. - terminals[i]) * self.gamma * R 87 | returns.append(R) 88 | returns = list(reversed(returns)) 89 | # (T,N) -> (N,T) 90 | returns = np.transpose(returns,[1,0]) 91 | return returns 92 | 93 | def compute_GAE(self,rewards,values,terminals): 94 | # (N,T) -> (T,N) 95 | rewards = np.transpose(rewards,[1,0]) 96 | values = np.transpose(values,[1,0]) 97 | terminals = np.transpose(terminals,[1,0]) 98 | length = rewards.shape[0] 99 | # print('reward:{},value:{},terminal{}'.format(rewards.shape,values.shape,terminals.shape)) 100 | deltas = [] 101 | for i in reversed(range(length)): 102 | v = rewards[i] + (1. - terminals[i]) * self.gamma * values[i+1] 103 | delta = v - values[i] 104 | deltas.append(delta) 105 | deltas = np.array(list(reversed(deltas))) 106 | 107 | A = deltas[-1,:] 108 | advantages = [A] 109 | for i in reversed(range(length-1)): 110 | A = deltas[i] + (1. - terminals[i]) * self.gamma * self.lam * A 111 | advantages.append(A) 112 | advantages = reversed(advantages) 113 | # (T,N) -> (N,T) 114 | advantages = np.transpose(list(advantages),[1,0]) 115 | # print(advantages) 116 | return advantages 117 | 118 | def get_data(self): 119 | states, actions, rewards, terminals, values, log_probs = self.buffer_list_to_array() 120 | advs = self.compute_GAE(rewards,values,terminals) 121 | advs = (advs - np.mean(advs)) / (np.std(advs) + 1e-8) 122 | returns = self.compute_reward_to_go_returns(rewards,values,terminals) 123 | 124 | return states[:,:len(advs[0])],actions[:,:len(advs[0])],returns,values[:,:len(advs[0])],log_probs[:,:len(advs[0])],advs 125 | 126 | def shuffle_data(self,states, state_space, actions,returns,values,log_probs,advs): 127 | states = np.reshape(states, [-1] + state_space) 128 | actions = np.reshape(actions, [-1]) 129 | returns = np.reshape(returns, [-1, 1]) 130 | values = np.reshape(values, [-1, 1]) 131 | log_probs = np.reshape(log_probs, [-1]) 132 | advs = np.reshape(advs, [-1, 1]) 133 | 134 | indices = np.random.permutation(range(len(advs))).tolist() 135 | 136 | states = states[indices] 137 | actions = actions[indices] 138 | returns = returns[indices] 139 | values = values[indices] 140 | log_probs = log_probs[indices] 141 | advs = advs[indices] 142 | 143 | return states,actions,returns,values,log_probs,advs 144 | 145 | def get_minibatch(self,startingIndex, batch_size,states,actions,returns,values,log_probs,advs): 146 | batch_states = states[startingIndex : startingIndex+batch_size] 147 | batch_actions = actions[startingIndex : startingIndex+batch_size] 148 | 149 | batch_returns = returns[startingIndex : startingIndex+batch_size] 150 | batch_values = values[startingIndex : startingIndex + batch_size] 151 | batch_log_probs = log_probs[startingIndex : startingIndex + batch_size] 152 | batch_advs = advs[startingIndex : startingIndex + batch_size] 153 | 154 | return batch_states, batch_actions, batch_returns, batch_values, batch_log_probs, batch_advs -------------------------------------------------------------------------------- /PPO/env_wrappers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import deque 3 | import gym 4 | from gym import spaces 5 | import time 6 | import cv2 7 | import arguments as args 8 | import numpy as np 9 | from collections import deque 10 | import gym 11 | from gym import spaces 12 | import time 13 | import cv2 14 | import arguments as args 15 | cv2.ocl.setUseOpenCL(False) 16 | 17 | # https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py 18 | 19 | class VecFrameStack(gym.Wrapper): 20 | def __init__(self, env, nstack): 21 | gym.Wrapper.__init__(self, env) 22 | self.nstack = nstack 23 | wos = env.observation_space # wrapped ob space 24 | low = np.repeat(wos.low, self.nstack, axis=-1) 25 | high = np.repeat(wos.high, self.nstack, axis=-1) 26 | self.stackedobs = np.zeros(low.shape, low.dtype) 27 | self.observation_space = spaces.Box(low=low, high=high, dtype=env.observation_space.dtype) 28 | 29 | def step(self,action): 30 | obs, rew, new, info = self.env.step(action) 31 | self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1) 32 | self.stackedobs[..., -obs.shape[-1]:] = obs 33 | return self.stackedobs, rew, new, info 34 | 35 | def reset(self): 36 | obs = self.env.reset() 37 | self.stackedobs[...] = 0 38 | self.stackedobs[..., -obs.shape[-1]:] = obs 39 | return self.stackedobs 40 | 41 | class NoopResetEnv(gym.Wrapper): 42 | def __init__(self, env, noop_max=30): 43 | """Sample initial states by taking random number of no-ops on reset. 44 | No-op is assumed to be action 0. 45 | """ 46 | gym.Wrapper.__init__(self, env) 47 | self.noop_max = noop_max 48 | self.override_num_noops = None # set a fixed number 49 | self.noop_action = 0 50 | assert env.unwrapped.get_action_meanings()[0] == 'NOOP' 51 | 52 | def reset(self, **kwargs): 53 | """ Do no-op action for a number of steps in [1, noop_max].""" 54 | self.env.reset(**kwargs) 55 | if self.override_num_noops is not None: 56 | noops = self.override_num_noops 57 | else: 58 | noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101 59 | assert noops > 0 60 | obs = None 61 | for _ in range(noops): 62 | obs, _, done, _ = self.env.step(self.noop_action) 63 | if done: 64 | obs = self.env.reset(**kwargs) 65 | return obs 66 | 67 | def step(self, ac): 68 | return self.env.step(ac) 69 | 70 | class FireResetEnv(gym.Wrapper): 71 | def __init__(self, env): 72 | """Take action on reset for environments that are fixed until firing.""" 73 | gym.Wrapper.__init__(self, env) 74 | assert env.unwrapped.get_action_meanings()[1] == 'FIRE' 75 | assert len(env.unwrapped.get_action_meanings()) >= 3 76 | 77 | def reset(self, **kwargs): 78 | self.env.reset(**kwargs) 79 | obs, _, done, _ = self.env.step(1) 80 | if done: 81 | self.env.reset(**kwargs) 82 | obs, _, done, _ = self.env.step(2) 83 | if done: 84 | self.env.reset(**kwargs) 85 | return obs 86 | 87 | def step(self, ac): 88 | return self.env.step(ac) 89 | 90 | class EpisodicLifeEnv_withInfos(gym.Wrapper): 91 | def __init__(self, env): 92 | """Make end-of-life == end-of-episode, but only reset on true game over. 93 | Done by DeepMind for the DQN and co. since it helps value_head estimation. 94 | """ 95 | gym.Wrapper.__init__(self, env) 96 | self.lives = 0 97 | self.was_real_done = True 98 | self.rewards = [] 99 | self.tstart = time.time() 100 | 101 | def update(self,rew,info): 102 | # add a flag implying EpisodicLife is used : info['EpisodicLife'] == True 103 | info['EpisodicLife'] = True 104 | self.rewards.append(rew) 105 | if self.was_real_done: 106 | eprew = sum(self.rewards) 107 | eplen = len(self.rewards) 108 | epinfo = {"r": round(eprew, 6), "l": eplen, "t": round(time.time() - self.tstart, 6)} 109 | assert isinstance(info,dict) 110 | if isinstance(info,dict): 111 | info['episode'] = epinfo 112 | self.rewards = [] 113 | 114 | def step(self, action): 115 | obs, reward, done, info = self.env.step(action) 116 | self.was_real_done = done 117 | # check current lives, make loss of life terminal, 118 | # then update lives to handle bonus lives 119 | lives = self.env.unwrapped.ale.lives() 120 | if lives < self.lives and lives > 0: 121 | # for Qbert sometimes we stay in lives == 0 condition for a few frames 122 | # so it's important to keep lives > 0, so that we only reset once 123 | # the environment advertises done. 124 | done = True 125 | self.lives = lives 126 | self.update(reward,info) 127 | return obs, reward, done, info 128 | 129 | def reset(self, **kwargs): 130 | """Reset only when lives are exhausted. 131 | This way all states are still reachable even though lives are episodic, 132 | and the learner need not know about any of this behind-the-scenes. 133 | """ 134 | if self.was_real_done: 135 | obs = self.env.reset(**kwargs) 136 | else: 137 | # no-op step to advance from terminal/lost life state 138 | obs, _, _, _ = self.env.step(0) 139 | self.lives = self.env.unwrapped.ale.lives() 140 | return obs 141 | 142 | 143 | class MaxAndSkipEnv(gym.Wrapper): 144 | def __init__(self, env, skip=4): 145 | """Return only every `skip`-th frame""" 146 | gym.Wrapper.__init__(self, env) 147 | # most recent raw observations (for max pooling across time steps) 148 | self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8) 149 | self._skip = skip 150 | 151 | def step(self, action): 152 | """Repeat action, sum reward, and max over last observations.""" 153 | total_reward = 0.0 154 | done = None 155 | for i in range(self._skip): 156 | # only save the last 2 obs and do a max operation 157 | # other skip-2 obs will be omitted 158 | obs, reward, done, info = self.env.step(action) 159 | if i == self._skip - 2: self._obs_buffer[0] = obs 160 | if i == self._skip - 1: self._obs_buffer[1] = obs 161 | total_reward += reward 162 | if done: 163 | break 164 | # Note that the observation on the done=True frame 165 | # doesn't matter 166 | max_frame = self._obs_buffer.max(axis=0) 167 | 168 | return max_frame, total_reward, done, info 169 | 170 | def reset(self, **kwargs): 171 | return self.env.reset(**kwargs) 172 | 173 | class ClipRewardEnv(gym.RewardWrapper): 174 | def __init__(self, env): 175 | gym.RewardWrapper.__init__(self, env) 176 | 177 | def reward(self, reward): 178 | """Bin reward to {+1, 0, -1} by its sign.""" 179 | if reward == 0: 180 | args.zero_reward_num += 1 181 | elif reward == 10: 182 | args.ten_reward_num += 1 183 | else: 184 | args.other_reward += 1 185 | # if reward!=0: 186 | # print(reward) 187 | return np.sign(reward)*args.reward_scale 188 | 189 | 190 | class WarpFrame(gym.ObservationWrapper): 191 | def __init__(self, env, width=84, height=84, grayscale=True, dict_space_key=None): 192 | """ 193 | Warp frames to 84x84 as done in the Nature paper and later work. 194 | 195 | If the environment uses dictionary observations, `dict_space_key` can be specified which indicates which 196 | observation should be warped. 197 | """ 198 | super().__init__(env) 199 | self._width = width 200 | self._height = height 201 | self._grayscale = grayscale 202 | self._key = dict_space_key 203 | if self._grayscale: 204 | num_colors = 1 205 | else: 206 | num_colors = 3 207 | 208 | new_space = gym.spaces.Box( 209 | low=0, 210 | high=255, 211 | shape=(self._height, self._width, num_colors), 212 | dtype=np.uint8, 213 | ) 214 | if self._key is None: 215 | original_space = self.observation_space 216 | self.observation_space = new_space 217 | else: 218 | original_space = self.observation_space.spaces[self._key] 219 | self.observation_space.spaces[self._key] = new_space 220 | assert original_space.dtype == np.uint8 and len(original_space.shape) == 3 221 | 222 | def observation(self, obs): 223 | if self._key is None: 224 | frame = obs 225 | else: 226 | frame = obs[self._key] 227 | 228 | if self._grayscale: 229 | frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) 230 | frame = cv2.resize( 231 | frame, (self._width, self._height), interpolation=cv2.INTER_AREA 232 | ) 233 | if self._grayscale: 234 | frame = np.expand_dims(frame, -1) 235 | 236 | if self._key is None: 237 | obs = frame 238 | else: 239 | obs = obs.copy() 240 | obs[self._key] = frame 241 | return obs 242 | 243 | 244 | class FrameStack(gym.Wrapper): 245 | def __init__(self, env, k): 246 | """Stack k last frames. 247 | Returns lazy array, which is much more memory efficient. 248 | See Also 249 | -------- 250 | baselines.common.atari_wrappers.LazyFrames 251 | 252 | Not convenient to check the shape and value_head of a state 253 | """ 254 | gym.Wrapper.__init__(self, env) 255 | self.k = k 256 | self.frames = deque([], maxlen=k) 257 | shp = env.observation_space.shape 258 | self.observation_space = spaces.Box(low=0, high=255, shape=(shp[:-1] + (shp[-1] * k,)), dtype=env.observation_space.dtype) 259 | 260 | def reset(self): 261 | ob = self.env.reset() 262 | for _ in range(self.k): 263 | self.frames.append(ob) 264 | return self._get_ob() 265 | 266 | def step(self, action): 267 | ob, reward, done, info = self.env.step(action) 268 | self.frames.append(ob) 269 | return self._get_ob(), reward, done, info 270 | 271 | def _get_ob(self): 272 | assert len(self.frames) == self.k 273 | return LazyFrames(list(self.frames)) 274 | 275 | class ScaledFloatFrame(gym.ObservationWrapper): 276 | def __init__(self, env): 277 | gym.ObservationWrapper.__init__(self, env) 278 | self.observation_space = gym.spaces.Box(low=0, high=1, shape=env.observation_space.shape, dtype=np.float32) 279 | 280 | def observation(self, observation): 281 | # careful! This undoes the memory optimization, use 282 | # with smaller replay buffers only. 283 | return np.array(observation).astype(np.float32) / 255.0 284 | 285 | class LazyFrames(object): 286 | def __init__(self, frames): 287 | """This object ensures that common frames between the observations are only stored once. 288 | It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay 289 | buffers. 290 | 291 | This object should only be converted to numpy array before being passed to the model. 292 | 293 | You'd not believe how complex the previous solution was.""" 294 | self._frames = frames 295 | self._out = None 296 | 297 | def _force(self): 298 | if self._out is None: 299 | self._out = np.concatenate(self._frames, axis=-1) 300 | self._frames = None 301 | return self._out 302 | 303 | def __array__(self, dtype=None): 304 | out = self._force() 305 | if dtype is not None: 306 | out = out.astype(dtype) 307 | return out 308 | 309 | def __len__(self): 310 | return len(self._force()) 311 | 312 | def __getitem__(self, i): 313 | return self._force()[i] 314 | 315 | def count(self): 316 | frames = self._force() 317 | return frames.shape[frames.ndim - 1] 318 | 319 | def frame(self, i): 320 | return self._force()[..., i] 321 | 322 | class TimeLimit(gym.Wrapper): 323 | def __init__(self, env, max_episode_steps=None): 324 | super(TimeLimit, self).__init__(env) 325 | self._max_episode_steps = max_episode_steps 326 | self._elapsed_steps = 0 327 | 328 | def step(self, ac): 329 | observation, reward, done, info = self.env.step(ac) 330 | self._elapsed_steps += 1 331 | if self._elapsed_steps >= self._max_episode_steps: 332 | done = True 333 | info['TimeLimit.truncated'] = True 334 | return observation, reward, done, info 335 | 336 | def reset(self, **kwargs): 337 | self._elapsed_steps = 0 338 | return self.env.reset(**kwargs) 339 | 340 | class ClipActionsWrapper(gym.Wrapper): 341 | def step(self, action): 342 | import numpy as np 343 | action = np.nan_to_num(action) 344 | action = np.clip(action, self.action_space.low, self.action_space.high) 345 | return self.env.step(action) 346 | 347 | def reset(self, **kwargs): 348 | return self.env.reset(**kwargs) 349 | 350 | class PreprocessWrapper(gym.Wrapper): 351 | def __init__(self, env, r_preprocess=None, s_preprocess=None): 352 | ''' 353 | reward & state preprocess 354 | record info like real reward, episode length, etc 355 | Be careful: when an episode is done: check info['episode'] for information 356 | ''' 357 | gym.Wrapper.__init__(self, env) 358 | self.env = env 359 | self.observation_space = env.observation_space 360 | self.action_space = env.action_space 361 | self.r_preprocess = r_preprocess 362 | self.s_preprocess = s_preprocess 363 | self.rewards = [] 364 | 365 | def step(self, action): 366 | state, reward, done, info = self.env.step(action) 367 | state = state.astype('float32') # todo: can change to int8 on atari 368 | self.rewards.append(reward) 369 | if done: 370 | # if no EpisodicLifeEnv_withInfos wrapper, update info here 371 | if not info.get('EpisodicLife'): 372 | # return None if there is no EpisodicLife 373 | eprew = sum(self.rewards) 374 | eplen = len(self.rewards) 375 | epinfo = {"r": round(eprew, 6), "l": eplen} 376 | assert isinstance(info,dict) 377 | if isinstance(info,dict): 378 | info['episode'] = epinfo 379 | self.rewards = [] 380 | # preprocess reward 381 | if self.r_preprocess is not None: 382 | reward = self.r_preprocess(reward) 383 | # preprocess state 384 | if self.s_preprocess is not None: 385 | state = self.s_preprocess(state) 386 | return state, reward, done, info 387 | 388 | def reset(self): 389 | state = self.env.reset() 390 | state = state.astype('float32') # todo: can change to int8 on atari 391 | # preprocess state 392 | if self.s_preprocess is not None: 393 | state = self.s_preprocess(state) 394 | return state 395 | 396 | def render(self, mode='human'): 397 | return self.env.render(mode=mode) 398 | 399 | class BatchEnvWrapper: 400 | def __init__(self, envs): 401 | self.envs = envs 402 | self.observation_space = list(envs[0].observation_space.shape) 403 | # self.observation_space = [84,84,1] 404 | self.action_space = envs[0].action_space.n 405 | self.epinfobuf = deque(maxlen=100) 406 | 407 | def step(self, actions): 408 | states = [] 409 | rewards = [] 410 | dones = [] 411 | infos = [] 412 | for i, env in enumerate(self.envs): 413 | state, reward, done, info = env.step(actions[i]) 414 | if done: 415 | state = env.reset() 416 | states.append(state) 417 | rewards.append(reward) 418 | dones.append(done) 419 | infos.append(info) 420 | maybeepinfo = info.get('episode') 421 | 422 | if maybeepinfo: 423 | self.epinfobuf.append(maybeepinfo) 424 | 425 | 426 | 427 | # print(infos) 428 | return states, rewards, dones, infos 429 | 430 | def reset(self): 431 | return [self.envs[i].reset() for i in range(self.get_num_of_envs())] 432 | 433 | def render(self, mode='human'): 434 | return self.envs[0].render(mode=mode) 435 | 436 | def get_num_of_envs(self): 437 | return len(self.envs) 438 | 439 | def get_episode_rewmean(self): 440 | #print([epinfo['r'] for epinfo in self.epinfobuf]) 441 | #input() 442 | return round(self.safemean([epinfo['r'] for epinfo in self.epinfobuf]),2) 443 | 444 | def get_list_of_episode(self): 445 | return [epinfo['r'] for epinfo in self.epinfobuf] 446 | 447 | def get_episode_lenmean(self): 448 | return round(self.safemean([epinfo['l'] for epinfo in self.epinfobuf]),2) 449 | 450 | def safemean(self,xs): 451 | return np.nan if len(xs) == 0 else np.mean(xs) 452 | 453 | 454 | def make_atari(env_id, max_episode_steps=None): 455 | # NoopResetEnv,MaxAndSkipEnv 456 | env = gym.make(env_id) 457 | assert 'NoFrameskip' in env.spec.id 458 | env = NoopResetEnv(env, noop_max=30) 459 | env = MaxAndSkipEnv(env, skip=4) 460 | if max_episode_steps is not None: 461 | env = TimeLimit(env, max_episode_steps=max_episode_steps) 462 | return env 463 | 464 | def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False): 465 | """Configure environment for DeepMind-style Atari. 466 | """ 467 | if episode_life: 468 | env = EpisodicLifeEnv_withInfos(env) 469 | if 'FIRE' in env.unwrapped.get_action_meanings(): 470 | env = FireResetEnv(env) 471 | # https://github.com/openai/baselines/issues/240#issuecomment-375245009 472 | env = WarpFrame(env) 473 | if scale: 474 | env = ScaledFloatFrame(env) 475 | if clip_rewards: 476 | env = ClipRewardEnv(env) 477 | if frame_stack: 478 | env = FrameStack(env, 4) 479 | return env 480 | 481 | def Baselines_DummyVecEnv(env_id,num_env): 482 | envs = [] 483 | for i in range(num_env): 484 | env = make_atari(env_id) 485 | # todo 486 | env.seed(i*1000) # random seed 487 | env = wrap_deepmind(env) # used in baselines 488 | env = VecFrameStack(env,nstack=4) 489 | env = PreprocessWrapper(env) 490 | envs.append(env) 491 | batch_env = BatchEnvWrapper(envs) 492 | return batch_env --------------------------------------------------------------------------------