├── Ptan ├── blank ├── regions.dat ├── funky_kong_img3.jpg ├── Region.py ├── 01_dqn_basic.py ├── 02_dqn_n_steps.py ├── 06_dqn_dueling.py ├── dqn_model.py ├── 04_dqn_noisy_net.py ├── 03_dqn_double.py ├── 05_dqn_prio_replay.py ├── 08_dqn_rainbow.py ├── RainbowMarioKart.py ├── common.py └── 07_dqn_distrib.py ├── EffRainbow ├── readme ├── kill_dolphins.py ├── plot.py ├── ptan_actions.py ├── simple_buffer_bench.py ├── OutputViewer.py ├── dqn_model.py ├── prio_buffer_bench.py ├── Wrappers.py ├── 08_dqn_rainbow.py ├── ptan_agent.py ├── common.py ├── networks.py └── ptan_utils.py ├── Results1.npy ├── regions.dat ├── results.xlsx ├── blank_regions.jpg ├── regions_luigi.dat ├── funky_kong_img2.png ├── funky_kong_img3.jpg ├── holy_agent_graph.png ├── map_region_luigi.jpg ├── current_model98852.331 ├── current_model151406.779 ├── FelkFork ├── readme.txt ├── clearMem.py ├── dolphinBootTest.py ├── dolphinScriptTest.py ├── LeakTest.py ├── newDolphinTest.py ├── Wrappers.py ├── DolphinEnv.py └── DolphinSideScriptTanks.py ├── readme.md ├── result_reader.py ├── Region.py ├── ER.py ├── MarioKartMain.py ├── LunarLanderMain.py ├── dist_test.py ├── ButtonLib.py ├── PER_old.py ├── PER.py └── MarioKartEnvBackup.py /Ptan/blank: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /EffRainbow/readme: -------------------------------------------------------------------------------- 1 | Here's the new stuff 2 | -------------------------------------------------------------------------------- /Results1.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIPTankz/OldWiiRL/HEAD/Results1.npy -------------------------------------------------------------------------------- /regions.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIPTankz/OldWiiRL/HEAD/regions.dat -------------------------------------------------------------------------------- /results.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIPTankz/OldWiiRL/HEAD/results.xlsx -------------------------------------------------------------------------------- /Ptan/regions.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIPTankz/OldWiiRL/HEAD/Ptan/regions.dat -------------------------------------------------------------------------------- /blank_regions.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIPTankz/OldWiiRL/HEAD/blank_regions.jpg -------------------------------------------------------------------------------- /regions_luigi.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIPTankz/OldWiiRL/HEAD/regions_luigi.dat -------------------------------------------------------------------------------- /funky_kong_img2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIPTankz/OldWiiRL/HEAD/funky_kong_img2.png -------------------------------------------------------------------------------- /funky_kong_img3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIPTankz/OldWiiRL/HEAD/funky_kong_img3.jpg -------------------------------------------------------------------------------- /holy_agent_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIPTankz/OldWiiRL/HEAD/holy_agent_graph.png -------------------------------------------------------------------------------- /map_region_luigi.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIPTankz/OldWiiRL/HEAD/map_region_luigi.jpg -------------------------------------------------------------------------------- /current_model98852.331: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIPTankz/OldWiiRL/HEAD/current_model98852.331 -------------------------------------------------------------------------------- /Ptan/funky_kong_img3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIPTankz/OldWiiRL/HEAD/Ptan/funky_kong_img3.jpg -------------------------------------------------------------------------------- /current_model151406.779: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VIPTankz/OldWiiRL/HEAD/current_model151406.779 -------------------------------------------------------------------------------- /FelkFork/readme.txt: -------------------------------------------------------------------------------- 1 | This folder is the newest work at time of writing! No longer using screencapture, but using felk's fork to access screen frames. 2 | -------------------------------------------------------------------------------- /EffRainbow/kill_dolphins.py: -------------------------------------------------------------------------------- 1 | import psutil 2 | 3 | PROCNAME = "Dolphin.exe" 4 | 5 | for proc in psutil.process_iter(): 6 | # check whether the process name matches 7 | if proc.name() == PROCNAME: 8 | proc.kill() -------------------------------------------------------------------------------- /FelkFork/clearMem.py: -------------------------------------------------------------------------------- 1 | from multiprocessing.shared_memory import SharedMemory 2 | 3 | name = 'p1' # replace this with the name of your lingering shared memory 4 | 5 | shm = SharedMemory(name, create=False) 6 | 7 | shm.unlink() # this closes all attachments to the memory and destroys it 8 | print("Cleared memory succesfully") -------------------------------------------------------------------------------- /FelkFork/dolphinBootTest.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | cmd = 'cd ~/Documents/dolphin/build/Binaries && ./dolphin-emu --no-python-subinterpreters\ 4 | --script /home/tyler/Documents/WiiRL/FelkFork/dolphinScriptTest.py\ 5 | --exec="/home/tyler/Documents/GameCollection/Wii Play (Europe) (En,Fr,De,Es,It).nkit.gcz"' 6 | 7 | subprocess.call(cmd, shell=True) 8 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | This is a very old version of the code seen on my YouTube channel, and does not represent what is currently being done in any way. You can attempt to use this code if you like, but do so at your own risk since there is minimal documentation, and this is definitely not the best way to do things. 2 | 3 | I may release the source code for the actual version at some point, but I do not yet know when that'll be. 4 | -------------------------------------------------------------------------------- /FelkFork/dolphinScriptTest.py: -------------------------------------------------------------------------------- 1 | from dolphin import event, gui 2 | 3 | red = 0xffff0000 4 | frame_counter = 0 5 | while True: 6 | await event.frameadvance() 7 | frame_counter += 1 8 | # draw on screen 9 | gui.draw_text((10, 10), red, f"Frame: {frame_counter}") 10 | # print to console 11 | if frame_counter % 60 == 0: 12 | print(f"The frame count has reached {frame_counter}") 13 | -------------------------------------------------------------------------------- /result_reader.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | filename = "results.npy" 5 | 6 | arr = np.load(filename) 7 | #print(arr) 8 | print(arr[-1]) 9 | 10 | scores = [] 11 | timesteps = [] 12 | 13 | for i in range(len(arr) - 100): 14 | scores.append(np.average(arr[i:i+100,0])) 15 | timesteps.append(arr[i + 100][3] / 3600) 16 | 17 | plt.plot(timesteps,scores) 18 | plt.ylabel('Average over last 100 games') 19 | plt.xlabel('Wall Time (Hours)') 20 | plt.show() 21 | -------------------------------------------------------------------------------- /Region.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class Region(): 4 | def __init__(self,x,y,bot_x,bot_y,dir_x,dir_y, 5 | in_bounds,is_chkp,chkp_num = -1): 6 | 7 | self.x = x 8 | self.y = y 9 | 10 | self.bot_x = bot_x 11 | self.bot_y = bot_y 12 | 13 | #dir will be -1,0 or 1. This gives reward 14 | self.dir_x = dir_x 15 | self.dir_y = dir_y 16 | 17 | self.in_bounds = in_bounds 18 | 19 | self.is_chkp = is_chkp 20 | self.chkp_num = chkp_num 21 | 22 | def in_region(self,x,y): 23 | #pass in the midpoint of funky kong 24 | 25 | if x >= self.x and y >= self.y: 26 | if x <= self.bot_x and y <= self.bot_y: 27 | return True 28 | 29 | return False 30 | 31 | -------------------------------------------------------------------------------- /Ptan/Region.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class Region(): 4 | def __init__(self,x,y,bot_x,bot_y, 5 | is_chkp,chkp_num = -1,focal = False): 6 | 7 | self.x = x 8 | self.y = y 9 | self.focal = focal 10 | 11 | self.bot_x = bot_x 12 | self.bot_y = bot_y 13 | 14 | #dir will be -1,0 or 1. This gives reward 15 | #self.dir_x = dir_x 16 | #self.dir_y = dir_y 17 | 18 | #self.in_bounds = in_bounds 19 | 20 | self.is_chkp = is_chkp 21 | self.chkp_num = chkp_num 22 | 23 | def in_region(self,x,y): 24 | #pass in the midpoint of funky kong 25 | 26 | if x >= self.x and y >= self.y: 27 | if x <= self.bot_x and y <= self.bot_y: 28 | return True 29 | 30 | return False 31 | 32 | -------------------------------------------------------------------------------- /EffRainbow/plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | runs = 1 4 | 5 | data_files = ["ResultsGhost"] 6 | 7 | expers = [] 8 | 9 | for exper in data_files: 10 | temp = [] 11 | for i in range(runs): 12 | temp.append(np.load(exper + '.npy')) 13 | expers.append(temp[:]) 14 | 15 | # Example 2D list 16 | data = list(expers[0][0]) 17 | print(len(data)) 18 | 19 | # Number of data entries to average 20 | average_size = 1000 21 | 22 | # Extracting scores and steps from the data 23 | scores = [row[0] for row in data] 24 | steps = [row[2] for row in data] 25 | 26 | 27 | myInt = 3600 28 | steps = [x / myInt for x in steps] 29 | 30 | 31 | # Averaging scores over a given number of data entries 32 | averaged_scores = [] 33 | for i in range(len(scores) - average_size + 1): 34 | average = sum(scores[i:i+average_size]) / average_size 35 | averaged_scores.append(average) 36 | 37 | # Creating the plot 38 | plt.plot(steps[:len(averaged_scores)], averaged_scores) 39 | plt.xlabel('Hours') 40 | plt.ylabel('Average Reward') 41 | plt.title('Mario Kart - GhostValley2') 42 | plt.grid(True) 43 | 44 | # Displaying the plot 45 | plt.show() 46 | print("done?") -------------------------------------------------------------------------------- /FelkFork/LeakTest.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("C:\\Users\\TYLER\\AppData\\Local\\Programs\\Python\\Python38\\Lib\\site-packages") 3 | with open('leak.txt', 'w') as f: 4 | f.write("got path123") 5 | from PIL import Image 6 | 7 | from dolphin import event,gui 8 | 9 | from copy import deepcopy 10 | 11 | import numpy as np 12 | import time 13 | with open('leak.txt', 'a') as f: 14 | f.write("\nlibraries") 15 | 16 | white = 0xffffffff 17 | 18 | def show_screenshot(width: int, height: int, data: bytes): 19 | global allow 20 | allow = True 21 | #gui.draw_text((10, 10), white, "Hi") 22 | #print(f"received {width}x{height} image of length {len(data)}") 23 | #image = Image.frombytes('RGBA', (width,height), data, 'raw') 24 | #image.show() 25 | 26 | steps = 1 27 | start = time.time() 28 | allow = False 29 | while True: 30 | await event.frameadvance() 31 | steps += 1 32 | fps = round(steps / (time.time() - start)) 33 | gui.draw_text((10, 10), white, "FPS: " + str(fps)) 34 | 35 | 36 | """event.on_framedrawn(show_screenshot) 37 | with open('leak.txt', 'a') as f: 38 | f.write("\nonframeadvance") 39 | while True: 40 | 41 | while not allow: 42 | await event.frameadvance() 43 | 44 | with open('leak.txt', 'a') as f: 45 | f.write("\nallowed") 46 | 47 | (width,height,data) = await event.framedrawn() 48 | with open('leak.txt', 'a') as f: 49 | f.write("\ndrawn") 50 | allow = False 51 | #Adding the two lines below fixes the problem?!? 52 | 53 | #with open('leak.txt', 'a') as f: 54 | #pass 55 | 56 | #img = Image.frombytes('RGBA', (width,height), data, 'raw')""" 57 | 58 | 59 | -------------------------------------------------------------------------------- /ER.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class ReplayMemory: 4 | def __init__(self, input_dims, max_mem, batch_size): 5 | self.mem_size = max_mem 6 | self.batch_size = batch_size 7 | self.mem_cntr = 0 8 | self.state_memory = np.zeros((self.mem_size, *input_dims), 9 | dtype=np.float32) 10 | self.new_state_memory = np.zeros((self.mem_size, *input_dims), 11 | dtype=np.float32) 12 | self.action_memory = np.zeros(self.mem_size, dtype=np.int64) 13 | self.reward_memory = np.zeros(self.mem_size, dtype=np.float32) 14 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool) 15 | 16 | def store_transition(self, state, action, reward, state_, terminal): 17 | index = self.mem_cntr % self.mem_size 18 | self.state_memory[index] = state 19 | self.action_memory[index] = action 20 | self.reward_memory[index] = reward 21 | self.new_state_memory[index] = state_ 22 | self.terminal_memory[index] = terminal 23 | 24 | self.mem_cntr += 1 25 | 26 | def sample_memory(self): 27 | offset = 0 28 | max_mem = min(self.mem_cntr, self.mem_size) 29 | batch = np.random.choice(max_mem, self.batch_size, 30 | replace=False) 31 | states = self.state_memory[batch] 32 | new_states = self.new_state_memory[batch] 33 | actions = self.action_memory[batch] 34 | rewards = self.reward_memory[batch] 35 | terminals = self.terminal_memory[batch] 36 | 37 | 38 | 39 | return states, actions, rewards, new_states, terminals 40 | 41 | def is_sufficient(self): 42 | return self.mem_cntr > self.batch_size 43 | -------------------------------------------------------------------------------- /MarioKartMain.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym.wrappers import FrameStack 3 | import numpy as np 4 | from DDDQN import Agent 5 | #from utils import plotLearning 6 | import time 7 | from MarioKartEnv import MarioKartEnv 8 | 9 | if __name__ == '__main__': 10 | env = MarioKartEnv() 11 | env = FrameStack(env,4) 12 | save_interval = 400 13 | load_checkpoint = False 14 | 15 | agent = Agent(gamma=0.99, epsilon=1, batch_size=32, n_actions=7, 16 | eps_end=0.1, input_dims=[4,52,96], lr=1e-4, 17 | max_mem_size=50000,memory = "PER",image = True, 18 | learning_starts=32,replace=16000,preprocess = True, 19 | n_step = 4,noisy = True,action_repeat=1) 20 | 21 | 22 | #learning starts to 50k 23 | if load_checkpoint: 24 | agent.load_models() 25 | 26 | scores = [] 27 | steps = 0 28 | start = time.time() 29 | i = -1 30 | arr = [] 31 | 32 | act_time = 0 33 | step_time = 0 34 | learn_time = 0 35 | 36 | while True: 37 | done = False 38 | observation = env.reset() 39 | observation = np.stack( observation, axis=0) 40 | 41 | score = 0 42 | i += 1 43 | 44 | while not done: 45 | steps += 1 46 | action = agent.choose_action(observation) 47 | 48 | observation_, reward, done, info = env.step(action) 49 | 50 | observation_ = np.stack( observation_, axis=0) 51 | score += reward 52 | 53 | agent.store_transition(observation, action, 54 | reward, observation_, int(done)) 55 | 56 | agent.learn() 57 | 58 | observation = observation_ 59 | 60 | 61 | arr.append([score,i,steps,round(time.time() - start,4),agent.epsilon]) 62 | if i % save_interval == save_interval - 1: 63 | np.save("Results.npy",np.array(arr)) 64 | agent.save_models() 65 | 66 | #eps_history.append(agent.epsilon) 67 | 68 | 69 | -------------------------------------------------------------------------------- /Ptan/01_dqn_basic.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gym 3 | import ptan 4 | import argparse 5 | 6 | import torch 7 | import torch.optim as optim 8 | 9 | from tensorboardX import SummaryWriter 10 | 11 | from lib import dqn_model, common 12 | 13 | 14 | if __name__ == "__main__": 15 | params = common.HYPERPARAMS['pong'] 16 | # params['epsilon_frames'] = 200000 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") 19 | args = parser.parse_args() 20 | device = torch.device("cuda" if args.cuda else "cpu") 21 | 22 | env = gym.make(params['env_name']) 23 | env = ptan.common.wrappers.wrap_dqn(env) 24 | 25 | writer = SummaryWriter(comment="-" + params['run_name'] + "-basic") 26 | net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) 27 | 28 | tgt_net = ptan.agent.TargetNet(net) 29 | selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start']) 30 | epsilon_tracker = common.EpsilonTracker(selector, params) 31 | agent = ptan.agent.DQNAgent(net, selector, device=device) 32 | 33 | exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1) 34 | buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params['replay_size']) 35 | optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) 36 | 37 | frame_idx = 0 38 | 39 | with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: 40 | while True: 41 | frame_idx += 1 42 | buffer.populate(1) 43 | epsilon_tracker.frame(frame_idx) 44 | 45 | new_rewards = exp_source.pop_total_rewards() 46 | if new_rewards: 47 | if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon): 48 | break 49 | 50 | if len(buffer) < params['replay_initial']: 51 | continue 52 | 53 | optimizer.zero_grad() 54 | batch = buffer.sample(params['batch_size']) 55 | loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params['gamma'], device=device) 56 | loss_v.backward() 57 | optimizer.step() 58 | 59 | if frame_idx % params['target_net_sync'] == 0: 60 | tgt_net.sync() 61 | -------------------------------------------------------------------------------- /LunarLanderMain.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from DDDQN import Agent 4 | import argparse 5 | #from utils import plotLearning 6 | import time 7 | 8 | if __name__ == '__main__': 9 | parser = argparse.ArgumentParser(description='') 10 | parser.add_argument('-srun', type=int, default=0) 11 | 12 | args = parser.parse_args() 13 | srun = args.srun 14 | 15 | env = gym.make('LunarLander-v2') 16 | num_frames = 180000 17 | load_checkpoint = False 18 | 19 | agent = Agent(gamma=0.99, epsilon=0.1, batch_size=64, n_actions=4, 20 | eps_end=0.1, input_dims=[8], lr=0.001, 21 | max_mem_size=1000000,memory = "PER",image = False, 22 | learning_starts=64,replace=100,n_step = 4,noisy=False) 23 | 24 | if load_checkpoint: 25 | agent.load_models() 26 | 27 | filename = 'LunarLander-Dueling-DDQN-512-Adam-lr0005-replace100.png' 28 | scores = [] 29 | eps_history = [] 30 | n_steps = 0 31 | start = time.time() 32 | i = -1 33 | while n_steps < num_frames: 34 | i += 1 35 | done = False 36 | observation = env.reset() 37 | score = 0 38 | 39 | while not done: 40 | n_steps += 1 41 | action = agent.choose_action(observation) 42 | observation_, reward, done, info = env.step(action) 43 | #env.render() 44 | score += reward 45 | agent.store_transition(observation, action, 46 | reward, observation_, int(done)) 47 | agent.learn() 48 | 49 | observation = observation_ 50 | 51 | scores.append(score) 52 | avg_score = np.mean(scores[max(0, i-100):(i+1)]) 53 | if i % 10 == 0: 54 | print('episode: ', i,'score %.1f ' % score, 55 | ' average score %.1f' % avg_score, 56 | 'epsilon %.2f' % agent.epsilon) 57 | 58 | eps_history.append(agent.epsilon) 59 | 60 | #x = [i+1 for i in range(num_games)] 61 | #plotLearning(x, scores, eps_history, filename) 62 | #print("Total Wall Time: " + str(time.time() - start)) 63 | #print(avg_score) 64 | save_stuff = [time.time() - start,avg_score] 65 | save_stuff = np.array(save_stuff,dtype = float) 66 | np.save("results_er" + str(srun) + ".npy", save_stuff) 67 | -------------------------------------------------------------------------------- /Ptan/02_dqn_n_steps.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gym 3 | import ptan 4 | import argparse 5 | 6 | import torch 7 | import torch.optim as optim 8 | 9 | from tensorboardX import SummaryWriter 10 | 11 | from lib import dqn_model, common 12 | 13 | REWARD_STEPS_DEFAULT = 2 14 | 15 | 16 | if __name__ == "__main__": 17 | params = common.HYPERPARAMS['pong'] 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") 20 | parser.add_argument("-n", default=REWARD_STEPS_DEFAULT, type=int, help="Count of steps to unroll Bellman") 21 | args = parser.parse_args() 22 | device = torch.device("cuda" if args.cuda else "cpu") 23 | 24 | env = gym.make(params['env_name']) 25 | env = ptan.common.wrappers.wrap_dqn(env) 26 | 27 | writer = SummaryWriter(comment="-" + params['run_name'] + "-%d-step" % args.n) 28 | net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) 29 | 30 | tgt_net = ptan.agent.TargetNet(net) 31 | selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start']) 32 | epsilon_tracker = common.EpsilonTracker(selector, params) 33 | agent = ptan.agent.DQNAgent(net, selector, device=device) 34 | 35 | exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=args.n) 36 | buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params['replay_size']) 37 | optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) 38 | 39 | frame_idx = 0 40 | 41 | with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: 42 | while True: 43 | frame_idx += 1 44 | buffer.populate(1) 45 | epsilon_tracker.frame(frame_idx) 46 | 47 | new_rewards = exp_source.pop_total_rewards() 48 | if new_rewards: 49 | if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon): 50 | break 51 | 52 | if len(buffer) < params['replay_initial']: 53 | continue 54 | 55 | optimizer.zero_grad() 56 | batch = buffer.sample(params['batch_size']) 57 | loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, 58 | gamma=params['gamma']**args.n, device=device) 59 | loss_v.backward() 60 | optimizer.step() 61 | 62 | if frame_idx % params['target_net_sync'] == 0: 63 | tgt_net.sync() 64 | -------------------------------------------------------------------------------- /FelkFork/newDolphinTest.py: -------------------------------------------------------------------------------- 1 | from dolphin import event, gui 2 | import sys 3 | sys.path.append("C:\\Users\\TYLER\\AppData\\Local\\Programs\\Python\\Python38\\Lib\\site-packages") 4 | #import numpy as np 5 | from PIL import Image 6 | #import cv2 7 | import random 8 | import time 9 | 10 | def show_screenshot(width: int, height: int, data: bytes): 11 | #print(f"received {width}x{height} image of length {len(data)}") 12 | # data is RGBA, so its size is width*height*4 13 | gui.draw_text((10, 50), red, f"Hi") 14 | 15 | red = 0xffff0000 16 | frame_counter = 0 17 | start = time.time() 18 | count = 0 19 | while True: 20 | (width,height,data) = await event.framedrawn() 21 | gui.draw_text((10, 50), red, f"Hi") 22 | 23 | if random.random() > 0.995: 24 | image = Image.frombytes('RGBA', (width,height), data, 'raw') 25 | image.show() 26 | """advance = False 27 | while True: 28 | await event.on_framedrawn(show_screenshot) 29 | count += 1 30 | gui.draw_text((10, 50), red, f"Count: {count}")""" 31 | 32 | #image = Image.frombytes('RGBA', (width,height), data, 'raw') 33 | #gui.draw_text((10, 50), red, f"Hi") 34 | 35 | 36 | 37 | 38 | """while True: 39 | #while not advance: 40 | #await event.frameadvance() 41 | 42 | advance = False 43 | 44 | await event.on_framedrawn(show_screenshot) 45 | 46 | 47 | #with open('loggTest.txt', 'a') as f: 48 | #f.write("Run framedrawn statement") 49 | 50 | frame_counter += 1 51 | fps = frame_counter / (time.time() - start) 52 | # draw on screen 53 | counts = count / (time.time() - start) 54 | gui.draw_text((10, 10), red, f"FPS: {fps}") 55 | gui.draw_text((10, 50), red, f"Count: {counts}") 56 | #gui.draw_text((10, 50), red, f"frames: {img.dtype}") 57 | # print to console 58 | if frame_counter % 60 == 0: 59 | print(f"The frame count has reached {frame_counter}")""" 60 | 61 | 62 | """global count 63 | global advance 64 | global img 65 | 66 | gui.draw_text((10, 50), red, f"Boo") 67 | 68 | if count % 4 == 3: 69 | image = Image.frombytes('RGBA', (width,height), data, 'raw') 70 | image = image.resize((94,78)) 71 | image = image.convert("RGB") 72 | 73 | img1 = np.asarray(image) 74 | img1 = img1[...,::-1] 75 | img1 = np.dot(img1[...,:3], [0.2989, 0.5870, 0.1140]) 76 | 77 | #if random.random() > 0.99: 78 | #cv2.imwrite("filename.png", img) 79 | 80 | img = img1.astype(np.float32) 81 | 82 | advance = True 83 | 84 | count += 1""" 85 | -------------------------------------------------------------------------------- /EffRainbow/ptan_actions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class ActionSelector: 5 | """ 6 | Abstract class which converts scores to the actions 7 | """ 8 | def __call__(self, scores): 9 | raise NotImplementedError 10 | 11 | 12 | class ArgmaxActionSelector(ActionSelector): 13 | """ 14 | Selects actions using argmax 15 | """ 16 | def __call__(self, scores): 17 | assert isinstance(scores, np.ndarray) 18 | return np.argmax(scores, axis=1) 19 | 20 | 21 | class EpsilonGreedyActionSelector(ActionSelector): 22 | def __init__(self, epsilon=1.0,eps_dec = 1e-6,eps_min = 0.05, selector=None): 23 | self.epsilon = epsilon 24 | self.eps_dec = eps_dec 25 | self.eps_min = eps_min 26 | self.selector = selector if selector is not None else ArgmaxActionSelector() 27 | 28 | def __call__(self, scores): 29 | assert isinstance(scores, np.ndarray) 30 | batch_size, n_actions = scores.shape 31 | actions = self.selector(scores) 32 | mask = np.random.random(size=batch_size) < self.epsilon 33 | rand_actions = np.random.choice(n_actions, sum(mask)) 34 | actions[mask] = rand_actions 35 | self.epsilon -= self.eps_dec 36 | if self.epsilon < self.eps_min: 37 | self.epsilon = self.eps_min 38 | return actions 39 | 40 | class StickyEpsilonGreedyActionSelector(ActionSelector): 41 | def __init__(self, epsilon=1.0,eps_dec = 1e-6,eps_min = 0.05, selector=None): 42 | self.epsilon = epsilon 43 | self.eps_dec = eps_dec 44 | self.eps_min = eps_min 45 | self.selector = selector if selector is not None else ArgmaxActionSelector() 46 | self.repeat_probs = 0.3 47 | self.prev_actions = None 48 | 49 | def __call__(self, scores): 50 | assert isinstance(scores, np.ndarray) 51 | batch_size, n_actions = scores.shape 52 | 53 | if np.random.random() < self.repeat_probs: 54 | if self.prev_actions is not None and batch_size == len(self.prev_actions): 55 | return self.prev_actions 56 | 57 | batch_size, n_actions = scores.shape 58 | actions = self.selector(scores) 59 | mask = np.random.random(size=batch_size) < self.epsilon 60 | rand_actions = np.random.choice(n_actions, sum(mask)) 61 | actions[mask] = rand_actions 62 | self.epsilon -= self.eps_dec 63 | if self.epsilon < self.eps_min: 64 | self.epsilon = self.eps_min 65 | 66 | self.prev_actions = actions[:] 67 | return actions 68 | 69 | 70 | class ProbabilityActionSelector(ActionSelector): 71 | """ 72 | Converts probabilities of actions into action by sampling them 73 | """ 74 | def __call__(self, probs): 75 | assert isinstance(probs, np.ndarray) 76 | actions = [] 77 | for prob in probs: 78 | actions.append(np.random.choice(len(prob), p=prob)) 79 | return np.array(actions) 80 | -------------------------------------------------------------------------------- /EffRainbow/simple_buffer_bench.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Benchmark various Replay Buffer variants 4 | """ 5 | import timeit 6 | import numpy as np 7 | import collections 8 | 9 | 10 | SIZES = [10**n for n in (3, 4, 5)] 11 | DATA_SHAPE = (84, 84, 4) 12 | REPEAT_NUMBER = 10 13 | 14 | 15 | class ExperienceBufferDeque: 16 | def __init__(self, capacity): 17 | self.buffer = collections.deque(maxlen=capacity) 18 | 19 | def __len__(self): 20 | return len(self.buffer) 21 | 22 | def append(self, experience): 23 | self.buffer.append(experience) 24 | 25 | def sample(self, batch_size): 26 | indices = np.random.choice(len(self.buffer), batch_size, replace=True) 27 | return [self.buffer[idx] for idx in indices] 28 | 29 | 30 | class ExperienceBufferCircularList: 31 | def __init__(self, capacity): 32 | self.buffer = list() 33 | self.capacity = capacity 34 | self.pos = 0 35 | 36 | def __len__(self): 37 | return len(self.buffer) 38 | 39 | def append(self, experience): 40 | if len(self.buffer) < self.capacity: 41 | self.buffer.append(experience) 42 | else: 43 | self.buffer[self.pos] = experience 44 | self.pos = (self.pos + 1) % self.capacity 45 | 46 | def sample(self, batch_size): 47 | indices = np.random.choice(len(self.buffer), batch_size, replace=True) 48 | return [self.buffer[idx] for idx in indices] 49 | 50 | 51 | 52 | def fill_buf(buf, size): 53 | for _ in range(size): 54 | buf.append(np.zeros(DATA_SHAPE, dtype=np.uint8)) 55 | 56 | 57 | def bench_buffer(buf_class): 58 | print("Benchmarking %s" % buf_class.__name__) 59 | 60 | for size in SIZES: 61 | print(" Test size %d" % size) 62 | ns = globals() 63 | ns.update(locals()) 64 | t = timeit.timeit('fill_buf(buf, size)', setup='buf = buf_class(size)', number=REPEAT_NUMBER, globals=ns) 65 | print(" * Initial fill:\t%.2f items/s" % (size*REPEAT_NUMBER / t)) 66 | buf = buf_class(size) 67 | fill_buf(buf, size) 68 | ns.update(locals()) 69 | t = timeit.timeit('fill_buf(buf, size)', number=REPEAT_NUMBER, globals=ns) 70 | print(" * Append:\t\t%.2f items/s" % (size*REPEAT_NUMBER / t)) 71 | t = timeit.timeit('buf.sample(4)', number=REPEAT_NUMBER*100, globals=ns) 72 | print(" * Sample 4:\t\t%.2f items/s" % (REPEAT_NUMBER*100 / t)) 73 | t = timeit.timeit('buf.sample(8)', number=REPEAT_NUMBER*100, globals=ns) 74 | print(" * Sample 8:\t\t%.2f items/s" % (REPEAT_NUMBER*100 / t)) 75 | t = timeit.timeit('buf.sample(16)', number=REPEAT_NUMBER*100, globals=ns) 76 | print(" * Sample 16:\t\t%.2f items/s" % (REPEAT_NUMBER*100 / t)) 77 | t = timeit.timeit('buf.sample(32)', number=REPEAT_NUMBER*100, globals=ns) 78 | print(" * Sample 32:\t\t%.2f items/s" % (REPEAT_NUMBER*100 / t)) 79 | 80 | 81 | 82 | if __name__ == "__main__": 83 | bench_buffer(ExperienceBufferCircularList) 84 | bench_buffer(ExperienceBufferDeque) 85 | pass 86 | -------------------------------------------------------------------------------- /EffRainbow/OutputViewer.py: -------------------------------------------------------------------------------- 1 | import pygame 2 | import numpy as np 3 | from copy import copy 4 | import sys 5 | import time 6 | 7 | class OutputViewer(): 8 | def __init__(self,tags): 9 | 10 | self.tags = tags 11 | 12 | self.width = 600 13 | self.height = 400 14 | 15 | self.text_height = 30 16 | 17 | self.bar_heights = self.height - (self.text_height * 2) 18 | 19 | self.rangeMin = -1 20 | self.rangeMax = 9 21 | self.remapping = [2, 0, 1, 3, 4] 22 | 23 | self.mult = self.height / self.rangeMax 24 | 25 | pygame.init() 26 | 27 | all_fonts = pygame.font.get_fonts() 28 | self.font = pygame.font.SysFont(all_fonts[7], 18) 29 | 30 | self.clock = pygame.time.Clock() 31 | self.screen = pygame.display.set_mode((self.width, self.height)) 32 | 33 | self.color = (0, 0, 255) 34 | 35 | def update(self,ovals): 36 | time.sleep(0.01) 37 | 38 | vals = [] 39 | for i in range(len(ovals)): 40 | vals.append(-1) 41 | 42 | for i in range(len(ovals)): 43 | vals[self.remapping[i]] = ovals[i] 44 | 45 | self.screen.fill((0,0,0)) 46 | 47 | bar_width = (self.width - 80) / len(vals) 48 | spacing = bar_width / 10 49 | 50 | ma = np.argmax(vals) 51 | 52 | for i in range(len(vals)): 53 | if i == ma: 54 | color = (255,215,0) 55 | elif vals[i] < 0: 56 | color = (255,0,0) 57 | else: 58 | color = (0,0,255) 59 | 60 | vals[i] -= self.rangeMin 61 | 62 | pygame.draw.rect(self.screen, color, pygame.Rect(10 + spacing * i + bar_width * i,\ 63 | self.bar_heights - int(vals[i] * self.mult), bar_width - spacing * 2, int(vals[i] * self.mult))) 64 | 65 | text = self.font.render(self.tags[i],1,(255,255,255))#creates the text 66 | self.screen.blit(text,(10 + spacing * i + bar_width * i,self.bar_heights + 10)) 67 | 68 | text = self.font.render(str(round(vals[i],2)),1,(255,255,255))#creates the text 69 | self.screen.blit(text,(10 + spacing * i + bar_width * i,self.bar_heights + 30)) 70 | 71 | 72 | self.mouse_up = False 73 | #allow shutdown window 74 | self.mouse_pos = pygame.mouse.get_pos() 75 | for event in pygame.event.get(): 76 | if event.type == pygame.QUIT: 77 | pygame.quit() 78 | sys.exit() 79 | if event.type == pygame.MOUSEBUTTONUP and event.button == 1: 80 | self.mouse_up = True 81 | 82 | self.clock.tick(60) 83 | 84 | pygame.display.flip() 85 | 86 | 87 | if __name__ == "__main__": 88 | out = OutputViewer(["hLeft","sLeft","wLeft","Forward","wRight","sRight","hRight"]) 89 | outputs = [1,2,3,4,5,6,12] 90 | while True: 91 | 92 | out.update(copy(outputs)) 93 | for i in range(len(outputs)): 94 | outputs[i] += np.random.random() - 0.5 95 | if outputs[i] < -1: 96 | outputs[i] = -1 97 | elif outputs[i] > 12: 98 | outputs[i] = 12 99 | 100 | -------------------------------------------------------------------------------- /EffRainbow/dqn_model.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | import numpy as np 7 | 8 | class NoisyLinear(nn.Linear): 9 | def __init__(self, in_features, out_features, sigma_init=0.1, bias=True): 10 | super(NoisyLinear, self).__init__(in_features, out_features, bias=bias) 11 | self.sigma_weight = nn.Parameter(torch.full((out_features, in_features), sigma_init)) 12 | self.register_buffer("epsilon_weight", torch.zeros(out_features, in_features)) 13 | if bias: 14 | self.sigma_bias = nn.Parameter(torch.full((out_features,), sigma_init)) 15 | self.register_buffer("epsilon_bias", torch.zeros(out_features)) 16 | self.reset_parameters() 17 | 18 | def reset_parameters(self): 19 | std = math.sqrt(3 / self.in_features) 20 | self.weight.data.uniform_(-std, std) 21 | self.bias.data.uniform_(-std, std) 22 | 23 | def forward(self, input): 24 | self.epsilon_weight.normal_() 25 | bias = self.bias 26 | if bias is not None: 27 | self.epsilon_bias.normal_() 28 | bias = bias + self.sigma_bias * self.epsilon_bias.data 29 | return F.linear(input, self.weight + self.sigma_weight * self.epsilon_weight.data, bias) 30 | 31 | 32 | class NoisyFactorizedLinear(nn.Linear): 33 | """ 34 | NoisyNet layer with factorized gaussian noise 35 | 36 | N.B. nn.Linear already initializes weight and bias to 37 | """ 38 | def __init__(self, in_features, out_features, sigma_zero=0.4, bias=True): 39 | super(NoisyFactorizedLinear, self).__init__(in_features, out_features, bias=bias) 40 | sigma_init = sigma_zero / math.sqrt(in_features) 41 | self.sigma_weight = nn.Parameter(torch.full((out_features, in_features), sigma_init)) 42 | self.register_buffer("epsilon_input", torch.zeros(1, in_features)) 43 | self.register_buffer("epsilon_output", torch.zeros(out_features, 1)) 44 | if bias: 45 | self.sigma_bias = nn.Parameter(torch.full((out_features,), sigma_init)) 46 | 47 | def forward(self, input): 48 | self.epsilon_input.normal_() 49 | self.epsilon_output.normal_() 50 | 51 | func = lambda x: torch.sign(x) * torch.sqrt(torch.abs(x)) 52 | eps_in = func(self.epsilon_input.data) 53 | eps_out = func(self.epsilon_output.data) 54 | 55 | bias = self.bias 56 | if bias is not None: 57 | bias = bias + self.sigma_bias * eps_out.t() 58 | noise_v = torch.mul(eps_in, eps_out) 59 | return F.linear(input, self.weight + self.sigma_weight * noise_v, bias) 60 | 61 | 62 | class DQN(nn.Module): 63 | def __init__(self, input_shape, n_actions): 64 | super(DQN, self).__init__() 65 | 66 | self.conv = nn.Sequential( 67 | nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4), 68 | nn.ReLU(), 69 | nn.Conv2d(32, 64, kernel_size=4, stride=2), 70 | nn.ReLU(), 71 | nn.Conv2d(64, 64, kernel_size=3, stride=1), 72 | nn.ReLU() 73 | ) 74 | 75 | conv_out_size = self._get_conv_out(input_shape) 76 | self.fc = nn.Sequential( 77 | nn.Linear(conv_out_size, 512), 78 | nn.ReLU(), 79 | nn.Linear(512, n_actions) 80 | ) 81 | 82 | def _get_conv_out(self, shape): 83 | o = self.conv(torch.zeros(1, *shape)) 84 | return int(np.prod(o.size())) 85 | 86 | def forward(self, x): 87 | fx = x.float() / 256 88 | conv_out = self.conv(fx).view(fx.size()[0], -1) 89 | return self.fc(conv_out) 90 | -------------------------------------------------------------------------------- /Ptan/06_dqn_dueling.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gym 3 | import ptan 4 | import argparse 5 | import numpy as np 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.optim as optim 10 | 11 | from tensorboardX import SummaryWriter 12 | 13 | from lib import common 14 | 15 | 16 | class DuelingDQN(nn.Module): 17 | def __init__(self, input_shape, n_actions): 18 | super(DuelingDQN, self).__init__() 19 | 20 | self.conv = nn.Sequential( 21 | nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4), 22 | nn.ReLU(), 23 | nn.Conv2d(32, 64, kernel_size=4, stride=2), 24 | nn.ReLU(), 25 | nn.Conv2d(64, 64, kernel_size=3, stride=1), 26 | nn.ReLU() 27 | ) 28 | 29 | conv_out_size = self._get_conv_out(input_shape) 30 | self.fc_adv = nn.Sequential( 31 | nn.Linear(conv_out_size, 256), 32 | nn.ReLU(), 33 | nn.Linear(256, n_actions) 34 | ) 35 | self.fc_val = nn.Sequential( 36 | nn.Linear(conv_out_size, 256), 37 | nn.ReLU(), 38 | nn.Linear(256, 1) 39 | ) 40 | 41 | def _get_conv_out(self, shape): 42 | o = self.conv(torch.zeros(1, *shape)) 43 | return int(np.prod(o.size())) 44 | 45 | def forward(self, x): 46 | fx = x.float() / 256 47 | conv_out = self.conv(fx).view(fx.size()[0], -1) 48 | val = self.fc_val(conv_out) 49 | adv = self.fc_adv(conv_out) 50 | return val + (adv - adv.mean(dim=1, keepdim=True)) 51 | 52 | 53 | if __name__ == "__main__": 54 | params = common.HYPERPARAMS['pong'] 55 | parser = argparse.ArgumentParser() 56 | parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") 57 | args = parser.parse_args() 58 | device = torch.device("cuda" if args.cuda else "cpu") 59 | 60 | env = gym.make(params['env_name']) 61 | env = ptan.common.wrappers.wrap_dqn(env) 62 | 63 | writer = SummaryWriter(comment="-" + params['run_name'] + "-dueling") 64 | net = DuelingDQN(env.observation_space.shape, env.action_space.n).to(device) 65 | tgt_net = ptan.agent.TargetNet(net) 66 | selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start']) 67 | epsilon_tracker = common.EpsilonTracker(selector, params) 68 | agent = ptan.agent.DQNAgent(net, selector, device=device) 69 | 70 | exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1) 71 | buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params['replay_size']) 72 | optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) 73 | 74 | frame_idx = 0 75 | 76 | with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: 77 | while True: 78 | frame_idx += 1 79 | buffer.populate(1) 80 | epsilon_tracker.frame(frame_idx) 81 | 82 | new_rewards = exp_source.pop_total_rewards() 83 | if new_rewards: 84 | if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon): 85 | break 86 | 87 | if len(buffer) < params['replay_initial']: 88 | continue 89 | 90 | optimizer.zero_grad() 91 | batch = buffer.sample(params['batch_size']) 92 | loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params['gamma'], device=device) 93 | loss_v.backward() 94 | optimizer.step() 95 | 96 | if frame_idx % params['target_net_sync'] == 0: 97 | tgt_net.sync() 98 | -------------------------------------------------------------------------------- /Ptan/dqn_model.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | import numpy as np 7 | 8 | 9 | class NoisyLinear(nn.Linear): 10 | def __init__(self, in_features, out_features, sigma_init=0.15, bias=True): #0.017 11 | super(NoisyLinear, self).__init__(in_features, out_features, bias=bias) 12 | #print(sigma_init) 13 | self.sigma_weight = nn.Parameter(torch.full((out_features, in_features), sigma_init)) 14 | self.register_buffer("epsilon_weight", torch.zeros(out_features, in_features)) 15 | if bias: 16 | self.sigma_bias = nn.Parameter(torch.full((out_features,), sigma_init)) 17 | self.register_buffer("epsilon_bias", torch.zeros(out_features)) 18 | self.reset_parameters() 19 | 20 | def reset_parameters(self): 21 | std = math.sqrt(3 / self.in_features) 22 | self.weight.data.uniform_(-std, std) 23 | self.bias.data.uniform_(-std, std) 24 | 25 | def forward(self, input): 26 | self.epsilon_weight.normal_() 27 | bias = self.bias 28 | if bias is not None: 29 | self.epsilon_bias.normal_() 30 | bias = bias + self.sigma_bias * self.epsilon_bias.data 31 | return F.linear(input, self.weight + self.sigma_weight * self.epsilon_weight.data, bias) 32 | 33 | 34 | class NoisyFactorizedLinear(nn.Linear): 35 | """ 36 | NoisyNet layer with factorized gaussian noise 37 | 38 | N.B. nn.Linear already initializes weight and bias to 39 | """ 40 | def __init__(self, in_features, out_features, sigma_zero=0.4, bias=True): 41 | super(NoisyFactorizedLinear, self).__init__(in_features, out_features, bias=bias) 42 | sigma_init = sigma_zero / math.sqrt(in_features) 43 | self.sigma_weight = nn.Parameter(torch.full((out_features, in_features), sigma_init)) 44 | self.register_buffer("epsilon_input", torch.zeros(1, in_features)) 45 | self.register_buffer("epsilon_output", torch.zeros(out_features, 1)) 46 | if bias: 47 | self.sigma_bias = nn.Parameter(torch.full((out_features,), sigma_init)) 48 | 49 | def forward(self, input): 50 | self.epsilon_input.normal_() 51 | self.epsilon_output.normal_() 52 | 53 | func = lambda x: torch.sign(x) * torch.sqrt(torch.abs(x)) 54 | eps_in = func(self.epsilon_input.data) 55 | eps_out = func(self.epsilon_output.data) 56 | 57 | bias = self.bias 58 | if bias is not None: 59 | bias = bias + self.sigma_bias * eps_out.t() 60 | noise_v = torch.mul(eps_in, eps_out) 61 | return F.linear(input, self.weight + self.sigma_weight * noise_v, bias) 62 | 63 | 64 | class DQN(nn.Module): 65 | def __init__(self, input_shape, n_actions): 66 | super(DQN, self).__init__() 67 | 68 | self.conv = nn.Sequential( 69 | nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4), 70 | nn.ReLU(), 71 | nn.Conv2d(32, 64, kernel_size=4, stride=2), 72 | nn.ReLU(), 73 | nn.Conv2d(64, 64, kernel_size=3, stride=1), 74 | nn.ReLU() 75 | ) 76 | 77 | conv_out_size = self._get_conv_out(input_shape) 78 | self.fc = nn.Sequential( 79 | nn.Linear(conv_out_size, 512), 80 | nn.ReLU(), 81 | nn.Linear(512, n_actions) 82 | ) 83 | 84 | def _get_conv_out(self, shape): 85 | o = self.conv(torch.zeros(1, *shape)) 86 | return int(np.prod(o.size())) 87 | 88 | def forward(self, x): 89 | fx = x.float() / 256 90 | conv_out = self.conv(fx).view(fx.size()[0], -1) 91 | return self.fc(conv_out) 92 | -------------------------------------------------------------------------------- /Ptan/04_dqn_noisy_net.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gym 3 | import ptan 4 | import argparse 5 | import numpy as np 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.optim as optim 10 | from torch.autograd import Variable 11 | 12 | from tensorboardX import SummaryWriter 13 | 14 | from lib import dqn_model, common 15 | 16 | 17 | class NoisyDQN(nn.Module): 18 | def __init__(self, input_shape, n_actions): 19 | super(NoisyDQN, self).__init__() 20 | 21 | self.conv = nn.Sequential( 22 | nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4), 23 | nn.ReLU(), 24 | nn.Conv2d(32, 64, kernel_size=4, stride=2), 25 | nn.ReLU(), 26 | nn.Conv2d(64, 64, kernel_size=3, stride=1), 27 | nn.ReLU() 28 | ) 29 | 30 | conv_out_size = self._get_conv_out(input_shape) 31 | self.noisy_layers = [ 32 | dqn_model.NoisyLinear(conv_out_size, 512), 33 | dqn_model.NoisyLinear(512, n_actions) 34 | ] 35 | self.fc = nn.Sequential( 36 | self.noisy_layers[0], 37 | nn.ReLU(), 38 | self.noisy_layers[1] 39 | ) 40 | 41 | def _get_conv_out(self, shape): 42 | o = self.conv(torch.zeros(1, *shape)) 43 | return int(np.prod(o.size())) 44 | 45 | def forward(self, x): 46 | fx = x.float() / 256 47 | conv_out = self.conv(fx).view(fx.size()[0], -1) 48 | return self.fc(conv_out) 49 | 50 | def noisy_layers_sigma_snr(self): 51 | return [ 52 | ((layer.weight ** 2).mean().sqrt() / (layer.sigma_weight ** 2).mean().sqrt()).item() 53 | for layer in self.noisy_layers 54 | ] 55 | 56 | 57 | if __name__ == "__main__": 58 | params = common.HYPERPARAMS['pong'] 59 | parser = argparse.ArgumentParser() 60 | parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") 61 | args = parser.parse_args() 62 | device = torch.device("cuda" if args.cuda else "cpu") 63 | 64 | env = gym.make(params['env_name']) 65 | env = ptan.common.wrappers.wrap_dqn(env) 66 | 67 | writer = SummaryWriter(comment="-" + params['run_name'] + "-noisy-net") 68 | net = NoisyDQN(env.observation_space.shape, env.action_space.n).to(device) 69 | tgt_net = ptan.agent.TargetNet(net) 70 | agent = ptan.agent.DQNAgent(net, ptan.actions.ArgmaxActionSelector(), device=device) 71 | 72 | exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1) 73 | buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params['replay_size']) 74 | optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) 75 | 76 | frame_idx = 0 77 | 78 | with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: 79 | while True: 80 | frame_idx += 1 81 | buffer.populate(1) 82 | 83 | new_rewards = exp_source.pop_total_rewards() 84 | if new_rewards: 85 | if reward_tracker.reward(new_rewards[0], frame_idx): 86 | break 87 | 88 | if len(buffer) < params['replay_initial']: 89 | continue 90 | 91 | optimizer.zero_grad() 92 | batch = buffer.sample(params['batch_size']) 93 | loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params['gamma'], device=device) 94 | loss_v.backward() 95 | optimizer.step() 96 | 97 | if frame_idx % params['target_net_sync'] == 0: 98 | tgt_net.sync() 99 | 100 | if frame_idx % 500 == 0: 101 | for layer_idx, sigma_l2 in enumerate(net.noisy_layers_sigma_snr()): 102 | writer.add_scalar("sigma_snr_layer_%d" % (layer_idx+1), 103 | sigma_l2, frame_idx) 104 | -------------------------------------------------------------------------------- /Ptan/03_dqn_double.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gym 3 | import ptan 4 | import argparse 5 | import numpy as np 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.optim as optim 10 | 11 | from tensorboardX import SummaryWriter 12 | 13 | from lib import dqn_model, common 14 | 15 | STATES_TO_EVALUATE = 1000 16 | EVAL_EVERY_FRAME = 100 17 | 18 | 19 | def calc_loss(batch, net, tgt_net, gamma, device="cpu", double=True): 20 | states, actions, rewards, dones, next_states = common.unpack_batch(batch) 21 | 22 | states_v = torch.tensor(states).to(device) 23 | next_states_v = torch.tensor(next_states).to(device) 24 | actions_v = torch.tensor(actions).to(device) 25 | rewards_v = torch.tensor(rewards).to(device) 26 | done_mask = torch.ByteTensor(dones).to(device) 27 | 28 | state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1) 29 | if double: 30 | next_state_actions = net(next_states_v).max(1)[1] 31 | next_state_values = tgt_net(next_states_v).gather(1, next_state_actions.unsqueeze(-1)).squeeze(-1) 32 | else: 33 | next_state_values = tgt_net(next_states_v).max(1)[0] 34 | next_state_values[done_mask] = 0.0 35 | 36 | expected_state_action_values = next_state_values.detach() * gamma + rewards_v 37 | return nn.MSELoss()(state_action_values, expected_state_action_values) 38 | 39 | 40 | def calc_values_of_states(states, net, device="cpu"): 41 | mean_vals = [] 42 | for batch in np.array_split(states, 64): 43 | states_v = torch.tensor(batch).to(device) 44 | action_values_v = net(states_v) 45 | best_action_values_v = action_values_v.max(1)[0] 46 | mean_vals.append(best_action_values_v.mean().item()) 47 | return np.mean(mean_vals) 48 | 49 | 50 | if __name__ == "__main__": 51 | params = common.HYPERPARAMS['pong'] 52 | parser = argparse.ArgumentParser() 53 | parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") 54 | parser.add_argument("--double", default=False, action="store_true", help="Enable double DQN") 55 | args = parser.parse_args() 56 | device = torch.device("cuda" if args.cuda else "cpu") 57 | 58 | env = gym.make(params['env_name']) 59 | env = ptan.common.wrappers.wrap_dqn(env) 60 | 61 | writer = SummaryWriter(comment="-" + params['run_name'] + "-double=" + str(args.double)) 62 | net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) 63 | 64 | tgt_net = ptan.agent.TargetNet(net) 65 | selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start']) 66 | epsilon_tracker = common.EpsilonTracker(selector, params) 67 | agent = ptan.agent.DQNAgent(net, selector, device=device) 68 | 69 | exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1) 70 | buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params['replay_size']) 71 | optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) 72 | 73 | frame_idx = 0 74 | eval_states = None 75 | 76 | with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: 77 | while True: 78 | frame_idx += 1 79 | buffer.populate(1) 80 | epsilon_tracker.frame(frame_idx) 81 | 82 | new_rewards = exp_source.pop_total_rewards() 83 | if new_rewards: 84 | if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon): 85 | break 86 | 87 | if len(buffer) < params['replay_initial']: 88 | continue 89 | if eval_states is None: 90 | eval_states = buffer.sample(STATES_TO_EVALUATE) 91 | eval_states = [np.array(transition.state, copy=False) for transition in eval_states] 92 | eval_states = np.array(eval_states, copy=False) 93 | 94 | optimizer.zero_grad() 95 | batch = buffer.sample(params['batch_size']) 96 | loss_v = calc_loss(batch, net, tgt_net.target_model, gamma=params['gamma'], device=device, 97 | double=args.double) 98 | loss_v.backward() 99 | optimizer.step() 100 | 101 | if frame_idx % params['target_net_sync'] == 0: 102 | tgt_net.sync() 103 | if frame_idx % EVAL_EVERY_FRAME == 0: 104 | mean_val = calc_values_of_states(eval_states, net, device=device) 105 | writer.add_scalar("values_mean", mean_val, frame_idx) 106 | 107 | -------------------------------------------------------------------------------- /EffRainbow/prio_buffer_bench.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Benchmark various Priority Replay Buffer variants 4 | """ 5 | import timeit 6 | import numpy as np 7 | import collections 8 | 9 | SIZES = [10**n for n in (3, 4, 5)] 10 | DATA_SHAPE = (84, 84, 4) 11 | REPEAT_NUMBER = 10 12 | 13 | 14 | class PrioReplayBufferDeque: 15 | def __init__(self, buf_size, prob_alpha=0.6): 16 | self.prob_alpha = prob_alpha 17 | self.buffer = collections.deque(maxlen=buf_size) 18 | self.priorities = collections.deque(maxlen=buf_size) 19 | 20 | def __len__(self): 21 | return len(self.buffer) 22 | 23 | def append(self, sample): 24 | max_prio = max(self.priorities) if self.priorities else 1.0 25 | self.buffer.append(sample) 26 | self.priorities.append(max_prio) 27 | 28 | def sample(self, batch_size, beta=0.4): 29 | probs = np.array(self.priorities, dtype=np.float32) ** self.prob_alpha 30 | probs /= probs.sum() 31 | indices = np.random.choice(len(self.buffer), batch_size, p=probs, replace=True) 32 | samples = [self.buffer[idx] for idx in indices] 33 | total = len(self.buffer) 34 | weights = (total * probs[indices]) ** (-beta) 35 | weights /= weights.max() 36 | return samples, indices, weights 37 | 38 | def update_priorities(self, batch_indices, batch_priorities): 39 | for idx, prio in zip(batch_indices, batch_priorities): 40 | self.priorities[idx] = prio 41 | 42 | 43 | class PrioReplayBufferList: 44 | def __init__(self, buf_size, prob_alpha=0.6): 45 | self.prob_alpha = prob_alpha 46 | self.capacity = buf_size 47 | self.pos = 0 48 | self.buffer = [] 49 | self.priorities = np.zeros((buf_size, ), dtype=np.float32) 50 | 51 | def __len__(self): 52 | return len(self.buffer) 53 | 54 | def append(self, sample): 55 | max_prio = self.priorities.max() if self.buffer else 1.0 56 | if len(self.buffer) < self.capacity: 57 | self.buffer.append(sample) 58 | else: 59 | self.buffer[self.pos] = sample 60 | self.priorities[self.pos] = max_prio 61 | self.pos = (self.pos + 1) % self.capacity 62 | 63 | def sample(self, batch_size, beta=0.4): 64 | if len(self.buffer) == self.capacity: 65 | prios = self.priorities 66 | else: 67 | prios = self.priorities[:self.pos] 68 | probs = np.array(prios, dtype=np.float32) ** self.prob_alpha 69 | 70 | probs /= probs.sum() 71 | indices = np.random.choice(len(self.buffer), batch_size, p=probs, replace=True) 72 | samples = [self.buffer[idx] for idx in indices] 73 | total = len(self.buffer) 74 | weights = (total * probs[indices]) ** (-beta) 75 | weights /= weights.max() 76 | return samples, indices, weights 77 | 78 | def update_priorities(self, batch_indices, batch_priorities): 79 | for idx, prio in zip(batch_indices, batch_priorities): 80 | self.priorities[idx] = prio 81 | 82 | 83 | def fill_buf(buf, size): 84 | for _ in range(size): 85 | buf.append(np.zeros(DATA_SHAPE, dtype=np.uint8)) 86 | 87 | 88 | def bench_buffer(buf_class): 89 | print("Benchmarking %s" % buf_class.__name__) 90 | 91 | for size in SIZES: 92 | print(" Test size %d" % size) 93 | ns = globals() 94 | ns.update(locals()) 95 | t = timeit.timeit('fill_buf(buf, size)', setup='buf = buf_class(size)', number=REPEAT_NUMBER, globals=ns) 96 | print(" * Initial fill:\t%.2f items/s" % (size*REPEAT_NUMBER / t)) 97 | buf = buf_class(size) 98 | fill_buf(buf, size) 99 | ns.update(locals()) 100 | t = timeit.timeit('fill_buf(buf, size)', number=REPEAT_NUMBER, globals=ns) 101 | print(" * Append:\t\t%.2f items/s" % (size*REPEAT_NUMBER / t)) 102 | t = timeit.timeit('buf.sample(4)', number=REPEAT_NUMBER*100, globals=ns) 103 | print(" * Sample 4:\t\t%.2f items/s" % (REPEAT_NUMBER*100 / t)) 104 | t = timeit.timeit('buf.sample(8)', number=REPEAT_NUMBER*100, globals=ns) 105 | print(" * Sample 8:\t\t%.2f items/s" % (REPEAT_NUMBER*100 / t)) 106 | t = timeit.timeit('buf.sample(16)', number=REPEAT_NUMBER*100, globals=ns) 107 | print(" * Sample 16:\t\t%.2f items/s" % (REPEAT_NUMBER*100 / t)) 108 | t = timeit.timeit('buf.sample(32)', number=REPEAT_NUMBER*100, globals=ns) 109 | print(" * Sample 32:\t\t%.2f items/s" % (REPEAT_NUMBER*100 / t)) 110 | 111 | 112 | 113 | if __name__ == "__main__": 114 | bench_buffer(PrioReplayBufferList) 115 | bench_buffer(PrioReplayBufferDeque) 116 | pass 117 | -------------------------------------------------------------------------------- /EffRainbow/Wrappers.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym import spaces 3 | import numpy as np 4 | from collections import deque 5 | import cv2 6 | import random 7 | 8 | class ImageToPyTorch(gym.ObservationWrapper): 9 | """ 10 | Change image shape to CWH 11 | """ 12 | def __init__(self, env): 13 | super(ImageToPyTorch, self).__init__(env) 14 | old_shape = self.observation_space.shape 15 | print("Converting to torch...") 16 | print(old_shape) 17 | self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[-1], old_shape[0], old_shape[1]), 18 | dtype=np.float32) 19 | print(self.observation_space) 20 | 21 | def observation(self, observation): 22 | print("Pytorch processing") 23 | print(observation.shape) 24 | return np.swapaxes(observation, 2, 0) 25 | 26 | class ScaledFloatFrame(gym.ObservationWrapper): 27 | def observation(self, obs): 28 | # careful! This undoes the memory optimization, use 29 | # with smaller replay buffers only. 30 | return np.array(obs).astype(np.float32) / 255.0 31 | 32 | class LazyFrames(object): 33 | def __init__(self, frames): 34 | """This object ensures that common frames between the observations are only stored once. 35 | It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay 36 | buffers. 37 | This object should only be converted to numpy array before being passed to the model. 38 | You'd not belive how complex the previous solution was.""" 39 | self._frames = frames 40 | 41 | def __array__(self, dtype=None): 42 | out = np.concatenate(self._frames, axis=0) 43 | if dtype is not None: 44 | out = out.astype(dtype) 45 | return out 46 | 47 | class ProcessFrame84(gym.ObservationWrapper): 48 | def __init__(self, env=None): 49 | super(ProcessFrame84, self).__init__(env) 50 | self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8) 51 | 52 | def observation(self, obs): 53 | return ProcessFrame84.process(obs) 54 | 55 | @staticmethod 56 | def process(frame): 57 | if frame.size == 210 * 160 * 3: 58 | img = np.reshape(frame, [210, 160, 3]).astype(np.float32) 59 | elif frame.size == 250 * 160 * 3: 60 | img = np.reshape(frame, [250, 160, 3]).astype(np.float32) 61 | else: 62 | assert False, "Unknown resolution." 63 | img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114 64 | resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA) 65 | x_t = resized_screen[18:102, :] 66 | x_t = np.reshape(x_t, [84, 84]) 67 | return x_t.astype(np.uint8) 68 | 69 | class ProcessFrameUint(gym.ObservationWrapper): 70 | def __init__(self, env=None): 71 | super(ProcessFrameUint, self).__init__(env) 72 | self.observation_space = spaces.Box(low=0, high=255, shape=(60, 112, 1), dtype=np.uint8) 73 | 74 | def observation(self, obs): 75 | return ProcessFrameUint.process(obs) 76 | 77 | @staticmethod 78 | def process(im): 79 | print("Uint") 80 | print(im.shape) 81 | 82 | im = np.reshape(im, [60, 112, 1])#.astype(np.float32) 83 | #x_t = cv2.resize(im, (100, 54), interpolation=cv2.INTER_AREA) 84 | #x_t = np.reshape(x_t, [54, 100,1]) 85 | print(im.shape) 86 | 87 | return im.astype(np.uint8) 88 | 89 | class FrameStack(gym.Wrapper): 90 | def __init__(self, env, k): 91 | """Stack k last frames. 92 | Returns lazy array, which is much more memory efficient. 93 | See Also 94 | -------- 95 | baselines.common.atari_wrappers.LazyFrames 96 | """ 97 | gym.Wrapper.__init__(self, env) 98 | self.k = k 99 | self.frames = deque([], maxlen=k) 100 | shp = env.observation_space.shape 101 | self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0]*k, shp[1], shp[2]), dtype=np.uint8) 102 | 103 | def reset(self): 104 | ob = self.env.reset() 105 | for _ in range(self.k): 106 | self.frames.append(ob) 107 | return self._get_ob() 108 | 109 | def step(self, action): 110 | ob, reward, done, info = self.env.step(action) 111 | self.frames.append(ob) 112 | return self._get_ob(), reward, done, info 113 | 114 | def _get_ob(self): 115 | assert len(self.frames) == self.k 116 | return LazyFrames(list(self.frames)) 117 | 118 | def wrap_env(env, stack_frames=4): 119 | 120 | env = ProcessFrameUint(env) 121 | env = ImageToPyTorch(env) 122 | env = FrameStack(env, stack_frames) 123 | 124 | return env 125 | 126 | def wrap_env_vec(env, stack_frames=4): 127 | 128 | env = ProcessFrameUint(env) 129 | env = ImageToPyTorch(env) 130 | env = FrameStack(env, stack_frames) 131 | 132 | return env 133 | -------------------------------------------------------------------------------- /FelkFork/Wrappers.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym import spaces 3 | import numpy as np 4 | from collections import deque 5 | import cv2 6 | 7 | class ImageToPyTorch(gym.ObservationWrapper): 8 | """ 9 | Change image shape to CWH 10 | """ 11 | def __init__(self, env): 12 | super(ImageToPyTorch, self).__init__(env) 13 | old_shape = self.observation_space.shape 14 | self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[-1], old_shape[0], old_shape[1]), 15 | dtype=np.float32) 16 | 17 | def observation(self, observation): 18 | return np.swapaxes(observation, 2, 0) 19 | 20 | class ScaledFloatFrame(gym.ObservationWrapper): 21 | def observation(self, obs): 22 | # careful! This undoes the memory optimization, use 23 | # with smaller replay buffers only. 24 | return np.array(obs).astype(np.float32) / 255.0 25 | 26 | class LazyFrames(object): 27 | def __init__(self, frames): 28 | """This object ensures that common frames between the observations are only stored once. 29 | It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay 30 | buffers. 31 | This object should only be converted to numpy array before being passed to the model. 32 | You'd not belive how complex the previous solution was.""" 33 | self._frames = frames 34 | 35 | def __array__(self, dtype=None): 36 | out = np.concatenate(self._frames, axis=0) 37 | if dtype is not None: 38 | out = out.astype(dtype) 39 | return out 40 | 41 | class ProcessFrame84(gym.ObservationWrapper): 42 | def __init__(self, env=None): 43 | super(ProcessFrame84, self).__init__(env) 44 | self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8) 45 | 46 | def observation(self, obs): 47 | return ProcessFrame84.process(obs) 48 | 49 | @staticmethod 50 | def process(frame): 51 | if frame.size == 210 * 160 * 3: 52 | img = np.reshape(frame, [210, 160, 3]).astype(np.float32) 53 | elif frame.size == 250 * 160 * 3: 54 | img = np.reshape(frame, [250, 160, 3]).astype(np.float32) 55 | else: 56 | assert False, "Unknown resolution." 57 | img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114 58 | resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA) 59 | x_t = resized_screen[18:102, :] 60 | x_t = np.reshape(x_t, [84, 84]) 61 | return x_t.astype(np.uint8) 62 | 63 | class ProcessFrame10054_2(gym.ObservationWrapper): 64 | def __init__(self, env=None): 65 | super(ProcessFrame10054_2, self).__init__(env) 66 | self.observation_space = spaces.Box(low=0, high=255, shape=(54, 100, 1), dtype=np.uint8) 67 | 68 | def observation(self, obs): 69 | return ProcessFrame10054_2.process(obs) 70 | 71 | @staticmethod 72 | def process(im): 73 | 74 | im = np.reshape(im, [108, 200, 1]).astype(np.float32) 75 | x_t = cv2.resize(im, (100, 54), interpolation=cv2.INTER_AREA) 76 | x_t = np.reshape(x_t, [54, 100,1]) 77 | 78 | return x_t.astype(np.uint8) 79 | 80 | class ProcessFrameUint(gym.ObservationWrapper): 81 | def __init__(self, env=None): 82 | super(ProcessFrameUint, self).__init__(env) 83 | self.observation_space = spaces.Box(low=0, high=255, shape=(78, 94, 1), dtype=np.uint8) 84 | 85 | def observation(self, obs): 86 | return ProcessFrameUint.process(obs) 87 | 88 | @staticmethod 89 | def process(im): 90 | 91 | im = np.reshape(im, [78, 94, 1])#.astype(np.float32) 92 | #x_t = cv2.resize(im, (100, 54), interpolation=cv2.INTER_AREA) 93 | #x_t = np.reshape(x_t, [54, 100,1]) 94 | 95 | return im.astype(np.uint8) 96 | 97 | class FrameStack(gym.Wrapper): 98 | def __init__(self, env, k): 99 | """Stack k last frames. 100 | Returns lazy array, which is much more memory efficient. 101 | See Also 102 | -------- 103 | baselines.common.atari_wrappers.LazyFrames 104 | """ 105 | gym.Wrapper.__init__(self, env) 106 | self.k = k 107 | self.frames = deque([], maxlen=k) 108 | shp = env.observation_space.shape 109 | self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0]*k, shp[1], shp[2]), dtype=np.uint8) 110 | 111 | def reset(self): 112 | ob = self.env.reset() 113 | for _ in range(self.k): 114 | self.frames.append(ob) 115 | return self._get_ob() 116 | 117 | def step(self, action): 118 | ob, reward, done, info = self.env.step(action) 119 | self.frames.append(ob) 120 | return self._get_ob(), reward, done, info 121 | 122 | def _get_ob(self): 123 | assert len(self.frames) == self.k 124 | return LazyFrames(list(self.frames)) 125 | 126 | 127 | 128 | 129 | def wrap_env(env, stack_frames=4): 130 | 131 | env = ProcessFrameUint(env) 132 | env = ImageToPyTorch(env) 133 | env = FrameStack(env, stack_frames) 134 | 135 | return env 136 | -------------------------------------------------------------------------------- /dist_test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | sys.path.append("./") 4 | 5 | #from lib import common 6 | 7 | import matplotlib as mpl 8 | mpl.use("Agg") 9 | import matplotlib.pyplot as plt 10 | 11 | 12 | Vmax = 10 13 | Vmin = -10 14 | N_ATOMS = 51 15 | DELTA_Z = (Vmax - Vmin) / (N_ATOMS - 1) 16 | 17 | 18 | def distr_projection(next_distr, rewards, dones, Vmin, Vmax, n_atoms, gamma): 19 | """ 20 | Perform distribution projection aka Catergorical Algorithm from the 21 | "A Distributional Perspective on RL" paper 22 | """ 23 | batch_size = len(rewards) 24 | proj_distr = np.zeros((batch_size, n_atoms), dtype=np.float32) 25 | delta_z = (Vmax - Vmin) / (n_atoms - 1) 26 | for atom in range(n_atoms): 27 | tz_j = np.minimum(Vmax, np.maximum(Vmin, rewards + (Vmin + atom * delta_z) * gamma)) 28 | b_j = (tz_j - Vmin) / delta_z 29 | l = np.floor(b_j).astype(np.int64) 30 | u = np.ceil(b_j).astype(np.int64) 31 | eq_mask = u == l 32 | proj_distr[eq_mask, l[eq_mask]] += next_distr[eq_mask, atom] 33 | ne_mask = u != l 34 | proj_distr[ne_mask, l[ne_mask]] += next_distr[ne_mask, atom] * (u - b_j)[ne_mask] 35 | proj_distr[ne_mask, u[ne_mask]] += next_distr[ne_mask, atom] * (b_j - l)[ne_mask] 36 | if dones.any(): 37 | proj_distr[dones] = 0.0 38 | tz_j = np.minimum(Vmax, np.maximum(Vmin, rewards[dones])) 39 | b_j = (tz_j - Vmin) / delta_z 40 | l = np.floor(b_j).astype(np.int64) 41 | u = np.ceil(b_j).astype(np.int64) 42 | eq_mask = u == l 43 | eq_dones = dones.copy() 44 | eq_dones[dones] = eq_mask 45 | if eq_dones.any(): 46 | proj_distr[eq_dones, l[eq_mask]] = 1.0 47 | ne_mask = u != l 48 | ne_dones = dones.copy() 49 | ne_dones[dones] = ne_mask 50 | if ne_dones.any(): 51 | proj_distr[ne_dones, l[ne_mask]] = (u - b_j)[ne_mask] 52 | proj_distr[ne_dones, u[ne_mask]] = (b_j - l)[ne_mask] 53 | return proj_distr 54 | 55 | def save_distr(src, proj, name): 56 | plt.clf() 57 | p = np.arange(Vmin, Vmax+DELTA_Z, DELTA_Z) 58 | plt.subplot(2, 1, 1) 59 | plt.bar(p, src, width=0.5) 60 | plt.title("Source") 61 | plt.subplot(2, 1, 2) 62 | plt.bar(p, proj, width=0.5) 63 | plt.title("Projected") 64 | plt.savefig(name + ".png") 65 | 66 | 67 | if __name__ == "__main__": 68 | np.random.seed(123) 69 | atoms = np.arange(Vmin, Vmax+DELTA_Z, DELTA_Z) 70 | 71 | # single peak distribution 72 | src_hist = np.zeros(shape=(1, N_ATOMS), dtype=np.float32) 73 | src_hist[0, N_ATOMS//2+1] = 1.0 74 | proj_hist = distr_projection(src_hist, np.array([2], dtype=np.float32), np.array([False]), 75 | Vmin, Vmax, N_ATOMS, gamma=0.9) 76 | 77 | save_distr(src_hist[0], proj_hist[0], "peak-r=2") 78 | 79 | 80 | # normal distribution 81 | data = np.random.normal(size=1000, scale=3) 82 | hist = np.histogram(data, normed=True, bins=np.arange(Vmin - DELTA_Z/2, Vmax + DELTA_Z*3/2, DELTA_Z)) 83 | 84 | src_hist = hist[0] 85 | proj_hist = distr_projection(np.array([src_hist]), np.array([2], dtype=np.float32), np.array([False]), 86 | Vmin, Vmax, N_ATOMS, gamma=0.9) 87 | save_distr(hist[0], proj_hist[0], "normal-r=2") 88 | #raise Exception("stop in the name of plod") 89 | 90 | # normal distribution, but done episode 91 | proj_hist = distr_projection(np.array([src_hist]), np.array([2], dtype=np.float32), np.array([True]), 92 | Vmin, Vmax, N_ATOMS, gamma=0.9) 93 | save_distr(hist[0], proj_hist[0], "normal-done-r=2") 94 | 95 | # clipping for out-of-range distribution 96 | proj_dist = distr_projection(np.array([src_hist]), np.array([10], dtype=np.float32), np.array([False]), 97 | Vmin, Vmax, N_ATOMS, gamma=0.9) 98 | save_distr(hist[0], proj_dist[0], "normal-r=10") 99 | 100 | # test both done and not done, unclipped 101 | proj_hist = distr_projection(np.array([src_hist, src_hist]), np.array([2, 2], dtype=np.float32), 102 | np.array([False, True]), Vmin, Vmax, N_ATOMS, gamma=0.9) 103 | save_distr(src_hist, proj_hist[0], "both_not_clip-01-incomplete") 104 | save_distr(src_hist, proj_hist[1], "both_not_clip-02-complete") 105 | 106 | # test both done and not done, clipped right 107 | proj_hist = distr_projection(np.array([src_hist, src_hist]), np.array([10, 10], dtype=np.float32), 108 | np.array([False, True]), Vmin, Vmax, N_ATOMS, gamma=0.9) 109 | save_distr(src_hist, proj_hist[0], "both_clip-right-01-incomplete") 110 | save_distr(src_hist, proj_hist[1], "both_clip-right-02-complete") 111 | 112 | # test both done and not done, clipped left 113 | proj_hist = distr_projection(np.array([src_hist, src_hist]), np.array([-10, -10], dtype=np.float32), 114 | np.array([False, True]), Vmin, Vmax, N_ATOMS, gamma=0.9) 115 | save_distr(src_hist, proj_hist[0], "both_clip-left-01-incomplete") 116 | save_distr(src_hist, proj_hist[1], "both_clip-left-02-complete") 117 | 118 | pass 119 | -------------------------------------------------------------------------------- /Ptan/05_dqn_prio_replay.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gym 3 | import ptan 4 | import numpy as np 5 | import argparse 6 | 7 | import torch 8 | import torch.optim as optim 9 | 10 | from tensorboardX import SummaryWriter 11 | 12 | from lib import dqn_model, common 13 | 14 | PRIO_REPLAY_ALPHA = 0.6 15 | BETA_START = 0.4 16 | BETA_FRAMES = 100000 17 | 18 | 19 | class PrioReplayBuffer: 20 | def __init__(self, exp_source, buf_size, prob_alpha=0.6): 21 | self.exp_source_iter = iter(exp_source) 22 | self.prob_alpha = prob_alpha 23 | self.capacity = buf_size 24 | self.pos = 0 25 | self.buffer = [] 26 | self.priorities = np.zeros((buf_size, ), dtype=np.float32) 27 | 28 | def __len__(self): 29 | return len(self.buffer) 30 | 31 | def populate(self, count): 32 | max_prio = self.priorities.max() if self.buffer else 1.0 33 | for _ in range(count): 34 | sample = next(self.exp_source_iter) 35 | if len(self.buffer) < self.capacity: 36 | self.buffer.append(sample) 37 | else: 38 | self.buffer[self.pos] = sample 39 | self.priorities[self.pos] = max_prio 40 | self.pos = (self.pos + 1) % self.capacity 41 | 42 | def sample(self, batch_size, beta=0.4): 43 | if len(self.buffer) == self.capacity: 44 | prios = self.priorities 45 | else: 46 | prios = self.priorities[:self.pos] 47 | probs = prios ** self.prob_alpha 48 | 49 | probs /= probs.sum() 50 | indices = np.random.choice(len(self.buffer), batch_size, p=probs) 51 | samples = [self.buffer[idx] for idx in indices] 52 | total = len(self.buffer) 53 | weights = (total * probs[indices]) ** (-beta) 54 | weights /= weights.max() 55 | return samples, indices, np.array(weights, dtype=np.float32) 56 | 57 | def update_priorities(self, batch_indices, batch_priorities): 58 | for idx, prio in zip(batch_indices, batch_priorities): 59 | self.priorities[idx] = prio 60 | 61 | 62 | def calc_loss(batch, batch_weights, net, tgt_net, gamma, device="cpu"): 63 | states, actions, rewards, dones, next_states = common.unpack_batch(batch) 64 | 65 | states_v = torch.tensor(states).to(device) 66 | next_states_v = torch.tensor(next_states).to(device) 67 | actions_v = torch.tensor(actions).to(device) 68 | rewards_v = torch.tensor(rewards).to(device) 69 | done_mask = torch.ByteTensor(dones).to(device) 70 | batch_weights_v = torch.tensor(batch_weights).to(device) 71 | 72 | state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1) 73 | next_state_values = tgt_net(next_states_v).max(1)[0] 74 | next_state_values[done_mask] = 0.0 75 | 76 | expected_state_action_values = next_state_values.detach() * gamma + rewards_v 77 | losses_v = batch_weights_v * (state_action_values - expected_state_action_values) ** 2 78 | return losses_v.mean(), losses_v + 1e-5 79 | 80 | 81 | if __name__ == "__main__": 82 | params = common.HYPERPARAMS['pong'] 83 | parser = argparse.ArgumentParser() 84 | parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") 85 | args = parser.parse_args() 86 | device = torch.device("cuda" if args.cuda else "cpu") 87 | 88 | env = gym.make(params['env_name']) 89 | env = ptan.common.wrappers.wrap_dqn(env) 90 | 91 | writer = SummaryWriter(comment="-" + params['run_name'] + "-prio-replay") 92 | net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) 93 | tgt_net = ptan.agent.TargetNet(net) 94 | selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start']) 95 | epsilon_tracker = common.EpsilonTracker(selector, params) 96 | agent = ptan.agent.DQNAgent(net, selector, device=device) 97 | 98 | exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1) 99 | buffer = PrioReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA) 100 | optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) 101 | 102 | frame_idx = 0 103 | beta = BETA_START 104 | 105 | with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: 106 | while True: 107 | frame_idx += 1 108 | buffer.populate(1) 109 | epsilon_tracker.frame(frame_idx) 110 | beta = min(1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES) 111 | 112 | new_rewards = exp_source.pop_total_rewards() 113 | if new_rewards: 114 | writer.add_scalar("beta", beta, frame_idx) 115 | if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon): 116 | break 117 | 118 | if len(buffer) < params['replay_initial']: 119 | continue 120 | 121 | optimizer.zero_grad() 122 | batch, batch_indices, batch_weights = buffer.sample(params['batch_size'], beta) 123 | loss_v, sample_prios_v = calc_loss(batch, batch_weights, net, tgt_net.target_model, 124 | params['gamma'], device=device) 125 | loss_v.backward() 126 | optimizer.step() 127 | buffer.update_priorities(batch_indices, sample_prios_v.data.cpu().numpy()) 128 | 129 | if frame_idx % params['target_net_sync'] == 0: 130 | tgt_net.sync() 131 | -------------------------------------------------------------------------------- /ButtonLib.py: -------------------------------------------------------------------------------- 1 | import pygame 2 | class Button(): 3 | def __init__(self,x,y,width,height,colour,colourH,border=False,borderC=(0,0,0), 4 | text="",font=0,size=30,textColour = (0,0,0),stripe = None,stripeH = None,stripeThickness = None):#Initialise all of the variables 5 | fonts = pygame.font.get_fonts() 6 | 7 | self.x = x #x coordinate of top left corner 8 | self.y = y #y coordinate of top left corner 9 | self.width = width #width of the button 10 | self.height = height #height of the button 11 | self.colour = colour #colour of button when not hovered over 12 | self.colourC = colour #current colour of button 13 | self.colourNH = colour 14 | self.colourH = colourH #colour of button when hovered over 15 | self.borderC=borderC #colour of the border of the button, default is black 16 | self.border=False #Boolean to check if the button has a border 17 | if border: 18 | self.border=True 19 | self.text=text #Text of the button 20 | self.font=pygame.font.SysFont(fonts[font],size) #font of the button 21 | self.textColour = textColour 22 | self.stripeC = stripe 23 | self.stripe = stripe 24 | self.stripeH = stripeH 25 | self.stripeThickness = stripeThickness 26 | 27 | 28 | def click(self,pos,mouseUp):#method to check if button is pressed 29 | if self.hovering(pos):#Checks if mouse if over the button 30 | if mouseUp:#checks if left mouse button was pressed 31 | return True 32 | else: 33 | return False 34 | else: #Returns false if not over button or not clicked 35 | return False 36 | 37 | def hovering(self,pos):#Method to check if mouse is over the button 38 | if (pos[0]>self.x and pos[0]<(self.width+self.x) and pos[1]>self.y and pos[1]<(self.height+self.y)):#checks if the mouse is over the button 39 | self.colourC=self.colourH#changes colour to hover colour 40 | if self.stripe != None: 41 | self.stripeC = self.stripeH 42 | return True 43 | else: 44 | self.colourC=self.colour#changes colour back to base colour 45 | self.stripeC = self.stripe 46 | return False 47 | 48 | def create(self,screen):#Method to blit the button onto the screen 49 | if self.border:#Checks if the button has a border 50 | pygame.draw.rect(screen,self.borderC,(self.x-3,self.y-3,self.width+6,self.height+6),0)#creates a rectangle larger than the button 51 | pygame.draw.rect(screen,self.colourC,(self.x,self.y,self.width,self.height),0)#draws the button 52 | 53 | if self.stripe != None: 54 | pygame.draw.rect(screen,self.stripeC, 55 | (self.x,self.y + (self.height / 2) - int(self.stripeThickness / 2),self.width,self.stripeThickness),0) 56 | 57 | if self.text != "":#checks if there is text for the button 58 | text= self.font.render(self.text,1,self.textColour)#creates the text 59 | screen.blit(text,(self.x + (self.width/2 - text.get_width()/2), self.y + (self.height/2 - text.get_height()/2)))#draws the text in the centre of the button 60 | 61 | def getX(self): 62 | return self.x 63 | 64 | def getY(self): 65 | return self.y 66 | 67 | def getCorners(self):#Returns the coordinates of the four corners of the button 68 | return self.x,self.y,self.x+self.width,self.y+self.height 69 | 70 | def setCorners(self,co1,co2,co3,co4=(0,0)):#takes four coordinates to set the values of the button 71 | if co2[0]>co1[0] and co2[1]co3[0] and co3[0]==co1[0] and co3[1]>co1[1]:#checks if the coordinates create a valid rectangle 72 | self.x=co1[0] 73 | self.y=co1[1]#sets the values to create the button with the coordinates given 74 | self.width=co2[0]-co1[0] 75 | self.height=co3[1]-co1[1] 76 | 77 | def getWidth(self):#returns the value of the width 78 | return self.width 79 | 80 | def getHeight(self):#returns the value of the height 81 | return self.height 82 | 83 | def setWidth(self,width):#Sets the width of the button to the input 84 | self.width=width 85 | 86 | def setHeight(self,height):#Sets the height of the button to the input 87 | self.height=height 88 | 89 | def getColour(self):#returns the value of the current colour of the button 90 | return self.colourC 91 | 92 | def getColourH(self):#returns the value of the current colour of the button 93 | return self.colourH 94 | 95 | def getColourNH(self):#returns the value of the current colour of the button 96 | return self.colourNH 97 | 98 | def setColour(self,colour):#Sets the base colour of the button to the input 99 | self.colour=colour 100 | 101 | def setColourH(self,colour):#Sets the hover colour of the button to the input 102 | self.colourH=colour 103 | 104 | def setBorderColour(self,colour):#Sets the border colour of the button to the input 105 | self.borderC=colour 106 | 107 | def getText(self):#returns the value of the text for the button 108 | return self.text 109 | 110 | def setText(self,text):#Sets the text of the button to the input 111 | self.text=text 112 | 113 | def setFont(self,font,size=-1):#Sets the font of the button to the input 114 | if size>-1: 115 | self.font=pygame.font.SysFont(fonts[font],size) 116 | else: 117 | self.font=pygame.font.SysFont(fonts[font],26) 118 | -------------------------------------------------------------------------------- /EffRainbow/08_dqn_rainbow.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gym 3 | import ptan_actions 4 | import ptan_agent 5 | import ptan_experience 6 | import argparse 7 | import numpy as np 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | import time 13 | import torch.optim as optim 14 | 15 | from tensorboardX import SummaryWriter 16 | 17 | import networks 18 | from lib import dqn_model, common 19 | from DolphinEnvVec import DolphinEnvVec 20 | import keyboard 21 | # n-step 22 | REWARD_STEPS = 3 23 | 24 | # priority replay 25 | PRIO_REPLAY_ALPHA = 0.5 26 | BETA_START = 0.4 27 | BETA_FRAMES = 20000000 28 | 29 | # C51 30 | Vmax = 6 31 | Vmin = -2 32 | N_ATOMS = 51 33 | DELTA_Z = (Vmax - Vmin) / (N_ATOMS - 1) 34 | 35 | 36 | class RainbowDQN(nn.Module): 37 | def __init__(self, input_shape, n_actions): 38 | super(RainbowDQN, self).__init__() 39 | 40 | self.conv = nn.Sequential( 41 | nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4), 42 | nn.ReLU(), 43 | nn.Conv2d(32, 64, kernel_size=4, stride=2), 44 | nn.ReLU(), 45 | nn.Conv2d(64, 64, kernel_size=3, stride=1), 46 | nn.ReLU() 47 | ) 48 | 49 | conv_out_size = self._get_conv_out(input_shape) 50 | self.fc_adv = nn.Sequential( 51 | dqn_model.NoisyLinear(conv_out_size, 512), 52 | nn.ReLU(), 53 | dqn_model.NoisyLinear(512, n_actions) 54 | ) 55 | self.fc_val = nn.Sequential( 56 | dqn_model.NoisyLinear(conv_out_size, 512), 57 | nn.ReLU(), 58 | dqn_model.NoisyLinear(512, 1) 59 | ) 60 | 61 | def _get_conv_out(self, shape): 62 | o = self.conv(torch.zeros(1, *shape)) 63 | return int(np.prod(o.size())) 64 | 65 | def forward(self, x): 66 | fx = x.float() / 256 67 | conv_out = self.conv(fx).view(fx.size()[0], -1) 68 | val = self.fc_val(conv_out) 69 | adv = self.fc_adv(conv_out) 70 | return val + (adv - adv.mean(dim=1, keepdim=True)) 71 | 72 | if __name__ == "__main__": 73 | params = common.HYPERPARAMS['MarioBros'] 74 | #params['epsilon_frames'] *= 2 75 | parser = argparse.ArgumentParser() 76 | parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") 77 | 78 | args = parser.parse_args() 79 | #print(args.cuda) 80 | device = torch.device("cuda")# if args.cuda else "cpu" 81 | 82 | with open('pid_num.txt', 'w') as f: 83 | f.write(str(0)) 84 | 85 | #env = DolphinEnv() 86 | #env = wrap_env(env) 87 | 88 | vec_envs = DolphinEnvVec(4) 89 | 90 | #print("Vector Obs Shape: " + str(vec_envs.observation_space)) 91 | 92 | #envs.observation_space = gym.spaces.Box( 93 | #low=0, high=1, shape=(3,78, 94), dtype=np.uint8) 94 | 95 | #env = DolphinEnv(pid = 0) #gym.make(params['env_name']) 96 | #env = wrap_env(env,3) 97 | 98 | print(vec_envs.action_space) 99 | print(vec_envs.observation_space.shape) 100 | #env = ptan.common.wrappers.wrap_dqn_custom(env) 101 | 102 | #Test this code: 103 | #check to see if observations are uints or floats 104 | 105 | #raise Exception("stop") 106 | 107 | #need to copy and reshape network 108 | 109 | writer = SummaryWriter(comment="-" + params['run_name'] + "-rainbow") 110 | #net = RainbowDQN(env.observation_space.shape, env.action_space.n).to(device) 111 | #net = networks.ImpalaCNNSmall(env.observation_space.shape[0], env.action_space.n).to(device) 112 | net = networks.ImpalaCNNLarge(vec_envs.observation_space.shape[0],vec_envs.action_space.n).to(device) 113 | #net.load_checkpoint() 114 | 115 | tgt_net = ptan_agent.TargetNet(net) 116 | selector = ptan_actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start'],eps_dec=params['epsilon_dec'],eps_min=params['epsilon_final']) 117 | #ptan_actions.StickyEpsilonGreedyActionSelector() 118 | agent = ptan_agent.DQNAgent(net, selector, device=device) 119 | 120 | exp_source = ptan_experience.ExperienceSourceFirstLast(vec_envs, agent, gamma=params['gamma'], steps_count=REWARD_STEPS,vectorized=True)# 121 | buffer = ptan_experience.PrioritizedReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA) 122 | optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'], eps=0.0025/params['batch_size']) 123 | 124 | frame_idx = 0 125 | beta = BETA_START 126 | 127 | save_interval = 320000 128 | start_timer = time.time() 129 | 130 | scores = [] 131 | run_name = "ResultsItems.npy" 132 | 133 | with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: 134 | while True: 135 | 136 | frame_idx += 8 137 | buffer.populate(8) 138 | beta = min(1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES) 139 | 140 | new_rewards = exp_source.pop_total_rewards() 141 | if new_rewards: 142 | if reward_tracker.reward(new_rewards[0], frame_idx): 143 | break 144 | 145 | if frame_idx % 1600 == 0: 146 | print("Total FPS: " + str(round(frame_idx / (time.time() - start_timer),2))) 147 | 148 | if len(buffer) < params['replay_initial']: 149 | continue 150 | 151 | optimizer.zero_grad() 152 | batch, batch_indices, batch_weights = buffer.sample(params['batch_size'], beta) 153 | 154 | loss_v, sample_prios_v = common.calc_loss_dqn(batch, batch_weights, net, tgt_net.target_model, 155 | params['gamma'] ** REWARD_STEPS, device=device) 156 | 157 | """loss_v, sample_prios_v = calc_loss(batch, batch_weights, net, tgt_net.target_model, 158 | params['gamma'] ** REWARD_STEPS, device=device)""" 159 | 160 | loss_v.backward() 161 | optimizer.step() 162 | buffer.update_priorities(batch_indices, sample_prios_v.data.cpu().numpy()) 163 | 164 | if frame_idx % save_interval == 0: 165 | net.save_checkpoint() 166 | 167 | if frame_idx % params['target_net_sync'] == 0: 168 | tgt_net.sync() 169 | np.save(run_name, reward_tracker.get_scores()) 170 | 171 | 172 | -------------------------------------------------------------------------------- /EffRainbow/ptan_agent.py: -------------------------------------------------------------------------------- 1 | """ 2 | Agent is something which converts states into actions and has state 3 | """ 4 | import copy 5 | import numpy as np 6 | import torch 7 | import torch.nn.functional as F 8 | 9 | import ptan_actions as actions 10 | #from OutputViewer import OutputViewer 11 | 12 | 13 | class BaseAgent: 14 | """ 15 | Abstract Agent interface 16 | """ 17 | def initial_state(self): 18 | """ 19 | Should create initial empty state for the agent. It will be called for the start of the episode 20 | :return: Anything agent want to remember 21 | """ 22 | return None 23 | 24 | def __call__(self, states, agent_states): 25 | """ 26 | Convert observations and states into actions to take 27 | :param states: list of environment states to process 28 | :param agent_states: list of states with the same length as observations 29 | :return: tuple of actions, states 30 | """ 31 | assert isinstance(states, list) 32 | assert isinstance(agent_states, list) 33 | assert len(agent_states) == len(states) 34 | 35 | raise NotImplementedError 36 | 37 | 38 | def default_states_preprocessor(states): 39 | """ 40 | Convert list of states into the form suitable for model. By default we assume Variable 41 | :param states: list of numpy arrays with states 42 | :return: Variable 43 | """ 44 | if len(states) == 1: 45 | np_states = np.expand_dims(states[0], 0) 46 | else: 47 | np_states = np.array([np.array(s, copy=False) for s in states], copy=False) 48 | return torch.tensor(np_states) 49 | 50 | 51 | def float32_preprocessor(states): 52 | np_states = np.array(states, dtype=np.float32) 53 | return torch.tensor(np_states) 54 | 55 | 56 | class DQNAgent(BaseAgent): 57 | """ 58 | DQNAgent is a memoryless DQN agent which calculates Q values 59 | from the observations and converts them into the actions using action_selector 60 | """ 61 | def __init__(self, dqn_model, action_selector, device="cpu", preprocessor=default_states_preprocessor): 62 | self.dqn_model = dqn_model 63 | self.action_selector = action_selector 64 | self.preprocessor = preprocessor 65 | self.device = device 66 | #self.viewer = OutputViewer(["hLeft", "sLeft", "Wheel", "sRight", "hRight"]) 67 | 68 | def __call__(self, states, agent_states=None): 69 | if agent_states is None: 70 | agent_states = [None] * len(states) 71 | if self.preprocessor is not None: 72 | states = self.preprocessor(states) 73 | if torch.is_tensor(states): 74 | states = states.to(self.device) 75 | q_v = self.dqn_model(states) 76 | q = q_v.data.cpu().numpy() 77 | #self.viewer.update(list(q[0])) 78 | actions = self.action_selector(q) 79 | return actions, agent_states 80 | 81 | 82 | class TargetNet: 83 | """ 84 | Wrapper around model which provides copy of it instead of trained weights 85 | """ 86 | def __init__(self, model): 87 | self.model = model 88 | self.target_model = copy.deepcopy(model) 89 | 90 | def sync(self): 91 | self.target_model.load_state_dict(self.model.state_dict()) 92 | 93 | def alpha_sync(self, alpha): 94 | """ 95 | Blend params of target net with params from the model 96 | :param alpha: 97 | """ 98 | assert isinstance(alpha, float) 99 | assert 0.0 < alpha <= 1.0 100 | state = self.model.state_dict() 101 | tgt_state = self.target_model.state_dict() 102 | for k, v in state.items(): 103 | tgt_state[k] = tgt_state[k] * alpha + (1 - alpha) * v 104 | self.target_model.load_state_dict(tgt_state) 105 | 106 | 107 | class PolicyAgent(BaseAgent): 108 | """ 109 | Policy agent gets action probabilities from the model and samples actions from it 110 | """ 111 | # TODO: unify code with DQNAgent, as only action selector is differs. 112 | def __init__(self, model, action_selector=actions.ProbabilityActionSelector(), device="cpu", 113 | apply_softmax=False, preprocessor=default_states_preprocessor): 114 | self.model = model 115 | self.action_selector = action_selector 116 | self.device = device 117 | self.apply_softmax = apply_softmax 118 | self.preprocessor = preprocessor 119 | 120 | def __call__(self, states, agent_states=None): 121 | """ 122 | Return actions from given list of states 123 | :param states: list of states 124 | :return: list of actions 125 | """ 126 | if agent_states is None: 127 | agent_states = [None] * len(states) 128 | if self.preprocessor is not None: 129 | states = self.preprocessor(states) 130 | if torch.is_tensor(states): 131 | states = states.to(self.device) 132 | probs_v = self.model(states) 133 | if self.apply_softmax: 134 | probs_v = F.softmax(probs_v, dim=1) 135 | probs = probs_v.data.cpu().numpy() 136 | actions = self.action_selector(probs) 137 | return np.array(actions), agent_states 138 | 139 | 140 | class ActorCriticAgent(BaseAgent): 141 | """ 142 | Policy agent which returns policy and value tensors from observations. Value are stored in agent's state 143 | and could be reused for rollouts calculations by ExperienceSource. 144 | """ 145 | def __init__(self, model, action_selector=actions.ProbabilityActionSelector(), device="cpu", 146 | apply_softmax=False, preprocessor=default_states_preprocessor): 147 | self.model = model 148 | self.action_selector = action_selector 149 | self.device = device 150 | self.apply_softmax = apply_softmax 151 | self.preprocessor = preprocessor 152 | 153 | def __call__(self, states, agent_states=None): 154 | """ 155 | Return actions from given list of states 156 | :param states: list of states 157 | :return: list of actions 158 | """ 159 | if self.preprocessor is not None: 160 | states = self.preprocessor(states) 161 | if torch.is_tensor(states): 162 | states = states.to(self.device) 163 | probs_v, values_v = self.model(states) 164 | if self.apply_softmax: 165 | probs_v = F.softmax(probs_v, dim=1) 166 | probs = probs_v.data.cpu().numpy() 167 | actions = self.action_selector(probs) 168 | agent_states = values_v.data.squeeze().cpu().numpy().tolist() 169 | return np.array(actions), agent_states 170 | -------------------------------------------------------------------------------- /Ptan/08_dqn_rainbow.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gym 3 | import ptan 4 | import argparse 5 | import numpy as np 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | 11 | import torch.optim as optim 12 | 13 | from tensorboardX import SummaryWriter 14 | 15 | from lib import dqn_model, common 16 | 17 | # n-step 18 | REWARD_STEPS = 3 19 | 20 | # priority replay 21 | PRIO_REPLAY_ALPHA = 0.6 22 | BETA_START = 0.4 23 | BETA_FRAMES = 1000000 24 | 25 | # C51 26 | Vmax = 10 27 | Vmin = -10 28 | N_ATOMS = 51 29 | DELTA_Z = (Vmax - Vmin) / (N_ATOMS - 1) 30 | 31 | 32 | class RainbowDQN(nn.Module): 33 | def __init__(self, input_shape, n_actions): 34 | super(RainbowDQN, self).__init__() 35 | 36 | self.conv = nn.Sequential( 37 | nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4), 38 | nn.ReLU(), 39 | nn.Conv2d(32, 64, kernel_size=4, stride=2), 40 | nn.ReLU(), 41 | nn.Conv2d(64, 64, kernel_size=3, stride=1), 42 | nn.ReLU() 43 | ) 44 | 45 | conv_out_size = self._get_conv_out(input_shape) 46 | self.fc_val = nn.Sequential( 47 | dqn_model.NoisyLinear(conv_out_size, 256), 48 | nn.ReLU(), 49 | dqn_model.NoisyLinear(256, N_ATOMS) 50 | ) 51 | 52 | self.fc_adv = nn.Sequential( 53 | dqn_model.NoisyLinear(conv_out_size, 256), 54 | nn.ReLU(), 55 | dqn_model.NoisyLinear(256, n_actions * N_ATOMS) 56 | ) 57 | 58 | self.register_buffer("supports", torch.arange(Vmin, Vmax+DELTA_Z, DELTA_Z)) 59 | self.softmax = nn.Softmax(dim=1) 60 | 61 | def _get_conv_out(self, shape): 62 | o = self.conv(torch.zeros(1, *shape)) 63 | return int(np.prod(o.size())) 64 | 65 | def forward(self, x): 66 | batch_size = x.size()[0] 67 | fx = x.float() / 256 68 | conv_out = self.conv(fx).view(batch_size, -1) 69 | val_out = self.fc_val(conv_out).view(batch_size, 1, N_ATOMS) 70 | adv_out = self.fc_adv(conv_out).view(batch_size, -1, N_ATOMS) 71 | adv_mean = adv_out.mean(dim=1, keepdim=True) 72 | return val_out + (adv_out - adv_mean) 73 | 74 | def both(self, x): 75 | cat_out = self(x) 76 | probs = self.apply_softmax(cat_out) 77 | weights = probs * self.supports 78 | res = weights.sum(dim=2) 79 | return cat_out, res 80 | 81 | def qvals(self, x): 82 | return self.both(x)[1] 83 | 84 | def apply_softmax(self, t): 85 | return self.softmax(t.view(-1, N_ATOMS)).view(t.size()) 86 | 87 | 88 | def calc_loss(batch, batch_weights, net, tgt_net, gamma, device="cpu"): 89 | states, actions, rewards, dones, next_states = common.unpack_batch(batch) 90 | batch_size = len(batch) 91 | 92 | states_v = torch.tensor(states).to(device) 93 | actions_v = torch.tensor(actions).to(device) 94 | next_states_v = torch.tensor(next_states).to(device) 95 | batch_weights_v = torch.tensor(batch_weights).to(device) 96 | 97 | # next state distribution 98 | # dueling arch -- actions from main net, distr from tgt_net 99 | 100 | # calc at once both next and cur states 101 | distr_v, qvals_v = net.both(torch.cat((states_v, next_states_v))) 102 | next_qvals_v = qvals_v[batch_size:] 103 | distr_v = distr_v[:batch_size] 104 | 105 | next_actions_v = next_qvals_v.max(1)[1] 106 | next_distr_v = tgt_net(next_states_v) 107 | next_best_distr_v = next_distr_v[range(batch_size), next_actions_v.data] 108 | next_best_distr_v = tgt_net.apply_softmax(next_best_distr_v) 109 | next_best_distr = next_best_distr_v.data.cpu().numpy() 110 | 111 | dones = dones.astype(bool) 112 | 113 | # project our distribution using Bellman update 114 | proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin, Vmax, N_ATOMS, gamma) 115 | 116 | # calculate net output 117 | state_action_values = distr_v[range(batch_size), actions_v.data] 118 | state_log_sm_v = F.log_softmax(state_action_values, dim=1) 119 | proj_distr_v = torch.tensor(proj_distr).to(device) 120 | 121 | loss_v = -state_log_sm_v * proj_distr_v 122 | loss_v = batch_weights_v * loss_v.sum(dim=1) 123 | return loss_v.mean(), loss_v + 1e-5 124 | 125 | 126 | if __name__ == "__main__": 127 | params = common.HYPERPARAMS['invaders'] 128 | params['epsilon_frames'] *= 2 129 | parser = argparse.ArgumentParser() 130 | parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") 131 | 132 | args = parser.parse_args() 133 | #print(args.cuda) 134 | device = torch.device("cuda")# if args.cuda else "cpu" 135 | 136 | env = gym.make(params['env_name']) 137 | print(env.action_space) 138 | print(env.observation_space.shape) 139 | env = ptan.common.wrappers.wrap_dqn(env) 140 | print(env.observation_space.shape) 141 | #env = ptan.common.wrappers.wrap_dqn_custom(env) 142 | 143 | #Test this code: 144 | #check to see if observations are uints or floats 145 | #print(env.action_space) 146 | #print(env.observation_space.shape) 147 | raise Exception("stop") 148 | 149 | #need to copy and reshape network 150 | 151 | 152 | writer = SummaryWriter(comment="-" + params['run_name'] + "-rainbow") 153 | net = RainbowDQN(env.observation_space.shape, env.action_space.n).to(device) 154 | tgt_net = ptan.agent.TargetNet(net) 155 | agent = ptan.agent.DQNAgent(lambda x: net.qvals(x), ptan.actions.ArgmaxActionSelector(), device=device) 156 | 157 | exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=REWARD_STEPS) 158 | buffer = ptan.experience.PrioritizedReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA) 159 | optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) 160 | 161 | frame_idx = 0 162 | beta = BETA_START 163 | 164 | with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: 165 | while True: 166 | frame_idx += 1 167 | buffer.populate(1) 168 | beta = min(1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES) 169 | 170 | new_rewards = exp_source.pop_total_rewards() 171 | if new_rewards: 172 | if reward_tracker.reward(new_rewards[0], frame_idx): 173 | break 174 | 175 | if len(buffer) < params['replay_initial']: 176 | continue 177 | 178 | optimizer.zero_grad() 179 | batch, batch_indices, batch_weights = buffer.sample(params['batch_size'], beta) 180 | loss_v, sample_prios_v = calc_loss(batch, batch_weights, net, tgt_net.target_model, 181 | params['gamma'] ** REWARD_STEPS, device=device) 182 | loss_v.backward() 183 | optimizer.step() 184 | buffer.update_priorities(batch_indices, sample_prios_v.data.cpu().numpy()) 185 | 186 | if frame_idx % params['target_net_sync'] == 0: 187 | tgt_net.sync() 188 | -------------------------------------------------------------------------------- /Ptan/RainbowMarioKart.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gym 3 | import ptan 4 | import argparse 5 | import numpy as np 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | import time 11 | import torch.optim as optim 12 | 13 | from tensorboardX import SummaryWriter 14 | 15 | from lib import dqn_model, common 16 | from MarioKartEnvPtan import MarioKartEnv 17 | 18 | # n-step 19 | REWARD_STEPS = 3 20 | 21 | # priority replay 22 | PRIO_REPLAY_ALPHA = 0.6 23 | BETA_START = 0.4 24 | BETA_FRAMES = 3000000 25 | 26 | # C51 27 | Vmax = 16 28 | Vmin = -2 29 | N_ATOMS = 51 30 | DELTA_Z = (Vmax - Vmin) / (N_ATOMS - 1) 31 | 32 | 33 | class RainbowDQN(nn.Module): 34 | def __init__(self, input_shape, n_actions): 35 | super(RainbowDQN, self).__init__() 36 | 37 | self.start = time.time() 38 | 39 | self.conv = nn.Sequential( 40 | nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4), 41 | nn.ReLU(), 42 | nn.Conv2d(32, 64, kernel_size=4, stride=2), 43 | nn.ReLU(), 44 | nn.Conv2d(64, 64, kernel_size=3, stride=1), 45 | nn.ReLU() 46 | ) 47 | 48 | #print(input_shape) 49 | conv_out_size = self._get_conv_out(input_shape) 50 | self.fc_val = nn.Sequential( 51 | dqn_model.NoisyLinear(conv_out_size, 256), 52 | nn.ReLU(), 53 | dqn_model.NoisyLinear(256, N_ATOMS) 54 | ) 55 | 56 | self.fc_adv = nn.Sequential( 57 | dqn_model.NoisyLinear(conv_out_size, 256), 58 | nn.ReLU(), 59 | dqn_model.NoisyLinear(256, n_actions * N_ATOMS) 60 | ) 61 | 62 | self.register_buffer("supports", torch.arange(Vmin, Vmax+DELTA_Z, DELTA_Z)) 63 | self.softmax = nn.Softmax(dim=1) 64 | 65 | def _get_conv_out(self, shape): 66 | o = self.conv(torch.zeros(1, *shape)) 67 | return int(np.prod(o.size())) 68 | 69 | def forward(self, x): 70 | batch_size = x.size()[0] 71 | fx = x.float() / 256 72 | conv_out = self.conv(fx).view(batch_size, -1) 73 | val_out = self.fc_val(conv_out).view(batch_size, 1, N_ATOMS) 74 | adv_out = self.fc_adv(conv_out).view(batch_size, -1, N_ATOMS) 75 | adv_mean = adv_out.mean(dim=1, keepdim=True) 76 | return val_out + (adv_out - adv_mean) 77 | 78 | def both(self, x): 79 | cat_out = self(x) 80 | probs = self.apply_softmax(cat_out) 81 | weights = probs * self.supports 82 | res = weights.sum(dim=2) 83 | return cat_out, res 84 | 85 | def qvals(self, x): 86 | return self.both(x)[1] 87 | 88 | def apply_softmax(self, t): 89 | return self.softmax(t.view(-1, N_ATOMS)).view(t.size()) 90 | 91 | def save_checkpoint(self): 92 | #print('... saving checkpoint ...') 93 | torch.save(self.state_dict(), "current_model" + str(int(time.time() - self.start))) 94 | 95 | def load_checkpoint(self): 96 | #print('... loading checkpoint ...') 97 | self.load_state_dict(torch.load("current_model235392")) 98 | 99 | 100 | def calc_loss(batch, batch_weights, net, tgt_net, gamma, device="cpu"): 101 | states, actions, rewards, dones, next_states = common.unpack_batch(batch) 102 | batch_size = len(batch) 103 | 104 | states_v = torch.tensor(states).to(device) 105 | actions_v = torch.tensor(actions).to(device) 106 | next_states_v = torch.tensor(next_states).to(device) 107 | batch_weights_v = torch.tensor(batch_weights).to(device) 108 | 109 | # next state distribution 110 | # dueling arch -- actions from main net, distr from tgt_net 111 | 112 | # calc at once both next and cur states 113 | distr_v, qvals_v = net.both(torch.cat((states_v, next_states_v))) 114 | next_qvals_v = qvals_v[batch_size:] 115 | distr_v = distr_v[:batch_size] 116 | 117 | next_actions_v = next_qvals_v.max(1)[1] 118 | next_distr_v = tgt_net(next_states_v) 119 | next_best_distr_v = next_distr_v[range(batch_size), next_actions_v.data] 120 | next_best_distr_v = tgt_net.apply_softmax(next_best_distr_v) 121 | next_best_distr = next_best_distr_v.data.cpu().numpy() 122 | 123 | dones = dones.astype(bool) 124 | 125 | # project our distribution using Bellman update 126 | proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin, Vmax, N_ATOMS, gamma) 127 | 128 | # calculate net output 129 | state_action_values = distr_v[range(batch_size), actions_v.data] 130 | state_log_sm_v = F.log_softmax(state_action_values, dim=1) 131 | proj_distr_v = torch.tensor(proj_distr).to(device) 132 | 133 | loss_v = -state_log_sm_v * proj_distr_v 134 | loss_v = batch_weights_v * loss_v.sum(dim=1) 135 | return loss_v.mean(), loss_v + 1e-5 136 | 137 | 138 | if __name__ == "__main__": 139 | params = common.HYPERPARAMS['MarioKart'] 140 | params['epsilon_frames'] *= 2 141 | parser = argparse.ArgumentParser() 142 | parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") 143 | 144 | args = parser.parse_args() 145 | #print(args.cuda) 146 | device = torch.device("cuda")# if args.cuda else "cpu" 147 | 148 | env = MarioKartEnv() #gym.make(params['env_name']) 149 | print(env.action_space) 150 | print(env.observation_space.shape) 151 | env = ptan.common.wrappers.wrap_dqn_custom(env) 152 | print(env.observation_space.shape) 153 | #env = ptan.common.wrappers.wrap_dqn_custom(env) 154 | 155 | #Test this code: 156 | #check to see if observations are uints or floats 157 | 158 | #raise Exception("stop") 159 | 160 | #need to copy and reshape network 161 | 162 | writer = SummaryWriter(comment="-" + params['run_name'] + "-rainbow") 163 | net = RainbowDQN(env.observation_space.shape, env.action_space.n).to(device) 164 | #net.load_checkpoint() 165 | 166 | tgt_net = ptan.agent.TargetNet(net) 167 | agent = ptan.agent.DQNAgent(lambda x: net.qvals(x), ptan.actions.EpsilonGreedyActionSelector(), device=device)#ptan.actions.ArgmaxActionSelector() 168 | 169 | exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=REWARD_STEPS) 170 | buffer = ptan.experience.PrioritizedReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA) 171 | optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) 172 | 173 | frame_idx = 0 174 | beta = BETA_START 175 | 176 | save_interval = 30000 177 | 178 | with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: 179 | while True: 180 | frame_idx += 1 181 | buffer.populate(1) 182 | beta = min(1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES) 183 | 184 | new_rewards = exp_source.pop_total_rewards() 185 | if new_rewards: 186 | if reward_tracker.reward(new_rewards[0], frame_idx): 187 | break 188 | 189 | if len(buffer) < params['replay_initial']: 190 | continue 191 | 192 | optimizer.zero_grad() 193 | batch, batch_indices, batch_weights = buffer.sample(params['batch_size'], beta) 194 | loss_v, sample_prios_v = calc_loss(batch, batch_weights, net, tgt_net.target_model, 195 | params['gamma'] ** REWARD_STEPS, device=device) 196 | loss_v.backward() 197 | optimizer.step() 198 | buffer.update_priorities(batch_indices, sample_prios_v.data.cpu().numpy()) 199 | 200 | if frame_idx % save_interval == 0: 201 | net.save_checkpoint() 202 | 203 | if frame_idx % params['target_net_sync'] == 0: 204 | tgt_net.sync() 205 | -------------------------------------------------------------------------------- /Ptan/common.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | 7 | 8 | HYPERPARAMS = { 9 | 'pong': { 10 | 'env_name': "PongNoFrameskip-v4", 11 | 'stop_reward': 18.0, 12 | 'run_name': 'pong', 13 | 'replay_size': 250000, 14 | 'replay_initial': 10000, 15 | 'target_net_sync': 32000, 16 | 'epsilon_frames': 10**5, 17 | 'epsilon_start': 1.0, 18 | 'epsilon_final': 0.02, 19 | 'learning_rate': 6.25e-5, 20 | 'gamma': 0.99, 21 | 'batch_size': 32 22 | }, 23 | 'breakout-small': { 24 | 'env_name': "BreakoutNoFrameskip-v4", 25 | 'stop_reward': 500.0, 26 | 'run_name': 'breakout-small', 27 | 'replay_size': 3*10 ** 5, 28 | 'replay_initial': 20000, 29 | 'target_net_sync': 1000, 30 | 'epsilon_frames': 10 ** 6, 31 | 'epsilon_start': 1.0, 32 | 'epsilon_final': 0.1, 33 | 'learning_rate': 0.0001, 34 | 'gamma': 0.99, 35 | 'batch_size': 64 36 | }, 37 | 'breakout': { 38 | 'env_name': "BreakoutNoFrameskip-v4", 39 | 'stop_reward': 500.0, 40 | 'run_name': 'breakout', 41 | 'replay_size': 10 ** 6, 42 | 'replay_initial': 50000, 43 | 'target_net_sync': 10000, 44 | 'epsilon_frames': 10 ** 6, 45 | 'epsilon_start': 1.0, 46 | 'epsilon_final': 0.1, 47 | 'learning_rate': 0.00025, 48 | 'gamma': 0.99, 49 | 'batch_size': 32 50 | }, 51 | 'invaders': { 52 | 'env_name': "SpaceInvadersNoFrameskip-v4", 53 | 'stop_reward': 50000.0, 54 | 'run_name': 'breakout', 55 | 'replay_size': 1000000, 56 | 'replay_initial': 80000, 57 | 'target_net_sync': 32000, 58 | 'epsilon_frames': 10 ** 5, 59 | 'epsilon_start': 1.0, 60 | 'epsilon_final': 0.1, 61 | 'learning_rate': 6.25e-5, 62 | 'gamma': 0.99, 63 | 'batch_size': 32 64 | }, 65 | 'MarioKart': { 66 | 'env_name': "MarioKartEnv", 67 | 'stop_reward': 5000000.0, 68 | 'run_name': 'MarioKart1', 69 | 'replay_size': 1000000, 70 | 'replay_initial': 30000, 71 | 'target_net_sync': 32000, 72 | 'epsilon_frames': 10 ** 5, 73 | 'epsilon_start': 1.0, 74 | 'epsilon_final': 0.1, 75 | 'learning_rate': 0.0001,#6.25e-5 76 | 'gamma': 0.99, 77 | 'batch_size': 32 78 | }, 79 | } 80 | 81 | 82 | def unpack_batch(batch): 83 | states, actions, rewards, dones, last_states = [], [], [], [], [] 84 | for exp in batch: 85 | state = np.array(exp.state, copy=False) 86 | states.append(state) 87 | actions.append(exp.action) 88 | rewards.append(exp.reward) 89 | dones.append(exp.last_state is None) 90 | if exp.last_state is None: 91 | last_states.append(state) # the result will be masked anyway 92 | else: 93 | last_states.append(np.array(exp.last_state, copy=False)) 94 | return np.array(states, copy=False), np.array(actions), np.array(rewards, dtype=np.float32), \ 95 | np.array(dones, dtype=np.uint8), np.array(last_states, copy=False) 96 | 97 | 98 | def calc_loss_dqn(batch, net, tgt_net, gamma, device="cpu"): 99 | states, actions, rewards, dones, next_states = unpack_batch(batch) 100 | 101 | states_v = torch.tensor(states).to(device) 102 | next_states_v = torch.tensor(next_states).to(device) 103 | actions_v = torch.tensor(actions).to(device) 104 | rewards_v = torch.tensor(rewards).to(device) 105 | done_mask = torch.ByteTensor(dones).to(device) 106 | 107 | state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1) 108 | next_state_values = tgt_net(next_states_v).max(1)[0] 109 | next_state_values[done_mask] = 0.0 110 | 111 | expected_state_action_values = next_state_values.detach() * gamma + rewards_v 112 | return nn.MSELoss()(state_action_values, expected_state_action_values) 113 | 114 | 115 | class RewardTracker: 116 | def __init__(self, writer, stop_reward): 117 | self.writer = writer 118 | self.stop_reward = stop_reward 119 | 120 | def __enter__(self): 121 | self.ts = time.time() 122 | self.ts_frame = 0 123 | self.total_rewards = [] 124 | return self 125 | 126 | def __exit__(self, *args): 127 | self.writer.close() 128 | 129 | def reward(self, reward, frame, epsilon=None): 130 | self.total_rewards.append(reward) 131 | speed = (frame - self.ts_frame) / (time.time() - self.ts) 132 | self.ts_frame = frame 133 | self.ts = time.time() 134 | mean_reward = np.mean(self.total_rewards[-100:]) 135 | epsilon_str = "" if epsilon is None else ", eps %.2f" % epsilon 136 | print("%d: done %d games, mean reward %.3f, speed %.2f f/s%s" % ( 137 | frame, len(self.total_rewards), mean_reward, speed, epsilon_str 138 | )) 139 | sys.stdout.flush() 140 | if epsilon is not None: 141 | self.writer.add_scalar("epsilon", epsilon, frame) 142 | self.writer.add_scalar("speed", speed, frame) 143 | self.writer.add_scalar("reward_100", mean_reward, frame) 144 | self.writer.add_scalar("reward", reward, frame) 145 | if mean_reward > self.stop_reward: 146 | print("Solved in %d frames!" % frame) 147 | return True 148 | return False 149 | 150 | 151 | class EpsilonTracker: 152 | def __init__(self, epsilon_greedy_selector, params): 153 | self.epsilon_greedy_selector = epsilon_greedy_selector 154 | self.epsilon_start = params['epsilon_start'] 155 | self.epsilon_final = params['epsilon_final'] 156 | self.epsilon_frames = params['epsilon_frames'] 157 | self.frame(0) 158 | 159 | def frame(self, frame): 160 | self.epsilon_greedy_selector.epsilon = \ 161 | max(self.epsilon_final, self.epsilon_start - frame / self.epsilon_frames) 162 | 163 | 164 | def distr_projection(next_distr, rewards, dones, Vmin, Vmax, n_atoms, gamma): 165 | """ 166 | Perform distribution projection aka Catergorical Algorithm from the 167 | "A Distributional Perspective on RL" paper 168 | """ 169 | batch_size = len(rewards) 170 | proj_distr = np.zeros((batch_size, n_atoms), dtype=np.float32) 171 | delta_z = (Vmax - Vmin) / (n_atoms - 1) 172 | for atom in range(n_atoms): 173 | tz_j = np.minimum(Vmax, np.maximum(Vmin, rewards + (Vmin + atom * delta_z) * gamma)) 174 | b_j = (tz_j - Vmin) / delta_z 175 | l = np.floor(b_j).astype(np.int64) 176 | u = np.ceil(b_j).astype(np.int64) 177 | eq_mask = u == l 178 | proj_distr[eq_mask, l[eq_mask]] += next_distr[eq_mask, atom] 179 | ne_mask = u != l 180 | proj_distr[ne_mask, l[ne_mask]] += next_distr[ne_mask, atom] * (u - b_j)[ne_mask] 181 | proj_distr[ne_mask, u[ne_mask]] += next_distr[ne_mask, atom] * (b_j - l)[ne_mask] 182 | if dones.any(): 183 | proj_distr[dones] = 0.0 184 | tz_j = np.minimum(Vmax, np.maximum(Vmin, rewards[dones])) 185 | b_j = (tz_j - Vmin) / delta_z 186 | l = np.floor(b_j).astype(np.int64) 187 | u = np.ceil(b_j).astype(np.int64) 188 | eq_mask = u == l 189 | eq_dones = dones.copy() 190 | eq_dones[dones] = eq_mask 191 | if eq_dones.any(): 192 | proj_distr[eq_dones, l[eq_mask]] = 1.0 193 | ne_mask = u != l 194 | ne_dones = dones.copy() 195 | ne_dones[dones] = ne_mask 196 | if ne_dones.any(): 197 | proj_distr[ne_dones, l[ne_mask]] = (u - b_j)[ne_mask] 198 | proj_distr[ne_dones, u[ne_mask]] = (b_j - l)[ne_mask] 199 | return proj_distr 200 | -------------------------------------------------------------------------------- /PER_old.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | 4 | class SumTree(object): 5 | data_pointer = 0 6 | 7 | # Here we initialize the tree with all nodes = 0, and initialize the data with all values = 0 8 | def __init__(self, capacity): 9 | # Number of leaf nodes (final nodes) that contains experiences 10 | self.capacity = capacity 11 | 12 | # Generate the tree with all nodes values = 0 13 | # To understand this calculation (2 * capacity - 1) look at the schema below 14 | # Remember we are in a binary node (each node has max 2 children) so 2x size of leaf (capacity) - 1 (root node) 15 | # Parent nodes = capacity - 1 16 | # Leaf nodes = capacity 17 | self.tree = np.zeros(2 * capacity - 1) 18 | 19 | # Contains the experiences (so the size of data is capacity) 20 | self.data = np.zeros(capacity, dtype=object) 21 | 22 | # Here we define function that will add our priority score in the sumtree leaf and add the experience in data: 23 | def add(self, priority, data): 24 | 25 | # Look at what index we want to put the experience 26 | tree_index = self.data_pointer + self.capacity - 1 27 | 28 | # Update data frame 29 | self.data[self.data_pointer] = data 30 | 31 | # Update the leaf 32 | self.update (tree_index, priority) 33 | 34 | # Add 1 to data_pointer 35 | self.data_pointer += 1 36 | 37 | if self.data_pointer >= self.capacity: # If we're above the capacity, we go back to first index (we overwrite) 38 | self.data_pointer = 0 39 | 40 | # Update the leaf priority score and propagate the change through tree 41 | def update(self, tree_index, priority): 42 | # Change = new priority score - former priority score 43 | change = priority - self.tree[tree_index] 44 | self.tree[tree_index] = priority 45 | 46 | # then propagate the change through tree 47 | # this method is faster than the recursive loop in the reference code 48 | while tree_index != 0: 49 | tree_index = (tree_index - 1) // 2 50 | self.tree[tree_index] += change 51 | 52 | # Here build a function to get a leaf from our tree. So we'll build a function to get the leaf_index, priority value of that leaf and experience associated with that leaf index: 53 | def get_leaf(self, v): 54 | parent_index = 0 55 | 56 | # the while loop is faster than the method in the reference code 57 | while True: 58 | left_child_index = 2 * parent_index + 1 59 | right_child_index = left_child_index + 1 60 | 61 | # If we reach bottom, end the search 62 | if left_child_index >= len(self.tree): 63 | leaf_index = parent_index 64 | break 65 | else: # downward search, always search for a higher priority node 66 | if v <= self.tree[left_child_index]: 67 | parent_index = left_child_index 68 | else: 69 | v -= self.tree[left_child_index] 70 | parent_index = right_child_index 71 | 72 | data_index = leaf_index - self.capacity + 1 73 | 74 | return leaf_index, self.tree[leaf_index], self.data[data_index] 75 | 76 | @property 77 | def total_priority(self): 78 | return self.tree[0] # Returns the root node 79 | 80 | # Now we finished constructing our SumTree object, next we'll build a memory object. 81 | class ReplayMemory(object): # stored as ( state, action, reward, next_state ) in SumTree 82 | PER_e = 0.01 # Hyperparameter that we use to avoid some experiences to have 0 probability of being taken 83 | PER_a = 0.6 # Hyperparameter that we use to make a tradeoff between taking only exp with high priority and sampling randomly 84 | PER_b = 0.4 # importance-sampling, from initial value increasing to 1 85 | 86 | PER_b_increment_per_sampling = 0.001 87 | 88 | absolute_error_upper = 1. # clipped abs error 89 | 90 | def __init__(self, shape,capacity,batch_size): 91 | # Making the tree 92 | self.tree = SumTree(capacity) 93 | self.mem_cntr = 0 94 | self.shape = shape 95 | self.batch_size = batch_size 96 | self.capacity = capacity 97 | 98 | def is_sufficient(self): 99 | return self.mem_cntr > self.batch_size 100 | 101 | # Next, we define a function to store a new experience in our tree. 102 | # Each new experience will have a score of max_prority (it will be then improved when we use this exp to train our DDQN). 103 | def store_transition(self, state, action, reward, state_, terminal): 104 | 105 | experience = [state, action, reward, state_, terminal] 106 | 107 | # Find the max priority 108 | self.mem_cntr += 1 109 | max_priority = np.max(self.tree.tree[-self.tree.capacity:]) 110 | 111 | # If the max priority = 0 we can't put priority = 0 since this experience will never have a chance to be selected 112 | # So we use a minimum priority 113 | if max_priority == 0: 114 | max_priority = self.absolute_error_upper 115 | 116 | self.tree.add(max_priority, experience) # set the max priority for new priority 117 | 118 | # Now we create sample function, which will be used to pick batch from our tree memory, which will be used to train our model. 119 | # - First, we sample a minibatch of n size, the range [0, priority_total] into priority ranges. 120 | # - Then a value is uniformly sampled from each range. 121 | # - Then we search in the sumtree, for the experience where priority score correspond to sample values are retrieved from. 122 | def sample_memory(self): 123 | # Create a miself.batch_minibatch array that will contains the minibatch 124 | states = np.zeros((self.batch_size, *self.shape), 125 | dtype=np.float32) 126 | actions = np.zeros(self.batch_size, dtype=np.int64) 127 | rewards = np.zeros(self.batch_size, dtype=np.float32) 128 | states_ = np.zeros((self.batch_size, *self.shape), 129 | dtype=np.float32) 130 | terminal = np.zeros(self.batch_size, dtype=np.bool) 131 | 132 | b_idx = np.empty((self.batch_size,), dtype=np.int32) 133 | 134 | # Calculate the priority segment 135 | # Here, as explained in the paper, we divide the Range[0, ptotal] into n ranges 136 | priority_segment = self.tree.total_priority / self.batch_size # priority segment 137 | 138 | for i in range(self.batch_size): 139 | # A value is uniformly sample from each range 140 | a, b = priority_segment * i, priority_segment * (i + 1) 141 | data = 0 142 | count = 0 143 | while data == 0: 144 | count += 1 145 | value = np.random.uniform(a, b) 146 | 147 | # Experience that correspond to each value is retrieved 148 | index, priority, data = self.tree.get_leaf(value) 149 | 150 | if count > 50: 151 | raise Exception("Couldnt get non 0 value from tree") 152 | 153 | b_idx[i]= index 154 | 155 | states[i] = data[0] 156 | actions[i] = data[1] 157 | rewards[i] = data[2] 158 | states_[i] = data[3] 159 | terminal[i] = data[4] 160 | 161 | #minibatch.append([data[0],data[1],data[2],data[3],data[4]]) 162 | 163 | return b_idx, states,actions,rewards,states_,terminal 164 | 165 | # Update the priorities on the tree 166 | def batch_update(self, tree_idx, abs_errors): 167 | abs_errors += self.PER_e # convert to abs and avoid 0 168 | clipped_errors = np.minimum(abs_errors, self.absolute_error_upper) 169 | ps = np.power(clipped_errors, self.PER_a) 170 | 171 | for ti, p in zip(tree_idx, ps): 172 | self.tree.update(ti, p) 173 | -------------------------------------------------------------------------------- /PER.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Node: 4 | def __init__(self, left, right, is_leaf: bool = False, idx = None): 5 | self.left = left 6 | self.right = right 7 | self.is_leaf = is_leaf 8 | if not self.is_leaf: 9 | self.value = self.left.value + self.right.value 10 | self.parent = None 11 | self.idx = idx # this value is only set for leaf nodes 12 | if left is not None: 13 | left.parent = self 14 | if right is not None: 15 | right.parent = self 16 | @classmethod 17 | def create_leaf(cls, value, idx): 18 | leaf = cls(None, None, is_leaf=True, idx=idx) 19 | leaf.value = value 20 | return leaf 21 | 22 | def create_tree(input: list): 23 | nodes = [Node.create_leaf(v, i) for i, v in enumerate(input)] 24 | leaf_nodes = nodes 25 | while len(nodes) > 1: 26 | inodes = iter(nodes) 27 | nodes = [Node(*pair) for pair in zip(inodes, inodes)] 28 | return nodes[0], leaf_nodes 29 | 30 | def retrieve(value: float, node: Node): 31 | if node.is_leaf: 32 | return node 33 | if node.left.value >= value: 34 | return retrieve(value, node.left) 35 | else: 36 | return retrieve(value - node.left.value, node.right) 37 | 38 | def update(node: Node, new_value: float): 39 | change = new_value - node.value 40 | node.value = new_value 41 | propagate_changes(change, node.parent) 42 | 43 | def propagate_changes(change: float, node: Node): 44 | node.value += change 45 | if node.parent is not None: 46 | propagate_changes(change, node.parent) 47 | 48 | class ReplayMemory: 49 | def __init__(self, input_dims, max_mem, batch_size): 50 | 51 | self.alpha = 0.6 52 | self.beta = 0.4 53 | self.beta_steps = 180000 54 | self.beta_inc = (1 - self.beta) / self.beta_steps 55 | self.eps = 0.01 56 | 57 | self.mem_size = max_mem 58 | self.batch_size = batch_size 59 | self.mem_cntr = 0 60 | 61 | self.state_memory = np.zeros((self.mem_size, *input_dims), 62 | dtype=np.float32) 63 | self.new_state_memory = np.zeros((self.mem_size, *input_dims), 64 | dtype=np.float32) 65 | self.action_memory = np.zeros(self.mem_size, dtype=np.int64) 66 | self.reward_memory = np.zeros(self.mem_size, dtype=np.float32) 67 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool) 68 | 69 | priorities = np.zeros((self.mem_size, ), dtype=np.float32) 70 | 71 | self.root_node, self.leaf_nodes = create_tree(priorities) 72 | self.max_priority = -1.0 73 | self.absolute_error_upper = 1.0 74 | 75 | def store_transition(self, state, action, reward, state_, terminal): 76 | self.beta = min(self.beta + self.beta_inc,1) 77 | 78 | index = self.mem_cntr % self.mem_size 79 | self.state_memory[index] = state 80 | self.action_memory[index] = action 81 | self.reward_memory[index] = reward 82 | self.new_state_memory[index] = state_ 83 | self.terminal_memory[index] = terminal 84 | 85 | update(self.leaf_nodes[index], abs(self.max_priority)) 86 | 87 | self.mem_cntr += 1 88 | 89 | def sample_memory(self): 90 | max_mem = min(self.mem_cntr, self.mem_size) 91 | """ 92 | 93 | if self.mem_cntr > self.mem_size: 94 | prios = self.priorities 95 | else: 96 | prios = self.priorities[:self.mem_cntr]""" 97 | tree_total = self.root_node.value 98 | indices = [] 99 | probs = [] 100 | for i in range(self.batch_size): 101 | rand_val = np.random.uniform(0, tree_total) 102 | 103 | leaf = retrieve(rand_val, self.root_node) 104 | 105 | indices.append(leaf.idx) 106 | 107 | ###remove here IS 108 | #probs.append(leaf.value / tree_total) 109 | 110 | #and here IS 111 | #probs = np.array(probs,dtype=np.float32) 112 | 113 | states = self.state_memory[indices] 114 | actions = self.action_memory[indices] 115 | rewards = self.reward_memory[indices] 116 | new_states = self.new_state_memory[indices] 117 | terminals = self.terminal_memory[indices] 118 | 119 | #Both of these IS 120 | #weights = (max_mem * probs) ** (-self.beta) 121 | #weights /= weights.max() 122 | 123 | #last bit here 124 | return states, actions, rewards, new_states, terminals, indices#, np.array(weights, dtype=np.float32) 125 | 126 | def batch_update(self,batch_indices, batch_priorities): 127 | #print(batch_priorities.type) 128 | 129 | #batch_priorities += self.eps 130 | #batch_priorities = np.minimum(batch_priorities, self.absolute_error_upper) 131 | 132 | #batch_priorities = np.power(batch_priorities, self.alpha) 133 | 134 | self.max_priority = max(self.max_priority, max(batch_priorities)) 135 | 136 | for idx, prio in zip(batch_indices, batch_priorities): 137 | update(self.leaf_nodes[idx], prio) 138 | 139 | def is_sufficient(self): 140 | return self.mem_cntr > self.batch_size 141 | 142 | 143 | ################################################ the below implementation doesnt use tree 144 | class ReplayMemoryBuffer: 145 | def __init__(self, input_dims, max_mem, batch_size): 146 | 147 | self.alpha = 0.6 148 | self.beta = 0.4 149 | self.beta_steps = 50000 150 | self.beta_inc = (1 - self.beta) / self.beta_steps 151 | self.eps = 1e-5 152 | 153 | self.mem_size = max_mem 154 | self.batch_size = batch_size 155 | self.mem_cntr = 0 156 | 157 | self.state_memory = np.zeros((self.mem_size, *input_dims), 158 | dtype=np.float32) 159 | self.new_state_memory = np.zeros((self.mem_size, *input_dims), 160 | dtype=np.float32) 161 | self.action_memory = np.zeros(self.mem_size, dtype=np.int64) 162 | self.reward_memory = np.zeros(self.mem_size, dtype=np.float32) 163 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool) 164 | 165 | self.priorities = np.zeros((self.mem_size, ), dtype=np.float32) 166 | 167 | 168 | def store_transition(self, state, action, reward, state_, terminal): 169 | self.beta = min(self.beta + self.beta_inc,1) 170 | 171 | max_prio = self.priorities.max() if self.mem_cntr > 0 else 1.0 172 | 173 | index = self.mem_cntr % self.mem_size 174 | self.state_memory[index] = state 175 | self.action_memory[index] = action 176 | self.reward_memory[index] = reward 177 | self.new_state_memory[index] = state_ 178 | self.terminal_memory[index] = terminal 179 | 180 | self.priorities[index] = max_prio 181 | 182 | self.mem_cntr += 1 183 | 184 | def sample_memory(self): 185 | 186 | max_mem = min(self.mem_cntr, self.mem_size) 187 | 188 | if self.mem_cntr > self.mem_size: 189 | prios = self.priorities 190 | else: 191 | prios = self.priorities[:self.mem_cntr] 192 | 193 | probs = prios ** self.alpha 194 | probs /= probs.sum() 195 | indices = np.random.choice(max_mem, self.batch_size, p=probs) 196 | 197 | states = self.state_memory[indices] 198 | actions = self.action_memory[indices] 199 | rewards = self.reward_memory[indices] 200 | new_states = self.new_state_memory[indices] 201 | terminals = self.terminal_memory[indices] 202 | 203 | weights = (max_mem * probs[indices]) ** (-self.beta) 204 | weights /= weights.max() 205 | 206 | return states, actions, rewards, new_states, terminals, indices, np.array(weights, dtype=np.float32) 207 | 208 | def batch_update(self,batch_indices, batch_priorities): 209 | for idx, prio in zip(batch_indices, batch_priorities): 210 | self.priorities[idx] = prio 211 | 212 | def is_sufficient(self): 213 | return self.mem_cntr > self.batch_size 214 | 215 | 216 | 217 | -------------------------------------------------------------------------------- /Ptan/07_dqn_distrib.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gym 3 | import ptan 4 | import numpy as np 5 | import argparse 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | import torch.optim as optim 11 | 12 | from tensorboardX import SummaryWriter 13 | 14 | from lib import common 15 | 16 | SAVE_STATES_IMG = False 17 | SAVE_TRANSITIONS_IMG = False 18 | 19 | if SAVE_STATES_IMG or SAVE_TRANSITIONS_IMG: 20 | import matplotlib as mpl 21 | mpl.use("Agg") 22 | import matplotlib.pylab as plt 23 | 24 | Vmax = 10 25 | Vmin = -10 26 | N_ATOMS = 51 27 | DELTA_Z = (Vmax - Vmin) / (N_ATOMS - 1) 28 | 29 | STATES_TO_EVALUATE = 1000 30 | EVAL_EVERY_FRAME = 100 31 | 32 | 33 | class DistributionalDQN(nn.Module): 34 | def __init__(self, input_shape, n_actions): 35 | super(DistributionalDQN, self).__init__() 36 | 37 | self.conv = nn.Sequential( 38 | nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4), 39 | nn.ReLU(), 40 | nn.Conv2d(32, 64, kernel_size=4, stride=2), 41 | nn.ReLU(), 42 | nn.Conv2d(64, 64, kernel_size=3, stride=1), 43 | nn.ReLU() 44 | ) 45 | 46 | conv_out_size = self._get_conv_out(input_shape) 47 | self.fc = nn.Sequential( 48 | nn.Linear(conv_out_size, 512), 49 | nn.ReLU(), 50 | nn.Linear(512, n_actions * N_ATOMS) 51 | ) 52 | 53 | self.register_buffer("supports", torch.arange(Vmin, Vmax+DELTA_Z, DELTA_Z)) 54 | self.softmax = nn.Softmax(dim=1) 55 | 56 | def _get_conv_out(self, shape): 57 | o = self.conv(torch.zeros(1, *shape)) 58 | return int(np.prod(o.size())) 59 | 60 | def forward(self, x): 61 | batch_size = x.size()[0] 62 | fx = x.float() / 256 63 | conv_out = self.conv(fx).view(batch_size, -1) 64 | fc_out = self.fc(conv_out) 65 | return fc_out.view(batch_size, -1, N_ATOMS) 66 | 67 | def both(self, x): 68 | cat_out = self(x) 69 | probs = self.apply_softmax(cat_out) 70 | weights = probs * self.supports 71 | res = weights.sum(dim=2) 72 | return cat_out, res 73 | 74 | def qvals(self, x): 75 | return self.both(x)[1] 76 | 77 | def apply_softmax(self, t): 78 | return self.softmax(t.view(-1, N_ATOMS)).view(t.size()) 79 | 80 | 81 | def calc_values_of_states(states, net, device="cpu"): 82 | mean_vals = [] 83 | for batch in np.array_split(states, 64): 84 | states_v = torch.tensor(batch).to(device) 85 | action_values_v = net.qvals(states_v) 86 | best_action_values_v = action_values_v.max(1)[0] 87 | mean_vals.append(best_action_values_v.mean().item()) 88 | return np.mean(mean_vals) 89 | 90 | 91 | def save_state_images(frame_idx, states, net, device="cpu", max_states=200): 92 | ofs = 0 93 | p = np.arange(Vmin, Vmax + DELTA_Z, DELTA_Z) 94 | for batch in np.array_split(states, 64): 95 | states_v = torch.tensor(batch).to(device) 96 | action_prob = net.apply_softmax(net(states_v)).data.cpu().numpy() 97 | batch_size, num_actions, _ = action_prob.shape 98 | for batch_idx in range(batch_size): 99 | plt.clf() 100 | for action_idx in range(num_actions): 101 | plt.subplot(num_actions, 1, action_idx+1) 102 | plt.bar(p, action_prob[batch_idx, action_idx], width=0.5) 103 | plt.savefig("states/%05d_%08d.png" % (ofs + batch_idx, frame_idx)) 104 | ofs += batch_size 105 | if ofs >= max_states: 106 | break 107 | 108 | 109 | def save_transition_images(batch_size, predicted, projected, next_distr, dones, rewards, save_prefix): 110 | for batch_idx in range(batch_size): 111 | is_done = dones[batch_idx] 112 | reward = rewards[batch_idx] 113 | plt.clf() 114 | p = np.arange(Vmin, Vmax + DELTA_Z, DELTA_Z) 115 | plt.subplot(3, 1, 1) 116 | plt.bar(p, predicted[batch_idx], width=0.5) 117 | plt.title("Predicted") 118 | plt.subplot(3, 1, 2) 119 | plt.bar(p, projected[batch_idx], width=0.5) 120 | plt.title("Projected") 121 | plt.subplot(3, 1, 3) 122 | plt.bar(p, next_distr[batch_idx], width=0.5) 123 | plt.title("Next state") 124 | suffix = "" 125 | if reward != 0.0: 126 | suffix = suffix + "_%.0f" % reward 127 | if is_done: 128 | suffix = suffix + "_done" 129 | plt.savefig("%s_%02d%s.png" % (save_prefix, batch_idx, suffix)) 130 | 131 | 132 | def calc_loss(batch, net, tgt_net, gamma, device="cpu", save_prefix=None): 133 | states, actions, rewards, dones, next_states = common.unpack_batch(batch) 134 | batch_size = len(batch) 135 | 136 | states_v = torch.tensor(states).to(device) 137 | actions_v = torch.tensor(actions).to(device) 138 | next_states_v = torch.tensor(next_states).to(device) 139 | 140 | # next state distribution 141 | next_distr_v, next_qvals_v = tgt_net.both(next_states_v) 142 | next_actions = next_qvals_v.max(1)[1].data.cpu().numpy() 143 | next_distr = tgt_net.apply_softmax(next_distr_v).data.cpu().numpy() 144 | 145 | next_best_distr = next_distr[range(batch_size), next_actions] 146 | dones = dones.astype(np.bool) 147 | 148 | # project our distribution using Bellman update 149 | proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin, Vmax, N_ATOMS, gamma) 150 | 151 | # calculate net output 152 | distr_v = net(states_v) 153 | state_action_values = distr_v[range(batch_size), actions_v.data] 154 | state_log_sm_v = F.log_softmax(state_action_values, dim=1) 155 | proj_distr_v = torch.tensor(proj_distr).to(device) 156 | 157 | if save_prefix is not None: 158 | pred = F.softmax(state_action_values, dim=1).data.cpu().numpy() 159 | save_transition_images(batch_size, pred, proj_distr, next_best_distr, dones, rewards, save_prefix) 160 | 161 | loss_v = -state_log_sm_v * proj_distr_v 162 | return loss_v.sum(dim=1).mean() 163 | 164 | 165 | if __name__ == "__main__": 166 | params = common.HYPERPARAMS['pong'] 167 | # params['epsilon_frames'] *= 2 168 | parser = argparse.ArgumentParser() 169 | parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") 170 | args = parser.parse_args() 171 | device = torch.device("cuda" if args.cuda else "cpu") 172 | 173 | env = gym.make(params['env_name']) 174 | env = ptan.common.wrappers.wrap_dqn(env) 175 | 176 | writer = SummaryWriter(comment="-" + params['run_name'] + "-distrib") 177 | net = DistributionalDQN(env.observation_space.shape, env.action_space.n).to(device) 178 | 179 | tgt_net = ptan.agent.TargetNet(net) 180 | selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start']) 181 | epsilon_tracker = common.EpsilonTracker(selector, params) 182 | agent = ptan.agent.DQNAgent(lambda x: net.qvals(x), selector, device=device) 183 | 184 | exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1) 185 | buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params['replay_size']) 186 | optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) 187 | 188 | frame_idx = 0 189 | eval_states = None 190 | prev_save = 0 191 | save_prefix = None 192 | 193 | with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: 194 | while True: 195 | frame_idx += 1 196 | buffer.populate(1) 197 | epsilon_tracker.frame(frame_idx) 198 | 199 | new_rewards = exp_source.pop_total_rewards() 200 | if new_rewards: 201 | if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon): 202 | break 203 | 204 | if len(buffer) < params['replay_initial']: 205 | continue 206 | 207 | if eval_states is None: 208 | eval_states = buffer.sample(STATES_TO_EVALUATE) 209 | eval_states = [np.array(transition.state, copy=False) for transition in eval_states] 210 | eval_states = np.array(eval_states, copy=False) 211 | 212 | optimizer.zero_grad() 213 | batch = buffer.sample(params['batch_size']) 214 | 215 | save_prefix = None 216 | if SAVE_TRANSITIONS_IMG: 217 | interesting = any(map(lambda s: s.last_state is None or s.reward != 0.0, batch)) 218 | if interesting and frame_idx // 30000 > prev_save: 219 | save_prefix = "images/img_%08d" % frame_idx 220 | prev_save = frame_idx // 30000 221 | 222 | loss_v = calc_loss(batch, net, tgt_net.target_model, gamma=params['gamma'], 223 | device=device, save_prefix=save_prefix) 224 | loss_v.backward() 225 | optimizer.step() 226 | 227 | if frame_idx % params['target_net_sync'] == 0: 228 | tgt_net.sync() 229 | 230 | if frame_idx % EVAL_EVERY_FRAME == 0: 231 | mean_val = calc_values_of_states(eval_states, net, device=device) 232 | writer.add_scalar("values_mean", mean_val, frame_idx) 233 | 234 | if SAVE_STATES_IMG and frame_idx % 10000 == 0: 235 | save_state_images(frame_idx, eval_states, net, device=device) 236 | -------------------------------------------------------------------------------- /EffRainbow/common.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | import kornia 7 | from torchvision.utils import save_image 8 | 9 | HYPERPARAMS = { 10 | 'pong': { 11 | 'env_name': "PongNoFrameskip-v4", 12 | 'stop_reward': 18.0, 13 | 'run_name': 'pong', 14 | 'replay_size': 100000, 15 | 'replay_initial': 10000, 16 | 'target_net_sync': 1000, 17 | 'epsilon_frames': 10**5, 18 | 'epsilon_start': 1.0, 19 | 'epsilon_final': 0.02, 20 | 'learning_rate': 0.0001, 21 | 'gamma': 0.99, 22 | 'batch_size': 32 23 | }, 24 | 'breakout-small': { 25 | 'env_name': "BreakoutNoFrameskip-v4", 26 | 'stop_reward': 500.0, 27 | 'run_name': 'breakout-small', 28 | 'replay_size': 3*10 ** 5, 29 | 'replay_initial': 20000, 30 | 'target_net_sync': 1000, 31 | 'epsilon_frames': 10 ** 6, 32 | 'epsilon_start': 1.0, 33 | 'epsilon_final': 0.1, 34 | 'learning_rate': 0.0001, 35 | 'gamma': 0.99, 36 | 'batch_size': 64 37 | }, 38 | 'breakout': { 39 | 'env_name': "BreakoutNoFrameskip-v4", 40 | 'stop_reward': 500.0, 41 | 'run_name': 'breakout', 42 | 'replay_size': 10 ** 6, 43 | 'replay_initial': 50000, 44 | 'target_net_sync': 10000, 45 | 'epsilon_frames': 10 ** 6, 46 | 'epsilon_start': 1.0, 47 | 'epsilon_final': 0.1, 48 | 'learning_rate': 0.00025, 49 | 'gamma': 0.99, 50 | 'batch_size': 32 51 | }, 52 | 'invaders': { 53 | 'env_name': "SpaceInvadersNoFrameskip-v4", 54 | 'stop_reward': 500.0, 55 | 'run_name': 'breakout', 56 | 'replay_size': 10 ** 6, 57 | 'replay_initial': 50000, 58 | 'target_net_sync': 10000, 59 | 'epsilon_frames': 10 ** 6, 60 | 'epsilon_start': 1.0, 61 | 'epsilon_final': 0.1, 62 | 'learning_rate': 0.00025, 63 | 'gamma': 0.99, 64 | 'batch_size': 32 65 | }, 66 | 67 | 'MarioBros': { 68 | 'env_name': "MarioBrosEnv", 69 | 'stop_reward': 5000000.0, 70 | 'run_name': 'MarioKart1', 71 | 'replay_size': 870000, 72 | 'replay_initial': 80000, #80k 73 | 'target_net_sync': 32000, 74 | 'epsilon_dec': 1.98e-6, 75 | 'epsilon_start': 1.0,#1.0 76 | 'epsilon_final': 0.01,#0.01 77 | 'learning_rate': 0.00025,# 78 | 'gamma': 0.99, 79 | 'batch_size': 32 #used to be 256 80 | }, 81 | } 82 | 83 | def unpack_batch(batch): 84 | states, actions, rewards, dones, last_states = [], [], [], [], [] 85 | for exp in batch: 86 | state = np.array(exp.state, copy=False) 87 | states.append(state) 88 | actions.append(exp.action) 89 | rewards.append(exp.reward) 90 | dones.append(exp.last_state is None) 91 | if exp.last_state is None: 92 | last_states.append(state) # the result will be masked anyway 93 | else: 94 | last_states.append(np.array(exp.last_state, copy=False)) 95 | return np.array(states, copy=False), np.array(actions), np.array(rewards, dtype=np.float32), \ 96 | np.array(dones, dtype=np.uint8), np.array(last_states, copy=False) 97 | 98 | 99 | class Intensity(nn.Module): 100 | def __init__(self, scale): 101 | super().__init__() 102 | self.scale = scale 103 | 104 | def forward(self, x): 105 | r = torch.randn((x.size(0), 1, 1, 1), device=x.device) 106 | noise = 1.0 + (self.scale * r.clamp(-2.0, 2.0)) 107 | return x * noise 108 | 109 | 110 | AUG = nn.Sequential(nn.ReplicationPad2d(4), 111 | kornia.augmentation.RandomCrop((140, 75)), 112 | Intensity(scale=0.1)) 113 | 114 | def calc_loss_dqn(batch, batch_weights,net, tgt_net, gamma, device="cpu"): 115 | states, actions, rewards, dones, next_states = unpack_batch(batch) 116 | 117 | states_v = torch.tensor(states).to(device) 118 | next_states_v = torch.tensor(next_states).to(device) 119 | actions_v = torch.tensor(actions).to(device) 120 | rewards_v = torch.tensor(rewards).to(device) 121 | done_mask = torch.ByteTensor(dones).to(device) 122 | batch_weights_v = torch.tensor(batch_weights).to(device) 123 | 124 | """ #This code very much needs checking to see if images are broken 125 | save_image(states_v[0].float(), 'img_size_test.png') 126 | 127 | states_v = aug_trans(states_v) 128 | next_states_v = aug_trans(next_states_v) 129 | next_states_policy = aug_trans(next_states_v) 130 | 131 | save_image(states_v[0], 'img1_aug.png') 132 | raise Exception("stop") 133 | """ 134 | 135 | state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1) 136 | next_state_values = tgt_net(next_states_v).max(1)[0] 137 | next_state_values[done_mask] = 0.0 138 | 139 | """ 140 | state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1) 141 | next_state_values = tgt_net(next_states_v).max(1)[0]""" 142 | 143 | expected_state_action_values = next_state_values.detach() * gamma + rewards_v 144 | losses_v = batch_weights_v * (state_action_values - expected_state_action_values) ** 2 145 | 146 | return losses_v.mean(), losses_v + 1e-5 147 | 148 | class RewardTracker: 149 | def __init__(self, writer, stop_reward): 150 | self.writer = writer 151 | self.stop_reward = stop_reward 152 | self.time = time.time() 153 | self.scores = [] 154 | 155 | def __enter__(self): 156 | self.ts = time.time() 157 | self.ts_frame = 0 158 | self.total_rewards = [] 159 | return self 160 | 161 | def __exit__(self, *args): 162 | self.writer.close() 163 | 164 | def reward(self, reward, frame, epsilon=None): 165 | self.total_rewards.append(reward) 166 | speed = (frame - self.ts_frame) / (time.time() - self.ts + 0.000001) 167 | self.ts_frame = frame 168 | self.ts = time.time() 169 | mean_reward = np.mean(self.total_rewards[-100:]) 170 | print("%d frames: Played %d games, Avg reward %.3f, Time: %.3f Hours" % ( 171 | frame, len(self.total_rewards), mean_reward, (self.ts - self.time) / 3600 172 | )) 173 | 174 | self.scores.append([reward,self.ts_frame,time.time() - self.time]) 175 | 176 | sys.stdout.flush() 177 | if epsilon is not None: 178 | self.writer.add_scalar("epsilon", epsilon, frame) 179 | self.writer.add_scalar("speed", speed, frame) 180 | self.writer.add_scalar("reward_100", mean_reward, frame) 181 | self.writer.add_scalar("reward", reward, frame) 182 | if mean_reward > self.stop_reward: 183 | print("Solved in %d frames!" % frame) 184 | return True 185 | return False 186 | 187 | def get_scores(self): 188 | return np.array(self.scores) 189 | 190 | 191 | class EpsilonTracker: 192 | def __init__(self, epsilon_greedy_selector, params): 193 | self.epsilon_greedy_selector = epsilon_greedy_selector 194 | self.epsilon_start = params['epsilon_start'] 195 | self.epsilon_final = params['epsilon_final'] 196 | self.epsilon_frames = params['epsilon_frames'] 197 | self.frame(0) 198 | 199 | def frame(self, frame): 200 | self.epsilon_greedy_selector.epsilon = \ 201 | max(self.epsilon_final, self.epsilon_start - frame / self.epsilon_frames) 202 | 203 | 204 | def distr_projection(next_distr, rewards, dones, Vmin, Vmax, n_atoms, gamma): 205 | """ 206 | Perform distribution projection aka Catergorical Algorithm from the 207 | "A Distributional Perspective on RL" paper 208 | """ 209 | batch_size = len(rewards) 210 | proj_distr = np.zeros((batch_size, n_atoms), dtype=np.float32) 211 | delta_z = (Vmax - Vmin) / (n_atoms - 1) 212 | for atom in range(n_atoms): 213 | tz_j = np.minimum(Vmax, np.maximum(Vmin, rewards + (Vmin + atom * delta_z) * gamma)) 214 | b_j = (tz_j - Vmin) / delta_z 215 | l = np.floor(b_j).astype(np.int64) 216 | u = np.ceil(b_j).astype(np.int64) 217 | eq_mask = u == l 218 | proj_distr[eq_mask, l[eq_mask]] += next_distr[eq_mask, atom] 219 | ne_mask = u != l 220 | proj_distr[ne_mask, l[ne_mask]] += next_distr[ne_mask, atom] * (u - b_j)[ne_mask] 221 | proj_distr[ne_mask, u[ne_mask]] += next_distr[ne_mask, atom] * (b_j - l)[ne_mask] 222 | if dones.any(): 223 | proj_distr[dones] = 0.0 224 | tz_j = np.minimum(Vmax, np.maximum(Vmin, rewards[dones])) 225 | b_j = (tz_j - Vmin) / delta_z 226 | l = np.floor(b_j).astype(np.int64) 227 | u = np.ceil(b_j).astype(np.int64) 228 | eq_mask = u == l 229 | eq_dones = dones.copy() 230 | eq_dones[dones] = eq_mask 231 | if eq_dones.any(): 232 | proj_distr[eq_dones, l[eq_mask]] = 1.0 233 | ne_mask = u != l 234 | ne_dones = dones.copy() 235 | ne_dones[dones] = ne_mask 236 | if ne_dones.any(): 237 | proj_distr[ne_dones, l[ne_mask]] = (u - b_j)[ne_mask] 238 | proj_distr[ne_dones, u[ne_mask]] = (b_j - l)[ne_mask] 239 | return proj_distr 240 | -------------------------------------------------------------------------------- /FelkFork/DolphinEnv.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import shared_memory 2 | import numpy as np 3 | import time 4 | import os 5 | from PIL import Image 6 | import gym 7 | import random 8 | from copy import deepcopy 9 | import cv2 10 | import warnings 11 | from Wrappers import wrap_env 12 | import subprocess 13 | os.chdir('/home/tyler/Documents/EfficientZero') 14 | 15 | warnings.filterwarnings("ignore") 16 | 17 | """ 18 | This program implements the standard gym MDP 19 | 20 | However, it will use shared memory to access 21 | data from DolphinSideScript: 22 | Rewards 23 | terminals 24 | states 25 | 26 | It will also need to send to DolphinSideScript: 27 | actions 28 | 29 | """ 30 | #Ymem = 108 31 | #Xmem = 200 32 | Ymem = 78 33 | Xmem = 94 34 | 35 | class DolphinEnv(gym.Env): 36 | def __init__(self): 37 | 38 | """ 39 | shared mem is in following format: 40 | 41 | This needs to be changed to this format: 42 | 43 | arr = np.zeros((101,60),dtype=np.float32) 44 | 45 | arr[0][0] = Dtimestep 46 | arr[0][1] = Etimestep 47 | arr[0][2] = action 48 | arr[0][3] = reward 49 | arr[0][4] = terminal 50 | 51 | arr[1:] = state 52 | 53 | """ 54 | 55 | self.observation_space = gym.spaces.Box( 56 | low=0, high=1, shape=(Ymem, Xmem), dtype=np.uint8) 57 | self.action_space = gym.spaces.Discrete(9) 58 | self.action_space.n = 9 59 | 60 | self.reward_range = (-100, 100) 61 | self.metadata = None 62 | self.initialised = False 63 | 64 | def real_init(self): 65 | 66 | with open('pid_num.txt') as f: 67 | pid = int(f.readlines()[0]) + 1 68 | 69 | self.pid = pid 70 | self.offset = 0 71 | 72 | 73 | #write to file with pid number 74 | 75 | with open('pid_num.txt', 'w') as f: 76 | f.write(str(self.pid)) 77 | 78 | print("My PID: " + str(self.pid)) 79 | 80 | self.timestep = 0. 81 | self.init = True 82 | 83 | self.data = np.zeros((Ymem + 1,Xmem),dtype=np.float32) 84 | print("Data Array") 85 | print(self.data) 86 | 87 | self.shm = shared_memory.SharedMemory(create=True,size=self.data.nbytes,name = 'p' + str(pid)) 88 | 89 | print("Saving to shared mem") 90 | self.shm_array = np.ndarray(self.data.shape, dtype=self.data.dtype, buffer=self.shm.buf) 91 | self.shm_array[:] = self.data[:] 92 | 93 | print("Launching Dolphin") 94 | 95 | """cmd1 = 'cmd /c C:\\Users\\TYLER\\Downloads\\RLJourney\\DolphinNew\\dolphin' 96 | cmd2 = '\\Binary\\x64\\Dolphin.exe --no-python-subinterpreters --script C:/Users/TYLER/Downloads/RLJourney/DolphinNew/DolphinSideScript.py \\b --exec="C:\\Users\\TYLER\\Downloads\\GameCollection\\' 97 | cmd3 = 'SuperSmashBros.Brawl(Europe)(En,Fr,De,Es,It).nkit.gcz"' 98 | #cmd3 = '\\games\\Mario Kart Wii (USA) (En,Fr,Es).nkit.iso"' 99 | #cmd /c C:\\Users\\TYLER\\Downloads\\DolphinRevamp\\dolphinScript0\\Dolphin.exe --script C:/Users/TYLER/Downloads/DolphinRevamp/DolphinSideScript.py \\b --exec="C:\\Users\\TYLER\\Downloads\\DolphinRevamp\\dolphinScript0\\games\\NewSuperMarioBros.Wii(Europe)(En,Fr,De,Es,It)(Rev 1).nkit.gcz" 100 | 101 | #launch dolphin 102 | os.popen(cmd1 + str(pid) + cmd2 + cmd3)""" 103 | 104 | cmd = 'cd ~/Documents/dolphin' + str(pid) + '/build/Binaries && ./dolphin-emu --no-python-subinterpreters\ 105 | --script /home/tyler/Documents/WiiRL/FelkFork/DolphinSideScriptTanks.py\ 106 | --exec="/home/tyler/Documents/GameCollection/Wii Play (Europe) (En,Fr,De,Es,It).nkit.gcz"' 107 | 108 | subprocess.Popen(cmd, shell=True) 109 | time.sleep(4) 110 | 111 | print("Dolphin Launched Successfully") 112 | 113 | def get_max_episode_steps(self): 114 | return 1000 115 | 116 | def restart(self): 117 | with open('pid_num.txt', 'w') as f: 118 | f.write(str(self.pid)) 119 | 120 | self.timestep = 0. 121 | self.init = True 122 | 123 | self.data = np.zeros((Ymem + 1,Xmem),dtype=np.float32) 124 | print("Data Array") 125 | print(self.data) 126 | 127 | self.shm = shared_memory.SharedMemory(create=False,size=self.data.nbytes,name = 'p' + str(self.pid)) 128 | 129 | print("Saving to shared mem") 130 | self.shm_array = np.ndarray(self.data.shape, dtype=self.data.dtype, buffer=self.shm.buf) 131 | self.shm_array[:] = self.data[:] 132 | 133 | print("Launching Dolphin After Crash...") 134 | 135 | cmd1 = 'cmd /c C:\\Users\\TYLER\\Downloads\\RLJourney\\DolphinNew\\dolphin' 136 | cmd2 = '\\Binary\\x64\\Dolphin.exe --no-python-subinterpreters --script C:/Users/TYLER/Downloads/RLJourney/DolphinNew/DolphinSideScriptTanks.py \\b --exec="C:\\Users\\TYLER\\Downloads\\GameCollection\\' 137 | cmd3 = 'SuperSmashBros.Brawl(Europe)(En,Fr,De,Es,It).nkit.gcz"' 138 | #cmd3 = '\\games\\Mario Kart Wii (USA) (En,Fr,Es).nkit.iso"' 139 | #cmd /c C:\\Users\\TYLER\\Downloads\\DolphinRevamp\\dolphinScript0\\Dolphin.exe --script C:/Users/TYLER/Downloads/DolphinRevamp/DolphinSideScript.py \\b --exec="C:\\Users\\TYLER\\Downloads\\DolphinRevamp\\dolphinScript0\\games\\NewSuperMarioBros.Wii(Europe)(En,Fr,De,Es,It)(Rev 1).nkit.gcz" 140 | 141 | #launch dolphin 142 | os.popen(cmd1 + str(self.pid) + cmd2 + cmd3) 143 | 144 | time.sleep(4) 145 | 146 | def reset(self): 147 | 148 | if not self.initialised: 149 | self.real_init() 150 | self.initialised = True 151 | #sync 152 | #print("Resestting...") 153 | print("Reset Called, PID: " + str(self.pid)) 154 | self.shm_array[0][2 + self.offset] = 0 155 | 156 | if not self.init: 157 | self.timestep += 1 158 | else: 159 | self.init = False 160 | 161 | self.shm_array[0][1 + self.offset] = self.timestep 162 | 163 | timer = time.time() 164 | while True: 165 | #print(str(self.shm_array[0]) + " " + str(self.shm_array[1])+ " " + str(self.timestep)) 166 | if self.shm_array[0][0 + self.offset] == self.timestep + 1: 167 | break 168 | 169 | else: 170 | if time.time() - timer > 10: 171 | print("Waiting 10+ seconds at reset! PID: " + str(self.pid)) 172 | print("Resestting to t0...") 173 | #self.timestep = 0 174 | #self.shm_array[0][1 + self.offset] = self.timestep 175 | timer = time.time() 176 | 177 | return self.shm_array[1:][:].astype(np.uint8) 178 | 179 | def step(self,action): 180 | 181 | #write timestep and action 182 | 183 | self.shm_array[0][2 + self.offset] = action 184 | 185 | self.timestep += 1 186 | self.shm_array[0][1 + self.offset] = self.timestep 187 | 188 | print("Step Called, PID: " + str(self.pid)) 189 | #wait for new state,reward,terminal 190 | #sync 191 | timer = time.time() 192 | while True: 193 | time.sleep(0.001) 194 | #print(str(self.shm_array[0]) + " " + str(self.shm_array[1]) + " " + str(self.timestep)) 195 | if self.shm_array[0][0 + self.offset] == self.timestep + 1: 196 | break 197 | else: 198 | if time.time() - timer > 10: 199 | time.sleep(10) 200 | print("Dolphin Has likely crashed! No response in 10+ seconds, PID: " + str(self.pid)) 201 | print(str(self.shm_array[0]) + " " + str(self.shm_array[1]) + " " + str(self.timestep)) 202 | print("") 203 | """os.system("taskkill /f /im Dolphin.exe") 204 | time.sleep(5) 205 | state = self.shm_array[1:][:].astype(np.uint8) 206 | 207 | self.restart() 208 | 209 | return state,0,True,{}""" 210 | 211 | return self.shm_array[1:][:].astype(np.uint8),self.shm_array[0][3 + self.offset],self.shm_array[0][4 + self.offset],{} 212 | 213 | def on_press(key): 214 | global action 215 | try: 216 | if key.char == 'q': #### 217 | action = 0 218 | elif key.char == 'w':#### 219 | action = 1 220 | elif key.char == 'e': 221 | action = 2 222 | elif key.char == 'r': 223 | action = 3 224 | elif key.char == 't': 225 | action = 4 226 | elif key.char == 'y':### 227 | action = 5 228 | elif key.char == 'u': 229 | action = 6 230 | elif key.char == 'i': 231 | action = 7 232 | elif key.char == 'o': 233 | action = 8 234 | print(action) 235 | except:pass 236 | 237 | def on_release(key): 238 | global action 239 | action = 9 240 | 241 | 242 | if __name__ == "__main__": 243 | with open('pid_num.txt', 'w') as f: 244 | f.write(str(-1)) 245 | from pynput import keyboard 246 | start = time.time() 247 | steps = 1 248 | 249 | envs = [] 250 | 251 | for i in range(1): 252 | env = DolphinEnv() 253 | #env = gym.wrappers.ResizeObservation(env,(54,100)) 254 | #env = gym.wrappers.FrameStack(env, 3) 255 | envs.append(env) 256 | #env = wrap_env(env,4) 257 | #envs.append(env) 258 | 259 | 260 | print(env.observation_space) 261 | print(env.action_space) 262 | action = 1 263 | steps = 1 264 | 265 | 266 | """while True: 267 | 268 | for env in envs: 269 | state = env.reset() 270 | 271 | terminal = False 272 | 273 | 274 | while True: 275 | for env in envs: 276 | steps += 1 277 | state,reward,terminal,_ = env.step(action) 278 | if terminal: 279 | env.reset() 280 | print("Fps: " + str(steps / (time.time() - start)))""" 281 | 282 | listener = keyboard.Listener( 283 | on_press=on_press, 284 | on_release=on_release) 285 | listener.start() 286 | 287 | tot_reward = 0 288 | avg_reward = 0 289 | max_avg_reward = 0 290 | steps = 1 291 | while True: 292 | 293 | state = env.reset() 294 | print("Reset Environment") 295 | 296 | print("Fps: " + str(steps / (time.time() - start))) 297 | 298 | print("Total Reward: " + str(tot_reward)) 299 | print("AVG Reward: " + str(tot_reward / steps)) 300 | print("MAX AVG Reward: " + str(max_avg_reward)) 301 | 302 | terminal = False 303 | trun = False 304 | tot_reward = 0 305 | 306 | avg_reward = 0 307 | max_avg_reward = 0 308 | 309 | while not terminal and not trun: 310 | steps += 1 311 | 312 | state,reward,terminal,_ = env.step(action) 313 | tot_reward += reward 314 | if reward != 0: 315 | print(reward) 316 | 317 | if tot_reward / steps > max_avg_reward: 318 | max_avg_reward = tot_reward / steps 319 | 320 | 321 | 322 | 323 | 324 | -------------------------------------------------------------------------------- /EffRainbow/networks.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file defines all the neural network architectures available to use. 3 | """ 4 | from functools import partial 5 | from math import sqrt 6 | 7 | import torch 8 | from torch import nn as nn, Tensor 9 | from torch.nn import init 10 | import torch.nn.functional as F 11 | import numpy as np 12 | from lib import dqn_model 13 | import time 14 | 15 | #import kornia 16 | from torchvision.utils import save_image 17 | 18 | class FactorizedNoisyLinear(nn.Module): 19 | """ The factorized Gaussian noise layer for noisy-nets dqn. """ 20 | def __init__(self, in_features: int, out_features: int, sigma_0: float) -> None: 21 | super().__init__() 22 | self.in_features = in_features 23 | self.out_features = out_features 24 | self.sigma_0 = sigma_0 25 | 26 | # weight: w = \mu^w + \sigma^w . \epsilon^w 27 | self.weight_mu = nn.Parameter(torch.empty(out_features, in_features)) 28 | self.weight_sigma = nn.Parameter(torch.empty(out_features, in_features)) 29 | self.register_buffer('weight_epsilon', torch.empty(out_features, in_features)) 30 | 31 | # bias: b = \mu^b + \sigma^b . \epsilon^b 32 | self.bias_mu = nn.Parameter(torch.empty(out_features)) 33 | self.bias_sigma = nn.Parameter(torch.empty(out_features)) 34 | self.register_buffer('bias_epsilon', torch.empty(out_features)) 35 | 36 | self.reset_parameters() 37 | self.reset_noise() 38 | 39 | @torch.no_grad() 40 | def reset_parameters(self) -> None: 41 | # initialization is similar to Kaiming uniform (He. initialization) with fan_mode=fan_in 42 | scale = 1 / sqrt(self.in_features) 43 | 44 | init.uniform_(self.weight_mu, -scale, scale) 45 | init.uniform_(self.bias_mu, -scale, scale) 46 | 47 | init.constant_(self.weight_sigma, self.sigma_0 * scale) 48 | init.constant_(self.bias_sigma, self.sigma_0 * scale) 49 | 50 | @torch.no_grad() 51 | def _get_noise(self, size: int) -> Tensor: 52 | noise = torch.randn(size, device=self.weight_mu.device) 53 | # f(x) = sgn(x)sqrt(|x|) 54 | return noise.sign().mul_(noise.abs().sqrt_()) 55 | 56 | @torch.no_grad() 57 | def reset_noise(self) -> None: 58 | # like in eq 10 and 11 of the paper 59 | epsilon_in = self._get_noise(self.in_features) 60 | epsilon_out = self._get_noise(self.out_features) 61 | self.weight_epsilon.copy_(epsilon_out.outer(epsilon_in)) 62 | self.bias_epsilon.copy_(epsilon_out) 63 | 64 | @torch.no_grad() 65 | def disable_noise(self) -> None: 66 | self.weight_epsilon[:] = 0 67 | self.bias_epsilon[:] = 0 68 | 69 | def forward(self, input: Tensor) -> Tensor: 70 | # y = wx + d, where 71 | # w = \mu^w + \sigma^w * \epsilon^w 72 | # b = \mu^b + \sigma^b * \epsilon^b 73 | return F.linear(input, 74 | self.weight_mu + self.weight_sigma*self.weight_epsilon, 75 | self.bias_mu + self.bias_sigma*self.bias_epsilon) 76 | 77 | class Dueling(nn.Module): 78 | """ The dueling branch used in all nets that use dueling-dqn. """ 79 | def __init__(self, value_branch, advantage_branch): 80 | super().__init__() 81 | self.flatten = nn.Flatten() 82 | self.value_branch = value_branch 83 | self.advantage_branch = advantage_branch 84 | 85 | #@torch.autocast('cuda') 86 | def forward(self, x, advantages_only=False): 87 | x = self.flatten(x) 88 | advantages = self.advantage_branch(x) 89 | if advantages_only: 90 | return advantages 91 | 92 | value = self.value_branch(x) 93 | return value + (advantages - torch.mean(advantages, dim=1, keepdim=True)) 94 | 95 | 96 | class DuelingAlt(nn.Module): 97 | """ The dueling branch used in all nets that use dueling-dqn. """ 98 | def __init__(self, l1, l2): 99 | super().__init__() 100 | self.main = nn.Sequential( 101 | nn.Flatten(), 102 | l1, 103 | nn.ReLU(), 104 | l2 105 | ) 106 | 107 | def forward(self, x, advantages_only=False): 108 | res = self.main(x) 109 | advantages = res[:, 1:] 110 | value = res[:, 0:1] 111 | return value + (advantages - torch.mean(advantages, dim=1, keepdim=True)) 112 | 113 | class NatureCNN(nn.Module): 114 | """ 115 | This is the CNN that was introduced in Mnih et al. (2013) and then used in a lot of later work such as 116 | Mnih et al. (2015) and the Rainbow paper. This implementation only works with a frame resolution of 84x84. 117 | """ 118 | def __init__(self, depth, actions, linear_layer): 119 | super().__init__() 120 | 121 | self.main = nn.Sequential( 122 | nn.Conv2d(in_channels=depth, out_channels=32, kernel_size=8, stride=4), 123 | nn.ReLU(), 124 | nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2), 125 | nn.ReLU(), 126 | nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1), 127 | nn.ReLU(), 128 | nn.Flatten(), 129 | linear_layer(3136, 512), 130 | nn.ReLU(), 131 | linear_layer(512, actions), 132 | ) 133 | 134 | def forward(self, x, advantages_only=None): 135 | return self.main(x) 136 | 137 | 138 | class DuelingNatureCNN(nn.Module): 139 | """ 140 | Implementation of the dueling architecture introduced in Wang et al. (2015). 141 | This implementation only works with a frame resolution of 84x84. 142 | """ 143 | def __init__(self, depth, actions, linear_layer): 144 | super().__init__() 145 | 146 | self.main = nn.Sequential( 147 | nn.Conv2d(in_channels=depth, out_channels=32, kernel_size=8, stride=4), 148 | nn.ReLU(), 149 | nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2), 150 | nn.ReLU(), 151 | nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1), 152 | nn.ReLU(), 153 | ) 154 | 155 | self.dueling = Dueling( 156 | nn.Sequential(linear_layer(3136, 512), 157 | nn.ReLU(), 158 | linear_layer(512, 1)), 159 | nn.Sequential(linear_layer(3136, 512), 160 | nn.ReLU(), 161 | linear_layer(512, actions)) 162 | ) 163 | 164 | def forward(self, x, advantages_only=False): 165 | f = self.main(x) 166 | return self.dueling(f, advantages_only=advantages_only) 167 | 168 | 169 | class ImpalaCNNSmall(nn.Module): 170 | """ 171 | Implementation of the small variant of the IMPALA CNN introduced in Espeholt et al. (2018). 172 | """ 173 | def __init__(self, depth, actions): 174 | super().__init__() 175 | 176 | self.main = nn.Sequential( 177 | nn.Conv2d(in_channels=depth, out_channels=16, kernel_size=8, stride=4), 178 | nn.ReLU(), 179 | nn.Conv2d(in_channels=16, out_channels=32, kernel_size=4, stride=2), 180 | nn.ReLU(), 181 | ) 182 | 183 | self.pool = torch.nn.AdaptiveMaxPool2d((6, 6)) 184 | 185 | self.dueling = Dueling( 186 | nn.Sequential(dqn_model.NoisyLinear(1152, 256), 187 | nn.ReLU(), 188 | dqn_model.NoisyLinear(256, 1)), 189 | nn.Sequential(dqn_model.NoisyLinear(1152, 256), 190 | nn.ReLU(), 191 | dqn_model.NoisyLinear(256, actions)) 192 | ) 193 | 194 | def _get_conv_out(self, shape): 195 | o = self.main(torch.zeros(1, *shape)) 196 | return int(np.prod(o.size())) 197 | 198 | def forward(self, x, advantages_only=False): 199 | x = x.float() / 256 200 | f = self.main(x) 201 | f = self.pool(f) 202 | return self.dueling(f, advantages_only=advantages_only) 203 | 204 | 205 | class ImpalaCNNResidual(nn.Module): 206 | """ 207 | Simple residual block used in the large IMPALA CNN. 208 | """ 209 | def __init__(self, depth, norm_func): 210 | super().__init__() 211 | 212 | self.relu = nn.ReLU() 213 | self.conv_0 = norm_func(nn.Conv2d(in_channels=depth, out_channels=depth, kernel_size=3, stride=1, padding=1)) 214 | self.conv_1 = norm_func(nn.Conv2d(in_channels=depth, out_channels=depth, kernel_size=3, stride=1, padding=1)) 215 | 216 | #@torch.autocast('cuda') 217 | def forward(self, x): 218 | x_ = self.conv_0(self.relu(x)) 219 | x_ = self.conv_1(self.relu(x_)) 220 | return x+x_ 221 | 222 | class ImpalaCNNBlock(nn.Module): 223 | """ 224 | Three of these blocks are used in the large IMPALA CNN. 225 | """ 226 | def __init__(self, depth_in, depth_out, norm_func): 227 | super().__init__() 228 | 229 | self.conv = nn.Conv2d(in_channels=depth_in, out_channels=depth_out, kernel_size=3, stride=1, padding=1) 230 | self.max_pool = nn.MaxPool2d(3, 2, padding=1) 231 | self.residual_0 = ImpalaCNNResidual(depth_out, norm_func=norm_func) 232 | self.residual_1 = ImpalaCNNResidual(depth_out, norm_func=norm_func) 233 | 234 | #@torch.autocast('cuda') 235 | def forward(self, x): 236 | x = self.conv(x) 237 | x = self.max_pool(x) 238 | x = self.residual_0(x) 239 | x = self.residual_1(x) 240 | return x 241 | 242 | 243 | class ImpalaCNNLarge(nn.Module): 244 | """ 245 | Implementation of the large variant of the IMPALA CNN introduced in Espeholt et al. (2018). 246 | """ 247 | def __init__(self, in_depth, actions, model_size=4, spectral_norm='all'): 248 | super().__init__() 249 | 250 | self.start = time.time() 251 | self.model_size = model_size 252 | self.actions = actions 253 | 254 | def identity(p): return p 255 | 256 | norm_func = torch.nn.utils.spectral_norm if (spectral_norm == 'all') else identity 257 | norm_func_last = torch.nn.utils.spectral_norm if (spectral_norm == 'last' or spectral_norm == 'all') else identity 258 | 259 | self.main = nn.Sequential( 260 | ImpalaCNNBlock(in_depth, 16*model_size, norm_func=norm_func), 261 | ImpalaCNNBlock(16*model_size, 32*model_size, norm_func=norm_func), 262 | ImpalaCNNBlock(32*model_size, 32*model_size, norm_func=norm_func_last), 263 | nn.ReLU() 264 | ) 265 | 266 | self.pool = torch.nn.AdaptiveMaxPool2d((8, 8)) 267 | 268 | self.dueling = Dueling( 269 | nn.Sequential(nn.Linear(2048*model_size, 256), 270 | nn.ReLU(), 271 | nn.Linear(256, 1)), 272 | nn.Sequential(nn.Linear(2048*model_size, 256), 273 | nn.ReLU(), 274 | nn.Linear(256, actions)) 275 | ) 276 | 277 | def reset_mlps(self): 278 | self.dueling = Dueling( 279 | nn.Sequential(nn.Linear(2048*self.model_size, 256), 280 | nn.ReLU(), 281 | nn.Linear(256, 1)), 282 | nn.Sequential(nn.Linear(2048*self.model_size, 256), 283 | nn.ReLU(), 284 | nn.Linear(256, self.actions)) 285 | ) 286 | 287 | def _get_conv_out(self, shape): 288 | o = self.main(torch.zeros(1, *shape)) 289 | return int(np.prod(o.size())) 290 | 291 | def forward(self, x, advantages_only=False): 292 | x = x.float() / 256 293 | """if test: 294 | save_image(x[0], 'img1.png') 295 | save_image(x[1], 'img2.png') 296 | save_image(x[2], 'img3.png') 297 | 298 | raise Exception("stop")""" 299 | 300 | f = self.main(x) 301 | f = self.pool(f) 302 | return self.dueling(f, advantages_only=advantages_only) 303 | 304 | def save_checkpoint(self): 305 | #print('... saving checkpoint ...') 306 | torch.save(self.state_dict(), "current_model" + str(int(time.time() - self.start))) 307 | 308 | def load_checkpoint(self): 309 | #print('... loading checkpoint ...') 310 | self.load_state_dict(torch.load("current_model260538")) 311 | 312 | 313 | def get_model(model_str, spectral_norm): 314 | if model_str == 'nature': return NatureCNN 315 | elif model_str == 'dueling': return DuelingNatureCNN 316 | elif model_str == 'impala_small': return ImpalaCNNSmall 317 | elif model_str.startswith('impala_large:'): 318 | return partial(ImpalaCNNLarge, model_size=int(model_str[13:]), spectral_norm=spectral_norm) 319 | -------------------------------------------------------------------------------- /EffRainbow/ptan_utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import operator 4 | from datetime import timedelta 5 | import numpy as np 6 | import collections 7 | 8 | import torch 9 | import torch.nn as nn 10 | 11 | 12 | class SMAQueue: 13 | """ 14 | Queue of fixed size with mean, max, min operations 15 | """ 16 | def __init__(self, size): 17 | self.queue = collections.deque() 18 | self.size = size 19 | 20 | def __iadd__(self, other): 21 | if isinstance(other, (list, tuple)): 22 | self.queue.extend(other) 23 | else: 24 | self.queue.append(other) 25 | while len(self.queue) > self.size: 26 | self.queue.popleft() 27 | return self 28 | 29 | def __len__(self): 30 | return len(self.queue) 31 | 32 | def __repr__(self): 33 | return "SMAQueue(size=%d)" % self.size 34 | 35 | def __str__(self): 36 | return "SMAQueue(size=%d, len=%d)" % (self.size, len(self.queue)) 37 | 38 | def min(self): 39 | if not self.queue: 40 | return None 41 | return np.min(self.queue) 42 | 43 | def mean(self): 44 | if not self.queue: 45 | return None 46 | return np.mean(self.queue) 47 | 48 | def max(self): 49 | if not self.queue: 50 | return None 51 | return np.max(self.queue) 52 | 53 | 54 | class SpeedMonitor: 55 | def __init__(self, batch_size, autostart=True): 56 | self.batch_size = batch_size 57 | self.start_ts = None 58 | self.batches = None 59 | if autostart: 60 | self.reset() 61 | 62 | def epoch(self): 63 | if self.epoches is not None: 64 | self.epoches += 1 65 | 66 | def batch(self): 67 | if self.batches is not None: 68 | self.batches += 1 69 | 70 | def reset(self): 71 | self.start_ts = time.time() 72 | self.batches = 0 73 | self.epoches = 0 74 | 75 | def seconds(self): 76 | """ 77 | Seconds since last reset 78 | :return: 79 | """ 80 | return time.time() - self.start_ts 81 | 82 | def samples_per_sec(self): 83 | """ 84 | Calculate samples per second since last reset() call 85 | :return: float count samples per second or None if not started 86 | """ 87 | if self.start_ts is None: 88 | return None 89 | secs = self.seconds() 90 | if abs(secs) < 1e-5: 91 | return 0.0 92 | return (self.batches + 1) * self.batch_size / secs 93 | 94 | def epoch_time(self): 95 | """ 96 | Calculate average epoch time 97 | :return: timedelta object 98 | """ 99 | if self.start_ts is None: 100 | return None 101 | s = self.seconds() 102 | if self.epoches > 0: 103 | s /= self.epoches + 1 104 | return timedelta(seconds=s) 105 | 106 | def batch_time(self): 107 | """ 108 | Calculate average batch time 109 | :return: timedelta object 110 | """ 111 | if self.start_ts is None: 112 | return None 113 | s = self.seconds() 114 | if self.batches > 0: 115 | s /= self.batches + 1 116 | return timedelta(seconds=s) 117 | 118 | 119 | class WeightedMSELoss(nn.Module): 120 | def __init__(self, size_average=True): 121 | super(WeightedMSELoss, self).__init__() 122 | self.size_average = size_average 123 | 124 | def forward(self, input, target, weights=None): 125 | if weights is None: 126 | return nn.MSELoss(self.size_average)(input, target) 127 | 128 | loss_rows = (input - target) ** 2 129 | if len(loss_rows.size()) != 1: 130 | loss_rows = torch.sum(loss_rows, dim=1) 131 | res = (weights * loss_rows).sum() 132 | if self.size_average: 133 | res /= len(weights) 134 | return res 135 | 136 | 137 | class SegmentTree(object): 138 | def __init__(self, capacity, operation, neutral_element): 139 | """Build a Segment Tree data structure. 140 | 141 | https://en.wikipedia.org/wiki/Segment_tree 142 | 143 | Can be used as regular array, but with two 144 | important differences: 145 | 146 | a) setting item's value is slightly slower. 147 | It is O(lg capacity) instead of O(1). 148 | b) user has access to an efficient `reduce` 149 | operation which reduces `operation` over 150 | a contiguous subsequence of items in the 151 | array. 152 | 153 | Paramters 154 | --------- 155 | capacity: int 156 | Total size of the array - must be a power of two. 157 | operation: lambda obj, obj -> obj 158 | and operation for combining elements (eg. sum, max) 159 | must for a mathematical group together with the set of 160 | possible values for array elements. 161 | neutral_element: obj 162 | neutral element for the operation above. eg. float('-inf') 163 | for max and 0 for sum. 164 | """ 165 | assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2." 166 | self._capacity = capacity 167 | self._value = [neutral_element for _ in range(2 * capacity)] 168 | self._operation = operation 169 | 170 | def _reduce_helper(self, start, end, node, node_start, node_end): 171 | if start == node_start and end == node_end: 172 | return self._value[node] 173 | mid = (node_start + node_end) // 2 174 | if end <= mid: 175 | return self._reduce_helper(start, end, 2 * node, node_start, mid) 176 | else: 177 | if mid + 1 <= start: 178 | return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end) 179 | else: 180 | return self._operation( 181 | self._reduce_helper(start, mid, 2 * node, node_start, mid), 182 | self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end) 183 | ) 184 | 185 | def reduce(self, start=0, end=None): 186 | """Returns result of applying `self.operation` 187 | to a contiguous subsequence of the array. 188 | 189 | self.operation(arr[start], operation(arr[start+1], operation(... arr[end]))) 190 | 191 | Parameters 192 | ---------- 193 | start: int 194 | beginning of the subsequence 195 | end: int 196 | end of the subsequences 197 | 198 | Returns 199 | ------- 200 | reduced: obj 201 | result of reducing self.operation over the specified range of array elements. 202 | """ 203 | if end is None: 204 | end = self._capacity 205 | if end < 0: 206 | end += self._capacity 207 | end -= 1 208 | return self._reduce_helper(start, end, 1, 0, self._capacity - 1) 209 | 210 | def __setitem__(self, idx, val): 211 | # index of the leaf 212 | idx += self._capacity 213 | self._value[idx] = val 214 | idx //= 2 215 | while idx >= 1: 216 | self._value[idx] = self._operation( 217 | self._value[2 * idx], 218 | self._value[2 * idx + 1] 219 | ) 220 | idx //= 2 221 | 222 | def __getitem__(self, idx): 223 | assert 0 <= idx < self._capacity 224 | return self._value[self._capacity + idx] 225 | 226 | 227 | class SumSegmentTree(SegmentTree): 228 | def __init__(self, capacity): 229 | super(SumSegmentTree, self).__init__( 230 | capacity=capacity, 231 | operation=operator.add, 232 | neutral_element=0.0 233 | ) 234 | 235 | def sum(self, start=0, end=None): 236 | """Returns arr[start] + ... + arr[end]""" 237 | return super(SumSegmentTree, self).reduce(start, end) 238 | 239 | def find_prefixsum_idx(self, prefixsum): 240 | """Find the highest index `i` in the array such that 241 | sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum 242 | 243 | if array values are probabilities, this function 244 | allows to sample indexes according to the discrete 245 | probability efficiently. 246 | 247 | Parameters 248 | ---------- 249 | perfixsum: float 250 | upperbound on the sum of array prefix 251 | 252 | Returns 253 | ------- 254 | idx: int 255 | highest index satisfying the prefixsum constraint 256 | """ 257 | assert 0 <= prefixsum <= self.sum() + 1e-5 258 | idx = 1 259 | while idx < self._capacity: # while non-leaf 260 | if self._value[2 * idx] > prefixsum: 261 | idx = 2 * idx 262 | else: 263 | prefixsum -= self._value[2 * idx] 264 | idx = 2 * idx + 1 265 | return idx - self._capacity 266 | 267 | 268 | class MinSegmentTree(SegmentTree): 269 | def __init__(self, capacity): 270 | super(MinSegmentTree, self).__init__( 271 | capacity=capacity, 272 | operation=min, 273 | neutral_element=float('inf') 274 | ) 275 | 276 | def min(self, start=0, end=None): 277 | """Returns min(arr[start], ..., arr[end])""" 278 | 279 | return super(MinSegmentTree, self).reduce(start, end) 280 | 281 | 282 | class TBMeanTracker: 283 | """ 284 | TensorBoard value tracker: allows to batch fixed amount of historical values and write their mean into TB 285 | 286 | Designed and tested with pytorch-tensorboard in mind 287 | """ 288 | def __init__(self, writer, batch_size): 289 | """ 290 | :param writer: writer with close() and add_scalar() methods 291 | :param batch_size: integer size of batch to track 292 | """ 293 | assert isinstance(batch_size, int) 294 | assert writer is not None 295 | self.writer = writer 296 | self.batch_size = batch_size 297 | 298 | def __enter__(self): 299 | self._batches = collections.defaultdict(list) 300 | return self 301 | 302 | def __exit__(self, exc_type, exc_val, exc_tb): 303 | self.writer.close() 304 | 305 | @staticmethod 306 | def _as_float(value): 307 | assert isinstance(value, (float, int, np.ndarray, np.generic, torch.autograd.Variable)) or torch.is_tensor(value) 308 | tensor_val = None 309 | if isinstance(value, torch.autograd.Variable): 310 | tensor_val = value.data 311 | elif torch.is_tensor(value): 312 | tensor_val = value 313 | 314 | if tensor_val is not None: 315 | return tensor_val.float().mean().item() 316 | elif isinstance(value, np.ndarray): 317 | return float(np.mean(value)) 318 | else: 319 | return float(value) 320 | 321 | def track(self, param_name, value, iter_index): 322 | assert isinstance(param_name, str) 323 | assert isinstance(iter_index, int) 324 | 325 | data = self._batches[param_name] 326 | data.append(self._as_float(value)) 327 | 328 | if len(data) >= self.batch_size: 329 | self.writer.add_scalar(param_name, np.mean(data), iter_index) 330 | data.clear() 331 | 332 | 333 | class RewardTracker: 334 | def __init__(self, writer, min_ts_diff=1.0): 335 | """ 336 | Constructs RewardTracker 337 | :param writer: writer to use for writing stats 338 | :param min_ts_diff: minimal time difference to track speed 339 | """ 340 | self.writer = writer 341 | self.min_ts_diff = min_ts_diff 342 | 343 | def __enter__(self): 344 | self.ts = time.time() 345 | self.ts_frame = 0 346 | self.total_rewards = [] 347 | return self 348 | 349 | def __exit__(self, *args): 350 | self.writer.close() 351 | 352 | def reward(self, reward, frame, epsilon=None): 353 | self.total_rewards.append(reward) 354 | mean_reward = np.mean(self.total_rewards[-100:]) 355 | ts_diff = time.time() - self.ts 356 | if ts_diff > self.min_ts_diff: 357 | speed = (frame - self.ts_frame) / ts_diff 358 | self.ts_frame = frame 359 | self.ts = time.time() 360 | epsilon_str = "" if epsilon is None else ", eps %.2f" % epsilon 361 | print("%d: done %d episodes, mean reward %.3f, speed %.2f f/s%s" % ( 362 | frame, len(self.total_rewards), mean_reward, speed, epsilon_str 363 | )) 364 | sys.stdout.flush() 365 | self.writer.add_scalar("speed", speed, frame) 366 | if epsilon is not None: 367 | self.writer.add_scalar("epsilon", epsilon, frame) 368 | self.writer.add_scalar("reward_100", mean_reward, frame) 369 | self.writer.add_scalar("reward", reward, frame) 370 | return mean_reward if len(self.total_rewards) > 30 else None 371 | -------------------------------------------------------------------------------- /FelkFork/DolphinSideScriptTanks.py: -------------------------------------------------------------------------------- 1 | 2 | #Window is 500x270 when captured 3 | 4 | try: 5 | import sys 6 | #sys.path.append("C:\\Users\\TYLER\\AppData\\Local\\Programs\\Python\\Python38\\Lib\\site-packages") 7 | sys.path.append("/home/tyler/anaconda3/envs/effzero/lib/python3.8/site-packages") 8 | except Exception as e: 9 | with open('logg.txt', 'a') as f: 10 | f.write(str(e)) 11 | raise Exception("stop") 12 | 13 | import os 14 | cwd = str(os.getcwd()) 15 | cwd = cwd.split("dolphin",1)[1][0] 16 | os.chdir('/home/tyler/Documents/EfficientZero') 17 | 18 | with open('logg.txt', 'a') as f: 19 | f.write('Path Changed... Again') 20 | 21 | with open('logg.txt', 'a') as f: 22 | f.write("PID:" + str(cwd)) 23 | 24 | pid = int(cwd) 25 | 26 | from multiprocessing import shared_memory,Lock 27 | import numpy as np 28 | 29 | with open('logg.txt', 'a') as f: 30 | f.write('half libraries installed\n') 31 | 32 | from PIL import Image 33 | import math 34 | import time 35 | import random 36 | 37 | with open('logg.txt', 'a') as f: 38 | f.write('Imported Some\n') 39 | 40 | with open('logg.txt', 'a') as f: 41 | f.write('Got Pid ' + str(pid) + '\n') 42 | #Ymem = 270 43 | #Xmem = 500 44 | #Ymem = 108 45 | #Xmem = 200 46 | 47 | Ymem = 78 48 | Xmem = 94 49 | 50 | ##78x94 51 | 52 | #div by 6.8 53 | 54 | try: 55 | data = np.zeros((Ymem + 1,Xmem),dtype=np.float32) 56 | shm = shared_memory.SharedMemory(name='p' + str(pid)) 57 | with open('logg.txt', 'a') as f: 58 | f.write('Joined Shared Memory') 59 | 60 | except Exception as e: 61 | with open('logg.txt', 'a') as f: 62 | f.write(str(e)) 63 | 64 | with open('logg.txt', 'a') as f: 65 | f.write(' Failed to create Shared Memory') 66 | 67 | raise Exception("Stop - failed to create shared mem") 68 | 69 | #import mss 70 | #import dxcam 71 | 72 | from dolphin import event, gui,savestate,memory,controller 73 | with open('logg.txt', 'a') as f: 74 | f.write('\nImported FelkLibs') 75 | 76 | class DolphinSideEnv(): 77 | def __init__(self,pid=0,offset = 0): 78 | 79 | """ 80 | shared mem is in following format: 81 | 82 | This needs to be changed to this format: 83 | 84 | arr = np.zeros((101,60),dtype=np.float32) 85 | 86 | arr[0][0] = Dtimestep 87 | arr[0][1] = Etimestep 88 | arr[0][2] = action 89 | arr[0][3] = reward 90 | arr[0][4] = terminal 91 | 92 | arr[1:] = state 93 | 94 | """ 95 | 96 | #about 78fs with mss 97 | 98 | #about 60 with dxcam (the weird method) 99 | 100 | ########### Game Code 101 | 102 | self.last_action = 0 103 | 104 | ##################### End Game Code 105 | 106 | self.offset = offset 107 | 108 | self.window_header = 40 109 | self.window_width = 100 110 | self.window_height = 60 111 | 112 | pidx = pid % 5 113 | pidy = math.floor(pid / 5) 114 | 115 | #self.monitor = {"top": 32 + (270 + 32) * pidy, "left": pidx*500, "width": 500, "height": 270} 116 | #self.monitor = {"top": 32 + (Ymem + 32) * pidy, "left": pidx*(Xmem + 1), "width": Xmem, "height": Ymem} 117 | 118 | self.frameskip = 4 119 | 120 | self.timestep = 0. 121 | 122 | self.current_step = 0 123 | 124 | with open('logg.txt', 'a') as f: 125 | f.write('About to make data array\n') 126 | 127 | self.data = np.zeros((Ymem + 1,Xmem),dtype=np.float32)#np.zeros(self.dims,dtype=np.float32) 128 | 129 | self.shm_array = np.ndarray(self.data.shape, dtype=self.data.dtype, buffer=shm.buf) 130 | 131 | with open('logg.txt', 'a') as f: 132 | f.write('shared mem\n') 133 | 134 | self.reset() 135 | 136 | with open('logg.txt', 'a') as f: 137 | f.write('Init Reset Successful\n') 138 | 139 | def reset(self): 140 | 141 | self.current_step = 0 142 | 143 | ########### Game Code 144 | self.movement_inc = 0.015 145 | x = np.random.random() 146 | global change 147 | change = False 148 | savestate.load_from_slot(1)#random.randint(1,8) 149 | 150 | """if x < 0.5: 151 | savestate.load_from_slot(1) 152 | elif x < 0.6: 153 | savestate.load_from_slot(5) 154 | elif x < 0.75: 155 | savestate.load_from_slot(4) 156 | elif x < 0.95: 157 | savestate.load_from_slot(2) 158 | else: 159 | savestate.load_from_slot(3)""" 160 | #else: 161 | #savestate.load_from_slot(4) 162 | 163 | change = True 164 | 165 | self.numEnemies = memory.read_u32(0x91CFA9E8) 166 | self.numLives = memory.read_u32(0x91D27ED0) 167 | self.x = 0 168 | self.y = 0 169 | 170 | ##################### End Game Code 171 | start = time.time() 172 | while True: 173 | time.sleep(0.5) 174 | 175 | if time.time() - start > 10: 176 | time.sleep(10) 177 | with open('logg.txt', 'a') as f: 178 | f.write("Waiting 10+ seconds! PID: " + str(pid)) 179 | f.write('\nWaiting for Reset... ' + str(self.shm_array[0]) + " " + str(self.shm_array[1]) + " " + str(self.timestep)) 180 | f.write("\n\n") 181 | #self.timestep = 0 182 | #self.shm_array[0][0 + self.offset] = self.timestep 183 | start = time.time() 184 | 185 | if self.shm_array[0][1 + self.offset] == self.timestep: 186 | break 187 | 188 | #write state 189 | self.shm_array[0][4 + self.offset] = 0. 190 | self.shm_array[0][3 + self.offset] = 0. 191 | 192 | self.shm_array[1:] = np.zeros((Ymem,Xmem),dtype=np.float32)#self.get_state() 193 | 194 | self.timestep += 1 195 | self.shm_array[0][0 + self.offset] = self.timestep 196 | 197 | self.dic = {"Left":False,"Right":False,"Down":False,"Up":False, \ 198 | "Plus":False,"Minus":False,"One":False,"Two":False, \ 199 | "A":False,"B":False,"Home":False} 200 | 201 | def get_state(self): 202 | 203 | #event.on_framedrawn(show_screenshot) 204 | 205 | 206 | return img[:] 207 | 208 | def get_state_old(self): 209 | 210 | with mss.mss() as sct: 211 | 212 | # Part of the screen to capture 213 | #im = 0.07 * im[:,:,2] + 0.72 * im[:,:,1] + 0.21 * im[:,:,0] 214 | # Get raw pixels from the screen, save it to a Numpy array 215 | im = np.array(sct.grab(self.monitor)) 216 | 217 | #im = 0.0002745098 * im[:,:,2] + 0.00282352941 * im[:,:,1] + 0.00082352941 * im[:,:,0] 218 | im = 0.07 * im[:,:,2] + 0.72 * im[:,:,1] + 0.21 * im[:,:,0] 219 | #im = im.astype(np.float32) 220 | 221 | return im 222 | 223 | def get_state_dx(self): 224 | 225 | im = self.camera.get_latest_frame() 226 | im = np.squeeze(im) 227 | im = np.true_divide(im,255,dtype=np.float32) 228 | #im = cv2.resize(im, dsize=(54, 100), interpolation=cv2.INTER_CUBIC) 229 | #im = np.swapaxes(im,0,1) 230 | 231 | return im 232 | 233 | def get_reward_terminal(self): 234 | # Returns reward,terminal,trun 235 | controller.set_wii_buttons(0,self.dic) 236 | self.current_step += 1 237 | 238 | ########### Game Code 239 | terminal = False 240 | reward = 0. 241 | 242 | numEnemies = memory.read_u32(0x91CFA9E8) 243 | numLives = memory.read_u32(0x91D27ED0) 244 | 245 | #check if we died 246 | if numLives < self.numLives: 247 | return -1., True, False 248 | 249 | 250 | #check if the round ended 251 | if numEnemies > self.numEnemies: 252 | return 1., True, False 253 | 254 | #get kills 255 | reward = self.numEnemies - numEnemies 256 | 257 | self.numEnemies = numEnemies 258 | self.numLives = numLives 259 | 260 | ##################### End Game Code 261 | 262 | #remove this 263 | if random.randint(1,60) == 25: 264 | terminal = True 265 | 266 | return reward,terminal,False 267 | 268 | def apply_action(self,action): 269 | 270 | """ 271 | self.dic = {"Left":False,"Right":False,"Down":False,"Up":False, \ 272 | "Plus":False,"Minus":False,"One":False,"Two":False, \ 273 | "A":False,"B":False,"Home":False} 274 | 275 | """ 276 | self.last_action = action 277 | self.dic = {"Left":False,"Right":False,"Down":False,"Up":False, \ 278 | "Plus":False,"Minus":False,"One":False,"Two":False, \ 279 | "A":False,"B":False,"Home":False} 280 | 281 | #REMOVE THIS LINE 282 | action = random.randint(0,8) 283 | 284 | if action == 0: 285 | self.dic["Left"] = True 286 | elif action == 1: 287 | self.dic["Right"] = True 288 | elif action == 2: 289 | self.dic["Up"] = True 290 | elif action == 3: 291 | self.dic["Down"] = True 292 | elif action == 4: 293 | self.x += self.movement_inc 294 | elif action == 5: 295 | self.x -= self.movement_inc 296 | elif action == 6: 297 | self.y += self.movement_inc 298 | elif action == 7: 299 | self.y -= self.movement_inc 300 | elif action == 8: 301 | self.dic["B"] = True 302 | 303 | self.x = max(-0.32,min(self.x,0.32)) 304 | self.y = max(-0.16, min(self.y, 0.08)) 305 | 306 | controller.set_wii_ircamera_transform(0,self.x,self.y,-2,0,0,0) 307 | controller.set_wii_buttons(0,self.dic) 308 | 309 | def step(self): 310 | 311 | #get action 312 | #sync 313 | while True: 314 | start = time.time() 315 | time.sleep(0.001) 316 | """with open('logg.txt', 'a') as f: 317 | f.write('\nWaiting for ETimestep... ' + str(self.shm_array[0]) + " " + str(self.shm_array[1]) + " " + str(self.timestep))""" 318 | 319 | if self.shm_array[0][1 + self.offset] == self.timestep: 320 | break 321 | 322 | if time.time() - start > 10: 323 | with open('logg.txt', 'a') as f: 324 | f.write('\nDolphin has been awaiting respose for 10+ seconds! Pid: ' + str(pid)) 325 | f.write('\nWaiting for Step... ' + str(self.shm_array[0]) + " " + str(self.shm_array[1]) + " " + str(self.timestep)) 326 | f.write("\n\n") 327 | 328 | try: 329 | self.apply_action(self.shm_array[0][2 + self.offset]) 330 | except: 331 | print("Error at apply action! PID: " + str(pid)) 332 | time.sleep(1) 333 | 334 | 335 | def step2(self,reward,terminal,trun,image): 336 | 337 | #with open('logg.txt', 'a') as f: 338 | #f.write('\nWriting timestep: ' + str(self.timestep)) 339 | 340 | #image = cv2.resize(image,(78,94), interpolation=cv2.INTER_AREA) 341 | 342 | #send back data 343 | 344 | image = image.resize((94,78)) 345 | image = image.convert("RGB") 346 | 347 | img1 = np.asarray(image) 348 | img1 = img1[...,::-1] 349 | image = np.dot(img1[...,:3], [0.2989, 0.5870, 0.1140]) 350 | 351 | #if random.random() > 0.99: 352 | #cv2.imwrite("filename.png", img) 353 | 354 | image = image.astype(np.float32) 355 | 356 | self.shm_array[0][4 + self.offset] = float(terminal) 357 | self.shm_array[0][3 + self.offset] = reward 358 | self.shm_array[1:] = image#self.get_state()np.zeros((Ymem,Xmem),dtype=np.float32)# 359 | 360 | self.timestep += 1 361 | self.shm_array[0][0 + self.offset] = self.timestep 362 | 363 | if terminal or trun: 364 | self.reset() 365 | 366 | """def show_screenshot(width: int, height: int, data: bytes): 367 | #print(f"received {width}x{height} image of length {len(data)}") 368 | # data is RGBA, so its size is width*height*4 369 | 370 | if change: 371 | global img 372 | img = deepcopy(Image.frombytes('RGBA', (width,height), data, 'raw')) 373 | 374 | 375 | return""" 376 | #img = np.zeros((528,640),dtype=np.float32) 377 | 378 | #img = np.zeros((94,78),dtype=np.uint8) 379 | change = True 380 | #event.on_framedrawn(show_screenshot) 381 | 382 | for i in range(4): 383 | await event.frameadvance() 384 | 385 | env = DolphinSideEnv(pid=pid) 386 | 387 | for i in range(env.frameskip): 388 | await event.frameadvance() 389 | 390 | reward = 0 391 | terminal = False 392 | trun = False 393 | red = 0xffff0000 394 | 395 | with open('logg.txt', 'a') as f: 396 | f.write('\nEntering Main While loop') 397 | 398 | while True: 399 | env.step() 400 | 401 | for i in range(env.frameskip): 402 | (width,height,data) = await event.framedrawn() 403 | 404 | rewardN,terminalN,trunN = env.get_reward_terminal() 405 | #env.apply_action(env.last_action) 406 | gui.draw_text((10, 90), red, str(env.last_action)) 407 | 408 | #with open('logg.txt', 'a') as f: 409 | #f.write('\nAfter reward terminal') 410 | 411 | terminal = terminal or terminalN 412 | trun = trun or trunN 413 | reward += rewardN 414 | if terminal or trun: 415 | for i in range(4): 416 | await event.frameadvance() 417 | break 418 | 419 | #with open('logg.txt', 'a') as f: 420 | #f.write(str(deepcopy(img))) 421 | img = Image.frombytes('RGBA', (width,height), data, 'raw') 422 | 423 | env.step2(reward,terminal,trun,img) 424 | 425 | gui.draw_text((10, 10), red, f"HI") 426 | 427 | reward = 0 428 | terminal = False 429 | trun = False 430 | 431 | 432 | -------------------------------------------------------------------------------- /MarioKartEnvBackup.py: -------------------------------------------------------------------------------- 1 | import win32gui 2 | import win32ui 3 | from ctypes import windll 4 | from PIL import Image 5 | import PIL 6 | from pywinauto import Desktop 7 | import cv2 8 | import numpy as np 9 | import ctypes, time 10 | from copy import copy,deepcopy 11 | import gym 12 | from Region import Region 13 | import pickle 14 | import math 15 | import keyboard 16 | # Bunch of stuff so that the script can send keystrokes to game # 17 | 18 | SendInput = ctypes.windll.user32.SendInput 19 | 20 | # C struct redefinitions 21 | PUL = ctypes.POINTER(ctypes.c_ulong) 22 | class KeyBdInput(ctypes.Structure): 23 | _fields_ = [("wVk", ctypes.c_ushort), 24 | ("wScan", ctypes.c_ushort), 25 | ("dwFlags", ctypes.c_ulong), 26 | ("time", ctypes.c_ulong), 27 | ("dwExtraInfo", PUL)] 28 | 29 | class HardwareInput(ctypes.Structure): 30 | _fields_ = [("uMsg", ctypes.c_ulong), 31 | ("wParamL", ctypes.c_short), 32 | ("wParamH", ctypes.c_ushort)] 33 | 34 | class MouseInput(ctypes.Structure): 35 | _fields_ = [("dx", ctypes.c_long), 36 | ("dy", ctypes.c_long), 37 | ("mouseData", ctypes.c_ulong), 38 | ("dwFlags", ctypes.c_ulong), 39 | ("time",ctypes.c_ulong), 40 | ("dwExtraInfo", PUL)] 41 | 42 | class Input_I(ctypes.Union): 43 | _fields_ = [("ki", KeyBdInput), 44 | ("mi", MouseInput), 45 | ("hi", HardwareInput)] 46 | 47 | class Input(ctypes.Structure): 48 | _fields_ = [("type", ctypes.c_ulong), 49 | ("ii", Input_I)] 50 | 51 | # Actuals Functions 52 | 53 | def PressKey(hexKeyCode): 54 | extra = ctypes.c_ulong(0) 55 | ii_ = Input_I() 56 | ii_.ki = KeyBdInput( 0, hexKeyCode, 0x0008, 0, ctypes.pointer(extra) ) 57 | x = Input( ctypes.c_ulong(1), ii_ ) 58 | ctypes.windll.user32.SendInput(1, ctypes.pointer(x), ctypes.sizeof(x)) 59 | 60 | def ReleaseKey(hexKeyCode): 61 | extra = ctypes.c_ulong(0) 62 | ii_ = Input_I() 63 | ii_.ki = KeyBdInput( 0, hexKeyCode, 0x0008 | 0x0002, 0, ctypes.pointer(extra) ) 64 | x = Input( ctypes.c_ulong(1), ii_ ) 65 | ctypes.windll.user32.SendInput(1, ctypes.pointer(x), ctypes.sizeof(x)) 66 | 67 | def KeyPress(key): 68 | PressKey(keys[key]) # press Q 69 | time.sleep(.05) 70 | ReleaseKey(keys[key]) #release Q 71 | 72 | def release_keys(): 73 | for key in keys: 74 | ReleaseKey(keys[key]) 75 | 76 | def push(key): 77 | PressKey(keys[key]) 78 | 79 | def release(key): 80 | ReleaseKey(keys[key]) 81 | 82 | keys = { 83 | "a": 0x1E, 84 | "b": 0x30, 85 | "w": 0x11, 86 | "n": 0x31, 87 | "m": 0x32, 88 | "`": 0x29, 89 | "\\": 0x2B, 90 | "p": 0x19, 91 | "e": 0x12, 92 | "z": 0x2C, 93 | "c": 0x2E, 94 | "d": 0x20 95 | } 96 | 97 | #32400 frames/hour! 98 | #ray did 28700 99 | class MarioKartEnv(): 100 | def __init__(self,config=None): 101 | 102 | windows = Desktop(backend="uia").windows() 103 | for i in windows: 104 | if i.window_text()[:19] == "Dolphin 5.0-16101 |": 105 | window_name = i.window_text() 106 | 107 | self.hwnd = win32gui.FindWindow(None, window_name) 108 | left, top, right, bot = win32gui.GetWindowRect(self.hwnd) 109 | self.w = right - left 110 | self.h = bot - top 111 | 112 | self.template = cv2.imread('C:/Users/TYLER/Downloads/dolphin_ai_tests/env/funky_kong_img2.png') 113 | self.template = cv2.cvtColor(self.template, cv2.COLOR_RGB2GRAY) 114 | self.tem_w = 69 115 | self.tem_h = 132#100,141 116 | 117 | self.action_space = gym.spaces.Discrete(4) 118 | """ 119 | 120 | 1 - accel 121 | 2 - accel+wheely 122 | 3 - accel+drift_hold_right 123 | 4 - accel+drift_hold_left 124 | 125 | 126 | item has been removed 127 | 5 - accel + right 128 | 6 - accel + left 129 | 7 - accel + item 130 | 0 - null 131 | """ 132 | #yx 133 | #self.observation_space = gym.spaces.Box( 134 | #low=0, high=255, shape=(64, 32), dtype=np.uint8) 135 | 136 | save_name = "regions.dat" 137 | 138 | self.image_x = 950 139 | self.image_y = 1220 140 | self.grid_size = 10 141 | self.grid_x = int(self.image_x / self.grid_size) 142 | self.grid_y = int(self.image_y / self.grid_size) 143 | 144 | self.time_till_checkpoint = 4 145 | self.checkpoint_timer = time.time() 146 | 147 | self.method = eval('cv2.TM_CCOEFF') 148 | self.num_chkps = 22 149 | with open(save_name, "rb") as f: 150 | self.regions = pickle.load(f) 151 | 152 | self.reset() 153 | 154 | 155 | def reset(self): 156 | self.dist = 0 157 | self.first = True 158 | release_keys() 159 | self.held_keys = [] 160 | KeyPress("m") 161 | time.sleep(0.25) 162 | self.timer = time.time() 163 | self.prev_action = 0 164 | self.out_frames = 0 165 | self.current_chkp = -1 166 | 167 | return self.get_state()[0] 168 | 169 | def template_match(self,img): 170 | terminal = False 171 | #crop image so avoid issues -- #og image 2098, 3868 172 | img = img[680:1900, 2600: 3550] 173 | #img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 174 | 175 | 176 | # Apply template Matching 177 | res = cv2.matchTemplate(img,self.template,self.method) 178 | min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(res) 179 | # If the method is TM_SQDIFF or TM_SQDIFF_NORMED, take minimum 180 | 181 | if not self.first: 182 | 183 | self.prev_top_left = copy(self.top_left) 184 | 185 | self.top_left = max_loc 186 | 187 | if self.first: 188 | self.prev_top_left = copy(self.top_left) 189 | self.first = False 190 | #cv2.imwrite("bug_test" + str(time.time()) + ".jpg", img) 191 | return 0,False 192 | 193 | else: 194 | 195 | y_dif = list(self.prev_top_left)[1] - list(self.top_left)[1] 196 | x_dif = list(self.top_left)[0] - list(self.prev_top_left)[0] 197 | 198 | if True:#time.time() - self.timer > 5.4 199 | 200 | self.dist = x_dif**2 + y_dif**2 201 | 202 | #exception for broken template matching 203 | if self.dist > 950: 204 | 205 | #need to allow it to refind template next frame 206 | self.first = True 207 | reward = 0.9 208 | bottom_right2 = (self.prev_top_left[0] + self.tem_w, self.prev_top_left[1] + self.tem_h) 209 | cv2.rectangle(img,self.prev_top_left, bottom_right2, 255, 2) 210 | 211 | bottom_right = (self.top_left[0] + self.tem_w, self.top_left[1] + self.tem_h) 212 | cv2.rectangle(img,self.top_left, bottom_right, 255, 2) 213 | 214 | cv2.imwrite("wrong_pattern12" + str(round(time.time(),4)) + ".jpg", img) 215 | 216 | self.dist = 0 217 | else: 218 | 219 | #region code - #cropped image 1220, 880 (y,x) 220 | reward = self.get_reward(x_dif,y_dif) 221 | if self.out_frames > 3: 222 | terminal = True 223 | reward -= 150 224 | 225 | 226 | """bottom_right = (self.top_left[0] + self.tem_w, self.top_left[1] + self.tem_h) 227 | cv2.rectangle(img,self.top_left, bottom_right, 128, 2) 228 | cv2.imwrite("bug_test" + str(time.time()) + ".jpg", img) 229 | 230 | raise Exception("stop")""" 231 | 232 | else: 233 | return 0,terminal 234 | 235 | reward = reward / 30 236 | reward -= 0.03 237 | 238 | return reward,terminal 239 | 240 | def get_reward(self,x_dif,y_dif): 241 | reward = 0 242 | reset_frames = True 243 | 244 | #this are based off funky's face 245 | add_x = int(self.tem_w / 2) 246 | add_y = int(self.tem_h / 2) 247 | 248 | #center location 249 | x = self.top_left[0] + add_x 250 | y = self.top_left[1] + add_y 251 | 252 | x = math.floor(x / self.grid_size) 253 | y = math.floor(y / self.grid_size) 254 | 255 | #get which grid cell 256 | num = self.convert_xy_to_num(x,y) 257 | 258 | #check out of bounds 259 | if not self.regions[num].in_bounds: 260 | self.out_frames += 1 261 | reset_frames = False 262 | 263 | #check dir_x 264 | reward += x_dif * self.regions[num].dir_x 265 | 266 | #check dir_y 267 | reward -= y_dif * self.regions[num].dir_y 268 | 269 | #checkpoints 270 | if self.regions[num].is_chkp: 271 | if self.regions[num].chkp_num > self.current_chkp or (self.regions[num].chkp_num == 0 and self.current_chkp == self.num_chkps): 272 | reward += 65 273 | self.checkpoint_timer = time.time() 274 | #print("checkpoint: " + str(self.regions[num].chkp_num)) 275 | self.current_chkp = self.regions[num].chkp_num 276 | elif self.regions[num].chkp_num < self.current_chkp or \ 277 | (self.regions[num].chkp_num == self.num_chkps and (self.current_chkp == 0 or self.current_chkp == -1)): 278 | 279 | self.out_frames += 1 280 | reset_frames = False 281 | 282 | if reset_frames: 283 | self.out_frames = 0 284 | 285 | #timer for reaching checkpoints 286 | if time.time() - self.checkpoint_timer > self.time_till_checkpoint: 287 | self.out_frames = 10 288 | 289 | return reward 290 | 291 | def convert_xy_to_num(self,x,y): 292 | return x + y * self.grid_x 293 | 294 | def is_inside(self,point,reg_point,reg_end_point): 295 | #loop over xy 296 | for i in range(2): 297 | if not (point[i] >= reg_point[i] and point[i] <= reg_end_point[i]): 298 | return False 299 | return True 300 | 301 | 302 | def get_state(self): 303 | hwndDC = win32gui.GetWindowDC(self.hwnd) 304 | mfcDC = win32ui.CreateDCFromHandle(hwndDC) 305 | saveDC = mfcDC.CreateCompatibleDC() 306 | 307 | saveBitMap = win32ui.CreateBitmap() 308 | saveBitMap.CreateCompatibleBitmap(mfcDC, self.w, self.h) 309 | 310 | saveDC.SelectObject(saveBitMap) 311 | 312 | # Change the line below depending on whether you want the whole window 313 | # or just the client area. 314 | #result = windll.user32.PrintWindow(hwnd, saveDC.GetSafeHdc(), 1) 315 | result = windll.user32.PrintWindow(self.hwnd, saveDC.GetSafeHdc(), 0) 316 | 317 | bmpinfo = saveBitMap.GetInfo() 318 | bmpstr = saveBitMap.GetBitmapBits(True) 319 | 320 | im = Image.frombuffer( 321 | 'RGB', 322 | (bmpinfo['bmWidth'], bmpinfo['bmHeight']), 323 | bmpstr, 'raw', 'BGRX', 0, 1)# 324 | 325 | #og image 2098, 3868 326 | im = np.array(im) 327 | 328 | #gets the top_left var 329 | im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY) 330 | reward,terminal = self.template_match(im) 331 | 332 | 333 | im = im[80:, 85: 3868 - 85] 334 | 335 | im = cv2.resize(im, (64,32), interpolation = cv2.INTER_AREA) 336 | #cv2.imwrite("bug_test_ai" + str(time.time()) + ".jpg", im) 337 | #raise Exception("stop") 338 | 339 | win32gui.DeleteObject(saveBitMap.GetHandle()) 340 | saveDC.DeleteDC() 341 | mfcDC.DeleteDC() 342 | win32gui.ReleaseDC(self.hwnd, hwndDC) 343 | 344 | return im,reward,terminal 345 | 346 | def step(self,action=0): 347 | #time.sleep(0.005) 348 | terminal = False 349 | 350 | self.apply_action(action) 351 | #press some key 352 | state,reward,terminal = self.get_state() 353 | 354 | if time.time() - self.timer > 80: 355 | terminal = True 356 | 357 | #print(reward) 358 | info = {} 359 | 360 | return state,reward,terminal,info 361 | 362 | 363 | 364 | def apply_action(self,action): 365 | """ 366 | 367 | 1 - accel 368 | 2 - accel+wheely 369 | 3 - accel+drift_hold_right 370 | 4 - accel+drift_hold_left 371 | 372 | 0 - null 373 | 5 - accel + right 374 | 6 - accel + left 375 | 7 - accel + item 376 | """ 377 | self.prev_held = copy(self.held_keys) 378 | 379 | #null action removed 380 | action += 1 381 | 382 | """if action == 0: 383 | self.held_keys = []""" 384 | if action == 1: 385 | self.held_keys = ["w"] 386 | elif action == 2: 387 | self.held_keys = ["w","e"] 388 | elif action == 3: 389 | self.held_keys = ["w","c","d"] 390 | elif action == 4: 391 | self.held_keys = ["w","c","a"] 392 | """elif action == 5: 393 | self.held_keys = ["w","d"] 394 | elif action == 6: 395 | self.held_keys = ["w","a"] 396 | elif action == 7: 397 | self.held_keys = ["w","z"]""" 398 | 399 | for i in self.held_keys: 400 | if i not in self.prev_held: 401 | push(i) 402 | 403 | for i in self.prev_held: 404 | if i not in self.held_keys: 405 | release(i) 406 | 407 | #print() 408 | 409 | if __name__ == "__main__": 410 | time.sleep(5) 411 | env = MarioKartEnv() 412 | state = env.reset() 413 | score = 0 414 | action = 0 415 | 416 | while True: 417 | 418 | if keyboard.is_pressed('u'): 419 | action = 0 420 | elif keyboard.is_pressed('h'): 421 | action = 3 422 | elif keyboard.is_pressed('k'): 423 | action = 2 424 | elif keyboard.is_pressed('i'): 425 | action = 1 426 | else: 427 | action = 0 428 | 429 | 430 | state,reward,terminal,info = env.step(action) 431 | score += reward 432 | print(reward) 433 | if terminal: 434 | print("Total Reward: " + str(score)) 435 | score = 0 436 | env.reset() 437 | --------------------------------------------------------------------------------