├── Ptan
    ├── blank
    ├── regions.dat
    ├── funky_kong_img3.jpg
    ├── Region.py
    ├── 01_dqn_basic.py
    ├── 02_dqn_n_steps.py
    ├── 06_dqn_dueling.py
    ├── dqn_model.py
    ├── 04_dqn_noisy_net.py
    ├── 03_dqn_double.py
    ├── 05_dqn_prio_replay.py
    ├── 08_dqn_rainbow.py
    ├── RainbowMarioKart.py
    ├── common.py
    └── 07_dqn_distrib.py
├── EffRainbow
    ├── readme
    ├── kill_dolphins.py
    ├── plot.py
    ├── ptan_actions.py
    ├── simple_buffer_bench.py
    ├── OutputViewer.py
    ├── dqn_model.py
    ├── prio_buffer_bench.py
    ├── Wrappers.py
    ├── 08_dqn_rainbow.py
    ├── ptan_agent.py
    ├── common.py
    ├── networks.py
    └── ptan_utils.py
├── Results1.npy
├── regions.dat
├── results.xlsx
├── blank_regions.jpg
├── regions_luigi.dat
├── funky_kong_img2.png
├── funky_kong_img3.jpg
├── holy_agent_graph.png
├── map_region_luigi.jpg
├── current_model98852.331
├── current_model151406.779
├── FelkFork
    ├── readme.txt
    ├── clearMem.py
    ├── dolphinBootTest.py
    ├── dolphinScriptTest.py
    ├── LeakTest.py
    ├── newDolphinTest.py
    ├── Wrappers.py
    ├── DolphinEnv.py
    └── DolphinSideScriptTanks.py
├── readme.md
├── result_reader.py
├── Region.py
├── ER.py
├── MarioKartMain.py
├── LunarLanderMain.py
├── dist_test.py
├── ButtonLib.py
├── PER_old.py
├── PER.py
└── MarioKartEnvBackup.py


/Ptan/blank:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/EffRainbow/readme:
--------------------------------------------------------------------------------
1 | Here's the new stuff
2 | 


--------------------------------------------------------------------------------
/Results1.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIPTankz/OldWiiRL/HEAD/Results1.npy


--------------------------------------------------------------------------------
/regions.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIPTankz/OldWiiRL/HEAD/regions.dat


--------------------------------------------------------------------------------
/results.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIPTankz/OldWiiRL/HEAD/results.xlsx


--------------------------------------------------------------------------------
/Ptan/regions.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIPTankz/OldWiiRL/HEAD/Ptan/regions.dat


--------------------------------------------------------------------------------
/blank_regions.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIPTankz/OldWiiRL/HEAD/blank_regions.jpg


--------------------------------------------------------------------------------
/regions_luigi.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIPTankz/OldWiiRL/HEAD/regions_luigi.dat


--------------------------------------------------------------------------------
/funky_kong_img2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIPTankz/OldWiiRL/HEAD/funky_kong_img2.png


--------------------------------------------------------------------------------
/funky_kong_img3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIPTankz/OldWiiRL/HEAD/funky_kong_img3.jpg


--------------------------------------------------------------------------------
/holy_agent_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIPTankz/OldWiiRL/HEAD/holy_agent_graph.png


--------------------------------------------------------------------------------
/map_region_luigi.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIPTankz/OldWiiRL/HEAD/map_region_luigi.jpg


--------------------------------------------------------------------------------
/current_model98852.331:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIPTankz/OldWiiRL/HEAD/current_model98852.331


--------------------------------------------------------------------------------
/Ptan/funky_kong_img3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIPTankz/OldWiiRL/HEAD/Ptan/funky_kong_img3.jpg


--------------------------------------------------------------------------------
/current_model151406.779:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VIPTankz/OldWiiRL/HEAD/current_model151406.779


--------------------------------------------------------------------------------
/FelkFork/readme.txt:
--------------------------------------------------------------------------------
1 | This folder is the newest work at time of writing! No longer using screencapture, but using felk's fork to access screen frames.
2 | 


--------------------------------------------------------------------------------
/EffRainbow/kill_dolphins.py:
--------------------------------------------------------------------------------
1 | import psutil
2 | 
3 | PROCNAME = "Dolphin.exe"
4 | 
5 | for proc in psutil.process_iter():
6 |     # check whether the process name matches
7 |     if proc.name() == PROCNAME:
8 |         proc.kill()


--------------------------------------------------------------------------------
/FelkFork/clearMem.py:
--------------------------------------------------------------------------------
1 | from multiprocessing.shared_memory import SharedMemory
2 | 
3 | name = 'p1' # replace this with the name of your lingering shared memory
4 | 
5 | shm = SharedMemory(name, create=False)
6 | 
7 | shm.unlink() # this closes all attachments to the memory and destroys it
8 | print("Cleared memory succesfully")


--------------------------------------------------------------------------------
/FelkFork/dolphinBootTest.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | 
3 | cmd = 'cd ~/Documents/dolphin/build/Binaries && ./dolphin-emu --no-python-subinterpreters\
4 |     --script /home/tyler/Documents/WiiRL/FelkFork/dolphinScriptTest.py\
5 |     --exec="/home/tyler/Documents/GameCollection/Wii Play (Europe) (En,Fr,De,Es,It).nkit.gcz"'
6 | 
7 | subprocess.call(cmd, shell=True)
8 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | This is a very old version of the code seen on my YouTube channel, and does not represent what is currently being done in any way. You can attempt to use this code if you like, but do so at your own risk since there is minimal documentation, and this is definitely not the best way to do things. 
2 | 
3 | I may release the source code for the actual version at some point, but I do not yet know when that'll be.
4 | 


--------------------------------------------------------------------------------
/FelkFork/dolphinScriptTest.py:
--------------------------------------------------------------------------------
 1 | from dolphin import event, gui
 2 | 
 3 | red = 0xffff0000
 4 | frame_counter = 0
 5 | while True:
 6 |     await event.frameadvance()
 7 |     frame_counter += 1
 8 |     # draw on screen
 9 |     gui.draw_text((10, 10), red, f"Frame: {frame_counter}")
10 |     # print to console
11 |     if frame_counter % 60 == 0:
12 |         print(f"The frame count has reached {frame_counter}")
13 | 


--------------------------------------------------------------------------------
/result_reader.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | filename = "results.npy"
 5 | 
 6 | arr = np.load(filename)
 7 | #print(arr)
 8 | print(arr[-1])
 9 | 
10 | scores = []
11 | timesteps = []
12 | 
13 | for i in range(len(arr) - 100):
14 |     scores.append(np.average(arr[i:i+100,0]))
15 |     timesteps.append(arr[i + 100][3] / 3600)
16 | 
17 | plt.plot(timesteps,scores)
18 | plt.ylabel('Average over last 100 games')
19 | plt.xlabel('Wall Time (Hours)')
20 | plt.show()
21 | 


--------------------------------------------------------------------------------
/Region.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | class Region():
 4 |     def __init__(self,x,y,bot_x,bot_y,dir_x,dir_y,
 5 |                  in_bounds,is_chkp,chkp_num = -1):
 6 |         
 7 |         self.x = x
 8 |         self.y = y
 9 | 
10 |         self.bot_x = bot_x
11 |         self.bot_y = bot_y
12 | 
13 |         #dir will be -1,0 or 1. This gives reward
14 |         self.dir_x = dir_x
15 |         self.dir_y = dir_y
16 | 
17 |         self.in_bounds = in_bounds
18 | 
19 |         self.is_chkp = is_chkp
20 |         self.chkp_num = chkp_num
21 | 
22 |     def in_region(self,x,y):
23 |         #pass in the midpoint of funky kong
24 | 
25 |         if x >= self.x and y >= self.y:
26 |             if x <= self.bot_x and y <= self.bot_y:
27 |                 return True
28 | 
29 |         return False
30 |         
31 | 


--------------------------------------------------------------------------------
/Ptan/Region.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | class Region():
 4 |     def __init__(self,x,y,bot_x,bot_y,
 5 |                  is_chkp,chkp_num = -1,focal = False):
 6 |         
 7 |         self.x = x
 8 |         self.y = y
 9 |         self.focal = focal
10 | 
11 |         self.bot_x = bot_x
12 |         self.bot_y = bot_y
13 | 
14 |         #dir will be -1,0 or 1. This gives reward
15 |         #self.dir_x = dir_x
16 |         #self.dir_y = dir_y
17 | 
18 |         #self.in_bounds = in_bounds
19 | 
20 |         self.is_chkp = is_chkp
21 |         self.chkp_num = chkp_num
22 | 
23 |     def in_region(self,x,y):
24 |         #pass in the midpoint of funky kong
25 | 
26 |         if x >= self.x and y >= self.y:
27 |             if x <= self.bot_x and y <= self.bot_y:
28 |                 return True
29 | 
30 |         return False
31 |         
32 | 


--------------------------------------------------------------------------------
/EffRainbow/plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | runs = 1
 4 | 
 5 | data_files = ["ResultsGhost"]
 6 | 
 7 | expers = []
 8 | 
 9 | for exper in data_files:
10 |     temp = []
11 |     for i in range(runs):
12 |         temp.append(np.load(exper + '.npy'))
13 |     expers.append(temp[:])
14 | 
15 | # Example 2D list
16 | data = list(expers[0][0])
17 | print(len(data))
18 | 
19 | # Number of data entries to average
20 | average_size = 1000
21 | 
22 | # Extracting scores and steps from the data
23 | scores = [row[0] for row in data]
24 | steps = [row[2] for row in data]
25 | 
26 | 
27 | myInt = 3600
28 | steps = [x / myInt for x in steps]
29 | 
30 | 
31 | # Averaging scores over a given number of data entries
32 | averaged_scores = []
33 | for i in range(len(scores) - average_size + 1):
34 |     average = sum(scores[i:i+average_size]) / average_size
35 |     averaged_scores.append(average)
36 | 
37 | # Creating the plot
38 | plt.plot(steps[:len(averaged_scores)], averaged_scores)
39 | plt.xlabel('Hours')
40 | plt.ylabel('Average Reward')
41 | plt.title('Mario Kart - GhostValley2')
42 | plt.grid(True)
43 | 
44 | # Displaying the plot
45 | plt.show()
46 | print("done?")


--------------------------------------------------------------------------------
/FelkFork/LeakTest.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append("C:\\Users\\TYLER\\AppData\\Local\\Programs\\Python\\Python38\\Lib\\site-packages")
 3 | with open('leak.txt', 'w') as f:
 4 |     f.write("got path123")
 5 | from PIL import Image
 6 | 
 7 | from dolphin import event,gui
 8 | 
 9 | from copy import deepcopy
10 | 
11 | import numpy as np
12 | import time
13 | with open('leak.txt', 'a') as f:
14 |     f.write("\nlibraries")
15 |     
16 | white = 0xffffffff
17 | 
18 | def show_screenshot(width: int, height: int, data: bytes):
19 |     global allow
20 |     allow = True
21 |     #gui.draw_text((10, 10), white, "Hi")
22 |     #print(f"received {width}x{height} image of length {len(data)}")
23 |     #image = Image.frombytes('RGBA', (width,height), data, 'raw')
24 |     #image.show()
25 | 
26 | steps = 1
27 | start = time.time()
28 | allow = False
29 | while True:
30 |     await event.frameadvance()
31 |     steps += 1
32 |     fps = round(steps / (time.time() - start))
33 |     gui.draw_text((10, 10), white, "FPS: " + str(fps))
34 | 
35 |     
36 | """event.on_framedrawn(show_screenshot)
37 | with open('leak.txt', 'a') as f:
38 |     f.write("\nonframeadvance")
39 | while True:
40 | 
41 |     while not allow:
42 |         await event.frameadvance()
43 |     
44 |     with open('leak.txt', 'a') as f:
45 |         f.write("\nallowed")
46 |         
47 |     (width,height,data) = await event.framedrawn()
48 |     with open('leak.txt', 'a') as f:
49 |         f.write("\ndrawn")
50 |     allow = False
51 |     #Adding the two lines below fixes the problem?!?
52 |     
53 |     #with open('leak.txt', 'a') as f:
54 |     #pass
55 |     
56 |     #img = Image.frombytes('RGBA', (width,height), data, 'raw')"""
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/ER.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class ReplayMemory:
 4 |     def __init__(self, input_dims, max_mem, batch_size):
 5 |         self.mem_size = max_mem
 6 |         self.batch_size = batch_size
 7 |         self.mem_cntr = 0
 8 |         self.state_memory = np.zeros((self.mem_size, *input_dims),
 9 |                                      dtype=np.float32)
10 |         self.new_state_memory = np.zeros((self.mem_size, *input_dims),
11 |                                          dtype=np.float32)
12 |         self.action_memory = np.zeros(self.mem_size, dtype=np.int64)
13 |         self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
14 |         self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
15 | 
16 |     def store_transition(self, state, action, reward, state_, terminal):
17 |         index = self.mem_cntr % self.mem_size
18 |         self.state_memory[index] = state
19 |         self.action_memory[index] = action
20 |         self.reward_memory[index] = reward
21 |         self.new_state_memory[index] = state_
22 |         self.terminal_memory[index] = terminal
23 | 
24 |         self.mem_cntr += 1
25 | 
26 |     def sample_memory(self):
27 |         offset = 0
28 |         max_mem = min(self.mem_cntr, self.mem_size)
29 |         batch = np.random.choice(max_mem, self.batch_size,
30 |                                  replace=False)
31 |         states = self.state_memory[batch]
32 |         new_states = self.new_state_memory[batch]
33 |         actions = self.action_memory[batch]
34 |         rewards = self.reward_memory[batch]
35 |         terminals = self.terminal_memory[batch]
36 | 
37 | 
38 | 
39 |         return states, actions, rewards, new_states, terminals
40 | 
41 |     def is_sufficient(self):
42 |         return self.mem_cntr > self.batch_size
43 | 


--------------------------------------------------------------------------------
/MarioKartMain.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | from gym.wrappers import FrameStack
 3 | import numpy as np
 4 | from DDDQN import Agent
 5 | #from utils import plotLearning
 6 | import time
 7 | from MarioKartEnv import MarioKartEnv
 8 | 
 9 | if __name__ == '__main__':
10 |     env = MarioKartEnv()
11 |     env = FrameStack(env,4)
12 |     save_interval = 400
13 |     load_checkpoint = False
14 | 
15 |     agent = Agent(gamma=0.99, epsilon=1, batch_size=32, n_actions=7,
16 |                       eps_end=0.1, input_dims=[4,52,96], lr=1e-4,
17 |                       max_mem_size=50000,memory = "PER",image = True,
18 |                       learning_starts=32,replace=16000,preprocess = True,
19 |                       n_step = 4,noisy = True,action_repeat=1)
20 | 
21 | 
22 |     #learning starts to 50k
23 |     if load_checkpoint:
24 |         agent.load_models()
25 | 
26 |     scores = []
27 |     steps = 0
28 |     start = time.time()
29 |     i = -1
30 |     arr = []
31 | 
32 |     act_time = 0
33 |     step_time = 0
34 |     learn_time = 0
35 |     
36 |     while True:
37 |         done = False
38 |         observation = env.reset()
39 |         observation = np.stack( observation, axis=0)
40 |         
41 |         score = 0
42 |         i += 1 
43 | 
44 |         while not done:
45 |             steps += 1
46 |             action = agent.choose_action(observation)
47 | 
48 |             observation_, reward, done, info = env.step(action)
49 |             
50 |             observation_ = np.stack( observation_, axis=0)
51 |             score += reward
52 | 
53 |             agent.store_transition(observation, action,
54 |                                     reward, observation_, int(done))
55 | 
56 |             agent.learn()
57 | 
58 |             observation = observation_
59 | 
60 | 
61 |         arr.append([score,i,steps,round(time.time() - start,4),agent.epsilon])
62 |         if i % save_interval == save_interval - 1:
63 |             np.save("Results.npy",np.array(arr))
64 |             agent.save_models()
65 | 
66 |         #eps_history.append(agent.epsilon)
67 | 
68 |     
69 | 


--------------------------------------------------------------------------------
/Ptan/01_dqn_basic.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import gym
 3 | import ptan
 4 | import argparse
 5 | 
 6 | import torch
 7 | import torch.optim as optim
 8 | 
 9 | from tensorboardX import SummaryWriter
10 | 
11 | from lib import dqn_model, common
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     params = common.HYPERPARAMS['pong']
16 | #    params['epsilon_frames'] = 200000
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda")
19 |     args = parser.parse_args()
20 |     device = torch.device("cuda" if args.cuda else "cpu")
21 | 
22 |     env = gym.make(params['env_name'])
23 |     env = ptan.common.wrappers.wrap_dqn(env)
24 | 
25 |     writer = SummaryWriter(comment="-" + params['run_name'] + "-basic")
26 |     net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device)
27 | 
28 |     tgt_net = ptan.agent.TargetNet(net)
29 |     selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start'])
30 |     epsilon_tracker = common.EpsilonTracker(selector, params)
31 |     agent = ptan.agent.DQNAgent(net, selector, device=device)
32 | 
33 |     exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1)
34 |     buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params['replay_size'])
35 |     optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])
36 | 
37 |     frame_idx = 0
38 | 
39 |     with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
40 |         while True:
41 |             frame_idx += 1
42 |             buffer.populate(1)
43 |             epsilon_tracker.frame(frame_idx)
44 | 
45 |             new_rewards = exp_source.pop_total_rewards()
46 |             if new_rewards:
47 |                 if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon):
48 |                     break
49 | 
50 |             if len(buffer) < params['replay_initial']:
51 |                 continue
52 | 
53 |             optimizer.zero_grad()
54 |             batch = buffer.sample(params['batch_size'])
55 |             loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params['gamma'], device=device)
56 |             loss_v.backward()
57 |             optimizer.step()
58 | 
59 |             if frame_idx % params['target_net_sync'] == 0:
60 |                 tgt_net.sync()
61 | 


--------------------------------------------------------------------------------
/LunarLanderMain.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from DDDQN import Agent
 4 | import argparse
 5 | #from utils import plotLearning
 6 | import time
 7 | 
 8 | if __name__ == '__main__':
 9 |     parser = argparse.ArgumentParser(description='')
10 |     parser.add_argument('-srun', type=int, default=0)
11 |     
12 |     args = parser.parse_args()
13 |     srun = args.srun
14 |     
15 |     env = gym.make('LunarLander-v2')
16 |     num_frames = 180000
17 |     load_checkpoint = False
18 | 
19 |     agent = Agent(gamma=0.99, epsilon=0.1, batch_size=64, n_actions=4,
20 |                       eps_end=0.1, input_dims=[8], lr=0.001,
21 |                       max_mem_size=1000000,memory = "PER",image = False,
22 |                       learning_starts=64,replace=100,n_step = 4,noisy=False)
23 | 
24 |     if load_checkpoint:
25 |         agent.load_models()
26 | 
27 |     filename = 'LunarLander-Dueling-DDQN-512-Adam-lr0005-replace100.png'
28 |     scores = []
29 |     eps_history = []
30 |     n_steps = 0
31 |     start = time.time()
32 |     i = -1
33 |     while n_steps < num_frames:
34 |         i += 1
35 |         done = False
36 |         observation = env.reset()
37 |         score = 0
38 | 
39 |         while not done:
40 |             n_steps += 1
41 |             action = agent.choose_action(observation)
42 |             observation_, reward, done, info = env.step(action)
43 |             #env.render()
44 |             score += reward
45 |             agent.store_transition(observation, action,
46 |                                     reward, observation_, int(done))
47 |             agent.learn()
48 | 
49 |             observation = observation_
50 | 
51 |         scores.append(score)
52 |         avg_score = np.mean(scores[max(0, i-100):(i+1)])
53 |         if i % 10 == 0:
54 |             print('episode: ', i,'score %.1f ' % score,
55 |                  ' average score %.1f' % avg_score,
56 |                 'epsilon %.2f' % agent.epsilon)
57 | 
58 |         eps_history.append(agent.epsilon)
59 | 
60 |     #x = [i+1 for i in range(num_games)]
61 |     #plotLearning(x, scores, eps_history, filename)
62 |     #print("Total Wall Time: " + str(time.time() - start))
63 |     #print(avg_score)
64 |     save_stuff = [time.time() - start,avg_score]
65 |     save_stuff = np.array(save_stuff,dtype = float)
66 |     np.save("results_er" + str(srun) + ".npy", save_stuff)
67 | 


--------------------------------------------------------------------------------
/Ptan/02_dqn_n_steps.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import gym
 3 | import ptan
 4 | import argparse
 5 | 
 6 | import torch
 7 | import torch.optim as optim
 8 | 
 9 | from tensorboardX import SummaryWriter
10 | 
11 | from lib import dqn_model, common
12 | 
13 | REWARD_STEPS_DEFAULT = 2
14 | 
15 | 
16 | if __name__ == "__main__":
17 |     params = common.HYPERPARAMS['pong']
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda")
20 |     parser.add_argument("-n", default=REWARD_STEPS_DEFAULT, type=int, help="Count of steps to unroll Bellman")
21 |     args = parser.parse_args()
22 |     device = torch.device("cuda" if args.cuda else "cpu")
23 | 
24 |     env = gym.make(params['env_name'])
25 |     env = ptan.common.wrappers.wrap_dqn(env)
26 | 
27 |     writer = SummaryWriter(comment="-" + params['run_name'] + "-%d-step" % args.n)
28 |     net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device)
29 | 
30 |     tgt_net = ptan.agent.TargetNet(net)
31 |     selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start'])
32 |     epsilon_tracker = common.EpsilonTracker(selector, params)
33 |     agent = ptan.agent.DQNAgent(net, selector, device=device)
34 | 
35 |     exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=args.n)
36 |     buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params['replay_size'])
37 |     optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])
38 | 
39 |     frame_idx = 0
40 | 
41 |     with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
42 |         while True:
43 |             frame_idx += 1
44 |             buffer.populate(1)
45 |             epsilon_tracker.frame(frame_idx)
46 | 
47 |             new_rewards = exp_source.pop_total_rewards()
48 |             if new_rewards:
49 |                 if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon):
50 |                     break
51 | 
52 |             if len(buffer) < params['replay_initial']:
53 |                 continue
54 | 
55 |             optimizer.zero_grad()
56 |             batch = buffer.sample(params['batch_size'])
57 |             loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model,
58 |                                           gamma=params['gamma']**args.n, device=device)
59 |             loss_v.backward()
60 |             optimizer.step()
61 | 
62 |             if frame_idx % params['target_net_sync'] == 0:
63 |                 tgt_net.sync()
64 | 


--------------------------------------------------------------------------------
/FelkFork/newDolphinTest.py:
--------------------------------------------------------------------------------
 1 | from dolphin import event, gui
 2 | import sys
 3 | sys.path.append("C:\\Users\\TYLER\\AppData\\Local\\Programs\\Python\\Python38\\Lib\\site-packages")
 4 | #import numpy as np
 5 | from PIL import Image
 6 | #import cv2
 7 | import random
 8 | import time
 9 | 
10 | def show_screenshot(width: int, height: int, data: bytes):
11 |     #print(f"received {width}x{height} image of length {len(data)}")
12 |     # data is RGBA, so its size is width*height*4
13 |     gui.draw_text((10, 50), red, f"Hi")
14 | 
15 | red = 0xffff0000
16 | frame_counter = 0
17 | start = time.time()
18 | count = 0
19 | while True:
20 |     (width,height,data) = await event.framedrawn()
21 |     gui.draw_text((10, 50), red, f"Hi")
22 | 
23 |     if random.random() > 0.995:
24 |         image = Image.frombytes('RGBA', (width,height), data, 'raw')
25 |         image.show()
26 | """advance = False
27 | while True:
28 |     await event.on_framedrawn(show_screenshot)
29 |     count += 1
30 |     gui.draw_text((10, 50), red, f"Count: {count}")"""
31 | 
32 | #image = Image.frombytes('RGBA', (width,height), data, 'raw')
33 | #gui.draw_text((10, 50), red, f"Hi")
34 | 
35 | 
36 | 
37 | 
38 | """while True:
39 |     #while not advance:
40 |     #await event.frameadvance()
41 |     
42 |     advance = False
43 |     
44 |     await event.on_framedrawn(show_screenshot)
45 |     
46 | 
47 |     #with open('loggTest.txt', 'a') as f:
48 |         #f.write("Run framedrawn statement")
49 |     
50 |     frame_counter += 1
51 |     fps = frame_counter / (time.time() - start)
52 |     # draw on screen
53 |     counts = count / (time.time() - start)
54 |     gui.draw_text((10, 10), red, f"FPS: {fps}")
55 |     gui.draw_text((10, 50), red, f"Count: {counts}")
56 |     #gui.draw_text((10, 50), red, f"frames: {img.dtype}")
57 |     # print to console
58 |     if frame_counter % 60 == 0:
59 |         print(f"The frame count has reached {frame_counter}")"""
60 | 
61 | 
62 | """global count
63 | global advance
64 | global img
65 | 
66 | gui.draw_text((10, 50), red, f"Boo")
67 | 
68 | if count % 4 == 3:
69 |     image = Image.frombytes('RGBA', (width,height), data, 'raw')
70 |     image = image.resize((94,78))
71 |     image = image.convert("RGB")
72 | 
73 |     img1 = np.asarray(image)
74 |     img1 = img1[...,::-1]
75 |     img1 = np.dot(img1[...,:3], [0.2989, 0.5870, 0.1140])
76 | 
77 |     #if random.random() > 0.99:
78 |         #cv2.imwrite("filename.png", img)
79 | 
80 |     img = img1.astype(np.float32)
81 |     
82 |     advance = True
83 | 
84 | count += 1"""
85 | 


--------------------------------------------------------------------------------
/EffRainbow/ptan_actions.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class ActionSelector:
 5 |     """
 6 |     Abstract class which converts scores to the actions
 7 |     """
 8 |     def __call__(self, scores):
 9 |         raise NotImplementedError
10 | 
11 | 
12 | class ArgmaxActionSelector(ActionSelector):
13 |     """
14 |     Selects actions using argmax
15 |     """
16 |     def __call__(self, scores):
17 |         assert isinstance(scores, np.ndarray)
18 |         return np.argmax(scores, axis=1)
19 | 
20 | 
21 | class EpsilonGreedyActionSelector(ActionSelector):
22 |     def __init__(self, epsilon=1.0,eps_dec = 1e-6,eps_min = 0.05, selector=None):
23 |         self.epsilon = epsilon
24 |         self.eps_dec = eps_dec
25 |         self.eps_min = eps_min
26 |         self.selector = selector if selector is not None else ArgmaxActionSelector()
27 | 
28 |     def __call__(self, scores):
29 |         assert isinstance(scores, np.ndarray)
30 |         batch_size, n_actions = scores.shape
31 |         actions = self.selector(scores)
32 |         mask = np.random.random(size=batch_size) < self.epsilon
33 |         rand_actions = np.random.choice(n_actions, sum(mask))
34 |         actions[mask] = rand_actions
35 |         self.epsilon -= self.eps_dec
36 |         if self.epsilon < self.eps_min:
37 |             self.epsilon = self.eps_min
38 |         return actions
39 | 
40 | class StickyEpsilonGreedyActionSelector(ActionSelector):
41 |     def __init__(self, epsilon=1.0,eps_dec = 1e-6,eps_min = 0.05, selector=None):
42 |         self.epsilon = epsilon
43 |         self.eps_dec = eps_dec
44 |         self.eps_min = eps_min
45 |         self.selector = selector if selector is not None else ArgmaxActionSelector()
46 |         self.repeat_probs = 0.3
47 |         self.prev_actions = None
48 | 
49 |     def __call__(self, scores):
50 |         assert isinstance(scores, np.ndarray)
51 |         batch_size, n_actions = scores.shape
52 | 
53 |         if np.random.random() < self.repeat_probs:
54 |             if self.prev_actions is not None and batch_size == len(self.prev_actions):
55 |                 return self.prev_actions
56 |         
57 |         batch_size, n_actions = scores.shape
58 |         actions = self.selector(scores)
59 |         mask = np.random.random(size=batch_size) < self.epsilon
60 |         rand_actions = np.random.choice(n_actions, sum(mask))
61 |         actions[mask] = rand_actions
62 |         self.epsilon -= self.eps_dec
63 |         if self.epsilon < self.eps_min:
64 |             self.epsilon = self.eps_min
65 | 
66 |         self.prev_actions = actions[:]
67 |         return actions
68 | 
69 | 
70 | class ProbabilityActionSelector(ActionSelector):
71 |     """
72 |     Converts probabilities of actions into action by sampling them
73 |     """
74 |     def __call__(self, probs):
75 |         assert isinstance(probs, np.ndarray)
76 |         actions = []
77 |         for prob in probs:
78 |             actions.append(np.random.choice(len(prob), p=prob))
79 |         return np.array(actions)
80 | 


--------------------------------------------------------------------------------
/EffRainbow/simple_buffer_bench.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Benchmark various Replay Buffer variants
 4 | """
 5 | import timeit
 6 | import numpy as np
 7 | import collections
 8 | 
 9 | 
10 | SIZES = [10**n for n in (3, 4, 5)]
11 | DATA_SHAPE = (84, 84, 4)
12 | REPEAT_NUMBER = 10
13 | 
14 | 
15 | class ExperienceBufferDeque:
16 |     def __init__(self, capacity):
17 |         self.buffer = collections.deque(maxlen=capacity)
18 | 
19 |     def __len__(self):
20 |         return len(self.buffer)
21 | 
22 |     def append(self, experience):
23 |         self.buffer.append(experience)
24 | 
25 |     def sample(self, batch_size):
26 |         indices = np.random.choice(len(self.buffer), batch_size, replace=True)
27 |         return [self.buffer[idx] for idx in indices]
28 | 
29 | 
30 | class ExperienceBufferCircularList:
31 |     def __init__(self, capacity):
32 |         self.buffer = list()
33 |         self.capacity = capacity
34 |         self.pos = 0
35 | 
36 |     def __len__(self):
37 |         return len(self.buffer)
38 | 
39 |     def append(self, experience):
40 |         if len(self.buffer) < self.capacity:
41 |             self.buffer.append(experience)
42 |         else:
43 |             self.buffer[self.pos] = experience
44 |             self.pos = (self.pos + 1) % self.capacity
45 | 
46 |     def sample(self, batch_size):
47 |         indices = np.random.choice(len(self.buffer), batch_size, replace=True)
48 |         return [self.buffer[idx] for idx in indices]
49 | 
50 | 
51 | 
52 | def fill_buf(buf, size):
53 |     for _ in range(size):
54 |         buf.append(np.zeros(DATA_SHAPE, dtype=np.uint8))
55 | 
56 | 
57 | def bench_buffer(buf_class):
58 |     print("Benchmarking %s" % buf_class.__name__)
59 | 
60 |     for size in SIZES:
61 |         print("  Test size %d" % size)
62 |         ns = globals()
63 |         ns.update(locals())
64 |         t = timeit.timeit('fill_buf(buf, size)', setup='buf = buf_class(size)', number=REPEAT_NUMBER, globals=ns)
65 |         print("  * Initial fill:\t%.2f items/s" % (size*REPEAT_NUMBER / t))
66 |         buf = buf_class(size)
67 |         fill_buf(buf, size)
68 |         ns.update(locals())
69 |         t = timeit.timeit('fill_buf(buf, size)', number=REPEAT_NUMBER, globals=ns)
70 |         print("  * Append:\t\t%.2f items/s" % (size*REPEAT_NUMBER / t))
71 |         t = timeit.timeit('buf.sample(4)', number=REPEAT_NUMBER*100, globals=ns)
72 |         print("  * Sample 4:\t\t%.2f items/s" % (REPEAT_NUMBER*100 / t))
73 |         t = timeit.timeit('buf.sample(8)', number=REPEAT_NUMBER*100, globals=ns)
74 |         print("  * Sample 8:\t\t%.2f items/s" % (REPEAT_NUMBER*100 / t))
75 |         t = timeit.timeit('buf.sample(16)', number=REPEAT_NUMBER*100, globals=ns)
76 |         print("  * Sample 16:\t\t%.2f items/s" % (REPEAT_NUMBER*100 / t))
77 |         t = timeit.timeit('buf.sample(32)', number=REPEAT_NUMBER*100, globals=ns)
78 |         print("  * Sample 32:\t\t%.2f items/s" % (REPEAT_NUMBER*100 / t))
79 | 
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     bench_buffer(ExperienceBufferCircularList)
84 |     bench_buffer(ExperienceBufferDeque)
85 |     pass
86 | 


--------------------------------------------------------------------------------
/EffRainbow/OutputViewer.py:
--------------------------------------------------------------------------------
  1 | import pygame
  2 | import numpy as np
  3 | from copy import copy
  4 | import sys
  5 | import time
  6 | 
  7 | class OutputViewer():
  8 |     def __init__(self,tags):
  9 | 
 10 |         self.tags = tags
 11 | 
 12 |         self.width = 600
 13 |         self.height = 400
 14 | 
 15 |         self.text_height = 30
 16 | 
 17 |         self.bar_heights = self.height - (self.text_height * 2)
 18 | 
 19 |         self.rangeMin = -1
 20 |         self.rangeMax = 9
 21 |         self.remapping = [2, 0, 1, 3, 4]
 22 | 
 23 |         self.mult = self.height / self.rangeMax
 24 | 
 25 |         pygame.init()
 26 | 
 27 |         all_fonts = pygame.font.get_fonts()
 28 |         self.font = pygame.font.SysFont(all_fonts[7], 18)
 29 |         
 30 |         self.clock = pygame.time.Clock()
 31 |         self.screen = pygame.display.set_mode((self.width, self.height))
 32 | 
 33 |         self.color = (0, 0, 255)
 34 | 
 35 |     def update(self,ovals):
 36 |         time.sleep(0.01)
 37 | 
 38 |         vals = []
 39 |         for i in range(len(ovals)):
 40 |             vals.append(-1)
 41 | 
 42 |         for i in range(len(ovals)):
 43 |             vals[self.remapping[i]] = ovals[i]
 44 | 
 45 |         self.screen.fill((0,0,0))
 46 | 
 47 |         bar_width = (self.width - 80) / len(vals)
 48 |         spacing = bar_width / 10
 49 | 
 50 |         ma = np.argmax(vals)
 51 |         
 52 |         for i in range(len(vals)):
 53 |             if i == ma:
 54 |                 color = (255,215,0)
 55 |             elif vals[i] < 0:
 56 |                 color = (255,0,0)
 57 |             else:
 58 |                 color = (0,0,255)
 59 |             
 60 |             vals[i] -= self.rangeMin
 61 | 
 62 |             pygame.draw.rect(self.screen, color, pygame.Rect(10 + spacing * i + bar_width * i,\
 63 |                                             self.bar_heights - int(vals[i] * self.mult), bar_width - spacing * 2, int(vals[i] * self.mult)))
 64 | 
 65 |             text = self.font.render(self.tags[i],1,(255,255,255))#creates the text
 66 |             self.screen.blit(text,(10 + spacing * i + bar_width * i,self.bar_heights + 10))
 67 | 
 68 |             text = self.font.render(str(round(vals[i],2)),1,(255,255,255))#creates the text
 69 |             self.screen.blit(text,(10 + spacing * i + bar_width * i,self.bar_heights + 30))   
 70 | 
 71 | 
 72 |         self.mouse_up = False
 73 |         #allow shutdown window
 74 |         self.mouse_pos = pygame.mouse.get_pos()
 75 |         for event in pygame.event.get():
 76 |             if event.type == pygame.QUIT:
 77 |                 pygame.quit()
 78 |                 sys.exit()
 79 |             if event.type == pygame.MOUSEBUTTONUP and event.button == 1:
 80 |                 self.mouse_up = True
 81 | 
 82 |         self.clock.tick(60)
 83 | 
 84 |         pygame.display.flip()
 85 | 
 86 | 
 87 | if __name__ == "__main__":
 88 |     out = OutputViewer(["hLeft","sLeft","wLeft","Forward","wRight","sRight","hRight"])
 89 |     outputs = [1,2,3,4,5,6,12]
 90 |     while True:
 91 | 
 92 |         out.update(copy(outputs))
 93 |         for i in range(len(outputs)):
 94 |             outputs[i] += np.random.random() - 0.5
 95 |             if outputs[i] < -1:
 96 |                 outputs[i] = -1
 97 |             elif outputs[i] > 12:
 98 |                 outputs[i] = 12
 99 |     
100 | 


--------------------------------------------------------------------------------
/EffRainbow/dqn_model.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | import numpy as np
 7 | 
 8 | class NoisyLinear(nn.Linear):
 9 |     def __init__(self, in_features, out_features, sigma_init=0.1, bias=True):
10 |         super(NoisyLinear, self).__init__(in_features, out_features, bias=bias)
11 |         self.sigma_weight = nn.Parameter(torch.full((out_features, in_features), sigma_init))
12 |         self.register_buffer("epsilon_weight", torch.zeros(out_features, in_features))
13 |         if bias:
14 |             self.sigma_bias = nn.Parameter(torch.full((out_features,), sigma_init))
15 |             self.register_buffer("epsilon_bias", torch.zeros(out_features))
16 |         self.reset_parameters()
17 | 
18 |     def reset_parameters(self):
19 |         std = math.sqrt(3 / self.in_features)
20 |         self.weight.data.uniform_(-std, std)
21 |         self.bias.data.uniform_(-std, std)
22 | 
23 |     def forward(self, input):
24 |         self.epsilon_weight.normal_()
25 |         bias = self.bias
26 |         if bias is not None:
27 |             self.epsilon_bias.normal_()
28 |             bias = bias + self.sigma_bias * self.epsilon_bias.data
29 |         return F.linear(input, self.weight + self.sigma_weight * self.epsilon_weight.data, bias)
30 | 
31 | 
32 | class NoisyFactorizedLinear(nn.Linear):
33 |     """
34 |     NoisyNet layer with factorized gaussian noise
35 | 
36 |     N.B. nn.Linear already initializes weight and bias to
37 |     """
38 |     def __init__(self, in_features, out_features, sigma_zero=0.4, bias=True):
39 |         super(NoisyFactorizedLinear, self).__init__(in_features, out_features, bias=bias)
40 |         sigma_init = sigma_zero / math.sqrt(in_features)
41 |         self.sigma_weight = nn.Parameter(torch.full((out_features, in_features), sigma_init))
42 |         self.register_buffer("epsilon_input", torch.zeros(1, in_features))
43 |         self.register_buffer("epsilon_output", torch.zeros(out_features, 1))
44 |         if bias:
45 |             self.sigma_bias = nn.Parameter(torch.full((out_features,), sigma_init))
46 | 
47 |     def forward(self, input):
48 |         self.epsilon_input.normal_()
49 |         self.epsilon_output.normal_()
50 | 
51 |         func = lambda x: torch.sign(x) * torch.sqrt(torch.abs(x))
52 |         eps_in = func(self.epsilon_input.data)
53 |         eps_out = func(self.epsilon_output.data)
54 | 
55 |         bias = self.bias
56 |         if bias is not None:
57 |             bias = bias + self.sigma_bias * eps_out.t()
58 |         noise_v = torch.mul(eps_in, eps_out)
59 |         return F.linear(input, self.weight + self.sigma_weight * noise_v, bias)
60 | 
61 | 
62 | class DQN(nn.Module):
63 |     def __init__(self, input_shape, n_actions):
64 |         super(DQN, self).__init__()
65 | 
66 |         self.conv = nn.Sequential(
67 |             nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
68 |             nn.ReLU(),
69 |             nn.Conv2d(32, 64, kernel_size=4, stride=2),
70 |             nn.ReLU(),
71 |             nn.Conv2d(64, 64, kernel_size=3, stride=1),
72 |             nn.ReLU()
73 |         )
74 | 
75 |         conv_out_size = self._get_conv_out(input_shape)
76 |         self.fc = nn.Sequential(
77 |             nn.Linear(conv_out_size, 512),
78 |             nn.ReLU(),
79 |             nn.Linear(512, n_actions)
80 |         )
81 | 
82 |     def _get_conv_out(self, shape):
83 |         o = self.conv(torch.zeros(1, *shape))
84 |         return int(np.prod(o.size()))
85 | 
86 |     def forward(self, x):
87 |         fx = x.float() / 256
88 |         conv_out = self.conv(fx).view(fx.size()[0], -1)
89 |         return self.fc(conv_out)
90 | 


--------------------------------------------------------------------------------
/Ptan/06_dqn_dueling.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import gym
 3 | import ptan
 4 | import argparse
 5 | import numpy as np
 6 | 
 7 | import torch
 8 | import torch.nn as nn
 9 | import torch.optim as optim
10 | 
11 | from tensorboardX import SummaryWriter
12 | 
13 | from lib import common
14 | 
15 | 
16 | class DuelingDQN(nn.Module):
17 |     def __init__(self, input_shape, n_actions):
18 |         super(DuelingDQN, self).__init__()
19 | 
20 |         self.conv = nn.Sequential(
21 |             nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
22 |             nn.ReLU(),
23 |             nn.Conv2d(32, 64, kernel_size=4, stride=2),
24 |             nn.ReLU(),
25 |             nn.Conv2d(64, 64, kernel_size=3, stride=1),
26 |             nn.ReLU()
27 |         )
28 | 
29 |         conv_out_size = self._get_conv_out(input_shape)
30 |         self.fc_adv = nn.Sequential(
31 |             nn.Linear(conv_out_size, 256),
32 |             nn.ReLU(),
33 |             nn.Linear(256, n_actions)
34 |         )
35 |         self.fc_val = nn.Sequential(
36 |             nn.Linear(conv_out_size, 256),
37 |             nn.ReLU(),
38 |             nn.Linear(256, 1)
39 |         )
40 | 
41 |     def _get_conv_out(self, shape):
42 |         o = self.conv(torch.zeros(1, *shape))
43 |         return int(np.prod(o.size()))
44 | 
45 |     def forward(self, x):
46 |         fx = x.float() / 256
47 |         conv_out = self.conv(fx).view(fx.size()[0], -1)
48 |         val = self.fc_val(conv_out)
49 |         adv = self.fc_adv(conv_out)
50 |         return val + (adv - adv.mean(dim=1, keepdim=True))
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     params = common.HYPERPARAMS['pong']
55 |     parser = argparse.ArgumentParser()
56 |     parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda")
57 |     args = parser.parse_args()
58 |     device = torch.device("cuda" if args.cuda else "cpu")
59 | 
60 |     env = gym.make(params['env_name'])
61 |     env = ptan.common.wrappers.wrap_dqn(env)
62 | 
63 |     writer = SummaryWriter(comment="-" + params['run_name'] + "-dueling")
64 |     net = DuelingDQN(env.observation_space.shape, env.action_space.n).to(device)
65 |     tgt_net = ptan.agent.TargetNet(net)
66 |     selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start'])
67 |     epsilon_tracker = common.EpsilonTracker(selector, params)
68 |     agent = ptan.agent.DQNAgent(net, selector, device=device)
69 | 
70 |     exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1)
71 |     buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params['replay_size'])
72 |     optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])
73 | 
74 |     frame_idx = 0
75 | 
76 |     with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
77 |         while True:
78 |             frame_idx += 1
79 |             buffer.populate(1)
80 |             epsilon_tracker.frame(frame_idx)
81 | 
82 |             new_rewards = exp_source.pop_total_rewards()
83 |             if new_rewards:
84 |                 if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon):
85 |                     break
86 | 
87 |             if len(buffer) < params['replay_initial']:
88 |                 continue
89 | 
90 |             optimizer.zero_grad()
91 |             batch = buffer.sample(params['batch_size'])
92 |             loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params['gamma'], device=device)
93 |             loss_v.backward()
94 |             optimizer.step()
95 | 
96 |             if frame_idx % params['target_net_sync'] == 0:
97 |                 tgt_net.sync()
98 | 


--------------------------------------------------------------------------------
/Ptan/dqn_model.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | import numpy as np
 7 | 
 8 | 
 9 | class NoisyLinear(nn.Linear):
10 |     def __init__(self, in_features, out_features, sigma_init=0.15, bias=True): #0.017
11 |         super(NoisyLinear, self).__init__(in_features, out_features, bias=bias)
12 |         #print(sigma_init)
13 |         self.sigma_weight = nn.Parameter(torch.full((out_features, in_features), sigma_init))
14 |         self.register_buffer("epsilon_weight", torch.zeros(out_features, in_features))
15 |         if bias:
16 |             self.sigma_bias = nn.Parameter(torch.full((out_features,), sigma_init))
17 |             self.register_buffer("epsilon_bias", torch.zeros(out_features))
18 |         self.reset_parameters()
19 | 
20 |     def reset_parameters(self):
21 |         std = math.sqrt(3 / self.in_features)
22 |         self.weight.data.uniform_(-std, std)
23 |         self.bias.data.uniform_(-std, std)
24 | 
25 |     def forward(self, input):
26 |         self.epsilon_weight.normal_()
27 |         bias = self.bias
28 |         if bias is not None:
29 |             self.epsilon_bias.normal_()
30 |             bias = bias + self.sigma_bias * self.epsilon_bias.data
31 |         return F.linear(input, self.weight + self.sigma_weight * self.epsilon_weight.data, bias)
32 | 
33 | 
34 | class NoisyFactorizedLinear(nn.Linear):
35 |     """
36 |     NoisyNet layer with factorized gaussian noise
37 | 
38 |     N.B. nn.Linear already initializes weight and bias to
39 |     """
40 |     def __init__(self, in_features, out_features, sigma_zero=0.4, bias=True):
41 |         super(NoisyFactorizedLinear, self).__init__(in_features, out_features, bias=bias)
42 |         sigma_init = sigma_zero / math.sqrt(in_features)
43 |         self.sigma_weight = nn.Parameter(torch.full((out_features, in_features), sigma_init))
44 |         self.register_buffer("epsilon_input", torch.zeros(1, in_features))
45 |         self.register_buffer("epsilon_output", torch.zeros(out_features, 1))
46 |         if bias:
47 |             self.sigma_bias = nn.Parameter(torch.full((out_features,), sigma_init))
48 | 
49 |     def forward(self, input):
50 |         self.epsilon_input.normal_()
51 |         self.epsilon_output.normal_()
52 | 
53 |         func = lambda x: torch.sign(x) * torch.sqrt(torch.abs(x))
54 |         eps_in = func(self.epsilon_input.data)
55 |         eps_out = func(self.epsilon_output.data)
56 | 
57 |         bias = self.bias
58 |         if bias is not None:
59 |             bias = bias + self.sigma_bias * eps_out.t()
60 |         noise_v = torch.mul(eps_in, eps_out)
61 |         return F.linear(input, self.weight + self.sigma_weight * noise_v, bias)
62 | 
63 | 
64 | class DQN(nn.Module):
65 |     def __init__(self, input_shape, n_actions):
66 |         super(DQN, self).__init__()
67 | 
68 |         self.conv = nn.Sequential(
69 |             nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
70 |             nn.ReLU(),
71 |             nn.Conv2d(32, 64, kernel_size=4, stride=2),
72 |             nn.ReLU(),
73 |             nn.Conv2d(64, 64, kernel_size=3, stride=1),
74 |             nn.ReLU()
75 |         )
76 | 
77 |         conv_out_size = self._get_conv_out(input_shape)
78 |         self.fc = nn.Sequential(
79 |             nn.Linear(conv_out_size, 512),
80 |             nn.ReLU(),
81 |             nn.Linear(512, n_actions)
82 |         )
83 | 
84 |     def _get_conv_out(self, shape):
85 |         o = self.conv(torch.zeros(1, *shape))
86 |         return int(np.prod(o.size()))
87 | 
88 |     def forward(self, x):
89 |         fx = x.float() / 256
90 |         conv_out = self.conv(fx).view(fx.size()[0], -1)
91 |         return self.fc(conv_out)
92 | 


--------------------------------------------------------------------------------
/Ptan/04_dqn_noisy_net.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import gym
  3 | import ptan
  4 | import argparse
  5 | import numpy as np
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.optim as optim
 10 | from torch.autograd import Variable
 11 | 
 12 | from tensorboardX import SummaryWriter
 13 | 
 14 | from lib import dqn_model, common
 15 | 
 16 | 
 17 | class NoisyDQN(nn.Module):
 18 |     def __init__(self, input_shape, n_actions):
 19 |         super(NoisyDQN, self).__init__()
 20 | 
 21 |         self.conv = nn.Sequential(
 22 |             nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
 23 |             nn.ReLU(),
 24 |             nn.Conv2d(32, 64, kernel_size=4, stride=2),
 25 |             nn.ReLU(),
 26 |             nn.Conv2d(64, 64, kernel_size=3, stride=1),
 27 |             nn.ReLU()
 28 |         )
 29 | 
 30 |         conv_out_size = self._get_conv_out(input_shape)
 31 |         self.noisy_layers = [
 32 |             dqn_model.NoisyLinear(conv_out_size, 512),
 33 |             dqn_model.NoisyLinear(512, n_actions)
 34 |         ]
 35 |         self.fc = nn.Sequential(
 36 |             self.noisy_layers[0],
 37 |             nn.ReLU(),
 38 |             self.noisy_layers[1]
 39 |         )
 40 | 
 41 |     def _get_conv_out(self, shape):
 42 |         o = self.conv(torch.zeros(1, *shape))
 43 |         return int(np.prod(o.size()))
 44 | 
 45 |     def forward(self, x):
 46 |         fx = x.float() / 256
 47 |         conv_out = self.conv(fx).view(fx.size()[0], -1)
 48 |         return self.fc(conv_out)
 49 | 
 50 |     def noisy_layers_sigma_snr(self):
 51 |         return [
 52 |             ((layer.weight ** 2).mean().sqrt() / (layer.sigma_weight ** 2).mean().sqrt()).item()
 53 |             for layer in self.noisy_layers
 54 |         ]
 55 | 
 56 | 
 57 | if __name__ == "__main__":
 58 |     params = common.HYPERPARAMS['pong']
 59 |     parser = argparse.ArgumentParser()
 60 |     parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda")
 61 |     args = parser.parse_args()
 62 |     device = torch.device("cuda" if args.cuda else "cpu")
 63 | 
 64 |     env = gym.make(params['env_name'])
 65 |     env = ptan.common.wrappers.wrap_dqn(env)
 66 | 
 67 |     writer = SummaryWriter(comment="-" + params['run_name'] + "-noisy-net")
 68 |     net = NoisyDQN(env.observation_space.shape, env.action_space.n).to(device)
 69 |     tgt_net = ptan.agent.TargetNet(net)
 70 |     agent = ptan.agent.DQNAgent(net, ptan.actions.ArgmaxActionSelector(), device=device)
 71 | 
 72 |     exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1)
 73 |     buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params['replay_size'])
 74 |     optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])
 75 | 
 76 |     frame_idx = 0
 77 | 
 78 |     with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
 79 |         while True:
 80 |             frame_idx += 1
 81 |             buffer.populate(1)
 82 | 
 83 |             new_rewards = exp_source.pop_total_rewards()
 84 |             if new_rewards:
 85 |                 if reward_tracker.reward(new_rewards[0], frame_idx):
 86 |                     break
 87 | 
 88 |             if len(buffer) < params['replay_initial']:
 89 |                 continue
 90 | 
 91 |             optimizer.zero_grad()
 92 |             batch = buffer.sample(params['batch_size'])
 93 |             loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params['gamma'], device=device)
 94 |             loss_v.backward()
 95 |             optimizer.step()
 96 | 
 97 |             if frame_idx % params['target_net_sync'] == 0:
 98 |                 tgt_net.sync()
 99 | 
100 |             if frame_idx % 500 == 0:
101 |                 for layer_idx, sigma_l2 in enumerate(net.noisy_layers_sigma_snr()):
102 |                     writer.add_scalar("sigma_snr_layer_%d" % (layer_idx+1),
103 |                                       sigma_l2, frame_idx)
104 | 


--------------------------------------------------------------------------------
/Ptan/03_dqn_double.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import gym
  3 | import ptan
  4 | import argparse
  5 | import numpy as np
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.optim as optim
 10 | 
 11 | from tensorboardX import SummaryWriter
 12 | 
 13 | from lib import dqn_model, common
 14 | 
 15 | STATES_TO_EVALUATE = 1000
 16 | EVAL_EVERY_FRAME = 100
 17 | 
 18 | 
 19 | def calc_loss(batch, net, tgt_net, gamma, device="cpu", double=True):
 20 |     states, actions, rewards, dones, next_states = common.unpack_batch(batch)
 21 | 
 22 |     states_v = torch.tensor(states).to(device)
 23 |     next_states_v = torch.tensor(next_states).to(device)
 24 |     actions_v = torch.tensor(actions).to(device)
 25 |     rewards_v = torch.tensor(rewards).to(device)
 26 |     done_mask = torch.ByteTensor(dones).to(device)
 27 | 
 28 |     state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
 29 |     if double:
 30 |         next_state_actions = net(next_states_v).max(1)[1]
 31 |         next_state_values = tgt_net(next_states_v).gather(1, next_state_actions.unsqueeze(-1)).squeeze(-1)
 32 |     else:
 33 |         next_state_values = tgt_net(next_states_v).max(1)[0]
 34 |     next_state_values[done_mask] = 0.0
 35 | 
 36 |     expected_state_action_values = next_state_values.detach() * gamma + rewards_v
 37 |     return nn.MSELoss()(state_action_values, expected_state_action_values)
 38 | 
 39 | 
 40 | def calc_values_of_states(states, net, device="cpu"):
 41 |     mean_vals = []
 42 |     for batch in np.array_split(states, 64):
 43 |         states_v = torch.tensor(batch).to(device)
 44 |         action_values_v = net(states_v)
 45 |         best_action_values_v = action_values_v.max(1)[0]
 46 |         mean_vals.append(best_action_values_v.mean().item())
 47 |     return np.mean(mean_vals)
 48 | 
 49 | 
 50 | if __name__ == "__main__":
 51 |     params = common.HYPERPARAMS['pong']
 52 |     parser = argparse.ArgumentParser()
 53 |     parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda")
 54 |     parser.add_argument("--double", default=False, action="store_true", help="Enable double DQN")
 55 |     args = parser.parse_args()
 56 |     device = torch.device("cuda" if args.cuda else "cpu")
 57 | 
 58 |     env = gym.make(params['env_name'])
 59 |     env = ptan.common.wrappers.wrap_dqn(env)
 60 | 
 61 |     writer = SummaryWriter(comment="-" + params['run_name'] + "-double=" + str(args.double))
 62 |     net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device)
 63 | 
 64 |     tgt_net = ptan.agent.TargetNet(net)
 65 |     selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start'])
 66 |     epsilon_tracker = common.EpsilonTracker(selector, params)
 67 |     agent = ptan.agent.DQNAgent(net, selector, device=device)
 68 | 
 69 |     exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1)
 70 |     buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params['replay_size'])
 71 |     optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])
 72 | 
 73 |     frame_idx = 0
 74 |     eval_states = None
 75 | 
 76 |     with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
 77 |         while True:
 78 |             frame_idx += 1
 79 |             buffer.populate(1)
 80 |             epsilon_tracker.frame(frame_idx)
 81 | 
 82 |             new_rewards = exp_source.pop_total_rewards()
 83 |             if new_rewards:
 84 |                 if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon):
 85 |                     break
 86 | 
 87 |             if len(buffer) < params['replay_initial']:
 88 |                 continue
 89 |             if eval_states is None:
 90 |                 eval_states = buffer.sample(STATES_TO_EVALUATE)
 91 |                 eval_states = [np.array(transition.state, copy=False) for transition in eval_states]
 92 |                 eval_states = np.array(eval_states, copy=False)
 93 | 
 94 |             optimizer.zero_grad()
 95 |             batch = buffer.sample(params['batch_size'])
 96 |             loss_v = calc_loss(batch, net, tgt_net.target_model, gamma=params['gamma'], device=device,
 97 |                                double=args.double)
 98 |             loss_v.backward()
 99 |             optimizer.step()
100 | 
101 |             if frame_idx % params['target_net_sync'] == 0:
102 |                 tgt_net.sync()
103 |             if frame_idx % EVAL_EVERY_FRAME == 0:
104 |                 mean_val = calc_values_of_states(eval_states, net, device=device)
105 |                 writer.add_scalar("values_mean", mean_val, frame_idx)
106 | 
107 | 


--------------------------------------------------------------------------------
/EffRainbow/prio_buffer_bench.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Benchmark various Priority Replay Buffer variants
  4 | """
  5 | import timeit
  6 | import numpy as np
  7 | import collections
  8 | 
  9 | SIZES = [10**n for n in (3, 4, 5)]
 10 | DATA_SHAPE = (84, 84, 4)
 11 | REPEAT_NUMBER = 10
 12 | 
 13 | 
 14 | class PrioReplayBufferDeque:
 15 |     def __init__(self, buf_size, prob_alpha=0.6):
 16 |         self.prob_alpha = prob_alpha
 17 |         self.buffer = collections.deque(maxlen=buf_size)
 18 |         self.priorities = collections.deque(maxlen=buf_size)
 19 | 
 20 |     def __len__(self):
 21 |         return len(self.buffer)
 22 | 
 23 |     def append(self, sample):
 24 |         max_prio = max(self.priorities) if self.priorities else 1.0
 25 |         self.buffer.append(sample)
 26 |         self.priorities.append(max_prio)
 27 | 
 28 |     def sample(self, batch_size, beta=0.4):
 29 |         probs = np.array(self.priorities, dtype=np.float32) ** self.prob_alpha
 30 |         probs /= probs.sum()
 31 |         indices = np.random.choice(len(self.buffer), batch_size, p=probs, replace=True)
 32 |         samples = [self.buffer[idx] for idx in indices]
 33 |         total = len(self.buffer)
 34 |         weights = (total * probs[indices]) ** (-beta)
 35 |         weights /= weights.max()
 36 |         return samples, indices, weights
 37 | 
 38 |     def update_priorities(self, batch_indices, batch_priorities):
 39 |         for idx, prio in zip(batch_indices, batch_priorities):
 40 |             self.priorities[idx] = prio
 41 | 
 42 | 
 43 | class PrioReplayBufferList:
 44 |     def __init__(self, buf_size, prob_alpha=0.6):
 45 |         self.prob_alpha = prob_alpha
 46 |         self.capacity = buf_size
 47 |         self.pos = 0
 48 |         self.buffer = []
 49 |         self.priorities = np.zeros((buf_size, ), dtype=np.float32)
 50 | 
 51 |     def __len__(self):
 52 |         return len(self.buffer)
 53 | 
 54 |     def append(self, sample):
 55 |         max_prio = self.priorities.max() if self.buffer else 1.0
 56 |         if len(self.buffer) < self.capacity:
 57 |             self.buffer.append(sample)
 58 |         else:
 59 |             self.buffer[self.pos] = sample
 60 |         self.priorities[self.pos] = max_prio
 61 |         self.pos = (self.pos + 1) % self.capacity
 62 | 
 63 |     def sample(self, batch_size, beta=0.4):
 64 |         if len(self.buffer) == self.capacity:
 65 |             prios = self.priorities
 66 |         else:
 67 |             prios = self.priorities[:self.pos]
 68 |         probs = np.array(prios, dtype=np.float32) ** self.prob_alpha
 69 | 
 70 |         probs /= probs.sum()
 71 |         indices = np.random.choice(len(self.buffer), batch_size, p=probs, replace=True)
 72 |         samples = [self.buffer[idx] for idx in indices]
 73 |         total = len(self.buffer)
 74 |         weights = (total * probs[indices]) ** (-beta)
 75 |         weights /= weights.max()
 76 |         return samples, indices, weights
 77 | 
 78 |     def update_priorities(self, batch_indices, batch_priorities):
 79 |         for idx, prio in zip(batch_indices, batch_priorities):
 80 |             self.priorities[idx] = prio
 81 | 
 82 | 
 83 | def fill_buf(buf, size):
 84 |     for _ in range(size):
 85 |         buf.append(np.zeros(DATA_SHAPE, dtype=np.uint8))
 86 | 
 87 | 
 88 | def bench_buffer(buf_class):
 89 |     print("Benchmarking %s" % buf_class.__name__)
 90 | 
 91 |     for size in SIZES:
 92 |         print("  Test size %d" % size)
 93 |         ns = globals()
 94 |         ns.update(locals())
 95 |         t = timeit.timeit('fill_buf(buf, size)', setup='buf = buf_class(size)', number=REPEAT_NUMBER, globals=ns)
 96 |         print("  * Initial fill:\t%.2f items/s" % (size*REPEAT_NUMBER / t))
 97 |         buf = buf_class(size)
 98 |         fill_buf(buf, size)
 99 |         ns.update(locals())
100 |         t = timeit.timeit('fill_buf(buf, size)', number=REPEAT_NUMBER, globals=ns)
101 |         print("  * Append:\t\t%.2f items/s" % (size*REPEAT_NUMBER / t))
102 |         t = timeit.timeit('buf.sample(4)', number=REPEAT_NUMBER*100, globals=ns)
103 |         print("  * Sample 4:\t\t%.2f items/s" % (REPEAT_NUMBER*100 / t))
104 |         t = timeit.timeit('buf.sample(8)', number=REPEAT_NUMBER*100, globals=ns)
105 |         print("  * Sample 8:\t\t%.2f items/s" % (REPEAT_NUMBER*100 / t))
106 |         t = timeit.timeit('buf.sample(16)', number=REPEAT_NUMBER*100, globals=ns)
107 |         print("  * Sample 16:\t\t%.2f items/s" % (REPEAT_NUMBER*100 / t))
108 |         t = timeit.timeit('buf.sample(32)', number=REPEAT_NUMBER*100, globals=ns)
109 |         print("  * Sample 32:\t\t%.2f items/s" % (REPEAT_NUMBER*100 / t))
110 | 
111 | 
112 | 
113 | if __name__ == "__main__":
114 |     bench_buffer(PrioReplayBufferList)
115 |     bench_buffer(PrioReplayBufferDeque)
116 |     pass
117 | 


--------------------------------------------------------------------------------
/EffRainbow/Wrappers.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | from gym import spaces
  3 | import numpy as np
  4 | from collections import deque
  5 | import cv2
  6 | import random
  7 | 
  8 | class ImageToPyTorch(gym.ObservationWrapper):
  9 |     """
 10 |     Change image shape to CWH
 11 |     """
 12 |     def __init__(self, env):
 13 |         super(ImageToPyTorch, self).__init__(env)
 14 |         old_shape = self.observation_space.shape
 15 |         print("Converting to torch...")
 16 |         print(old_shape)
 17 |         self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[-1], old_shape[0], old_shape[1]),
 18 |                                                 dtype=np.float32)
 19 |         print(self.observation_space)
 20 | 
 21 |     def observation(self, observation):
 22 |         print("Pytorch processing")
 23 |         print(observation.shape)
 24 |         return np.swapaxes(observation, 2, 0)
 25 | 
 26 | class ScaledFloatFrame(gym.ObservationWrapper):
 27 |     def observation(self, obs):
 28 |         # careful! This undoes the memory optimization, use
 29 |         # with smaller replay buffers only.
 30 |         return np.array(obs).astype(np.float32) / 255.0
 31 | 
 32 | class LazyFrames(object):
 33 |     def __init__(self, frames):
 34 |         """This object ensures that common frames between the observations are only stored once.
 35 |         It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
 36 |         buffers.
 37 |         This object should only be converted to numpy array before being passed to the model.
 38 |         You'd not belive how complex the previous solution was."""
 39 |         self._frames = frames
 40 | 
 41 |     def __array__(self, dtype=None):
 42 |         out = np.concatenate(self._frames, axis=0)
 43 |         if dtype is not None:
 44 |             out = out.astype(dtype)
 45 |         return out
 46 | 
 47 | class ProcessFrame84(gym.ObservationWrapper):
 48 |     def __init__(self, env=None):
 49 |         super(ProcessFrame84, self).__init__(env)
 50 |         self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
 51 | 
 52 |     def observation(self, obs):
 53 |         return ProcessFrame84.process(obs)
 54 | 
 55 |     @staticmethod
 56 |     def process(frame):
 57 |         if frame.size == 210 * 160 * 3:
 58 |             img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
 59 |         elif frame.size == 250 * 160 * 3:
 60 |             img = np.reshape(frame, [250, 160, 3]).astype(np.float32)
 61 |         else:
 62 |             assert False, "Unknown resolution."
 63 |         img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
 64 |         resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
 65 |         x_t = resized_screen[18:102, :]
 66 |         x_t = np.reshape(x_t, [84, 84])
 67 |         return x_t.astype(np.uint8)
 68 | 
 69 | class ProcessFrameUint(gym.ObservationWrapper):
 70 |     def __init__(self, env=None):
 71 |         super(ProcessFrameUint, self).__init__(env)
 72 |         self.observation_space = spaces.Box(low=0, high=255, shape=(60, 112, 1), dtype=np.uint8)
 73 | 
 74 |     def observation(self, obs):
 75 |         return ProcessFrameUint.process(obs)
 76 | 
 77 |     @staticmethod
 78 |     def process(im):
 79 |         print("Uint")
 80 |         print(im.shape)
 81 | 
 82 |         im = np.reshape(im, [60, 112, 1])#.astype(np.float32)
 83 |         #x_t = cv2.resize(im, (100, 54), interpolation=cv2.INTER_AREA)
 84 |         #x_t = np.reshape(x_t, [54, 100,1])
 85 |         print(im.shape)
 86 | 
 87 |         return im.astype(np.uint8)
 88 | 
 89 | class FrameStack(gym.Wrapper):
 90 |     def __init__(self, env, k):
 91 |         """Stack k last frames.
 92 |         Returns lazy array, which is much more memory efficient.
 93 |         See Also
 94 |         --------
 95 |         baselines.common.atari_wrappers.LazyFrames
 96 |         """
 97 |         gym.Wrapper.__init__(self, env)
 98 |         self.k = k
 99 |         self.frames = deque([], maxlen=k)
100 |         shp = env.observation_space.shape
101 |         self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0]*k, shp[1], shp[2]), dtype=np.uint8)
102 | 
103 |     def reset(self):
104 |         ob = self.env.reset()
105 |         for _ in range(self.k):
106 |             self.frames.append(ob)
107 |         return self._get_ob()
108 | 
109 |     def step(self, action):
110 |         ob, reward, done, info = self.env.step(action)
111 |         self.frames.append(ob)
112 |         return self._get_ob(), reward, done, info
113 | 
114 |     def _get_ob(self):
115 |         assert len(self.frames) == self.k
116 |         return LazyFrames(list(self.frames))
117 | 
118 | def wrap_env(env, stack_frames=4):
119 | 
120 |     env = ProcessFrameUint(env)
121 |     env = ImageToPyTorch(env)
122 |     env = FrameStack(env, stack_frames)
123 | 
124 |     return env
125 | 
126 | def wrap_env_vec(env, stack_frames=4):
127 | 
128 |     env = ProcessFrameUint(env)
129 |     env = ImageToPyTorch(env)
130 |     env = FrameStack(env, stack_frames)
131 | 
132 |     return env
133 | 


--------------------------------------------------------------------------------
/FelkFork/Wrappers.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | from gym import spaces
  3 | import numpy as np
  4 | from collections import deque
  5 | import cv2
  6 | 
  7 | class ImageToPyTorch(gym.ObservationWrapper):
  8 |     """
  9 |     Change image shape to CWH
 10 |     """
 11 |     def __init__(self, env):
 12 |         super(ImageToPyTorch, self).__init__(env)
 13 |         old_shape = self.observation_space.shape
 14 |         self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[-1], old_shape[0], old_shape[1]),
 15 |                                                 dtype=np.float32)
 16 | 
 17 |     def observation(self, observation):
 18 |         return np.swapaxes(observation, 2, 0)
 19 | 
 20 | class ScaledFloatFrame(gym.ObservationWrapper):
 21 |     def observation(self, obs):
 22 |         # careful! This undoes the memory optimization, use
 23 |         # with smaller replay buffers only.
 24 |         return np.array(obs).astype(np.float32) / 255.0
 25 | 
 26 | class LazyFrames(object):
 27 |     def __init__(self, frames):
 28 |         """This object ensures that common frames between the observations are only stored once.
 29 |         It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
 30 |         buffers.
 31 |         This object should only be converted to numpy array before being passed to the model.
 32 |         You'd not belive how complex the previous solution was."""
 33 |         self._frames = frames
 34 | 
 35 |     def __array__(self, dtype=None):
 36 |         out = np.concatenate(self._frames, axis=0)
 37 |         if dtype is not None:
 38 |             out = out.astype(dtype)
 39 |         return out
 40 | 
 41 | class ProcessFrame84(gym.ObservationWrapper):
 42 |     def __init__(self, env=None):
 43 |         super(ProcessFrame84, self).__init__(env)
 44 |         self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
 45 | 
 46 |     def observation(self, obs):
 47 |         return ProcessFrame84.process(obs)
 48 | 
 49 |     @staticmethod
 50 |     def process(frame):
 51 |         if frame.size == 210 * 160 * 3:
 52 |             img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
 53 |         elif frame.size == 250 * 160 * 3:
 54 |             img = np.reshape(frame, [250, 160, 3]).astype(np.float32)
 55 |         else:
 56 |             assert False, "Unknown resolution."
 57 |         img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
 58 |         resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
 59 |         x_t = resized_screen[18:102, :]
 60 |         x_t = np.reshape(x_t, [84, 84])
 61 |         return x_t.astype(np.uint8)
 62 | 
 63 | class ProcessFrame10054_2(gym.ObservationWrapper):
 64 |     def __init__(self, env=None):
 65 |         super(ProcessFrame10054_2, self).__init__(env)
 66 |         self.observation_space = spaces.Box(low=0, high=255, shape=(54, 100, 1), dtype=np.uint8)
 67 | 
 68 |     def observation(self, obs):
 69 |         return ProcessFrame10054_2.process(obs)
 70 | 
 71 |     @staticmethod
 72 |     def process(im):
 73 | 
 74 |         im = np.reshape(im, [108, 200, 1]).astype(np.float32)
 75 |         x_t = cv2.resize(im, (100, 54), interpolation=cv2.INTER_AREA)
 76 |         x_t = np.reshape(x_t, [54, 100,1])
 77 | 
 78 |         return x_t.astype(np.uint8)
 79 | 
 80 | class ProcessFrameUint(gym.ObservationWrapper):
 81 |     def __init__(self, env=None):
 82 |         super(ProcessFrameUint, self).__init__(env)
 83 |         self.observation_space = spaces.Box(low=0, high=255, shape=(78, 94, 1), dtype=np.uint8)
 84 | 
 85 |     def observation(self, obs):
 86 |         return ProcessFrameUint.process(obs)
 87 | 
 88 |     @staticmethod
 89 |     def process(im):
 90 | 
 91 |         im = np.reshape(im, [78, 94, 1])#.astype(np.float32)
 92 |         #x_t = cv2.resize(im, (100, 54), interpolation=cv2.INTER_AREA)
 93 |         #x_t = np.reshape(x_t, [54, 100,1])
 94 | 
 95 |         return im.astype(np.uint8)
 96 | 
 97 | class FrameStack(gym.Wrapper):
 98 |     def __init__(self, env, k):
 99 |         """Stack k last frames.
100 |         Returns lazy array, which is much more memory efficient.
101 |         See Also
102 |         --------
103 |         baselines.common.atari_wrappers.LazyFrames
104 |         """
105 |         gym.Wrapper.__init__(self, env)
106 |         self.k = k
107 |         self.frames = deque([], maxlen=k)
108 |         shp = env.observation_space.shape
109 |         self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0]*k, shp[1], shp[2]), dtype=np.uint8)
110 | 
111 |     def reset(self):
112 |         ob = self.env.reset()
113 |         for _ in range(self.k):
114 |             self.frames.append(ob)
115 |         return self._get_ob()
116 | 
117 |     def step(self, action):
118 |         ob, reward, done, info = self.env.step(action)
119 |         self.frames.append(ob)
120 |         return self._get_ob(), reward, done, info
121 | 
122 |     def _get_ob(self):
123 |         assert len(self.frames) == self.k
124 |         return LazyFrames(list(self.frames))
125 | 
126 | 
127 | 
128 | 
129 | def wrap_env(env, stack_frames=4):
130 | 
131 |     env = ProcessFrameUint(env)
132 |     env = ImageToPyTorch(env)
133 |     env = FrameStack(env, stack_frames)
134 | 
135 |     return env
136 | 


--------------------------------------------------------------------------------
/dist_test.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import numpy as np
  3 | sys.path.append("./")
  4 | 
  5 | #from lib import common
  6 | 
  7 | import matplotlib as mpl
  8 | mpl.use("Agg")
  9 | import matplotlib.pyplot as plt
 10 | 
 11 | 
 12 | Vmax = 10
 13 | Vmin = -10
 14 | N_ATOMS = 51
 15 | DELTA_Z = (Vmax - Vmin) / (N_ATOMS - 1)
 16 | 
 17 | 
 18 | def distr_projection(next_distr, rewards, dones, Vmin, Vmax, n_atoms, gamma):
 19 |     """
 20 |     Perform distribution projection aka Catergorical Algorithm from the
 21 |     "A Distributional Perspective on RL" paper
 22 |     """
 23 |     batch_size = len(rewards)
 24 |     proj_distr = np.zeros((batch_size, n_atoms), dtype=np.float32)
 25 |     delta_z = (Vmax - Vmin) / (n_atoms - 1)
 26 |     for atom in range(n_atoms):
 27 |         tz_j = np.minimum(Vmax, np.maximum(Vmin, rewards + (Vmin + atom * delta_z) * gamma))
 28 |         b_j = (tz_j - Vmin) / delta_z
 29 |         l = np.floor(b_j).astype(np.int64)
 30 |         u = np.ceil(b_j).astype(np.int64)
 31 |         eq_mask = u == l
 32 |         proj_distr[eq_mask, l[eq_mask]] += next_distr[eq_mask, atom]
 33 |         ne_mask = u != l
 34 |         proj_distr[ne_mask, l[ne_mask]] += next_distr[ne_mask, atom] * (u - b_j)[ne_mask]
 35 |         proj_distr[ne_mask, u[ne_mask]] += next_distr[ne_mask, atom] * (b_j - l)[ne_mask]
 36 |     if dones.any():
 37 |         proj_distr[dones] = 0.0
 38 |         tz_j = np.minimum(Vmax, np.maximum(Vmin, rewards[dones]))
 39 |         b_j = (tz_j - Vmin) / delta_z
 40 |         l = np.floor(b_j).astype(np.int64)
 41 |         u = np.ceil(b_j).astype(np.int64)
 42 |         eq_mask = u == l
 43 |         eq_dones = dones.copy()
 44 |         eq_dones[dones] = eq_mask
 45 |         if eq_dones.any():
 46 |             proj_distr[eq_dones, l[eq_mask]] = 1.0
 47 |         ne_mask = u != l
 48 |         ne_dones = dones.copy()
 49 |         ne_dones[dones] = ne_mask
 50 |         if ne_dones.any():
 51 |             proj_distr[ne_dones, l[ne_mask]] = (u - b_j)[ne_mask]
 52 |             proj_distr[ne_dones, u[ne_mask]] = (b_j - l)[ne_mask]
 53 |     return proj_distr
 54 | 
 55 | def save_distr(src, proj, name):
 56 |     plt.clf()
 57 |     p = np.arange(Vmin, Vmax+DELTA_Z, DELTA_Z)
 58 |     plt.subplot(2, 1, 1)
 59 |     plt.bar(p, src, width=0.5)
 60 |     plt.title("Source")
 61 |     plt.subplot(2, 1, 2)
 62 |     plt.bar(p, proj, width=0.5)
 63 |     plt.title("Projected")
 64 |     plt.savefig(name + ".png")
 65 | 
 66 | 
 67 | if __name__ == "__main__":
 68 |     np.random.seed(123)
 69 |     atoms = np.arange(Vmin, Vmax+DELTA_Z, DELTA_Z)
 70 | 
 71 |     # single peak distribution
 72 |     src_hist = np.zeros(shape=(1, N_ATOMS), dtype=np.float32)
 73 |     src_hist[0, N_ATOMS//2+1] = 1.0
 74 |     proj_hist = distr_projection(src_hist, np.array([2], dtype=np.float32), np.array([False]),
 75 |                                         Vmin, Vmax, N_ATOMS, gamma=0.9)
 76 |     
 77 |     save_distr(src_hist[0], proj_hist[0], "peak-r=2")
 78 |     
 79 | 
 80 |     # normal distribution
 81 |     data = np.random.normal(size=1000, scale=3)
 82 |     hist = np.histogram(data, normed=True, bins=np.arange(Vmin - DELTA_Z/2, Vmax + DELTA_Z*3/2, DELTA_Z))
 83 | 
 84 |     src_hist = hist[0]
 85 |     proj_hist = distr_projection(np.array([src_hist]), np.array([2], dtype=np.float32), np.array([False]),
 86 |                                         Vmin, Vmax, N_ATOMS, gamma=0.9)
 87 |     save_distr(hist[0], proj_hist[0], "normal-r=2")
 88 |     #raise Exception("stop in the name of plod")
 89 | 
 90 |     # normal distribution, but done episode
 91 |     proj_hist = distr_projection(np.array([src_hist]), np.array([2], dtype=np.float32), np.array([True]),
 92 |                                         Vmin, Vmax, N_ATOMS, gamma=0.9)
 93 |     save_distr(hist[0], proj_hist[0], "normal-done-r=2")
 94 | 
 95 |     # clipping for out-of-range distribution
 96 |     proj_dist = distr_projection(np.array([src_hist]), np.array([10], dtype=np.float32), np.array([False]),
 97 |                                         Vmin, Vmax, N_ATOMS, gamma=0.9)
 98 |     save_distr(hist[0], proj_dist[0], "normal-r=10")
 99 | 
100 |     # test both done and not done, unclipped
101 |     proj_hist = distr_projection(np.array([src_hist, src_hist]), np.array([2, 2], dtype=np.float32),
102 |                                         np.array([False, True]), Vmin, Vmax, N_ATOMS, gamma=0.9)
103 |     save_distr(src_hist, proj_hist[0], "both_not_clip-01-incomplete")
104 |     save_distr(src_hist, proj_hist[1], "both_not_clip-02-complete")
105 | 
106 |     # test both done and not done, clipped right
107 |     proj_hist = distr_projection(np.array([src_hist, src_hist]), np.array([10, 10], dtype=np.float32),
108 |                                         np.array([False, True]), Vmin, Vmax, N_ATOMS, gamma=0.9)
109 |     save_distr(src_hist, proj_hist[0], "both_clip-right-01-incomplete")
110 |     save_distr(src_hist, proj_hist[1], "both_clip-right-02-complete")
111 | 
112 |     # test both done and not done, clipped left
113 |     proj_hist = distr_projection(np.array([src_hist, src_hist]), np.array([-10, -10], dtype=np.float32),
114 |                                         np.array([False, True]), Vmin, Vmax, N_ATOMS, gamma=0.9)
115 |     save_distr(src_hist, proj_hist[0], "both_clip-left-01-incomplete")
116 |     save_distr(src_hist, proj_hist[1], "both_clip-left-02-complete")
117 | 
118 |     pass
119 | 


--------------------------------------------------------------------------------
/Ptan/05_dqn_prio_replay.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import gym
  3 | import ptan
  4 | import numpy as np
  5 | import argparse
  6 | 
  7 | import torch
  8 | import torch.optim as optim
  9 | 
 10 | from tensorboardX import SummaryWriter
 11 | 
 12 | from lib import dqn_model, common
 13 | 
 14 | PRIO_REPLAY_ALPHA = 0.6
 15 | BETA_START = 0.4
 16 | BETA_FRAMES = 100000
 17 | 
 18 | 
 19 | class PrioReplayBuffer:
 20 |     def __init__(self, exp_source, buf_size, prob_alpha=0.6):
 21 |         self.exp_source_iter = iter(exp_source)
 22 |         self.prob_alpha = prob_alpha
 23 |         self.capacity = buf_size
 24 |         self.pos = 0
 25 |         self.buffer = []
 26 |         self.priorities = np.zeros((buf_size, ), dtype=np.float32)
 27 | 
 28 |     def __len__(self):
 29 |         return len(self.buffer)
 30 | 
 31 |     def populate(self, count):
 32 |         max_prio = self.priorities.max() if self.buffer else 1.0
 33 |         for _ in range(count):
 34 |             sample = next(self.exp_source_iter)
 35 |             if len(self.buffer) < self.capacity:
 36 |                 self.buffer.append(sample)
 37 |             else:
 38 |                 self.buffer[self.pos] = sample
 39 |             self.priorities[self.pos] = max_prio
 40 |             self.pos = (self.pos + 1) % self.capacity
 41 | 
 42 |     def sample(self, batch_size, beta=0.4):
 43 |         if len(self.buffer) == self.capacity:
 44 |             prios = self.priorities
 45 |         else:
 46 |             prios = self.priorities[:self.pos]
 47 |         probs = prios ** self.prob_alpha
 48 | 
 49 |         probs /= probs.sum()
 50 |         indices = np.random.choice(len(self.buffer), batch_size, p=probs)
 51 |         samples = [self.buffer[idx] for idx in indices]
 52 |         total = len(self.buffer)
 53 |         weights = (total * probs[indices]) ** (-beta)
 54 |         weights /= weights.max()
 55 |         return samples, indices, np.array(weights, dtype=np.float32)
 56 | 
 57 |     def update_priorities(self, batch_indices, batch_priorities):
 58 |         for idx, prio in zip(batch_indices, batch_priorities):
 59 |             self.priorities[idx] = prio
 60 | 
 61 | 
 62 | def calc_loss(batch, batch_weights, net, tgt_net, gamma, device="cpu"):
 63 |     states, actions, rewards, dones, next_states = common.unpack_batch(batch)
 64 | 
 65 |     states_v = torch.tensor(states).to(device)
 66 |     next_states_v = torch.tensor(next_states).to(device)
 67 |     actions_v = torch.tensor(actions).to(device)
 68 |     rewards_v = torch.tensor(rewards).to(device)
 69 |     done_mask = torch.ByteTensor(dones).to(device)
 70 |     batch_weights_v = torch.tensor(batch_weights).to(device)
 71 | 
 72 |     state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
 73 |     next_state_values = tgt_net(next_states_v).max(1)[0]
 74 |     next_state_values[done_mask] = 0.0
 75 | 
 76 |     expected_state_action_values = next_state_values.detach() * gamma + rewards_v
 77 |     losses_v = batch_weights_v * (state_action_values - expected_state_action_values) ** 2
 78 |     return losses_v.mean(), losses_v + 1e-5
 79 | 
 80 | 
 81 | if __name__ == "__main__":
 82 |     params = common.HYPERPARAMS['pong']
 83 |     parser = argparse.ArgumentParser()
 84 |     parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda")
 85 |     args = parser.parse_args()
 86 |     device = torch.device("cuda" if args.cuda else "cpu")
 87 | 
 88 |     env = gym.make(params['env_name'])
 89 |     env = ptan.common.wrappers.wrap_dqn(env)
 90 | 
 91 |     writer = SummaryWriter(comment="-" + params['run_name'] + "-prio-replay")
 92 |     net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device)
 93 |     tgt_net = ptan.agent.TargetNet(net)
 94 |     selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start'])
 95 |     epsilon_tracker = common.EpsilonTracker(selector, params)
 96 |     agent = ptan.agent.DQNAgent(net, selector, device=device)
 97 | 
 98 |     exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1)
 99 |     buffer = PrioReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA)
100 |     optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])
101 | 
102 |     frame_idx = 0
103 |     beta = BETA_START
104 | 
105 |     with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
106 |         while True:
107 |             frame_idx += 1
108 |             buffer.populate(1)
109 |             epsilon_tracker.frame(frame_idx)
110 |             beta = min(1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES)
111 | 
112 |             new_rewards = exp_source.pop_total_rewards()
113 |             if new_rewards:
114 |                 writer.add_scalar("beta", beta, frame_idx)
115 |                 if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon):
116 |                     break
117 | 
118 |             if len(buffer) < params['replay_initial']:
119 |                 continue
120 | 
121 |             optimizer.zero_grad()
122 |             batch, batch_indices, batch_weights = buffer.sample(params['batch_size'], beta)
123 |             loss_v, sample_prios_v = calc_loss(batch, batch_weights, net, tgt_net.target_model,
124 |                                                params['gamma'], device=device)
125 |             loss_v.backward()
126 |             optimizer.step()
127 |             buffer.update_priorities(batch_indices, sample_prios_v.data.cpu().numpy())
128 | 
129 |             if frame_idx % params['target_net_sync'] == 0:
130 |                 tgt_net.sync()
131 | 


--------------------------------------------------------------------------------
/ButtonLib.py:
--------------------------------------------------------------------------------
  1 | import pygame
  2 | class Button():
  3 |     def __init__(self,x,y,width,height,colour,colourH,border=False,borderC=(0,0,0),
  4 |                  text="",font=0,size=30,textColour = (0,0,0),stripe = None,stripeH = None,stripeThickness = None):#Initialise all of the variables
  5 |         fonts = pygame.font.get_fonts()
  6 |         
  7 |         self.x = x #x coordinate of top left corner
  8 |         self.y = y #y coordinate of top left corner
  9 |         self.width = width #width of the button
 10 |         self.height = height #height of the button
 11 |         self.colour = colour #colour of button when not hovered over
 12 |         self.colourC = colour #current colour of button
 13 |         self.colourNH = colour
 14 |         self.colourH = colourH #colour of button when hovered over
 15 |         self.borderC=borderC #colour of the border of the button, default is black
 16 |         self.border=False #Boolean to check if the button has a border
 17 |         if border:
 18 |             self.border=True
 19 |         self.text=text #Text of the button
 20 |         self.font=pygame.font.SysFont(fonts[font],size) #font of the button
 21 |         self.textColour = textColour
 22 |         self.stripeC = stripe
 23 |         self.stripe = stripe
 24 |         self.stripeH = stripeH
 25 |         self.stripeThickness = stripeThickness
 26 |         
 27 | 
 28 |     def click(self,pos,mouseUp):#method to check if button is pressed
 29 |         if self.hovering(pos):#Checks if mouse if over the button
 30 |             if mouseUp:#checks if left mouse button was pressed
 31 |                 return True
 32 |             else:
 33 |                 return False
 34 |         else: #Returns false if not over button or not clicked
 35 |             return False
 36 |         
 37 |     def hovering(self,pos):#Method to check if mouse is over the button
 38 |         if (pos[0]>self.x and pos[0]<(self.width+self.x) and pos[1]>self.y and pos[1]<(self.height+self.y)):#checks if the mouse is over the button
 39 |             self.colourC=self.colourH#changes colour to hover colour
 40 |             if self.stripe != None:
 41 |                 self.stripeC = self.stripeH
 42 |             return True
 43 |         else:
 44 |             self.colourC=self.colour#changes colour back to base colour
 45 |             self.stripeC = self.stripe
 46 |             return False
 47 | 
 48 |     def create(self,screen):#Method to blit the button onto the screen
 49 |         if self.border:#Checks if the button has a border
 50 |             pygame.draw.rect(screen,self.borderC,(self.x-3,self.y-3,self.width+6,self.height+6),0)#creates a rectangle larger than the button
 51 |         pygame.draw.rect(screen,self.colourC,(self.x,self.y,self.width,self.height),0)#draws the button
 52 | 
 53 |         if self.stripe != None:
 54 |             pygame.draw.rect(screen,self.stripeC,
 55 |                              (self.x,self.y + (self.height / 2) - int(self.stripeThickness / 2),self.width,self.stripeThickness),0)
 56 | 
 57 |         if self.text != "":#checks if there is text for the button
 58 |             text= self.font.render(self.text,1,self.textColour)#creates the text
 59 |             screen.blit(text,(self.x + (self.width/2 - text.get_width()/2), self.y + (self.height/2 - text.get_height()/2)))#draws the text in the centre of the button
 60 | 
 61 |     def getX(self):
 62 |         return self.x
 63 |     
 64 |     def getY(self):
 65 |         return self.y
 66 |     
 67 |     def getCorners(self):#Returns the coordinates of the four corners of the button
 68 |         return self.x,self.y,self.x+self.width,self.y+self.height
 69 | 
 70 |     def setCorners(self,co1,co2,co3,co4=(0,0)):#takes four coordinates to set the values of the button
 71 |         if co2[0]>co1[0] and co2[1]<co3[1] and co2[1]==co1[1] and co2[0]>co3[0] and co3[0]==co1[0] and co3[1]>co1[1]:#checks if the coordinates create a valid rectangle
 72 |             self.x=co1[0]
 73 |             self.y=co1[1]#sets the values to create the button with the coordinates given
 74 |             self.width=co2[0]-co1[0]
 75 |             self.height=co3[1]-co1[1]
 76 | 
 77 |     def getWidth(self):#returns the value of the width
 78 |         return self.width
 79 | 
 80 |     def getHeight(self):#returns the value of the height
 81 |         return self.height
 82 | 
 83 |     def setWidth(self,width):#Sets the width of the button to the input
 84 |         self.width=width
 85 | 
 86 |     def setHeight(self,height):#Sets the height of the button to the input
 87 |         self.height=height
 88 | 
 89 |     def getColour(self):#returns the value of the current colour of the button
 90 |         return self.colourC
 91 | 
 92 |     def getColourH(self):#returns the value of the current colour of the button
 93 |         return self.colourH
 94 | 
 95 |     def getColourNH(self):#returns the value of the current colour of the button
 96 |         return self.colourNH
 97 |     
 98 |     def setColour(self,colour):#Sets the base colour of the button to the input
 99 |         self.colour=colour
100 | 
101 |     def setColourH(self,colour):#Sets the hover colour of the button to the input
102 |         self.colourH=colour
103 | 
104 |     def setBorderColour(self,colour):#Sets the border colour of the button to the input
105 |         self.borderC=colour
106 | 
107 |     def getText(self):#returns the value of the text for the button
108 |         return self.text
109 | 
110 |     def setText(self,text):#Sets the text of the button to the input
111 |         self.text=text
112 | 
113 |     def setFont(self,font,size=-1):#Sets the font of the button to the input
114 |         if size>-1:
115 |             self.font=pygame.font.SysFont(fonts[font],size)
116 |         else:
117 |             self.font=pygame.font.SysFont(fonts[font],26)
118 | 


--------------------------------------------------------------------------------
/EffRainbow/08_dqn_rainbow.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import gym
  3 | import ptan_actions
  4 | import ptan_agent
  5 | import ptan_experience
  6 | import argparse
  7 | import numpy as np
  8 | 
  9 | import torch
 10 | import torch.nn as nn
 11 | import torch.nn.functional as F
 12 | import time
 13 | import torch.optim as optim
 14 | 
 15 | from tensorboardX import SummaryWriter
 16 | 
 17 | import networks
 18 | from lib import dqn_model, common
 19 | from DolphinEnvVec import DolphinEnvVec
 20 | import keyboard
 21 | # n-step
 22 | REWARD_STEPS = 3
 23 | 
 24 | # priority replay
 25 | PRIO_REPLAY_ALPHA = 0.5
 26 | BETA_START = 0.4
 27 | BETA_FRAMES = 20000000
 28 | 
 29 | # C51
 30 | Vmax = 6
 31 | Vmin = -2
 32 | N_ATOMS = 51
 33 | DELTA_Z = (Vmax - Vmin) / (N_ATOMS - 1)
 34 | 
 35 | 
 36 | class RainbowDQN(nn.Module):
 37 |     def __init__(self, input_shape, n_actions):
 38 |         super(RainbowDQN, self).__init__()
 39 | 
 40 |         self.conv = nn.Sequential(
 41 |             nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
 42 |             nn.ReLU(),
 43 |             nn.Conv2d(32, 64, kernel_size=4, stride=2),
 44 |             nn.ReLU(),
 45 |             nn.Conv2d(64, 64, kernel_size=3, stride=1),
 46 |             nn.ReLU()
 47 |         )
 48 | 
 49 |         conv_out_size = self._get_conv_out(input_shape)
 50 |         self.fc_adv = nn.Sequential(
 51 |             dqn_model.NoisyLinear(conv_out_size, 512),
 52 |             nn.ReLU(),
 53 |             dqn_model.NoisyLinear(512, n_actions)
 54 |         )
 55 |         self.fc_val = nn.Sequential(
 56 |             dqn_model.NoisyLinear(conv_out_size, 512),
 57 |             nn.ReLU(),
 58 |             dqn_model.NoisyLinear(512, 1)
 59 |         )
 60 | 
 61 |     def _get_conv_out(self, shape):
 62 |         o = self.conv(torch.zeros(1, *shape))
 63 |         return int(np.prod(o.size()))
 64 | 
 65 |     def forward(self, x):
 66 |         fx = x.float() / 256
 67 |         conv_out = self.conv(fx).view(fx.size()[0], -1)
 68 |         val = self.fc_val(conv_out)
 69 |         adv = self.fc_adv(conv_out)
 70 |         return val + (adv - adv.mean(dim=1, keepdim=True))
 71 | 
 72 | if __name__ == "__main__":
 73 |     params = common.HYPERPARAMS['MarioBros']
 74 |     #params['epsilon_frames'] *= 2
 75 |     parser = argparse.ArgumentParser()
 76 |     parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda")
 77 |     
 78 |     args = parser.parse_args()
 79 |     #print(args.cuda)
 80 |     device = torch.device("cuda")# if args.cuda else "cpu"
 81 | 
 82 |     with open('pid_num.txt', 'w') as f:
 83 |         f.write(str(0))
 84 | 
 85 |     #env = DolphinEnv()
 86 |     #env = wrap_env(env)
 87 | 
 88 |     vec_envs = DolphinEnvVec(4)
 89 | 
 90 |     #print("Vector Obs Shape: " + str(vec_envs.observation_space))
 91 | 
 92 |     #envs.observation_space = gym.spaces.Box(
 93 |             #low=0, high=1, shape=(3,78, 94), dtype=np.uint8)
 94 | 
 95 |     #env = DolphinEnv(pid = 0) #gym.make(params['env_name'])
 96 |     #env = wrap_env(env,3)
 97 | 
 98 |     print(vec_envs.action_space)
 99 |     print(vec_envs.observation_space.shape)
100 |     #env = ptan.common.wrappers.wrap_dqn_custom(env)
101 | 
102 |     #Test this code:
103 |     #check to see if observations are uints or floats
104 | 
105 |     #raise Exception("stop")
106 | 
107 |     #need to copy and reshape network
108 | 
109 |     writer = SummaryWriter(comment="-" + params['run_name'] + "-rainbow")
110 |     #net = RainbowDQN(env.observation_space.shape, env.action_space.n).to(device)
111 |     #net = networks.ImpalaCNNSmall(env.observation_space.shape[0], env.action_space.n).to(device)
112 |     net = networks.ImpalaCNNLarge(vec_envs.observation_space.shape[0],vec_envs.action_space.n).to(device)
113 |     #net.load_checkpoint()
114 |     
115 |     tgt_net = ptan_agent.TargetNet(net)
116 |     selector = ptan_actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start'],eps_dec=params['epsilon_dec'],eps_min=params['epsilon_final'])
117 |     #ptan_actions.StickyEpsilonGreedyActionSelector()
118 |     agent = ptan_agent.DQNAgent(net, selector, device=device)
119 | 
120 |     exp_source = ptan_experience.ExperienceSourceFirstLast(vec_envs, agent, gamma=params['gamma'], steps_count=REWARD_STEPS,vectorized=True)#
121 |     buffer = ptan_experience.PrioritizedReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA)
122 |     optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'], eps=0.0025/params['batch_size'])
123 | 
124 |     frame_idx = 0
125 |     beta = BETA_START
126 | 
127 |     save_interval = 320000
128 |     start_timer = time.time()
129 | 
130 |     scores = []
131 |     run_name = "ResultsItems.npy"
132 | 
133 |     with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
134 |         while True:
135 |             
136 |             frame_idx += 8
137 |             buffer.populate(8)
138 |             beta = min(1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES)
139 | 
140 |             new_rewards = exp_source.pop_total_rewards()
141 |             if new_rewards:
142 |                 if reward_tracker.reward(new_rewards[0], frame_idx):
143 |                     break
144 | 
145 |             if frame_idx % 1600 == 0:
146 |                 print("Total FPS: " + str(round(frame_idx / (time.time() - start_timer),2)))
147 | 
148 |             if len(buffer) < params['replay_initial']:
149 |                 continue
150 | 
151 |             optimizer.zero_grad()
152 |             batch, batch_indices, batch_weights = buffer.sample(params['batch_size'], beta)
153 | 
154 |             loss_v, sample_prios_v = common.calc_loss_dqn(batch, batch_weights, net, tgt_net.target_model,
155 |                                 params['gamma'] ** REWARD_STEPS, device=device)
156 | 
157 |             """loss_v, sample_prios_v = calc_loss(batch, batch_weights, net, tgt_net.target_model,
158 |                                                params['gamma'] ** REWARD_STEPS, device=device)"""
159 | 
160 |             loss_v.backward()
161 |             optimizer.step()
162 |             buffer.update_priorities(batch_indices, sample_prios_v.data.cpu().numpy())
163 | 
164 |             if frame_idx % save_interval == 0:
165 |                 net.save_checkpoint()
166 | 
167 |             if frame_idx % params['target_net_sync'] == 0:
168 |                 tgt_net.sync()
169 |                 np.save(run_name, reward_tracker.get_scores())
170 | 
171 | 
172 | 


--------------------------------------------------------------------------------
/EffRainbow/ptan_agent.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Agent is something which converts states into actions and has state
  3 | """
  4 | import copy
  5 | import numpy as np
  6 | import torch
  7 | import torch.nn.functional as F
  8 | 
  9 | import ptan_actions as actions
 10 | #from OutputViewer import OutputViewer
 11 | 
 12 | 
 13 | class BaseAgent:
 14 |     """
 15 |     Abstract Agent interface
 16 |     """
 17 |     def initial_state(self):
 18 |         """
 19 |         Should create initial empty state for the agent. It will be called for the start of the episode
 20 |         :return: Anything agent want to remember
 21 |         """
 22 |         return None
 23 | 
 24 |     def __call__(self, states, agent_states):
 25 |         """
 26 |         Convert observations and states into actions to take
 27 |         :param states: list of environment states to process
 28 |         :param agent_states: list of states with the same length as observations
 29 |         :return: tuple of actions, states
 30 |         """
 31 |         assert isinstance(states, list)
 32 |         assert isinstance(agent_states, list)
 33 |         assert len(agent_states) == len(states)
 34 | 
 35 |         raise NotImplementedError
 36 | 
 37 | 
 38 | def default_states_preprocessor(states):
 39 |     """
 40 |     Convert list of states into the form suitable for model. By default we assume Variable
 41 |     :param states: list of numpy arrays with states
 42 |     :return: Variable
 43 |     """
 44 |     if len(states) == 1:
 45 |         np_states = np.expand_dims(states[0], 0)
 46 |     else:
 47 |         np_states = np.array([np.array(s, copy=False) for s in states], copy=False)
 48 |     return torch.tensor(np_states)
 49 | 
 50 | 
 51 | def float32_preprocessor(states):
 52 |     np_states = np.array(states, dtype=np.float32)
 53 |     return torch.tensor(np_states)
 54 | 
 55 | 
 56 | class DQNAgent(BaseAgent):
 57 |     """
 58 |     DQNAgent is a memoryless DQN agent which calculates Q values
 59 |     from the observations and  converts them into the actions using action_selector
 60 |     """
 61 |     def __init__(self, dqn_model, action_selector, device="cpu", preprocessor=default_states_preprocessor):
 62 |         self.dqn_model = dqn_model
 63 |         self.action_selector = action_selector
 64 |         self.preprocessor = preprocessor
 65 |         self.device = device
 66 |         #self.viewer = OutputViewer(["hLeft", "sLeft", "Wheel", "sRight", "hRight"])
 67 | 
 68 |     def __call__(self, states, agent_states=None):
 69 |         if agent_states is None:
 70 |             agent_states = [None] * len(states)
 71 |         if self.preprocessor is not None:
 72 |             states = self.preprocessor(states)
 73 |             if torch.is_tensor(states):
 74 |                 states = states.to(self.device)
 75 |         q_v = self.dqn_model(states)
 76 |         q = q_v.data.cpu().numpy()
 77 |         #self.viewer.update(list(q[0]))
 78 |         actions = self.action_selector(q)
 79 |         return actions, agent_states
 80 | 
 81 | 
 82 | class TargetNet:
 83 |     """
 84 |     Wrapper around model which provides copy of it instead of trained weights
 85 |     """
 86 |     def __init__(self, model):
 87 |         self.model = model
 88 |         self.target_model = copy.deepcopy(model)
 89 | 
 90 |     def sync(self):
 91 |         self.target_model.load_state_dict(self.model.state_dict())
 92 | 
 93 |     def alpha_sync(self, alpha):
 94 |         """
 95 |         Blend params of target net with params from the model
 96 |         :param alpha:
 97 |         """
 98 |         assert isinstance(alpha, float)
 99 |         assert 0.0 < alpha <= 1.0
100 |         state = self.model.state_dict()
101 |         tgt_state = self.target_model.state_dict()
102 |         for k, v in state.items():
103 |             tgt_state[k] = tgt_state[k] * alpha + (1 - alpha) * v
104 |         self.target_model.load_state_dict(tgt_state)
105 | 
106 | 
107 | class PolicyAgent(BaseAgent):
108 |     """
109 |     Policy agent gets action probabilities from the model and samples actions from it
110 |     """
111 |     # TODO: unify code with DQNAgent, as only action selector is differs.
112 |     def __init__(self, model, action_selector=actions.ProbabilityActionSelector(), device="cpu",
113 |                  apply_softmax=False, preprocessor=default_states_preprocessor):
114 |         self.model = model
115 |         self.action_selector = action_selector
116 |         self.device = device
117 |         self.apply_softmax = apply_softmax
118 |         self.preprocessor = preprocessor
119 | 
120 |     def __call__(self, states, agent_states=None):
121 |         """
122 |         Return actions from given list of states
123 |         :param states: list of states
124 |         :return: list of actions
125 |         """
126 |         if agent_states is None:
127 |             agent_states = [None] * len(states)
128 |         if self.preprocessor is not None:
129 |             states = self.preprocessor(states)
130 |             if torch.is_tensor(states):
131 |                 states = states.to(self.device)
132 |         probs_v = self.model(states)
133 |         if self.apply_softmax:
134 |             probs_v = F.softmax(probs_v, dim=1)
135 |         probs = probs_v.data.cpu().numpy()
136 |         actions = self.action_selector(probs)
137 |         return np.array(actions), agent_states
138 | 
139 | 
140 | class ActorCriticAgent(BaseAgent):
141 |     """
142 |     Policy agent which returns policy and value tensors from observations. Value are stored in agent's state
143 |     and could be reused for rollouts calculations by ExperienceSource.
144 |     """
145 |     def __init__(self, model, action_selector=actions.ProbabilityActionSelector(), device="cpu",
146 |                  apply_softmax=False, preprocessor=default_states_preprocessor):
147 |         self.model = model
148 |         self.action_selector = action_selector
149 |         self.device = device
150 |         self.apply_softmax = apply_softmax
151 |         self.preprocessor = preprocessor
152 | 
153 |     def __call__(self, states, agent_states=None):
154 |         """
155 |         Return actions from given list of states
156 |         :param states: list of states
157 |         :return: list of actions
158 |         """
159 |         if self.preprocessor is not None:
160 |             states = self.preprocessor(states)
161 |             if torch.is_tensor(states):
162 |                 states = states.to(self.device)
163 |         probs_v, values_v = self.model(states)
164 |         if self.apply_softmax:
165 |             probs_v = F.softmax(probs_v, dim=1)
166 |         probs = probs_v.data.cpu().numpy()
167 |         actions = self.action_selector(probs)
168 |         agent_states = values_v.data.squeeze().cpu().numpy().tolist()
169 |         return np.array(actions), agent_states
170 | 


--------------------------------------------------------------------------------
/Ptan/08_dqn_rainbow.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import gym
  3 | import ptan
  4 | import argparse
  5 | import numpy as np
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | 
 11 | import torch.optim as optim
 12 | 
 13 | from tensorboardX import SummaryWriter
 14 | 
 15 | from lib import dqn_model, common
 16 | 
 17 | # n-step
 18 | REWARD_STEPS = 3
 19 | 
 20 | # priority replay
 21 | PRIO_REPLAY_ALPHA = 0.6
 22 | BETA_START = 0.4
 23 | BETA_FRAMES = 1000000
 24 | 
 25 | # C51
 26 | Vmax = 10
 27 | Vmin = -10
 28 | N_ATOMS = 51
 29 | DELTA_Z = (Vmax - Vmin) / (N_ATOMS - 1)
 30 | 
 31 | 
 32 | class RainbowDQN(nn.Module):
 33 |     def __init__(self, input_shape, n_actions):
 34 |         super(RainbowDQN, self).__init__()
 35 | 
 36 |         self.conv = nn.Sequential(
 37 |             nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
 38 |             nn.ReLU(),
 39 |             nn.Conv2d(32, 64, kernel_size=4, stride=2),
 40 |             nn.ReLU(),
 41 |             nn.Conv2d(64, 64, kernel_size=3, stride=1),
 42 |             nn.ReLU()
 43 |         )
 44 | 
 45 |         conv_out_size = self._get_conv_out(input_shape)
 46 |         self.fc_val = nn.Sequential(
 47 |             dqn_model.NoisyLinear(conv_out_size, 256),
 48 |             nn.ReLU(),
 49 |             dqn_model.NoisyLinear(256, N_ATOMS)
 50 |         )
 51 | 
 52 |         self.fc_adv = nn.Sequential(
 53 |             dqn_model.NoisyLinear(conv_out_size, 256),
 54 |             nn.ReLU(),
 55 |             dqn_model.NoisyLinear(256, n_actions * N_ATOMS)
 56 |         )
 57 | 
 58 |         self.register_buffer("supports", torch.arange(Vmin, Vmax+DELTA_Z, DELTA_Z))
 59 |         self.softmax = nn.Softmax(dim=1)
 60 | 
 61 |     def _get_conv_out(self, shape):
 62 |         o = self.conv(torch.zeros(1, *shape))
 63 |         return int(np.prod(o.size()))
 64 | 
 65 |     def forward(self, x):
 66 |         batch_size = x.size()[0]
 67 |         fx = x.float() / 256
 68 |         conv_out = self.conv(fx).view(batch_size, -1)
 69 |         val_out = self.fc_val(conv_out).view(batch_size, 1, N_ATOMS)
 70 |         adv_out = self.fc_adv(conv_out).view(batch_size, -1, N_ATOMS)
 71 |         adv_mean = adv_out.mean(dim=1, keepdim=True)
 72 |         return val_out + (adv_out - adv_mean)
 73 | 
 74 |     def both(self, x):
 75 |         cat_out = self(x)
 76 |         probs = self.apply_softmax(cat_out)
 77 |         weights = probs * self.supports
 78 |         res = weights.sum(dim=2)
 79 |         return cat_out, res
 80 | 
 81 |     def qvals(self, x):
 82 |         return self.both(x)[1]
 83 | 
 84 |     def apply_softmax(self, t):
 85 |         return self.softmax(t.view(-1, N_ATOMS)).view(t.size())
 86 | 
 87 | 
 88 | def calc_loss(batch, batch_weights, net, tgt_net, gamma, device="cpu"):
 89 |     states, actions, rewards, dones, next_states = common.unpack_batch(batch)
 90 |     batch_size = len(batch)
 91 | 
 92 |     states_v = torch.tensor(states).to(device)
 93 |     actions_v = torch.tensor(actions).to(device)
 94 |     next_states_v = torch.tensor(next_states).to(device)
 95 |     batch_weights_v = torch.tensor(batch_weights).to(device)
 96 | 
 97 |     # next state distribution
 98 |     # dueling arch -- actions from main net, distr from tgt_net
 99 | 
100 |     # calc at once both next and cur states
101 |     distr_v, qvals_v = net.both(torch.cat((states_v, next_states_v)))
102 |     next_qvals_v = qvals_v[batch_size:]
103 |     distr_v = distr_v[:batch_size]
104 | 
105 |     next_actions_v = next_qvals_v.max(1)[1]
106 |     next_distr_v = tgt_net(next_states_v)
107 |     next_best_distr_v = next_distr_v[range(batch_size), next_actions_v.data]
108 |     next_best_distr_v = tgt_net.apply_softmax(next_best_distr_v)
109 |     next_best_distr = next_best_distr_v.data.cpu().numpy()
110 | 
111 |     dones = dones.astype(bool)
112 | 
113 |     # project our distribution using Bellman update
114 |     proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin, Vmax, N_ATOMS, gamma)
115 | 
116 |     # calculate net output
117 |     state_action_values = distr_v[range(batch_size), actions_v.data]
118 |     state_log_sm_v = F.log_softmax(state_action_values, dim=1)
119 |     proj_distr_v = torch.tensor(proj_distr).to(device)
120 | 
121 |     loss_v = -state_log_sm_v * proj_distr_v
122 |     loss_v = batch_weights_v * loss_v.sum(dim=1)
123 |     return loss_v.mean(), loss_v + 1e-5
124 | 
125 | 
126 | if __name__ == "__main__":
127 |     params = common.HYPERPARAMS['invaders']
128 |     params['epsilon_frames'] *= 2
129 |     parser = argparse.ArgumentParser()
130 |     parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda")
131 |     
132 |     args = parser.parse_args()
133 |     #print(args.cuda)
134 |     device = torch.device("cuda")# if args.cuda else "cpu"
135 | 
136 |     env = gym.make(params['env_name'])
137 |     print(env.action_space)
138 |     print(env.observation_space.shape)
139 |     env = ptan.common.wrappers.wrap_dqn(env)
140 |     print(env.observation_space.shape)
141 |     #env = ptan.common.wrappers.wrap_dqn_custom(env)
142 | 
143 |     #Test this code:
144 |     #check to see if observations are uints or floats
145 |     #print(env.action_space)
146 |     #print(env.observation_space.shape)
147 |     raise Exception("stop")
148 | 
149 |     #need to copy and reshape network
150 |     
151 | 
152 |     writer = SummaryWriter(comment="-" + params['run_name'] + "-rainbow")
153 |     net = RainbowDQN(env.observation_space.shape, env.action_space.n).to(device)
154 |     tgt_net = ptan.agent.TargetNet(net)
155 |     agent = ptan.agent.DQNAgent(lambda x: net.qvals(x), ptan.actions.ArgmaxActionSelector(), device=device)
156 | 
157 |     exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=REWARD_STEPS)
158 |     buffer = ptan.experience.PrioritizedReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA)
159 |     optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])
160 | 
161 |     frame_idx = 0
162 |     beta = BETA_START
163 | 
164 |     with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
165 |         while True:
166 |             frame_idx += 1
167 |             buffer.populate(1)
168 |             beta = min(1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES)
169 | 
170 |             new_rewards = exp_source.pop_total_rewards()
171 |             if new_rewards:
172 |                 if reward_tracker.reward(new_rewards[0], frame_idx):
173 |                     break
174 | 
175 |             if len(buffer) < params['replay_initial']:
176 |                 continue
177 | 
178 |             optimizer.zero_grad()
179 |             batch, batch_indices, batch_weights = buffer.sample(params['batch_size'], beta)
180 |             loss_v, sample_prios_v = calc_loss(batch, batch_weights, net, tgt_net.target_model,
181 |                                                params['gamma'] ** REWARD_STEPS, device=device)
182 |             loss_v.backward()
183 |             optimizer.step()
184 |             buffer.update_priorities(batch_indices, sample_prios_v.data.cpu().numpy())
185 | 
186 |             if frame_idx % params['target_net_sync'] == 0:
187 |                 tgt_net.sync()
188 | 


--------------------------------------------------------------------------------
/Ptan/RainbowMarioKart.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import gym
  3 | import ptan
  4 | import argparse
  5 | import numpy as np
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | import time
 11 | import torch.optim as optim
 12 | 
 13 | from tensorboardX import SummaryWriter
 14 | 
 15 | from lib import dqn_model, common
 16 | from MarioKartEnvPtan import MarioKartEnv
 17 | 
 18 | # n-step
 19 | REWARD_STEPS = 3
 20 | 
 21 | # priority replay
 22 | PRIO_REPLAY_ALPHA = 0.6
 23 | BETA_START = 0.4
 24 | BETA_FRAMES = 3000000
 25 | 
 26 | # C51
 27 | Vmax = 16
 28 | Vmin = -2
 29 | N_ATOMS = 51
 30 | DELTA_Z = (Vmax - Vmin) / (N_ATOMS - 1)
 31 | 
 32 | 
 33 | class RainbowDQN(nn.Module):
 34 |     def __init__(self, input_shape, n_actions):
 35 |         super(RainbowDQN, self).__init__()
 36 | 
 37 |         self.start = time.time()
 38 | 
 39 |         self.conv = nn.Sequential(
 40 |             nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
 41 |             nn.ReLU(),
 42 |             nn.Conv2d(32, 64, kernel_size=4, stride=2),
 43 |             nn.ReLU(),
 44 |             nn.Conv2d(64, 64, kernel_size=3, stride=1),
 45 |             nn.ReLU()
 46 |         )
 47 | 
 48 |         #print(input_shape)
 49 |         conv_out_size = self._get_conv_out(input_shape)
 50 |         self.fc_val = nn.Sequential(
 51 |             dqn_model.NoisyLinear(conv_out_size, 256),
 52 |             nn.ReLU(),
 53 |             dqn_model.NoisyLinear(256, N_ATOMS)
 54 |         )
 55 | 
 56 |         self.fc_adv = nn.Sequential(
 57 |             dqn_model.NoisyLinear(conv_out_size, 256),
 58 |             nn.ReLU(),
 59 |             dqn_model.NoisyLinear(256, n_actions * N_ATOMS)
 60 |         )
 61 | 
 62 |         self.register_buffer("supports", torch.arange(Vmin, Vmax+DELTA_Z, DELTA_Z))
 63 |         self.softmax = nn.Softmax(dim=1)
 64 | 
 65 |     def _get_conv_out(self, shape):
 66 |         o = self.conv(torch.zeros(1, *shape))
 67 |         return int(np.prod(o.size()))
 68 | 
 69 |     def forward(self, x):
 70 |         batch_size = x.size()[0]
 71 |         fx = x.float() / 256
 72 |         conv_out = self.conv(fx).view(batch_size, -1)
 73 |         val_out = self.fc_val(conv_out).view(batch_size, 1, N_ATOMS)
 74 |         adv_out = self.fc_adv(conv_out).view(batch_size, -1, N_ATOMS)
 75 |         adv_mean = adv_out.mean(dim=1, keepdim=True)
 76 |         return val_out + (adv_out - adv_mean)
 77 | 
 78 |     def both(self, x):
 79 |         cat_out = self(x)
 80 |         probs = self.apply_softmax(cat_out)
 81 |         weights = probs * self.supports
 82 |         res = weights.sum(dim=2)
 83 |         return cat_out, res
 84 | 
 85 |     def qvals(self, x):
 86 |         return self.both(x)[1]
 87 | 
 88 |     def apply_softmax(self, t):
 89 |         return self.softmax(t.view(-1, N_ATOMS)).view(t.size())
 90 | 
 91 |     def save_checkpoint(self):
 92 |         #print('... saving checkpoint ...')
 93 |         torch.save(self.state_dict(), "current_model" + str(int(time.time() - self.start)))
 94 | 
 95 |     def load_checkpoint(self):
 96 |         #print('... loading checkpoint ...')
 97 |         self.load_state_dict(torch.load("current_model235392"))
 98 | 
 99 | 
100 | def calc_loss(batch, batch_weights, net, tgt_net, gamma, device="cpu"):
101 |     states, actions, rewards, dones, next_states = common.unpack_batch(batch)
102 |     batch_size = len(batch)
103 | 
104 |     states_v = torch.tensor(states).to(device)
105 |     actions_v = torch.tensor(actions).to(device)
106 |     next_states_v = torch.tensor(next_states).to(device)
107 |     batch_weights_v = torch.tensor(batch_weights).to(device)
108 | 
109 |     # next state distribution
110 |     # dueling arch -- actions from main net, distr from tgt_net
111 | 
112 |     # calc at once both next and cur states
113 |     distr_v, qvals_v = net.both(torch.cat((states_v, next_states_v)))
114 |     next_qvals_v = qvals_v[batch_size:]
115 |     distr_v = distr_v[:batch_size]
116 | 
117 |     next_actions_v = next_qvals_v.max(1)[1]
118 |     next_distr_v = tgt_net(next_states_v)
119 |     next_best_distr_v = next_distr_v[range(batch_size), next_actions_v.data]
120 |     next_best_distr_v = tgt_net.apply_softmax(next_best_distr_v)
121 |     next_best_distr = next_best_distr_v.data.cpu().numpy()
122 | 
123 |     dones = dones.astype(bool)
124 | 
125 |     # project our distribution using Bellman update
126 |     proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin, Vmax, N_ATOMS, gamma)
127 | 
128 |     # calculate net output
129 |     state_action_values = distr_v[range(batch_size), actions_v.data]
130 |     state_log_sm_v = F.log_softmax(state_action_values, dim=1)
131 |     proj_distr_v = torch.tensor(proj_distr).to(device)
132 | 
133 |     loss_v = -state_log_sm_v * proj_distr_v
134 |     loss_v = batch_weights_v * loss_v.sum(dim=1)
135 |     return loss_v.mean(), loss_v + 1e-5
136 | 
137 | 
138 | if __name__ == "__main__":
139 |     params = common.HYPERPARAMS['MarioKart']
140 |     params['epsilon_frames'] *= 2
141 |     parser = argparse.ArgumentParser()
142 |     parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda")
143 |     
144 |     args = parser.parse_args()
145 |     #print(args.cuda)
146 |     device = torch.device("cuda")# if args.cuda else "cpu"
147 | 
148 |     env = MarioKartEnv() #gym.make(params['env_name'])
149 |     print(env.action_space)
150 |     print(env.observation_space.shape)
151 |     env = ptan.common.wrappers.wrap_dqn_custom(env)
152 |     print(env.observation_space.shape)
153 |     #env = ptan.common.wrappers.wrap_dqn_custom(env)
154 | 
155 |     #Test this code:
156 |     #check to see if observations are uints or floats
157 | 
158 |     #raise Exception("stop")
159 | 
160 |     #need to copy and reshape network
161 | 
162 |     writer = SummaryWriter(comment="-" + params['run_name'] + "-rainbow")
163 |     net = RainbowDQN(env.observation_space.shape, env.action_space.n).to(device)
164 |     #net.load_checkpoint()
165 |     
166 |     tgt_net = ptan.agent.TargetNet(net)
167 |     agent = ptan.agent.DQNAgent(lambda x: net.qvals(x), ptan.actions.EpsilonGreedyActionSelector(), device=device)#ptan.actions.ArgmaxActionSelector()
168 | 
169 |     exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=REWARD_STEPS)
170 |     buffer = ptan.experience.PrioritizedReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA)
171 |     optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])
172 | 
173 |     frame_idx = 0
174 |     beta = BETA_START
175 | 
176 |     save_interval = 30000
177 | 
178 |     with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
179 |         while True:
180 |             frame_idx += 1
181 |             buffer.populate(1)
182 |             beta = min(1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES)
183 | 
184 |             new_rewards = exp_source.pop_total_rewards()
185 |             if new_rewards:
186 |                 if reward_tracker.reward(new_rewards[0], frame_idx):
187 |                     break
188 | 
189 |             if len(buffer) < params['replay_initial']:
190 |                 continue
191 | 
192 |             optimizer.zero_grad()
193 |             batch, batch_indices, batch_weights = buffer.sample(params['batch_size'], beta)
194 |             loss_v, sample_prios_v = calc_loss(batch, batch_weights, net, tgt_net.target_model,
195 |                                                params['gamma'] ** REWARD_STEPS, device=device)
196 |             loss_v.backward()
197 |             optimizer.step()
198 |             buffer.update_priorities(batch_indices, sample_prios_v.data.cpu().numpy())
199 | 
200 |             if frame_idx % save_interval == 0:
201 |                 net.save_checkpoint()
202 | 
203 |             if frame_idx % params['target_net_sync'] == 0:
204 |                 tgt_net.sync()
205 | 


--------------------------------------------------------------------------------
/Ptan/common.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import time
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn as nn
  6 | 
  7 | 
  8 | HYPERPARAMS = {
  9 |     'pong': {
 10 |         'env_name':         "PongNoFrameskip-v4",
 11 |         'stop_reward':      18.0,
 12 |         'run_name':         'pong',
 13 |         'replay_size':      250000,
 14 |         'replay_initial':   10000,
 15 |         'target_net_sync':  32000,
 16 |         'epsilon_frames':   10**5,
 17 |         'epsilon_start':    1.0,
 18 |         'epsilon_final':    0.02,
 19 |         'learning_rate':    6.25e-5,
 20 |         'gamma':            0.99,
 21 |         'batch_size':       32
 22 |     },
 23 |     'breakout-small': {
 24 |         'env_name':         "BreakoutNoFrameskip-v4",
 25 |         'stop_reward':      500.0,
 26 |         'run_name':         'breakout-small',
 27 |         'replay_size':      3*10 ** 5,
 28 |         'replay_initial':   20000,
 29 |         'target_net_sync':  1000,
 30 |         'epsilon_frames':   10 ** 6,
 31 |         'epsilon_start':    1.0,
 32 |         'epsilon_final':    0.1,
 33 |         'learning_rate':    0.0001,
 34 |         'gamma':            0.99,
 35 |         'batch_size':       64
 36 |     },
 37 |     'breakout': {
 38 |         'env_name':         "BreakoutNoFrameskip-v4",
 39 |         'stop_reward':      500.0,
 40 |         'run_name':         'breakout',
 41 |         'replay_size':      10 ** 6,
 42 |         'replay_initial':   50000,
 43 |         'target_net_sync':  10000,
 44 |         'epsilon_frames':   10 ** 6,
 45 |         'epsilon_start':    1.0,
 46 |         'epsilon_final':    0.1,
 47 |         'learning_rate':    0.00025,
 48 |         'gamma':            0.99,
 49 |         'batch_size':       32
 50 |     },
 51 |     'invaders': {
 52 |         'env_name': "SpaceInvadersNoFrameskip-v4",
 53 |         'stop_reward': 50000.0,
 54 |         'run_name': 'breakout',
 55 |         'replay_size': 1000000,
 56 |         'replay_initial': 80000,
 57 |         'target_net_sync': 32000,
 58 |         'epsilon_frames': 10 ** 5,
 59 |         'epsilon_start': 1.0,
 60 |         'epsilon_final': 0.1,
 61 |         'learning_rate': 6.25e-5,
 62 |         'gamma': 0.99,
 63 |         'batch_size': 32
 64 |     },
 65 |     'MarioKart': {
 66 |         'env_name': "MarioKartEnv",
 67 |         'stop_reward': 5000000.0,
 68 |         'run_name': 'MarioKart1',
 69 |         'replay_size': 1000000,
 70 |         'replay_initial': 30000,
 71 |         'target_net_sync': 32000,
 72 |         'epsilon_frames': 10 ** 5,
 73 |         'epsilon_start': 1.0,
 74 |         'epsilon_final': 0.1,
 75 |         'learning_rate': 0.0001,#6.25e-5
 76 |         'gamma': 0.99,
 77 |         'batch_size': 32
 78 |     },
 79 | }
 80 | 
 81 | 
 82 | def unpack_batch(batch):
 83 |     states, actions, rewards, dones, last_states = [], [], [], [], []
 84 |     for exp in batch:
 85 |         state = np.array(exp.state, copy=False)
 86 |         states.append(state)
 87 |         actions.append(exp.action)
 88 |         rewards.append(exp.reward)
 89 |         dones.append(exp.last_state is None)
 90 |         if exp.last_state is None:
 91 |             last_states.append(state)       # the result will be masked anyway
 92 |         else:
 93 |             last_states.append(np.array(exp.last_state, copy=False))
 94 |     return np.array(states, copy=False), np.array(actions), np.array(rewards, dtype=np.float32), \
 95 |            np.array(dones, dtype=np.uint8), np.array(last_states, copy=False)
 96 | 
 97 | 
 98 | def calc_loss_dqn(batch, net, tgt_net, gamma, device="cpu"):
 99 |     states, actions, rewards, dones, next_states = unpack_batch(batch)
100 | 
101 |     states_v = torch.tensor(states).to(device)
102 |     next_states_v = torch.tensor(next_states).to(device)
103 |     actions_v = torch.tensor(actions).to(device)
104 |     rewards_v = torch.tensor(rewards).to(device)
105 |     done_mask = torch.ByteTensor(dones).to(device)
106 | 
107 |     state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
108 |     next_state_values = tgt_net(next_states_v).max(1)[0]
109 |     next_state_values[done_mask] = 0.0
110 | 
111 |     expected_state_action_values = next_state_values.detach() * gamma + rewards_v
112 |     return nn.MSELoss()(state_action_values, expected_state_action_values)
113 | 
114 | 
115 | class RewardTracker:
116 |     def __init__(self, writer, stop_reward):
117 |         self.writer = writer
118 |         self.stop_reward = stop_reward
119 | 
120 |     def __enter__(self):
121 |         self.ts = time.time()
122 |         self.ts_frame = 0
123 |         self.total_rewards = []
124 |         return self
125 | 
126 |     def __exit__(self, *args):
127 |         self.writer.close()
128 | 
129 |     def reward(self, reward, frame, epsilon=None):
130 |         self.total_rewards.append(reward)
131 |         speed = (frame - self.ts_frame) / (time.time() - self.ts)
132 |         self.ts_frame = frame
133 |         self.ts = time.time()
134 |         mean_reward = np.mean(self.total_rewards[-100:])
135 |         epsilon_str = "" if epsilon is None else ", eps %.2f" % epsilon
136 |         print("%d: done %d games, mean reward %.3f, speed %.2f f/s%s" % (
137 |             frame, len(self.total_rewards), mean_reward, speed, epsilon_str
138 |         ))
139 |         sys.stdout.flush()
140 |         if epsilon is not None:
141 |             self.writer.add_scalar("epsilon", epsilon, frame)
142 |         self.writer.add_scalar("speed", speed, frame)
143 |         self.writer.add_scalar("reward_100", mean_reward, frame)
144 |         self.writer.add_scalar("reward", reward, frame)
145 |         if mean_reward > self.stop_reward:
146 |             print("Solved in %d frames!" % frame)
147 |             return True
148 |         return False
149 | 
150 | 
151 | class EpsilonTracker:
152 |     def __init__(self, epsilon_greedy_selector, params):
153 |         self.epsilon_greedy_selector = epsilon_greedy_selector
154 |         self.epsilon_start = params['epsilon_start']
155 |         self.epsilon_final = params['epsilon_final']
156 |         self.epsilon_frames = params['epsilon_frames']
157 |         self.frame(0)
158 | 
159 |     def frame(self, frame):
160 |         self.epsilon_greedy_selector.epsilon = \
161 |             max(self.epsilon_final, self.epsilon_start - frame / self.epsilon_frames)
162 | 
163 | 
164 | def distr_projection(next_distr, rewards, dones, Vmin, Vmax, n_atoms, gamma):
165 |     """
166 |     Perform distribution projection aka Catergorical Algorithm from the
167 |     "A Distributional Perspective on RL" paper
168 |     """
169 |     batch_size = len(rewards)
170 |     proj_distr = np.zeros((batch_size, n_atoms), dtype=np.float32)
171 |     delta_z = (Vmax - Vmin) / (n_atoms - 1)
172 |     for atom in range(n_atoms):
173 |         tz_j = np.minimum(Vmax, np.maximum(Vmin, rewards + (Vmin + atom * delta_z) * gamma))
174 |         b_j = (tz_j - Vmin) / delta_z
175 |         l = np.floor(b_j).astype(np.int64)
176 |         u = np.ceil(b_j).astype(np.int64)
177 |         eq_mask = u == l
178 |         proj_distr[eq_mask, l[eq_mask]] += next_distr[eq_mask, atom]
179 |         ne_mask = u != l
180 |         proj_distr[ne_mask, l[ne_mask]] += next_distr[ne_mask, atom] * (u - b_j)[ne_mask]
181 |         proj_distr[ne_mask, u[ne_mask]] += next_distr[ne_mask, atom] * (b_j - l)[ne_mask]
182 |     if dones.any():
183 |         proj_distr[dones] = 0.0
184 |         tz_j = np.minimum(Vmax, np.maximum(Vmin, rewards[dones]))
185 |         b_j = (tz_j - Vmin) / delta_z
186 |         l = np.floor(b_j).astype(np.int64)
187 |         u = np.ceil(b_j).astype(np.int64)
188 |         eq_mask = u == l
189 |         eq_dones = dones.copy()
190 |         eq_dones[dones] = eq_mask
191 |         if eq_dones.any():
192 |             proj_distr[eq_dones, l[eq_mask]] = 1.0
193 |         ne_mask = u != l
194 |         ne_dones = dones.copy()
195 |         ne_dones[dones] = ne_mask
196 |         if ne_dones.any():
197 |             proj_distr[ne_dones, l[ne_mask]] = (u - b_j)[ne_mask]
198 |             proj_distr[ne_dones, u[ne_mask]] = (b_j - l)[ne_mask]
199 |     return proj_distr
200 | 


--------------------------------------------------------------------------------
/PER_old.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | 
  4 | class SumTree(object):
  5 |     data_pointer = 0
  6 |     
  7 |     # Here we initialize the tree with all nodes = 0, and initialize the data with all values = 0
  8 |     def __init__(self, capacity):
  9 |         # Number of leaf nodes (final nodes) that contains experiences
 10 |         self.capacity = capacity
 11 |         
 12 |         # Generate the tree with all nodes values = 0
 13 |         # To understand this calculation (2 * capacity - 1) look at the schema below
 14 |         # Remember we are in a binary node (each node has max 2 children) so 2x size of leaf (capacity) - 1 (root node)
 15 |         # Parent nodes = capacity - 1
 16 |         # Leaf nodes = capacity
 17 |         self.tree = np.zeros(2 * capacity - 1)
 18 |         
 19 |         # Contains the experiences (so the size of data is capacity)
 20 |         self.data = np.zeros(capacity, dtype=object)    
 21 |     
 22 |     # Here we define function that will add our priority score in the sumtree leaf and add the experience in data:
 23 |     def add(self, priority, data):
 24 |         
 25 |         # Look at what index we want to put the experience
 26 |         tree_index = self.data_pointer + self.capacity - 1
 27 | 
 28 |         # Update data frame
 29 |         self.data[self.data_pointer] = data
 30 | 
 31 |         # Update the leaf
 32 |         self.update (tree_index, priority)
 33 | 
 34 |         # Add 1 to data_pointer
 35 |         self.data_pointer += 1
 36 | 
 37 |         if self.data_pointer >= self.capacity:  # If we're above the capacity, we go back to first index (we overwrite)
 38 |             self.data_pointer = 0
 39 |             
 40 |     # Update the leaf priority score and propagate the change through tree
 41 |     def update(self, tree_index, priority):
 42 |         # Change = new priority score - former priority score
 43 |         change = priority - self.tree[tree_index]
 44 |         self.tree[tree_index] = priority
 45 | 
 46 |         # then propagate the change through tree
 47 |         # this method is faster than the recursive loop in the reference code
 48 |         while tree_index != 0:
 49 |             tree_index = (tree_index - 1) // 2
 50 |             self.tree[tree_index] += change
 51 |         
 52 |     # Here build a function to get a leaf from our tree. So we'll build a function to get the leaf_index, priority value of that leaf and experience associated with that leaf index:
 53 |     def get_leaf(self, v):
 54 |         parent_index = 0
 55 | 
 56 |         # the while loop is faster than the method in the reference code
 57 |         while True:
 58 |             left_child_index = 2 * parent_index + 1
 59 |             right_child_index = left_child_index + 1
 60 | 
 61 |             # If we reach bottom, end the search
 62 |             if left_child_index >= len(self.tree):
 63 |                 leaf_index = parent_index
 64 |                 break
 65 |             else: # downward search, always search for a higher priority node
 66 |                 if v <= self.tree[left_child_index]:
 67 |                     parent_index = left_child_index
 68 |                 else:
 69 |                     v -= self.tree[left_child_index]
 70 |                     parent_index = right_child_index
 71 | 
 72 |         data_index = leaf_index - self.capacity + 1
 73 | 
 74 |         return leaf_index, self.tree[leaf_index], self.data[data_index]
 75 |     
 76 |     @property
 77 |     def total_priority(self):
 78 |         return self.tree[0] # Returns the root node
 79 | 
 80 | # Now we finished constructing our SumTree object, next we'll build a memory object.
 81 | class ReplayMemory(object):  # stored as ( state, action, reward, next_state ) in SumTree
 82 |     PER_e = 0.01  # Hyperparameter that we use to avoid some experiences to have 0 probability of being taken
 83 |     PER_a = 0.6  # Hyperparameter that we use to make a tradeoff between taking only exp with high priority and sampling randomly
 84 |     PER_b = 0.4  # importance-sampling, from initial value increasing to 1
 85 |     
 86 |     PER_b_increment_per_sampling = 0.001
 87 |     
 88 |     absolute_error_upper = 1.  # clipped abs error
 89 | 
 90 |     def __init__(self, shape,capacity,batch_size):
 91 |         # Making the tree 
 92 |         self.tree = SumTree(capacity)
 93 |         self.mem_cntr = 0
 94 |         self.shape = shape
 95 |         self.batch_size = batch_size
 96 |         self.capacity = capacity
 97 | 
 98 |     def is_sufficient(self):
 99 |         return self.mem_cntr > self.batch_size
100 |         
101 |     # Next, we define a function to store a new experience in our tree.
102 |     # Each new experience will have a score of max_prority (it will be then improved when we use this exp to train our DDQN).
103 |     def store_transition(self, state, action, reward, state_, terminal):
104 | 
105 |         experience = [state, action, reward, state_, terminal]
106 |             
107 |         # Find the max priority
108 |         self.mem_cntr += 1
109 |         max_priority = np.max(self.tree.tree[-self.tree.capacity:])
110 | 
111 |         # If the max priority = 0 we can't put priority = 0 since this experience will never have a chance to be selected
112 |         # So we use a minimum priority
113 |         if max_priority == 0:
114 |             max_priority = self.absolute_error_upper
115 | 
116 |         self.tree.add(max_priority, experience)   # set the max priority for new priority
117 |         
118 |     # Now we create sample function, which will be used to pick batch from our tree memory, which will be used to train our model.
119 |     # - First, we sample a minibatch of n size, the range [0, priority_total] into priority ranges.
120 |     # - Then a value is uniformly sampled from each range.
121 |     # - Then we search in the sumtree, for the experience where priority score correspond to sample values are retrieved from.
122 |     def sample_memory(self):
123 |         # Create a miself.batch_minibatch array that will contains the minibatch
124 |         states = np.zeros((self.batch_size, *self.shape),
125 |                                     dtype=np.float32)
126 |         actions = np.zeros(self.batch_size, dtype=np.int64)
127 |         rewards = np.zeros(self.batch_size, dtype=np.float32)
128 |         states_ = np.zeros((self.batch_size, *self.shape),
129 |                                     dtype=np.float32)
130 |         terminal = np.zeros(self.batch_size, dtype=np.bool)
131 | 
132 |         b_idx = np.empty((self.batch_size,), dtype=np.int32)
133 | 
134 |         # Calculate the priority segment
135 |         # Here, as explained in the paper, we divide the Range[0, ptotal] into n ranges
136 |         priority_segment = self.tree.total_priority / self.batch_size       # priority segment
137 | 
138 |         for i in range(self.batch_size):
139 |             # A value is uniformly sample from each range
140 |             a, b = priority_segment * i, priority_segment * (i + 1)
141 |             data = 0
142 |             count = 0
143 |             while data == 0:
144 |                 count += 1
145 |                 value = np.random.uniform(a, b)
146 | 
147 |                 # Experience that correspond to each value is retrieved
148 |                 index, priority, data = self.tree.get_leaf(value)
149 | 
150 |                 if count > 50:
151 |                     raise Exception("Couldnt get non 0 value from tree")
152 |                 
153 |             b_idx[i]= index
154 | 
155 |             states[i] = data[0]                
156 |             actions[i] = data[1]
157 |             rewards[i] = data[2]
158 |             states_[i] = data[3]
159 |             terminal[i] = data[4]
160 | 
161 |             #minibatch.append([data[0],data[1],data[2],data[3],data[4]])
162 | 
163 |         return b_idx, states,actions,rewards,states_,terminal
164 |     
165 |     # Update the priorities on the tree
166 |     def batch_update(self, tree_idx, abs_errors):
167 |         abs_errors += self.PER_e  # convert to abs and avoid 0
168 |         clipped_errors = np.minimum(abs_errors, self.absolute_error_upper)
169 |         ps = np.power(clipped_errors, self.PER_a)
170 | 
171 |         for ti, p in zip(tree_idx, ps):
172 |             self.tree.update(ti, p)
173 | 


--------------------------------------------------------------------------------
/PER.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | class Node:
  4 |     def __init__(self, left, right, is_leaf: bool = False, idx = None):
  5 |         self.left = left
  6 |         self.right = right
  7 |         self.is_leaf = is_leaf
  8 |         if not self.is_leaf:
  9 |             self.value = self.left.value + self.right.value
 10 |         self.parent = None
 11 |         self.idx = idx  # this value is only set for leaf nodes
 12 |         if left is not None:
 13 |             left.parent = self
 14 |         if right is not None:
 15 |             right.parent = self
 16 |     @classmethod
 17 |     def create_leaf(cls, value, idx):
 18 |         leaf = cls(None, None, is_leaf=True, idx=idx)
 19 |         leaf.value = value
 20 |         return leaf
 21 | 
 22 | def create_tree(input: list):
 23 |     nodes = [Node.create_leaf(v, i) for i, v in enumerate(input)]
 24 |     leaf_nodes = nodes
 25 |     while len(nodes) > 1:
 26 |         inodes = iter(nodes)
 27 |         nodes = [Node(*pair) for pair in zip(inodes, inodes)]
 28 |     return nodes[0], leaf_nodes
 29 | 
 30 | def retrieve(value: float, node: Node):
 31 |     if node.is_leaf:
 32 |         return node
 33 |     if node.left.value >= value:
 34 |         return retrieve(value, node.left)
 35 |     else:
 36 |         return retrieve(value - node.left.value, node.right)
 37 |             
 38 | def update(node: Node, new_value: float):
 39 |     change = new_value - node.value
 40 |     node.value = new_value
 41 |     propagate_changes(change, node.parent)
 42 |     
 43 | def propagate_changes(change: float, node: Node):
 44 |     node.value += change
 45 |     if node.parent is not None:
 46 |         propagate_changes(change, node.parent)
 47 | 
 48 | class ReplayMemory:
 49 |     def __init__(self, input_dims, max_mem, batch_size):
 50 | 
 51 |         self.alpha = 0.6
 52 |         self.beta = 0.4
 53 |         self.beta_steps = 180000
 54 |         self.beta_inc = (1 - self.beta) / self.beta_steps
 55 |         self.eps = 0.01
 56 |         
 57 |         self.mem_size = max_mem
 58 |         self.batch_size = batch_size
 59 |         self.mem_cntr = 0
 60 |         
 61 |         self.state_memory = np.zeros((self.mem_size, *input_dims),
 62 |                                      dtype=np.float32)
 63 |         self.new_state_memory = np.zeros((self.mem_size, *input_dims),
 64 |                                          dtype=np.float32)
 65 |         self.action_memory = np.zeros(self.mem_size, dtype=np.int64)
 66 |         self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
 67 |         self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
 68 |         
 69 |         priorities = np.zeros((self.mem_size, ), dtype=np.float32)
 70 | 
 71 |         self.root_node, self.leaf_nodes = create_tree(priorities)
 72 |         self.max_priority = -1.0
 73 |         self.absolute_error_upper = 1.0
 74 | 
 75 |     def store_transition(self, state, action, reward, state_, terminal):
 76 |         self.beta = min(self.beta + self.beta_inc,1)
 77 |         
 78 |         index = self.mem_cntr % self.mem_size
 79 |         self.state_memory[index] = state
 80 |         self.action_memory[index] = action
 81 |         self.reward_memory[index] = reward
 82 |         self.new_state_memory[index] = state_
 83 |         self.terminal_memory[index] = terminal
 84 | 
 85 |         update(self.leaf_nodes[index], abs(self.max_priority))
 86 | 
 87 |         self.mem_cntr += 1
 88 | 
 89 |     def sample_memory(self):
 90 |         max_mem = min(self.mem_cntr, self.mem_size)
 91 |         """
 92 | 
 93 |         if self.mem_cntr > self.mem_size:
 94 |             prios = self.priorities
 95 |         else:
 96 |             prios = self.priorities[:self.mem_cntr]"""
 97 |         tree_total = self.root_node.value
 98 |         indices = []
 99 |         probs = []
100 |         for i in range(self.batch_size):
101 |             rand_val = np.random.uniform(0, tree_total)
102 | 
103 |             leaf = retrieve(rand_val, self.root_node)
104 |             
105 |             indices.append(leaf.idx)
106 | 
107 |             ###remove here IS
108 |             #probs.append(leaf.value / tree_total)
109 | 
110 |         #and here IS
111 |         #probs = np.array(probs,dtype=np.float32)
112 | 
113 |         states = self.state_memory[indices]
114 |         actions = self.action_memory[indices]
115 |         rewards = self.reward_memory[indices]
116 |         new_states = self.new_state_memory[indices]
117 |         terminals = self.terminal_memory[indices]
118 | 
119 |         #Both of these IS
120 |         #weights = (max_mem * probs) ** (-self.beta)
121 |         #weights /= weights.max()
122 | 
123 |         #last bit here
124 |         return states, actions, rewards, new_states, terminals, indices#, np.array(weights, dtype=np.float32)
125 | 
126 |     def batch_update(self,batch_indices, batch_priorities):
127 |         #print(batch_priorities.type)
128 | 
129 |         #batch_priorities += self.eps
130 |         #batch_priorities = np.minimum(batch_priorities, self.absolute_error_upper)
131 | 
132 |         #batch_priorities = np.power(batch_priorities, self.alpha)
133 | 
134 |         self.max_priority = max(self.max_priority, max(batch_priorities))
135 |         
136 |         for idx, prio in zip(batch_indices, batch_priorities):            
137 |             update(self.leaf_nodes[idx], prio)
138 | 
139 |     def is_sufficient(self):
140 |         return self.mem_cntr > self.batch_size    
141 |         
142 | 
143 | ################################################ the below implementation doesnt use tree
144 | class ReplayMemoryBuffer:
145 |     def __init__(self, input_dims, max_mem, batch_size):
146 | 
147 |         self.alpha = 0.6
148 |         self.beta = 0.4
149 |         self.beta_steps = 50000
150 |         self.beta_inc = (1 - self.beta) / self.beta_steps
151 |         self.eps = 1e-5
152 |         
153 |         self.mem_size = max_mem
154 |         self.batch_size = batch_size
155 |         self.mem_cntr = 0
156 |         
157 |         self.state_memory = np.zeros((self.mem_size, *input_dims),
158 |                                      dtype=np.float32)
159 |         self.new_state_memory = np.zeros((self.mem_size, *input_dims),
160 |                                          dtype=np.float32)
161 |         self.action_memory = np.zeros(self.mem_size, dtype=np.int64)
162 |         self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
163 |         self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
164 |         
165 |         self.priorities = np.zeros((self.mem_size, ), dtype=np.float32)
166 | 
167 | 
168 |     def store_transition(self, state, action, reward, state_, terminal):
169 |         self.beta = min(self.beta + self.beta_inc,1)
170 |         
171 |         max_prio = self.priorities.max() if self.mem_cntr > 0 else 1.0
172 |         
173 |         index = self.mem_cntr % self.mem_size
174 |         self.state_memory[index] = state
175 |         self.action_memory[index] = action
176 |         self.reward_memory[index] = reward
177 |         self.new_state_memory[index] = state_
178 |         self.terminal_memory[index] = terminal
179 | 
180 |         self.priorities[index] = max_prio
181 | 
182 |         self.mem_cntr += 1
183 | 
184 |     def sample_memory(self):
185 | 
186 |         max_mem = min(self.mem_cntr, self.mem_size)
187 | 
188 |         if self.mem_cntr > self.mem_size:
189 |             prios = self.priorities
190 |         else:
191 |             prios = self.priorities[:self.mem_cntr]
192 | 
193 |         probs = prios ** self.alpha
194 |         probs /= probs.sum()
195 |         indices = np.random.choice(max_mem, self.batch_size, p=probs)
196 | 
197 |         states = self.state_memory[indices]
198 |         actions = self.action_memory[indices]
199 |         rewards = self.reward_memory[indices]
200 |         new_states = self.new_state_memory[indices]
201 |         terminals = self.terminal_memory[indices]
202 | 
203 |         weights = (max_mem * probs[indices]) ** (-self.beta)
204 |         weights /= weights.max()
205 | 
206 |         return states, actions, rewards, new_states, terminals, indices, np.array(weights, dtype=np.float32)
207 | 
208 |     def batch_update(self,batch_indices, batch_priorities):
209 |         for idx, prio in zip(batch_indices, batch_priorities):
210 |             self.priorities[idx] = prio
211 | 
212 |     def is_sufficient(self):
213 |         return self.mem_cntr > self.batch_size
214 | 
215 | 
216 |     
217 | 


--------------------------------------------------------------------------------
/Ptan/07_dqn_distrib.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import gym
  3 | import ptan
  4 | import numpy as np
  5 | import argparse
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | import torch.optim as optim
 11 | 
 12 | from tensorboardX import SummaryWriter
 13 | 
 14 | from lib import common
 15 | 
 16 | SAVE_STATES_IMG = False
 17 | SAVE_TRANSITIONS_IMG = False
 18 | 
 19 | if SAVE_STATES_IMG or SAVE_TRANSITIONS_IMG:
 20 |     import matplotlib as mpl
 21 |     mpl.use("Agg")
 22 |     import matplotlib.pylab as plt
 23 | 
 24 | Vmax = 10
 25 | Vmin = -10
 26 | N_ATOMS = 51
 27 | DELTA_Z = (Vmax - Vmin) / (N_ATOMS - 1)
 28 | 
 29 | STATES_TO_EVALUATE = 1000
 30 | EVAL_EVERY_FRAME = 100
 31 | 
 32 | 
 33 | class DistributionalDQN(nn.Module):
 34 |     def __init__(self, input_shape, n_actions):
 35 |         super(DistributionalDQN, self).__init__()
 36 | 
 37 |         self.conv = nn.Sequential(
 38 |             nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
 39 |             nn.ReLU(),
 40 |             nn.Conv2d(32, 64, kernel_size=4, stride=2),
 41 |             nn.ReLU(),
 42 |             nn.Conv2d(64, 64, kernel_size=3, stride=1),
 43 |             nn.ReLU()
 44 |         )
 45 | 
 46 |         conv_out_size = self._get_conv_out(input_shape)
 47 |         self.fc = nn.Sequential(
 48 |             nn.Linear(conv_out_size, 512),
 49 |             nn.ReLU(),
 50 |             nn.Linear(512, n_actions * N_ATOMS)
 51 |         )
 52 | 
 53 |         self.register_buffer("supports", torch.arange(Vmin, Vmax+DELTA_Z, DELTA_Z))
 54 |         self.softmax = nn.Softmax(dim=1)
 55 | 
 56 |     def _get_conv_out(self, shape):
 57 |         o = self.conv(torch.zeros(1, *shape))
 58 |         return int(np.prod(o.size()))
 59 | 
 60 |     def forward(self, x):
 61 |         batch_size = x.size()[0]
 62 |         fx = x.float() / 256
 63 |         conv_out = self.conv(fx).view(batch_size, -1)
 64 |         fc_out = self.fc(conv_out)
 65 |         return fc_out.view(batch_size, -1, N_ATOMS)
 66 | 
 67 |     def both(self, x):
 68 |         cat_out = self(x)
 69 |         probs = self.apply_softmax(cat_out)
 70 |         weights = probs * self.supports
 71 |         res = weights.sum(dim=2)
 72 |         return cat_out, res
 73 | 
 74 |     def qvals(self, x):
 75 |         return self.both(x)[1]
 76 | 
 77 |     def apply_softmax(self, t):
 78 |         return self.softmax(t.view(-1, N_ATOMS)).view(t.size())
 79 | 
 80 | 
 81 | def calc_values_of_states(states, net, device="cpu"):
 82 |     mean_vals = []
 83 |     for batch in np.array_split(states, 64):
 84 |         states_v = torch.tensor(batch).to(device)
 85 |         action_values_v = net.qvals(states_v)
 86 |         best_action_values_v = action_values_v.max(1)[0]
 87 |         mean_vals.append(best_action_values_v.mean().item())
 88 |     return np.mean(mean_vals)
 89 | 
 90 | 
 91 | def save_state_images(frame_idx, states, net, device="cpu", max_states=200):
 92 |     ofs = 0
 93 |     p = np.arange(Vmin, Vmax + DELTA_Z, DELTA_Z)
 94 |     for batch in np.array_split(states, 64):
 95 |         states_v = torch.tensor(batch).to(device)
 96 |         action_prob = net.apply_softmax(net(states_v)).data.cpu().numpy()
 97 |         batch_size, num_actions, _ = action_prob.shape
 98 |         for batch_idx in range(batch_size):
 99 |             plt.clf()
100 |             for action_idx in range(num_actions):
101 |                 plt.subplot(num_actions, 1, action_idx+1)
102 |                 plt.bar(p, action_prob[batch_idx, action_idx], width=0.5)
103 |             plt.savefig("states/%05d_%08d.png" % (ofs + batch_idx, frame_idx))
104 |         ofs += batch_size
105 |         if ofs >= max_states:
106 |             break
107 | 
108 | 
109 | def save_transition_images(batch_size, predicted, projected, next_distr, dones, rewards, save_prefix):
110 |     for batch_idx in range(batch_size):
111 |         is_done = dones[batch_idx]
112 |         reward = rewards[batch_idx]
113 |         plt.clf()
114 |         p = np.arange(Vmin, Vmax + DELTA_Z, DELTA_Z)
115 |         plt.subplot(3, 1, 1)
116 |         plt.bar(p, predicted[batch_idx], width=0.5)
117 |         plt.title("Predicted")
118 |         plt.subplot(3, 1, 2)
119 |         plt.bar(p, projected[batch_idx], width=0.5)
120 |         plt.title("Projected")
121 |         plt.subplot(3, 1, 3)
122 |         plt.bar(p, next_distr[batch_idx], width=0.5)
123 |         plt.title("Next state")
124 |         suffix = ""
125 |         if reward != 0.0:
126 |             suffix = suffix + "_%.0f" % reward
127 |         if is_done:
128 |             suffix = suffix + "_done"
129 |         plt.savefig("%s_%02d%s.png" % (save_prefix, batch_idx, suffix))
130 | 
131 | 
132 | def calc_loss(batch, net, tgt_net, gamma, device="cpu", save_prefix=None):
133 |     states, actions, rewards, dones, next_states = common.unpack_batch(batch)
134 |     batch_size = len(batch)
135 | 
136 |     states_v = torch.tensor(states).to(device)
137 |     actions_v = torch.tensor(actions).to(device)
138 |     next_states_v = torch.tensor(next_states).to(device)
139 | 
140 |     # next state distribution
141 |     next_distr_v, next_qvals_v = tgt_net.both(next_states_v)
142 |     next_actions = next_qvals_v.max(1)[1].data.cpu().numpy()
143 |     next_distr = tgt_net.apply_softmax(next_distr_v).data.cpu().numpy()
144 | 
145 |     next_best_distr = next_distr[range(batch_size), next_actions]
146 |     dones = dones.astype(np.bool)
147 | 
148 |     # project our distribution using Bellman update
149 |     proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin, Vmax, N_ATOMS, gamma)
150 | 
151 |     # calculate net output
152 |     distr_v = net(states_v)
153 |     state_action_values = distr_v[range(batch_size), actions_v.data]
154 |     state_log_sm_v = F.log_softmax(state_action_values, dim=1)
155 |     proj_distr_v = torch.tensor(proj_distr).to(device)
156 | 
157 |     if save_prefix is not None:
158 |         pred = F.softmax(state_action_values, dim=1).data.cpu().numpy()
159 |         save_transition_images(batch_size, pred, proj_distr, next_best_distr, dones, rewards, save_prefix)
160 | 
161 |     loss_v = -state_log_sm_v * proj_distr_v
162 |     return loss_v.sum(dim=1).mean()
163 | 
164 | 
165 | if __name__ == "__main__":
166 |     params = common.HYPERPARAMS['pong']
167 | #    params['epsilon_frames'] *= 2
168 |     parser = argparse.ArgumentParser()
169 |     parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda")
170 |     args = parser.parse_args()
171 |     device = torch.device("cuda" if args.cuda else "cpu")
172 | 
173 |     env = gym.make(params['env_name'])
174 |     env = ptan.common.wrappers.wrap_dqn(env)
175 | 
176 |     writer = SummaryWriter(comment="-" + params['run_name'] + "-distrib")
177 |     net = DistributionalDQN(env.observation_space.shape, env.action_space.n).to(device)
178 | 
179 |     tgt_net = ptan.agent.TargetNet(net)
180 |     selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start'])
181 |     epsilon_tracker = common.EpsilonTracker(selector, params)
182 |     agent = ptan.agent.DQNAgent(lambda x: net.qvals(x), selector, device=device)
183 | 
184 |     exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1)
185 |     buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params['replay_size'])
186 |     optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])
187 | 
188 |     frame_idx = 0
189 |     eval_states = None
190 |     prev_save = 0
191 |     save_prefix = None
192 | 
193 |     with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
194 |         while True:
195 |             frame_idx += 1
196 |             buffer.populate(1)
197 |             epsilon_tracker.frame(frame_idx)
198 | 
199 |             new_rewards = exp_source.pop_total_rewards()
200 |             if new_rewards:
201 |                 if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon):
202 |                     break
203 | 
204 |             if len(buffer) < params['replay_initial']:
205 |                 continue
206 | 
207 |             if eval_states is None:
208 |                 eval_states = buffer.sample(STATES_TO_EVALUATE)
209 |                 eval_states = [np.array(transition.state, copy=False) for transition in eval_states]
210 |                 eval_states = np.array(eval_states, copy=False)
211 | 
212 |             optimizer.zero_grad()
213 |             batch = buffer.sample(params['batch_size'])
214 | 
215 |             save_prefix = None
216 |             if SAVE_TRANSITIONS_IMG:
217 |                 interesting = any(map(lambda s: s.last_state is None or s.reward != 0.0, batch))
218 |                 if interesting and frame_idx // 30000 > prev_save:
219 |                     save_prefix = "images/img_%08d" % frame_idx
220 |                     prev_save = frame_idx // 30000
221 | 
222 |             loss_v = calc_loss(batch, net, tgt_net.target_model, gamma=params['gamma'],
223 |                                device=device, save_prefix=save_prefix)
224 |             loss_v.backward()
225 |             optimizer.step()
226 | 
227 |             if frame_idx % params['target_net_sync'] == 0:
228 |                 tgt_net.sync()
229 | 
230 |             if frame_idx % EVAL_EVERY_FRAME == 0:
231 |                 mean_val = calc_values_of_states(eval_states, net, device=device)
232 |                 writer.add_scalar("values_mean", mean_val, frame_idx)
233 | 
234 |             if SAVE_STATES_IMG and frame_idx % 10000 == 0:
235 |                 save_state_images(frame_idx, eval_states, net, device=device)
236 | 


--------------------------------------------------------------------------------
/EffRainbow/common.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import time
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn as nn
  6 | import kornia
  7 | from torchvision.utils import save_image
  8 | 
  9 | HYPERPARAMS = {
 10 |     'pong': {
 11 |         'env_name':         "PongNoFrameskip-v4",
 12 |         'stop_reward':      18.0,
 13 |         'run_name':         'pong',
 14 |         'replay_size':      100000,
 15 |         'replay_initial':   10000,
 16 |         'target_net_sync':  1000,
 17 |         'epsilon_frames':   10**5,
 18 |         'epsilon_start':    1.0,
 19 |         'epsilon_final':    0.02,
 20 |         'learning_rate':    0.0001,
 21 |         'gamma':            0.99,
 22 |         'batch_size':       32
 23 |     },
 24 |     'breakout-small': {
 25 |         'env_name':         "BreakoutNoFrameskip-v4",
 26 |         'stop_reward':      500.0,
 27 |         'run_name':         'breakout-small',
 28 |         'replay_size':      3*10 ** 5,
 29 |         'replay_initial':   20000,
 30 |         'target_net_sync':  1000,
 31 |         'epsilon_frames':   10 ** 6,
 32 |         'epsilon_start':    1.0,
 33 |         'epsilon_final':    0.1,
 34 |         'learning_rate':    0.0001,
 35 |         'gamma':            0.99,
 36 |         'batch_size':       64
 37 |     },
 38 |     'breakout': {
 39 |         'env_name':         "BreakoutNoFrameskip-v4",
 40 |         'stop_reward':      500.0,
 41 |         'run_name':         'breakout',
 42 |         'replay_size':      10 ** 6,
 43 |         'replay_initial':   50000,
 44 |         'target_net_sync':  10000,
 45 |         'epsilon_frames':   10 ** 6,
 46 |         'epsilon_start':    1.0,
 47 |         'epsilon_final':    0.1,
 48 |         'learning_rate':    0.00025,
 49 |         'gamma':            0.99,
 50 |         'batch_size':       32
 51 |     },
 52 |     'invaders': {
 53 |         'env_name': "SpaceInvadersNoFrameskip-v4",
 54 |         'stop_reward': 500.0,
 55 |         'run_name': 'breakout',
 56 |         'replay_size': 10 ** 6,
 57 |         'replay_initial': 50000,
 58 |         'target_net_sync': 10000,
 59 |         'epsilon_frames': 10 ** 6,
 60 |         'epsilon_start': 1.0,
 61 |         'epsilon_final': 0.1,
 62 |         'learning_rate': 0.00025,
 63 |         'gamma': 0.99,
 64 |         'batch_size': 32
 65 |     },
 66 | 
 67 |     'MarioBros': {
 68 |         'env_name': "MarioBrosEnv",
 69 |         'stop_reward': 5000000.0,
 70 |         'run_name': 'MarioKart1',
 71 |         'replay_size': 870000,
 72 |         'replay_initial': 80000, #80k
 73 |         'target_net_sync': 32000,
 74 |         'epsilon_dec': 1.98e-6,
 75 |         'epsilon_start': 1.0,#1.0
 76 |         'epsilon_final': 0.01,#0.01
 77 |         'learning_rate': 0.00025,#
 78 |         'gamma': 0.99,
 79 |         'batch_size': 32 #used to be 256
 80 |     },
 81 | }
 82 | 
 83 | def unpack_batch(batch):
 84 |     states, actions, rewards, dones, last_states = [], [], [], [], []
 85 |     for exp in batch:
 86 |         state = np.array(exp.state, copy=False)
 87 |         states.append(state)
 88 |         actions.append(exp.action)
 89 |         rewards.append(exp.reward)
 90 |         dones.append(exp.last_state is None)
 91 |         if exp.last_state is None:
 92 |             last_states.append(state)       # the result will be masked anyway
 93 |         else:
 94 |             last_states.append(np.array(exp.last_state, copy=False))
 95 |     return np.array(states, copy=False), np.array(actions), np.array(rewards, dtype=np.float32), \
 96 |            np.array(dones, dtype=np.uint8), np.array(last_states, copy=False)
 97 | 
 98 | 
 99 | class Intensity(nn.Module):
100 |     def __init__(self, scale):
101 |         super().__init__()
102 |         self.scale = scale
103 | 
104 |     def forward(self, x):
105 |         r = torch.randn((x.size(0), 1, 1, 1), device=x.device)
106 |         noise = 1.0 + (self.scale * r.clamp(-2.0, 2.0))
107 |         return x * noise
108 | 
109 | 
110 | AUG = nn.Sequential(nn.ReplicationPad2d(4),
111 |                          kornia.augmentation.RandomCrop((140, 75)),
112 |                          Intensity(scale=0.1))
113 | 
114 | def calc_loss_dqn(batch, batch_weights,net, tgt_net, gamma, device="cpu"):
115 |     states, actions, rewards, dones, next_states = unpack_batch(batch)
116 | 
117 |     states_v = torch.tensor(states).to(device)
118 |     next_states_v = torch.tensor(next_states).to(device)
119 |     actions_v = torch.tensor(actions).to(device)
120 |     rewards_v = torch.tensor(rewards).to(device)
121 |     done_mask = torch.ByteTensor(dones).to(device)
122 |     batch_weights_v = torch.tensor(batch_weights).to(device)
123 | 
124 |     """ #This code very much needs checking to see if images are broken
125 |     save_image(states_v[0].float(), 'img_size_test.png')
126 | 
127 |     states_v = aug_trans(states_v)
128 |     next_states_v = aug_trans(next_states_v)
129 |     next_states_policy = aug_trans(next_states_v)
130 | 
131 |     save_image(states_v[0], 'img1_aug.png')
132 |     raise Exception("stop")
133 |     """
134 | 
135 |     state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
136 |     next_state_values = tgt_net(next_states_v).max(1)[0]
137 |     next_state_values[done_mask] = 0.0
138 | 
139 |     """
140 |     state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
141 |     next_state_values = tgt_net(next_states_v).max(1)[0]"""
142 | 
143 |     expected_state_action_values = next_state_values.detach() * gamma + rewards_v
144 |     losses_v = batch_weights_v * (state_action_values - expected_state_action_values) ** 2
145 | 
146 |     return losses_v.mean(), losses_v + 1e-5
147 | 
148 | class RewardTracker:
149 |     def __init__(self, writer, stop_reward):
150 |         self.writer = writer
151 |         self.stop_reward = stop_reward
152 |         self.time = time.time()
153 |         self.scores = []
154 | 
155 |     def __enter__(self):
156 |         self.ts = time.time()
157 |         self.ts_frame = 0
158 |         self.total_rewards = []
159 |         return self
160 | 
161 |     def __exit__(self, *args):
162 |         self.writer.close()
163 | 
164 |     def reward(self, reward, frame, epsilon=None):
165 |         self.total_rewards.append(reward)
166 |         speed = (frame - self.ts_frame) / (time.time() - self.ts + 0.000001)
167 |         self.ts_frame = frame
168 |         self.ts = time.time()
169 |         mean_reward = np.mean(self.total_rewards[-100:])
170 |         print("%d frames: Played %d games, Avg reward %.3f, Time: %.3f Hours" % (
171 |             frame, len(self.total_rewards), mean_reward, (self.ts - self.time) / 3600
172 |         ))
173 | 
174 |         self.scores.append([reward,self.ts_frame,time.time() - self.time])
175 | 
176 |         sys.stdout.flush()
177 |         if epsilon is not None:
178 |             self.writer.add_scalar("epsilon", epsilon, frame)
179 |         self.writer.add_scalar("speed", speed, frame)
180 |         self.writer.add_scalar("reward_100", mean_reward, frame)
181 |         self.writer.add_scalar("reward", reward, frame)
182 |         if mean_reward > self.stop_reward:
183 |             print("Solved in %d frames!" % frame)
184 |             return True
185 |         return False
186 | 
187 |     def get_scores(self):
188 |         return np.array(self.scores)
189 | 
190 | 
191 | class EpsilonTracker:
192 |     def __init__(self, epsilon_greedy_selector, params):
193 |         self.epsilon_greedy_selector = epsilon_greedy_selector
194 |         self.epsilon_start = params['epsilon_start']
195 |         self.epsilon_final = params['epsilon_final']
196 |         self.epsilon_frames = params['epsilon_frames']
197 |         self.frame(0)
198 | 
199 |     def frame(self, frame):
200 |         self.epsilon_greedy_selector.epsilon = \
201 |             max(self.epsilon_final, self.epsilon_start - frame / self.epsilon_frames)
202 | 
203 | 
204 | def distr_projection(next_distr, rewards, dones, Vmin, Vmax, n_atoms, gamma):
205 |     """
206 |     Perform distribution projection aka Catergorical Algorithm from the
207 |     "A Distributional Perspective on RL" paper
208 |     """
209 |     batch_size = len(rewards)
210 |     proj_distr = np.zeros((batch_size, n_atoms), dtype=np.float32)
211 |     delta_z = (Vmax - Vmin) / (n_atoms - 1)
212 |     for atom in range(n_atoms):
213 |         tz_j = np.minimum(Vmax, np.maximum(Vmin, rewards + (Vmin + atom * delta_z) * gamma))
214 |         b_j = (tz_j - Vmin) / delta_z
215 |         l = np.floor(b_j).astype(np.int64)
216 |         u = np.ceil(b_j).astype(np.int64)
217 |         eq_mask = u == l
218 |         proj_distr[eq_mask, l[eq_mask]] += next_distr[eq_mask, atom]
219 |         ne_mask = u != l
220 |         proj_distr[ne_mask, l[ne_mask]] += next_distr[ne_mask, atom] * (u - b_j)[ne_mask]
221 |         proj_distr[ne_mask, u[ne_mask]] += next_distr[ne_mask, atom] * (b_j - l)[ne_mask]
222 |     if dones.any():
223 |         proj_distr[dones] = 0.0
224 |         tz_j = np.minimum(Vmax, np.maximum(Vmin, rewards[dones]))
225 |         b_j = (tz_j - Vmin) / delta_z
226 |         l = np.floor(b_j).astype(np.int64)
227 |         u = np.ceil(b_j).astype(np.int64)
228 |         eq_mask = u == l
229 |         eq_dones = dones.copy()
230 |         eq_dones[dones] = eq_mask
231 |         if eq_dones.any():
232 |             proj_distr[eq_dones, l[eq_mask]] = 1.0
233 |         ne_mask = u != l
234 |         ne_dones = dones.copy()
235 |         ne_dones[dones] = ne_mask
236 |         if ne_dones.any():
237 |             proj_distr[ne_dones, l[ne_mask]] = (u - b_j)[ne_mask]
238 |             proj_distr[ne_dones, u[ne_mask]] = (b_j - l)[ne_mask]
239 |     return proj_distr
240 | 


--------------------------------------------------------------------------------
/FelkFork/DolphinEnv.py:
--------------------------------------------------------------------------------
  1 | from multiprocessing import shared_memory
  2 | import numpy as np
  3 | import time
  4 | import os
  5 | from PIL import Image
  6 | import gym
  7 | import random
  8 | from copy import deepcopy
  9 | import cv2
 10 | import warnings
 11 | from Wrappers import wrap_env
 12 | import subprocess
 13 | os.chdir('/home/tyler/Documents/EfficientZero')
 14 | 
 15 | warnings.filterwarnings("ignore")
 16 | 
 17 | """
 18 | This program implements the standard gym MDP
 19 | 
 20 | However, it will use shared memory to access
 21 | data from DolphinSideScript:
 22 |     Rewards
 23 |     terminals
 24 |     states
 25 | 
 26 | It will also need to send to DolphinSideScript:
 27 |     actions
 28 | 
 29 | """
 30 | #Ymem = 108
 31 | #Xmem = 200
 32 | Ymem = 78
 33 | Xmem = 94
 34 | 
 35 | class DolphinEnv(gym.Env):
 36 |     def __init__(self):
 37 | 
 38 |         """
 39 |         shared mem is in following format:
 40 | 
 41 |         This needs to be changed to this format:
 42 | 
 43 |         arr = np.zeros((101,60),dtype=np.float32)
 44 | 
 45 |         arr[0][0] = Dtimestep
 46 |         arr[0][1] = Etimestep
 47 |         arr[0][2] = action
 48 |         arr[0][3] = reward
 49 |         arr[0][4] = terminal
 50 | 
 51 |         arr[1:] = state
 52 |         
 53 |         """
 54 | 
 55 |         self.observation_space = gym.spaces.Box(
 56 |             low=0, high=1, shape=(Ymem, Xmem), dtype=np.uint8)
 57 |         self.action_space = gym.spaces.Discrete(9)
 58 |         self.action_space.n = 9
 59 | 
 60 |         self.reward_range = (-100, 100)
 61 |         self.metadata = None
 62 |         self.initialised = False
 63 | 
 64 |     def real_init(self):
 65 | 
 66 |         with open('pid_num.txt') as f:
 67 |             pid = int(f.readlines()[0]) + 1
 68 | 
 69 |         self.pid = pid
 70 |         self.offset = 0
 71 | 
 72 | 
 73 |         #write to file with pid number
 74 | 
 75 |         with open('pid_num.txt', 'w') as f:
 76 |             f.write(str(self.pid))
 77 | 
 78 |         print("My PID: " + str(self.pid))
 79 |         
 80 |         self.timestep = 0.
 81 |         self.init = True
 82 |         
 83 |         self.data = np.zeros((Ymem + 1,Xmem),dtype=np.float32)
 84 |         print("Data Array")
 85 |         print(self.data)
 86 | 
 87 |         self.shm = shared_memory.SharedMemory(create=True,size=self.data.nbytes,name = 'p' + str(pid))
 88 | 
 89 |         print("Saving to shared mem")
 90 |         self.shm_array = np.ndarray(self.data.shape, dtype=self.data.dtype, buffer=self.shm.buf)
 91 |         self.shm_array[:] = self.data[:]
 92 | 
 93 |         print("Launching Dolphin")
 94 | 
 95 |         """cmd1 = 'cmd /c C:\\Users\\TYLER\\Downloads\\RLJourney\\DolphinNew\\dolphin'
 96 |         cmd2 = '\\Binary\\x64\\Dolphin.exe --no-python-subinterpreters --script C:/Users/TYLER/Downloads/RLJourney/DolphinNew/DolphinSideScript.py \\b --exec="C:\\Users\\TYLER\\Downloads\\GameCollection\\'
 97 |         cmd3 = 'SuperSmashBros.Brawl(Europe)(En,Fr,De,Es,It).nkit.gcz"'
 98 |         #cmd3 = '\\games\\Mario Kart Wii (USA) (En,Fr,Es).nkit.iso"'
 99 |         #cmd /c C:\\Users\\TYLER\\Downloads\\DolphinRevamp\\dolphinScript0\\Dolphin.exe --script C:/Users/TYLER/Downloads/DolphinRevamp/DolphinSideScript.py \\b --exec="C:\\Users\\TYLER\\Downloads\\DolphinRevamp\\dolphinScript0\\games\\NewSuperMarioBros.Wii(Europe)(En,Fr,De,Es,It)(Rev 1).nkit.gcz"
100 | 
101 |         #launch dolphin
102 |         os.popen(cmd1 + str(pid) + cmd2 + cmd3)"""
103 | 
104 |         cmd = 'cd ~/Documents/dolphin' + str(pid) + '/build/Binaries && ./dolphin-emu --no-python-subinterpreters\
105 |             --script /home/tyler/Documents/WiiRL/FelkFork/DolphinSideScriptTanks.py\
106 |             --exec="/home/tyler/Documents/GameCollection/Wii Play (Europe) (En,Fr,De,Es,It).nkit.gcz"'
107 | 
108 |         subprocess.Popen(cmd, shell=True)
109 |         time.sleep(4)
110 | 
111 |         print("Dolphin Launched Successfully")
112 | 
113 |     def get_max_episode_steps(self):
114 |         return 1000
115 | 
116 |     def restart(self):
117 |         with open('pid_num.txt', 'w') as f:
118 |             f.write(str(self.pid))
119 |         
120 |         self.timestep = 0.
121 |         self.init = True
122 | 
123 |         self.data = np.zeros((Ymem + 1,Xmem),dtype=np.float32)
124 |         print("Data Array")
125 |         print(self.data)
126 | 
127 |         self.shm = shared_memory.SharedMemory(create=False,size=self.data.nbytes,name = 'p' + str(self.pid))
128 | 
129 |         print("Saving to shared mem")
130 |         self.shm_array = np.ndarray(self.data.shape, dtype=self.data.dtype, buffer=self.shm.buf)
131 |         self.shm_array[:] = self.data[:]
132 | 
133 |         print("Launching Dolphin After Crash...")
134 | 
135 |         cmd1 = 'cmd /c C:\\Users\\TYLER\\Downloads\\RLJourney\\DolphinNew\\dolphin'
136 |         cmd2 = '\\Binary\\x64\\Dolphin.exe --no-python-subinterpreters --script C:/Users/TYLER/Downloads/RLJourney/DolphinNew/DolphinSideScriptTanks.py \\b --exec="C:\\Users\\TYLER\\Downloads\\GameCollection\\'
137 |         cmd3 = 'SuperSmashBros.Brawl(Europe)(En,Fr,De,Es,It).nkit.gcz"'
138 |         #cmd3 = '\\games\\Mario Kart Wii (USA) (En,Fr,Es).nkit.iso"'
139 |         #cmd /c C:\\Users\\TYLER\\Downloads\\DolphinRevamp\\dolphinScript0\\Dolphin.exe --script C:/Users/TYLER/Downloads/DolphinRevamp/DolphinSideScript.py \\b --exec="C:\\Users\\TYLER\\Downloads\\DolphinRevamp\\dolphinScript0\\games\\NewSuperMarioBros.Wii(Europe)(En,Fr,De,Es,It)(Rev 1).nkit.gcz"
140 | 
141 |         #launch dolphin
142 |         os.popen(cmd1 + str(self.pid) + cmd2 + cmd3)
143 | 
144 |         time.sleep(4)
145 |                              
146 |     def reset(self):
147 | 
148 |         if not self.initialised:
149 |             self.real_init()
150 |             self.initialised = True
151 |         #sync
152 |         #print("Resestting...")
153 |         print("Reset Called, PID: " + str(self.pid))
154 |         self.shm_array[0][2 + self.offset] = 0
155 | 
156 |         if not self.init:
157 |             self.timestep += 1
158 |         else:
159 |             self.init = False
160 |         
161 |         self.shm_array[0][1 + self.offset] = self.timestep
162 | 
163 |         timer = time.time()
164 |         while True:
165 |             #print(str(self.shm_array[0]) + " " + str(self.shm_array[1])+ " " + str(self.timestep))
166 |             if self.shm_array[0][0 + self.offset] == self.timestep + 1:          
167 |                 break
168 | 
169 |             else:
170 |                 if time.time() - timer > 10:
171 |                     print("Waiting 10+ seconds at reset! PID: " + str(self.pid))
172 |                     print("Resestting to t0...")
173 |                     #self.timestep = 0
174 |                     #self.shm_array[0][1 + self.offset] = self.timestep
175 |                     timer = time.time()
176 | 
177 |         return self.shm_array[1:][:].astype(np.uint8)
178 | 
179 |     def step(self,action):
180 | 
181 |         #write timestep and action
182 |         
183 |         self.shm_array[0][2 + self.offset] = action
184 | 
185 |         self.timestep += 1
186 |         self.shm_array[0][1 + self.offset] = self.timestep
187 | 
188 |         print("Step Called, PID: " + str(self.pid))
189 |         #wait for new state,reward,terminal
190 |         #sync
191 |         timer = time.time()
192 |         while True:
193 |             time.sleep(0.001)
194 |             #print(str(self.shm_array[0]) + " " + str(self.shm_array[1]) + " " + str(self.timestep))
195 |             if self.shm_array[0][0 + self.offset] == self.timestep + 1:
196 |                 break
197 |             else:
198 |                 if time.time() - timer > 10:
199 |                     time.sleep(10)
200 |                     print("Dolphin Has likely crashed! No response in 10+ seconds, PID: " + str(self.pid))
201 |                     print(str(self.shm_array[0]) + " " + str(self.shm_array[1]) + " " + str(self.timestep))
202 |                     print("")
203 |                     """os.system("taskkill /f /im Dolphin.exe")
204 |                     time.sleep(5)
205 |                     state = self.shm_array[1:][:].astype(np.uint8)
206 | 
207 |                     self.restart()
208 | 
209 |                     return state,0,True,{}"""
210 |                     
211 |         return self.shm_array[1:][:].astype(np.uint8),self.shm_array[0][3 + self.offset],self.shm_array[0][4 + self.offset],{}
212 | 
213 | def on_press(key):
214 |     global action
215 |     try:
216 |         if key.char == 'q': ####
217 |             action = 0
218 |         elif key.char == 'w':####
219 |             action = 1
220 |         elif key.char == 'e':
221 |             action = 2
222 |         elif key.char == 'r':
223 |             action = 3
224 |         elif key.char == 't':
225 |             action = 4
226 |         elif key.char == 'y':###
227 |             action = 5
228 |         elif key.char == 'u':
229 |             action = 6
230 |         elif key.char == 'i':
231 |             action = 7
232 |         elif key.char == 'o':
233 |             action = 8
234 |         print(action)
235 |     except:pass
236 |         
237 | def on_release(key):
238 |     global action
239 |     action = 9
240 | 
241 | 
242 | if __name__ == "__main__":
243 |     with open('pid_num.txt', 'w') as f:
244 |         f.write(str(-1))
245 |     from pynput import keyboard
246 |     start = time.time()
247 |     steps = 1
248 | 
249 |     envs = []
250 | 
251 |     for i in range(1):
252 |         env = DolphinEnv()
253 |         #env = gym.wrappers.ResizeObservation(env,(54,100))
254 |         #env = gym.wrappers.FrameStack(env, 3)
255 |         envs.append(env)
256 |         #env = wrap_env(env,4)
257 |         #envs.append(env)
258 | 
259 | 
260 |     print(env.observation_space)
261 |     print(env.action_space)
262 |     action = 1
263 |     steps = 1
264 | 
265 | 
266 |     """while True:
267 | 
268 |         for env in envs:
269 |             state = env.reset()
270 |             
271 |         terminal = False
272 |         
273 | 
274 |         while True:
275 |             for env in envs:
276 |                 steps += 1
277 |                 state,reward,terminal,_ = env.step(action)
278 |                 if terminal:
279 |                     env.reset()
280 |                     print("Fps: " + str(steps / (time.time() - start)))"""
281 | 
282 |     listener = keyboard.Listener(
283 |         on_press=on_press,
284 |         on_release=on_release)
285 |     listener.start()
286 | 
287 |     tot_reward = 0
288 |     avg_reward = 0
289 |     max_avg_reward = 0
290 |     steps = 1
291 |     while True:
292 | 
293 |         state = env.reset()
294 |         print("Reset Environment")
295 | 
296 |         print("Fps: " + str(steps / (time.time() - start)))
297 | 
298 |         print("Total Reward: " + str(tot_reward))
299 |         print("AVG Reward: " + str(tot_reward / steps))
300 |         print("MAX AVG Reward: " + str(max_avg_reward))
301 | 
302 |         terminal = False
303 |         trun = False
304 |         tot_reward = 0
305 | 
306 |         avg_reward = 0
307 |         max_avg_reward = 0
308 | 
309 |         while not terminal and not trun:
310 |             steps += 1
311 | 
312 |             state,reward,terminal,_ = env.step(action)
313 |             tot_reward += reward
314 |             if reward != 0:
315 |                 print(reward)
316 | 
317 |             if tot_reward / steps > max_avg_reward:
318 |                 max_avg_reward = tot_reward / steps
319 | 
320 | 
321 | 
322 | 
323 |     
324 | 


--------------------------------------------------------------------------------
/EffRainbow/networks.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file defines all the neural network architectures available to use.
  3 | """
  4 | from functools import partial
  5 | from math import sqrt
  6 | 
  7 | import torch
  8 | from torch import nn as nn, Tensor
  9 | from torch.nn import init
 10 | import torch.nn.functional as F
 11 | import numpy as np
 12 | from lib import dqn_model
 13 | import time
 14 | 
 15 | #import kornia
 16 | from torchvision.utils import save_image
 17 | 
 18 | class FactorizedNoisyLinear(nn.Module):
 19 |     """ The factorized Gaussian noise layer for noisy-nets dqn. """
 20 |     def __init__(self, in_features: int, out_features: int, sigma_0: float) -> None:
 21 |         super().__init__()
 22 |         self.in_features = in_features
 23 |         self.out_features = out_features
 24 |         self.sigma_0 = sigma_0
 25 | 
 26 |         # weight: w = \mu^w + \sigma^w . \epsilon^w
 27 |         self.weight_mu = nn.Parameter(torch.empty(out_features, in_features))
 28 |         self.weight_sigma = nn.Parameter(torch.empty(out_features, in_features))
 29 |         self.register_buffer('weight_epsilon', torch.empty(out_features, in_features))
 30 | 
 31 |         # bias: b = \mu^b + \sigma^b . \epsilon^b
 32 |         self.bias_mu = nn.Parameter(torch.empty(out_features))
 33 |         self.bias_sigma = nn.Parameter(torch.empty(out_features))
 34 |         self.register_buffer('bias_epsilon', torch.empty(out_features))
 35 | 
 36 |         self.reset_parameters()
 37 |         self.reset_noise()
 38 | 
 39 |     @torch.no_grad()
 40 |     def reset_parameters(self) -> None:
 41 |         # initialization is similar to Kaiming uniform (He. initialization) with fan_mode=fan_in
 42 |         scale = 1 / sqrt(self.in_features)
 43 | 
 44 |         init.uniform_(self.weight_mu, -scale, scale)
 45 |         init.uniform_(self.bias_mu, -scale, scale)
 46 | 
 47 |         init.constant_(self.weight_sigma, self.sigma_0 * scale)
 48 |         init.constant_(self.bias_sigma, self.sigma_0 * scale)
 49 | 
 50 |     @torch.no_grad()
 51 |     def _get_noise(self, size: int) -> Tensor:
 52 |         noise = torch.randn(size, device=self.weight_mu.device)
 53 |         # f(x) = sgn(x)sqrt(|x|)
 54 |         return noise.sign().mul_(noise.abs().sqrt_())
 55 | 
 56 |     @torch.no_grad()
 57 |     def reset_noise(self) -> None:
 58 |         # like in eq 10 and 11 of the paper
 59 |         epsilon_in = self._get_noise(self.in_features)
 60 |         epsilon_out = self._get_noise(self.out_features)
 61 |         self.weight_epsilon.copy_(epsilon_out.outer(epsilon_in))
 62 |         self.bias_epsilon.copy_(epsilon_out)
 63 | 
 64 |     @torch.no_grad()
 65 |     def disable_noise(self) -> None:
 66 |         self.weight_epsilon[:] = 0
 67 |         self.bias_epsilon[:] = 0
 68 | 
 69 |     def forward(self, input: Tensor) -> Tensor:
 70 |         # y = wx + d, where
 71 |         # w = \mu^w + \sigma^w * \epsilon^w
 72 |         # b = \mu^b + \sigma^b * \epsilon^b
 73 |         return F.linear(input,
 74 |                         self.weight_mu + self.weight_sigma*self.weight_epsilon,
 75 |                         self.bias_mu + self.bias_sigma*self.bias_epsilon)
 76 | 
 77 | class Dueling(nn.Module):
 78 |     """ The dueling branch used in all nets that use dueling-dqn. """
 79 |     def __init__(self, value_branch, advantage_branch):
 80 |         super().__init__()
 81 |         self.flatten = nn.Flatten()
 82 |         self.value_branch = value_branch
 83 |         self.advantage_branch = advantage_branch
 84 | 
 85 |     #@torch.autocast('cuda')
 86 |     def forward(self, x, advantages_only=False):
 87 |         x = self.flatten(x)
 88 |         advantages = self.advantage_branch(x)
 89 |         if advantages_only:
 90 |             return advantages
 91 | 
 92 |         value = self.value_branch(x)
 93 |         return value + (advantages - torch.mean(advantages, dim=1, keepdim=True))
 94 | 
 95 | 
 96 | class DuelingAlt(nn.Module):
 97 |     """ The dueling branch used in all nets that use dueling-dqn. """
 98 |     def __init__(self, l1, l2):
 99 |         super().__init__()
100 |         self.main = nn.Sequential(
101 |             nn.Flatten(),
102 |             l1,
103 |             nn.ReLU(),
104 |             l2
105 |         )
106 | 
107 |     def forward(self, x, advantages_only=False):
108 |         res = self.main(x)
109 |         advantages = res[:, 1:]
110 |         value = res[:, 0:1]
111 |         return value + (advantages - torch.mean(advantages, dim=1, keepdim=True))
112 | 
113 | class NatureCNN(nn.Module):
114 |     """
115 |     This is the CNN that was introduced in Mnih et al. (2013) and then used in a lot of later work such as
116 |     Mnih et al. (2015) and the Rainbow paper. This implementation only works with a frame resolution of 84x84.
117 |     """
118 |     def __init__(self, depth, actions, linear_layer):
119 |         super().__init__()
120 | 
121 |         self.main = nn.Sequential(
122 |             nn.Conv2d(in_channels=depth, out_channels=32, kernel_size=8, stride=4),
123 |             nn.ReLU(),
124 |             nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
125 |             nn.ReLU(),
126 |             nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
127 |             nn.ReLU(),
128 |             nn.Flatten(),
129 |             linear_layer(3136, 512),
130 |             nn.ReLU(),
131 |             linear_layer(512, actions),
132 |         )
133 | 
134 |     def forward(self, x, advantages_only=None):
135 |         return self.main(x)
136 | 
137 | 
138 | class DuelingNatureCNN(nn.Module):
139 |     """
140 |     Implementation of the dueling architecture introduced in Wang et al. (2015).
141 |     This implementation only works with a frame resolution of 84x84.
142 |     """
143 |     def __init__(self, depth, actions, linear_layer):
144 |         super().__init__()
145 | 
146 |         self.main = nn.Sequential(
147 |             nn.Conv2d(in_channels=depth, out_channels=32, kernel_size=8, stride=4),
148 |             nn.ReLU(),
149 |             nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
150 |             nn.ReLU(),
151 |             nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
152 |             nn.ReLU(),
153 |         )
154 | 
155 |         self.dueling = Dueling(
156 |                 nn.Sequential(linear_layer(3136, 512),
157 |                               nn.ReLU(),
158 |                               linear_layer(512, 1)),
159 |                 nn.Sequential(linear_layer(3136, 512),
160 |                               nn.ReLU(),
161 |                               linear_layer(512, actions))
162 |             )
163 | 
164 |     def forward(self, x, advantages_only=False):
165 |         f = self.main(x)
166 |         return self.dueling(f, advantages_only=advantages_only)
167 | 
168 | 
169 | class ImpalaCNNSmall(nn.Module):
170 |     """
171 |     Implementation of the small variant of the IMPALA CNN introduced in Espeholt et al. (2018).
172 |     """
173 |     def __init__(self, depth, actions):
174 |         super().__init__()
175 | 
176 |         self.main = nn.Sequential(
177 |             nn.Conv2d(in_channels=depth, out_channels=16, kernel_size=8, stride=4),
178 |             nn.ReLU(),
179 |             nn.Conv2d(in_channels=16, out_channels=32, kernel_size=4, stride=2),
180 |             nn.ReLU(),
181 |         )
182 | 
183 |         self.pool = torch.nn.AdaptiveMaxPool2d((6, 6))
184 | 
185 |         self.dueling = Dueling(
186 |                 nn.Sequential(dqn_model.NoisyLinear(1152, 256),
187 |                               nn.ReLU(),
188 |                               dqn_model.NoisyLinear(256, 1)),
189 |                 nn.Sequential(dqn_model.NoisyLinear(1152, 256),
190 |                               nn.ReLU(),
191 |                               dqn_model.NoisyLinear(256, actions))
192 |             )
193 | 
194 |     def _get_conv_out(self, shape):
195 |         o = self.main(torch.zeros(1, *shape))
196 |         return int(np.prod(o.size()))
197 | 
198 |     def forward(self, x, advantages_only=False):
199 |         x = x.float() / 256
200 |         f = self.main(x)
201 |         f = self.pool(f)
202 |         return self.dueling(f, advantages_only=advantages_only)
203 | 
204 | 
205 | class ImpalaCNNResidual(nn.Module):
206 |     """
207 |     Simple residual block used in the large IMPALA CNN.
208 |     """
209 |     def __init__(self, depth, norm_func):
210 |         super().__init__()
211 | 
212 |         self.relu = nn.ReLU()
213 |         self.conv_0 = norm_func(nn.Conv2d(in_channels=depth, out_channels=depth, kernel_size=3, stride=1, padding=1))
214 |         self.conv_1 = norm_func(nn.Conv2d(in_channels=depth, out_channels=depth, kernel_size=3, stride=1, padding=1))
215 | 
216 |     #@torch.autocast('cuda')
217 |     def forward(self, x):
218 |         x_ = self.conv_0(self.relu(x))
219 |         x_ = self.conv_1(self.relu(x_))
220 |         return x+x_
221 | 
222 | class ImpalaCNNBlock(nn.Module):
223 |     """
224 |     Three of these blocks are used in the large IMPALA CNN.
225 |     """
226 |     def __init__(self, depth_in, depth_out, norm_func):
227 |         super().__init__()
228 | 
229 |         self.conv = nn.Conv2d(in_channels=depth_in, out_channels=depth_out, kernel_size=3, stride=1, padding=1)
230 |         self.max_pool = nn.MaxPool2d(3, 2, padding=1)
231 |         self.residual_0 = ImpalaCNNResidual(depth_out, norm_func=norm_func)
232 |         self.residual_1 = ImpalaCNNResidual(depth_out, norm_func=norm_func)
233 | 
234 |     #@torch.autocast('cuda')
235 |     def forward(self, x):
236 |         x = self.conv(x)
237 |         x = self.max_pool(x)
238 |         x = self.residual_0(x)
239 |         x = self.residual_1(x)
240 |         return x
241 | 
242 | 
243 | class ImpalaCNNLarge(nn.Module):
244 |     """
245 |     Implementation of the large variant of the IMPALA CNN introduced in Espeholt et al. (2018).
246 |     """
247 |     def __init__(self, in_depth, actions, model_size=4, spectral_norm='all'):
248 |         super().__init__()
249 | 
250 |         self.start = time.time()
251 |         self.model_size = model_size
252 |         self.actions = actions
253 | 
254 |         def identity(p): return p
255 | 
256 |         norm_func = torch.nn.utils.spectral_norm if (spectral_norm == 'all') else identity
257 |         norm_func_last = torch.nn.utils.spectral_norm if (spectral_norm == 'last' or spectral_norm == 'all') else identity
258 | 
259 |         self.main = nn.Sequential(
260 |             ImpalaCNNBlock(in_depth, 16*model_size, norm_func=norm_func),
261 |             ImpalaCNNBlock(16*model_size, 32*model_size, norm_func=norm_func),
262 |             ImpalaCNNBlock(32*model_size, 32*model_size, norm_func=norm_func_last),
263 |             nn.ReLU()
264 |         )
265 | 
266 |         self.pool = torch.nn.AdaptiveMaxPool2d((8, 8))
267 | 
268 |         self.dueling = Dueling(
269 |             nn.Sequential(nn.Linear(2048*model_size, 256),
270 |                           nn.ReLU(),
271 |                           nn.Linear(256, 1)),
272 |             nn.Sequential(nn.Linear(2048*model_size, 256),
273 |                           nn.ReLU(),
274 |                           nn.Linear(256, actions))
275 |         )
276 | 
277 |     def reset_mlps(self):
278 |         self.dueling = Dueling(
279 |             nn.Sequential(nn.Linear(2048*self.model_size, 256),
280 |                           nn.ReLU(),
281 |                           nn.Linear(256, 1)),
282 |             nn.Sequential(nn.Linear(2048*self.model_size, 256),
283 |                           nn.ReLU(),
284 |                           nn.Linear(256, self.actions))
285 |         )
286 | 
287 |     def _get_conv_out(self, shape):
288 |         o = self.main(torch.zeros(1, *shape))
289 |         return int(np.prod(o.size()))
290 | 
291 |     def forward(self, x, advantages_only=False):
292 |         x = x.float() / 256
293 |         """if test:
294 |             save_image(x[0], 'img1.png')
295 |             save_image(x[1], 'img2.png')
296 |             save_image(x[2], 'img3.png')
297 | 
298 |             raise Exception("stop")"""
299 | 
300 |         f = self.main(x)
301 |         f = self.pool(f)
302 |         return self.dueling(f, advantages_only=advantages_only)
303 | 
304 |     def save_checkpoint(self):
305 |         #print('... saving checkpoint ...')
306 |         torch.save(self.state_dict(), "current_model" + str(int(time.time() - self.start)))
307 | 
308 |     def load_checkpoint(self):
309 |         #print('... loading checkpoint ...')
310 |         self.load_state_dict(torch.load("current_model260538"))
311 | 
312 | 
313 | def get_model(model_str, spectral_norm):
314 |     if model_str == 'nature': return NatureCNN
315 |     elif model_str == 'dueling': return DuelingNatureCNN
316 |     elif model_str == 'impala_small': return ImpalaCNNSmall
317 |     elif model_str.startswith('impala_large:'):
318 |         return partial(ImpalaCNNLarge, model_size=int(model_str[13:]), spectral_norm=spectral_norm)
319 | 


--------------------------------------------------------------------------------
/EffRainbow/ptan_utils.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import time
  3 | import operator
  4 | from datetime import timedelta
  5 | import numpy as np
  6 | import collections
  7 | 
  8 | import torch
  9 | import torch.nn as nn
 10 | 
 11 | 
 12 | class SMAQueue:
 13 |     """
 14 |     Queue of fixed size with mean, max, min operations
 15 |     """
 16 |     def __init__(self, size):
 17 |         self.queue = collections.deque()
 18 |         self.size = size
 19 | 
 20 |     def __iadd__(self, other):
 21 |         if isinstance(other, (list, tuple)):
 22 |             self.queue.extend(other)
 23 |         else:
 24 |             self.queue.append(other)
 25 |         while len(self.queue) > self.size:
 26 |             self.queue.popleft()
 27 |         return self
 28 | 
 29 |     def __len__(self):
 30 |         return len(self.queue)
 31 | 
 32 |     def __repr__(self):
 33 |         return "SMAQueue(size=%d)" % self.size
 34 | 
 35 |     def __str__(self):
 36 |         return "SMAQueue(size=%d, len=%d)" % (self.size, len(self.queue))
 37 | 
 38 |     def min(self):
 39 |         if not self.queue:
 40 |             return None
 41 |         return np.min(self.queue)
 42 | 
 43 |     def mean(self):
 44 |         if not self.queue:
 45 |             return None
 46 |         return np.mean(self.queue)
 47 | 
 48 |     def max(self):
 49 |         if not self.queue:
 50 |             return None
 51 |         return np.max(self.queue)
 52 | 
 53 | 
 54 | class SpeedMonitor:
 55 |     def __init__(self, batch_size, autostart=True):
 56 |         self.batch_size = batch_size
 57 |         self.start_ts = None
 58 |         self.batches = None
 59 |         if autostart:
 60 |             self.reset()
 61 | 
 62 |     def epoch(self):
 63 |         if self.epoches is not None:
 64 |             self.epoches += 1
 65 | 
 66 |     def batch(self):
 67 |         if self.batches is not None:
 68 |             self.batches += 1
 69 | 
 70 |     def reset(self):
 71 |         self.start_ts = time.time()
 72 |         self.batches = 0
 73 |         self.epoches = 0
 74 | 
 75 |     def seconds(self):
 76 |         """
 77 |         Seconds since last reset
 78 |         :return:
 79 |         """
 80 |         return time.time() - self.start_ts
 81 | 
 82 |     def samples_per_sec(self):
 83 |         """
 84 |         Calculate samples per second since last reset() call
 85 |         :return: float count samples per second or None if not started
 86 |         """
 87 |         if self.start_ts is None:
 88 |             return None
 89 |         secs = self.seconds()
 90 |         if abs(secs) < 1e-5:
 91 |             return 0.0
 92 |         return (self.batches + 1) * self.batch_size / secs
 93 | 
 94 |     def epoch_time(self):
 95 |         """
 96 |         Calculate average epoch time
 97 |         :return: timedelta object
 98 |         """
 99 |         if self.start_ts is None:
100 |             return None
101 |         s = self.seconds()
102 |         if self.epoches > 0:
103 |             s /= self.epoches + 1
104 |         return timedelta(seconds=s)
105 | 
106 |     def batch_time(self):
107 |         """
108 |         Calculate average batch time
109 |         :return: timedelta object
110 |         """
111 |         if self.start_ts is None:
112 |             return None
113 |         s = self.seconds()
114 |         if self.batches > 0:
115 |             s /= self.batches + 1
116 |         return timedelta(seconds=s)
117 | 
118 | 
119 | class WeightedMSELoss(nn.Module):
120 |     def __init__(self, size_average=True):
121 |         super(WeightedMSELoss, self).__init__()
122 |         self.size_average = size_average
123 | 
124 |     def forward(self, input, target, weights=None):
125 |         if weights is None:
126 |             return nn.MSELoss(self.size_average)(input, target)
127 | 
128 |         loss_rows = (input - target) ** 2
129 |         if len(loss_rows.size()) != 1:
130 |             loss_rows = torch.sum(loss_rows, dim=1)
131 |         res = (weights * loss_rows).sum()
132 |         if self.size_average:
133 |             res /= len(weights)
134 |         return res
135 | 
136 | 
137 | class SegmentTree(object):
138 |     def __init__(self, capacity, operation, neutral_element):
139 |         """Build a Segment Tree data structure.
140 | 
141 |         https://en.wikipedia.org/wiki/Segment_tree
142 | 
143 |         Can be used as regular array, but with two
144 |         important differences:
145 | 
146 |             a) setting item's value is slightly slower.
147 |                It is O(lg capacity) instead of O(1).
148 |             b) user has access to an efficient `reduce`
149 |                operation which reduces `operation` over
150 |                a contiguous subsequence of items in the
151 |                array.
152 | 
153 |         Paramters
154 |         ---------
155 |         capacity: int
156 |             Total size of the array - must be a power of two.
157 |         operation: lambda obj, obj -> obj
158 |             and operation for combining elements (eg. sum, max)
159 |             must for a mathematical group together with the set of
160 |             possible values for array elements.
161 |         neutral_element: obj
162 |             neutral element for the operation above. eg. float('-inf')
163 |             for max and 0 for sum.
164 |         """
165 |         assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2."
166 |         self._capacity = capacity
167 |         self._value = [neutral_element for _ in range(2 * capacity)]
168 |         self._operation = operation
169 | 
170 |     def _reduce_helper(self, start, end, node, node_start, node_end):
171 |         if start == node_start and end == node_end:
172 |             return self._value[node]
173 |         mid = (node_start + node_end) // 2
174 |         if end <= mid:
175 |             return self._reduce_helper(start, end, 2 * node, node_start, mid)
176 |         else:
177 |             if mid + 1 <= start:
178 |                 return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end)
179 |             else:
180 |                 return self._operation(
181 |                     self._reduce_helper(start, mid, 2 * node, node_start, mid),
182 |                     self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end)
183 |                 )
184 | 
185 |     def reduce(self, start=0, end=None):
186 |         """Returns result of applying `self.operation`
187 |         to a contiguous subsequence of the array.
188 | 
189 |             self.operation(arr[start], operation(arr[start+1], operation(... arr[end])))
190 | 
191 |         Parameters
192 |         ----------
193 |         start: int
194 |             beginning of the subsequence
195 |         end: int
196 |             end of the subsequences
197 | 
198 |         Returns
199 |         -------
200 |         reduced: obj
201 |             result of reducing self.operation over the specified range of array elements.
202 |         """
203 |         if end is None:
204 |             end = self._capacity
205 |         if end < 0:
206 |             end += self._capacity
207 |         end -= 1
208 |         return self._reduce_helper(start, end, 1, 0, self._capacity - 1)
209 | 
210 |     def __setitem__(self, idx, val):
211 |         # index of the leaf
212 |         idx += self._capacity
213 |         self._value[idx] = val
214 |         idx //= 2
215 |         while idx >= 1:
216 |             self._value[idx] = self._operation(
217 |                 self._value[2 * idx],
218 |                 self._value[2 * idx + 1]
219 |             )
220 |             idx //= 2
221 | 
222 |     def __getitem__(self, idx):
223 |         assert 0 <= idx < self._capacity
224 |         return self._value[self._capacity + idx]
225 | 
226 | 
227 | class SumSegmentTree(SegmentTree):
228 |     def __init__(self, capacity):
229 |         super(SumSegmentTree, self).__init__(
230 |             capacity=capacity,
231 |             operation=operator.add,
232 |             neutral_element=0.0
233 |         )
234 | 
235 |     def sum(self, start=0, end=None):
236 |         """Returns arr[start] + ... + arr[end]"""
237 |         return super(SumSegmentTree, self).reduce(start, end)
238 | 
239 |     def find_prefixsum_idx(self, prefixsum):
240 |         """Find the highest index `i` in the array such that
241 |             sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum
242 | 
243 |         if array values are probabilities, this function
244 |         allows to sample indexes according to the discrete
245 |         probability efficiently.
246 | 
247 |         Parameters
248 |         ----------
249 |         perfixsum: float
250 |             upperbound on the sum of array prefix
251 | 
252 |         Returns
253 |         -------
254 |         idx: int
255 |             highest index satisfying the prefixsum constraint
256 |         """
257 |         assert 0 <= prefixsum <= self.sum() + 1e-5
258 |         idx = 1
259 |         while idx < self._capacity:  # while non-leaf
260 |             if self._value[2 * idx] > prefixsum:
261 |                 idx = 2 * idx
262 |             else:
263 |                 prefixsum -= self._value[2 * idx]
264 |                 idx = 2 * idx + 1
265 |         return idx - self._capacity
266 | 
267 | 
268 | class MinSegmentTree(SegmentTree):
269 |     def __init__(self, capacity):
270 |         super(MinSegmentTree, self).__init__(
271 |             capacity=capacity,
272 |             operation=min,
273 |             neutral_element=float('inf')
274 |         )
275 | 
276 |     def min(self, start=0, end=None):
277 |         """Returns min(arr[start], ...,  arr[end])"""
278 | 
279 |         return super(MinSegmentTree, self).reduce(start, end)
280 | 
281 | 
282 | class TBMeanTracker:
283 |     """
284 |     TensorBoard value tracker: allows to batch fixed amount of historical values and write their mean into TB
285 | 
286 |     Designed and tested with pytorch-tensorboard in mind
287 |     """
288 |     def __init__(self, writer, batch_size):
289 |         """
290 |         :param writer: writer with close() and add_scalar() methods
291 |         :param batch_size: integer size of batch to track
292 |         """
293 |         assert isinstance(batch_size, int)
294 |         assert writer is not None
295 |         self.writer = writer
296 |         self.batch_size = batch_size
297 | 
298 |     def __enter__(self):
299 |         self._batches = collections.defaultdict(list)
300 |         return self
301 | 
302 |     def __exit__(self, exc_type, exc_val, exc_tb):
303 |         self.writer.close()
304 | 
305 |     @staticmethod
306 |     def _as_float(value):
307 |         assert isinstance(value, (float, int, np.ndarray, np.generic, torch.autograd.Variable)) or torch.is_tensor(value)
308 |         tensor_val = None
309 |         if isinstance(value, torch.autograd.Variable):
310 |             tensor_val = value.data
311 |         elif torch.is_tensor(value):
312 |             tensor_val = value
313 | 
314 |         if tensor_val is not None:
315 |             return tensor_val.float().mean().item()
316 |         elif isinstance(value, np.ndarray):
317 |             return float(np.mean(value))
318 |         else:
319 |             return float(value)
320 | 
321 |     def track(self, param_name, value, iter_index):
322 |         assert isinstance(param_name, str)
323 |         assert isinstance(iter_index, int)
324 | 
325 |         data = self._batches[param_name]
326 |         data.append(self._as_float(value))
327 | 
328 |         if len(data) >= self.batch_size:
329 |             self.writer.add_scalar(param_name, np.mean(data), iter_index)
330 |             data.clear()
331 | 
332 | 
333 | class RewardTracker:
334 |     def __init__(self, writer, min_ts_diff=1.0):
335 |         """
336 |         Constructs RewardTracker
337 |         :param writer: writer to use for writing stats
338 |         :param min_ts_diff: minimal time difference to track speed
339 |         """
340 |         self.writer = writer
341 |         self.min_ts_diff = min_ts_diff
342 | 
343 |     def __enter__(self):
344 |         self.ts = time.time()
345 |         self.ts_frame = 0
346 |         self.total_rewards = []
347 |         return self
348 | 
349 |     def __exit__(self, *args):
350 |         self.writer.close()
351 | 
352 |     def reward(self, reward, frame, epsilon=None):
353 |         self.total_rewards.append(reward)
354 |         mean_reward = np.mean(self.total_rewards[-100:])
355 |         ts_diff = time.time() - self.ts
356 |         if ts_diff > self.min_ts_diff:
357 |             speed = (frame - self.ts_frame) / ts_diff
358 |             self.ts_frame = frame
359 |             self.ts = time.time()
360 |             epsilon_str = "" if epsilon is None else ", eps %.2f" % epsilon
361 |             print("%d: done %d episodes, mean reward %.3f, speed %.2f f/s%s" % (
362 |                 frame, len(self.total_rewards), mean_reward, speed, epsilon_str
363 |             ))
364 |             sys.stdout.flush()
365 |             self.writer.add_scalar("speed", speed, frame)
366 |         if epsilon is not None:
367 |             self.writer.add_scalar("epsilon", epsilon, frame)
368 |         self.writer.add_scalar("reward_100", mean_reward, frame)
369 |         self.writer.add_scalar("reward", reward, frame)
370 |         return mean_reward if len(self.total_rewards) > 30 else None
371 | 


--------------------------------------------------------------------------------
/FelkFork/DolphinSideScriptTanks.py:
--------------------------------------------------------------------------------
  1 | 
  2 | #Window is 500x270 when captured
  3 | 
  4 | try:
  5 |     import sys
  6 |     #sys.path.append("C:\\Users\\TYLER\\AppData\\Local\\Programs\\Python\\Python38\\Lib\\site-packages")
  7 |     sys.path.append("/home/tyler/anaconda3/envs/effzero/lib/python3.8/site-packages")
  8 | except Exception as e:
  9 |     with open('logg.txt', 'a') as f:
 10 |         f.write(str(e))
 11 |     raise Exception("stop")
 12 | 
 13 | import os
 14 | cwd = str(os.getcwd())
 15 | cwd = cwd.split("dolphin",1)[1][0]
 16 | os.chdir('/home/tyler/Documents/EfficientZero')
 17 | 
 18 | with open('logg.txt', 'a') as f:
 19 |     f.write('Path Changed... Again')
 20 | 
 21 | with open('logg.txt', 'a') as f:
 22 |     f.write("PID:" + str(cwd))
 23 | 
 24 | pid = int(cwd)
 25 | 
 26 | from multiprocessing import shared_memory,Lock
 27 | import numpy as np
 28 | 
 29 | with open('logg.txt', 'a') as f:
 30 |     f.write('half libraries installed\n')
 31 | 
 32 | from PIL import Image
 33 | import math
 34 | import time
 35 | import random
 36 | 
 37 | with open('logg.txt', 'a') as f:
 38 |     f.write('Imported Some\n')
 39 | 
 40 | with open('logg.txt', 'a') as f:
 41 |     f.write('Got Pid ' + str(pid) + '\n')
 42 | #Ymem = 270
 43 | #Xmem = 500
 44 | #Ymem = 108
 45 | #Xmem = 200
 46 | 
 47 | Ymem = 78
 48 | Xmem = 94
 49 | 
 50 | ##78x94
 51 | 
 52 | #div by 6.8
 53 | 
 54 | try:
 55 |     data = np.zeros((Ymem + 1,Xmem),dtype=np.float32)
 56 |     shm = shared_memory.SharedMemory(name='p' + str(pid))
 57 |     with open('logg.txt', 'a') as f:
 58 |         f.write('Joined Shared Memory')
 59 |     
 60 | except Exception as e:
 61 |     with open('logg.txt', 'a') as f:
 62 |         f.write(str(e))
 63 | 
 64 |     with open('logg.txt', 'a') as f:
 65 |         f.write(' Failed to create Shared Memory')
 66 | 
 67 |     raise Exception("Stop - failed to create shared mem")
 68 | 
 69 | #import mss
 70 | #import dxcam
 71 | 
 72 | from dolphin import event, gui,savestate,memory,controller
 73 | with open('logg.txt', 'a') as f:
 74 |     f.write('\nImported FelkLibs')
 75 | 
 76 | class DolphinSideEnv():
 77 |     def __init__(self,pid=0,offset = 0):
 78 | 
 79 |         """
 80 |         shared mem is in following format:
 81 | 
 82 |         This needs to be changed to this format:
 83 | 
 84 |         arr = np.zeros((101,60),dtype=np.float32)
 85 | 
 86 |         arr[0][0] = Dtimestep
 87 |         arr[0][1] = Etimestep
 88 |         arr[0][2] = action
 89 |         arr[0][3] = reward
 90 |         arr[0][4] = terminal
 91 | 
 92 |         arr[1:] = state
 93 | 
 94 |         """
 95 | 
 96 |         #about 78fs with mss
 97 | 
 98 |         #about 60 with dxcam (the weird method)
 99 | 
100 |         ########### Game Code
101 | 
102 |         self.last_action = 0
103 |         
104 |         ##################### End Game Code
105 | 
106 |         self.offset = offset
107 |         
108 |         self.window_header = 40
109 |         self.window_width = 100
110 |         self.window_height = 60
111 | 
112 |         pidx = pid % 5
113 |         pidy = math.floor(pid / 5)
114 |         
115 |         #self.monitor = {"top": 32 + (270 + 32) * pidy, "left": pidx*500, "width": 500, "height": 270}
116 |         #self.monitor = {"top": 32 + (Ymem + 32) * pidy, "left": pidx*(Xmem + 1), "width": Xmem, "height": Ymem}
117 | 
118 |         self.frameskip = 4
119 | 
120 |         self.timestep = 0.
121 | 
122 |         self.current_step = 0
123 | 
124 |         with open('logg.txt', 'a') as f:
125 |             f.write('About to make data array\n')        
126 | 
127 |         self.data = np.zeros((Ymem + 1,Xmem),dtype=np.float32)#np.zeros(self.dims,dtype=np.float32)
128 | 
129 |         self.shm_array = np.ndarray(self.data.shape, dtype=self.data.dtype, buffer=shm.buf)
130 | 
131 |         with open('logg.txt', 'a') as f:
132 |             f.write('shared mem\n')
133 | 
134 |         self.reset()
135 | 
136 |         with open('logg.txt', 'a') as f:
137 |             f.write('Init Reset Successful\n')
138 | 
139 |     def reset(self):
140 | 
141 |         self.current_step = 0
142 | 
143 |         ########### Game Code
144 |         self.movement_inc = 0.015
145 |         x = np.random.random()
146 |         global change
147 |         change = False
148 |         savestate.load_from_slot(1)#random.randint(1,8)
149 |         
150 |         """if x < 0.5:
151 |             savestate.load_from_slot(1)
152 |         elif x < 0.6:
153 |             savestate.load_from_slot(5)
154 |         elif x < 0.75:
155 |             savestate.load_from_slot(4)
156 |         elif x < 0.95:
157 |             savestate.load_from_slot(2)
158 |         else:
159 |             savestate.load_from_slot(3)"""
160 |         #else:
161 |             #savestate.load_from_slot(4)
162 |         
163 |         change = True
164 | 
165 |         self.numEnemies = memory.read_u32(0x91CFA9E8)
166 |         self.numLives = memory.read_u32(0x91D27ED0)
167 |         self.x = 0
168 |         self.y = 0
169 |         
170 |         ##################### End Game Code
171 |         start = time.time()
172 |         while True:
173 |             time.sleep(0.5)
174 | 
175 |             if time.time() - start > 10:
176 |                 time.sleep(10)
177 |                 with open('logg.txt', 'a') as f:
178 |                     f.write("Waiting 10+ seconds! PID: " + str(pid))
179 |                     f.write('\nWaiting for Reset... ' + str(self.shm_array[0]) + " " + str(self.shm_array[1]) + " " + str(self.timestep))
180 |                     f.write("\n\n")
181 |                 #self.timestep = 0
182 |                 #self.shm_array[0][0 + self.offset] = self.timestep
183 |                 start = time.time()
184 |                 
185 |             if self.shm_array[0][1 + self.offset] == self.timestep:
186 |                 break
187 | 
188 |         #write state
189 |         self.shm_array[0][4 + self.offset] = 0.
190 |         self.shm_array[0][3 + self.offset] = 0.
191 |         
192 |         self.shm_array[1:] = np.zeros((Ymem,Xmem),dtype=np.float32)#self.get_state()
193 | 
194 |         self.timestep += 1
195 |         self.shm_array[0][0 + self.offset] = self.timestep
196 | 
197 |         self.dic = {"Left":False,"Right":False,"Down":False,"Up":False, \
198 |                "Plus":False,"Minus":False,"One":False,"Two":False, \
199 |                "A":False,"B":False,"Home":False}
200 | 
201 |     def get_state(self):
202 |         
203 |         #event.on_framedrawn(show_screenshot)
204 | 
205 |         
206 |         return img[:]
207 | 
208 |     def get_state_old(self):
209 |         
210 |         with mss.mss() as sct:
211 |             
212 |             # Part of the screen to capture
213 |             #im = 0.07 * im[:,:,2] + 0.72 * im[:,:,1] + 0.21 * im[:,:,0]
214 |             # Get raw pixels from the screen, save it to a Numpy array
215 |             im = np.array(sct.grab(self.monitor))
216 |             
217 |             #im = 0.0002745098 * im[:,:,2] + 0.00282352941 * im[:,:,1] + 0.00082352941 * im[:,:,0]
218 |             im = 0.07 * im[:,:,2] + 0.72 * im[:,:,1] + 0.21 * im[:,:,0]
219 |             #im = im.astype(np.float32)
220 | 
221 |         return im
222 | 
223 |     def get_state_dx(self):
224 | 
225 |         im = self.camera.get_latest_frame()
226 |         im = np.squeeze(im)
227 |         im = np.true_divide(im,255,dtype=np.float32)
228 |         #im = cv2.resize(im, dsize=(54, 100), interpolation=cv2.INTER_CUBIC)
229 |         #im = np.swapaxes(im,0,1)
230 | 
231 |         return im
232 | 
233 |     def get_reward_terminal(self):
234 |         # Returns reward,terminal,trun
235 |         controller.set_wii_buttons(0,self.dic)
236 |         self.current_step += 1
237 | 
238 |         ########### Game Code
239 |         terminal = False
240 |         reward = 0.
241 | 
242 |         numEnemies = memory.read_u32(0x91CFA9E8)
243 |         numLives = memory.read_u32(0x91D27ED0)
244 | 
245 |         #check if we died
246 |         if numLives < self.numLives:
247 |             return -1., True, False
248 | 
249 | 
250 |         #check if the round ended
251 |         if numEnemies > self.numEnemies:
252 |             return 1., True, False
253 | 
254 |         #get kills
255 |         reward = self.numEnemies - numEnemies
256 | 
257 |         self.numEnemies = numEnemies
258 |         self.numLives = numLives
259 | 
260 |         ##################### End Game Code
261 | 
262 |         #remove this
263 |         if random.randint(1,60) == 25:
264 |             terminal = True
265 | 
266 |         return reward,terminal,False
267 | 
268 |     def apply_action(self,action):        
269 | 
270 |         """
271 |         self.dic = {"Left":False,"Right":False,"Down":False,"Up":False, \
272 |                "Plus":False,"Minus":False,"One":False,"Two":False, \
273 |                "A":False,"B":False,"Home":False}
274 | 
275 |         """
276 |         self.last_action = action
277 |         self.dic = {"Left":False,"Right":False,"Down":False,"Up":False, \
278 |                "Plus":False,"Minus":False,"One":False,"Two":False, \
279 |                "A":False,"B":False,"Home":False}
280 | 
281 |         #REMOVE THIS LINE
282 |         action = random.randint(0,8)
283 | 
284 |         if action == 0:
285 |             self.dic["Left"] = True
286 |         elif action == 1:
287 |             self.dic["Right"] = True
288 |         elif action == 2:
289 |             self.dic["Up"] = True
290 |         elif action == 3:
291 |             self.dic["Down"] = True
292 |         elif action == 4:
293 |             self.x += self.movement_inc
294 |         elif action == 5:
295 |             self.x -= self.movement_inc
296 |         elif action == 6:
297 |             self.y += self.movement_inc
298 |         elif action == 7:
299 |             self.y -= self.movement_inc
300 |         elif action == 8:
301 |             self.dic["B"] = True
302 | 
303 |         self.x = max(-0.32,min(self.x,0.32))
304 |         self.y = max(-0.16, min(self.y, 0.08))
305 | 
306 |         controller.set_wii_ircamera_transform(0,self.x,self.y,-2,0,0,0)
307 |         controller.set_wii_buttons(0,self.dic)
308 |         
309 |     def step(self):
310 |         
311 |         #get action
312 |         #sync
313 |         while True:
314 |             start = time.time()
315 |             time.sleep(0.001)
316 |             """with open('logg.txt', 'a') as f:
317 |                 f.write('\nWaiting for ETimestep... ' + str(self.shm_array[0]) + " " + str(self.shm_array[1]) + " " + str(self.timestep))"""
318 | 
319 |             if self.shm_array[0][1 + self.offset] == self.timestep:
320 |                 break
321 | 
322 |             if time.time() - start > 10:
323 |                 with open('logg.txt', 'a') as f:
324 |                     f.write('\nDolphin has been awaiting respose for 10+ seconds! Pid: ' + str(pid))
325 |                     f.write('\nWaiting for Step... ' + str(self.shm_array[0]) + " " + str(self.shm_array[1]) + " " + str(self.timestep))
326 |                     f.write("\n\n")
327 | 
328 |         try:
329 |             self.apply_action(self.shm_array[0][2 + self.offset])
330 |         except:
331 |             print("Error at apply action! PID: " + str(pid))
332 |             time.sleep(1)
333 | 
334 | 
335 |     def step2(self,reward,terminal,trun,image):
336 | 
337 |         #with open('logg.txt', 'a') as f:
338 |             #f.write('\nWriting timestep: ' + str(self.timestep))
339 | 
340 |         #image = cv2.resize(image,(78,94), interpolation=cv2.INTER_AREA)
341 | 
342 |         #send back data
343 | 
344 |         image = image.resize((94,78))
345 |         image = image.convert("RGB")
346 | 
347 |         img1 = np.asarray(image)
348 |         img1 = img1[...,::-1]
349 |         image = np.dot(img1[...,:3], [0.2989, 0.5870, 0.1140])
350 | 
351 |         #if random.random() > 0.99:
352 |             #cv2.imwrite("filename.png", img)
353 | 
354 |         image = image.astype(np.float32)
355 |         
356 |         self.shm_array[0][4 + self.offset] = float(terminal)
357 |         self.shm_array[0][3 + self.offset] = reward
358 |         self.shm_array[1:] = image#self.get_state()np.zeros((Ymem,Xmem),dtype=np.float32)#
359 |         
360 |         self.timestep += 1
361 |         self.shm_array[0][0 + self.offset] = self.timestep
362 | 
363 |         if terminal or trun:
364 |             self.reset()
365 | 
366 | """def show_screenshot(width: int, height: int, data: bytes):
367 |     #print(f"received {width}x{height} image of length {len(data)}")
368 |     # data is RGBA, so its size is width*height*4
369 |     
370 |     if change:
371 |         global img
372 |         img = deepcopy(Image.frombytes('RGBA', (width,height), data, 'raw'))
373 | 
374 | 
375 |     return"""
376 | #img = np.zeros((528,640),dtype=np.float32)
377 | 
378 | #img = np.zeros((94,78),dtype=np.uint8)
379 | change = True
380 | #event.on_framedrawn(show_screenshot)
381 | 
382 | for i in range(4):
383 |     await event.frameadvance()
384 | 
385 | env = DolphinSideEnv(pid=pid)
386 | 
387 | for i in range(env.frameskip):
388 |     await event.frameadvance()
389 | 
390 | reward = 0
391 | terminal = False
392 | trun = False
393 | red = 0xffff0000
394 | 
395 | with open('logg.txt', 'a') as f:
396 |     f.write('\nEntering Main While loop')
397 | 
398 | while True:
399 |     env.step()
400 |         
401 |     for i in range(env.frameskip):
402 |         (width,height,data) = await event.framedrawn()
403 | 
404 |         rewardN,terminalN,trunN = env.get_reward_terminal()
405 |         #env.apply_action(env.last_action)
406 |         gui.draw_text((10, 90), red, str(env.last_action))
407 | 
408 |         #with open('logg.txt', 'a') as f:
409 |             #f.write('\nAfter reward terminal')
410 | 
411 |         terminal = terminal or terminalN
412 |         trun = trun or trunN
413 |         reward += rewardN
414 |         if terminal or trun:
415 |             for i in range(4):
416 |                 await event.frameadvance()
417 |             break
418 | 
419 |     #with open('logg.txt', 'a') as f:
420 |         #f.write(str(deepcopy(img)))
421 |     img = Image.frombytes('RGBA', (width,height), data, 'raw')
422 |     
423 |     env.step2(reward,terminal,trun,img)
424 | 
425 |     gui.draw_text((10, 10), red, f"HI")
426 |     
427 |     reward = 0
428 |     terminal = False
429 |     trun = False
430 |         
431 | 
432 | 


--------------------------------------------------------------------------------
/MarioKartEnvBackup.py:
--------------------------------------------------------------------------------
  1 | import win32gui
  2 | import win32ui
  3 | from ctypes import windll
  4 | from PIL import Image
  5 | import PIL
  6 | from pywinauto import Desktop
  7 | import cv2
  8 | import numpy as np
  9 | import ctypes, time
 10 | from copy import copy,deepcopy
 11 | import gym
 12 | from Region import Region
 13 | import pickle
 14 | import math
 15 | import keyboard
 16 | # Bunch of stuff so that the script can send keystrokes to game #
 17 | 
 18 | SendInput = ctypes.windll.user32.SendInput
 19 | 
 20 | # C struct redefinitions 
 21 | PUL = ctypes.POINTER(ctypes.c_ulong)
 22 | class KeyBdInput(ctypes.Structure):
 23 |     _fields_ = [("wVk", ctypes.c_ushort),
 24 |                 ("wScan", ctypes.c_ushort),
 25 |                 ("dwFlags", ctypes.c_ulong),
 26 |                 ("time", ctypes.c_ulong),
 27 |                 ("dwExtraInfo", PUL)]
 28 | 
 29 | class HardwareInput(ctypes.Structure):
 30 |     _fields_ = [("uMsg", ctypes.c_ulong),
 31 |                 ("wParamL", ctypes.c_short),
 32 |                 ("wParamH", ctypes.c_ushort)]
 33 | 
 34 | class MouseInput(ctypes.Structure):
 35 |     _fields_ = [("dx", ctypes.c_long),
 36 |                 ("dy", ctypes.c_long),
 37 |                 ("mouseData", ctypes.c_ulong),
 38 |                 ("dwFlags", ctypes.c_ulong),
 39 |                 ("time",ctypes.c_ulong),
 40 |                 ("dwExtraInfo", PUL)]
 41 | 
 42 | class Input_I(ctypes.Union):
 43 |     _fields_ = [("ki", KeyBdInput),
 44 |                  ("mi", MouseInput),
 45 |                  ("hi", HardwareInput)]
 46 | 
 47 | class Input(ctypes.Structure):
 48 |     _fields_ = [("type", ctypes.c_ulong),
 49 |                 ("ii", Input_I)]
 50 | 
 51 | # Actuals Functions
 52 | 
 53 | def PressKey(hexKeyCode):
 54 |     extra = ctypes.c_ulong(0)
 55 |     ii_ = Input_I()
 56 |     ii_.ki = KeyBdInput( 0, hexKeyCode, 0x0008, 0, ctypes.pointer(extra) )
 57 |     x = Input( ctypes.c_ulong(1), ii_ )
 58 |     ctypes.windll.user32.SendInput(1, ctypes.pointer(x), ctypes.sizeof(x))
 59 | 
 60 | def ReleaseKey(hexKeyCode):
 61 |     extra = ctypes.c_ulong(0)
 62 |     ii_ = Input_I()
 63 |     ii_.ki = KeyBdInput( 0, hexKeyCode, 0x0008 | 0x0002, 0, ctypes.pointer(extra) )
 64 |     x = Input( ctypes.c_ulong(1), ii_ )
 65 |     ctypes.windll.user32.SendInput(1, ctypes.pointer(x), ctypes.sizeof(x))
 66 | 
 67 | def KeyPress(key):
 68 |     PressKey(keys[key]) # press Q
 69 |     time.sleep(.05)
 70 |     ReleaseKey(keys[key]) #release Q
 71 | 
 72 | def release_keys():
 73 |     for key in keys:
 74 |         ReleaseKey(keys[key])
 75 | 
 76 | def push(key):
 77 |     PressKey(keys[key])
 78 | 
 79 | def release(key):
 80 |     ReleaseKey(keys[key])
 81 | 
 82 | keys = {
 83 |     "a": 0x1E,
 84 |     "b": 0x30,
 85 |     "w": 0x11,
 86 |     "n": 0x31,
 87 |     "m": 0x32,
 88 |     "`": 0x29,
 89 |     "\\": 0x2B,
 90 |     "p": 0x19,
 91 |     "e": 0x12,
 92 |     "z": 0x2C,
 93 |     "c": 0x2E,
 94 |     "d": 0x20
 95 |     }
 96 | 
 97 | #32400 frames/hour!
 98 | #ray did 28700
 99 | class MarioKartEnv():
100 |     def __init__(self,config=None):
101 | 
102 |         windows = Desktop(backend="uia").windows()
103 |         for i in windows:
104 |             if i.window_text()[:19] == "Dolphin 5.0-16101 |":
105 |                 window_name = i.window_text()
106 | 
107 |         self.hwnd = win32gui.FindWindow(None, window_name)
108 |         left, top, right, bot = win32gui.GetWindowRect(self.hwnd)
109 |         self.w = right - left
110 |         self.h = bot - top
111 | 
112 |         self.template = cv2.imread('C:/Users/TYLER/Downloads/dolphin_ai_tests/env/funky_kong_img2.png')
113 |         self.template = cv2.cvtColor(self.template, cv2.COLOR_RGB2GRAY)
114 |         self.tem_w = 69
115 |         self.tem_h = 132#100,141
116 | 
117 |         self.action_space = gym.spaces.Discrete(4)
118 |         """
119 |         
120 |         1 - accel
121 |         2 - accel+wheely
122 |         3 - accel+drift_hold_right
123 |         4 - accel+drift_hold_left
124 | 
125 | 
126 |         item has been removed
127 |         5 - accel + right
128 |         6 - accel + left
129 |         7 - accel + item
130 |         0 - null
131 |         """
132 |         #yx
133 |         #self.observation_space = gym.spaces.Box(
134 |         #low=0, high=255, shape=(64, 32), dtype=np.uint8)
135 | 
136 |         save_name = "regions.dat"
137 | 
138 |         self.image_x = 950
139 |         self.image_y = 1220
140 |         self.grid_size = 10
141 |         self.grid_x = int(self.image_x / self.grid_size)
142 |         self.grid_y = int(self.image_y / self.grid_size)
143 |         
144 |         self.time_till_checkpoint = 4
145 |         self.checkpoint_timer = time.time()
146 |         
147 |         self.method = eval('cv2.TM_CCOEFF')
148 |         self.num_chkps = 22
149 |         with open(save_name, "rb") as f:
150 |             self.regions = pickle.load(f)
151 |         
152 |         self.reset()
153 | 
154 | 
155 |     def reset(self):
156 |         self.dist = 0
157 |         self.first = True
158 |         release_keys()
159 |         self.held_keys = []
160 |         KeyPress("m")
161 |         time.sleep(0.25)
162 |         self.timer = time.time()
163 |         self.prev_action = 0
164 |         self.out_frames = 0
165 |         self.current_chkp = -1
166 | 
167 |         return self.get_state()[0]
168 | 
169 |     def template_match(self,img):
170 |         terminal = False
171 |         #crop image so avoid issues -- #og image 2098, 3868
172 |         img = img[680:1900, 2600: 3550]
173 |         #img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
174 |         
175 |         
176 |         # Apply template Matching
177 |         res = cv2.matchTemplate(img,self.template,self.method)
178 |         min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(res)
179 |         # If the method is TM_SQDIFF or TM_SQDIFF_NORMED, take minimum
180 | 
181 |         if not self.first:
182 |             
183 |             self.prev_top_left = copy(self.top_left)
184 | 
185 |         self.top_left = max_loc
186 | 
187 |         if self.first:
188 |             self.prev_top_left = copy(self.top_left)
189 |             self.first = False
190 |             #cv2.imwrite("bug_test" + str(time.time()) + ".jpg", img)
191 |             return 0,False
192 |             
193 |         else:
194 | 
195 |             y_dif = list(self.prev_top_left)[1] - list(self.top_left)[1]
196 |             x_dif = list(self.top_left)[0] - list(self.prev_top_left)[0]
197 | 
198 |         if True:#time.time() - self.timer > 5.4
199 |         
200 |             self.dist = x_dif**2 + y_dif**2
201 | 
202 |             #exception for broken template matching
203 |             if self.dist > 950:
204 | 
205 |                 #need to allow it to refind template next frame
206 |                 self.first = True
207 |                 reward = 0.9
208 |                 bottom_right2 = (self.prev_top_left[0] + self.tem_w, self.prev_top_left[1] + self.tem_h)
209 |                 cv2.rectangle(img,self.prev_top_left, bottom_right2, 255, 2)
210 |                 
211 |                 bottom_right = (self.top_left[0] + self.tem_w, self.top_left[1] + self.tem_h)
212 |                 cv2.rectangle(img,self.top_left, bottom_right, 255, 2)
213 |                 
214 |                 cv2.imwrite("wrong_pattern12" + str(round(time.time(),4)) + ".jpg", img)
215 | 
216 |                 self.dist = 0
217 |             else:
218 | 
219 |                 #region code - #cropped image 1220, 880 (y,x)
220 |                 reward = self.get_reward(x_dif,y_dif)
221 |                 if self.out_frames > 3:
222 |                     terminal = True
223 |                     reward -= 150
224 | 
225 |                 
226 |                 """bottom_right = (self.top_left[0] + self.tem_w, self.top_left[1] + self.tem_h)
227 |                 cv2.rectangle(img,self.top_left, bottom_right, 128, 2)
228 |                 cv2.imwrite("bug_test" + str(time.time()) + ".jpg", img)
229 | 
230 |                 raise Exception("stop")"""
231 |                 
232 |         else:
233 |             return 0,terminal
234 | 
235 |         reward = reward / 30
236 |         reward -= 0.03
237 | 
238 |         return reward,terminal
239 | 
240 |     def get_reward(self,x_dif,y_dif):
241 |         reward = 0
242 |         reset_frames = True
243 | 
244 |         #this are based off funky's face
245 |         add_x = int(self.tem_w / 2)
246 |         add_y = int(self.tem_h / 2)
247 | 
248 |         #center location
249 |         x = self.top_left[0] + add_x
250 |         y = self.top_left[1] + add_y
251 | 
252 |         x = math.floor(x / self.grid_size)
253 |         y = math.floor(y / self.grid_size)
254 | 
255 |         #get which grid cell
256 |         num = self.convert_xy_to_num(x,y)
257 | 
258 |         #check out of bounds
259 |         if not self.regions[num].in_bounds:
260 |             self.out_frames += 1
261 |             reset_frames = False
262 | 
263 |         #check dir_x
264 |         reward += x_dif * self.regions[num].dir_x
265 | 
266 |         #check dir_y
267 |         reward -= y_dif * self.regions[num].dir_y
268 | 
269 |         #checkpoints
270 |         if self.regions[num].is_chkp:
271 |             if self.regions[num].chkp_num > self.current_chkp or (self.regions[num].chkp_num == 0 and self.current_chkp == self.num_chkps):
272 |                 reward += 65
273 |                 self.checkpoint_timer = time.time()
274 |                 #print("checkpoint: " + str(self.regions[num].chkp_num))
275 |                 self.current_chkp = self.regions[num].chkp_num
276 |             elif self.regions[num].chkp_num < self.current_chkp or \
277 |                  (self.regions[num].chkp_num == self.num_chkps and (self.current_chkp == 0 or self.current_chkp == -1)):
278 |                 
279 |                 self.out_frames += 1
280 |                 reset_frames = False        
281 |         
282 |         if reset_frames:
283 |             self.out_frames = 0
284 | 
285 |         #timer for reaching checkpoints
286 |         if time.time() - self.checkpoint_timer > self.time_till_checkpoint:
287 |             self.out_frames = 10
288 |             
289 |         return reward
290 | 
291 |     def convert_xy_to_num(self,x,y):
292 |         return x + y * self.grid_x
293 | 
294 |     def is_inside(self,point,reg_point,reg_end_point):
295 |         #loop over xy
296 |         for i in range(2):
297 |             if not (point[i] >= reg_point[i] and point[i] <= reg_end_point[i]):
298 |                 return False
299 |         return True
300 |         
301 | 
302 |     def get_state(self):
303 |         hwndDC = win32gui.GetWindowDC(self.hwnd)
304 |         mfcDC  = win32ui.CreateDCFromHandle(hwndDC)
305 |         saveDC = mfcDC.CreateCompatibleDC()
306 | 
307 |         saveBitMap = win32ui.CreateBitmap()
308 |         saveBitMap.CreateCompatibleBitmap(mfcDC, self.w, self.h)
309 | 
310 |         saveDC.SelectObject(saveBitMap)
311 | 
312 |         # Change the line below depending on whether you want the whole window
313 |         # or just the client area. 
314 |         #result = windll.user32.PrintWindow(hwnd, saveDC.GetSafeHdc(), 1)
315 |         result = windll.user32.PrintWindow(self.hwnd, saveDC.GetSafeHdc(), 0)
316 | 
317 |         bmpinfo = saveBitMap.GetInfo()
318 |         bmpstr = saveBitMap.GetBitmapBits(True)
319 | 
320 |         im = Image.frombuffer(
321 |             'RGB',
322 |             (bmpinfo['bmWidth'], bmpinfo['bmHeight']),
323 |             bmpstr, 'raw', 'BGRX', 0, 1)#
324 | 
325 |         #og image 2098, 3868
326 |         im = np.array(im)
327 |         
328 |         #gets the top_left var
329 |         im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
330 |         reward,terminal = self.template_match(im)
331 |         
332 |         
333 |         im = im[80:, 85: 3868 - 85]
334 |         
335 |         im = cv2.resize(im, (64,32), interpolation = cv2.INTER_AREA)
336 |         #cv2.imwrite("bug_test_ai" + str(time.time()) + ".jpg", im)
337 |         #raise Exception("stop")
338 | 
339 |         win32gui.DeleteObject(saveBitMap.GetHandle())
340 |         saveDC.DeleteDC()
341 |         mfcDC.DeleteDC()
342 |         win32gui.ReleaseDC(self.hwnd, hwndDC)
343 | 
344 |         return im,reward,terminal
345 | 
346 |     def step(self,action=0):
347 |         #time.sleep(0.005)
348 |         terminal = False
349 | 
350 |         self.apply_action(action)
351 |         #press some key
352 |         state,reward,terminal = self.get_state()
353 | 
354 |         if time.time() - self.timer > 80:
355 |             terminal = True
356 | 
357 |         #print(reward)
358 |         info = {}
359 | 
360 |         return state,reward,terminal,info
361 | 
362 |         
363 | 
364 |     def apply_action(self,action):
365 |         """
366 |         
367 |         1 - accel
368 |         2 - accel+wheely
369 |         3 - accel+drift_hold_right
370 |         4 - accel+drift_hold_left
371 | 
372 |         0 - null
373 |         5 - accel + right
374 |         6 - accel + left
375 |         7 - accel + item
376 |         """
377 |         self.prev_held = copy(self.held_keys)
378 | 
379 |         #null action removed
380 |         action += 1
381 |         
382 |         """if action == 0:
383 |             self.held_keys = []"""
384 |         if action == 1:
385 |             self.held_keys = ["w"]
386 |         elif action == 2:
387 |             self.held_keys = ["w","e"]
388 |         elif action == 3:
389 |             self.held_keys = ["w","c","d"]
390 |         elif action == 4:
391 |             self.held_keys = ["w","c","a"]
392 |         """elif action == 5:
393 |             self.held_keys = ["w","d"]
394 |         elif action == 6:
395 |             self.held_keys = ["w","a"]
396 |         elif action == 7:
397 |             self.held_keys = ["w","z"]"""
398 | 
399 |         for i in self.held_keys:
400 |             if i not in self.prev_held:
401 |                 push(i)
402 | 
403 |         for i in self.prev_held:
404 |             if i not in self.held_keys:
405 |                 release(i)
406 | 
407 |         #print()
408 | 
409 | if __name__ == "__main__":
410 |     time.sleep(5)
411 |     env = MarioKartEnv()
412 |     state = env.reset()
413 |     score = 0
414 |     action = 0
415 |     
416 |     while True:
417 | 
418 |         if keyboard.is_pressed('u'):
419 |             action = 0
420 |         elif keyboard.is_pressed('h'):
421 |             action = 3
422 |         elif keyboard.is_pressed('k'):
423 |             action = 2
424 |         elif keyboard.is_pressed('i'):
425 |             action = 1
426 |         else:
427 |             action = 0
428 | 
429 |             
430 |         state,reward,terminal,info = env.step(action)
431 |         score += reward
432 |         print(reward)
433 |         if terminal:
434 |             print("Total Reward: " + str(score))
435 |             score = 0
436 |             env.reset()
437 | 


--------------------------------------------------------------------------------