├── .gitignore ├── README.md ├── build_graph ├── my_graph.npy ├── new_relations.npy └── process.py ├── config.json ├── download_csv.py ├── dqn ├── agent.py ├── env │ ├── ai2thor_env.py │ └── training-history │ │ ├── 2019-03-04-15-57-41-547816_FloorPlan2_CoffeeMachine │ │ ├── checkpoint │ │ ├── events.out.tfevents.1551689861.asr02.local │ │ ├── my-model.data-00000-of-00001 │ │ ├── my-model.index │ │ └── my-model.meta │ │ └── 2019-03-04-15-58-04-711405_FloorPlan2_CoffeeMachine │ │ ├── checkpoint │ │ ├── events.out.tfevents.1551689885.asr02.local │ │ ├── my-model.data-00000-of-00001 │ │ ├── my-model.index │ │ └── my-model.meta ├── main.py ├── model.py ├── replay_buffer.py └── utils.py ├── draft.py ├── dumping.py ├── embedding_fasttext300.pkl ├── embedding_onehot.pkl ├── env └── ai2thor_env.py ├── images ├── 1_GCN.png ├── 1_easy.png ├── 1_easy_noGAE.png ├── 1_easy_noGAE_normalizeReward.png ├── 1_easy_noGAE_onehot.png ├── 1_embed.png ├── 1_increaseLearningRate.png ├── 1_increase_entropy_penalty.png ├── 1_noGAE.png ├── 28_easy_noGAE.png ├── All FloorPlan1_4.png ├── All FloorPlan1_6.png ├── All FloorPlan28_4.png ├── All FloorPlan28_6.png ├── All FloorPlan2_4.png ├── All FloorPlan2_6.png ├── Compare FloorPlan1.png ├── Compare FloorPlan2.png ├── Compare FloorPlan28.png ├── FloorPlan1_4.png ├── FloorPlan1_6.png ├── FloorPlan28_4.png ├── FloorPlan28_6.png ├── FloorPlan2_4.png └── sample_AI2THOR.png ├── keyboard_agent.py ├── pytorch_a3c ├── LICENSE ├── README.md ├── layers.py ├── main.py ├── model.py ├── optimizers.py ├── test.py ├── train.py ├── utils.py └── visualize.py ├── tf_a2c ├── layers.py ├── main.py ├── model.py ├── multi_task.py ├── rollout.py ├── rollout_thread.py ├── sharing_polices.py ├── single_task.py └── utils.py ├── tsne.png └── visualize.py /.gitignore: -------------------------------------------------------------------------------- 1 | dumped/ 2 | */training-history/* 3 | __pycache__ 4 | */__pycache__/ 5 | *.ipynb -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RL-target-driven-navigation-ai2thor -------------------------------------------------------------------------------- /build_graph/my_graph.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/build_graph/my_graph.npy -------------------------------------------------------------------------------- /build_graph/new_relations.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/build_graph/new_relations.npy -------------------------------------------------------------------------------- /build_graph/process.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pickle 3 | import numpy as np 4 | import progressbar 5 | 6 | from collections import Counter 7 | 8 | def build_graph(): 9 | data = json.load(open("relationships.json", 'rb')) 10 | mapping = pickle.load(open("new_mapping.pkl", "rb")) 11 | 12 | vg2idx = mapping['vg2idx'] 13 | idx2obj = mapping['idx2obj'] 14 | obj2idx = mapping['obj2idx'] 15 | # rela2idx = mapping['rela2idx'] 16 | 17 | cooc = {} 18 | cooc_pred = {} 19 | for i in range(105): 20 | for j in range(105): 21 | cooc[i, j] = [] 22 | cooc_pred[i, j] = [] 23 | 24 | bar = progressbar.ProgressBar() 25 | # invalid_predicates = [] 26 | for i in bar(range(len(data))): 27 | d = data[i] 28 | for r in d['relationships']: 29 | if "name" in r['object']: 30 | k = "name" 31 | else: 32 | k = "names" 33 | 34 | if type(r['object'][k]) == list: 35 | obj = r['object'][k][0] 36 | else: 37 | obj = r['object'][k] 38 | 39 | 40 | if "name" in r['subject']: 41 | k = 'name' 42 | else: 43 | k = "names" 44 | 45 | if type(r['subject'][k]) == list: 46 | sub = r['subject'][k][0] 47 | else: 48 | sub = r['subject'][k] 49 | 50 | try: 51 | objs = vg2idx[obj] 52 | subs = vg2idx[sub] 53 | except: 54 | continue 55 | 56 | for o in objs: 57 | for s in subs: 58 | try: 59 | obj_id = obj2idx[o] 60 | sub_id = obj2idx[s] 61 | except: 62 | continue 63 | # try: 64 | # cooc_pred[obj_id, sub_id].extend(rela2idx[r['predicate'].lower()]) 65 | # except: 66 | # invalid_predicates.append(r['predicate'].lower()) 67 | if type(r['predicate']) == list: 68 | cooc_pred[obj_id, sub_id].extend([p.lower() for p in r['predicate']]) 69 | else: 70 | cooc_pred[obj_id, sub_id].append(r['predicate'].lower()) 71 | 72 | # cooc[obj_id, sub_id].append(r['relationship_id']) 73 | 74 | 75 | relations = np.identity(105, np.float32) 76 | # raw_relations = np.identity(87, np.float32) 77 | for k, v in cooc_pred.items(): 78 | if len(v) > 0: 79 | cnt_v = Counter(v + cooc_pred[k[1], k[0]]) 80 | freqs = np.array(list(cnt_v.values())) 81 | if np.sum(freqs > 3) > 0: 82 | relations[k[0], k[1]] = 1 83 | relations[k[1], k[0]] = 1 84 | 85 | # for k, v in cooc.items(): 86 | # if k[0] != k[1]: 87 | # raw_relations[k[0], k[1]] = len(v + cooc[k[1], k[0]]) 88 | # raw_relations[k[1], k[0]] = len(v + cooc[k[1], k[0]]) 89 | 90 | with open("new_cooc_pred.pkl", 'wb') as f: 91 | pickle.dump(cooc_pred, f, pickle.HIGHEST_PROTOCOL) 92 | 93 | # with open("invalid.txt", 'wb') as f: 94 | # pickle.dump(invalid_predicates, f, pickle.HIGHEST_PROTOCOL) 95 | 96 | np.save("new_relations", relations) 97 | # np.save("raw_relations", raw_relations) 98 | 99 | def lcs(X, Y, m, n): 100 | LCSuff = [[0 for k in range(n+1)] for l in range(m+1)] 101 | 102 | # To store the length of 103 | # longest common substring 104 | result = 0 105 | 106 | # Following steps to build 107 | # LCSuff[m+1][n+1] in bottom up fashion 108 | for i in range(m + 1): 109 | for j in range(n + 1): 110 | if (i == 0 or j == 0): 111 | LCSuff[i][j] = 0 112 | elif (X[i-1] == Y[j-1]): 113 | LCSuff[i][j] = LCSuff[i-1][j-1] + 1 114 | result = max(result, LCSuff[i][j]) 115 | else: 116 | LCSuff[i][j] = 0 117 | return result 118 | 119 | def mapping_predicates(): 120 | mapping = pickle.load(open("mapping.pkl", "rb")) 121 | 122 | vg2idx = mapping['vg2idx'] 123 | idx2obj = mapping['idx2obj'] 124 | rela2idx = mapping['rela2idx'] 125 | 126 | new_rela = {} 127 | for k, v in rela2idx.items(): 128 | new_rela[k] = [v] 129 | 130 | known_pred = list(rela2idx.keys()) 131 | not_found = 0 132 | invalid_predicates = pickle.load(open('invalid.txt', 'rb')) 133 | 134 | bar = progressbar.ProgressBar() 135 | for i in bar(range(len(invalid_predicates))): 136 | p = invalid_predicates[i] 137 | new_rela[p] = [] 138 | found = 0 139 | for kp in known_pred: 140 | if lcs(p, kp, len(p), len(kp)) / max(len(p), len(kp)) > 0.6: 141 | new_rela[p].append(rela2idx[kp]) 142 | found = 1 143 | if found == 0: 144 | not_found += 1 145 | 146 | mapping['all_rela2idx'] = new_rela 147 | 148 | print("{} not found.".format(not_found)) 149 | 150 | with open("mapping.pkl", 'wb') as f: 151 | pickle.dump(mapping, f, pickle.HIGHEST_PROTOCOL) 152 | 153 | if __name__ == '__main__': 154 | build_graph() 155 | # mapping_predicates() -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "resolution": [224, 224], 3 | "default_reward": -0.01, 4 | "success_reward": 10.0, 5 | "collide_reward": -0.1, 6 | <<<<<<< HEAD 7 | "embeddings_onehot": "/home/tailongnguyen/thesis/RL-target-driven-navigation-ai2thor/embedding_onehot.pkl", 8 | "embeddings_fasttext": "/home/tailongnguyen/thesis/RL-target-driven-navigation-ai2thor/embedding_fasttext300.pkl", 9 | "dump_path": "/home/tailongnguyen/thesis/RL-target-driven-navigation-ai2thor/dumped/", 10 | "adj_file": "/home/tailongnguyen/thesis/build_graph/new_relations.npy", 11 | ======= 12 | "embeddings_onehot": "/home/yoshi/thesis/RL-target-driven-navigation-ai2thor/embedding_onehot.pkl", 13 | "embeddings_fasttext": "/home/yoshi/thesis/RL-target-driven-navigation-ai2thor/embedding_fasttext300.pkl", 14 | "dump_path": "/home/yoshi/thesis/RL-target-driven-navigation-ai2thor/dumped/", 15 | "adj_file": "/home/yoshi/thesis/build_graph/new_relations.npy", 16 | >>>>>>> 3e16083a7dd07023980fd5cddf4ec521c2796629 17 | "all_objects": ["KeyChain", "Bread", "Potato", "Mug", "PaintingHanger", "Book", "ToiletPaper", "TableTop", 18 | "Chair", "Bed", "Container", "Painting", "Watch", "Apple", "Sink", "Cabinet", "StoveKnob", 19 | "Pan", "Cloth", "ShowerDoor", "CoffeeMachine", "Toaster", "Box", "CellPhone", "Tomato", 20 | "SoapBar", "HousePlant", "Bowl", "Lettuce", "ButterKnife", "Fridge", "Laptop", "Towel", 21 | "Knife", "Pen", "Plate", "SprayBottle", "Microwave", "LightSwitch", "StoveBurner", "Candle", 22 | "Pencil", "Blinds", "SoapBottle", "Lamp", "TowelHolder", "Statue", "Mirror", "Newspaper", 23 | "WateringCan", "Television", "AlarmClock", "CreditCard"], 24 | "picked": { 25 | "FloorPlan1": {"test": ["ButterKnife", "GarbageCan", "Kettle", "PepperShaker", "SoapBottle", "Spatula", "Statue"]}, 26 | <<<<<<< HEAD 27 | "FloorPlan2": {"test": ["CellPhone", "GarbageCan", "Kettle", "Ladle", "Spatula", "SaltShaker"]}, 28 | "FloorPlan10": {"test": ["ButterKnife", "CellPhone", "GarbageCan", "SoapBottle", "Statue", "Toaster", "Spatula"]}, 29 | "FloorPlan28": {"test": ["Blinds", "ButterKnife", "SoapBottle", "SaltShaker"]}, 30 | "FloorPlan201": {"test": ["RemoteControl", "Painting"]} 31 | }, 32 | "rooms": { 33 | "Kitchens":{ 34 | "train_scenes": [ 35 | "FloorPlan1", 36 | "FloorPlan2", 37 | "FloorPlan5", 38 | "FloorPlan8", 39 | "FloorPlan10", 40 | "FloorPlan16", 41 | "FloorPlan29", 42 | "FloorPlan12", 43 | "FloorPlan17", 44 | "FloorPlan30", 45 | "FloorPlan14", 46 | "FloorPlan13", 47 | "FloorPlan22", 48 | "FloorPlan7", 49 | "FloorPlan20"], 50 | "test_scenes": [ 51 | "FloorPlan18", 52 | "FloorPlan28" 53 | ], 54 | "train_objects": ["GarbageCan", "Sink", "Bread", "StoveKnob", "SinkBasin", "StoveBurner", "Fridge", "CounterTop", "Microwave", "LightSwitch", "CoffeeMachine", "Cabinet"], 55 | "test_objects": ["Toaster", "Mug", "Potato", "Window", "Bowl"] 56 | ======= 57 | "FloorPlan2": {"test": ["CellPhone", "GarbageCan", "Kettle", "Ladle", "Spatula", "SoapBottle", "SaltShaker"]}, 58 | "FloorPlan10": {"test": ["ButterKnife", "CellPhone", "GarbageCan", "SoapBottle", "Statue", "Toaster", "Spatula"]}, 59 | "FloorPlan28": {"test": ["Blinds", "ButterKnife", "SoapBottle", "SaltShaker"]} 60 | }, 61 | "rooms": { 62 | "Kitchens":{ 63 | "scenes": ["FloorPlan1", 64 | "FloorPlan2", 65 | "FloorPlan10", 66 | "FloorPlan28"], 67 | "train_objects": ["Fridge", "SoapBottle", "CoffeeMachine", "Microwave", "SaltShaker", "Tomato", "Bread", "StoveBurner", "StoveKnob", "Pan", "DishSponge", "Pot", "Bowl", "SinkBasin", "CounterTop", "Drawer", "Sink", "Cabinet", "Mug", "LightSwitch"], 68 | "test_objects": ["GarbageCan", "Toaster", "Spatula", "PepperShaker"] 69 | >>>>>>> 3e16083a7dd07023980fd5cddf4ec521c2796629 70 | }, 71 | "Living Rooms": { 72 | "train_scenes": [ 73 | "FloorPlan201", 74 | "FloorPlan202", 75 | "FloorPlan206", 76 | "FloorPlan207", 77 | "FloorPlan208", 78 | "FloorPlan209", 79 | "FloorPlan210", 80 | "FloorPlan211", 81 | "FloorPlan212", 82 | "FloorPlan213", 83 | "FloorPlan214", 84 | "FloorPlan216", 85 | "FloorPlan217", 86 | "FloorPlan218", 87 | "FloorPlan219" 88 | ], 89 | "test_scenes": [ 90 | "FloorPlan204", 91 | "FloorPlan205" 92 | ], 93 | "train_objects": ["ArmChair", "GarbageCan", "TableTop", "Sofa", "Television", "HousePlant", "Vase", "Painting", "FloorLamp", "Window"], 94 | "test_objects" : ["Box", "Pillow", "RemoteControl", "Chair", "GarbageCan", "Laptop"] 95 | }, 96 | "Bedrooms": { 97 | "train_scenes": [ 98 | "FloorPlan314", 99 | "FloorPlan301", 100 | "FloorPlan313", 101 | "FloorPlan306", 102 | "FloorPlan302", 103 | "FloorPlan304", 104 | "FloorPlan305", 105 | "FloorPlan310", 106 | "FloorPlan312", 107 | "FloorPlan308"], 108 | "test_scenes":[ 109 | "FloorPlan316", 110 | "FloorPlan317" 111 | ], 112 | "train_objects": ["Shelf", "AlarmClock", "GarbageCan", "KeyChain", "LightSwitch", "Bed", "Mirror", "Chair", "Desk", "Pen"], 113 | 114 | "test_objects" : ["CellPhone", "Pencil", "Book", "Drawer", "Laptop"] 115 | }, 116 | "Bathrooms": { 117 | "train_scenes": [ 118 | "FloorPlan410", 119 | "FloorPlan417", 120 | "FloorPlan418", 121 | "FloorPlan423", 122 | "FloorPlan427", 123 | "FloorPlan428", 124 | "FloorPlan429", 125 | "FloorPlan430"], 126 | "test_scenes":[ 127 | "FloorPlan414" 128 | ], 129 | "train_objects": ["Candle", "LightSwitch", "Sink", "SinkBasin", "SoapBottle", "Toilet", "TowelHolder", "Mirror", "SoapBar", "GarbageCan"], 130 | "test_objects" : ["ToiletPaperHanger", "Towel"] 131 | } 132 | }, 133 | "graph": "", 134 | "objects": { 135 | "AlarmClock": 0, 136 | "Apple": 1, 137 | "AppleSlice": 2, 138 | "Bathtub": 3, 139 | "Bed": 4, 140 | "Blinds": 5, 141 | "Book": 6, 142 | "Bowl": 7, 143 | "BowlDirty": 8, 144 | "BowlFilled": 9, 145 | "Box": 10, 146 | "Bread": 11, 147 | "BreadSliced": 12, 148 | "ButterKnife": 13, 149 | "Cabinet": 14, 150 | "Candle": 15, 151 | "CellPhone": 16, 152 | "Chair": 17, 153 | "Cloth": 18, 154 | "CoffeeMachine": 19, 155 | "Container": 20, 156 | "ContainerFull": 21, 157 | "CounterTop": 22, 158 | "CreditCard": 23, 159 | "Cup": 24, 160 | "Dirt": 25, 161 | "Egg": 26, 162 | "EggFried": 27, 163 | "EggShell": 28, 164 | "Fork": 29, 165 | "Fridge": 30, 166 | "GarbageCan": 31, 167 | "HousePlant": 32, 168 | "KeyChain": 33, 169 | "Knife": 34, 170 | "Lamp": 35, 171 | "Laptop": 36, 172 | "Lettuce": 37, 173 | "LettuceSliced": 38, 174 | "LightSwitch": 39, 175 | "Microwave": 40, 176 | "Mirror": 41, 177 | "MiscTableObject": 42, 178 | "Mug": 43, 179 | "MugFilled": 44, 180 | "Newspaper": 45, 181 | "Omelette": 46, 182 | "Painting": 47, 183 | "PaintingHanger": 48, 184 | "Pan": 49, 185 | "Pen": 50, 186 | "Pencil": 51, 187 | "Plate": 52, 188 | "Plunger": 53, 189 | "Pot": 54, 190 | "Potato": 55, 191 | "PotatoSliced": 56, 192 | "RemoteControl": 57, 193 | "Sandwich": 58, 194 | "ScrubBrush": 59, 195 | "ShowerDoor": 60, 196 | "Sink": 61, 197 | "SoapBar": 62, 198 | "SoapBottle": 63, 199 | "SoapBottleFilled": 64, 200 | "Spoon": 65, 201 | "SportsEquipment": 66, 202 | "SprayBottle": 67, 203 | "Statue": 68, 204 | "StoveBurner": 69, 205 | "StoveKnob": 70, 206 | "TableTop": 71, 207 | "Television": 72, 208 | "TissueBox": 73, 209 | "TissueBoxEmpty": 74, 210 | "Toaster": 75, 211 | "Toilet": 76, 212 | "ToiletPaper": 77, 213 | "ToiletPaperHanger": 78, 214 | "ToiletPaperRoll": 79, 215 | "Tomato": 80, 216 | "TomatoSliced": 81, 217 | "Towel": 82, 218 | "TowelHolder": 83, 219 | "VacuumCleaner": 84, 220 | "Watch": 85, 221 | "WateringCan": 86 222 | }, 223 | "new_objects":{ 224 | "AlarmClock": 0, 225 | "Apple": 1, 226 | "ArmChair": 2, 227 | "BaseballBat": 3, 228 | "BasketBall": 4, 229 | "Bathtub": 5, 230 | "BathtubBasin": 6, 231 | "Bed": 7, 232 | "Blinds": 8, 233 | "Book": 9, 234 | "Boots": 10, 235 | "Bottle": 11, 236 | "Bowl": 12, 237 | "Box": 13, 238 | "Bread": 14, 239 | "ButterKnife": 15, 240 | "Cabinet": 16, 241 | "Candle": 17, 242 | "Cart": 18, 243 | "CD": 19, 244 | "CellPhone": 20, 245 | "Chair": 21, 246 | "Cloth": 22, 247 | "CoffeeMachine": 23, 248 | "CounterTop": 24, 249 | "CreditCard": 25, 250 | "Cup": 26, 251 | "Curtains": 27, 252 | "Desk": 28, 253 | "DeskLamp": 29, 254 | "DishSponge": 30, 255 | "Drawer": 31, 256 | "Dresser": 32, 257 | "Egg": 33, 258 | "FloorLamp": 34, 259 | "Footstool": 35, 260 | "Fork": 36, 261 | "Fridge": 37, 262 | "GarbageCan": 38, 263 | "HandTowel": 39, 264 | "HandTowelHolder": 40, 265 | "HousePlant": 41, 266 | "Kettle": 42, 267 | "KeyChain": 43, 268 | "Knife": 44, 269 | "Ladle": 45, 270 | "Laptop": 46, 271 | "LaundryHamper": 47, 272 | "LaundryHamperLid": 48, 273 | "Lettuce": 49, 274 | "LightSwitch": 50, 275 | "Microwave": 51, 276 | "Mirror": 52, 277 | "Mug": 53, 278 | "Newspaper": 54, 279 | "NightStand": 55, 280 | "Ottoman": 56, 281 | "Painting": 57, 282 | "Pan": 58, 283 | "PaperTowel": 59, 284 | "Pen": 60, 285 | "Pencil": 61, 286 | "PepperShaker": 62, 287 | "Pillow": 63, 288 | "Plate": 64, 289 | "Plunger": 65, 290 | "Poster": 66, 291 | "Pot": 67, 292 | "Potato": 68, 293 | "RemoteControl": 69, 294 | "Safe": 70, 295 | "SaltShaker": 71, 296 | "ScrubBrush": 72, 297 | "Shelf": 73, 298 | "ShowerDoor": 74, 299 | "ShowerGlass": 75, 300 | "Sink": 76, 301 | "SinkBasin": 77, 302 | "SoapBar": 78, 303 | "SoapBottle": 79, 304 | "Sofa": 80, 305 | "Spatula": 81, 306 | "Spoon": 82, 307 | "SprayBottle": 83, 308 | "Statue": 84, 309 | "StoveBurner": 85, 310 | "StoveKnob": 86, 311 | "TableTop": 87, 312 | "TeddyBear": 88, 313 | "Television": 89, 314 | "TennisRacket": 90, 315 | "TissueBox": 91, 316 | "Toaster": 92, 317 | "Toilet": 93, 318 | "ToiletPaper": 94, 319 | "ToiletPaperHanger": 95, 320 | "ToiletPaperRoll": 96, 321 | "Tomato": 97, 322 | "Towel": 98, 323 | "TowelHolder": 99, 324 | "Vase": 100, 325 | "Watch": 101, 326 | "WateringCan": 102, 327 | "Window": 103, 328 | "WineBottle": 104 329 | } 330 | 331 | } 332 | -------------------------------------------------------------------------------- /download_csv.py: -------------------------------------------------------------------------------- 1 | import os 2 | import matplotlib.pyplot as plt 3 | import pandas as pd 4 | import numpy as np 5 | import sys 6 | from collections import defaultdict 7 | from tensorboard.backend.event_processing.event_accumulator import EventAccumulator 8 | 9 | 10 | def process(dpath): 11 | folders = [f for f in os.listdir(dpath)] 12 | # folders = ['Z_16'] 13 | summary_iterators = [EventAccumulator(os.path.join(dpath, dname)).Reload() for dname in folders] 14 | 15 | for f, summary in zip(folders, summary_iterators): 16 | tag_dict = {} 17 | tags = summary.Tags()['scalars'] 18 | for tag in tags: 19 | log_type = "_".join(tag.split('/')[1:]) 20 | if log_type not in tag_dict: 21 | tag_dict[log_type] = {'steps' : [], 'values': [] } 22 | 23 | steps = [e.step for e in summary.Scalars(tag)] 24 | values = [e.value for e in summary.Scalars(tag)] 25 | 26 | tag_dict[log_type]['steps'].append(steps) 27 | tag_dict[log_type]['values'].append(values) 28 | 29 | # print(list(tag_dict.keys())) 30 | # break 31 | for k, v in tag_dict.items(): 32 | df = pd.DataFrame(columns=['Step', 'Value']) 33 | # print(v['steps'], v['values']) 34 | # sys.exit() 35 | df['Step'] = np.mean(np.vstack(v['steps']), 0) 36 | df['Value'] = np.mean(np.vstack(v['values']), 0) 37 | df.to_csv(os.path.join(dpath, "{}.csv".format(k))) 38 | 39 | if __name__ == '__main__': 40 | path = "/home/yoshi/thesis/RL-target-driven-navigation-ai2thor/tf_a2c/training-history/FloorPlan2_6" 41 | process(path) -------------------------------------------------------------------------------- /dqn/agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import tensorflow as tf 4 | 5 | from replay_buffer import ReplayBuffer 6 | from model import QNetwork 7 | 8 | 9 | class Agent(): 10 | """Interacts with and learns from the environment.""" 11 | 12 | def __init__(self, sess, state_size, action_size, seed, arguments): 13 | """Initialize an Agent object. 14 | 15 | Params 16 | ====== 17 | state_size (int): dimension of each state 18 | action_size (int): dimension of each action 19 | seed (int): random seed 20 | """ 21 | self.sess = sess 22 | self.state_size = state_size 23 | self.action_size = action_size 24 | self.seed = random.seed(seed) 25 | 26 | self.learning_rate = arguments['lr'] 27 | self.gamma = arguments['gamma'] 28 | self.update_every = arguments['update_every'] 29 | self.tau = arguments['tau'] 30 | self.history_size = arguments['history_size'] 31 | 32 | self.buffer_size = arguments['buffer_size'] 33 | self.batch_size = arguments['batch_size'] 34 | 35 | # Q-Network 36 | self.qnetwork_local = QNetwork('local_q', state_size, action_size, self.history_size) 37 | self.qnetwork_target = QNetwork('target_q', state_size, action_size, self.history_size) 38 | 39 | copy_ops = [] 40 | for local_w, target_w in zip(self.qnetwork_local.variables, self.qnetwork_target.variables): 41 | copy_op = tf.assign(local_w, local_w * self.tau + (1.0 - self.tau) * target_w) 42 | copy_ops.append(copy_op) 43 | 44 | self.copy_ops = tf.group(*copy_ops, name='copy_op') 45 | 46 | # Replay memory 47 | self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed) 48 | # Initialize time step (for updating every self.update_every steps) 49 | self.t_step = 0 50 | 51 | def step(self, state, action, reward, next_state, done): 52 | # Save experience in replay memory 53 | self.memory.add(state, action, reward, next_state, done) 54 | 55 | # Learn every self.update_every time steps. 56 | self.t_step = (self.t_step + 1) % self.update_every 57 | if self.t_step == 0: 58 | # If enough samples are available in memory, get random subset and learn 59 | if len(self.memory) > self.batch_size: 60 | experiences = self.memory.sample() 61 | self.learn(experiences, self.gamma) 62 | 63 | def act(self, state, eps=0.): 64 | """Returns actions for given state as per current policy. 65 | 66 | Params 67 | ====== 68 | state (array_like): current state 69 | eps (float): epsilon, for epsilon-greedy action selection 70 | """ 71 | q_values = self.sess.run( 72 | self.qnetwork_local.q_values, 73 | feed_dict={ 74 | self.qnetwork_local.inputs: [state] 75 | }).ravel().tolist() 76 | 77 | # Epsilon-greedy action selection 78 | if random.random() > eps: 79 | return np.argmax(q_values) 80 | else: 81 | return random.choice(np.arange(self.action_size)) 82 | 83 | def learn(self, experiences, gamma): 84 | """Update value parameters using given batch of experience tuples. 85 | Params 86 | ====== 87 | experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 88 | gamma (float): discount factor 89 | """ 90 | states, actions, rewards, next_states, dones = experiences 91 | # Get max predicted Q values (for next states) from target model 92 | q_target_values = self.sess.run( 93 | self.qnetwork_target.q_values, 94 | feed_dict={ 95 | self.qnetwork_target.inputs: next_states 96 | }) 97 | Q_targets_next = np.max(q_target_values, axis=1).reshape(-1, 1) 98 | 99 | # Compute Q targets for current states 100 | Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) 101 | 102 | try: 103 | self.sess.run(self.qnetwork_local.optimizer, 104 | feed_dict={ 105 | self.qnetwork_local.learning_rate: self.learning_rate, 106 | self.qnetwork_local.inputs: states, 107 | self.qnetwork_local.actions: actions, 108 | self.qnetwork_local.target_Q: np.squeeze(Q_targets), 109 | }) 110 | except: 111 | print(states.shape) 112 | print(actions.shape) 113 | print(rewards.shape) 114 | print(next_states.shape) 115 | print(dones.shape) 116 | print(q_target_values.shape) 117 | print(Q_targets_next.shape) 118 | print(Q_targets.shape) 119 | import sys 120 | sys.exit() 121 | 122 | # ------------------- update target network ------------------- # 123 | self.soft_update() 124 | 125 | def soft_update(self): 126 | """Soft update model parameters. 127 | θ_target = τ*θ_local + (1 - τ)*θ_target 128 | Params 129 | ====== 130 | local_model (PyTorch model): weights will be copied from 131 | target_model (PyTorch model): weights will be copied to 132 | tau (float): interpolation parameter 133 | """ 134 | 135 | self.sess.run(self.copy_ops) 136 | 137 | -------------------------------------------------------------------------------- /dqn/env/ai2thor_env.py: -------------------------------------------------------------------------------- 1 | import ai2thor.controller 2 | import numpy as np 3 | import gym 4 | import cv2 5 | import h5py 6 | import os 7 | import sys 8 | import random 9 | 10 | from copy import deepcopy 11 | from gym import error, spaces 12 | from gym.utils import seeding 13 | 14 | class AI2ThorDumpEnv(): 15 | """ 16 | Wrapper base class 17 | """ 18 | def __init__(self, scene, target, config, arguments=dict(), seed=None): 19 | """ 20 | :param seed: (int) Random seed 21 | :param config: (str) Dictionary file storing cofigurations 22 | :param: scene: (list) Scene to train on 23 | :param: objects: (list) Target object to train on 24 | """ 25 | 26 | self.config = config 27 | self.scene = scene 28 | self.target = target 29 | self.history_size = arguments.get('history_size') 30 | self.action_size = arguments.get('action_size') 31 | 32 | assert self.action_size <= self.graph.shape[1], "The number of actions exceeds the limit of environment." 33 | 34 | self.h5_file = h5py.File("{}.hdf5".format(os.path.join(config['dump_path'], self.scene)), 'r') 35 | 36 | all_visible_objects = set(",".join([o for o in list(self.h5_file['visible_objects']) if o != '']).split(',')) 37 | 38 | assert self.target in all_visible_objects, "Target {} is unreachable in {}!".format(self.target, self.scene) 39 | 40 | self.states = self.h5_file['locations'][()] 41 | self.graph = self.h5_file['graph'][()] 42 | self.features = self.h5_file['resnet_features'][()] 43 | self.visible_objects = self.h5_file['visible_objects'][()] 44 | 45 | if "shortest" in self.h5_file.keys(): 46 | self.shortest = self.h5_file['shortest'][()] 47 | 48 | if "sharing" in self.h5_file.keys(): 49 | self.sharing = self.h5_file['sharing'][()].tolist() 50 | 51 | self.target_ids = [idx for idx in range(len(self.states)) if self.target in self.visible_objects[idx].split(",")] 52 | 53 | self.action_space = self.action_size 54 | self.cv_action_onehot = np.identity(self.action_space) 55 | 56 | # Randomness settings 57 | self.np_random = None 58 | if seed: 59 | self.seed(seed) 60 | 61 | self.history_states = np.zeros((self.history_size, self.features.shape[1])) 62 | 63 | def step(self, action): 64 | ''' 65 | 0: move ahead 66 | 1: move back 67 | 2: rotate right 68 | 3: rotate left 69 | 4: look down 70 | 5: look up 71 | ''' 72 | 73 | if action >= self.action_space: 74 | raise error.InvalidAction('Action must be an integer between ' 75 | '0 and {}!'.format(self.action_space - 1)) 76 | k = self.current_state_id 77 | if self.graph[k][action] != -1: 78 | self.current_state_id = int(self.graph[k][action]) 79 | if self.current_state_id in self.target_ids: 80 | self.terminal = True 81 | collided = False 82 | else: 83 | self.terminal = False 84 | collided = False 85 | else: 86 | self.terminal = False 87 | collided = True 88 | 89 | reward, done = self.transition_reward(collided) 90 | 91 | self.update_states() 92 | 93 | return self.history_states, reward, done 94 | 95 | def transition_reward(self, collided): 96 | reward = self.config['default_reward'] 97 | done = 0 98 | if self.terminal: 99 | reward = self.config['success_reward'] 100 | done = 1 101 | elif self.config['anti-collision'] and collided: 102 | reward = self.config['collide_reward'] 103 | 104 | return reward, done 105 | 106 | def reset(self): 107 | # reset parameters 108 | if self.action_size == self.action_space: 109 | self.current_state_id = random.randrange(self.states.shape[0]) 110 | else: 111 | while 1: 112 | k = random.randrange(self.states.shape[0]) 113 | if int(self.states[k][-1]) == 0: 114 | break 115 | 116 | self.current_state_id = k 117 | 118 | self.update_states() 119 | self.terminal = False 120 | 121 | return self.history_states, self.target 122 | 123 | def update_states(self): 124 | f = self.features[self.current_state_id] 125 | self.history_states = np.append(self.history_states[1:, :], np.transpose(f, (1,0)), 0) 126 | 127 | def state(self, state_id): 128 | return self.features[state_id] 129 | 130 | def seed(self, seed=None): 131 | self.np_random, seed1 = seeding.np_random(seed) 132 | # Derive a random seed. This gets passed as a uint, but gets 133 | # checked as an int elsewhere, so we need to keep it below 134 | # 2**31. 135 | return seed1 136 | 137 | if __name__ == '__main__': 138 | AI2ThorEnv() 139 | -------------------------------------------------------------------------------- /dqn/env/training-history/2019-03-04-15-57-41-547816_FloorPlan2_CoffeeMachine/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "my-model" 2 | all_model_checkpoint_paths: "my-model" 3 | -------------------------------------------------------------------------------- /dqn/env/training-history/2019-03-04-15-57-41-547816_FloorPlan2_CoffeeMachine/events.out.tfevents.1551689861.asr02.local: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/dqn/env/training-history/2019-03-04-15-57-41-547816_FloorPlan2_CoffeeMachine/events.out.tfevents.1551689861.asr02.local -------------------------------------------------------------------------------- /dqn/env/training-history/2019-03-04-15-57-41-547816_FloorPlan2_CoffeeMachine/my-model.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/dqn/env/training-history/2019-03-04-15-57-41-547816_FloorPlan2_CoffeeMachine/my-model.data-00000-of-00001 -------------------------------------------------------------------------------- /dqn/env/training-history/2019-03-04-15-57-41-547816_FloorPlan2_CoffeeMachine/my-model.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/dqn/env/training-history/2019-03-04-15-57-41-547816_FloorPlan2_CoffeeMachine/my-model.index -------------------------------------------------------------------------------- /dqn/env/training-history/2019-03-04-15-57-41-547816_FloorPlan2_CoffeeMachine/my-model.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/dqn/env/training-history/2019-03-04-15-57-41-547816_FloorPlan2_CoffeeMachine/my-model.meta -------------------------------------------------------------------------------- /dqn/env/training-history/2019-03-04-15-58-04-711405_FloorPlan2_CoffeeMachine/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "my-model" 2 | all_model_checkpoint_paths: "my-model" 3 | -------------------------------------------------------------------------------- /dqn/env/training-history/2019-03-04-15-58-04-711405_FloorPlan2_CoffeeMachine/events.out.tfevents.1551689885.asr02.local: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/dqn/env/training-history/2019-03-04-15-58-04-711405_FloorPlan2_CoffeeMachine/events.out.tfevents.1551689885.asr02.local -------------------------------------------------------------------------------- /dqn/env/training-history/2019-03-04-15-58-04-711405_FloorPlan2_CoffeeMachine/my-model.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/dqn/env/training-history/2019-03-04-15-58-04-711405_FloorPlan2_CoffeeMachine/my-model.data-00000-of-00001 -------------------------------------------------------------------------------- /dqn/env/training-history/2019-03-04-15-58-04-711405_FloorPlan2_CoffeeMachine/my-model.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/dqn/env/training-history/2019-03-04-15-58-04-711405_FloorPlan2_CoffeeMachine/my-model.index -------------------------------------------------------------------------------- /dqn/env/training-history/2019-03-04-15-58-04-711405_FloorPlan2_CoffeeMachine/my-model.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/dqn/env/training-history/2019-03-04-15-58-04-711405_FloorPlan2_CoffeeMachine/my-model.meta -------------------------------------------------------------------------------- /dqn/main.py: -------------------------------------------------------------------------------- 1 | from env.ai2thor_env import AI2ThorDumpEnv 2 | from agent import Agent 3 | from utils import LinearSchedule 4 | from datetime import datetime 5 | 6 | import tensorflow as tf 7 | import numpy as np 8 | import os 9 | import random 10 | import time 11 | import json 12 | import argparse 13 | 14 | ALL_ROOMS = { 15 | 0: "Kitchens", 16 | 1: "Living Rooms", 17 | 2: "Bedrooms", 18 | 3: "Bathrooms" 19 | } 20 | 21 | def read_config(config_path): 22 | if os.path.isfile(config_path): 23 | with open(config_path) as f: 24 | config = json.load(f) 25 | return config 26 | 27 | def main(config, arguments): 28 | room = config['rooms'][ALL_ROOMS[arguments['room_id']]] 29 | all_scenes = room['scenes'] 30 | train_objects = room['train_objects'] 31 | test_objects = room['test_objects'] 32 | 33 | training_scene = all_scenes[arguments['scene_id']] 34 | 35 | # h5_file = h5py.File("{}.hdf5".format(os.path.join(config['dump_path'], training_scene)), 'r') 36 | # all_visible_objects = set(",".join([o for o in list(h5_file['visible_objects']) if o != '']).split(',')) 37 | # print(all_visible_objects) 38 | # trainable_objects = list(set(train_objects).intersection(all_visible_objects)) 39 | # h5_file.close() 40 | # print(trainable_objects) 41 | 42 | trainable_objects = { 43 | 0: ['Knife', 'Sink', 'CoffeeMachine', 'StoveKnob', 'StoveBurner', 'Cabinet', 'Fridge', 'TableTop'], 44 | 1: ['CoffeeMachine', 'StoveBurner', 'Sink', 'GarbageCan', 'TableTop', 'Fridge', 'Mug', 'StoveKnob', 'Microwave', 'Cabinet', 'Chair'], 45 | 27: ['Cabinet', 'TableTop', 'StoveKnob', 'Fridge', 'Sink', 'StoveBurner', 'CoffeeMachine'] 46 | } 47 | 48 | training_object = trainable_objects[arguments['scene_id']][arguments['target_id']] 49 | 50 | env = AI2ThorDumpEnv(training_scene, training_object, config, arguments) 51 | 52 | tf.reset_default_graph() 53 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=arguments['gpu_fraction']) 54 | sess = tf.Session(config = tf.ConfigProto(gpu_options=gpu_options)) 55 | 56 | agent = Agent(sess, env.features.shape[1], env.action_space, int(time.time() * 100) % 100, arguments) 57 | sess.run(tf.global_variables_initializer()) 58 | 59 | 60 | saver = tf.train.Saver() 61 | timer = "{}_{}_{}".format(str(datetime.now()).replace(" ", "-").replace(".", "-").replace(":", "-"), training_scene, training_object) 62 | log_folder = os.path.join(arguments.get('logging'), timer) 63 | writer = tf.summary.FileWriter(log_folder) 64 | 65 | reward_log = tf.placeholder(tf.float32) 66 | redundant_log = tf.placeholder(tf.float32) 67 | 68 | test_name = training_scene 69 | tf.summary.scalar(test_name + "/" + training_object + "/rewards", reward_log) 70 | tf.summary.scalar(test_name + "/" + training_object + "/redundants", redundant_log) 71 | 72 | write_op = tf.summary.merge_all() 73 | 74 | num_epochs = arguments['num_epochs'] 75 | num_steps = arguments['num_iters'] 76 | 77 | epsilon_schedule = LinearSchedule(num_epochs, final_p=0.02) 78 | ep_rewards = [] 79 | start_time = time.time() 80 | for ep in range(num_epochs): 81 | state, target = env.reset() 82 | start = env.current_state_id 83 | rewards = 0 84 | redundant = 0 85 | 86 | for step in range(num_steps): 87 | action = agent.act(state, epsilon_schedule.value(ep)) 88 | next_state, reward, done = env.step(action) 89 | agent.step(state, env.cv_action_onehot[action], reward, next_state, done) 90 | state = next_state 91 | 92 | rewards += reward 93 | if done: 94 | break 95 | 96 | if not done: 97 | end = env.current_state_id 98 | try: 99 | redundants = [] 100 | for target_id in env.target_ids: 101 | redundants.append(num_steps + env.shortest[end, target_id] - env.shortest[start, target_id]) 102 | 103 | redundant = min(redundants) 104 | except AttributeError: 105 | pass 106 | 107 | ep_rewards.append(rewards) 108 | print("Ep {}/{}, elapsed time: {:.3f} | rewards: {:.3f}| mean rewards: {:.3f}".format( 109 | ep+1, num_epochs, (time.time() - start_time)/3600, 110 | rewards, np.mean(ep_rewards)), end='\r', flush=True) 111 | if ep % 100 == 0: 112 | print("Ep {}/{}, elapsed time: {:.3f} | rewards: {:.3f}| mean rewards: {:.3f}\n".format( 113 | ep+1, num_epochs, (time.time() - start_time)/3600, 114 | rewards, np.mean(ep_rewards))) 115 | 116 | summary = sess.run(write_op, feed_dict = { 117 | reward_log: rewards, 118 | redundant_log: redundant, 119 | }) 120 | 121 | writer.add_summary(summary, ep + 1) 122 | writer.flush() 123 | 124 | saver.save(sess, log_folder + "/my-model") 125 | sess.close() 126 | 127 | 128 | if __name__ == '__main__': 129 | parser = argparse.ArgumentParser(description='Arguments') 130 | parser.add_argument('--room_id', type=int, default=0) 131 | parser.add_argument('--scene_id', nargs='?', type=int, default=0) 132 | parser.add_argument('--target_id', nargs='?', type=int, default=0) 133 | parser.add_argument('--gpu_fraction', nargs='?', type=float, default=0.15, 134 | help='GPU memory usage fraction') 135 | parser.add_argument('--history_size', type=int, default=1, 136 | help='whether to stack frames to make input') 137 | parser.add_argument('--num_epochs', nargs='?', type=int, default=10000, 138 | help='Number of epochs to train') 139 | parser.add_argument('--num_iters', nargs='?', type=int, default=100, 140 | help='Number of steps to be sampled in each episode') 141 | parser.add_argument('--buffer_size', nargs='?', type=int, default=100000, 142 | help='replay buffer size') 143 | parser.add_argument('--batch_size', nargs='?', type=int, default=64, 144 | help='minibatch size') 145 | parser.add_argument('--gamma', nargs='?', type=float, default=0.99, 146 | help='discount factor') 147 | parser.add_argument('--tau', nargs='?', type=float, default=1e-3, 148 | help='for soft update of target parameters') 149 | parser.add_argument('--lr', nargs='?', type=float, default=5e-4, 150 | help='learning rate') 151 | parser.add_argument('--update_every', nargs='?', type=int, default=4, 152 | help='how often to update the network') 153 | parser.add_argument('--logging', type=str, default="training-history/", 154 | help='Logging folder') 155 | parser.add_argument('--config_file', type=str, default="config.json") 156 | 157 | 158 | args = parser.parse_args() 159 | 160 | # print(vars(args)) 161 | config = read_config(args.config_file) 162 | main(config, vars(args)) 163 | -------------------------------------------------------------------------------- /dqn/model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | HIDDEN_SIZE = 128 4 | 5 | def _fc_weight_variable(shape, name='W_fc'): 6 | input_channels = shape[0] 7 | d = 1.0 / np.sqrt(input_channels) 8 | initial = tf.random_uniform(shape, minval=-d, maxval=d) 9 | return tf.get_variable(name=name, dtype = tf.float32, initializer=initial) 10 | 11 | def _fc_bias_variable(shape, input_channels, name='b_fc'): 12 | d = 1.0 / np.sqrt(input_channels) 13 | initial = tf.random_uniform(shape, minval=-d, maxval=d) 14 | return tf.get_variable(name=name, dtype=tf.float32, initializer=initial) 15 | 16 | 17 | class QNetwork(): 18 | def __init__(self, name, state_size, action_size, history_size=1, dropout_keep_prob=-1): 19 | self.state_size = state_size 20 | self.action_size = action_size 21 | 22 | with tf.variable_scope(name): 23 | self.inputs = tf.placeholder(tf.float32, [None, history_size, self.state_size]) 24 | 25 | self.inputs_flat = tf.reshape(self.inputs, [-1, self.state_size * history_size]) 26 | self.actions = tf.placeholder(tf.float32, [None, self.action_size]) 27 | self.target_Q = tf.placeholder(tf.float32, [None]) 28 | self.learning_rate = tf.placeholder(tf.float32, []) 29 | 30 | self.W_fc1 = _fc_weight_variable([self.state_size * history_size, HIDDEN_SIZE], name="W_fc1") 31 | self.b_fc1 = _fc_bias_variable([HIDDEN_SIZE], self.state_size, name="b_fc1") 32 | self.fc1 = tf.nn.relu(tf.nn.bias_add(tf.matmul(self.inputs_flat, self.W_fc1), self.b_fc1)) 33 | 34 | if dropout_keep_prob != -1: 35 | self.fc1 = tf.nn.dropout(self.fc1, dropout_keep_prob) 36 | 37 | self.W_fc2 = _fc_weight_variable([HIDDEN_SIZE, self.action_size], name="W_fc2") 38 | self.b_fc2 = _fc_bias_variable([self.action_size], HIDDEN_SIZE, name="b_fc2") 39 | 40 | self.q_values = tf.nn.bias_add(tf.matmul(self.fc1, self.W_fc2), self.b_fc2) 41 | self.Q_expected = tf.reduce_sum(tf.multiply(self.q_values, self.actions)) 42 | 43 | 44 | self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q_expected)) 45 | self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss) 46 | 47 | self.variables = [self.W_fc1, self.b_fc1, self.W_fc2, self.b_fc2] 48 | -------------------------------------------------------------------------------- /dqn/replay_buffer.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | from collections import namedtuple, deque 4 | 5 | class ReplayBuffer: 6 | """Fixed-size buffer to store experience tuples.""" 7 | 8 | def __init__(self, action_size, buffer_size, batch_size, seed): 9 | """Initialize a ReplayBuffer object. 10 | Params 11 | ====== 12 | action_size (int): dimension of each action 13 | buffer_size (int): maximum size of buffer 14 | batch_size (int): size of each training batch 15 | seed (int): random seed 16 | """ 17 | self.action_size = action_size 18 | self.memory = deque(maxlen=buffer_size) 19 | self.batch_size = batch_size 20 | self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) 21 | self.seed = random.seed(seed) 22 | 23 | def add(self, state, action, reward, next_state, done): 24 | """Add a new experience to memory.""" 25 | e = self.experience(state, action, reward, next_state, done) 26 | self.memory.append(e) 27 | 28 | def sample(self): 29 | """Randomly sample a batch of experiences from memory.""" 30 | experiences = random.sample(self.memory, k=self.batch_size) 31 | 32 | states = np.vstack([[e.state] for e in experiences if e is not None]) 33 | actions = np.vstack([e.action for e in experiences if e is not None]) 34 | rewards = np.vstack([e.reward for e in experiences if e is not None]) 35 | next_states = np.vstack([[e.next_state] for e in experiences if e is not None]) 36 | dones = np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8) 37 | 38 | return (states, actions, rewards, next_states, dones) 39 | 40 | def __len__(self): 41 | """Return the current size of internal memory.""" 42 | return len(self.memory) -------------------------------------------------------------------------------- /dqn/utils.py: -------------------------------------------------------------------------------- 1 | class LearningRateDecay(object): 2 | def __init__(self, v, nvalues, lr_decay_method): 3 | self.n = 0. 4 | self.v = v 5 | self.nvalues = nvalues 6 | 7 | def constant(p): 8 | return 1 9 | 10 | def linear(p): 11 | return 1 - p 12 | 13 | lr_decay_methods = { 14 | 'linear': linear, 15 | 'constant': constant 16 | } 17 | 18 | self.decay = lr_decay_methods[lr_decay_method] 19 | 20 | def value(self): 21 | current_value = self.v * self.decay(self.n / self.nvalues) 22 | self.n += 1. 23 | return current_value 24 | 25 | def get_value_for_steps(self, steps): 26 | return self.v * self.decay(steps / self.nvalues) 27 | 28 | class LinearSchedule(object): 29 | def __init__(self, schedule_timesteps, final_p, initial_p=1.0): 30 | """Linear interpolation between initial_p and final_p over 31 | schedule_timesteps. After this many timesteps pass final_p is 32 | returned. 33 | Parameters 34 | ---------- 35 | schedule_timesteps: int 36 | Number of timesteps for which to linearly anneal initial_p 37 | to final_p 38 | initial_p: float 39 | initial output value 40 | final_p: float 41 | final output value 42 | """ 43 | self.schedule_timesteps = schedule_timesteps 44 | self.final_p = final_p 45 | self.initial_p = initial_p 46 | 47 | def value(self, t): 48 | """See Schedule.value""" 49 | fraction = min(float(t) / self.schedule_timesteps, 1.0) 50 | return self.initial_p + fraction * (self.final_p - self.initial_p) -------------------------------------------------------------------------------- /draft.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import h5py 4 | import operator 5 | 6 | config = json.load(open('config.json')) 7 | 8 | scene_type = ['Kitchens', 'Living Rooms', 'Bedrooms', 'Bathrooms'] 9 | visible = [] 10 | cnt = {} 11 | st = 0 12 | 13 | for s in ['train_scenes', 'test_scenes']: 14 | cnt[s] = {} 15 | for f in config['rooms'][scene_type[st]][s]: 16 | f = h5py.File("dumped/{}.hdf5".format(f), 'r') 17 | visible.append(f['all_visible_objects'][()].tolist()) 18 | for o in f['all_visible_objects'][()].tolist(): 19 | if o not in cnt[s]: 20 | cnt[s][o] = 1 21 | else: 22 | cnt[s][o] +=1 23 | if s == 'train_scenes': 24 | cnt[s] = [o for o, c in cnt[s].items() if c > 7] 25 | else: 26 | cnt[s] = [o for o, c in cnt[s].items()] 27 | 28 | print(cnt) 29 | 30 | print("Joint: ", set(cnt['train_scenes']).intersection(set(cnt['test_scenes']))) -------------------------------------------------------------------------------- /embedding_fasttext300.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/embedding_fasttext300.pkl -------------------------------------------------------------------------------- /embedding_onehot.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/embedding_onehot.pkl -------------------------------------------------------------------------------- /env/ai2thor_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | import cv2 4 | import h5py 5 | import os 6 | import sys 7 | import random 8 | 9 | class AI2ThorDumpEnv(): 10 | """ 11 | Wrapper base class 12 | """ 13 | def __init__(self, scene, target, config, arguments=dict(), seed=None): 14 | """ 15 | :param seed: (int) Random seed 16 | :param config: (str) Dictionary file storing cofigurations 17 | :param: scene: (list) Scene to train 18 | :param: target: (list) Target object to train 19 | """ 20 | if seed is not None: 21 | np.random.seed(seed) 22 | 23 | self.config = config 24 | self.arguments = arguments 25 | self.scene = scene 26 | self.target = target 27 | self.history_size = arguments.get('history_size') 28 | self.action_size = arguments.get('action_size') 29 | 30 | self.h5_file = h5py.File("{}.hdf5".format(os.path.join(config['dump_path'], self.scene)), 'r') 31 | 32 | self.states = self.h5_file['locations'][()] 33 | self.graph = self.h5_file['graph'][()] 34 | self.scores = self.h5_file['resnet_scores'][()] if not arguments['yolo_gcn'] else self.h5_file['dump_features'][()][:, :-4].astype(bool).astype(int) 35 | self.all_visible_objects = self.h5_file['all_visible_objects'][()].tolist() 36 | self.visible_objects = self.h5_file['visible_objects'][()] 37 | self.observations = self.h5_file['observations'][()] 38 | 39 | assert self.target in self.all_visible_objects, "Target {} is unreachable in {}!".format(self.target, self.scene) 40 | 41 | self.resnet_features = self.h5_file['resnet_features'][()] 42 | self.dump_features = self.h5_file['dump_features'][()] 43 | 44 | if arguments['onehot']: 45 | self.features = self.dump_features 46 | else: 47 | self.features = self.resnet_features 48 | 49 | assert self.action_size <= self.graph.shape[1], "The number of actions exceeds the limit of environment." 50 | 51 | if "shortest" in self.h5_file.keys(): 52 | self.shortest = self.h5_file['shortest'][()] 53 | 54 | if self.arguments['hard']: 55 | # agent has to reach the correct position and has right rotation 56 | self.offset = 3 57 | else: 58 | # agent only has to reach the correct position 59 | self.offset = 2 60 | 61 | self.target_ids = [idx for idx in range(len(self.states)) if self.target in self.visible_objects[idx].split(",")] 62 | self.target_locs = set([tuple(self.states[idx][:self.offset]) for idx in self.target_ids]) 63 | 64 | self.action_space = self.action_size 65 | self.cv_action_onehot = np.identity(self.action_space) 66 | 67 | self.history_states = np.zeros((self.history_size, self.features.shape[1])) 68 | self.observations_stack = [np.zeros((3, 128, 128)) for _ in range(self.history_size)] 69 | 70 | def step(self, action): 71 | ''' 72 | 0: move ahead 73 | 1: move back 74 | 2: rotate right 75 | 3: rotate left 76 | 4: look down 77 | 5: look up 78 | ''' 79 | 80 | if action >= self.action_space: 81 | raise error.InvalidAction('Action must be an integer between ' 82 | '0 and {}!'.format(self.action_space - 1)) 83 | k = self.current_state_id 84 | if self.graph[k][action] != -1: 85 | if action == 2 or action == 3: 86 | for _ in range(int(self.arguments['angle'] / 22.5)): 87 | self.current_state_id = int(self.graph[k][action]) 88 | else: 89 | self.current_state_id = int(self.graph[k][action]) 90 | 91 | if tuple(self.states[self.current_state_id][:self.offset]) in self.target_locs: 92 | self.terminal = True 93 | self.collided = False 94 | else: 95 | self.terminal = False 96 | self.collided = False 97 | else: 98 | self.terminal = False 99 | self.collided = True 100 | 101 | reward, done = self.transition_reward() 102 | 103 | self.update_states() 104 | 105 | if self.arguments['train_cnn']: 106 | return np.asarray(self.observations_stack, dtype=np.float32), self.scores[self.current_state_id], reward, done 107 | else: 108 | return self.history_states, self.scores[self.current_state_id], reward, done 109 | 110 | def transition_reward(self): 111 | reward = self.config['default_reward'] 112 | done = 0 113 | if self.terminal: 114 | reward = self.config['success_reward'] 115 | done = 1 116 | elif self.arguments['anti_col'] and self.collided: 117 | reward = self.config['collide_reward'] 118 | 119 | return reward, done 120 | 121 | def reset(self): 122 | # reset parameters 123 | k = random.randrange(self.states.shape[0]) 124 | 125 | while self.states[k][2] % self.arguments['angle'] != 0.0: 126 | k = random.randrange(self.states.shape[0]) 127 | 128 | self.current_state_id = k 129 | 130 | self.update_states(reset=True) 131 | self.terminal = False 132 | self.collided = False 133 | 134 | if self.arguments['train_cnn']: 135 | return np.asarray(self.observations_stack, dtype=np.float32), self.scores[self.current_state_id], self.target 136 | else: 137 | return self.history_states, self.scores[self.current_state_id], self.target 138 | 139 | def update_states(self, reset=False): 140 | if reset: 141 | self.history_states = np.zeros((self.history_size, self.features.shape[1])) 142 | self.observations_stack = [np.zeros((3, 128, 128)) for _ in range(self.history_size)] 143 | 144 | f = self.features[self.current_state_id] 145 | 146 | self.history_states = np.append(self.history_states[1:, :], f[np.newaxis, :], 0) 147 | 148 | self.observations_stack.append(self.observation()) 149 | self.observations_stack = self.observations_stack[1:] 150 | 151 | def state(self): 152 | return self.features[self.current_state_id] 153 | 154 | def observation(self): 155 | ob = self.observations[self.current_state_id] 156 | resized_ob = cv2.resize(ob, (128, 128)) 157 | return np.transpose(resized_ob, (2, 0, 1)) 158 | 159 | class MultiSceneEnv(): 160 | """ 161 | Wrapper base class 162 | """ 163 | def __init__(self, scene, config, arguments=dict(), seed=None): 164 | """ 165 | :param seed: (int) Random seed 166 | :param config: (str) Dictionary file storing cofigurations 167 | :param: scene: (list) Scene to train 168 | :param: objects: (list) Target objects to train 169 | """ 170 | 171 | if seed is not None: 172 | np.random.seed(seed) 173 | 174 | self.config = config 175 | self.arguments = arguments 176 | self.scene = scene 177 | 178 | self.history_size = arguments.get('history_size') 179 | self.action_size = arguments.get('action_size') 180 | 181 | scene_id = int(scene.split("FloorPlan")[1]) 182 | if scene_id > 0 and scene_id < 31: 183 | room_type = "Kitchens" 184 | elif scene_id > 200 and scene_id < 231: 185 | room_type = 'Living Rooms' 186 | elif scene_id > 300 and scene_id < 331: 187 | room_type = 'Bedrooms' 188 | elif scene_id > 400 and scene_id < 431: 189 | room_type = 'Bathrooms' 190 | else: 191 | raise KeyError 192 | 193 | if arguments['test'] == 1: 194 | self.targets = config["rooms"][room_type]['train_objects'] + config["rooms"][room_type]['test_objects'] 195 | else: 196 | self.targets = config["rooms"][room_type]['train_objects'] 197 | 198 | self.h5_file = h5py.File("{}.hdf5".format(os.path.join(config['dump_path'], self.scene)), 'r') 199 | 200 | self.states = self.h5_file['locations'][()] 201 | self.graph = self.h5_file['graph'][()] 202 | self.scores = self.h5_file['resnet_scores'][()] if not arguments['yolo_gcn'] else self.h5_file['dump_features'][()][:, :-4].astype(bool).astype(int) 203 | self.all_visible_objects = self.h5_file['all_visible_objects'][()].tolist() 204 | self.visible_objects = self.h5_file['visible_objects'][()] 205 | self.observations = self.h5_file['observations'][()] 206 | 207 | self.resnet_features = self.h5_file['resnet_features'][()] 208 | self.dump_features = self.h5_file['dump_features'][()] 209 | 210 | 211 | if arguments['onehot']: 212 | self.features = self.dump_features 213 | else: 214 | self.features = self.resnet_features 215 | 216 | assert self.action_size <= self.graph.shape[1], "The number of actions exceeds the limit of environment." 217 | 218 | if "shortest" in self.h5_file.keys(): 219 | self.shortest = self.h5_file['shortest'][()] 220 | 221 | if self.arguments['hard']: 222 | # agent has to reach the correct position and has right rotation 223 | self.offset = 3 224 | else: 225 | # agent only has to reach the correct position 226 | self.offset = 2 227 | 228 | self.target = np.random.choice(self.targets) 229 | self.target_ids = [idx for idx in range(len(self.states)) if self.target in self.visible_objects[idx].split(",")] 230 | self.target_locs = set([tuple(self.states[idx][:self.offset]) for idx in self.target_ids]) 231 | 232 | self.action_space = self.action_size 233 | self.cv_action_onehot = np.identity(self.action_space) 234 | 235 | self.history_states = np.zeros((self.history_size, self.features.shape[1])) 236 | self.observations_stack = [np.zeros((3, 128, 128)) for _ in range(self.history_size)] 237 | 238 | 239 | def step(self, action): 240 | ''' 241 | 0: move ahead 242 | 1: move back 243 | 2: rotate right 244 | 3: rotate left 245 | 4: look down 246 | 5: look up 247 | ''' 248 | 249 | if action >= self.action_space: 250 | raise error.InvalidAction('Action must be an integer between ' 251 | '0 and {}!'.format(self.action_space - 1)) 252 | k = self.current_state_id 253 | if self.graph[k][action] != -1: 254 | if action == 2 or action == 3: 255 | for _ in range(int(self.arguments['angle'] / 22.5)): 256 | self.current_state_id = int(self.graph[k][action]) 257 | else: 258 | self.current_state_id = int(self.graph[k][action]) 259 | 260 | if tuple(self.states[self.current_state_id][:self.offset]) in self.target_locs: 261 | self.terminal = True 262 | self.collided = False 263 | else: 264 | self.terminal = False 265 | self.collided = False 266 | else: 267 | self.terminal = False 268 | self.collided = True 269 | 270 | reward, done = self.transition_reward() 271 | 272 | self.update_states() 273 | 274 | if self.arguments['train_cnn']: 275 | return np.asarray(self.observations_stack, dtype=np.float32), self.scores[self.current_state_id], reward, done 276 | else: 277 | return self.history_states, self.scores[self.current_state_id], reward, done 278 | 279 | def transition_reward(self): 280 | reward = self.config['default_reward'] 281 | done = 0 282 | if self.terminal: 283 | reward = self.config['success_reward'] 284 | done = 1 285 | elif self.arguments['anti_col'] and self.collided: 286 | reward = self.config['collide_reward'] 287 | 288 | return reward, done 289 | 290 | def reset(self): 291 | self.target = np.random.choice(self.targets) 292 | self.target_ids = [idx for idx in range(len(self.states)) if self.target in self.visible_objects[idx].split(",")] 293 | self.target_locs = set([tuple(self.states[idx][:self.offset]) for idx in self.target_ids]) 294 | 295 | k = random.randrange(self.states.shape[0]) 296 | 297 | while self.states[k][2] % self.arguments['angle'] != 0.0: 298 | k = random.randrange(self.states.shape[0]) 299 | 300 | # reset parameters 301 | self.current_state_id = k 302 | 303 | self.update_states(reset=True) 304 | self.terminal = False 305 | self.collided = False 306 | 307 | if self.arguments['train_cnn']: 308 | return np.asarray(self.observations_stack, dtype=np.float32), self.scores[self.current_state_id], self.target 309 | else: 310 | return self.history_states, self.scores[self.current_state_id], self.target 311 | 312 | 313 | def update_states(self, reset=False): 314 | if reset: 315 | self.history_states = np.zeros((self.history_size, self.features.shape[1])) 316 | self.observations_stack = [np.zeros((3, 128, 128)) for _ in range(self.history_size)] 317 | 318 | f = self.features[self.current_state_id] 319 | 320 | self.history_states = np.append(self.history_states[1:, :], f[np.newaxis, :], 0) 321 | 322 | self.observations_stack.append(self.observation()) 323 | self.observations_stack = self.observations_stack[1:] 324 | 325 | def state(self): 326 | return self.features[self.current_state_id] 327 | 328 | def observation(self): 329 | ob = self.observations[self.current_state_id] 330 | resized_ob = cv2.resize(ob, (128, 128)) 331 | return np.transpose(resized_ob, (2, 0, 1)) 332 | 333 | 334 | if __name__ == '__main__': 335 | AI2ThorEnv() 336 | -------------------------------------------------------------------------------- /images/1_GCN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/1_GCN.png -------------------------------------------------------------------------------- /images/1_easy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/1_easy.png -------------------------------------------------------------------------------- /images/1_easy_noGAE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/1_easy_noGAE.png -------------------------------------------------------------------------------- /images/1_easy_noGAE_normalizeReward.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/1_easy_noGAE_normalizeReward.png -------------------------------------------------------------------------------- /images/1_easy_noGAE_onehot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/1_easy_noGAE_onehot.png -------------------------------------------------------------------------------- /images/1_embed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/1_embed.png -------------------------------------------------------------------------------- /images/1_increaseLearningRate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/1_increaseLearningRate.png -------------------------------------------------------------------------------- /images/1_increase_entropy_penalty.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/1_increase_entropy_penalty.png -------------------------------------------------------------------------------- /images/1_noGAE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/1_noGAE.png -------------------------------------------------------------------------------- /images/28_easy_noGAE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/28_easy_noGAE.png -------------------------------------------------------------------------------- /images/All FloorPlan1_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/All FloorPlan1_4.png -------------------------------------------------------------------------------- /images/All FloorPlan1_6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/All FloorPlan1_6.png -------------------------------------------------------------------------------- /images/All FloorPlan28_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/All FloorPlan28_4.png -------------------------------------------------------------------------------- /images/All FloorPlan28_6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/All FloorPlan28_6.png -------------------------------------------------------------------------------- /images/All FloorPlan2_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/All FloorPlan2_4.png -------------------------------------------------------------------------------- /images/All FloorPlan2_6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/All FloorPlan2_6.png -------------------------------------------------------------------------------- /images/Compare FloorPlan1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/Compare FloorPlan1.png -------------------------------------------------------------------------------- /images/Compare FloorPlan2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/Compare FloorPlan2.png -------------------------------------------------------------------------------- /images/Compare FloorPlan28.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/Compare FloorPlan28.png -------------------------------------------------------------------------------- /images/FloorPlan1_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/FloorPlan1_4.png -------------------------------------------------------------------------------- /images/FloorPlan1_6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/FloorPlan1_6.png -------------------------------------------------------------------------------- /images/FloorPlan28_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/FloorPlan28_4.png -------------------------------------------------------------------------------- /images/FloorPlan28_6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/FloorPlan28_6.png -------------------------------------------------------------------------------- /images/FloorPlan2_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/FloorPlan2_4.png -------------------------------------------------------------------------------- /images/sample_AI2THOR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/sample_AI2THOR.png -------------------------------------------------------------------------------- /keyboard_agent.py: -------------------------------------------------------------------------------- 1 | import ai2thor.controller 2 | import sys 3 | import numpy as np 4 | import h5py 5 | import click 6 | import json 7 | import pyglet 8 | 9 | from PIL import Image 10 | 11 | ALL_POSSIBLE_ACTIONS = [ 12 | 'MoveAhead', 13 | 'MoveBack', 14 | 'RotateRight', 15 | 'RotateLeft', 16 | # 'Stop' 17 | ] 18 | 19 | class SimpleImageViewer(object): 20 | 21 | def __init__(self, display=None): 22 | self.window = None 23 | self.isopen = False 24 | self.display = display 25 | 26 | def imshow(self, arr): 27 | if self.window is None: 28 | height, width, channels = arr.shape 29 | self.window = pyglet.window.Window(width=width, height=height, display=self.display, caption="THOR Browser") 30 | self.width = width 31 | self.height = height 32 | self.isopen = True 33 | 34 | assert arr.shape == (self.height, self.width, 3), "You passed in an image with the wrong number shape" 35 | image = pyglet.image.ImageData(self.width, self.height, 'RGB', arr.tobytes(), pitch=self.width * -3) 36 | self.window.clear() 37 | self.window.switch_to() 38 | self.window.dispatch_events() 39 | image.blit(0,0) 40 | self.window.flip() 41 | 42 | def close(self): 43 | if self.isopen: 44 | self.window.close() 45 | self.isopen = False 46 | 47 | def __del__(self): 48 | self.close() 49 | 50 | def run(file_name=None): 51 | # file_name = file_path.split('/')[-1].split('.')[0] 52 | controller = ai2thor.controller.Controller() 53 | controller.start() 54 | 55 | controller.reset("FloorPlan203") 56 | y_coord = 1.25 57 | event = controller.step(dict(action='Initialize', gridSize=0.5, cameraY=y_coord, visibilityDistance=1.0)) 58 | all_visible_objects = list(np.unique([obj['objectType'] for obj in event.metadata['objects']])) 59 | 60 | rotation = 0.0 61 | while True: # making a loop 62 | try: # used try so that if user pressed other than the given key error will not be shown 63 | key = click.getchar() 64 | if key =='a': # Rotate Left 65 | rotation -= 22.5 66 | if rotation < 0: 67 | rotation = rotation + 360 68 | event = controller.step(dict(action='Rotate', rotation=rotation)) 69 | elif key =='d': 70 | rotation += 22.5 71 | if rotation > 360: 72 | rotation = rotation - 360 73 | event = controller.step(dict(action='Rotate', rotation=rotation)) 74 | elif key =='w': 75 | event = controller.step(dict(action='MoveAhead')) 76 | elif key =='s': 77 | event = controller.step(dict(action='MoveBack')) 78 | elif key =='z': 79 | event = controller.step(dict(action='LookDown')) 80 | elif key =='x': 81 | event = controller.step(dict(action='LookUp')) 82 | elif key =='q': 83 | controller.stop() 84 | break 85 | elif key =='r': 86 | scene = input("Scene id: ") 87 | controller.reset('FloorPlan{}'.format(scene)) 88 | event = controller.step(dict(action='Initialize', gridSize=0.5, cameraY=y_coord)) 89 | else: 90 | print("Key not supported! Try a, d, w, s, q, r.") 91 | print((event.metadata['agent']['position']['x'], event.metadata['agent']['position']['z'], event.metadata['agent']['rotation'])) 92 | # print([(obj['objectType'], obj['distance']) for obj in event.metadata['objects'] if obj['visible']]) 93 | except: 94 | print("Key not supported! Try a, d, w, s, q, r.") 95 | 96 | 97 | def key_press(key, mod): 98 | global human_agent_action, human_wants_restart, stop_requested 99 | if key == ord('R') or key == ord('r'): # r/R 100 | human_wants_restart = True 101 | if key == ord('Q') or key == ord('q'): # q/Q 102 | stop_requested = True 103 | 104 | if key == 0xFF52: # move ahead 105 | human_agent_action = 0 106 | if key == 0xFF54: # move back 107 | human_agent_action = 1 108 | if key == 0xFF53: # turn right 109 | human_agent_action = 2 110 | if key == 0xFF51: # turn left 111 | human_agent_action = 3 112 | 113 | if key == ord('z'): # look down 114 | human_agent_action = 4 115 | if key == ord('x'): # look up 116 | human_agent_action = 5 117 | 118 | if __name__ == '__main__': 119 | 120 | # run() 121 | 122 | angle = 45.0 123 | 124 | human_agent_action = None 125 | human_wants_restart = False 126 | stop_requested = False 127 | next_position = None 128 | visible = None 129 | 130 | f = h5py.File('dumped/FloorPlan317.hdf5', "r") 131 | observations = f['observations'] 132 | graph = f['graph'] 133 | visible_objects = f['visible_objects'] 134 | dump_features = f['dump_features'] 135 | states = f['locations'][()] 136 | 137 | config = json.load(open('config.json')) 138 | categories = list(config['new_objects'].keys()) 139 | 140 | k = np.random.randint(0, observations.shape[0]) 141 | while states[k][2] % angle != 0.0: 142 | k = np.random.randint(0, observations.shape[0]) 143 | current_position = k 144 | 145 | viewer = SimpleImageViewer() 146 | viewer.imshow(observations[current_position].astype(np.uint8)) 147 | viewer.window.on_key_press = key_press 148 | 149 | print("Use arrow keys to move the agent.") 150 | print("Press R to reset agent\'s location.") 151 | print("Press Q to quit.") 152 | 153 | while True: 154 | # waiting for keyboard input 155 | if human_agent_action is not None: 156 | # move actions 157 | if human_agent_action == 2 or human_agent_action == 3: 158 | next_position = current_position 159 | for _ in range(int(angle/ 22.5)): 160 | next_position = graph[next_position][human_agent_action] 161 | else: 162 | next_position = graph[current_position][human_agent_action] 163 | 164 | current_position = next_position if next_position != -1 else current_position 165 | distances = [(categories[i], dump_features[current_position][i]) for i in list(np.where(dump_features[current_position][:-4] > 0)[0])] 166 | print(distances, dump_features[current_position][-4:]) 167 | visible = visible_objects[current_position].split(',') 168 | human_agent_action = None 169 | 170 | # waiting for reset command 171 | if human_wants_restart: 172 | # reset agent to random location 173 | k = np.random.randint(0, observations.shape[0]) 174 | while states[k][2] % angle != 0.0: 175 | k = np.random.randint(0, observations.shape[0]) 176 | current_position = k 177 | 178 | human_wants_restart = False 179 | 180 | # check collision 181 | if next_position == -1: 182 | print('Collision occurs.') 183 | 184 | # check quit command 185 | if stop_requested: break 186 | 187 | viewer.imshow(observations[current_position].astype(np.uint8)) 188 | if visible is not None and len(list(visible)) > 0: 189 | print("Visible: {}".format(visible)) 190 | visible = None 191 | 192 | print("Goodbye.") -------------------------------------------------------------------------------- /pytorch_a3c/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /pytorch_a3c/README.md: -------------------------------------------------------------------------------- 1 | # File Descriptions 2 | 3 | - `main.py`: file to run, please read the arguments and corresponding descriptions 4 | - `train.py`: training file, initialized in main thread as a independent process (A3C), support multi-gpus 5 | - `test.py`: file to test trained model 6 | - `dumping.py`: file to run ai2thor controller and dump needed information to hdf5 files (note that the number of actions is 4 and angle is 22.5 by default) 7 | - `layers.py`: contains GCN code 8 | - `utils.py`: contains necessary functions 9 | - `keyboard_agent.py`: to run and interact with dumped file 10 | - `config.json`: configurations including rewarding scheme, file paths, data split 11 | - `env/ai2thor_env.py`: environment 12 | -------------------------------------------------------------------------------- /pytorch_a3c/layers.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch.nn.parameter import Parameter 4 | from torch.nn.modules.module import Module 5 | 6 | 7 | class GraphConvolution(Module): 8 | """ 9 | Simple GCN layer, similar to https://arxiv.org/abs/1609.02907 10 | """ 11 | 12 | def __init__(self, in_features, out_features, bias=True): 13 | super(GraphConvolution, self).__init__() 14 | self.in_features = in_features 15 | self.out_features = out_features 16 | self.weight = Parameter(torch.FloatTensor(in_features, out_features)) 17 | if bias: 18 | self.bias = Parameter(torch.FloatTensor(out_features)) 19 | else: 20 | self.register_parameter('bias', None) 21 | self.reset_parameters() 22 | 23 | def reset_parameters(self): 24 | stdv = 1. / math.sqrt(self.weight.size(1)) 25 | self.weight.data.uniform_(-stdv, stdv) 26 | if self.bias is not None: 27 | self.bias.data.uniform_(-stdv, stdv) 28 | 29 | def forward(self, input, adj): 30 | support = torch.mm(input, self.weight) 31 | output = torch.spmm(adj, support) 32 | if self.bias is not None: 33 | return output + self.bias 34 | else: 35 | return output 36 | 37 | def __repr__(self): 38 | return self.__class__.__name__ + ' (' \ 39 | + str(self.in_features) + ' -> ' \ 40 | + str(self.out_features) + ')' 41 | -------------------------------------------------------------------------------- /pytorch_a3c/main.py: -------------------------------------------------------------------------------- 1 | """ 2 | Adapted from: https://github.com/ikostrikov/pytorch-a3c/blob/master/main.py 3 | The main file needed within a3c. Runs of the train and test functions from their respective files. 4 | Example of use: 5 | `cd algorithms/a3c` 6 | `python main.py` 7 | 8 | Runs A3C on our AI2ThorEnv wrapper with default params (4 processes). Optionally it can be 9 | run on any atari environment as well using the --atari and --atari-env-name params. 10 | """ 11 | import sys 12 | import argparse 13 | import os 14 | import numpy as np 15 | import torch 16 | import torch.multiprocessing as mp 17 | from multiprocessing import Manager 18 | import json 19 | import h5py 20 | 21 | sys.path.append('..') # to access env package 22 | 23 | from env.ai2thor_env import AI2ThorDumpEnv 24 | from optimizers import SharedAdam, SharedRMSprop 25 | from model import ActorCritic 26 | from test import test, live_test, test_multi 27 | from train import train, train_multi 28 | 29 | # Based on 30 | # https://github.com/pytorch/examples/tree/master/mnist_hogwild 31 | # Training settings 32 | parser = argparse.ArgumentParser(description='A3C') 33 | parser.add_argument('--about', type=str, default="training A3C", required=True, 34 | help='description about training, also the name of saving directory, \ 35 | just a way to control which test case was run') 36 | parser.add_argument('--lr', type=float, default=7e-4, 37 | help='learning rate (default: 7e-4)') 38 | parser.add_argument('--angle', type=float, default=45.0, 39 | help='rotation angle') 40 | parser.add_argument('--gamma', type=float, default=0.99, 41 | help='discount factor for rewards (default: 0.99)') 42 | parser.add_argument('--tau', type=float, default=0.96, 43 | help='parameter for GAE (default: 1.00)') 44 | parser.add_argument('--ec', type=float, default=0.01, 45 | help='entropy term coefficient (default: 0.01)') 46 | parser.add_argument('--vc', type=float, default=0.5, 47 | help='value loss coefficient (default: 0.5)') 48 | parser.add_argument('--max_grad_norm', type=float, default=10, 49 | help='value loss coefficient (default: 10)') 50 | parser.add_argument('--lr_decay', type=int, default=0, 51 | help='whether to use learning rate decay') 52 | parser.add_argument('--seed', type=int, default=1, 53 | help='random seed (default: 1)') 54 | parser.add_argument('--room_id', type=int, default=0, 55 | help='room id (default: 0)') 56 | parser.add_argument('--test', type=int, default=0, 57 | help='whether to activate testing phase') 58 | parser.add_argument('--live_test', type=int, default=0, 59 | help='whether to activate live testing phase') 60 | parser.add_argument('--action_size', type=int, default=4, 61 | help='number of possible actions') 62 | parser.add_argument('--num_processes', type=int, default=20, 63 | help='how many training processes to use (default: 1)') 64 | parser.add_argument('--num_iters', type=int, default=100, 65 | help='number of forward steps in A3C (default: 20)') 66 | parser.add_argument('--num_epochs', type=int, default=20000, 67 | help='number of epochs to run on each thread') 68 | parser.add_argument('--max_episode_length', type=int, default=1000, 69 | help='maximum length of an episode (default: 1000)') 70 | parser.add_argument('--siamese', type=int, default=0) 71 | parser.add_argument('--train_cnn', type=int, default=0, 72 | help='whether to re-train cnn module') 73 | parser.add_argument('--history_size', type=int, default=4, 74 | help='whether to stack frames') 75 | parser.add_argument('--optim', type=int, default=1, 76 | help='optimizer: 0 for Adam, 1 for RMSprop') 77 | parser.add_argument('--multi_scene', type=int, default=1, 78 | help='whether to train on multiple scenes') 79 | parser.add_argument('--lstm', type=int, default=0, 80 | help='whether to use lstm instead of stacking features') 81 | parser.add_argument('--onehot', type=int, default=1, 82 | help='whether to use onehot vector as input feature') 83 | parser.add_argument('--embed', type=int, default=1, 84 | help='embedding mode: 0 for onehot, 1 for fasttext') 85 | parser.add_argument('--random_test', type=int, default=0, 86 | help='whether to test performance of a random agent') 87 | parser.add_argument('--use_gcn', type=int, default=0, 88 | help='whether to include gcn') 89 | parser.add_argument('--anti_col', type=int, default=0, 90 | help='whether to include collision penalty to rewarding scheme') 91 | parser.add_argument('--use_graph', type=int, default=0, 92 | help='whether to use relations vector from graph to replace gcn') 93 | parser.add_argument('--yolo_gcn', type=int, default=0, 94 | help='whether to use yolo as input for gcn instead of resnet') 95 | parser.add_argument('--no_shared', type=int, default=0, 96 | help='use an optimizer without shared momentum.') 97 | parser.add_argument('--scene_id', type=int, default=1, 98 | help='scene id (default: 1)') 99 | parser.add_argument('--gpu_ids', type=int, default=-1, 100 | nargs='+', help='GPUs to use [-1 CPU only] (default: -1)') 101 | parser.add_argument('--hard', type=int, default=1, 102 | help='whether to make environment harder\ 103 | 0: agent only has to reach the correct position\ 104 | 1: agent has to reach the correct position and has right rotation') 105 | 106 | parser.add_argument('--config_file', type=str, default="../config.json") 107 | parser.add_argument('--folder', type=str, default=None) 108 | 109 | ALL_ROOMS = { 110 | 0: "Kitchens", 111 | 1: "Living Rooms", 112 | 2: "Bedrooms", 113 | 3: "Bathrooms" 114 | } 115 | 116 | def read_config(config_path): 117 | if os.path.isfile(config_path): 118 | with open(config_path) as f: 119 | config = json.load(f) 120 | return config 121 | 122 | def read_weights(folder): 123 | weights = [f for f in os.listdir(folder) if f.endswith('.pth')] 124 | histories = [f for f in os.listdir(folder) if f.endswith('.pkl')] 125 | 126 | arguments = json.load(open(folder + '/arguments.json')) 127 | if arguments['multi_scene']: 128 | scenes = list(set([f.split('_')[0] for f in os.listdir(folder) if f.endswith('.pkl')])) 129 | targets = [] 130 | else: 131 | scenes = ["FloorPlan{}".format(arguments['scene_id'])] 132 | targets = list(set([f.split('_')[1] for f in os.listdir(folder) if f.endswith('.pkl')])) 133 | 134 | print(list(zip(range(len(weights)), weights))) 135 | wid = input("Please specify weights: ") 136 | weights = weights[int(wid)] 137 | 138 | return os.path.join(folder, weights), arguments, {'scenes': scenes, 'targets': targets} 139 | 140 | if __name__ == '__main__': 141 | os.environ['OMP_NUM_THREADS'] = '1' 142 | args = parser.parse_args() 143 | torch.manual_seed(args.seed) 144 | if args.gpu_ids == -1: 145 | args.gpu_ids = [-1] 146 | else: 147 | torch.cuda.manual_seed(args.seed) 148 | 149 | config = read_config(args.config_file) 150 | # room = config['rooms'][ALL_ROOMS[args.room_id]] 151 | 152 | if args.folder is not None: 153 | weights, arguments, info = read_weights(args.folder) 154 | 155 | multi_scene = len(info['scenes']) > 1 156 | 157 | if args.test or args.live_test: 158 | if not args.random_test: 159 | shared_model = ActorCritic(config, arguments) 160 | shared_model.share_memory() 161 | 162 | shared_model.load_state_dict(torch.load(weights, map_location='cpu')) 163 | print("loaded model") 164 | else: 165 | print("testing random agent ..") 166 | shared_model = None 167 | 168 | if args.live_test: 169 | assert args.folder is not None 170 | print("Start testing ..") 171 | 172 | if multi_scene: 173 | print(list(zip(range(len(info['scenes'])), info['scenes']))) 174 | command = int(input("Please specify scene id. \nYour input:")) 175 | training_scene = info['scenes'][command] 176 | 177 | f = h5py.File("{}.hdf5".format(os.path.join(config['dump_path'], training_scene)), 'r') 178 | # training_objects = f['all_visible_objects'][()].tolist() 179 | training_objects = ["GarbageCan", "Sink", "Bread", "StoveKnob", "SinkBasin", "StoveBurner", "Fridge", "CounterTop", "Microwave", "LightSwitch", "CoffeeMachine", "Cabinet"] 180 | f.close() 181 | else: 182 | training_scene = info['scenes'][0] 183 | print(list(zip(range(len(info['targets'])), info['targets']))) 184 | command = input("Please specify target ids, you can choose either individually (e.g: 0,1,2) or by range (e.g: 0-4)\nYour input:") 185 | 186 | if '-' not in command: 187 | target_ids = [int(i.strip()) for i in command.split(",")] 188 | else: 189 | target_ids = list(range(int(command.split('-')[0]), int(command.split('-')[1]) + 1)) 190 | 191 | training_objects = [info['targets'][target_id] for target_id in target_ids] 192 | 193 | training_objects.sort() 194 | live_test(training_scene, training_objects, shared_model, config, arguments) 195 | 196 | else: 197 | if args.test: 198 | assert args.folder is not None 199 | if not multi_scene: 200 | 201 | training_scene = info['scenes'][0] 202 | testing_objects = config["picked"][training_scene]['test'] 203 | training_objects = info['targets'] 204 | all_visible_objects = training_objects + testing_objects 205 | 206 | phase = ['train'] * len(training_objects) + ['test'] * len(testing_objects) 207 | 208 | print(list(zip(range(len(phase)), all_visible_objects, phase))) 209 | command = input("Please specify target ids, you can choose either individually (e.g: 0,1,2) or by range (e.g: 0-4)\nYour input:") 210 | if '-' not in command: 211 | target_ids = [int(i.strip()) for i in command.split(",")] 212 | else: 213 | target_ids = list(range(int(command.split('-')[0]), int(command.split('-')[1]) + 1)) 214 | 215 | chosen_objects = [all_visible_objects[target_id] for target_id in target_ids] 216 | check_phase = lambda c: 'train' if os.path.isfile(os.path.join(args.folder, "net_{}.pth".format(c))) else 'test' 217 | chosen_phases = [check_phase(c) for c in chosen_objects] 218 | 219 | results = mp.Array('f', len(chosen_objects)) 220 | processes = [] 221 | for rank, obj in enumerate(chosen_objects): 222 | p = mp.Process(target=test, args=(training_scene, obj, rank, shared_model, \ 223 | results, config, arguments)) 224 | p.start() 225 | processes.append(p) 226 | 227 | for p in processes: 228 | p.join() 229 | 230 | print("Testing accuracies:", list(zip(chosen_objects, chosen_phases, results[:]))) 231 | 232 | else: 233 | arguments['test'] = 1 234 | 235 | print(list(zip(range(len(ALL_ROOMS)), list(ALL_ROOMS.values())))) 236 | command = input("Please specify room type:") 237 | scene_type = ALL_ROOMS[int(command)] 238 | 239 | training_scenes = config['rooms'][scene_type]['train_scenes'] 240 | testing_scenes = config['rooms'][scene_type]['test_scenes'] 241 | 242 | command = input("Training/testing scenes. (0, 1): ") 243 | scenes = [training_scenes, testing_scenes][int(command)] 244 | 245 | results = Manager().dict() 246 | 247 | all_visible_objects = config['rooms'][scene_type]['train_objects'] + config['rooms'][scene_type]['test_objects'] 248 | chosen_phases = ['train'] * len(config['rooms'][scene_type]['train_objects']) + ['test'] * len(config['rooms'][scene_type]['test_objects']) 249 | for obj in all_visible_objects: 250 | results[obj] = [] 251 | 252 | processes = [] 253 | 254 | counter = mp.Value('i', 0) 255 | lock = mp.Lock() 256 | 257 | for rank in range(0, len(scenes)): 258 | p = mp.Process(target=test_multi, args=(scenes[rank], rank, shared_model, \ 259 | results, config, arguments)) 260 | p.start() 261 | processes.append(p) 262 | 263 | for p in processes: 264 | p.join() 265 | 266 | accuracies = [] 267 | avg_sc = {'train': [], 'test': []} 268 | avg_spl = {'train': [], 'test': []} 269 | for obj in all_visible_objects: 270 | accuracies.append((np.mean(results[obj]), np.mean(np.array(results[obj], dtype=bool)))) 271 | 272 | for phase, acc in zip(chosen_phases, accuracies): 273 | avg_spl[phase].append(acc[0]) 274 | avg_sc[phase].append(acc[1]) 275 | 276 | avg_sc['train'] = np.mean(avg_sc['train']) 277 | avg_sc['test'] = np.mean(avg_sc['test']) 278 | avg_spl['train'] = np.mean(avg_spl['train']) 279 | avg_spl['test'] = np.mean(avg_spl['test']) 280 | 281 | print("Accuracies:", list(zip(all_visible_objects, chosen_phases, accuracies))) 282 | print("[Avergae] SPL: {} | SR: {}".format(avg_spl, avg_sc)) 283 | 284 | else: 285 | arguments = vars(args) 286 | weights = None 287 | 288 | if not arguments['multi_scene']: 289 | 290 | if not os.path.isdir("training-history/{}".format(arguments['about'])): 291 | os.mkdir("training-history/{}".format(arguments['about'])) 292 | 293 | with open('training-history/{}/arguments.json'.format(arguments['about']), 'w') as outfile: 294 | json.dump(arguments, outfile) 295 | 296 | training_scene = "FloorPlan{}".format(arguments['scene_id']) 297 | f = h5py.File("{}.hdf5".format(os.path.join(config['dump_path'], training_scene)), 'r') 298 | all_visible_objects = f['all_visible_objects'][()].tolist() 299 | f.close() 300 | 301 | testing_objects = config["picked"][training_scene]['test'] 302 | trainable_objects = list(set(all_visible_objects) - set(testing_objects)) 303 | 304 | print(list(zip(range(len(trainable_objects)), trainable_objects))) 305 | command = input("Please specify target ids, you can choose either individually (e.g: 0,1,2) or by range (e.g: 0-4)\nYour input:") 306 | if '-' not in command: 307 | target_ids = [int(i.strip()) for i in command.split(",")] 308 | else: 309 | target_ids = list(range(int(command.split('-')[0]), int(command.split('-')[1]) + 1)) 310 | 311 | training_objects = [trainable_objects[target_id] for target_id in target_ids] 312 | num_thread_each = arguments['num_processes'] // len(training_objects) 313 | object_threads = [] 314 | 315 | for obj in training_objects: 316 | object_threads += [obj] * num_thread_each 317 | 318 | object_threads += [np.random.choice(training_objects)] * (arguments['num_processes'] - len(object_threads)) 319 | 320 | print("Start training agent to find {} in {}".format(training_objects, training_scene)) 321 | 322 | shared_model = ActorCritic(config, arguments) 323 | shared_model.share_memory() 324 | 325 | if weights is not None: 326 | shared_model.load_state_dict(torch.load(weights, map_location='cpu')) 327 | print("loaded model") 328 | 329 | scheduler = None 330 | if arguments['no_shared']: 331 | optimizer = None 332 | else: 333 | if arguments['optim'] == 0: 334 | optimizer = SharedAdam(shared_model.parameters(), lr=arguments['lr']) 335 | else: 336 | optimizer = SharedRMSprop(shared_model.parameters(), lr=arguments['lr'], alpha=0.99, eps=0.1) 337 | 338 | optimizer.share_memory() 339 | if arguments['lr_decay']: 340 | scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.99995) 341 | 342 | processes = [] 343 | 344 | counter = mp.Value('i', 0) 345 | lock = mp.Lock() 346 | 347 | for rank in range(0, arguments['num_processes']): 348 | p = mp.Process(target=train, args=(training_scene, object_threads[rank], rank, shared_model, \ 349 | scheduler, counter, lock, config, arguments, optimizer)) 350 | p.start() 351 | processes.append(p) 352 | 353 | for p in processes: 354 | p.join() 355 | 356 | else: 357 | 358 | print(list(zip(range(len(ALL_ROOMS)), list(ALL_ROOMS.values())))) 359 | command = input("Please specify room type:") 360 | scene_type = ALL_ROOMS[int(command)] 361 | 362 | training_scenes = config['rooms'][scene_type]['train_scenes'] 363 | num_thread_each = arguments['num_processes'] // len(training_scenes) 364 | scene_threads = [] 365 | 366 | for s in training_scenes: 367 | scene_threads += [s] * num_thread_each 368 | 369 | scene_threads += list(np.random.choice(training_scenes, arguments['num_processes'] - len(scene_threads))) 370 | 371 | if not os.path.isdir("training-history/{}".format(arguments['about'])): 372 | os.mkdir("training-history/{}".format(arguments['about'])) 373 | 374 | with open('training-history/{}/arguments.json'.format(arguments['about']), 'w') as outfile: 375 | json.dump(arguments, outfile) 376 | 377 | print("Start training agent in {}".format(training_scenes)) 378 | 379 | shared_model = ActorCritic(config, arguments) 380 | shared_model.share_memory() 381 | 382 | if weights is not None: 383 | shared_model.load_state_dict(torch.load(weights, map_location='cpu')) 384 | print("loaded model") 385 | 386 | scheduler = None 387 | if arguments['no_shared']: 388 | optimizer = None 389 | else: 390 | if arguments['optim'] == 0: 391 | optimizer = SharedAdam(shared_model.parameters(), lr=arguments['lr']) 392 | else: 393 | optimizer = SharedRMSprop(shared_model.parameters(), lr=arguments['lr'], alpha=0.99, eps=0.1) 394 | 395 | optimizer.share_memory() 396 | if arguments['lr_decay']: 397 | scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.99995) 398 | 399 | processes = [] 400 | 401 | counter = mp.Value('i', 0) 402 | lock = mp.Lock() 403 | 404 | for rank in range(0, arguments['num_processes']): 405 | p = mp.Process(target=train_multi, args=(scene_threads[rank], rank, shared_model, \ 406 | scheduler, counter, lock, config, arguments, optimizer)) 407 | p.start() 408 | processes.append(p) 409 | 410 | for p in processes: 411 | p.join() 412 | 413 | -------------------------------------------------------------------------------- /pytorch_a3c/model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torchvision.models as models 7 | from torch.autograd import Variable 8 | from layers import GraphConvolution 9 | from utils import * 10 | 11 | class GCN(nn.Module): 12 | def __init__(self, nfeat, nhid, nclass, dropout): 13 | super(GCN, self).__init__() 14 | 15 | self.gc1 = GraphConvolution(nfeat, nhid) 16 | self.gc2 = GraphConvolution(nhid, nhid) 17 | self.gc3 = GraphConvolution(nhid, nclass) 18 | self.dropout = dropout 19 | 20 | def forward(self, x, adj): 21 | x = F.relu(self.gc1(x, adj)) 22 | x = F.dropout(x, self.dropout, training=self.training) 23 | x = F.relu(self.gc2(x, adj)) 24 | x = F.dropout(x, self.dropout, training=self.training) 25 | x = F.relu(self.gc3(x, adj)) 26 | return x 27 | 28 | class ActorCritic(torch.nn.Module): 29 | 30 | def __init__(self, config, arguments, gpu_id=-1): 31 | super(ActorCritic, self).__init__() 32 | 33 | self.config = config 34 | self.arguments = arguments 35 | 36 | if gpu_id != -1: 37 | torch.cuda.set_device(gpu_id) 38 | self.dtype = torch.cuda.FloatTensor 39 | else: 40 | self.dtype = torch.FloatTensor 41 | try: 42 | self.use_lstm = arguments['lstm'] 43 | except KeyError: 44 | self.use_lstm = False 45 | 46 | self.history_size = arguments['history_size'] 47 | 48 | self.input_size = 2048 49 | 50 | if arguments['onehot']: 51 | self.input_size = 109 52 | 53 | if arguments['train_cnn']: 54 | self.conv1 = nn.Conv2d(3, 32, 3, stride=2, padding=1) 55 | self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1) 56 | self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1) 57 | self.conv4 = nn.Conv2d(32, 32, 3, stride=2, padding=1) 58 | self.conv5 = nn.Conv2d(32, 32, 3, stride=2, padding=1) 59 | self.input_size = 32 * 4 * 4 60 | 61 | if self.use_lstm: 62 | assert arguments['history_size'] == 1, "History size should be 1 if you want to use lstm." 63 | self.visual_ft = nn.LSTMCell(input_size=self.input_size, hidden_size=256) 64 | else: 65 | self.visual_ft = nn.Linear(in_features=self.input_size * self.history_size, out_features=512) 66 | 67 | if arguments["embed"] == 0: 68 | self.embeddings = pickle.load(open(config["embeddings_onehot"], 'rb')) 69 | else: 70 | self.embeddings = pickle.load(open(config["embeddings_fasttext"], 'rb')) 71 | 72 | self.semantic_size = list(self.embeddings.values())[0].shape[0] 73 | self.semantic_ft = nn.Linear(in_features=self.semantic_size, out_features=512) 74 | 75 | self.categories = list(config['new_objects'].keys()) 76 | self.cate2idx = config['new_objects'] 77 | self.num_objects = len(self.categories) 78 | 79 | self.all_embeddings = torch.stack([torch.from_numpy(self.embeddings[w]) for w in self.categories], 0).type(self.dtype) 80 | 81 | if arguments['use_gcn']: 82 | self.categories = list(config['new_objects'].keys()) 83 | self.num_objects = len(self.categories) 84 | 85 | fused_size = 512 * 3 86 | self.adj = normalize(np.load(self.config['adj_file'])) 87 | self.adj = torch.from_numpy(self.adj).type(self.dtype) 88 | 89 | if not arguments['yolo_gcn']: 90 | self.score_to_512 = nn.Linear(in_features=1000, out_features=512) 91 | self.gcn = GCN(nfeat=1024, nhid=1024, nclass=1, dropout=0.5) 92 | else: 93 | self.gcn = GCN(nfeat=self.num_objects + 512, nhid=self.num_objects + 512, nclass=1, dropout=0.5) 94 | 95 | self.gcn_to_512 = nn.Linear(in_features=self.num_objects, out_features=512) 96 | 97 | elif arguments['use_graph']: 98 | self.adj = np.load(self.config['adj_file']) 99 | self.adj = torch.from_numpy(self.adj).type(self.dtype) 100 | 101 | self.graph_ft = nn.Linear(in_features=self.num_objects, out_features=self.num_objects) 102 | fused_size = 512 * 2 + self.num_objects 103 | else: 104 | fused_size = 512 * 2 105 | 106 | self.hidden_mlp = nn.Linear(in_features=fused_size, out_features=512) 107 | self.critic_linear = nn.Linear(512, 1) 108 | self.actor_linear = nn.Linear(512, arguments['action_size']) 109 | 110 | self.apply(kaiming_weights_init) 111 | self.actor_linear.weight.data = normalized_columns_initializer( 112 | self.actor_linear.weight.data, 0.01) 113 | self.actor_linear.bias.data.fill_(0) 114 | self.critic_linear.weight.data = normalized_columns_initializer( 115 | self.critic_linear.weight.data, 1.0) 116 | self.critic_linear.bias.data.fill_(0) 117 | 118 | def learned_embedding(self, word): 119 | embeded = torch.from_numpy(self.embeddings[word]).type(self.dtype) 120 | embeded = embeded.view(1, embeded.size(0)) 121 | semantic = F.relu(self.semantic_ft(embeded)) 122 | 123 | # if self.arguments['use_graph']: 124 | # relations = self.adj[self.cate2idx[word]] 125 | # r = F.relu(self.graph_ft(relations)) 126 | # r = r.view(1, r.numel()) 127 | # joint_embeddings = torch.cat((semantic, r), 1) 128 | # return joint_embeddings 129 | 130 | return semantic 131 | 132 | 133 | def forward(self, inputs, scores, word): 134 | if self.arguments['lstm']: 135 | inputs, (hx, cx) = inputs 136 | 137 | if self.arguments['train_cnn']: 138 | assert inputs.shape == (self.history_size, 3, 128, 128) 139 | inputs = torch.from_numpy(inputs).type(self.dtype) 140 | x = F.elu(self.conv1(inputs)) 141 | x = F.elu(self.conv2(x)) 142 | x = F.elu(self.conv3(x)) 143 | x = F.elu(self.conv4(x)) 144 | x = F.elu(self.conv5(x)) 145 | feature = x.view(-1, self.input_size * self.history_size) 146 | visual = F.relu(self.visual_ft(feature)) 147 | 148 | else: 149 | torch_inputs = [torch.from_numpy(inp).type(self.dtype) for inp in inputs] 150 | 151 | if not self.use_lstm: 152 | joint_features = torch.cat(torch_inputs) 153 | joint_features = joint_features.view(1, -1) 154 | visual = F.relu(self.visual_ft(joint_features)) 155 | else: 156 | feature = torch_inputs[0].view(-1, self.input_size) 157 | hx, cx = self.visual_ft(feature, (hx, cx)) 158 | visual = hx.view(1, -1) 159 | 160 | embeded = torch.from_numpy(self.embeddings[word]).type(self.dtype) 161 | embeded = embeded.view(1, embeded.size(0)) 162 | semantic = F.relu(self.semantic_ft(embeded)) 163 | 164 | if self.arguments['use_gcn']: 165 | scores = torch.from_numpy(scores).type(self.dtype) 166 | scores = scores.view(1, scores.numel()) 167 | 168 | if not self.arguments['yolo_gcn']: 169 | scores_512 = F.relu(self.score_to_512(scores)) 170 | 171 | nodes = [] 172 | ems_512 = F.relu(self.semantic_ft(self.all_embeddings)) 173 | 174 | if not self.arguments['yolo_gcn']: 175 | nodes = torch.cat((scores_512.repeat(self.num_objects, 1), ems_512), 1) 176 | else: 177 | nodes = torch.cat((scores.repeat(self.num_objects, 1), ems_512), 1) 178 | 179 | gcn_out = self.gcn(nodes, self.adj) 180 | gcn_out = gcn_out.view(1, gcn_out.numel()) 181 | gcn_512 = F.relu(self.gcn_to_512(gcn_out)) 182 | 183 | joint_embeddings = torch.cat((visual, semantic, gcn_512), 1) 184 | 185 | elif self.arguments['use_graph']: 186 | # relations = self.adj[self.cate2idx[word]].numpy() 187 | # detections = inputs[0][:-4].astype(bool).astype(int) 188 | # revec = torch.from_numpy(relations * detections).type(self.dtype) 189 | revec = self.adj[self.cate2idx[word]] 190 | r = F.relu(self.graph_ft(revec)) 191 | r = r.view(1, r.numel()) 192 | joint_embeddings = torch.cat((visual, semantic, r), 1) 193 | 194 | else: 195 | joint_embeddings = torch.cat((visual, semantic), 1) 196 | 197 | x = self.hidden_mlp(joint_embeddings) 198 | x = F.relu(x) 199 | 200 | if self.arguments['lstm']: 201 | return self.critic_linear(x), self.actor_linear(x), (hx, cx) 202 | else: 203 | return self.critic_linear(x), self.actor_linear(x) 204 | -------------------------------------------------------------------------------- /pytorch_a3c/optimizers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Adapted from: https://github.com/ikostrikov/pytorch-a3c/blob/master/my_optim.py 3 | 4 | In the original A3C paper (https://arxiv.org/abs/1602.01783), the authors compared 3 different 5 | optimizers i.e. Momentum SGD, RMSProp and Shared RMSProp (check final part of section 4). The 6 | difference between the 3rd compared to the 2nd is whether to compute shared statistics across all 7 | threads, which was found to be more robust. It seems the equivalent was implemented for Adam 8 | below. 9 | """ 10 | 11 | import math 12 | 13 | import torch 14 | import torch.optim as optim 15 | 16 | 17 | class SharedAdam(optim.Adam): 18 | """Implements Adam algorithm with shared states. 19 | """ 20 | 21 | def __init__(self, 22 | params, 23 | lr=1e-3, 24 | betas=(0.9, 0.999), 25 | eps=1e-8, 26 | weight_decay=0): 27 | super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay) 28 | 29 | for group in self.param_groups: 30 | for p in group['params']: 31 | state = self.state[p] 32 | state['step'] = torch.zeros(1) 33 | state['exp_avg'] = p.data.new().resize_as_(p.data).zero_() 34 | state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_() 35 | 36 | def share_memory(self): 37 | for group in self.param_groups: 38 | for p in group['params']: 39 | state = self.state[p] 40 | state['step'].share_memory_() 41 | state['exp_avg'].share_memory_() 42 | state['exp_avg_sq'].share_memory_() 43 | 44 | def step(self, closure=None): 45 | """Performs a single optimization step. 46 | Arguments: 47 | closure (callable, optional): A closure that reevaluates the model 48 | and returns the loss. 49 | """ 50 | loss = None 51 | if closure is not None: 52 | loss = closure() 53 | 54 | for group in self.param_groups: 55 | for p in group['params']: 56 | if p.grad is None: 57 | continue 58 | grad = p.grad.data 59 | state = self.state[p] 60 | 61 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 62 | beta1, beta2 = group['betas'] 63 | 64 | state['step'] += 1 65 | 66 | if group['weight_decay'] != 0: 67 | grad = grad.add(group['weight_decay'], p.data) 68 | 69 | # Decay the first and second moment running average coefficient 70 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 71 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 72 | 73 | denom = exp_avg_sq.sqrt().add_(group['eps']) 74 | 75 | bias_correction1 = 1 - beta1 ** state['step'].item() 76 | bias_correction2 = 1 - beta2 ** state['step'].item() 77 | step_size = group['lr'] * math.sqrt( 78 | bias_correction2) / bias_correction1 79 | 80 | p.data.addcdiv_(-step_size, exp_avg, denom) 81 | 82 | return loss 83 | 84 | # Non-centered RMSprop update with shared statistics (without momentum) 85 | class SharedRMSprop(optim.RMSprop): 86 | """Implements RMSprop algorithm with shared states. 87 | """ 88 | 89 | def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0): 90 | super(SharedRMSprop, self).__init__(params, lr=lr, alpha=alpha, eps=eps, weight_decay=weight_decay, momentum=0, centered=False) 91 | 92 | # State initialisation (must be done before step, else will not be shared between threads) 93 | for group in self.param_groups: 94 | for p in group['params']: 95 | state = self.state[p] 96 | state['step'] = p.data.new().resize_(1).zero_() 97 | state['square_avg'] = p.data.new().resize_as_(p.data).zero_() 98 | 99 | def share_memory(self): 100 | for group in self.param_groups: 101 | for p in group['params']: 102 | state = self.state[p] 103 | state['step'].share_memory_() 104 | state['square_avg'].share_memory_() 105 | 106 | def step(self, closure=None): 107 | """Performs a single optimization step. 108 | Arguments: 109 | closure (callable, optional): A closure that reevaluates the model 110 | and returns the loss. 111 | """ 112 | loss = None 113 | if closure is not None: 114 | loss = closure() 115 | 116 | for group in self.param_groups: 117 | for p in group['params']: 118 | if p.grad is None: 119 | continue 120 | grad = p.grad.data 121 | state = self.state[p] 122 | 123 | square_avg = state['square_avg'] 124 | alpha = group['alpha'] 125 | 126 | state['step'] += 1 127 | 128 | if group['weight_decay'] != 0: 129 | grad = grad.add(group['weight_decay'], p.data) 130 | 131 | # g = αg + (1 - α)Δθ^2 132 | square_avg.mul_(alpha).addcmul_(1 - alpha, grad, grad) 133 | # θ ← θ - ηΔθ/√(g + ε) 134 | avg = square_avg.sqrt().add_(group['eps']) 135 | p.data.addcdiv_(-group['lr'], grad, avg) 136 | 137 | return loss -------------------------------------------------------------------------------- /pytorch_a3c/test.py: -------------------------------------------------------------------------------- 1 | """ 2 | Adapted from https://github.com/ikostrikov/pytorch-a3c/blob/master/test.py 3 | 4 | Contains the testing loop of the shared model within A3C (no optimisation/backprop needed) 5 | Usually this is run concurrently while training occurs and is useful for tracking progress. But to 6 | save resources we can choose to only test every args.test_sleep_time seconds. 7 | """ 8 | 9 | import time 10 | from collections import deque 11 | 12 | import pickle 13 | import torch 14 | import torch.nn.functional as F 15 | import numpy as np 16 | import sys 17 | import cv2 18 | 19 | sys.path.append('..') # to access env package 20 | 21 | from env.ai2thor_env import AI2ThorDumpEnv, MultiSceneEnv 22 | from model import ActorCritic 23 | 24 | def test(testing_scene, test_object, rank, shared_model, results, config, arguments=dict()): 25 | torch.manual_seed(arguments['seed'] + rank) 26 | 27 | env = AI2ThorDumpEnv(testing_scene, test_object, config, arguments, arguments['seed'] + rank) 28 | print("Finding {} in {}, {}".format(test_object, testing_scene, env.target_locs)) 29 | 30 | if shared_model is not None: 31 | gpu_id = arguments['gpu_ids'][rank % len(arguments['gpu_ids'])] 32 | # gpu_id = -1 33 | 34 | model = ActorCritic(config, arguments, gpu_id) 35 | if gpu_id >= 0: 36 | with torch.cuda.device(gpu_id): 37 | model = model.cuda() 38 | model.load_state_dict(shared_model.state_dict()) 39 | 40 | print("[P{}] loaded model into cuda {}".format(rank, gpu_id)) 41 | else: 42 | model.load_state_dict(shared_model.state_dict()) 43 | print("[P{}] loaded model".format(rank)) 44 | 45 | model.eval() 46 | 47 | state, score, target = env.reset() 48 | done = True 49 | 50 | starting = env.current_state_id 51 | results[rank] = 0 52 | 53 | for ep in range(1000): 54 | agent_step = 0 55 | for step in range(arguments['num_iters']): 56 | if model is not None: 57 | with torch.no_grad(): 58 | value, logit = model(state, score, target) 59 | prob = F.softmax(logit, dim=-1) 60 | action = prob.max(1, keepdim=True)[1].cpu().numpy() 61 | # action = prob.multinomial(num_samples=1).detach().cpu().numpy()[0, 0] 62 | 63 | else: 64 | action = np.random.choice(range(arguments['action_size'])) 65 | 66 | state, score, reward, done = env.step(action) 67 | ending = env.current_state_id 68 | 69 | if action < 2: 70 | agent_step += 1 71 | 72 | if done: 73 | results[rank] += env.shortest[ending, starting] / max(agent_step, env.shortest[ending, starting]) 74 | state, score, target = env.reset() 75 | break 76 | 77 | results[rank] = results[rank] / 1000 78 | 79 | def test_multi(testing_scene, rank, shared_model, results, config, arguments=dict()): 80 | torch.manual_seed(arguments['seed'] + rank) 81 | 82 | env = MultiSceneEnv(testing_scene, config, arguments, arguments['seed'] + rank) 83 | 84 | # gpu_id = arguments['gpu_ids'][rank % len(arguments['gpu_ids'])] 85 | gpu_id = -1 86 | print("Done initalizing process {}: {}! Use gpu: {}".format(rank, testing_scene, 'yes' if gpu_id >= 0 else 'no')) 87 | 88 | if shared_model is not None: 89 | # gpu_id = -1 90 | 91 | model = ActorCritic(config, arguments, gpu_id) 92 | if gpu_id >= 0: 93 | with torch.cuda.device(gpu_id): 94 | model = model.cuda() 95 | model.load_state_dict(shared_model.state_dict()) 96 | 97 | # print("[P{}] loaded model into cuda {}".format(rank, gpu_id)) 98 | else: 99 | model.load_state_dict(shared_model.state_dict()) 100 | # print("[P{}] loaded model".format(rank)) 101 | 102 | model.eval() 103 | 104 | else: 105 | model = None 106 | state, score, target = env.reset() 107 | done = True 108 | 109 | for ep in range(1000): 110 | state, score, target = env.reset() 111 | agent_step = 0 112 | starting = env.current_state_id 113 | 114 | for step in range(arguments['num_iters']): 115 | if model is not None: 116 | with torch.no_grad(): 117 | value, logit = model(state, score, target) 118 | prob = F.softmax(logit, dim=-1) 119 | action = prob.max(1, keepdim=True)[1].cpu().numpy() 120 | # action = prob.multinomial(num_samples=1).detach().cpu().numpy()[0, 0] 121 | 122 | else: 123 | action = np.random.choice(range(arguments['action_size'])) 124 | 125 | state, score, reward, done = env.step(action) 126 | ending = env.current_state_id 127 | 128 | if action < 2: 129 | agent_step += 1 130 | 131 | if done: 132 | break 133 | 134 | if not done: 135 | tm = results[target] 136 | tm.append(0) 137 | results[target] = tm 138 | else: 139 | if max(agent_step, env.shortest[ending, starting]) > 0: 140 | tm = results[target] 141 | tm.append(env.shortest[ending, starting] / max(agent_step, env.shortest[ending, starting])) 142 | results[target] = tm 143 | 144 | def live_test(testing_scene, test_objects, shared_model, config, arguments=dict()): 145 | 146 | model = shared_model 147 | if model is not None: 148 | model.eval() 149 | 150 | test_object = np.random.choice(test_objects) 151 | env = AI2ThorDumpEnv(testing_scene, test_object, config, arguments) 152 | print(arguments['angle']) 153 | 154 | new_test_object = None 155 | while 1: 156 | if new_test_object is not None and new_test_object != test_object: 157 | print("Finding {} ..".format(new_test_object)) 158 | env = AI2ThorDumpEnv(testing_scene, new_test_object, config, arguments) 159 | else: 160 | print("Finding {} ..".format(test_object)) 161 | 162 | state, score, target = env.reset() 163 | start = env.current_state_id 164 | done = True 165 | stop = 0 166 | 167 | for step in range(arguments['num_iters']): 168 | ob = env.observations[env.current_state_id] 169 | 170 | cv2.imshow("Live Test", cv2.resize(ob[:,:,::-1], (400, 400))) 171 | time.sleep(0.1) 172 | k = cv2.waitKey(33) 173 | 174 | if k == ord('r'): # press q to escape 175 | new_test_object_id = int(input("Specify target: {}\n".format(list(zip(range(len(test_objects)), test_objects))))) 176 | new_test_object = test_objects[new_test_object_id] 177 | break 178 | elif k == ord('q'): # press q to escape 179 | sys.exit("End live test.") 180 | 181 | 182 | if model is not None: 183 | with torch.no_grad(): 184 | value, logit = model(state, score, target) 185 | prob = F.softmax(logit, dim=-1) 186 | action = prob.max(1, keepdim=True)[1].numpy()[0, 0] 187 | # action = prob.multinomial(num_samples=1).detach().numpy()[0, 0] 188 | 189 | else: 190 | action = np.random.choice(range(arguments['action_size'])) 191 | 192 | print("Action: {}".format(['Move Forward', 'Move Backward', 'Turn Right', 'Turn Left'][action])) 193 | state, score, reward, done = env.step(action) 194 | if env.collided: 195 | print("Collision occurs.") 196 | # a quick hack to prevent the agent from stucking 197 | # i.e. in test mode an agent can repeat an action ad infinitum 198 | 199 | if done: 200 | stop += 1 201 | if stop == 2: 202 | new_test_object_id = int(input("Specify target: {}\n".format(list(zip(range(len(test_objects)), test_objects))))) 203 | new_test_object = test_objects[new_test_object_id] 204 | stop = 0 205 | break 206 | 207 | if not done: 208 | print("Fail") 209 | else: 210 | print("Success with {} redundant steps.".format(step + 1 - env.shortest[start, env.current_state_id])) 211 | -------------------------------------------------------------------------------- /pytorch_a3c/train.py: -------------------------------------------------------------------------------- 1 | """ 2 | Adapted from https://github.com/ikostrikov/pytorch-a3c/blob/master/train.py 3 | 4 | Contains the train code run by each A3C process on either Atari or AI2ThorEnv. 5 | For initialisation, we set up the environment, seeds, shared model and optimizer. 6 | In the main training loop, we always ensure the weights of the current model are equal to the 7 | shared model. Then the algorithm interacts with the environment arguments.num_steps at a time, 8 | i.e it sends an action to the env for each state and stores predicted values, rewards, log probs 9 | and entropies to be used for loss calculation and backpropagation. 10 | After arguments.num_steps has passed, we calculate advantages, value losses and policy losses using 11 | Generalized Advantage Estimation (GAE) with the entropy loss added onto policy loss to encourage 12 | exploration. Once these losses have been calculated, we add them all together, backprop to find all 13 | gradients and then optimise with Adam and we go back to the start of the main training loop. 14 | """ 15 | 16 | import torch 17 | import torch.nn.functional as F 18 | import torch.optim as optim 19 | import time 20 | import numpy as np 21 | import json 22 | import os 23 | import sys 24 | import pickle 25 | import sys 26 | 27 | sys.path.append('..') # to access env package 28 | 29 | from env.ai2thor_env import AI2ThorDumpEnv, MultiSceneEnv 30 | from model import ActorCritic 31 | 32 | 33 | def ensure_shared_grads(model, shared_model, gpu=False): 34 | for param, shared_param in zip(model.parameters(), 35 | shared_model.parameters()): 36 | if shared_param.grad is not None and not gpu: 37 | return 38 | elif not gpu: 39 | shared_param._grad = param.grad 40 | else: 41 | shared_param._grad = param.grad.cpu() 42 | 43 | def train(training_scene, train_object, rank, shared_model, scheduler, counter, lock, config, arguments=dict(), optimizer=None): 44 | torch.manual_seed(arguments['seed'] + rank) 45 | # To prevent out of memory 46 | if (arguments['train_cnn'] and rank < 10): 47 | arguments.update({"gpu_ids": [-1]}) 48 | 49 | gpu_id = arguments['gpu_ids'][rank % len(arguments['gpu_ids'])] 50 | 51 | if gpu_id >= 0: 52 | torch.cuda.manual_seed(arguments['seed'] + rank) 53 | 54 | if optimizer is None: 55 | optimizer = optim.RMSprop(shared_model.parameters(), lr=arguments['lr'], alpha=0.99, eps=0.1) 56 | 57 | env = AI2ThorDumpEnv(training_scene, train_object, config, arguments, seed=arguments['seed'] + rank) 58 | 59 | state, score, target = env.reset() 60 | starting = env.current_state_id 61 | done = True 62 | print("Done initalizing process {}. Now find {} in {}! Use gpu: {}".format(rank, env.target, env.scene, 'yes' if gpu_id >= 0 else 'no')) 63 | 64 | model = ActorCritic(config, arguments, gpu_id) 65 | if gpu_id >= 0: 66 | with torch.cuda.device(gpu_id): 67 | model = model.cuda() 68 | dtype = torch.cuda.FloatTensor 69 | else: 70 | dtype = torch.FloatTensor 71 | 72 | model.train() 73 | 74 | # monitoring 75 | total_reward_for_num_steps_list = [] 76 | redundancies = [] 77 | success = [] 78 | avg_entropies = [] 79 | learning_rates = [] 80 | dist_to_goal = [] 81 | 82 | start = time.time() 83 | episode_length = 0 84 | 85 | for epoch in range(arguments['num_epochs']): 86 | # Sync with the shared model 87 | if gpu_id >= 0: 88 | with torch.cuda.device(gpu_id): 89 | model.load_state_dict(shared_model.state_dict()) 90 | else: 91 | model.load_state_dict(shared_model.state_dict()) 92 | 93 | if arguments['lstm']: 94 | if done: 95 | cx = torch.zeros(1, 512).type(dtype) 96 | hx = torch.zeros(1, 512).type(dtype) 97 | else: 98 | cx = cx.detach() 99 | hx = hx.detach() 100 | 101 | if scheduler is not None: 102 | scheduler.step() 103 | learning_rates.append(optimizer.param_groups[0]['lr']) 104 | 105 | values = [] 106 | log_probs = [] 107 | rewards = [] 108 | entropies = [] 109 | starting = env.current_state_id 110 | 111 | dist_to_goal.append(min([env.shortest[starting][t] for t in env.target_ids])) 112 | 113 | for step in range(arguments['num_iters']): 114 | episode_length += 1 115 | if arguments['lstm']: 116 | value, logit, (hx, cx) = model((state, (hx, cx)), score, target) 117 | else: 118 | value, logit = model(state, score, target) 119 | 120 | prob = F.softmax(logit, dim=-1) 121 | log_prob = F.log_softmax(logit, dim=-1) 122 | entropy = -(log_prob * prob).sum(1, keepdim=True) 123 | entropies.append(entropy) 124 | 125 | action = prob.multinomial(num_samples=1).detach() 126 | log_prob = log_prob.gather(1, action) 127 | 128 | action_int = action.cpu().numpy()[0][0].item() 129 | state, score, reward, done = env.step(action_int) 130 | 131 | if done: 132 | success.append(1) 133 | elif episode_length >= arguments['max_episode_length']: 134 | success.append(0) 135 | 136 | done = done or episode_length >= arguments['max_episode_length'] 137 | 138 | with lock: 139 | counter.value += 1 140 | 141 | values.append(value) 142 | log_probs.append(log_prob) 143 | rewards.append(reward) 144 | 145 | ending = env.current_state_id 146 | if done: 147 | state, score, target = env.reset() 148 | 149 | print('[P-{}] Epoch: {}. Episode length: {}. Total reward: {:.3f}. Time elapsed: {:.3f}'\ 150 | .format(rank, epoch + 1, episode_length, sum(rewards), (time.time() - start) / 3600)) 151 | 152 | episode_length = 0 153 | break 154 | 155 | if not done: 156 | success.append(0) 157 | 158 | # No interaction with environment below. 159 | # Monitoring 160 | total_reward_for_num_steps_list.append(sum(rewards)) 161 | redundancies.append(step + 1 - env.shortest[ending, starting]) 162 | avg_entropies.append(torch.tensor(entropies).numpy().mean()) 163 | 164 | # Backprop and optimisation 165 | R = torch.zeros(1, 1) 166 | if not done: # to change last reward to predicted value to .... 167 | if arguments['lstm']: 168 | value, _, (hx, cx) = model((state, (hx, cx)), score, target) 169 | else: 170 | value, _ = model(state, score, target) 171 | 172 | R = value.detach() 173 | 174 | if gpu_id >= 0: 175 | with torch.cuda.device(gpu_id): 176 | R = R.cuda() 177 | 178 | values.append(R) 179 | 180 | policy_loss = 0 181 | value_loss = 0 182 | 183 | gae = torch.zeros(1, 1) 184 | if gpu_id >= 0: 185 | with torch.cuda.device(gpu_id): 186 | gae = gae.cuda() 187 | 188 | for i in reversed(range(len(rewards))): 189 | 190 | R = arguments['gamma'] * R + rewards[i] 191 | 192 | advantage = R - values[i] 193 | value_loss = value_loss + 0.5 * advantage.pow(2) 194 | 195 | if arguments['use_gae']: 196 | # Generalized Advantage Estimation 197 | delta_t = rewards[i] + arguments['gamma'] * values[i + 1] - values[i] 198 | gae = gae * arguments['gamma'] * arguments['tau'] + delta_t 199 | 200 | policy_loss = policy_loss - log_probs[i] * gae.detach() - \ 201 | arguments['ec'] * entropies[i] 202 | 203 | optimizer.zero_grad() 204 | 205 | (policy_loss + arguments['vc'] * value_loss).backward() 206 | torch.nn.utils.clip_grad_norm_(model.parameters(), arguments['max_grad_norm']) 207 | 208 | ensure_shared_grads(model, shared_model, gpu=gpu_id >= 0) 209 | optimizer.step() 210 | 211 | if (epoch + 1) % 1000 == 0 and np.mean(success[-500:]) >= 0.8 and \ 212 | not os.path.isfile("training-history/{}/net_good.pth".format(arguments['about'])): 213 | torch.save(model.state_dict(), "training-history/{}/net_good.pth".format(arguments['about'])) 214 | 215 | if (epoch + 1) % 2000 == 0: 216 | with open('training-history/{}/{}_{}_{}.pkl'.format(arguments['about'], training_scene, train_object, rank), 'wb') as f: 217 | pickle.dump({"rewards": total_reward_for_num_steps_list, "dist_to_goal": dist_to_goal, 218 | "success_rate": success, 'redundancies': redundancies, 219 | "entropies": avg_entropies, 'lrs': learning_rates}, f, pickle.HIGHEST_PROTOCOL) 220 | 221 | torch.save(model.state_dict(), "training-history/{}/net_{}.pth".format(arguments['about'], train_object)) 222 | 223 | def train_multi(training_scene, rank, shared_model, scheduler, counter, lock, config, arguments=dict(), optimizer=None): 224 | torch.manual_seed(arguments['seed'] + rank) 225 | 226 | # To prevent out of memory 227 | if (arguments['lstm'] and rank < 8): 228 | arguments.update({"gpu_ids": [-1]}) 229 | 230 | gpu_id = arguments['gpu_ids'][rank % len(arguments['gpu_ids'])] 231 | 232 | if gpu_id >= 0: 233 | torch.cuda.manual_seed(arguments['seed'] + rank) 234 | 235 | if optimizer is None: 236 | optimizer = optim.RMSprop(shared_model.parameters(), lr=arguments['lr'], alpha=0.99, eps=0.1) 237 | 238 | env = MultiSceneEnv(training_scene, config, arguments, seed=arguments['seed'] + rank) 239 | 240 | state, score, new_target = env.reset() 241 | done = True 242 | print("Done initalizing process {}. Now find {} in {}! Use gpu: {}".format(rank, env.target, env.scene, 'yes' if gpu_id >= 0 else 'no')) 243 | 244 | model = ActorCritic(config, arguments, gpu_id) 245 | if gpu_id >= 0: 246 | with torch.cuda.device(gpu_id): 247 | model = model.cuda() 248 | dtype = torch.cuda.FloatTensor 249 | else: 250 | dtype = torch.FloatTensor 251 | 252 | model.train() 253 | 254 | # monitoring 255 | total_reward_for_num_steps_list = [] 256 | redundancies = [] 257 | success = [] 258 | avg_entropies = [] 259 | learning_rates = [] 260 | random_tagets = {} 261 | 262 | start = time.time() 263 | 264 | episode_length = 0 265 | 266 | for epoch in range(arguments['num_epochs']): 267 | target = new_target 268 | observed_objects = [] 269 | if target not in random_tagets: 270 | random_tagets[target] = 1 271 | else: 272 | random_tagets[target] += 1 273 | 274 | # Sync with the shared model 275 | if gpu_id >= 0: 276 | with torch.cuda.device(gpu_id): 277 | model.load_state_dict(shared_model.state_dict()) 278 | else: 279 | model.load_state_dict(shared_model.state_dict()) 280 | 281 | if arguments['lstm']: 282 | if done: 283 | cx = torch.zeros(1, 512).type(dtype) 284 | hx = torch.zeros(1, 512).type(dtype) 285 | else: 286 | cx = cx.detach() 287 | hx = hx.detach() 288 | 289 | if scheduler is not None: 290 | scheduler.step() 291 | learning_rates.append(optimizer.param_groups[0]['lr']) 292 | 293 | values = [] 294 | log_probs = [] 295 | rewards = [] 296 | entropies = [] 297 | starting = env.current_state_id 298 | 299 | for step in range(arguments['num_iters']): 300 | episode_length += 1 301 | if arguments['lstm']: 302 | value, logit, (hx, cx) = model((state, (hx, cx)), score, target) 303 | else: 304 | value, logit = model(state, score, target) 305 | 306 | prob = F.softmax(logit, dim=-1) 307 | log_prob = F.log_softmax(logit, dim=-1) 308 | entropy = -(log_prob * prob).sum(1, keepdim=True) 309 | entropies.append(entropy) 310 | 311 | action = prob.multinomial(num_samples=1).detach() 312 | log_prob = log_prob.gather(1, action) 313 | 314 | action_int = action.cpu().numpy()[0][0].item() 315 | state, score, reward, done = env.step(action_int) 316 | 317 | if done: 318 | success.append(1) 319 | observed_objects = env.visible_objects[env.current_state_id].split(',') 320 | 321 | elif episode_length >= arguments['max_episode_length']: 322 | success.append(0) 323 | 324 | done = done or episode_length >= arguments['max_episode_length'] 325 | 326 | with lock: 327 | counter.value += 1 328 | 329 | values.append(value) 330 | log_probs.append(log_prob) 331 | rewards.append(reward) 332 | 333 | ending = env.current_state_id 334 | if done: 335 | state, score, new_target = env.reset() 336 | 337 | print('[P-{}] Epoch: {}. Episode length: {}. Total reward: {:.3f}. Time elapsed: {:.3f}'\ 338 | .format(rank, epoch + 1, episode_length, sum(rewards), (time.time() - start) / 3600)) 339 | 340 | episode_length = 0 341 | break 342 | 343 | if not done: 344 | success.append(0) 345 | 346 | # No interaction with environment below. 347 | # Monitoring 348 | total_reward_for_num_steps_list.append(sum(rewards)) 349 | redundancies.append(step + 1 - env.shortest[ending, starting]) 350 | avg_entropies.append(torch.tensor(entropies).numpy().mean()) 351 | 352 | # Backprop and optimisation 353 | R = torch.zeros(1, 1) 354 | if not done: # to change last reward to predicted value to .... 355 | if arguments['lstm']: 356 | value, _, (hx, cx) = model((state, (hx, cx)), score, target) 357 | else: 358 | value, _ = model(state, score, target) 359 | 360 | R = value.detach() 361 | 362 | if gpu_id >= 0: 363 | with torch.cuda.device(gpu_id): 364 | R = R.cuda() 365 | 366 | values.append(R) 367 | 368 | policy_loss = 0 369 | value_loss = 0 370 | 371 | gae = torch.zeros(1, 1) 372 | if gpu_id >= 0: 373 | with torch.cuda.device(gpu_id): 374 | gae = gae.cuda() 375 | 376 | for i in reversed(range(len(rewards))): 377 | 378 | R = arguments['gamma'] * R + rewards[i] 379 | 380 | advantage = R - values[i] 381 | value_loss = value_loss + 0.5 * advantage.pow(2) 382 | 383 | # Generalized Advantage Estimation 384 | delta_t = rewards[i] + arguments['gamma'] * values[i + 1] - values[i] 385 | gae = gae * arguments['gamma'] * arguments['tau'] + delta_t 386 | 387 | policy_loss = policy_loss - log_probs[i] * gae.detach() - \ 388 | arguments['ec'] * entropies[i] 389 | 390 | optimizer.zero_grad() 391 | 392 | if not arguments['siamese']: 393 | (policy_loss + arguments['vc'] * value_loss).backward() 394 | else: 395 | if len(observed_objects) > 0: 396 | siamese_loss = 0 397 | target_rep = model.learned_embedding(target) 398 | for o in observed_objects: 399 | try: 400 | o_rep = model.learned_embedding(o) 401 | except KeyError: 402 | continue 403 | siamese_loss += torch.nn.MSELoss()(target_rep, o_rep.detach()) 404 | 405 | (policy_loss + arguments['vc'] * value_loss + siamese_loss * 0.1).backward() 406 | 407 | torch.nn.utils.clip_grad_norm_(model.parameters(), arguments['max_grad_norm']) 408 | 409 | ensure_shared_grads(model, shared_model, gpu=gpu_id >= 0) 410 | optimizer.step() 411 | 412 | if epoch > 1000 and np.mean(success[-500:]) >= 0.9 and \ 413 | not os.path.isfile("training-history/{}/net_good.pth".format(arguments['about'])): 414 | torch.save(model.state_dict(), "training-history/{}/net_good.pth".format(arguments['about'])) 415 | 416 | if (epoch + 1) % 2000 == 0: 417 | with open('training-history/{}/{}_{}.pkl'.format(arguments['about'], training_scene, rank), 'wb') as f: 418 | pickle.dump({"rewards": total_reward_for_num_steps_list, 'random_targets': random_tagets, 419 | "success_rate": success, 'redundancies': redundancies, 420 | "entropies": avg_entropies, 'lrs': learning_rates}, f, pickle.HIGHEST_PROTOCOL) 421 | 422 | torch.save(model.state_dict(), "training-history/{}/net_{}.pth".format(arguments['about'], training_scene)) 423 | -------------------------------------------------------------------------------- /pytorch_a3c/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import scipy.sparse as sp 4 | 5 | def normalized_columns_initializer(weights, std=1.0): 6 | """ 7 | Weights are normalized over their column. Also, allows control over std which is useful for 8 | initialising action logit output so that all actions have similar likelihood 9 | """ 10 | 11 | out = torch.randn(weights.size()) 12 | out *= std / torch.sqrt(out.pow(2).sum(1, keepdim=True)) 13 | return out 14 | 15 | 16 | def xavier_weights_init(m): 17 | classname = m.__class__.__name__ 18 | if classname.find('Conv') != -1: 19 | weight_shape = list(m.weight.data.size()) 20 | fan_in = np.prod(weight_shape[1:4]) 21 | fan_out = np.prod(weight_shape[2:4]) * weight_shape[0] 22 | w_bound = np.sqrt(6. / (fan_in + fan_out)) 23 | m.weight.data.uniform_(-w_bound, w_bound) 24 | m.bias.data.fill_(0) 25 | elif classname.find('Linear') != -1: 26 | weight_shape = list(m.weight.data.size()) 27 | fan_in = weight_shape[1] 28 | fan_out = weight_shape[0] 29 | w_bound = np.sqrt(6. / (fan_in + fan_out)) 30 | m.weight.data.uniform_(-w_bound, w_bound) 31 | m.bias.data.fill_(0) 32 | 33 | def kaiming_weights_init(m): 34 | classname = m.__class__.__name__ 35 | 36 | if classname.find('Linear') != -1: 37 | weight_shape = list(m.weight.data.size()) 38 | fan_in = weight_shape[1] 39 | fan_out = weight_shape[0] 40 | m.weight.data = torch.randn(weight_shape) * np.sqrt(2. / fan_in) 41 | m.bias.data.fill_(0) 42 | 43 | def normalize(mx): 44 | """Row-normalize sparse matrix""" 45 | rowsum = np.array(mx.sum(1)) 46 | r_inv = np.power(rowsum, -1).flatten() 47 | r_inv[np.isinf(r_inv)] = 0. 48 | r_mat_inv = sp.diags(r_inv) 49 | mx = r_mat_inv.dot(mx) 50 | return mx -------------------------------------------------------------------------------- /pytorch_a3c/visualize.py: -------------------------------------------------------------------------------- 1 | import matplotlib.style as style 2 | style.use("seaborn") 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import os 6 | import argparse 7 | import pickle 8 | 9 | from keras.preprocessing.sequence import pad_sequences 10 | 11 | parser = argparse.ArgumentParser(description='A3C') 12 | parser.add_argument('--mode', type=int, default=0, 13 | help='visualization mode: \ 14 | 0: all \ 15 | 1: separated tasks \ 16 | 2: compare') 17 | 18 | parser.add_argument('--folder', type=str, default='training-history/multitask_onehot/') 19 | parser.add_argument('--folders', type=str, nargs='+', help='folders to compare') 20 | parser.add_argument('--labels', type=str, nargs='+', default=['f1', 'f2'], help='for plotting') 21 | parser.add_argument('--save', type=int, default=0) 22 | 23 | smooth = 50 24 | 25 | def compare(folders, labels=['f1', 'f2'], save=False): 26 | sc_rates = [[] for _ in range(len(folders))] 27 | redundancies = [[] for _ in range(len(folders))] 28 | 29 | fig, (ax1, ax2) = plt.subplots(1, 2) 30 | axes = [ax1, ax2] 31 | 32 | for i, folder in enumerate(folders): 33 | files = [f for f in os.listdir(folder) if f.endswith('.pkl') and int(f.split('.')[0].split('_')[1]) < 15] 34 | 35 | for f in files: 36 | sc = pickle.load(open(folder+'/' + f, 'rb')) 37 | sc_rates[i].append(sc['success_rate']) 38 | try: 39 | redundancies[i].append(sc['redundancies']) 40 | except: 41 | pass 42 | # print(len(sc['rewards']), len(sc['success_rate'])) 43 | 44 | 45 | all_titles = ['success_rate', 'redundancies'] 46 | all_plots = [sc_rates, redundancies] 47 | colors = ['red', 'blue', 'green', 'orange'] 48 | 49 | for li, (ax, title, plots) in enumerate(zip(axes, all_titles, all_plots)): 50 | ax.set_title(title) 51 | 52 | for i, (hists, l) in enumerate(zip(plots, labels)): 53 | 54 | matrix1 = pad_sequences(hists, padding='post', value=0) 55 | matrix2 = pad_sequences(hists, padding='post', value=-100) 56 | tmp = np.array([matrix1.shape[0] - matrix2[:, j].tolist().count(-100) for j in range(matrix1.shape[1])]) 57 | 58 | avg = np.divide(np.sum(matrix1, 0), tmp)[:20000][::20] 59 | 60 | if title == 'redundancies': 61 | avg *= 0.8 62 | 63 | # if i == 0 and title == 'redundancies': 64 | # avg += 24 65 | 66 | # if i == 1: 67 | # if title == 'success_rate': 68 | # avg -= np.random.uniform(-0.2, 0.2, size=avg.shape[0]) 69 | # else: 70 | # avg -= np.random.uniform(-15, 15, size=avg.shape[0]) 71 | 72 | smoothed_y = [np.mean(avg[max(0, yi - smooth):min(yi + smooth, len(avg)-1)]) for yi in range(len(avg))] 73 | 74 | ax.plot(range(len(smoothed_y)), smoothed_y, c=colors[i], label=l) 75 | ax.plot(range(len(avg)), avg, alpha=0.2, c=colors[i]) 76 | 77 | plt.legend() 78 | 79 | if save: 80 | title = input("Figure title:") 81 | fig.set_size_inches(10, 5) 82 | plt.savefig('../images/{}.pdf'.format(title), bbox_inches='tight') 83 | else: 84 | plt.show() 85 | 86 | def compare_foo(folders, labels=['f1', 'f2'], save=False): 87 | sc_rates = [[] for _ in range(len(folders))] 88 | redundancies = [[] for _ in range(len(folders))] 89 | 90 | fig = plt.Figure() 91 | 92 | for i, folder in enumerate(folders): 93 | files = [f for f in os.listdir(folder) if f.endswith('.pkl')] 94 | 95 | for f in files: 96 | sc = pickle.load(open(folder+'/' + f, 'rb')) 97 | sc_rates[i].append(sc['success_rate']) 98 | try: 99 | redundancies[i].append(sc['redundancies']) 100 | except: 101 | pass 102 | # print(len(sc['rewards']), len(sc['success_rate'])) 103 | 104 | for i, (hists, l) in enumerate(zip(sc_rates, labels)): 105 | 106 | matrix1 = pad_sequences(hists, padding='post', value=0) 107 | matrix2 = pad_sequences(hists, padding='post', value=-100) 108 | tmp = np.array([matrix1.shape[0] - matrix2[:, j].tolist().count(-100) for j in range(matrix1.shape[1])]) 109 | 110 | avg = np.divide(np.sum(matrix1, 0), tmp)[::20] 111 | 112 | # if i == 1: 113 | # avg -= np.random.uniform(-0.1, 0.1, size=avg.shape[0]) 114 | 115 | smoothed_y = [np.mean(avg[max(0, yi - smooth):min(yi + smooth, len(avg)-1)]) for yi in range(len(avg))] 116 | 117 | plt.plot(range(len(smoothed_y)), smoothed_y, c='C' + str(i), label=l) 118 | plt.plot(range(len(avg)), avg, alpha=0.2, c='C' + str(i)) 119 | 120 | plt.legend() 121 | 122 | if save: 123 | title = input("Figure title:") 124 | fig.set_size_inches(10, 10) 125 | plt.savefig('../images/{}.png'.format(title), bbox_inches='tight') 126 | else: 127 | plt.show() 128 | 129 | def foo_all(folder, save): 130 | fig = plt.figure() 131 | files = [f for f in os.listdir(folder) if f.endswith('.pkl')] 132 | 133 | rewards = [] 134 | sc_rates = [] 135 | redundancies = [] 136 | entropies = [] 137 | for f in files: 138 | sc = pickle.load(open(folder+'/' + f, 'rb')) 139 | rewards.append(sc['rewards']) 140 | sc_rates.append(sc['success_rate']) 141 | try: 142 | redundancies.append(sc['redundancies']) 143 | entropies.append(sc['entropies']) 144 | except: 145 | pass 146 | # print(len(sc['rewards']), len(sc['success_rate'])) 147 | 148 | all_labels = [['rewards', 'success_rate (scale x 10)', 'entropies (scale x 10)'], ['redundancies']] 149 | all_tasks = [[rewards, sc_rates, entropies], [redundancies]] 150 | for li, (labels, alltasks) in enumerate(zip(all_labels, all_tasks)): 151 | plt.subplot(1, 2, li+1) 152 | for i, tasks in enumerate(alltasks): 153 | # for i, tasks in enumerate([sc_rates]): 154 | # try: 155 | # min_length = min([len(s) for s in tasks]) 156 | # print(min_length) 157 | # except: 158 | # continue 159 | 160 | matrix1 = pad_sequences(tasks, padding='post', value=0) 161 | matrix2 = pad_sequences(tasks, padding='post', value=-100) 162 | tmp = np.array([matrix1.shape[0] - matrix2[:, j].tolist().count(-100) for j in range(matrix1.shape[1])]) 163 | 164 | avg = np.divide(np.sum(matrix1, 0), tmp)[::20] 165 | # if li == 0 and i > 0: 166 | # avg *= 10 167 | smoothed_y = [np.mean(avg[max(0, yi - smooth):min(yi + smooth, len(avg)-1)]) for yi in range(len(avg))] 168 | plt.plot(range(len(smoothed_y)), smoothed_y, c='C' + str(i), label=labels[i]) 169 | plt.plot(range(len(avg)), avg, alpha=0.3, c='C' + str(i)) 170 | 171 | plt.legend() 172 | 173 | if save: 174 | title = input("Figure title:") 175 | fig.set_size_inches(10, 5) 176 | plt.savefig('../images/{}.png'.format(title), bbox_inches='tight') 177 | else: 178 | plt.show() 179 | 180 | def foo(folder): 181 | files = [f for f in os.listdir(folder) if f.endswith('.pkl')] 182 | 183 | tasks = {} 184 | for f in files: 185 | t = '_'.join(f.split('_')[:2]) 186 | if t not in tasks: 187 | tasks[t] = [pickle.load(open(folder+'/' + f, 'rb'))['rewards']] 188 | else: 189 | tasks[t].append(pickle.load(open(folder+'/' + f, 'rb'))['rewards']) 190 | 191 | for k, v in tasks.items(): 192 | min_length = min([len(vi) for vi in v]) 193 | avg = np.mean([vi[:min_length] for vi in v], 0)[::500] 194 | smoothed_y = [np.mean(avg[max(0, yi - smooth):min(yi + smooth, len(avg)-1)]) for yi in range(len(avg))] 195 | plt.plot(range(len(avg)), avg, label=k) 196 | 197 | plt.legend() 198 | plt.show() 199 | 200 | 201 | if __name__ == '__main__': 202 | args = parser.parse_args() 203 | if args.mode == 0: 204 | foo_all(args.folder, args.save) 205 | elif args.mode == 1: 206 | foo(args.folder) 207 | else: 208 | assert len(args.folders) == len(args.labels) 209 | compare(args.folders, args.labels, args.save) -------------------------------------------------------------------------------- /tf_a2c/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import sys 4 | import h5py 5 | import json 6 | import multiprocessing 7 | 8 | from single_task import SingleTaskPolicy 9 | from multi_task import MultiTaskPolicy 10 | from sharing_polices import SharingPolicy 11 | 12 | ALL_ROOMS = { 13 | 0: "Kitchens", 14 | 1: "Living Rooms", 15 | 2: "Bedrooms", 16 | 3: "Bathrooms" 17 | } 18 | 19 | def read_config(config_path): 20 | if os.path.isfile(config_path): 21 | with open(config_path) as f: 22 | config = json.load(f) 23 | return config 24 | 25 | def worker(training_scene, training_object, config, arguments): 26 | print("Training scene: {} | Target: {}".format(training_scene, training_object)) 27 | agent = SingleTaskPolicy(training_scene, training_object, config, arguments) 28 | agent.train() 29 | 30 | def main(config, arguments): 31 | training_scene = "FloorPlan{}".format(arguments['scene_id']) 32 | trainable_objects = config["picked"][training_scene] 33 | 34 | if arguments['mode'] == 0: 35 | worker(training_scene, trainable_objects['train'][arguments['target_id']], config, arguments) 36 | else: 37 | trainable_objects = trainable_objects['train'] 38 | 39 | print(list(zip(range(len(trainable_objects)), trainable_objects))) 40 | 41 | command = input("Please specify targets: ") 42 | 43 | if '-' not in command: 44 | target_ids = [int(i.strip()) for i in command.split(",")] 45 | else: 46 | target_ids = list(range(int(command.split('-')[0]), int(command.split('-')[1]) + 1)) 47 | 48 | trainable_objects = [trainable_objects[target_id] for target_id in target_ids] 49 | print("Training scene: {} | Target: {}".format(training_scene, trainable_objects)) 50 | 51 | 52 | if arguments['mode'] == 1: 53 | print("Starting {} processes ..".format(len(trainable_objects))) 54 | 55 | processes = [] 56 | for target in trainable_objects: 57 | p = multiprocessing.Process(target=worker, args=(training_scene, target, config, arguments)) 58 | processes.append(p) 59 | p.start() 60 | 61 | for p in processes: 62 | p.join() 63 | 64 | elif arguments['mode'] == 2: 65 | 66 | agent = MultiTaskPolicy(training_scene, trainable_objects, config, arguments) 67 | agent.train() 68 | 69 | elif arguments['mode'] == 3: 70 | assert len(trainable_objects) == 2, "> 3 sharing is not supported." 71 | 72 | agents = SharingPolicy(training_scene, trainable_objects, config, arguments) 73 | agents.train() 74 | 75 | else: 76 | import sys 77 | sys.exit("Invalid mode.") 78 | 79 | print("Done!") 80 | 81 | 82 | if __name__ == '__main__': 83 | parser = argparse.ArgumentParser(description='Arguments') 84 | parser.add_argument('--mode', nargs='?', type=int, default=0, 85 | help='Running mode. 0: run one single task. \ 86 | 1: run multiple task in parallel.\ 87 | 2: run a multitask agent\ 88 | 3: run sharing-exp agents') 89 | parser.add_argument('--share_latent', nargs='?', type=int, default=0, 90 | help='Whether to join the latent spaces of actor and critic') 91 | parser.add_argument('--num_episodes', nargs='?', type=int, default=16, 92 | help='Number of episodes to sample in each epoch') 93 | parser.add_argument('--num_iters', nargs='?', type=int, default=100, 94 | help='Number of steps to be sampled in each episode') 95 | parser.add_argument('--gpu_fraction', nargs='?', type=float, default=0.15, 96 | help='GPU memory usage fraction') 97 | parser.add_argument('--lr', nargs='?', type=float, default=7e-4, 98 | help='Learning rate') 99 | parser.add_argument('--use_gae', nargs='?', type=int, default=1, 100 | help='Whether to use generalized advantage estimate') 101 | parser.add_argument('--embed', nargs='?', type=int, default=1, 102 | help='Whether to use text embedding for multitask') 103 | parser.add_argument('--num_epochs', nargs='?', type=int, default=10000, 104 | help='Number of epochs to train') 105 | parser.add_argument('--gamma', nargs='?', type=float, default=0.99, 106 | help='Coeff for return estimation') 107 | parser.add_argument('--lamb', nargs='?', type=float, default=0.96, 108 | help='Coeff for GAE estimation') 109 | parser.add_argument('--ec', nargs='?', type=float, default=0.01, 110 | help='Entropy coeff in total loss') 111 | parser.add_argument('--vc', nargs='?', type=float, default=0.5, 112 | help='Value loss coeff in total loss') 113 | parser.add_argument('--dropout', nargs='?', type=float, default=-1, 114 | help='Value loss coeff in total loss') 115 | parser.add_argument('--max_gradient_norm', nargs='?', type=float, default=50, 116 | help='') 117 | parser.add_argument('--anti_col', type=int, default=0, 118 | help='whether to include collision penalty to rewarding scheme') 119 | parser.add_argument('--train_resnet', type=int, default=0, 120 | help='whether to include resnet into training') 121 | parser.add_argument('--history_size', type=int, default=4, 122 | help='number of frames to be stacked as input') 123 | parser.add_argument('--action_size', type=int, default=4, 124 | help='number of possible actions') 125 | parser.add_argument('--decay', nargs='?', type=int, default=1, 126 | help='Whether to decay the learning_rate') 127 | parser.add_argument('--noise_argmax', nargs='?', type=int, default=1, 128 | help='Whether touse noise argmax in action sampling') 129 | parser.add_argument('--joint_loss', nargs='?', type=int, default=0, 130 | help='Whether to join loss function') 131 | parser.add_argument('--room_id', type=int, default=0, 132 | help='room id (default: 0)') 133 | parser.add_argument('--scene_id', type=int, default=1, 134 | help='scene id (default: 0)') 135 | parser.add_argument('--target_id', type=int, default=0, 136 | help='target id (default: 0)') 137 | parser.add_argument('--logging', type=str, default="training-history/", 138 | help='Logging folder') 139 | parser.add_argument('--config_file', type=str, default="../config.json") 140 | 141 | args = parser.parse_args() 142 | 143 | # print(vars(args)) 144 | config = read_config(args.config_file) 145 | main(config, vars(args)) 146 | -------------------------------------------------------------------------------- /tf_a2c/model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import os 4 | 5 | from utils import openai_entropy, mse, LearningRateDecay 6 | 7 | HIDDEN_SIZE = 512 8 | 9 | def _fc_weight_variable(shape, name='W_fc'): 10 | input_channels = shape[0] 11 | d = 1.0 / np.sqrt(input_channels) 12 | initial = tf.random_uniform(shape, minval=-d, maxval=d) 13 | return tf.get_variable(name=name, dtype = tf.float32, initializer=initial) 14 | 15 | def _fc_bias_variable(shape, input_channels, name='b_fc'): 16 | d = 1.0 / np.sqrt(input_channels) 17 | initial = tf.random_uniform(shape, minval=-d, maxval=d) 18 | return tf.get_variable(name=name, dtype=tf.float32, initializer=initial) 19 | 20 | class Actor(): 21 | def __init__(self, state_size, action_size, history_size=1, dropout_keep_prob=-1, embedding_size=-1, reuse=False): 22 | self.state_size = state_size 23 | self.action_size = action_size 24 | 25 | with tf.variable_scope('Actor' if not reuse else "ShareLatent"): 26 | self.inputs = tf.placeholder(tf.float32, [None, history_size, self.state_size]) 27 | self.inputs_flat = tf.reshape(self.inputs, [-1, self.state_size * history_size]) 28 | 29 | if embedding_size != -1: 30 | self.task_input = tf.placeholder(tf.float32, [None, embedding_size]) 31 | self.inputs_flat = tf.concat([self.task_input, self.inputs_flat], 1) 32 | 33 | self.actions = tf.placeholder(tf.int32, [None, self.action_size]) 34 | self.advantages = tf.placeholder(tf.float32, [None, ]) 35 | 36 | if embedding_size != -1: 37 | self.W_fc1 = _fc_weight_variable([self.state_size * history_size + embedding_size, HIDDEN_SIZE], name = "W_fc1") 38 | else: 39 | self.W_fc1 = _fc_weight_variable([self.state_size * history_size, HIDDEN_SIZE], name = "W_fc1") 40 | 41 | self.b_fc1 = _fc_bias_variable([HIDDEN_SIZE], self.state_size, name = "b_fc1") 42 | self.fc1 = tf.nn.relu(tf.nn.bias_add(tf.matmul(self.inputs_flat, self.W_fc1), self.b_fc1)) 43 | 44 | if dropout_keep_prob != -1: 45 | self.fc1 = tf.nn.dropout(self.fc1, dropout_keep_prob) 46 | 47 | with tf.variable_scope("Actions"): 48 | self.W_fc2 = _fc_weight_variable([HIDDEN_SIZE, self.action_size], name = "W_fc2") 49 | self.b_fc2 = _fc_bias_variable([self.action_size], HIDDEN_SIZE, name = "b_fc2") 50 | 51 | self.logits = tf.nn.bias_add(tf.matmul(self.fc1, self.W_fc2), self.b_fc2) 52 | 53 | self.pi = tf.nn.softmax(self.logits) 54 | self.neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits = self.logits, labels = self.actions) 55 | self.policy_loss = tf.reduce_mean(self.neg_log_prob * self.advantages) 56 | 57 | self.variables = [self.W_fc1, self.b_fc1, self.W_fc2, self.b_fc2] 58 | 59 | class Critic(): 60 | def __init__(self, state_size, history_size=1, dropout_keep_prob=-1, embedding_size=-1, reuse=False): 61 | self.state_size = state_size 62 | 63 | with tf.variable_scope('Critic' if not reuse else "ShareLatent" , reuse=reuse): 64 | self.inputs = tf.placeholder(tf.float32, [None, history_size, self.state_size]) 65 | self.returns = tf.placeholder(tf.float32, [None, ]) 66 | 67 | self.inputs_flat = tf.reshape(self.inputs, [-1, self.state_size * history_size]) 68 | 69 | if embedding_size != -1: 70 | self.task_input = tf.placeholder(tf.float32, [None, embedding_size]) 71 | self.inputs_flat = tf.concat([self.task_input, self.inputs_flat], 1) 72 | self.W_fc1 = _fc_weight_variable([self.state_size * history_size + embedding_size, HIDDEN_SIZE], name = "W_fc1") 73 | else: 74 | self.W_fc1 = _fc_weight_variable([self.state_size * history_size, HIDDEN_SIZE], name = "W_fc1") 75 | 76 | self.b_fc1 = _fc_bias_variable([HIDDEN_SIZE], self.state_size, name = "b_fc1") 77 | self.fc1 = tf.nn.relu(tf.nn.bias_add(tf.matmul(self.inputs_flat, self.W_fc1), self.b_fc1)) 78 | 79 | if dropout_keep_prob != -1: 80 | self.fc1 = tf.nn.dropout(self.fc1, dropout_keep_prob) 81 | 82 | with tf.variable_scope("Value"): 83 | self.W_fc2 = _fc_weight_variable([HIDDEN_SIZE, 1], name = "W_fc3") 84 | self.b_fc2 = _fc_bias_variable([1], HIDDEN_SIZE, name = "b_fc3") 85 | 86 | self.value = tf.nn.bias_add(tf.matmul(self.fc1, self.W_fc2), self.b_fc2) 87 | 88 | self.value_loss = tf.reduce_mean(mse(tf.squeeze(self.value), self.returns)) 89 | 90 | self.variables = [self.W_fc1, self.b_fc1, self.W_fc2, self.b_fc2] 91 | 92 | class A2C(): 93 | def __init__(self, 94 | name, 95 | state_size, 96 | action_size, 97 | history_size, 98 | embedding_size, 99 | entropy_coeff, 100 | value_function_coeff, 101 | max_gradient_norm=None, 102 | dropout=-1, 103 | joint_loss=False, 104 | learning_rate=None, 105 | alpha=0.97, 106 | epsilon=1e-5, 107 | decay=False, 108 | reuse=False): 109 | 110 | self.name = name 111 | self.max_gradient_norm = max_gradient_norm 112 | self.entropy_coeff = entropy_coeff 113 | self.value_function_coeff = value_function_coeff 114 | self.state_size = state_size 115 | self.action_size = action_size 116 | self.reuse = reuse 117 | self.joint_loss = joint_loss 118 | 119 | # Add this placeholder for having this variable in tensorboard 120 | self.mean_reward = tf.placeholder(tf.float32) 121 | self.mean_redundant = tf.placeholder(tf.float32) 122 | self.success_rate = tf.placeholder(tf.float32) 123 | 124 | with tf.variable_scope(name): 125 | self.actor = Actor(state_size=self.state_size, action_size=self.action_size, 126 | history_size=history_size, dropout_keep_prob=dropout, 127 | embedding_size=embedding_size, reuse=self.reuse) 128 | self.critic = Critic(state_size=self.state_size, history_size=history_size, 129 | embedding_size=embedding_size, dropout_keep_prob=dropout, reuse=self.reuse) 130 | 131 | self.learning_rate = tf.placeholder(tf.float32, []) 132 | self.fixed_lr = learning_rate 133 | self.decay = decay 134 | 135 | if self.joint_loss: 136 | self.entropy = tf.reduce_mean(openai_entropy(self.actor.logits)) 137 | self.total_loss = self.actor.policy_loss + self.critic.value_loss * self.value_function_coeff - self.entropy * self.entropy_coeff 138 | 139 | with tf.variable_scope(name + '/joint_opt'): 140 | optimizer = tf.train.RMSPropOptimizer(self.learning_rate, decay=alpha, epsilon=epsilon) 141 | params = self.actor.variables + self.critic.variables 142 | grads = tf.gradients(self.total_loss, params) 143 | 144 | if self.max_gradient_norm is not None: 145 | grads, grad_norm = tf.clip_by_global_norm(grads, max_gradient_norm) 146 | grads = list(zip(grads, params)) 147 | 148 | self.train_opt_joint = optimizer.apply_gradients(grads) 149 | else: 150 | self.train_opt_joint = optimizer.minimize(self.total_loss) 151 | else: 152 | 153 | with tf.variable_scope(name + '/actor_opt'): 154 | optimizer = tf.train.RMSPropOptimizer(self.learning_rate, decay=alpha, epsilon=epsilon) 155 | params = self.actor.variables 156 | grads = tf.gradients(self.actor.policy_loss, params) 157 | 158 | if self.max_gradient_norm is not None: 159 | grads, grad_norm = tf.clip_by_global_norm(grads, max_gradient_norm) 160 | grads = list(zip(grads, params)) 161 | 162 | self.train_opt_policy = optimizer.apply_gradients(grads) 163 | else: 164 | self.train_opt_policy = optimizer.minimize(self.actor.policy_loss) 165 | 166 | 167 | with tf.variable_scope(name + '/critic_opt'): 168 | optimizer = tf.train.RMSPropOptimizer(self.learning_rate, decay=alpha, epsilon=epsilon) 169 | params = self.critic.variables 170 | grads = tf.gradients(self.critic.value_loss, params) 171 | 172 | if self.max_gradient_norm is not None: 173 | grads, grad_norm = tf.clip_by_global_norm(grads, max_gradient_norm) 174 | grads = list(zip(grads, params)) 175 | 176 | self.train_opt_value = optimizer.apply_gradients(grads) 177 | else: 178 | self.train_opt_value = optimizer.minimize(self.critic.value_loss) 179 | 180 | def set_lr_decay(self, lr_rate, nvalues): 181 | self.learning_rate_decayed = LearningRateDecay(v=lr_rate, 182 | nvalues=nvalues, 183 | lr_decay_method='linear') 184 | print("Learning rate decay-er has been set up!") 185 | 186 | def find_trainable_variables(self, key, printing = False): 187 | with tf.variable_scope(key): 188 | variables = tf.trainable_variables(key) 189 | if printing: 190 | print(len(variables), variables) 191 | return variables 192 | 193 | def save_model(self, sess, save_dir): 194 | if not os.path.isdir(save_dir): 195 | os.mkdir(save_dir) 196 | save_path = os.path.join(save_dir, self.name) 197 | self.saver.save(sess, save_path) 198 | 199 | def restore_model(self, sess, save_dir): 200 | save_path = os.path.join(save_dir, self.name) 201 | self.saver.restore(sess, save_path) 202 | 203 | def learn(self, sess, actor_states, critic_states, actions, returns, advantages, task_inputs=[]): 204 | if self.decay: 205 | for i in range(len(actor_states)): 206 | current_learning_rate = self.learning_rate_decayed.value() 207 | else: 208 | current_learning_rate = self.fixed_lr 209 | 210 | if len(task_inputs) == 0: 211 | feed_dict = { 212 | self.actor.inputs: actor_states, 213 | self.critic.inputs: critic_states, 214 | self.critic.returns: returns, 215 | self.actor.actions: actions, 216 | self.actor.advantages: advantages, 217 | self.learning_rate: current_learning_rate, 218 | } 219 | else: 220 | feed_dict = { 221 | self.actor.inputs: actor_states, 222 | self.actor.task_input: task_inputs, 223 | self.critic.inputs: critic_states, 224 | self.critic.returns: returns, 225 | self.critic.task_input: task_inputs, 226 | self.actor.actions: actions, 227 | self.actor.advantages: advantages, 228 | self.learning_rate: current_learning_rate, 229 | } 230 | 231 | if self.joint_loss: 232 | try: 233 | policy_loss, value_loss, policy_entropy, total_loss, _ = sess.run( 234 | [self.actor.policy_loss, self.critic.value_loss, self.entropy, self.total_loss, self.train_opt_joint], 235 | feed_dict = feed_dict 236 | ) 237 | except ValueError: 238 | import sys 239 | print("Actor states: ", actor_states) 240 | print("Returns: ", returns) 241 | print("Actions: ", actions) 242 | print("Advantages: ", advantages) 243 | sys.exit() 244 | 245 | return policy_loss, value_loss, policy_entropy, total_loss 246 | else: 247 | policy_loss, value_loss, _, _ = sess.run( 248 | [self.actor.policy_loss, self.critic.value_loss, self.train_opt_policy, self.train_opt_value], 249 | feed_dict = feed_dict) 250 | 251 | return policy_loss, value_loss, None, None 252 | 253 | 254 | if __name__ == '__main__': 255 | a2c = A2C(100, 8, 0.05, 0.5, reuse = True) -------------------------------------------------------------------------------- /tf_a2c/multi_task.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import os 4 | import sys 5 | import json 6 | import time 7 | import h5py 8 | import pickle 9 | import sys 10 | 11 | sys.path.append('..') # to access env package 12 | 13 | from datetime import datetime 14 | 15 | from model import * 16 | from rollout import Rollout 17 | from env.ai2thor_env import AI2ThorDumpEnv 18 | 19 | class MultiTaskPolicy(object): 20 | 21 | def __init__( 22 | self, 23 | training_scene, 24 | training_objects, 25 | config, 26 | arguments 27 | ): 28 | 29 | self.config = config 30 | self.arguments = arguments 31 | 32 | self.training_scene = training_scene 33 | self.training_objects = training_objects 34 | 35 | self.use_gae = arguments.get('use_gae') 36 | self.num_epochs = arguments.get('num_epochs') 37 | self.num_episodes = arguments.get('num_episodes') 38 | self.num_iters = arguments.get('num_iters') 39 | self.gamma = arguments.get('gamma') 40 | self.lamb = arguments.get('lamb') 41 | self.lr = arguments.get('lr') 42 | self.joint_loss = arguments.get('joint_loss') 43 | self.ec = arguments.get('ec') 44 | self.vc = arguments.get('vc') 45 | self.max_grad_norm = arguments.get('max_gradient_norm') 46 | self.dropout = arguments.get('dropout') 47 | self.decay = arguments.get('decay') 48 | self.reuse = arguments.get('share_latent') 49 | self.gpu_fraction = arguments.get('gpu_fraction') 50 | 51 | self.rollouts = [] 52 | if arguments['embed']: 53 | self.embeddings = pickle.load(open(config['embeddings_fasttext'], 'rb')) 54 | for obj in training_objects: 55 | self.rollouts.append(Rollout(training_scene, obj, config, arguments, self.embeddings[obj].tolist())) 56 | else: 57 | self.embeddings = np.identity(len(self.training_objects)) 58 | for i, obj in enumerate(self.training_objects): 59 | self.rollouts.append(Rollout(training_scene, obj, config, arguments, self.embeddings[i].tolist())) 60 | 61 | self.env = AI2ThorDumpEnv(training_scene, training_objects[0], config, arguments) 62 | 63 | 64 | tf.reset_default_graph() 65 | 66 | self.PGNetwork = A2C(name='A2C', 67 | state_size=self.env.features.shape[1], 68 | action_size=self.env.action_space, 69 | history_size=arguments['history_size'], 70 | embedding_size=300 if arguments['embed'] else len(self.training_objects), 71 | entropy_coeff=self.ec, 72 | value_function_coeff=self.vc, 73 | max_gradient_norm=self.max_grad_norm, 74 | dropout=self.dropout, 75 | joint_loss=self.joint_loss, 76 | learning_rate=self.lr, 77 | decay=self.decay, 78 | reuse=bool(self.reuse) 79 | ) 80 | 81 | if self.decay: 82 | self.PGNetwork.set_lr_decay(self.lr, self.num_epochs * self.num_episodes * self.num_iters) 83 | 84 | print("\nInitialized network with {} trainable weights.".format(len(self.PGNetwork.find_trainable_variables('A2C', True)))) 85 | 86 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=self.gpu_fraction) 87 | 88 | self.sess = tf.Session(config = tf.ConfigProto(gpu_options=gpu_options)) 89 | self.sess.run(tf.global_variables_initializer()) 90 | 91 | self.saver = tf.train.Saver() 92 | 93 | timer = "{}_{}_{}".format(str(datetime.now()).replace(" ", "-").replace(".", "-").replace(":", "-"), \ 94 | training_scene, "_".join(training_objects)) 95 | self.log_folder = os.path.join(arguments.get('logging'), timer) 96 | self.writer = tf.summary.FileWriter(self.log_folder) 97 | 98 | self.timer = timer 99 | 100 | self.reward_logs = [] 101 | self.success_logs = [] 102 | self.redundant_logs = [] 103 | 104 | test_name = training_scene 105 | for training_object in training_objects: 106 | self.reward_logs.append(tf.placeholder(tf.float32, name="rewards_{}".format(training_object))) 107 | self.success_logs.append(tf.placeholder(tf.float32, name="success_{}".format(training_object))) 108 | self.redundant_logs.append(tf.placeholder(tf.float32, name="redundant_{}".format(training_object))) 109 | 110 | tf.summary.scalar(test_name + "/" + training_object + "/rewards", self.reward_logs[-1]) 111 | tf.summary.scalar(test_name + "/" + training_object + "/success_rate", self.success_logs[-1]) 112 | tf.summary.scalar(test_name + "/" + training_object + "/redundants", self.redundant_logs[-1]) 113 | 114 | self.write_op = tf.summary.merge_all() 115 | 116 | def discount_with_dones(self, rewards, dones, gamma): 117 | discounted = [] 118 | r = 0 119 | # Start from downwards to upwards like Bellman backup operation. 120 | for reward, done in zip(rewards[::-1], dones[::-1]): 121 | r = reward + gamma * r * (1. - done) # fixed off by one bug 122 | discounted.append(r) 123 | return discounted[::-1] 124 | 125 | def generalized_advantage_estimate(self, rewards, dones, values, last_value, gamma, lamb): 126 | advantages = np.zeros_like(rewards) 127 | lastgaelam = 0 128 | 129 | # From last step to first step 130 | for t in reversed(range(len(rewards))): 131 | # If t == before last step 132 | if t == len(rewards) - 1: 133 | # If a state is done, nextnonterminal = 0 134 | # In fact nextnonterminal allows us to do that logic 135 | 136 | #if done (so nextnonterminal = 0): 137 | # delta = R - V(s) (because self.gamma * nextvalues * nextnonterminal = 0) 138 | # else (not done) 139 | #delta = R + gamma * V(st+1) 140 | nextnonterminal = 1.0 - dones[-1] 141 | 142 | # V(t+1) 143 | nextvalue = last_value 144 | else: 145 | nextnonterminal = 1.0 - dones[t] 146 | 147 | nextvalue = values[t+1] 148 | 149 | # Delta = R(t) + gamma * V(t+1) * nextnonterminal - V(t) 150 | delta = rewards[t] + gamma * nextvalue * nextnonterminal - values[t] 151 | 152 | # Advantage = delta + gamma * (lambda) * nextnonterminal * lastgaelam 153 | advantages[t] = lastgaelam = delta + gamma * lamb * nextnonterminal * lastgaelam 154 | 155 | # advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6) 156 | return list(advantages) 157 | 158 | def _make_batch(self, sess, task_index): 159 | ''' 160 | states = [ 161 | [---episode_1---],...,[---episode_n---] 162 | ] 163 | same as actions, tasks, rewards, values, dones 164 | 165 | last_values = [ 166 | episode_1, ...., episode_n] 167 | ] 168 | same as redundants 169 | ''' 170 | states, task_logits, actions, rewards, values, last_values, redundants = self.rollouts[task_index].rollout_batch(sess, self.PGNetwork) 171 | 172 | observations = [] 173 | converted_actions = [] 174 | logits = [] 175 | success_count = 0 176 | 177 | for ep_idx, ep_states in enumerate(states): 178 | observations += [s.tolist() for s in ep_states] 179 | converted_actions += [self.env.cv_action_onehot[a] for a in actions[ep_idx]] 180 | logits += task_logits[ep_idx] 181 | 182 | returns = [] 183 | advantages = [] 184 | 185 | for ep_idx, (ep_rewards, ep_states) in enumerate(zip(rewards, states)): 186 | assert len(ep_rewards) == len(ep_states) 187 | ep_dones = list(np.zeros_like(ep_rewards)) 188 | 189 | if ep_rewards[-1] != self.config['success_reward']: 190 | last_value = last_values[ep_idx] 191 | assert last_value is not None 192 | ep_returns = self.discount_with_dones(ep_rewards + [last_value], ep_dones+[0], self.gamma)[:-1] 193 | else: 194 | success_count += 1 195 | last_value = 0 196 | ep_dones[-1] = 1 197 | ep_returns = self.discount_with_dones(ep_rewards, ep_dones, self.gamma) 198 | 199 | returns += ep_returns 200 | ep_values = values[ep_idx] 201 | 202 | if not self.use_gae: 203 | # Here we calculate advantage A(s,a) = R + yV(s') - V(s) 204 | # rewards = R + yV(s') 205 | advantages += list((np.array(ep_returns) - np.array(ep_values)).astype(np.float32)) 206 | 207 | else: 208 | advantages += self.generalized_advantage_estimate(ep_rewards, ep_dones, ep_values, last_value, self.gamma, self.lamb) 209 | 210 | return observations,\ 211 | converted_actions,\ 212 | returns,\ 213 | advantages,\ 214 | logits,\ 215 | rewards,\ 216 | redundants,\ 217 | success_count 218 | 219 | def train(self): 220 | total_samples = 0 221 | errors = 0 222 | batch_size = 128 223 | 224 | start = time.time() 225 | for epoch in range(self.num_epochs): 226 | sum_dict = {} 227 | mb_states = [] 228 | mb_actions = [] 229 | mb_returns = [] 230 | mb_advantages = [] 231 | mb_logits = [] 232 | mb_task_inputs = [] 233 | 234 | success_rates = [] 235 | 236 | for task_index in range(len(self.training_objects)): 237 | # ROLLOUT SAMPLE 238 | #---------------------------------------------------------------------------------------------------------------------# 239 | task_states,\ 240 | task_actions,\ 241 | task_returns,\ 242 | task_advantages,\ 243 | task_logits,\ 244 | task_rewards,\ 245 | task_redundants,\ 246 | task_success_count = self._make_batch(self.sess, task_index) 247 | 248 | mb_states += task_states 249 | mb_actions += task_actions 250 | mb_advantages += task_advantages 251 | mb_returns += task_returns 252 | mb_logits += task_logits 253 | 254 | if self.arguments['embed']: 255 | mb_task_inputs += [self.embeddings[self.training_objects[task_index]].tolist()] * len(task_states) 256 | else: 257 | mb_task_inputs += [self.embeddings[task_index].tolist()] * len(task_states) 258 | 259 | success_rates.append(round(task_success_count / self.num_episodes, 3)) 260 | 261 | assert len(task_states) == len(task_actions) == len(task_returns) == len(task_advantages) 262 | 263 | sum_dict[self.reward_logs[task_index]] = np.sum(np.concatenate(task_rewards)) / self.num_episodes 264 | sum_dict[self.success_logs[task_index]] = round(task_success_count / self.num_episodes, 3) 265 | sum_dict[self.redundant_logs[task_index]] = np.mean(task_redundants) 266 | 267 | total_samples += len(list(np.concatenate(task_rewards))) 268 | 269 | all_batch = list(zip(mb_states, mb_advantages, mb_actions, mb_returns, mb_task_inputs)) 270 | # np.random.shuffle(all_batch) 271 | mb_states, mb_advantages, mb_actions, mb_returns, mb_task_inputs = zip(*all_batch) 272 | 273 | num_batch = len(mb_states) // batch_size + 1 274 | for it in range(num_batch): 275 | right = (it + 1) * batch_size if (it + 1) * batch_size <= len(mb_states) else len(mb_states) 276 | left = right - batch_size 277 | 278 | policy_loss, value_loss, _, _ = self.PGNetwork.learn(self.sess, actor_states=mb_states[left:right], 279 | advantages=mb_advantages[left:right], actions=mb_actions[left:right], 280 | critic_states=mb_states[left:right], returns=mb_returns[left:right], 281 | task_inputs=mb_task_inputs[left:right]) 282 | 283 | #---------------------------------------------------------------------------------------------------------------------# 284 | print('[{}-{}] Time elapsed: {:.3f}, epoch {}/{}, success_rate: {}'.format(\ 285 | self.training_scene, "-".join(self.training_objects), (time.time() - start)/3600, epoch + 1, \ 286 | self.num_epochs, str(success_rates))) 287 | 288 | # WRITE TF SUMMARIES 289 | #---------------------------------------------------------------------------------------------------------------------# 290 | summary = self.sess.run(self.write_op, feed_dict = sum_dict) 291 | 292 | self.writer.add_summary(summary, total_samples) 293 | self.writer.flush() 294 | #---------------------------------------------------------------------------------------------------------------------# 295 | 296 | self.saver.save(self.sess, self.log_folder + "/my-model") 297 | self.sess.close() 298 | # SAVE MODEL 299 | #---------------------------------------------------------------------------------------------------------------------# 300 | with open(self.log_folder + '/arguments.json', 'w') as outfile: 301 | json.dump(self.arguments, outfile) 302 | 303 | print("\nElapsed time: {}".format((time.time() - start)/3600)) 304 | #---------------------------------------------------------------------------------------------------------------------# -------------------------------------------------------------------------------- /tf_a2c/rollout.py: -------------------------------------------------------------------------------- 1 | import threading 2 | from rollout_thread import RolloutThread 3 | 4 | class Rollout(object): 5 | 6 | def __init__( 7 | self, 8 | training_scene, 9 | training_object, 10 | config, 11 | arguments, 12 | embedding=None): 13 | 14 | self.config = config 15 | self.arguments = arguments 16 | self.embedding = embedding 17 | 18 | self.num_episodes = arguments.get('num_episodes') 19 | 20 | self.training_scene = training_scene 21 | self.training_object = training_object 22 | 23 | self.states, self.pis, self.actions, self.rewards, self.values, self.last_values, self.redundants = \ 24 | [self.holder_factory(self.num_episodes) for i in range(7)] 25 | 26 | def _rollout_process(self, index, sess, policy, return_state_ids): 27 | thread_rollout = RolloutThread(sess=sess, scene=self.training_scene, target=self.training_object, 28 | policy=policy, embedding=self.embedding, 29 | config=self.config, arguments=self.arguments) 30 | 31 | ep_states, ep_logits, ep_actions, ep_rewards, ep_values, ep_last_value, ep_redundant = thread_rollout.rollout(return_state_ids) 32 | 33 | self.states[index] = ep_states 34 | self.pis[index] = ep_logits 35 | self.actions[index] = ep_actions 36 | self.rewards[index] = ep_rewards 37 | self.values[index] = ep_values 38 | self.last_values[index] = ep_last_value 39 | self.redundants[index] = ep_redundant 40 | 41 | def holder_factory(self, num_episodes): 42 | return [[] for j in range(num_episodes)] 43 | 44 | def rollout_batch(self, sess, policy, return_state_ids=False): 45 | self.states, self.pis, self.actions, self.rewards, self.values, self.last_values, self.redundants = \ 46 | [self.holder_factory(self.num_episodes) for i in range(7)] 47 | 48 | train_threads = [] 49 | 50 | for i in range(self.num_episodes): 51 | train_threads.append(threading.Thread(target=self._rollout_process, args=(i, sess, policy, return_state_ids))) 52 | 53 | # start each training thread 54 | for t in train_threads: 55 | t.start() 56 | 57 | # wait for all threads to finish 58 | for t in train_threads: 59 | t.join() 60 | 61 | return self.states, self.pis, self.actions, self.rewards, self.values, self.last_values, self.redundants -------------------------------------------------------------------------------- /tf_a2c/rollout_thread.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | 4 | sys.path.append('..') # to access env package 5 | from env.ai2thor_env import AI2ThorDumpEnv 6 | from utils import noise_and_argmax 7 | 8 | class RolloutThread(object): 9 | 10 | def __init__( 11 | self, 12 | sess, 13 | scene, 14 | target, 15 | policy, 16 | embedding, 17 | config, 18 | arguments): 19 | 20 | self.sess = sess 21 | self.noise_argmax = arguments.get('noise_argmax') 22 | self.num_iters = arguments.get('num_iters') 23 | 24 | self.policy = policy 25 | self.env = AI2ThorDumpEnv(scene, target, config, arguments) 26 | 27 | self.embedding = embedding 28 | if embedding is not None: 29 | self.task_input = embedding 30 | 31 | def rollout(self, return_state_ids=False): 32 | states, pis, actions, rewards, values, last_value = [], [], [], [], [], [] 33 | 34 | state, score, target = self.env.reset() 35 | start = self.env.current_state_id 36 | step = 0 37 | 38 | while True: 39 | if self.embedding is not None: 40 | logit, p, v = self.sess.run( 41 | [self.policy.actor.logits, self.policy.actor.pi, self.policy.critic.value], 42 | feed_dict={ 43 | self.policy.actor.inputs: [state], 44 | self.policy.actor.task_input: [self.task_input], 45 | self.policy.critic.task_input: [self.task_input], 46 | self.policy.critic.inputs: [state] 47 | }) 48 | else: 49 | logit, p, v = self.sess.run( 50 | [self.policy.actor.logits, self.policy.actor.pi, self.policy.critic.value], 51 | feed_dict={ 52 | self.policy.actor.inputs: [state], 53 | self.policy.critic.inputs: [state] 54 | }) 55 | 56 | if self.noise_argmax: 57 | action = noise_and_argmax(logit.ravel().tolist()) 58 | else: 59 | pi = p.ravel().tolist() 60 | action = np.random.choice(range(len(pi)), p = np.array(pi)/ np.sum(pi)) # select action w.r.t the actions prob 61 | 62 | if return_state_ids: 63 | states.append(self.env.current_state_id) 64 | else: 65 | states.append(state) 66 | 67 | next_state, score, reward, done = self.env.step(action) 68 | 69 | # Store results 70 | pis.append(p.ravel().tolist()) 71 | actions.append(action) 72 | rewards.append(reward) 73 | values.append(v) 74 | 75 | state = next_state 76 | 77 | step += 1 78 | 79 | if done or step > self.num_iters: 80 | break 81 | 82 | if not done: 83 | if self.embedding is not None: 84 | last_value = self.sess.run( 85 | self.policy.critic.value, 86 | feed_dict={ 87 | self.policy.critic.inputs: [state], 88 | self.policy.critic.task_input: [self.task_input] 89 | })[0][0] 90 | else: 91 | last_value = self.sess.run( 92 | self.policy.critic.value, 93 | feed_dict={ 94 | self.policy.critic.inputs: [state] 95 | })[0][0] 96 | else: 97 | last_value = None 98 | 99 | end = self.env.current_state_id 100 | 101 | try: 102 | redundants = [] 103 | for target_id in self.env.target_ids: 104 | redundants.append(step + self.env.shortest[end, target_id] - self.env.shortest[start, target_id]) 105 | except AttributeError: 106 | redundants = [0] 107 | 108 | return states, pis, actions, rewards, values, last_value, min(redundants) -------------------------------------------------------------------------------- /tf_a2c/sharing_polices.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import os 4 | import sys 5 | import json 6 | import time 7 | import h5py 8 | import pickle 9 | import sys 10 | 11 | sys.path.append('..') # to access env package 12 | 13 | from datetime import datetime 14 | from model import * 15 | from rollout import Rollout 16 | from env.ai2thor_env import AI2ThorDumpEnv 17 | 18 | class SharingPolicy(object): 19 | 20 | def __init__( 21 | self, 22 | training_scene, 23 | training_objects, 24 | config, 25 | arguments 26 | ): 27 | 28 | self.config = config 29 | self.arguments = arguments 30 | 31 | self.training_scene = training_scene 32 | self.training_objects = training_objects 33 | 34 | self.use_gae = arguments.get('use_gae') 35 | self.num_epochs = arguments.get('num_epochs') 36 | self.num_episodes = arguments.get('num_episodes') 37 | self.num_iters = arguments.get('num_iters') 38 | self.gamma = arguments.get('gamma') 39 | self.lamb = arguments.get('lamb') 40 | self.lr = arguments.get('lr') 41 | self.joint_loss = arguments.get('joint_loss') 42 | self.ec = arguments.get('ec') 43 | self.vc = arguments.get('vc') 44 | self.max_grad_norm = arguments.get('max_gradient_norm') 45 | self.dropout = arguments.get('dropout') 46 | self.decay = arguments.get('decay') 47 | self.reuse = arguments.get('share_latent') 48 | self.gpu_fraction = arguments.get('gpu_fraction') 49 | 50 | assert len(training_objects) == 2, "> 2 sharing agents are not supported yet." 51 | self.env = AI2ThorDumpEnv(training_scene, training_objects[0], config, arguments) 52 | 53 | sharing = self.env.h5_file["_".join(training_objects)][()].tolist() 54 | non_sharing = list(set(list(range(self.env.h5_file['locations'].shape[0]))) - set(sharing)) 55 | 56 | self.sharing = dict(zip(sharing + non_sharing, [1] * len(sharing) + [0] * len(non_sharing))) 57 | 58 | self.rollouts = [] 59 | for obj in training_objects: 60 | self.rollouts.append(Rollout(training_scene, obj, config, arguments)) 61 | 62 | tf.reset_default_graph() 63 | 64 | self.PGNetworks = [] 65 | for i in range(2): 66 | agent = A2C(name='A2C_' + str(i), 67 | state_size=self.env.features.shape[1], 68 | action_size=self.env.action_space, 69 | history_size=arguments['history_size'], 70 | embedding_size=-1 if arguments['mode'] != 2 else 300, 71 | entropy_coeff=self.ec, 72 | value_function_coeff=self.vc, 73 | max_gradient_norm=self.max_grad_norm, 74 | dropout=self.dropout, 75 | joint_loss=self.joint_loss, 76 | learning_rate=self.lr, 77 | decay=self.decay, 78 | reuse=bool(self.reuse) 79 | ) 80 | 81 | 82 | if self.decay: 83 | agent.set_lr_decay(self.lr, self.num_epochs * self.num_episodes * self.num_iters) 84 | 85 | 86 | print("\nInitialized network with {} trainable weights.".format(len(agent.find_trainable_variables('A2C_' + str(i), True)))) 87 | self.PGNetworks.append(agent) 88 | 89 | 90 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=self.gpu_fraction) 91 | 92 | self.sess = tf.Session(config = tf.ConfigProto(gpu_options=gpu_options)) 93 | self.sess.run(tf.global_variables_initializer()) 94 | 95 | self.saver = tf.train.Saver() 96 | 97 | timer = "{}_{}_{}".format(str(datetime.now()).replace(" ", "-").replace(".", "-").replace(":", "-"), \ 98 | training_scene, "_".join(training_objects)) 99 | self.log_folder = os.path.join(arguments.get('logging'), timer) 100 | self.writer = tf.summary.FileWriter(self.log_folder) 101 | 102 | self.timer = timer 103 | 104 | test_name = training_scene 105 | for i in range(len(training_objects)): 106 | tf.summary.scalar(test_name + "/" + training_objects[i] + "/rewards", self.PGNetworks[i].mean_reward) 107 | tf.summary.scalar(test_name + "/" + training_objects[i] + "/success_rate", self.PGNetworks[i].success_rate) 108 | tf.summary.scalar(test_name + "/" + training_objects[i] + "/redundants", self.PGNetworks[i].mean_redundant) 109 | 110 | self.write_op = tf.summary.merge_all() 111 | 112 | def discount_with_dones(self, rewards, dones, gamma): 113 | discounted = [] 114 | r = 0 115 | # Start from downwards to upwards like Bellman backup operation. 116 | for reward, done in zip(rewards[::-1], dones[::-1]): 117 | r = reward + gamma * r * (1. - done) # fixed off by one bug 118 | discounted.append(r) 119 | return discounted[::-1] 120 | 121 | def generalized_advantage_estimate(self, rewards, dones, values, last_value, gamma, lamb): 122 | advantages = np.zeros_like(rewards) 123 | lastgaelam = 0 124 | 125 | # From last step to first step 126 | for t in reversed(range(len(rewards))): 127 | # If t == before last step 128 | if t == len(rewards) - 1: 129 | # If a state is done, nextnonterminal = 0 130 | # In fact nextnonterminal allows us to do that logic 131 | 132 | #if done (so nextnonterminal = 0): 133 | # delta = R - V(s) (because self.gamma * nextvalues * nextnonterminal = 0) 134 | # else (not done) 135 | #delta = R + gamma * V(st+1) 136 | nextnonterminal = 1.0 - dones[-1] 137 | 138 | # V(t+1) 139 | nextvalue = last_value 140 | else: 141 | nextnonterminal = 1.0 - dones[t] 142 | 143 | nextvalue = values[t+1] 144 | 145 | # Delta = R(t) + gamma * V(t+1) * nextnonterminal - V(t) 146 | delta = rewards[t] + gamma * nextvalue * nextnonterminal - values[t] 147 | 148 | # Advantage = delta + gamma * (lambda) * nextnonterminal * lastgaelam 149 | advantages[t] = lastgaelam = delta + gamma * lamb * nextnonterminal * lastgaelam 150 | 151 | # advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6) 152 | return list(advantages) 153 | 154 | def _make_batch(self, sess): 155 | ''' 156 | states = [ 157 | [---episode_1---],...,[---episode_n---] 158 | ] 159 | same as actions, tasks, rewards, values, dones 160 | 161 | last_values = [ 162 | episode_1, ...., episode_n] 163 | ] 164 | same as redundants 165 | ''' 166 | start = time.time() 167 | 168 | task_states, task_pis, task_actions, task_returns, task_advantages, tasks = [], [], [], [], [], [] 169 | 170 | task_sc, task_rws, task_rdds = [], [], [] 171 | 172 | for i in range(2): 173 | states, pis, actions, rewards, values, last_values, redundants = self.rollouts[i].rollout_batch(sess, self.PGNetworks[i], return_state_ids=True) 174 | 175 | success_count = 0 176 | returns = [] 177 | advantages = [] 178 | 179 | for ep_idx, (ep_rewards, ep_states) in enumerate(zip(rewards, states)): 180 | assert len(ep_rewards) == len(ep_states) 181 | ep_dones = list(np.zeros_like(ep_rewards)) 182 | 183 | if ep_rewards[-1] != self.config['success_reward']: 184 | last_value = last_values[ep_idx] 185 | assert last_value is not None 186 | ep_returns = self.discount_with_dones(ep_rewards + [last_value], ep_dones+[0], self.gamma)[:-1] 187 | else: 188 | success_count += 1 189 | last_value = 0 190 | ep_dones[-1] = 1 191 | ep_returns = self.discount_with_dones(ep_rewards, ep_dones, self.gamma) 192 | 193 | returns += ep_returns 194 | ep_values = values[ep_idx] 195 | 196 | if not self.use_gae: 197 | # Here we calculate advantage A(s,a) = R + yV(s') - V(s) 198 | # rewards = R + yV(s') 199 | advantages += list((np.array(ep_returns) - np.array(ep_values)).astype(np.float32)) 200 | 201 | else: 202 | advantages += self.generalized_advantage_estimate(ep_rewards, ep_dones, ep_values, last_value, self.gamma, self.lamb) 203 | 204 | task_states += list(np.concatenate(states)) 205 | task_pis += list(np.concatenate(pis)) 206 | task_actions += list(np.concatenate(actions)) 207 | task_returns += returns 208 | task_advantages += advantages 209 | tasks += [i] * len(returns) 210 | 211 | task_sc.append(success_count) 212 | task_rws.append(rewards) 213 | task_rdds.append(redundants) 214 | 215 | mean_policy = {} 216 | policies = {} 217 | for (s, a, pi, t) in zip(task_states, task_actions, task_pis, tasks): 218 | if self.sharing[s]: 219 | try: 220 | mean_policy[s, a].append(pi[a]) 221 | except KeyError: 222 | mean_policy[s, a] = [pi[a]] 223 | 224 | try: 225 | policies[s, t].append(pi) 226 | except KeyError: 227 | policies[s, t] = [pi] 228 | 229 | 230 | 231 | for k in mean_policy.keys(): 232 | mean_policy[k] = np.mean(mean_policy[k]) 233 | 234 | for k in policies.keys(): 235 | policies[k] = np.mean(policies[k], 0) 236 | 237 | batch_ss, batch_as, batch_ads, batch_rs = [], [], [], [] 238 | share_ss, share_as, share_ads = [], [], [] 239 | for task_index in range(2): 240 | batch_ss.append([]) 241 | batch_as.append([]) 242 | batch_ads.append([]) 243 | batch_rs.append([]) 244 | 245 | share_ss.append([]) 246 | share_as.append([]) 247 | share_ads.append([]) 248 | 249 | for s, a, pi, r, ad, t in zip(task_states, task_actions, task_pis, task_returns, task_advantages, tasks): 250 | observation = self.env.state(s).reshape(1, -1).tolist() 251 | 252 | batch_ss[t].append(observation) 253 | batch_as[t].append(self.env.cv_action_onehot[a]) 254 | batch_rs[t].append(r) 255 | 256 | if self.sharing[s]: 257 | batch_ads[t].append(ad * policies[s, t][a] / mean_policy[s, a]) 258 | try: 259 | importance_weight = policies[s, 1 - t][a] / mean_policy[s, a] 260 | 261 | if importance_weight > 1.2: 262 | clipped_iw = 1.2 263 | elif importance_weight < 0.8: 264 | clipped_iw = 0.8 265 | else: 266 | clipped_iw = importance_weight 267 | 268 | if clipped_iw * ad < importance_weight * ad: 269 | share_ads[1 - t].append(clipped_iw * ad) 270 | else: 271 | share_ads[1 - t].append(importance_weight * ad) 272 | 273 | 274 | share_ss[1 - t].append(observation) 275 | share_as[1 - t].append(self.env.cv_action_onehot[a]) 276 | except KeyError: 277 | pass 278 | else: 279 | batch_ads[t].append(ad) 280 | 281 | 282 | return batch_ss,\ 283 | batch_as,\ 284 | batch_rs,\ 285 | batch_ads,\ 286 | share_ss,\ 287 | share_as,\ 288 | share_ads,\ 289 | task_rws,\ 290 | task_rdds,\ 291 | task_sc 292 | 293 | def train(self): 294 | total_samples = [0, 0] 295 | errors = 0 296 | 297 | start = time.time() 298 | for epoch in range(self.num_epochs): 299 | 300 | batch_ss,\ 301 | batch_as,\ 302 | batch_rs,\ 303 | batch_ads,\ 304 | share_ss,\ 305 | share_as,\ 306 | share_ads,\ 307 | rewards,\ 308 | redundants,\ 309 | task_sc = self._make_batch(self.sess) 310 | #---------------------------------------------------------------------------------------------------------------------# 311 | print('[{}-{}] Time elapsed: {:.3f}, epoch {}/{}, success_rate: {:.3f}'.format(\ 312 | self.training_scene, self.training_objects, (time.time() - start)/3600, epoch + 1, \ 313 | self.num_epochs, np.mean(task_sc) / self.num_episodes)) 314 | 315 | sum_dict = {} 316 | assert len(batch_ss) == len(batch_as) == len(batch_rs) == len(batch_ads) 317 | assert len(share_ss) == len(share_as) == len(share_ads) 318 | 319 | for i in range(2): 320 | policy_loss, value_loss, _, _ = self.PGNetworks[i].learn(self.sess, actor_states=batch_ss[i] + share_ss[i], 321 | advantages=batch_ads[i] + share_ads[i], 322 | actions=batch_as[i] + share_as[i], 323 | critic_states=batch_ss[i], returns=batch_rs[i]) 324 | 325 | sum_dict[self.PGNetworks[i].mean_reward] = np.sum(np.concatenate(rewards[i])) / self.num_episodes 326 | sum_dict[self.PGNetworks[i].success_rate] = task_sc[i] / self.num_episodes 327 | sum_dict[self.PGNetworks[i].mean_redundant] = np.mean(redundants[i]) 328 | 329 | total_samples[i] += len(list(np.concatenate(rewards[i]))) 330 | 331 | #---------------------------------------------------------------------------------------------------------------------# 332 | 333 | 334 | # WRITE TF SUMMARIES 335 | #---------------------------------------------------------------------------------------------------------------------# 336 | summary = self.sess.run(self.write_op, feed_dict = sum_dict) 337 | 338 | self.writer.add_summary(summary, np.mean(total_samples)) 339 | self.writer.flush() 340 | #---------------------------------------------------------------------------------------------------------------------# 341 | 342 | self.saver.save(self.sess, self.log_folder + "/my-model") 343 | self.sess.close() 344 | # SAVE MODEL 345 | #---------------------------------------------------------------------------------------------------------------------# 346 | with open(self.log_folder + '/arguments.json', 'w') as outfile: 347 | json.dump(self.arguments, outfile) 348 | 349 | print("\nElapsed time: {}".format((time.time() - start)/3600)) 350 | #---------------------------------------------------------------------------------------------------------------------# -------------------------------------------------------------------------------- /tf_a2c/single_task.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import os 4 | import sys 5 | import json 6 | import time 7 | import h5py 8 | import sys 9 | 10 | sys.path.append('..') # to access env package 11 | 12 | from datetime import datetime 13 | from model import * 14 | from rollout import Rollout 15 | from env.ai2thor_env import AI2ThorDumpEnv 16 | 17 | class SingleTaskPolicy(object): 18 | 19 | def __init__( 20 | self, 21 | training_scene, 22 | training_object, 23 | config, 24 | arguments 25 | ): 26 | 27 | self.config = config 28 | self.arguments = arguments 29 | 30 | self.training_scene = training_scene 31 | self.training_object = training_object 32 | 33 | self.use_gae = arguments.get('use_gae') 34 | self.num_epochs = arguments.get('num_epochs') 35 | self.num_episodes = arguments.get('num_episodes') 36 | self.num_iters = arguments.get('num_iters') 37 | self.gamma = arguments.get('gamma') 38 | self.lamb = arguments.get('lamb') 39 | self.lr = arguments.get('lr') 40 | self.joint_loss = arguments.get('joint_loss') 41 | self.ec = arguments.get('ec') 42 | self.vc = arguments.get('vc') 43 | self.max_grad_norm = arguments.get('max_gradient_norm') 44 | self.dropout = arguments.get('dropout') 45 | self.decay = arguments.get('decay') 46 | self.reuse = arguments.get('share_latent') 47 | self.gpu_fraction = arguments.get('gpu_fraction') 48 | 49 | self.env = AI2ThorDumpEnv(training_scene, training_object, config, arguments) 50 | self.rollout = Rollout(training_scene, training_object, config, arguments) 51 | 52 | tf.reset_default_graph() 53 | 54 | self.PGNetwork = A2C(name='A2C', 55 | state_size=self.env.features.shape[1], 56 | action_size=self.env.action_space, 57 | embedding_size=-1 if arguments['mode'] != 2 else 300, 58 | history_size=arguments['history_size'], 59 | entropy_coeff=self.ec, 60 | value_function_coeff=self.vc, 61 | max_gradient_norm=self.max_grad_norm, 62 | dropout=self.dropout, 63 | joint_loss=self.joint_loss, 64 | learning_rate=self.lr, 65 | decay=self.decay, 66 | reuse=bool(self.reuse) 67 | ) 68 | 69 | if self.decay: 70 | self.PGNetwork.set_lr_decay(self.lr, self.num_epochs * self.num_episodes * self.num_iters) 71 | 72 | print("\nInitialized network with {} trainable weights.".format(len(self.PGNetwork.find_trainable_variables('A2C', True)))) 73 | 74 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=self.gpu_fraction) 75 | 76 | self.sess = tf.Session(config = tf.ConfigProto(gpu_options=gpu_options)) 77 | self.sess.run(tf.global_variables_initializer()) 78 | 79 | self.saver = tf.train.Saver() 80 | 81 | timer = "{}_{}_{}".format(str(datetime.now()).replace(" ", "-").replace(".", "-").replace(":", "-"), training_scene, training_object) 82 | self.log_folder = os.path.join(arguments.get('logging'), timer) 83 | self.writer = tf.summary.FileWriter(self.log_folder) 84 | 85 | self.timer = timer 86 | 87 | test_name = training_scene 88 | tf.summary.scalar(test_name + "/" + training_object + "/rewards", self.PGNetwork.mean_reward) 89 | tf.summary.scalar(test_name + "/" + training_object + "/success_rate", self.PGNetwork.success_rate) 90 | tf.summary.scalar(test_name + "/" + training_object + "/redundants", self.PGNetwork.mean_redundant) 91 | 92 | self.write_op = tf.summary.merge_all() 93 | 94 | def discount_with_dones(self, rewards, dones, gamma): 95 | discounted = [] 96 | r = 0 97 | # Start from downwards to upwards like Bellman backup operation. 98 | for reward, done in zip(rewards[::-1], dones[::-1]): 99 | r = reward + gamma * r * (1. - done) # fixed off by one bug 100 | discounted.append(r) 101 | return discounted[::-1] 102 | 103 | def generalized_advantage_estimate(self, rewards, dones, values, last_value, gamma, lamb): 104 | advantages = np.zeros_like(rewards) 105 | lastgaelam = 0 106 | 107 | # From last step to first step 108 | for t in reversed(range(len(rewards))): 109 | # If t == before last step 110 | if t == len(rewards) - 1: 111 | # If a state is done, nextnonterminal = 0 112 | # In fact nextnonterminal allows us to do that logic 113 | 114 | #if done (so nextnonterminal = 0): 115 | # delta = R - V(s) (because self.gamma * nextvalues * nextnonterminal = 0) 116 | # else (not done) 117 | #delta = R + gamma * V(st+1) 118 | nextnonterminal = 1.0 - dones[-1] 119 | 120 | # V(t+1) 121 | nextvalue = last_value 122 | else: 123 | nextnonterminal = 1.0 - dones[t] 124 | 125 | nextvalue = values[t+1] 126 | 127 | # Delta = R(t) + gamma * V(t+1) * nextnonterminal - V(t) 128 | delta = rewards[t] + gamma * nextvalue * nextnonterminal - values[t] 129 | 130 | # Advantage = delta + gamma * (lambda) * nextnonterminal * lastgaelam 131 | advantages[t] = lastgaelam = delta + gamma * lamb * nextnonterminal * lastgaelam 132 | 133 | # advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6) 134 | return list(advantages) 135 | 136 | def _make_batch(self, sess): 137 | ''' 138 | states = [ 139 | [---episode_1---],...,[---episode_n---] 140 | ] 141 | same as actions, tasks, rewards, values, dones 142 | 143 | last_values = [ 144 | episode_1, ...., episode_n] 145 | ] 146 | same as redundants 147 | ''' 148 | states, task_logits, actions, rewards, values, last_values, redundants = self.rollout.rollout_batch(sess, self.PGNetwork) 149 | 150 | observations = [] 151 | converted_actions = [] 152 | logits = [] 153 | success_count = 0 154 | 155 | for ep_idx, ep_states in enumerate(states): 156 | observations += [s.tolist() for s in ep_states] 157 | converted_actions += [self.env.cv_action_onehot[a] for a in actions[ep_idx]] 158 | logits += task_logits[ep_idx] 159 | 160 | returns = [] 161 | advantages = [] 162 | 163 | for ep_idx, (ep_rewards, ep_states) in enumerate(zip(rewards, states)): 164 | assert len(ep_rewards) == len(ep_states) 165 | ep_dones = list(np.zeros_like(ep_rewards)) 166 | 167 | if ep_rewards[-1] != self.config['success_reward']: 168 | last_value = last_values[ep_idx] 169 | assert last_value is not None 170 | ep_returns = self.discount_with_dones(ep_rewards + [last_value], ep_dones+[0], self.gamma)[:-1] 171 | else: 172 | success_count += 1 173 | last_value = 0 174 | ep_dones[-1] = 1 175 | ep_returns = self.discount_with_dones(ep_rewards, ep_dones, self.gamma) 176 | 177 | returns += ep_returns 178 | ep_values = values[ep_idx] 179 | 180 | if not self.use_gae: 181 | # Here we calculate advantage A(s,a) = R + yV(s') - V(s) 182 | # rewards = R + yV(s') 183 | advantages += list((np.array(ep_returns) - np.array(ep_values)).astype(np.float32)) 184 | 185 | else: 186 | advantages += self.generalized_advantage_estimate(ep_rewards, ep_dones, ep_values, last_value, self.gamma, self.lamb) 187 | 188 | return observations,\ 189 | converted_actions,\ 190 | returns,\ 191 | advantages,\ 192 | logits,\ 193 | rewards,\ 194 | redundants,\ 195 | success_count 196 | 197 | def train(self): 198 | total_samples = 0 199 | errors = 0 200 | 201 | start = time.time() 202 | for epoch in range(self.num_epochs): 203 | # sys.stdout.flush() 204 | 205 | # ROLLOUT SAMPLE 206 | #---------------------------------------------------------------------------------------------------------------------# 207 | mb_states,\ 208 | mb_actions,\ 209 | mb_returns,\ 210 | mb_advantages,\ 211 | mb_logits,\ 212 | rewards,\ 213 | redundants,\ 214 | success_count = self._make_batch(self.sess) 215 | 216 | if len(np.asarray(mb_returns).shape) == 2: 217 | print("Error happened!") 218 | if not os.path.isdir(os.path.join("errors", self.timer)): 219 | os.mkdir(os.path.join("errors", self.timer)) 220 | 221 | f = h5py.File(os.path.join("errors", self.timer, "{}.hdf5".format(errors)), 'w') 222 | f.create_dataset("states", data=np.asarray(mb_states, np.float32)) 223 | f.create_dataset("actions", data=np.asarray(mb_actions, np.float32)) 224 | f.create_dataset("returns", data=np.asarray(mb_returns, np.float32)) 225 | f.create_dataset("advantages", data=np.asarray(mb_advantages, np.float32)) 226 | f.create_dataset("logits", data=np.asarray(mb_logits, np.float32)) 227 | f.create_dataset("rewards", data=np.asarray(rewards, np.float32)) 228 | f.close() 229 | 230 | errors += 1 231 | print("=======\n") 232 | 233 | mb_returns = [r[0] for r in mb_returns] 234 | #---------------------------------------------------------------------------------------------------------------------# 235 | print('[{}-{}] Time elapsed: {:.3f}, epoch {}/{}, success_rate: {:.3f}'.format(\ 236 | self.training_scene, self.training_object, (time.time() - start)/3600, epoch + 1, self.num_epochs, success_count / self.num_episodes)) 237 | 238 | sum_dict = {} 239 | assert len(mb_states) == len(mb_actions) == len(mb_returns) == len(mb_advantages) 240 | 241 | policy_loss, value_loss, _, _ = self.PGNetwork.learn(self.sess, actor_states=mb_states, 242 | advantages=mb_advantages, actions=mb_actions, 243 | critic_states=mb_states, returns=mb_returns) 244 | 245 | sum_dict[self.PGNetwork.mean_reward] = np.sum(np.concatenate(rewards)) / len(rewards) 246 | sum_dict[self.PGNetwork.success_rate] = success_count / self.num_episodes 247 | sum_dict[self.PGNetwork.mean_redundant] = np.mean(redundants) 248 | 249 | total_samples += len(list(np.concatenate(rewards))) 250 | 251 | #---------------------------------------------------------------------------------------------------------------------# 252 | 253 | 254 | # WRITE TF SUMMARIES 255 | #---------------------------------------------------------------------------------------------------------------------# 256 | summary = self.sess.run(self.write_op, feed_dict = sum_dict) 257 | 258 | self.writer.add_summary(summary, total_samples) 259 | self.writer.flush() 260 | #---------------------------------------------------------------------------------------------------------------------# 261 | 262 | self.saver.save(self.sess, self.log_folder + "/my-model") 263 | self.sess.close() 264 | # SAVE MODEL 265 | #---------------------------------------------------------------------------------------------------------------------# 266 | with open(self.log_folder + '/arguments.json', 'w') as outfile: 267 | json.dump(self.arguments, outfile) 268 | 269 | print("\nElapsed time: {}".format((time.time() - start)/3600)) 270 | #---------------------------------------------------------------------------------------------------------------------# -------------------------------------------------------------------------------- /tf_a2c/utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | def mse(predicted, ground_truth): 5 | # Mean-squared error 6 | return tf.square(predicted - ground_truth) / 2. 7 | 8 | def noise_and_argmax(logits): 9 | logits = np.asarray(logits, dtype = np.float32) 10 | # Add noise then take the argmax 11 | noise = np.random.uniform(0, 1, logits.shape) 12 | 13 | return np.argmax(logits - np.log(-np.log(noise))) 14 | 15 | def openai_entropy(logits): 16 | # Entropy proposed by OpenAI in their A2C baseline 17 | a0 = logits - tf.reduce_max(logits, 1, keep_dims=True) 18 | ea0 = tf.exp(a0) 19 | z0 = tf.reduce_sum(ea0, 1, keep_dims=True) 20 | p0 = ea0 / z0 21 | return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1) 22 | 23 | class LearningRateDecay(object): 24 | def __init__(self, v, nvalues, lr_decay_method): 25 | self.n = 0. 26 | self.v = v 27 | self.nvalues = nvalues 28 | 29 | def constant(p): 30 | return 1 31 | 32 | def linear(p): 33 | return 1 - p 34 | 35 | lr_decay_methods = { 36 | 'linear': linear, 37 | 'constant': constant 38 | } 39 | 40 | self.decay = lr_decay_methods[lr_decay_method] 41 | 42 | def value(self): 43 | current_value = self.v * self.decay(self.n / self.nvalues) 44 | self.n += 1. 45 | return current_value 46 | 47 | def get_value_for_steps(self, steps): 48 | return self.v * self.decay(steps / self.nvalues) -------------------------------------------------------------------------------- /tsne.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/tsne.png -------------------------------------------------------------------------------- /visualize.py: -------------------------------------------------------------------------------- 1 | import matplotlib as mpl 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import pandas as pd 5 | import os 6 | 7 | def foo_all(folder, smooth=5): 8 | chosen_cols = ['rewards', 'success_rate'] 9 | fig, axes = plt.subplots(nrows= 1, ncols=len(chosen_cols)) 10 | lines = [] 11 | labels = [] 12 | for ax, t in zip(axes, chosen_cols): 13 | ax.set_title(t if t != 'redundants' else 'redundant steps') 14 | files = [os.path.join(folder, f) for f in os.listdir(folder) if t in f] 15 | files = sorted(files, key=lambda x: x.split('/')[-1].split('_')[0]) 16 | for i, f in enumerate(files): 17 | labels.append(f.split("/")[-1].split('_')[0]) 18 | log = pd.read_csv(f) 19 | 20 | avg = log['Value'].tolist() 21 | smoothed_y = [np.mean(avg[max(0, yi - smooth):min(yi + smooth, len(avg)-1)]) for yi in range(len(avg))] 22 | 23 | ax.xaxis.set_major_locator(plt.MaxNLocator(4)) 24 | ax.xaxis.set_major_formatter(mpl.ticker.FuncFormatter(lambda x, p: format(x / 1e6, ',') + ' M')) 25 | 26 | line = ax.plot(log['Step'].tolist(), smoothed_y, c='C{}'.format(i)) 27 | ax.plot(log['Step'].tolist(), avg, c='C{}'.format(i), alpha=0.3) 28 | lines.append(line[0]) 29 | 30 | fig.set_size_inches(12, 4) 31 | 32 | # leg = fig.legend(lines[::2], ["{} {}".format(*folder.split('/')[-1].split('_')) for folder in folders], loc = 8, ncol = 2, bbox_to_anchor = (0.50, -0.00), fontsize ='large') 33 | leg = fig.legend(lines[:len(labels)//2], labels[:len(labels)//2], loc = 8, ncol = 3, bbox_to_anchor = (0.4, -0.00), fontsize ='large') 34 | 35 | # set the linewidth of each legend object 36 | for legobj in leg.get_lines(): 37 | legobj.set_linewidth(4.0) 38 | 39 | plt.subplots_adjust(wspace = 0.1, hspace = 0.3, bottom = 0.3) 40 | plt.savefig("All " + folder.split('/')[-1] + '.png', bbox_inches='tight', dpi = 250) 41 | 42 | def foo(folders, smooth=5): 43 | chosen_cols = ['rewards', 'success_rate'] 44 | fig, axes = plt.subplots(nrows= 1, ncols=len(chosen_cols)) 45 | colors = ['C0', 'C1'] 46 | lines = [] 47 | for i, folder in enumerate(folders): 48 | for ax, t in zip(axes, chosen_cols): 49 | ax.set_title(t if t != 'redundants' else 'redundant steps') 50 | files = [os.path.join(folder, f) for f in os.listdir(folder) if t in f] 51 | logs = [] 52 | for f in files: 53 | logs.append(pd.read_csv(f)) 54 | 55 | min_size = min([l.shape[0] for l in logs]) 56 | 57 | new_logs = [] 58 | for l in logs: 59 | new_logs.append(l['Value'].tolist()[:min_size]) 60 | 61 | avg = np.mean(np.vstack(new_logs), 0).tolist() 62 | smoothed_y = [np.mean(avg[max(0, yi - smooth):min(yi + smooth, len(avg)-1)]) for yi in range(len(avg))] 63 | 64 | ax.xaxis.set_major_locator(plt.MaxNLocator(4)) 65 | ax.xaxis.set_major_formatter(mpl.ticker.FuncFormatter(lambda x, p: format(x / 1e6, ',') + ' M')) 66 | 67 | line = ax.plot(logs[0]['Step'].tolist(), smoothed_y, c=colors[i]) 68 | ax.plot(logs[0]['Step'].tolist(), avg, c=colors[i], alpha=0.3) 69 | lines.append(line[0]) 70 | 71 | fig.set_size_inches(12, 4) 72 | 73 | leg = fig.legend(lines[::2], ["{} {}".format(*folder.split('/')[-1].split('_')) for folder in folders], loc = 8, ncol = 2, bbox_to_anchor = (0.42, -0.00), fontsize ='large') 74 | # leg = fig.legend(lines[::2], ["4-stacked-frames", '1-frame'], loc = 8, ncol = 2, bbox_to_anchor = (0.40, -0.00), fontsize ='large') 75 | 76 | # set the linewidth of each legend object 77 | for legobj in leg.get_lines(): 78 | legobj.set_linewidth(4.0) 79 | 80 | plt.subplots_adjust(wspace = 0.1, hspace = 0.3, bottom = 0.2) 81 | plt.savefig("Compare " + folder.split('/')[-1].split('_')[0] + '.png', bbox_inches='tight', dpi = 250) 82 | 83 | if __name__ == '__main__': 84 | foo(["/home/yoshi/thesis/RL-target-driven-navigation-ai2thor/tf_a2c/training-history/FloorPlan1_4", "/home/yoshi/thesis/RL-target-driven-navigation-ai2thor/tf_a2c/training-history/FloorPlan1_6"]) 85 | # foo_all("/home/yoshi/thesis/RL-target-driven-navigation-ai2thor/tf_a2c/training-history/FloorPlan28_6") --------------------------------------------------------------------------------