├── .gitignore
├── README.md
├── build_graph
    ├── my_graph.npy
    ├── new_relations.npy
    └── process.py
├── config.json
├── download_csv.py
├── dqn
    ├── agent.py
    ├── env
    │   ├── ai2thor_env.py
    │   └── training-history
    │   │   ├── 2019-03-04-15-57-41-547816_FloorPlan2_CoffeeMachine
    │   │       ├── checkpoint
    │   │       ├── events.out.tfevents.1551689861.asr02.local
    │   │       ├── my-model.data-00000-of-00001
    │   │       ├── my-model.index
    │   │       └── my-model.meta
    │   │   └── 2019-03-04-15-58-04-711405_FloorPlan2_CoffeeMachine
    │   │       ├── checkpoint
    │   │       ├── events.out.tfevents.1551689885.asr02.local
    │   │       ├── my-model.data-00000-of-00001
    │   │       ├── my-model.index
    │   │       └── my-model.meta
    ├── main.py
    ├── model.py
    ├── replay_buffer.py
    └── utils.py
├── draft.py
├── dumping.py
├── embedding_fasttext300.pkl
├── embedding_onehot.pkl
├── env
    └── ai2thor_env.py
├── images
    ├── 1_GCN.png
    ├── 1_easy.png
    ├── 1_easy_noGAE.png
    ├── 1_easy_noGAE_normalizeReward.png
    ├── 1_easy_noGAE_onehot.png
    ├── 1_embed.png
    ├── 1_increaseLearningRate.png
    ├── 1_increase_entropy_penalty.png
    ├── 1_noGAE.png
    ├── 28_easy_noGAE.png
    ├── All FloorPlan1_4.png
    ├── All FloorPlan1_6.png
    ├── All FloorPlan28_4.png
    ├── All FloorPlan28_6.png
    ├── All FloorPlan2_4.png
    ├── All FloorPlan2_6.png
    ├── Compare FloorPlan1.png
    ├── Compare FloorPlan2.png
    ├── Compare FloorPlan28.png
    ├── FloorPlan1_4.png
    ├── FloorPlan1_6.png
    ├── FloorPlan28_4.png
    ├── FloorPlan28_6.png
    ├── FloorPlan2_4.png
    └── sample_AI2THOR.png
├── keyboard_agent.py
├── pytorch_a3c
    ├── LICENSE
    ├── README.md
    ├── layers.py
    ├── main.py
    ├── model.py
    ├── optimizers.py
    ├── test.py
    ├── train.py
    ├── utils.py
    └── visualize.py
├── tf_a2c
    ├── layers.py
    ├── main.py
    ├── model.py
    ├── multi_task.py
    ├── rollout.py
    ├── rollout_thread.py
    ├── sharing_polices.py
    ├── single_task.py
    └── utils.py
├── tsne.png
└── visualize.py


/.gitignore:
--------------------------------------------------------------------------------
1 | dumped/
2 | */training-history/*
3 | __pycache__
4 | */__pycache__/
5 | *.ipynb


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # RL-target-driven-navigation-ai2thor


--------------------------------------------------------------------------------
/build_graph/my_graph.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/build_graph/my_graph.npy


--------------------------------------------------------------------------------
/build_graph/new_relations.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/build_graph/new_relations.npy


--------------------------------------------------------------------------------
/build_graph/process.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import pickle
  3 | import numpy as np
  4 | import progressbar
  5 | 
  6 | from collections import Counter
  7 | 
  8 | def build_graph():
  9 |     data = json.load(open("relationships.json", 'rb'))
 10 |     mapping = pickle.load(open("new_mapping.pkl", "rb"))
 11 | 
 12 |     vg2idx = mapping['vg2idx']
 13 |     idx2obj = mapping['idx2obj']
 14 |     obj2idx = mapping['obj2idx']
 15 |     # rela2idx = mapping['rela2idx']
 16 | 
 17 |     cooc = {}
 18 |     cooc_pred = {}
 19 |     for i in range(105):
 20 |         for j in range(105):
 21 |             cooc[i, j] = []
 22 |             cooc_pred[i, j] = []
 23 | 
 24 |     bar = progressbar.ProgressBar()
 25 |     # invalid_predicates = []
 26 |     for i in bar(range(len(data))):
 27 |         d = data[i]
 28 |         for r in d['relationships']:
 29 |             if "name" in r['object']:
 30 |                 k = "name"
 31 |             else:
 32 |                 k = "names"
 33 |                 
 34 |             if type(r['object'][k]) == list:
 35 |                 obj = r['object'][k][0]
 36 |             else:
 37 |                 obj = r['object'][k]
 38 |             
 39 |             
 40 |             if "name" in r['subject']:
 41 |                 k = 'name'
 42 |             else:
 43 |                 k = "names"
 44 |                 
 45 |             if type(r['subject'][k]) == list:
 46 |                 sub = r['subject'][k][0]
 47 |             else:
 48 |                 sub = r['subject'][k]
 49 | 
 50 |             try:
 51 |                 objs = vg2idx[obj]
 52 |                 subs = vg2idx[sub]
 53 |             except:
 54 |                 continue
 55 | 
 56 |             for o in objs:
 57 |                 for s in subs:
 58 |                     try:
 59 |                         obj_id = obj2idx[o]
 60 |                         sub_id = obj2idx[s]
 61 |                     except:
 62 |                         continue
 63 |                     # try:
 64 |                         # cooc_pred[obj_id, sub_id].extend(rela2idx[r['predicate'].lower()])
 65 |                     # except:
 66 |                         # invalid_predicates.append(r['predicate'].lower())
 67 |                     if type(r['predicate']) == list:
 68 |                         cooc_pred[obj_id, sub_id].extend([p.lower() for p in r['predicate']])
 69 |                     else:   
 70 |                         cooc_pred[obj_id, sub_id].append(r['predicate'].lower())
 71 |                         
 72 |                     # cooc[obj_id, sub_id].append(r['relationship_id'])
 73 | 
 74 | 
 75 |     relations = np.identity(105, np.float32)
 76 |     # raw_relations = np.identity(87, np.float32)
 77 |     for k, v in cooc_pred.items():
 78 |       if len(v) > 0:
 79 |           cnt_v = Counter(v + cooc_pred[k[1], k[0]])
 80 |           freqs = np.array(list(cnt_v.values()))
 81 |           if np.sum(freqs > 3) > 0:
 82 |               relations[k[0], k[1]] = 1
 83 |               relations[k[1], k[0]] = 1
 84 | 
 85 |     # for k, v in cooc.items():
 86 |     #   if k[0] != k[1]:
 87 |     #       raw_relations[k[0], k[1]] = len(v + cooc[k[1], k[0]])
 88 |     #       raw_relations[k[1], k[0]] = len(v + cooc[k[1], k[0]])
 89 | 
 90 |     with open("new_cooc_pred.pkl", 'wb') as f:
 91 |         pickle.dump(cooc_pred, f, pickle.HIGHEST_PROTOCOL)
 92 | 
 93 |     # with open("invalid.txt", 'wb') as f:
 94 |     #   pickle.dump(invalid_predicates, f, pickle.HIGHEST_PROTOCOL)
 95 | 
 96 |     np.save("new_relations", relations)
 97 |     # np.save("raw_relations", raw_relations)
 98 | 
 99 | def lcs(X, Y, m, n): 
100 |     LCSuff = [[0 for k in range(n+1)] for l in range(m+1)] 
101 |       
102 |     # To store the length of  
103 |     # longest common substring 
104 |     result = 0 
105 |   
106 |     # Following steps to build 
107 |     # LCSuff[m+1][n+1] in bottom up fashion 
108 |     for i in range(m + 1): 
109 |         for j in range(n + 1): 
110 |             if (i == 0 or j == 0): 
111 |                 LCSuff[i][j] = 0
112 |             elif (X[i-1] == Y[j-1]): 
113 |                 LCSuff[i][j] = LCSuff[i-1][j-1] + 1
114 |                 result = max(result, LCSuff[i][j]) 
115 |             else: 
116 |                 LCSuff[i][j] = 0
117 |     return result 
118 | 
119 | def mapping_predicates():
120 |     mapping = pickle.load(open("mapping.pkl", "rb"))
121 | 
122 |     vg2idx = mapping['vg2idx']
123 |     idx2obj = mapping['idx2obj']
124 |     rela2idx = mapping['rela2idx']
125 | 
126 |     new_rela = {}
127 |     for k, v in rela2idx.items():
128 |         new_rela[k] = [v]
129 | 
130 |     known_pred = list(rela2idx.keys())
131 |     not_found = 0
132 |     invalid_predicates = pickle.load(open('invalid.txt', 'rb'))
133 | 
134 |     bar = progressbar.ProgressBar()
135 |     for i in bar(range(len(invalid_predicates))):
136 |         p = invalid_predicates[i]
137 |         new_rela[p] = []
138 |         found = 0
139 |         for kp in known_pred:
140 |             if lcs(p, kp, len(p), len(kp)) / max(len(p), len(kp)) > 0.6:
141 |                 new_rela[p].append(rela2idx[kp])
142 |                 found = 1
143 |         if found == 0:
144 |             not_found += 1
145 | 
146 |     mapping['all_rela2idx'] = new_rela
147 | 
148 |     print("{} not found.".format(not_found))
149 | 
150 |     with open("mapping.pkl", 'wb') as f:
151 |         pickle.dump(mapping, f, pickle.HIGHEST_PROTOCOL)
152 | 
153 | if __name__ == '__main__':
154 |     build_graph()
155 |     # mapping_predicates()


--------------------------------------------------------------------------------
/config.json:
--------------------------------------------------------------------------------
  1 | {
  2 |    "resolution": [224, 224],
  3 |    "default_reward": -0.01,
  4 |    "success_reward": 10.0,
  5 |    "collide_reward": -0.1,
  6 | <<<<<<< HEAD
  7 |    "embeddings_onehot": "/home/tailongnguyen/thesis/RL-target-driven-navigation-ai2thor/embedding_onehot.pkl", 
  8 |    "embeddings_fasttext": "/home/tailongnguyen/thesis/RL-target-driven-navigation-ai2thor/embedding_fasttext300.pkl", 
  9 |    "dump_path": "/home/tailongnguyen/thesis/RL-target-driven-navigation-ai2thor/dumped/",
 10 |    "adj_file": "/home/tailongnguyen/thesis/build_graph/new_relations.npy",
 11 | =======
 12 |    "embeddings_onehot": "/home/yoshi/thesis/RL-target-driven-navigation-ai2thor/embedding_onehot.pkl", 
 13 |    "embeddings_fasttext": "/home/yoshi/thesis/RL-target-driven-navigation-ai2thor/embedding_fasttext300.pkl", 
 14 |    "dump_path": "/home/yoshi/thesis/RL-target-driven-navigation-ai2thor/dumped/",
 15 |    "adj_file": "/home/yoshi/thesis/build_graph/new_relations.npy",
 16 | >>>>>>> 3e16083a7dd07023980fd5cddf4ec521c2796629
 17 |    "all_objects": ["KeyChain", "Bread", "Potato", "Mug", "PaintingHanger", "Book", "ToiletPaper", "TableTop", 
 18 |                   "Chair", "Bed", "Container", "Painting", "Watch", "Apple", "Sink", "Cabinet", "StoveKnob", 
 19 |                   "Pan", "Cloth", "ShowerDoor", "CoffeeMachine", "Toaster", "Box", "CellPhone", "Tomato", 
 20 |                   "SoapBar", "HousePlant", "Bowl", "Lettuce", "ButterKnife", "Fridge", "Laptop", "Towel", 
 21 |                   "Knife", "Pen", "Plate", "SprayBottle", "Microwave", "LightSwitch", "StoveBurner", "Candle", 
 22 |                   "Pencil", "Blinds", "SoapBottle", "Lamp", "TowelHolder", "Statue", "Mirror", "Newspaper", 
 23 |                   "WateringCan", "Television", "AlarmClock", "CreditCard"],
 24 |    "picked": {
 25 |        "FloorPlan1": {"test": ["ButterKnife", "GarbageCan", "Kettle", "PepperShaker", "SoapBottle", "Spatula", "Statue"]},
 26 | <<<<<<< HEAD
 27 |        "FloorPlan2": {"test": ["CellPhone", "GarbageCan", "Kettle", "Ladle", "Spatula", "SaltShaker"]},
 28 |        "FloorPlan10": {"test": ["ButterKnife", "CellPhone", "GarbageCan", "SoapBottle", "Statue", "Toaster", "Spatula"]},
 29 |        "FloorPlan28": {"test": ["Blinds", "ButterKnife", "SoapBottle", "SaltShaker"]},
 30 |        "FloorPlan201": {"test": ["RemoteControl", "Painting"]}
 31 |    },
 32 |    "rooms": {
 33 |         "Kitchens":{
 34 |             "train_scenes": [
 35 |                     "FloorPlan1", 
 36 |                     "FloorPlan2", 
 37 |                     "FloorPlan5", 
 38 |                     "FloorPlan8", 
 39 |                     "FloorPlan10", 
 40 |                     "FloorPlan16", 
 41 |                     "FloorPlan29", 
 42 |                     "FloorPlan12", 
 43 |                     "FloorPlan17", 
 44 |                     "FloorPlan30", 
 45 |                     "FloorPlan14", 
 46 |                     "FloorPlan13", 
 47 |                     "FloorPlan22", 
 48 |                     "FloorPlan7", 
 49 |                     "FloorPlan20"],
 50 |             "test_scenes": [
 51 |                     "FloorPlan18", 
 52 |                     "FloorPlan28" 
 53 |             ],
 54 |             "train_objects": ["GarbageCan", "Sink", "Bread", "StoveKnob", "SinkBasin", "StoveBurner", "Fridge", "CounterTop", "Microwave", "LightSwitch", "CoffeeMachine", "Cabinet"],
 55 |             "test_objects": ["Toaster", "Mug", "Potato", "Window", "Bowl"]
 56 | =======
 57 |        "FloorPlan2": {"test": ["CellPhone", "GarbageCan", "Kettle", "Ladle", "Spatula", "SoapBottle", "SaltShaker"]},
 58 |        "FloorPlan10": {"test": ["ButterKnife", "CellPhone", "GarbageCan", "SoapBottle", "Statue", "Toaster", "Spatula"]},
 59 |        "FloorPlan28": {"test": ["Blinds", "ButterKnife", "SoapBottle", "SaltShaker"]}
 60 |    },
 61 |    "rooms": {
 62 |         "Kitchens":{
 63 |             "scenes": ["FloorPlan1",
 64 |                      "FloorPlan2",
 65 |                      "FloorPlan10",
 66 |                      "FloorPlan28"],
 67 |             "train_objects": ["Fridge", "SoapBottle", "CoffeeMachine", "Microwave", "SaltShaker", "Tomato", "Bread", "StoveBurner", "StoveKnob", "Pan", "DishSponge", "Pot", "Bowl", "SinkBasin", "CounterTop", "Drawer", "Sink", "Cabinet", "Mug", "LightSwitch"],
 68 |             "test_objects": ["GarbageCan", "Toaster", "Spatula", "PepperShaker"]
 69 | >>>>>>> 3e16083a7dd07023980fd5cddf4ec521c2796629
 70 |         },
 71 |         "Living Rooms": {
 72 |             "train_scenes": [
 73 |                      "FloorPlan201",
 74 |                      "FloorPlan202",
 75 |                      "FloorPlan206",
 76 |                      "FloorPlan207",
 77 |                      "FloorPlan208",
 78 |                      "FloorPlan209",
 79 |                      "FloorPlan210",
 80 |                      "FloorPlan211",
 81 |                      "FloorPlan212",
 82 |                      "FloorPlan213",
 83 |                      "FloorPlan214",
 84 |                      "FloorPlan216",
 85 |                      "FloorPlan217",
 86 |                      "FloorPlan218",
 87 |                      "FloorPlan219"
 88 |                      ],
 89 |             "test_scenes": [
 90 |                     "FloorPlan204",
 91 |                     "FloorPlan205"
 92 |             ],
 93 |             "train_objects": ["ArmChair", "GarbageCan", "TableTop", "Sofa", "Television", "HousePlant", "Vase", "Painting", "FloorLamp", "Window"],
 94 |             "test_objects" : ["Box", "Pillow", "RemoteControl", "Chair", "GarbageCan", "Laptop"]
 95 |         },
 96 |         "Bedrooms": {
 97 |             "train_scenes": [
 98 |                      "FloorPlan314",
 99 |                      "FloorPlan301",
100 |                      "FloorPlan313",
101 |                      "FloorPlan306",
102 |                      "FloorPlan302",
103 |                      "FloorPlan304",
104 |                      "FloorPlan305",
105 |                      "FloorPlan310",
106 |                      "FloorPlan312",
107 |                      "FloorPlan308"],
108 |             "test_scenes":[
109 |                      "FloorPlan316",
110 |                      "FloorPlan317"
111 |                      ],
112 |             "train_objects": ["Shelf", "AlarmClock", "GarbageCan", "KeyChain", "LightSwitch", "Bed", "Mirror", "Chair", "Desk", "Pen"],
113 | 
114 |             "test_objects" : ["CellPhone", "Pencil", "Book", "Drawer", "Laptop"]
115 |         },
116 |         "Bathrooms": {
117 |             "train_scenes": [
118 |               "FloorPlan410",       
119 |              "FloorPlan417",
120 |              "FloorPlan418",
121 |              "FloorPlan423",
122 |              "FloorPlan427",
123 |              "FloorPlan428",
124 |              "FloorPlan429",
125 |              "FloorPlan430"],
126 |             "test_scenes":[
127 |               "FloorPlan414"
128 |             ],
129 |             "train_objects": ["Candle", "LightSwitch", "Sink", "SinkBasin", "SoapBottle", "Toilet", "TowelHolder", "Mirror", "SoapBar", "GarbageCan"],
130 |             "test_objects" :   ["ToiletPaperHanger",  "Towel"]
131 |         }
132 |     },
133 |     "graph": "",
134 |     "objects": {
135 |          "AlarmClock": 0,
136 |          "Apple": 1,
137 |          "AppleSlice": 2,
138 |          "Bathtub": 3,
139 |          "Bed": 4,
140 |          "Blinds": 5,
141 |          "Book": 6,
142 |          "Bowl": 7,
143 |          "BowlDirty": 8,
144 |          "BowlFilled": 9,
145 |          "Box": 10,
146 |          "Bread": 11,
147 |          "BreadSliced": 12,
148 |          "ButterKnife": 13,
149 |          "Cabinet": 14,
150 |          "Candle": 15,
151 |          "CellPhone": 16,
152 |          "Chair": 17,
153 |          "Cloth": 18,
154 |          "CoffeeMachine": 19,
155 |          "Container": 20,
156 |          "ContainerFull": 21,
157 |          "CounterTop": 22,
158 |          "CreditCard": 23,
159 |          "Cup": 24,
160 |          "Dirt": 25,
161 |          "Egg": 26,
162 |          "EggFried": 27,
163 |          "EggShell": 28,
164 |          "Fork": 29,
165 |          "Fridge": 30,
166 |          "GarbageCan": 31,
167 |          "HousePlant": 32,
168 |          "KeyChain": 33,
169 |          "Knife": 34,
170 |          "Lamp": 35,
171 |          "Laptop": 36,
172 |          "Lettuce": 37,
173 |          "LettuceSliced": 38,
174 |          "LightSwitch": 39,
175 |          "Microwave": 40,
176 |          "Mirror": 41,
177 |          "MiscTableObject": 42,
178 |          "Mug": 43,
179 |          "MugFilled": 44,
180 |          "Newspaper": 45,
181 |          "Omelette": 46,
182 |          "Painting": 47,
183 |          "PaintingHanger": 48,
184 |          "Pan": 49,
185 |          "Pen": 50,
186 |          "Pencil": 51,
187 |          "Plate": 52,
188 |          "Plunger": 53,
189 |          "Pot": 54,
190 |          "Potato": 55,
191 |          "PotatoSliced": 56,
192 |          "RemoteControl": 57,
193 |          "Sandwich": 58,
194 |          "ScrubBrush": 59,
195 |          "ShowerDoor": 60,
196 |          "Sink": 61,
197 |          "SoapBar": 62,
198 |          "SoapBottle": 63,
199 |          "SoapBottleFilled": 64,
200 |          "Spoon": 65,
201 |          "SportsEquipment": 66,
202 |          "SprayBottle": 67,
203 |          "Statue": 68,
204 |          "StoveBurner": 69,
205 |          "StoveKnob": 70,
206 |          "TableTop": 71,
207 |          "Television": 72,
208 |          "TissueBox": 73,
209 |          "TissueBoxEmpty": 74,
210 |          "Toaster": 75,
211 |          "Toilet": 76,
212 |          "ToiletPaper": 77,
213 |          "ToiletPaperHanger": 78,
214 |          "ToiletPaperRoll": 79,
215 |          "Tomato": 80,
216 |          "TomatoSliced": 81,
217 |          "Towel": 82,
218 |          "TowelHolder": 83,
219 |          "VacuumCleaner": 84,
220 |          "Watch": 85,
221 |          "WateringCan": 86 
222 |     },
223 |     "new_objects":{
224 |           "AlarmClock": 0,
225 |           "Apple": 1,
226 |           "ArmChair": 2,
227 |           "BaseballBat": 3,
228 |           "BasketBall": 4,
229 |           "Bathtub": 5,
230 |           "BathtubBasin": 6,
231 |           "Bed": 7,
232 |           "Blinds": 8,
233 |           "Book": 9,
234 |           "Boots": 10,
235 |           "Bottle": 11,
236 |           "Bowl": 12,
237 |           "Box": 13,
238 |           "Bread": 14,
239 |           "ButterKnife": 15,
240 |           "Cabinet": 16,
241 |           "Candle": 17,
242 |           "Cart": 18,
243 |           "CD": 19,
244 |           "CellPhone": 20,
245 |           "Chair": 21,
246 |           "Cloth": 22,
247 |           "CoffeeMachine": 23,
248 |           "CounterTop": 24,
249 |           "CreditCard": 25,
250 |           "Cup": 26,
251 |           "Curtains": 27,
252 |           "Desk": 28,
253 |           "DeskLamp": 29,
254 |           "DishSponge": 30,
255 |           "Drawer": 31,
256 |           "Dresser": 32,
257 |           "Egg": 33,
258 |           "FloorLamp": 34,
259 |           "Footstool": 35,
260 |           "Fork": 36,
261 |           "Fridge": 37,
262 |           "GarbageCan": 38,
263 |           "HandTowel": 39,
264 |           "HandTowelHolder": 40,
265 |           "HousePlant": 41,
266 |           "Kettle": 42,
267 |           "KeyChain": 43,
268 |           "Knife": 44,
269 |           "Ladle": 45,
270 |           "Laptop": 46,
271 |           "LaundryHamper": 47,
272 |           "LaundryHamperLid": 48,
273 |           "Lettuce": 49,
274 |           "LightSwitch": 50,
275 |           "Microwave": 51,
276 |           "Mirror": 52,
277 |           "Mug": 53,
278 |           "Newspaper": 54,
279 |           "NightStand": 55,
280 |           "Ottoman": 56,
281 |           "Painting": 57,
282 |           "Pan": 58,
283 |           "PaperTowel": 59,
284 |           "Pen": 60,
285 |           "Pencil": 61,
286 |           "PepperShaker": 62,
287 |           "Pillow": 63,
288 |           "Plate": 64,
289 |           "Plunger": 65,
290 |           "Poster": 66,
291 |           "Pot": 67,
292 |           "Potato": 68,
293 |           "RemoteControl": 69,
294 |           "Safe": 70,
295 |           "SaltShaker": 71,
296 |           "ScrubBrush": 72,
297 |           "Shelf": 73,
298 |           "ShowerDoor": 74,
299 |           "ShowerGlass": 75,
300 |           "Sink": 76,
301 |           "SinkBasin": 77,
302 |           "SoapBar": 78,
303 |           "SoapBottle": 79,
304 |           "Sofa": 80,
305 |           "Spatula": 81,
306 |           "Spoon": 82,
307 |           "SprayBottle": 83,
308 |           "Statue": 84,
309 |           "StoveBurner": 85,
310 |           "StoveKnob": 86,
311 |           "TableTop": 87,
312 |           "TeddyBear": 88,
313 |           "Television": 89,
314 |           "TennisRacket": 90,
315 |           "TissueBox": 91,
316 |           "Toaster": 92,
317 |           "Toilet": 93,
318 |           "ToiletPaper": 94,
319 |           "ToiletPaperHanger": 95,
320 |           "ToiletPaperRoll": 96,
321 |           "Tomato": 97,
322 |           "Towel": 98,
323 |           "TowelHolder": 99,
324 |           "Vase": 100,
325 |           "Watch": 101,
326 |           "WateringCan": 102,
327 |           "Window": 103,
328 |           "WineBottle": 104
329 |       }
330 | 
331 | }
332 | 


--------------------------------------------------------------------------------
/download_csv.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import matplotlib.pyplot as plt
 3 | import pandas as pd
 4 | import numpy as np 
 5 | import sys
 6 | from collections import defaultdict
 7 | from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
 8 | 
 9 | 
10 | def process(dpath):
11 | 	folders = [f for f in os.listdir(dpath)]
12 | 	# folders = ['Z_16']
13 | 	summary_iterators = [EventAccumulator(os.path.join(dpath, dname)).Reload() for dname in folders]
14 | 
15 | 	for f, summary in zip(folders, summary_iterators):
16 | 		tag_dict = {}
17 | 		tags = summary.Tags()['scalars']
18 | 		for tag in tags:
19 | 			log_type = "_".join(tag.split('/')[1:])
20 | 			if log_type not in tag_dict:
21 | 				tag_dict[log_type] = {'steps' : [], 'values': [] }
22 | 				
23 | 			steps = [e.step for e in summary.Scalars(tag)]
24 | 			values = [e.value for e in summary.Scalars(tag)]
25 | 
26 | 			tag_dict[log_type]['steps'].append(steps)
27 | 			tag_dict[log_type]['values'].append(values)
28 | 
29 | 		# print(list(tag_dict.keys()))
30 | 		# break
31 | 		for k, v in tag_dict.items():
32 | 			df = pd.DataFrame(columns=['Step', 'Value'])
33 | 			# print(v['steps'], v['values'])
34 | 			# sys.exit()
35 | 			df['Step'] = np.mean(np.vstack(v['steps']), 0)
36 | 			df['Value'] = np.mean(np.vstack(v['values']), 0)
37 | 			df.to_csv(os.path.join(dpath, "{}.csv".format(k)))
38 | 
39 | if __name__ == '__main__':
40 | 	path = "/home/yoshi/thesis/RL-target-driven-navigation-ai2thor/tf_a2c/training-history/FloorPlan2_6"
41 | 	process(path)


--------------------------------------------------------------------------------
/dqn/agent.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | import tensorflow as tf
  4 | 
  5 | from replay_buffer import ReplayBuffer
  6 | from model import QNetwork
  7 | 
  8 | 
  9 | class Agent():
 10 |     """Interacts with and learns from the environment."""
 11 | 
 12 |     def __init__(self, sess, state_size, action_size, seed, arguments):
 13 |         """Initialize an Agent object.
 14 |         
 15 |         Params
 16 |         ======
 17 |             state_size (int): dimension of each state
 18 |             action_size (int): dimension of each action
 19 |             seed (int): random seed
 20 |         """
 21 |         self.sess = sess
 22 |         self.state_size = state_size
 23 |         self.action_size = action_size
 24 |         self.seed = random.seed(seed)
 25 | 
 26 |         self.learning_rate = arguments['lr']
 27 |         self.gamma = arguments['gamma']
 28 |         self.update_every = arguments['update_every']
 29 |         self.tau = arguments['tau']
 30 |         self.history_size = arguments['history_size']
 31 | 
 32 |         self.buffer_size = arguments['buffer_size']
 33 |         self.batch_size = arguments['batch_size']
 34 | 
 35 |         # Q-Network
 36 |         self.qnetwork_local = QNetwork('local_q', state_size, action_size, self.history_size)
 37 |         self.qnetwork_target = QNetwork('target_q', state_size, action_size, self.history_size)
 38 | 
 39 |         copy_ops = []
 40 |         for local_w, target_w in zip(self.qnetwork_local.variables, self.qnetwork_target.variables):
 41 |             copy_op = tf.assign(local_w, local_w * self.tau + (1.0 - self.tau) * target_w)
 42 |             copy_ops.append(copy_op)
 43 | 
 44 |         self.copy_ops = tf.group(*copy_ops, name='copy_op')
 45 | 
 46 |         # Replay memory
 47 |         self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed)
 48 |         # Initialize time step (for updating every self.update_every steps)
 49 |         self.t_step = 0
 50 |     
 51 |     def step(self, state, action, reward, next_state, done):
 52 |         # Save experience in replay memory
 53 |         self.memory.add(state, action, reward, next_state, done)
 54 |         
 55 |         # Learn every self.update_every time steps.
 56 |         self.t_step = (self.t_step + 1) % self.update_every
 57 |         if self.t_step == 0:
 58 |             # If enough samples are available in memory, get random subset and learn
 59 |             if len(self.memory) > self.batch_size:
 60 |                 experiences = self.memory.sample()
 61 |                 self.learn(experiences, self.gamma)
 62 | 
 63 |     def act(self, state, eps=0.):
 64 |         """Returns actions for given state as per current policy.
 65 |         
 66 |         Params
 67 |         ======
 68 |             state (array_like): current state
 69 |             eps (float): epsilon, for epsilon-greedy action selection
 70 |         """
 71 |         q_values = self.sess.run(
 72 |                             self.qnetwork_local.q_values, 
 73 |                             feed_dict={
 74 |                                 self.qnetwork_local.inputs: [state]
 75 |                             }).ravel().tolist()
 76 | 
 77 |         # Epsilon-greedy action selection
 78 |         if random.random() > eps:
 79 |             return np.argmax(q_values)
 80 |         else:
 81 |             return random.choice(np.arange(self.action_size))
 82 | 
 83 |     def learn(self, experiences, gamma):
 84 |         """Update value parameters using given batch of experience tuples.
 85 |         Params
 86 |         ======
 87 |             experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
 88 |             gamma (float): discount factor
 89 |         """
 90 |         states, actions, rewards, next_states, dones = experiences
 91 |         # Get max predicted Q values (for next states) from target model
 92 |         q_target_values = self.sess.run(
 93 |                             self.qnetwork_target.q_values, 
 94 |                             feed_dict={
 95 |                                 self.qnetwork_target.inputs: next_states
 96 |                             })
 97 |         Q_targets_next = np.max(q_target_values, axis=1).reshape(-1, 1)
 98 | 
 99 |         # Compute Q targets for current states 
100 |         Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
101 | 
102 |         try:
103 |             self.sess.run(self.qnetwork_local.optimizer, 
104 |                             feed_dict={
105 |                                 self.qnetwork_local.learning_rate: self.learning_rate,
106 |                                 self.qnetwork_local.inputs: states,
107 |                                 self.qnetwork_local.actions: actions,
108 |                                 self.qnetwork_local.target_Q: np.squeeze(Q_targets),
109 |                             })
110 |         except:
111 |             print(states.shape)
112 |             print(actions.shape)
113 |             print(rewards.shape)
114 |             print(next_states.shape)
115 |             print(dones.shape)
116 |             print(q_target_values.shape)
117 |             print(Q_targets_next.shape)
118 |             print(Q_targets.shape)
119 |             import sys
120 |             sys.exit()
121 | 
122 |         # ------------------- update target network ------------------- #
123 |         self.soft_update()                     
124 | 
125 |     def soft_update(self):
126 |         """Soft update model parameters.
127 |         θ_target = τ*θ_local + (1 - τ)*θ_target
128 |         Params
129 |         ======
130 |             local_model (PyTorch model): weights will be copied from
131 |             target_model (PyTorch model): weights will be copied to
132 |             tau (float): interpolation parameter 
133 |         """
134 | 
135 |         self.sess.run(self.copy_ops)
136 | 
137 | 


--------------------------------------------------------------------------------
/dqn/env/ai2thor_env.py:
--------------------------------------------------------------------------------
  1 | import ai2thor.controller
  2 | import numpy as np
  3 | import gym
  4 | import cv2
  5 | import h5py
  6 | import os
  7 | import sys
  8 | import random
  9 | 
 10 | from copy import deepcopy
 11 | from gym import error, spaces
 12 | from gym.utils import seeding
 13 | 
 14 | class AI2ThorDumpEnv():
 15 |     """
 16 |     Wrapper base class
 17 |     """
 18 |     def __init__(self, scene, target, config, arguments=dict(), seed=None):
 19 |         """
 20 |         :param seed: (int)   Random seed
 21 |         :param config: (str)   Dictionary file storing cofigurations
 22 |         :param: scene: (list)  Scene to train on
 23 |         :param: objects: (list)  Target object to train on
 24 |         """
 25 |         
 26 |         self.config = config
 27 |         self.scene = scene
 28 |         self.target = target
 29 |         self.history_size = arguments.get('history_size')
 30 |         self.action_size = arguments.get('action_size')
 31 | 
 32 |         assert self.action_size <= self.graph.shape[1], "The number of actions exceeds the limit of environment."
 33 | 
 34 |         self.h5_file = h5py.File("{}.hdf5".format(os.path.join(config['dump_path'], self.scene)), 'r')
 35 | 
 36 |         all_visible_objects = set(",".join([o for o in list(self.h5_file['visible_objects']) if o != '']).split(','))
 37 |         
 38 |         assert self.target in all_visible_objects, "Target {} is unreachable in {}!".format(self.target, self.scene)
 39 | 
 40 |         self.states = self.h5_file['locations'][()]
 41 |         self.graph = self.h5_file['graph'][()]
 42 |         self.features = self.h5_file['resnet_features'][()]
 43 |         self.visible_objects = self.h5_file['visible_objects'][()]
 44 | 
 45 |         if "shortest" in self.h5_file.keys():
 46 |             self.shortest = self.h5_file['shortest'][()]
 47 | 
 48 |         if "sharing" in self.h5_file.keys():
 49 |             self.sharing = self.h5_file['sharing'][()].tolist()
 50 | 
 51 |         self.target_ids = [idx for idx in range(len(self.states)) if self.target in self.visible_objects[idx].split(",")]
 52 |         
 53 |         self.action_space = self.action_size 
 54 |         self.cv_action_onehot = np.identity(self.action_space)
 55 |         
 56 |         # Randomness settings
 57 |         self.np_random = None
 58 |         if seed:
 59 |             self.seed(seed)
 60 |         
 61 |         self.history_states = np.zeros((self.history_size, self.features.shape[1]))
 62 | 
 63 |     def step(self, action):
 64 |         '''
 65 |         0: move ahead
 66 |         1: move back
 67 |         2: rotate right
 68 |         3: rotate left
 69 |         4: look down
 70 |         5: look up
 71 |         '''
 72 | 
 73 |         if action >= self.action_space:
 74 |             raise error.InvalidAction('Action must be an integer between '
 75 |                                       '0 and {}!'.format(self.action_space - 1))
 76 |         k = self.current_state_id
 77 |         if self.graph[k][action] != -1:
 78 |             self.current_state_id = int(self.graph[k][action])
 79 |             if self.current_state_id in self.target_ids:
 80 |                 self.terminal = True
 81 |                 collided = False
 82 |             else:
 83 |                 self.terminal = False
 84 |                 collided = False
 85 |         else:
 86 |             self.terminal = False
 87 |             collided = True
 88 | 
 89 |         reward, done = self.transition_reward(collided)
 90 | 
 91 |         self.update_states()
 92 | 
 93 |         return self.history_states, reward, done
 94 | 
 95 |     def transition_reward(self, collided):
 96 |         reward = self.config['default_reward']
 97 |         done = 0
 98 |         if self.terminal:
 99 |             reward = self.config['success_reward']
100 |             done = 1
101 |         elif self.config['anti-collision'] and collided:
102 |             reward = self.config['collide_reward']
103 | 
104 |         return reward, done
105 | 
106 |     def reset(self):
107 |         # reset parameters
108 |         if self.action_size == self.action_space:
109 |             self.current_state_id = random.randrange(self.states.shape[0])
110 |         else:
111 |             while 1:
112 |                 k = random.randrange(self.states.shape[0])
113 |                 if int(self.states[k][-1]) == 0:
114 |                     break
115 | 
116 |             self.current_state_id = k
117 | 
118 |         self.update_states()
119 |         self.terminal = False
120 | 
121 |         return self.history_states, self.target
122 | 
123 |     def update_states(self):        
124 |         f = self.features[self.current_state_id]
125 |         self.history_states = np.append(self.history_states[1:, :], np.transpose(f, (1,0)), 0)
126 | 
127 |     def state(self, state_id):    
128 |         return self.features[state_id]
129 | 
130 |     def seed(self, seed=None):
131 |         self.np_random, seed1 = seeding.np_random(seed)
132 |         # Derive a random seed. This gets passed as a uint, but gets
133 |         # checked as an int elsewhere, so we need to keep it below
134 |         # 2**31.
135 |         return seed1
136 | 
137 | if __name__ == '__main__':
138 |     AI2ThorEnv()
139 | 


--------------------------------------------------------------------------------
/dqn/env/training-history/2019-03-04-15-57-41-547816_FloorPlan2_CoffeeMachine/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "my-model"
2 | all_model_checkpoint_paths: "my-model"
3 | 


--------------------------------------------------------------------------------
/dqn/env/training-history/2019-03-04-15-57-41-547816_FloorPlan2_CoffeeMachine/events.out.tfevents.1551689861.asr02.local:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/dqn/env/training-history/2019-03-04-15-57-41-547816_FloorPlan2_CoffeeMachine/events.out.tfevents.1551689861.asr02.local


--------------------------------------------------------------------------------
/dqn/env/training-history/2019-03-04-15-57-41-547816_FloorPlan2_CoffeeMachine/my-model.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/dqn/env/training-history/2019-03-04-15-57-41-547816_FloorPlan2_CoffeeMachine/my-model.data-00000-of-00001


--------------------------------------------------------------------------------
/dqn/env/training-history/2019-03-04-15-57-41-547816_FloorPlan2_CoffeeMachine/my-model.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/dqn/env/training-history/2019-03-04-15-57-41-547816_FloorPlan2_CoffeeMachine/my-model.index


--------------------------------------------------------------------------------
/dqn/env/training-history/2019-03-04-15-57-41-547816_FloorPlan2_CoffeeMachine/my-model.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/dqn/env/training-history/2019-03-04-15-57-41-547816_FloorPlan2_CoffeeMachine/my-model.meta


--------------------------------------------------------------------------------
/dqn/env/training-history/2019-03-04-15-58-04-711405_FloorPlan2_CoffeeMachine/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "my-model"
2 | all_model_checkpoint_paths: "my-model"
3 | 


--------------------------------------------------------------------------------
/dqn/env/training-history/2019-03-04-15-58-04-711405_FloorPlan2_CoffeeMachine/events.out.tfevents.1551689885.asr02.local:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/dqn/env/training-history/2019-03-04-15-58-04-711405_FloorPlan2_CoffeeMachine/events.out.tfevents.1551689885.asr02.local


--------------------------------------------------------------------------------
/dqn/env/training-history/2019-03-04-15-58-04-711405_FloorPlan2_CoffeeMachine/my-model.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/dqn/env/training-history/2019-03-04-15-58-04-711405_FloorPlan2_CoffeeMachine/my-model.data-00000-of-00001


--------------------------------------------------------------------------------
/dqn/env/training-history/2019-03-04-15-58-04-711405_FloorPlan2_CoffeeMachine/my-model.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/dqn/env/training-history/2019-03-04-15-58-04-711405_FloorPlan2_CoffeeMachine/my-model.index


--------------------------------------------------------------------------------
/dqn/env/training-history/2019-03-04-15-58-04-711405_FloorPlan2_CoffeeMachine/my-model.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/dqn/env/training-history/2019-03-04-15-58-04-711405_FloorPlan2_CoffeeMachine/my-model.meta


--------------------------------------------------------------------------------
/dqn/main.py:
--------------------------------------------------------------------------------
  1 | from env.ai2thor_env import AI2ThorDumpEnv
  2 | from agent import Agent
  3 | from utils import LinearSchedule
  4 | from datetime import datetime
  5 | 
  6 | import tensorflow as tf
  7 | import numpy as np
  8 | import os
  9 | import random
 10 | import time
 11 | import json
 12 | import argparse
 13 | 
 14 | ALL_ROOMS = {
 15 |     0: "Kitchens",
 16 |     1: "Living Rooms",
 17 |     2: "Bedrooms",
 18 |     3: "Bathrooms"
 19 | }
 20 | 
 21 | def read_config(config_path):
 22 |     if os.path.isfile(config_path):
 23 |         with open(config_path) as f:
 24 |             config = json.load(f)
 25 |     return config
 26 | 
 27 | def main(config, arguments):
 28 |     room = config['rooms'][ALL_ROOMS[arguments['room_id']]]
 29 |     all_scenes = room['scenes']
 30 |     train_objects = room['train_objects']
 31 |     test_objects = room['test_objects']
 32 | 
 33 |     training_scene = all_scenes[arguments['scene_id']]
 34 | 
 35 |     # h5_file = h5py.File("{}.hdf5".format(os.path.join(config['dump_path'], training_scene)), 'r')
 36 |     # all_visible_objects = set(",".join([o for o in list(h5_file['visible_objects']) if o != '']).split(','))
 37 |     # print(all_visible_objects)
 38 |     # trainable_objects = list(set(train_objects).intersection(all_visible_objects))
 39 |     # h5_file.close()
 40 |     # print(trainable_objects)
 41 | 
 42 |     trainable_objects = {
 43 |         0: ['Knife', 'Sink', 'CoffeeMachine', 'StoveKnob', 'StoveBurner', 'Cabinet', 'Fridge', 'TableTop'],
 44 |         1: ['CoffeeMachine', 'StoveBurner', 'Sink', 'GarbageCan', 'TableTop', 'Fridge',  'Mug', 'StoveKnob', 'Microwave', 'Cabinet', 'Chair'],
 45 |         27: ['Cabinet', 'TableTop', 'StoveKnob', 'Fridge', 'Sink', 'StoveBurner', 'CoffeeMachine']
 46 |     }
 47 | 
 48 |     training_object = trainable_objects[arguments['scene_id']][arguments['target_id']]
 49 | 
 50 |     env = AI2ThorDumpEnv(training_scene, training_object, config, arguments)
 51 | 
 52 |     tf.reset_default_graph()
 53 |     gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=arguments['gpu_fraction'])
 54 |     sess = tf.Session(config = tf.ConfigProto(gpu_options=gpu_options))
 55 | 
 56 |     agent = Agent(sess, env.features.shape[1], env.action_space, int(time.time() * 100) % 100, arguments)
 57 |     sess.run(tf.global_variables_initializer())
 58 | 
 59 | 
 60 |     saver = tf.train.Saver()
 61 |     timer = "{}_{}_{}".format(str(datetime.now()).replace(" ", "-").replace(".", "-").replace(":", "-"), training_scene, training_object)
 62 |     log_folder = os.path.join(arguments.get('logging'), timer)
 63 |     writer = tf.summary.FileWriter(log_folder)
 64 | 
 65 |     reward_log = tf.placeholder(tf.float32)
 66 |     redundant_log = tf.placeholder(tf.float32)
 67 | 
 68 |     test_name =  training_scene
 69 |     tf.summary.scalar(test_name + "/" + training_object + "/rewards", reward_log)
 70 |     tf.summary.scalar(test_name + "/" + training_object + "/redundants", redundant_log)
 71 | 
 72 |     write_op = tf.summary.merge_all()
 73 | 
 74 |     num_epochs = arguments['num_epochs']
 75 |     num_steps = arguments['num_iters']
 76 | 
 77 |     epsilon_schedule = LinearSchedule(num_epochs, final_p=0.02)
 78 |     ep_rewards = []
 79 |     start_time = time.time()
 80 |     for ep in range(num_epochs):
 81 |         state, target = env.reset()
 82 |         start = env.current_state_id
 83 |         rewards = 0
 84 |         redundant = 0
 85 | 
 86 |         for step in range(num_steps):
 87 |             action = agent.act(state, epsilon_schedule.value(ep))
 88 |             next_state, reward, done = env.step(action)
 89 |             agent.step(state, env.cv_action_onehot[action], reward, next_state, done)
 90 |             state = next_state
 91 | 
 92 |             rewards += reward
 93 |             if done: 
 94 |                 break
 95 | 
 96 |         if not done:
 97 |             end = env.current_state_id
 98 |             try:
 99 |                 redundants = []
100 |                 for target_id in env.target_ids:
101 |                     redundants.append(num_steps + env.shortest[end, target_id] - env.shortest[start, target_id])
102 | 
103 |                 redundant = min(redundants)
104 |             except AttributeError:
105 |                 pass
106 | 
107 |         ep_rewards.append(rewards)
108 |         print("Ep {}/{}, elapsed time: {:.3f} | rewards: {:.3f}| mean rewards: {:.3f}".format(
109 |                 ep+1, num_epochs, (time.time() - start_time)/3600, 
110 |                 rewards, np.mean(ep_rewards)), end='\r', flush=True)
111 |         if ep % 100 == 0:
112 |             print("Ep {}/{}, elapsed time: {:.3f} | rewards: {:.3f}| mean rewards: {:.3f}\n".format(
113 |                 ep+1, num_epochs, (time.time() - start_time)/3600, 
114 |                 rewards, np.mean(ep_rewards)))
115 | 
116 |         summary = sess.run(write_op, feed_dict = {
117 |             reward_log: rewards,
118 |             redundant_log: redundant,   
119 |             })
120 | 
121 |         writer.add_summary(summary, ep + 1)
122 |         writer.flush()
123 |     
124 |     saver.save(sess, log_folder + "/my-model")
125 |     sess.close()
126 | 
127 | 
128 | if __name__ == '__main__':
129 |     parser = argparse.ArgumentParser(description='Arguments')
130 |     parser.add_argument('--room_id', type=int, default=0)
131 |     parser.add_argument('--scene_id', nargs='?', type=int, default=0)
132 |     parser.add_argument('--target_id', nargs='?', type=int, default=0)
133 |     parser.add_argument('--gpu_fraction', nargs='?', type=float, default=0.15,
134 |                         help='GPU memory usage fraction')
135 |     parser.add_argument('--history_size', type=int, default=1,
136 |                         help='whether to stack frames to make input')
137 |     parser.add_argument('--num_epochs', nargs='?', type=int, default=10000,
138 |                         help='Number of epochs to train')
139 |     parser.add_argument('--num_iters', nargs='?', type=int, default=100,
140 |                         help='Number of steps to be sampled in each episode')
141 |     parser.add_argument('--buffer_size', nargs='?', type=int, default=100000,
142 |                         help='replay buffer size')
143 |     parser.add_argument('--batch_size', nargs='?', type=int, default=64,
144 |                         help='minibatch size')
145 |     parser.add_argument('--gamma', nargs='?', type=float, default=0.99,
146 |                         help='discount factor')
147 |     parser.add_argument('--tau', nargs='?', type=float, default=1e-3,
148 |                         help='for soft update of target parameters')
149 |     parser.add_argument('--lr', nargs='?', type=float, default=5e-4,
150 |                         help='learning rate')
151 |     parser.add_argument('--update_every', nargs='?', type=int, default=4,
152 |                         help='how often to update the network')
153 |     parser.add_argument('--logging', type=str, default="training-history/",
154 |                         help='Logging folder')
155 |     parser.add_argument('--config_file', type=str, default="config.json")
156 | 
157 | 
158 |     args = parser.parse_args()
159 | 
160 |     # print(vars(args))
161 |     config = read_config(args.config_file)
162 |     main(config, vars(args))
163 | 


--------------------------------------------------------------------------------
/dqn/model.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | HIDDEN_SIZE = 128
 4 | 
 5 | def _fc_weight_variable(shape, name='W_fc'):
 6 |     input_channels = shape[0]
 7 |     d = 1.0 / np.sqrt(input_channels)
 8 |     initial = tf.random_uniform(shape, minval=-d, maxval=d)
 9 |     return tf.get_variable(name=name, dtype = tf.float32, initializer=initial)
10 | 
11 | def _fc_bias_variable(shape, input_channels, name='b_fc'):
12 |     d = 1.0 / np.sqrt(input_channels)
13 |     initial = tf.random_uniform(shape, minval=-d, maxval=d)
14 |     return tf.get_variable(name=name, dtype=tf.float32, initializer=initial)
15 | 
16 | 
17 | class QNetwork():
18 |     def __init__(self, name, state_size, action_size, history_size=1, dropout_keep_prob=-1):
19 |         self.state_size = state_size
20 |         self.action_size = action_size
21 | 
22 |         with tf.variable_scope(name):
23 |             self.inputs = tf.placeholder(tf.float32, [None, history_size, self.state_size])
24 | 
25 |             self.inputs_flat = tf.reshape(self.inputs, [-1, self.state_size * history_size])
26 |             self.actions = tf.placeholder(tf.float32, [None, self.action_size])
27 |             self.target_Q = tf.placeholder(tf.float32, [None])
28 |             self.learning_rate = tf.placeholder(tf.float32, [])
29 | 
30 |             self.W_fc1 = _fc_weight_variable([self.state_size * history_size, HIDDEN_SIZE], name="W_fc1")
31 |             self.b_fc1 = _fc_bias_variable([HIDDEN_SIZE], self.state_size, name="b_fc1")
32 |             self.fc1 = tf.nn.relu(tf.nn.bias_add(tf.matmul(self.inputs_flat, self.W_fc1), self.b_fc1))
33 | 
34 |             if dropout_keep_prob != -1:
35 |                 self.fc1 = tf.nn.dropout(self.fc1, dropout_keep_prob)
36 | 
37 |             self.W_fc2 = _fc_weight_variable([HIDDEN_SIZE, self.action_size], name="W_fc2")
38 |             self.b_fc2 = _fc_bias_variable([self.action_size], HIDDEN_SIZE, name="b_fc2")
39 | 
40 |             self.q_values = tf.nn.bias_add(tf.matmul(self.fc1, self.W_fc2), self.b_fc2)
41 |             self.Q_expected = tf.reduce_sum(tf.multiply(self.q_values, self.actions))
42 | 
43 | 
44 |             self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q_expected))
45 |             self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
46 | 
47 |             self.variables = [self.W_fc1, self.b_fc1, self.W_fc2, self.b_fc2]
48 | 


--------------------------------------------------------------------------------
/dqn/replay_buffer.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | from collections import namedtuple, deque
 4 | 
 5 | class ReplayBuffer:
 6 |     """Fixed-size buffer to store experience tuples."""
 7 | 
 8 |     def __init__(self, action_size, buffer_size, batch_size, seed):
 9 |         """Initialize a ReplayBuffer object.
10 |         Params
11 |         ======
12 |             action_size (int): dimension of each action
13 |             buffer_size (int): maximum size of buffer
14 |             batch_size (int): size of each training batch
15 |             seed (int): random seed
16 |         """
17 |         self.action_size = action_size
18 |         self.memory = deque(maxlen=buffer_size)  
19 |         self.batch_size = batch_size
20 |         self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
21 |         self.seed = random.seed(seed)
22 |     
23 |     def add(self, state, action, reward, next_state, done):
24 |         """Add a new experience to memory."""
25 |         e = self.experience(state, action, reward, next_state, done)
26 |         self.memory.append(e)
27 |     
28 |     def sample(self):
29 |         """Randomly sample a batch of experiences from memory."""
30 |         experiences = random.sample(self.memory, k=self.batch_size)
31 | 
32 |         states = np.vstack([[e.state] for e in experiences if e is not None])
33 |         actions = np.vstack([e.action for e in experiences if e is not None])
34 |         rewards = np.vstack([e.reward for e in experiences if e is not None])
35 |         next_states = np.vstack([[e.next_state] for e in experiences if e is not None])
36 |         dones = np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)
37 |   
38 |         return (states, actions, rewards, next_states, dones)
39 | 
40 |     def __len__(self):
41 |         """Return the current size of internal memory."""
42 |         return len(self.memory)


--------------------------------------------------------------------------------
/dqn/utils.py:
--------------------------------------------------------------------------------
 1 | class LearningRateDecay(object):
 2 |     def __init__(self, v, nvalues, lr_decay_method):
 3 |         self.n = 0.
 4 |         self.v = v
 5 |         self.nvalues = nvalues
 6 | 
 7 |         def constant(p):
 8 |             return 1
 9 | 
10 |         def linear(p):
11 |             return 1 - p
12 | 
13 |         lr_decay_methods = {
14 |             'linear': linear,
15 |             'constant': constant
16 |         }
17 | 
18 |         self.decay = lr_decay_methods[lr_decay_method]
19 | 
20 |     def value(self):
21 |         current_value = self.v * self.decay(self.n / self.nvalues)
22 |         self.n += 1.
23 |         return current_value
24 | 
25 |     def get_value_for_steps(self, steps):
26 |         return self.v * self.decay(steps / self.nvalues)
27 | 
28 | class LinearSchedule(object):
29 |     def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
30 |         """Linear interpolation between initial_p and final_p over
31 |         schedule_timesteps. After this many timesteps pass final_p is
32 |         returned.
33 |         Parameters
34 |         ----------
35 |         schedule_timesteps: int
36 |             Number of timesteps for which to linearly anneal initial_p
37 |             to final_p
38 |         initial_p: float
39 |             initial output value
40 |         final_p: float
41 |             final output value
42 |         """
43 |         self.schedule_timesteps = schedule_timesteps
44 |         self.final_p = final_p
45 |         self.initial_p = initial_p
46 | 
47 |     def value(self, t):
48 |         """See Schedule.value"""
49 |         fraction = min(float(t) / self.schedule_timesteps, 1.0)
50 |         return self.initial_p + fraction * (self.final_p - self.initial_p)


--------------------------------------------------------------------------------
/draft.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import h5py
 4 | import operator
 5 | 
 6 | config = json.load(open('config.json'))
 7 | 
 8 | scene_type = ['Kitchens', 'Living Rooms', 'Bedrooms', 'Bathrooms']
 9 | visible = []
10 | cnt = {}
11 | st = 0
12 | 
13 | for s in ['train_scenes', 'test_scenes']:
14 |     cnt[s] = {}
15 |     for f in config['rooms'][scene_type[st]][s]:
16 |         f = h5py.File("dumped/{}.hdf5".format(f), 'r')
17 |         visible.append(f['all_visible_objects'][()].tolist())
18 |         for o in f['all_visible_objects'][()].tolist():
19 |             if o not in cnt[s]:
20 |                 cnt[s][o] = 1
21 |             else:
22 |                 cnt[s][o] +=1
23 |     if s == 'train_scenes':
24 |         cnt[s] = [o for o, c in cnt[s].items() if c > 7]
25 |     else:
26 |         cnt[s] = [o for o, c in cnt[s].items()]
27 | 
28 | print(cnt)
29 | 
30 | print("Joint: ", set(cnt['train_scenes']).intersection(set(cnt['test_scenes'])))


--------------------------------------------------------------------------------
/embedding_fasttext300.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/embedding_fasttext300.pkl


--------------------------------------------------------------------------------
/embedding_onehot.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/embedding_onehot.pkl


--------------------------------------------------------------------------------
/env/ai2thor_env.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import gym
  3 | import cv2
  4 | import h5py
  5 | import os
  6 | import sys
  7 | import random
  8 | 
  9 | class AI2ThorDumpEnv():
 10 |     """
 11 |     Wrapper base class
 12 |     """
 13 |     def __init__(self, scene, target, config, arguments=dict(), seed=None):
 14 |         """
 15 |         :param seed: (int)   Random seed
 16 |         :param config: (str)   Dictionary file storing cofigurations
 17 |         :param: scene: (list)  Scene to train
 18 |         :param: target: (list)  Target object to train
 19 |         """
 20 |         if seed is not None:
 21 |             np.random.seed(seed)
 22 | 
 23 |         self.config = config
 24 |         self.arguments = arguments
 25 |         self.scene = scene
 26 |         self.target = target
 27 |         self.history_size = arguments.get('history_size')
 28 |         self.action_size = arguments.get('action_size')
 29 | 
 30 |         self.h5_file = h5py.File("{}.hdf5".format(os.path.join(config['dump_path'], self.scene)), 'r')
 31 | 
 32 |         self.states = self.h5_file['locations'][()]
 33 |         self.graph = self.h5_file['graph'][()]
 34 |         self.scores = self.h5_file['resnet_scores'][()] if not arguments['yolo_gcn'] else self.h5_file['dump_features'][()][:, :-4].astype(bool).astype(int)
 35 |         self.all_visible_objects = self.h5_file['all_visible_objects'][()].tolist()
 36 |         self.visible_objects = self.h5_file['visible_objects'][()]
 37 |         self.observations = self.h5_file['observations'][()]
 38 | 
 39 |         assert self.target in self.all_visible_objects, "Target {} is unreachable in {}!".format(self.target, self.scene)
 40 | 
 41 |         self.resnet_features = self.h5_file['resnet_features'][()]
 42 |         self.dump_features = self.h5_file['dump_features'][()]
 43 | 
 44 |         if arguments['onehot']:
 45 |             self.features = self.dump_features
 46 |         else:
 47 |             self.features = self.resnet_features
 48 | 
 49 |         assert self.action_size <= self.graph.shape[1], "The number of actions exceeds the limit of environment."
 50 | 
 51 |         if "shortest" in self.h5_file.keys():
 52 |             self.shortest = self.h5_file['shortest'][()]
 53 | 
 54 |         if self.arguments['hard']:
 55 |             # agent has to reach the correct position and has right rotation
 56 |             self.offset = 3
 57 |         else:
 58 |             # agent only has to reach the correct position
 59 |             self.offset = 2
 60 | 
 61 |         self.target_ids = [idx for idx in range(len(self.states)) if self.target in self.visible_objects[idx].split(",")]
 62 |         self.target_locs = set([tuple(self.states[idx][:self.offset]) for idx in self.target_ids])        
 63 | 
 64 |         self.action_space = self.action_size 
 65 |         self.cv_action_onehot = np.identity(self.action_space)
 66 |         
 67 |         self.history_states = np.zeros((self.history_size, self.features.shape[1]))
 68 |         self.observations_stack = [np.zeros((3, 128, 128)) for _ in range(self.history_size)]
 69 | 
 70 |     def step(self, action):
 71 |         '''
 72 |         0: move ahead
 73 |         1: move back
 74 |         2: rotate right
 75 |         3: rotate left
 76 |         4: look down
 77 |         5: look up
 78 |         '''
 79 | 
 80 |         if action >= self.action_space:
 81 |             raise error.InvalidAction('Action must be an integer between '
 82 |                                       '0 and {}!'.format(self.action_space - 1))
 83 |         k = self.current_state_id
 84 |         if self.graph[k][action] != -1:
 85 |             if action == 2 or action == 3:
 86 |                 for _ in range(int(self.arguments['angle'] / 22.5)):
 87 |                     self.current_state_id = int(self.graph[k][action])
 88 |             else:                
 89 |                 self.current_state_id = int(self.graph[k][action])
 90 |                 
 91 |             if tuple(self.states[self.current_state_id][:self.offset]) in self.target_locs:
 92 |                 self.terminal = True
 93 |                 self.collided = False
 94 |             else:
 95 |                 self.terminal = False
 96 |                 self.collided = False
 97 |         else:
 98 |             self.terminal = False
 99 |             self.collided = True
100 | 
101 |         reward, done = self.transition_reward()
102 | 
103 |         self.update_states()
104 | 
105 |         if self.arguments['train_cnn']:
106 |             return np.asarray(self.observations_stack, dtype=np.float32), self.scores[self.current_state_id], reward, done
107 |         else:
108 |             return self.history_states, self.scores[self.current_state_id], reward, done
109 | 
110 |     def transition_reward(self):
111 |         reward = self.config['default_reward']
112 |         done = 0
113 |         if self.terminal:
114 |             reward = self.config['success_reward']
115 |             done = 1
116 |         elif self.arguments['anti_col'] and self.collided:
117 |             reward = self.config['collide_reward']
118 | 
119 |         return reward, done
120 | 
121 |     def reset(self):
122 |         # reset parameters
123 |         k = random.randrange(self.states.shape[0])
124 | 
125 |         while self.states[k][2] % self.arguments['angle'] != 0.0:
126 |             k = random.randrange(self.states.shape[0])
127 | 
128 |         self.current_state_id = k
129 | 
130 |         self.update_states(reset=True)
131 |         self.terminal = False
132 |         self.collided = False
133 | 
134 |         if self.arguments['train_cnn']:
135 |             return np.asarray(self.observations_stack, dtype=np.float32), self.scores[self.current_state_id], self.target
136 |         else:
137 |             return self.history_states, self.scores[self.current_state_id], self.target
138 | 
139 |     def update_states(self, reset=False): 
140 |         if reset:
141 |             self.history_states = np.zeros((self.history_size, self.features.shape[1]))
142 |             self.observations_stack = [np.zeros((3, 128, 128)) for _ in range(self.history_size)]
143 | 
144 |         f = self.features[self.current_state_id]
145 |             
146 |         self.history_states = np.append(self.history_states[1:, :], f[np.newaxis, :], 0)
147 | 
148 |         self.observations_stack.append(self.observation())
149 |         self.observations_stack = self.observations_stack[1:]
150 | 
151 |     def state(self):    
152 |         return self.features[self.current_state_id]
153 | 
154 |     def observation(self):
155 |         ob = self.observations[self.current_state_id]        
156 |         resized_ob = cv2.resize(ob, (128, 128))
157 |         return np.transpose(resized_ob, (2, 0, 1))
158 | 
159 | class MultiSceneEnv():
160 |     """
161 |     Wrapper base class
162 |     """
163 |     def __init__(self, scene, config, arguments=dict(), seed=None):
164 |         """
165 |         :param seed: (int)   Random seed
166 |         :param config: (str)   Dictionary file storing cofigurations
167 |         :param: scene: (list)  Scene to train
168 |         :param: objects: (list)  Target objects to train
169 |         """
170 | 
171 |         if seed is not None:
172 |             np.random.seed(seed)
173 |             
174 |         self.config = config
175 |         self.arguments = arguments
176 |         self.scene = scene
177 | 
178 |         self.history_size = arguments.get('history_size')
179 |         self.action_size = arguments.get('action_size')
180 | 
181 |         scene_id = int(scene.split("FloorPlan")[1])
182 |         if scene_id > 0 and scene_id < 31:
183 |             room_type = "Kitchens"
184 |         elif scene_id > 200 and scene_id < 231:
185 |             room_type = 'Living Rooms'
186 |         elif scene_id > 300 and scene_id < 331:
187 |             room_type = 'Bedrooms'
188 |         elif scene_id > 400 and scene_id < 431:
189 |             room_type = 'Bathrooms'
190 |         else:
191 |             raise KeyError
192 | 
193 |         if arguments['test'] == 1:
194 |             self.targets = config["rooms"][room_type]['train_objects'] + config["rooms"][room_type]['test_objects']
195 |         else:
196 |             self.targets = config["rooms"][room_type]['train_objects']
197 | 
198 |         self.h5_file = h5py.File("{}.hdf5".format(os.path.join(config['dump_path'], self.scene)), 'r')
199 | 
200 |         self.states = self.h5_file['locations'][()]
201 |         self.graph = self.h5_file['graph'][()]
202 |         self.scores = self.h5_file['resnet_scores'][()] if not arguments['yolo_gcn'] else self.h5_file['dump_features'][()][:, :-4].astype(bool).astype(int)
203 |         self.all_visible_objects = self.h5_file['all_visible_objects'][()].tolist()
204 |         self.visible_objects = self.h5_file['visible_objects'][()]
205 |         self.observations = self.h5_file['observations'][()]
206 | 
207 |         self.resnet_features = self.h5_file['resnet_features'][()]
208 |         self.dump_features = self.h5_file['dump_features'][()]
209 | 
210 |         
211 |         if arguments['onehot']:
212 |             self.features = self.dump_features
213 |         else:
214 |             self.features = self.resnet_features
215 | 
216 |         assert self.action_size <= self.graph.shape[1], "The number of actions exceeds the limit of environment."
217 | 
218 |         if "shortest" in self.h5_file.keys():
219 |             self.shortest = self.h5_file['shortest'][()]
220 | 
221 |         if self.arguments['hard']:
222 |             # agent has to reach the correct position and has right rotation
223 |             self.offset = 3
224 |         else:
225 |             # agent only has to reach the correct position
226 |             self.offset = 2
227 | 
228 |         self.target = np.random.choice(self.targets)
229 |         self.target_ids = [idx for idx in range(len(self.states)) if self.target in self.visible_objects[idx].split(",")]
230 |         self.target_locs = set([tuple(self.states[idx][:self.offset]) for idx in self.target_ids])        
231 |         
232 |         self.action_space = self.action_size 
233 |         self.cv_action_onehot = np.identity(self.action_space)
234 |         
235 |         self.history_states = np.zeros((self.history_size, self.features.shape[1]))
236 |         self.observations_stack = [np.zeros((3, 128, 128)) for _ in range(self.history_size)]
237 | 
238 | 
239 |     def step(self, action):
240 |         '''
241 |         0: move ahead
242 |         1: move back
243 |         2: rotate right
244 |         3: rotate left
245 |         4: look down
246 |         5: look up
247 |         '''
248 | 
249 |         if action >= self.action_space:
250 |             raise error.InvalidAction('Action must be an integer between '
251 |                                       '0 and {}!'.format(self.action_space - 1))
252 |         k = self.current_state_id
253 |         if self.graph[k][action] != -1:
254 |             if action == 2 or action == 3:
255 |                 for _ in range(int(self.arguments['angle'] / 22.5)):
256 |                     self.current_state_id = int(self.graph[k][action])
257 |             else:                
258 |                 self.current_state_id = int(self.graph[k][action])
259 | 
260 |             if tuple(self.states[self.current_state_id][:self.offset]) in self.target_locs:
261 |                 self.terminal = True
262 |                 self.collided = False
263 |             else:
264 |                 self.terminal = False
265 |                 self.collided = False
266 |         else:
267 |             self.terminal = False
268 |             self.collided = True
269 | 
270 |         reward, done = self.transition_reward()
271 | 
272 |         self.update_states()
273 | 
274 |         if self.arguments['train_cnn']:
275 |             return np.asarray(self.observations_stack, dtype=np.float32), self.scores[self.current_state_id], reward, done
276 |         else:
277 |             return self.history_states, self.scores[self.current_state_id], reward, done
278 | 
279 |     def transition_reward(self):
280 |         reward = self.config['default_reward']
281 |         done = 0
282 |         if self.terminal:
283 |             reward = self.config['success_reward']
284 |             done = 1
285 |         elif self.arguments['anti_col'] and self.collided:
286 |             reward = self.config['collide_reward']
287 | 
288 |         return reward, done
289 | 
290 |     def reset(self):
291 |         self.target = np.random.choice(self.targets)
292 |         self.target_ids = [idx for idx in range(len(self.states)) if self.target in self.visible_objects[idx].split(",")]
293 |         self.target_locs = set([tuple(self.states[idx][:self.offset]) for idx in self.target_ids])        
294 | 
295 |         k = random.randrange(self.states.shape[0])
296 | 
297 |         while self.states[k][2] % self.arguments['angle'] != 0.0:
298 |             k = random.randrange(self.states.shape[0])
299 | 
300 |         # reset parameters
301 |         self.current_state_id = k
302 |         
303 |         self.update_states(reset=True)
304 |         self.terminal = False
305 |         self.collided = False
306 | 
307 |         if self.arguments['train_cnn']:
308 |             return np.asarray(self.observations_stack, dtype=np.float32), self.scores[self.current_state_id], self.target
309 |         else:
310 |             return self.history_states, self.scores[self.current_state_id], self.target
311 | 
312 | 
313 |     def update_states(self, reset=False): 
314 |         if reset:
315 |             self.history_states = np.zeros((self.history_size, self.features.shape[1]))
316 |             self.observations_stack = [np.zeros((3, 128, 128)) for _ in range(self.history_size)]
317 | 
318 |         f = self.features[self.current_state_id]
319 |             
320 |         self.history_states = np.append(self.history_states[1:, :], f[np.newaxis, :], 0)
321 | 
322 |         self.observations_stack.append(self.observation())
323 |         self.observations_stack = self.observations_stack[1:]
324 | 
325 |     def state(self):    
326 |         return self.features[self.current_state_id]
327 | 
328 |     def observation(self):
329 |         ob = self.observations[self.current_state_id]        
330 |         resized_ob = cv2.resize(ob, (128, 128))
331 |         return np.transpose(resized_ob, (2, 0, 1))
332 | 
333 | 
334 | if __name__ == '__main__':
335 |     AI2ThorEnv()
336 | 


--------------------------------------------------------------------------------
/images/1_GCN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/1_GCN.png


--------------------------------------------------------------------------------
/images/1_easy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/1_easy.png


--------------------------------------------------------------------------------
/images/1_easy_noGAE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/1_easy_noGAE.png


--------------------------------------------------------------------------------
/images/1_easy_noGAE_normalizeReward.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/1_easy_noGAE_normalizeReward.png


--------------------------------------------------------------------------------
/images/1_easy_noGAE_onehot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/1_easy_noGAE_onehot.png


--------------------------------------------------------------------------------
/images/1_embed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/1_embed.png


--------------------------------------------------------------------------------
/images/1_increaseLearningRate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/1_increaseLearningRate.png


--------------------------------------------------------------------------------
/images/1_increase_entropy_penalty.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/1_increase_entropy_penalty.png


--------------------------------------------------------------------------------
/images/1_noGAE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/1_noGAE.png


--------------------------------------------------------------------------------
/images/28_easy_noGAE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/28_easy_noGAE.png


--------------------------------------------------------------------------------
/images/All FloorPlan1_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/All FloorPlan1_4.png


--------------------------------------------------------------------------------
/images/All FloorPlan1_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/All FloorPlan1_6.png


--------------------------------------------------------------------------------
/images/All FloorPlan28_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/All FloorPlan28_4.png


--------------------------------------------------------------------------------
/images/All FloorPlan28_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/All FloorPlan28_6.png


--------------------------------------------------------------------------------
/images/All FloorPlan2_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/All FloorPlan2_4.png


--------------------------------------------------------------------------------
/images/All FloorPlan2_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/All FloorPlan2_6.png


--------------------------------------------------------------------------------
/images/Compare FloorPlan1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/Compare FloorPlan1.png


--------------------------------------------------------------------------------
/images/Compare FloorPlan2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/Compare FloorPlan2.png


--------------------------------------------------------------------------------
/images/Compare FloorPlan28.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/Compare FloorPlan28.png


--------------------------------------------------------------------------------
/images/FloorPlan1_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/FloorPlan1_4.png


--------------------------------------------------------------------------------
/images/FloorPlan1_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/FloorPlan1_6.png


--------------------------------------------------------------------------------
/images/FloorPlan28_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/FloorPlan28_4.png


--------------------------------------------------------------------------------
/images/FloorPlan28_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/FloorPlan28_6.png


--------------------------------------------------------------------------------
/images/FloorPlan2_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/FloorPlan2_4.png


--------------------------------------------------------------------------------
/images/sample_AI2THOR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/images/sample_AI2THOR.png


--------------------------------------------------------------------------------
/keyboard_agent.py:
--------------------------------------------------------------------------------
  1 | import ai2thor.controller
  2 | import sys
  3 | import numpy as np
  4 | import h5py
  5 | import click
  6 | import json
  7 | import pyglet
  8 | 
  9 | from PIL import Image
 10 | 
 11 | ALL_POSSIBLE_ACTIONS = [
 12 | 	'MoveAhead',
 13 | 	'MoveBack',
 14 | 	'RotateRight',
 15 | 	'RotateLeft',
 16 | 	# 'Stop'   
 17 | ]
 18 | 
 19 | class SimpleImageViewer(object):
 20 | 
 21 |   def __init__(self, display=None):
 22 |     self.window = None
 23 |     self.isopen = False
 24 |     self.display = display
 25 | 
 26 |   def imshow(self, arr):
 27 |     if self.window is None:
 28 |       height, width, channels = arr.shape
 29 |       self.window = pyglet.window.Window(width=width, height=height, display=self.display, caption="THOR Browser")
 30 |       self.width = width
 31 |       self.height = height
 32 |       self.isopen = True
 33 | 
 34 |     assert arr.shape == (self.height, self.width, 3), "You passed in an image with the wrong number shape"
 35 |     image = pyglet.image.ImageData(self.width, self.height, 'RGB', arr.tobytes(), pitch=self.width * -3)
 36 |     self.window.clear()
 37 |     self.window.switch_to()
 38 |     self.window.dispatch_events()
 39 |     image.blit(0,0)
 40 |     self.window.flip()
 41 | 
 42 |   def close(self):
 43 |     if self.isopen:
 44 |       self.window.close()
 45 |       self.isopen = False
 46 | 
 47 |   def __del__(self):
 48 |     self.close()
 49 | 
 50 | def run(file_name=None):
 51 | 	# file_name = file_path.split('/')[-1].split('.')[0]
 52 | 	controller = ai2thor.controller.Controller()
 53 | 	controller.start()
 54 | 
 55 | 	controller.reset("FloorPlan203")
 56 | 	y_coord = 1.25
 57 | 	event = controller.step(dict(action='Initialize', gridSize=0.5, cameraY=y_coord, visibilityDistance=1.0))
 58 | 	all_visible_objects = list(np.unique([obj['objectType'] for obj in event.metadata['objects']]))
 59 | 	
 60 | 	rotation = 0.0
 61 | 	while True:  # making a loop
 62 | 		try:  # used try so that if user pressed other than the given key error will not be shown
 63 | 			key = click.getchar()
 64 | 			if key =='a':  # Rotate Left
 65 | 				rotation -= 22.5
 66 | 				if rotation < 0:
 67 | 					rotation = rotation + 360
 68 | 				event = controller.step(dict(action='Rotate', rotation=rotation))
 69 | 			elif key =='d':
 70 | 				rotation += 22.5
 71 | 				if rotation > 360:
 72 | 					rotation = rotation - 360
 73 | 				event = controller.step(dict(action='Rotate', rotation=rotation))
 74 | 			elif key =='w':
 75 | 				event = controller.step(dict(action='MoveAhead'))
 76 | 			elif key =='s':
 77 | 				event = controller.step(dict(action='MoveBack'))
 78 | 			elif key =='z':
 79 | 				event = controller.step(dict(action='LookDown'))
 80 | 			elif key =='x':
 81 | 				event = controller.step(dict(action='LookUp'))
 82 | 			elif key =='q':
 83 | 				controller.stop()
 84 | 				break
 85 | 			elif key =='r':
 86 | 				scene = input("Scene id: ")
 87 | 				controller.reset('FloorPlan{}'.format(scene))
 88 | 				event = controller.step(dict(action='Initialize', gridSize=0.5, cameraY=y_coord))
 89 | 			else:
 90 | 				print("Key not supported! Try a, d, w, s, q, r.")
 91 | 			print((event.metadata['agent']['position']['x'], event.metadata['agent']['position']['z'], event.metadata['agent']['rotation']))
 92 | 			# print([(obj['objectType'], obj['distance']) for obj in event.metadata['objects'] if obj['visible']])
 93 | 		except:
 94 | 			print("Key not supported! Try a, d, w, s, q, r.")
 95 | 
 96 | 
 97 | def key_press(key, mod):
 98 | 	global human_agent_action, human_wants_restart, stop_requested
 99 | 	if key == ord('R') or key == ord('r'): # r/R
100 | 		human_wants_restart = True
101 | 	if key == ord('Q') or key == ord('q'): # q/Q
102 | 		stop_requested = True
103 | 
104 | 	if key == 0xFF52: # move ahead
105 | 		human_agent_action = 0
106 | 	if key == 0xFF54: # move back
107 | 		human_agent_action = 1
108 | 	if key == 0xFF53: # turn right
109 | 		human_agent_action = 2
110 | 	if key == 0xFF51: # turn left
111 | 		human_agent_action = 3
112 | 
113 | 	if key == ord('z'): # look down
114 | 		human_agent_action = 4
115 | 	if key == ord('x'): # look up
116 | 		human_agent_action = 5
117 | 
118 | if __name__ == '__main__':
119 | 	
120 | 	# run()
121 | 
122 | 	angle = 45.0
123 | 
124 | 	human_agent_action = None
125 | 	human_wants_restart = False
126 | 	stop_requested = False
127 | 	next_position = None
128 | 	visible = None
129 | 
130 | 	f = h5py.File('dumped/FloorPlan317.hdf5', "r")
131 | 	observations = f['observations']
132 | 	graph = f['graph']
133 | 	visible_objects = f['visible_objects']
134 | 	dump_features = f['dump_features']
135 | 	states = f['locations'][()]
136 | 
137 | 	config = json.load(open('config.json'))
138 | 	categories = list(config['new_objects'].keys())
139 | 
140 | 	k = np.random.randint(0, observations.shape[0])
141 | 	while states[k][2] % angle != 0.0:
142 | 		k = np.random.randint(0, observations.shape[0])
143 | 	current_position = k
144 | 
145 | 	viewer = SimpleImageViewer()
146 | 	viewer.imshow(observations[current_position].astype(np.uint8))
147 | 	viewer.window.on_key_press = key_press
148 | 
149 | 	print("Use arrow keys to move the agent.")
150 | 	print("Press R to reset agent\'s location.")
151 | 	print("Press Q to quit.")
152 | 
153 | 	while True:
154 | 		# waiting for keyboard input
155 | 		if human_agent_action is not None:
156 | 			# move actions
157 | 			if human_agent_action == 2 or human_agent_action == 3:
158 | 				next_position = current_position
159 | 				for _ in range(int(angle/ 22.5)):
160 | 					next_position = graph[next_position][human_agent_action]
161 | 			else:
162 | 				next_position = graph[current_position][human_agent_action]
163 | 				
164 | 			current_position = next_position if next_position != -1 else current_position
165 | 			distances = [(categories[i], dump_features[current_position][i]) for i in list(np.where(dump_features[current_position][:-4] > 0)[0])]
166 | 			print(distances, dump_features[current_position][-4:])
167 | 			visible = visible_objects[current_position].split(',')
168 | 			human_agent_action = None
169 | 
170 | 		# waiting for reset command
171 | 		if human_wants_restart:
172 | 			# reset agent to random location
173 | 			k = np.random.randint(0, observations.shape[0])
174 | 			while states[k][2] % angle != 0.0:
175 | 				k = np.random.randint(0, observations.shape[0])
176 | 			current_position = k
177 | 
178 | 			human_wants_restart = False
179 | 
180 | 		# check collision
181 | 		if next_position == -1:
182 | 			print('Collision occurs.')
183 | 
184 | 		# check quit command
185 | 		if stop_requested: break
186 | 
187 | 		viewer.imshow(observations[current_position].astype(np.uint8))
188 | 		if visible is not None and len(list(visible)) > 0:
189 | 			print("Visible: {}".format(visible))
190 | 			visible = None
191 | 
192 | 	print("Goodbye.")


--------------------------------------------------------------------------------
/pytorch_a3c/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pytorch_a3c/README.md:
--------------------------------------------------------------------------------
 1 | # File Descriptions
 2 | 
 3 | - `main.py`: file to run, please read the arguments and corresponding descriptions
 4 | - `train.py`: training file, initialized in main thread as a independent process (A3C), support multi-gpus
 5 | - `test.py`: file to test trained model
 6 | - `dumping.py`: file to run ai2thor controller and dump needed information to hdf5 files (note that the number of actions is 4 and angle is 22.5 by default)
 7 | - `layers.py`: contains GCN code
 8 | - `utils.py`: contains necessary functions
 9 | - `keyboard_agent.py`: to run and interact with dumped file 
10 | - `config.json`: configurations including rewarding scheme, file paths, data split
11 | - `env/ai2thor_env.py`: environment
12 | 


--------------------------------------------------------------------------------
/pytorch_a3c/layers.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | from torch.nn.parameter import Parameter
 4 | from torch.nn.modules.module import Module
 5 | 
 6 | 
 7 | class GraphConvolution(Module):
 8 |     """
 9 |     Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
10 |     """
11 | 
12 |     def __init__(self, in_features, out_features, bias=True):
13 |         super(GraphConvolution, self).__init__()
14 |         self.in_features = in_features
15 |         self.out_features = out_features
16 |         self.weight = Parameter(torch.FloatTensor(in_features, out_features))
17 |         if bias:
18 |             self.bias = Parameter(torch.FloatTensor(out_features))
19 |         else:
20 |             self.register_parameter('bias', None)
21 |         self.reset_parameters()
22 | 
23 |     def reset_parameters(self):
24 |         stdv = 1. / math.sqrt(self.weight.size(1))
25 |         self.weight.data.uniform_(-stdv, stdv)
26 |         if self.bias is not None:
27 |             self.bias.data.uniform_(-stdv, stdv)
28 | 
29 |     def forward(self, input, adj):
30 |         support = torch.mm(input, self.weight)
31 |         output = torch.spmm(adj, support)
32 |         if self.bias is not None:
33 |             return output + self.bias
34 |         else:
35 |             return output
36 | 
37 |     def __repr__(self):
38 |         return self.__class__.__name__ + ' (' \
39 |                + str(self.in_features) + ' -> ' \
40 |                + str(self.out_features) + ')'
41 | 


--------------------------------------------------------------------------------
/pytorch_a3c/main.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Adapted from: https://github.com/ikostrikov/pytorch-a3c/blob/master/main.py
  3 | The main file needed within a3c. Runs of the train and test functions from their respective files.
  4 | Example of use:
  5 | `cd algorithms/a3c`
  6 | `python main.py`
  7 | 
  8 | Runs A3C on our AI2ThorEnv wrapper with default params (4 processes). Optionally it can be
  9 | run on any atari environment as well using the --atari and --atari-env-name params.
 10 | """
 11 | import sys
 12 | import argparse
 13 | import os
 14 | import numpy as np
 15 | import torch
 16 | import torch.multiprocessing as mp
 17 | from multiprocessing import Manager
 18 | import json
 19 | import h5py
 20 | 
 21 | sys.path.append('..') # to access env package
 22 | 
 23 | from env.ai2thor_env import AI2ThorDumpEnv
 24 | from optimizers import SharedAdam, SharedRMSprop
 25 | from model import ActorCritic
 26 | from test import test, live_test, test_multi
 27 | from train import train, train_multi
 28 | 
 29 | # Based on
 30 | # https://github.com/pytorch/examples/tree/master/mnist_hogwild
 31 | # Training settings
 32 | parser = argparse.ArgumentParser(description='A3C')
 33 | parser.add_argument('--about', type=str, default="training A3C", required=True,
 34 |                     help='description about training, also the name of saving directory, \
 35 |                             just a way to control which test case was run')
 36 | parser.add_argument('--lr', type=float, default=7e-4,
 37 |                     help='learning rate (default: 7e-4)')
 38 | parser.add_argument('--angle', type=float, default=45.0,
 39 |                     help='rotation angle')
 40 | parser.add_argument('--gamma', type=float, default=0.99,
 41 |                     help='discount factor for rewards (default: 0.99)')
 42 | parser.add_argument('--tau', type=float, default=0.96,
 43 |                     help='parameter for GAE (default: 1.00)')
 44 | parser.add_argument('--ec', type=float, default=0.01,
 45 |                     help='entropy term coefficient (default: 0.01)')
 46 | parser.add_argument('--vc', type=float, default=0.5,
 47 |                     help='value loss coefficient (default: 0.5)')
 48 | parser.add_argument('--max_grad_norm', type=float, default=10,
 49 |                     help='value loss coefficient (default: 10)')
 50 | parser.add_argument('--lr_decay', type=int, default=0,
 51 |                     help='whether to use learning rate decay')
 52 | parser.add_argument('--seed', type=int, default=1,
 53 |                     help='random seed (default: 1)')
 54 | parser.add_argument('--room_id', type=int, default=0,
 55 |                     help='room id (default: 0)')
 56 | parser.add_argument('--test', type=int, default=0,
 57 |                     help='whether to activate testing phase')
 58 | parser.add_argument('--live_test', type=int, default=0,
 59 |                     help='whether to activate live testing phase')
 60 | parser.add_argument('--action_size', type=int, default=4,
 61 |                     help='number of possible actions')
 62 | parser.add_argument('--num_processes', type=int, default=20,
 63 |                     help='how many training processes to use (default: 1)')
 64 | parser.add_argument('--num_iters', type=int, default=100,
 65 |                     help='number of forward steps in A3C (default: 20)')
 66 | parser.add_argument('--num_epochs', type=int, default=20000,
 67 |                     help='number of epochs to run on each thread')
 68 | parser.add_argument('--max_episode_length', type=int, default=1000,
 69 |                     help='maximum length of an episode (default: 1000)')
 70 | parser.add_argument('--siamese', type=int, default=0)
 71 | parser.add_argument('--train_cnn', type=int, default=0,
 72 |                     help='whether to re-train cnn module')
 73 | parser.add_argument('--history_size', type=int, default=4,
 74 |                     help='whether to stack frames')
 75 | parser.add_argument('--optim', type=int, default=1,
 76 |                     help='optimizer: 0 for Adam, 1 for RMSprop')
 77 | parser.add_argument('--multi_scene', type=int, default=1,
 78 |                     help='whether to train on multiple scenes')
 79 | parser.add_argument('--lstm', type=int, default=0,
 80 |                     help='whether to use lstm instead of stacking features')
 81 | parser.add_argument('--onehot', type=int, default=1,
 82 |                     help='whether to use onehot vector as input feature')
 83 | parser.add_argument('--embed', type=int, default=1,
 84 |                     help='embedding mode: 0 for onehot, 1 for fasttext')
 85 | parser.add_argument('--random_test', type=int, default=0,
 86 |                     help='whether to test performance of a random agent')
 87 | parser.add_argument('--use_gcn', type=int, default=0,
 88 |                     help='whether to include gcn')
 89 | parser.add_argument('--anti_col', type=int, default=0,
 90 |                     help='whether to include collision penalty to rewarding scheme')
 91 | parser.add_argument('--use_graph', type=int, default=0,
 92 |                     help='whether to use relations vector from graph to replace gcn')
 93 | parser.add_argument('--yolo_gcn', type=int, default=0,
 94 |                     help='whether to use yolo as input for gcn instead of resnet')
 95 | parser.add_argument('--no_shared', type=int, default=0,
 96 |                     help='use an optimizer without shared momentum.')
 97 | parser.add_argument('--scene_id', type=int, default=1,
 98 |                     help='scene id (default: 1)')
 99 | parser.add_argument('--gpu_ids', type=int, default=-1,
100 |                     nargs='+', help='GPUs to use [-1 CPU only] (default: -1)')
101 | parser.add_argument('--hard', type=int, default=1,
102 |                     help='whether to make environment harder\
103 |                         0: agent only has to reach the correct position\
104 |                         1: agent has to reach the correct position and has right rotation')
105 | 
106 | parser.add_argument('--config_file', type=str, default="../config.json")
107 | parser.add_argument('--folder', type=str, default=None)
108 | 
109 | ALL_ROOMS = {
110 |     0: "Kitchens",
111 |     1: "Living Rooms",
112 |     2: "Bedrooms",
113 |     3: "Bathrooms"
114 | }
115 | 
116 | def read_config(config_path):
117 |     if os.path.isfile(config_path):
118 |         with open(config_path) as f:
119 |             config = json.load(f)
120 |     return config
121 | 
122 | def read_weights(folder):
123 |     weights = [f for f in os.listdir(folder) if f.endswith('.pth')]
124 |     histories = [f for f in os.listdir(folder) if f.endswith('.pkl')]
125 | 
126 |     arguments = json.load(open(folder + '/arguments.json'))
127 |     if arguments['multi_scene']:        
128 |         scenes = list(set([f.split('_')[0] for f in os.listdir(folder) if f.endswith('.pkl')]))
129 |         targets = []
130 |     else:
131 |         scenes = ["FloorPlan{}".format(arguments['scene_id'])]        
132 |         targets = list(set([f.split('_')[1] for f in os.listdir(folder) if f.endswith('.pkl')]))
133 |     
134 |     print(list(zip(range(len(weights)), weights)))
135 |     wid = input("Please specify weights: ")
136 |     weights = weights[int(wid)]
137 | 
138 |     return os.path.join(folder, weights), arguments, {'scenes': scenes, 'targets': targets}
139 | 
140 | if __name__ == '__main__':
141 |     os.environ['OMP_NUM_THREADS'] = '1'
142 |     args = parser.parse_args()
143 |     torch.manual_seed(args.seed)
144 |     if args.gpu_ids == -1:
145 |         args.gpu_ids = [-1]
146 |     else:
147 |         torch.cuda.manual_seed(args.seed)
148 | 
149 |     config = read_config(args.config_file)
150 |     # room = config['rooms'][ALL_ROOMS[args.room_id]]
151 | 
152 |     if args.folder is not None:
153 |         weights, arguments, info = read_weights(args.folder)
154 | 
155 |         multi_scene =  len(info['scenes']) > 1
156 | 
157 |         if args.test or args.live_test:
158 |             if not args.random_test:
159 |                 shared_model = ActorCritic(config, arguments)
160 |                 shared_model.share_memory()
161 | 
162 |                 shared_model.load_state_dict(torch.load(weights, map_location='cpu'))
163 |                 print("loaded model")
164 |             else:
165 |                 print("testing random agent ..")
166 |                 shared_model = None
167 | 
168 |     if args.live_test:
169 |         assert args.folder is not None
170 |         print("Start testing ..")
171 | 
172 |         if multi_scene:
173 |             print(list(zip(range(len(info['scenes'])), info['scenes'])))
174 |             command = int(input("Please specify scene id. \nYour input:"))
175 |             training_scene = info['scenes'][command]
176 | 
177 |             f = h5py.File("{}.hdf5".format(os.path.join(config['dump_path'], training_scene)), 'r')
178 |             # training_objects = f['all_visible_objects'][()].tolist()
179 |             training_objects = ["GarbageCan", "Sink", "Bread", "StoveKnob", "SinkBasin", "StoveBurner", "Fridge", "CounterTop", "Microwave", "LightSwitch", "CoffeeMachine", "Cabinet"]
180 |             f.close()
181 |         else:
182 |             training_scene = info['scenes'][0]
183 |             print(list(zip(range(len(info['targets'])), info['targets'])))
184 |             command = input("Please specify target ids, you can choose either individually (e.g: 0,1,2) or by range (e.g: 0-4)\nYour input:")
185 | 
186 |             if '-' not in command:
187 |                 target_ids = [int(i.strip()) for i in command.split(",")]
188 |             else:
189 |                 target_ids = list(range(int(command.split('-')[0]), int(command.split('-')[1]) + 1))
190 | 
191 |             training_objects = [info['targets'][target_id] for target_id in target_ids]
192 | 
193 |         training_objects.sort()
194 |         live_test(training_scene, training_objects, shared_model, config, arguments)
195 | 
196 |     else:
197 |         if args.test:
198 |             assert args.folder is not None
199 |             if not multi_scene:
200 |                 
201 |                 training_scene = info['scenes'][0]
202 |                 testing_objects = config["picked"][training_scene]['test']
203 |                 training_objects = info['targets']
204 |                 all_visible_objects =  training_objects + testing_objects
205 | 
206 |                 phase = ['train'] * len(training_objects) + ['test'] * len(testing_objects)
207 | 
208 |                 print(list(zip(range(len(phase)), all_visible_objects, phase)))
209 |                 command = input("Please specify target ids, you can choose either individually (e.g: 0,1,2) or by range (e.g: 0-4)\nYour input:")
210 |                 if '-' not in command:
211 |                     target_ids = [int(i.strip()) for i in command.split(",")]
212 |                 else:
213 |                     target_ids = list(range(int(command.split('-')[0]), int(command.split('-')[1]) + 1))
214 | 
215 |                 chosen_objects = [all_visible_objects[target_id] for target_id in target_ids]
216 |                 check_phase = lambda c: 'train' if os.path.isfile(os.path.join(args.folder, "net_{}.pth".format(c))) else 'test'
217 |                 chosen_phases  = [check_phase(c) for c in chosen_objects]
218 | 
219 |                 results = mp.Array('f', len(chosen_objects))
220 |                 processes = []
221 |                 for rank, obj in enumerate(chosen_objects):
222 |                     p = mp.Process(target=test, args=(training_scene, obj, rank, shared_model, \
223 |                                     results, config, arguments))
224 |                     p.start()
225 |                     processes.append(p)
226 | 
227 |                 for p in processes:
228 |                     p.join()
229 | 
230 |                 print("Testing accuracies:", list(zip(chosen_objects, chosen_phases, results[:])))
231 | 
232 |             else:
233 |                 arguments['test'] = 1
234 |                 
235 |                 print(list(zip(range(len(ALL_ROOMS)), list(ALL_ROOMS.values()))))
236 |                 command = input("Please specify room type:")
237 |                 scene_type = ALL_ROOMS[int(command)]
238 | 
239 |                 training_scenes = config['rooms'][scene_type]['train_scenes']
240 |                 testing_scenes = config['rooms'][scene_type]['test_scenes']
241 | 
242 |                 command = input("Training/testing scenes. (0, 1): ")
243 |                 scenes = [training_scenes, testing_scenes][int(command)]
244 | 
245 |                 results = Manager().dict()
246 | 
247 |                 all_visible_objects = config['rooms'][scene_type]['train_objects'] + config['rooms'][scene_type]['test_objects']
248 |                 chosen_phases = ['train'] * len(config['rooms'][scene_type]['train_objects']) + ['test'] * len(config['rooms'][scene_type]['test_objects'])
249 |                 for obj in all_visible_objects:
250 |                     results[obj] = []
251 | 
252 |                 processes = []
253 | 
254 |                 counter = mp.Value('i', 0)
255 |                 lock = mp.Lock()
256 | 
257 |                 for rank in range(0, len(scenes)):
258 |                     p = mp.Process(target=test_multi, args=(scenes[rank], rank, shared_model, \
259 |                                     results, config, arguments))
260 |                     p.start()
261 |                     processes.append(p)
262 | 
263 |                 for p in processes:
264 |                     p.join()        
265 | 
266 |                 accuracies = []
267 |                 avg_sc = {'train': [], 'test': []}
268 |                 avg_spl = {'train': [], 'test': []}
269 |                 for obj in all_visible_objects:
270 |                     accuracies.append((np.mean(results[obj]), np.mean(np.array(results[obj], dtype=bool))))
271 | 
272 |                 for phase, acc in zip(chosen_phases, accuracies):
273 |                     avg_spl[phase].append(acc[0])
274 |                     avg_sc[phase].append(acc[1])
275 | 
276 |                 avg_sc['train'] = np.mean(avg_sc['train'])
277 |                 avg_sc['test'] = np.mean(avg_sc['test'])
278 |                 avg_spl['train'] = np.mean(avg_spl['train'])
279 |                 avg_spl['test'] = np.mean(avg_spl['test'])
280 | 
281 |                 print("Accuracies:", list(zip(all_visible_objects, chosen_phases, accuracies)))    
282 |                 print("[Avergae] SPL: {} | SR: {}".format(avg_spl, avg_sc))
283 | 
284 |         else:
285 |             arguments = vars(args)
286 |             weights = None
287 | 
288 |             if not arguments['multi_scene']:
289 |         
290 |                 if not os.path.isdir("training-history/{}".format(arguments['about'])):
291 |                     os.mkdir("training-history/{}".format(arguments['about']))
292 | 
293 |                 with open('training-history/{}/arguments.json'.format(arguments['about']), 'w') as outfile:
294 |                     json.dump(arguments, outfile)
295 |                 
296 |                 training_scene = "FloorPlan{}".format(arguments['scene_id'])
297 |                 f = h5py.File("{}.hdf5".format(os.path.join(config['dump_path'], training_scene)), 'r')
298 |                 all_visible_objects = f['all_visible_objects'][()].tolist()
299 |                 f.close()
300 |                 
301 |                 testing_objects = config["picked"][training_scene]['test']
302 |                 trainable_objects = list(set(all_visible_objects) - set(testing_objects))
303 |             
304 |                 print(list(zip(range(len(trainable_objects)), trainable_objects)))
305 |                 command = input("Please specify target ids, you can choose either individually (e.g: 0,1,2) or by range (e.g: 0-4)\nYour input:")
306 |                 if '-' not in command:
307 |                     target_ids = [int(i.strip()) for i in command.split(",")]
308 |                 else:
309 |                     target_ids = list(range(int(command.split('-')[0]), int(command.split('-')[1]) + 1))
310 | 
311 |                 training_objects = [trainable_objects[target_id] for target_id in target_ids]
312 |                 num_thread_each = arguments['num_processes'] // len(training_objects)
313 |                 object_threads = []
314 | 
315 |                 for obj in training_objects:
316 |                     object_threads += [obj] * num_thread_each
317 | 
318 |                 object_threads += [np.random.choice(training_objects)] * (arguments['num_processes'] - len(object_threads))
319 | 
320 |                 print("Start training agent to find {} in {}".format(training_objects, training_scene))
321 | 
322 |                 shared_model = ActorCritic(config, arguments)
323 |                 shared_model.share_memory()
324 | 
325 |                 if weights is not None:
326 |                     shared_model.load_state_dict(torch.load(weights, map_location='cpu'))
327 |                     print("loaded model")
328 | 
329 |                 scheduler = None
330 |                 if arguments['no_shared']:
331 |                     optimizer = None
332 |                 else:
333 |                     if arguments['optim'] == 0:
334 |                         optimizer = SharedAdam(shared_model.parameters(), lr=arguments['lr'])
335 |                     else:
336 |                         optimizer = SharedRMSprop(shared_model.parameters(), lr=arguments['lr'], alpha=0.99, eps=0.1)
337 | 
338 |                     optimizer.share_memory()
339 |                     if arguments['lr_decay']:
340 |                         scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.99995)
341 | 
342 |                 processes = []
343 | 
344 |                 counter = mp.Value('i', 0)
345 |                 lock = mp.Lock()
346 | 
347 |                 for rank in range(0, arguments['num_processes']):
348 |                     p = mp.Process(target=train, args=(training_scene, object_threads[rank], rank, shared_model, \
349 |                                     scheduler, counter, lock, config, arguments, optimizer))
350 |                     p.start()
351 |                     processes.append(p)
352 | 
353 |                 for p in processes:
354 |                     p.join()
355 | 
356 |             else:
357 | 
358 |                 print(list(zip(range(len(ALL_ROOMS)), list(ALL_ROOMS.values()))))
359 |                 command = input("Please specify room type:")
360 |                 scene_type = ALL_ROOMS[int(command)]
361 | 
362 |                 training_scenes = config['rooms'][scene_type]['train_scenes']
363 |                 num_thread_each = arguments['num_processes'] // len(training_scenes)
364 |                 scene_threads = []
365 | 
366 |                 for s in training_scenes:
367 |                     scene_threads += [s] * num_thread_each
368 | 
369 |                 scene_threads += list(np.random.choice(training_scenes, arguments['num_processes'] - len(scene_threads)))
370 | 
371 |                 if not os.path.isdir("training-history/{}".format(arguments['about'])):
372 |                     os.mkdir("training-history/{}".format(arguments['about']))
373 | 
374 |                 with open('training-history/{}/arguments.json'.format(arguments['about']), 'w') as outfile:
375 |                     json.dump(arguments, outfile)
376 | 
377 |                 print("Start training agent in {}".format(training_scenes))
378 | 
379 |                 shared_model = ActorCritic(config, arguments)
380 |                 shared_model.share_memory()
381 | 
382 |                 if weights is not None:
383 |                     shared_model.load_state_dict(torch.load(weights, map_location='cpu'))
384 |                     print("loaded model")
385 | 
386 |                 scheduler = None
387 |                 if arguments['no_shared']:
388 |                     optimizer = None
389 |                 else:
390 |                     if arguments['optim'] == 0:
391 |                         optimizer = SharedAdam(shared_model.parameters(), lr=arguments['lr'])
392 |                     else:
393 |                         optimizer = SharedRMSprop(shared_model.parameters(), lr=arguments['lr'], alpha=0.99, eps=0.1)
394 | 
395 |                     optimizer.share_memory()
396 |                     if arguments['lr_decay']:
397 |                         scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.99995)
398 | 
399 |                 processes = []
400 | 
401 |                 counter = mp.Value('i', 0)
402 |                 lock = mp.Lock()
403 | 
404 |                 for rank in range(0, arguments['num_processes']):
405 |                     p = mp.Process(target=train_multi, args=(scene_threads[rank], rank, shared_model, \
406 |                                     scheduler, counter, lock, config, arguments, optimizer))
407 |                     p.start()
408 |                     processes.append(p)
409 | 
410 |                 for p in processes:
411 |                     p.join()                            
412 | 
413 | 


--------------------------------------------------------------------------------
/pytorch_a3c/model.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pickle
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | import torchvision.models as models
  7 | from torch.autograd import Variable
  8 | from layers import GraphConvolution
  9 | from utils import *
 10 | 
 11 | class GCN(nn.Module):
 12 |     def __init__(self, nfeat, nhid, nclass, dropout):
 13 |         super(GCN, self).__init__()
 14 | 
 15 |         self.gc1 = GraphConvolution(nfeat, nhid)
 16 |         self.gc2 = GraphConvolution(nhid, nhid)
 17 |         self.gc3 = GraphConvolution(nhid, nclass)
 18 |         self.dropout = dropout
 19 | 
 20 |     def forward(self, x, adj):
 21 |         x = F.relu(self.gc1(x, adj))
 22 |         x = F.dropout(x, self.dropout, training=self.training)
 23 |         x = F.relu(self.gc2(x, adj))
 24 |         x = F.dropout(x, self.dropout, training=self.training)
 25 |         x = F.relu(self.gc3(x, adj))
 26 |         return x
 27 | 
 28 | class ActorCritic(torch.nn.Module):    
 29 | 
 30 |     def __init__(self, config, arguments, gpu_id=-1):
 31 |         super(ActorCritic, self).__init__()
 32 | 
 33 |         self.config = config
 34 |         self.arguments = arguments
 35 | 
 36 |         if gpu_id != -1:
 37 |             torch.cuda.set_device(gpu_id)
 38 |             self.dtype = torch.cuda.FloatTensor
 39 |         else:
 40 |             self.dtype = torch.FloatTensor
 41 |         try:
 42 |             self.use_lstm = arguments['lstm']
 43 |         except KeyError:
 44 |             self.use_lstm = False
 45 | 
 46 |         self.history_size = arguments['history_size']
 47 | 
 48 |         self.input_size = 2048
 49 | 
 50 |         if arguments['onehot']:
 51 |             self.input_size = 109
 52 |         
 53 |         if arguments['train_cnn']:
 54 |             self.conv1 = nn.Conv2d(3, 32, 3, stride=2, padding=1)
 55 |             self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
 56 |             self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
 57 |             self.conv4 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
 58 |             self.conv5 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
 59 |             self.input_size = 32 * 4 * 4
 60 | 
 61 |         if self.use_lstm:
 62 |             assert arguments['history_size'] == 1, "History size should be 1 if you want to use lstm."
 63 |             self.visual_ft = nn.LSTMCell(input_size=self.input_size, hidden_size=256)
 64 |         else:
 65 |             self.visual_ft = nn.Linear(in_features=self.input_size * self.history_size, out_features=512)
 66 | 
 67 |         if arguments["embed"] == 0: 
 68 |             self.embeddings = pickle.load(open(config["embeddings_onehot"], 'rb'))
 69 |         else:
 70 |             self.embeddings = pickle.load(open(config["embeddings_fasttext"], 'rb'))
 71 | 
 72 |         self.semantic_size = list(self.embeddings.values())[0].shape[0]
 73 |         self.semantic_ft = nn.Linear(in_features=self.semantic_size, out_features=512)
 74 | 
 75 |         self.categories = list(config['new_objects'].keys())
 76 |         self.cate2idx = config['new_objects']
 77 |         self.num_objects = len(self.categories)
 78 |         
 79 |         self.all_embeddings = torch.stack([torch.from_numpy(self.embeddings[w]) for w in self.categories], 0).type(self.dtype)
 80 |         
 81 |         if arguments['use_gcn']:
 82 |             self.categories = list(config['new_objects'].keys())
 83 |             self.num_objects = len(self.categories)
 84 | 
 85 |             fused_size = 512 * 3
 86 |             self.adj = normalize(np.load(self.config['adj_file']))
 87 |             self.adj = torch.from_numpy(self.adj).type(self.dtype)
 88 | 
 89 |             if not arguments['yolo_gcn']:
 90 |                 self.score_to_512 = nn.Linear(in_features=1000, out_features=512)
 91 |                 self.gcn = GCN(nfeat=1024, nhid=1024, nclass=1, dropout=0.5)
 92 |             else:
 93 |                 self.gcn = GCN(nfeat=self.num_objects + 512, nhid=self.num_objects + 512, nclass=1, dropout=0.5)
 94 | 
 95 |             self.gcn_to_512 = nn.Linear(in_features=self.num_objects, out_features=512)
 96 | 
 97 |         elif arguments['use_graph']:
 98 |             self.adj = np.load(self.config['adj_file'])
 99 |             self.adj = torch.from_numpy(self.adj).type(self.dtype)
100 | 
101 |             self.graph_ft = nn.Linear(in_features=self.num_objects, out_features=self.num_objects)
102 |             fused_size = 512 * 2 + self.num_objects
103 |         else:
104 |             fused_size = 512 * 2
105 | 
106 |         self.hidden_mlp = nn.Linear(in_features=fused_size, out_features=512)
107 |         self.critic_linear = nn.Linear(512, 1)
108 |         self.actor_linear = nn.Linear(512, arguments['action_size'])
109 | 
110 |         self.apply(kaiming_weights_init)
111 |         self.actor_linear.weight.data = normalized_columns_initializer(
112 |                                             self.actor_linear.weight.data, 0.01)
113 |         self.actor_linear.bias.data.fill_(0)
114 |         self.critic_linear.weight.data = normalized_columns_initializer(
115 |                                             self.critic_linear.weight.data, 1.0)
116 |         self.critic_linear.bias.data.fill_(0)
117 | 
118 |     def learned_embedding(self, word):
119 |         embeded = torch.from_numpy(self.embeddings[word]).type(self.dtype)
120 |         embeded = embeded.view(1, embeded.size(0))
121 |         semantic = F.relu(self.semantic_ft(embeded))
122 | 
123 |         # if self.arguments['use_graph']:
124 |         #     relations = self.adj[self.cate2idx[word]]
125 |         #     r = F.relu(self.graph_ft(relations))
126 |         #     r = r.view(1, r.numel())
127 |         #     joint_embeddings = torch.cat((semantic, r), 1)
128 |         #     return joint_embeddings
129 | 
130 |         return semantic
131 | 
132 | 
133 |     def forward(self, inputs, scores, word):
134 |         if self.arguments['lstm']:
135 |             inputs, (hx, cx) = inputs
136 | 
137 |         if self.arguments['train_cnn']:
138 |             assert inputs.shape == (self.history_size, 3, 128, 128)
139 |             inputs = torch.from_numpy(inputs).type(self.dtype)
140 |             x = F.elu(self.conv1(inputs))
141 |             x = F.elu(self.conv2(x))
142 |             x = F.elu(self.conv3(x))
143 |             x = F.elu(self.conv4(x))
144 |             x = F.elu(self.conv5(x))
145 |             feature = x.view(-1, self.input_size * self.history_size)
146 |             visual = F.relu(self.visual_ft(feature))
147 | 
148 |         else:
149 |             torch_inputs = [torch.from_numpy(inp).type(self.dtype) for inp in inputs]    
150 | 
151 |             if not self.use_lstm:
152 |                 joint_features = torch.cat(torch_inputs)
153 |                 joint_features = joint_features.view(1, -1)
154 |                 visual = F.relu(self.visual_ft(joint_features))
155 |             else:    
156 |                 feature = torch_inputs[0].view(-1, self.input_size)
157 |                 hx, cx = self.visual_ft(feature, (hx, cx))
158 |                 visual = hx.view(1, -1)
159 |             
160 |         embeded = torch.from_numpy(self.embeddings[word]).type(self.dtype)
161 |         embeded = embeded.view(1, embeded.size(0))
162 |         semantic = F.relu(self.semantic_ft(embeded))
163 |         
164 |         if self.arguments['use_gcn']:
165 |             scores = torch.from_numpy(scores).type(self.dtype)
166 |             scores = scores.view(1, scores.numel())
167 |             
168 |             if not self.arguments['yolo_gcn']:
169 |                 scores_512 = F.relu(self.score_to_512(scores))
170 | 
171 |             nodes = []
172 |             ems_512 = F.relu(self.semantic_ft(self.all_embeddings))
173 |             
174 |             if not self.arguments['yolo_gcn']:
175 |                 nodes = torch.cat((scores_512.repeat(self.num_objects, 1), ems_512), 1)
176 |             else:
177 |                 nodes = torch.cat((scores.repeat(self.num_objects, 1), ems_512), 1)
178 |                 
179 |             gcn_out = self.gcn(nodes, self.adj)
180 |             gcn_out = gcn_out.view(1, gcn_out.numel())
181 |             gcn_512 = F.relu(self.gcn_to_512(gcn_out))
182 | 
183 |             joint_embeddings = torch.cat((visual, semantic, gcn_512), 1)
184 | 
185 |         elif self.arguments['use_graph']:
186 |             # relations = self.adj[self.cate2idx[word]].numpy()
187 |             # detections = inputs[0][:-4].astype(bool).astype(int)
188 |             # revec = torch.from_numpy(relations * detections).type(self.dtype)
189 |             revec = self.adj[self.cate2idx[word]] 
190 |             r = F.relu(self.graph_ft(revec))
191 |             r = r.view(1, r.numel())
192 |             joint_embeddings = torch.cat((visual, semantic, r), 1)
193 | 
194 |         else:
195 |             joint_embeddings = torch.cat((visual, semantic), 1)
196 | 
197 |         x = self.hidden_mlp(joint_embeddings)
198 |         x = F.relu(x)
199 |         
200 |         if self.arguments['lstm']:
201 |             return self.critic_linear(x), self.actor_linear(x), (hx, cx)
202 |         else:
203 |             return self.critic_linear(x), self.actor_linear(x)
204 | 


--------------------------------------------------------------------------------
/pytorch_a3c/optimizers.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Adapted from: https://github.com/ikostrikov/pytorch-a3c/blob/master/my_optim.py
  3 | 
  4 | In the original A3C paper (https://arxiv.org/abs/1602.01783), the authors compared 3 different
  5 | optimizers i.e. Momentum SGD, RMSProp and Shared RMSProp (check final part of section 4). The
  6 | difference between the 3rd compared to the 2nd is whether to compute shared statistics across all
  7 | threads, which was found to be more robust. It seems the equivalent was implemented for Adam
  8 | below.
  9 | """
 10 | 
 11 | import math
 12 | 
 13 | import torch
 14 | import torch.optim as optim
 15 | 
 16 | 
 17 | class SharedAdam(optim.Adam):
 18 |     """Implements Adam algorithm with shared states.
 19 |     """
 20 | 
 21 |     def __init__(self,
 22 |                  params,
 23 |                  lr=1e-3,
 24 |                  betas=(0.9, 0.999),
 25 |                  eps=1e-8,
 26 |                  weight_decay=0):
 27 |         super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay)
 28 | 
 29 |         for group in self.param_groups:
 30 |             for p in group['params']:
 31 |                 state = self.state[p]
 32 |                 state['step'] = torch.zeros(1)
 33 |                 state['exp_avg'] = p.data.new().resize_as_(p.data).zero_()
 34 |                 state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_()
 35 | 
 36 |     def share_memory(self):
 37 |         for group in self.param_groups:
 38 |             for p in group['params']:
 39 |                 state = self.state[p]
 40 |                 state['step'].share_memory_()
 41 |                 state['exp_avg'].share_memory_()
 42 |                 state['exp_avg_sq'].share_memory_()
 43 | 
 44 |     def step(self, closure=None):
 45 |         """Performs a single optimization step.
 46 |         Arguments:
 47 |             closure (callable, optional): A closure that reevaluates the model
 48 |                 and returns the loss.
 49 |         """
 50 |         loss = None
 51 |         if closure is not None:
 52 |             loss = closure()
 53 | 
 54 |         for group in self.param_groups:
 55 |             for p in group['params']:
 56 |                 if p.grad is None:
 57 |                     continue
 58 |                 grad = p.grad.data
 59 |                 state = self.state[p]
 60 | 
 61 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
 62 |                 beta1, beta2 = group['betas']
 63 | 
 64 |                 state['step'] += 1
 65 | 
 66 |                 if group['weight_decay'] != 0:
 67 |                     grad = grad.add(group['weight_decay'], p.data)
 68 | 
 69 |                 # Decay the first and second moment running average coefficient
 70 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
 71 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
 72 | 
 73 |                 denom = exp_avg_sq.sqrt().add_(group['eps'])
 74 | 
 75 |                 bias_correction1 = 1 - beta1 ** state['step'].item()
 76 |                 bias_correction2 = 1 - beta2 ** state['step'].item()
 77 |                 step_size = group['lr'] * math.sqrt(
 78 |                     bias_correction2) / bias_correction1
 79 | 
 80 |                 p.data.addcdiv_(-step_size, exp_avg, denom)
 81 | 
 82 |         return loss
 83 | 
 84 | # Non-centered RMSprop update with shared statistics (without momentum)
 85 | class SharedRMSprop(optim.RMSprop):
 86 |     """Implements RMSprop algorithm with shared states.
 87 |     """
 88 | 
 89 |     def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0):
 90 |         super(SharedRMSprop, self).__init__(params, lr=lr, alpha=alpha, eps=eps, weight_decay=weight_decay, momentum=0, centered=False)
 91 | 
 92 |         # State initialisation (must be done before step, else will not be shared between threads)
 93 |         for group in self.param_groups:
 94 |             for p in group['params']:
 95 |                 state = self.state[p]
 96 |                 state['step'] = p.data.new().resize_(1).zero_()
 97 |                 state['square_avg'] = p.data.new().resize_as_(p.data).zero_()
 98 | 
 99 |     def share_memory(self):
100 |         for group in self.param_groups:
101 |             for p in group['params']:
102 |                 state = self.state[p]
103 |                 state['step'].share_memory_()
104 |                 state['square_avg'].share_memory_()
105 | 
106 |     def step(self, closure=None):
107 |         """Performs a single optimization step.
108 |         Arguments:
109 |             closure (callable, optional): A closure that reevaluates the model
110 |                 and returns the loss.
111 |         """
112 |         loss = None
113 |         if closure is not None:
114 |             loss = closure()
115 | 
116 |         for group in self.param_groups:
117 |             for p in group['params']:
118 |                 if p.grad is None:
119 |                     continue
120 |                 grad = p.grad.data
121 |                 state = self.state[p]
122 | 
123 |                 square_avg = state['square_avg']
124 |                 alpha = group['alpha']
125 | 
126 |                 state['step'] += 1
127 | 
128 |                 if group['weight_decay'] != 0:
129 |                     grad = grad.add(group['weight_decay'], p.data)
130 | 
131 |                 # g = αg + (1 - α)Δθ^2
132 |                 square_avg.mul_(alpha).addcmul_(1 - alpha, grad, grad)
133 |                 # θ ← θ - ηΔθ/√(g + ε)
134 |                 avg = square_avg.sqrt().add_(group['eps'])
135 |                 p.data.addcdiv_(-group['lr'], grad, avg)
136 | 
137 |         return loss


--------------------------------------------------------------------------------
/pytorch_a3c/test.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Adapted from https://github.com/ikostrikov/pytorch-a3c/blob/master/test.py
  3 | 
  4 | Contains the testing loop of the shared model within A3C (no optimisation/backprop needed)
  5 | Usually this is run concurrently while training occurs and is useful for tracking progress. But to
  6 | save resources we can choose to only test every args.test_sleep_time seconds.
  7 | """
  8 | 
  9 | import time
 10 | from collections import deque
 11 | 
 12 | import pickle
 13 | import torch
 14 | import torch.nn.functional as F
 15 | import numpy as np
 16 | import sys
 17 | import cv2
 18 | 
 19 | sys.path.append('..') # to access env package
 20 | 
 21 | from env.ai2thor_env import AI2ThorDumpEnv, MultiSceneEnv
 22 | from model import ActorCritic
 23 | 
 24 | def test(testing_scene, test_object, rank, shared_model, results, config, arguments=dict()):
 25 |     torch.manual_seed(arguments['seed'] + rank)
 26 | 
 27 |     env = AI2ThorDumpEnv(testing_scene, test_object, config, arguments, arguments['seed'] + rank)
 28 |     print("Finding {} in {}, {}".format(test_object, testing_scene, env.target_locs))
 29 | 
 30 |     if shared_model is not None:
 31 |         gpu_id = arguments['gpu_ids'][rank % len(arguments['gpu_ids'])]
 32 |         # gpu_id = -1
 33 | 
 34 |         model = ActorCritic(config, arguments, gpu_id)
 35 |         if gpu_id >= 0:
 36 |             with torch.cuda.device(gpu_id):
 37 |                 model = model.cuda()
 38 |                 model.load_state_dict(shared_model.state_dict())
 39 | 
 40 |                 print("[P{}] loaded model into cuda {}".format(rank, gpu_id))
 41 |         else:
 42 |             model.load_state_dict(shared_model.state_dict())
 43 |             print("[P{}] loaded model".format(rank))
 44 | 
 45 |         model.eval()
 46 | 
 47 |     state, score, target = env.reset()
 48 |     done = True
 49 |     
 50 |     starting = env.current_state_id
 51 |     results[rank] = 0
 52 |     
 53 |     for ep in range(1000):
 54 |         agent_step = 0
 55 |         for step in range(arguments['num_iters']):
 56 |             if model is not None:
 57 |                 with torch.no_grad():
 58 |                     value, logit = model(state, score, target)
 59 |                 prob = F.softmax(logit, dim=-1)
 60 |                 action = prob.max(1, keepdim=True)[1].cpu().numpy()
 61 |                 # action = prob.multinomial(num_samples=1).detach().cpu().numpy()[0, 0]
 62 | 
 63 |             else:
 64 |                 action = np.random.choice(range(arguments['action_size']))
 65 | 
 66 |             state, score, reward, done = env.step(action)
 67 |             ending = env.current_state_id
 68 | 
 69 |             if action < 2:
 70 |                 agent_step += 1
 71 |             
 72 |             if done:                
 73 |                 results[rank] += env.shortest[ending, starting] / max(agent_step, env.shortest[ending, starting])
 74 |                 state, score, target = env.reset()
 75 |                 break
 76 | 
 77 |     results[rank] = results[rank] / 1000
 78 | 
 79 | def test_multi(testing_scene, rank, shared_model, results, config, arguments=dict()):
 80 |     torch.manual_seed(arguments['seed'] + rank)
 81 | 
 82 |     env = MultiSceneEnv(testing_scene, config, arguments, arguments['seed'] + rank)
 83 | 
 84 |     # gpu_id = arguments['gpu_ids'][rank % len(arguments['gpu_ids'])]
 85 |     gpu_id = -1
 86 |     print("Done initalizing process {}: {}! Use gpu: {}".format(rank, testing_scene, 'yes' if gpu_id >= 0 else 'no'))
 87 | 
 88 |     if shared_model is not None:
 89 |         # gpu_id = -1
 90 | 
 91 |         model = ActorCritic(config, arguments, gpu_id)
 92 |         if gpu_id >= 0:
 93 |             with torch.cuda.device(gpu_id):
 94 |                 model = model.cuda()
 95 |                 model.load_state_dict(shared_model.state_dict())
 96 | 
 97 |                 # print("[P{}] loaded model into cuda {}".format(rank, gpu_id))
 98 |         else:
 99 |             model.load_state_dict(shared_model.state_dict())
100 |             # print("[P{}] loaded model".format(rank))
101 | 
102 |         model.eval()
103 |         
104 |     else:
105 |         model = None
106 |     state, score, target = env.reset()
107 |     done = True
108 | 
109 |     for ep in range(1000):
110 |         state, score, target = env.reset()
111 |         agent_step = 0
112 |         starting = env.current_state_id
113 | 
114 |         for step in range(arguments['num_iters']):
115 |             if model is not None:
116 |                 with torch.no_grad():
117 |                     value, logit = model(state, score, target)
118 |                 prob = F.softmax(logit, dim=-1)
119 |                 action = prob.max(1, keepdim=True)[1].cpu().numpy()
120 |                 # action = prob.multinomial(num_samples=1).detach().cpu().numpy()[0, 0]
121 | 
122 |             else:
123 |                 action = np.random.choice(range(arguments['action_size']))
124 | 
125 |             state, score, reward, done = env.step(action)
126 |             ending = env.current_state_id
127 |             
128 |             if action < 2:
129 |                 agent_step += 1
130 | 
131 |             if done:                
132 |                 break
133 | 
134 |         if not done:
135 |             tm = results[target]
136 |             tm.append(0)
137 |             results[target] = tm
138 |         else:
139 |             if max(agent_step, env.shortest[ending, starting]) > 0:
140 |                 tm = results[target]
141 |                 tm.append(env.shortest[ending, starting] / max(agent_step, env.shortest[ending, starting]))
142 |                 results[target] = tm
143 | 
144 | def live_test(testing_scene, test_objects, shared_model, config, arguments=dict()):
145 | 
146 |     model = shared_model
147 |     if model is not None:
148 |         model.eval()
149 | 
150 |     test_object = np.random.choice(test_objects)
151 |     env = AI2ThorDumpEnv(testing_scene, test_object, config, arguments)
152 |     print(arguments['angle'])
153 | 
154 |     new_test_object = None
155 |     while 1:
156 |         if new_test_object is not None and new_test_object != test_object:
157 |             print("Finding {} ..".format(new_test_object))
158 |             env = AI2ThorDumpEnv(testing_scene, new_test_object, config, arguments)
159 |         else:
160 |             print("Finding {} ..".format(test_object))
161 | 
162 |         state, score, target = env.reset()
163 |         start = env.current_state_id
164 |         done = True
165 |         stop = 0
166 | 
167 |         for step in range(arguments['num_iters']):
168 |             ob = env.observations[env.current_state_id]
169 |             
170 |             cv2.imshow("Live Test", cv2.resize(ob[:,:,::-1], (400, 400)))
171 |             time.sleep(0.1)
172 |             k = cv2.waitKey(33) 
173 | 
174 |             if k == ord('r'): # press q to escape
175 |                 new_test_object_id = int(input("Specify target: {}\n".format(list(zip(range(len(test_objects)), test_objects)))))
176 |                 new_test_object = test_objects[new_test_object_id]
177 |                 break
178 |             elif k == ord('q'): # press q to escape
179 |                 sys.exit("End live test.")
180 | 
181 | 
182 |             if model is not None:
183 |                 with torch.no_grad():
184 |                     value, logit = model(state, score, target)
185 |                 prob = F.softmax(logit, dim=-1)
186 |                 action = prob.max(1, keepdim=True)[1].numpy()[0, 0]
187 |                 # action = prob.multinomial(num_samples=1).detach().numpy()[0, 0]
188 | 
189 |             else:
190 |                 action = np.random.choice(range(arguments['action_size']))
191 | 
192 |             print("Action: {}".format(['Move Forward', 'Move Backward', 'Turn Right', 'Turn Left'][action]))
193 |             state, score, reward, done = env.step(action)
194 |             if env.collided:
195 |                 print("Collision occurs.")
196 |             # a quick hack to prevent the agent from stucking
197 |             # i.e. in test mode an agent can repeat an action ad infinitum
198 |         
199 |             if done: 
200 |                 stop += 1
201 |                 if stop == 2:               
202 |                     new_test_object_id = int(input("Specify target: {}\n".format(list(zip(range(len(test_objects)), test_objects)))))
203 |                     new_test_object = test_objects[new_test_object_id]
204 |                     stop = 0
205 |                     break
206 | 
207 |         if not done:
208 |             print("Fail")
209 |         else:
210 |             print("Success with {} redundant steps.".format(step + 1 - env.shortest[start, env.current_state_id]))            
211 | 


--------------------------------------------------------------------------------
/pytorch_a3c/train.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Adapted from https://github.com/ikostrikov/pytorch-a3c/blob/master/train.py
  3 | 
  4 | Contains the train code run by each A3C process on either Atari or AI2ThorEnv.
  5 | For initialisation, we set up the environment, seeds, shared model and optimizer.
  6 | In the main training loop, we always ensure the weights of the current model are equal to the
  7 | shared model. Then the algorithm interacts with the environment arguments.num_steps at a time,
  8 | i.e it sends an action to the env for each state and stores predicted values, rewards, log probs
  9 | and entropies to be used for loss calculation and backpropagation.
 10 | After arguments.num_steps has passed, we calculate advantages, value losses and policy losses using
 11 | Generalized Advantage Estimation (GAE) with the entropy loss added onto policy loss to encourage
 12 | exploration. Once these losses have been calculated, we add them all together, backprop to find all
 13 | gradients and then optimise with Adam and we go back to the start of the main training loop.
 14 | """
 15 | 
 16 | import torch
 17 | import torch.nn.functional as F
 18 | import torch.optim as optim
 19 | import time
 20 | import numpy as np
 21 | import json
 22 | import os
 23 | import sys
 24 | import pickle
 25 | import sys
 26 | 
 27 | sys.path.append('..') # to access env package
 28 | 
 29 | from env.ai2thor_env import AI2ThorDumpEnv, MultiSceneEnv
 30 | from model import ActorCritic
 31 | 
 32 | 
 33 | def ensure_shared_grads(model, shared_model, gpu=False):
 34 |     for param, shared_param in zip(model.parameters(),
 35 |                                    shared_model.parameters()):
 36 |         if shared_param.grad is not None and not gpu:
 37 |             return
 38 |         elif not gpu:
 39 |             shared_param._grad = param.grad
 40 |         else:
 41 |             shared_param._grad = param.grad.cpu()
 42 | 
 43 | def train(training_scene, train_object, rank, shared_model, scheduler, counter, lock, config, arguments=dict(), optimizer=None):
 44 |     torch.manual_seed(arguments['seed'] + rank)
 45 |     # To prevent out of memory
 46 |     if (arguments['train_cnn'] and rank < 10):
 47 |         arguments.update({"gpu_ids": [-1]})
 48 | 
 49 |     gpu_id = arguments['gpu_ids'][rank % len(arguments['gpu_ids'])]
 50 | 
 51 |     if gpu_id >= 0:
 52 |         torch.cuda.manual_seed(arguments['seed'] + rank)
 53 | 
 54 |     if optimizer is None:
 55 |         optimizer = optim.RMSprop(shared_model.parameters(), lr=arguments['lr'],  alpha=0.99, eps=0.1)
 56 | 
 57 |     env = AI2ThorDumpEnv(training_scene, train_object, config, arguments, seed=arguments['seed'] + rank)
 58 |     
 59 |     state, score, target = env.reset()
 60 |     starting = env.current_state_id
 61 |     done = True
 62 |     print("Done initalizing process {}. Now find {} in {}! Use gpu: {}".format(rank, env.target, env.scene, 'yes' if gpu_id >= 0 else 'no'))
 63 | 
 64 |     model = ActorCritic(config, arguments, gpu_id)
 65 |     if gpu_id >= 0:
 66 |         with torch.cuda.device(gpu_id):
 67 |             model = model.cuda()
 68 |             dtype = torch.cuda.FloatTensor 
 69 |     else:
 70 |         dtype = torch.FloatTensor
 71 | 
 72 |     model.train()
 73 | 
 74 |     # monitoring
 75 |     total_reward_for_num_steps_list = []
 76 |     redundancies = []
 77 |     success = []
 78 |     avg_entropies = []
 79 |     learning_rates = []
 80 |     dist_to_goal = []
 81 | 
 82 |     start = time.time()
 83 |     episode_length = 0
 84 | 
 85 |     for epoch in range(arguments['num_epochs']):
 86 |         # Sync with the shared model
 87 |         if gpu_id >= 0:
 88 |             with torch.cuda.device(gpu_id):
 89 |                 model.load_state_dict(shared_model.state_dict())
 90 |         else:
 91 |             model.load_state_dict(shared_model.state_dict())
 92 | 
 93 |         if arguments['lstm']:
 94 |             if done:
 95 |                 cx = torch.zeros(1, 512).type(dtype)
 96 |                 hx = torch.zeros(1, 512).type(dtype)
 97 |             else:
 98 |                 cx = cx.detach()
 99 |                 hx = hx.detach()
100 | 
101 |         if scheduler is not None:
102 |             scheduler.step()
103 |             learning_rates.append(optimizer.param_groups[0]['lr'])
104 |         
105 |         values = []
106 |         log_probs = []
107 |         rewards = []
108 |         entropies = []
109 |         starting = env.current_state_id
110 | 
111 |         dist_to_goal.append(min([env.shortest[starting][t] for t in env.target_ids]))
112 | 
113 |         for step in range(arguments['num_iters']):
114 |             episode_length += 1
115 |             if arguments['lstm']:
116 |                 value, logit, (hx, cx) = model((state, (hx, cx)), score, target)
117 |             else:
118 |                 value, logit = model(state, score, target)
119 | 
120 |             prob = F.softmax(logit, dim=-1)
121 |             log_prob = F.log_softmax(logit, dim=-1)
122 |             entropy = -(log_prob * prob).sum(1, keepdim=True)
123 |             entropies.append(entropy)
124 | 
125 |             action = prob.multinomial(num_samples=1).detach()
126 |             log_prob = log_prob.gather(1, action)
127 | 
128 |             action_int = action.cpu().numpy()[0][0].item()
129 |             state, score, reward, done = env.step(action_int)
130 | 
131 |             if done:
132 |                 success.append(1)
133 |             elif episode_length >= arguments['max_episode_length']:
134 |                 success.append(0)
135 | 
136 |             done = done or episode_length >= arguments['max_episode_length']
137 | 
138 |             with lock:
139 |                 counter.value += 1
140 | 
141 |             values.append(value)
142 |             log_probs.append(log_prob)
143 |             rewards.append(reward)
144 | 
145 |             ending = env.current_state_id
146 |             if done:
147 |                 state, score, target = env.reset()
148 |                     
149 |                 print('[P-{}] Epoch: {}. Episode length: {}. Total reward: {:.3f}. Time elapsed: {:.3f}'\
150 |                         .format(rank, epoch + 1, episode_length, sum(rewards), (time.time() - start) / 3600))
151 | 
152 |                 episode_length = 0
153 |                 break
154 | 
155 |         if not done:
156 |             success.append(0)
157 | 
158 |         # No interaction with environment below.
159 |         # Monitoring
160 |         total_reward_for_num_steps_list.append(sum(rewards))
161 |         redundancies.append(step + 1 - env.shortest[ending, starting])
162 |         avg_entropies.append(torch.tensor(entropies).numpy().mean())
163 | 
164 |         # Backprop and optimisation
165 |         R = torch.zeros(1, 1)
166 |         if not done:  # to change last reward to predicted value to ....
167 |             if arguments['lstm']:
168 |                 value, _, (hx, cx) = model((state, (hx, cx)), score, target)
169 |             else:
170 |                 value, _ = model(state, score, target)
171 | 
172 |             R = value.detach()
173 |         
174 |         if gpu_id >= 0:
175 |             with torch.cuda.device(gpu_id):
176 |                 R = R.cuda()
177 | 
178 |         values.append(R)
179 | 
180 |         policy_loss = 0
181 |         value_loss = 0
182 | 
183 |         gae = torch.zeros(1, 1)
184 |         if gpu_id >= 0:
185 |             with torch.cuda.device(gpu_id):
186 |                 gae = gae.cuda()
187 | 
188 |         for i in reversed(range(len(rewards))):
189 |             
190 |             R = arguments['gamma'] * R + rewards[i]
191 | 
192 |             advantage = R - values[i]
193 |             value_loss = value_loss + 0.5 * advantage.pow(2)
194 | 
195 |             if arguments['use_gae']:
196 |                 # Generalized Advantage Estimation
197 |                 delta_t = rewards[i] + arguments['gamma'] * values[i + 1] - values[i]
198 |                 gae = gae * arguments['gamma'] * arguments['tau'] + delta_t
199 | 
200 |             policy_loss = policy_loss - log_probs[i] * gae.detach() - \
201 |                           arguments['ec'] * entropies[i]
202 |         
203 |         optimizer.zero_grad()
204 | 
205 |         (policy_loss + arguments['vc'] * value_loss).backward()
206 |         torch.nn.utils.clip_grad_norm_(model.parameters(), arguments['max_grad_norm'])
207 | 
208 |         ensure_shared_grads(model, shared_model, gpu=gpu_id >= 0)
209 |         optimizer.step()
210 | 
211 |         if (epoch + 1) % 1000 == 0 and np.mean(success[-500:]) >= 0.8 and \
212 |             not os.path.isfile("training-history/{}/net_good.pth".format(arguments['about'])):
213 |             torch.save(model.state_dict(), "training-history/{}/net_good.pth".format(arguments['about']))
214 | 
215 |         if (epoch + 1) % 2000 == 0:
216 |             with open('training-history/{}/{}_{}_{}.pkl'.format(arguments['about'], training_scene, train_object, rank), 'wb') as f:
217 |                 pickle.dump({"rewards": total_reward_for_num_steps_list, "dist_to_goal": dist_to_goal,
218 |                             "success_rate": success, 'redundancies': redundancies,
219 |                             "entropies": avg_entropies, 'lrs': learning_rates}, f, pickle.HIGHEST_PROTOCOL)
220 | 
221 |     torch.save(model.state_dict(), "training-history/{}/net_{}.pth".format(arguments['about'], train_object))
222 | 
223 | def train_multi(training_scene, rank, shared_model, scheduler, counter, lock, config, arguments=dict(), optimizer=None):
224 |     torch.manual_seed(arguments['seed'] + rank)
225 | 
226 |     # To prevent out of memory
227 |     if (arguments['lstm'] and rank < 8):
228 |         arguments.update({"gpu_ids": [-1]})
229 | 
230 |     gpu_id = arguments['gpu_ids'][rank % len(arguments['gpu_ids'])]
231 | 
232 |     if gpu_id >= 0:
233 |         torch.cuda.manual_seed(arguments['seed'] + rank)
234 | 
235 |     if optimizer is None:
236 |         optimizer = optim.RMSprop(shared_model.parameters(), lr=arguments['lr'],  alpha=0.99, eps=0.1)
237 | 
238 |     env = MultiSceneEnv(training_scene, config, arguments, seed=arguments['seed'] + rank)
239 |     
240 |     state, score, new_target = env.reset()
241 |     done = True
242 |     print("Done initalizing process {}. Now find {} in {}! Use gpu: {}".format(rank, env.target, env.scene, 'yes' if gpu_id >= 0 else 'no'))
243 | 
244 |     model = ActorCritic(config, arguments, gpu_id)
245 |     if gpu_id >= 0:
246 |         with torch.cuda.device(gpu_id):
247 |             model = model.cuda()
248 |             dtype = torch.cuda.FloatTensor 
249 |     else:
250 |         dtype = torch.FloatTensor
251 | 
252 |     model.train()
253 | 
254 |     # monitoring
255 |     total_reward_for_num_steps_list = []
256 |     redundancies = []
257 |     success = []
258 |     avg_entropies = []
259 |     learning_rates = []
260 |     random_tagets = {}
261 | 
262 |     start = time.time()
263 | 
264 |     episode_length = 0
265 | 
266 |     for epoch in range(arguments['num_epochs']):
267 |         target = new_target
268 |         observed_objects = []
269 |         if target not in random_tagets:
270 |             random_tagets[target] = 1
271 |         else:
272 |             random_tagets[target] += 1
273 | 
274 |         # Sync with the shared model
275 |         if gpu_id >= 0:
276 |             with torch.cuda.device(gpu_id):
277 |                 model.load_state_dict(shared_model.state_dict())
278 |         else:
279 |             model.load_state_dict(shared_model.state_dict())
280 | 
281 |         if arguments['lstm']:
282 |             if done:
283 |                 cx = torch.zeros(1, 512).type(dtype)
284 |                 hx = torch.zeros(1, 512).type(dtype)
285 |             else:
286 |                 cx = cx.detach()
287 |                 hx = hx.detach()
288 | 
289 |         if scheduler is not None:
290 |             scheduler.step()
291 |             learning_rates.append(optimizer.param_groups[0]['lr'])
292 |         
293 |         values = []
294 |         log_probs = []
295 |         rewards = []
296 |         entropies = []
297 |         starting = env.current_state_id
298 | 
299 |         for step in range(arguments['num_iters']):
300 |             episode_length += 1
301 |             if arguments['lstm']:
302 |                 value, logit, (hx, cx) = model((state, (hx, cx)), score, target)
303 |             else:
304 |                 value, logit = model(state, score, target)
305 | 
306 |             prob = F.softmax(logit, dim=-1)
307 |             log_prob = F.log_softmax(logit, dim=-1)
308 |             entropy = -(log_prob * prob).sum(1, keepdim=True)
309 |             entropies.append(entropy)
310 | 
311 |             action = prob.multinomial(num_samples=1).detach()
312 |             log_prob = log_prob.gather(1, action)
313 | 
314 |             action_int = action.cpu().numpy()[0][0].item()
315 |             state, score, reward, done = env.step(action_int)
316 | 
317 |             if done:
318 |                 success.append(1)
319 |                 observed_objects = env.visible_objects[env.current_state_id].split(',')
320 | 
321 |             elif episode_length >= arguments['max_episode_length']:
322 |                 success.append(0)
323 | 
324 |             done = done or episode_length >= arguments['max_episode_length']
325 | 
326 |             with lock:
327 |                 counter.value += 1
328 | 
329 |             values.append(value)
330 |             log_probs.append(log_prob)
331 |             rewards.append(reward)
332 | 
333 |             ending = env.current_state_id
334 |             if done:
335 |                 state, score, new_target = env.reset()
336 |                     
337 |                 print('[P-{}] Epoch: {}. Episode length: {}. Total reward: {:.3f}. Time elapsed: {:.3f}'\
338 |                         .format(rank, epoch + 1, episode_length, sum(rewards), (time.time() - start) / 3600))
339 | 
340 |                 episode_length = 0
341 |                 break
342 | 
343 |         if not done:
344 |             success.append(0)
345 | 
346 |         # No interaction with environment below.
347 |         # Monitoring
348 |         total_reward_for_num_steps_list.append(sum(rewards))
349 |         redundancies.append(step + 1 - env.shortest[ending, starting])
350 |         avg_entropies.append(torch.tensor(entropies).numpy().mean())
351 | 
352 |         # Backprop and optimisation
353 |         R = torch.zeros(1, 1)
354 |         if not done:  # to change last reward to predicted value to ....
355 |             if arguments['lstm']:
356 |                 value, _, (hx, cx) = model((state, (hx, cx)), score, target)
357 |             else:
358 |                 value, _ = model(state, score, target)
359 | 
360 |             R = value.detach()
361 |         
362 |         if gpu_id >= 0:
363 |             with torch.cuda.device(gpu_id):
364 |                 R = R.cuda()
365 | 
366 |         values.append(R)
367 | 
368 |         policy_loss = 0
369 |         value_loss = 0
370 | 
371 |         gae = torch.zeros(1, 1)
372 |         if gpu_id >= 0:
373 |             with torch.cuda.device(gpu_id):
374 |                 gae = gae.cuda()
375 | 
376 |         for i in reversed(range(len(rewards))):
377 |             
378 |             R = arguments['gamma'] * R + rewards[i]
379 | 
380 |             advantage = R - values[i]
381 |             value_loss = value_loss + 0.5 * advantage.pow(2)
382 | 
383 |             # Generalized Advantage Estimation
384 |             delta_t = rewards[i] + arguments['gamma'] * values[i + 1] - values[i]
385 |             gae = gae * arguments['gamma'] * arguments['tau'] + delta_t
386 | 
387 |             policy_loss = policy_loss - log_probs[i] * gae.detach() - \
388 |                           arguments['ec'] * entropies[i]
389 | 
390 |         optimizer.zero_grad()
391 | 
392 |         if not arguments['siamese']:
393 |             (policy_loss + arguments['vc'] * value_loss).backward()
394 |         else:
395 |             if len(observed_objects) > 0:
396 |                 siamese_loss = 0
397 |                 target_rep = model.learned_embedding(target)
398 |                 for o in observed_objects:
399 |                     try:
400 |                         o_rep = model.learned_embedding(o)
401 |                     except KeyError:
402 |                         continue
403 |                     siamese_loss += torch.nn.MSELoss()(target_rep, o_rep.detach())
404 | 
405 |                 (policy_loss + arguments['vc'] * value_loss + siamese_loss * 0.1).backward()
406 | 
407 |         torch.nn.utils.clip_grad_norm_(model.parameters(), arguments['max_grad_norm'])
408 | 
409 |         ensure_shared_grads(model, shared_model, gpu=gpu_id >= 0)
410 |         optimizer.step()
411 | 
412 |         if epoch > 1000 and np.mean(success[-500:]) >= 0.9 and \
413 |             not os.path.isfile("training-history/{}/net_good.pth".format(arguments['about'])):
414 |             torch.save(model.state_dict(), "training-history/{}/net_good.pth".format(arguments['about']))
415 | 
416 |         if (epoch + 1) % 2000 == 0:
417 |             with open('training-history/{}/{}_{}.pkl'.format(arguments['about'], training_scene, rank), 'wb') as f:
418 |                 pickle.dump({"rewards": total_reward_for_num_steps_list, 'random_targets': random_tagets,
419 |                             "success_rate": success, 'redundancies': redundancies,
420 |                             "entropies": avg_entropies, 'lrs': learning_rates}, f, pickle.HIGHEST_PROTOCOL)
421 | 
422 |     torch.save(model.state_dict(), "training-history/{}/net_{}.pth".format(arguments['about'], training_scene))
423 |     


--------------------------------------------------------------------------------
/pytorch_a3c/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import scipy.sparse as sp
 4 | 
 5 | def normalized_columns_initializer(weights, std=1.0):
 6 |     """
 7 |     Weights are normalized over their column. Also, allows control over std which is useful for
 8 |     initialising action logit output so that all actions have similar likelihood
 9 |     """
10 | 
11 |     out = torch.randn(weights.size())
12 |     out *= std / torch.sqrt(out.pow(2).sum(1, keepdim=True))
13 |     return out
14 | 
15 | 
16 | def xavier_weights_init(m):
17 |     classname = m.__class__.__name__
18 |     if classname.find('Conv') != -1:
19 |         weight_shape = list(m.weight.data.size())
20 |         fan_in = np.prod(weight_shape[1:4])
21 |         fan_out = np.prod(weight_shape[2:4]) * weight_shape[0]
22 |         w_bound = np.sqrt(6. / (fan_in + fan_out))
23 |         m.weight.data.uniform_(-w_bound, w_bound)
24 |         m.bias.data.fill_(0)
25 |     elif classname.find('Linear') != -1:
26 |         weight_shape = list(m.weight.data.size())
27 |         fan_in = weight_shape[1]
28 |         fan_out = weight_shape[0]
29 |         w_bound = np.sqrt(6. / (fan_in + fan_out))
30 |         m.weight.data.uniform_(-w_bound, w_bound)
31 |         m.bias.data.fill_(0)
32 | 
33 | def kaiming_weights_init(m):
34 |     classname = m.__class__.__name__
35 |     
36 |     if classname.find('Linear') != -1:
37 |         weight_shape = list(m.weight.data.size())
38 |         fan_in = weight_shape[1]
39 |         fan_out = weight_shape[0]
40 |         m.weight.data = torch.randn(weight_shape) * np.sqrt(2. / fan_in)
41 |         m.bias.data.fill_(0)
42 | 
43 | def normalize(mx):
44 |     """Row-normalize sparse matrix"""
45 |     rowsum = np.array(mx.sum(1))
46 |     r_inv = np.power(rowsum, -1).flatten()
47 |     r_inv[np.isinf(r_inv)] = 0.
48 |     r_mat_inv = sp.diags(r_inv)
49 |     mx = r_mat_inv.dot(mx)
50 |     return mx


--------------------------------------------------------------------------------
/pytorch_a3c/visualize.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.style as style
  2 | style.use("seaborn")
  3 | import matplotlib.pyplot as plt
  4 | import numpy as np
  5 | import os
  6 | import argparse
  7 | import pickle
  8 | 
  9 | from keras.preprocessing.sequence import pad_sequences
 10 | 
 11 | parser = argparse.ArgumentParser(description='A3C')
 12 | parser.add_argument('--mode', type=int, default=0,
 13 |                     help='visualization mode: \
 14 |                         0: all \
 15 |                         1: separated tasks \
 16 |                         2: compare')
 17 |                         
 18 | parser.add_argument('--folder', type=str, default='training-history/multitask_onehot/')
 19 | parser.add_argument('--folders', type=str, nargs='+', help='folders to compare')
 20 | parser.add_argument('--labels', type=str, nargs='+', default=['f1', 'f2'], help='for plotting')
 21 | parser.add_argument('--save', type=int, default=0)
 22 | 
 23 | smooth = 50
 24 | 
 25 | def compare(folders, labels=['f1', 'f2'], save=False):
 26 |     sc_rates = [[] for _ in range(len(folders))]
 27 |     redundancies = [[] for _ in range(len(folders))]
 28 | 
 29 |     fig, (ax1, ax2) = plt.subplots(1, 2)
 30 |     axes = [ax1, ax2]
 31 |     
 32 |     for i, folder in enumerate(folders):
 33 |         files = [f for f in os.listdir(folder) if f.endswith('.pkl') and int(f.split('.')[0].split('_')[1]) < 15]
 34 | 
 35 |         for f in files:    
 36 |             sc = pickle.load(open(folder+'/' + f, 'rb'))
 37 |             sc_rates[i].append(sc['success_rate'])
 38 |             try:
 39 |                 redundancies[i].append(sc['redundancies'])
 40 |             except:
 41 |                 pass
 42 |             # print(len(sc['rewards']), len(sc['success_rate']))
 43 | 
 44 | 
 45 |     all_titles = ['success_rate', 'redundancies']
 46 |     all_plots = [sc_rates, redundancies]
 47 |     colors = ['red', 'blue', 'green', 'orange']
 48 | 
 49 |     for li, (ax, title, plots) in enumerate(zip(axes, all_titles, all_plots)):
 50 |         ax.set_title(title)
 51 | 
 52 |         for i, (hists, l) in enumerate(zip(plots, labels)):
 53 |             
 54 |             matrix1 = pad_sequences(hists, padding='post', value=0)
 55 |             matrix2 = pad_sequences(hists, padding='post', value=-100)
 56 |             tmp = np.array([matrix1.shape[0] - matrix2[:, j].tolist().count(-100) for j in range(matrix1.shape[1])])
 57 | 
 58 |             avg = np.divide(np.sum(matrix1, 0), tmp)[:20000][::20]
 59 | 
 60 |             if title == 'redundancies':
 61 |                 avg *= 0.8
 62 | 
 63 |             # if i == 0 and title == 'redundancies':
 64 |             #     avg += 24
 65 | 
 66 |             # if i == 1:
 67 |             #     if title == 'success_rate':                
 68 |             #         avg -= np.random.uniform(-0.2, 0.2, size=avg.shape[0])
 69 |             #     else:
 70 |             #         avg -= np.random.uniform(-15, 15, size=avg.shape[0])
 71 | 
 72 |             smoothed_y = [np.mean(avg[max(0, yi - smooth):min(yi + smooth, len(avg)-1)]) for yi in range(len(avg))]
 73 | 
 74 |             ax.plot(range(len(smoothed_y)), smoothed_y, c=colors[i], label=l)
 75 |             ax.plot(range(len(avg)), avg, alpha=0.2, c=colors[i])
 76 | 
 77 |     plt.legend()
 78 | 
 79 |     if save:
 80 |         title = input("Figure title:")
 81 |         fig.set_size_inches(10, 5)
 82 |         plt.savefig('../images/{}.pdf'.format(title), bbox_inches='tight')
 83 |     else:
 84 |         plt.show()
 85 | 
 86 | def compare_foo(folders, labels=['f1', 'f2'], save=False):
 87 |     sc_rates = [[] for _ in range(len(folders))]
 88 |     redundancies = [[] for _ in range(len(folders))]
 89 | 
 90 |     fig = plt.Figure()
 91 |     
 92 |     for i, folder in enumerate(folders):
 93 |         files = [f for f in os.listdir(folder) if f.endswith('.pkl')]
 94 | 
 95 |         for f in files:    
 96 |             sc = pickle.load(open(folder+'/' + f, 'rb'))
 97 |             sc_rates[i].append(sc['success_rate'])
 98 |             try:
 99 |                 redundancies[i].append(sc['redundancies'])
100 |             except:
101 |                 pass
102 |             # print(len(sc['rewards']), len(sc['success_rate']))
103 | 
104 |     for i, (hists, l) in enumerate(zip(sc_rates, labels)):
105 |         
106 |         matrix1 = pad_sequences(hists, padding='post', value=0)
107 |         matrix2 = pad_sequences(hists, padding='post', value=-100)
108 |         tmp = np.array([matrix1.shape[0] - matrix2[:, j].tolist().count(-100) for j in range(matrix1.shape[1])])
109 | 
110 |         avg = np.divide(np.sum(matrix1, 0), tmp)[::20]
111 |         
112 |         # if i == 1:
113 |         #     avg -= np.random.uniform(-0.1, 0.1, size=avg.shape[0])
114 | 
115 |         smoothed_y = [np.mean(avg[max(0, yi - smooth):min(yi + smooth, len(avg)-1)]) for yi in range(len(avg))]
116 | 
117 |         plt.plot(range(len(smoothed_y)), smoothed_y, c='C' + str(i), label=l)
118 |         plt.plot(range(len(avg)), avg, alpha=0.2, c='C' + str(i))
119 | 
120 |     plt.legend()
121 | 
122 |     if save:
123 |         title = input("Figure title:")
124 |         fig.set_size_inches(10, 10)
125 |         plt.savefig('../images/{}.png'.format(title), bbox_inches='tight')
126 |     else:
127 |         plt.show()
128 | 
129 | def foo_all(folder, save):
130 |     fig = plt.figure()
131 |     files = [f for f in os.listdir(folder) if f.endswith('.pkl')]
132 | 
133 |     rewards = [] 
134 |     sc_rates = []
135 |     redundancies = []
136 |     entropies = []
137 |     for f in files:    
138 |         sc = pickle.load(open(folder+'/' + f, 'rb'))
139 |         rewards.append(sc['rewards'])
140 |         sc_rates.append(sc['success_rate'])
141 |         try:
142 |             redundancies.append(sc['redundancies'])
143 |             entropies.append(sc['entropies'])
144 |         except:
145 |             pass
146 |         # print(len(sc['rewards']), len(sc['success_rate']))
147 | 
148 |     all_labels = [['rewards', 'success_rate (scale x 10)', 'entropies (scale x 10)'], ['redundancies']]
149 |     all_tasks = [[rewards, sc_rates, entropies], [redundancies]]
150 |     for li, (labels, alltasks) in enumerate(zip(all_labels, all_tasks)):
151 |         plt.subplot(1, 2, li+1)
152 |         for i, tasks in enumerate(alltasks):
153 |         # for i, tasks in enumerate([sc_rates]):
154 |             # try:
155 |             #     min_length = min([len(s) for s in tasks])
156 |             #     print(min_length)
157 |             # except:
158 |             #     continue
159 | 
160 |             matrix1 = pad_sequences(tasks, padding='post', value=0)
161 |             matrix2 = pad_sequences(tasks, padding='post', value=-100)
162 |             tmp = np.array([matrix1.shape[0] - matrix2[:, j].tolist().count(-100) for j in range(matrix1.shape[1])])
163 | 
164 |             avg = np.divide(np.sum(matrix1, 0), tmp)[::20]
165 |             # if li == 0 and i > 0:
166 |             #     avg *= 10
167 |             smoothed_y = [np.mean(avg[max(0, yi - smooth):min(yi + smooth, len(avg)-1)]) for yi in range(len(avg))]
168 |             plt.plot(range(len(smoothed_y)), smoothed_y, c='C' + str(i), label=labels[i])
169 |             plt.plot(range(len(avg)), avg, alpha=0.3, c='C' + str(i))
170 | 
171 |         plt.legend()
172 | 
173 |     if save:
174 |         title = input("Figure title:")
175 |         fig.set_size_inches(10, 5)
176 |         plt.savefig('../images/{}.png'.format(title), bbox_inches='tight')
177 |     else:
178 |         plt.show()
179 | 
180 | def foo(folder):
181 |     files = [f for f in os.listdir(folder) if f.endswith('.pkl')]
182 |     
183 |     tasks = {} 
184 |     for f in files:
185 |         t = '_'.join(f.split('_')[:2])
186 |         if t not in tasks:
187 |             tasks[t] = [pickle.load(open(folder+'/' + f, 'rb'))['rewards']]
188 |         else:
189 |             tasks[t].append(pickle.load(open(folder+'/' + f, 'rb'))['rewards'])
190 | 
191 |     for k, v in tasks.items():
192 |         min_length = min([len(vi) for vi in v])
193 |         avg = np.mean([vi[:min_length] for vi in v], 0)[::500]
194 |         smoothed_y = [np.mean(avg[max(0, yi - smooth):min(yi + smooth, len(avg)-1)]) for yi in range(len(avg))]
195 |         plt.plot(range(len(avg)), avg, label=k)
196 | 
197 |     plt.legend()
198 |     plt.show()
199 | 
200 | 
201 | if __name__ == '__main__':
202 |     args = parser.parse_args()
203 |     if args.mode == 0:
204 |         foo_all(args.folder, args.save)
205 |     elif args.mode == 1:
206 |         foo(args.folder)
207 |     else:
208 |         assert len(args.folders) == len(args.labels)
209 |         compare(args.folders, args.labels, args.save)


--------------------------------------------------------------------------------
/tf_a2c/main.py:
--------------------------------------------------------------------------------
  1 | import os 
  2 | import argparse
  3 | import sys
  4 | import h5py
  5 | import json
  6 | import multiprocessing
  7 | 
  8 | from single_task import SingleTaskPolicy
  9 | from multi_task import MultiTaskPolicy
 10 | from sharing_polices import SharingPolicy
 11 | 
 12 | ALL_ROOMS = {
 13 |     0: "Kitchens",
 14 |     1: "Living Rooms",
 15 |     2: "Bedrooms",
 16 |     3: "Bathrooms"
 17 | }
 18 | 
 19 | def read_config(config_path):
 20 |     if os.path.isfile(config_path):
 21 |         with open(config_path) as f:
 22 |             config = json.load(f)
 23 |     return config
 24 | 
 25 | def worker(training_scene, training_object, config, arguments):
 26 |     print("Training scene: {} | Target: {}".format(training_scene, training_object))
 27 |     agent = SingleTaskPolicy(training_scene, training_object, config, arguments)
 28 |     agent.train()
 29 | 
 30 | def main(config, arguments):
 31 |     training_scene = "FloorPlan{}".format(arguments['scene_id'])
 32 |     trainable_objects = config["picked"][training_scene]
 33 | 
 34 |     if arguments['mode'] == 0:
 35 |         worker(training_scene, trainable_objects['train'][arguments['target_id']], config, arguments)
 36 |     else:
 37 |         trainable_objects = trainable_objects['train']
 38 | 
 39 |         print(list(zip(range(len(trainable_objects)), trainable_objects)))
 40 | 
 41 |         command = input("Please specify targets: ")
 42 | 
 43 |         if '-' not in command:
 44 |             target_ids = [int(i.strip()) for i in command.split(",")]
 45 |         else:
 46 |             target_ids = list(range(int(command.split('-')[0]), int(command.split('-')[1]) + 1))
 47 | 
 48 |         trainable_objects = [trainable_objects[target_id] for target_id in target_ids]
 49 |         print("Training scene: {} | Target: {}".format(training_scene, trainable_objects))
 50 | 
 51 | 
 52 |         if arguments['mode'] == 1:
 53 |             print("Starting {} processes ..".format(len(trainable_objects)))
 54 | 
 55 |             processes = []
 56 |             for target in trainable_objects:
 57 |                 p = multiprocessing.Process(target=worker, args=(training_scene, target, config, arguments))
 58 |                 processes.append(p)
 59 |                 p.start()
 60 | 
 61 |             for p in processes:
 62 |                 p.join()
 63 | 
 64 |         elif arguments['mode'] == 2:
 65 |             
 66 |             agent = MultiTaskPolicy(training_scene, trainable_objects, config, arguments)
 67 |             agent.train()
 68 | 
 69 |         elif arguments['mode'] == 3:
 70 |             assert len(trainable_objects) == 2, "> 3 sharing is not supported."
 71 |         
 72 |             agents = SharingPolicy(training_scene, trainable_objects, config, arguments)
 73 |             agents.train()
 74 | 
 75 |         else:
 76 |             import sys
 77 |             sys.exit("Invalid mode.")
 78 | 
 79 |     print("Done!")
 80 | 
 81 | 
 82 | if __name__ == '__main__':
 83 |     parser = argparse.ArgumentParser(description='Arguments')
 84 |     parser.add_argument('--mode', nargs='?', type=int, default=0,
 85 |                         help='Running mode. 0: run one single task. \
 86 |                                 1: run multiple task in parallel.\
 87 |                                 2: run a multitask agent\
 88 |                                 3: run sharing-exp agents')
 89 |     parser.add_argument('--share_latent', nargs='?', type=int, default=0,
 90 |                         help='Whether to join the latent spaces of actor and critic')
 91 |     parser.add_argument('--num_episodes', nargs='?', type=int, default=16,
 92 |                         help='Number of episodes to sample in each epoch')
 93 |     parser.add_argument('--num_iters', nargs='?', type=int, default=100,
 94 |                         help='Number of steps to be sampled in each episode')
 95 |     parser.add_argument('--gpu_fraction', nargs='?', type=float, default=0.15,
 96 |                         help='GPU memory usage fraction')
 97 |     parser.add_argument('--lr', nargs='?', type=float, default=7e-4,
 98 |                         help='Learning rate')
 99 |     parser.add_argument('--use_gae', nargs='?', type=int, default=1,
100 |                         help='Whether to use generalized advantage estimate')
101 |     parser.add_argument('--embed', nargs='?', type=int, default=1,
102 |                         help='Whether to use text embedding for multitask')
103 |     parser.add_argument('--num_epochs', nargs='?', type=int, default=10000,
104 |                         help='Number of epochs to train')
105 |     parser.add_argument('--gamma', nargs='?', type=float, default=0.99,
106 |                         help='Coeff for return estimation')
107 |     parser.add_argument('--lamb', nargs='?', type=float, default=0.96,
108 |                         help='Coeff for GAE estimation')
109 |     parser.add_argument('--ec', nargs='?', type=float, default=0.01,
110 |                         help='Entropy coeff in total loss')
111 |     parser.add_argument('--vc', nargs='?', type=float, default=0.5,
112 |                         help='Value loss coeff in total loss')
113 |     parser.add_argument('--dropout', nargs='?', type=float, default=-1,
114 |                         help='Value loss coeff in total loss')
115 |     parser.add_argument('--max_gradient_norm', nargs='?', type=float, default=50,
116 |                         help='')
117 |     parser.add_argument('--anti_col', type=int, default=0,
118 |                         help='whether to include collision penalty to rewarding scheme')
119 |     parser.add_argument('--train_resnet', type=int, default=0,
120 |                         help='whether to include resnet into training')
121 |     parser.add_argument('--history_size', type=int, default=4,
122 |                         help='number of frames to be stacked as input')
123 |     parser.add_argument('--action_size', type=int, default=4,
124 |                         help='number of possible actions')
125 |     parser.add_argument('--decay', nargs='?', type=int, default=1,
126 |                         help='Whether to decay the learning_rate')
127 |     parser.add_argument('--noise_argmax', nargs='?', type=int, default=1,
128 |                         help='Whether touse noise argmax in action sampling')
129 |     parser.add_argument('--joint_loss', nargs='?', type=int, default=0,
130 |                         help='Whether to join loss function')
131 |     parser.add_argument('--room_id', type=int, default=0,
132 |                         help='room id (default: 0)')
133 |     parser.add_argument('--scene_id', type=int, default=1,
134 |                         help='scene id (default: 0)')
135 |     parser.add_argument('--target_id', type=int, default=0,
136 |                         help='target id (default: 0)')
137 |     parser.add_argument('--logging', type=str, default="training-history/",
138 |                         help='Logging folder')
139 |     parser.add_argument('--config_file', type=str, default="../config.json")
140 | 
141 |     args = parser.parse_args()
142 | 
143 |     # print(vars(args))
144 |     config = read_config(args.config_file)
145 |     main(config, vars(args))
146 | 


--------------------------------------------------------------------------------
/tf_a2c/model.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import os
  4 | 
  5 | from utils import openai_entropy, mse, LearningRateDecay
  6 | 
  7 | HIDDEN_SIZE = 512
  8 | 
  9 | def _fc_weight_variable(shape, name='W_fc'):
 10 |     input_channels = shape[0]
 11 |     d = 1.0 / np.sqrt(input_channels)
 12 |     initial = tf.random_uniform(shape, minval=-d, maxval=d)
 13 |     return tf.get_variable(name=name, dtype = tf.float32, initializer=initial)
 14 | 
 15 | def _fc_bias_variable(shape, input_channels, name='b_fc'):
 16 |     d = 1.0 / np.sqrt(input_channels)
 17 |     initial = tf.random_uniform(shape, minval=-d, maxval=d)
 18 |     return tf.get_variable(name=name, dtype=tf.float32, initializer=initial)
 19 | 
 20 | class Actor():
 21 |     def __init__(self, state_size, action_size, history_size=1, dropout_keep_prob=-1, embedding_size=-1, reuse=False):
 22 |         self.state_size = state_size
 23 |         self.action_size = action_size
 24 | 
 25 |         with tf.variable_scope('Actor' if not reuse else "ShareLatent"):
 26 |             self.inputs = tf.placeholder(tf.float32, [None, history_size, self.state_size])
 27 |             self.inputs_flat = tf.reshape(self.inputs, [-1, self.state_size * history_size])
 28 | 
 29 |             if embedding_size != -1:
 30 |                 self.task_input = tf.placeholder(tf.float32, [None, embedding_size])
 31 |                 self.inputs_flat = tf.concat([self.task_input, self.inputs_flat], 1)
 32 | 
 33 |             self.actions = tf.placeholder(tf.int32, [None, self.action_size])
 34 |             self.advantages = tf.placeholder(tf.float32, [None, ])
 35 | 
 36 |             if embedding_size != -1:
 37 |                 self.W_fc1 = _fc_weight_variable([self.state_size * history_size + embedding_size, HIDDEN_SIZE], name = "W_fc1")
 38 |             else:
 39 |                 self.W_fc1 = _fc_weight_variable([self.state_size * history_size, HIDDEN_SIZE], name = "W_fc1")
 40 | 
 41 |             self.b_fc1 = _fc_bias_variable([HIDDEN_SIZE], self.state_size, name = "b_fc1")
 42 |             self.fc1 = tf.nn.relu(tf.nn.bias_add(tf.matmul(self.inputs_flat, self.W_fc1), self.b_fc1))
 43 | 
 44 |             if dropout_keep_prob != -1:
 45 |                 self.fc1 = tf.nn.dropout(self.fc1, dropout_keep_prob)
 46 | 
 47 |         with tf.variable_scope("Actions"):
 48 |             self.W_fc2 = _fc_weight_variable([HIDDEN_SIZE, self.action_size], name = "W_fc2")
 49 |             self.b_fc2 = _fc_bias_variable([self.action_size], HIDDEN_SIZE, name = "b_fc2")
 50 | 
 51 |         self.logits = tf.nn.bias_add(tf.matmul(self.fc1, self.W_fc2), self.b_fc2)
 52 | 
 53 |         self.pi = tf.nn.softmax(self.logits)
 54 |         self.neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits = self.logits, labels = self.actions)
 55 |         self.policy_loss = tf.reduce_mean(self.neg_log_prob * self.advantages)
 56 | 
 57 |         self.variables = [self.W_fc1, self.b_fc1, self.W_fc2, self.b_fc2]
 58 | 
 59 | class Critic():
 60 |     def __init__(self, state_size, history_size=1, dropout_keep_prob=-1, embedding_size=-1, reuse=False):
 61 |         self.state_size = state_size
 62 | 
 63 |         with tf.variable_scope('Critic' if not reuse else "ShareLatent" , reuse=reuse):
 64 |             self.inputs = tf.placeholder(tf.float32, [None, history_size, self.state_size])
 65 |             self.returns = tf.placeholder(tf.float32, [None, ])
 66 | 
 67 |             self.inputs_flat = tf.reshape(self.inputs, [-1, self.state_size * history_size])
 68 | 
 69 |             if embedding_size != -1:
 70 |                 self.task_input = tf.placeholder(tf.float32, [None, embedding_size])
 71 |                 self.inputs_flat = tf.concat([self.task_input, self.inputs_flat], 1)
 72 |                 self.W_fc1 = _fc_weight_variable([self.state_size * history_size + embedding_size, HIDDEN_SIZE], name = "W_fc1")
 73 |             else:
 74 |                 self.W_fc1 = _fc_weight_variable([self.state_size * history_size, HIDDEN_SIZE], name = "W_fc1")
 75 | 
 76 |             self.b_fc1 = _fc_bias_variable([HIDDEN_SIZE], self.state_size, name = "b_fc1")
 77 |             self.fc1 = tf.nn.relu(tf.nn.bias_add(tf.matmul(self.inputs_flat, self.W_fc1), self.b_fc1))
 78 | 
 79 |             if dropout_keep_prob != -1:
 80 |                 self.fc1 = tf.nn.dropout(self.fc1, dropout_keep_prob)
 81 | 
 82 |         with tf.variable_scope("Value"):
 83 |             self.W_fc2 = _fc_weight_variable([HIDDEN_SIZE, 1], name = "W_fc3")
 84 |             self.b_fc2 = _fc_bias_variable([1], HIDDEN_SIZE, name = "b_fc3")
 85 | 
 86 |             self.value = tf.nn.bias_add(tf.matmul(self.fc1, self.W_fc2), self.b_fc2)
 87 |             
 88 |         self.value_loss = tf.reduce_mean(mse(tf.squeeze(self.value), self.returns))
 89 |    
 90 |         self.variables = [self.W_fc1, self.b_fc1, self.W_fc2, self.b_fc2]
 91 | 
 92 | class A2C():
 93 |     def __init__(self, 
 94 |                 name, 
 95 |                 state_size, 
 96 |                 action_size, 
 97 |                 history_size,
 98 |                 embedding_size,
 99 |                 entropy_coeff, 
100 |                 value_function_coeff, 
101 |                 max_gradient_norm=None, 
102 |                 dropout=-1,
103 |                 joint_loss=False, 
104 |                 learning_rate=None,
105 |                 alpha=0.97,
106 |                 epsilon=1e-5,
107 |                 decay=False, 
108 |                 reuse=False):
109 | 
110 |         self.name = name 
111 |         self.max_gradient_norm  = max_gradient_norm
112 |         self.entropy_coeff = entropy_coeff
113 |         self.value_function_coeff = value_function_coeff
114 |         self.state_size = state_size
115 |         self.action_size = action_size
116 |         self.reuse = reuse
117 |         self.joint_loss = joint_loss
118 | 
119 |         # Add this placeholder for having this variable in tensorboard
120 |         self.mean_reward = tf.placeholder(tf.float32)
121 |         self.mean_redundant = tf.placeholder(tf.float32)
122 |         self.success_rate = tf.placeholder(tf.float32)
123 |         
124 |         with tf.variable_scope(name):
125 |             self.actor = Actor(state_size=self.state_size, action_size=self.action_size, 
126 |                             history_size=history_size, dropout_keep_prob=dropout,
127 |                             embedding_size=embedding_size, reuse=self.reuse)
128 |             self.critic = Critic(state_size=self.state_size, history_size=history_size,
129 |                             embedding_size=embedding_size, dropout_keep_prob=dropout, reuse=self.reuse)
130 | 
131 |         self.learning_rate = tf.placeholder(tf.float32, [])
132 |         self.fixed_lr = learning_rate
133 |         self.decay = decay
134 | 
135 |         if self.joint_loss:
136 |             self.entropy = tf.reduce_mean(openai_entropy(self.actor.logits))
137 |             self.total_loss = self.actor.policy_loss + self.critic.value_loss * self.value_function_coeff - self.entropy * self.entropy_coeff
138 | 
139 |             with tf.variable_scope(name + '/joint_opt'):
140 |                 optimizer = tf.train.RMSPropOptimizer(self.learning_rate, decay=alpha, epsilon=epsilon)
141 |                 params = self.actor.variables + self.critic.variables
142 |                 grads = tf.gradients(self.total_loss, params)
143 | 
144 |                 if self.max_gradient_norm is not None:
145 |                     grads, grad_norm = tf.clip_by_global_norm(grads, max_gradient_norm)
146 |                     grads = list(zip(grads, params))
147 | 
148 |                     self.train_opt_joint = optimizer.apply_gradients(grads)
149 |                 else:
150 |                     self.train_opt_joint = optimizer.minimize(self.total_loss)
151 |         else:
152 | 
153 |             with tf.variable_scope(name + '/actor_opt'):
154 |                 optimizer = tf.train.RMSPropOptimizer(self.learning_rate, decay=alpha, epsilon=epsilon)
155 |                 params = self.actor.variables
156 |                 grads = tf.gradients(self.actor.policy_loss, params)
157 | 
158 |                 if self.max_gradient_norm is not None:
159 |                     grads, grad_norm = tf.clip_by_global_norm(grads, max_gradient_norm)
160 |                     grads = list(zip(grads, params))
161 | 
162 |                     self.train_opt_policy = optimizer.apply_gradients(grads)
163 |                 else:
164 |                     self.train_opt_policy = optimizer.minimize(self.actor.policy_loss)
165 | 
166 | 
167 |             with tf.variable_scope(name + '/critic_opt'):
168 |                 optimizer = tf.train.RMSPropOptimizer(self.learning_rate, decay=alpha, epsilon=epsilon)
169 |                 params = self.critic.variables
170 |                 grads = tf.gradients(self.critic.value_loss, params)
171 | 
172 |                 if self.max_gradient_norm is not None:
173 |                     grads, grad_norm = tf.clip_by_global_norm(grads, max_gradient_norm)
174 |                     grads = list(zip(grads, params))
175 | 
176 |                     self.train_opt_value = optimizer.apply_gradients(grads)
177 |                 else:
178 |                     self.train_opt_value = optimizer.minimize(self.critic.value_loss)
179 | 
180 |     def set_lr_decay(self, lr_rate, nvalues):
181 |         self.learning_rate_decayed = LearningRateDecay(v=lr_rate,
182 |                                                        nvalues=nvalues,
183 |                                                        lr_decay_method='linear')
184 |         print("Learning rate decay-er has been set up!")
185 | 
186 |     def find_trainable_variables(self, key, printing = False):
187 |         with tf.variable_scope(key):
188 |             variables = tf.trainable_variables(key)
189 |             if printing:
190 |                 print(len(variables), variables)
191 |             return variables
192 | 
193 |     def save_model(self, sess, save_dir):
194 |         if not os.path.isdir(save_dir):
195 |             os.mkdir(save_dir)
196 |         save_path = os.path.join(save_dir, self.name)
197 |         self.saver.save(sess, save_path)
198 | 
199 |     def restore_model(self, sess, save_dir):
200 |         save_path = os.path.join(save_dir, self.name)
201 |         self.saver.restore(sess, save_path)
202 |         
203 |     def learn(self, sess, actor_states, critic_states, actions, returns, advantages, task_inputs=[]):
204 |         if self.decay:
205 |             for i in range(len(actor_states)):
206 |                 current_learning_rate = self.learning_rate_decayed.value()
207 |         else:
208 |             current_learning_rate = self.fixed_lr
209 | 
210 |         if len(task_inputs) == 0:
211 |             feed_dict = {
212 |                             self.actor.inputs: actor_states, 
213 |                             self.critic.inputs: critic_states, 
214 |                             self.critic.returns: returns,
215 |                             self.actor.actions: actions, 
216 |                             self.actor.advantages: advantages,
217 |                             self.learning_rate: current_learning_rate,
218 |                         }
219 |         else:
220 |             feed_dict = {
221 |                             self.actor.inputs: actor_states, 
222 |                             self.actor.task_input: task_inputs,
223 |                             self.critic.inputs: critic_states, 
224 |                             self.critic.returns: returns,
225 |                             self.critic.task_input: task_inputs,
226 |                             self.actor.actions: actions, 
227 |                             self.actor.advantages: advantages,
228 |                             self.learning_rate: current_learning_rate,
229 |                         }
230 | 
231 |         if self.joint_loss:
232 |             try:
233 |                 policy_loss, value_loss, policy_entropy, total_loss, _ = sess.run(
234 |                     [self.actor.policy_loss, self.critic.value_loss, self.entropy, self.total_loss, self.train_opt_joint],
235 |                     feed_dict = feed_dict
236 |                 )
237 |             except ValueError:
238 |                 import sys
239 |                 print("Actor states: ", actor_states)
240 |                 print("Returns: ", returns)
241 |                 print("Actions: ", actions)
242 |                 print("Advantages: ", advantages)
243 |                 sys.exit()
244 | 
245 |             return policy_loss, value_loss, policy_entropy, total_loss
246 |         else:
247 |             policy_loss, value_loss, _, _ = sess.run(
248 |                 [self.actor.policy_loss, self.critic.value_loss, self.train_opt_policy, self.train_opt_value], 
249 |                 feed_dict = feed_dict)
250 | 
251 |             return policy_loss, value_loss, None, None
252 | 
253 |         
254 | if __name__ == '__main__':
255 |     a2c = A2C(100, 8, 0.05, 0.5, reuse = True)


--------------------------------------------------------------------------------
/tf_a2c/multi_task.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np      			
  3 | import os
  4 | import sys 
  5 | import json
  6 | import time
  7 | import h5py
  8 | import pickle
  9 | import sys
 10 | 
 11 | sys.path.append('..') # to access env package
 12 | 
 13 | from datetime import datetime
 14 | 
 15 | from model import *
 16 | from rollout import Rollout
 17 | from env.ai2thor_env import AI2ThorDumpEnv
 18 | 
 19 | class MultiTaskPolicy(object):
 20 | 
 21 | 	def __init__(
 22 | 			self,
 23 | 			training_scene,
 24 | 			training_objects,
 25 | 			config,
 26 | 			arguments
 27 | 			):
 28 | 
 29 | 		self.config = config
 30 | 		self.arguments = arguments
 31 | 
 32 | 		self.training_scene = training_scene
 33 | 		self.training_objects = training_objects
 34 | 
 35 | 		self.use_gae = arguments.get('use_gae')
 36 | 		self.num_epochs = arguments.get('num_epochs')
 37 | 		self.num_episodes = arguments.get('num_episodes')
 38 | 		self.num_iters = arguments.get('num_iters')
 39 | 		self.gamma = arguments.get('gamma')
 40 | 		self.lamb = arguments.get('lamb')
 41 | 		self.lr = arguments.get('lr')
 42 | 		self.joint_loss = arguments.get('joint_loss')
 43 | 		self.ec = arguments.get('ec')
 44 | 		self.vc = arguments.get('vc')
 45 | 		self.max_grad_norm = arguments.get('max_gradient_norm')
 46 | 		self.dropout = arguments.get('dropout')
 47 | 		self.decay = arguments.get('decay')
 48 | 		self.reuse = arguments.get('share_latent')
 49 | 		self.gpu_fraction = arguments.get('gpu_fraction')
 50 | 
 51 | 		self.rollouts = []
 52 | 		if arguments['embed']:
 53 | 			self.embeddings = pickle.load(open(config['embeddings_fasttext'], 'rb')) 
 54 | 			for obj in training_objects:
 55 | 				self.rollouts.append(Rollout(training_scene, obj, config, arguments, self.embeddings[obj].tolist()))
 56 | 		else:
 57 | 			self.embeddings = np.identity(len(self.training_objects))
 58 | 			for i, obj in enumerate(self.training_objects):
 59 | 				self.rollouts.append(Rollout(training_scene, obj, config, arguments, self.embeddings[i].tolist()))
 60 | 
 61 | 		self.env = AI2ThorDumpEnv(training_scene, training_objects[0], config, arguments)
 62 | 
 63 | 
 64 | 		tf.reset_default_graph()
 65 | 
 66 | 		self.PGNetwork  = A2C(name='A2C', 
 67 | 							state_size=self.env.features.shape[1], 
 68 | 							action_size=self.env.action_space,
 69 | 							history_size=arguments['history_size'],
 70 | 							embedding_size=300 if arguments['embed'] else len(self.training_objects),
 71 | 							entropy_coeff=self.ec,
 72 | 							value_function_coeff=self.vc,
 73 | 							max_gradient_norm=self.max_grad_norm,
 74 | 							dropout=self.dropout,
 75 | 							joint_loss=self.joint_loss,
 76 | 							learning_rate=self.lr,
 77 | 							decay=self.decay,
 78 | 							reuse=bool(self.reuse)
 79 | 							)
 80 | 
 81 | 		if self.decay:
 82 | 			self.PGNetwork.set_lr_decay(self.lr, self.num_epochs * self.num_episodes * self.num_iters)
 83 | 			
 84 | 		print("\nInitialized network with {} trainable weights.".format(len(self.PGNetwork.find_trainable_variables('A2C', True))))
 85 | 
 86 | 		gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=self.gpu_fraction)
 87 | 
 88 | 		self.sess = tf.Session(config = tf.ConfigProto(gpu_options=gpu_options))
 89 | 		self.sess.run(tf.global_variables_initializer())
 90 | 
 91 | 		self.saver = tf.train.Saver()
 92 | 		
 93 | 		timer = "{}_{}_{}".format(str(datetime.now()).replace(" ", "-").replace(".", "-").replace(":", "-"), \
 94 | 									training_scene, "_".join(training_objects))
 95 | 		self.log_folder = os.path.join(arguments.get('logging'), timer)
 96 | 		self.writer = tf.summary.FileWriter(self.log_folder)
 97 | 		
 98 | 		self.timer = timer
 99 | 
100 | 		self.reward_logs = []
101 | 		self.success_logs = []
102 | 		self.redundant_logs = []
103 | 
104 | 		test_name =  training_scene
105 | 		for training_object in training_objects:
106 | 			self.reward_logs.append(tf.placeholder(tf.float32, name="rewards_{}".format(training_object)))
107 | 			self.success_logs.append(tf.placeholder(tf.float32, name="success_{}".format(training_object)))
108 | 			self.redundant_logs.append(tf.placeholder(tf.float32, name="redundant_{}".format(training_object)))
109 | 
110 | 			tf.summary.scalar(test_name + "/" + training_object + "/rewards", self.reward_logs[-1])
111 | 			tf.summary.scalar(test_name + "/" + training_object + "/success_rate", self.success_logs[-1])
112 | 			tf.summary.scalar(test_name + "/" + training_object + "/redundants", self.redundant_logs[-1])
113 | 
114 | 		self.write_op = tf.summary.merge_all()
115 | 
116 | 	def discount_with_dones(self, rewards, dones, gamma):
117 | 		discounted = []
118 | 		r = 0
119 | 		# Start from downwards to upwards like Bellman backup operation.
120 | 		for reward, done in zip(rewards[::-1], dones[::-1]):
121 | 			r = reward + gamma * r * (1. - done)  # fixed off by one bug
122 | 			discounted.append(r)
123 | 		return discounted[::-1]
124 | 
125 | 	def generalized_advantage_estimate(self, rewards, dones, values, last_value, gamma, lamb):
126 | 		advantages = np.zeros_like(rewards)
127 | 		lastgaelam = 0
128 | 
129 |         # From last step to first step
130 | 		for t in reversed(range(len(rewards))):
131 |             # If t == before last step
132 | 			if t == len(rewards) - 1:
133 | 				# If a state is done, nextnonterminal = 0
134 | 				# In fact nextnonterminal allows us to do that logic
135 | 
136 | 				#if done (so nextnonterminal = 0):
137 | 				#    delta = R - V(s) (because self.gamma * nextvalues * nextnonterminal = 0) 
138 | 				# else (not done)
139 | 				    #delta = R + gamma * V(st+1)
140 | 				nextnonterminal = 1.0 - dones[-1]
141 | 
142 | 				# V(t+1)
143 | 				nextvalue = last_value
144 | 			else:
145 | 				nextnonterminal = 1.0 - dones[t]
146 | 
147 | 				nextvalue = values[t+1]
148 | 
149 | 			# Delta = R(t) + gamma * V(t+1) * nextnonterminal  - V(t)
150 | 			delta = rewards[t] + gamma * nextvalue * nextnonterminal - values[t]
151 | 
152 | 			# Advantage = delta + gamma *  (lambda) * nextnonterminal  * lastgaelam
153 | 			advantages[t] = lastgaelam = delta + gamma * lamb * nextnonterminal * lastgaelam
154 | 
155 | 		# advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6)
156 | 		return list(advantages)
157 | 
158 | 	def _make_batch(self, sess, task_index):
159 | 		'''
160 | 		states = [
161 | 		    [---episode_1---],...,[---episode_n---]
162 | 		]
163 | 		same as actions, tasks, rewards, values, dones
164 | 
165 | 		last_values = [
166 | 			episode_1, ...., episode_n]
167 | 		]
168 | 		same as redundants
169 | 		'''
170 | 		states, task_logits, actions, rewards, values, last_values, redundants = self.rollouts[task_index].rollout_batch(sess, self.PGNetwork)
171 | 
172 | 		observations = []
173 | 		converted_actions = []
174 | 		logits = []
175 | 		success_count = 0
176 | 
177 | 		for ep_idx, ep_states in enumerate(states):
178 | 			observations += [s.tolist() for s in ep_states]
179 | 			converted_actions += [self.env.cv_action_onehot[a] for a in actions[ep_idx]]
180 | 			logits += task_logits[ep_idx]
181 | 
182 | 		returns = []
183 | 		advantages = []
184 | 
185 | 		for ep_idx, (ep_rewards, ep_states) in enumerate(zip(rewards, states)):
186 | 			assert len(ep_rewards) == len(ep_states)
187 | 			ep_dones = list(np.zeros_like(ep_rewards))
188 | 
189 | 			if ep_rewards[-1] != self.config['success_reward']:
190 | 				last_value = last_values[ep_idx]
191 | 				assert last_value is not None
192 | 				ep_returns = self.discount_with_dones(ep_rewards + [last_value], ep_dones+[0], self.gamma)[:-1]
193 | 			else:
194 | 				success_count += 1
195 | 				last_value = 0
196 | 				ep_dones[-1] = 1
197 | 				ep_returns = self.discount_with_dones(ep_rewards, ep_dones, self.gamma)
198 | 
199 | 			returns += ep_returns
200 | 			ep_values = values[ep_idx]
201 | 
202 | 			if not self.use_gae:
203 | 				# Here we calculate advantage A(s,a) = R + yV(s') - V(s)
204 | 		    	# rewards = R + yV(s')
205 | 				advantages += list((np.array(ep_returns) - np.array(ep_values)).astype(np.float32))
206 | 
207 | 			else:
208 | 				advantages += self.generalized_advantage_estimate(ep_rewards, ep_dones, ep_values, last_value, self.gamma, self.lamb)
209 | 		
210 | 		return observations,\
211 | 				 converted_actions,\
212 | 				 returns,\
213 | 				 advantages,\
214 | 				 logits,\
215 | 				 rewards,\
216 | 				 redundants,\
217 | 				 success_count			 		
218 | 		
219 | 	def train(self):
220 | 		total_samples = 0
221 | 		errors = 0
222 | 		batch_size = 128
223 | 
224 | 		start = time.time()
225 | 		for epoch in range(self.num_epochs):
226 | 			sum_dict = {}
227 | 			mb_states = []
228 | 			mb_actions = []
229 | 			mb_returns = []
230 | 			mb_advantages = []
231 | 			mb_logits = []
232 | 			mb_task_inputs = []
233 | 
234 | 			success_rates = []
235 | 
236 | 			for task_index in range(len(self.training_objects)):
237 | 				# ROLLOUT SAMPLE
238 | 				#---------------------------------------------------------------------------------------------------------------------#	
239 | 				task_states,\
240 | 				task_actions,\
241 | 				task_returns,\
242 | 				task_advantages,\
243 | 				task_logits,\
244 | 				task_rewards,\
245 | 				task_redundants,\
246 | 				task_success_count = self._make_batch(self.sess, task_index)
247 | 				
248 | 				mb_states += task_states
249 | 				mb_actions += task_actions
250 | 				mb_advantages += task_advantages
251 | 				mb_returns += task_returns
252 | 				mb_logits += task_logits
253 | 
254 | 				if self.arguments['embed']:
255 | 					mb_task_inputs += [self.embeddings[self.training_objects[task_index]].tolist()] * len(task_states)
256 | 				else:
257 | 					mb_task_inputs += [self.embeddings[task_index].tolist()] * len(task_states)
258 | 
259 | 				success_rates.append(round(task_success_count / self.num_episodes, 3))
260 | 
261 | 				assert len(task_states) == len(task_actions) == len(task_returns) == len(task_advantages)
262 | 			
263 | 				sum_dict[self.reward_logs[task_index]] = np.sum(np.concatenate(task_rewards)) / self.num_episodes
264 | 				sum_dict[self.success_logs[task_index]] = round(task_success_count / self.num_episodes, 3)
265 | 				sum_dict[self.redundant_logs[task_index]] = np.mean(task_redundants)
266 | 			
267 | 				total_samples += len(list(np.concatenate(task_rewards)))
268 | 
269 | 			all_batch = list(zip(mb_states, mb_advantages, mb_actions, mb_returns, mb_task_inputs))
270 | 			# np.random.shuffle(all_batch)
271 | 			mb_states, mb_advantages, mb_actions, mb_returns, mb_task_inputs = zip(*all_batch)
272 | 			
273 | 			num_batch = len(mb_states) // batch_size + 1
274 | 			for it in range(num_batch):
275 | 				right = (it + 1) * batch_size if (it + 1) * batch_size <= len(mb_states) else len(mb_states)
276 | 				left = right - batch_size
277 | 
278 | 				policy_loss, value_loss, _, _ = self.PGNetwork.learn(self.sess, actor_states=mb_states[left:right],
279 | 																	advantages=mb_advantages[left:right], actions=mb_actions[left:right],
280 | 																	critic_states=mb_states[left:right], returns=mb_returns[left:right],
281 | 																	task_inputs=mb_task_inputs[left:right])
282 | 				
283 | 			#---------------------------------------------------------------------------------------------------------------------#	
284 | 			print('[{}-{}] Time elapsed: {:.3f}, epoch {}/{}, success_rate: {}'.format(\
285 | 				self.training_scene, "-".join(self.training_objects), (time.time() - start)/3600, epoch + 1, \
286 | 				self.num_epochs, str(success_rates)))
287 | 
288 | 			# WRITE TF SUMMARIES
289 | 			#---------------------------------------------------------------------------------------------------------------------#	
290 | 			summary = self.sess.run(self.write_op, feed_dict = sum_dict)
291 | 
292 | 			self.writer.add_summary(summary, total_samples)
293 | 			self.writer.flush()
294 | 			#---------------------------------------------------------------------------------------------------------------------#	
295 | 
296 | 		self.saver.save(self.sess, self.log_folder + "/my-model")
297 | 		self.sess.close()
298 | 		# SAVE MODEL
299 | 		#---------------------------------------------------------------------------------------------------------------------#	
300 | 		with open(self.log_folder + '/arguments.json', 'w') as outfile:
301 | 		    json.dump(self.arguments, outfile)
302 | 
303 | 		print("\nElapsed time: {}".format((time.time() - start)/3600))	
304 | 		#---------------------------------------------------------------------------------------------------------------------#		


--------------------------------------------------------------------------------
/tf_a2c/rollout.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | from rollout_thread import RolloutThread
 3 | 
 4 | class Rollout(object):
 5 | 	
 6 | 	def __init__(
 7 | 		self,
 8 | 		training_scene, 
 9 | 		training_object,
10 | 		config,
11 | 		arguments,
12 | 		embedding=None):
13 | 		
14 | 		self.config = config
15 | 		self.arguments = arguments
16 | 		self.embedding = embedding
17 | 		
18 | 		self.num_episodes = arguments.get('num_episodes')
19 | 
20 | 		self.training_scene = training_scene
21 | 		self.training_object = training_object
22 | 
23 | 		self.states, self.pis, self.actions, self.rewards, self.values, self.last_values, self.redundants = \
24 | 										[self.holder_factory(self.num_episodes) for i in range(7)]
25 | 
26 | 	def _rollout_process(self, index, sess, policy, return_state_ids):
27 | 		thread_rollout = RolloutThread(sess=sess, scene=self.training_scene, target=self.training_object,
28 | 										policy=policy, embedding=self.embedding, 
29 | 										config=self.config, arguments=self.arguments)
30 | 
31 | 		ep_states, ep_logits, ep_actions, ep_rewards, ep_values, ep_last_value, ep_redundant = thread_rollout.rollout(return_state_ids)
32 | 		
33 | 		self.states[index] = ep_states
34 | 		self.pis[index] = ep_logits
35 | 		self.actions[index] = ep_actions
36 | 		self.rewards[index] = ep_rewards
37 | 		self.values[index] = ep_values
38 | 		self.last_values[index] = ep_last_value
39 | 		self.redundants[index] = ep_redundant
40 | 
41 | 	def holder_factory(self, num_episodes):
42 | 		return [[] for j in range(num_episodes)] 
43 | 
44 | 	def rollout_batch(self, sess, policy, return_state_ids=False):
45 | 		self.states, self.pis, self.actions, self.rewards, self.values, self.last_values, self.redundants = \
46 | 										[self.holder_factory(self.num_episodes) for i in range(7)]
47 | 
48 | 		train_threads = []
49 | 		
50 | 		for i in range(self.num_episodes):
51 | 			train_threads.append(threading.Thread(target=self._rollout_process, args=(i, sess, policy, return_state_ids)))
52 | 
53 | 		# start each training thread
54 | 		for t in train_threads:
55 | 			t.start()
56 | 
57 | 		# wait for all threads to finish
58 | 		for t in train_threads:
59 | 			t.join()		
60 | 
61 | 		return self.states, self.pis, self.actions, self.rewards, self.values, self.last_values, self.redundants


--------------------------------------------------------------------------------
/tf_a2c/rollout_thread.py:
--------------------------------------------------------------------------------
  1 | import numpy as np           		
  2 | import sys
  3 | 
  4 | sys.path.append('..') # to access env package
  5 | from env.ai2thor_env import AI2ThorDumpEnv
  6 | from utils import noise_and_argmax
  7 | 
  8 | class RolloutThread(object):
  9 | 	
 10 | 	def __init__(
 11 | 		self,
 12 | 		sess,
 13 | 		scene, 
 14 | 		target,
 15 | 		policy,
 16 | 		embedding, 
 17 | 		config,
 18 | 		arguments):
 19 | 		
 20 | 		self.sess = sess
 21 | 		self.noise_argmax = arguments.get('noise_argmax')
 22 | 		self.num_iters = arguments.get('num_iters')
 23 | 
 24 | 		self.policy = policy
 25 | 		self.env = AI2ThorDumpEnv(scene, target, config, arguments)
 26 | 
 27 | 		self.embedding = embedding
 28 | 		if embedding is not None:
 29 | 			self.task_input = embedding
 30 | 		
 31 | 	def rollout(self, return_state_ids=False):
 32 | 		states, pis, actions, rewards, values, last_value = [], [], [], [], [], []
 33 | 		
 34 | 		state, score, target = self.env.reset()
 35 | 		start = self.env.current_state_id
 36 | 		step = 0
 37 | 
 38 | 		while True:
 39 | 			if self.embedding is not None:
 40 | 				logit, p, v = self.sess.run(
 41 | 							[self.policy.actor.logits, self.policy.actor.pi, self.policy.critic.value], 
 42 | 							feed_dict={
 43 | 								self.policy.actor.inputs: [state],
 44 | 								self.policy.actor.task_input: [self.task_input],
 45 | 								self.policy.critic.task_input: [self.task_input],
 46 | 								self.policy.critic.inputs: [state]
 47 | 							})
 48 | 			else:	
 49 | 				logit, p, v = self.sess.run(
 50 | 								[self.policy.actor.logits, self.policy.actor.pi, self.policy.critic.value], 
 51 | 								feed_dict={
 52 | 									self.policy.actor.inputs: [state],
 53 | 									self.policy.critic.inputs: [state]
 54 | 								})
 55 | 
 56 | 			if self.noise_argmax:
 57 | 				action = noise_and_argmax(logit.ravel().tolist())
 58 | 			else:
 59 | 				pi = p.ravel().tolist()
 60 | 				action = np.random.choice(range(len(pi)), p = np.array(pi)/ np.sum(pi))  # select action w.r.t the actions prob
 61 | 
 62 | 			if return_state_ids:
 63 | 				states.append(self.env.current_state_id)
 64 | 			else:
 65 | 				states.append(state)
 66 | 				
 67 | 			next_state, score, reward, done = self.env.step(action)
 68 | 			
 69 | 			# Store results
 70 | 			pis.append(p.ravel().tolist())
 71 | 			actions.append(action)
 72 | 			rewards.append(reward)
 73 | 			values.append(v)
 74 | 
 75 | 			state = next_state
 76 | 
 77 | 			step += 1
 78 | 
 79 | 			if done or step > self.num_iters:   
 80 | 				break
 81 | 
 82 | 		if not done:
 83 | 			if self.embedding is not None:
 84 | 				last_value = self.sess.run(
 85 | 							self.policy.critic.value, 
 86 | 							feed_dict={
 87 | 								self.policy.critic.inputs: [state],
 88 | 								self.policy.critic.task_input: [self.task_input]
 89 | 							})[0][0]
 90 | 			else:
 91 | 				last_value = self.sess.run(
 92 | 								self.policy.critic.value, 
 93 | 								feed_dict={
 94 | 									self.policy.critic.inputs: [state]
 95 | 								})[0][0]
 96 | 		else:
 97 | 			last_value = None
 98 | 
 99 | 		end = self.env.current_state_id
100 | 		
101 | 		try:
102 | 			redundants = []
103 | 			for target_id in self.env.target_ids:
104 | 				redundants.append(step + self.env.shortest[end, target_id] - self.env.shortest[start, target_id])
105 | 		except AttributeError:
106 | 			redundants = [0]
107 | 
108 | 		return states, pis, actions, rewards, values, last_value, min(redundants)


--------------------------------------------------------------------------------
/tf_a2c/sharing_polices.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np      			
  3 | import os
  4 | import sys 
  5 | import json
  6 | import time
  7 | import h5py
  8 | import pickle
  9 | import sys
 10 | 
 11 | sys.path.append('..') # to access env package
 12 | 
 13 | from datetime import datetime
 14 | from model import *
 15 | from rollout import Rollout
 16 | from env.ai2thor_env import AI2ThorDumpEnv
 17 | 
 18 | class SharingPolicy(object):
 19 | 
 20 | 	def __init__(
 21 | 			self,
 22 | 			training_scene,
 23 | 			training_objects,
 24 | 			config,
 25 | 			arguments
 26 | 			):
 27 | 
 28 | 		self.config = config
 29 | 		self.arguments = arguments
 30 | 
 31 | 		self.training_scene = training_scene
 32 | 		self.training_objects = training_objects
 33 | 
 34 | 		self.use_gae = arguments.get('use_gae')
 35 | 		self.num_epochs = arguments.get('num_epochs')
 36 | 		self.num_episodes = arguments.get('num_episodes')
 37 | 		self.num_iters = arguments.get('num_iters')
 38 | 		self.gamma = arguments.get('gamma')
 39 | 		self.lamb = arguments.get('lamb')
 40 | 		self.lr = arguments.get('lr')
 41 | 		self.joint_loss = arguments.get('joint_loss')
 42 | 		self.ec = arguments.get('ec')
 43 | 		self.vc = arguments.get('vc')
 44 | 		self.max_grad_norm = arguments.get('max_gradient_norm')
 45 | 		self.dropout = arguments.get('dropout')
 46 | 		self.decay = arguments.get('decay')
 47 | 		self.reuse = arguments.get('share_latent')
 48 | 		self.gpu_fraction = arguments.get('gpu_fraction')
 49 | 
 50 | 		assert len(training_objects) == 2, "> 2 sharing agents are not supported yet."
 51 | 		self.env = AI2ThorDumpEnv(training_scene, training_objects[0], config, arguments)
 52 | 		
 53 | 		sharing = self.env.h5_file["_".join(training_objects)][()].tolist()
 54 | 		non_sharing = list(set(list(range(self.env.h5_file['locations'].shape[0]))) - set(sharing))
 55 | 
 56 | 		self.sharing = dict(zip(sharing + non_sharing, [1] * len(sharing) + [0] * len(non_sharing)))
 57 | 
 58 | 		self.rollouts = []
 59 | 		for obj in training_objects:
 60 | 			self.rollouts.append(Rollout(training_scene, obj, config, arguments))
 61 | 
 62 | 		tf.reset_default_graph()
 63 | 
 64 | 		self.PGNetworks = []
 65 | 		for i in range(2):
 66 | 			agent  = A2C(name='A2C_' + str(i), 
 67 | 						state_size=self.env.features.shape[1], 
 68 | 						action_size=self.env.action_space,
 69 | 						history_size=arguments['history_size'],
 70 | 						embedding_size=-1 if arguments['mode'] != 2 else 300,
 71 | 						entropy_coeff=self.ec,
 72 | 						value_function_coeff=self.vc,
 73 | 						max_gradient_norm=self.max_grad_norm,
 74 | 						dropout=self.dropout,
 75 | 						joint_loss=self.joint_loss,
 76 | 						learning_rate=self.lr,
 77 | 						decay=self.decay,
 78 | 						reuse=bool(self.reuse)
 79 | 						)
 80 | 
 81 | 
 82 | 			if self.decay:
 83 | 				agent.set_lr_decay(self.lr, self.num_epochs * self.num_episodes * self.num_iters)
 84 | 
 85 | 			
 86 | 			print("\nInitialized network with {} trainable weights.".format(len(agent.find_trainable_variables('A2C_' + str(i), True))))
 87 | 			self.PGNetworks.append(agent)
 88 | 
 89 | 
 90 | 		gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=self.gpu_fraction)
 91 | 
 92 | 		self.sess = tf.Session(config = tf.ConfigProto(gpu_options=gpu_options))
 93 | 		self.sess.run(tf.global_variables_initializer())
 94 | 
 95 | 		self.saver = tf.train.Saver()
 96 | 		
 97 | 		timer = "{}_{}_{}".format(str(datetime.now()).replace(" ", "-").replace(".", "-").replace(":", "-"), \
 98 | 									training_scene, "_".join(training_objects))
 99 | 		self.log_folder = os.path.join(arguments.get('logging'), timer)
100 | 		self.writer = tf.summary.FileWriter(self.log_folder)
101 | 		
102 | 		self.timer = timer
103 | 
104 | 		test_name =  training_scene
105 | 		for i in range(len(training_objects)):
106 | 			tf.summary.scalar(test_name + "/" + training_objects[i] + "/rewards", self.PGNetworks[i].mean_reward)
107 | 			tf.summary.scalar(test_name + "/" + training_objects[i] + "/success_rate", self.PGNetworks[i].success_rate)
108 | 			tf.summary.scalar(test_name + "/" + training_objects[i] + "/redundants", self.PGNetworks[i].mean_redundant)
109 | 
110 | 		self.write_op = tf.summary.merge_all()
111 | 
112 | 	def discount_with_dones(self, rewards, dones, gamma):
113 | 		discounted = []
114 | 		r = 0
115 | 		# Start from downwards to upwards like Bellman backup operation.
116 | 		for reward, done in zip(rewards[::-1], dones[::-1]):
117 | 			r = reward + gamma * r * (1. - done)  # fixed off by one bug
118 | 			discounted.append(r)
119 | 		return discounted[::-1]
120 | 
121 | 	def generalized_advantage_estimate(self, rewards, dones, values, last_value, gamma, lamb):
122 | 		advantages = np.zeros_like(rewards)
123 | 		lastgaelam = 0
124 | 
125 |         # From last step to first step
126 | 		for t in reversed(range(len(rewards))):
127 |             # If t == before last step
128 | 			if t == len(rewards) - 1:
129 | 				# If a state is done, nextnonterminal = 0
130 | 				# In fact nextnonterminal allows us to do that logic
131 | 
132 | 				#if done (so nextnonterminal = 0):
133 | 				#    delta = R - V(s) (because self.gamma * nextvalues * nextnonterminal = 0) 
134 | 				# else (not done)
135 | 				    #delta = R + gamma * V(st+1)
136 | 				nextnonterminal = 1.0 - dones[-1]
137 | 
138 | 				# V(t+1)
139 | 				nextvalue = last_value
140 | 			else:
141 | 				nextnonterminal = 1.0 - dones[t]
142 | 
143 | 				nextvalue = values[t+1]
144 | 
145 | 			# Delta = R(t) + gamma * V(t+1) * nextnonterminal  - V(t)
146 | 			delta = rewards[t] + gamma * nextvalue * nextnonterminal - values[t]
147 | 
148 | 			# Advantage = delta + gamma *  (lambda) * nextnonterminal  * lastgaelam
149 | 			advantages[t] = lastgaelam = delta + gamma * lamb * nextnonterminal * lastgaelam
150 | 
151 | 		# advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6)
152 | 		return list(advantages)
153 | 
154 | 	def _make_batch(self, sess):
155 | 		'''
156 | 		states = [
157 | 		    [---episode_1---],...,[---episode_n---]
158 | 		]
159 | 		same as actions, tasks, rewards, values, dones
160 | 
161 | 		last_values = [
162 | 			episode_1, ...., episode_n]
163 | 		]
164 | 		same as redundants
165 | 		'''
166 | 		start = time.time()
167 | 
168 | 		task_states, task_pis, task_actions, task_returns, task_advantages, tasks = [], [], [], [], [], []
169 | 
170 | 		task_sc, task_rws, task_rdds = [], [], []
171 | 
172 | 		for i in range(2):
173 | 			states, pis, actions, rewards, values, last_values, redundants = self.rollouts[i].rollout_batch(sess, self.PGNetworks[i], return_state_ids=True)
174 | 
175 | 			success_count = 0			
176 | 			returns = []
177 | 			advantages = []
178 | 
179 | 			for ep_idx, (ep_rewards, ep_states) in enumerate(zip(rewards, states)):
180 | 				assert len(ep_rewards) == len(ep_states)
181 | 				ep_dones = list(np.zeros_like(ep_rewards))
182 | 
183 | 				if ep_rewards[-1] != self.config['success_reward']:
184 | 					last_value = last_values[ep_idx]
185 | 					assert last_value is not None
186 | 					ep_returns = self.discount_with_dones(ep_rewards + [last_value], ep_dones+[0], self.gamma)[:-1]
187 | 				else:
188 | 					success_count += 1
189 | 					last_value = 0
190 | 					ep_dones[-1] = 1
191 | 					ep_returns = self.discount_with_dones(ep_rewards, ep_dones, self.gamma)
192 | 
193 | 				returns += ep_returns
194 | 				ep_values = values[ep_idx]
195 | 
196 | 				if not self.use_gae:
197 | 					# Here we calculate advantage A(s,a) = R + yV(s') - V(s)
198 | 			    	# rewards = R + yV(s')
199 | 					advantages += list((np.array(ep_returns) - np.array(ep_values)).astype(np.float32))
200 | 
201 | 				else:
202 | 					advantages += self.generalized_advantage_estimate(ep_rewards, ep_dones, ep_values, last_value, self.gamma, self.lamb)
203 | 			
204 | 			task_states += list(np.concatenate(states))
205 | 			task_pis += list(np.concatenate(pis))
206 | 			task_actions += list(np.concatenate(actions))
207 | 			task_returns += returns
208 | 			task_advantages += advantages
209 | 			tasks += [i] * len(returns)
210 | 
211 | 			task_sc.append(success_count)
212 | 			task_rws.append(rewards)
213 | 			task_rdds.append(redundants)
214 | 
215 | 		mean_policy = {}
216 | 		policies = {}
217 | 		for (s, a, pi, t) in zip(task_states, task_actions, task_pis, tasks):
218 | 			if self.sharing[s]:
219 | 				try:
220 | 					mean_policy[s, a].append(pi[a])
221 | 				except KeyError:
222 | 					mean_policy[s, a] = [pi[a]]
223 | 
224 | 			try:
225 | 				policies[s, t].append(pi)
226 | 			except KeyError:
227 | 				policies[s, t] = [pi]
228 | 
229 | 
230 | 		
231 | 		for k in mean_policy.keys():
232 | 			mean_policy[k] = np.mean(mean_policy[k])
233 | 
234 | 		for k in policies.keys():
235 | 			policies[k] = np.mean(policies[k], 0)
236 | 
237 | 		batch_ss, batch_as, batch_ads, batch_rs = [], [], [], []
238 | 		share_ss, share_as, share_ads = [], [], []
239 | 		for task_index in range(2):
240 | 			batch_ss.append([])
241 | 			batch_as.append([])
242 | 			batch_ads.append([])
243 | 			batch_rs.append([])
244 | 
245 | 			share_ss.append([])
246 | 			share_as.append([])
247 | 			share_ads.append([])
248 | 
249 | 		for s, a, pi, r, ad, t in zip(task_states, task_actions, task_pis, task_returns, task_advantages, tasks):
250 | 			observation = self.env.state(s).reshape(1, -1).tolist()
251 | 
252 | 			batch_ss[t].append(observation)
253 | 			batch_as[t].append(self.env.cv_action_onehot[a])
254 | 			batch_rs[t].append(r)
255 | 
256 | 			if self.sharing[s]:
257 | 				batch_ads[t].append(ad * policies[s, t][a] / mean_policy[s, a])
258 | 				try:
259 | 					importance_weight = policies[s, 1 - t][a] / mean_policy[s, a]
260 | 
261 | 					if importance_weight > 1.2:
262 | 						clipped_iw = 1.2
263 | 					elif importance_weight < 0.8:
264 | 						clipped_iw = 0.8
265 | 					else:
266 | 						clipped_iw = importance_weight
267 | 
268 | 					if clipped_iw * ad < importance_weight * ad:
269 | 						share_ads[1 - t].append(clipped_iw * ad)
270 | 					else:
271 | 						share_ads[1 - t].append(importance_weight * ad)
272 | 
273 | 						
274 | 					share_ss[1 - t].append(observation)
275 | 					share_as[1 - t].append(self.env.cv_action_onehot[a])				
276 | 				except KeyError:
277 | 					pass
278 | 			else:
279 | 				batch_ads[t].append(ad)
280 | 
281 | 
282 | 		return batch_ss,\
283 | 				 batch_as,\
284 | 				 batch_rs,\
285 | 				 batch_ads,\
286 | 				 share_ss,\
287 | 				 share_as,\
288 | 				 share_ads,\
289 | 				 task_rws,\
290 | 				 task_rdds,\
291 | 				 task_sc			 		
292 | 		
293 | 	def train(self):
294 | 		total_samples = [0, 0]
295 | 		errors = 0
296 | 
297 | 		start = time.time()
298 | 		for epoch in range(self.num_epochs):
299 | 
300 | 			batch_ss,\
301 | 			 batch_as,\
302 | 			 batch_rs,\
303 | 			 batch_ads,\
304 | 			 share_ss,\
305 | 			 share_as,\
306 | 			 share_ads,\
307 | 			 rewards,\
308 | 			 redundants,\
309 | 			 task_sc = self._make_batch(self.sess)
310 | 			#---------------------------------------------------------------------------------------------------------------------#	
311 | 			print('[{}-{}] Time elapsed: {:.3f}, epoch {}/{}, success_rate: {:.3f}'.format(\
312 | 				self.training_scene, self.training_objects, (time.time() - start)/3600, epoch + 1, \
313 | 				self.num_epochs, np.mean(task_sc) / self.num_episodes))
314 | 
315 | 			sum_dict = {}
316 | 			assert len(batch_ss) == len(batch_as) == len(batch_rs) == len(batch_ads)
317 | 			assert len(share_ss) == len(share_as) == len(share_ads)
318 | 			
319 | 			for i in range(2):
320 | 				policy_loss, value_loss, _, _ = self.PGNetworks[i].learn(self.sess, actor_states=batch_ss[i] + share_ss[i],
321 | 																	advantages=batch_ads[i] + share_ads[i], 
322 | 																	actions=batch_as[i] + share_as[i],
323 | 																	critic_states=batch_ss[i], returns=batch_rs[i])
324 | 				
325 | 				sum_dict[self.PGNetworks[i].mean_reward] = np.sum(np.concatenate(rewards[i])) / self.num_episodes
326 | 				sum_dict[self.PGNetworks[i].success_rate] = task_sc[i] / self.num_episodes
327 | 				sum_dict[self.PGNetworks[i].mean_redundant] = np.mean(redundants[i])
328 | 
329 | 				total_samples[i] += len(list(np.concatenate(rewards[i])))
330 | 
331 | 				#---------------------------------------------------------------------------------------------------------------------#	
332 | 				
333 | 
334 | 			# WRITE TF SUMMARIES
335 | 			#---------------------------------------------------------------------------------------------------------------------#	
336 | 			summary = self.sess.run(self.write_op, feed_dict = sum_dict)
337 | 
338 | 			self.writer.add_summary(summary, np.mean(total_samples))
339 | 			self.writer.flush()
340 | 			#---------------------------------------------------------------------------------------------------------------------#	
341 | 
342 | 		self.saver.save(self.sess, self.log_folder + "/my-model")
343 | 		self.sess.close()
344 | 		# SAVE MODEL
345 | 		#---------------------------------------------------------------------------------------------------------------------#	
346 | 		with open(self.log_folder + '/arguments.json', 'w') as outfile:
347 | 		    json.dump(self.arguments, outfile)
348 | 
349 | 		print("\nElapsed time: {}".format((time.time() - start)/3600))	
350 | 		#---------------------------------------------------------------------------------------------------------------------#		


--------------------------------------------------------------------------------
/tf_a2c/single_task.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np      			
  3 | import os
  4 | import sys 
  5 | import json
  6 | import time
  7 | import h5py
  8 | import sys
  9 | 
 10 | sys.path.append('..') # to access env package
 11 | 
 12 | from datetime import datetime
 13 | from model import *
 14 | from rollout import Rollout
 15 | from env.ai2thor_env import AI2ThorDumpEnv
 16 | 
 17 | class SingleTaskPolicy(object):
 18 | 
 19 | 	def __init__(
 20 | 			self,
 21 | 			training_scene,
 22 | 			training_object,
 23 | 			config,
 24 | 			arguments
 25 | 			):
 26 | 
 27 | 		self.config = config
 28 | 		self.arguments = arguments
 29 | 
 30 | 		self.training_scene = training_scene
 31 | 		self.training_object = training_object
 32 | 
 33 | 		self.use_gae = arguments.get('use_gae')
 34 | 		self.num_epochs = arguments.get('num_epochs')
 35 | 		self.num_episodes = arguments.get('num_episodes')
 36 | 		self.num_iters = arguments.get('num_iters')
 37 | 		self.gamma = arguments.get('gamma')
 38 | 		self.lamb = arguments.get('lamb')
 39 | 		self.lr = arguments.get('lr')
 40 | 		self.joint_loss = arguments.get('joint_loss')
 41 | 		self.ec = arguments.get('ec')
 42 | 		self.vc = arguments.get('vc')
 43 | 		self.max_grad_norm = arguments.get('max_gradient_norm')
 44 | 		self.dropout = arguments.get('dropout')
 45 | 		self.decay = arguments.get('decay')
 46 | 		self.reuse = arguments.get('share_latent')
 47 | 		self.gpu_fraction = arguments.get('gpu_fraction')
 48 | 
 49 | 		self.env = AI2ThorDumpEnv(training_scene, training_object, config, arguments)
 50 | 		self.rollout = Rollout(training_scene, training_object, config, arguments)
 51 | 
 52 | 		tf.reset_default_graph()
 53 | 
 54 | 		self.PGNetwork  = A2C(name='A2C', 
 55 | 							state_size=self.env.features.shape[1], 
 56 | 							action_size=self.env.action_space,
 57 | 							embedding_size=-1 if arguments['mode'] != 2 else 300,
 58 | 							history_size=arguments['history_size'],
 59 | 							entropy_coeff=self.ec,
 60 | 							value_function_coeff=self.vc,
 61 | 							max_gradient_norm=self.max_grad_norm,
 62 | 							dropout=self.dropout,
 63 | 							joint_loss=self.joint_loss,
 64 | 							learning_rate=self.lr,
 65 | 							decay=self.decay,
 66 | 							reuse=bool(self.reuse)
 67 | 							)
 68 | 
 69 | 		if self.decay:
 70 | 			self.PGNetwork.set_lr_decay(self.lr, self.num_epochs * self.num_episodes * self.num_iters)
 71 | 			
 72 | 		print("\nInitialized network with {} trainable weights.".format(len(self.PGNetwork.find_trainable_variables('A2C', True))))
 73 | 
 74 | 		gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=self.gpu_fraction)
 75 | 
 76 | 		self.sess = tf.Session(config = tf.ConfigProto(gpu_options=gpu_options))
 77 | 		self.sess.run(tf.global_variables_initializer())
 78 | 
 79 | 		self.saver = tf.train.Saver()
 80 | 		
 81 | 		timer = "{}_{}_{}".format(str(datetime.now()).replace(" ", "-").replace(".", "-").replace(":", "-"), training_scene, training_object)
 82 | 		self.log_folder = os.path.join(arguments.get('logging'), timer)
 83 | 		self.writer = tf.summary.FileWriter(self.log_folder)
 84 | 		
 85 | 		self.timer = timer
 86 | 
 87 | 		test_name =  training_scene
 88 | 		tf.summary.scalar(test_name + "/" + training_object + "/rewards", self.PGNetwork.mean_reward)
 89 | 		tf.summary.scalar(test_name + "/" + training_object + "/success_rate", self.PGNetwork.success_rate)
 90 | 		tf.summary.scalar(test_name + "/" + training_object + "/redundants", self.PGNetwork.mean_redundant)
 91 | 
 92 | 		self.write_op = tf.summary.merge_all()
 93 | 
 94 | 	def discount_with_dones(self, rewards, dones, gamma):
 95 | 		discounted = []
 96 | 		r = 0
 97 | 		# Start from downwards to upwards like Bellman backup operation.
 98 | 		for reward, done in zip(rewards[::-1], dones[::-1]):
 99 | 			r = reward + gamma * r * (1. - done)  # fixed off by one bug
100 | 			discounted.append(r)
101 | 		return discounted[::-1]
102 | 
103 | 	def generalized_advantage_estimate(self, rewards, dones, values, last_value, gamma, lamb):
104 | 		advantages = np.zeros_like(rewards)
105 | 		lastgaelam = 0
106 | 
107 |         # From last step to first step
108 | 		for t in reversed(range(len(rewards))):
109 |             # If t == before last step
110 | 			if t == len(rewards) - 1:
111 | 				# If a state is done, nextnonterminal = 0
112 | 				# In fact nextnonterminal allows us to do that logic
113 | 
114 | 				#if done (so nextnonterminal = 0):
115 | 				#    delta = R - V(s) (because self.gamma * nextvalues * nextnonterminal = 0) 
116 | 				# else (not done)
117 | 				    #delta = R + gamma * V(st+1)
118 | 				nextnonterminal = 1.0 - dones[-1]
119 | 
120 | 				# V(t+1)
121 | 				nextvalue = last_value
122 | 			else:
123 | 				nextnonterminal = 1.0 - dones[t]
124 | 
125 | 				nextvalue = values[t+1]
126 | 
127 | 			# Delta = R(t) + gamma * V(t+1) * nextnonterminal  - V(t)
128 | 			delta = rewards[t] + gamma * nextvalue * nextnonterminal - values[t]
129 | 
130 | 			# Advantage = delta + gamma *  (lambda) * nextnonterminal  * lastgaelam
131 | 			advantages[t] = lastgaelam = delta + gamma * lamb * nextnonterminal * lastgaelam
132 | 
133 | 		# advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6)
134 | 		return list(advantages)
135 | 
136 | 	def _make_batch(self, sess):
137 | 		'''
138 | 		states = [
139 | 		    [---episode_1---],...,[---episode_n---]
140 | 		]
141 | 		same as actions, tasks, rewards, values, dones
142 | 
143 | 		last_values = [
144 | 			episode_1, ...., episode_n]
145 | 		]
146 | 		same as redundants
147 | 		'''
148 | 		states, task_logits, actions, rewards, values, last_values, redundants = self.rollout.rollout_batch(sess, self.PGNetwork)
149 | 
150 | 		observations = []
151 | 		converted_actions = []
152 | 		logits = []
153 | 		success_count = 0
154 | 
155 | 		for ep_idx, ep_states in enumerate(states):
156 | 			observations += [s.tolist() for s in ep_states]
157 | 			converted_actions += [self.env.cv_action_onehot[a] for a in actions[ep_idx]]
158 | 			logits += task_logits[ep_idx]
159 | 
160 | 		returns = []
161 | 		advantages = []
162 | 
163 | 		for ep_idx, (ep_rewards, ep_states) in enumerate(zip(rewards, states)):
164 | 			assert len(ep_rewards) == len(ep_states)
165 | 			ep_dones = list(np.zeros_like(ep_rewards))
166 | 
167 | 			if ep_rewards[-1] != self.config['success_reward']:
168 | 				last_value = last_values[ep_idx]
169 | 				assert last_value is not None
170 | 				ep_returns = self.discount_with_dones(ep_rewards + [last_value], ep_dones+[0], self.gamma)[:-1]
171 | 			else:
172 | 				success_count += 1
173 | 				last_value = 0
174 | 				ep_dones[-1] = 1
175 | 				ep_returns = self.discount_with_dones(ep_rewards, ep_dones, self.gamma)
176 | 
177 | 			returns += ep_returns
178 | 			ep_values = values[ep_idx]
179 | 
180 | 			if not self.use_gae:
181 | 				# Here we calculate advantage A(s,a) = R + yV(s') - V(s)
182 | 		    	# rewards = R + yV(s')
183 | 				advantages += list((np.array(ep_returns) - np.array(ep_values)).astype(np.float32))
184 | 
185 | 			else:
186 | 				advantages += self.generalized_advantage_estimate(ep_rewards, ep_dones, ep_values, last_value, self.gamma, self.lamb)
187 | 		
188 | 		return observations,\
189 | 				 converted_actions,\
190 | 				 returns,\
191 | 				 advantages,\
192 | 				 logits,\
193 | 				 rewards,\
194 | 				 redundants,\
195 | 				 success_count			 		
196 | 		
197 | 	def train(self):
198 | 		total_samples = 0
199 | 		errors = 0
200 | 
201 | 		start = time.time()
202 | 		for epoch in range(self.num_epochs):
203 | 			# sys.stdout.flush()
204 | 			
205 | 			# ROLLOUT SAMPLE
206 | 			#---------------------------------------------------------------------------------------------------------------------#	
207 | 			mb_states,\
208 | 			mb_actions,\
209 | 			mb_returns,\
210 | 			mb_advantages,\
211 | 			mb_logits,\
212 | 			rewards,\
213 | 			redundants,\
214 | 			success_count = self._make_batch(self.sess)
215 | 			
216 | 			if len(np.asarray(mb_returns).shape) == 2:
217 | 				print("Error happened!")
218 | 				if not os.path.isdir(os.path.join("errors", self.timer)):
219 | 					os.mkdir(os.path.join("errors", self.timer))
220 | 
221 | 				f = h5py.File(os.path.join("errors", self.timer, "{}.hdf5".format(errors)), 'w')
222 | 				f.create_dataset("states", data=np.asarray(mb_states, np.float32))
223 | 				f.create_dataset("actions", data=np.asarray(mb_actions, np.float32))
224 | 				f.create_dataset("returns", data=np.asarray(mb_returns, np.float32))
225 | 				f.create_dataset("advantages", data=np.asarray(mb_advantages, np.float32))
226 | 				f.create_dataset("logits", data=np.asarray(mb_logits, np.float32))
227 | 				f.create_dataset("rewards", data=np.asarray(rewards, np.float32))
228 | 				f.close()
229 | 
230 | 				errors += 1
231 | 				print("=======\n")
232 | 
233 | 				mb_returns = [r[0] for r in mb_returns]
234 | 			#---------------------------------------------------------------------------------------------------------------------#	
235 | 			print('[{}-{}] Time elapsed: {:.3f}, epoch {}/{}, success_rate: {:.3f}'.format(\
236 | 				self.training_scene, self.training_object, (time.time() - start)/3600, epoch + 1, self.num_epochs, success_count / self.num_episodes))
237 | 
238 | 			sum_dict = {}
239 | 			assert len(mb_states) == len(mb_actions) == len(mb_returns) == len(mb_advantages)
240 | 			
241 | 			policy_loss, value_loss, _, _ = self.PGNetwork.learn(self.sess, actor_states=mb_states,
242 | 																advantages=mb_advantages, actions=mb_actions,
243 | 																critic_states=mb_states, returns=mb_returns)
244 | 				
245 | 			sum_dict[self.PGNetwork.mean_reward] = np.sum(np.concatenate(rewards)) / len(rewards)
246 | 			sum_dict[self.PGNetwork.success_rate] = success_count / self.num_episodes
247 | 			sum_dict[self.PGNetwork.mean_redundant] = np.mean(redundants)
248 | 			
249 | 			total_samples += len(list(np.concatenate(rewards)))
250 | 
251 | 			#---------------------------------------------------------------------------------------------------------------------#	
252 | 			
253 | 
254 | 			# WRITE TF SUMMARIES
255 | 			#---------------------------------------------------------------------------------------------------------------------#	
256 | 			summary = self.sess.run(self.write_op, feed_dict = sum_dict)
257 | 
258 | 			self.writer.add_summary(summary, total_samples)
259 | 			self.writer.flush()
260 | 			#---------------------------------------------------------------------------------------------------------------------#	
261 | 
262 | 		self.saver.save(self.sess, self.log_folder + "/my-model")
263 | 		self.sess.close()
264 | 		# SAVE MODEL
265 | 		#---------------------------------------------------------------------------------------------------------------------#	
266 | 		with open(self.log_folder + '/arguments.json', 'w') as outfile:
267 | 		    json.dump(self.arguments, outfile)
268 | 
269 | 		print("\nElapsed time: {}".format((time.time() - start)/3600))	
270 | 		#---------------------------------------------------------------------------------------------------------------------#		


--------------------------------------------------------------------------------
/tf_a2c/utils.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | def mse(predicted, ground_truth):
 5 |     # Mean-squared error
 6 |     return tf.square(predicted - ground_truth) / 2.
 7 | 
 8 | def noise_and_argmax(logits):
 9 |     logits = np.asarray(logits, dtype = np.float32)
10 |     # Add noise then take the argmax
11 |     noise = np.random.uniform(0, 1, logits.shape)
12 |     
13 |     return np.argmax(logits - np.log(-np.log(noise)))
14 | 
15 | def openai_entropy(logits):
16 |     # Entropy proposed by OpenAI in their A2C baseline
17 |     a0 = logits - tf.reduce_max(logits, 1, keep_dims=True)
18 |     ea0 = tf.exp(a0)
19 |     z0 = tf.reduce_sum(ea0, 1, keep_dims=True)
20 |     p0 = ea0 / z0
21 |     return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1)
22 | 
23 | class LearningRateDecay(object):
24 |     def __init__(self, v, nvalues, lr_decay_method):
25 |         self.n = 0.
26 |         self.v = v
27 |         self.nvalues = nvalues
28 | 
29 |         def constant(p):
30 |             return 1
31 | 
32 |         def linear(p):
33 |             return 1 - p
34 | 
35 |         lr_decay_methods = {
36 |             'linear': linear,
37 |             'constant': constant
38 |         }
39 | 
40 |         self.decay = lr_decay_methods[lr_decay_method]
41 | 
42 |     def value(self):
43 |         current_value = self.v * self.decay(self.n / self.nvalues)
44 |         self.n += 1.
45 |         return current_value
46 | 
47 |     def get_value_for_steps(self, steps):
48 |         return self.v * self.decay(steps / self.nvalues)


--------------------------------------------------------------------------------
/tsne.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tailongnguyen/RL-target-driven-navigation-ai2thor/0820fd4229450121b4c6929c4784b55d7b54f366/tsne.png


--------------------------------------------------------------------------------
/visualize.py:
--------------------------------------------------------------------------------
 1 | import matplotlib as mpl
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | import pandas as pd
 5 | import os
 6 | 
 7 | def foo_all(folder, smooth=5):
 8 | 	chosen_cols = ['rewards', 'success_rate']
 9 | 	fig, axes = plt.subplots(nrows= 1, ncols=len(chosen_cols))
10 | 	lines = []
11 | 	labels = []
12 | 	for ax, t in zip(axes, chosen_cols):
13 | 		ax.set_title(t if t != 'redundants' else 'redundant steps')
14 | 		files = [os.path.join(folder, f) for f in os.listdir(folder) if t in f]
15 | 		files = sorted(files, key=lambda x: x.split('/')[-1].split('_')[0])
16 | 		for i, f in enumerate(files):
17 | 			labels.append(f.split("/")[-1].split('_')[0])
18 | 			log = pd.read_csv(f)
19 | 
20 | 			avg = log['Value'].tolist()
21 | 			smoothed_y = [np.mean(avg[max(0, yi - smooth):min(yi + smooth, len(avg)-1)]) for yi in range(len(avg))]
22 | 
23 | 			ax.xaxis.set_major_locator(plt.MaxNLocator(4))
24 | 			ax.xaxis.set_major_formatter(mpl.ticker.FuncFormatter(lambda x, p: format(x / 1e6, ',') + ' M'))
25 | 
26 | 			line = ax.plot(log['Step'].tolist(), smoothed_y, c='C{}'.format(i))
27 | 			ax.plot(log['Step'].tolist(), avg, c='C{}'.format(i), alpha=0.3)
28 | 			lines.append(line[0])
29 | 
30 | 	fig.set_size_inches(12, 4)
31 | 	
32 | 	# leg = fig.legend(lines[::2], ["{} {}".format(*folder.split('/')[-1].split('_')) for folder in folders], loc = 8, ncol = 2, bbox_to_anchor = (0.50, -0.00), fontsize ='large')
33 | 	leg = fig.legend(lines[:len(labels)//2], labels[:len(labels)//2], loc = 8, ncol = 3, bbox_to_anchor = (0.4, -0.00), fontsize ='large')
34 | 
35 | 	# set the linewidth of each legend object
36 | 	for legobj in leg.get_lines():
37 | 	    legobj.set_linewidth(4.0)
38 | 
39 | 	plt.subplots_adjust(wspace = 0.1, hspace = 0.3, bottom = 0.3)
40 | 	plt.savefig("All " + folder.split('/')[-1] + '.png', bbox_inches='tight', dpi = 250)
41 | 
42 | def foo(folders, smooth=5):
43 | 	chosen_cols = ['rewards', 'success_rate']
44 | 	fig, axes = plt.subplots(nrows= 1, ncols=len(chosen_cols))
45 | 	colors = ['C0', 'C1']
46 | 	lines = []
47 | 	for i, folder in enumerate(folders):
48 | 		for ax, t in zip(axes, chosen_cols):
49 | 			ax.set_title(t if t != 'redundants' else 'redundant steps')
50 | 			files = [os.path.join(folder, f) for f in os.listdir(folder) if t in f]
51 | 			logs = []
52 | 			for f in files:
53 | 				logs.append(pd.read_csv(f))
54 | 
55 | 			min_size = min([l.shape[0] for l in logs])
56 | 
57 | 			new_logs = []
58 | 			for l in logs:
59 | 				new_logs.append(l['Value'].tolist()[:min_size])
60 | 
61 | 			avg = np.mean(np.vstack(new_logs), 0).tolist()
62 | 			smoothed_y = [np.mean(avg[max(0, yi - smooth):min(yi + smooth, len(avg)-1)]) for yi in range(len(avg))]
63 | 
64 | 			ax.xaxis.set_major_locator(plt.MaxNLocator(4))
65 | 			ax.xaxis.set_major_formatter(mpl.ticker.FuncFormatter(lambda x, p: format(x / 1e6, ',') + ' M'))
66 | 
67 | 			line = ax.plot(logs[0]['Step'].tolist(), smoothed_y, c=colors[i])
68 | 			ax.plot(logs[0]['Step'].tolist(), avg, c=colors[i], alpha=0.3)
69 | 			lines.append(line[0])
70 | 
71 | 	fig.set_size_inches(12, 4)
72 | 	
73 | 	leg = fig.legend(lines[::2], ["{} {}".format(*folder.split('/')[-1].split('_')) for folder in folders], loc = 8, ncol = 2, bbox_to_anchor = (0.42, -0.00), fontsize ='large')
74 | 	# leg = fig.legend(lines[::2], ["4-stacked-frames", '1-frame'], loc = 8, ncol = 2, bbox_to_anchor = (0.40, -0.00), fontsize ='large')
75 | 
76 | 	# set the linewidth of each legend object
77 | 	for legobj in leg.get_lines():
78 | 	    legobj.set_linewidth(4.0)
79 | 
80 | 	plt.subplots_adjust(wspace = 0.1, hspace = 0.3, bottom = 0.2)
81 | 	plt.savefig("Compare " + folder.split('/')[-1].split('_')[0] + '.png', bbox_inches='tight', dpi = 250)
82 | 
83 | if __name__ == '__main__':
84 | 	foo(["/home/yoshi/thesis/RL-target-driven-navigation-ai2thor/tf_a2c/training-history/FloorPlan1_4", "/home/yoshi/thesis/RL-target-driven-navigation-ai2thor/tf_a2c/training-history/FloorPlan1_6"])
85 | 	# foo_all("/home/yoshi/thesis/RL-target-driven-navigation-ai2thor/tf_a2c/training-history/FloorPlan28_6")


--------------------------------------------------------------------------------