├── .gitignore ├── LICENSE ├── README.md ├── experiments ├── .csv ├── TSP.py ├── compare.py ├── crazy_env │ ├── __init__.py │ ├── data_collection3-0604.py │ ├── data_collection4.py │ ├── env_setting3.py │ ├── log3.py │ ├── tsp_data_collection.py │ └── tsp_env_setting.py ├── env0 │ ├── __init__.py │ ├── data_collection0.py │ ├── env_setting0.py │ └── log0.py ├── image │ ├── __init__.py │ ├── flag.py │ ├── map.py │ └── mapM.py ├── poor_compare.py ├── random_generator.py ├── test.py ├── test_random.py ├── train.py └── visualization.py ├── maddpg ├── __init__.py ├── common │ ├── __init__.py │ ├── distributions.py │ ├── summary.py │ └── tf_util.py └── trainer │ ├── maddpg.py │ └── prioritized_rb │ ├── __init__.py │ ├── proportional.py │ ├── replay_buffer.py │ └── sum_tree.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # pycharm 124 | .idea 125 | .idea/ 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 BIT-MCS 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Edics 2 | This is the code accompanying the paper: "[Energy-Efficient UAV Control for Effective and Fair Communication Coverage: A Deep Reinforcement Learning Approach](https://ieeexplore.ieee.org/document/8432464)", published in JSAC. 3 | 4 | ## :page_facing_up: Description 5 | Unmanned aerial vehicles (UAVs) can be used to serve as aerial base stations to enhance both the coverage and performance of communication networks in various scenarios, such as emergency communications and network access for remote areas. Mobile UAVs can establish communication links for ground users to deliver packets. However, UAVs have limited communication ranges and energy resources. Particularly, for a large region, they cannot cover the entire area all the time or keep flying for a long time. It is thus challenging to control a group of UAVs to achieve certain communication coverage in a long run, while preserving their connectivity and minimizing their energy consumption. Toward this end, we propose to leverage emerging deep reinforcement learning (DRL) for UAV control and present a novel and highly energy-efficient DRL-based method, which we call DRL-based energy-efficient control for coverage and connectivity ($DRL-EC^3$ ). The proposed method 1) maximizes a novel energy efficiency function with joint consideration for communications coverage, fairness, energy consumption and connectivity; 2) learns the environment and its dynamics; and 3) makes decisions under the guidance of two powerful deep neural networks. We conduct extensive simulations for performance evaluation. 6 | 7 | ## :wrench: Installation 8 | 1. Clone repo 9 | ```bash 10 | git clone https://github.com/BIT-MCS/DRL-EC3.git 11 | cd DRL-EC3 12 | ``` 13 | 2. Install dependent packages 14 | ```sh 15 | conda create -n mcs python==3.8 16 | conda activate mcs 17 | pip install tensorflow-gpu==1.15 18 | pip install -r requirements.txt 19 | ``` 20 | 21 | 22 | ## :computer: Training 23 | 24 | Train our solution 25 | ```bash 26 | python experiments/train.py 27 | ``` 28 | ## :checkered_flag: Testing 29 | 30 | Test with the trained models 31 | 32 | ```sh 33 | python experiments/test.py --load-dir=your_model_path 34 | ``` 35 | 36 | Random test the env 37 | 38 | ```sh 39 | python experiments/test_random.py 40 | ``` 41 | 42 | ## :clap: Reference 43 | - https://github.com/openai/maddpg 44 | 45 | 46 | ## :scroll: Acknowledgement 47 | 48 | This work was supported in part by the National Natural Science Foundation of China under Grant 61772072 and in part by the National Key Research and Development Program of China under Grant 2018YFB1003701. 49 |
50 | Corresponding author: Chi Harold Liu. 51 | 52 | ## :e-mail: Contact 53 | 54 | If you have any question, please email `daizipeng@bit.edu.cn`. 55 | 56 | ## Paper 57 | If you are interested in our work, please cite our paper as 58 | 59 | ``` 60 | @ARTICLE{liu2018energy, 61 | author={Liu, Chi Harold and Chen, Zheyu and Tang, Jian and Xu, Jie and Piao, Chengzhe}, 62 | journal={IEEE Journal on Selected Areas in Communications (JSAC)}, 63 | title={Energy-Efficient UAV Control for Effective and Fair Communication Coverage: A Deep Reinforcement Learning Approach}, 64 | year={2018}, 65 | volume={36}, 66 | number={9}, 67 | pages={2059-2070}, 68 | } 69 | ``` 70 | -------------------------------------------------------------------------------- /experiments/.csv: -------------------------------------------------------------------------------- 1 | ,test_model,collection_ratio,fairness,consumption of energy,efficiency 2 | 1,71,0.953,0.92802734375,1.401,0.4912266999477505 3 | 2,72,0.9604,0.92109375,1.363,0.4933548979332965 4 | 3,73,0.916,0.899365234375,1.532421875,0.42414871986096025 5 | 4,74,0.964,0.938623046875,1.444,0.4934789119789393 6 | 5,75,0.9316,0.907373046875,1.403,0.4689399282267934 7 | 6,76,0.991,0.95537109375,1.328,0.5371141743779904 8 | 7,77,0.9043,0.88916015625,1.54296875,0.4126325189324473 9 | 8,78,0.957,0.92060546875,1.448,0.4678810126150502 10 | 9,79,0.945,0.92197265625,1.520703125,0.4475759912848207 11 | -------------------------------------------------------------------------------- /experiments/TSP.py: -------------------------------------------------------------------------------- 1 | from experiments.crazy_env.tsp_data_collection import Env 2 | from experiments.crazy_env import log3 as Log 3 | import numpy as np 4 | import math 5 | import time 6 | import random, operator 7 | import pandas as pd 8 | import matplotlib.pyplot as plt 9 | 10 | 11 | def myint(a): 12 | # return int(np.ceil(a)) 13 | return int(np.floor(a)) 14 | 15 | 16 | class City: 17 | def __init__(self, x, y, env): 18 | self.x = x 19 | self.y = y 20 | self.env = env 21 | 22 | def distance(self, tocity): 23 | dx = tocity.x - self.x 24 | dy = tocity.y - self.y 25 | 26 | if 0 <= self.x + dx < self.env.mapx and 0 <= self.x + dx < self.env.mapy and self.env.mapob[myint(self.x + dx)][ 27 | myint(self.y + dy)] != self.env.OB and \ 28 | self.env.mapob[myint(self.x + (dx / 2))][myint(self.y + (dy / 2))] != self.env.OB and \ 29 | self.env.mapob[myint(self.x + (dx / 3))][myint(self.y + (dy / 3))] != self.env.OB and \ 30 | self.env.mapob[myint(self.x + (2 * dx / 3))][myint(self.y + (2 * dy / 3))] != self.env.OB and \ 31 | self.env.mapob[myint(self.x + (dx / 4))][myint(self.y + (dy / 4))] != self.env.OB and \ 32 | self.env.mapob[myint(self.x + (3 * dx / 4))][myint(self.y + (3 * dy / 4))] != self.env.OB: 33 | 34 | distance = np.sqrt((dx ** 2) + (dy ** 2)) 35 | else: 36 | distance = 50 37 | 38 | return distance 39 | 40 | def __repr__(self): 41 | return "(" + str(self.x) + "," + str(self.y) + ")" 42 | 43 | 44 | class Fitness: 45 | def __init__(self, route): 46 | self.route = route 47 | self.distance = 0 48 | self.fitness = 0.0 49 | 50 | def routeDistance(self): 51 | if self.distance == 0: 52 | pathDistance = 0 53 | for i in range(0, len(self.route)): 54 | fromCity = self.route[i] 55 | toCity = None 56 | if i + 1 < len(self.route): 57 | toCity = self.route[i + 1] 58 | else: 59 | toCity = self.route[0] 60 | pathDistance += fromCity.distance(toCity) 61 | self.distance = pathDistance 62 | return self.distance 63 | 64 | def routeFitness(self): 65 | if self.fitness == 0: 66 | self.fitness = 1 / float(self.routeDistance()) 67 | return self.fitness 68 | 69 | 70 | def createRoute(cityList): 71 | route = random.sample(cityList, len(cityList)) 72 | return route 73 | 74 | 75 | def initialPopulation(popSize, cityList): 76 | population = [] 77 | 78 | for i in range(0, popSize): 79 | population.append(createRoute(cityList)) 80 | return population 81 | 82 | 83 | def rankRoutes(population): 84 | fitnessResults = {} 85 | for i in range(0, len(population)): 86 | fitnessResults[i] = Fitness(population[i]).routeFitness() 87 | return sorted(fitnessResults.items(), key=operator.itemgetter(1), reverse=True) 88 | 89 | 90 | def selection(popRanked, eliteSize): 91 | selectionResults = [] 92 | df = pd.DataFrame(np.array(popRanked), columns=["Index", "Fitness"]) 93 | df['cum_sum'] = df.Fitness.cumsum() 94 | df['cum_perc'] = 100 * df.cum_sum / df.Fitness.sum() 95 | 96 | for i in range(0, eliteSize): 97 | selectionResults.append(popRanked[i][0]) 98 | for i in range(0, len(popRanked) - eliteSize): 99 | pick = 100 * random.random() 100 | for i in range(0, len(popRanked)): 101 | if pick <= df.iat[i, 3]: 102 | selectionResults.append(popRanked[i][0]) 103 | break 104 | return selectionResults 105 | 106 | 107 | def matingPool(population, selectionResults): 108 | matingpool = [] 109 | for i in range(0, len(selectionResults)): 110 | index = selectionResults[i] 111 | matingpool.append(population[index]) 112 | return matingpool 113 | 114 | 115 | def breed(parent1, parent2): 116 | child = [] 117 | childP1 = [] 118 | childP2 = [] 119 | 120 | geneA = int(random.random() * len(parent1)) 121 | geneB = int(random.random() * len(parent1)) 122 | 123 | startGene = min(geneA, geneB) 124 | endGene = max(geneA, geneB) 125 | 126 | for i in range(startGene, endGene): 127 | childP1.append(parent1[i]) 128 | 129 | childP2 = [item for item in parent2 if item not in childP1] 130 | 131 | child = childP1 + childP2 132 | return child 133 | 134 | 135 | def breedPopulation(matingpool, eliteSize): 136 | children = [] 137 | length = len(matingpool) - eliteSize 138 | pool = random.sample(matingpool, len(matingpool)) 139 | 140 | for i in range(0, eliteSize): # to carry the best individuals into the next generation 141 | children.append(matingpool[i]) 142 | 143 | for i in range(0, length): 144 | child = breed(pool[i], pool[len(matingpool) - i - 1]) 145 | children.append(child) 146 | return children 147 | 148 | 149 | def mutate(individual, mutationRate): 150 | for swapped in range(len(individual)): 151 | if (random.random() < mutationRate): 152 | swapWith = int(random.random() * len(individual)) 153 | 154 | city1 = individual[swapped] 155 | city2 = individual[swapWith] 156 | 157 | individual[swapped] = city2 158 | individual[swapWith] = city1 159 | return individual 160 | 161 | 162 | def mutatePopulation(population, mutationRate): 163 | mutatedPop = [] 164 | 165 | for ind in range(0, len(population)): 166 | mutatedInd = mutate(population[ind], mutationRate) 167 | mutatedPop.append(mutatedInd) 168 | return mutatedPop 169 | 170 | 171 | def nextGeneration(currentGen, eliteSize, mutationRate): 172 | popRanked = rankRoutes(currentGen) 173 | selectionResults = selection(popRanked, eliteSize) 174 | matingpool = matingPool(currentGen, selectionResults) 175 | children = breedPopulation(matingpool, eliteSize) 176 | nextGeneration = mutatePopulation(children, mutationRate) 177 | return nextGeneration 178 | 179 | 180 | def geneticAlgorithm(population, popSize, eliteSize, mutationRate, generations): 181 | pop = initialPopulation(popSize, population) 182 | print("Initial distance: " + str(1 / rankRoutes(pop)[0][1])) 183 | 184 | for i in range(0, generations): 185 | pop = nextGeneration(pop, eliteSize, mutationRate) 186 | 187 | print("Final distance: " + str(1 / rankRoutes(pop)[0][1])) 188 | bestRouteIndex = rankRoutes(pop)[0][0] 189 | bestRoute = pop[bestRouteIndex] 190 | return bestRoute 191 | 192 | 193 | def geneticAlgorithmPlot(population, popSize, eliteSize, mutationRate, generations, env_log, reg_n): 194 | log_path = env_log.full_path 195 | pop = initialPopulation(popSize, population) 196 | print("Initial distance: " + str(1 / rankRoutes(pop)[0][1])) 197 | 198 | progress = [] 199 | progress.append(1 / rankRoutes(pop)[0][1]) 200 | 201 | for i in range(0, generations): 202 | pop = nextGeneration(pop, eliteSize, mutationRate) 203 | progress.append(1 / rankRoutes(pop)[0][1]) 204 | end = False 205 | if i % 10 == 0: 206 | plt.plot(progress) 207 | plt.ylabel('Distance') 208 | plt.xlabel('Generation') 209 | plt.savefig(log_path + '/Distance_generation_%d.png' % (reg_n)) 210 | plt.close() 211 | if i > 50: 212 | test_coverage = progress[i - 50:i] 213 | list_var = np.var(test_coverage) 214 | print("%d th var: %f" % (i, list_var)) 215 | else: 216 | list_var = 1e5 217 | print(i) 218 | 219 | if list_var < 1e-5: 220 | end = True 221 | break 222 | 223 | if end is True: 224 | break 225 | 226 | print("Final distance: " + str(1 / rankRoutes(pop)[0][1])) 227 | bestRouteIndex = rankRoutes(pop)[0][0] 228 | bestRoute = pop[bestRouteIndex] 229 | return bestRoute 230 | 231 | 232 | def train(num_uav): 233 | log = Log.Log() 234 | env = Env(log) 235 | print("training %d PoIs..." % (len(env.datas))) 236 | start = time.clock() 237 | 238 | for n in range(num_uav): 239 | cityList = [] 240 | 241 | for i in range(0, len(env.datas)): 242 | # 随机测试 243 | # cityList.append(City(x=random.random() * 16, y=random.random() * 16)) 244 | datax = env.datas[i][0] 245 | datay = env.datas[i][1] 246 | ab_reg = float(env.mapx) / num_uav 247 | if ab_reg * n <= datax <= ab_reg * (n + 1): 248 | cityList.append(City(x=datax, y=datay, env=env)) 249 | 250 | print("\nthe %dth region: %d PoI" % (n, len(cityList))) 251 | # geneticAlgorithm(population=cityList, popSize=100, eliteSize=20, mutationRate=0.01, generations=500) 252 | 253 | bestRoute = geneticAlgorithmPlot(population=cityList, popSize=300, eliteSize=50, mutationRate=0.01, 254 | generations=3000, 255 | env_log=log, 256 | reg_n=n) 257 | 258 | bestRoutelist = [] 259 | for poi in bestRoute: 260 | bestRoutelist.append([poi.x, poi.y]) 261 | 262 | bestRouteDataFrame = pd.DataFrame(np.array(bestRoutelist), columns=["x", "y"]) 263 | bestRouteDataFrame.to_csv(log.full_path + '/saved_route_uav%d.csv' % n) 264 | 265 | training_time = time.clock() - start 266 | print("\n\nTraining time: ", training_time) 267 | 268 | 269 | def __cusume_energy(env, uav, value, distance): 270 | # distance-0.1, alpha-1.0 271 | if (env.factor * distance + env.alpha * value < env.energy[uav]): 272 | env.energy[uav] -= (env.factor * distance + env.alpha * value) 273 | env.use_energy[uav] += (env.factor * distance + env.alpha * value) 274 | else: 275 | env.use_energy[uav] += env.energy[uav] 276 | distance = env.energy[uav] / env.factor 277 | env.energy[uav] = 0 278 | 279 | return env 280 | 281 | 282 | def test(num_uav, model_path): 283 | print("testing...") 284 | log = Log.Log() 285 | env = Env(log) 286 | _ = env.reset() 287 | 288 | for n in range(num_uav): 289 | df = pd.read_csv("%s/saved_route_uav%d.csv" % (model_path, n)) 290 | print("the %dth region: %d PoI" % (n, df.shape[0])) 291 | step = 0 292 | i = 0 293 | 294 | while step < 500: 295 | new_positions = [df.loc[i, 'x'], df.loc[i, 'y']] 296 | 297 | # charge 298 | _pos = np.repeat([new_positions], [env.fills.shape[0]], axis=0) # just repeat(On) NB! 299 | _minus = env.fills - _pos 300 | _power = np.power(_minus, 2) 301 | _dis = np.sum(_power, axis=1) 302 | for index, dis in enumerate(_dis): 303 | # sensing Fill Station(crange=1.1) 304 | if np.sqrt(dis) <= env.crange: 305 | # uodate poi data 306 | if env.fills_energy_remain[index] > 0: 307 | # TODO:加油站的信息更新 308 | if env.fspeed * env.maxenergy <= env.fills_energy_remain[index]: 309 | if env.energy[n] + env.fspeed * env.maxenergy <= env.maxenergy: 310 | env.fill_energy[n] += env.fspeed * env.maxenergy 311 | env.fills_energy_remain[index] -= env.fspeed * env.maxenergy 312 | env.energy[n] += env.fspeed * env.maxenergy 313 | else: 314 | env.fill_energy[n] += env.maxenergy - env.energy[n] 315 | env.fills_energy_remain[index] -= (env.maxenergy - env.energy[n]) 316 | env.energy[n] = env.maxenergy 317 | else: 318 | if env.energy[n] + env.fills_energy_remain[index] <= env.maxenergy: 319 | env.fill_energy[n] += env.fills_energy_remain[index] 320 | env.energy[n] += env.fills_energy_remain[index] 321 | env.fills_energy_remain[index] = 0 322 | else: 323 | env.fill_energy[n] += env.maxenergy - env.energy[n] 324 | env.fills_energy_remain[index] -= (env.maxenergy - env.energy[n]) 325 | env.energy[n] = env.maxenergy 326 | break 327 | 328 | # collect 329 | data = 0 330 | _pos = np.repeat([new_positions], [env.datas.shape[0]], axis=0) 331 | _minus = env.datas - _pos 332 | _power = np.power(_minus, 2) 333 | _dis = np.sum(_power, axis=1) 334 | for index, dis in enumerate(_dis): 335 | # sensing PoI(crange=1.1) 336 | if np.sqrt(dis) <= env.crange: 337 | # uodate poi data 338 | if env.mapmatrix[index] > 0: 339 | tmp_data = env._mapmatrix[index] * env.cspeed 340 | if env.energy[n] >= tmp_data * env.alpha: 341 | data += tmp_data 342 | env.mapmatrix[index] -= tmp_data 343 | if env.mapmatrix[index] < 0: 344 | env.mapmatrix[index] = 0. 345 | else: 346 | data += env.energy[n] 347 | env.mapmatrix[index] -= env.energy[n] 348 | if env.mapmatrix[index] < 0: 349 | env.mapmatrix[index] = 0. 350 | break 351 | 352 | value = data if env.energy[n] >= data * env.alpha else env.energy[n] 353 | env.collection[n] += value 354 | env = __cusume_energy(env, n, value, 0.) # collect 355 | 356 | if i == df.shape[0] - 1: 357 | # env.energy[n]=env.maxenergy # 不加! 358 | ii = 0 359 | else: 360 | ii = i + 1 361 | 362 | distance = np.sqrt(((df.loc[ii, 'x'] - df.loc[i, 'x']) ** 2) + ((df.loc[ii, 'y'] - df.loc[i, 'y']) ** 2)) 363 | 364 | if distance <= env.maxdistance: 365 | env = __cusume_energy(env, n, 0, distance) # move 366 | 367 | # 撞墙 368 | dx = df.loc[ii, 'x'] - df.loc[i, 'x'] 369 | dy = df.loc[ii, 'y'] - df.loc[i, 'y'] 370 | if 0 <= df.loc[ii, 'x'] < env.mapx and 0 <= df.loc[ii, 'y'] < env.mapy and \ 371 | env.mapob[myint(df.loc[ii, 'x'])][ 372 | myint(df.loc[ii, 'y'])] != env.OB and \ 373 | env.mapob[myint(df.loc[i, 'x'] + (dx / 2))][myint(df.loc[i, 'y'] + (dy / 2))] != env.OB and \ 374 | env.mapob[myint(df.loc[i, 'x'] + (dx / 3))][myint(df.loc[i, 'y'] + (dy / 3))] != env.OB and \ 375 | env.mapob[myint(df.loc[i, 'x'] + (2 * dx / 3))][ 376 | myint(df.loc[i, 'y'] + (2 * dy / 3))] != env.OB and \ 377 | env.mapob[myint(df.loc[i, 'x'] + (dx / 4))][myint(df.loc[i, 'y'] + (dy / 4))] != env.OB and \ 378 | env.mapob[myint(df.loc[i, 'x'] + (3 * dx / 4))][myint(df.loc[i, 'y'] + (3 * dy / 4))] != env.OB: 379 | i = ii 380 | else: 381 | env = __cusume_energy(env, n, 0, env.maxdistance) # move 382 | newx = df.loc[i, 'x'] + (df.loc[ii, 'x'] - df.loc[i, 'x']) * (env.maxdistance / distance) 383 | newy = df.loc[i, 'y'] + (df.loc[ii, 'y'] - df.loc[i, 'y']) * (env.maxdistance / distance) 384 | 385 | dx = newx - df.loc[i, 'x'] 386 | dy = newy - df.loc[i, 'y'] 387 | if 0 <= newx < env.mapx and 0 <= newy < env.mapy and \ 388 | env.mapob[myint(newx)][myint(newy)] != env.OB and \ 389 | env.mapob[myint(df.loc[i, 'x'] + (dx / 2))][myint(df.loc[i, 'y'] + (dy / 2))] != env.OB and \ 390 | env.mapob[myint(df.loc[i, 'x'] + (dx / 3))][myint(df.loc[i, 'y'] + (dy / 3))] != env.OB and \ 391 | env.mapob[myint(df.loc[i, 'x'] + (2 * dx / 3))][ 392 | myint(df.loc[i, 'y'] + (2 * dy / 3))] != env.OB and \ 393 | env.mapob[myint(df.loc[i, 'x'] + (dx / 4))][myint(df.loc[i, 'y'] + (dy / 4))] != env.OB and \ 394 | env.mapob[myint(df.loc[i, 'x'] + (3 * dx / 4))][myint(df.loc[i, 'y'] + (3 * dy / 4))] != env.OB: 395 | df.loc[i, 'x'] = newx 396 | df.loc[i, 'y'] = newy 397 | step += 1 398 | 399 | print('efficiency: %.3f' % env.efficiency) 400 | print('data_collection_ratio: %.3f' % (1.0 - env.leftrewards)) 401 | print('fairness: %.3f' % env.collection_fairness) 402 | print('normal fairness: %.3f' % env.normal_collection_fairness) 403 | print('energy_consumption: %.3f' % (np.sum(env.normal_use_energy))) 404 | print('fill:', env.fills_energy_remain) 405 | 406 | 407 | if __name__ == '__main__': 408 | num_uav = 5 409 | # train(num_uav=num_uav) 410 | test(num_uav=num_uav, model_path='/home/dzp1997/PycharmProjects/maddpg-czy-DZP/experiments/2019/06-29/uav5') 411 | -------------------------------------------------------------------------------- /experiments/compare.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | 5 | def error(input_list): 6 | input = np.array(input_list) 7 | input = input.transpose((1, 0)) 8 | error_low = input[0] - input[1] 9 | error_high = input[2] - input[0] 10 | error = [] 11 | error.append(error_low) 12 | error.append(error_high) 13 | return error 14 | 15 | 16 | def average(input_list): 17 | input = np.array(input_list) 18 | input = input.transpose((1, 0)) 19 | return input[0] 20 | 21 | 22 | def compare_plot_errorbar(xlabel, ylabel, x, eDivert, woApeX, woRNN, MADDPG): 23 | plt.xlabel(xlabel) 24 | plt.ylabel(ylabel) 25 | plt.errorbar(x=x, y=average(eDivert), yerr=error(eDivert), fmt='r-o', label='e-Divert', capsize=4) 26 | plt.errorbar(x=x, y=average(woApeX), yerr=error(woApeX), fmt='g-^', label='e-Divert w/o Ape-X', capsize=4) 27 | plt.errorbar(x=x, y=average(woRNN), yerr=error(woRNN), fmt='m-<', label='e-Divert w/o RNN', capsize=4) 28 | plt.errorbar(x=x, y=average(MADDPG), yerr=error(MADDPG), fmt='k-*', label='MADDPG', capsize=4) 29 | 30 | plt.ylim(ymin=0, ymax=1) 31 | plt.grid(True) 32 | plt.grid(linestyle='--') 33 | plt.legend() 34 | plt.show() 35 | 36 | 37 | def compare_plot(xlabel, ylabel, x,yrange, eDivert, woApeX, woRNN, MADDPG): 38 | plt.figure(figsize=(15, 20)) 39 | plt.xlabel(xlabel,fontsize=32) 40 | plt.ylabel(ylabel,fontsize=32) 41 | plt.xticks(fontsize=32) 42 | plt.yticks(fontsize=32) 43 | plt.plot(x,eDivert, color='b', marker='o', label='e-Divert',markersize=26,markeredgewidth=5,markerfacecolor='none',linewidth=4) 44 | plt.plot(x, woApeX, color='g',marker='^', label='e-Divert w/o Ape-X',markersize=26,markeredgewidth=5,markerfacecolor='none',linewidth=4) 45 | plt.plot(x, woRNN, color='m',marker='d', label='e-Divert w/o RNN',markersize=26,markeredgewidth=5,markerfacecolor='none',linewidth=4) 46 | plt.plot(x, MADDPG, color='k',marker='s', label='MADDPG',markersize=26,markeredgewidth=5,markerfacecolor='none',linewidth=4) 47 | # plt.plot(x,[3.62,4.62,5.62,6.62,7.62],color='red',linestyle='--',label="Maximum used energy",linewidth=4) 48 | 49 | plt.xticks(x,x) 50 | # plt.axhline(y=4.62, color='red', linestyle='--', label="Maximum used energy",linewidth=4) 51 | plt.ylim(yrange[0],yrange[1]) 52 | plt.grid(True) 53 | plt.grid(linestyle='--') 54 | plt.legend(loc='lower right',fontsize=22) 55 | plt.show() 56 | 57 | 58 | if __name__ == '__main__': 59 | # collection-range 60 | compare_plot(xlabel="Sensing range (unit)", 61 | ylabel="Data collection ratio", 62 | x=[0.6, 0.8, 1.0, 1.2, 1.4], 63 | yrange=[0,1], 64 | eDivert=[0.706, 0.874, 0.916, 0.936, 0.952], 65 | woApeX=[0.584, 0.70, 0.871, 0.906, 0.949], 66 | woRNN=[0.205, 0.41, 0.463, 0.569, 0.722], 67 | MADDPG=[0.139, 0.245, 0.323, 0.360, 0.439], 68 | ) 69 | 70 | 71 | # fairness_range 72 | compare_plot(xlabel="Sensing range (unit)", 73 | ylabel="Geographical fairness", 74 | x=[0.6, 0.8, 1.0, 1.2, 1.4], 75 | yrange=[0,1], 76 | eDivert=[0.784,0.909,0.936,0.951,0.970], 77 | woApeX=[0.675,0.729,0.903,0.935,0.963], 78 | woRNN=[0.294,0.467,0.573,0.650,0.777], 79 | MADDPG=[0.168,0.293,0.382,0.409,0.5], 80 | ) 81 | # # # 82 | # # energy_range 83 | # compare_plot(xlabel="Sensing range (unit)", 84 | # ylabel="Energy usage (# of full batteries)", 85 | # x=[0.6, 0.8, 1.0, 1.2, 1.4], 86 | # yrange=[0,5], 87 | # eDivert=[3.45,4.086,3.89,3.918,3.9], 88 | # woApeX=[3.39,3.588,4.617,4.43,4.48], 89 | # woRNN=[1.395,2.514,3.188,3.113,4.113], 90 | # MADDPG=[1.792,2.201,2.545,2.547,3.027], 91 | # ) 92 | 93 | # efficiency_range 94 | compare_plot(xlabel="Sensing range (unit)", 95 | ylabel="Energy efficiency", 96 | x=[0.6, 0.8, 1.0, 1.2, 1.4], 97 | yrange=[-0.04,0.2], 98 | eDivert=[0.129,0.155,0.179,0.182,0.193], 99 | woApeX=[0.092,0.118,0.139,0.153,0.165], 100 | woRNN=[0.033,0.062,0.063,0.097,0.108], 101 | MADDPG=[0.011,0.027,0.039,0.048,0.058], 102 | ) 103 | 104 | # collection_uav 105 | compare_plot(xlabel="No. of vehicles", 106 | ylabel="Data collection ratio", 107 | x=[1, 2, 3, 4, 5], 108 | yrange=[0,1], 109 | eDivert=[0.88,0.943,0.916,0.912,0.911], 110 | woApeX=[0.769,0.871,0.746,0.738,0.764], 111 | woRNN=[0.842,0.722,0.636,0.682,0.772], 112 | MADDPG=[0.401,0.383,0.415,0.478,0.269], 113 | ) 114 | 115 | # fairness_uav 116 | compare_plot(xlabel="No. of vehicles", 117 | ylabel="Geographical fairness", 118 | x=[1, 2, 3, 4, 5], 119 | yrange=[0,1], 120 | eDivert=[0.912,0.958,0.943,0.944,0.935], 121 | woApeX=[0.814,0.902,0.795,0.790,0.819], 122 | woRNN=[0.874,0.777,0.714,0.732,0.815], 123 | MADDPG=[0.500,0.431,0.463,0.537,0.338], 124 | ) 125 | # # 126 | # # energy_uav 127 | # compare_plot(xlabel="No. of vehicles", 128 | # ylabel="Energy usage (# of full batteries)", 129 | # x=[1, 2, 3, 4, 5], 130 | # yrange=[1,8], 131 | # eDivert=[3.576,4,4.004,4.402,4.668], 132 | # woApeX=[3.244,4.273,4.562,5.156,5.953], 133 | # woRNN=[3.42,4.113,4.496,5.613,6.45], 134 | # MADDPG=[1.853,2.695,3.543,4.44,5.08], 135 | # ) 136 | 137 | # efficiency_uav 138 | compare_plot(xlabel="No. of vehicles", 139 | ylabel="Energy efficiency", 140 | x=[1, 2, 3, 4, 5], 141 | yrange=[-0.04,0.2], 142 | eDivert=[0.182,0.181,0.179,0.158,0.149], 143 | woApeX=[0.155,0.150,0.104,0.091,0.083], 144 | woRNN=[0.174,0.108,0.080,0.080,0.080], 145 | MADDPG=[0.085,0.050,0.045,0.046,0.015], 146 | ) 147 | 148 | # collection_fill 149 | compare_plot(xlabel="Charging proportion (%)", 150 | ylabel="Data collection ratio", 151 | x=[10, 20, 30, 40, 50], 152 | yrange=[0,1], 153 | eDivert=[0.927,0.911,0.937,0.905,0.939], 154 | woApeX=[0.736,0.766,0.761,0.791,0.838], 155 | woRNN=[0.638,0.702,0.713,0.680,0.672], 156 | MADDPG=[0.305,0.354,0.393,0.392,0.369], 157 | ) 158 | 159 | # fairness_fill 160 | compare_plot(xlabel="Charging proportion (%)", 161 | ylabel="Geographical fairness", 162 | x=[10, 20, 30, 40, 50], 163 | yrange=[0,1], 164 | eDivert=[0.951,0.935,0.958,0.941,0.959], 165 | woApeX=[0.804,0.829,0.821,0.843,0.880], 166 | woRNN=[0.704,0.745,0.776,0.722,0.727], 167 | MADDPG=[0.360,0.425,0.436,0.421,0.431], 168 | ) 169 | 170 | # # energy_fill 171 | # compare_plot(xlabel="Charging proportion (%)", 172 | # ylabel="Energy usage (# of full batteries)", 173 | # x=[10, 20, 30, 40, 50], 174 | # yrange=[0,5], 175 | # eDivert=[4.023,3.844,3.926,3.73,4], 176 | # woApeX=[3.463,3.771,3.74,3.889,4.348], 177 | # woRNN=[2.844,3.184,3.457,3.066,3.064], 178 | # MADDPG=[2.15,2.285,2.342,2.3,2.244], 179 | # ) 180 | # 181 | # efficiency_fill 182 | compare_plot(xlabel="Charging proportion (%)", 183 | ylabel="Energy efficiency", 184 | x=[10, 20, 30, 40, 50], 185 | yrange=[0,0.3], 186 | eDivert=[0.180,0.180,0.185,0.185,0.184], 187 | woApeX=[0.138,0.136,0.137,0.141,0.139], 188 | woRNN=[0.132,0.132,0.131,0.131,0.132], 189 | MADDPG=[0.044,0.055,0.059,0.061,0.057], 190 | ) 191 | 192 | # collection_station 193 | compare_plot(xlabel="No. of charging stations", 194 | ylabel="Data collection ratio", 195 | x=[1,2,3,4,5], 196 | yrange=[0,1], 197 | eDivert=[0.819,0.865,0.911,0.905,0.943], 198 | woApeX=[0.461,0.680,0.795,0.874,0.871], 199 | woRNN=[0.480,0.684,0.702,0.649,0.688], 200 | MADDPG=[0.366,0.366,0.332,0.336,0.371], 201 | ) 202 | 203 | # fairness_station 204 | compare_plot(xlabel="No. of charging stations", 205 | ylabel="Geographical fairness", 206 | x=[1, 2, 3, 4, 5], 207 | yrange=[0,1], 208 | eDivert=[0.865,0.906,0.935,0.934,0.958], 209 | woApeX=[0.526,0.734,0.851,0.903,0.902], 210 | woRNN=[0.547,0.710,0.745,0.694,0.758], 211 | MADDPG=[0.411,0.415,0.415,0.392,0.423], 212 | ) 213 | # 214 | # # energy_station 215 | # compare_plot(xlabel="No. of charging stations", 216 | # ylabel="Energy usage (# of full batteries)", 217 | # x=[1, 2, 3, 4, 5], 218 | # yrange=[0,5], 219 | # eDivert=[1.993,3.537,3.844,3.773,4], 220 | # woApeX=[2.092,3.135,3.855,4.383,4.273], 221 | # woRNN=[2.09,3.041,3.184,3.05,3.98], 222 | # MADDPG=[2.016,2.203,2.264,2.473,2.693], 223 | # ) 224 | 225 | # efficiency_station 226 | compare_plot(xlabel="No. of charging stations", 227 | ylabel="Energy efficiency", 228 | x=[1, 2, 3, 4, 5], 229 | yrange=[-0.04,0.2], 230 | eDivert=[0.138,0.177,0.180,0.181,0.181], 231 | woApeX=[0.093,0.128,0.142,0.148,0.150], 232 | woRNN=[0.101,0.126,0.132,0.119,0.104], 233 | MADDPG=[0.063,0.055,0.048,0.047,0.048], 234 | ) 235 | 236 | 237 | 238 | 239 | -------------------------------------------------------------------------------- /experiments/crazy_env/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BIT-MCS/DRL-EC3/3f6fc8afe7ddea615e0e8f3f9f0fdfd6a6cd6db6/experiments/crazy_env/__init__.py -------------------------------------------------------------------------------- /experiments/crazy_env/env_setting3.py: -------------------------------------------------------------------------------- 1 | class Setting(object): 2 | def __init__(self, log): 3 | self.V = { 4 | 'MAP_X': 16, 5 | 'MAP_Y': 16, 6 | 'MAX_VALUE': 1., 7 | 'MIN_VALUE': 0., 8 | 'OBSTACLE': [ # todo:OBSTACLE 9 | [0, 3, 1, 1], 10 | [2, 9, 2, 1], 11 | [1, 3, 1, 2], 12 | [2, 15, 2, 1], 13 | [2, 0, 1, 1], 14 | [4, 4, 1, 1], 15 | [5, 4, 1,3], 16 | [5, 11, 1, 3], 17 | [10, 0, 3, 1], 18 | [10, 1, 1, 1], 19 | [10, 5, 1, 3], 20 | [8, 10, 3, 1], 21 | [9, 15, 1, 1], 22 | [13, 6, 1, 2], 23 | [13, 13, 1, 2], 24 | [12, 15, 4, 1], 25 | [15, 10, 1, 1] 26 | ], 27 | 'CHANNEL': 3, 28 | 29 | 'NUM_UAV': 2, # TODO:无人机个数 30 | 'INIT_POSITION': (0, 8, 8), 31 | 'MAX_ENERGY': 50., # TODO: 初始能量 32 | 'NUM_ACTION': 2, # 2 33 | 'SAFE_ENERGY_RATE': 0.2, 34 | 'RANGE': 1.1, # TODO:采集范围 35 | 'MAXDISTANCE': 1., 36 | 'COLLECTION_PROPORTION': 0.2, # c speed # TODO: 采集速度 37 | 'FILL_PROPORTION': 0.2, # fill speed # TODO:充电速度 38 | 39 | 'WALL_REWARD': -1., 40 | 'VISIT': 1. / 1000., 41 | 'DATA_REWARD': 1., 42 | 'FILL_REWARD': 1., 43 | 'ALPHA': 1., 44 | 'BETA': 0.1, 45 | 'EPSILON': 1e-4, 46 | 'NORMALIZE': .1, 47 | 'FACTOR': 0.1, 48 | } 49 | self.LOG = log 50 | self.time = log.time 51 | 52 | def log(self): 53 | self.LOG.log(self.V) 54 | -------------------------------------------------------------------------------- /experiments/crazy_env/log3.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | 6 | 7 | class Log(object): 8 | def __init__(self): 9 | self.time = str(time.strftime("%Y/%m-%d/%H-%M-%S", time.localtime())) 10 | self.full_path = os.path.join('.', self.time) 11 | self.choose_color = ['blue', 'green', 'purple', 'red'] 12 | if os.path.exists(self.full_path): 13 | self.full_path = os.path.join(self.full_path, '*') 14 | else: 15 | pass 16 | 17 | os.makedirs(self.full_path) 18 | self.file_path = self.full_path + '/REPORT.txt' 19 | file = open(self.file_path, 'x') 20 | file.close() 21 | 22 | def log(self, values): 23 | if isinstance(values, dict): 24 | with open(self.file_path, 'a') as file: 25 | for key, value in values.items(): 26 | print(key, value, file=file) 27 | elif isinstance(values, list): 28 | with open(self.file_path, 'a') as file: 29 | for value in values: 30 | print(value, file=file) 31 | else: 32 | with open(self.file_path, 'a') as file: 33 | print(values, file=file) 34 | 35 | def circle(self, x, y, r, color='red', count=100): 36 | xarr = [] 37 | yarr = [] 38 | for i in range(count): 39 | j = float(i) / count * 2 * np.pi 40 | xarr.append(x + r * np.cos(j)) 41 | yarr.append(y + r * np.sin(j)) 42 | plt.plot(xarr, yarr, c=color, linewidth=2) 43 | 44 | def draw_path(self, env, env_i, meaningful_fill, meaningful_get): 45 | full_path = os.path.join(self.full_path, 'Path') 46 | if not os.path.exists(full_path): 47 | os.makedirs(full_path) 48 | xxx = [] 49 | colors = [] 50 | for x in range(env.mapx): 51 | xxx.append((x, 1)) 52 | for y in range(env.mapy): 53 | c = [] 54 | for x in range(env.mapx): 55 | # 1 represents obstacle,0 is blank 56 | if env.mapob[x][y] == 1: 57 | c.append((0, 0, 0, 1)) 58 | else: 59 | c.append((1, 1, 1, 1)) 60 | colors.append(c) 61 | 62 | Fig = plt.figure(figsize=(5, 5)) 63 | PATH = np.array(env.trace) 64 | ENERGY_PATH = np.array(env.energytrace) 65 | 66 | for i1 in range(env.mapy): 67 | plt.broken_barh(xxx, (i1, 1), facecolors=colors[i1]) 68 | 69 | plt.scatter(env.datas[:, 0], env.datas[:, 1], c=env.DATAs[:, 2], marker="s") 70 | 71 | for i in range(env.n): 72 | # M = Fig.add_subplot(1, 1, i + 1) 73 | plt.ylim(ymin=0, ymax=env.mapy) 74 | plt.xlim(xmin=0, xmax=env.mapx) 75 | color = self.choose_color[i] 76 | plt.plot(PATH[i, :, 1], PATH[i, :, 2], color=color) 77 | for j in range(len(PATH[i])): 78 | if PATH[i, j, 0] >= 0: 79 | plt.scatter(PATH[i, j, 1], PATH[i, j, 2], color=color, marker=".", norm=ENERGY_PATH[i]) 80 | else: 81 | plt.scatter(PATH[i, j, 1], PATH[i, j, 2], color=color, marker="+", norm=ENERGY_PATH[i]) 82 | # grid line 83 | plt.grid(True, linestyle='-.', color='black') 84 | # title 85 | plt.title('Meaningful Get:' + str(meaningful_get) + '\nMeaningful Fill:' + str( 86 | meaningful_fill) + '\nLeft Reward=' + str(env.leftrewards) + ' ( NAIVE VERSION^_^ )') 87 | 88 | plt.scatter(env.fills[:, 0], env.fills[:, 1], c='red', marker="*") 89 | for (x, y) in zip(env.fills[:, 0], env.fills[:, 1]): 90 | self.circle(x, y, env.crange) 91 | Fig.savefig(full_path + '/path_' + str(env_i) + '.png') 92 | 93 | plt.close() 94 | 95 | def step_information(self, action_n, env, step, env_i, meaningful_fill, meaningful_get, indicator): # -1 fill,1 get 96 | full_path = os.path.join(self.full_path, 'Path') 97 | if not os.path.exists(full_path): 98 | os.makedirs(full_path) 99 | debug_filename = full_path + '/path_' + str(env_i) + '.txt' 100 | 101 | with open(debug_filename, 'a+') as file: 102 | print("\nStep ", step, ":", file=file) 103 | for i in range(env.n): 104 | if indicator[i] == -1: 105 | print("UAV_", i, "------", "Decision: Filling ", env.tmp_energy[i], " energy,current Energy: ", 106 | env.energy[i], ", Reward: ", env.reward[i], ", Penalty: ", env.tmp_penalty[i], 107 | "\n\t\tAction detail:", action_n[i], " Station-energy Remain:", env.fills_energy_remain, "\n", 108 | file=file) 109 | 110 | if env.tmp_energy[i] > 0: 111 | meaningful_fill[i] += 1 112 | else: 113 | print("UAV_", i, "------", "Decision: Getting ", env.tmp_value[i], " POI,current Energy: ", 114 | env.energy[i], ", Reward: ", env.reward[i], ", Penalty: ", env.tmp_penalty[i], 115 | "\n\t\tAction detail:", action_n[i], " Station-energy Remain:", env.fills_energy_remain, "\n", 116 | file=file) 117 | if env.tmp_value[i] > 0: 118 | meaningful_get[i] += 1 119 | -------------------------------------------------------------------------------- /experiments/crazy_env/tsp_data_collection.py: -------------------------------------------------------------------------------- 1 | from experiments.crazy_env.tsp_env_setting import Setting 2 | from experiments.image.mapM import MapM 3 | import os 4 | import copy 5 | from os.path import join as pjoin 6 | import numpy as np 7 | import time 8 | import cv2 9 | import math 10 | from gym import spaces 11 | 12 | 13 | def mypjoin(path1, path2, paths=None): 14 | full_path = pjoin(path1, path2) 15 | if not os.path.exists(full_path): 16 | os.mkdir(full_path) 17 | if paths is not None: 18 | full_path = pjoin(full_path, paths) 19 | if not os.path.exists(full_path): 20 | os.mkdir(full_path) 21 | return full_path 22 | 23 | 24 | def myint(a): 25 | # return int(np.ceil(a)) 26 | return int(np.floor(a)) 27 | 28 | 29 | class Env(object): 30 | def __init__(self, log): 31 | # self.tr = tracker.SummaryTracker() 32 | self.sg = Setting(log) 33 | self.sg.log() 34 | 35 | # 6-19 00:42 36 | self.maxaction = 0 37 | self.minaction = 0 38 | # 39 | 40 | self.log_dir = log.full_path 41 | # self.log_dir = mypjoin('.', self.sg.time) 42 | # basis 43 | self.mapx = self.sg.V['MAP_X'] # 16 44 | self.mapy = self.sg.V['MAP_Y'] # 16 45 | self.map = MapM(self.log_dir) # [80,80] 46 | self.channel = self.sg.V['CHANNEL'] # 3 47 | self.image_data = None 48 | self.image_position = None 49 | self.safe_energy_rate = self.sg.V['SAFE_ENERGY_RATE'] # 0.1 50 | 51 | # num of uavs 52 | self.n = self.sg.V['NUM_UAV'] 53 | 54 | # [[80.80,3]] 55 | # Box用于实现连续数据构成的空间,其中包含两组参数:空间内数据范围(上限和下限),以及空间维度的大小 56 | self.observation_space = [spaces.Box(low=-1, high=1, shape=(self.map.width, self.map.height, self.channel)) for 57 | i in range(self.n)] 58 | 59 | # [[2]] 60 | # TODO:去掉了action-state(<0,>0),只留下 delta x, delta y 61 | self.action_space = [spaces.Box(low=-1, high=1, shape=(self.sg.V['NUM_ACTION'],)) for i in range(self.n)] 62 | 63 | self.maxenergy = self.sg.V['MAX_ENERGY'] # 100 64 | self.crange = self.sg.V['RANGE'] # 1.1 65 | self.maxdistance = self.sg.V['MAXDISTANCE'] # 1.0 66 | self.cspeed = np.float16(self.sg.V['COLLECTION_PROPORTION']) # 0.2 67 | self.fspeed = np.float16(self.sg.V['FILL_PROPORTION']) # 0.1 68 | self.alpha = self.sg.V['ALPHA'] # 1.0 69 | self.beta = self.sg.V['BETA'] # 0.1 70 | self.track = 1. / 1000. 71 | 72 | # ---- 6-8 14:48 add factor 73 | self.factor = self.sg.V['FACTOR'] 74 | self.epsilon = self.sg.V['EPSILON'] 75 | self.normalize = self.sg.V['NORMALIZE'] 76 | 77 | # mapob [16,16] 78 | self.mapob = np.zeros((self.mapx, self.mapy)).astype(np.int8) 79 | 80 | """ 81 | Initial Obstacles 82 | """ 83 | # obstacles 84 | self.OB = 1 85 | obs = self.sg.V['OBSTACLE'] 86 | 87 | # draw obstacles in mapob[16,16], the obstacle is 1, others is 0 88 | for i in obs: 89 | for x in range(i[0], i[0] + i[2], 1): 90 | for y in range(i[1], i[1] + i[3], 1): 91 | self.mapob[x][y] = self.OB 92 | # reward 93 | self.pwall = self.sg.V['WALL_REWARD'] # -1 94 | 95 | """ 96 | Initial POI(data) 97 | """ 98 | # POI [256,3] 3->[x,y,value] 99 | test = [[1.5454101562e-01, 2.2583007812e-02, 6.5332031250e-01], 100 | [2.1936035156e-01, 2.1618652344e-01, 8.2568359375e-01], 101 | [3.3813476562e-01, 4.4738769531e-02, 6.6406250000e-02], 102 | [6.5478515625e-01, 6.5429687500e-01, 8.7280273438e-02], 103 | [6.9970703125e-01, 7.5000000000e-01, 4.6923828125e-01], 104 | [3.2177734375e-01, 4.9145507812e-01, 8.8769531250e-01], 105 | [6.0595703125e-01, 8.5449218750e-01, 1.0772705078e-01], 106 | [7.1679687500e-01, 1.1370849609e-01, 5.3759765625e-01], 107 | [7.3046875000e-01, 9.5800781250e-01, 3.6157226562e-01], 108 | [9.7656250000e-01, 4.9365234375e-01, 2.5732421875e-01], 109 | [1.4416503906e-01, 7.8320312500e-01, 7.1679687500e-01], 110 | [7.1435546875e-01, 2.1618652344e-01, 4.7070312500e-01], 111 | [1.3830566406e-01, 6.8310546875e-01, 6.7675781250e-01], 112 | [6.2304687500e-01, 1.4045715332e-02, 4.3017578125e-01], 113 | [9.2919921875e-01, 9.7460937500e-01, 5.6494140625e-01], 114 | [9.5996093750e-01, 3.4423828125e-02, 1.2927246094e-01], 115 | [5.4443359375e-01, 7.9199218750e-01, 3.7622070312e-01], 116 | [4.6777343750e-01, 5.4394531250e-01, 7.2753906250e-01], 117 | [4.7558593750e-01, 7.0898437500e-01, 7.6562500000e-01], 118 | [8.5205078125e-01, 4.8364257812e-01, 3.9965820312e-01], 119 | [7.1240234375e-01, 1.6027832031e-01, 5.7421875000e-01], 120 | [4.7460937500e-01, 9.8937988281e-02, 3.8500976562e-01], 121 | [6.1914062500e-01, 1.2841796875e-01, 1.4758300781e-01], 122 | [6.7773437500e-01, 5.8593750000e-02, 5.6689453125e-01], 123 | [5.2099609375e-01, 1.2927246094e-01, 1.6943359375e-01], 124 | [3.0737304688e-01, 9.3066406250e-01, 9.1845703125e-01], 125 | [1.7565917969e-01, 9.7802734375e-01, 4.3847656250e-01], 126 | [4.1040039062e-01, 8.9794921875e-01, 2.6123046875e-01], 127 | [6.5234375000e-01, 6.9580078125e-01, 6.5429687500e-01], 128 | [9.8046875000e-01, 4.0161132812e-01, 5.4003906250e-01], 129 | [6.2597656250e-01, 7.5244140625e-01, 8.1640625000e-01], 130 | [5.6762695312e-02, 7.7734375000e-01, 2.2973632812e-01], 131 | [9.0380859375e-01, 6.3720703125e-01, 8.8183593750e-01], 132 | [5.9326171875e-01, 5.8740234375e-01, 7.3339843750e-01], 133 | [2.6318359375e-01, 6.7480468750e-01, 3.6206054688e-01], 134 | [2.6245117188e-01, 5.3613281250e-01, 3.1201171875e-01], 135 | [5.5468750000e-01, 3.2397460938e-01, 5.8496093750e-01], 136 | [9.3896484375e-01, 6.6601562500e-01, 2.0996093750e-02], 137 | [1.3537597656e-01, 2.8100585938e-01, 1.8847656250e-01], 138 | [9.5507812500e-01, 8.2421875000e-01, 6.2890625000e-01], 139 | [4.3505859375e-01, 9.8046875000e-01, 7.4169921875e-01], 140 | [4.8559570312e-01, 4.9853515625e-01, 2.4414062500e-01], 141 | [6.8457031250e-01, 2.5073242188e-01, 4.5385742188e-01], 142 | [5.1025390625e-01, 8.9990234375e-01, 6.6601562500e-01], 143 | [6.6992187500e-01, 6.2011718750e-01, 6.6552734375e-01], 144 | [5.0292968750e-02, 8.3496093750e-01, 6.7968750000e-01], 145 | [7.8808593750e-01, 1.5332031250e-01, 9.0429687500e-01], 146 | [8.2128906250e-01, 7.9833984375e-01, 4.6142578125e-01], 147 | [3.0059814453e-02, 7.8125000000e-01, 4.9951171875e-01], 148 | [1.9006347656e-01, 7.3144531250e-01, 4.3994140625e-01], 149 | [8.3544921875e-01, 4.3237304688e-01, 8.6279296875e-01], 150 | [7.3437500000e-01, 9.9548339844e-02, 1.8688964844e-01], 151 | [2.6074218750e-01, 9.1699218750e-01, 5.9814453125e-01], 152 | [8.1689453125e-01, 1.9482421875e-01, 9.2675781250e-01], 153 | [8.7500000000e-01, 2.7221679688e-01, 7.4707031250e-01], 154 | [7.4121093750e-01, 6.7529296875e-01, 9.1601562500e-01], 155 | [9.3066406250e-01, 6.2207031250e-01, 8.2568359375e-01], 156 | [5.1220703125e-01, 1.7529296875e-01, 1.3122558594e-01], 157 | [8.9794921875e-01, 3.0053710938e-01, 8.1591796875e-01], 158 | [2.6953125000e-01, 6.9824218750e-01, 1.1224365234e-01], 159 | [7.1386718750e-01, 6.3134765625e-01, 1.3537597656e-01], 160 | [6.8066406250e-01, 6.5673828125e-01, 5.0195312500e-01], 161 | [5.4248046875e-01, 1.5234375000e-01, 1.6955566406e-01], 162 | [5.7568359375e-01, 1.5124511719e-01, 8.9599609375e-01], 163 | [1.7065429688e-01, 8.4411621094e-02, 2.5708007812e-01], 164 | [8.6474609375e-01, 2.2229003906e-01, 9.2675781250e-01], 165 | [9.3701171875e-01, 5.1849365234e-02, 3.6474609375e-01], 166 | [8.1298828125e-01, 7.8564453125e-01, 6.2402343750e-01], 167 | [4.1503906250e-01, 5.9423828125e-01, 5.0537109375e-01], 168 | [3.4179687500e-01, 4.7802734375e-01, 8.8818359375e-01], 169 | [3.9306640625e-01, 5.1074218750e-01, 3.0981445312e-01], 170 | [8.0566406250e-01, 1.6113281250e-01, 4.4848632812e-01], 171 | [8.8134765625e-02, 9.7705078125e-01, 8.5742187500e-01], 172 | [2.1984863281e-01, 7.5048828125e-01, 5.2978515625e-01], 173 | [8.5839843750e-01, 8.5058593750e-01, 4.6582031250e-01], 174 | [6.6259765625e-01, 6.6992187500e-01, 6.4404296875e-01], 175 | [8.7500000000e-01, 9.2138671875e-01, 3.1982421875e-01], 176 | [4.5800781250e-01, 5.3076171875e-01, 3.9868164062e-01], 177 | [5.2148437500e-01, 9.7705078125e-01, 8.2617187500e-01], 178 | [2.3986816406e-01, 5.0488281250e-01, 6.6650390625e-01]] 179 | 180 | # DATA shape:256*3 181 | self.DATAs = np.reshape(test, (-1, 3)).astype(np.float16) 182 | 183 | # # #TODO:调点 184 | # dx = [-0.2, -0.2, -0.2, 0, 0, 0, 0.2, 0.2, 0.2] 185 | # dy = [-0.2, 0, 0.2, -0.2, 0, 0.2, -0.2, 0, 0.2] 186 | # # replace the POI in obstacle position with the POI out of obstacle position 187 | # for index in range(self.DATAs.shape[0]): 188 | # need_adjust = True 189 | # while need_adjust: 190 | # need_adjust = False 191 | # for i in range(len(dx)): 192 | # if self.mapob[min(myint(self.DATAs[index][0] * self.mapx + dx[i]), self.mapx - 1)][ 193 | # min(myint(self.DATAs[index][1] * self.mapy + dy[i]), self.mapy - 1)] == self.OB: 194 | # need_adjust = True 195 | # break 196 | # if need_adjust is True: 197 | # self.DATAs[index] = np.random.rand(3).astype(np.float16) 198 | # 199 | # for i, poi_i in enumerate(self.DATAs): 200 | # if i == 0: 201 | # print("[[%.10e,%.10e,%.10e]," % (poi_i[0], poi_i[1], poi_i[2])) 202 | # elif i == len(self.DATAs) - 1: 203 | # print("[%.10e,%.10e,%.10e]]\n" % (poi_i[0], poi_i[1], poi_i[2])) 204 | # else: 205 | # print("[%.10e,%.10e,%.10e]," % (poi_i[0], poi_i[1], poi_i[2])) 206 | 207 | # POI data value [256] 208 | self._mapmatrix = copy.copy(self.DATAs[:, 2]) 209 | 210 | # POI data Position [256,2] 211 | self.datas = self.DATAs[:, 0:2] * self.mapx 212 | 213 | # sum of all POI data values 214 | self.totaldata = np.sum(self.DATAs[:, 2]) 215 | log.log(self.DATAs) 216 | 217 | """ 218 | Initial Fill Station 219 | """ 220 | # TODO:加入加油站的有限油量 221 | station = [ 222 | [0.1875, 0.8125, 50], 223 | [0.625, 0.8125, 50], 224 | [0.5, 0.5, 50], 225 | [0.375, 0.125, 50], 226 | [0.875, 0.25, 50] 227 | ] 228 | 229 | self.FILL = np.reshape(station, (-1, 3)).astype(np.float16) 230 | 231 | # Fill Station Position [5,2] 232 | self.fills = self.FILL[:, 0:2] * self.mapx 233 | 234 | # Fill Station remain energy [5] 235 | self.fills_energy_remain = copy.copy(self.FILL[:, 2]) 236 | 237 | # sum of all FIll Station remain energy 238 | self.total_fills_energy_remain = np.sum(self.FILL[:, 2]) 239 | 240 | log.log(self.FILL) 241 | 242 | """ 243 | Initial image information 244 | """ 245 | # [80,80] 246 | self._image_data = np.zeros((self.map.width, self.map.height)).astype(np.float16) 247 | 248 | # [n,80,80] 249 | self._image_position = np.zeros((self.sg.V['NUM_UAV'], self.map.width, self.map.height)).astype(np.float16) 250 | 251 | # [80,80] 252 | self._image_access = np.zeros((self.map.width, self.map.height)).astype(np.float16) 253 | 254 | # empty wall 255 | # draw walls in the border of the map (self._image_data) 256 | # the value of the wall is -1 257 | # the width of the wall is 4, which can be customized in image/flag.py 258 | # after adding four wall borders, the shape of the map is still [80,80] 259 | self.map.draw_wall(self._image_data) 260 | 261 | # PoI 262 | # draw PoIs in the map (self._image_data) 263 | # the position of PoI is [x*4+8,y*4+8] of the [80,80] map, 264 | # where x,y->[0~15] 265 | # the PoI's size is [2,2] in [80,80] map 266 | # the value of PoI in the map is the actual value of PoI (self._mapmatrix[i]) 267 | # PoI value->(0~1) 268 | for i, position in enumerate(self.datas): 269 | self.map.draw_point(position[0], position[1], self._mapmatrix[i], self._image_data) 270 | for obstacle in self.sg.V['OBSTACLE']: 271 | self.map.draw_obstacle(obstacle[0], obstacle[1], obstacle[2], obstacle[3], self._image_data) 272 | 273 | for i_n in range(self.n): 274 | # layer 2 275 | self.map.draw_UAV(self.sg.V['INIT_POSITION'][1], self.sg.V['INIT_POSITION'][2], 1., 276 | self._image_position[i_n]) 277 | for i, position in enumerate(self.fills): 278 | self.map.draw_FillStation(position[0], position[1], self.fills_energy_remain[i], 279 | self._image_position[i_n]) 280 | 281 | # 无人机随机颜色 282 | self.uav_render_color = [] 283 | for i in range(self.n): 284 | self.uav_render_color.append(np.random.randint(low=0, high=255, size=3, dtype=np.uint8)) 285 | 286 | self.pow_list = [] 287 | 288 | def reset(self): 289 | # initialize data map 290 | # tr = tracker.SummaryTracker() 291 | self.mapmatrix = copy.copy(self._mapmatrix) 292 | self.fills_energy_remain = copy.copy(self.FILL[:, 2]) 293 | 294 | # record data access times(per 0.001 default) 295 | self.maptrack = np.zeros(self.mapmatrix.shape) 296 | # ---- 297 | # initialize state(get POI/filling) and positions of uavs 298 | self.uav = [list(self.sg.V['INIT_POSITION']) for i in range(self.n)] 299 | self.eff = [0.] * self.n 300 | self.count = 0 301 | self.zero = 0 302 | 303 | self.trace = [[] for i in range(self.n)] 304 | self.energytrace = [[] for i in range(self.n)] 305 | # initialize remaining energy 306 | self.energy = np.ones(self.n).astype(np.float64) * self.maxenergy 307 | # initialize indicators 308 | self.collection = np.zeros(self.n).astype(np.float16) 309 | # energy use 310 | self.use_energy = np.zeros(self.n).astype(np.float16) 311 | # energy fill 312 | self.fill_energy = np.zeros(self.n).astype(np.float16) 313 | # energy max 314 | self.max_energy_array = np.array([self.maxenergy] * self.n).astype(np.float16) 315 | 316 | # walls 317 | self.walls = np.zeros(self.n).astype(np.int16) 318 | 319 | # time 320 | self.time_ = 0 321 | 322 | # initialize images 323 | self.state = self.__init_image() 324 | return self.__get_state() 325 | 326 | def __init_image(self): 327 | self.image_data = copy.copy(self._image_data) 328 | self.image_access = copy.copy(self._image_access) 329 | self.image_position = copy.copy(self._image_position) 330 | self.image_track = np.zeros(self.image_position.shape) 331 | # ---- 332 | state = [] 333 | for i in range(self.n): 334 | image = np.zeros((self.map.width, self.map.height, self.channel)).astype(np.float16) 335 | for width in range(image.shape[0]): 336 | for height in range(image.shape[1]): 337 | # god view 338 | image[width][height][0] = self.image_data[width][height] 339 | image[width][height][1] = self.image_position[i][width][height] 340 | image[width][height][2] = self.image_access[width][height] 341 | state.append(image) 342 | return state 343 | 344 | def __draw_image(self, clear_uav, update_point, update_station, update_track): 345 | # update 3 channels 346 | for n in range(self.n): 347 | for i, value in update_point: 348 | self.map.draw_point(self.datas[i][0], self.datas[i][1], value, self.state[n][:, :, 0]) 349 | for i, value in update_station: 350 | self.map.draw_point(self.fills[i][0], self.fills[i][1], value, self.state[n][:, :, 1]) 351 | self.map.clear_uav(clear_uav[n][1], clear_uav[n][2], self.state[n][:, :, 1]) 352 | self.map.draw_UAV(self.uav[n][1], self.uav[n][2], self.energy[n] / self.maxenergy, self.state[n][:, :, 1]) 353 | 354 | # ---- draw track 355 | for i, value in update_track: 356 | self.map.draw_point(self.datas[i][0], self.datas[i][1], value, self.state[n][:, :, 2]) 357 | 358 | def __get_state(self): 359 | return copy.deepcopy(self.state) 360 | 361 | # TODO: penalty加移动penalty,有待商榷 362 | def __get_reward(self, value, energy, distance, penalty, fairness, fairness_): 363 | factor0 = value / (self.factor * distance + self.alpha * value + self.epsilon) 364 | factor1 = energy / self.maxenergy / (self.factor * distance + self.epsilon) 365 | reward = factor0 + factor1 366 | if value == 0 and energy == 0: # 浪费生命的一步 367 | return penalty - self.normalize * distance 368 | else: 369 | return reward * fairness_ + penalty 370 | 371 | def __get_fairness(self, values): 372 | square_of_sum = np.square(np.sum(values)) 373 | sum_of_square = np.sum(np.square(values)) 374 | if sum_of_square == 0: 375 | return 0. 376 | jain_fairness_index = square_of_sum / sum_of_square / float(len(values)) 377 | return jain_fairness_index 378 | 379 | def __get_eff1(self, value, distance): 380 | return value / (distance + self.alpha * value + self.epsilon) 381 | 382 | def __cusume_energy1(self, uav, value, distance): 383 | # distance-0.1, alpha-1.0 384 | if (self.factor * distance + self.alpha * value < self.energy[uav]): 385 | self.energy[uav] -= (self.factor * distance + self.alpha * value) 386 | self.use_energy[uav] += (self.factor * distance + self.alpha * value) 387 | else: 388 | self.use_energy[uav] += self.energy[uav] 389 | distance = self.energy[uav] / self.factor 390 | self.energy[uav] = 0 391 | 392 | def __fill_energy1(self, uav): 393 | # fspeed-0.1 394 | if self.energy[uav] + self.fspeed * self.maxenergy <= self.maxenergy: 395 | self.fill_energy[uav] += self.fspeed * self.maxenergy 396 | self.energy[uav] += self.fspeed * self.maxenergy 397 | else: 398 | self.fill_energy[uav] += self.maxenergy - self.energy[uav] 399 | self.energy[uav] = self.maxenergy 400 | 401 | def step(self, actions, indicator=None): 402 | # actions = actions.reshape((2, 3)) 403 | self.count += 1 404 | action = copy.deepcopy(actions) 405 | # 6-20 00:43 406 | if np.max(action) > self.maxaction: 407 | self.maxaction = np.max(action) 408 | # print(self.maxaction) 409 | if np.min(action) < self.minaction: 410 | self.minaction = np.min(action) 411 | # print(self.minaction) 412 | 413 | action = np.clip(action, -1e3, 1e3) 414 | 415 | normalize = self.normalize 416 | 417 | # TODO:梯度爆炸问题不可小觑, 418 | # 遇到nan直接卡掉 419 | for i in range(self.n): 420 | for ii in action[i]: 421 | if np.isnan(ii): 422 | print('Nan. What can I do? do!') 423 | while True: 424 | pass 425 | 426 | reward = [0] * self.n 427 | self.tmp_value = [0] * self.n 428 | self.tmp_energy = [0] * self.n 429 | self.tmp_distance = [0] * self.n 430 | self.tmp_penalty = [0] * self.n 431 | self.dn = [False] * self.n # no energy UAV 432 | update_points = [] # Updated PoI remained data 433 | update_stations = [] # Updated Station remained energy 434 | update_tracks = [] # Updated PoI access times 435 | clear_uav = copy.copy(self.uav) 436 | new_positions = [] 437 | c_f = self.__get_fairness(self.maptrack) 438 | 439 | # update positions of UAVs 440 | for i in range(self.n): 441 | self.trace[i].append(self.uav[i]) 442 | self.energytrace[i].append(self.energy[i] / self.maxenergy) 443 | 444 | # distance is from action(x,y), which is a kind of offset,[minaction,maxaction] 445 | distance = np.sqrt(np.power(action[i][0], 2) + np.power(action[i][1], 2)) 446 | data = 0.0 447 | value = 0.0 448 | energy = 0.0 449 | penalty = 0.0 450 | 451 | # think about distance and energy 452 | # 1.normal and enough energy 453 | # 2.so small 454 | # 3.so large(>maxdistance) enough energy 455 | # 4.so large(>energy) 456 | if distance <= self.maxdistance and self.energy[i] >= self.factor * distance: 457 | new_x = self.uav[i][1] + action[i][0] 458 | new_y = self.uav[i][2] + action[i][1] 459 | else: 460 | maxdistance = self.maxdistance if self.maxdistance <= self.energy[i] else \ 461 | self.energy[i] 462 | # distance>=0.001 463 | if distance <= self.epsilon: 464 | distance = self.epsilon 465 | print("very small.") 466 | new_x = self.uav[i][1] + maxdistance * action[i][0] / distance 467 | new_y = self.uav[i][2] + maxdistance * action[i][1] / distance 468 | distance = maxdistance 469 | 470 | self.__cusume_energy1(i, 0, distance) 471 | 472 | # penalty!! 473 | # update position 474 | # if normal, save update 475 | # if reach OB or WALL, give negative reward, save original positon 476 | dx = new_x - self.uav[i][1] 477 | dy = new_y - self.uav[i][2] 478 | # TODO:简单的防夸张跳墙 479 | if 0 <= new_x < self.mapx and 0 <= new_y < self.mapy and self.mapob[myint(new_x)][ 480 | myint(new_y)] != self.OB and \ 481 | self.mapob[myint(self.uav[i][1] + (dx / 2))][myint(self.uav[i][2] + (dy / 2))] != self.OB and \ 482 | self.mapob[myint(self.uav[i][1] + (dx / 3))][myint(self.uav[i][2] + (dy / 3))] != self.OB and \ 483 | self.mapob[myint(self.uav[i][1] + (2 * dx / 3))][ 484 | myint(self.uav[i][2] + (2 * dy / 3))] != self.OB and \ 485 | self.mapob[myint(self.uav[i][1] + (dx / 4))][myint(self.uav[i][2] + (dy / 4))] != self.OB and \ 486 | self.mapob[myint(self.uav[i][1] + (3 * dx / 4))][myint(self.uav[i][2] + (3 * dy / 4))] != self.OB: 487 | new_positions.append([0, new_x, new_y]) 488 | else: 489 | new_positions.append([0, self.uav[i][1], self.uav[i][2]]) 490 | penalty += normalize * self.pwall 491 | self.walls[i] += 1 492 | 493 | # TODO:加完了会有惊喜的哈哈哈!!! 494 | if self.energy[i] < self.safe_energy_rate * self.maxenergy: 495 | penalty += -1. * distance * normalize 496 | 497 | # TODO:先看能否加油 498 | # calculate distances between UAV and FillStation points 499 | _pos = np.repeat([new_positions[-1][1:]], [self.fills.shape[0]], axis=0) # just repeat(On) NB! 500 | _minus = self.fills - _pos 501 | _power = np.power(_minus, 2) 502 | _dis = np.sum(_power, axis=1) 503 | __exists_FS = 0 504 | tmp = self.energy[i] 505 | for index, dis in enumerate(_dis): 506 | # sensing Fill Station(crange=1.1) 507 | if np.sqrt(dis) <= self.crange: 508 | __exists_FS = 1 509 | # uodate poi data 510 | if self.fills_energy_remain[index] > 0: 511 | # TODO:加油站的信息更新 512 | if self.fspeed * self.maxenergy <= self.fills_energy_remain[index]: 513 | if self.energy[i] + self.fspeed * self.maxenergy <= self.maxenergy: 514 | self.fill_energy[i] += self.fspeed * self.maxenergy 515 | self.fills_energy_remain[index] -= self.fspeed * self.maxenergy 516 | self.energy[i] += self.fspeed * self.maxenergy 517 | else: 518 | self.fill_energy[i] += self.maxenergy - self.energy[i] 519 | self.fills_energy_remain[index] -= (self.maxenergy - self.energy[i]) 520 | self.energy[i] = self.maxenergy 521 | else: 522 | if self.energy[i] + self.fills_energy_remain[index] <= self.maxenergy: 523 | self.fill_energy[i] += self.fills_energy_remain[index] 524 | self.energy[i] += self.fills_energy_remain[index] 525 | self.fills_energy_remain[index] = 0 526 | else: 527 | self.fill_energy[i] += self.maxenergy - self.energy[i] 528 | self.fills_energy_remain[index] -= (self.maxenergy - self.energy[i]) 529 | self.energy[i] = self.maxenergy 530 | update_stations.append([index, self.fills_energy_remain[index]]) 531 | break 532 | 533 | # 若在加油站范围内则加油,若不在任何一个加油站范围内,则采集数据 534 | if __exists_FS == 1: 535 | new_positions[-1][0] = -1 # 状态标识符置为-1 536 | if indicator is not None: 537 | indicator[i] = -1 538 | # fill energy!! 539 | energy = self.energy[i] - tmp 540 | 541 | 542 | else: 543 | new_positions[-1][0] = 1 # 状态标识符置为1 544 | if indicator is not None: 545 | indicator[i] = 1 546 | # calculate distances between UAV and data points 547 | _pos = np.repeat([new_positions[-1][1:]], [self.datas.shape[0]], axis=0) # just repeat(On) NB! 548 | _minus = self.datas - _pos 549 | _power = np.power(_minus, 2) 550 | _dis = np.sum(_power, axis=1) 551 | for index, dis in enumerate(_dis): 552 | # sensing PoI(crange=1.1) 553 | if np.sqrt(dis) <= self.crange: 554 | self.maptrack[index] += self.track 555 | update_tracks.append([index, self.maptrack[index]]) # update poi access times 556 | 557 | # uodate poi data 558 | if self.mapmatrix[index] > 0: 559 | # cspeed just like a perceptage of consuming a special POI 560 | data += self._mapmatrix[index] * self.cspeed 561 | self.mapmatrix[index] -= self._mapmatrix[index] * self.cspeed 562 | if self.mapmatrix[index] < 0: 563 | self.mapmatrix[index] = 0. 564 | update_points.append([index, self.mapmatrix[index]]) 565 | 566 | # update info (collected data) 567 | # use energy to get POI(consume energy of UAVs, per alpha 1.0 default) 568 | value = data if self.energy[i] >= data * self.alpha else self.energy[i] 569 | self.__cusume_energy1(i, value, 0.) # 采集数据 570 | 571 | # calculate fairness 572 | c_f_ = self.__get_fairness(self.maptrack) 573 | 574 | # reward 575 | reward[i] += self.__get_reward(value, energy, distance, penalty, c_f, c_f_) 576 | 577 | # TODO:debug 578 | self.tmp_value[i] = value 579 | self.tmp_energy[i] = energy 580 | self.tmp_distance[i] = distance 581 | self.tmp_penalty[i] = penalty 582 | 583 | # ---- 584 | c_f = c_f_ 585 | 586 | # efficiency 587 | self.eff[i] += self.__get_eff1(value, distance) 588 | self.collection[i] += value 589 | 590 | # mark no energy UAVs 591 | if self.energy[i] <= self.epsilon * self.maxenergy: 592 | self.dn[i] = True 593 | 594 | self.uav = new_positions 595 | t = time.time() 596 | self.__draw_image(clear_uav, update_points, update_stations, update_tracks) 597 | self.time_ += time.time() - t 598 | 599 | # TODO:放大reward 为什么要人为砍梯度? 600 | self.reward = list(np.clip(np.array(reward) / normalize, -2., 2.)) 601 | # self.reward = list(np.array(reward) / normalize) 602 | 603 | info = None 604 | state = self.__get_state() 605 | for r in self.reward: 606 | if np.isnan(r): 607 | print('Rerward Nan') 608 | while True: 609 | pass 610 | 611 | # TODO:不提前结束,给予一些的躺尸的经历,最极端的就是所有无人机一起躺尸,但是TDerror可能会有问题吧 612 | # done = True 613 | # for d in self.dn: 614 | # if d is False: 615 | # done = False 616 | # break 617 | # else: 618 | # continue 619 | 620 | done = False 621 | return state, self.reward, done, info, indicator 622 | 623 | def render(self): 624 | global power_list 625 | observ = list(self.__get_state()) 626 | observ = np.array(observ) 627 | observ = observ.transpose((0, 2, 1, 3)) 628 | observ_0 = observ[np.random.randint(low=0, high=self.n), :, :, 0] 629 | observ_1 = observ[np.random.randint(low=0, high=self.n), :, :, 2] 630 | 631 | img_0 = np.zeros([80, 80, 3], dtype=np.uint8) 632 | self.draw_convert(observ_0, img_0, max(self._mapmatrix), color=np.asarray([0, 255, 0])) 633 | 634 | max_visit_val = max(np.max(observ_1), self.sg.V['VISIT'] * 20) 635 | img_1 = np.zeros([80, 80, 3], dtype=np.uint8) 636 | self.draw_convert(observ_1, img_1, max_visit_val, color=np.asarray([0, 255, 0])) 637 | 638 | for i in range(self.n): 639 | power_list = self.draw_convert(observ[i, :, :, 1], img_0, self.maxenergy, color=self.uav_render_color[i], 640 | is_power=True) 641 | 642 | img = np.hstack([img_0, img_1]) 643 | img = cv2.resize(img, (800, 400)) 644 | 645 | for p in power_list: 646 | cv2.circle(img, (p[1] * 5, p[0] * 5), 25, (0, 0, 255)) 647 | 648 | img = cv2.flip(img, 0, dst=None) 649 | 650 | cv2.imshow('show', img) 651 | cv2.waitKey(1) 652 | 653 | def draw_convert(self, observ, img, max_val, color, is_power=False): 654 | for i in range(80): 655 | for j in range(80): 656 | 657 | if observ[j, i] < 0 and is_power == False: 658 | img[j, i, 0] = np.uint8(255) 659 | elif observ[j, i] < 0 and is_power == True: 660 | img[j, i, 2] = np.uint8(255) 661 | self.pow_list.append((j, i)) 662 | elif observ[j, i] > 0 and is_power == True: 663 | img[j, i, :] = np.uint8(color * observ[j, i]) 664 | elif observ[j, i] > 0 and is_power == False: 665 | img[j, i, :] = np.uint8(color * observ[j, i] / max_val) 666 | 667 | if len(self.pow_list) > 0: 668 | return self.pow_list 669 | 670 | # TODO:MAYBE NOT USEFUL NOW!!! 671 | @property 672 | def leftrewards(self): 673 | return np.sum(self.mapmatrix) / self.totaldata 674 | 675 | @property 676 | def efficiency(self): 677 | return np.sum(self.collection / self.totaldata) * self.collection_fairness / (np.sum(self.normal_use_energy)) 678 | 679 | @property 680 | def normal_use_energy(self): 681 | tmp = list(np.array(self.use_energy) / (self.max_energy_array)) 682 | # for i in range(len(tmp)): 683 | # if tmp[i] > 1.0: 684 | # tmp[i] = 1.0 685 | 686 | return tmp 687 | 688 | @property 689 | def fairness(self): 690 | square_of_sum = np.square(np.sum(self.mapmatrix[:])) 691 | sum_of_square = np.sum(np.square(self.mapmatrix[:])) 692 | fairness = square_of_sum / sum_of_square / float(len(self.mapmatrix)) 693 | return fairness 694 | 695 | @property 696 | def collection_fairness(self): 697 | collection = self._mapmatrix - self.mapmatrix 698 | square_of_sum = np.square(np.sum(collection)) 699 | sum_of_square = np.sum(np.square(collection)) 700 | fairness = square_of_sum / sum_of_square / float(len(collection)) 701 | return fairness 702 | 703 | @property 704 | def normal_collection_fairness(self): 705 | collection = self._mapmatrix - self.mapmatrix 706 | for index, i in enumerate(collection): 707 | collection[index] = i / self._mapmatrix[index] 708 | square_of_sum = np.square(np.sum(collection)) 709 | sum_of_square = np.sum(np.square(collection)) 710 | fairness = square_of_sum / sum_of_square / float(len(collection)) 711 | return fairness 712 | -------------------------------------------------------------------------------- /experiments/crazy_env/tsp_env_setting.py: -------------------------------------------------------------------------------- 1 | class Setting(object): 2 | def __init__(self, log): 3 | self.V = { 4 | 'MAP_X': 16, 5 | 'MAP_Y': 16, 6 | 'MAX_VALUE': 1., 7 | 'MIN_VALUE': 0., 8 | 'OBSTACLE': [ 9 | # [0, 4, 1, 1], 10 | # [0, 9, 1, 1], 11 | # [0, 10, 2, 1], 12 | # [2, 2, 2, 1], 13 | # [5, 13, 1, 1], 14 | # [6, 12, 2, 1], 15 | # [10, 5, 3, 1], 16 | # [11, 5, 1, 3], 17 | # [10, 13, 1, 2], 18 | # [11, 13, 2, 1], 19 | # [12, 0, 1, 2], 20 | # [12, 5, 1, 1], 21 | # [12, 7, 1, 1], 22 | # [15, 11, 1, 1] 23 | ], 24 | 'CHANNEL': 3, 25 | 26 | 'NUM_UAV': 5, 27 | 'INIT_POSITION': (0, 8, 8), 28 | 'MAX_ENERGY': 50., # must face the time of lack 29 | 'NUM_ACTION': 2, # 2 30 | 'SAFE_ENERGY_RATE': 0.2, 31 | 'RANGE': 1.1, 32 | 'MAXDISTANCE': 4., 33 | 'COLLECTION_PROPORTION': 0.2, # c speed 34 | 'FILL_PROPORTION': 0.2, # fill speed 35 | 36 | 'WALL_REWARD': -1., 37 | 'VISIT': 1. / 1000., 38 | 'DATA_REWARD': 1., 39 | 'FILL_REWARD': 1., 40 | 'ALPHA': 1., 41 | 'BETA': 0.1, 42 | 'EPSILON': 1e-4, 43 | 'NORMALIZE': .1, 44 | 'FACTOR': 0.1, 45 | } 46 | self.LOG = log 47 | self.time = log.time 48 | 49 | def log(self): 50 | self.LOG.log(self.V) 51 | -------------------------------------------------------------------------------- /experiments/env0/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BIT-MCS/DRL-EC3/3f6fc8afe7ddea615e0e8f3f9f0fdfd6a6cd6db6/experiments/env0/__init__.py -------------------------------------------------------------------------------- /experiments/env0/env_setting0.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | class Setting(object): 5 | def __init__(self, log): 6 | self.V = { 7 | 'MAP_X': 16, 8 | 'MAP_Y': 16, 9 | 'MAX_VALUE': 1., 10 | 'MIN_VALUE': 0., 11 | 'OBSTACLE': [ 12 | [0, 4, 1, 1], 13 | [0, 9, 1, 1], 14 | [0, 10, 2, 1], 15 | [2, 2, 2, 1], 16 | [3, 6, 4, 1], 17 | [4, 4, 1, 4], 18 | # [4,12, 1, 1], 19 | [5, 13, 1, 1], 20 | [6, 12, 2, 1], 21 | # [10,3, 1, 1], 22 | [10, 5, 3, 1], 23 | # [10,9, 1, 1], 24 | [11, 5, 1, 3], 25 | [10, 13, 1, 2], 26 | [11, 13, 2, 1], 27 | # [11,12,1, 2], 28 | [12, 0, 1, 2], 29 | [12, 5, 1, 1], 30 | [12, 7, 1, 1], 31 | # [12,13,2, 1], 32 | [15, 11, 1, 1] 33 | ], 34 | 'CHANNLE': 3, 35 | 36 | 'NUM_UAV': 2, 37 | 'INIT_POSITION': (8, 8), 38 | 'MAX_ENERGY': 500., 39 | 'NUM_ACTION': 2, 40 | 'RANGE' : 1.1, 41 | 'MAXDISTANCE': 1., 42 | 'COLLECTION_PROPORTION': 0.2, # c speed 43 | 44 | 'WALL_REWARD': -1., 45 | 'DATA_REWARD': 1., 46 | 'WASTE_STEP' : -.5, 47 | 'ALPHA': 1., 48 | # 'BETA': 0.01, 49 | 'EPSILON': 1e-4, 50 | 'NORMALIZE': .1, 51 | 'FACTOR': 0.1, 52 | } 53 | self.LOG = log 54 | self.time = log.time 55 | 56 | def log(self): 57 | # with open(os.path.join('.', self.time + '.txt'), 'x') as file: 58 | # for key, value in self.V.items(): 59 | # print(key, value, file=file) 60 | self.LOG.log(self.V) 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /experiments/env0/log0.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | 6 | class Log(object): 7 | def __init__(self): 8 | self.time = str(time.strftime("%Y/%m-%d/%H-%M-%S", time.localtime())) 9 | self.full_path = os.path.join('.', self.time) 10 | os.makedirs(self.full_path) 11 | self.file_path = self.full_path + '/REPORT.txt' 12 | file = open(self.file_path, 'x') 13 | file.close() 14 | 15 | def log(self, values): 16 | if isinstance(values, dict): 17 | with open(self.file_path, 'a') as file: 18 | for key, value in values.items(): 19 | print(key, value, file=file) 20 | elif isinstance(values, list): 21 | with open(self.file_path, 'a') as file: 22 | for value in values: 23 | print(value, file=file) 24 | else: 25 | with open(self.file_path, 'a') as file: 26 | print(values, file=file) 27 | 28 | def draw_path(self, env, step): 29 | full_path = os.path.join(self.full_path, 'Path') 30 | # ob_xy = np.zeros((FLAGS.map_x, FLAGS.map_y)) 31 | # for i in FLAGS.obstacle: 32 | # for x in range(i[0], i[0] + i[2], 1): 33 | # for y in range(i[1], i[1] + i[3], 1): 34 | # ob_xy[x][y] = 1 35 | if not os.path.exists(full_path): 36 | os.makedirs(full_path) 37 | xxx = [] 38 | colors = [] 39 | for x in range(env.mapx): 40 | xxx.append((x, 1)) 41 | for y in range(env.mapy): 42 | c = [] 43 | for x in range(env.mapx): 44 | if env.mapob[x][y] == 1: 45 | c.append((1, 0, 0, 1)) 46 | else: 47 | c.append((1, 1, 1, 1)) 48 | colors.append(c) 49 | 50 | Fig = plt.figure(figsize=(5, 5)) 51 | PATH = np.array(env.trace) 52 | for i1 in range(env.mapy): 53 | plt.broken_barh(xxx, (i1, 1), facecolors=colors[i1]) 54 | plt.scatter(env.datas[:,0], env.datas[:,1], c=env.DATAs[:,2]) 55 | for i in range(env.n): 56 | # M = Fig.add_subplot(1, 1, i + 1) 57 | plt.ylim(ymin=0, ymax=env.mapy) 58 | plt.xlim(xmin=0, xmax=env.mapx) 59 | color = np.random.random(3) 60 | plt.plot(PATH[i, :, 0], PATH[i, :, 1], color=color) 61 | plt.scatter(PATH[i, :, 0], PATH[i, :, 1], color=color,marker='.') 62 | plt.grid(True, linestyle='-.', color='r') 63 | plt.title(str(env.normal_energy) + ',\n' + str(env.leftrewards)) 64 | Fig.savefig(full_path + '/path_' + str(step) + '.png') 65 | 66 | plt.close() 67 | -------------------------------------------------------------------------------- /experiments/image/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BIT-MCS/DRL-EC3/3f6fc8afe7ddea615e0e8f3f9f0fdfd6a6cd6db6/experiments/image/__init__.py -------------------------------------------------------------------------------- /experiments/image/flag.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | flags = tf.app.flags 4 | FLAGS = flags.FLAGS 5 | 6 | # map info 7 | flags.DEFINE_integer('image_size', 80, 'the size of image') 8 | flags.DEFINE_integer('image_deepth', 2, 'the deepth of image') 9 | flags.DEFINE_integer('wall_value', -1, 'the value of wall') 10 | flags.DEFINE_integer('wall_width', 4, 'the width of wall') 11 | flags.DEFINE_integer('fill_value', -1, 'the value of FillStation') 12 | 13 | flags.DEFINE_integer('map_x', 16, 'the length of x-axis') 14 | flags.DEFINE_integer('map_y', 16, 'the length of y-axis') 15 | -------------------------------------------------------------------------------- /experiments/image/map.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Map(object): 4 | def __init__(self, width, height): 5 | # self.__map = np.zeros((width, height)) 6 | self.__width = width 7 | self.__height = height 8 | 9 | # @property 10 | # def map(self): 11 | # return self.__map 12 | @property 13 | def width(self): 14 | return self.__width 15 | @property 16 | def height(self): 17 | return self.__height 18 | 19 | def get_value(self, x, y, map): 20 | return map[x][y] 21 | 22 | def draw_sqr(self, x, y, width, height, value, map): 23 | assert 0 <= x < self.__width and 0 <= y < self.__height, 'the position ({0}, {1}) is not correct.'.format(x, y) 24 | for i in range(x, x + width, 1): 25 | for j in range(y, y + height, 1): 26 | map[i][j] = value 27 | 28 | 29 | -------------------------------------------------------------------------------- /experiments/image/mapM.py: -------------------------------------------------------------------------------- 1 | from .map import Map 2 | from .flag import FLAGS 3 | from PIL import Image 4 | import time 5 | import os 6 | 7 | class MapM(Map): 8 | 9 | def __init__(self, log_path, width=80, height=80): 10 | super(MapM, self).__init__(width, height) 11 | self.__time = time.time() 12 | self.full_path = os.path.join(log_path, 'img_map') 13 | if not os.path.exists(self.full_path): 14 | os.makedirs(self.full_path) 15 | 16 | def draw_wall(self, map): 17 | wall = FLAGS.wall_value 18 | width = FLAGS.wall_width 19 | for j in range(0, 80, 1): 20 | for i in range(80-width, 80, 1): 21 | self.draw_sqr(i, j, 1, 1, wall, map) 22 | for i in range(0, width, 1): 23 | self.draw_sqr(i, j, 1, 1, wall, map) 24 | for i in range(0, 80, 1): 25 | for j in range(0, width, 1): 26 | self.draw_sqr(i, j, 1, 1, wall, map) 27 | for j in range(80-width, 80, 1): 28 | self.draw_sqr(i, j, 1, 1, wall, map) 29 | 30 | def get_value(self, x, y, map): 31 | x, y = self.__trans(x, y) 32 | super(MapM, self).get_value(x, y, map) 33 | 34 | def __trans(self, x, y): 35 | return int(4 * x + FLAGS.wall_width*2), int(y * 4 + FLAGS.wall_width*2) 36 | 37 | def draw_obstacle(self, x, y, width, height, map): 38 | # self.clear_cell(x, y, map) 39 | x, y = self.__trans(x, y) 40 | self.draw_sqr(x, y, width * 4, height * 4, FLAGS.wall_value, map) 41 | 42 | def draw_chargestation(self, x, y, map): 43 | self.clear_cell(x, y, map) 44 | x, y = self.__trans(x, y) 45 | self.draw_sqr(x, y + 1, 4, 2, 1, map) 46 | self.draw_sqr(x + 1, y, 2, 4, 1, map) 47 | 48 | # xy transpose occur 49 | def draw_point(self, x, y, value, map): 50 | x, y = self.__trans(x, y) 51 | self.draw_sqr(x, y, 2, 2, value, map) 52 | 53 | def clear_point(self, x, y, map): 54 | x, y = self.__trans(x, y) 55 | self.draw_sqr(x, y, 2, 2, 0, map) 56 | 57 | def clear_uav(self, x, y, map): 58 | self.clear_cell(x, y, map) 59 | 60 | def draw_UAV(self, x, y, value, map): 61 | x = -1 if x < -1 else FLAGS.map_x if x > FLAGS.map_x else x 62 | y = -1 if y < -1 else FLAGS.map_y if y > FLAGS.map_y else y 63 | self.clear_cell(x, y, map) 64 | x, y = self.__trans(x, y) 65 | # self.draw_sqr(x, y + 1, 4, 2, value, map) 66 | # self.draw_sqr(x + 1, y, 2, 4, value, map) 67 | # value = self.get_value(x, y) 68 | self.draw_sqr(x, y, 4, 4, value, map) 69 | # self.draw_sqr(x, y, 4, 4, value) 70 | 71 | def clear_cell(self, x, y, map): 72 | x, y = self.__trans(x, y) 73 | self.draw_sqr(x, y, 4, 4, 0, map) 74 | 75 | def draw_goal(self, x, y, map): 76 | # x, y = self.__trans(x, y) 77 | # value = self.get_value(x + 2, y + 2, map) 78 | # self.draw_sqr(x, y, 4, 4, 1, map) 79 | # self.draw_sqr(x + 2, y + 2, 2, 2, value, map) 80 | pass 81 | 82 | def draw_FillStation(self,x,y,value,map): 83 | x, y = self.__trans(x, y) 84 | self.draw_sqr(x, y, 2, 2,value,map) 85 | 86 | 87 | def save_as_png(self, map, ip=None): 88 | img = Image.fromarray(map * 255) 89 | img = img.convert('L') 90 | # img.show() 91 | if ip is None: 92 | name = time.time() - self.__time 93 | else: 94 | name = str(ip) 95 | img.save(os.path.join(self.full_path, str(name)), 'png') 96 | 97 | -------------------------------------------------------------------------------- /experiments/poor_compare.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import matplotlib.ticker as ticker 3 | from matplotlib.backends.backend_pdf import PdfPages 4 | import os 5 | import numpy as np 6 | 7 | 8 | def error(input_list): 9 | input = np.array(input_list) 10 | input = input.transpose((1, 0)) 11 | error_low = input[0] - input[1] 12 | error_high = input[2] - input[0] 13 | error = [] 14 | error.append(error_low) 15 | error.append(error_high) 16 | return error 17 | 18 | 19 | def average(input_list): 20 | input = np.array(input_list) 21 | input = input.transpose((1, 0)) 22 | return input[0] 23 | 24 | 25 | def compare_plot_errorbar(xlabel, ylabel, x, eDivert, woApeX, woRNN, MADDPG): 26 | plt.xlabel(xlabel) 27 | plt.ylabel(ylabel) 28 | plt.errorbar(x=x, y=average(eDivert), yerr=error(eDivert), fmt='r-o', label='e-Divert', capsize=4) 29 | plt.errorbar(x=x, y=average(woApeX), yerr=error(woApeX), fmt='g-^', label='e-Divert w/o Ape-X', capsize=4) 30 | plt.errorbar(x=x, y=average(woRNN), yerr=error(woRNN), fmt='m-<', label='e-Divert w/o RNN', capsize=4) 31 | plt.errorbar(x=x, y=average(MADDPG), yerr=error(MADDPG), fmt='k-*', label='MADDPG', capsize=4) 32 | 33 | plt.ylim(ymin=0, ymax=1) 34 | plt.grid(True) 35 | plt.grid(linestyle='--') 36 | plt.legend() 37 | plt.show() 38 | 39 | 40 | def compare_plot(xlabel, ylabel, x, yrange, eDivert, TSP): 41 | if os.path.exists('./pdf') is False: 42 | os.makedirs('./pdf') 43 | pdf = PdfPages('./pdf/%s-%s.pdf' % (xlabel, ylabel)) 44 | plt.figure(figsize=(13, 13)) 45 | 46 | plt.xlabel(xlabel, fontsize=32) 47 | plt.ylabel(ylabel, fontsize=32) 48 | plt.xticks(fontsize=32) 49 | plt.yticks(fontsize=32) 50 | plt.plot(x, eDivert, color='b', marker='o', label='e-Divert', markersize=26, markeredgewidth=5, 51 | markerfacecolor='none', linewidth=4) 52 | plt.plot(x, TSP, color='orange', marker='s', label='GA-based route planning', markersize=26, markeredgewidth=5, 53 | markerfacecolor='none', linewidth=4) 54 | 55 | # if ylabel == "Energy usage (# of full batteries)": 56 | # if xlabel == "No. of vehicles": 57 | # plt.plot(x, [3.62, 4.62, 5.62, 6.62, 7.62], color='red', linestyle='--', label="Maximum used energy", 58 | # linewidth=4) 59 | # else: 60 | # plt.axhline(y=2.83, color='red', linestyle='--', label="Maximum used energy", linewidth=4) 61 | plt.xticks(x, x) 62 | plt.gca().yaxis.set_major_formatter(ticker.FormatStrFormatter('%.2f')) 63 | plt.ylim(yrange[0], yrange[1] * 1.5) 64 | plt.grid(True) 65 | plt.grid(linestyle='--') 66 | plt.legend(loc='upper left', fontsize=25, ncol=1, markerscale=0.9) 67 | plt.tight_layout() 68 | 69 | pdf.savefig() 70 | plt.close() 71 | pdf.close() 72 | 73 | 74 | if __name__ == '__main__': 75 | # collection-range 76 | compare_plot(xlabel="Sensing range (unit)", 77 | ylabel="Data collection ratio", 78 | x=[0.6, 0.8, 1.0, 1.2, 1.4], 79 | yrange=[0, 0.8], 80 | eDivert=[0.704, 0.719, 0.746, 0.88, 0.95], 81 | TSP=[0.905, 0.917, 0.930, 0.952, 0.974], 82 | ) 83 | 84 | # fairness_range 85 | compare_plot(xlabel="Sensing range (unit)", 86 | ylabel="Geographical fairness", 87 | x=[0.6, 0.8, 1.0, 1.2, 1.4], 88 | yrange=[0, 0.8], 89 | eDivert=[0.755, 0.766, 0.801, 0.91, 0.957], 90 | TSP=[0.919, 0.935, 0.950, 0.963, 0.980], 91 | ) 92 | # # 93 | # energy_range 94 | compare_plot(xlabel="Sensing range (unit)", 95 | ylabel="Energy usage (# of full batteries)", 96 | x=[0.6, 0.8, 1.0, 1.2, 1.4], 97 | yrange=[0, 4], 98 | eDivert=[1.32, 1.329, 1.459, 1.57, 1.805], 99 | TSP=[3.855, 4.219, 4.234, 4.250, 4.270], 100 | ) 101 | 102 | # efficiency_range 103 | compare_plot(xlabel="Sensing range (unit)", 104 | ylabel="Energy efficiency", 105 | x=[0.6, 0.8, 1.0, 1.2, 1.4], 106 | yrange=[0, 0.36], 107 | eDivert=[0.357, 0.362, 0.371, 0.382, 0.4], 108 | TSP=[0.189, 0.178, 0.183, 0.189, 0.196], 109 | ) 110 | 111 | # TODO 112 | # collection-range 113 | compare_plot(xlabel="No. of vehicles", 114 | ylabel="Data collection ratio", 115 | x=[1, 2, 3, 4, 5], 116 | yrange=[0, 0.8], 117 | eDivert=[0.841,0.852,0.902,0.942,0.94], 118 | TSP=[0.893,0.992,0.999,0.994,0.994], 119 | ) 120 | 121 | # fairness_range 122 | compare_plot(xlabel="No. of vehicles", 123 | ylabel="Geographical fairness", 124 | x=[1, 2, 3, 4, 5], 125 | yrange=[0, 0.8], 126 | eDivert=[0.862,0.878,0.921,0.943,0.939], 127 | TSP=[0.936,0.988,0.991,0.991,0.991], 128 | ) 129 | # # 130 | # energy_range 131 | compare_plot(xlabel="No. of vehicles", 132 | ylabel="Energy usage (# of full batteries)", 133 | x=[1, 2, 3, 4, 5], 134 | yrange=[0, 6], 135 | eDivert=[1.38,1.784,2.01,2.493,2.67], 136 | TSP=[3.395,4.324,4.941,7.402,7.996], 137 | ) 138 | 139 | # efficiency_range 140 | compare_plot(xlabel="No. of vehicles", 141 | ylabel="Energy efficiency", 142 | x=[1, 2, 3, 4, 5], 143 | yrange=[0, 0.32], 144 | eDivert=[0.386,0.371,0.349,0.311,0.302], 145 | TSP=[0.213,0.201,0.179,0.118,0.102], 146 | ) 147 | -------------------------------------------------------------------------------- /experiments/random_generator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | poi_data=np.random.random(size=(256,3)) 4 | 5 | for i,poi_i in enumerate(poi_data): 6 | if i==0: 7 | print("[[%.10e,%.10e,%.10e],"% (poi_i[0],poi_i[1],poi_i[2])) 8 | elif i==255: 9 | print("[%.10e,%.10e,%.10e]]" % (poi_i[0], poi_i[1], poi_i[2])) 10 | else: 11 | print("[%.10e,%.10e,%.10e]," % (poi_i[0], poi_i[1], poi_i[2])) 12 | 13 | 14 | test = [[1.5454101562e-01, 2.2583007812e-02, 6.5332031250e-01], 15 | [2.1936035156e-01, 2.1618652344e-01, 8.2568359375e-01], 16 | [3.3813476562e-01, 4.4738769531e-02, 6.6406250000e-02], 17 | [6.5478515625e-01, 6.5429687500e-01, 8.7280273438e-02], 18 | [6.9970703125e-01, 7.5000000000e-01, 4.6923828125e-01], 19 | [3.2177734375e-01, 4.9145507812e-01, 8.8769531250e-01], 20 | [6.0595703125e-01, 8.5449218750e-01, 1.0772705078e-01], 21 | [7.1679687500e-01, 1.1370849609e-01, 5.3759765625e-01], 22 | [7.3046875000e-01, 9.5800781250e-01, 3.6157226562e-01], 23 | [9.7656250000e-01, 4.9365234375e-01, 2.5732421875e-01], 24 | [1.4416503906e-01, 7.8320312500e-01, 7.1679687500e-01], 25 | [7.1435546875e-01, 2.1618652344e-01, 4.7070312500e-01], 26 | [1.3830566406e-01, 6.8310546875e-01, 6.7675781250e-01], 27 | [6.2304687500e-01, 1.4045715332e-02, 4.3017578125e-01], 28 | [9.2919921875e-01, 9.7460937500e-01, 5.6494140625e-01], 29 | [9.5996093750e-01, 3.4423828125e-02, 1.2927246094e-01], 30 | [5.4443359375e-01, 7.9199218750e-01, 3.7622070312e-01], 31 | [4.6777343750e-01, 5.4394531250e-01, 7.2753906250e-01], 32 | [4.7558593750e-01, 7.0898437500e-01, 7.6562500000e-01], 33 | [8.5205078125e-01, 4.8364257812e-01, 3.9965820312e-01], 34 | [7.1240234375e-01, 1.6027832031e-01, 5.7421875000e-01], 35 | [4.7460937500e-01, 9.8937988281e-02, 3.8500976562e-01], 36 | [6.1914062500e-01, 1.2841796875e-01, 1.4758300781e-01], 37 | [6.7773437500e-01, 5.8593750000e-02, 5.6689453125e-01], 38 | [5.2099609375e-01, 1.2927246094e-01, 1.6943359375e-01], 39 | [3.0737304688e-01, 9.3066406250e-01, 9.1845703125e-01], 40 | [1.7565917969e-01, 9.7802734375e-01, 4.3847656250e-01], 41 | [4.1040039062e-01, 8.9794921875e-01, 2.6123046875e-01], 42 | [6.5234375000e-01, 6.9580078125e-01, 6.5429687500e-01], 43 | [9.8046875000e-01, 4.0161132812e-01, 5.4003906250e-01], 44 | [6.2597656250e-01, 7.5244140625e-01, 8.1640625000e-01], 45 | [5.6762695312e-02, 7.7734375000e-01, 2.2973632812e-01], 46 | [9.0380859375e-01, 6.3720703125e-01, 8.8183593750e-01], 47 | [5.9326171875e-01, 5.8740234375e-01, 7.3339843750e-01], 48 | [2.6318359375e-01, 6.7480468750e-01, 3.6206054688e-01], 49 | [2.6245117188e-01, 5.3613281250e-01, 3.1201171875e-01], 50 | [5.5468750000e-01, 3.2397460938e-01, 5.8496093750e-01], 51 | [9.3896484375e-01, 6.6601562500e-01, 2.0996093750e-02], 52 | [1.3537597656e-01, 2.8100585938e-01, 1.8847656250e-01], 53 | [9.5507812500e-01, 8.2421875000e-01, 6.2890625000e-01], 54 | [4.3505859375e-01, 9.8046875000e-01, 7.4169921875e-01], 55 | [4.8559570312e-01, 4.9853515625e-01, 2.4414062500e-01], 56 | [6.8457031250e-01, 2.5073242188e-01, 4.5385742188e-01], 57 | [5.1025390625e-01, 8.9990234375e-01, 6.6601562500e-01], 58 | [6.6992187500e-01, 6.2011718750e-01, 6.6552734375e-01], 59 | [5.0292968750e-02, 8.3496093750e-01, 6.7968750000e-01], 60 | [7.8808593750e-01, 1.5332031250e-01, 9.0429687500e-01], 61 | [8.2128906250e-01, 7.9833984375e-01, 4.6142578125e-01], 62 | [3.0059814453e-02, 7.8125000000e-01, 4.9951171875e-01], 63 | [1.9006347656e-01, 7.3144531250e-01, 4.3994140625e-01], 64 | [8.3544921875e-01, 4.3237304688e-01, 8.6279296875e-01], 65 | [7.3437500000e-01, 9.9548339844e-02, 1.8688964844e-01], 66 | [2.6074218750e-01, 9.1699218750e-01, 5.9814453125e-01], 67 | [8.1689453125e-01, 1.9482421875e-01, 9.2675781250e-01], 68 | [8.7500000000e-01, 2.7221679688e-01, 7.4707031250e-01], 69 | [7.4121093750e-01, 6.7529296875e-01, 9.1601562500e-01], 70 | [9.3066406250e-01, 6.2207031250e-01, 8.2568359375e-01], 71 | [5.1220703125e-01, 1.7529296875e-01, 1.3122558594e-01], 72 | [8.9794921875e-01, 3.0053710938e-01, 8.1591796875e-01], 73 | [2.6953125000e-01, 6.9824218750e-01, 1.1224365234e-01], 74 | [7.1386718750e-01, 6.3134765625e-01, 1.3537597656e-01], 75 | [6.8066406250e-01, 6.5673828125e-01, 5.0195312500e-01], 76 | [5.4248046875e-01, 1.5234375000e-01, 1.6955566406e-01], 77 | [5.7568359375e-01, 1.5124511719e-01, 8.9599609375e-01], 78 | [1.7065429688e-01, 8.4411621094e-02, 2.5708007812e-01], 79 | [8.6474609375e-01, 2.2229003906e-01, 9.2675781250e-01], 80 | [9.3701171875e-01, 5.1849365234e-02, 3.6474609375e-01], 81 | [8.1298828125e-01, 7.8564453125e-01, 6.2402343750e-01], 82 | [4.1503906250e-01, 5.9423828125e-01, 5.0537109375e-01], 83 | [3.4179687500e-01, 4.7802734375e-01, 8.8818359375e-01], 84 | [3.9306640625e-01, 5.1074218750e-01, 3.0981445312e-01], 85 | [8.0566406250e-01, 1.6113281250e-01, 4.4848632812e-01], 86 | [8.8134765625e-02, 9.7705078125e-01, 8.5742187500e-01], 87 | [2.1984863281e-01, 7.5048828125e-01, 5.2978515625e-01], 88 | [8.5839843750e-01, 8.5058593750e-01, 4.6582031250e-01], 89 | [6.6259765625e-01, 6.6992187500e-01, 6.4404296875e-01], 90 | [8.7500000000e-01, 9.2138671875e-01, 3.1982421875e-01], 91 | [4.5800781250e-01, 5.3076171875e-01, 3.9868164062e-01], 92 | [5.2148437500e-01, 9.7705078125e-01, 8.2617187500e-01], 93 | [2.3986816406e-01, 5.0488281250e-01, 6.6650390625e-01]] 94 | 95 | 96 | sum=0.0 97 | for i in test: 98 | sum+=i[2] 99 | 100 | print(sum) 101 | 102 | 103 | -------------------------------------------------------------------------------- /experiments/test.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import os 4 | import tensorflow as tf 5 | import pandas as pd 6 | 7 | import maddpg.common.tf_util as U 8 | from experiments.env0 import log0 as Log 9 | from experiments.env0.data_collection0 import Env 10 | from maddpg.trainer.maddpg import MADDPGAgentTrainer 11 | 12 | 13 | os.environ["CUDA_VISIBLE_DEVICES"] = "-1" 14 | 15 | # Hyperparameters 16 | ARGUMENTS = [ 17 | # Environment 18 | ["--scenario", str, "simple_adversary", "name of the scenario script"], 19 | ["--max-episode-len", int, 500, "maximum episode length"], 20 | ["--num-episodes", int, 5000, "number of episodes"], 21 | ["--num-adversaries", int, 0, "number of adversaries(enemy)"], 22 | ["--good-policy", str, "maddpg", "policy for good agents"], 23 | ["--adv-policy", str, "maddpg", "policy of adversaries"], 24 | 25 | # Core training parameters 26 | ["--lr", float, 5e-4, "learning rate for Adam optimizer"], 27 | ["--decay_rate", float, 0.99995, "learning rate exponential decay"], # 作为初始学习率,后面尝试进行衰减,这个不着急加! 28 | ["--gamma", float, 0.95, "discount factor"], 29 | ["--batch-size", int, 32, "number of epochs to optimize at the same time"], 30 | ["--num-units", int, 600, "number of units in the mlp"], 31 | 32 | # Priority Replay Buffer ( weights not used ) 33 | ["--alpha", float, 0.5, "priority parameter"], 34 | ["--beta", float, 0.4, "IS parameter"], 35 | ["--epsilon", float, 0.5, "a small positive constant"], 36 | ["--buffer_size", int, 200000, "buffer size for each agent"], 37 | 38 | # N-steps 39 | ["--N", int, 5, "steps of N-step"], 40 | 41 | # TODO: Experiments 42 | # Ape-X 43 | ["--num_actor_workers", int,0, 44 | "number of environments one agent can deal with. if >1, use apex ; else, use simple maddpg"], 45 | ["--debug_dir", str, "/debug_list/", 46 | "save index,reward(n-step),priority, value,wi per every sample from experience"], 47 | 48 | # RNN 49 | ["--rnn_length", int,0, 50 | "time_step in rnn, try to use LSTM instead of N-steps. if ==0, not use rnn; else, use rnn."], 51 | ["--rnn_cell_size", int, 64, "LSTM-cell output's size"], 52 | 53 | # Checkpointing 保存model 54 | ["--exp-name", str, None, "name of the experiment"], 55 | ["--save-dir", str, "/policy/", "directory in which training state and model sho uld be saved"], 56 | ["--save-rate", int, 10, "save model once every time this many episodes are completed"], 57 | ["--model_to_keep", int, 100, "the number of saved models"], 58 | ["--load-dir", str, "/home/dzp1997/learning/my_experiment_model/alpha0.5_actor5_rnn3", # TODO:polcy之前一个路径 59 | "directory in which training state and model are loaded"], 60 | 61 | # Test 62 | ['--test_time', int, 10, "number of iterations run for testing"], 63 | ["--random_seed", int, 100, "random seed"], 64 | ["--start", int,90,"start model"], #TODO 65 | ["--end", int,100, "end model"] #todo 66 | ] 67 | 68 | ACTIONS = [ 69 | ["--restore", "store_true", False], 70 | ["--display", "store_true", False], 71 | ["--benchmark", "store_true", False] 72 | 73 | ] 74 | 75 | 76 | # 参数调节器 77 | def parse_args(): 78 | parser = argparse.ArgumentParser("Reinforcement Learning experiments for multiagent environments") 79 | for arg in ARGUMENTS: 80 | parser.add_argument(arg[0], type=arg[1], default=arg[2], help=arg[3]) 81 | for action in ACTIONS: 82 | parser.add_argument(action[0], action=action[1], default=action[2]) 83 | return parser.parse_args() 84 | 85 | 86 | def get_trainers(env, num_adversaries, obs_shape_n, arglist): 87 | # 加入多个trainers 88 | trainers = [] 89 | trainer = MADDPGAgentTrainer 90 | 91 | # 对手agent个数 0 92 | for i in range(num_adversaries): 93 | trainers.append(trainer( 94 | "agent_%d" % i, obs_shape_n, env.action_space, i, arglist, 95 | local_q_func=(arglist.adv_policy == 'ddpg'))) 96 | 97 | # 盟友agent个数 env.n 每一个agent都有一个actor,critic,replay_buffer!!! 98 | for i in range(num_adversaries, env.n): 99 | trainers.append(trainer( 100 | "agent_%d" % i, obs_shape_n, env.action_space, i, arglist, 101 | local_q_func=(arglist.good_policy == 'ddpg'))) 102 | 103 | return trainers 104 | 105 | 106 | def test(arglist, log,full_load_dir,test_iteration): 107 | 108 | with U.multi_threaded_session() as sess: 109 | # Create environment for testing 110 | env=Env(log) 111 | log.log(ARGUMENTS) 112 | log.log(ACTIONS) 113 | 114 | # Create agent trainers 115 | obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] 116 | num_adversaries = min(env.n, arglist.num_adversaries) # 0 117 | trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) # 定义所有数据结构和静态图 118 | 119 | # Initialize all the uninitialized variables in the global scope 120 | U.initialize() 121 | 122 | # debug 123 | from tensorflow.python import pywrap_tensorflow 124 | # Read data from checkpoint file 125 | reader = pywrap_tensorflow.NewCheckpointReader(full_load_dir) 126 | var_to_shape_map = reader.get_variable_to_shape_map() 127 | # Print tensor name and value 128 | f = open('trained_params.txt', 'w') 129 | for key in var_to_shape_map: # write tensors' names and values in file 130 | print(key, file=f) 131 | print(reader.get_tensor(key), file=f) 132 | f.close() 133 | 134 | f = open('test_params.txt', 'w') 135 | for variable_name in tf.global_variables(): 136 | print(variable_name,file=f) 137 | f.close() 138 | 139 | # TODO:加载已经训练好的模型 140 | saver = tf.train.Saver() 141 | saver.restore(sess, full_load_dir) 142 | 143 | episode_rewards = [0.0] # sum of rewards for all agents 144 | agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward 145 | episode_step = 0 146 | 147 | start_env=env.reset() 148 | state_step = [] 149 | for _ in range(0, arglist.rnn_length - 1): 150 | state_step.append(start_env) 151 | obs_n=start_env 152 | 153 | 154 | print('Starting a new TEST iterations...') 155 | print('Log_dir:', env.log_dir) 156 | iteration = 0 157 | 158 | efficiency=[] 159 | fairness=[] 160 | normal_fairness=[] 161 | collection_ratio=[] 162 | energy_consumption=[] 163 | collision = [] 164 | steps = [] 165 | collect_frequency=[] 166 | charge_frequency=[] 167 | station_remain=[] 168 | 169 | indicator = [0] * env.n # TODO:状态指示器 170 | meaningful_fill = [0] * env.n 171 | meaningful_get = [0] * env.n 172 | 173 | # testing 174 | while iteration < arglist.test_time: 175 | if arglist.rnn_length > 0: 176 | action_n = [] 177 | state_step.append(obs_n) 178 | for i, agent, obs in zip(range(0, len(trainers)), trainers, obs_n): 179 | obs_sequence = [] 180 | for j in range(-1 * arglist.rnn_length, 0, 1): 181 | obs_sequence.append(state_step[j][i]) 182 | 183 | action_n.append(agent.action(np.array(obs_sequence))) 184 | else: 185 | action_n = [agent.action(obs[None]) for agent, obs in zip(trainers, obs_n)] 186 | 187 | new_obs_n, rew_n, done_n, info_n, indicator = env.step(actions=action_n, indicator=indicator) 188 | log.step_information(action_n, env, episode_step, iteration, meaningful_fill, meaningful_get, 189 | indicator) 190 | 191 | indicator = [0] * env.n 192 | episode_step += 1 # step per episode 193 | done = done_n 194 | terminal = (episode_step >= arglist.max_episode_len) 195 | obs_n = new_obs_n 196 | 197 | for i, rew in enumerate(rew_n): 198 | episode_rewards[-1] += rew # 每一个step的总reward 199 | agent_rewards[i][-1] += rew # 每一个step,每个agent自己的reward 200 | 201 | if done or terminal: 202 | efficiency.append(env.efficiency) 203 | fairness.append(env.collection_fairness) 204 | normal_fairness.append(env.normal_collection_fairness) 205 | collection_ratio.append(1.0-env.leftrewards) 206 | energy_consumption.append(np.sum(env.normal_use_energy)) 207 | collision.append(np.sum(env.walls)) 208 | steps.append(env.count) 209 | collect_frequency.append(np.sum(meaningful_get)) 210 | charge_frequency.append(np.sum(meaningful_fill)) 211 | station_remain.append(250-sum(env.fills_energy_remain)) # @TODO:这里写的比较傻逼 212 | 213 | log.draw_path(env, iteration, meaningful_fill, meaningful_get) 214 | 215 | iteration += 1 216 | meaningful_fill = [0] * env.n 217 | meaningful_get = [0] * env.n 218 | obs_n = env.reset() 219 | episode_step = 0 220 | episode_rewards.append(0) 221 | for a in agent_rewards: 222 | a.append(0) 223 | 224 | # for displaying learned policies 225 | if arglist.display: 226 | env.render() 227 | continue 228 | 229 | details = [ 230 | '\n\nindicator DETAILS:', 231 | '\n\tefficiency: ' + str(efficiency), 232 | '\n\tfairness: ' + str(fairness), 233 | '\n\tnormal_fairness: ' + str(normal_fairness), 234 | '\n\tcollection_ratio: ' + str(collection_ratio), 235 | '\n\tenergy_consumption: ' + str(energy_consumption), 236 | '\n\tcollision: ' + str(collision), 237 | '\n\tsteps: ' + str(steps), 238 | ] 239 | 240 | indicator = [ 241 | '\n\ntest_model: '+str(test_iteration)+' --indicator AVERAGE:', 242 | '\n\tefficiency: ' + str(np.mean(efficiency)), 243 | '\n\tfairness: ' + str(np.mean(fairness)), 244 | '\n\tnormal_fairness: ' + str(np.mean(normal_fairness)), 245 | '\n\tcollection_ratio: ' + str(np.mean(collection_ratio)), 246 | '\n\tenergy_consumption: ' + str(np.mean(energy_consumption)), 247 | '\n\tcollision: ' + str(np.mean(collision)), 248 | '\n\tsteps: ' + str(np.mean(steps)), 249 | ] 250 | 251 | for _ in indicator: 252 | print(_) 253 | 254 | indicator_to_pandas = [ 255 | str(test_iteration), 256 | 257 | str(np.mean(collection_ratio)), 258 | str(np.min(collection_ratio)), 259 | str(np.max(collection_ratio)), 260 | 261 | str(np.mean(normal_fairness)), 262 | str(np.min(normal_fairness)), 263 | str(np.max(normal_fairness)), 264 | 265 | str(np.mean(energy_consumption)), 266 | str(np.min(energy_consumption)), 267 | str(np.max(energy_consumption)), 268 | 269 | str(np.mean(efficiency)), 270 | str(np.min(efficiency)), 271 | str(np.max(efficiency)), 272 | 273 | str(np.mean(collect_frequency)), 274 | str(np.mean(charge_frequency)), 275 | str(np.mean(station_remain)) 276 | ] 277 | 278 | 279 | log.log(details) 280 | log.log(indicator) 281 | 282 | tf.reset_default_graph() 283 | return indicator_to_pandas 284 | 285 | if __name__ == '__main__': 286 | print('Loading the trained model...Now, Enjoy yourself!') 287 | arglist = parse_args() 288 | df=pd.DataFrame(columns=["test_model", 289 | "collection_ratio","cr_min","cr_max", 290 | "fairness","f_min","f_max", 291 | "consumption of energy","ce_min","ce_max", 292 | "efficiency","e_min","e_max","collect","charge","station remain"]) 293 | for i in range(arglist.start,arglist.end): 294 | full_load_dir=arglist.load_dir+"/policy/"+str(i)+".ckpt" 295 | log = Log.Log() 296 | indicator_to_pandas=test(arglist, log,full_load_dir,i) 297 | df.loc[i-70]=indicator_to_pandas 298 | 299 | df.sort_values("efficiency",inplace=True) 300 | df.to_csv(arglist.load_dir+"/choose_best_model.csv",index=0) 301 | print('\n', 'TEST finished') 302 | -------------------------------------------------------------------------------- /experiments/test_random.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import os 4 | import tensorflow as tf 5 | import pandas as pd 6 | 7 | import maddpg.common.tf_util as U 8 | from experiments.env0 import log0 as Log 9 | from experiments.env0.data_collection0 import Env 10 | from maddpg.trainer.maddpg import MADDPGAgentTrainer 11 | 12 | 13 | os.environ["CUDA_VISIBLE_DEVICES"] = "-1" 14 | 15 | # Hyperparameters 16 | ARGUMENTS = [ 17 | # Environment 18 | ["--scenario", str, "simple_adversary", "name of the scenario script"], 19 | ["--max-episode-len", int, 500, "maximum episode length"], 20 | ["--num-episodes", int, 5000, "number of episodes"], 21 | ["--num-adversaries", int, 0, "number of adversaries(enemy)"], 22 | ["--good-policy", str, "maddpg", "policy for good agents"], 23 | ["--adv-policy", str, "maddpg", "policy of adversaries"], 24 | 25 | # Core training parameters 26 | ["--lr", float, 5e-4, "learning rate for Adam optimizer"], 27 | ["--decay_rate", float, 0.99995, "learning rate exponential decay"], # 作为初始学习率,后面尝试进行衰减,这个不着急加! 28 | ["--gamma", float, 0.95, "discount factor"], 29 | ["--batch-size", int, 512, "number of epochs to optimize at the same time"], 30 | ["--num-units", int, 600, "number of units in the mlp"], 31 | 32 | # Priority Replay Buffer ( weights not used ) 33 | ["--alpha", float, 0.5, "priority parameter"], 34 | ["--beta", float, 0.4, "IS parameter"], 35 | ["--epsilon", float, 0.5, "a small positive constant"], 36 | ["--buffer_size", int, 200000, "buffer size for each agent"], 37 | 38 | # N-steps 39 | ["--N", int, 5, "steps of N-step"], 40 | 41 | # TODO: Experiments 42 | # Ape-X 43 | ["--num_actor_workers", int,0, 44 | "number of environments one agent can deal with. if >1, use apex ; else, use simple maddpg"], 45 | ["--debug_dir", str, "/debug_list/", 46 | "save index,reward(n-step),priority, value,wi per every sample from experience"], 47 | 48 | # RNN 49 | ["--rnn_length", int,0, 50 | "time_step in rnn, try to use LSTM instead of N-steps. if ==0, not use rnn; else, use rnn."], 51 | ["--rnn_cell_size", int, 64, "LSTM-cell output's size"], 52 | 53 | # Checkpointing 保存model 54 | ["--exp-name", str, None, "name of the experiment"], 55 | ["--save-dir", str, "/policy/", "directory in which training state and model sho uld be saved"], 56 | ["--save-rate", int, 10, "save model once every time this many episodes are completed"], 57 | ["--model_to_keep", int, 100, "the number of saved models"], 58 | ["--load-dir", str, "/media/sda1/MCS_experiments/test_裸奔/uav5", 59 | "directory in which training state and model are loaded"], 60 | 61 | # Test 62 | ['--test_time', int, 10, "number of iterations run for testing"], 63 | ["--random_seed", int, 100, "random seed"], 64 | ["--start", int,0,"start model"], 65 | ["--end", int,5, "end model"] 66 | ] 67 | 68 | ACTIONS = [ 69 | ["--restore", "store_true", False], 70 | ["--display", "store_true", False], 71 | ["--benchmark", "store_true", False] 72 | 73 | ] 74 | 75 | 76 | # 参数调节器 77 | def parse_args(): 78 | parser = argparse.ArgumentParser("Reinforcement Learning experiments for multiagent environments") 79 | for arg in ARGUMENTS: 80 | parser.add_argument(arg[0], type=arg[1], default=arg[2], help=arg[3]) 81 | for action in ACTIONS: 82 | parser.add_argument(action[0], action=action[1], default=action[2]) 83 | return parser.parse_args() 84 | 85 | 86 | def get_trainers(env, num_adversaries, obs_shape_n, arglist): 87 | # 加入多个trainers 88 | trainers = [] 89 | trainer = MADDPGAgentTrainer 90 | 91 | # 对手agent个数 0 92 | for i in range(num_adversaries): 93 | trainers.append(trainer( 94 | "agent_%d" % i, obs_shape_n, env.action_space, i, arglist, 95 | local_q_func=(arglist.adv_policy == 'ddpg'))) 96 | 97 | # 盟友agent个数 env.n 每一个agent都有一个actor,critic,replay_buffer!!! 98 | for i in range(num_adversaries, env.n): 99 | trainers.append(trainer( 100 | "agent_%d" % i, obs_shape_n, env.action_space, i, arglist, 101 | local_q_func=(arglist.good_policy == 'ddpg'))) 102 | 103 | return trainers 104 | 105 | 106 | def test(arglist, log,full_load_dir,test_iteration): 107 | 108 | with U.multi_threaded_session() as sess: 109 | # Create environment for testing 110 | env=Env(log) 111 | log.log(ARGUMENTS) 112 | log.log(ACTIONS) 113 | 114 | # Create agent trainers 115 | obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] 116 | num_adversaries = min(env.n, arglist.num_adversaries) # 0 117 | trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) # 定义所有数据结构和静态图 118 | 119 | # Initialize all the uninitialized variables in the global scope 120 | U.initialize() 121 | 122 | # TODO:加载已经训练好的模型 123 | saver = tf.train.Saver() 124 | saver.restore(sess,full_load_dir) 125 | 126 | episode_rewards = [0.0] # sum of rewards for all agents 127 | agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward 128 | episode_step = 0 129 | 130 | start_env=env.reset() 131 | state_step = [] 132 | for _ in range(0, arglist.rnn_length - 1): 133 | state_step.append(start_env) 134 | obs_n=start_env 135 | 136 | 137 | print('Starting a new TEST iterations...') 138 | print('Log_dir:', env.log_dir) 139 | iteration = 0 140 | 141 | efficiency=[] 142 | fairness=[] 143 | normal_fairness=[] 144 | collection_ratio=[] 145 | energy_consumption=[] 146 | collision = [] 147 | steps = [] 148 | 149 | indicator = [0] * env.n # TODO:状态指示器 150 | meaningful_fill = [0] * env.n 151 | meaningful_get = [0] * env.n 152 | 153 | # testing 154 | while iteration < arglist.test_time: 155 | if arglist.rnn_length > 0: 156 | action_n = [] 157 | state_step.append(obs_n) 158 | for i, agent, obs in zip(range(0, len(trainers)), trainers, obs_n): 159 | obs_sequence = [] 160 | for j in range(-1 * arglist.rnn_length, 0, 1): 161 | obs_sequence.append(state_step[j][i]) 162 | 163 | action_n.append(agent.action(np.array(obs_sequence))) 164 | else: 165 | action_n = [agent.action(obs[None]) for agent, obs in zip(trainers, obs_n)] 166 | 167 | action_n=np.array(action_n) 168 | random_action_n=np.random.uniform(low=-1,high=1,size=action_n.shape) 169 | new_obs_n, rew_n, done_n, info_n, indicator = env.step(actions=random_action_n, indicator=indicator) 170 | log.step_information(random_action_n, env, episode_step, iteration, meaningful_fill, meaningful_get, 171 | indicator) 172 | 173 | indicator = [0] * env.n 174 | episode_step += 1 # step per episode 175 | done = done_n 176 | terminal = (episode_step >= arglist.max_episode_len) 177 | obs_n = new_obs_n 178 | 179 | for i, rew in enumerate(rew_n): 180 | episode_rewards[-1] += rew # 每一个step的总reward 181 | agent_rewards[i][-1] += rew # 每一个step,每个agent自己的reward 182 | 183 | if done or terminal: 184 | efficiency.append(env.efficiency) 185 | fairness.append(env.collection_fairness) 186 | normal_fairness.append(env.normal_collection_fairness) 187 | collection_ratio.append(1.0-env.leftrewards) 188 | energy_consumption.append(np.sum(env.normal_use_energy)) 189 | collision.append(np.sum(env.walls)) 190 | steps.append(env.count) 191 | 192 | log.draw_path(env, iteration, meaningful_fill, meaningful_get) 193 | 194 | iteration += 1 195 | meaningful_fill = [0] * env.n 196 | meaningful_get = [0] * env.n 197 | obs_n = env.reset() 198 | episode_step = 0 199 | episode_rewards.append(0) 200 | for a in agent_rewards: 201 | a.append(0) 202 | 203 | # for displaying learned policies 204 | if arglist.display: 205 | env.render() 206 | continue 207 | 208 | details = [ 209 | '\n\nindicator DETAILS:', 210 | '\n\tefficiency: ' + str(efficiency), 211 | '\n\tfairness: ' + str(fairness), 212 | '\n\tnormal_fairness: ' + str(normal_fairness), 213 | '\n\tcollection_ratio: ' + str(collection_ratio), 214 | '\n\tenergy_consumption: ' + str(energy_consumption), 215 | '\n\tcollision: ' + str(collision), 216 | '\n\tsteps: ' + str(steps), 217 | ] 218 | 219 | indicator = [ 220 | '\n\ntest_model: '+str(test_iteration)+' --indicator AVERAGE:', 221 | '\n\tefficiency: ' + str(np.mean(efficiency)), 222 | '\n\tfairness: ' + str(np.mean(fairness)), 223 | '\n\tnormal_fairness: ' + str(np.mean(normal_fairness)), 224 | '\n\tcollection_ratio: ' + str(np.mean(collection_ratio)), 225 | '\n\tenergy_consumption: ' + str(np.mean(energy_consumption)), 226 | '\n\tcollision: ' + str(np.mean(collision)), 227 | '\n\tsteps: ' + str(np.mean(steps)), 228 | ] 229 | 230 | for _ in indicator: 231 | print(_) 232 | 233 | indicator_to_pandas = [ 234 | str(test_iteration), 235 | 236 | str(np.mean(collection_ratio)), 237 | str(np.min(collection_ratio)), 238 | str(np.max(collection_ratio)), 239 | 240 | str(np.mean(normal_fairness)), 241 | str(np.min(normal_fairness)), 242 | str(np.max(normal_fairness)), 243 | 244 | str(np.mean(energy_consumption)), 245 | str(np.min(energy_consumption)), 246 | str(np.max(energy_consumption)), 247 | 248 | str(np.mean(efficiency)), 249 | str(np.min(efficiency)), 250 | str(np.max(efficiency)), 251 | ] 252 | 253 | 254 | log.log(details) 255 | log.log(indicator) 256 | 257 | tf.reset_default_graph() 258 | return indicator_to_pandas 259 | 260 | if __name__ == '__main__': 261 | print('Loading the trained model...Now, Enjoy yourself!') 262 | arglist = parse_args() 263 | df=pd.DataFrame(columns=["test_model", 264 | "collection_ratio","cr_min","cr_max", 265 | "fairness","f_min","f_max", 266 | "consumption of energy","ce_min","ce_max", 267 | "efficiency","e_min","e_max"]) 268 | for i in range(arglist.start,arglist.end): 269 | full_load_dir=arglist.load_dir+"/policy/"+str(i)+".ckpt" 270 | log = Log.Log() 271 | indicator_to_pandas=test(arglist, log,full_load_dir,i) 272 | df.loc[i-70]=indicator_to_pandas 273 | 274 | df.sort_values("efficiency",inplace=True) 275 | df.to_csv(arglist.load_dir+"/瞎跑uav5.csv",index=0) 276 | print('\n', 'TEST finished') 277 | -------------------------------------------------------------------------------- /experiments/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import os 4 | import tensorflow as tf 5 | import time 6 | 7 | import maddpg.common.tf_util as U 8 | from experiments.env0 import log0 as Log 9 | from experiments.env0.data_collection0 import Env 10 | from maddpg.common.summary import Summary 11 | from maddpg.trainer.maddpg import MADDPGAgentTrainer 12 | 13 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 14 | 15 | # Hyperparameters 16 | ARGUMENTS = [ 17 | # Environment 18 | ["--scenario", str, "simple_adversary", "name of the scenario script"], 19 | ["--max-episode-len", int, 500, "maximum episode length"], 20 | ["--num-episodes", int, 500, "number of episodes"], 21 | ["--num-adversaries", int, 0, "number of adversaries(enemy)"], 22 | ["--good-policy", str, "maddpg", "policy for good agents"], 23 | ["--adv-policy", str, "maddpg", "policy of adversaries"], 24 | 25 | # Core training parameters 26 | ["--lr", float, 5e-4, "learning rate for Adam optimizer"], 27 | ["--decay_rate", float, 0.99995, "learning rate exponential decay"], # 作为初始学习率,后面尝试进行衰减,这个不着急加! 28 | ["--gamma", float, 0.95, "discount factor"], 29 | ["--batch-size", int, 32, "number of epochs to optimize at the same time"], # 512 30 | ["--num-units", int, 600, "number of units in the mlp"], 31 | 32 | # Priority Replay Buffer ( weights not used ) 33 | ["--alpha", float, 0.5, "priority parameter"], 34 | ["--beta", float, 0.4, "IS parameter"], 35 | ["--epsilon", float, 0.5, "a small positive constant"], 36 | ["--buffer_size", int, 200000, "buffer size for each agent"] , 37 | 38 | # N-steps 39 | ["--N", int, 5, "steps of N-step"], 40 | 41 | # TODO: Experiments 42 | # Ape-X 43 | ["--num_actor_workers", int, 0, 44 | "number of environments one agent can deal with. if >1, use apex ; else, use simple maddpg"], 45 | ["--debug_dir", str, "/debug_list/", 46 | "save index,reward(n-step),priority, value,wi per every sample from experience"], 47 | 48 | # RNN 49 | ["--rnn_length", int, 0, 50 | "time_step in rnn, try to use LSTM instead of N-steps. if ==0, not use rnn; else, use rnn."], 51 | ["--rnn_cell_size", int, 64, "LSTM-cell output's size"], 52 | 53 | # Checkpointing 保存model 54 | ["--exp-name", str, None, "name of the experiment"], 55 | ["--save-dir", str, "/policy/", "directory in which training state and model should be saved"], 56 | ["--save-rate", int, 2, "save model once every time this many episodes are completed"], 57 | ["--model_to_keep", int, 100, "the number of saved models"], 58 | ["--load-dir", str, "/home/linc/Desktop/maddpg-final/saved_state.ckpt", 59 | "directory in which training state and model are loaed"], 60 | 61 | # Evaluation 62 | ["--benchmark-iters", int, 100000, "number of iterations run for benchmarking"], 63 | ["--benchmark-dir", str, "./benchm", "directory where benchmark data is saved"], 64 | ["--plots-dir", str, "./learning_curves/", "directory where plot data is saved"], 65 | 66 | # Training 67 | ["--random_seed", int, 0, "random seed"] 68 | ] 69 | 70 | ACTIONS = [ 71 | ["--restore", "store_true", False], 72 | ["--display", "store_true", False], 73 | ["--benchmark", "store_true", False] 74 | 75 | ] 76 | 77 | 78 | # 参数调节器 79 | def parse_args(): 80 | parser = argparse.ArgumentParser("Reinforcement Learning experiments for multiagent environments") 81 | for arg in ARGUMENTS: 82 | parser.add_argument(arg[0], type=arg[1], default=arg[2], help=arg[3]) 83 | for action in ACTIONS: 84 | parser.add_argument(action[0], action=action[1], default=action[2]) 85 | return parser.parse_args() 86 | 87 | 88 | def get_trainers(env, num_adversaries, obs_shape_n, arglist): 89 | # 加入多个trainers 90 | trainers = [] 91 | trainer = MADDPGAgentTrainer 92 | 93 | # 对手agent个数 0 94 | for i in range(num_adversaries): 95 | trainers.append(trainer( 96 | "agent_%d" % i, obs_shape_n, env.action_space, i, arglist, 97 | local_q_func=(arglist.adv_policy == 'ddpg'))) 98 | 99 | # 盟友agent个数 env.n 每一个agent都有一个actor,critic,replay_buffer!!! 100 | for i in range(num_adversaries, env.n): 101 | trainers.append(trainer( 102 | "agent_%d" % i, obs_shape_n, env.action_space, i, arglist, 103 | local_q_func=(arglist.good_policy == 'ddpg'))) 104 | 105 | return trainers 106 | 107 | 108 | def train(arglist, log): 109 | 110 | with U.multi_threaded_session() as sess: 111 | # Create environment(use Ape-X) 112 | envs = [Env(log) for _ in range(arglist.num_actor_workers)] 113 | log.log(ARGUMENTS) 114 | log.log(ACTIONS) 115 | 116 | # Create summary 117 | summary = Summary(sess, envs[0].log_dir) 118 | for i in range(envs[0].n): 119 | summary.add_variable(tf.Variable(0.), 'reward_%d' % i) 120 | summary.add_variable(tf.Variable(0.), 'loss_%d' % i) 121 | summary.add_variable(tf.Variable(0.), 'wall_%d' % i) 122 | summary.add_variable(tf.Variable(0.), 'energy_%d' % i) 123 | summary.add_variable(tf.Variable(0.), 'gained_info_%d' % i) 124 | summary.add_variable(tf.Variable(0.), 'buffer_size') 125 | summary.add_variable(tf.Variable(0.), 'acc_reward') 126 | summary.add_variable(tf.Variable(0.), 'leftrewards') 127 | summary.add_variable(tf.Variable(0.), 'efficiency') 128 | summary.build() 129 | 130 | # Create agent trainers 131 | obs_shape_n = [envs[0].observation_space[i].shape for i in range(envs[0].n)] 132 | 133 | # 计算对手个数 134 | num_adversaries = min(envs[0].n, arglist.num_adversaries) # 0 135 | 136 | # 定义所有数据结构和静态图 137 | trainers = get_trainers(envs[0], num_adversaries, obs_shape_n, arglist) 138 | 139 | # # 我方和敌方采用不同策略(适用于多智能体的双方竞争环境) 140 | # print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy)) 141 | 142 | # Initialize all the uninitialized variables in the global scope 143 | U.initialize() 144 | 145 | if arglist.restore: 146 | print('Loading previous state...') 147 | U.load_state(arglist.load_dir) 148 | 149 | # 保存模型的个数 100 150 | saver = tf.train.Saver(max_to_keep=arglist.model_to_keep) 151 | 152 | episode_rewards = [[0.0] for env in envs] # sum of rewards for all agents 153 | agent_rewards = [[[0.0] for _ in range(env.n)] for env in envs] # individual agent reward 154 | final_ep_rewards = [] # sum of rewards for training curve 155 | final_ep_ag_rewards = [] # agent rewards for training curve 156 | agent_info = [[[]]] # placeholder for benchmarking info 157 | obs_n = [] 158 | state_step_n = [] 159 | for env in envs: 160 | start_env = env.reset() 161 | state_step_i = [] 162 | for _ in range(0, arglist.rnn_length - 1): 163 | state_step_i.append(start_env) 164 | state_step_n.append(state_step_i) 165 | obs_n.append(start_env) 166 | episode_step = [0 for env in envs] 167 | t_start = [time.time() for env in envs] 168 | 169 | m_time = t_start.copy() 170 | print('Starting iterations...') 171 | print('Log_dir:', envs[0].log_dir) 172 | iteration = 0 173 | global_total_step = 0 # episode step 174 | loss = [0.] * envs[0].n 175 | model_index = 0 176 | efficiency = 0 177 | indicator = [0] * envs[0].n # TODO:状态指示器 178 | meaningful_fill = [0] * envs[0].n 179 | meaningful_get = [0] * envs[0].n 180 | 181 | # training 182 | while iteration < arglist.num_episodes: 183 | global_total_step += 1 # sum step id 184 | terminal_done_0=False 185 | # TODO:DEBUG 186 | # print("global-step: ",global_total_step) 187 | rew_n_master = [] 188 | for env_i, env in enumerate(envs): 189 | # get action 各取各的 190 | # TODO:LSTM try 191 | if arglist.rnn_length > 0: 192 | action_n = [] 193 | state_step_n[env_i].append(obs_n[env_i]) 194 | for i, agent, obs in zip(range(0, len(trainers)), trainers, obs_n[env_i]): 195 | obs_sequence = [] 196 | 197 | for j in range(-1 * arglist.rnn_length, 0, 1): 198 | obs_sequence.append(state_step_n[env_i][j][i]) 199 | 200 | action_n.append(agent.action(np.array(obs_sequence))) 201 | else: 202 | action_n = [agent.action(obs[None]) for agent, obs in zip(trainers, obs_n[env_i])] 203 | 204 | # environment step 205 | if env_i == 0: 206 | # TODO:加入状态指示器放在step里面进行每一步的更新 207 | new_obs_n, rew_n, done_n, info_n, indicator = env.step(actions=action_n, indicator=indicator) 208 | # TODO:step-debug 209 | log.step_information(action_n, env, episode_step[0], iteration, meaningful_fill, meaningful_get, 210 | indicator) 211 | rew_n_master = rew_n 212 | indicator = [0] * envs[0].n 213 | else: 214 | new_obs_n, rew_n, done_n, info_n, _ = env.step(actions=action_n) 215 | episode_step[env_i] += 1 # step per episode 216 | done = done_n 217 | terminal = (episode_step[env_i] >= arglist.max_episode_len) 218 | 219 | # collect experience 添加buffer是各加各的 220 | for i, agent in enumerate(trainers): 221 | agent.experience(obs_n[env_i][i], action_n[i], rew_n[i], new_obs_n[i], done_n, terminal, 222 | arglist.num_actor_workers) 223 | obs_n[env_i] = new_obs_n 224 | 225 | for i, rew in enumerate(rew_n): 226 | episode_rewards[env_i][-1] += rew # 每一个step的总reward 227 | agent_rewards[env_i][i][-1] += rew # 每一个step,每个agent自己的reward 228 | 229 | if done or terminal: 230 | # report 231 | if env_i == 0: 232 | terminal_done_0=True 233 | print('\n%d th episode:\n' % iteration) 234 | print('\tthe %d env,%d steps,%.2f seconds, wasted %.2f seconds.' % ( 235 | env_i, episode_step[env_i], time.time() - m_time[env_i], env.time_)) 236 | # print('rewards:', agent_rewards[0][-1], agent_rewards[1][-1]) 237 | print('\tobstacle collisions:', env.walls) 238 | print('\tdata collection:', env.collection / env.totaldata) 239 | print('\treminding energy:', env.energy) 240 | efficiency = env.efficiency 241 | # log.draw_path(env, iteration) 242 | log.draw_path(env, iteration, meaningful_fill, meaningful_get) 243 | iteration += 1 244 | 245 | meaningful_fill = [0] * envs[0].n 246 | meaningful_get = [0] * envs[0].n 247 | m_time[env_i] = time.time() 248 | obs_n[env_i] = env.reset() 249 | episode_step[env_i] = 0 250 | episode_rewards[env_i].append(0) 251 | for a in agent_rewards[env_i]: 252 | a.append(0) 253 | agent_info.append([[]]) 254 | 255 | # for displaying learned policies 256 | if arglist.display: 257 | envs[0].render() 258 | continue 259 | 260 | # update all trainers, if not in display or benchmark mode 261 | _loss = [] 262 | 263 | # update 每一个agent自己更新自己的PQ参数 264 | for agent in trainers: # 将buffer采样初始化 265 | agent.preupdate() 266 | for agent in trainers: 267 | _loss.append(agent.update(envs[0], trainers, global_total_step)[0]) 268 | if np.sum(_loss) != 0: # 在buffer没有填满的时候不加loss 269 | loss = _loss 270 | 271 | # summary vistalize for all UAVs 272 | feed_dict = {} 273 | for i_summary in range(envs[0].n): 274 | feed_dict['reward_%d' % i_summary] = rew_n_master[i_summary] 275 | feed_dict['loss_%d' % i_summary] = loss[i_summary] 276 | feed_dict['wall_%d' % i_summary] = envs[0].walls[i_summary] / (float(episode_step[0]) + 1e-4) 277 | feed_dict['energy_%d' % i_summary] = envs[0].energy[i_summary] 278 | feed_dict['gained_info_%d' % i_summary] = envs[0].collection[i_summary] 279 | feed_dict['buffer_size'] = trainers[0].filled_size 280 | feed_dict['leftrewards'] = envs[0].leftrewards 281 | feed_dict['acc_reward'] = episode_rewards[0][-1] 282 | feed_dict['efficiency'] = efficiency 283 | summary.run(feed_dict=feed_dict, step=global_total_step) 284 | 285 | # save model, display training output 286 | if terminal_done_0 is True and (len(episode_rewards[0]) + 1) % arglist.save_rate == 0: 287 | U.save_state( 288 | envs[0].log_dir + arglist.save_dir + "/" + str(model_index % arglist.model_to_keep) + ".ckpt", 289 | saver=saver) 290 | model_index += 1 291 | # print statement depends on whether or not there are adversaries 292 | if num_adversaries == 0: 293 | print("------------------------------------------------------------------------------------------") 294 | print("Master: steps: {}, episodes: {}, mean episode reward: {}, time: {}".format( 295 | global_total_step, len(episode_rewards[0]) - 1, 296 | np.mean(episode_rewards[0][-arglist.save_rate:]), 297 | round(time.time() - t_start[0], 3))) 298 | print("------------------------------------------------------------------------------------------") 299 | else: 300 | print("------------------------------------------------------------------------------------------") 301 | print( 302 | "Master: steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format( 303 | global_total_step, len(episode_rewards[0]) - 1, 304 | np.mean(episode_rewards[0][-arglist.save_rate:]), 305 | [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], 306 | round(time.time() - t_start[0], 3))) 307 | print("------------------------------------------------------------------------------------------") 308 | t_start = [time.time() for env in envs] 309 | # Keep track of final episode reward 310 | final_ep_rewards.append(np.mean(episode_rewards[0][-arglist.save_rate:])) 311 | for rew in agent_rewards: 312 | final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:])) 313 | 314 | 315 | if __name__ == '__main__': 316 | print('Let\'s train, go! go! go!') 317 | arglist = parse_args() 318 | log = Log.Log() 319 | train(arglist, log) 320 | -------------------------------------------------------------------------------- /experiments/visualization.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | # training 6 | eff=pd.read_csv("/home/linc/桌面/标杆/run_.-tag-efficiency.csv") 7 | plt.figure(figsize=(17,17)) 8 | plt.plot(eff['Step'],eff['Value'],color='black',linewidth=2) 9 | plt.xlim(xmax=130000,xmin=10240) 10 | plt.ylim() 11 | plt.xticks(fontsize=32) 12 | plt.yticks(fontsize=32) 13 | plt.xlabel("Training epoch (1e5)",fontsize=32) 14 | plt.ylabel("Energy efficiency",fontsize=32) 15 | plt.grid(True) 16 | plt.grid(linestyle='--') 17 | ax=plt.gca() 18 | ax.xaxis.get_major_formatter().set_powerlimits((0,1)) 19 | plt.show() 20 | 21 | rew=pd.read_csv("/home/linc/桌面/标杆/run_.-tag-acc_reward.csv") 22 | plt.figure(figsize=(17,17)) 23 | plt.xticks(fontsize=32) 24 | plt.yticks(fontsize=32) 25 | plt.plot(rew['Step'],rew['Value'],color='black',linewidth=2) 26 | plt.xlim(xmax=130000,xmin=10240) 27 | plt.xlabel("Training epoch (1e5)",fontsize=32) 28 | plt.ylabel("Accumulated reward",fontsize=32) 29 | plt.grid(True) 30 | plt.grid(linestyle='--') 31 | ax=plt.gca() 32 | ax.xaxis.get_major_formatter().set_powerlimits((0,1)) 33 | plt.show() 34 | 35 | loss0=pd.read_csv("/home/linc/桌面/标杆/run_.-tag-loss_0.csv") 36 | loss1=pd.read_csv("/home/linc/桌面/标杆/run_.-tag-loss_1.csv") 37 | plt.figure(figsize=(17,17)) 38 | plt.xticks(fontsize=32) 39 | plt.yticks(fontsize=32) 40 | plt.plot(loss0['Step'],loss0['Value'],label='Vehicle 1',color='black',linewidth=2) 41 | plt.plot(loss1['Step'],loss1['Value'],label='Vehicle 2',color='blue',linewidth=2) 42 | plt.xlim(xmax=130000,xmin=10240) 43 | plt.ylim(ymax=30,ymin=5) 44 | plt.xlabel("Training epoch (1e5)",fontsize=32) 45 | plt.ylabel("Loss",fontsize=32) 46 | plt.grid(True) 47 | plt.grid(linestyle='--') 48 | plt.legend(fontsize=32) 49 | ax=plt.gca() 50 | ax.xaxis.get_major_formatter().set_powerlimits((0,1)) 51 | plt.show() 52 | 53 | 54 | # consumption 55 | 56 | plt.figure(figsize=(17,17)) 57 | num = [1, 2, 3, 4, 5] 58 | sum_comsumption = [3.576, 4, 4.004, 4.402, 4.668] 59 | average_comsumption=np.true_divide(np.array(sum_comsumption),np.array(num)) 60 | plt.xticks(num,num[::1]) 61 | plt.xticks(fontsize=32) 62 | plt.yticks(fontsize=32) 63 | plt.plot(num,sum_comsumption,label='Total energy consumption',marker='o',markersize=26,markeredgewidth=5,markerfacecolor='none', 64 | color='blue',linewidth=4) 65 | plt.plot(num,average_comsumption,label='Average energy consumption per vehicle',marker='s',markersize=26,markeredgewidth=5, 66 | markerfacecolor='none',color='black',linewidth=4) 67 | 68 | plt.xlabel("No. of vehicles",fontsize=32) 69 | plt.ylabel("Energy usage (# of full batteries)",fontsize=32) 70 | plt.axhline(y=1,color='red',linestyle='--',label="Initial energy reserve",linewidth=4) 71 | plt.grid(True) 72 | plt.grid(linestyle='--') 73 | plt.legend(fontsize=26) 74 | plt.show() 75 | # 76 | # # charge amount 77 | # num = [1, 2, 3, 4, 5] 78 | # sum_charge_amount = [166.317, 204.866, 201.16, 196.783, 192.793] 79 | # plt.xticks(num,num[::1]) 80 | # plt.plot(num,sum_charge_amount) 81 | # plt.xlabel("Num of vehicles") 82 | # plt.ylabel("Total charge amount") 83 | # plt.axhline(y=250,color='green',linestyle='--') 84 | # plt.grid(True) 85 | # plt.grid(linestyle='--') 86 | # plt.legend() 87 | # plt.show() 88 | # 89 | # # charge frequency 90 | # num = [1, 2, 3, 4, 5] 91 | # sum_charge_frequency = [49, 232, 227, 512, 514] 92 | # plt.xticks(num,num[::1]) 93 | # plt.plot(num,sum_charge_frequency) 94 | # plt.xlabel("Num of vehicles") 95 | # plt.ylabel("Total charge frequency") 96 | # plt.grid(True) 97 | # plt.grid(linestyle='--') 98 | # plt.legend() 99 | # plt.show() 100 | 101 | 102 | # yy 103 | num = [1, 2, 3, 4, 5] 104 | plt.figure(figsize=(17,17)) 105 | sum_charge_amount = [166.317/50, 204.866/50, 201.16/50, 196.783/50, 192.793/50] 106 | sum_charge_frequency = [49, 232, 227, 512, 514] 107 | plt.xlabel("No. of vehicles",fontsize=32) 108 | plt.plot(num,sum_charge_amount,label='Total # of charged full battery',color='red',marker='o', 109 | markersize=26,markeredgewidth=5,markerfacecolor='none',linewidth=4) 110 | plt.ylim(ymax=4.4,ymin=3) 111 | plt.xticks(fontsize=32) 112 | plt.yticks(fontsize=32) 113 | plt.ylabel("Total # of charged full battery",fontsize=32) 114 | plt.legend(loc='upper left',fontsize=26) 115 | plt.grid(True) 116 | plt.grid(linestyle='--') 117 | 118 | plt.twinx() 119 | plt.xticks(fontsize=32) 120 | plt.yticks(fontsize=32) 121 | plt.plot(num,sum_charge_frequency,label='Total charging frequency',color='blue',marker='s', 122 | markersize=26,markeredgewidth=5,markerfacecolor='none',linewidth=4) 123 | plt.ylim(ymax=700,ymin=0) 124 | plt.ylabel("Total charging frequency",fontsize=32) 125 | plt.legend(loc='lower right',fontsize=26) 126 | 127 | plt.xticks(num,num[::1]) 128 | 129 | plt.grid(True) 130 | plt.grid(linestyle='--') 131 | plt.show() 132 | 133 | -------------------------------------------------------------------------------- /maddpg/__init__.py: -------------------------------------------------------------------------------- 1 | class AgentTrainer(object): 2 | def __init__(self, name, model, obs_shape, act_space, args): 3 | raise NotImplemented() 4 | 5 | def action(self, obs): 6 | raise NotImplemented() 7 | 8 | def process_experience(self, obs, act, rew, new_obs, done, terminal): 9 | raise NotImplemented() 10 | 11 | def preupdate(self): 12 | raise NotImplemented() 13 | 14 | def update(self, agents): 15 | raise NotImplemented() -------------------------------------------------------------------------------- /maddpg/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BIT-MCS/DRL-EC3/3f6fc8afe7ddea615e0e8f3f9f0fdfd6a6cd6db6/maddpg/common/__init__.py -------------------------------------------------------------------------------- /maddpg/common/distributions.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import maddpg.common.tf_util as U 4 | from tensorflow.python.ops import math_ops 5 | from tensorflow.python.ops import nn 6 | 7 | class Pd(object): 8 | """ 9 | A particular probability distribution 10 | """ 11 | def flatparam(self): 12 | raise NotImplementedError 13 | def mode(self): 14 | raise NotImplementedError 15 | def logp(self, x): 16 | raise NotImplementedError 17 | def kl(self, other): 18 | raise NotImplementedError 19 | def entropy(self): 20 | raise NotImplementedError 21 | def sample(self): 22 | raise NotImplementedError 23 | 24 | class PdType(object): 25 | """ 26 | Parametrized family of probability distributions 27 | """ 28 | def pdclass(self): 29 | raise NotImplementedError 30 | def pdfromflat(self, flat): 31 | return self.pdclass()(flat) 32 | def param_shape(self): 33 | raise NotImplementedError 34 | def sample_shape(self): 35 | raise NotImplementedError 36 | def sample_dtype(self): 37 | raise NotImplementedError 38 | 39 | def param_placeholder(self, prepend_shape, name=None): 40 | return tf.placeholder(dtype=tf.float32, shape=prepend_shape+self.param_shape(), name=name) 41 | def sample_placeholder(self, prepend_shape, name=None): 42 | return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape+self.sample_shape(), name=name) 43 | 44 | class CategoricalPdType(PdType): 45 | def __init__(self, ncat): 46 | self.ncat = ncat 47 | def pdclass(self): 48 | return CategoricalPd 49 | def param_shape(self): 50 | return [self.ncat] 51 | def sample_shape(self): 52 | return [] 53 | def sample_dtype(self): 54 | return tf.int32 55 | 56 | class SoftCategoricalPdType(PdType): 57 | def __init__(self, ncat): 58 | self.ncat = ncat 59 | def pdclass(self): 60 | return SoftCategoricalPd 61 | def param_shape(self): 62 | return [self.ncat] 63 | def sample_shape(self): 64 | return [self.ncat] 65 | def sample_dtype(self): 66 | return tf.float32 67 | 68 | class MultiCategoricalPdType(PdType): 69 | def __init__(self, low, high): 70 | self.low = low 71 | self.high = high 72 | self.ncats = high - low + 1 73 | def pdclass(self): 74 | return MultiCategoricalPd 75 | def pdfromflat(self, flat): 76 | return MultiCategoricalPd(self.low, self.high, flat) 77 | def param_shape(self): 78 | return [sum(self.ncats)] 79 | def sample_shape(self): 80 | return [len(self.ncats)] 81 | def sample_dtype(self): 82 | return tf.int32 83 | 84 | class SoftMultiCategoricalPdType(PdType): 85 | def __init__(self, low, high): 86 | self.low = low 87 | self.high = high 88 | self.ncats = high - low + 1 89 | def pdclass(self): 90 | return SoftMultiCategoricalPd 91 | def pdfromflat(self, flat): 92 | return SoftMultiCategoricalPd(self.low, self.high, flat) 93 | def param_shape(self): 94 | return [sum(self.ncats)] 95 | def sample_shape(self): 96 | return [sum(self.ncats)] 97 | def sample_dtype(self): 98 | return tf.float32 99 | 100 | class DiagGaussianPdType(PdType): 101 | def __init__(self, size): 102 | self.size = size 103 | def pdclass(self): 104 | return DiagGaussianPd 105 | def param_shape(self): 106 | return [2*self.size] 107 | def sample_shape(self): 108 | return [self.size] 109 | def sample_dtype(self): 110 | return tf.float32 111 | 112 | class BernoulliPdType(PdType): 113 | def __init__(self, size): 114 | self.size = size 115 | def pdclass(self): 116 | return BernoulliPd 117 | def param_shape(self): 118 | return [self.size] 119 | def sample_shape(self): 120 | return [self.size] 121 | def sample_dtype(self): 122 | return tf.int32 123 | 124 | # WRONG SECOND DERIVATIVES 125 | # class CategoricalPd(Pd): 126 | # def __init__(self, logits): 127 | # self.logits = logits 128 | # self.ps = tf.nn.softmax(logits) 129 | # @classmethod 130 | # def fromflat(cls, flat): 131 | # return cls(flat) 132 | # def flatparam(self): 133 | # return self.logits 134 | # def mode(self): 135 | # return U.argmax(self.logits, axis=1) 136 | # def logp(self, x): 137 | # return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x) 138 | # def kl(self, other): 139 | # return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \ 140 | # - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps) 141 | # def entropy(self): 142 | # return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps) 143 | # def sample(self): 144 | # u = tf.random_uniform(tf.shape(self.logits)) 145 | # return U.argmax(self.logits - tf.log(-tf.log(u)), axis=1) 146 | 147 | class CategoricalPd(Pd): 148 | def __init__(self, logits): 149 | self.logits = logits 150 | def flatparam(self): 151 | return self.logits 152 | def mode(self): 153 | return U.argmax(self.logits, axis=1) 154 | def logp(self, x): 155 | return -tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x) 156 | def kl(self, other): 157 | a0 = self.logits - U.max(self.logits, axis=1, keepdims=True) 158 | a1 = other.logits - U.max(other.logits, axis=1, keepdims=True) 159 | ea0 = tf.exp(a0) 160 | ea1 = tf.exp(a1) 161 | z0 = U.sum(ea0, axis=1, keepdims=True) 162 | z1 = U.sum(ea1, axis=1, keepdims=True) 163 | p0 = ea0 / z0 164 | return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=1) 165 | def entropy(self): 166 | a0 = self.logits - U.max(self.logits, axis=1, keepdims=True) 167 | ea0 = tf.exp(a0) 168 | z0 = U.sum(ea0, axis=1, keepdims=True) 169 | p0 = ea0 / z0 170 | return U.sum(p0 * (tf.log(z0) - a0), axis=1) 171 | def sample(self): 172 | u = tf.random_uniform(tf.shape(self.logits)) 173 | return U.argmax(self.logits - tf.log(-tf.log(u)), axis=1) 174 | @classmethod 175 | def fromflat(cls, flat): 176 | return cls(flat) 177 | 178 | class SoftCategoricalPd(Pd): 179 | def __init__(self, logits): 180 | self.logits = logits 181 | def flatparam(self): 182 | return self.logits 183 | def mode(self): 184 | return U.softmax(self.logits, axis=-1) 185 | def logp(self, x): 186 | return -tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=x) 187 | def kl(self, other): 188 | a0 = self.logits - U.max(self.logits, axis=1, keepdims=True) 189 | a1 = other.logits - U.max(other.logits, axis=1, keepdims=True) 190 | ea0 = tf.exp(a0) 191 | ea1 = tf.exp(a1) 192 | z0 = U.sum(ea0, axis=1, keepdims=True) 193 | z1 = U.sum(ea1, axis=1, keepdims=True) 194 | p0 = ea0 / z0 195 | return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=1) 196 | def entropy(self): 197 | a0 = self.logits - U.max(self.logits, axis=1, keepdims=True) 198 | ea0 = tf.exp(a0) 199 | z0 = U.sum(ea0, axis=1, keepdims=True) 200 | p0 = ea0 / z0 201 | return U.sum(p0 * (tf.log(z0) - a0), axis=1) 202 | def sample(self): 203 | u = tf.random_uniform(tf.shape(self.logits)) 204 | return U.softmax(self.logits - tf.log(-tf.log(u)), axis=-1) 205 | @classmethod 206 | def fromflat(cls, flat): 207 | return cls(flat) 208 | 209 | class MultiCategoricalPd(Pd): 210 | def __init__(self, low, high, flat): 211 | self.flat = flat 212 | self.low = tf.constant(low, dtype=tf.int32) 213 | self.categoricals = list(map(CategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1))) 214 | def flatparam(self): 215 | return self.flat 216 | def mode(self): 217 | return self.low + tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32) 218 | def logp(self, x): 219 | return tf.add_n([p.logp(px) for p, px in zip(self.categoricals, tf.unstack(x - self.low, axis=len(x.get_shape()) - 1))]) 220 | def kl(self, other): 221 | return tf.add_n([ 222 | p.kl(q) for p, q in zip(self.categoricals, other.categoricals) 223 | ]) 224 | def entropy(self): 225 | return tf.add_n([p.entropy() for p in self.categoricals]) 226 | def sample(self): 227 | return self.low + tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32) 228 | @classmethod 229 | def fromflat(cls, flat): 230 | return cls(flat) 231 | 232 | class SoftMultiCategoricalPd(Pd): # doesn't work yet 233 | def __init__(self, low, high, flat): 234 | self.flat = flat 235 | self.low = tf.constant(low, dtype=tf.float32) 236 | self.categoricals = list(map(SoftCategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1))) 237 | def flatparam(self): 238 | return self.flat 239 | def mode(self): 240 | x = [] 241 | for i in range(len(self.categoricals)): 242 | x.append(self.low[i] + self.categoricals[i].mode()) 243 | return tf.concat(x, axis=-1) 244 | def logp(self, x): 245 | return tf.add_n([p.logp(px) for p, px in zip(self.categoricals, tf.unstack(x - self.low, axis=len(x.get_shape()) - 1))]) 246 | def kl(self, other): 247 | return tf.add_n([ 248 | p.kl(q) for p, q in zip(self.categoricals, other.categoricals) 249 | ]) 250 | def entropy(self): 251 | return tf.add_n([p.entropy() for p in self.categoricals]) 252 | def sample(self): 253 | x = [] 254 | for i in range(len(self.categoricals)): 255 | x.append(self.low[i] + self.categoricals[i].sample()) 256 | return tf.concat(x, axis=-1) 257 | @classmethod 258 | def fromflat(cls, flat): 259 | return cls(flat) 260 | 261 | class DiagGaussianPd(Pd): 262 | def __init__(self, flat): 263 | self.flat = flat 264 | mean, logstd = tf.split(axis=1, num_or_size_splits=2, value=flat) 265 | self.mean = mean 266 | self.logstd = logstd 267 | self.std = tf.exp(logstd) #e^(log(std)) 268 | def flatparam(self): 269 | return self.flat 270 | def mode(self): 271 | return self.mean 272 | def logp(self, x): 273 | return - 0.5 * U.sum(tf.square((x - self.mean) / self.std), axis=1) \ 274 | - 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[1]) \ 275 | - U.sum(self.logstd, axis=1) 276 | def kl(self, other): 277 | assert isinstance(other, DiagGaussianPd) 278 | return U.sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=1) 279 | def entropy(self): 280 | return U.sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), 1) 281 | def sample(self): 282 | # tf.random_normal: Outputs random values from a normal distribution. 283 | return self.mean + self.std * tf.random_normal(tf.shape(self.mean)) # , self.mean, self.std 284 | @classmethod 285 | def fromflat(cls, flat): 286 | return cls(flat) 287 | 288 | class BernoulliPd(Pd): 289 | def __init__(self, logits): 290 | self.logits = logits 291 | self.ps = tf.sigmoid(logits) 292 | def flatparam(self): 293 | return self.logits 294 | def mode(self): 295 | return tf.round(self.ps) 296 | def logp(self, x): 297 | return - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=1) 298 | def kl(self, other): 299 | return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=1) - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=1) 300 | def entropy(self): 301 | return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=1) 302 | def sample(self): 303 | p = tf.sigmoid(self.logits) 304 | u = tf.random_uniform(tf.shape(p)) 305 | return tf.to_float(math_ops.less(u, p)) 306 | @classmethod 307 | def fromflat(cls, flat): 308 | return cls(flat) 309 | 310 | # 建立概率分布 311 | def make_pdtype(ac_space): 312 | from gym import spaces 313 | if isinstance(ac_space, spaces.Box): 314 | assert len(ac_space.shape) == 1 315 | return DiagGaussianPdType(ac_space.shape[0]) 316 | elif isinstance(ac_space, spaces.Discrete): 317 | # return CategoricalPdType(ac_space.n) 318 | return SoftCategoricalPdType(ac_space.n) 319 | elif isinstance(ac_space, spaces.MultiDiscrete): 320 | #return MultiCategoricalPdType(ac_space.low, ac_space.high) 321 | return SoftMultiCategoricalPdType(ac_space.low, ac_space.high) 322 | elif isinstance(ac_space, spaces.MultiBinary): 323 | return BernoulliPdType(ac_space.n) 324 | else: 325 | raise NotImplementedError 326 | 327 | def shape_el(v, i): 328 | maybe = v.get_shape()[i] 329 | if maybe is not None: 330 | return maybe 331 | else: 332 | return tf.shape(v)[i] 333 | -------------------------------------------------------------------------------- /maddpg/common/summary.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import tensorflow as tf 3 | 4 | class Summary: 5 | 6 | def __init__(self, session, dir_summary): 7 | self.__sess = session 8 | self.__vars = {} 9 | self.__ops = None 10 | self.__dir = dir_summary 11 | self.__writer = tf.summary.FileWriter(dir_summary, session.graph) 12 | 13 | def add_variable(self, var, name="name"): 14 | tf.summary.scalar(name, var) 15 | assert name not in self.__vars, "Already has " + name 16 | self.__vars[name] = var 17 | 18 | def build(self): 19 | self.__ops = tf.summary.merge_all() 20 | 21 | def run(self, feed_dict, step): 22 | feed_dict_final = {} 23 | for key, val in feed_dict.items(): 24 | feed_dict_final[self.__vars[key]] = val 25 | str_summary = self.__sess.run(self.__ops, feed_dict_final) 26 | self.__writer.add_summary(str_summary, step) 27 | self.__writer.flush() 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /maddpg/common/tf_util.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import numpy as np 3 | import os 4 | import tensorflow as tf 5 | import multiprocessing as mp 6 | 7 | 8 | def sum(x, axis=None, keepdims=False): 9 | return tf.reduce_sum(x, axis=None if axis is None else [axis], keep_dims=keepdims) 10 | 11 | 12 | def mean(x, axis=None, keepdims=False): 13 | return tf.reduce_mean(x, axis=None if axis is None else [axis], keep_dims=keepdims) 14 | 15 | 16 | def var(x, axis=None, keepdims=False): 17 | meanx = mean(x, axis=axis, keepdims=keepdims) 18 | return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims) 19 | 20 | 21 | def std(x, axis=None, keepdims=False): 22 | return tf.sqrt(var(x, axis=axis, keepdims=keepdims)) 23 | 24 | 25 | def max(x, axis=None, keepdims=False): 26 | return tf.reduce_max(x, axis=None if axis is None else [axis], keep_dims=keepdims) 27 | 28 | 29 | def min(x, axis=None, keepdims=False): 30 | return tf.reduce_min(x, axis=None if axis is None else [axis], keep_dims=keepdims) 31 | 32 | 33 | def concatenate(arrs, axis=0): 34 | return tf.concat(axis=axis, values=arrs) 35 | 36 | 37 | def argmax(x, axis=None): 38 | return tf.argmax(x, axis=axis) 39 | 40 | 41 | def softmax(x, axis=None): 42 | return tf.nn.softmax(x, dim=axis) 43 | 44 | 45 | # ================================================================ 46 | # Misc 47 | # ================================================================ 48 | 49 | 50 | def is_placeholder(x): 51 | return type(x) is tf.Tensor and len(x.op.inputs) == 0 52 | 53 | 54 | # ================================================================ 55 | # Inputs 56 | # ================================================================ 57 | 58 | 59 | class TfInput(object): 60 | def __init__(self, name="(unnamed)"): 61 | """Generalized Tensorflow placeholder. The main differences are: 62 | - possibly uses multiple placeholders internally and returns multiple values 63 | - can apply light postprocessing to the value feed to placeholder. 64 | """ 65 | self.name = name 66 | 67 | def get(self): 68 | """Return the tf variable(s) representing the possibly postprocessed value 69 | of placeholder(s). 70 | """ 71 | raise NotImplemented() 72 | 73 | def make_feed_dict(data): 74 | """Given data input it to the placeholder(s).""" 75 | raise NotImplemented() 76 | 77 | 78 | class PlacholderTfInput(TfInput): 79 | def __init__(self, placeholder): 80 | """Wrapper for regular tensorflow placeholder.""" 81 | super().__init__(placeholder.name) 82 | self._placeholder = placeholder 83 | 84 | def get(self): 85 | return self._placeholder 86 | 87 | def make_feed_dict(self, data): 88 | return {self._placeholder: data} 89 | 90 | 91 | class BatchInput(PlacholderTfInput): 92 | def __init__(self, shape, dtype=tf.float32, name=None): 93 | """Creates a placeholder for a batch of tensors of a given shape and dtype 94 | 95 | Parameters 96 | ---------- 97 | shape: [int] 98 | shape of a single elemenet of the batch 99 | dtype: tf.dtype 100 | number representation used for tensor contents 101 | name: str 102 | name of the underlying placeholder 103 | """ 104 | super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name)) 105 | 106 | 107 | class Uint8Input(PlacholderTfInput): 108 | def __init__(self, shape, name=None): 109 | """Takes input in uint8 format which is cast to float32 and divided by 255 110 | before passing it to the model. 111 | 112 | On GPU this ensures lower data transfer times. 113 | 114 | Parameters 115 | ---------- 116 | shape: [int] 117 | shape of the tensor. 118 | name: str 119 | name of the underlying placeholder 120 | """ 121 | 122 | super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name)) 123 | self._shape = shape 124 | self._output = tf.cast(super().get(), tf.float32) / 255.0 125 | 126 | def get(self): 127 | return self._output 128 | 129 | 130 | def ensure_tf_input(thing): 131 | """Takes either tf.placeholder of TfInput and outputs equivalent TfInput""" 132 | if isinstance(thing, TfInput): 133 | return thing 134 | elif is_placeholder(thing): 135 | return PlacholderTfInput(thing) 136 | else: 137 | raise ValueError("Must be a placeholder or TfInput") 138 | 139 | 140 | # ================================================================ 141 | # Mathematical utils 142 | # ================================================================ 143 | 144 | 145 | def huber_loss(x, delta=1.0): 146 | """Reference: https://en.wikipedia.org/wiki/Huber_loss""" 147 | return tf.where( 148 | tf.abs(x) < delta, 149 | tf.square(x) * 0.5, 150 | delta * (tf.abs(x) - 0.5 * delta) 151 | ) 152 | 153 | 154 | # ================================================================ 155 | # Optimizer utils 156 | # ================================================================ 157 | 158 | # 最小化loss的同时剪切梯度范围 159 | def minimize_and_clip(optimizer, objective, var_list,global_step, clip_val=10): 160 | """Minimized `objective` using `optimizer` w.r.t. variables in 161 | `var_list` while ensure the norm of the gradients for each 162 | variable is clipped to `clip_val` 163 | """ 164 | if clip_val is None: 165 | return optimizer.minimize(objective, var_list=var_list,global_step=global_step) 166 | else: 167 | gradients = optimizer.compute_gradients(objective, var_list=var_list) 168 | for i, (grad, var) in enumerate(gradients): 169 | if grad is not None: 170 | # Given a tensor grad, and a maximum clip value clip_val, 171 | # this operation normalizes t so that its L2-norm is 172 | # less than or equal to clip_norm 173 | gradients[i] = (tf.clip_by_norm(grad, clip_val), var) 174 | return optimizer.apply_gradients(gradients) 175 | 176 | 177 | # ================================================================ 178 | # Global session 179 | # ================================================================ 180 | 181 | def get_session(): 182 | """Returns recently made Tensorflow session""" 183 | return tf.get_default_session() 184 | 185 | 186 | def make_session(num_cpu): 187 | """Returns a session that will use CPU's only""" 188 | # 控制session使用的cpu资源 189 | tf_config = tf.ConfigProto( 190 | inter_op_parallelism_threads=num_cpu, 191 | intra_op_parallelism_threads=num_cpu 192 | ) 193 | tf_config.gpu_options.allow_growth=True 194 | return tf.Session(config=tf_config) 195 | 196 | 197 | def multi_threaded_session(): 198 | """Returns a session which will only use a single CPU""" 199 | # TODO:也可以使用很多CPU呀,只有这里有多线程,这就是一个假的分布式! 200 | return make_session(num_cpu=10) 201 | 202 | 203 | ALREADY_INITIALIZED = set() 204 | 205 | 206 | def initialize(): 207 | """Initialize all the uninitialized variables in the global scope.""" 208 | new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED 209 | get_session().run(tf.variables_initializer(new_variables)) 210 | ALREADY_INITIALIZED.update(new_variables) 211 | 212 | 213 | # ================================================================ 214 | # Scopes 215 | # ================================================================ 216 | 217 | # 按照命名空间得到variables变量 218 | def scope_vars(scope, trainable_only=False): 219 | """ 220 | Get variables inside a scope 221 | The scope can be specified as a string 222 | 223 | Parameters 224 | ---------- 225 | scope: str or VariableScope 226 | scope in which the variables reside. 227 | trainable_only: bool 228 | whether or not to return only the variables that were marked as trainable. 229 | 230 | Returns 231 | ------- 232 | vars: [tf.Variable] 233 | list of variables in `scope`. 234 | """ 235 | return tf.get_collection( 236 | tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.GLOBAL_VARIABLES, 237 | scope=scope if isinstance(scope, str) else scope.name 238 | ) 239 | 240 | 241 | def scope_name(): 242 | """Returns the name of current scope as a string, e.g. deepq/q_func""" 243 | return tf.get_variable_scope().name 244 | 245 | 246 | def absolute_scope_name(relative_scope_name): 247 | """Appends parent scope name to `relative_scope_name`""" 248 | return scope_name() + "/" + relative_scope_name 249 | 250 | 251 | # ================================================================ 252 | # Saving variables 253 | # ================================================================ 254 | 255 | 256 | def load_state(fname, saver=None): 257 | """Load all the variables to the current session from the location """ 258 | if saver is None: 259 | saver = tf.train.Saver() 260 | saver.restore(get_session(), fname) 261 | return saver 262 | 263 | 264 | def save_state(fname, saver=None): 265 | """Save all the variables in the current session to the location """ 266 | os.makedirs(os.path.dirname(fname), exist_ok=True) 267 | if saver is None: 268 | saver = tf.train.Saver() 269 | saver.save(get_session(), fname) 270 | return saver 271 | 272 | 273 | # ================================================================ 274 | # Theano-like Function 275 | # ================================================================ 276 | 277 | # 建立一个函数 278 | def function(inputs, outputs, updates=None, givens=None): 279 | """Just like Theano function. Take a bunch of tensorflow placeholders and expersions 280 | computed based on those placeholders and produces f(inputs) -> outputs. Function f takes 281 | values to be feed to the inputs placeholders and produces the values of the experessions 282 | in outputs. 283 | 284 | Input values can be passed in the same order as inputs or can be provided as kwargs based 285 | on placeholder name (passed to constructor or accessible via placeholder.op.name). 286 | 287 | Example: 288 | x = tf.placeholder(tf.int32, (), name="x") 289 | y = tf.placeholder(tf.int32, (), name="y") 290 | z = 3 * x + 2 * y 291 | lin = function([x, y], z, givens={y: 0}) 292 | 293 | with single_threaded_session(): 294 | initialize() 295 | 296 | assert lin(2) == 6 297 | assert lin(x=3) == 9 298 | assert lin(2, 2) == 10 299 | assert lin(x=2, y=3) == 12 300 | 301 | Parameters 302 | ---------- 303 | inputs: [tf.placeholder or TfInput] 304 | list of input arguments 305 | outputs: [tf.Variable] or tf.Variable 306 | list of outputs or a single output to be returned from function. Returned 307 | value will also have the same shape. 308 | """ 309 | if isinstance(outputs, list): 310 | return _Function(inputs, outputs, updates, givens=givens) 311 | elif isinstance(outputs, (dict, collections.OrderedDict)): 312 | f = _Function(inputs, outputs.values(), updates, givens=givens) 313 | return lambda *args, **kwargs: type(outputs)(zip(outputs.keys(), f(*args, **kwargs))) 314 | else: 315 | f = _Function(inputs, [outputs], updates, givens=givens) 316 | return lambda *args, **kwargs: f(*args, **kwargs)[0] 317 | 318 | 319 | class _Function(object): 320 | def __init__(self, inputs, outputs, updates, givens, check_nan=False): 321 | for inpt in inputs: 322 | if not issubclass(type(inpt), TfInput): 323 | assert len(inpt.op.inputs) == 0, "inputs should all be placeholders of rl_algs.common.TfInput" 324 | self.inputs = inputs 325 | updates = updates or [] 326 | # tf.group() 327 | # Create an op that groups multiple operations. 328 | # When this op finishes, all ops in inputs have finished. 329 | # This op has no output. 330 | self.update_group = tf.group(*updates) 331 | self.outputs_update = list(outputs) + [self.update_group] 332 | self.givens = {} if givens is None else givens 333 | self.check_nan = check_nan 334 | 335 | def _feed_input(self, feed_dict, inpt, value): 336 | if issubclass(type(inpt), TfInput): 337 | feed_dict.update(inpt.make_feed_dict(value)) 338 | elif is_placeholder(inpt): 339 | feed_dict[inpt] = value 340 | 341 | def __call__(self, *args, **kwargs): 342 | assert len(args) <= len(self.inputs), "Too many arguments provided" 343 | feed_dict = {} 344 | # Update the args 345 | for inpt, value in zip(self.inputs, args): 346 | self._feed_input(feed_dict, inpt, value) 347 | # Update the kwargs 348 | kwargs_passed_inpt_names = set() 349 | for inpt in self.inputs[len(args):]: 350 | inpt_name = inpt.name.split(':')[0] 351 | inpt_name = inpt_name.split('/')[-1] 352 | assert inpt_name not in kwargs_passed_inpt_names, \ 353 | "this function has two arguments with the same name \"{}\", so kwargs cannot be used.".format(inpt_name) 354 | if inpt_name in kwargs: 355 | kwargs_passed_inpt_names.add(inpt_name) 356 | self._feed_input(feed_dict, inpt, kwargs.pop(inpt_name)) 357 | else: 358 | assert inpt in self.givens, "Missing argument " + inpt_name 359 | assert len(kwargs) == 0, "Function got extra arguments " + str(list(kwargs.keys())) 360 | # Update feed dict with givens. 361 | for inpt in self.givens: 362 | feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt]) 363 | results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1] 364 | if self.check_nan: 365 | if any(np.isnan(r).any() for r in results): 366 | raise RuntimeError("Nan detected") 367 | return results 368 | -------------------------------------------------------------------------------- /maddpg/trainer/maddpg.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import tensorflow as tf 4 | import maddpg.common.tf_util as U 5 | 6 | from maddpg.common.distributions import make_pdtype 7 | from maddpg import AgentTrainer 8 | from maddpg.trainer.prioritized_rb.replay_buffer import ReplayBuffer 9 | import tensorflow.contrib.layers as layers 10 | 11 | 12 | def discount_with_dones(rewards, dones, gamma): 13 | discounted = [] 14 | r = 0 15 | for reward, done in zip(rewards[::-1], dones[::-1]): 16 | r = reward + gamma * r 17 | r = r * (1. - done) 18 | discounted.append(r) 19 | return discounted[::-1] 20 | 21 | 22 | def make_update_exp(vals, target_vals): 23 | polyak = 1.0 - 1e-2 24 | expression = [] 25 | for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)): 26 | # update target network parameters (once) 27 | expression.append(var_target.assign(polyak * var_target + (1.0 - polyak) * var)) 28 | expression = tf.group(*expression) 29 | return U.function([], [], updates=[expression]) 30 | 31 | 32 | def CNN(state_input, reuse=tf.AUTO_REUSE, scope='CNN'): 33 | with tf.variable_scope(scope, reuse=reuse): 34 | state = tf.layers.conv2d(state_input, 16, 3, activation='relu', strides=2, padding='VALID') 35 | state = tf.layers.conv2d(state, 32, 3, activation='relu', strides=2, padding='VALID') 36 | state = tf.layers.conv2d(state, 64, 3, activation='relu', strides=2, padding='VALID') 37 | temp = 64 * 9 * 9 38 | 39 | state = tf.layers.batch_normalization(state) 40 | input_1 = tf.reshape(state, [-1]) 41 | input_s = tf.reshape(input_1, [-1, temp]) 42 | return input_s 43 | 44 | 45 | # TODO: RNN!!! 46 | def RNN(state_input, reuse=tf.AUTO_REUSE, scope='RNN', cell_size=None, initial_state=None): 47 | with tf.variable_scope(scope, reuse=reuse): 48 | rnn_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(num_units=cell_size, 49 | layer_norm=True, norm_gain=1.0, norm_shift=0.0, 50 | dropout_keep_prob=0.75, dropout_prob_seed=None) 51 | outputs, final_state = tf.nn.dynamic_rnn( 52 | cell=rnn_cell, inputs=state_input, initial_state=initial_state, time_major=True, dtype=tf.float32) 53 | cell_out = outputs[-1, :, :] 54 | return cell_out, final_state 55 | 56 | 57 | # 多层感知机 Actor/Critic-Net 互相独立 58 | # TODO:输出加了tanh确实不会梯度爆炸,但是收敛效果变得不是很好 59 | def mlp_model(input, num_outputs, scope, reuse=False, num_units=64, ac_fn=None): 60 | # This model takes as input an observation and returns values of all actions 61 | with tf.variable_scope(scope, reuse=reuse): 62 | out = input 63 | 64 | out = layers.fully_connected(out, num_outputs=num_units, activation_fn=tf.nn.relu, 65 | weights_regularizer=tf.contrib.layers.l2_regularizer(1e-2), 66 | biases_regularizer=tf.contrib.layers.l2_regularizer(1e-2)) 67 | out = layers.fully_connected(out, num_outputs=num_units, activation_fn=tf.nn.relu, 68 | weights_regularizer=tf.contrib.layers.l2_regularizer(1e-2), 69 | biases_regularizer=tf.contrib.layers.l2_regularizer(1e-2)) 70 | out = layers.fully_connected(out, num_outputs=num_outputs, activation_fn=None, 71 | weights_regularizer=tf.contrib.layers.l2_regularizer(1e-2), 72 | biases_regularizer=tf.contrib.layers.l2_regularizer(1e-2)) 73 | return out 74 | 75 | 76 | # actor 77 | def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer,global_step, grad_norm_clipping=None, local_q_func=False, 78 | num_units=64, scope="trainer", reuse=None, args=None): 79 | with tf.variable_scope(scope, reuse=reuse): 80 | # create distribtuions - DiagGaussian 81 | act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] 82 | # make_pdtype(box(2,))->DiagGaussianPdType(2) 83 | 84 | # set up placeholders 85 | obs_ph_n = make_obs_ph_n # n * [None,80,80,3] 86 | # n * [None, 3] 87 | act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n))] 88 | 89 | # actor output shape(p-shape):[None,2*3] 2:mean,std 3:one_uav_action_n 90 | # add cnn for actor here! p_input=CNN(obs_ph_n[p_index]) 91 | if args.rnn_length > 0: 92 | cnn_output = tf.reshape(CNN(state_input=obs_ph_n[p_index], scope='p_func'), 93 | [args.rnn_length, -1, 64 * 9 * 9]) 94 | p_input, _ = RNN(state_input=cnn_output, scope='p_func', cell_size=args.rnn_cell_size) 95 | else: 96 | p_input = CNN(obs_ph_n[p_index], scope='p_func') 97 | p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units, 98 | ac_fn=tf.nn.tanh) # TODO:actor的输出加tanh,避免爆炸 99 | 100 | # 提取CNN+BATCH_NORMALIZATION+MLP里面的参数 101 | p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) 102 | 103 | # wrap parameters in distribution 104 | act_pd = act_pdtype_n[p_index].pdfromflat(p) 105 | # PdType.pdfromflat(p) => DGPT.pdclass()(p) =>DiagGaussianPd(p) 106 | # shape of p [None, 4] => mean:[None, 2] std:[None, 2] 107 | 108 | act_sample = act_pd.sample() # action == mean + std * tf.random_normal 109 | mean, logstd = act_pd.mean, act_pd.logstd 110 | # act_pd.flatparam() === p 111 | p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) 112 | 113 | act_input_n = act_ph_n + [] 114 | act_input_n[p_index] = act_pd.sample() # 每个agent更新自己的action,跑多次这个函数最后会形成新的action_input_n 115 | 116 | # add cnn for critic here! 117 | cnn_obs_ph_n = [] 118 | for obs_ph in obs_ph_n: 119 | # rnn 120 | if args.rnn_length > 0: 121 | cnn_output = tf.reshape(CNN(state_input=obs_ph, scope='q_func'), [args.rnn_length, -1, 64 * 9 * 9]) 122 | cell_out, _ = RNN(state_input=cnn_output, scope='q_func', cell_size=args.rnn_cell_size) 123 | cnn_obs_ph_n.append(cell_out) 124 | else: 125 | cnn_obs_ph_n.append(CNN(state_input=obs_ph, scope='q_func')) 126 | 127 | q_input = tf.concat(cnn_obs_ph_n + act_input_n, 1) 128 | if local_q_func: 129 | q_input = tf.concat([cnn_obs_ph_n[p_index], act_input_n[p_index]], 1) 130 | # reuse=True, the same critic. 会使用q_func空间里的critic,来更新actor的loss 131 | q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0] 132 | pg_loss = -tf.reduce_mean(q) 133 | 134 | loss = pg_loss + p_reg * 1e-3 135 | 136 | optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,global_step, grad_norm_clipping) 137 | 138 | # Create callable functions 139 | train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) 140 | act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) # 单个agent的evaluate-actor动作输出函数 141 | p_values = U.function([obs_ph_n[p_index]], p) 142 | 143 | # target network 144 | # add cnn for actor here! p_target_input=CNN(obs_ph_n[p_index]) 145 | if args.rnn_length > 0: 146 | cnn_output = tf.reshape(CNN(state_input=obs_ph_n[p_index], scope='target_p_func'), 147 | [args.rnn_length, -1, 64 * 9 * 9]) 148 | p_target_input, _ = RNN(state_input=cnn_output,scope='target_p_func',cell_size=args.rnn_cell_size) 149 | else: 150 | p_target_input = CNN(obs_ph_n[p_index], scope='target_p_func') 151 | 152 | target_p = p_func(p_target_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", 153 | num_units=num_units, ac_fn=tf.nn.tanh) 154 | target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func")) 155 | update_target_p = make_update_exp(p_func_vars, target_p_func_vars) # 更新target network 156 | 157 | target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() 158 | target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) # 单个agent的target-actor动作输出函数 159 | 160 | return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act} 161 | 162 | 163 | # critic 164 | def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, global_step, grad_norm_clipping=None, local_q_func=False, 165 | scope="trainer", reuse=None, num_units=64, args=None): 166 | with tf.variable_scope(scope, reuse=reuse): 167 | # create distribtuions (n * action_n) 168 | act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # dialog 高斯分布 169 | 170 | # set up placeholders 171 | obs_ph_n = make_obs_ph_n # n * [None,80,80,3] 172 | 173 | # add CNN for critic here. cnn_obs_ph_n=CNN(obs_ph_n) 174 | cnn_obs_ph_n = [] # n * [None,5184] 175 | for obs_ph in obs_ph_n: 176 | # rnn 177 | if args.rnn_length > 0: 178 | cnn_output = tf.reshape(CNN(state_input=obs_ph, scope='q_func'), [args.rnn_length, -1, 64 * 9 * 9]) 179 | cell_out, _ = RNN(state_input=cnn_output, scope='q_func', cell_size=args.rnn_cell_size) 180 | cnn_obs_ph_n.append(cell_out) 181 | else: 182 | cnn_obs_ph_n.append(CNN(state_input=obs_ph, scope='q_func')) 183 | 184 | # multi-state-placeholder(num_agents) 185 | act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n))] 186 | target_ph = tf.placeholder(tf.float32, [None], name="target") 187 | 188 | # q_input = n * (s cat a) shape:[None,n*(5184+3)] 189 | q_input = tf.concat(cnn_obs_ph_n + act_ph_n, 1) 190 | if local_q_func: # false 191 | q_input = tf.concat([cnn_obs_ph_n[q_index], act_ph_n[q_index]], 1) 192 | q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0] # MLP 输出值,增加一维 shape=[None,0] 193 | 194 | # 得到训练所需要的所有(在scope命名空间内)的variable变量(weight/bias/batch_normalization) 195 | q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) 196 | 197 | q_square = tf.square(q - target_ph) 198 | q_loss = tf.reduce_mean(q_square) # Square loss (from batch to one) 199 | 200 | # viscosity solution to Bellman differential equation in place of an initial condition 201 | q_reg = tf.reduce_mean(tf.square(q)) 202 | loss = q_loss # + 1e-3 * q_reg 203 | 204 | # optimizer(Adam) 205 | optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,global_step,grad_norm_clipping) 206 | 207 | # Create callable functions 建立了一个pipeline,方便后续训练 208 | train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=[loss, q_square, q, q_input], 209 | updates=[optimize_expr]) 210 | q_values = U.function(obs_ph_n + act_ph_n, q) 211 | 212 | # add CNN for critic here. tcnn_obs_ph_n=CNN(obs_ph_n) 213 | tcnn_obs_ph_n = [] # n * [None,5184] for target 214 | for obs_ph in obs_ph_n: 215 | # rnn 216 | if args.rnn_length > 0: 217 | cnn_output = tf.reshape(CNN(state_input=obs_ph, scope='target_q_func'), 218 | [args.rnn_length, -1, 64 * 9 * 9]) 219 | cell_out, _ = RNN(state_input=cnn_output, scope='target_q_func', cell_size=args.rnn_cell_size) 220 | tcnn_obs_ph_n.append(cell_out) 221 | else: 222 | tcnn_obs_ph_n.append(CNN(state_input=obs_ph, scope='target_q_func')) 223 | 224 | q_target_input = tf.concat(tcnn_obs_ph_n + act_ph_n, 1) 225 | if local_q_func: # false 226 | q_target_input = tf.concat([tcnn_obs_ph_n[q_index], act_ph_n[q_index]], 1) 227 | 228 | # target network 229 | target_q = q_func(q_target_input, 1, scope="target_q_func", num_units=num_units)[:, 0] 230 | target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func")) 231 | update_target_q = make_update_exp(q_func_vars, target_q_func_vars) 232 | 233 | target_q_values = U.function(obs_ph_n + act_ph_n, target_q) 234 | 235 | return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values} 236 | 237 | 238 | class MADDPGAgentTrainer(AgentTrainer): # 按照agent_index挨个建立trainer 239 | def __init__(self, name, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): 240 | self.name = name 241 | self.n = len(obs_shape_n) # 2 242 | self.agent_index = agent_index 243 | self.args = args 244 | 245 | # TODO:加一个自适应学习率衰减(有很多tricks) 246 | self.global_train_step = tf.Variable(tf.constant(0.0), trainable=False) 247 | self.decey_lr = tf.train.exponential_decay(learning_rate=self.args.lr, global_step=self.global_train_step, 248 | decay_steps=100, decay_rate=self.args.decay_rate, staircase=True) 249 | # multi-state-placeholder(num_agents) 250 | obs_ph_n = [] 251 | for i in range(self.n): 252 | obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get()) 253 | 254 | # Create all the functions necessary to train the model 255 | 256 | # critic 257 | # q_train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=[loss, q_square, q, q_input], 258 | # updates=[optimize_expr]) 259 | # q_update = make_update_exp(q_func_vars, target_q_func_vars) 260 | # q_values = U.function(obs_ph_n + act_ph_n, q) 261 | # target_q_values = U.function(obs_ph_n + act_ph_n, target_q) 262 | # self.q_debug={'q_values': q_values, 'target_q_values': target_q_values} 263 | self.q_train, self.q_update, self.q_debug = q_train( 264 | scope=self.name, 265 | make_obs_ph_n=obs_ph_n, 266 | act_space_n=act_space_n, 267 | q_index=agent_index, 268 | q_func=mlp_model, # mlp 269 | optimizer=tf.train.AdamOptimizer(learning_rate=self.decey_lr), #args.lr 270 | grad_norm_clipping=0.5, 271 | local_q_func=local_q_func, # false 272 | num_units=args.num_units, # 600 273 | args=args, 274 | global_step=self.global_train_step 275 | ) 276 | 277 | # actor 278 | # self.act 算的是第agent_index个agent的action_sample! 279 | # self.p-_debug={'p_values': p_values, 'target_act': target_act} 280 | self.act, self.p_train, self.p_update, self.p_debug = p_train( 281 | scope=self.name, 282 | make_obs_ph_n=obs_ph_n, 283 | act_space_n=act_space_n, 284 | p_index=agent_index, 285 | p_func=mlp_model, # mlp 286 | q_func=mlp_model, # mlp 287 | optimizer=tf.train.AdamOptimizer(learning_rate=self.decey_lr), 288 | grad_norm_clipping=0.5, 289 | local_q_func=local_q_func, # false 290 | num_units=args.num_units, # 600 291 | args=args, 292 | global_step = self.global_train_step 293 | ) 294 | 295 | # Create experience buffer 296 | self.buffer_size = self.args.buffer_size # 1e6 297 | self.beta = self.args.beta 298 | self.replay_buffer = ReplayBuffer(int(self.buffer_size), int(self.args.batch_size), self.args.alpha, 299 | self.args.epsilon) 300 | self.replay_sample_index = None 301 | 302 | @property 303 | def filled_size(self): 304 | return len(self.replay_buffer) 305 | 306 | def action(self, obs): 307 | actor_output = self.act(obs)[0] 308 | return actor_output 309 | 310 | def experience(self, obs, act, rew, new_obs, done, terminal, num_actor_workers): 311 | # Store transition in the replay buffer. 312 | self.replay_buffer.add(obs, act, rew, new_obs, float(done), self.args.N, self.args.gamma, num_actor_workers) 313 | 314 | def preupdate(self): 315 | self.replay_sample_index = None 316 | 317 | def update(self, env, agents, t): 318 | # replay buffer is not large enough 没填满的时候不训练 319 | # if len(self.replay_buffer) < 10000: 320 | if len(self.replay_buffer) < 100 * self.args.batch_size: 321 | return [0] 322 | if not t % 10 == 0: # only update every 10 steps 323 | return [0] 324 | 325 | # 随着训练的进行,让β从某个小于1的值渐进地靠近1 326 | if self.beta < 1.: 327 | self.beta *= 1. + 1e-4 328 | 329 | # sample from one agent(batch:1024) 之后根据β算出来的weights没有用到呢!!! 330 | (obs, act, rew, obs_next, done), weights, priorities, self.replay_sample_index = self.replay_buffer.sample( 331 | self.args.batch_size, self.beta, self.args.num_actor_workers, self.args.rnn_length) # batch-size=1024 332 | 333 | # collect replay sample from all agents 334 | obs_n = [] 335 | obs_next_n = [] 336 | act_n = [] 337 | 338 | index = self.replay_sample_index # index数组 339 | for i in range(self.n): 340 | obs_, _, rew_, obs_next_, _ = agents[i].replay_buffer.sample_index(index, self.args.num_actor_workers, 341 | self.args.rnn_length) 342 | _, act_, _, _, done_ = agents[i].replay_buffer.sample_index(index, 0, 0) 343 | 344 | if self.args.rnn_length > 0: 345 | obs_ = obs_.transpose((1, 0, 2, 3, 4)) 346 | obs_next_ = obs_next_.transpose((1, 0, 2, 3, 4)) 347 | obs_shape = obs_.shape 348 | obs_ = obs_.reshape(-1, obs_shape[-3], obs_shape[-2], obs_shape[-1]) 349 | obs_next_ = obs_next_.reshape(-1, obs_shape[-3], obs_shape[-2], obs_shape[-1]) 350 | 351 | obs_n.append(obs_) 352 | obs_next_n.append(obs_next_) 353 | act_n.append(act_) 354 | 355 | # train q network 356 | num_sample = 1 357 | target_q = 0.0 358 | 359 | # TODO: 在target network里面采用兼顾过去和未来的一长段RNN 计算Qt+n 360 | # use functions defined (batch:1024) 361 | for i in range(num_sample): 362 | target_act_next_n = [agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n)] 363 | target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_next_n)) 364 | target_q += rew + self.args.gamma ** self.args.N * (1.0 - done) * target_q_next # N-step(N=5) 365 | target_q /= num_sample 366 | 367 | [q_loss, q_td, Q, q_input] = self.q_train(*(obs_n + act_n + [target_q])) 368 | 369 | debug_dir = env.log_dir + self.args.debug_dir 370 | if os.path.exists(debug_dir) is False: 371 | os.makedirs(debug_dir) 372 | with open(debug_dir + "current_step_information_{}.txt".format(self.name), 'w+') as file: 373 | for i, r, p, q, w in zip(index, rew, priorities, Q, weights): 374 | print(self.name, " current_global_step: ", t, "-----index: ", i, " reward(n-step): ", r, " priority: ", 375 | p, " Q: ", q, " Wi: ", w, file=file) 376 | 377 | # priority replay buffer update (use TD-error) 378 | values = np.fabs(q_td) 379 | self.replay_buffer.priority_update(self.replay_sample_index, values) 380 | 381 | # train p network 382 | p_loss = self.p_train(*(obs_n + act_n)) 383 | 384 | self.p_update() # actor update 385 | self.q_update() # critic update 386 | 387 | return [q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q)] 388 | -------------------------------------------------------------------------------- /maddpg/trainer/prioritized_rb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BIT-MCS/DRL-EC3/3f6fc8afe7ddea615e0e8f3f9f0fdfd6a6cd6db6/maddpg/trainer/prioritized_rb/__init__.py -------------------------------------------------------------------------------- /maddpg/trainer/prioritized_rb/proportional.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import copy 4 | from . import sum_tree 5 | 6 | 7 | class Experience(object): 8 | """ The class represents prioritized experience replay buffer. 9 | The class has functions: store samples, pick samples with 10 | probability in proportion to sample's priority, update 11 | each sample's priority, reset alpha. 12 | see https://arxiv.org/pdf/1511.05952.pdf . 13 | """ 14 | 15 | def __init__(self, memory_size, batch_size, alpha): 16 | """ Prioritized experience replay buffer initialization. 17 | 18 | Parameters 19 | ---------- 20 | memory_size : int 21 | sample size to be stored 22 | batch_size : int 23 | batch size to be selected by `select` method 24 | alpha: float 25 | exponent determine how much prioritization. 26 | Prob_i \sim priority_i**alpha/sum(priority**alpha) 27 | """ 28 | self.tree = sum_tree.SumTree(memory_size) 29 | self.memory_size = memory_size 30 | self.batch_size = batch_size 31 | self.alpha = alpha 32 | 33 | def add(self, data, priority): 34 | """ Add new sample. 35 | 36 | Parameters 37 | ---------- 38 | data : object 39 | new sample 40 | priority : float 41 | sample's priority 42 | """ 43 | self.tree.add(data, priority ** self.alpha) 44 | 45 | def n_step(self, n, r_position, x_position, gamma,num_actor_workers): 46 | current_index = self.tree.cursor - 1 47 | current_value = self.tree.data[current_index][r_position] # reward 48 | begin_index = current_index 49 | for i in range(1, n, 1): 50 | index = begin_index - i*num_actor_workers 51 | if index < 0 and index + self.tree.filled_size() <= current_index: 52 | break 53 | i_gamma = np.power(gamma, i) 54 | self.tree.data[index][r_position] += i_gamma * current_value # n-step 的处理,加了4个真实的reward 55 | if self.tree.filled_size() >= n: 56 | n_step_back = current_index - n 57 | if n_step_back < 0 and n_step_back + self.tree.filled_size() <= current_index: 58 | return 59 | self.tree.data[n_step_back][x_position] = copy.deepcopy(self.tree.data[current_index][x_position]) 60 | 61 | def select(self, beta,num_actor_workers,rnn_length): 62 | """ The method return samples randomly. 63 | 64 | Parameters 65 | ---------- 66 | beta : float 67 | 68 | Returns 69 | ------- 70 | out : 71 | list of samples 72 | weights: 73 | list of weight 74 | indices: 75 | list of sample indices 76 | The indices indicate sample positions in a sum tree. 77 | """ 78 | 79 | if self.tree.filled_size() < self.batch_size: 80 | return None, None, None 81 | ranges = np.linspace(0, self.tree.tree[0], num=self.batch_size + 1) 82 | out = [] 83 | indices = [] 84 | weights = [] 85 | priorities = [] 86 | for i in range(self.batch_size): 87 | while True: 88 | r = random.uniform(ranges[i], ranges[i+1]) 89 | data, priority, index = self.tree.find(r, norm=False) 90 | if index < (rnn_length-1)*num_actor_workers: 91 | index += (rnn_length-1)*num_actor_workers 92 | data=self.tree.data[index] 93 | priority=self.tree.tree[index + (2 ** (self.tree.tree_level - 1) - 1)] 94 | if data is not None: 95 | break 96 | priorities.append(priority) 97 | weights.append((1. / self.memory_size / priority) ** beta if priority > 1e-16 else 0) 98 | indices.append(index) 99 | out.append(data) 100 | 101 | weights = list(np.array(weights) / max(weights)) # Normalize for stability 102 | 103 | return out, weights,priorities, indices 104 | 105 | def priority_update(self, indices, priorities): 106 | """ The methods update samples's priority. 107 | 108 | Parameters 109 | ---------- 110 | indices : 111 | list of sample indices 112 | """ 113 | for i, p in zip(indices, priorities): 114 | self.tree.val_update(i, p ** self.alpha) 115 | 116 | def reset_alpha(self, alpha): 117 | """ Reset a exponent alpha. 118 | Parameters 119 | ---------- 120 | alpha : float 121 | """ 122 | self.alpha, old_alpha = alpha, self.alpha 123 | priorities = [self.tree.get_val(i) ** -old_alpha for i in range(self.tree.filled_size())] 124 | self.priority_update(range(self.tree.filled_size()), priorities) 125 | 126 | 127 | 128 | 129 | -------------------------------------------------------------------------------- /maddpg/trainer/prioritized_rb/replay_buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | from .proportional import Experience 4 | 5 | class ReplayBuffer(object): 6 | def __init__(self, size, batch_size, alpha, epsilon): 7 | """Create Prioritized Replay buffer. 8 | 9 | Parameters 10 | ---------- 11 | size: int 12 | Max number of transitions to store in the buffer. When the buffer 13 | overflows the old memories are dropped. 14 | """ 15 | self.rb = Experience(size, batch_size, alpha) 16 | self.epsilon = epsilon 17 | # self._storage = [] 18 | # self._maxsize = int(size) 19 | # self._next_idx = 0 20 | 21 | def __len__(self): 22 | # return len(self._storage) 23 | return self.rb.tree.filled_size() 24 | 25 | def clear(self): 26 | # self._storage = [] 27 | # self._next_idx = 0 28 | self.rb = Experience(self.rb.memory_size, self.rb.batch_size, self.rb.alpha) 29 | 30 | def add(self, obs_t, action, reward, obs_tp1, done, n, gamma,num_actor_workers): 31 | data = [obs_t, action, reward, obs_tp1, done] 32 | priority = self.rb.tree.max_value + self.epsilon 33 | self.rb.add(data, priority) 34 | reward_index = 2 # reward的位置是2 35 | x__index = 3 # next_state的位置是3 36 | 37 | # TODO:刘老师想在这里做文章~~ 38 | self.rb.n_step(n, reward_index, x__index, gamma,num_actor_workers) 39 | 40 | def _encode_sample(self, data): 41 | obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], [] 42 | for i in data: 43 | # data = self.rb.tree.data[i] 44 | obs_t, action, reward, obs_tp1, done = i 45 | obses_t.append(np.array(obs_t, copy=False)) 46 | actions.append(np.array(action, copy=False)) 47 | rewards.append(reward) 48 | obses_tp1.append(np.array(obs_tp1, copy=False)) 49 | dones.append(done) 50 | return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones) 51 | 52 | def _encode_sample_index(self, index,num_actor_workers,rnn_length): 53 | obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], [] 54 | for i in index: 55 | if rnn_length>0: 56 | obs_t_rnn, action_rnn, reward_rnn, obs_tp1_rnn, done_rnn=[],[],[],[],[] 57 | for j in range(rnn_length-1,-1,-1): 58 | pindex=i-j*num_actor_workers 59 | obs_t, action, reward, obs_tp1, done = self.rb.tree.data[pindex] 60 | 61 | obs_t_rnn.append(np.array(obs_t)) 62 | action_rnn.append(np.array(action)) 63 | reward_rnn.append(np.array(reward)) 64 | obs_tp1_rnn.append(np.array(obs_tp1)) 65 | done_rnn.append(np.array(done)) 66 | obses_t.append(np.array(obs_t_rnn, copy=False)) 67 | actions.append(np.array(action_rnn, copy=False)) 68 | rewards.append(reward_rnn) 69 | obses_tp1.append(np.array(obs_tp1_rnn, copy=False)) 70 | dones.append(done_rnn) 71 | 72 | else: 73 | obs_t, action, reward, obs_tp1, done = self.rb.tree.data[i] 74 | obses_t.append(np.array(obs_t, copy=False)) 75 | actions.append(np.array(action, copy=False)) 76 | rewards.append(reward) 77 | obses_tp1.append(np.array(obs_tp1, copy=False)) 78 | dones.append(done) 79 | return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones) 80 | 81 | # def make_index(self, batch_size): 82 | # return [random.randint(0, self.rb.tree.filled_size() - 1) for _ in range(batch_size)] 83 | # 84 | # def make_latest_index(self, batch_size): 85 | # idx = [(self._next_idx - 1 - i) % self._maxsize for i in range(batch_size)] 86 | # np.random.shuffle(idx) 87 | # return idx 88 | 89 | def sample_index(self, idxes,num_actor_workers,rnn_length): 90 | return self._encode_sample_index(idxes,num_actor_workers,rnn_length) 91 | 92 | def sample(self, batch_size, beta,num_actor_workers,rnn_length): 93 | """Sample a batch of experiences. 94 | 95 | Parameters 96 | ---------- 97 | batch_size: int 98 | How many transitions to sample. 99 | 100 | Returns 101 | ------- 102 | obs_batch: np.array 103 | batch of observations 104 | act_batch: np.array 105 | batch of actions executed given obs_batch 106 | rew_batch: np.array 107 | rewards received as results of executing act_batch 108 | next_obs_batch: np.array 109 | next set of observations seen after executing act_batch 110 | done_mask: np.array 111 | done_mask[i] = 1 if executing act_batch[i] resulted in 112 | the end of an episode and 0 otherwise. 113 | """ 114 | data, weight,priorities, indices = self.rb.select(beta,num_actor_workers,rnn_length) 115 | return self._encode_sample(data), weight,priorities, indices 116 | 117 | def priority_update(self, indices, priorities): 118 | priorities = list(np.array(priorities) + self.epsilon) 119 | self.rb.priority_update(indices=indices, priorities=priorities) 120 | 121 | def reset_alpha(self, alpha): 122 | self.rb.reset_alpha(alpha) 123 | 124 | # def collect(self): 125 | # return self.sample(-1) 126 | -------------------------------------------------------------------------------- /maddpg/trainer/prioritized_rb/sum_tree.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import math 4 | 5 | 6 | class SumTree(object): 7 | def __init__(self, max_size): 8 | self.max_size = max_size 9 | self.tree_level = math.ceil(math.log(max_size + 1, 2)) + 1 10 | self.tree_size = 2 ** self.tree_level - 1 11 | self.tree = [0 for i in range(self.tree_size)] 12 | self.max_value = 0 13 | self.data = [None for i in range(self.max_size)] 14 | self.size = 0 15 | self.cursor = 0 16 | 17 | def add(self, contents, value): 18 | if value > self.max_value: 19 | self.max_value = value 20 | index = self.cursor 21 | self.cursor = (self.cursor + 1) % self.max_size # 超过了从头覆盖 22 | self.size = min(self.size + 1, self.max_size) 23 | 24 | self.data[index] = contents 25 | self.val_update(index, value) 26 | 27 | def get_val(self, index): 28 | tree_index = 2 ** (self.tree_level - 1) - 1 + index 29 | return self.tree[tree_index] 30 | 31 | def val_update(self, index, value): 32 | if value > self.max_value: 33 | self.max_value = value 34 | tree_index = 2 ** (self.tree_level - 1) - 1 + index 35 | diff = value - self.tree[tree_index] 36 | self.reconstruct(tree_index, diff) 37 | 38 | def reconstruct(self, tindex, diff): 39 | self.tree[tindex] += diff 40 | if not tindex == 0: 41 | tindex = int((tindex - 1) / 2) 42 | self.reconstruct(tindex, diff) 43 | 44 | def find(self, value, norm=True): 45 | if norm: 46 | value *= self.tree[0] 47 | return self._find(value, 0) 48 | 49 | def _find(self, value, index): 50 | if 2 ** (self.tree_level - 1) - 1 <= index: 51 | return self.data[index - (2 ** (self.tree_level - 1) - 1)], self.tree[index], index - ( 52 | 2 ** (self.tree_level - 1) - 1) 53 | 54 | left = self.tree[2 * index + 1] 55 | 56 | if value <= left: 57 | return self._find(value, 2 * index + 1) 58 | else: 59 | return self._find(value - left, 2 * (index + 1)) 60 | 61 | def print_tree(self): 62 | for k in range(1, self.tree_level + 1): 63 | for j in range(2 ** (k - 1) - 1, 2 ** k - 1): 64 | print(self.tree[j], end=' ') 65 | print() 66 | 67 | def filled_size(self): 68 | return self.size 69 | 70 | 71 | if __name__ == '__main__': 72 | s = SumTree(20) 73 | for i in range(20): 74 | s.add(2 ** i, i) 75 | s.print_tree() 76 | print(s.find(0.5)) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gym 2 | matplotlib 3 | numpy 4 | scipy 5 | seaborn 6 | tqdm 7 | colorcet 8 | panel 9 | pyviz-comms 10 | pandas 11 | tensorboard 12 | --------------------------------------------------------------------------------