├── .gitignore
├── LICENSE
├── README.md
├── experiments
    ├── .csv
    ├── TSP.py
    ├── compare.py
    ├── crazy_env
    │   ├── __init__.py
    │   ├── data_collection3-0604.py
    │   ├── data_collection4.py
    │   ├── env_setting3.py
    │   ├── log3.py
    │   ├── tsp_data_collection.py
    │   └── tsp_env_setting.py
    ├── env0
    │   ├── __init__.py
    │   ├── data_collection0.py
    │   ├── env_setting0.py
    │   └── log0.py
    ├── image
    │   ├── __init__.py
    │   ├── flag.py
    │   ├── map.py
    │   └── mapM.py
    ├── poor_compare.py
    ├── random_generator.py
    ├── test.py
    ├── test_random.py
    ├── train.py
    └── visualization.py
├── maddpg
    ├── __init__.py
    ├── common
    │   ├── __init__.py
    │   ├── distributions.py
    │   ├── summary.py
    │   └── tf_util.py
    └── trainer
    │   ├── maddpg.py
    │   └── prioritized_rb
    │       ├── __init__.py
    │       ├── proportional.py
    │       ├── replay_buffer.py
    │       └── sum_tree.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # pycharm
124 | .idea
125 | .idea/
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 BIT-MCS
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Edics
 2 | This is the code accompanying the paper: "[Energy-Efficient UAV Control for Effective and Fair Communication Coverage: A Deep Reinforcement Learning Approach](https://ieeexplore.ieee.org/document/8432464)", published in JSAC.
 3 | 
 4 | ## :page_facing_up: Description
 5 | Unmanned aerial vehicles (UAVs) can be used to serve as aerial base stations to enhance both the coverage and performance of communication networks in various scenarios, such as emergency communications and network access for remote areas. Mobile UAVs can establish communication links for ground users to deliver packets. However, UAVs have limited communication ranges and energy resources. Particularly, for a large region, they cannot cover the entire area all the time or keep flying for a long time. It is thus challenging to control a group of UAVs to achieve certain communication coverage in a long run, while preserving their connectivity and minimizing their energy consumption. Toward this end, we propose to leverage emerging deep reinforcement learning (DRL) for UAV control and present a novel and highly energy-efficient DRL-based method, which we call DRL-based energy-efficient control for coverage and connectivity ($DRL-EC^3$ ). The proposed method 1) maximizes a novel energy efficiency function with joint consideration for communications coverage, fairness, energy consumption and connectivity; 2) learns the environment and its dynamics; and 3) makes decisions under the guidance of two powerful deep neural networks. We conduct extensive simulations for performance evaluation.
 6 | 
 7 | ## :wrench: Installation
 8 | 1. Clone repo
 9 |     ```bash
10 |     git clone https://github.com/BIT-MCS/DRL-EC3.git
11 |     cd DRL-EC3
12 |     ```
13 | 2. Install dependent packages
14 |     ```sh
15 |     conda create -n mcs python==3.8
16 |     conda activate mcs
17 |     pip install tensorflow-gpu==1.15
18 |     pip install -r requirements.txt
19 |     ```
20 | 
21 | 
22 | ## :computer: Training
23 | 
24 | Train our solution
25 | ```bash
26 | python experiments/train.py
27 | ```
28 | ## :checkered_flag: Testing
29 | 
30 | Test with the trained models 
31 | 
32 | ```sh
33 | python experiments/test.py --load-dir=your_model_path
34 | ```
35 | 
36 | Random test the env
37 | 
38 | ```sh
39 | python experiments/test_random.py
40 | ```
41 | 
42 | ## :clap: Reference
43 | - https://github.com/openai/maddpg
44 | 
45 | 
46 | ## :scroll: Acknowledgement
47 | 
48 | This work was supported in part by the National Natural Science Foundation of China under Grant 61772072 and in part by the National Key Research and Development Program of China under Grant 2018YFB1003701.
49 | <br>
50 | Corresponding author: Chi Harold Liu.
51 | 
52 | ## :e-mail: Contact
53 | 
54 | If you have any question, please email `daizipeng@bit.edu.cn`.
55 | 
56 | ## Paper
57 | If you are interested in our work, please cite our paper as
58 | 
59 | ```
60 | @ARTICLE{liu2018energy,
61 |     author={Liu, Chi Harold and Chen, Zheyu and Tang, Jian and Xu, Jie and Piao, Chengzhe},
62 |     journal={IEEE Journal on Selected Areas in Communications (JSAC)},
63 |     title={Energy-Efficient UAV Control for Effective and Fair Communication Coverage: A Deep Reinforcement Learning Approach},
64 |     year={2018},
65 |     volume={36},
66 |     number={9},
67 |     pages={2059-2070},
68 | }
69 | ```
70 | 


--------------------------------------------------------------------------------
/experiments/.csv:
--------------------------------------------------------------------------------
 1 | ,test_model,collection_ratio,fairness,consumption of energy,efficiency
 2 | 1,71,0.953,0.92802734375,1.401,0.4912266999477505
 3 | 2,72,0.9604,0.92109375,1.363,0.4933548979332965
 4 | 3,73,0.916,0.899365234375,1.532421875,0.42414871986096025
 5 | 4,74,0.964,0.938623046875,1.444,0.4934789119789393
 6 | 5,75,0.9316,0.907373046875,1.403,0.4689399282267934
 7 | 6,76,0.991,0.95537109375,1.328,0.5371141743779904
 8 | 7,77,0.9043,0.88916015625,1.54296875,0.4126325189324473
 9 | 8,78,0.957,0.92060546875,1.448,0.4678810126150502
10 | 9,79,0.945,0.92197265625,1.520703125,0.4475759912848207
11 | 


--------------------------------------------------------------------------------
/experiments/TSP.py:
--------------------------------------------------------------------------------
  1 | from experiments.crazy_env.tsp_data_collection import Env
  2 | from experiments.crazy_env import log3 as Log
  3 | import numpy as np
  4 | import math
  5 | import time
  6 | import random, operator
  7 | import pandas as pd
  8 | import matplotlib.pyplot as plt
  9 | 
 10 | 
 11 | def myint(a):
 12 |     # return int(np.ceil(a))
 13 |     return int(np.floor(a))
 14 | 
 15 | 
 16 | class City:
 17 |     def __init__(self, x, y, env):
 18 |         self.x = x
 19 |         self.y = y
 20 |         self.env = env
 21 | 
 22 |     def distance(self, tocity):
 23 |         dx = tocity.x - self.x
 24 |         dy = tocity.y - self.y
 25 | 
 26 |         if 0 <= self.x + dx < self.env.mapx and 0 <= self.x + dx < self.env.mapy and self.env.mapob[myint(self.x + dx)][
 27 |             myint(self.y + dy)] != self.env.OB and \
 28 |                 self.env.mapob[myint(self.x + (dx / 2))][myint(self.y + (dy / 2))] != self.env.OB and \
 29 |                 self.env.mapob[myint(self.x + (dx / 3))][myint(self.y + (dy / 3))] != self.env.OB and \
 30 |                 self.env.mapob[myint(self.x + (2 * dx / 3))][myint(self.y + (2 * dy / 3))] != self.env.OB and \
 31 |                 self.env.mapob[myint(self.x + (dx / 4))][myint(self.y + (dy / 4))] != self.env.OB and \
 32 |                 self.env.mapob[myint(self.x + (3 * dx / 4))][myint(self.y + (3 * dy / 4))] != self.env.OB:
 33 | 
 34 |             distance = np.sqrt((dx ** 2) + (dy ** 2))
 35 |         else:
 36 |             distance = 50
 37 | 
 38 |         return distance
 39 | 
 40 |     def __repr__(self):
 41 |         return "(" + str(self.x) + "," + str(self.y) + ")"
 42 | 
 43 | 
 44 | class Fitness:
 45 |     def __init__(self, route):
 46 |         self.route = route
 47 |         self.distance = 0
 48 |         self.fitness = 0.0
 49 | 
 50 |     def routeDistance(self):
 51 |         if self.distance == 0:
 52 |             pathDistance = 0
 53 |             for i in range(0, len(self.route)):
 54 |                 fromCity = self.route[i]
 55 |                 toCity = None
 56 |                 if i + 1 < len(self.route):
 57 |                     toCity = self.route[i + 1]
 58 |                 else:
 59 |                     toCity = self.route[0]
 60 |                 pathDistance += fromCity.distance(toCity)
 61 |             self.distance = pathDistance
 62 |         return self.distance
 63 | 
 64 |     def routeFitness(self):
 65 |         if self.fitness == 0:
 66 |             self.fitness = 1 / float(self.routeDistance())
 67 |         return self.fitness
 68 | 
 69 | 
 70 | def createRoute(cityList):
 71 |     route = random.sample(cityList, len(cityList))
 72 |     return route
 73 | 
 74 | 
 75 | def initialPopulation(popSize, cityList):
 76 |     population = []
 77 | 
 78 |     for i in range(0, popSize):
 79 |         population.append(createRoute(cityList))
 80 |     return population
 81 | 
 82 | 
 83 | def rankRoutes(population):
 84 |     fitnessResults = {}
 85 |     for i in range(0, len(population)):
 86 |         fitnessResults[i] = Fitness(population[i]).routeFitness()
 87 |     return sorted(fitnessResults.items(), key=operator.itemgetter(1), reverse=True)
 88 | 
 89 | 
 90 | def selection(popRanked, eliteSize):
 91 |     selectionResults = []
 92 |     df = pd.DataFrame(np.array(popRanked), columns=["Index", "Fitness"])
 93 |     df['cum_sum'] = df.Fitness.cumsum()
 94 |     df['cum_perc'] = 100 * df.cum_sum / df.Fitness.sum()
 95 | 
 96 |     for i in range(0, eliteSize):
 97 |         selectionResults.append(popRanked[i][0])
 98 |     for i in range(0, len(popRanked) - eliteSize):
 99 |         pick = 100 * random.random()
100 |         for i in range(0, len(popRanked)):
101 |             if pick <= df.iat[i, 3]:
102 |                 selectionResults.append(popRanked[i][0])
103 |                 break
104 |     return selectionResults
105 | 
106 | 
107 | def matingPool(population, selectionResults):
108 |     matingpool = []
109 |     for i in range(0, len(selectionResults)):
110 |         index = selectionResults[i]
111 |         matingpool.append(population[index])
112 |     return matingpool
113 | 
114 | 
115 | def breed(parent1, parent2):
116 |     child = []
117 |     childP1 = []
118 |     childP2 = []
119 | 
120 |     geneA = int(random.random() * len(parent1))
121 |     geneB = int(random.random() * len(parent1))
122 | 
123 |     startGene = min(geneA, geneB)
124 |     endGene = max(geneA, geneB)
125 | 
126 |     for i in range(startGene, endGene):
127 |         childP1.append(parent1[i])
128 | 
129 |     childP2 = [item for item in parent2 if item not in childP1]
130 | 
131 |     child = childP1 + childP2
132 |     return child
133 | 
134 | 
135 | def breedPopulation(matingpool, eliteSize):
136 |     children = []
137 |     length = len(matingpool) - eliteSize
138 |     pool = random.sample(matingpool, len(matingpool))
139 | 
140 |     for i in range(0, eliteSize):  # to carry the best individuals into the next generation
141 |         children.append(matingpool[i])
142 | 
143 |     for i in range(0, length):
144 |         child = breed(pool[i], pool[len(matingpool) - i - 1])
145 |         children.append(child)
146 |     return children
147 | 
148 | 
149 | def mutate(individual, mutationRate):
150 |     for swapped in range(len(individual)):
151 |         if (random.random() < mutationRate):
152 |             swapWith = int(random.random() * len(individual))
153 | 
154 |             city1 = individual[swapped]
155 |             city2 = individual[swapWith]
156 | 
157 |             individual[swapped] = city2
158 |             individual[swapWith] = city1
159 |     return individual
160 | 
161 | 
162 | def mutatePopulation(population, mutationRate):
163 |     mutatedPop = []
164 | 
165 |     for ind in range(0, len(population)):
166 |         mutatedInd = mutate(population[ind], mutationRate)
167 |         mutatedPop.append(mutatedInd)
168 |     return mutatedPop
169 | 
170 | 
171 | def nextGeneration(currentGen, eliteSize, mutationRate):
172 |     popRanked = rankRoutes(currentGen)
173 |     selectionResults = selection(popRanked, eliteSize)
174 |     matingpool = matingPool(currentGen, selectionResults)
175 |     children = breedPopulation(matingpool, eliteSize)
176 |     nextGeneration = mutatePopulation(children, mutationRate)
177 |     return nextGeneration
178 | 
179 | 
180 | def geneticAlgorithm(population, popSize, eliteSize, mutationRate, generations):
181 |     pop = initialPopulation(popSize, population)
182 |     print("Initial distance: " + str(1 / rankRoutes(pop)[0][1]))
183 | 
184 |     for i in range(0, generations):
185 |         pop = nextGeneration(pop, eliteSize, mutationRate)
186 | 
187 |     print("Final distance: " + str(1 / rankRoutes(pop)[0][1]))
188 |     bestRouteIndex = rankRoutes(pop)[0][0]
189 |     bestRoute = pop[bestRouteIndex]
190 |     return bestRoute
191 | 
192 | 
193 | def geneticAlgorithmPlot(population, popSize, eliteSize, mutationRate, generations, env_log, reg_n):
194 |     log_path = env_log.full_path
195 |     pop = initialPopulation(popSize, population)
196 |     print("Initial distance: " + str(1 / rankRoutes(pop)[0][1]))
197 | 
198 |     progress = []
199 |     progress.append(1 / rankRoutes(pop)[0][1])
200 | 
201 |     for i in range(0, generations):
202 |         pop = nextGeneration(pop, eliteSize, mutationRate)
203 |         progress.append(1 / rankRoutes(pop)[0][1])
204 |         end = False
205 |         if i % 10 == 0:
206 |             plt.plot(progress)
207 |             plt.ylabel('Distance')
208 |             plt.xlabel('Generation')
209 |             plt.savefig(log_path + '/Distance_generation_%d.png' % (reg_n))
210 |             plt.close()
211 |             if i > 50:
212 |                 test_coverage = progress[i - 50:i]
213 |                 list_var = np.var(test_coverage)
214 |                 print("%d th var: %f" % (i, list_var))
215 |             else:
216 |                 list_var = 1e5
217 |                 print(i)
218 | 
219 |             if list_var < 1e-5:
220 |                 end = True
221 |                 break
222 | 
223 |         if end is True:
224 |             break
225 | 
226 |     print("Final distance: " + str(1 / rankRoutes(pop)[0][1]))
227 |     bestRouteIndex = rankRoutes(pop)[0][0]
228 |     bestRoute = pop[bestRouteIndex]
229 |     return bestRoute
230 | 
231 | 
232 | def train(num_uav):
233 |     log = Log.Log()
234 |     env = Env(log)
235 |     print("training %d PoIs..." % (len(env.datas)))
236 |     start = time.clock()
237 | 
238 |     for n in range(num_uav):
239 |         cityList = []
240 | 
241 |         for i in range(0, len(env.datas)):
242 |             # 随机测试
243 |             # cityList.append(City(x=random.random() * 16, y=random.random() * 16))
244 |             datax = env.datas[i][0]
245 |             datay = env.datas[i][1]
246 |             ab_reg = float(env.mapx) / num_uav
247 |             if ab_reg * n <= datax <= ab_reg * (n + 1):
248 |                 cityList.append(City(x=datax, y=datay, env=env))
249 | 
250 |         print("\nthe %dth region: %d PoI" % (n, len(cityList)))
251 |         # geneticAlgorithm(population=cityList, popSize=100, eliteSize=20, mutationRate=0.01, generations=500)
252 | 
253 |         bestRoute = geneticAlgorithmPlot(population=cityList, popSize=300, eliteSize=50, mutationRate=0.01,
254 |                                          generations=3000,
255 |                                          env_log=log,
256 |                                          reg_n=n)
257 | 
258 |         bestRoutelist = []
259 |         for poi in bestRoute:
260 |             bestRoutelist.append([poi.x, poi.y])
261 | 
262 |         bestRouteDataFrame = pd.DataFrame(np.array(bestRoutelist), columns=["x", "y"])
263 |         bestRouteDataFrame.to_csv(log.full_path + '/saved_route_uav%d.csv' % n)
264 | 
265 |     training_time = time.clock() - start
266 |     print("\n\nTraining time: ", training_time)
267 | 
268 | 
269 | def __cusume_energy(env, uav, value, distance):
270 |     # distance-0.1, alpha-1.0
271 |     if (env.factor * distance + env.alpha * value < env.energy[uav]):
272 |         env.energy[uav] -= (env.factor * distance + env.alpha * value)
273 |         env.use_energy[uav] += (env.factor * distance + env.alpha * value)
274 |     else:
275 |         env.use_energy[uav] += env.energy[uav]
276 |         distance = env.energy[uav] / env.factor
277 |         env.energy[uav] = 0
278 | 
279 |     return env
280 | 
281 | 
282 | def test(num_uav, model_path):
283 |     print("testing...")
284 |     log = Log.Log()
285 |     env = Env(log)
286 |     _ = env.reset()
287 | 
288 |     for n in range(num_uav):
289 |         df = pd.read_csv("%s/saved_route_uav%d.csv" % (model_path, n))
290 |         print("the %dth region: %d PoI" % (n, df.shape[0]))
291 |         step = 0
292 |         i = 0
293 | 
294 |         while step < 500:
295 |             new_positions = [df.loc[i, 'x'], df.loc[i, 'y']]
296 | 
297 |             # charge
298 |             _pos = np.repeat([new_positions], [env.fills.shape[0]], axis=0)  # just repeat(On)  NB!
299 |             _minus = env.fills - _pos
300 |             _power = np.power(_minus, 2)
301 |             _dis = np.sum(_power, axis=1)
302 |             for index, dis in enumerate(_dis):
303 |                 # sensing Fill Station(crange=1.1)
304 |                 if np.sqrt(dis) <= env.crange:
305 |                     # uodate poi data
306 |                     if env.fills_energy_remain[index] > 0:
307 |                         # TODO:加油站的信息更新
308 |                         if env.fspeed * env.maxenergy <= env.fills_energy_remain[index]:
309 |                             if env.energy[n] + env.fspeed * env.maxenergy <= env.maxenergy:
310 |                                 env.fill_energy[n] += env.fspeed * env.maxenergy
311 |                                 env.fills_energy_remain[index] -= env.fspeed * env.maxenergy
312 |                                 env.energy[n] += env.fspeed * env.maxenergy
313 |                             else:
314 |                                 env.fill_energy[n] += env.maxenergy - env.energy[n]
315 |                                 env.fills_energy_remain[index] -= (env.maxenergy - env.energy[n])
316 |                                 env.energy[n] = env.maxenergy
317 |                         else:
318 |                             if env.energy[n] + env.fills_energy_remain[index] <= env.maxenergy:
319 |                                 env.fill_energy[n] += env.fills_energy_remain[index]
320 |                                 env.energy[n] += env.fills_energy_remain[index]
321 |                                 env.fills_energy_remain[index] = 0
322 |                             else:
323 |                                 env.fill_energy[n] += env.maxenergy - env.energy[n]
324 |                                 env.fills_energy_remain[index] -= (env.maxenergy - env.energy[n])
325 |                                 env.energy[n] = env.maxenergy
326 |                     break
327 | 
328 |             # collect
329 |             data = 0
330 |             _pos = np.repeat([new_positions], [env.datas.shape[0]], axis=0)
331 |             _minus = env.datas - _pos
332 |             _power = np.power(_minus, 2)
333 |             _dis = np.sum(_power, axis=1)
334 |             for index, dis in enumerate(_dis):
335 |                 # sensing PoI(crange=1.1)
336 |                 if np.sqrt(dis) <= env.crange:
337 |                     # uodate poi data
338 |                     if env.mapmatrix[index] > 0:
339 |                         tmp_data = env._mapmatrix[index] * env.cspeed
340 |                         if env.energy[n] >= tmp_data * env.alpha:
341 |                             data += tmp_data
342 |                             env.mapmatrix[index] -= tmp_data
343 |                             if env.mapmatrix[index] < 0:
344 |                                 env.mapmatrix[index] = 0.
345 |                         else:
346 |                             data += env.energy[n]
347 |                             env.mapmatrix[index] -= env.energy[n]
348 |                             if env.mapmatrix[index] < 0:
349 |                                 env.mapmatrix[index] = 0.
350 |                             break
351 | 
352 |             value = data if env.energy[n] >= data * env.alpha else env.energy[n]
353 |             env.collection[n] += value
354 |             env = __cusume_energy(env, n, value, 0.)  # collect
355 | 
356 |             if i == df.shape[0] - 1:
357 |                 # env.energy[n]=env.maxenergy  # 不加！
358 |                 ii = 0
359 |             else:
360 |                 ii = i + 1
361 | 
362 |             distance = np.sqrt(((df.loc[ii, 'x'] - df.loc[i, 'x']) ** 2) + ((df.loc[ii, 'y'] - df.loc[i, 'y']) ** 2))
363 | 
364 |             if distance <= env.maxdistance:
365 |                 env = __cusume_energy(env, n, 0, distance)  # move
366 | 
367 |                 # 撞墙
368 |                 dx = df.loc[ii, 'x'] - df.loc[i, 'x']
369 |                 dy = df.loc[ii, 'y'] - df.loc[i, 'y']
370 |                 if 0 <= df.loc[ii, 'x'] < env.mapx and 0 <= df.loc[ii, 'y'] < env.mapy and \
371 |                         env.mapob[myint(df.loc[ii, 'x'])][
372 |                             myint(df.loc[ii, 'y'])] != env.OB and \
373 |                         env.mapob[myint(df.loc[i, 'x'] + (dx / 2))][myint(df.loc[i, 'y'] + (dy / 2))] != env.OB and \
374 |                         env.mapob[myint(df.loc[i, 'x'] + (dx / 3))][myint(df.loc[i, 'y'] + (dy / 3))] != env.OB and \
375 |                         env.mapob[myint(df.loc[i, 'x'] + (2 * dx / 3))][
376 |                             myint(df.loc[i, 'y'] + (2 * dy / 3))] != env.OB and \
377 |                         env.mapob[myint(df.loc[i, 'x'] + (dx / 4))][myint(df.loc[i, 'y'] + (dy / 4))] != env.OB and \
378 |                         env.mapob[myint(df.loc[i, 'x'] + (3 * dx / 4))][myint(df.loc[i, 'y'] + (3 * dy / 4))] != env.OB:
379 |                     i = ii
380 |             else:
381 |                 env = __cusume_energy(env, n, 0, env.maxdistance)  # move
382 |                 newx = df.loc[i, 'x'] + (df.loc[ii, 'x'] - df.loc[i, 'x']) * (env.maxdistance / distance)
383 |                 newy = df.loc[i, 'y'] + (df.loc[ii, 'y'] - df.loc[i, 'y']) * (env.maxdistance / distance)
384 | 
385 |                 dx = newx - df.loc[i, 'x']
386 |                 dy = newy - df.loc[i, 'y']
387 |                 if 0 <= newx < env.mapx and 0 <= newy < env.mapy and \
388 |                         env.mapob[myint(newx)][myint(newy)] != env.OB and \
389 |                         env.mapob[myint(df.loc[i, 'x'] + (dx / 2))][myint(df.loc[i, 'y'] + (dy / 2))] != env.OB and \
390 |                         env.mapob[myint(df.loc[i, 'x'] + (dx / 3))][myint(df.loc[i, 'y'] + (dy / 3))] != env.OB and \
391 |                         env.mapob[myint(df.loc[i, 'x'] + (2 * dx / 3))][
392 |                             myint(df.loc[i, 'y'] + (2 * dy / 3))] != env.OB and \
393 |                         env.mapob[myint(df.loc[i, 'x'] + (dx / 4))][myint(df.loc[i, 'y'] + (dy / 4))] != env.OB and \
394 |                         env.mapob[myint(df.loc[i, 'x'] + (3 * dx / 4))][myint(df.loc[i, 'y'] + (3 * dy / 4))] != env.OB:
395 |                     df.loc[i, 'x'] = newx
396 |                     df.loc[i, 'y'] = newy
397 |             step += 1
398 | 
399 |     print('efficiency: %.3f' % env.efficiency)
400 |     print('data_collection_ratio: %.3f' % (1.0 - env.leftrewards))
401 |     print('fairness: %.3f' % env.collection_fairness)
402 |     print('normal fairness: %.3f' % env.normal_collection_fairness)
403 |     print('energy_consumption: %.3f' % (np.sum(env.normal_use_energy)))
404 |     print('fill:', env.fills_energy_remain)
405 | 
406 | 
407 | if __name__ == '__main__':
408 |     num_uav = 5
409 |     # train(num_uav=num_uav)
410 |     test(num_uav=num_uav, model_path='/home/dzp1997/PycharmProjects/maddpg-czy-DZP/experiments/2019/06-29/uav5')
411 | 


--------------------------------------------------------------------------------
/experiments/compare.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import numpy as np
  3 | 
  4 | 
  5 | def error(input_list):
  6 |     input = np.array(input_list)
  7 |     input = input.transpose((1, 0))
  8 |     error_low = input[0] - input[1]
  9 |     error_high = input[2] - input[0]
 10 |     error = []
 11 |     error.append(error_low)
 12 |     error.append(error_high)
 13 |     return error
 14 | 
 15 | 
 16 | def average(input_list):
 17 |     input = np.array(input_list)
 18 |     input = input.transpose((1, 0))
 19 |     return input[0]
 20 | 
 21 | 
 22 | def compare_plot_errorbar(xlabel, ylabel, x, eDivert, woApeX, woRNN, MADDPG):
 23 |     plt.xlabel(xlabel)
 24 |     plt.ylabel(ylabel)
 25 |     plt.errorbar(x=x, y=average(eDivert), yerr=error(eDivert), fmt='r-o', label='e-Divert', capsize=4)
 26 |     plt.errorbar(x=x, y=average(woApeX), yerr=error(woApeX), fmt='g-^', label='e-Divert w/o Ape-X', capsize=4)
 27 |     plt.errorbar(x=x, y=average(woRNN), yerr=error(woRNN), fmt='m-<', label='e-Divert w/o RNN', capsize=4)
 28 |     plt.errorbar(x=x, y=average(MADDPG), yerr=error(MADDPG), fmt='k-*', label='MADDPG', capsize=4)
 29 | 
 30 |     plt.ylim(ymin=0, ymax=1)
 31 |     plt.grid(True)
 32 |     plt.grid(linestyle='--')
 33 |     plt.legend()
 34 |     plt.show()
 35 | 
 36 | 
 37 | def compare_plot(xlabel, ylabel, x,yrange, eDivert, woApeX, woRNN, MADDPG):
 38 |     plt.figure(figsize=(15, 20))
 39 |     plt.xlabel(xlabel,fontsize=32)
 40 |     plt.ylabel(ylabel,fontsize=32)
 41 |     plt.xticks(fontsize=32)
 42 |     plt.yticks(fontsize=32)
 43 |     plt.plot(x,eDivert, color='b', marker='o', label='e-Divert',markersize=26,markeredgewidth=5,markerfacecolor='none',linewidth=4)
 44 |     plt.plot(x, woApeX, color='g',marker='^', label='e-Divert w/o Ape-X',markersize=26,markeredgewidth=5,markerfacecolor='none',linewidth=4)
 45 |     plt.plot(x, woRNN, color='m',marker='d', label='e-Divert w/o RNN',markersize=26,markeredgewidth=5,markerfacecolor='none',linewidth=4)
 46 |     plt.plot(x, MADDPG, color='k',marker='s', label='MADDPG',markersize=26,markeredgewidth=5,markerfacecolor='none',linewidth=4)
 47 |     # plt.plot(x,[3.62,4.62,5.62,6.62,7.62],color='red',linestyle='--',label="Maximum used energy",linewidth=4)
 48 | 
 49 |     plt.xticks(x,x)
 50 |     # plt.axhline(y=4.62, color='red', linestyle='--', label="Maximum used energy",linewidth=4)
 51 |     plt.ylim(yrange[0],yrange[1])
 52 |     plt.grid(True)
 53 |     plt.grid(linestyle='--')
 54 |     plt.legend(loc='lower right',fontsize=22)
 55 |     plt.show()
 56 | 
 57 | 
 58 | if __name__ == '__main__':
 59 |     # collection-range
 60 |     compare_plot(xlabel="Sensing range (unit)",
 61 |                  ylabel="Data collection ratio",
 62 |                  x=[0.6, 0.8, 1.0, 1.2, 1.4],
 63 |                  yrange=[0,1],
 64 |                  eDivert=[0.706, 0.874, 0.916, 0.936, 0.952],
 65 |                  woApeX=[0.584, 0.70, 0.871, 0.906, 0.949],
 66 |                  woRNN=[0.205, 0.41, 0.463, 0.569, 0.722],
 67 |                  MADDPG=[0.139, 0.245, 0.323, 0.360, 0.439],
 68 |                  )
 69 | 
 70 | 
 71 |     # fairness_range
 72 |     compare_plot(xlabel="Sensing range (unit)",
 73 |                  ylabel="Geographical fairness",
 74 |                  x=[0.6, 0.8, 1.0, 1.2, 1.4],
 75 |                  yrange=[0,1],
 76 |                  eDivert=[0.784,0.909,0.936,0.951,0.970],
 77 |                  woApeX=[0.675,0.729,0.903,0.935,0.963],
 78 |                  woRNN=[0.294,0.467,0.573,0.650,0.777],
 79 |                  MADDPG=[0.168,0.293,0.382,0.409,0.5],
 80 |                  )
 81 |     # # #
 82 |     # # energy_range
 83 |     # compare_plot(xlabel="Sensing range (unit)",
 84 |     #              ylabel="Energy usage (# of full batteries)",
 85 |     #              x=[0.6, 0.8, 1.0, 1.2, 1.4],
 86 |     #              yrange=[0,5],
 87 |     #              eDivert=[3.45,4.086,3.89,3.918,3.9],
 88 |     #              woApeX=[3.39,3.588,4.617,4.43,4.48],
 89 |     #              woRNN=[1.395,2.514,3.188,3.113,4.113],
 90 |     #              MADDPG=[1.792,2.201,2.545,2.547,3.027],
 91 |     #              )
 92 | 
 93 |     # efficiency_range
 94 |     compare_plot(xlabel="Sensing range (unit)",
 95 |                  ylabel="Energy efficiency",
 96 |                  x=[0.6, 0.8, 1.0, 1.2, 1.4],
 97 |                  yrange=[-0.04,0.2],
 98 |                  eDivert=[0.129,0.155,0.179,0.182,0.193],
 99 |                  woApeX=[0.092,0.118,0.139,0.153,0.165],
100 |                  woRNN=[0.033,0.062,0.063,0.097,0.108],
101 |                  MADDPG=[0.011,0.027,0.039,0.048,0.058],
102 |                  )
103 | 
104 |     # collection_uav
105 |     compare_plot(xlabel="No. of vehicles",
106 |                  ylabel="Data collection ratio",
107 |                  x=[1, 2, 3, 4, 5],
108 |                  yrange=[0,1],
109 |                  eDivert=[0.88,0.943,0.916,0.912,0.911],
110 |                  woApeX=[0.769,0.871,0.746,0.738,0.764],
111 |                  woRNN=[0.842,0.722,0.636,0.682,0.772],
112 |                  MADDPG=[0.401,0.383,0.415,0.478,0.269],
113 |                  )
114 | 
115 |     # fairness_uav
116 |     compare_plot(xlabel="No. of vehicles",
117 |                  ylabel="Geographical fairness",
118 |                  x=[1, 2, 3, 4, 5],
119 |                  yrange=[0,1],
120 |                  eDivert=[0.912,0.958,0.943,0.944,0.935],
121 |                  woApeX=[0.814,0.902,0.795,0.790,0.819],
122 |                  woRNN=[0.874,0.777,0.714,0.732,0.815],
123 |                  MADDPG=[0.500,0.431,0.463,0.537,0.338],
124 |                  )
125 |     # #
126 |     # # energy_uav
127 |     # compare_plot(xlabel="No. of vehicles",
128 |     #              ylabel="Energy usage (# of full batteries)",
129 |     #              x=[1, 2, 3, 4, 5],
130 |     #              yrange=[1,8],
131 |     #              eDivert=[3.576,4,4.004,4.402,4.668],
132 |     #              woApeX=[3.244,4.273,4.562,5.156,5.953],
133 |     #              woRNN=[3.42,4.113,4.496,5.613,6.45],
134 |     #              MADDPG=[1.853,2.695,3.543,4.44,5.08],
135 |     #              )
136 | 
137 |     # efficiency_uav
138 |     compare_plot(xlabel="No. of vehicles",
139 |                  ylabel="Energy efficiency",
140 |                  x=[1, 2, 3, 4, 5],
141 |                  yrange=[-0.04,0.2],
142 |                  eDivert=[0.182,0.181,0.179,0.158,0.149],
143 |                  woApeX=[0.155,0.150,0.104,0.091,0.083],
144 |                  woRNN=[0.174,0.108,0.080,0.080,0.080],
145 |                  MADDPG=[0.085,0.050,0.045,0.046,0.015],
146 |                  )
147 | 
148 |     # collection_fill
149 |     compare_plot(xlabel="Charging proportion (%)",
150 |                  ylabel="Data collection ratio",
151 |                  x=[10, 20, 30, 40, 50],
152 |                  yrange=[0,1],
153 |                  eDivert=[0.927,0.911,0.937,0.905,0.939],
154 |                  woApeX=[0.736,0.766,0.761,0.791,0.838],
155 |                  woRNN=[0.638,0.702,0.713,0.680,0.672],
156 |                  MADDPG=[0.305,0.354,0.393,0.392,0.369],
157 |                  )
158 | 
159 |     # fairness_fill
160 |     compare_plot(xlabel="Charging proportion (%)",
161 |                  ylabel="Geographical fairness",
162 |                  x=[10, 20, 30, 40, 50],
163 |                  yrange=[0,1],
164 |                  eDivert=[0.951,0.935,0.958,0.941,0.959],
165 |                  woApeX=[0.804,0.829,0.821,0.843,0.880],
166 |                  woRNN=[0.704,0.745,0.776,0.722,0.727],
167 |                  MADDPG=[0.360,0.425,0.436,0.421,0.431],
168 |                  )
169 | 
170 |     # # energy_fill
171 |     # compare_plot(xlabel="Charging proportion (%)",
172 |     #              ylabel="Energy usage (# of full batteries)",
173 |     #              x=[10, 20, 30, 40, 50],
174 |     #              yrange=[0,5],
175 |     #              eDivert=[4.023,3.844,3.926,3.73,4],
176 |     #              woApeX=[3.463,3.771,3.74,3.889,4.348],
177 |     #              woRNN=[2.844,3.184,3.457,3.066,3.064],
178 |     #              MADDPG=[2.15,2.285,2.342,2.3,2.244],
179 |     #              )
180 |     #
181 |     # efficiency_fill
182 |     compare_plot(xlabel="Charging proportion (%)",
183 |                  ylabel="Energy efficiency",
184 |                  x=[10, 20, 30, 40, 50],
185 |                  yrange=[0,0.3],
186 |                  eDivert=[0.180,0.180,0.185,0.185,0.184],
187 |                  woApeX=[0.138,0.136,0.137,0.141,0.139],
188 |                  woRNN=[0.132,0.132,0.131,0.131,0.132],
189 |                  MADDPG=[0.044,0.055,0.059,0.061,0.057],
190 |                  )
191 | 
192 |     # collection_station
193 |     compare_plot(xlabel="No. of charging stations",
194 |                  ylabel="Data collection ratio",
195 |                  x=[1,2,3,4,5],
196 |                  yrange=[0,1],
197 |                  eDivert=[0.819,0.865,0.911,0.905,0.943],
198 |                  woApeX=[0.461,0.680,0.795,0.874,0.871],
199 |                  woRNN=[0.480,0.684,0.702,0.649,0.688],
200 |                  MADDPG=[0.366,0.366,0.332,0.336,0.371],
201 |                  )
202 | 
203 |     # fairness_station
204 |     compare_plot(xlabel="No. of charging stations",
205 |                  ylabel="Geographical fairness",
206 |                  x=[1, 2, 3, 4, 5],
207 |                  yrange=[0,1],
208 |                  eDivert=[0.865,0.906,0.935,0.934,0.958],
209 |                  woApeX=[0.526,0.734,0.851,0.903,0.902],
210 |                  woRNN=[0.547,0.710,0.745,0.694,0.758],
211 |                  MADDPG=[0.411,0.415,0.415,0.392,0.423],
212 |                  )
213 |     #
214 |     # # energy_station
215 |     # compare_plot(xlabel="No. of charging stations",
216 |     #              ylabel="Energy usage (# of full batteries)",
217 |     #              x=[1, 2, 3, 4, 5],
218 |     #              yrange=[0,5],
219 |     #              eDivert=[1.993,3.537,3.844,3.773,4],
220 |     #              woApeX=[2.092,3.135,3.855,4.383,4.273],
221 |     #              woRNN=[2.09,3.041,3.184,3.05,3.98],
222 |     #              MADDPG=[2.016,2.203,2.264,2.473,2.693],
223 |     #              )
224 | 
225 |     # efficiency_station
226 |     compare_plot(xlabel="No. of charging stations",
227 |                  ylabel="Energy efficiency",
228 |                  x=[1, 2, 3, 4, 5],
229 |                  yrange=[-0.04,0.2],
230 |                  eDivert=[0.138,0.177,0.180,0.181,0.181],
231 |                  woApeX=[0.093,0.128,0.142,0.148,0.150],
232 |                  woRNN=[0.101,0.126,0.132,0.119,0.104],
233 |                  MADDPG=[0.063,0.055,0.048,0.047,0.048],
234 |                  )
235 | 
236 | 
237 | 
238 | 
239 | 


--------------------------------------------------------------------------------
/experiments/crazy_env/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BIT-MCS/DRL-EC3/3f6fc8afe7ddea615e0e8f3f9f0fdfd6a6cd6db6/experiments/crazy_env/__init__.py


--------------------------------------------------------------------------------
/experiments/crazy_env/env_setting3.py:
--------------------------------------------------------------------------------
 1 | class Setting(object):
 2 |     def __init__(self, log):
 3 |         self.V = {
 4 |             'MAP_X': 16,
 5 |             'MAP_Y': 16,
 6 |             'MAX_VALUE': 1.,
 7 |             'MIN_VALUE': 0.,
 8 |             'OBSTACLE': [   # todo：OBSTACLE
 9 |                 [0, 3, 1, 1],
10 |                 [2, 9, 2, 1],
11 |                 [1, 3, 1, 2],
12 |                 [2, 15, 2, 1],
13 |                 [2, 0, 1, 1],
14 |                 [4, 4, 1, 1],
15 |                 [5, 4, 1,3],
16 |                 [5, 11, 1, 3],
17 |                 [10, 0, 3, 1],
18 |                 [10, 1, 1, 1],
19 |                 [10, 5, 1, 3],
20 |                 [8, 10, 3, 1],
21 |                 [9, 15, 1, 1],
22 |                 [13, 6, 1, 2],
23 |                 [13, 13, 1, 2],
24 |                 [12, 15, 4, 1],
25 |                 [15, 10, 1, 1]
26 |             ],
27 |             'CHANNEL': 3,
28 | 
29 |             'NUM_UAV': 2,  # TODO:无人机个数
30 |             'INIT_POSITION': (0, 8, 8),
31 |             'MAX_ENERGY': 50.,  # TODO: 初始能量
32 |             'NUM_ACTION': 2,  # 2
33 |             'SAFE_ENERGY_RATE': 0.2,
34 |             'RANGE': 1.1,   # TODO：采集范围
35 |             'MAXDISTANCE': 1.,
36 |             'COLLECTION_PROPORTION': 0.2,  # c speed   # TODO： 采集速度
37 |             'FILL_PROPORTION': 0.2,  # fill speed  # TODO：充电速度
38 | 
39 |             'WALL_REWARD': -1.,
40 |             'VISIT': 1. / 1000.,
41 |             'DATA_REWARD': 1.,
42 |             'FILL_REWARD': 1.,
43 |             'ALPHA': 1.,
44 |             'BETA': 0.1,
45 |             'EPSILON': 1e-4,
46 |             'NORMALIZE': .1,
47 |             'FACTOR': 0.1,
48 |         }
49 |         self.LOG = log
50 |         self.time = log.time
51 | 
52 |     def log(self):
53 |         self.LOG.log(self.V)
54 | 


--------------------------------------------------------------------------------
/experiments/crazy_env/log3.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import os
  3 | import matplotlib.pyplot as plt
  4 | import numpy as np
  5 | 
  6 | 
  7 | class Log(object):
  8 |     def __init__(self):
  9 |         self.time = str(time.strftime("%Y/%m-%d/%H-%M-%S", time.localtime()))
 10 |         self.full_path = os.path.join('.', self.time)
 11 |         self.choose_color = ['blue', 'green', 'purple', 'red']
 12 |         if os.path.exists(self.full_path):
 13 |             self.full_path = os.path.join(self.full_path, '*')
 14 |         else:
 15 |             pass
 16 | 
 17 |         os.makedirs(self.full_path)
 18 |         self.file_path = self.full_path + '/REPORT.txt'
 19 |         file = open(self.file_path, 'x')
 20 |         file.close()
 21 | 
 22 |     def log(self, values):
 23 |         if isinstance(values, dict):
 24 |             with open(self.file_path, 'a') as file:
 25 |                 for key, value in values.items():
 26 |                     print(key, value, file=file)
 27 |         elif isinstance(values, list):
 28 |             with open(self.file_path, 'a') as file:
 29 |                 for value in values:
 30 |                     print(value, file=file)
 31 |         else:
 32 |             with open(self.file_path, 'a') as file:
 33 |                 print(values, file=file)
 34 | 
 35 |     def circle(self, x, y, r, color='red', count=100):
 36 |         xarr = []
 37 |         yarr = []
 38 |         for i in range(count):
 39 |             j = float(i) / count * 2 * np.pi
 40 |             xarr.append(x + r * np.cos(j))
 41 |             yarr.append(y + r * np.sin(j))
 42 |         plt.plot(xarr, yarr, c=color, linewidth=2)
 43 | 
 44 |     def draw_path(self, env, env_i, meaningful_fill, meaningful_get):
 45 |         full_path = os.path.join(self.full_path, 'Path')
 46 |         if not os.path.exists(full_path):
 47 |             os.makedirs(full_path)
 48 |         xxx = []
 49 |         colors = []
 50 |         for x in range(env.mapx):
 51 |             xxx.append((x, 1))
 52 |         for y in range(env.mapy):
 53 |             c = []
 54 |             for x in range(env.mapx):
 55 |                 # 1 represents obstacle,0 is blank
 56 |                 if env.mapob[x][y] == 1:
 57 |                     c.append((0, 0, 0, 1))
 58 |                 else:
 59 |                     c.append((1, 1, 1, 1))
 60 |             colors.append(c)
 61 | 
 62 |         Fig = plt.figure(figsize=(5, 5))
 63 |         PATH = np.array(env.trace)
 64 |         ENERGY_PATH = np.array(env.energytrace)
 65 | 
 66 |         for i1 in range(env.mapy):
 67 |             plt.broken_barh(xxx, (i1, 1), facecolors=colors[i1])
 68 | 
 69 |         plt.scatter(env.datas[:, 0], env.datas[:, 1], c=env.DATAs[:, 2], marker="s")
 70 | 
 71 |         for i in range(env.n):
 72 |             # M = Fig.add_subplot(1, 1, i + 1)
 73 |             plt.ylim(ymin=0, ymax=env.mapy)
 74 |             plt.xlim(xmin=0, xmax=env.mapx)
 75 |             color = self.choose_color[i]
 76 |             plt.plot(PATH[i, :, 1], PATH[i, :, 2], color=color)
 77 |             for j in range(len(PATH[i])):
 78 |                 if PATH[i, j, 0] >= 0:
 79 |                     plt.scatter(PATH[i, j, 1], PATH[i, j, 2], color=color, marker=".", norm=ENERGY_PATH[i])
 80 |                 else:
 81 |                     plt.scatter(PATH[i, j, 1], PATH[i, j, 2], color=color, marker="+", norm=ENERGY_PATH[i])
 82 |             # grid line
 83 |             plt.grid(True, linestyle='-.', color='black')
 84 |             # title
 85 |             plt.title('Meaningful Get:' + str(meaningful_get) + '\nMeaningful Fill:' + str(
 86 |                 meaningful_fill) + '\nLeft Reward=' + str(env.leftrewards) + '  ( NAIVE VERSION^_^ )')
 87 | 
 88 |         plt.scatter(env.fills[:, 0], env.fills[:, 1], c='red', marker="*")
 89 |         for (x, y) in zip(env.fills[:, 0], env.fills[:, 1]):
 90 |             self.circle(x, y, env.crange)
 91 |         Fig.savefig(full_path + '/path_' + str(env_i) + '.png')
 92 | 
 93 |         plt.close()
 94 | 
 95 |     def step_information(self, action_n, env, step, env_i, meaningful_fill, meaningful_get, indicator):  # -1 fill,1 get
 96 |         full_path = os.path.join(self.full_path, 'Path')
 97 |         if not os.path.exists(full_path):
 98 |             os.makedirs(full_path)
 99 |         debug_filename = full_path + '/path_' + str(env_i) + '.txt'
100 | 
101 |         with open(debug_filename, 'a+') as file:
102 |             print("\nStep ", step, ":", file=file)
103 |             for i in range(env.n):
104 |                 if indicator[i] == -1:
105 |                     print("UAV_", i, "------", "Decision: Filling ", env.tmp_energy[i], " energy,current Energy: ",
106 |                           env.energy[i], ",  Reward: ", env.reward[i], ", Penalty: ", env.tmp_penalty[i],
107 |                           "\n\t\tAction detail:", action_n[i], " Station-energy Remain:", env.fills_energy_remain, "\n",
108 |                           file=file)
109 | 
110 |                     if env.tmp_energy[i] > 0:
111 |                         meaningful_fill[i] += 1
112 |                 else:
113 |                     print("UAV_", i, "------", "Decision: Getting ", env.tmp_value[i], " POI,current Energy: ",
114 |                           env.energy[i], ",  Reward: ", env.reward[i], ", Penalty: ", env.tmp_penalty[i],
115 |                           "\n\t\tAction detail:", action_n[i], " Station-energy Remain:", env.fills_energy_remain, "\n",
116 |                           file=file)
117 |                     if env.tmp_value[i] > 0:
118 |                         meaningful_get[i] += 1
119 | 


--------------------------------------------------------------------------------
/experiments/crazy_env/tsp_data_collection.py:
--------------------------------------------------------------------------------
  1 | from experiments.crazy_env.tsp_env_setting import Setting
  2 | from experiments.image.mapM import MapM
  3 | import os
  4 | import copy
  5 | from os.path import join as pjoin
  6 | import numpy as np
  7 | import time
  8 | import cv2
  9 | import math
 10 | from gym import spaces
 11 | 
 12 | 
 13 | def mypjoin(path1, path2, paths=None):
 14 |     full_path = pjoin(path1, path2)
 15 |     if not os.path.exists(full_path):
 16 |         os.mkdir(full_path)
 17 |     if paths is not None:
 18 |         full_path = pjoin(full_path, paths)
 19 |         if not os.path.exists(full_path):
 20 |             os.mkdir(full_path)
 21 |     return full_path
 22 | 
 23 | 
 24 | def myint(a):
 25 |     # return int(np.ceil(a))
 26 |     return int(np.floor(a))
 27 | 
 28 | 
 29 | class Env(object):
 30 |     def __init__(self, log):
 31 |         # self.tr = tracker.SummaryTracker()
 32 |         self.sg = Setting(log)
 33 |         self.sg.log()
 34 | 
 35 |         # 6-19 00:42
 36 |         self.maxaction = 0
 37 |         self.minaction = 0
 38 |         #
 39 | 
 40 |         self.log_dir = log.full_path
 41 |         # self.log_dir = mypjoin('.', self.sg.time)
 42 |         # basis
 43 |         self.mapx = self.sg.V['MAP_X']  # 16
 44 |         self.mapy = self.sg.V['MAP_Y']  # 16
 45 |         self.map = MapM(self.log_dir)  # [80,80]
 46 |         self.channel = self.sg.V['CHANNEL']  # 3
 47 |         self.image_data = None
 48 |         self.image_position = None
 49 |         self.safe_energy_rate = self.sg.V['SAFE_ENERGY_RATE']  # 0.1
 50 | 
 51 |         # num of uavs
 52 |         self.n = self.sg.V['NUM_UAV']
 53 | 
 54 |         # [[80.80,3]]
 55 |         # Box用于实现连续数据构成的空间，其中包含两组参数：空间内数据范围（上限和下限），以及空间维度的大小
 56 |         self.observation_space = [spaces.Box(low=-1, high=1, shape=(self.map.width, self.map.height, self.channel)) for
 57 |                                   i in range(self.n)]
 58 | 
 59 |         # [[2]]
 60 |         # TODO:去掉了action-state(<0,>0),只留下 delta x, delta y
 61 |         self.action_space = [spaces.Box(low=-1, high=1, shape=(self.sg.V['NUM_ACTION'],)) for i in range(self.n)]
 62 | 
 63 |         self.maxenergy = self.sg.V['MAX_ENERGY']  # 100
 64 |         self.crange = self.sg.V['RANGE']  # 1.1
 65 |         self.maxdistance = self.sg.V['MAXDISTANCE']  # 1.0
 66 |         self.cspeed = np.float16(self.sg.V['COLLECTION_PROPORTION'])  # 0.2
 67 |         self.fspeed = np.float16(self.sg.V['FILL_PROPORTION'])  # 0.1
 68 |         self.alpha = self.sg.V['ALPHA']  # 1.0
 69 |         self.beta = self.sg.V['BETA']  # 0.1
 70 |         self.track = 1. / 1000.
 71 | 
 72 |         # ---- 6-8 14:48 add factor
 73 |         self.factor = self.sg.V['FACTOR']
 74 |         self.epsilon = self.sg.V['EPSILON']
 75 |         self.normalize = self.sg.V['NORMALIZE']
 76 | 
 77 |         # mapob [16,16]
 78 |         self.mapob = np.zeros((self.mapx, self.mapy)).astype(np.int8)
 79 | 
 80 |         """
 81 |         Initial Obstacles
 82 |         """
 83 |         # obstacles
 84 |         self.OB = 1
 85 |         obs = self.sg.V['OBSTACLE']
 86 | 
 87 |         # draw obstacles in mapob[16,16], the obstacle is 1, others is 0
 88 |         for i in obs:
 89 |             for x in range(i[0], i[0] + i[2], 1):
 90 |                 for y in range(i[1], i[1] + i[3], 1):
 91 |                     self.mapob[x][y] = self.OB
 92 |         # reward
 93 |         self.pwall = self.sg.V['WALL_REWARD']  # -1
 94 | 
 95 |         """
 96 |         Initial POI(data)
 97 |         """
 98 |         # POI [256,3]  3->[x,y,value]
 99 |         test = [[1.5454101562e-01, 2.2583007812e-02, 6.5332031250e-01],
100 |                 [2.1936035156e-01, 2.1618652344e-01, 8.2568359375e-01],
101 |                 [3.3813476562e-01, 4.4738769531e-02, 6.6406250000e-02],
102 |                 [6.5478515625e-01, 6.5429687500e-01, 8.7280273438e-02],
103 |                 [6.9970703125e-01, 7.5000000000e-01, 4.6923828125e-01],
104 |                 [3.2177734375e-01, 4.9145507812e-01, 8.8769531250e-01],
105 |                 [6.0595703125e-01, 8.5449218750e-01, 1.0772705078e-01],
106 |                 [7.1679687500e-01, 1.1370849609e-01, 5.3759765625e-01],
107 |                 [7.3046875000e-01, 9.5800781250e-01, 3.6157226562e-01],
108 |                 [9.7656250000e-01, 4.9365234375e-01, 2.5732421875e-01],
109 |                 [1.4416503906e-01, 7.8320312500e-01, 7.1679687500e-01],
110 |                 [7.1435546875e-01, 2.1618652344e-01, 4.7070312500e-01],
111 |                 [1.3830566406e-01, 6.8310546875e-01, 6.7675781250e-01],
112 |                 [6.2304687500e-01, 1.4045715332e-02, 4.3017578125e-01],
113 |                 [9.2919921875e-01, 9.7460937500e-01, 5.6494140625e-01],
114 |                 [9.5996093750e-01, 3.4423828125e-02, 1.2927246094e-01],
115 |                 [5.4443359375e-01, 7.9199218750e-01, 3.7622070312e-01],
116 |                 [4.6777343750e-01, 5.4394531250e-01, 7.2753906250e-01],
117 |                 [4.7558593750e-01, 7.0898437500e-01, 7.6562500000e-01],
118 |                 [8.5205078125e-01, 4.8364257812e-01, 3.9965820312e-01],
119 |                 [7.1240234375e-01, 1.6027832031e-01, 5.7421875000e-01],
120 |                 [4.7460937500e-01, 9.8937988281e-02, 3.8500976562e-01],
121 |                 [6.1914062500e-01, 1.2841796875e-01, 1.4758300781e-01],
122 |                 [6.7773437500e-01, 5.8593750000e-02, 5.6689453125e-01],
123 |                 [5.2099609375e-01, 1.2927246094e-01, 1.6943359375e-01],
124 |                 [3.0737304688e-01, 9.3066406250e-01, 9.1845703125e-01],
125 |                 [1.7565917969e-01, 9.7802734375e-01, 4.3847656250e-01],
126 |                 [4.1040039062e-01, 8.9794921875e-01, 2.6123046875e-01],
127 |                 [6.5234375000e-01, 6.9580078125e-01, 6.5429687500e-01],
128 |                 [9.8046875000e-01, 4.0161132812e-01, 5.4003906250e-01],
129 |                 [6.2597656250e-01, 7.5244140625e-01, 8.1640625000e-01],
130 |                 [5.6762695312e-02, 7.7734375000e-01, 2.2973632812e-01],
131 |                 [9.0380859375e-01, 6.3720703125e-01, 8.8183593750e-01],
132 |                 [5.9326171875e-01, 5.8740234375e-01, 7.3339843750e-01],
133 |                 [2.6318359375e-01, 6.7480468750e-01, 3.6206054688e-01],
134 |                 [2.6245117188e-01, 5.3613281250e-01, 3.1201171875e-01],
135 |                 [5.5468750000e-01, 3.2397460938e-01, 5.8496093750e-01],
136 |                 [9.3896484375e-01, 6.6601562500e-01, 2.0996093750e-02],
137 |                 [1.3537597656e-01, 2.8100585938e-01, 1.8847656250e-01],
138 |                 [9.5507812500e-01, 8.2421875000e-01, 6.2890625000e-01],
139 |                 [4.3505859375e-01, 9.8046875000e-01, 7.4169921875e-01],
140 |                 [4.8559570312e-01, 4.9853515625e-01, 2.4414062500e-01],
141 |                 [6.8457031250e-01, 2.5073242188e-01, 4.5385742188e-01],
142 |                 [5.1025390625e-01, 8.9990234375e-01, 6.6601562500e-01],
143 |                 [6.6992187500e-01, 6.2011718750e-01, 6.6552734375e-01],
144 |                 [5.0292968750e-02, 8.3496093750e-01, 6.7968750000e-01],
145 |                 [7.8808593750e-01, 1.5332031250e-01, 9.0429687500e-01],
146 |                 [8.2128906250e-01, 7.9833984375e-01, 4.6142578125e-01],
147 |                 [3.0059814453e-02, 7.8125000000e-01, 4.9951171875e-01],
148 |                 [1.9006347656e-01, 7.3144531250e-01, 4.3994140625e-01],
149 |                 [8.3544921875e-01, 4.3237304688e-01, 8.6279296875e-01],
150 |                 [7.3437500000e-01, 9.9548339844e-02, 1.8688964844e-01],
151 |                 [2.6074218750e-01, 9.1699218750e-01, 5.9814453125e-01],
152 |                 [8.1689453125e-01, 1.9482421875e-01, 9.2675781250e-01],
153 |                 [8.7500000000e-01, 2.7221679688e-01, 7.4707031250e-01],
154 |                 [7.4121093750e-01, 6.7529296875e-01, 9.1601562500e-01],
155 |                 [9.3066406250e-01, 6.2207031250e-01, 8.2568359375e-01],
156 |                 [5.1220703125e-01, 1.7529296875e-01, 1.3122558594e-01],
157 |                 [8.9794921875e-01, 3.0053710938e-01, 8.1591796875e-01],
158 |                 [2.6953125000e-01, 6.9824218750e-01, 1.1224365234e-01],
159 |                 [7.1386718750e-01, 6.3134765625e-01, 1.3537597656e-01],
160 |                 [6.8066406250e-01, 6.5673828125e-01, 5.0195312500e-01],
161 |                 [5.4248046875e-01, 1.5234375000e-01, 1.6955566406e-01],
162 |                 [5.7568359375e-01, 1.5124511719e-01, 8.9599609375e-01],
163 |                 [1.7065429688e-01, 8.4411621094e-02, 2.5708007812e-01],
164 |                 [8.6474609375e-01, 2.2229003906e-01, 9.2675781250e-01],
165 |                 [9.3701171875e-01, 5.1849365234e-02, 3.6474609375e-01],
166 |                 [8.1298828125e-01, 7.8564453125e-01, 6.2402343750e-01],
167 |                 [4.1503906250e-01, 5.9423828125e-01, 5.0537109375e-01],
168 |                 [3.4179687500e-01, 4.7802734375e-01, 8.8818359375e-01],
169 |                 [3.9306640625e-01, 5.1074218750e-01, 3.0981445312e-01],
170 |                 [8.0566406250e-01, 1.6113281250e-01, 4.4848632812e-01],
171 |                 [8.8134765625e-02, 9.7705078125e-01, 8.5742187500e-01],
172 |                 [2.1984863281e-01, 7.5048828125e-01, 5.2978515625e-01],
173 |                 [8.5839843750e-01, 8.5058593750e-01, 4.6582031250e-01],
174 |                 [6.6259765625e-01, 6.6992187500e-01, 6.4404296875e-01],
175 |                 [8.7500000000e-01, 9.2138671875e-01, 3.1982421875e-01],
176 |                 [4.5800781250e-01, 5.3076171875e-01, 3.9868164062e-01],
177 |                 [5.2148437500e-01, 9.7705078125e-01, 8.2617187500e-01],
178 |                 [2.3986816406e-01, 5.0488281250e-01, 6.6650390625e-01]]
179 | 
180 |         # DATA shape:256*3
181 |         self.DATAs = np.reshape(test, (-1, 3)).astype(np.float16)
182 | 
183 |         # # #TODO:调点
184 |         # dx = [-0.2, -0.2, -0.2, 0, 0, 0, 0.2, 0.2, 0.2]
185 |         # dy = [-0.2, 0, 0.2, -0.2, 0, 0.2, -0.2, 0, 0.2]
186 |         # # replace the POI in obstacle position with the POI out of obstacle position
187 |         # for index in range(self.DATAs.shape[0]):
188 |         #     need_adjust = True
189 |         #     while need_adjust:
190 |         #         need_adjust = False
191 |         #         for i in range(len(dx)):
192 |         #             if self.mapob[min(myint(self.DATAs[index][0] * self.mapx + dx[i]), self.mapx - 1)][
193 |         #                 min(myint(self.DATAs[index][1] * self.mapy + dy[i]), self.mapy - 1)] == self.OB:
194 |         #                 need_adjust = True
195 |         #                 break
196 |         #         if need_adjust is True:
197 |         #             self.DATAs[index] = np.random.rand(3).astype(np.float16)
198 |         #
199 |         # for i, poi_i in enumerate(self.DATAs):
200 |         #     if i == 0:
201 |         #         print("[[%.10e,%.10e,%.10e]," % (poi_i[0], poi_i[1], poi_i[2]))
202 |         #     elif i == len(self.DATAs) - 1:
203 |         #         print("[%.10e,%.10e,%.10e]]\n" % (poi_i[0], poi_i[1], poi_i[2]))
204 |         #     else:
205 |         #         print("[%.10e,%.10e,%.10e]," % (poi_i[0], poi_i[1], poi_i[2]))
206 | 
207 |         # POI data value [256]
208 |         self._mapmatrix = copy.copy(self.DATAs[:, 2])
209 | 
210 |         # POI data Position  [256,2]
211 |         self.datas = self.DATAs[:, 0:2] * self.mapx
212 | 
213 |         # sum of all POI data values
214 |         self.totaldata = np.sum(self.DATAs[:, 2])
215 |         log.log(self.DATAs)
216 | 
217 |         """
218 |         Initial Fill Station
219 |         """
220 |         # TODO:加入加油站的有限油量
221 |         station = [
222 |             [0.1875, 0.8125, 50],
223 |             [0.625, 0.8125, 50],
224 |             [0.5, 0.5, 50],
225 |             [0.375, 0.125, 50],
226 |             [0.875, 0.25, 50]
227 |         ]
228 | 
229 |         self.FILL = np.reshape(station, (-1, 3)).astype(np.float16)
230 | 
231 |         # Fill Station Position  [5,2]
232 |         self.fills = self.FILL[:, 0:2] * self.mapx
233 | 
234 |         # Fill Station remain energy [5]
235 |         self.fills_energy_remain = copy.copy(self.FILL[:, 2])
236 | 
237 |         # sum of all FIll Station remain energy
238 |         self.total_fills_energy_remain = np.sum(self.FILL[:, 2])
239 | 
240 |         log.log(self.FILL)
241 | 
242 |         """
243 |         Initial image information
244 |         """
245 |         # [80,80]
246 |         self._image_data = np.zeros((self.map.width, self.map.height)).astype(np.float16)
247 | 
248 |         # [n,80,80]
249 |         self._image_position = np.zeros((self.sg.V['NUM_UAV'], self.map.width, self.map.height)).astype(np.float16)
250 | 
251 |         # [80,80]
252 |         self._image_access = np.zeros((self.map.width, self.map.height)).astype(np.float16)
253 | 
254 |         # empty wall
255 |         # draw walls in the border of the map (self._image_data)
256 |         # the value of the wall is -1
257 |         # the width of the wall is 4, which can be customized in image/flag.py
258 |         # after adding four wall borders, the shape of the map is still [80,80]
259 |         self.map.draw_wall(self._image_data)
260 | 
261 |         # PoI
262 |         # draw PoIs in the map (self._image_data)
263 |         # the position of PoI is [x*4+8,y*4+8] of the [80,80] map,
264 |         # where x,y->[0~15]
265 |         # the PoI's size is [2,2] in [80,80] map
266 |         # the value of PoI in the map is the actual value of PoI (self._mapmatrix[i])
267 |         # PoI value->(0~1)
268 |         for i, position in enumerate(self.datas):
269 |             self.map.draw_point(position[0], position[1], self._mapmatrix[i], self._image_data)
270 |         for obstacle in self.sg.V['OBSTACLE']:
271 |             self.map.draw_obstacle(obstacle[0], obstacle[1], obstacle[2], obstacle[3], self._image_data)
272 | 
273 |         for i_n in range(self.n):
274 |             # layer 2
275 |             self.map.draw_UAV(self.sg.V['INIT_POSITION'][1], self.sg.V['INIT_POSITION'][2], 1.,
276 |                               self._image_position[i_n])
277 |             for i, position in enumerate(self.fills):
278 |                 self.map.draw_FillStation(position[0], position[1], self.fills_energy_remain[i],
279 |                                           self._image_position[i_n])
280 | 
281 |         # 无人机随机颜色
282 |         self.uav_render_color = []
283 |         for i in range(self.n):
284 |             self.uav_render_color.append(np.random.randint(low=0, high=255, size=3, dtype=np.uint8))
285 | 
286 |         self.pow_list = []
287 | 
288 |     def reset(self):
289 |         # initialize data map
290 |         # tr = tracker.SummaryTracker()
291 |         self.mapmatrix = copy.copy(self._mapmatrix)
292 |         self.fills_energy_remain = copy.copy(self.FILL[:, 2])
293 | 
294 |         # record data access times(per 0.001 default)
295 |         self.maptrack = np.zeros(self.mapmatrix.shape)
296 |         # ----
297 |         # initialize state(get POI/filling) and positions of uavs
298 |         self.uav = [list(self.sg.V['INIT_POSITION']) for i in range(self.n)]
299 |         self.eff = [0.] * self.n
300 |         self.count = 0
301 |         self.zero = 0
302 | 
303 |         self.trace = [[] for i in range(self.n)]
304 |         self.energytrace = [[] for i in range(self.n)]
305 |         # initialize remaining energy
306 |         self.energy = np.ones(self.n).astype(np.float64) * self.maxenergy
307 |         # initialize indicators
308 |         self.collection = np.zeros(self.n).astype(np.float16)
309 |         # energy use
310 |         self.use_energy = np.zeros(self.n).astype(np.float16)
311 |         # energy fill
312 |         self.fill_energy = np.zeros(self.n).astype(np.float16)
313 |         # energy max
314 |         self.max_energy_array = np.array([self.maxenergy] * self.n).astype(np.float16)
315 | 
316 |         # walls
317 |         self.walls = np.zeros(self.n).astype(np.int16)
318 | 
319 |         # time
320 |         self.time_ = 0
321 | 
322 |         # initialize images
323 |         self.state = self.__init_image()
324 |         return self.__get_state()
325 | 
326 |     def __init_image(self):
327 |         self.image_data = copy.copy(self._image_data)
328 |         self.image_access = copy.copy(self._image_access)
329 |         self.image_position = copy.copy(self._image_position)
330 |         self.image_track = np.zeros(self.image_position.shape)
331 |         # ----
332 |         state = []
333 |         for i in range(self.n):
334 |             image = np.zeros((self.map.width, self.map.height, self.channel)).astype(np.float16)
335 |             for width in range(image.shape[0]):
336 |                 for height in range(image.shape[1]):
337 |                     # god view
338 |                     image[width][height][0] = self.image_data[width][height]
339 |                     image[width][height][1] = self.image_position[i][width][height]
340 |                     image[width][height][2] = self.image_access[width][height]
341 |             state.append(image)
342 |         return state
343 | 
344 |     def __draw_image(self, clear_uav, update_point, update_station, update_track):
345 |         # update 3 channels
346 |         for n in range(self.n):
347 |             for i, value in update_point:
348 |                 self.map.draw_point(self.datas[i][0], self.datas[i][1], value, self.state[n][:, :, 0])
349 |             for i, value in update_station:
350 |                 self.map.draw_point(self.fills[i][0], self.fills[i][1], value, self.state[n][:, :, 1])
351 |             self.map.clear_uav(clear_uav[n][1], clear_uav[n][2], self.state[n][:, :, 1])
352 |             self.map.draw_UAV(self.uav[n][1], self.uav[n][2], self.energy[n] / self.maxenergy, self.state[n][:, :, 1])
353 | 
354 |             # ---- draw track
355 |             for i, value in update_track:
356 |                 self.map.draw_point(self.datas[i][0], self.datas[i][1], value, self.state[n][:, :, 2])
357 | 
358 |     def __get_state(self):
359 |         return copy.deepcopy(self.state)
360 | 
361 |     # TODO: penalty加移动penalty,有待商榷
362 |     def __get_reward(self, value, energy, distance, penalty, fairness, fairness_):
363 |         factor0 = value / (self.factor * distance + self.alpha * value + self.epsilon)
364 |         factor1 = energy / self.maxenergy / (self.factor * distance + self.epsilon)
365 |         reward = factor0 + factor1
366 |         if value == 0 and energy == 0:  # 浪费生命的一步
367 |             return penalty - self.normalize * distance
368 |         else:
369 |             return reward * fairness_ + penalty
370 | 
371 |     def __get_fairness(self, values):
372 |         square_of_sum = np.square(np.sum(values))
373 |         sum_of_square = np.sum(np.square(values))
374 |         if sum_of_square == 0:
375 |             return 0.
376 |         jain_fairness_index = square_of_sum / sum_of_square / float(len(values))
377 |         return jain_fairness_index
378 | 
379 |     def __get_eff1(self, value, distance):
380 |         return value / (distance + self.alpha * value + self.epsilon)
381 | 
382 |     def __cusume_energy1(self, uav, value, distance):
383 |         # distance-0.1, alpha-1.0
384 |         if (self.factor * distance + self.alpha * value < self.energy[uav]):
385 |             self.energy[uav] -= (self.factor * distance + self.alpha * value)
386 |             self.use_energy[uav] += (self.factor * distance + self.alpha * value)
387 |         else:
388 |             self.use_energy[uav] += self.energy[uav]
389 |             distance = self.energy[uav] / self.factor
390 |             self.energy[uav] = 0
391 | 
392 |     def __fill_energy1(self, uav):
393 |         # fspeed-0.1
394 |         if self.energy[uav] + self.fspeed * self.maxenergy <= self.maxenergy:
395 |             self.fill_energy[uav] += self.fspeed * self.maxenergy
396 |             self.energy[uav] += self.fspeed * self.maxenergy
397 |         else:
398 |             self.fill_energy[uav] += self.maxenergy - self.energy[uav]
399 |             self.energy[uav] = self.maxenergy
400 | 
401 |     def step(self, actions, indicator=None):
402 |         # actions = actions.reshape((2, 3))
403 |         self.count += 1
404 |         action = copy.deepcopy(actions)
405 |         # 6-20 00:43
406 |         if np.max(action) > self.maxaction:
407 |             self.maxaction = np.max(action)
408 |             # print(self.maxaction)
409 |         if np.min(action) < self.minaction:
410 |             self.minaction = np.min(action)
411 |             # print(self.minaction)
412 | 
413 |         action = np.clip(action, -1e3, 1e3)
414 | 
415 |         normalize = self.normalize
416 | 
417 |         # TODO:梯度爆炸问题不可小觑,
418 |         # 遇到nan直接卡掉
419 |         for i in range(self.n):
420 |             for ii in action[i]:
421 |                 if np.isnan(ii):
422 |                     print('Nan. What can I do? do!')
423 |                     while True:
424 |                         pass
425 | 
426 |         reward = [0] * self.n
427 |         self.tmp_value = [0] * self.n
428 |         self.tmp_energy = [0] * self.n
429 |         self.tmp_distance = [0] * self.n
430 |         self.tmp_penalty = [0] * self.n
431 |         self.dn = [False] * self.n  # no energy UAV
432 |         update_points = []  # Updated PoI remained data
433 |         update_stations = []  # Updated Station remained energy
434 |         update_tracks = []  # Updated PoI access times
435 |         clear_uav = copy.copy(self.uav)
436 |         new_positions = []
437 |         c_f = self.__get_fairness(self.maptrack)
438 | 
439 |         # update positions of UAVs
440 |         for i in range(self.n):
441 |             self.trace[i].append(self.uav[i])
442 |             self.energytrace[i].append(self.energy[i] / self.maxenergy)
443 | 
444 |             # distance is from action(x,y), which is a kind of offset,[minaction,maxaction]
445 |             distance = np.sqrt(np.power(action[i][0], 2) + np.power(action[i][1], 2))
446 |             data = 0.0
447 |             value = 0.0
448 |             energy = 0.0
449 |             penalty = 0.0
450 | 
451 |             # think about distance and energy
452 |             # 1.normal and enough energy
453 |             # 2.so small
454 |             # 3.so large(>maxdistance) enough energy
455 |             # 4.so large(>energy)
456 |             if distance <= self.maxdistance and self.energy[i] >= self.factor * distance:
457 |                 new_x = self.uav[i][1] + action[i][0]
458 |                 new_y = self.uav[i][2] + action[i][1]
459 |             else:
460 |                 maxdistance = self.maxdistance if self.maxdistance <= self.energy[i] else \
461 |                     self.energy[i]
462 |                 # distance>=0.001
463 |                 if distance <= self.epsilon:
464 |                     distance = self.epsilon
465 |                     print("very small.")
466 |                 new_x = self.uav[i][1] + maxdistance * action[i][0] / distance
467 |                 new_y = self.uav[i][2] + maxdistance * action[i][1] / distance
468 |                 distance = maxdistance
469 | 
470 |             self.__cusume_energy1(i, 0, distance)
471 | 
472 |             # penalty!!
473 |             # update position
474 |             # if normal, save update
475 |             # if reach OB or WALL, give negative reward, save original positon
476 |             dx = new_x - self.uav[i][1]
477 |             dy = new_y - self.uav[i][2]
478 |             # TODO：简单的防夸张跳墙
479 |             if 0 <= new_x < self.mapx and 0 <= new_y < self.mapy and self.mapob[myint(new_x)][
480 |                 myint(new_y)] != self.OB and \
481 |                     self.mapob[myint(self.uav[i][1] + (dx / 2))][myint(self.uav[i][2] + (dy / 2))] != self.OB and \
482 |                     self.mapob[myint(self.uav[i][1] + (dx / 3))][myint(self.uav[i][2] + (dy / 3))] != self.OB and \
483 |                     self.mapob[myint(self.uav[i][1] + (2 * dx / 3))][
484 |                         myint(self.uav[i][2] + (2 * dy / 3))] != self.OB and \
485 |                     self.mapob[myint(self.uav[i][1] + (dx / 4))][myint(self.uav[i][2] + (dy / 4))] != self.OB and \
486 |                     self.mapob[myint(self.uav[i][1] + (3 * dx / 4))][myint(self.uav[i][2] + (3 * dy / 4))] != self.OB:
487 |                 new_positions.append([0, new_x, new_y])
488 |             else:
489 |                 new_positions.append([0, self.uav[i][1], self.uav[i][2]])
490 |                 penalty += normalize * self.pwall
491 |                 self.walls[i] += 1
492 | 
493 |             # TODO:加完了会有惊喜的哈哈哈！！！
494 |             if self.energy[i] < self.safe_energy_rate * self.maxenergy:
495 |                 penalty += -1. * distance * normalize
496 | 
497 |             # TODO:先看能否加油
498 |             # calculate distances between UAV and FillStation points
499 |             _pos = np.repeat([new_positions[-1][1:]], [self.fills.shape[0]], axis=0)  # just repeat(On)  NB!
500 |             _minus = self.fills - _pos
501 |             _power = np.power(_minus, 2)
502 |             _dis = np.sum(_power, axis=1)
503 |             __exists_FS = 0
504 |             tmp = self.energy[i]
505 |             for index, dis in enumerate(_dis):
506 |                 # sensing Fill Station(crange=1.1)
507 |                 if np.sqrt(dis) <= self.crange:
508 |                     __exists_FS = 1
509 |                     # uodate poi data
510 |                     if self.fills_energy_remain[index] > 0:
511 |                         # TODO:加油站的信息更新
512 |                         if self.fspeed * self.maxenergy <= self.fills_energy_remain[index]:
513 |                             if self.energy[i] + self.fspeed * self.maxenergy <= self.maxenergy:
514 |                                 self.fill_energy[i] += self.fspeed * self.maxenergy
515 |                                 self.fills_energy_remain[index] -= self.fspeed * self.maxenergy
516 |                                 self.energy[i] += self.fspeed * self.maxenergy
517 |                             else:
518 |                                 self.fill_energy[i] += self.maxenergy - self.energy[i]
519 |                                 self.fills_energy_remain[index] -= (self.maxenergy - self.energy[i])
520 |                                 self.energy[i] = self.maxenergy
521 |                         else:
522 |                             if self.energy[i] + self.fills_energy_remain[index] <= self.maxenergy:
523 |                                 self.fill_energy[i] += self.fills_energy_remain[index]
524 |                                 self.energy[i] += self.fills_energy_remain[index]
525 |                                 self.fills_energy_remain[index] = 0
526 |                             else:
527 |                                 self.fill_energy[i] += self.maxenergy - self.energy[i]
528 |                                 self.fills_energy_remain[index] -= (self.maxenergy - self.energy[i])
529 |                                 self.energy[i] = self.maxenergy
530 |                         update_stations.append([index, self.fills_energy_remain[index]])
531 |                     break
532 | 
533 |             # 若在加油站范围内则加油,若不在任何一个加油站范围内,则采集数据
534 |             if __exists_FS == 1:
535 |                 new_positions[-1][0] = -1  # 状态标识符置为-1
536 |                 if indicator is not None:
537 |                     indicator[i] = -1
538 |                 # fill energy!!
539 |                 energy = self.energy[i] - tmp
540 | 
541 | 
542 |             else:
543 |                 new_positions[-1][0] = 1  # 状态标识符置为1
544 |                 if indicator is not None:
545 |                     indicator[i] = 1
546 |                 # calculate distances between UAV and data points
547 |                 _pos = np.repeat([new_positions[-1][1:]], [self.datas.shape[0]], axis=0)  # just repeat(On)  NB!
548 |                 _minus = self.datas - _pos
549 |                 _power = np.power(_minus, 2)
550 |                 _dis = np.sum(_power, axis=1)
551 |                 for index, dis in enumerate(_dis):
552 |                     # sensing PoI(crange=1.1)
553 |                     if np.sqrt(dis) <= self.crange:
554 |                         self.maptrack[index] += self.track
555 |                         update_tracks.append([index, self.maptrack[index]])  # update poi access times
556 | 
557 |                         # uodate poi data
558 |                         if self.mapmatrix[index] > 0:
559 |                             # cspeed just like a perceptage of consuming a special POI
560 |                             data += self._mapmatrix[index] * self.cspeed
561 |                             self.mapmatrix[index] -= self._mapmatrix[index] * self.cspeed
562 |                             if self.mapmatrix[index] < 0:
563 |                                 self.mapmatrix[index] = 0.
564 |                             update_points.append([index, self.mapmatrix[index]])
565 | 
566 |                 # update info (collected data)
567 |                 # use energy to get POI(consume energy of UAVs, per alpha 1.0 default)
568 |                 value = data if self.energy[i] >= data * self.alpha else self.energy[i]
569 |                 self.__cusume_energy1(i, value, 0.)  # 采集数据
570 | 
571 |             # calculate fairness
572 |             c_f_ = self.__get_fairness(self.maptrack)
573 | 
574 |             # reward
575 |             reward[i] += self.__get_reward(value, energy, distance, penalty, c_f, c_f_)
576 | 
577 |             # TODO:debug
578 |             self.tmp_value[i] = value
579 |             self.tmp_energy[i] = energy
580 |             self.tmp_distance[i] = distance
581 |             self.tmp_penalty[i] = penalty
582 | 
583 |             # ----
584 |             c_f = c_f_
585 | 
586 |             # efficiency
587 |             self.eff[i] += self.__get_eff1(value, distance)
588 |             self.collection[i] += value
589 | 
590 |             # mark no energy UAVs
591 |             if self.energy[i] <= self.epsilon * self.maxenergy:
592 |                 self.dn[i] = True
593 | 
594 |         self.uav = new_positions
595 |         t = time.time()
596 |         self.__draw_image(clear_uav, update_points, update_stations, update_tracks)
597 |         self.time_ += time.time() - t
598 | 
599 |         # TODO:放大reward  为什么要人为砍梯度?
600 |         self.reward = list(np.clip(np.array(reward) / normalize, -2., 2.))
601 |         # self.reward = list(np.array(reward) / normalize)
602 | 
603 |         info = None
604 |         state = self.__get_state()
605 |         for r in self.reward:
606 |             if np.isnan(r):
607 |                 print('Rerward Nan')
608 |                 while True:
609 |                     pass
610 | 
611 |         # TODO:不提前结束，给予一些的躺尸的经历,最极端的就是所有无人机一起躺尸，但是TDerror可能会有问题吧
612 |         # done = True
613 |         # for d in self.dn:
614 |         #     if d is False:
615 |         #         done = False
616 |         #         break
617 |         #     else:
618 |         #         continue
619 | 
620 |         done = False
621 |         return state, self.reward, done, info, indicator
622 | 
623 |     def render(self):
624 |         global power_list
625 |         observ = list(self.__get_state())
626 |         observ = np.array(observ)
627 |         observ = observ.transpose((0, 2, 1, 3))
628 |         observ_0 = observ[np.random.randint(low=0, high=self.n), :, :, 0]
629 |         observ_1 = observ[np.random.randint(low=0, high=self.n), :, :, 2]
630 | 
631 |         img_0 = np.zeros([80, 80, 3], dtype=np.uint8)
632 |         self.draw_convert(observ_0, img_0, max(self._mapmatrix), color=np.asarray([0, 255, 0]))
633 | 
634 |         max_visit_val = max(np.max(observ_1), self.sg.V['VISIT'] * 20)
635 |         img_1 = np.zeros([80, 80, 3], dtype=np.uint8)
636 |         self.draw_convert(observ_1, img_1, max_visit_val, color=np.asarray([0, 255, 0]))
637 | 
638 |         for i in range(self.n):
639 |             power_list = self.draw_convert(observ[i, :, :, 1], img_0, self.maxenergy, color=self.uav_render_color[i],
640 |                                            is_power=True)
641 | 
642 |         img = np.hstack([img_0, img_1])
643 |         img = cv2.resize(img, (800, 400))
644 | 
645 |         for p in power_list:
646 |             cv2.circle(img, (p[1] * 5, p[0] * 5), 25, (0, 0, 255))
647 | 
648 |         img = cv2.flip(img, 0, dst=None)
649 | 
650 |         cv2.imshow('show', img)
651 |         cv2.waitKey(1)
652 | 
653 |     def draw_convert(self, observ, img, max_val, color, is_power=False):
654 |         for i in range(80):
655 |             for j in range(80):
656 | 
657 |                 if observ[j, i] < 0 and is_power == False:
658 |                     img[j, i, 0] = np.uint8(255)
659 |                 elif observ[j, i] < 0 and is_power == True:
660 |                     img[j, i, 2] = np.uint8(255)
661 |                     self.pow_list.append((j, i))
662 |                 elif observ[j, i] > 0 and is_power == True:
663 |                     img[j, i, :] = np.uint8(color * observ[j, i])
664 |                 elif observ[j, i] > 0 and is_power == False:
665 |                     img[j, i, :] = np.uint8(color * observ[j, i] / max_val)
666 | 
667 |         if len(self.pow_list) > 0:
668 |             return self.pow_list
669 | 
670 |     # TODO:MAYBE NOT USEFUL NOW!!!
671 |     @property
672 |     def leftrewards(self):
673 |         return np.sum(self.mapmatrix) / self.totaldata
674 | 
675 |     @property
676 |     def efficiency(self):
677 |         return np.sum(self.collection / self.totaldata) * self.collection_fairness / (np.sum(self.normal_use_energy))
678 | 
679 |     @property
680 |     def normal_use_energy(self):
681 |         tmp = list(np.array(self.use_energy) / (self.max_energy_array))
682 |         # for i in range(len(tmp)):
683 |         #     if tmp[i] > 1.0:
684 |         #         tmp[i] = 1.0
685 | 
686 |         return tmp
687 | 
688 |     @property
689 |     def fairness(self):
690 |         square_of_sum = np.square(np.sum(self.mapmatrix[:]))
691 |         sum_of_square = np.sum(np.square(self.mapmatrix[:]))
692 |         fairness = square_of_sum / sum_of_square / float(len(self.mapmatrix))
693 |         return fairness
694 | 
695 |     @property
696 |     def collection_fairness(self):
697 |         collection = self._mapmatrix - self.mapmatrix
698 |         square_of_sum = np.square(np.sum(collection))
699 |         sum_of_square = np.sum(np.square(collection))
700 |         fairness = square_of_sum / sum_of_square / float(len(collection))
701 |         return fairness
702 | 
703 |     @property
704 |     def normal_collection_fairness(self):
705 |         collection = self._mapmatrix - self.mapmatrix
706 |         for index, i in enumerate(collection):
707 |             collection[index] = i / self._mapmatrix[index]
708 |         square_of_sum = np.square(np.sum(collection))
709 |         sum_of_square = np.sum(np.square(collection))
710 |         fairness = square_of_sum / sum_of_square / float(len(collection))
711 |         return fairness
712 | 


--------------------------------------------------------------------------------
/experiments/crazy_env/tsp_env_setting.py:
--------------------------------------------------------------------------------
 1 | class Setting(object):
 2 |     def __init__(self, log):
 3 |         self.V = {
 4 |             'MAP_X': 16,
 5 |             'MAP_Y': 16,
 6 |             'MAX_VALUE': 1.,
 7 |             'MIN_VALUE': 0.,
 8 |             'OBSTACLE': [
 9 |                 # [0, 4, 1, 1],
10 |                 # [0, 9, 1, 1],
11 |                 # [0, 10, 2, 1],
12 |                 # [2, 2, 2, 1],
13 |                 # [5, 13, 1, 1],
14 |                 # [6, 12, 2, 1],
15 |                 # [10, 5, 3, 1],
16 |                 # [11, 5, 1, 3],
17 |                 # [10, 13, 1, 2],
18 |                 # [11, 13, 2, 1],
19 |                 # [12, 0, 1, 2],
20 |                 # [12, 5, 1, 1],
21 |                 # [12, 7, 1, 1],
22 |                 # [15, 11, 1, 1]
23 |             ],
24 |             'CHANNEL': 3,
25 | 
26 |             'NUM_UAV': 5,
27 |             'INIT_POSITION': (0, 8, 8),
28 |             'MAX_ENERGY': 50.,  # must face the time of lack
29 |             'NUM_ACTION': 2,  # 2
30 |             'SAFE_ENERGY_RATE': 0.2,
31 |             'RANGE': 1.1,
32 |             'MAXDISTANCE': 4.,
33 |             'COLLECTION_PROPORTION': 0.2,  # c speed
34 |             'FILL_PROPORTION': 0.2,  # fill speed
35 | 
36 |             'WALL_REWARD': -1.,
37 |             'VISIT': 1. / 1000.,
38 |             'DATA_REWARD': 1.,
39 |             'FILL_REWARD': 1.,
40 |             'ALPHA': 1.,
41 |             'BETA': 0.1,
42 |             'EPSILON': 1e-4,
43 |             'NORMALIZE': .1,
44 |             'FACTOR': 0.1,
45 |         }
46 |         self.LOG = log
47 |         self.time = log.time
48 | 
49 |     def log(self):
50 |         self.LOG.log(self.V)
51 | 


--------------------------------------------------------------------------------
/experiments/env0/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BIT-MCS/DRL-EC3/3f6fc8afe7ddea615e0e8f3f9f0fdfd6a6cd6db6/experiments/env0/__init__.py


--------------------------------------------------------------------------------
/experiments/env0/env_setting0.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | 
 4 | class Setting(object):
 5 |     def __init__(self, log):
 6 |         self.V = {
 7 |             'MAP_X': 16,
 8 |             'MAP_Y': 16,
 9 |             'MAX_VALUE': 1.,
10 |             'MIN_VALUE': 0.,
11 |             'OBSTACLE': [
12 |                 [0, 4, 1, 1],
13 |                 [0, 9, 1, 1],
14 |                 [0, 10, 2, 1],
15 |                 [2, 2, 2, 1],
16 |                 [3, 6, 4, 1],
17 |                 [4, 4, 1, 4],
18 |                 # [4,12, 1, 1],
19 |                 [5, 13, 1, 1],
20 |                 [6, 12, 2, 1],
21 |                 # [10,3, 1, 1],
22 |                 [10, 5, 3, 1],
23 |                 # [10,9, 1, 1],
24 |                 [11, 5, 1, 3],
25 |                 [10, 13, 1, 2],
26 |                 [11, 13, 2, 1],
27 |                 # [11,12,1, 2],
28 |                 [12, 0, 1, 2],
29 |                 [12, 5, 1, 1],
30 |                 [12, 7, 1, 1],
31 |                 # [12,13,2, 1],
32 |                 [15, 11, 1, 1]
33 |             ],
34 |             'CHANNLE': 3,
35 | 
36 |             'NUM_UAV': 2,
37 |             'INIT_POSITION': (8, 8),
38 |             'MAX_ENERGY': 500.,
39 |             'NUM_ACTION': 2,
40 |             'RANGE' : 1.1,
41 |             'MAXDISTANCE': 1.,
42 |             'COLLECTION_PROPORTION': 0.2,  # c speed
43 | 
44 |             'WALL_REWARD': -1.,
45 |             'DATA_REWARD': 1.,
46 |             'WASTE_STEP' : -.5,
47 |             'ALPHA': 1.,
48 |             # 'BETA': 0.01,
49 |             'EPSILON': 1e-4,
50 |             'NORMALIZE': .1,
51 |             'FACTOR': 0.1,
52 |         }
53 |         self.LOG = log
54 |         self.time = log.time
55 | 
56 |     def log(self):
57 |         # with open(os.path.join('.', self.time + '.txt'), 'x') as file:
58 |         #     for key, value in self.V.items():
59 |         #         print(key, value, file=file)
60 |         self.LOG.log(self.V)
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/experiments/env0/log0.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import os
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | 
 6 | class Log(object):
 7 |     def __init__(self):
 8 |         self.time = str(time.strftime("%Y/%m-%d/%H-%M-%S", time.localtime()))
 9 |         self.full_path = os.path.join('.', self.time)
10 |         os.makedirs(self.full_path)
11 |         self.file_path = self.full_path + '/REPORT.txt'
12 |         file = open(self.file_path, 'x')
13 |         file.close()
14 | 
15 |     def log(self, values):
16 |         if isinstance(values, dict):
17 |             with open(self.file_path, 'a') as file:
18 |                 for key, value in values.items():
19 |                     print(key, value, file=file)
20 |         elif isinstance(values, list):
21 |             with open(self.file_path, 'a') as file:
22 |                 for value in values:
23 |                     print(value, file=file)
24 |         else:
25 |             with open(self.file_path, 'a') as file:
26 |                 print(values, file=file)
27 | 
28 |     def draw_path(self, env, step):
29 |         full_path = os.path.join(self.full_path, 'Path')
30 |         # ob_xy = np.zeros((FLAGS.map_x, FLAGS.map_y))
31 |         # for i in FLAGS.obstacle:
32 |         #     for x in range(i[0], i[0] + i[2], 1):
33 |         #         for y in range(i[1], i[1] + i[3], 1):
34 |         #             ob_xy[x][y] = 1
35 |         if not os.path.exists(full_path):
36 |             os.makedirs(full_path)
37 |         xxx = []
38 |         colors = []
39 |         for x in range(env.mapx):
40 |             xxx.append((x, 1))
41 |         for y in range(env.mapy):
42 |             c = []
43 |             for x in range(env.mapx):
44 |                 if env.mapob[x][y] == 1:
45 |                     c.append((1, 0, 0, 1))
46 |                 else:
47 |                     c.append((1, 1, 1, 1))
48 |             colors.append(c)
49 | 
50 |         Fig = plt.figure(figsize=(5, 5))
51 |         PATH = np.array(env.trace)
52 |         for i1 in range(env.mapy):
53 |             plt.broken_barh(xxx, (i1, 1), facecolors=colors[i1])
54 |         plt.scatter(env.datas[:,0], env.datas[:,1], c=env.DATAs[:,2])
55 |         for i in range(env.n):
56 |             # M = Fig.add_subplot(1, 1, i + 1)
57 |             plt.ylim(ymin=0, ymax=env.mapy)
58 |             plt.xlim(xmin=0, xmax=env.mapx)
59 |             color = np.random.random(3)
60 |             plt.plot(PATH[i, :, 0], PATH[i, :, 1], color=color)
61 |             plt.scatter(PATH[i, :, 0], PATH[i, :, 1], color=color,marker='.')
62 |             plt.grid(True, linestyle='-.', color='r')
63 |             plt.title(str(env.normal_energy) + ',\n' + str(env.leftrewards))
64 |         Fig.savefig(full_path + '/path_' + str(step) + '.png')
65 | 
66 |         plt.close()
67 | 


--------------------------------------------------------------------------------
/experiments/image/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BIT-MCS/DRL-EC3/3f6fc8afe7ddea615e0e8f3f9f0fdfd6a6cd6db6/experiments/image/__init__.py


--------------------------------------------------------------------------------
/experiments/image/flag.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | flags = tf.app.flags
 4 | FLAGS = flags.FLAGS
 5 | 
 6 | # map info
 7 | flags.DEFINE_integer('image_size', 80, 'the size of image')
 8 | flags.DEFINE_integer('image_deepth', 2, 'the deepth of image')
 9 | flags.DEFINE_integer('wall_value', -1, 'the value of wall')
10 | flags.DEFINE_integer('wall_width', 4, 'the width of wall')
11 | flags.DEFINE_integer('fill_value', -1, 'the value of FillStation')
12 | 
13 | flags.DEFINE_integer('map_x', 16, 'the length of x-axis')
14 | flags.DEFINE_integer('map_y', 16, 'the length of y-axis')
15 | 


--------------------------------------------------------------------------------
/experiments/image/map.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class Map(object):
 4 |     def __init__(self, width, height):
 5 |         # self.__map = np.zeros((width, height))
 6 |         self.__width = width
 7 |         self.__height = height
 8 | 
 9 |     # @property
10 |     # def map(self):
11 |     #     return self.__map
12 |     @property
13 |     def width(self):
14 |         return self.__width
15 |     @property
16 |     def height(self):
17 |         return self.__height
18 | 
19 |     def get_value(self, x, y, map):
20 |         return map[x][y]
21 | 
22 |     def draw_sqr(self, x, y, width, height, value, map):
23 |         assert 0 <= x < self.__width and 0 <= y < self.__height, 'the position ({0}, {1}) is not correct.'.format(x, y)
24 |         for i in range(x, x + width, 1):
25 |             for j in range(y, y + height, 1):
26 |                 map[i][j] = value
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/experiments/image/mapM.py:
--------------------------------------------------------------------------------
 1 | from .map import Map
 2 | from .flag import FLAGS
 3 | from PIL import Image
 4 | import time
 5 | import os
 6 | 
 7 | class MapM(Map):
 8 | 
 9 |     def __init__(self, log_path, width=80, height=80):
10 |         super(MapM, self).__init__(width, height)
11 |         self.__time = time.time()
12 |         self.full_path = os.path.join(log_path, 'img_map')
13 |         if not os.path.exists(self.full_path):
14 |             os.makedirs(self.full_path)
15 | 
16 |     def draw_wall(self, map):
17 |         wall = FLAGS.wall_value
18 |         width = FLAGS.wall_width
19 |         for j in range(0, 80, 1):
20 |             for i in range(80-width, 80, 1):
21 |                 self.draw_sqr(i, j, 1, 1, wall, map)
22 |             for i in range(0, width, 1):
23 |                 self.draw_sqr(i, j, 1, 1, wall, map)
24 |         for i in range(0, 80, 1):
25 |             for j in range(0, width, 1):
26 |                 self.draw_sqr(i, j, 1, 1, wall, map)
27 |             for j in range(80-width, 80, 1):
28 |                 self.draw_sqr(i, j, 1, 1, wall, map)
29 | 
30 |     def get_value(self, x, y, map):
31 |         x, y = self.__trans(x, y)
32 |         super(MapM, self).get_value(x, y, map)
33 | 
34 |     def __trans(self, x, y):
35 |         return int(4 * x + FLAGS.wall_width*2), int(y * 4 + FLAGS.wall_width*2)
36 | 
37 |     def draw_obstacle(self, x, y, width, height, map):
38 |         # self.clear_cell(x, y, map)
39 |         x, y = self.__trans(x, y)
40 |         self.draw_sqr(x, y, width * 4, height * 4, FLAGS.wall_value, map)
41 | 
42 |     def draw_chargestation(self, x, y, map):
43 |         self.clear_cell(x, y, map)
44 |         x, y = self.__trans(x, y)
45 |         self.draw_sqr(x, y + 1, 4, 2, 1, map)
46 |         self.draw_sqr(x + 1, y, 2, 4, 1, map)
47 | 
48 |     # xy transpose occur
49 |     def draw_point(self, x, y, value, map):
50 |         x, y = self.__trans(x, y)
51 |         self.draw_sqr(x, y, 2, 2, value, map)
52 | 
53 |     def clear_point(self, x, y, map):
54 |         x, y = self.__trans(x, y)
55 |         self.draw_sqr(x, y, 2, 2, 0, map)
56 | 
57 |     def clear_uav(self, x, y, map):
58 |         self.clear_cell(x, y, map)
59 | 
60 |     def draw_UAV(self, x, y, value, map):
61 |         x = -1 if x < -1 else FLAGS.map_x if x > FLAGS.map_x else x
62 |         y = -1 if y < -1 else FLAGS.map_y if y > FLAGS.map_y else y
63 |         self.clear_cell(x, y, map)
64 |         x, y = self.__trans(x, y)
65 |         # self.draw_sqr(x, y + 1, 4, 2, value, map)
66 |         # self.draw_sqr(x + 1, y, 2, 4, value, map)
67 |         # value = self.get_value(x, y)
68 |         self.draw_sqr(x, y, 4, 4, value, map)
69 |         # self.draw_sqr(x, y, 4, 4, value)
70 | 
71 |     def clear_cell(self, x, y, map):
72 |         x, y = self.__trans(x, y)
73 |         self.draw_sqr(x, y, 4, 4, 0, map)
74 | 
75 |     def draw_goal(self, x, y, map):
76 |         # x, y = self.__trans(x, y)
77 |         # value = self.get_value(x + 2, y + 2, map)
78 |         # self.draw_sqr(x, y, 4, 4, 1, map)
79 |         # self.draw_sqr(x + 2, y + 2, 2, 2, value, map)
80 |         pass
81 | 
82 |     def draw_FillStation(self,x,y,value,map):
83 |         x, y = self.__trans(x, y)
84 |         self.draw_sqr(x, y, 2, 2,value,map)
85 | 
86 | 
87 |     def save_as_png(self, map, ip=None):
88 |         img = Image.fromarray(map * 255)
89 |         img = img.convert('L')
90 |         # img.show()
91 |         if ip is None:
92 |             name = time.time() - self.__time
93 |         else:
94 |             name = str(ip)
95 |         img.save(os.path.join(self.full_path, str(name)), 'png')
96 | 
97 | 


--------------------------------------------------------------------------------
/experiments/poor_compare.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import matplotlib.ticker as ticker
  3 | from matplotlib.backends.backend_pdf import PdfPages
  4 | import os
  5 | import numpy as np
  6 | 
  7 | 
  8 | def error(input_list):
  9 |     input = np.array(input_list)
 10 |     input = input.transpose((1, 0))
 11 |     error_low = input[0] - input[1]
 12 |     error_high = input[2] - input[0]
 13 |     error = []
 14 |     error.append(error_low)
 15 |     error.append(error_high)
 16 |     return error
 17 | 
 18 | 
 19 | def average(input_list):
 20 |     input = np.array(input_list)
 21 |     input = input.transpose((1, 0))
 22 |     return input[0]
 23 | 
 24 | 
 25 | def compare_plot_errorbar(xlabel, ylabel, x, eDivert, woApeX, woRNN, MADDPG):
 26 |     plt.xlabel(xlabel)
 27 |     plt.ylabel(ylabel)
 28 |     plt.errorbar(x=x, y=average(eDivert), yerr=error(eDivert), fmt='r-o', label='e-Divert', capsize=4)
 29 |     plt.errorbar(x=x, y=average(woApeX), yerr=error(woApeX), fmt='g-^', label='e-Divert w/o Ape-X', capsize=4)
 30 |     plt.errorbar(x=x, y=average(woRNN), yerr=error(woRNN), fmt='m-<', label='e-Divert w/o RNN', capsize=4)
 31 |     plt.errorbar(x=x, y=average(MADDPG), yerr=error(MADDPG), fmt='k-*', label='MADDPG', capsize=4)
 32 | 
 33 |     plt.ylim(ymin=0, ymax=1)
 34 |     plt.grid(True)
 35 |     plt.grid(linestyle='--')
 36 |     plt.legend()
 37 |     plt.show()
 38 | 
 39 | 
 40 | def compare_plot(xlabel, ylabel, x, yrange, eDivert, TSP):
 41 |     if os.path.exists('./pdf') is False:
 42 |         os.makedirs('./pdf')
 43 |     pdf = PdfPages('./pdf/%s-%s.pdf' % (xlabel, ylabel))
 44 |     plt.figure(figsize=(13, 13))
 45 | 
 46 |     plt.xlabel(xlabel, fontsize=32)
 47 |     plt.ylabel(ylabel, fontsize=32)
 48 |     plt.xticks(fontsize=32)
 49 |     plt.yticks(fontsize=32)
 50 |     plt.plot(x, eDivert, color='b', marker='o', label='e-Divert', markersize=26, markeredgewidth=5,
 51 |              markerfacecolor='none', linewidth=4)
 52 |     plt.plot(x, TSP, color='orange', marker='s', label='GA-based route planning', markersize=26, markeredgewidth=5,
 53 |              markerfacecolor='none', linewidth=4)
 54 | 
 55 |     # if ylabel == "Energy usage (# of full batteries)":
 56 |     #     if xlabel == "No. of vehicles":
 57 |     #         plt.plot(x, [3.62, 4.62, 5.62, 6.62, 7.62], color='red', linestyle='--', label="Maximum used energy",
 58 |     #                  linewidth=4)
 59 |     #     else:
 60 |     #         plt.axhline(y=2.83, color='red', linestyle='--', label="Maximum used energy", linewidth=4)
 61 |     plt.xticks(x, x)
 62 |     plt.gca().yaxis.set_major_formatter(ticker.FormatStrFormatter('%.2f'))
 63 |     plt.ylim(yrange[0], yrange[1] * 1.5)
 64 |     plt.grid(True)
 65 |     plt.grid(linestyle='--')
 66 |     plt.legend(loc='upper left', fontsize=25, ncol=1, markerscale=0.9)
 67 |     plt.tight_layout()
 68 | 
 69 |     pdf.savefig()
 70 |     plt.close()
 71 |     pdf.close()
 72 | 
 73 | 
 74 | if __name__ == '__main__':
 75 |     # collection-range
 76 |     compare_plot(xlabel="Sensing range (unit)",
 77 |                  ylabel="Data collection ratio",
 78 |                  x=[0.6, 0.8, 1.0, 1.2, 1.4],
 79 |                  yrange=[0, 0.8],
 80 |                  eDivert=[0.704, 0.719, 0.746, 0.88, 0.95],
 81 |                  TSP=[0.905, 0.917, 0.930, 0.952, 0.974],
 82 |                  )
 83 | 
 84 |     # fairness_range
 85 |     compare_plot(xlabel="Sensing range (unit)",
 86 |                  ylabel="Geographical fairness",
 87 |                  x=[0.6, 0.8, 1.0, 1.2, 1.4],
 88 |                  yrange=[0, 0.8],
 89 |                  eDivert=[0.755, 0.766, 0.801, 0.91, 0.957],
 90 |                  TSP=[0.919, 0.935, 0.950, 0.963, 0.980],
 91 |                  )
 92 |     # #
 93 |     # energy_range
 94 |     compare_plot(xlabel="Sensing range (unit)",
 95 |                  ylabel="Energy usage (# of full batteries)",
 96 |                  x=[0.6, 0.8, 1.0, 1.2, 1.4],
 97 |                  yrange=[0, 4],
 98 |                  eDivert=[1.32, 1.329, 1.459, 1.57, 1.805],
 99 |                  TSP=[3.855, 4.219, 4.234, 4.250, 4.270],
100 |                  )
101 | 
102 |     # efficiency_range
103 |     compare_plot(xlabel="Sensing range (unit)",
104 |                  ylabel="Energy efficiency",
105 |                  x=[0.6, 0.8, 1.0, 1.2, 1.4],
106 |                  yrange=[0, 0.36],
107 |                  eDivert=[0.357, 0.362, 0.371, 0.382, 0.4],
108 |                  TSP=[0.189, 0.178, 0.183, 0.189, 0.196],
109 |                  )
110 | 
111 |     # TODO
112 |     # collection-range
113 |     compare_plot(xlabel="No. of vehicles",
114 |                  ylabel="Data collection ratio",
115 |                  x=[1, 2, 3, 4, 5],
116 |                  yrange=[0, 0.8],
117 |                  eDivert=[0.841,0.852,0.902,0.942,0.94],
118 |                  TSP=[0.893,0.992,0.999,0.994,0.994],
119 |                  )
120 | 
121 |     # fairness_range
122 |     compare_plot(xlabel="No. of vehicles",
123 |                  ylabel="Geographical fairness",
124 |                  x=[1, 2, 3, 4, 5],
125 |                  yrange=[0, 0.8],
126 |                  eDivert=[0.862,0.878,0.921,0.943,0.939],
127 |                  TSP=[0.936,0.988,0.991,0.991,0.991],
128 |                  )
129 |     # #
130 |     # energy_range
131 |     compare_plot(xlabel="No. of vehicles",
132 |                  ylabel="Energy usage (# of full batteries)",
133 |                  x=[1, 2, 3, 4, 5],
134 |                  yrange=[0, 6],
135 |                  eDivert=[1.38,1.784,2.01,2.493,2.67],
136 |                  TSP=[3.395,4.324,4.941,7.402,7.996],
137 |                  )
138 | 
139 |     # efficiency_range
140 |     compare_plot(xlabel="No. of vehicles",
141 |                  ylabel="Energy efficiency",
142 |                  x=[1, 2, 3, 4, 5],
143 |                  yrange=[0, 0.32],
144 |                  eDivert=[0.386,0.371,0.349,0.311,0.302],
145 |                  TSP=[0.213,0.201,0.179,0.118,0.102],
146 |                  )
147 | 


--------------------------------------------------------------------------------
/experiments/random_generator.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | poi_data=np.random.random(size=(256,3))
  4 | 
  5 | for i,poi_i in enumerate(poi_data):
  6 |     if i==0:
  7 |         print("[[%.10e,%.10e,%.10e],"% (poi_i[0],poi_i[1],poi_i[2]))
  8 |     elif i==255:
  9 |         print("[%.10e,%.10e,%.10e]]" % (poi_i[0], poi_i[1], poi_i[2]))
 10 |     else:
 11 |         print("[%.10e,%.10e,%.10e]," % (poi_i[0], poi_i[1], poi_i[2]))
 12 | 
 13 | 
 14 | test = [[1.5454101562e-01, 2.2583007812e-02, 6.5332031250e-01],
 15 |         [2.1936035156e-01, 2.1618652344e-01, 8.2568359375e-01],
 16 |         [3.3813476562e-01, 4.4738769531e-02, 6.6406250000e-02],
 17 |         [6.5478515625e-01, 6.5429687500e-01, 8.7280273438e-02],
 18 |         [6.9970703125e-01, 7.5000000000e-01, 4.6923828125e-01],
 19 |         [3.2177734375e-01, 4.9145507812e-01, 8.8769531250e-01],
 20 |         [6.0595703125e-01, 8.5449218750e-01, 1.0772705078e-01],
 21 |         [7.1679687500e-01, 1.1370849609e-01, 5.3759765625e-01],
 22 |         [7.3046875000e-01, 9.5800781250e-01, 3.6157226562e-01],
 23 |         [9.7656250000e-01, 4.9365234375e-01, 2.5732421875e-01],
 24 |         [1.4416503906e-01, 7.8320312500e-01, 7.1679687500e-01],
 25 |         [7.1435546875e-01, 2.1618652344e-01, 4.7070312500e-01],
 26 |         [1.3830566406e-01, 6.8310546875e-01, 6.7675781250e-01],
 27 |         [6.2304687500e-01, 1.4045715332e-02, 4.3017578125e-01],
 28 |         [9.2919921875e-01, 9.7460937500e-01, 5.6494140625e-01],
 29 |         [9.5996093750e-01, 3.4423828125e-02, 1.2927246094e-01],
 30 |         [5.4443359375e-01, 7.9199218750e-01, 3.7622070312e-01],
 31 |         [4.6777343750e-01, 5.4394531250e-01, 7.2753906250e-01],
 32 |         [4.7558593750e-01, 7.0898437500e-01, 7.6562500000e-01],
 33 |         [8.5205078125e-01, 4.8364257812e-01, 3.9965820312e-01],
 34 |         [7.1240234375e-01, 1.6027832031e-01, 5.7421875000e-01],
 35 |         [4.7460937500e-01, 9.8937988281e-02, 3.8500976562e-01],
 36 |         [6.1914062500e-01, 1.2841796875e-01, 1.4758300781e-01],
 37 |         [6.7773437500e-01, 5.8593750000e-02, 5.6689453125e-01],
 38 |         [5.2099609375e-01, 1.2927246094e-01, 1.6943359375e-01],
 39 |         [3.0737304688e-01, 9.3066406250e-01, 9.1845703125e-01],
 40 |         [1.7565917969e-01, 9.7802734375e-01, 4.3847656250e-01],
 41 |         [4.1040039062e-01, 8.9794921875e-01, 2.6123046875e-01],
 42 |         [6.5234375000e-01, 6.9580078125e-01, 6.5429687500e-01],
 43 |         [9.8046875000e-01, 4.0161132812e-01, 5.4003906250e-01],
 44 |         [6.2597656250e-01, 7.5244140625e-01, 8.1640625000e-01],
 45 |         [5.6762695312e-02, 7.7734375000e-01, 2.2973632812e-01],
 46 |         [9.0380859375e-01, 6.3720703125e-01, 8.8183593750e-01],
 47 |         [5.9326171875e-01, 5.8740234375e-01, 7.3339843750e-01],
 48 |         [2.6318359375e-01, 6.7480468750e-01, 3.6206054688e-01],
 49 |         [2.6245117188e-01, 5.3613281250e-01, 3.1201171875e-01],
 50 |         [5.5468750000e-01, 3.2397460938e-01, 5.8496093750e-01],
 51 |         [9.3896484375e-01, 6.6601562500e-01, 2.0996093750e-02],
 52 |         [1.3537597656e-01, 2.8100585938e-01, 1.8847656250e-01],
 53 |         [9.5507812500e-01, 8.2421875000e-01, 6.2890625000e-01],
 54 |         [4.3505859375e-01, 9.8046875000e-01, 7.4169921875e-01],
 55 |         [4.8559570312e-01, 4.9853515625e-01, 2.4414062500e-01],
 56 |         [6.8457031250e-01, 2.5073242188e-01, 4.5385742188e-01],
 57 |         [5.1025390625e-01, 8.9990234375e-01, 6.6601562500e-01],
 58 |         [6.6992187500e-01, 6.2011718750e-01, 6.6552734375e-01],
 59 |         [5.0292968750e-02, 8.3496093750e-01, 6.7968750000e-01],
 60 |         [7.8808593750e-01, 1.5332031250e-01, 9.0429687500e-01],
 61 |         [8.2128906250e-01, 7.9833984375e-01, 4.6142578125e-01],
 62 |         [3.0059814453e-02, 7.8125000000e-01, 4.9951171875e-01],
 63 |         [1.9006347656e-01, 7.3144531250e-01, 4.3994140625e-01],
 64 |         [8.3544921875e-01, 4.3237304688e-01, 8.6279296875e-01],
 65 |         [7.3437500000e-01, 9.9548339844e-02, 1.8688964844e-01],
 66 |         [2.6074218750e-01, 9.1699218750e-01, 5.9814453125e-01],
 67 |         [8.1689453125e-01, 1.9482421875e-01, 9.2675781250e-01],
 68 |         [8.7500000000e-01, 2.7221679688e-01, 7.4707031250e-01],
 69 |         [7.4121093750e-01, 6.7529296875e-01, 9.1601562500e-01],
 70 |         [9.3066406250e-01, 6.2207031250e-01, 8.2568359375e-01],
 71 |         [5.1220703125e-01, 1.7529296875e-01, 1.3122558594e-01],
 72 |         [8.9794921875e-01, 3.0053710938e-01, 8.1591796875e-01],
 73 |         [2.6953125000e-01, 6.9824218750e-01, 1.1224365234e-01],
 74 |         [7.1386718750e-01, 6.3134765625e-01, 1.3537597656e-01],
 75 |         [6.8066406250e-01, 6.5673828125e-01, 5.0195312500e-01],
 76 |         [5.4248046875e-01, 1.5234375000e-01, 1.6955566406e-01],
 77 |         [5.7568359375e-01, 1.5124511719e-01, 8.9599609375e-01],
 78 |         [1.7065429688e-01, 8.4411621094e-02, 2.5708007812e-01],
 79 |         [8.6474609375e-01, 2.2229003906e-01, 9.2675781250e-01],
 80 |         [9.3701171875e-01, 5.1849365234e-02, 3.6474609375e-01],
 81 |         [8.1298828125e-01, 7.8564453125e-01, 6.2402343750e-01],
 82 |         [4.1503906250e-01, 5.9423828125e-01, 5.0537109375e-01],
 83 |         [3.4179687500e-01, 4.7802734375e-01, 8.8818359375e-01],
 84 |         [3.9306640625e-01, 5.1074218750e-01, 3.0981445312e-01],
 85 |         [8.0566406250e-01, 1.6113281250e-01, 4.4848632812e-01],
 86 |         [8.8134765625e-02, 9.7705078125e-01, 8.5742187500e-01],
 87 |         [2.1984863281e-01, 7.5048828125e-01, 5.2978515625e-01],
 88 |         [8.5839843750e-01, 8.5058593750e-01, 4.6582031250e-01],
 89 |         [6.6259765625e-01, 6.6992187500e-01, 6.4404296875e-01],
 90 |         [8.7500000000e-01, 9.2138671875e-01, 3.1982421875e-01],
 91 |         [4.5800781250e-01, 5.3076171875e-01, 3.9868164062e-01],
 92 |         [5.2148437500e-01, 9.7705078125e-01, 8.2617187500e-01],
 93 |         [2.3986816406e-01, 5.0488281250e-01, 6.6650390625e-01]]
 94 | 
 95 | 
 96 | sum=0.0
 97 | for i in test:
 98 |     sum+=i[2]
 99 | 
100 | print(sum)
101 | 
102 | 
103 | 


--------------------------------------------------------------------------------
/experiments/test.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import numpy as np
  3 | import os
  4 | import tensorflow as tf
  5 | import pandas as pd
  6 | 
  7 | import maddpg.common.tf_util as U
  8 | from experiments.env0 import log0 as Log
  9 | from experiments.env0.data_collection0 import Env
 10 | from maddpg.trainer.maddpg import MADDPGAgentTrainer
 11 | 
 12 | 
 13 | os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 14 | 
 15 | # Hyperparameters
 16 | ARGUMENTS = [
 17 |     # Environment
 18 |     ["--scenario", str, "simple_adversary", "name of the scenario script"],
 19 |     ["--max-episode-len", int, 500, "maximum episode length"],
 20 |     ["--num-episodes", int, 5000, "number of episodes"],
 21 |     ["--num-adversaries", int, 0, "number of adversaries(enemy)"],
 22 |     ["--good-policy", str, "maddpg", "policy for good agents"],
 23 |     ["--adv-policy", str, "maddpg", "policy of adversaries"],
 24 | 
 25 |     # Core training parameters
 26 |     ["--lr", float, 5e-4, "learning rate for Adam optimizer"],
 27 |     ["--decay_rate", float, 0.99995, "learning rate exponential decay"], # 作为初始学习率，后面尝试进行衰减,这个不着急加!
 28 |     ["--gamma", float, 0.95, "discount factor"],
 29 |     ["--batch-size", int, 32, "number of epochs to optimize at the same time"],
 30 |     ["--num-units", int, 600, "number of units in the mlp"],
 31 | 
 32 |     # Priority Replay Buffer ( weights not used )
 33 |     ["--alpha", float, 0.5, "priority parameter"],
 34 |     ["--beta", float, 0.4, "IS parameter"],
 35 |     ["--epsilon", float, 0.5, "a small positive constant"],
 36 |     ["--buffer_size", int, 200000, "buffer size for each agent"],
 37 | 
 38 |     # N-steps
 39 |     ["--N", int, 5, "steps of N-step"],
 40 | 
 41 |     # TODO: Experiments
 42 |     # Ape-X
 43 |     ["--num_actor_workers", int,0,
 44 |      "number of environments one agent can deal with. if >1, use apex ; else, use simple maddpg"],
 45 |     ["--debug_dir", str, "/debug_list/",
 46 |      "save index,reward(n-step),priority, value,wi per every sample from experience"],
 47 | 
 48 |     # RNN
 49 |     ["--rnn_length", int,0,
 50 |      "time_step in rnn, try to use LSTM instead of N-steps. if ==0, not use rnn; else, use rnn."],
 51 |     ["--rnn_cell_size", int, 64, "LSTM-cell output's size"],
 52 | 
 53 |     # Checkpointing 保存model
 54 |     ["--exp-name", str, None, "name of the experiment"],
 55 |     ["--save-dir", str, "/policy/", "directory in which training state and model sho uld be saved"],
 56 |     ["--save-rate", int, 10, "save model once every time this many episodes are completed"],
 57 |     ["--model_to_keep", int, 100, "the number of saved models"],
 58 |     ["--load-dir", str, "/home/dzp1997/learning/my_experiment_model/alpha0.5_actor5_rnn3",  # TODO：polcy之前一个路径
 59 |      "directory in which training state and model are loaded"],
 60 | 
 61 |     # Test
 62 |     ['--test_time', int, 10, "number of iterations run for testing"],
 63 |     ["--random_seed", int, 100, "random seed"],
 64 |     ["--start", int,90,"start model"],  #TODO
 65 |     ["--end", int,100, "end model"]  #todo
 66 | ]
 67 | 
 68 | ACTIONS = [
 69 |     ["--restore", "store_true", False],
 70 |     ["--display", "store_true", False],
 71 |     ["--benchmark", "store_true", False]
 72 | 
 73 | ]
 74 | 
 75 | 
 76 | # 参数调节器
 77 | def parse_args():
 78 |     parser = argparse.ArgumentParser("Reinforcement Learning experiments for multiagent environments")
 79 |     for arg in ARGUMENTS:
 80 |         parser.add_argument(arg[0], type=arg[1], default=arg[2], help=arg[3])
 81 |     for action in ACTIONS:
 82 |         parser.add_argument(action[0], action=action[1], default=action[2])
 83 |     return parser.parse_args()
 84 | 
 85 | 
 86 | def get_trainers(env, num_adversaries, obs_shape_n, arglist):
 87 |     # 加入多个trainers
 88 |     trainers = []
 89 |     trainer = MADDPGAgentTrainer
 90 | 
 91 |     # 对手agent个数  0
 92 |     for i in range(num_adversaries):
 93 |         trainers.append(trainer(
 94 |             "agent_%d" % i, obs_shape_n, env.action_space, i, arglist,
 95 |             local_q_func=(arglist.adv_policy == 'ddpg')))
 96 | 
 97 |     # 盟友agent个数  env.n  每一个agent都有一个actor，critic，replay_buffer！！！
 98 |     for i in range(num_adversaries, env.n):
 99 |         trainers.append(trainer(
100 |             "agent_%d" % i, obs_shape_n, env.action_space, i, arglist,
101 |             local_q_func=(arglist.good_policy == 'ddpg')))
102 | 
103 |     return trainers
104 | 
105 | 
106 | def test(arglist, log,full_load_dir,test_iteration):
107 | 
108 |     with U.multi_threaded_session() as sess:
109 |         # Create environment for testing
110 |         env=Env(log)
111 |         log.log(ARGUMENTS)
112 |         log.log(ACTIONS)
113 | 
114 |         # Create agent trainers
115 |         obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
116 |         num_adversaries = min(env.n, arglist.num_adversaries)  # 0
117 |         trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)  # 定义所有数据结构和静态图
118 | 
119 |         # Initialize all the uninitialized variables in the global scope
120 |         U.initialize()
121 | 
122 |         # debug
123 |         from tensorflow.python import pywrap_tensorflow
124 |         # Read data from checkpoint file
125 |         reader = pywrap_tensorflow.NewCheckpointReader(full_load_dir)
126 |         var_to_shape_map = reader.get_variable_to_shape_map()
127 |         # Print tensor name and value
128 |         f = open('trained_params.txt', 'w')
129 |         for key in var_to_shape_map:  # write tensors' names and values in file
130 |             print(key, file=f)
131 |             print(reader.get_tensor(key), file=f)
132 |         f.close()
133 | 
134 |         f = open('test_params.txt', 'w')
135 |         for variable_name in tf.global_variables():
136 |             print(variable_name,file=f)
137 |         f.close()
138 | 
139 |         # TODO:加载已经训练好的模型
140 |         saver = tf.train.Saver()
141 |         saver.restore(sess, full_load_dir)
142 | 
143 |         episode_rewards = [0.0]   # sum of rewards for all agents
144 |         agent_rewards = [[0.0] for _ in range(env.n)]  # individual agent reward
145 |         episode_step = 0
146 | 
147 |         start_env=env.reset()
148 |         state_step = []
149 |         for _ in range(0, arglist.rnn_length - 1):
150 |             state_step.append(start_env)
151 |         obs_n=start_env
152 | 
153 | 
154 |         print('Starting a new TEST iterations...')
155 |         print('Log_dir:', env.log_dir)
156 |         iteration = 0
157 | 
158 |         efficiency=[]
159 |         fairness=[]
160 |         normal_fairness=[]
161 |         collection_ratio=[]
162 |         energy_consumption=[]
163 |         collision = []
164 |         steps = []
165 |         collect_frequency=[]
166 |         charge_frequency=[]
167 |         station_remain=[]
168 | 
169 |         indicator = [0] * env.n  # TODO:状态指示器
170 |         meaningful_fill = [0] * env.n
171 |         meaningful_get = [0] * env.n
172 | 
173 |         # testing
174 |         while iteration < arglist.test_time:
175 |             if arglist.rnn_length > 0:
176 |                 action_n = []
177 |                 state_step.append(obs_n)
178 |                 for i, agent, obs in zip(range(0, len(trainers)), trainers, obs_n):
179 |                     obs_sequence = []
180 |                     for j in range(-1 * arglist.rnn_length, 0, 1):
181 |                         obs_sequence.append(state_step[j][i])
182 | 
183 |                     action_n.append(agent.action(np.array(obs_sequence)))
184 |             else:
185 |                 action_n = [agent.action(obs[None]) for agent, obs in zip(trainers, obs_n)]
186 | 
187 |             new_obs_n, rew_n, done_n, info_n, indicator = env.step(actions=action_n, indicator=indicator)
188 |             log.step_information(action_n, env, episode_step, iteration, meaningful_fill, meaningful_get,
189 |                                  indicator)
190 | 
191 |             indicator = [0] * env.n
192 |             episode_step += 1  # step per episode
193 |             done = done_n
194 |             terminal = (episode_step >= arglist.max_episode_len)
195 |             obs_n = new_obs_n
196 | 
197 |             for i, rew in enumerate(rew_n):
198 |                 episode_rewards[-1] += rew  # 每一个step的总reward
199 |                 agent_rewards[i][-1] += rew  # 每一个step,每个agent自己的reward
200 | 
201 |             if done or terminal:
202 |                 efficiency.append(env.efficiency)
203 |                 fairness.append(env.collection_fairness)
204 |                 normal_fairness.append(env.normal_collection_fairness)
205 |                 collection_ratio.append(1.0-env.leftrewards)
206 |                 energy_consumption.append(np.sum(env.normal_use_energy))
207 |                 collision.append(np.sum(env.walls))
208 |                 steps.append(env.count)
209 |                 collect_frequency.append(np.sum(meaningful_get))
210 |                 charge_frequency.append(np.sum(meaningful_fill))
211 |                 station_remain.append(250-sum(env.fills_energy_remain))  # @TODO:这里写的比较傻逼
212 | 
213 |                 log.draw_path(env, iteration, meaningful_fill, meaningful_get)
214 | 
215 |                 iteration += 1
216 |                 meaningful_fill = [0] * env.n
217 |                 meaningful_get = [0] * env.n
218 |                 obs_n = env.reset()
219 |                 episode_step = 0
220 |                 episode_rewards.append(0)
221 |                 for a in agent_rewards:
222 |                     a.append(0)
223 | 
224 |             # for displaying learned policies
225 |             if arglist.display:
226 |                 env.render()
227 |                 continue
228 | 
229 |         details = [
230 |             '\n\nindicator DETAILS:',
231 |             '\n\tefficiency: ' + str(efficiency),
232 |             '\n\tfairness: ' + str(fairness),
233 |             '\n\tnormal_fairness: ' + str(normal_fairness),
234 |             '\n\tcollection_ratio: ' + str(collection_ratio),
235 |             '\n\tenergy_consumption: ' + str(energy_consumption),
236 |             '\n\tcollision: ' + str(collision),
237 |             '\n\tsteps: ' + str(steps),
238 |         ]
239 | 
240 |         indicator = [
241 |             '\n\ntest_model: '+str(test_iteration)+' --indicator AVERAGE:',
242 |             '\n\tefficiency: ' + str(np.mean(efficiency)),
243 |             '\n\tfairness: ' + str(np.mean(fairness)),
244 |             '\n\tnormal_fairness: ' + str(np.mean(normal_fairness)),
245 |             '\n\tcollection_ratio: ' + str(np.mean(collection_ratio)),
246 |             '\n\tenergy_consumption: ' + str(np.mean(energy_consumption)),
247 |             '\n\tcollision: ' + str(np.mean(collision)),
248 |             '\n\tsteps: ' + str(np.mean(steps)),
249 |         ]
250 | 
251 |         for _ in indicator:
252 |             print(_)
253 | 
254 |         indicator_to_pandas = [
255 |             str(test_iteration),
256 | 
257 |             str(np.mean(collection_ratio)),
258 |             str(np.min(collection_ratio)),
259 |             str(np.max(collection_ratio)),
260 | 
261 |             str(np.mean(normal_fairness)),
262 |             str(np.min(normal_fairness)),
263 |             str(np.max(normal_fairness)),
264 | 
265 |             str(np.mean(energy_consumption)),
266 |             str(np.min(energy_consumption)),
267 |             str(np.max(energy_consumption)),
268 | 
269 |             str(np.mean(efficiency)),
270 |             str(np.min(efficiency)),
271 |             str(np.max(efficiency)),
272 | 
273 |             str(np.mean(collect_frequency)),
274 |             str(np.mean(charge_frequency)),
275 |             str(np.mean(station_remain))
276 |         ]
277 | 
278 | 
279 |         log.log(details)
280 |         log.log(indicator)
281 | 
282 |     tf.reset_default_graph()
283 |     return indicator_to_pandas
284 | 
285 | if __name__ == '__main__':
286 |     print('Loading the trained model...Now, Enjoy yourself!')
287 |     arglist = parse_args()
288 |     df=pd.DataFrame(columns=["test_model",
289 |                              "collection_ratio","cr_min","cr_max",
290 |                              "fairness","f_min","f_max",
291 |                              "consumption of energy","ce_min","ce_max",
292 |                              "efficiency","e_min","e_max","collect","charge","station remain"])
293 |     for i in range(arglist.start,arglist.end):
294 |         full_load_dir=arglist.load_dir+"/policy/"+str(i)+".ckpt"
295 |         log = Log.Log()
296 |         indicator_to_pandas=test(arglist, log,full_load_dir,i)
297 |         df.loc[i-70]=indicator_to_pandas
298 | 
299 |     df.sort_values("efficiency",inplace=True)
300 |     df.to_csv(arglist.load_dir+"/choose_best_model.csv",index=0)
301 |     print('\n', 'TEST finished')
302 | 


--------------------------------------------------------------------------------
/experiments/test_random.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import numpy as np
  3 | import os
  4 | import tensorflow as tf
  5 | import pandas as pd
  6 | 
  7 | import maddpg.common.tf_util as U
  8 | from experiments.env0 import log0 as Log
  9 | from experiments.env0.data_collection0 import Env
 10 | from maddpg.trainer.maddpg import MADDPGAgentTrainer
 11 | 
 12 | 
 13 | os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 14 | 
 15 | # Hyperparameters
 16 | ARGUMENTS = [
 17 |     # Environment
 18 |     ["--scenario", str, "simple_adversary", "name of the scenario script"],
 19 |     ["--max-episode-len", int, 500, "maximum episode length"],
 20 |     ["--num-episodes", int, 5000, "number of episodes"],
 21 |     ["--num-adversaries", int, 0, "number of adversaries(enemy)"],
 22 |     ["--good-policy", str, "maddpg", "policy for good agents"],
 23 |     ["--adv-policy", str, "maddpg", "policy of adversaries"],
 24 | 
 25 |     # Core training parameters
 26 |     ["--lr", float, 5e-4, "learning rate for Adam optimizer"],
 27 |     ["--decay_rate", float, 0.99995, "learning rate exponential decay"], # 作为初始学习率，后面尝试进行衰减,这个不着急加!
 28 |     ["--gamma", float, 0.95, "discount factor"],
 29 |     ["--batch-size", int, 512, "number of epochs to optimize at the same time"],
 30 |     ["--num-units", int, 600, "number of units in the mlp"],
 31 | 
 32 |     # Priority Replay Buffer ( weights not used )
 33 |     ["--alpha", float, 0.5, "priority parameter"],
 34 |     ["--beta", float, 0.4, "IS parameter"],
 35 |     ["--epsilon", float, 0.5, "a small positive constant"],
 36 |     ["--buffer_size", int, 200000, "buffer size for each agent"],
 37 | 
 38 |     # N-steps
 39 |     ["--N", int, 5, "steps of N-step"],
 40 | 
 41 |     # TODO: Experiments
 42 |     # Ape-X
 43 |     ["--num_actor_workers", int,0,
 44 |      "number of environments one agent can deal with. if >1, use apex ; else, use simple maddpg"],
 45 |     ["--debug_dir", str, "/debug_list/",
 46 |      "save index,reward(n-step),priority, value,wi per every sample from experience"],
 47 | 
 48 |     # RNN
 49 |     ["--rnn_length", int,0,
 50 |      "time_step in rnn, try to use LSTM instead of N-steps. if ==0, not use rnn; else, use rnn."],
 51 |     ["--rnn_cell_size", int, 64, "LSTM-cell output's size"],
 52 | 
 53 |     # Checkpointing 保存model
 54 |     ["--exp-name", str, None, "name of the experiment"],
 55 |     ["--save-dir", str, "/policy/", "directory in which training state and model sho uld be saved"],
 56 |     ["--save-rate", int, 10, "save model once every time this many episodes are completed"],
 57 |     ["--model_to_keep", int, 100, "the number of saved models"],
 58 |     ["--load-dir", str, "/media/sda1/MCS_experiments/test_裸奔/uav5",
 59 |      "directory in which training state and model are loaded"],
 60 | 
 61 |     # Test
 62 |     ['--test_time', int, 10, "number of iterations run for testing"],
 63 |     ["--random_seed", int, 100, "random seed"],
 64 |     ["--start", int,0,"start model"],
 65 |     ["--end", int,5, "end model"]
 66 | ]
 67 | 
 68 | ACTIONS = [
 69 |     ["--restore", "store_true", False],
 70 |     ["--display", "store_true", False],
 71 |     ["--benchmark", "store_true", False]
 72 | 
 73 | ]
 74 | 
 75 | 
 76 | # 参数调节器
 77 | def parse_args():
 78 |     parser = argparse.ArgumentParser("Reinforcement Learning experiments for multiagent environments")
 79 |     for arg in ARGUMENTS:
 80 |         parser.add_argument(arg[0], type=arg[1], default=arg[2], help=arg[3])
 81 |     for action in ACTIONS:
 82 |         parser.add_argument(action[0], action=action[1], default=action[2])
 83 |     return parser.parse_args()
 84 | 
 85 | 
 86 | def get_trainers(env, num_adversaries, obs_shape_n, arglist):
 87 |     # 加入多个trainers
 88 |     trainers = []
 89 |     trainer = MADDPGAgentTrainer
 90 | 
 91 |     # 对手agent个数  0
 92 |     for i in range(num_adversaries):
 93 |         trainers.append(trainer(
 94 |             "agent_%d" % i, obs_shape_n, env.action_space, i, arglist,
 95 |             local_q_func=(arglist.adv_policy == 'ddpg')))
 96 | 
 97 |     # 盟友agent个数  env.n  每一个agent都有一个actor，critic，replay_buffer！！！
 98 |     for i in range(num_adversaries, env.n):
 99 |         trainers.append(trainer(
100 |             "agent_%d" % i, obs_shape_n, env.action_space, i, arglist,
101 |             local_q_func=(arglist.good_policy == 'ddpg')))
102 | 
103 |     return trainers
104 | 
105 | 
106 | def test(arglist, log,full_load_dir,test_iteration):
107 | 
108 |     with U.multi_threaded_session() as sess:
109 |         # Create environment for testing
110 |         env=Env(log)
111 |         log.log(ARGUMENTS)
112 |         log.log(ACTIONS)
113 | 
114 |         # Create agent trainers
115 |         obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
116 |         num_adversaries = min(env.n, arglist.num_adversaries)  # 0
117 |         trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)  # 定义所有数据结构和静态图
118 | 
119 |         # Initialize all the uninitialized variables in the global scope
120 |         U.initialize()
121 | 
122 |         # TODO:加载已经训练好的模型
123 |         saver = tf.train.Saver()
124 |         saver.restore(sess,full_load_dir)
125 | 
126 |         episode_rewards = [0.0]   # sum of rewards for all agents
127 |         agent_rewards = [[0.0] for _ in range(env.n)]  # individual agent reward
128 |         episode_step = 0
129 | 
130 |         start_env=env.reset()
131 |         state_step = []
132 |         for _ in range(0, arglist.rnn_length - 1):
133 |             state_step.append(start_env)
134 |         obs_n=start_env
135 | 
136 | 
137 |         print('Starting a new TEST iterations...')
138 |         print('Log_dir:', env.log_dir)
139 |         iteration = 0
140 | 
141 |         efficiency=[]
142 |         fairness=[]
143 |         normal_fairness=[]
144 |         collection_ratio=[]
145 |         energy_consumption=[]
146 |         collision = []
147 |         steps = []
148 | 
149 |         indicator = [0] * env.n  # TODO:状态指示器
150 |         meaningful_fill = [0] * env.n
151 |         meaningful_get = [0] * env.n
152 | 
153 |         # testing
154 |         while iteration < arglist.test_time:
155 |             if arglist.rnn_length > 0:
156 |                 action_n = []
157 |                 state_step.append(obs_n)
158 |                 for i, agent, obs in zip(range(0, len(trainers)), trainers, obs_n):
159 |                     obs_sequence = []
160 |                     for j in range(-1 * arglist.rnn_length, 0, 1):
161 |                         obs_sequence.append(state_step[j][i])
162 | 
163 |                     action_n.append(agent.action(np.array(obs_sequence)))
164 |             else:
165 |                 action_n = [agent.action(obs[None]) for agent, obs in zip(trainers, obs_n)]
166 | 
167 |             action_n=np.array(action_n)
168 |             random_action_n=np.random.uniform(low=-1,high=1,size=action_n.shape)
169 |             new_obs_n, rew_n, done_n, info_n, indicator = env.step(actions=random_action_n, indicator=indicator)
170 |             log.step_information(random_action_n, env, episode_step, iteration, meaningful_fill, meaningful_get,
171 |                                  indicator)
172 | 
173 |             indicator = [0] * env.n
174 |             episode_step += 1  # step per episode
175 |             done = done_n
176 |             terminal = (episode_step >= arglist.max_episode_len)
177 |             obs_n = new_obs_n
178 | 
179 |             for i, rew in enumerate(rew_n):
180 |                 episode_rewards[-1] += rew  # 每一个step的总reward
181 |                 agent_rewards[i][-1] += rew  # 每一个step,每个agent自己的reward
182 | 
183 |             if done or terminal:
184 |                 efficiency.append(env.efficiency)
185 |                 fairness.append(env.collection_fairness)
186 |                 normal_fairness.append(env.normal_collection_fairness)
187 |                 collection_ratio.append(1.0-env.leftrewards)
188 |                 energy_consumption.append(np.sum(env.normal_use_energy))
189 |                 collision.append(np.sum(env.walls))
190 |                 steps.append(env.count)
191 | 
192 |                 log.draw_path(env, iteration, meaningful_fill, meaningful_get)
193 | 
194 |                 iteration += 1
195 |                 meaningful_fill = [0] * env.n
196 |                 meaningful_get = [0] * env.n
197 |                 obs_n = env.reset()
198 |                 episode_step = 0
199 |                 episode_rewards.append(0)
200 |                 for a in agent_rewards:
201 |                     a.append(0)
202 | 
203 |             # for displaying learned policies
204 |             if arglist.display:
205 |                 env.render()
206 |                 continue
207 | 
208 |         details = [
209 |             '\n\nindicator DETAILS:',
210 |             '\n\tefficiency: ' + str(efficiency),
211 |             '\n\tfairness: ' + str(fairness),
212 |             '\n\tnormal_fairness: ' + str(normal_fairness),
213 |             '\n\tcollection_ratio: ' + str(collection_ratio),
214 |             '\n\tenergy_consumption: ' + str(energy_consumption),
215 |             '\n\tcollision: ' + str(collision),
216 |             '\n\tsteps: ' + str(steps),
217 |         ]
218 | 
219 |         indicator = [
220 |             '\n\ntest_model: '+str(test_iteration)+' --indicator AVERAGE:',
221 |             '\n\tefficiency: ' + str(np.mean(efficiency)),
222 |             '\n\tfairness: ' + str(np.mean(fairness)),
223 |             '\n\tnormal_fairness: ' + str(np.mean(normal_fairness)),
224 |             '\n\tcollection_ratio: ' + str(np.mean(collection_ratio)),
225 |             '\n\tenergy_consumption: ' + str(np.mean(energy_consumption)),
226 |             '\n\tcollision: ' + str(np.mean(collision)),
227 |             '\n\tsteps: ' + str(np.mean(steps)),
228 |         ]
229 | 
230 |         for _ in indicator:
231 |             print(_)
232 | 
233 |         indicator_to_pandas = [
234 |             str(test_iteration),
235 | 
236 |             str(np.mean(collection_ratio)),
237 |             str(np.min(collection_ratio)),
238 |             str(np.max(collection_ratio)),
239 | 
240 |             str(np.mean(normal_fairness)),
241 |             str(np.min(normal_fairness)),
242 |             str(np.max(normal_fairness)),
243 | 
244 |             str(np.mean(energy_consumption)),
245 |             str(np.min(energy_consumption)),
246 |             str(np.max(energy_consumption)),
247 | 
248 |             str(np.mean(efficiency)),
249 |             str(np.min(efficiency)),
250 |             str(np.max(efficiency)),
251 |         ]
252 | 
253 | 
254 |         log.log(details)
255 |         log.log(indicator)
256 | 
257 |     tf.reset_default_graph()
258 |     return indicator_to_pandas
259 | 
260 | if __name__ == '__main__':
261 |     print('Loading the trained model...Now, Enjoy yourself!')
262 |     arglist = parse_args()
263 |     df=pd.DataFrame(columns=["test_model",
264 |                              "collection_ratio","cr_min","cr_max",
265 |                              "fairness","f_min","f_max",
266 |                              "consumption of energy","ce_min","ce_max",
267 |                              "efficiency","e_min","e_max"])
268 |     for i in range(arglist.start,arglist.end):
269 |         full_load_dir=arglist.load_dir+"/policy/"+str(i)+".ckpt"
270 |         log = Log.Log()
271 |         indicator_to_pandas=test(arglist, log,full_load_dir,i)
272 |         df.loc[i-70]=indicator_to_pandas
273 | 
274 |     df.sort_values("efficiency",inplace=True)
275 |     df.to_csv(arglist.load_dir+"/瞎跑uav5.csv",index=0)
276 |     print('\n', 'TEST finished')
277 | 


--------------------------------------------------------------------------------
/experiments/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import numpy as np
  3 | import os
  4 | import tensorflow as tf
  5 | import time
  6 | 
  7 | import maddpg.common.tf_util as U
  8 | from experiments.env0 import log0 as Log
  9 | from experiments.env0.data_collection0 import Env
 10 | from maddpg.common.summary import Summary
 11 | from maddpg.trainer.maddpg import MADDPGAgentTrainer
 12 | 
 13 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 14 | 
 15 | # Hyperparameters
 16 | ARGUMENTS = [
 17 |     # Environment
 18 |     ["--scenario", str, "simple_adversary", "name of the scenario script"],
 19 |     ["--max-episode-len", int, 500, "maximum episode length"],
 20 |     ["--num-episodes", int, 500, "number of episodes"],
 21 |     ["--num-adversaries", int, 0, "number of adversaries(enemy)"],
 22 |     ["--good-policy", str, "maddpg", "policy for good agents"],
 23 |     ["--adv-policy", str, "maddpg", "policy of adversaries"],
 24 | 
 25 |     # Core training parameters
 26 |     ["--lr", float, 5e-4, "learning rate for Adam optimizer"],
 27 |     ["--decay_rate", float, 0.99995, "learning rate exponential decay"], # 作为初始学习率，后面尝试进行衰减,这个不着急加!
 28 |     ["--gamma", float, 0.95, "discount factor"],
 29 |     ["--batch-size", int, 32, "number of epochs to optimize at the same time"],  # 512
 30 |     ["--num-units", int, 600, "number of units in the mlp"],
 31 | 
 32 |     # Priority Replay Buffer ( weights not used )
 33 |     ["--alpha", float, 0.5, "priority parameter"],
 34 |     ["--beta", float, 0.4, "IS parameter"],
 35 |     ["--epsilon", float, 0.5, "a small positive constant"],
 36 |     ["--buffer_size", int, 200000, "buffer size for each agent"] ,
 37 | 
 38 |     # N-steps
 39 |     ["--N", int, 5, "steps of N-step"],
 40 | 
 41 |     # TODO: Experiments
 42 |     # Ape-X
 43 |     ["--num_actor_workers", int, 0,
 44 |      "number of environments one agent can deal with. if >1, use apex ; else, use simple maddpg"],
 45 |     ["--debug_dir", str, "/debug_list/",
 46 |      "save index,reward(n-step),priority, value,wi per every sample from experience"],
 47 | 
 48 |     # RNN
 49 |     ["--rnn_length", int, 0,
 50 |      "time_step in rnn, try to use LSTM instead of N-steps. if ==0, not use rnn; else, use rnn."],
 51 |     ["--rnn_cell_size", int, 64, "LSTM-cell output's size"],
 52 | 
 53 |     # Checkpointing 保存model
 54 |     ["--exp-name", str, None, "name of the experiment"],
 55 |     ["--save-dir", str, "/policy/", "directory in which training state and model should be saved"],
 56 |     ["--save-rate", int, 2, "save model once every time this many episodes are completed"],
 57 |     ["--model_to_keep", int, 100, "the number of saved models"],
 58 |     ["--load-dir", str, "/home/linc/Desktop/maddpg-final/saved_state.ckpt",
 59 |      "directory in which training state and model are loaed"],
 60 | 
 61 |     # Evaluation
 62 |     ["--benchmark-iters", int, 100000, "number of iterations run for benchmarking"],
 63 |     ["--benchmark-dir", str, "./benchm", "directory where benchmark data is saved"],
 64 |     ["--plots-dir", str, "./learning_curves/", "directory where plot data is saved"],
 65 | 
 66 |     # Training
 67 |     ["--random_seed", int, 0, "random seed"]
 68 | ]
 69 | 
 70 | ACTIONS = [
 71 |     ["--restore", "store_true", False],
 72 |     ["--display", "store_true", False],
 73 |     ["--benchmark", "store_true", False]
 74 | 
 75 | ]
 76 | 
 77 | 
 78 | # 参数调节器
 79 | def parse_args():
 80 |     parser = argparse.ArgumentParser("Reinforcement Learning experiments for multiagent environments")
 81 |     for arg in ARGUMENTS:
 82 |         parser.add_argument(arg[0], type=arg[1], default=arg[2], help=arg[3])
 83 |     for action in ACTIONS:
 84 |         parser.add_argument(action[0], action=action[1], default=action[2])
 85 |     return parser.parse_args()
 86 | 
 87 | 
 88 | def get_trainers(env, num_adversaries, obs_shape_n, arglist):
 89 |     # 加入多个trainers
 90 |     trainers = []
 91 |     trainer = MADDPGAgentTrainer
 92 | 
 93 |     # 对手agent个数  0
 94 |     for i in range(num_adversaries):
 95 |         trainers.append(trainer(
 96 |             "agent_%d" % i, obs_shape_n, env.action_space, i, arglist,
 97 |             local_q_func=(arglist.adv_policy == 'ddpg')))
 98 | 
 99 |     # 盟友agent个数  env.n  每一个agent都有一个actor，critic，replay_buffer！！！
100 |     for i in range(num_adversaries, env.n):
101 |         trainers.append(trainer(
102 |             "agent_%d" % i, obs_shape_n, env.action_space, i, arglist,
103 |             local_q_func=(arglist.good_policy == 'ddpg')))
104 | 
105 |     return trainers
106 | 
107 | 
108 | def train(arglist, log):
109 | 
110 |     with U.multi_threaded_session() as sess:
111 |         # Create environment(use Ape-X)
112 |         envs = [Env(log) for _ in range(arglist.num_actor_workers)]
113 |         log.log(ARGUMENTS)
114 |         log.log(ACTIONS)
115 | 
116 |         # Create summary
117 |         summary = Summary(sess, envs[0].log_dir)
118 |         for i in range(envs[0].n):
119 |             summary.add_variable(tf.Variable(0.), 'reward_%d' % i)
120 |             summary.add_variable(tf.Variable(0.), 'loss_%d' % i)
121 |             summary.add_variable(tf.Variable(0.), 'wall_%d' % i)
122 |             summary.add_variable(tf.Variable(0.), 'energy_%d' % i)
123 |             summary.add_variable(tf.Variable(0.), 'gained_info_%d' % i)
124 |         summary.add_variable(tf.Variable(0.), 'buffer_size')
125 |         summary.add_variable(tf.Variable(0.), 'acc_reward')
126 |         summary.add_variable(tf.Variable(0.), 'leftrewards')
127 |         summary.add_variable(tf.Variable(0.), 'efficiency')
128 |         summary.build()
129 | 
130 |         # Create agent trainers
131 |         obs_shape_n = [envs[0].observation_space[i].shape for i in range(envs[0].n)]
132 | 
133 |         # 计算对手个数
134 |         num_adversaries = min(envs[0].n, arglist.num_adversaries)  # 0
135 | 
136 |         # 定义所有数据结构和静态图
137 |         trainers = get_trainers(envs[0], num_adversaries, obs_shape_n, arglist)
138 | 
139 |         # # 我方和敌方采用不同策略(适用于多智能体的双方竞争环境)
140 |         # print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy))
141 | 
142 |         # Initialize all the uninitialized variables in the global scope
143 |         U.initialize()
144 | 
145 |         if arglist.restore:
146 |             print('Loading previous state...')
147 |             U.load_state(arglist.load_dir)
148 | 
149 |         # 保存模型的个数 100
150 |         saver = tf.train.Saver(max_to_keep=arglist.model_to_keep)
151 | 
152 |         episode_rewards = [[0.0] for env in envs]  # sum of rewards for all agents
153 |         agent_rewards = [[[0.0] for _ in range(env.n)] for env in envs]  # individual agent reward
154 |         final_ep_rewards = []  # sum of rewards for training curve
155 |         final_ep_ag_rewards = []  # agent rewards for training curve
156 |         agent_info = [[[]]]  # placeholder for benchmarking info
157 |         obs_n = []
158 |         state_step_n = []
159 |         for env in envs:
160 |             start_env = env.reset()
161 |             state_step_i = []
162 |             for _ in range(0, arglist.rnn_length - 1):
163 |                 state_step_i.append(start_env)
164 |             state_step_n.append(state_step_i)
165 |             obs_n.append(start_env)
166 |         episode_step = [0 for env in envs]
167 |         t_start = [time.time() for env in envs]
168 | 
169 |         m_time = t_start.copy()
170 |         print('Starting iterations...')
171 |         print('Log_dir:', envs[0].log_dir)
172 |         iteration = 0
173 |         global_total_step = 0  # episode step
174 |         loss = [0.] * envs[0].n
175 |         model_index = 0
176 |         efficiency = 0
177 |         indicator = [0] * envs[0].n  # TODO:状态指示器
178 |         meaningful_fill = [0] * envs[0].n
179 |         meaningful_get = [0] * envs[0].n
180 | 
181 |         # training
182 |         while iteration < arglist.num_episodes:
183 |             global_total_step += 1  # sum step id
184 |             terminal_done_0=False
185 |             # TODO:DEBUG
186 |             # print("global-step: ",global_total_step)
187 |             rew_n_master = []
188 |             for env_i, env in enumerate(envs):
189 |                 # get action 各取各的
190 |                 # TODO:LSTM try
191 |                 if arglist.rnn_length > 0:
192 |                     action_n = []
193 |                     state_step_n[env_i].append(obs_n[env_i])
194 |                     for i, agent, obs in zip(range(0, len(trainers)), trainers, obs_n[env_i]):
195 |                         obs_sequence = []
196 | 
197 |                         for j in range(-1 * arglist.rnn_length, 0, 1):
198 |                             obs_sequence.append(state_step_n[env_i][j][i])
199 | 
200 |                         action_n.append(agent.action(np.array(obs_sequence)))
201 |                 else:
202 |                     action_n = [agent.action(obs[None]) for agent, obs in zip(trainers, obs_n[env_i])]
203 | 
204 |                 # environment step
205 |                 if env_i == 0:
206 |                     # TODO:加入状态指示器放在step里面进行每一步的更新
207 |                     new_obs_n, rew_n, done_n, info_n, indicator = env.step(actions=action_n, indicator=indicator)
208 |                     # TODO：step-debug
209 |                     log.step_information(action_n, env, episode_step[0], iteration, meaningful_fill, meaningful_get,
210 |                                          indicator)
211 |                     rew_n_master = rew_n
212 |                     indicator = [0] * envs[0].n
213 |                 else:
214 |                     new_obs_n, rew_n, done_n, info_n, _ = env.step(actions=action_n)
215 |                 episode_step[env_i] += 1  # step per episode
216 |                 done = done_n
217 |                 terminal = (episode_step[env_i] >= arglist.max_episode_len)
218 | 
219 |                 # collect experience 添加buffer是各加各的
220 |                 for i, agent in enumerate(trainers):
221 |                     agent.experience(obs_n[env_i][i], action_n[i], rew_n[i], new_obs_n[i], done_n, terminal,
222 |                                      arglist.num_actor_workers)
223 |                 obs_n[env_i] = new_obs_n
224 | 
225 |                 for i, rew in enumerate(rew_n):
226 |                     episode_rewards[env_i][-1] += rew  # 每一个step的总reward
227 |                     agent_rewards[env_i][i][-1] += rew  # 每一个step,每个agent自己的reward
228 | 
229 |                 if done or terminal:
230 |                     # report
231 |                     if env_i == 0:
232 |                         terminal_done_0=True
233 |                         print('\n%d th episode:\n' % iteration)
234 |                         print('\tthe %d env,%d steps,%.2f seconds, wasted %.2f seconds.' % (
235 |                             env_i, episode_step[env_i], time.time() - m_time[env_i], env.time_))
236 |                         # print('rewards:', agent_rewards[0][-1], agent_rewards[1][-1])
237 |                         print('\tobstacle collisions:', env.walls)
238 |                         print('\tdata collection:', env.collection / env.totaldata)
239 |                         print('\treminding energy:', env.energy)
240 |                         efficiency = env.efficiency
241 |                         # log.draw_path(env, iteration)
242 |                         log.draw_path(env, iteration, meaningful_fill, meaningful_get)
243 |                         iteration += 1
244 | 
245 |                     meaningful_fill = [0] * envs[0].n
246 |                     meaningful_get = [0] * envs[0].n
247 |                     m_time[env_i] = time.time()
248 |                     obs_n[env_i] = env.reset()
249 |                     episode_step[env_i] = 0
250 |                     episode_rewards[env_i].append(0)
251 |                     for a in agent_rewards[env_i]:
252 |                         a.append(0)
253 |                     agent_info.append([[]])
254 | 
255 |             # for displaying learned policies
256 |             if arglist.display:
257 |                 envs[0].render()
258 |                 continue
259 | 
260 |             # update all trainers, if not in display or benchmark mode
261 |             _loss = []
262 | 
263 |             # update  每一个agent自己更新自己的PQ参数
264 |             for agent in trainers:  # 将buffer采样初始化
265 |                 agent.preupdate()
266 |             for agent in trainers:
267 |                 _loss.append(agent.update(envs[0], trainers, global_total_step)[0])
268 |             if np.sum(_loss) != 0:  # 在buffer没有填满的时候不加loss
269 |                 loss = _loss
270 | 
271 |             # summary vistalize for all UAVs
272 |             feed_dict = {}
273 |             for i_summary in range(envs[0].n):
274 |                 feed_dict['reward_%d' % i_summary] = rew_n_master[i_summary]
275 |                 feed_dict['loss_%d' % i_summary] = loss[i_summary]
276 |                 feed_dict['wall_%d' % i_summary] = envs[0].walls[i_summary] / (float(episode_step[0]) + 1e-4)
277 |                 feed_dict['energy_%d' % i_summary] = envs[0].energy[i_summary]
278 |                 feed_dict['gained_info_%d' % i_summary] = envs[0].collection[i_summary]
279 |             feed_dict['buffer_size'] = trainers[0].filled_size
280 |             feed_dict['leftrewards'] = envs[0].leftrewards
281 |             feed_dict['acc_reward'] = episode_rewards[0][-1]
282 |             feed_dict['efficiency'] = efficiency
283 |             summary.run(feed_dict=feed_dict, step=global_total_step)
284 | 
285 |             # save model, display training output
286 |             if terminal_done_0 is True and (len(episode_rewards[0]) + 1) % arglist.save_rate == 0:
287 |                 U.save_state(
288 |                     envs[0].log_dir + arglist.save_dir + "/" + str(model_index % arglist.model_to_keep) + ".ckpt",
289 |                     saver=saver)
290 |                 model_index += 1
291 |                 # print statement depends on whether or not there are adversaries
292 |                 if num_adversaries == 0:
293 |                     print("------------------------------------------------------------------------------------------")
294 |                     print("Master: steps: {}, episodes: {}, mean episode reward: {}, time: {}".format(
295 |                         global_total_step, len(episode_rewards[0]) - 1,
296 |                         np.mean(episode_rewards[0][-arglist.save_rate:]),
297 |                         round(time.time() - t_start[0], 3)))
298 |                     print("------------------------------------------------------------------------------------------")
299 |                 else:
300 |                     print("------------------------------------------------------------------------------------------")
301 |                     print(
302 |                         "Master: steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(
303 |                             global_total_step, len(episode_rewards[0]) - 1,
304 |                             np.mean(episode_rewards[0][-arglist.save_rate:]),
305 |                             [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards],
306 |                             round(time.time() - t_start[0], 3)))
307 |                     print("------------------------------------------------------------------------------------------")
308 |                 t_start = [time.time() for env in envs]
309 |                 # Keep track of final episode reward
310 |                 final_ep_rewards.append(np.mean(episode_rewards[0][-arglist.save_rate:]))
311 |                 for rew in agent_rewards:
312 |                     final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:]))
313 | 
314 | 
315 | if __name__ == '__main__':
316 |     print('Let\'s train, go! go! go!')
317 |     arglist = parse_args()
318 |     log = Log.Log()
319 |     train(arglist, log)
320 | 


--------------------------------------------------------------------------------
/experiments/visualization.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | 
  5 | # training
  6 | eff=pd.read_csv("/home/linc/桌面/标杆/run_.-tag-efficiency.csv")
  7 | plt.figure(figsize=(17,17))
  8 | plt.plot(eff['Step'],eff['Value'],color='black',linewidth=2)
  9 | plt.xlim(xmax=130000,xmin=10240)
 10 | plt.ylim()
 11 | plt.xticks(fontsize=32)
 12 | plt.yticks(fontsize=32)
 13 | plt.xlabel("Training epoch (1e5)",fontsize=32)
 14 | plt.ylabel("Energy efficiency",fontsize=32)
 15 | plt.grid(True)
 16 | plt.grid(linestyle='--')
 17 | ax=plt.gca()
 18 | ax.xaxis.get_major_formatter().set_powerlimits((0,1))
 19 | plt.show()
 20 | 
 21 | rew=pd.read_csv("/home/linc/桌面/标杆/run_.-tag-acc_reward.csv")
 22 | plt.figure(figsize=(17,17))
 23 | plt.xticks(fontsize=32)
 24 | plt.yticks(fontsize=32)
 25 | plt.plot(rew['Step'],rew['Value'],color='black',linewidth=2)
 26 | plt.xlim(xmax=130000,xmin=10240)
 27 | plt.xlabel("Training epoch (1e5)",fontsize=32)
 28 | plt.ylabel("Accumulated reward",fontsize=32)
 29 | plt.grid(True)
 30 | plt.grid(linestyle='--')
 31 | ax=plt.gca()
 32 | ax.xaxis.get_major_formatter().set_powerlimits((0,1))
 33 | plt.show()
 34 | 
 35 | loss0=pd.read_csv("/home/linc/桌面/标杆/run_.-tag-loss_0.csv")
 36 | loss1=pd.read_csv("/home/linc/桌面/标杆/run_.-tag-loss_1.csv")
 37 | plt.figure(figsize=(17,17))
 38 | plt.xticks(fontsize=32)
 39 | plt.yticks(fontsize=32)
 40 | plt.plot(loss0['Step'],loss0['Value'],label='Vehicle 1',color='black',linewidth=2)
 41 | plt.plot(loss1['Step'],loss1['Value'],label='Vehicle 2',color='blue',linewidth=2)
 42 | plt.xlim(xmax=130000,xmin=10240)
 43 | plt.ylim(ymax=30,ymin=5)
 44 | plt.xlabel("Training epoch (1e5)",fontsize=32)
 45 | plt.ylabel("Loss",fontsize=32)
 46 | plt.grid(True)
 47 | plt.grid(linestyle='--')
 48 | plt.legend(fontsize=32)
 49 | ax=plt.gca()
 50 | ax.xaxis.get_major_formatter().set_powerlimits((0,1))
 51 | plt.show()
 52 | 
 53 | 
 54 | # consumption
 55 | 
 56 | plt.figure(figsize=(17,17))
 57 | num = [1, 2, 3, 4, 5]
 58 | sum_comsumption = [3.576, 4, 4.004, 4.402, 4.668]
 59 | average_comsumption=np.true_divide(np.array(sum_comsumption),np.array(num))
 60 | plt.xticks(num,num[::1])
 61 | plt.xticks(fontsize=32)
 62 | plt.yticks(fontsize=32)
 63 | plt.plot(num,sum_comsumption,label='Total energy consumption',marker='o',markersize=26,markeredgewidth=5,markerfacecolor='none',
 64 |          color='blue',linewidth=4)
 65 | plt.plot(num,average_comsumption,label='Average energy consumption per vehicle',marker='s',markersize=26,markeredgewidth=5,
 66 |          markerfacecolor='none',color='black',linewidth=4)
 67 | 
 68 | plt.xlabel("No. of vehicles",fontsize=32)
 69 | plt.ylabel("Energy usage (# of full batteries)",fontsize=32)
 70 | plt.axhline(y=1,color='red',linestyle='--',label="Initial energy reserve",linewidth=4)
 71 | plt.grid(True)
 72 | plt.grid(linestyle='--')
 73 | plt.legend(fontsize=26)
 74 | plt.show()
 75 | #
 76 | # # charge amount
 77 | # num = [1, 2, 3, 4, 5]
 78 | # sum_charge_amount = [166.317, 204.866, 201.16, 196.783, 192.793]
 79 | # plt.xticks(num,num[::1])
 80 | # plt.plot(num,sum_charge_amount)
 81 | # plt.xlabel("Num of vehicles")
 82 | # plt.ylabel("Total charge amount")
 83 | # plt.axhline(y=250,color='green',linestyle='--')
 84 | # plt.grid(True)
 85 | # plt.grid(linestyle='--')
 86 | # plt.legend()
 87 | # plt.show()
 88 | #
 89 | # # charge frequency
 90 | # num = [1, 2, 3, 4, 5]
 91 | # sum_charge_frequency = [49, 232, 227, 512, 514]
 92 | # plt.xticks(num,num[::1])
 93 | # plt.plot(num,sum_charge_frequency)
 94 | # plt.xlabel("Num of vehicles")
 95 | # plt.ylabel("Total charge frequency")
 96 | # plt.grid(True)
 97 | # plt.grid(linestyle='--')
 98 | # plt.legend()
 99 | # plt.show()
100 | 
101 | 
102 | # yy
103 | num = [1, 2, 3, 4, 5]
104 | plt.figure(figsize=(17,17))
105 | sum_charge_amount = [166.317/50, 204.866/50, 201.16/50, 196.783/50, 192.793/50]
106 | sum_charge_frequency = [49, 232, 227, 512, 514]
107 | plt.xlabel("No. of vehicles",fontsize=32)
108 | plt.plot(num,sum_charge_amount,label='Total # of charged full battery',color='red',marker='o',
109 |          markersize=26,markeredgewidth=5,markerfacecolor='none',linewidth=4)
110 | plt.ylim(ymax=4.4,ymin=3)
111 | plt.xticks(fontsize=32)
112 | plt.yticks(fontsize=32)
113 | plt.ylabel("Total # of charged full battery",fontsize=32)
114 | plt.legend(loc='upper left',fontsize=26)
115 | plt.grid(True)
116 | plt.grid(linestyle='--')
117 | 
118 | plt.twinx()
119 | plt.xticks(fontsize=32)
120 | plt.yticks(fontsize=32)
121 | plt.plot(num,sum_charge_frequency,label='Total charging frequency',color='blue',marker='s',
122 |          markersize=26,markeredgewidth=5,markerfacecolor='none',linewidth=4)
123 | plt.ylim(ymax=700,ymin=0)
124 | plt.ylabel("Total charging frequency",fontsize=32)
125 | plt.legend(loc='lower right',fontsize=26)
126 | 
127 | plt.xticks(num,num[::1])
128 | 
129 | plt.grid(True)
130 | plt.grid(linestyle='--')
131 | plt.show()
132 | 
133 | 


--------------------------------------------------------------------------------
/maddpg/__init__.py:
--------------------------------------------------------------------------------
 1 | class AgentTrainer(object):
 2 |     def __init__(self, name, model, obs_shape, act_space, args):
 3 |         raise NotImplemented()
 4 | 
 5 |     def action(self, obs):
 6 |         raise NotImplemented()
 7 | 
 8 |     def process_experience(self, obs, act, rew, new_obs, done, terminal):
 9 |         raise NotImplemented()
10 | 
11 |     def preupdate(self):
12 |         raise NotImplemented()
13 | 
14 |     def update(self, agents):
15 |         raise NotImplemented()


--------------------------------------------------------------------------------
/maddpg/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BIT-MCS/DRL-EC3/3f6fc8afe7ddea615e0e8f3f9f0fdfd6a6cd6db6/maddpg/common/__init__.py


--------------------------------------------------------------------------------
/maddpg/common/distributions.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import maddpg.common.tf_util as U
  4 | from tensorflow.python.ops import math_ops
  5 | from tensorflow.python.ops import nn
  6 | 
  7 | class Pd(object):
  8 |     """
  9 |     A particular probability distribution
 10 |     """
 11 |     def flatparam(self):
 12 |         raise NotImplementedError
 13 |     def mode(self):
 14 |         raise NotImplementedError
 15 |     def logp(self, x):
 16 |         raise NotImplementedError
 17 |     def kl(self, other):
 18 |         raise NotImplementedError
 19 |     def entropy(self):
 20 |         raise NotImplementedError
 21 |     def sample(self):
 22 |         raise NotImplementedError
 23 | 
 24 | class PdType(object):
 25 |     """
 26 |     Parametrized family of probability distributions
 27 |     """
 28 |     def pdclass(self):
 29 |         raise NotImplementedError
 30 |     def pdfromflat(self, flat):
 31 |         return self.pdclass()(flat)
 32 |     def param_shape(self):
 33 |         raise NotImplementedError
 34 |     def sample_shape(self):
 35 |         raise NotImplementedError
 36 |     def sample_dtype(self):
 37 |         raise NotImplementedError
 38 | 
 39 |     def param_placeholder(self, prepend_shape, name=None):
 40 |         return tf.placeholder(dtype=tf.float32, shape=prepend_shape+self.param_shape(), name=name)
 41 |     def sample_placeholder(self, prepend_shape, name=None):
 42 |         return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape+self.sample_shape(), name=name)
 43 | 
 44 | class CategoricalPdType(PdType):
 45 |     def __init__(self, ncat):
 46 |         self.ncat = ncat
 47 |     def pdclass(self):
 48 |         return CategoricalPd
 49 |     def param_shape(self):
 50 |         return [self.ncat]
 51 |     def sample_shape(self):
 52 |         return []
 53 |     def sample_dtype(self):
 54 |         return tf.int32
 55 | 
 56 | class SoftCategoricalPdType(PdType):
 57 |     def __init__(self, ncat):
 58 |         self.ncat = ncat
 59 |     def pdclass(self):
 60 |         return SoftCategoricalPd
 61 |     def param_shape(self):
 62 |         return [self.ncat]
 63 |     def sample_shape(self):
 64 |         return [self.ncat]
 65 |     def sample_dtype(self):
 66 |         return tf.float32
 67 | 
 68 | class MultiCategoricalPdType(PdType):
 69 |     def __init__(self, low, high):
 70 |         self.low = low
 71 |         self.high = high
 72 |         self.ncats = high - low + 1
 73 |     def pdclass(self):
 74 |         return MultiCategoricalPd
 75 |     def pdfromflat(self, flat):
 76 |         return MultiCategoricalPd(self.low, self.high, flat)
 77 |     def param_shape(self):
 78 |         return [sum(self.ncats)]
 79 |     def sample_shape(self):
 80 |         return [len(self.ncats)]
 81 |     def sample_dtype(self):
 82 |         return tf.int32
 83 | 
 84 | class SoftMultiCategoricalPdType(PdType):
 85 |     def __init__(self, low, high):
 86 |         self.low = low
 87 |         self.high = high
 88 |         self.ncats = high - low + 1
 89 |     def pdclass(self):
 90 |         return SoftMultiCategoricalPd
 91 |     def pdfromflat(self, flat):
 92 |         return SoftMultiCategoricalPd(self.low, self.high, flat)
 93 |     def param_shape(self):
 94 |         return [sum(self.ncats)]
 95 |     def sample_shape(self):
 96 |         return [sum(self.ncats)]
 97 |     def sample_dtype(self):
 98 |         return tf.float32
 99 | 
100 | class DiagGaussianPdType(PdType):
101 |     def __init__(self, size):
102 |         self.size = size
103 |     def pdclass(self):
104 |         return DiagGaussianPd
105 |     def param_shape(self):
106 |         return [2*self.size]
107 |     def sample_shape(self):
108 |         return [self.size]
109 |     def sample_dtype(self):
110 |         return tf.float32
111 | 
112 | class BernoulliPdType(PdType):
113 |     def __init__(self, size):
114 |         self.size = size
115 |     def pdclass(self):
116 |         return BernoulliPd
117 |     def param_shape(self):
118 |         return [self.size]
119 |     def sample_shape(self):
120 |         return [self.size]
121 |     def sample_dtype(self):
122 |         return tf.int32
123 | 
124 | # WRONG SECOND DERIVATIVES
125 | # class CategoricalPd(Pd):
126 | #     def __init__(self, logits):
127 | #         self.logits = logits
128 | #         self.ps = tf.nn.softmax(logits)
129 | #     @classmethod
130 | #     def fromflat(cls, flat):
131 | #         return cls(flat)
132 | #     def flatparam(self):
133 | #         return self.logits
134 | #     def mode(self):
135 | #         return U.argmax(self.logits, axis=1)
136 | #     def logp(self, x):
137 | #         return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x)
138 | #     def kl(self, other):
139 | #         return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \
140 | #                 - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
141 | #     def entropy(self):
142 | #         return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
143 | #     def sample(self):
144 | #         u = tf.random_uniform(tf.shape(self.logits))
145 | #         return U.argmax(self.logits - tf.log(-tf.log(u)), axis=1)
146 | 
147 | class CategoricalPd(Pd):
148 |     def __init__(self, logits):
149 |         self.logits = logits
150 |     def flatparam(self):
151 |         return self.logits
152 |     def mode(self):
153 |         return U.argmax(self.logits, axis=1)
154 |     def logp(self, x):
155 |         return -tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
156 |     def kl(self, other):
157 |         a0 = self.logits - U.max(self.logits, axis=1, keepdims=True)
158 |         a1 = other.logits - U.max(other.logits, axis=1, keepdims=True)
159 |         ea0 = tf.exp(a0)
160 |         ea1 = tf.exp(a1)
161 |         z0 = U.sum(ea0, axis=1, keepdims=True)
162 |         z1 = U.sum(ea1, axis=1, keepdims=True)
163 |         p0 = ea0 / z0
164 |         return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=1)
165 |     def entropy(self):
166 |         a0 = self.logits - U.max(self.logits, axis=1, keepdims=True)
167 |         ea0 = tf.exp(a0)
168 |         z0 = U.sum(ea0, axis=1, keepdims=True)
169 |         p0 = ea0 / z0
170 |         return U.sum(p0 * (tf.log(z0) - a0), axis=1)
171 |     def sample(self):
172 |         u = tf.random_uniform(tf.shape(self.logits))
173 |         return U.argmax(self.logits - tf.log(-tf.log(u)), axis=1)
174 |     @classmethod
175 |     def fromflat(cls, flat):
176 |         return cls(flat)
177 | 
178 | class SoftCategoricalPd(Pd):
179 |     def __init__(self, logits):
180 |         self.logits = logits
181 |     def flatparam(self):
182 |         return self.logits
183 |     def mode(self):
184 |         return U.softmax(self.logits, axis=-1)
185 |     def logp(self, x):
186 |         return -tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
187 |     def kl(self, other):
188 |         a0 = self.logits - U.max(self.logits, axis=1, keepdims=True)
189 |         a1 = other.logits - U.max(other.logits, axis=1, keepdims=True)
190 |         ea0 = tf.exp(a0)
191 |         ea1 = tf.exp(a1)
192 |         z0 = U.sum(ea0, axis=1, keepdims=True)
193 |         z1 = U.sum(ea1, axis=1, keepdims=True)
194 |         p0 = ea0 / z0
195 |         return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=1)
196 |     def entropy(self):
197 |         a0 = self.logits - U.max(self.logits, axis=1, keepdims=True)
198 |         ea0 = tf.exp(a0)
199 |         z0 = U.sum(ea0, axis=1, keepdims=True)
200 |         p0 = ea0 / z0
201 |         return U.sum(p0 * (tf.log(z0) - a0), axis=1)
202 |     def sample(self):
203 |         u = tf.random_uniform(tf.shape(self.logits))
204 |         return U.softmax(self.logits - tf.log(-tf.log(u)), axis=-1)  
205 |     @classmethod
206 |     def fromflat(cls, flat):
207 |         return cls(flat)        
208 | 
209 | class MultiCategoricalPd(Pd):
210 |     def __init__(self, low, high, flat):
211 |         self.flat = flat
212 |         self.low = tf.constant(low, dtype=tf.int32)
213 |         self.categoricals = list(map(CategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1)))
214 |     def flatparam(self):
215 |         return self.flat
216 |     def mode(self):
217 |         return self.low + tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32)
218 |     def logp(self, x):
219 |         return tf.add_n([p.logp(px) for p, px in zip(self.categoricals, tf.unstack(x - self.low, axis=len(x.get_shape()) - 1))])
220 |     def kl(self, other):
221 |         return tf.add_n([
222 |                 p.kl(q) for p, q in zip(self.categoricals, other.categoricals)
223 |             ])
224 |     def entropy(self):
225 |         return tf.add_n([p.entropy() for p in self.categoricals])
226 |     def sample(self):
227 |         return self.low + tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32)
228 |     @classmethod
229 |     def fromflat(cls, flat):
230 |         return cls(flat)
231 | 
232 | class SoftMultiCategoricalPd(Pd):  # doesn't work yet
233 |     def __init__(self, low, high, flat):
234 |         self.flat = flat
235 |         self.low = tf.constant(low, dtype=tf.float32)
236 |         self.categoricals = list(map(SoftCategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1)))
237 |     def flatparam(self):
238 |         return self.flat
239 |     def mode(self):
240 |         x = []
241 |         for i in range(len(self.categoricals)):
242 |             x.append(self.low[i] + self.categoricals[i].mode())
243 |         return tf.concat(x, axis=-1)
244 |     def logp(self, x):
245 |         return tf.add_n([p.logp(px) for p, px in zip(self.categoricals, tf.unstack(x - self.low, axis=len(x.get_shape()) - 1))])
246 |     def kl(self, other):
247 |         return tf.add_n([
248 |                 p.kl(q) for p, q in zip(self.categoricals, other.categoricals)
249 |             ])
250 |     def entropy(self):
251 |         return tf.add_n([p.entropy() for p in self.categoricals])
252 |     def sample(self):
253 |         x = []
254 |         for i in range(len(self.categoricals)):
255 |             x.append(self.low[i] + self.categoricals[i].sample())
256 |         return tf.concat(x, axis=-1)
257 |     @classmethod
258 |     def fromflat(cls, flat):
259 |         return cls(flat)
260 | 
261 | class DiagGaussianPd(Pd):
262 |     def __init__(self, flat):
263 |         self.flat = flat
264 |         mean, logstd = tf.split(axis=1, num_or_size_splits=2, value=flat)
265 |         self.mean = mean
266 |         self.logstd = logstd
267 |         self.std = tf.exp(logstd) #e^(log(std))
268 |     def flatparam(self):
269 |         return self.flat        
270 |     def mode(self):
271 |         return self.mean
272 |     def logp(self, x):
273 |         return - 0.5 * U.sum(tf.square((x - self.mean) / self.std), axis=1) \
274 |                - 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[1]) \
275 |                - U.sum(self.logstd, axis=1)
276 |     def kl(self, other):
277 |         assert isinstance(other, DiagGaussianPd)
278 |         return U.sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=1)
279 |     def entropy(self):
280 |         return U.sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), 1)
281 |     def sample(self):
282 |         # tf.random_normal: Outputs random values from a normal distribution.
283 |         return self.mean + self.std * tf.random_normal(tf.shape(self.mean))  # , self.mean, self.std
284 |     @classmethod
285 |     def fromflat(cls, flat):
286 |         return cls(flat)
287 | 
288 | class BernoulliPd(Pd):
289 |     def __init__(self, logits):
290 |         self.logits = logits
291 |         self.ps = tf.sigmoid(logits)
292 |     def flatparam(self):
293 |         return self.logits
294 |     def mode(self):
295 |         return tf.round(self.ps)
296 |     def logp(self, x):
297 |         return - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=1)
298 |     def kl(self, other):
299 |         return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=1) - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=1)
300 |     def entropy(self):
301 |         return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=1)
302 |     def sample(self):
303 |         p = tf.sigmoid(self.logits)
304 |         u = tf.random_uniform(tf.shape(p))
305 |         return tf.to_float(math_ops.less(u, p))
306 |     @classmethod
307 |     def fromflat(cls, flat):
308 |         return cls(flat)
309 | 
310 | # 建立概率分布
311 | def make_pdtype(ac_space):
312 |     from gym import spaces
313 |     if isinstance(ac_space, spaces.Box):
314 |         assert len(ac_space.shape) == 1
315 |         return DiagGaussianPdType(ac_space.shape[0])
316 |     elif isinstance(ac_space, spaces.Discrete):
317 |         # return CategoricalPdType(ac_space.n)
318 |         return SoftCategoricalPdType(ac_space.n)
319 |     elif isinstance(ac_space, spaces.MultiDiscrete):
320 |         #return MultiCategoricalPdType(ac_space.low, ac_space.high)
321 |         return SoftMultiCategoricalPdType(ac_space.low, ac_space.high)
322 |     elif isinstance(ac_space, spaces.MultiBinary):
323 |         return BernoulliPdType(ac_space.n)
324 |     else:
325 |         raise NotImplementedError
326 | 
327 | def shape_el(v, i):
328 |     maybe = v.get_shape()[i]
329 |     if maybe is not None:
330 |         return maybe
331 |     else:
332 |         return tf.shape(v)[i]
333 | 


--------------------------------------------------------------------------------
/maddpg/common/summary.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import tensorflow as tf
 3 | 
 4 | class Summary:
 5 | 
 6 |     def __init__(self, session, dir_summary):
 7 |         self.__sess = session
 8 |         self.__vars = {}
 9 |         self.__ops = None
10 |         self.__dir = dir_summary
11 |         self.__writer = tf.summary.FileWriter(dir_summary, session.graph)
12 | 
13 |     def add_variable(self, var, name="name"):
14 |         tf.summary.scalar(name, var)
15 |         assert name not in self.__vars, "Already has " + name
16 |         self.__vars[name] = var
17 | 
18 |     def build(self):
19 |         self.__ops = tf.summary.merge_all()
20 | 
21 |     def run(self, feed_dict, step):
22 |         feed_dict_final = {}
23 |         for key, val in feed_dict.items():
24 |             feed_dict_final[self.__vars[key]] = val
25 |         str_summary = self.__sess.run(self.__ops, feed_dict_final)
26 |         self.__writer.add_summary(str_summary, step)
27 |         self.__writer.flush()
28 | 
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/maddpg/common/tf_util.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import numpy as np
  3 | import os
  4 | import tensorflow as tf
  5 | import multiprocessing as mp
  6 | 
  7 | 
  8 | def sum(x, axis=None, keepdims=False):
  9 |     return tf.reduce_sum(x, axis=None if axis is None else [axis], keep_dims=keepdims)
 10 | 
 11 | 
 12 | def mean(x, axis=None, keepdims=False):
 13 |     return tf.reduce_mean(x, axis=None if axis is None else [axis], keep_dims=keepdims)
 14 | 
 15 | 
 16 | def var(x, axis=None, keepdims=False):
 17 |     meanx = mean(x, axis=axis, keepdims=keepdims)
 18 |     return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims)
 19 | 
 20 | 
 21 | def std(x, axis=None, keepdims=False):
 22 |     return tf.sqrt(var(x, axis=axis, keepdims=keepdims))
 23 | 
 24 | 
 25 | def max(x, axis=None, keepdims=False):
 26 |     return tf.reduce_max(x, axis=None if axis is None else [axis], keep_dims=keepdims)
 27 | 
 28 | 
 29 | def min(x, axis=None, keepdims=False):
 30 |     return tf.reduce_min(x, axis=None if axis is None else [axis], keep_dims=keepdims)
 31 | 
 32 | 
 33 | def concatenate(arrs, axis=0):
 34 |     return tf.concat(axis=axis, values=arrs)
 35 | 
 36 | 
 37 | def argmax(x, axis=None):
 38 |     return tf.argmax(x, axis=axis)
 39 | 
 40 | 
 41 | def softmax(x, axis=None):
 42 |     return tf.nn.softmax(x, dim=axis)
 43 | 
 44 | 
 45 | # ================================================================
 46 | # Misc
 47 | # ================================================================
 48 | 
 49 | 
 50 | def is_placeholder(x):
 51 |     return type(x) is tf.Tensor and len(x.op.inputs) == 0
 52 | 
 53 | 
 54 | # ================================================================
 55 | # Inputs
 56 | # ================================================================
 57 | 
 58 | 
 59 | class TfInput(object):
 60 |     def __init__(self, name="(unnamed)"):
 61 |         """Generalized Tensorflow placeholder. The main differences are:
 62 |             - possibly uses multiple placeholders internally and returns multiple values
 63 |             - can apply light postprocessing to the value feed to placeholder.
 64 |         """
 65 |         self.name = name
 66 | 
 67 |     def get(self):
 68 |         """Return the tf variable(s) representing the possibly postprocessed value
 69 |         of placeholder(s).
 70 |         """
 71 |         raise NotImplemented()
 72 | 
 73 |     def make_feed_dict(data):
 74 |         """Given data input it to the placeholder(s)."""
 75 |         raise NotImplemented()
 76 | 
 77 | 
 78 | class PlacholderTfInput(TfInput):
 79 |     def __init__(self, placeholder):
 80 |         """Wrapper for regular tensorflow placeholder."""
 81 |         super().__init__(placeholder.name)
 82 |         self._placeholder = placeholder
 83 | 
 84 |     def get(self):
 85 |         return self._placeholder
 86 | 
 87 |     def make_feed_dict(self, data):
 88 |         return {self._placeholder: data}
 89 | 
 90 | 
 91 | class BatchInput(PlacholderTfInput):
 92 |     def __init__(self, shape, dtype=tf.float32, name=None):
 93 |         """Creates a placeholder for a batch of tensors of a given shape and dtype
 94 | 
 95 |         Parameters
 96 |         ----------
 97 |         shape: [int]
 98 |             shape of a single elemenet of the batch
 99 |         dtype: tf.dtype
100 |             number representation used for tensor contents
101 |         name: str
102 |             name of the underlying placeholder
103 |         """
104 |         super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name))
105 | 
106 | 
107 | class Uint8Input(PlacholderTfInput):
108 |     def __init__(self, shape, name=None):
109 |         """Takes input in uint8 format which is cast to float32 and divided by 255
110 |         before passing it to the model.
111 | 
112 |         On GPU this ensures lower data transfer times.
113 | 
114 |         Parameters
115 |         ----------
116 |         shape: [int]
117 |             shape of the tensor.
118 |         name: str
119 |             name of the underlying placeholder
120 |         """
121 | 
122 |         super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name))
123 |         self._shape = shape
124 |         self._output = tf.cast(super().get(), tf.float32) / 255.0
125 | 
126 |     def get(self):
127 |         return self._output
128 | 
129 | 
130 | def ensure_tf_input(thing):
131 |     """Takes either tf.placeholder of TfInput and outputs equivalent TfInput"""
132 |     if isinstance(thing, TfInput):
133 |         return thing
134 |     elif is_placeholder(thing):
135 |         return PlacholderTfInput(thing)
136 |     else:
137 |         raise ValueError("Must be a placeholder or TfInput")
138 | 
139 | 
140 | # ================================================================
141 | # Mathematical utils
142 | # ================================================================
143 | 
144 | 
145 | def huber_loss(x, delta=1.0):
146 |     """Reference: https://en.wikipedia.org/wiki/Huber_loss"""
147 |     return tf.where(
148 |         tf.abs(x) < delta,
149 |         tf.square(x) * 0.5,
150 |         delta * (tf.abs(x) - 0.5 * delta)
151 |     )
152 | 
153 | 
154 | # ================================================================
155 | # Optimizer utils
156 | # ================================================================
157 | 
158 | # 最小化loss的同时剪切梯度范围
159 | def minimize_and_clip(optimizer, objective, var_list,global_step, clip_val=10):
160 |     """Minimized `objective` using `optimizer` w.r.t. variables in
161 |     `var_list` while ensure the norm of the gradients for each
162 |     variable is clipped to `clip_val`
163 |     """
164 |     if clip_val is None:
165 |         return optimizer.minimize(objective, var_list=var_list,global_step=global_step)
166 |     else:
167 |         gradients = optimizer.compute_gradients(objective, var_list=var_list)
168 |         for i, (grad, var) in enumerate(gradients):
169 |             if grad is not None:
170 |                 # Given a tensor grad, and a maximum clip value clip_val,
171 |                 # this operation normalizes t so that its L2-norm is
172 |                 # less than or equal to clip_norm
173 |                 gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
174 |         return optimizer.apply_gradients(gradients)
175 | 
176 | 
177 | # ================================================================
178 | # Global session
179 | # ================================================================
180 | 
181 | def get_session():
182 |     """Returns recently made Tensorflow session"""
183 |     return tf.get_default_session()
184 | 
185 | 
186 | def make_session(num_cpu):
187 |     """Returns a session that will use <num_cpu> CPU's only"""
188 |     # 控制session使用的cpu资源
189 |     tf_config = tf.ConfigProto(
190 |         inter_op_parallelism_threads=num_cpu,
191 |         intra_op_parallelism_threads=num_cpu
192 |     )
193 |     tf_config.gpu_options.allow_growth=True
194 |     return tf.Session(config=tf_config)
195 | 
196 | 
197 | def multi_threaded_session():
198 |     """Returns a session which will only use a single CPU"""
199 |     # TODO:也可以使用很多CPU呀，只有这里有多线程，这就是一个假的分布式！
200 |     return make_session(num_cpu=10)
201 | 
202 | 
203 | ALREADY_INITIALIZED = set()
204 | 
205 | 
206 | def initialize():
207 |     """Initialize all the uninitialized variables in the global scope."""
208 |     new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED
209 |     get_session().run(tf.variables_initializer(new_variables))
210 |     ALREADY_INITIALIZED.update(new_variables)
211 | 
212 | 
213 | # ================================================================
214 | # Scopes
215 | # ================================================================
216 | 
217 | # 按照命名空间得到variables变量
218 | def scope_vars(scope, trainable_only=False):
219 |     """
220 |     Get variables inside a scope
221 |     The scope can be specified as a string
222 | 
223 |     Parameters
224 |     ----------
225 |     scope: str or VariableScope
226 |         scope in which the variables reside.
227 |     trainable_only: bool
228 |         whether or not to return only the variables that were marked as trainable.
229 | 
230 |     Returns
231 |     -------
232 |     vars: [tf.Variable]
233 |         list of variables in `scope`.
234 |     """
235 |     return tf.get_collection(
236 |         tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.GLOBAL_VARIABLES,
237 |         scope=scope if isinstance(scope, str) else scope.name
238 |     )
239 | 
240 | 
241 | def scope_name():
242 |     """Returns the name of current scope as a string, e.g. deepq/q_func"""
243 |     return tf.get_variable_scope().name
244 | 
245 | 
246 | def absolute_scope_name(relative_scope_name):
247 |     """Appends parent scope name to `relative_scope_name`"""
248 |     return scope_name() + "/" + relative_scope_name
249 | 
250 | 
251 | # ================================================================
252 | # Saving variables
253 | # ================================================================
254 | 
255 | 
256 | def load_state(fname, saver=None):
257 |     """Load all the variables to the current session from the location <fname>"""
258 |     if saver is None:
259 |         saver = tf.train.Saver()
260 |     saver.restore(get_session(), fname)
261 |     return saver
262 | 
263 | 
264 | def save_state(fname, saver=None):
265 |     """Save all the variables in the current session to the location <fname>"""
266 |     os.makedirs(os.path.dirname(fname), exist_ok=True)
267 |     if saver is None:
268 |         saver = tf.train.Saver()
269 |     saver.save(get_session(), fname)
270 |     return saver
271 | 
272 | 
273 | # ================================================================
274 | # Theano-like Function
275 | # ================================================================
276 | 
277 | # 建立一个函数
278 | def function(inputs, outputs, updates=None, givens=None):
279 |     """Just like Theano function. Take a bunch of tensorflow placeholders and expersions
280 |     computed based on those placeholders and produces f(inputs) -> outputs. Function f takes
281 |     values to be feed to the inputs placeholders and produces the values of the experessions
282 |     in outputs.
283 | 
284 |     Input values can be passed in the same order as inputs or can be provided as kwargs based
285 |     on placeholder name (passed to constructor or accessible via placeholder.op.name).
286 | 
287 |     Example:
288 |         x = tf.placeholder(tf.int32, (), name="x")
289 |         y = tf.placeholder(tf.int32, (), name="y")
290 |         z = 3 * x + 2 * y
291 |         lin = function([x, y], z, givens={y: 0})
292 | 
293 |         with single_threaded_session():
294 |             initialize()
295 | 
296 |             assert lin(2) == 6
297 |             assert lin(x=3) == 9
298 |             assert lin(2, 2) == 10
299 |             assert lin(x=2, y=3) == 12
300 | 
301 |     Parameters
302 |     ----------
303 |     inputs: [tf.placeholder or TfInput]
304 |         list of input arguments
305 |     outputs: [tf.Variable] or tf.Variable
306 |         list of outputs or a single output to be returned from function. Returned
307 |         value will also have the same shape.
308 |     """
309 |     if isinstance(outputs, list):
310 |         return _Function(inputs, outputs, updates, givens=givens)
311 |     elif isinstance(outputs, (dict, collections.OrderedDict)):
312 |         f = _Function(inputs, outputs.values(), updates, givens=givens)
313 |         return lambda *args, **kwargs: type(outputs)(zip(outputs.keys(), f(*args, **kwargs)))
314 |     else:
315 |         f = _Function(inputs, [outputs], updates, givens=givens)
316 |         return lambda *args, **kwargs: f(*args, **kwargs)[0]
317 | 
318 | 
319 | class _Function(object):
320 |     def __init__(self, inputs, outputs, updates, givens, check_nan=False):
321 |         for inpt in inputs:
322 |             if not issubclass(type(inpt), TfInput):
323 |                 assert len(inpt.op.inputs) == 0, "inputs should all be placeholders of rl_algs.common.TfInput"
324 |         self.inputs = inputs
325 |         updates = updates or []
326 |         # tf.group()
327 |         # Create an op that groups multiple operations.
328 |         # When this op finishes, all ops in inputs have finished.
329 |         # This op has no output.
330 |         self.update_group = tf.group(*updates)
331 |         self.outputs_update = list(outputs) + [self.update_group]
332 |         self.givens = {} if givens is None else givens
333 |         self.check_nan = check_nan
334 | 
335 |     def _feed_input(self, feed_dict, inpt, value):
336 |         if issubclass(type(inpt), TfInput):
337 |             feed_dict.update(inpt.make_feed_dict(value))
338 |         elif is_placeholder(inpt):
339 |             feed_dict[inpt] = value
340 | 
341 |     def __call__(self, *args, **kwargs):
342 |         assert len(args) <= len(self.inputs), "Too many arguments provided"
343 |         feed_dict = {}
344 |         # Update the args
345 |         for inpt, value in zip(self.inputs, args):
346 |             self._feed_input(feed_dict, inpt, value)
347 |         # Update the kwargs
348 |         kwargs_passed_inpt_names = set()
349 |         for inpt in self.inputs[len(args):]:
350 |             inpt_name = inpt.name.split(':')[0]
351 |             inpt_name = inpt_name.split('/')[-1]
352 |             assert inpt_name not in kwargs_passed_inpt_names, \
353 |                 "this function has two arguments with the same name \"{}\", so kwargs cannot be used.".format(inpt_name)
354 |             if inpt_name in kwargs:
355 |                 kwargs_passed_inpt_names.add(inpt_name)
356 |                 self._feed_input(feed_dict, inpt, kwargs.pop(inpt_name))
357 |             else:
358 |                 assert inpt in self.givens, "Missing argument " + inpt_name
359 |         assert len(kwargs) == 0, "Function got extra arguments " + str(list(kwargs.keys()))
360 |         # Update feed dict with givens.
361 |         for inpt in self.givens:
362 |             feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt])
363 |         results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1]
364 |         if self.check_nan:
365 |             if any(np.isnan(r).any() for r in results):
366 |                 raise RuntimeError("Nan detected")
367 |         return results
368 | 


--------------------------------------------------------------------------------
/maddpg/trainer/maddpg.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | import maddpg.common.tf_util as U
  5 | 
  6 | from maddpg.common.distributions import make_pdtype
  7 | from maddpg import AgentTrainer
  8 | from maddpg.trainer.prioritized_rb.replay_buffer import ReplayBuffer
  9 | import tensorflow.contrib.layers as layers
 10 | 
 11 | 
 12 | def discount_with_dones(rewards, dones, gamma):
 13 |     discounted = []
 14 |     r = 0
 15 |     for reward, done in zip(rewards[::-1], dones[::-1]):
 16 |         r = reward + gamma * r
 17 |         r = r * (1. - done)
 18 |         discounted.append(r)
 19 |     return discounted[::-1]
 20 | 
 21 | 
 22 | def make_update_exp(vals, target_vals):
 23 |     polyak = 1.0 - 1e-2
 24 |     expression = []
 25 |     for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)):
 26 |         # update target network parameters (once)
 27 |         expression.append(var_target.assign(polyak * var_target + (1.0 - polyak) * var))
 28 |     expression = tf.group(*expression)
 29 |     return U.function([], [], updates=[expression])
 30 | 
 31 | 
 32 | def CNN(state_input, reuse=tf.AUTO_REUSE, scope='CNN'):
 33 |     with tf.variable_scope(scope, reuse=reuse):
 34 |         state = tf.layers.conv2d(state_input, 16, 3, activation='relu', strides=2, padding='VALID')
 35 |         state = tf.layers.conv2d(state, 32, 3, activation='relu', strides=2, padding='VALID')
 36 |         state = tf.layers.conv2d(state, 64, 3, activation='relu', strides=2, padding='VALID')
 37 |         temp = 64 * 9 * 9
 38 | 
 39 |         state = tf.layers.batch_normalization(state)
 40 |         input_1 = tf.reshape(state, [-1])
 41 |         input_s = tf.reshape(input_1, [-1, temp])
 42 |     return input_s
 43 | 
 44 | 
 45 | # TODO: RNN!!!
 46 | def RNN(state_input, reuse=tf.AUTO_REUSE, scope='RNN', cell_size=None, initial_state=None):
 47 |     with tf.variable_scope(scope, reuse=reuse):
 48 |         rnn_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(num_units=cell_size,
 49 |                                                          layer_norm=True, norm_gain=1.0, norm_shift=0.0,
 50 |                                                          dropout_keep_prob=0.75, dropout_prob_seed=None)
 51 |         outputs, final_state = tf.nn.dynamic_rnn(
 52 |             cell=rnn_cell, inputs=state_input, initial_state=initial_state, time_major=True, dtype=tf.float32)
 53 |         cell_out = outputs[-1, :, :]
 54 |     return cell_out, final_state
 55 | 
 56 | 
 57 | # 多层感知机 Actor/Critic-Net 互相独立
 58 | # TODO:输出加了tanh确实不会梯度爆炸,但是收敛效果变得不是很好
 59 | def mlp_model(input, num_outputs, scope, reuse=False, num_units=64, ac_fn=None):
 60 |     # This model takes as input an observation and returns values of all actions
 61 |     with tf.variable_scope(scope, reuse=reuse):
 62 |         out = input
 63 | 
 64 |         out = layers.fully_connected(out, num_outputs=num_units, activation_fn=tf.nn.relu,
 65 |                                      weights_regularizer=tf.contrib.layers.l2_regularizer(1e-2),
 66 |                                      biases_regularizer=tf.contrib.layers.l2_regularizer(1e-2))
 67 |         out = layers.fully_connected(out, num_outputs=num_units, activation_fn=tf.nn.relu,
 68 |                                      weights_regularizer=tf.contrib.layers.l2_regularizer(1e-2),
 69 |                                      biases_regularizer=tf.contrib.layers.l2_regularizer(1e-2))
 70 |         out = layers.fully_connected(out, num_outputs=num_outputs, activation_fn=None,
 71 |                                      weights_regularizer=tf.contrib.layers.l2_regularizer(1e-2),
 72 |                                      biases_regularizer=tf.contrib.layers.l2_regularizer(1e-2))
 73 |         return out
 74 | 
 75 | 
 76 | # actor
 77 | def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer,global_step, grad_norm_clipping=None, local_q_func=False,
 78 |             num_units=64, scope="trainer", reuse=None, args=None):
 79 |     with tf.variable_scope(scope, reuse=reuse):
 80 |         # create distribtuions - DiagGaussian
 81 |         act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
 82 |         # make_pdtype(box(2,))->DiagGaussianPdType(2)
 83 | 
 84 |         # set up placeholders
 85 |         obs_ph_n = make_obs_ph_n  # n * [None,80,80,3]
 86 |         # n * [None, 3]
 87 |         act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n))]
 88 | 
 89 |         # actor output shape(p-shape)：[None,2*3]  2:mean,std  3:one_uav_action_n
 90 |         # add cnn for actor here! p_input=CNN(obs_ph_n[p_index])
 91 |         if args.rnn_length > 0:
 92 |             cnn_output = tf.reshape(CNN(state_input=obs_ph_n[p_index], scope='p_func'),
 93 |                                     [args.rnn_length, -1, 64 * 9 * 9])
 94 |             p_input, _ = RNN(state_input=cnn_output, scope='p_func', cell_size=args.rnn_cell_size)
 95 |         else:
 96 |             p_input = CNN(obs_ph_n[p_index], scope='p_func')
 97 |         p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units,
 98 |                    ac_fn=tf.nn.tanh)  # TODO:actor的输出加tanh,避免爆炸
 99 | 
100 |         # 提取CNN+BATCH_NORMALIZATION+MLP里面的参数
101 |         p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))
102 | 
103 |         # wrap parameters in distribution
104 |         act_pd = act_pdtype_n[p_index].pdfromflat(p)
105 |         #   PdType.pdfromflat(p) => DGPT.pdclass()(p) =>DiagGaussianPd(p)
106 |         #   shape of p [None, 4] => mean:[None, 2] std:[None, 2]
107 | 
108 |         act_sample = act_pd.sample()  # action == mean + std * tf.random_normal
109 |         mean, logstd = act_pd.mean, act_pd.logstd
110 |         # act_pd.flatparam() === p
111 |         p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))
112 | 
113 |         act_input_n = act_ph_n + []
114 |         act_input_n[p_index] = act_pd.sample()  # 每个agent更新自己的action,跑多次这个函数最后会形成新的action_input_n
115 | 
116 |         # add cnn for critic here!
117 |         cnn_obs_ph_n = []
118 |         for obs_ph in obs_ph_n:
119 |             # rnn
120 |             if args.rnn_length > 0:
121 |                 cnn_output = tf.reshape(CNN(state_input=obs_ph, scope='q_func'), [args.rnn_length, -1, 64 * 9 * 9])
122 |                 cell_out, _ = RNN(state_input=cnn_output, scope='q_func', cell_size=args.rnn_cell_size)
123 |                 cnn_obs_ph_n.append(cell_out)
124 |             else:
125 |                 cnn_obs_ph_n.append(CNN(state_input=obs_ph, scope='q_func'))
126 | 
127 |         q_input = tf.concat(cnn_obs_ph_n + act_input_n, 1)
128 |         if local_q_func:
129 |             q_input = tf.concat([cnn_obs_ph_n[p_index], act_input_n[p_index]], 1)
130 |         # reuse=True, the same critic. 会使用q_func空间里的critic，来更新actor的loss
131 |         q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0]
132 |         pg_loss = -tf.reduce_mean(q)
133 | 
134 |         loss = pg_loss + p_reg * 1e-3
135 | 
136 |         optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,global_step, grad_norm_clipping)
137 | 
138 |         # Create callable functions
139 |         train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr])
140 |         act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)  # 单个agent的evaluate-actor动作输出函数
141 |         p_values = U.function([obs_ph_n[p_index]], p)
142 | 
143 |         # target network
144 |         # add cnn for actor here! p_target_input=CNN(obs_ph_n[p_index])
145 |         if args.rnn_length > 0:
146 |             cnn_output = tf.reshape(CNN(state_input=obs_ph_n[p_index], scope='target_p_func'),
147 |                                     [args.rnn_length, -1, 64 * 9 * 9])
148 |             p_target_input, _ = RNN(state_input=cnn_output,scope='target_p_func',cell_size=args.rnn_cell_size)
149 |         else:
150 |             p_target_input = CNN(obs_ph_n[p_index], scope='target_p_func')
151 | 
152 |         target_p = p_func(p_target_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func",
153 |                           num_units=num_units, ac_fn=tf.nn.tanh)
154 |         target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func"))
155 |         update_target_p = make_update_exp(p_func_vars, target_p_func_vars)  # 更新target network
156 | 
157 |         target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
158 |         target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample)  # 单个agent的target-actor动作输出函数
159 | 
160 |         return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act}
161 | 
162 | 
163 | # critic
164 | def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, global_step, grad_norm_clipping=None, local_q_func=False,
165 |             scope="trainer", reuse=None, num_units=64, args=None):
166 |     with tf.variable_scope(scope, reuse=reuse):
167 |         # create distribtuions （n * action_n）
168 |         act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]  # dialog 高斯分布
169 | 
170 |         # set up placeholders
171 |         obs_ph_n = make_obs_ph_n  # n * [None,80,80,3]
172 | 
173 |         # add CNN for critic here. cnn_obs_ph_n=CNN(obs_ph_n)
174 |         cnn_obs_ph_n = []  # n * [None,5184]
175 |         for obs_ph in obs_ph_n:
176 |             # rnn
177 |             if args.rnn_length > 0:
178 |                 cnn_output = tf.reshape(CNN(state_input=obs_ph, scope='q_func'), [args.rnn_length, -1, 64 * 9 * 9])
179 |                 cell_out, _ = RNN(state_input=cnn_output, scope='q_func', cell_size=args.rnn_cell_size)
180 |                 cnn_obs_ph_n.append(cell_out)
181 |             else:
182 |                 cnn_obs_ph_n.append(CNN(state_input=obs_ph, scope='q_func'))
183 | 
184 |         # multi-state-placeholder(num_agents)
185 |         act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n))]
186 |         target_ph = tf.placeholder(tf.float32, [None], name="target")
187 | 
188 |         # q_input = n * (s cat a)   shape:[None,n*(5184+3)]
189 |         q_input = tf.concat(cnn_obs_ph_n + act_ph_n, 1)
190 |         if local_q_func:  # false
191 |             q_input = tf.concat([cnn_obs_ph_n[q_index], act_ph_n[q_index]], 1)
192 |         q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0]  # MLP 输出值,增加一维 shape=[None,0]
193 | 
194 |         # 得到训练所需要的所有(在scope命名空间内)的variable变量（weight/bias/batch_normalization）
195 |         q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
196 | 
197 |         q_square = tf.square(q - target_ph)
198 |         q_loss = tf.reduce_mean(q_square)  # Square loss (from batch to one)
199 | 
200 |         # viscosity solution to Bellman differential equation in place of an initial condition
201 |         q_reg = tf.reduce_mean(tf.square(q))
202 |         loss = q_loss  # + 1e-3 * q_reg
203 | 
204 |         # optimizer(Adam)
205 |         optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,global_step,grad_norm_clipping)
206 | 
207 |         # Create callable functions 建立了一个pipeline,方便后续训练
208 |         train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=[loss, q_square, q, q_input],
209 |                            updates=[optimize_expr])
210 |         q_values = U.function(obs_ph_n + act_ph_n, q)
211 | 
212 |         # add CNN for critic here. tcnn_obs_ph_n=CNN(obs_ph_n)
213 |         tcnn_obs_ph_n = []  # n * [None,5184]  for target
214 |         for obs_ph in obs_ph_n:
215 |             # rnn
216 |             if args.rnn_length > 0:
217 |                 cnn_output = tf.reshape(CNN(state_input=obs_ph, scope='target_q_func'),
218 |                                         [args.rnn_length, -1, 64 * 9 * 9])
219 |                 cell_out, _ = RNN(state_input=cnn_output, scope='target_q_func', cell_size=args.rnn_cell_size)
220 |                 tcnn_obs_ph_n.append(cell_out)
221 |             else:
222 |                 tcnn_obs_ph_n.append(CNN(state_input=obs_ph, scope='target_q_func'))
223 | 
224 |         q_target_input = tf.concat(tcnn_obs_ph_n + act_ph_n, 1)
225 |         if local_q_func:  # false
226 |             q_target_input = tf.concat([tcnn_obs_ph_n[q_index], act_ph_n[q_index]], 1)
227 | 
228 |         # target network
229 |         target_q = q_func(q_target_input, 1, scope="target_q_func", num_units=num_units)[:, 0]
230 |         target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))
231 |         update_target_q = make_update_exp(q_func_vars, target_q_func_vars)
232 | 
233 |         target_q_values = U.function(obs_ph_n + act_ph_n, target_q)
234 | 
235 |         return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values}
236 | 
237 | 
238 | class MADDPGAgentTrainer(AgentTrainer):  # 按照agent_index挨个建立trainer
239 |     def __init__(self, name, obs_shape_n, act_space_n, agent_index, args, local_q_func=False):
240 |         self.name = name
241 |         self.n = len(obs_shape_n)  # 2
242 |         self.agent_index = agent_index
243 |         self.args = args
244 | 
245 |         # TODO:加一个自适应学习率衰减(有很多tricks)
246 |         self.global_train_step = tf.Variable(tf.constant(0.0), trainable=False)
247 |         self.decey_lr = tf.train.exponential_decay(learning_rate=self.args.lr, global_step=self.global_train_step,
248 |                                                    decay_steps=100, decay_rate=self.args.decay_rate, staircase=True)
249 |         # multi-state-placeholder(num_agents)
250 |         obs_ph_n = []
251 |         for i in range(self.n):
252 |             obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get())
253 | 
254 |         # Create all the functions necessary to train the model
255 | 
256 |         # critic
257 |         # q_train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=[loss, q_square, q, q_input],
258 |         #                    updates=[optimize_expr])
259 |         # q_update = make_update_exp(q_func_vars, target_q_func_vars)
260 |         # q_values = U.function(obs_ph_n + act_ph_n, q)
261 |         # target_q_values = U.function(obs_ph_n + act_ph_n, target_q)
262 |         # self.q_debug={'q_values': q_values, 'target_q_values': target_q_values}
263 |         self.q_train, self.q_update, self.q_debug = q_train(
264 |             scope=self.name,
265 |             make_obs_ph_n=obs_ph_n,
266 |             act_space_n=act_space_n,
267 |             q_index=agent_index,
268 |             q_func=mlp_model,  # mlp
269 |             optimizer=tf.train.AdamOptimizer(learning_rate=self.decey_lr),  #args.lr
270 |             grad_norm_clipping=0.5,
271 |             local_q_func=local_q_func,  # false
272 |             num_units=args.num_units,  # 600
273 |             args=args,
274 |             global_step=self.global_train_step
275 |         )
276 | 
277 |         # actor
278 |         # self.act 算的是第agent_index个agent的action_sample！
279 |         # self.p-_debug={'p_values': p_values, 'target_act': target_act}
280 |         self.act, self.p_train, self.p_update, self.p_debug = p_train(
281 |             scope=self.name,
282 |             make_obs_ph_n=obs_ph_n,
283 |             act_space_n=act_space_n,
284 |             p_index=agent_index,
285 |             p_func=mlp_model,  # mlp
286 |             q_func=mlp_model,  # mlp
287 |             optimizer=tf.train.AdamOptimizer(learning_rate=self.decey_lr),
288 |             grad_norm_clipping=0.5,
289 |             local_q_func=local_q_func,  # false
290 |             num_units=args.num_units,  # 600
291 |             args=args,
292 |             global_step = self.global_train_step
293 |         )
294 | 
295 |         # Create experience buffer
296 |         self.buffer_size = self.args.buffer_size  # 1e6
297 |         self.beta = self.args.beta
298 |         self.replay_buffer = ReplayBuffer(int(self.buffer_size), int(self.args.batch_size), self.args.alpha,
299 |                                           self.args.epsilon)
300 |         self.replay_sample_index = None
301 | 
302 |     @property
303 |     def filled_size(self):
304 |         return len(self.replay_buffer)
305 | 
306 |     def action(self, obs):
307 |         actor_output = self.act(obs)[0]
308 |         return actor_output
309 | 
310 |     def experience(self, obs, act, rew, new_obs, done, terminal, num_actor_workers):
311 |         # Store transition in the replay buffer.
312 |         self.replay_buffer.add(obs, act, rew, new_obs, float(done), self.args.N, self.args.gamma, num_actor_workers)
313 | 
314 |     def preupdate(self):
315 |         self.replay_sample_index = None
316 | 
317 |     def update(self, env, agents, t):
318 |         # replay buffer is not large enough 没填满的时候不训练
319 |         # if len(self.replay_buffer) < 10000:
320 |         if len(self.replay_buffer) < 100 * self.args.batch_size:
321 |             return [0]
322 |         if not t % 10 == 0:  # only update every 10 steps
323 |             return [0]
324 | 
325 |         # 随着训练的进行，让β从某个小于1的值渐进地靠近1
326 |         if self.beta < 1.:
327 |             self.beta *= 1. + 1e-4
328 | 
329 |         # sample from one agent(batch:1024)  之后根据β算出来的weights没有用到呢！！！
330 |         (obs, act, rew, obs_next, done), weights, priorities, self.replay_sample_index = self.replay_buffer.sample(
331 |             self.args.batch_size, self.beta, self.args.num_actor_workers, self.args.rnn_length)  # batch-size=1024
332 | 
333 |         # collect replay sample from all agents
334 |         obs_n = []
335 |         obs_next_n = []
336 |         act_n = []
337 | 
338 |         index = self.replay_sample_index  # index数组
339 |         for i in range(self.n):
340 |             obs_, _, rew_, obs_next_, _ = agents[i].replay_buffer.sample_index(index, self.args.num_actor_workers,
341 |                                                                                self.args.rnn_length)
342 |             _, act_, _, _, done_ = agents[i].replay_buffer.sample_index(index, 0, 0)
343 | 
344 |             if self.args.rnn_length > 0:
345 |                 obs_ = obs_.transpose((1, 0, 2, 3, 4))
346 |                 obs_next_ = obs_next_.transpose((1, 0, 2, 3, 4))
347 |                 obs_shape = obs_.shape
348 |                 obs_ = obs_.reshape(-1, obs_shape[-3], obs_shape[-2], obs_shape[-1])
349 |                 obs_next_ = obs_next_.reshape(-1, obs_shape[-3], obs_shape[-2], obs_shape[-1])
350 | 
351 |             obs_n.append(obs_)
352 |             obs_next_n.append(obs_next_)
353 |             act_n.append(act_)
354 | 
355 |         # train q network
356 |         num_sample = 1
357 |         target_q = 0.0
358 | 
359 |         # TODO: 在target network里面采用兼顾过去和未来的一长段RNN 计算Qt+n
360 |         # use functions defined (batch:1024)
361 |         for i in range(num_sample):
362 |             target_act_next_n = [agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n)]
363 |             target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_next_n))
364 |             target_q += rew + self.args.gamma ** self.args.N * (1.0 - done) * target_q_next  # N-step(N=5)
365 |         target_q /= num_sample
366 | 
367 |         [q_loss, q_td, Q, q_input] = self.q_train(*(obs_n + act_n + [target_q]))
368 | 
369 |         debug_dir = env.log_dir + self.args.debug_dir
370 |         if os.path.exists(debug_dir) is False:
371 |             os.makedirs(debug_dir)
372 |         with open(debug_dir + "current_step_information_{}.txt".format(self.name), 'w+') as file:
373 |             for i, r, p, q, w in zip(index, rew, priorities, Q, weights):
374 |                 print(self.name, " current_global_step: ", t, "-----index: ", i, " reward(n-step): ", r, " priority: ",
375 |                       p, " Q: ", q, " Wi: ", w, file=file)
376 | 
377 |         # priority replay buffer update (use TD-error)
378 |         values = np.fabs(q_td)
379 |         self.replay_buffer.priority_update(self.replay_sample_index, values)
380 | 
381 |         # train p network
382 |         p_loss = self.p_train(*(obs_n + act_n))
383 | 
384 |         self.p_update()  # actor update
385 |         self.q_update()  # critic update
386 | 
387 |         return [q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q)]
388 | 


--------------------------------------------------------------------------------
/maddpg/trainer/prioritized_rb/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BIT-MCS/DRL-EC3/3f6fc8afe7ddea615e0e8f3f9f0fdfd6a6cd6db6/maddpg/trainer/prioritized_rb/__init__.py


--------------------------------------------------------------------------------
/maddpg/trainer/prioritized_rb/proportional.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | import copy
  4 | from . import sum_tree
  5 | 
  6 | 
  7 | class Experience(object):
  8 |     """ The class represents prioritized experience replay buffer.
  9 |     The class has functions: store samples, pick samples with
 10 |     probability in proportion to sample's priority, update
 11 |     each sample's priority, reset alpha.
 12 |     see https://arxiv.org/pdf/1511.05952.pdf .
 13 |     """
 14 | 
 15 |     def __init__(self, memory_size, batch_size, alpha):
 16 |         """ Prioritized experience replay buffer initialization.
 17 | 
 18 |         Parameters
 19 |         ----------
 20 |         memory_size : int
 21 |             sample size to be stored
 22 |         batch_size : int
 23 |             batch size to be selected by `select` method
 24 |         alpha: float
 25 |             exponent determine how much prioritization.
 26 |             Prob_i \sim priority_i**alpha/sum(priority**alpha)
 27 |         """
 28 |         self.tree = sum_tree.SumTree(memory_size)
 29 |         self.memory_size = memory_size
 30 |         self.batch_size = batch_size
 31 |         self.alpha = alpha
 32 | 
 33 |     def add(self, data, priority):
 34 |         """ Add new sample.
 35 | 
 36 |         Parameters
 37 |         ----------
 38 |         data : object
 39 |             new sample
 40 |         priority : float
 41 |             sample's priority
 42 |         """
 43 |         self.tree.add(data, priority ** self.alpha)
 44 | 
 45 |     def n_step(self, n, r_position, x_position, gamma,num_actor_workers):
 46 |         current_index = self.tree.cursor - 1
 47 |         current_value = self.tree.data[current_index][r_position]   # reward
 48 |         begin_index = current_index
 49 |         for i in range(1, n, 1):
 50 |             index = begin_index - i*num_actor_workers
 51 |             if index < 0 and index + self.tree.filled_size() <= current_index:
 52 |                 break
 53 |             i_gamma = np.power(gamma, i)
 54 |             self.tree.data[index][r_position] += i_gamma * current_value   # n-step 的处理,加了4个真实的reward
 55 |         if self.tree.filled_size() >= n:
 56 |             n_step_back = current_index - n
 57 |             if n_step_back < 0 and n_step_back + self.tree.filled_size() <= current_index:
 58 |                 return
 59 |             self.tree.data[n_step_back][x_position] = copy.deepcopy(self.tree.data[current_index][x_position])
 60 | 
 61 |     def select(self, beta,num_actor_workers,rnn_length):
 62 |         """ The method return samples randomly.
 63 | 
 64 |         Parameters
 65 |         ----------
 66 |         beta : float
 67 | 
 68 |         Returns
 69 |         -------
 70 |         out :
 71 |             list of samples
 72 |         weights:
 73 |             list of weight
 74 |         indices:
 75 |             list of sample indices
 76 |             The indices indicate sample positions in a sum tree.
 77 |         """
 78 | 
 79 |         if self.tree.filled_size() < self.batch_size:
 80 |             return None, None, None
 81 |         ranges = np.linspace(0, self.tree.tree[0], num=self.batch_size + 1)
 82 |         out = []
 83 |         indices = []
 84 |         weights = []
 85 |         priorities = []
 86 |         for i in range(self.batch_size):
 87 |             while True:
 88 |                 r = random.uniform(ranges[i], ranges[i+1])
 89 |                 data, priority, index = self.tree.find(r, norm=False)
 90 |                 if index < (rnn_length-1)*num_actor_workers:
 91 |                     index += (rnn_length-1)*num_actor_workers
 92 |                     data=self.tree.data[index]
 93 |                     priority=self.tree.tree[index + (2 ** (self.tree.tree_level - 1) - 1)]
 94 |                 if data is not None:
 95 |                     break
 96 |             priorities.append(priority)
 97 |             weights.append((1. / self.memory_size / priority) ** beta if priority > 1e-16 else 0)
 98 |             indices.append(index)
 99 |             out.append(data)
100 | 
101 |         weights = list(np.array(weights) / max(weights))  # Normalize for stability
102 | 
103 |         return out, weights,priorities, indices
104 | 
105 |     def priority_update(self, indices, priorities):
106 |         """ The methods update samples's priority.
107 | 
108 |         Parameters
109 |         ----------
110 |         indices :
111 |             list of sample indices
112 |         """
113 |         for i, p in zip(indices, priorities):
114 |             self.tree.val_update(i, p ** self.alpha)
115 | 
116 |     def reset_alpha(self, alpha):
117 |         """ Reset a exponent alpha.
118 |         Parameters
119 |         ----------
120 |         alpha : float
121 |         """
122 |         self.alpha, old_alpha = alpha, self.alpha
123 |         priorities = [self.tree.get_val(i) ** -old_alpha for i in range(self.tree.filled_size())]
124 |         self.priority_update(range(self.tree.filled_size()), priorities)
125 | 
126 | 
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/maddpg/trainer/prioritized_rb/replay_buffer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | from .proportional import Experience
  4 | 
  5 | class ReplayBuffer(object):
  6 |     def __init__(self, size, batch_size, alpha, epsilon):
  7 |         """Create Prioritized Replay buffer.
  8 | 
  9 |         Parameters
 10 |         ----------
 11 |         size: int
 12 |             Max number of transitions to store in the buffer. When the buffer
 13 |             overflows the old memories are dropped.
 14 |         """
 15 |         self.rb = Experience(size, batch_size, alpha)
 16 |         self.epsilon = epsilon
 17 |         # self._storage = []
 18 |         # self._maxsize = int(size)
 19 |         # self._next_idx = 0
 20 | 
 21 |     def __len__(self):
 22 |         # return len(self._storage)
 23 |         return self.rb.tree.filled_size()
 24 | 
 25 |     def clear(self):
 26 |         # self._storage = []
 27 |         # self._next_idx = 0
 28 |         self.rb = Experience(self.rb.memory_size, self.rb.batch_size, self.rb.alpha)
 29 | 
 30 |     def add(self, obs_t, action, reward, obs_tp1, done, n, gamma,num_actor_workers):
 31 |         data = [obs_t, action, reward, obs_tp1, done]
 32 |         priority = self.rb.tree.max_value + self.epsilon
 33 |         self.rb.add(data, priority)
 34 |         reward_index = 2   # reward的位置是2
 35 |         x__index = 3 # next_state的位置是3
 36 | 
 37 |         # TODO：刘老师想在这里做文章~~
 38 |         self.rb.n_step(n, reward_index, x__index, gamma,num_actor_workers)
 39 | 
 40 |     def _encode_sample(self, data):
 41 |         obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
 42 |         for i in data:
 43 |             # data = self.rb.tree.data[i]
 44 |             obs_t, action, reward, obs_tp1, done = i
 45 |             obses_t.append(np.array(obs_t, copy=False))
 46 |             actions.append(np.array(action, copy=False))
 47 |             rewards.append(reward)
 48 |             obses_tp1.append(np.array(obs_tp1, copy=False))
 49 |             dones.append(done)
 50 |         return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones)
 51 | 
 52 |     def _encode_sample_index(self, index,num_actor_workers,rnn_length):
 53 |         obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
 54 |         for i in index:
 55 |             if rnn_length>0:
 56 |                 obs_t_rnn, action_rnn, reward_rnn, obs_tp1_rnn, done_rnn=[],[],[],[],[]
 57 |                 for j in range(rnn_length-1,-1,-1):
 58 |                     pindex=i-j*num_actor_workers
 59 |                     obs_t, action, reward, obs_tp1, done = self.rb.tree.data[pindex]
 60 | 
 61 |                     obs_t_rnn.append(np.array(obs_t))
 62 |                     action_rnn.append(np.array(action))
 63 |                     reward_rnn.append(np.array(reward))
 64 |                     obs_tp1_rnn.append(np.array(obs_tp1))
 65 |                     done_rnn.append(np.array(done))
 66 |                 obses_t.append(np.array(obs_t_rnn, copy=False))
 67 |                 actions.append(np.array(action_rnn, copy=False))
 68 |                 rewards.append(reward_rnn)
 69 |                 obses_tp1.append(np.array(obs_tp1_rnn, copy=False))
 70 |                 dones.append(done_rnn)
 71 | 
 72 |             else:
 73 |                 obs_t, action, reward, obs_tp1, done = self.rb.tree.data[i]
 74 |                 obses_t.append(np.array(obs_t, copy=False))
 75 |                 actions.append(np.array(action, copy=False))
 76 |                 rewards.append(reward)
 77 |                 obses_tp1.append(np.array(obs_tp1, copy=False))
 78 |                 dones.append(done)
 79 |         return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones)
 80 | 
 81 |     # def make_index(self, batch_size):
 82 |     #     return [random.randint(0, self.rb.tree.filled_size() - 1) for _ in range(batch_size)]
 83 |     #
 84 |     # def make_latest_index(self, batch_size):
 85 |     #     idx = [(self._next_idx - 1 - i) % self._maxsize for i in range(batch_size)]
 86 |     #     np.random.shuffle(idx)
 87 |     #     return idx
 88 | 
 89 |     def sample_index(self, idxes,num_actor_workers,rnn_length):
 90 |         return self._encode_sample_index(idxes,num_actor_workers,rnn_length)
 91 | 
 92 |     def sample(self, batch_size, beta,num_actor_workers,rnn_length):
 93 |         """Sample a batch of experiences.
 94 | 
 95 |         Parameters
 96 |         ----------
 97 |         batch_size: int
 98 |             How many transitions to sample.
 99 | 
100 |         Returns
101 |         -------
102 |         obs_batch: np.array
103 |             batch of observations
104 |         act_batch: np.array
105 |             batch of actions executed given obs_batch
106 |         rew_batch: np.array
107 |             rewards received as results of executing act_batch
108 |         next_obs_batch: np.array
109 |             next set of observations seen after executing act_batch
110 |         done_mask: np.array
111 |             done_mask[i] = 1 if executing act_batch[i] resulted in
112 |             the end of an episode and 0 otherwise.
113 |         """
114 |         data, weight,priorities, indices = self.rb.select(beta,num_actor_workers,rnn_length)
115 |         return self._encode_sample(data), weight,priorities, indices
116 | 
117 |     def priority_update(self, indices, priorities):
118 |         priorities = list(np.array(priorities) + self.epsilon)
119 |         self.rb.priority_update(indices=indices, priorities=priorities)
120 | 
121 |     def reset_alpha(self, alpha):
122 |         self.rb.reset_alpha(alpha)
123 | 
124 |     # def collect(self):
125 |     #     return self.sample(-1)
126 | 


--------------------------------------------------------------------------------
/maddpg/trainer/prioritized_rb/sum_tree.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import math
 4 | 
 5 | 
 6 | class SumTree(object):
 7 |     def __init__(self, max_size):
 8 |         self.max_size = max_size
 9 |         self.tree_level = math.ceil(math.log(max_size + 1, 2)) + 1
10 |         self.tree_size = 2 ** self.tree_level - 1
11 |         self.tree = [0 for i in range(self.tree_size)]
12 |         self.max_value = 0
13 |         self.data = [None for i in range(self.max_size)]
14 |         self.size = 0
15 |         self.cursor = 0
16 | 
17 |     def add(self, contents, value):
18 |         if value > self.max_value:
19 |             self.max_value = value
20 |         index = self.cursor
21 |         self.cursor = (self.cursor + 1) % self.max_size  # 超过了从头覆盖
22 |         self.size = min(self.size + 1, self.max_size)
23 | 
24 |         self.data[index] = contents
25 |         self.val_update(index, value)
26 | 
27 |     def get_val(self, index):
28 |         tree_index = 2 ** (self.tree_level - 1) - 1 + index
29 |         return self.tree[tree_index]
30 | 
31 |     def val_update(self, index, value):
32 |         if value > self.max_value:
33 |             self.max_value = value
34 |         tree_index = 2 ** (self.tree_level - 1) - 1 + index
35 |         diff = value - self.tree[tree_index]
36 |         self.reconstruct(tree_index, diff)
37 | 
38 |     def reconstruct(self, tindex, diff):
39 |         self.tree[tindex] += diff
40 |         if not tindex == 0:
41 |             tindex = int((tindex - 1) / 2)
42 |             self.reconstruct(tindex, diff)
43 | 
44 |     def find(self, value, norm=True):
45 |         if norm:
46 |             value *= self.tree[0]
47 |         return self._find(value, 0)
48 | 
49 |     def _find(self, value, index):
50 |         if 2 ** (self.tree_level - 1) - 1 <= index:
51 |             return self.data[index - (2 ** (self.tree_level - 1) - 1)], self.tree[index], index - (
52 |             2 ** (self.tree_level - 1) - 1)
53 | 
54 |         left = self.tree[2 * index + 1]
55 | 
56 |         if value <= left:
57 |             return self._find(value, 2 * index + 1)
58 |         else:
59 |             return self._find(value - left, 2 * (index + 1))
60 | 
61 |     def print_tree(self):
62 |         for k in range(1, self.tree_level + 1):
63 |             for j in range(2 ** (k - 1) - 1, 2 ** k - 1):
64 |                 print(self.tree[j], end=' ')
65 |             print()
66 | 
67 |     def filled_size(self):
68 |         return self.size
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     s = SumTree(20)
73 |     for i in range(20):
74 |         s.add(2 ** i, i)
75 |     s.print_tree()
76 |     print(s.find(0.5))


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | gym
 2 | matplotlib
 3 | numpy
 4 | scipy
 5 | seaborn
 6 | tqdm
 7 | colorcet
 8 | panel
 9 | pyviz-comms
10 | pandas
11 | tensorboard
12 | 


--------------------------------------------------------------------------------