├── CMakeLists.txt
├── LICENSE
├── README.md
├── include
    ├── Qlearning.h
    └── env.h
├── result_image
    ├── Qlearning.png
    └── algorithm.png
└── src
    ├── Qlearning.cpp
    └── main.cpp


/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | project(Qlearning)
 3 | # c++ 11
 4 | set(CMAKE_CXX_FLAGS "-std=c++11")
 5 | # 调试程序
 6 | set(CMAKE_BUILD_TYPE "Debug")
 7 |  
 8 | find_package(OpenCV 4.7.0 REQUIRED) 	# find opencv4.7 installed path
 9 | include_directories(OpenCV_INCLUDE_DIRS)	# load opencv header files
10 | include_directories(include)
11 | add_executable(Qlearning src/main.cpp src/Qlearning.cpp)	# create exe file
12 | target_link_libraries(Qlearning ${OpenCV_LIBS})	# link llib files to exe


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 dongtaihong
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # QLearningVisualization
 2 | 使用opencv展示QLearning算法全局规划
 3 | 
 4 | 
 5 | 
 6 | ## 依赖
 7 | --opencv4.7.0
 8 | ```shell
 9 | 安装参照 https://blog.csdn.net/weixin_43863869/article/details/128552342
10 | ```
11 | ## 编译运行
12 | ```shell
13 | git clone git@github.com:dongtaihong/QLearningVisualization.git
14 | cd QLearningVisualization
15 | mkdir build && cd build
16 | cmake .. && make
17 | ./Qlearning
18 | ```
19 | ## 效果
20 | **说明**：在Qlearning.cpp的GridMap函数中采用的随机方法生成了障碍物，因此很可能会生成无解的地图，所以如果运行程序看到地图无解时请重新运行一下程序。  
21 | 
22 | ![image](./result_image/Qlearning.png)
23 | 
24 | ## 原理
25 | **本质**: 环境是一个(MDP)马尔科夫决策过程(S, A, P, R, γ)，在最初的环境中，智能体并不知道当前状态下各种动作行为的价值，因此基于价值的强化学习算法通过制定对应的环境评价体系(路径规划中走到终点会获得很大的reward、撞到障碍物会获得很大的惩罚、同时为了获得更短的路径每走一步都会有对应的惩罚)，在环境评价体系建立完成之后，智能体需要不断在环境中训练迭代(价值迭代的原理产生于动态规划-贝尔曼最优方程)，使得该MDP过程逐渐收敛，表现为Q-table逐渐收敛，Q-table收敛之后，环境随之确定(MDP过程的状态-价值转移关系确定)，因此路径规划行为就变成了随着状态不断查Q-table的查表过程。  
26 | **扩展**：Q-learning通过贝尔曼最优方程严谨的通过迭代找到环境的价值Q-table，这个其实是data-based的过程--大量的数据不断更新Q-table，但是Q-table是离散性质的，因此既然都是data-based的过程，很自然的可以想到用深度神经网络来直接拟合Q-function，这就是DQN，DQN的引入使得强化学习的应用灵活性更加良好，但是原本数学意义严谨的贝尔曼动态规划过程此时也因其替代为了深度神经网络而变得不再白盒，导致了可解释性的下降。
27 | ![image](./result_image/algorithm.png)


--------------------------------------------------------------------------------
/include/Qlearning.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: 董泰宏 2396203400@qq.com
 3 |  * @Date: 2023-04-28 09:29:31
 4 |  * @LastEditors: 董泰宏 2396203400@qq.com
 5 |  * @LastEditTime: 2023-05-04 09:52:53
 6 |  * @FilePath: /QLearningVisualization/include/Qlearning.h
 7 |  * @Description:
 8 |  * Copyright (c) 2023 by 董泰宏 email: 2396203400@qq.com, All Rights Reserved.
 9 |  */
10 | #include <algorithm>
11 | #include <opencv2/opencv.hpp>
12 | #include <random>
13 | 
14 | #include "env.h"
15 | 
16 | #define UNREACHABLE -10000
17 | extern vector<vector<int>> map;
18 | extern cv::Mat visualMap;
19 | 
20 | class QLearning {
21 |  public:
22 |   QLearning(Environment& env, int episodes, double alpha, double gamma,
23 |             double epsilon, int end_x, int end_y);
24 |   ~QLearning() = default;
25 |   vector<vector<double>> result_q_table_;
26 | 
27 |  private:
28 |   int episodes_;
29 |   double alpha_;
30 |   double gamma_;
31 |   double epsilon_;
32 |   vector<int> position{0, 0};
33 |   int EGreedy(Environment& env, vector<int> position);  // e-greedy选择行动
34 |   vector<int> step(Environment& env, int& action, double& reward, int end_x,
35 |                    int end_y);  //进行一步训练
36 | };
37 | 
38 | void GridMap(double start_x, double start_y, double end_x, double end_y);
39 | void PrintFinalPath(QLearning& rl);


--------------------------------------------------------------------------------
/include/env.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: 董泰宏 2396203400@qq.com
 3 |  * @Date: 2023-04-28 09:29:14
 4 |  * @LastEditors: 董泰宏 2396203400@qq.com
 5 |  * @LastEditTime: 2023-05-04 10:13:12
 6 |  * @FilePath: /QLearningVisualization/include/env.h
 7 |  * @Description:
 8 |  * Copyright (c) 2023 by 董泰宏 email: 2396203400@qq.com, All Rights Reserved.
 9 |  */
10 | 
11 | #pragma once
12 | 
13 | #include <iostream>
14 | #include <vector>
15 | using std::cout;
16 | using std::endl;
17 | using std::vector;
18 | 
19 | //地图的规格
20 | const int map_h = 30;
21 | const int map_w = 30;
22 | const int size_x = 20;
23 | const int size_y = 20;
24 | 
25 | #define UP 0
26 | #define DOWN 1
27 | #define LEFT 2
28 | #define RIGHT 3
29 | #define Free 0
30 | #define OBSTACLE 1
31 | 
32 | /**
33 |  * @description: 存储着强化学习的价值表Q-table, 以及行为组合action
34 |  * @return {*}
35 |  */
36 | class Environment {
37 |  public:
38 |   Environment(int map_h_size, int map_w_size) {
39 |     vector<vector<double>> temp_table(map_h_size * map_w_size,
40 |                                       vector<double>(4, 0));
41 |     q_table_ = std::move(temp_table);
42 |     action_ = {UP, DOWN, LEFT, RIGHT};
43 |   }
44 |   ~Environment() = default;
45 |   vector<vector<double>> q_table_;
46 |   vector<int> action_;
47 | };


--------------------------------------------------------------------------------
/result_image/Qlearning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongtaihong/QLearningVisualization/d525fd89ce015097f02fffd7651f53b8bf7eaac6/result_image/Qlearning.png


--------------------------------------------------------------------------------
/result_image/algorithm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongtaihong/QLearningVisualization/d525fd89ce015097f02fffd7651f53b8bf7eaac6/result_image/algorithm.png


--------------------------------------------------------------------------------
/src/Qlearning.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * @Author: 董泰宏 2396203400@qq.com
  3 |  * @Date: 2023-04-28 09:29:41
  4 |  * @LastEditors: 董泰宏 2396203400@qq.com
  5 |  * @LastEditTime: 2023-05-04 15:32:05
  6 |  * @FilePath: /QLearningVisualization/src/Qlearning.cpp
  7 |  * @Description:
  8 |  * Copyright (c) 2023 by 董泰宏 email: 2396203400@qq.com, All Rights Reserved.
  9 |  */
 10 | #include "Qlearning.h"
 11 | 
 12 | #include "env.h"
 13 | 
 14 | //地图的全局变量定义
 15 | vector<vector<int>> map = vector<vector<int>>(map_h, vector<int>(map_w, 0));
 16 | cv::Mat visualMap(map_h* size_y, map_w* size_x, CV_8UC3,
 17 |                   cv::Scalar(255, 255, 255));
 18 | 
 19 | /**
 20 |  * @description: 绘制棋盘格，起点终点用红色填充，障碍物用黑色填充
 21 |  * @param {double} start_x
 22 |  * @param {double} start_y
 23 |  * @param {double} end_x
 24 |  * @param {double} end_y
 25 |  * @return {*}
 26 |  */
 27 | void GridMap(double start_x, double start_y, double end_x, double end_y) {
 28 |   // 绘制水平方向的网格线
 29 |   for (int r = 0; r <= map_w; r++) {
 30 |     cv::Point p1(0, r * size_y);
 31 |     cv::Point p2(map_w * size_x, r * size_y);
 32 |     cv::line(visualMap, p1, p2, cv::Scalar(0, 0, 0), 1, cv::LINE_AA);
 33 |   }
 34 | 
 35 |   // 绘制垂直方向的网格线
 36 |   for (int c = 0; c <= map_h; c++) {
 37 |     cv::Point p1(c * size_x, 0);
 38 |     cv::Point p2(c * size_x, map_h * size_y);
 39 |     cv::line(visualMap, p1, p2, cv::Scalar(0, 0, 0), 1, cv::LINE_AA);
 40 |   }
 41 | 
 42 |   // 随机生成障碍物，需要注意：很可能会生成无解的地图，所以看到地图无解时就重新运行此程序
 43 |   std::random_device rd;
 44 |   std::mt19937 gen(rd());
 45 |   std::uniform_int_distribution<> disRow(0, map_w - 1);
 46 |   std::uniform_int_distribution<> disCol(0, map_h - 1);
 47 |   const double prob = 0.2;
 48 |   for (int i = 0; i < map_w; ++i) {
 49 |     for (int j = 0; j < map_h; ++j) {
 50 |       double randNum = static_cast<double>(disRow(gen)) / (map_w - 1);
 51 |       if (randNum < prob) {
 52 |         cv::rectangle(visualMap,
 53 |                       cv::Rect(i * size_x, j * size_y, size_x, size_y),
 54 |                       cv::Scalar(0, 0, 0), -1);
 55 |         map[i][j] = 1;
 56 |       }
 57 |     }
 58 |   }
 59 |   //起点、终点
 60 |   cv::rectangle(visualMap,
 61 |                 cv::Rect(start_x * size_x, start_y * size_y, size_x, size_y),
 62 |                 cv::Scalar(0, 0, 255), -1);
 63 |   map[start_x][start_y] = 0;
 64 |   cv::rectangle(visualMap,
 65 |                 cv::Rect(end_x * size_x, end_y * size_y, size_x, size_y),
 66 |                 cv::Scalar(0, 0, 255), -1);
 67 |   map[end_x][end_y] = 0;
 68 | }
 69 | 
 70 | /**
 71 |  * @description: 根据传入的概率分布选择对应的index
 72 |  * @return {*} 选择的index
 73 |  */
 74 | int chooseAction(const std::vector<double>& probabilities) {
 75 |   // 定义一个连续分布随机数生成器
 76 |   std::random_device rd;
 77 |   std::mt19937 gen(rd());
 78 |   std::discrete_distribution<> dis(probabilities.begin(), probabilities.end());
 79 | 
 80 |   // 生成一个随机数作为索引，返回被选择的行为
 81 |   return dis(gen);
 82 | }
 83 | 
 84 | /**
 85 |  * @description: 在当前的位置根据epsilon贪婪策略选择下一步的动作
 86 |  * @param {vector<int>} position
 87 |  * @return {*} 选择的动作
 88 |  */
 89 | int QLearning::EGreedy(Environment& env, vector<int> position) {
 90 |   //当前最优行为
 91 |   int flag = 0;
 92 |   double reward_flag = -10000;
 93 |   for (int i = 0; i < 4; i++) {
 94 |     if (env.q_table_[position[0] * map_w + position[1]][i] > reward_flag) {
 95 |       reward_flag = env.q_table_[position[0] * map_w + position[1]][i];
 96 |       flag = i;
 97 |     }
 98 |   }
 99 |   //各行为的概率值
100 |   int action_optimal = flag;
101 |   vector<double> prob(4, 0);
102 |   for (int i = 0; i < 4; i++) {
103 |     prob[i] = double(epsilon_ / double(3.0));
104 |     if (i == flag) {
105 |       prob[i] = 1 - epsilon_;
106 |     }
107 |   }
108 |   if (position[0] == 0) {
109 |     prob[UP] = 0;
110 |   }
111 |   if (position[0] == map_h - 1) {
112 |     prob[DOWN] = 0;
113 |   }
114 |   if (position[1] == 0) {
115 |     prob[LEFT] = 0;
116 |   }
117 |   if (position[1] == map_w - 1) {
118 |     prob[RIGHT] = 0;
119 |   }
120 | 
121 |   //根据当前概率分布选择相应的行为
122 |   int action_probb = chooseAction(prob);
123 |   return action_probb;
124 | }
125 | 
126 | /**
127 |  * @description: 根据动作计算下一步的位置与收益
128 |  * @return {*} 下一步的位置
129 |  */
130 | vector<int> QLearning::step(Environment& env, int& action, double& reward,
131 |                             int end_x, int end_y) {
132 |   if (action == UP) {
133 |     reward = -10;
134 |     if (map[position[0] - 1][position[1]] == 1) {
135 |       reward = UNREACHABLE;
136 |     }
137 |     if (position[0] - 1 == end_x && position[1] == end_y) reward = 10000;
138 |     return vector<int>{position[0] - 1, position[1]};
139 |   } else if (action == DOWN) {
140 |     reward = -10;
141 |     if (map[position[0] + 1][position[1]] == 1) {
142 |       reward = UNREACHABLE;
143 |     }
144 |     if (position[0] + 1 == end_x && position[1] == end_y) reward = 10000;
145 |     return vector<int>{position[0] + 1, position[1]};
146 |   } else if (action == LEFT) {
147 |     reward = -10;
148 |     if (map[position[0]][position[1] - 1] == 1) {
149 |       reward = UNREACHABLE;
150 |     }
151 |     if (position[0] == end_x && position[1] - 1 == end_y) reward = 10000;
152 |     return vector<int>{position[0], position[1] - 1};
153 |   } else {
154 |     reward = -10;
155 |     if (map[position[0]][position[1] + 1] == 1) {
156 |       reward = UNREACHABLE;
157 |     }
158 |     if (position[0] == end_x && position[1] + 1 == end_y) reward = 10000;
159 |     return vector<int>{position[0], position[1] + 1};
160 |   }
161 |   cout << "can not move" << endl;
162 |   return vector<int>{};
163 | }
164 | 
165 | /**
166 |  * @description: 算法主体
167 |  * @return {*}
168 |  */
169 | QLearning::QLearning(Environment& env, int episodes, double alpha, double gamma,
170 |                      double epsilon, int end_x, int end_y)
171 |     : episodes_(episodes), alpha_(alpha), gamma_(gamma), epsilon_(epsilon) {
172 |   //初始化边界
173 |   for (int i = 0; i < map_h; i++) {
174 |     for (int j = 0; j < map_w; j++) {
175 |       if (i == 0)
176 |         env.q_table_[i * map_w + j][0] = UNREACHABLE;  //第一行不能往上走
177 |       if (i == (map_h - 1))
178 |         env.q_table_[i * map_w + j][1] = UNREACHABLE;  //最后一行不能往下走
179 |       if (j == 0)
180 |         env.q_table_[i * map_w + j][2] = UNREACHABLE;  //第一列不能往左走
181 |       if (j == (map_w - 1))
182 |         env.q_table_[i * map_w + j][3] = UNREACHABLE;  //最后一列不能往右走
183 |     }
184 |   }
185 |   int action_prob = 0;
186 |   for (int i = 0; i < episodes; i++) {
187 |     //回到原点
188 |     position[0] = 0;
189 |     position[1] = 0;
190 |     vector<double> final_x;
191 |     vector<double> final_y;
192 |     final_x.push_back(position[0]);
193 |     final_y.push_back(position[1]);
194 |     while (true) {
195 |       action_prob = EGreedy(env, position);  //根据epsilon贪婪策略选择的行为
196 | 
197 |       double reward = 0;
198 |       vector<int> position_next =
199 |           step(env, action_prob, reward, end_x, end_y);  //走到下一步
200 | 
201 |       vector<double> temp_Q =
202 |           env.q_table_[position_next[0] * map_w + position_next[1]];
203 |       std::sort(temp_Q.begin(), temp_Q.end());  //升序排列
204 |       double Q_max = temp_Q[3];
205 |       env.q_table_[position[0] * map_w + position[1]][action_prob] +=
206 |           alpha * (reward + gamma * Q_max -
207 |                    env.q_table_[position[0] * map_w + position[1]]
208 |                                [action_prob]);  //更新Q-table
209 | 
210 |       if (position[0] == end_x && position[1] == end_y) break;
211 |       position = std::move(position_next);
212 |       final_x.push_back(position[0]);
213 |       final_y.push_back(position[1]);
214 |     }
215 |     cv::imshow("QLearning", visualMap);
216 |     cv::waitKey(1);
217 |   }
218 |   result_q_table_ = std::move(env.q_table_);
219 | }
220 | 
221 | /**
222 |  * @description: 将最后的路径打印出来
223 |  * @param {Environment&} env
224 |  * @return {*}
225 |  */
226 | void PrintFinalPath(QLearning& rl) {
227 |   int point_x = 0;
228 |   int point_y = 0;
229 |   int Num = 1;  //记录路径的长度
230 |   //打印结果
231 |   while (point_x != (map_h - 1) || point_y != (map_w - 1)) {
232 |     cout << "(" << point_x << "," << point_y << ")"
233 |          << "->";
234 |     if (map[point_x][point_y] == 1) {
235 |       cout << "crush obstacle" << endl;
236 |     }
237 |     cv::rectangle(visualMap,
238 |                   cv::Rect(point_x * size_x, point_y * size_y, size_x * 0.8,
239 |                            size_y * 0.8),
240 |                   cv::Scalar(0, 0, 255), -1);
241 | 
242 |     int flag = 0;
243 |     double reward_flag = UNREACHABLE;
244 |     for (int i = 0; i < 4; i++) {
245 |       if (rl.result_q_table_[point_x * map_w + point_y][i] > reward_flag) {
246 |         reward_flag = rl.result_q_table_[point_x * map_w + point_y][i];
247 |         flag = i;
248 |       }
249 |     }
250 |     if (flag == 0) point_x = point_x - 1;
251 |     if (flag == 1) point_x = point_x + 1;
252 |     if (flag == 2) point_y = point_y - 1;
253 |     if (flag == 3) point_y = point_y + 1;
254 |     Num++;
255 |     //如果路径过于长，则失败退出
256 |     if (Num > map_h * 20) {
257 |       cout << "failed" << endl;
258 |       break;
259 |     }
260 |   }
261 |   cout << "(" << point_x << "," << point_y << ") " << endl;
262 |   cout << Num << endl;
263 | 
264 |   cv::imshow("QLearning", visualMap);
265 |   cv::waitKey(18000);
266 | }


--------------------------------------------------------------------------------
/src/main.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: 董泰宏 2396203400@qq.com
 3 |  * @Date: 2023-04-28 09:29:48
 4 |  * @LastEditors: 董泰宏 2396203400@qq.com
 5 |  * @LastEditTime: 2023-05-04 15:27:28
 6 |  * @FilePath: /QLearningVisualization/src/main.cpp
 7 |  * @Description: 使用Q-Learning进行网格地图路径规划
 8 |  * Copyright (c) 2023 by 董泰宏 email: 2396203400@qq.com, All Rights Reserved.
 9 |  */
10 | #include "Qlearning.h"
11 | 
12 | int main(int argv, char* argc[]) {
13 |   //初始化网格地图、起点终点、障碍物
14 |   GridMap(0, 0, map_w - 1, map_h - 1);
15 | 
16 |   //初始化环境
17 |   Environment myenv(map_h, map_w);
18 | 
19 |   //在当前环境进行训练
20 |   QLearning rl(myenv, 3500, 0.1, 0.9, 0.1, map_h - 1, map_w - 1);
21 | 
22 |   //打印网格图
23 |   PrintFinalPath(rl);
24 |   return 0;
25 | }


--------------------------------------------------------------------------------