├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── README.md ├── demo ├── Makefile ├── io_uring_coroutine_echo_server.cpp ├── io_uring_echo_server.c ├── io_uring_echo_server.cpp ├── liburing_cat.c ├── test_echo_server.sh ├── uring_cat.c └── uring_server.c ├── document ├── 128.png ├── benchmark.md ├── coroutine │ └── Coroutines-on-a-Separate-Thread.md ├── io_uring-by-example │ ├── io_uring-by-example1.md │ ├── io_uring-by-example2.md │ └── io_uring-by-example3.md └── part1.md ├── server ├── http_conn.h ├── io_uring.h ├── main.cpp ├── server.h ├── stream.h ├── task.h └── utils.h └── test └── client.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | build -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.14) 2 | project(co-uring-server) 3 | 4 | set(CMAKE_CXX_STANDARD 20) 5 | set(CMAKE_CXX_STANDARD_REQUIRED True) 6 | 7 | set(CMAKE_BUILD_TYPE "Debug") 8 | 9 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Werror") 10 | 11 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -fcoroutines") 12 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer -fsanitize=address") 13 | set(CMAKE_LINKER_FLAGS_DEBUG "${CMAKE_LINKER_FLAGS_DEBUG} -fno-omit-frame-pointer -fsanitize=address") 14 | 15 | set(CMAKE_SOURCE_DIR "server") 16 | set(CMAKE_TEST_DIR "test") 17 | 18 | include_directories(${CMAKE_SOURCE_DIR}/include) 19 | 20 | link_libraries(uring) 21 | 22 | add_executable (co-uring-server 23 | "${CMAKE_SOURCE_DIR}/main.cpp" 24 | ) 25 | 26 | add_executable (test-client 27 | "${CMAKE_TEST_DIR}/client.cpp" 28 | ) 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 云微 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # co-uring-webserver 2 | 3 | Co-uring-webserver is a simple high performance Web Server written in cpp20 for learning: 4 | 5 | - io_uring support for non-blocking IO 6 | - write concurrent code using C++20 coroutine 7 | 8 | This repo also contains a series of learning materials for c++20 and io_uring, along with demos. 9 | 10 | 本项目为 C++20 编写的 Web 服务器,可处理静态资源, 同时也包含了一些学习 c++20 与 io_uring 的相关资料: 11 | 12 | - 使用带有 IORING_OP_PROVIDE_BUFFERS 和 IORING_FEAT_FAST_POLL 的 io_uring 技术进行非阻塞 IO 请求,避免系统调用开销以及用户态-内核态内存复制; 13 | - 采用 c++20 std::coroutine 处理异步回调; 14 | - TODO: 线程池 15 | - TODO 16 | 17 | ## demos to learn C++20, io_uring and test benchmark 18 | 19 | - [io_uring_coroutine_echo_server.cpp](demo/io_uring_coroutine_echo_server.cpp) an echo server with liburing and cpp coroutine 20 | - [io_uring_echo_server.c](demo/io_uring_echo_server.c) an echo server with liburing in c 21 | - [io_uring_echo_server.cpp](demo/io_uring_echo_server.cpp) an echo server with liburing in cpp 22 | - [uring_server.c](demo/uring_server.c) a basic static server with liburing 23 | - [liburing_cat.cpp](demo/liburing_cat.c) a cat program with liburing 24 | - [uring_cat.cpp](demo/uring_cat.c) a cat program using io_uring syscall interface 25 | 26 | ## requirements 27 | 28 | - Linux 5.7 or higher with IORING_FEAT_FAST_POLL and IORING_OP_PROVIDE_BUFFERS required 29 | - gcc 10.0 or higher 30 | 31 | ## build 32 | 33 | ## Document in Chinese 34 | 35 | 这里也包含了一些在学习 c++20 与 io_uring 相关知识的时候,翻译和撰写的中文文档,以及对应的原型实现: 36 | 37 | - io_uring 从原理到动手实践: 关于 iouring 的基本使用细节 38 | - [io_uring 从原理到动手实践 part1: 使用系统调用接口实现 cat 程序](document/io_uring-by-example/io_uring-by-example1.md) 39 | - [io_uring 从原理到动手实践 part2: liburing](document/io_uring-by-example/io_uring-by-example2.md) 40 | - [io_uring 从原理到动手实践 part3: 使用 liburing 实现的一个网络服务器](document/io_uring-by-example/io_uring-by-example3.md) TODO 41 | - c++20 协程: 42 | - TODO 43 | - [使用 c++20 协程与 io_uring 实现高性能web服务器 part1:从 echo server 开始](document/part1.md) 44 | 45 | ## benchmark 46 | 47 | - for echo server: see [document/part1.md](document/part1.md) 48 | - TODO 49 | 50 | ## reference 51 | 52 | repos: 53 | 54 | - the liburing library [github.com/axboe/liburing](https://github.com/axboe/liburing) 55 | - an echo server with liburing in c [github.com/frevib/io_uring-echo-server](https://github.com/frevib/io_uring-echo-server) 56 | - a simple rust echo benchmark tester [github.com/haraldh/rust_echo_bench](https://github.com/haraldh/rust_echo_bench) 57 | - [github.com/lewissbaker/cppcoro](https://github.com/lewissbaker/cppcoro) 58 | - [github.com/facebookexperimental/libunifex](https://github.com/facebookexperimental/libunifex) 59 | - [github.com/netcan/asyncio](https://github.com/netcan/asyncio) 60 | 61 | articles: 62 | 63 | - cpp reference [en.cppreference.com/w/cpp/language/coroutines](https://en.cppreference.com/w/cpp/language/coroutines) 64 | - modernescpp [www.modernescpp.com/index.php/tag/coroutines](https://www.modernescpp.com/index.php/tag/coroutines) 65 | - TODO -------------------------------------------------------------------------------- /demo/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS ?= -Wall -O2 -D_GNU_SOURCE -luring -std=c++2a -fcoroutines 2 | all_targets = io_uring_echo_server io_uring_coroutine_echo_server 3 | build_dir = ../build/demo/ 4 | 5 | .PHONY: liburing io_uring_echo_server 6 | 7 | all: $(all_targets) 8 | 9 | clean: 10 | rm -rf $(build_dir) 11 | 12 | liburing: 13 | +$(MAKE) -C ./liburing 14 | 15 | make_build: clean 16 | mkdir $(build_dir) 17 | 18 | io_uring_echo_server: make_build 19 | $(CXX) io_uring_echo_server.cpp -o $(build_dir)io_uring_echo_server ${CXXFLAGS} 20 | 21 | io_uring_coroutine_echo_server: make_build 22 | $(CXX) io_uring_coroutine_echo_server.cpp -o $(build_dir)io_uring_coroutine_echo_server ${CXXFLAGS} 23 | 24 | benchmark: io_uring_coroutine_echo_server 25 | ./io_uring_coroutine_echo_server 8888 26 | ./test_echo_server 8888 27 | 28 | -------------------------------------------------------------------------------- /demo/io_uring_coroutine_echo_server.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include "liburing.h" 17 | 18 | #define MAX_CONNECTIONS 4096 19 | #define BACKLOG 512 20 | #define MAX_MESSAGE_LEN 2048 21 | #define BUFFERS_COUNT MAX_CONNECTIONS 22 | 23 | void add_accept(struct io_uring *ring, int fd, struct sockaddr *client_addr, socklen_t *client_len, unsigned flags); 24 | void add_socket_read(struct io_uring *ring, int fd, unsigned gid, size_t size, unsigned flags); 25 | void add_socket_write(struct io_uring *ring, int fd, __u16 bid, size_t size, unsigned flags); 26 | void add_provide_buf(struct io_uring *ring, __u16 bid, unsigned gid); 27 | 28 | enum { 29 | ACCEPT, 30 | READ, 31 | WRITE, 32 | PROV_BUF, 33 | }; 34 | 35 | struct conn_info { 36 | __u32 fd; 37 | __u16 type; 38 | __u16 bid; 39 | }; 40 | 41 | char bufs[BUFFERS_COUNT][MAX_MESSAGE_LEN] = {0}; 42 | constexpr int group_id = 1337; 43 | 44 | struct conn_task { 45 | struct promise_type 46 | { 47 | using Handle = std::coroutine_handle; 48 | conn_task get_return_object() 49 | { 50 | return conn_task{Handle::from_promise(*this)}; 51 | } 52 | std::suspend_always initial_suspend() noexcept { 53 | return {}; 54 | } 55 | std::suspend_never final_suspend() noexcept { return {}; } 56 | void return_void() noexcept {} 57 | void unhandled_exception() noexcept {} 58 | struct io_uring *ring; 59 | struct conn_info conn_info; 60 | size_t res; 61 | }; 62 | explicit conn_task(promise_type::Handle handler) : handler(handler) {} 63 | void destroy() { handler.destroy(); } 64 | conn_task(const conn_task &) = delete; 65 | conn_task &operator=(const conn_task &) = delete; 66 | conn_task(conn_task &&t) noexcept : handler(t.handler) { t.handler = {}; } 67 | conn_task &operator=(conn_task &&t) noexcept 68 | { 69 | if (this == &t) 70 | return *this; 71 | if (handler) 72 | handler.destroy(); 73 | handler = t.handler; 74 | t.handler = {}; 75 | return *this; 76 | } 77 | promise_type::Handle handler; 78 | }; 79 | 80 | auto echo_read(size_t message_size, unsigned flags) { 81 | struct awaitable { 82 | bool await_ready() { return false; } 83 | void await_suspend(std::coroutine_handle h) { 84 | auto &p = h.promise(); 85 | struct io_uring_sqe *sqe = io_uring_get_sqe(p.ring); 86 | io_uring_prep_recv(sqe, p.conn_info.fd, NULL, message_size, 0); 87 | io_uring_sqe_set_flags(sqe, flags); 88 | sqe->buf_group = group_id; 89 | p.conn_info.type = READ; 90 | memcpy(&sqe->user_data, &p.conn_info, sizeof(conn_info)); 91 | this->p = &p; 92 | } 93 | size_t await_resume() { 94 | return p->res; 95 | } 96 | size_t message_size; 97 | unsigned flags; 98 | conn_task::promise_type* p = NULL; 99 | }; 100 | return awaitable{message_size, flags}; 101 | } 102 | 103 | auto echo_write(size_t message_size, unsigned flags) { 104 | struct awaitable { 105 | bool await_ready() { return false; } 106 | void await_suspend(std::coroutine_handle h) { 107 | auto &p = h.promise(); 108 | struct io_uring_sqe *sqe = io_uring_get_sqe(p.ring); 109 | io_uring_prep_send(sqe, p.conn_info.fd, &bufs[p.conn_info.bid], message_size, 0); 110 | io_uring_sqe_set_flags(sqe, flags); 111 | p.conn_info.type = WRITE; 112 | memcpy(&sqe->user_data, &p.conn_info, sizeof(conn_info)); 113 | } 114 | size_t await_resume() { 115 | return 0; 116 | } 117 | size_t message_size; 118 | unsigned flags; 119 | }; 120 | return awaitable{message_size, flags}; 121 | } 122 | 123 | auto echo_add_buffer() { 124 | struct awaitable { 125 | bool await_ready() { return false; } 126 | void await_suspend(std::coroutine_handle h) { 127 | auto &p = h.promise(); 128 | struct io_uring_sqe *sqe = io_uring_get_sqe(p.ring); 129 | io_uring_prep_provide_buffers(sqe, bufs[p.conn_info.bid], MAX_MESSAGE_LEN, 1, group_id, p.conn_info.bid); 130 | p.conn_info.type = PROV_BUF; 131 | memcpy(&sqe->user_data, &p.conn_info, sizeof(conn_info)); 132 | h.resume(); 133 | } 134 | size_t await_resume() { 135 | return 0; 136 | } 137 | }; 138 | return awaitable{}; 139 | } 140 | 141 | std::map connections; 142 | 143 | conn_task handle_echo(int fd) { 144 | while (true) { 145 | size_t size_r = co_await echo_read(MAX_MESSAGE_LEN, IOSQE_BUFFER_SELECT); 146 | if (size_r <= 0) { 147 | co_await echo_add_buffer(); 148 | shutdown(fd, SHUT_RDWR); 149 | connections.erase(fd); 150 | co_return; 151 | } 152 | co_await echo_write(size_r, 0); 153 | co_await echo_add_buffer(); 154 | } 155 | 156 | } 157 | 158 | int main(int argc, char *argv[]) { 159 | if (argc < 2) { 160 | printf("Please give a port number: ./io_uring_echo_server [port]\n"); 161 | exit(0); 162 | } 163 | 164 | // some variables we need 165 | int portno = strtol(argv[1], NULL, 10); 166 | struct sockaddr_in serv_addr, client_addr; 167 | socklen_t client_len = sizeof(client_addr); 168 | 169 | // setup socket 170 | int sock_listen_fd = socket(AF_INET, SOCK_STREAM, 0); 171 | const int val = 1; 172 | setsockopt(sock_listen_fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val)); 173 | 174 | memset(&serv_addr, 0, sizeof(serv_addr)); 175 | serv_addr.sin_family = AF_INET; 176 | serv_addr.sin_port = htons(portno); 177 | serv_addr.sin_addr.s_addr = INADDR_ANY; 178 | 179 | // bind and listen 180 | if (bind(sock_listen_fd, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) { 181 | perror("Error binding socket...\n"); 182 | exit(1); 183 | } 184 | if (listen(sock_listen_fd, BACKLOG) < 0) { 185 | perror("Error listening on socket...\n"); 186 | exit(1); 187 | } 188 | printf("io_uring echo server listening for connections on port: %d\n", portno); 189 | 190 | // initialize io_uring 191 | struct io_uring_params params; 192 | struct io_uring ring; 193 | memset(¶ms, 0, sizeof(params)); 194 | 195 | if (io_uring_queue_init_params(2048, &ring, ¶ms) < 0) { 196 | perror("io_uring_init_failed...\n"); 197 | exit(1); 198 | } 199 | 200 | // check if IORING_FEAT_FAST_POLL is supported 201 | if (!(params.features & IORING_FEAT_FAST_POLL)) { 202 | printf("IORING_FEAT_FAST_POLL not available in the kernel, quiting...\n"); 203 | exit(0); 204 | } 205 | 206 | // check if buffer selection is supported 207 | struct io_uring_probe *probe; 208 | probe = io_uring_get_probe_ring(&ring); 209 | if (!probe || !io_uring_opcode_supported(probe, IORING_OP_PROVIDE_BUFFERS)) { 210 | printf("Buffer select not supported, skipping...\n"); 211 | exit(0); 212 | } 213 | free(probe); 214 | 215 | // register buffers for buffer selection 216 | struct io_uring_sqe *sqe; 217 | struct io_uring_cqe *cqe; 218 | 219 | sqe = io_uring_get_sqe(&ring); 220 | io_uring_prep_provide_buffers(sqe, bufs, MAX_MESSAGE_LEN, BUFFERS_COUNT, group_id, 0); 221 | 222 | io_uring_submit(&ring); 223 | io_uring_wait_cqe(&ring, &cqe); 224 | if (cqe->res < 0) { 225 | printf("cqe->res = %d\n", cqe->res); 226 | exit(1); 227 | } 228 | io_uring_cqe_seen(&ring, cqe); 229 | 230 | // add first accept SQE to monitor for new incoming connections 231 | add_accept(&ring, sock_listen_fd, (struct sockaddr *)&client_addr, &client_len, 0); 232 | 233 | // start event loop 234 | while (1) { 235 | io_uring_submit_and_wait(&ring, 1); 236 | struct io_uring_cqe *cqe; 237 | unsigned head; 238 | unsigned count = 0; 239 | 240 | // go through all CQEs 241 | io_uring_for_each_cqe(&ring, head, cqe) { 242 | ++count; 243 | struct conn_info conn_i; 244 | memcpy(&conn_i, &cqe->user_data, sizeof(conn_i)); 245 | 246 | int type = conn_i.type; 247 | if (cqe->res == -ENOBUFS) { 248 | fprintf(stdout, "bufs in automatic buffer selection empty, this should not happen...\n"); 249 | fflush(stdout); 250 | exit(1); 251 | } else if (type == PROV_BUF) { 252 | if (cqe->res < 0) { 253 | printf("cqe->res = %d\n", cqe->res); 254 | exit(1); 255 | } 256 | } else if (type == ACCEPT) { 257 | int sock_conn_fd = cqe->res; 258 | // only read when there is no error, >= 0 259 | if (sock_conn_fd >= 0) { 260 | connections.emplace(sock_conn_fd, handle_echo(sock_conn_fd)); 261 | auto &h = connections.at(sock_conn_fd).handler; 262 | auto &p = h.promise(); 263 | p.conn_info.fd = sock_conn_fd; 264 | p.ring = ˚ 265 | h.resume(); 266 | } 267 | 268 | // new connected client; read data from socket and re-add accept to monitor for new connections 269 | add_accept(&ring, sock_listen_fd, (struct sockaddr *)&client_addr, &client_len, 0); 270 | } else if (type == READ) { 271 | auto &h = connections.at(conn_i.fd).handler; 272 | auto &p = h.promise(); 273 | p.conn_info.bid = cqe->flags >> 16; 274 | p.res = cqe->res; 275 | h.resume(); 276 | } else if (type == WRITE) { 277 | auto &h = connections.at(conn_i.fd).handler; 278 | h.resume(); 279 | } 280 | } 281 | 282 | io_uring_cq_advance(&ring, count); 283 | } 284 | } 285 | 286 | void add_accept(struct io_uring *ring, int fd, struct sockaddr *client_addr, socklen_t *client_len, unsigned flags) { 287 | struct io_uring_sqe *sqe = io_uring_get_sqe(ring); 288 | io_uring_prep_accept(sqe, fd, client_addr, client_len, 0); 289 | io_uring_sqe_set_flags(sqe, flags); 290 | 291 | conn_info conn_i = { 292 | .fd = (unsigned int)fd, 293 | .type = ACCEPT, 294 | .bid = 0 295 | }; 296 | memcpy(&sqe->user_data, &conn_i, sizeof(conn_i)); 297 | } 298 | -------------------------------------------------------------------------------- /demo/io_uring_echo_server.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "liburing.h" 13 | 14 | #define MAX_CONNECTIONS 4096 15 | #define BACKLOG 512 16 | #define MAX_MESSAGE_LEN 2048 17 | #define BUFFERS_COUNT MAX_CONNECTIONS 18 | 19 | void add_accept(struct io_uring *ring, int fd, struct sockaddr *client_addr, socklen_t *client_len, unsigned flags); 20 | void add_socket_read(struct io_uring *ring, int fd, unsigned gid, size_t size, unsigned flags); 21 | void add_socket_write(struct io_uring *ring, int fd, __u16 bid, size_t size, unsigned flags); 22 | void add_provide_buf(struct io_uring *ring, __u16 bid, unsigned gid); 23 | 24 | enum { 25 | ACCEPT, 26 | READ, 27 | WRITE, 28 | PROV_BUF, 29 | }; 30 | 31 | typedef struct conn_info { 32 | __u32 fd; 33 | __u16 type; 34 | __u16 bid; 35 | } conn_info; 36 | 37 | char bufs[BUFFERS_COUNT][MAX_MESSAGE_LEN] = {0}; 38 | int group_id = 1337; 39 | 40 | int main(int argc, char *argv[]) { 41 | if (argc < 2) { 42 | printf("Please give a port number: ./io_uring_echo_server [port]\n"); 43 | exit(0); 44 | } 45 | 46 | // some variables we need 47 | int portno = strtol(argv[1], NULL, 10); 48 | struct sockaddr_in serv_addr, client_addr; 49 | socklen_t client_len = sizeof(client_addr); 50 | 51 | // setup socket 52 | int sock_listen_fd = socket(AF_INET, SOCK_STREAM, 0); 53 | const int val = 1; 54 | setsockopt(sock_listen_fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val)); 55 | 56 | memset(&serv_addr, 0, sizeof(serv_addr)); 57 | serv_addr.sin_family = AF_INET; 58 | serv_addr.sin_port = htons(portno); 59 | serv_addr.sin_addr.s_addr = INADDR_ANY; 60 | 61 | // bind and listen 62 | if (bind(sock_listen_fd, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) { 63 | perror("Error binding socket...\n"); 64 | exit(1); 65 | } 66 | if (listen(sock_listen_fd, BACKLOG) < 0) { 67 | perror("Error listening on socket...\n"); 68 | exit(1); 69 | } 70 | printf("io_uring echo server listening for connections on port: %d\n", portno); 71 | 72 | // initialize io_uring 73 | struct io_uring_params params; 74 | struct io_uring ring; 75 | memset(¶ms, 0, sizeof(params)); 76 | 77 | if (io_uring_queue_init_params(2048, &ring, ¶ms) < 0) { 78 | perror("io_uring_init_failed...\n"); 79 | exit(1); 80 | } 81 | 82 | // check if IORING_FEAT_FAST_POLL is supported 83 | if (!(params.features & IORING_FEAT_FAST_POLL)) { 84 | printf("IORING_FEAT_FAST_POLL not available in the kernel, quiting...\n"); 85 | exit(0); 86 | } 87 | 88 | // check if buffer selection is supported 89 | struct io_uring_probe *probe; 90 | probe = io_uring_get_probe_ring(&ring); 91 | if (!probe || !io_uring_opcode_supported(probe, IORING_OP_PROVIDE_BUFFERS)) { 92 | printf("Buffer select not supported, skipping...\n"); 93 | exit(0); 94 | } 95 | free(probe); 96 | 97 | // register buffers for buffer selection 98 | struct io_uring_sqe *sqe; 99 | struct io_uring_cqe *cqe; 100 | 101 | sqe = io_uring_get_sqe(&ring); 102 | io_uring_prep_provide_buffers(sqe, bufs, MAX_MESSAGE_LEN, BUFFERS_COUNT, group_id, 0); 103 | 104 | io_uring_submit(&ring); 105 | io_uring_wait_cqe(&ring, &cqe); 106 | if (cqe->res < 0) { 107 | printf("cqe->res = %d\n", cqe->res); 108 | exit(1); 109 | } 110 | io_uring_cqe_seen(&ring, cqe); 111 | 112 | // add first accept SQE to monitor for new incoming connections 113 | add_accept(&ring, sock_listen_fd, (struct sockaddr *)&client_addr, &client_len, 0); 114 | 115 | // start event loop 116 | while (1) { 117 | io_uring_submit_and_wait(&ring, 1); 118 | struct io_uring_cqe *cqe; 119 | unsigned head; 120 | unsigned count = 0; 121 | 122 | // go through all CQEs 123 | io_uring_for_each_cqe(&ring, head, cqe) { 124 | ++count; 125 | struct conn_info conn_i; 126 | memcpy(&conn_i, &cqe->user_data, sizeof(conn_i)); 127 | 128 | int type = conn_i.type; 129 | if (cqe->res == -ENOBUFS) { 130 | fprintf(stdout, "bufs in automatic buffer selection empty, this should not happen...\n"); 131 | fflush(stdout); 132 | exit(1); 133 | } else if (type == PROV_BUF) { 134 | if (cqe->res < 0) { 135 | printf("cqe->res = %d\n", cqe->res); 136 | exit(1); 137 | } 138 | } else if (type == ACCEPT) { 139 | int sock_conn_fd = cqe->res; 140 | // only read when there is no error, >= 0 141 | if (sock_conn_fd >= 0) { 142 | add_socket_read(&ring, sock_conn_fd, group_id, MAX_MESSAGE_LEN, IOSQE_BUFFER_SELECT); 143 | } 144 | 145 | // new connected client; read data from socket and re-add accept to monitor for new connections 146 | add_accept(&ring, sock_listen_fd, (struct sockaddr *)&client_addr, &client_len, 0); 147 | } else if (type == READ) { 148 | int bytes_read = cqe->res; 149 | int bid = cqe->flags >> 16; 150 | if (cqe->res <= 0) { 151 | // read failed, re-add the buffer 152 | add_provide_buf(&ring, bid, group_id); 153 | // connection closed or error 154 | shutdown(conn_i.fd, SHUT_RDWR); 155 | } else { 156 | // bytes have been read into bufs, now add write to socket sqe 157 | add_socket_write(&ring, conn_i.fd, bid, bytes_read, 0); 158 | } 159 | } else if (type == WRITE) { 160 | // write has been completed, first re-add the buffer 161 | add_provide_buf(&ring, conn_i.bid, group_id); 162 | // add a new read for the existing connection 163 | add_socket_read(&ring, conn_i.fd, group_id, MAX_MESSAGE_LEN, IOSQE_BUFFER_SELECT); 164 | } 165 | } 166 | 167 | io_uring_cq_advance(&ring, count); 168 | } 169 | } 170 | 171 | void add_accept(struct io_uring *ring, int fd, struct sockaddr *client_addr, socklen_t *client_len, unsigned flags) { 172 | struct io_uring_sqe *sqe = io_uring_get_sqe(ring); 173 | io_uring_prep_accept(sqe, fd, client_addr, client_len, 0); 174 | io_uring_sqe_set_flags(sqe, flags); 175 | 176 | conn_info conn_i = { 177 | .fd = fd, 178 | .type = ACCEPT, 179 | }; 180 | memcpy(&sqe->user_data, &conn_i, sizeof(conn_i)); 181 | } 182 | 183 | void add_socket_read(struct io_uring *ring, int fd, unsigned gid, size_t message_size, unsigned flags) { 184 | struct io_uring_sqe *sqe = io_uring_get_sqe(ring); 185 | io_uring_prep_recv(sqe, fd, NULL, message_size, 0); 186 | io_uring_sqe_set_flags(sqe, flags); 187 | sqe->buf_group = gid; 188 | 189 | conn_info conn_i = { 190 | .fd = fd, 191 | .type = READ, 192 | }; 193 | memcpy(&sqe->user_data, &conn_i, sizeof(conn_i)); 194 | } 195 | 196 | void add_socket_write(struct io_uring *ring, int fd, __u16 bid, size_t message_size, unsigned flags) { 197 | struct io_uring_sqe *sqe = io_uring_get_sqe(ring); 198 | io_uring_prep_send(sqe, fd, &bufs[bid], message_size, 0); 199 | io_uring_sqe_set_flags(sqe, flags); 200 | 201 | conn_info conn_i = { 202 | .fd = fd, 203 | .type = WRITE, 204 | .bid = bid, 205 | }; 206 | memcpy(&sqe->user_data, &conn_i, sizeof(conn_i)); 207 | } 208 | 209 | void add_provide_buf(struct io_uring *ring, __u16 bid, unsigned gid) { 210 | struct io_uring_sqe *sqe = io_uring_get_sqe(ring); 211 | io_uring_prep_provide_buffers(sqe, bufs[bid], MAX_MESSAGE_LEN, 1, gid, bid); 212 | 213 | conn_info conn_i = { 214 | .fd = 0, 215 | .type = PROV_BUF, 216 | }; 217 | memcpy(&sqe->user_data, &conn_i, sizeof(conn_i)); 218 | } 219 | -------------------------------------------------------------------------------- /demo/io_uring_echo_server.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "liburing.h" 13 | 14 | #define MAX_CONNECTIONS 4096 15 | #define BACKLOG 512 16 | #define MAX_MESSAGE_LEN 2048 17 | #define BUFFERS_COUNT MAX_CONNECTIONS 18 | 19 | void add_accept(struct io_uring *ring, int fd, struct sockaddr *client_addr, socklen_t *client_len, unsigned flags); 20 | void add_socket_read(struct io_uring *ring, int fd, unsigned gid, size_t size, unsigned flags); 21 | void add_socket_write(struct io_uring *ring, int fd, __u16 bid, size_t size, unsigned flags); 22 | void add_provide_buf(struct io_uring *ring, __u16 bid, unsigned gid); 23 | 24 | enum { 25 | ACCEPT, 26 | READ, 27 | WRITE, 28 | PROV_BUF, 29 | }; 30 | 31 | typedef struct conn_info { 32 | __u32 fd; 33 | __u16 type; 34 | __u16 bid; 35 | } conn_info; 36 | 37 | char bufs[BUFFERS_COUNT][MAX_MESSAGE_LEN] = {0}; 38 | int group_id = 1337; 39 | 40 | int main(int argc, char *argv[]) { 41 | if (argc < 2) { 42 | printf("Please give a port number: ./io_uring_echo_server [port]\n"); 43 | exit(0); 44 | } 45 | 46 | // some variables we need 47 | int portno = strtol(argv[1], NULL, 10); 48 | struct sockaddr_in serv_addr, client_addr; 49 | socklen_t client_len = sizeof(client_addr); 50 | 51 | // setup socket 52 | int sock_listen_fd = socket(AF_INET, SOCK_STREAM, 0); 53 | const int val = 1; 54 | setsockopt(sock_listen_fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val)); 55 | 56 | memset(&serv_addr, 0, sizeof(serv_addr)); 57 | serv_addr.sin_family = AF_INET; 58 | serv_addr.sin_port = htons(portno); 59 | serv_addr.sin_addr.s_addr = INADDR_ANY; 60 | 61 | // bind and listen 62 | if (bind(sock_listen_fd, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) { 63 | perror("Error binding socket...\n"); 64 | exit(1); 65 | } 66 | if (listen(sock_listen_fd, BACKLOG) < 0) { 67 | perror("Error listening on socket...\n"); 68 | exit(1); 69 | } 70 | printf("io_uring echo server listening for connections on port: %d\n", portno); 71 | 72 | // initialize io_uring 73 | struct io_uring_params params; 74 | struct io_uring ring; 75 | memset(¶ms, 0, sizeof(params)); 76 | 77 | if (io_uring_queue_init_params(2048, &ring, ¶ms) < 0) { 78 | perror("io_uring_init_failed...\n"); 79 | exit(1); 80 | } 81 | 82 | // check if IORING_FEAT_FAST_POLL is supported 83 | if (!(params.features & IORING_FEAT_FAST_POLL)) { 84 | printf("IORING_FEAT_FAST_POLL not available in the kernel, quiting...\n"); 85 | exit(0); 86 | } 87 | 88 | // check if buffer selection is supported 89 | struct io_uring_probe *probe; 90 | probe = io_uring_get_probe_ring(&ring); 91 | if (!probe || !io_uring_opcode_supported(probe, IORING_OP_PROVIDE_BUFFERS)) { 92 | printf("Buffer select not supported, skipping...\n"); 93 | exit(0); 94 | } 95 | free(probe); 96 | 97 | // register buffers for buffer selection 98 | struct io_uring_sqe *sqe; 99 | struct io_uring_cqe *cqe; 100 | 101 | sqe = io_uring_get_sqe(&ring); 102 | io_uring_prep_provide_buffers(sqe, bufs, MAX_MESSAGE_LEN, BUFFERS_COUNT, group_id, 0); 103 | 104 | io_uring_submit(&ring); 105 | io_uring_wait_cqe(&ring, &cqe); 106 | if (cqe->res < 0) { 107 | printf("cqe->res = %d\n", cqe->res); 108 | exit(1); 109 | } 110 | io_uring_cqe_seen(&ring, cqe); 111 | 112 | // add first accept SQE to monitor for new incoming connections 113 | add_accept(&ring, sock_listen_fd, (struct sockaddr *)&client_addr, &client_len, 0); 114 | 115 | // start event loop 116 | while (1) { 117 | io_uring_submit_and_wait(&ring, 1); 118 | struct io_uring_cqe *cqe; 119 | unsigned head; 120 | unsigned count = 0; 121 | 122 | // go through all CQEs 123 | io_uring_for_each_cqe(&ring, head, cqe) { 124 | ++count; 125 | struct conn_info conn_i; 126 | memcpy(&conn_i, &cqe->user_data, sizeof(conn_i)); 127 | 128 | int type = conn_i.type; 129 | if (cqe->res == -ENOBUFS) { 130 | fprintf(stdout, "bufs in automatic buffer selection empty, this should not happen...\n"); 131 | fflush(stdout); 132 | exit(1); 133 | } else if (type == PROV_BUF) { 134 | if (cqe->res < 0) { 135 | printf("cqe->res = %d\n", cqe->res); 136 | exit(1); 137 | } 138 | } else if (type == ACCEPT) { 139 | int sock_conn_fd = cqe->res; 140 | // only read when there is no error, >= 0 141 | if (sock_conn_fd >= 0) { 142 | add_socket_read(&ring, sock_conn_fd, group_id, MAX_MESSAGE_LEN, IOSQE_BUFFER_SELECT); 143 | } 144 | 145 | // new connected client; read data from socket and re-add accept to monitor for new connections 146 | add_accept(&ring, sock_listen_fd, (struct sockaddr *)&client_addr, &client_len, 0); 147 | } else if (type == READ) { 148 | int bytes_read = cqe->res; 149 | int bid = cqe->flags >> 16; 150 | if (cqe->res <= 0) { 151 | // read failed, re-add the buffer 152 | add_provide_buf(&ring, bid, group_id); 153 | // connection closed or error 154 | shutdown(conn_i.fd, SHUT_RDWR); 155 | } else { 156 | // bytes have been read into bufs, now add write to socket sqe 157 | add_socket_write(&ring, conn_i.fd, bid, bytes_read, 0); 158 | } 159 | } else if (type == WRITE) { 160 | // write has been completed, first re-add the buffer 161 | add_provide_buf(&ring, conn_i.bid, group_id); 162 | // add a new read for the existing connection 163 | add_socket_read(&ring, conn_i.fd, group_id, MAX_MESSAGE_LEN, IOSQE_BUFFER_SELECT); 164 | } 165 | } 166 | 167 | io_uring_cq_advance(&ring, count); 168 | } 169 | } 170 | 171 | void add_accept(struct io_uring *ring, int fd, struct sockaddr *client_addr, socklen_t *client_len, unsigned flags) { 172 | struct io_uring_sqe *sqe = io_uring_get_sqe(ring); 173 | io_uring_prep_accept(sqe, fd, client_addr, client_len, 0); 174 | io_uring_sqe_set_flags(sqe, flags); 175 | 176 | conn_info conn_i = { 177 | .fd = fd, 178 | .type = ACCEPT, 179 | }; 180 | memcpy(&sqe->user_data, &conn_i, sizeof(conn_i)); 181 | } 182 | 183 | void add_socket_read(struct io_uring *ring, int fd, unsigned gid, size_t message_size, unsigned flags) { 184 | struct io_uring_sqe *sqe = io_uring_get_sqe(ring); 185 | io_uring_prep_recv(sqe, fd, NULL, message_size, 0); 186 | io_uring_sqe_set_flags(sqe, flags); 187 | sqe->buf_group = gid; 188 | 189 | conn_info conn_i = { 190 | .fd = fd, 191 | .type = READ, 192 | }; 193 | memcpy(&sqe->user_data, &conn_i, sizeof(conn_i)); 194 | } 195 | 196 | void add_socket_write(struct io_uring *ring, int fd, __u16 bid, size_t message_size, unsigned flags) { 197 | struct io_uring_sqe *sqe = io_uring_get_sqe(ring); 198 | io_uring_prep_send(sqe, fd, &bufs[bid], message_size, 0); 199 | io_uring_sqe_set_flags(sqe, flags); 200 | 201 | conn_info conn_i = { 202 | .fd = fd, 203 | .type = WRITE, 204 | .bid = bid, 205 | }; 206 | memcpy(&sqe->user_data, &conn_i, sizeof(conn_i)); 207 | } 208 | 209 | void add_provide_buf(struct io_uring *ring, __u16 bid, unsigned gid) { 210 | struct io_uring_sqe *sqe = io_uring_get_sqe(ring); 211 | io_uring_prep_provide_buffers(sqe, bufs[bid], MAX_MESSAGE_LEN, 1, gid, bid); 212 | 213 | conn_info conn_i = { 214 | .fd = 0, 215 | .type = PROV_BUF, 216 | }; 217 | memcpy(&sqe->user_data, &conn_i, sizeof(conn_i)); 218 | } 219 | -------------------------------------------------------------------------------- /demo/liburing_cat.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define QUEUE_DEPTH 1 10 | #define BLOCK_SZ 1024 11 | 12 | struct file_info { 13 | off_t file_sz; 14 | struct iovec iovecs[]; /* Referred by readv/writev */ 15 | }; 16 | 17 | /* 18 | * Returns the size of the file whose open file descriptor is passed in. 19 | * Properly handles regular file and block devices as well. Pretty. 20 | * */ 21 | 22 | off_t get_file_size(int fd) { 23 | struct stat st; 24 | 25 | if(fstat(fd, &st) < 0) { 26 | perror("fstat"); 27 | return -1; 28 | } 29 | if (S_ISBLK(st.st_mode)) { 30 | unsigned long long bytes; 31 | if (ioctl(fd, BLKGETSIZE64, &bytes) != 0) { 32 | perror("ioctl"); 33 | return -1; 34 | } 35 | return bytes; 36 | } else if (S_ISREG(st.st_mode)) 37 | return st.st_size; 38 | 39 | return -1; 40 | } 41 | 42 | /* 43 | * Output a string of characters of len length to stdout. 44 | * We use buffered output here to be efficient, 45 | * since we need to output character-by-character. 46 | * */ 47 | void output_to_console(char *buf, int len) { 48 | while (len--) { 49 | fputc(*buf++, stdout); 50 | } 51 | } 52 | 53 | /* 54 | * Wait for a completion to be available, fetch the data from 55 | * the readv operation and print it to the console. 56 | * */ 57 | 58 | int get_completion_and_print(struct io_uring *ring) { 59 | struct io_uring_cqe *cqe; 60 | int ret = io_uring_wait_cqe(ring, &cqe); 61 | if (ret < 0) { 62 | perror("io_uring_wait_cqe"); 63 | return 1; 64 | } 65 | if (cqe->res < 0) { 66 | fprintf(stderr, "Async readv failed.\n"); 67 | return 1; 68 | } 69 | struct file_info *fi = io_uring_cqe_get_data(cqe); 70 | int blocks = (int) fi->file_sz / BLOCK_SZ; 71 | if (fi->file_sz % BLOCK_SZ) blocks++; 72 | for (int i = 0; i < blocks; i ++) 73 | output_to_console(fi->iovecs[i].iov_base, fi->iovecs[i].iov_len); 74 | 75 | io_uring_cqe_seen(ring, cqe); 76 | return 0; 77 | } 78 | 79 | /* 80 | * Submit the readv request via liburing 81 | * */ 82 | 83 | int submit_read_request(char *file_path, struct io_uring *ring) { 84 | int file_fd = open(file_path, O_RDONLY); 85 | if (file_fd < 0) { 86 | perror("open"); 87 | return 1; 88 | } 89 | off_t file_sz = get_file_size(file_fd); 90 | off_t bytes_remaining = file_sz; 91 | off_t offset = 0; 92 | int current_block = 0; 93 | int blocks = (int) file_sz / BLOCK_SZ; 94 | if (file_sz % BLOCK_SZ) blocks++; 95 | struct file_info *fi = malloc(sizeof(*fi) + 96 | (sizeof(struct iovec) * blocks)); 97 | 98 | /* 99 | * For each block of the file we need to read, we allocate an iovec struct 100 | * which is indexed into the iovecs array. This array is passed in as part 101 | * of the submission. If you don't understand this, then you need to look 102 | * up how the readv() and writev() system calls work. 103 | * */ 104 | while (bytes_remaining) { 105 | off_t bytes_to_read = bytes_remaining; 106 | if (bytes_to_read > BLOCK_SZ) 107 | bytes_to_read = BLOCK_SZ; 108 | 109 | offset += bytes_to_read; 110 | fi->iovecs[current_block].iov_len = bytes_to_read; 111 | 112 | void *buf; 113 | if( posix_memalign(&buf, BLOCK_SZ, BLOCK_SZ)) { 114 | perror("posix_memalign"); 115 | return 1; 116 | } 117 | fi->iovecs[current_block].iov_base = buf; 118 | 119 | current_block++; 120 | bytes_remaining -= bytes_to_read; 121 | } 122 | fi->file_sz = file_sz; 123 | 124 | /* Get an SQE */ 125 | struct io_uring_sqe *sqe = io_uring_get_sqe(ring); 126 | /* Setup a readv operation */ 127 | io_uring_prep_readv(sqe, file_fd, fi->iovecs, blocks, 0); 128 | /* Set user data */ 129 | io_uring_sqe_set_data(sqe, fi); 130 | /* Finally, submit the request */ 131 | io_uring_submit(ring); 132 | 133 | return 0; 134 | } 135 | 136 | int main(int argc, char *argv[]) { 137 | struct io_uring ring; 138 | 139 | if (argc < 2) { 140 | fprintf(stderr, "Usage: %s [file name] <[file name] ...>\n", 141 | argv[0]); 142 | return 1; 143 | } 144 | 145 | /* Initialize io_uring */ 146 | io_uring_queue_init(QUEUE_DEPTH, &ring, 0); 147 | 148 | for (int i = 1; i < argc; i++) { 149 | int ret = submit_read_request(argv[i], &ring); 150 | if (ret) { 151 | fprintf(stderr, "Error reading file: %s\n", argv[i]); 152 | return 1; 153 | } 154 | get_completion_and_print(&ring); 155 | } 156 | 157 | /* Call the clean-up function. */ 158 | io_uring_queue_exit(&ring); 159 | return 0; 160 | } 161 | -------------------------------------------------------------------------------- /demo/test_echo_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo $(uname -a) 3 | 4 | if [ "$#" -ne 1 ]; then 5 | echo "Please give port where echo server is running: $0 [port]" 6 | exit 7 | fi 8 | 9 | PID=$(lsof -itcp:$1 | sed -n -e 2p | awk '{print $2}') 10 | taskset -cp 0 $PID 11 | 12 | for bytes in 1 128 512 1000 13 | do 14 | for connections in 1 50 150 300 500 15 | do 16 | cargo run --release -- --address "localhost:$1" --number $connections --duration 60 --length $bytes 17 | sleep 4 18 | done 19 | done 20 | -------------------------------------------------------------------------------- /demo/uring_cat.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | /* 如果你的编译失败是因为缺少下面的头文件, 14 | * 您的内核可能太旧,无法支持 io_uring。 15 | * */ 16 | #include 17 | 18 | #define QUEUE_DEPTH 1 19 | #define BLOCK_SZ 1024 20 | 21 | /* This is x86 specific */ 22 | #define read_barrier() __asm__ __volatile__("":::"memory") 23 | #define write_barrier() __asm__ __volatile__("":::"memory") 24 | 25 | struct app_io_sq_ring { 26 | unsigned *head; 27 | unsigned *tail; 28 | unsigned *ring_mask; 29 | unsigned *ring_entries; 30 | unsigned *flags; 31 | unsigned *array; 32 | }; 33 | 34 | struct app_io_cq_ring { 35 | unsigned *head; 36 | unsigned *tail; 37 | unsigned *ring_mask; 38 | unsigned *ring_entries; 39 | struct io_uring_cqe *cqes; 40 | }; 41 | 42 | struct submitter { 43 | int ring_fd; 44 | struct app_io_sq_ring sq_ring; 45 | struct io_uring_sqe *sqes; 46 | struct app_io_cq_ring cq_ring; 47 | }; 48 | 49 | struct file_info { 50 | off_t file_sz; 51 | struct iovec iovecs[]; /* Referred by readv/writev */ 52 | }; 53 | 54 | /* 55 | * 这段代码是在没有io_uring相关系统调用的年代写的 56 | * 标准 C 库的一部分。所以,我们推出自己的系统调用包装器. 57 | * */ 58 | 59 | int io_uring_setup(unsigned entries, struct io_uring_params *p) 60 | { 61 | return (int) syscall(__NR_io_uring_setup, entries, p); 62 | } 63 | 64 | int io_uring_enter(int ring_fd, unsigned int to_submit, 65 | unsigned int min_complete, unsigned int flags) 66 | { 67 | return (int) syscall(__NR_io_uring_enter, ring_fd, to_submit, min_complete, 68 | flags, NULL, 0); 69 | } 70 | 71 | /* 72 | * 返回传入其打开文件描述符的文件的大小。 73 | * 正确处理常规文件和块设备。 74 | * */ 75 | 76 | off_t get_file_size(int fd) { 77 | struct stat st; 78 | 79 | if(fstat(fd, &st) < 0) { 80 | perror("fstat"); 81 | return -1; 82 | } 83 | if (S_ISBLK(st.st_mode)) { 84 | unsigned long long bytes; 85 | if (ioctl(fd, BLKGETSIZE64, &bytes) != 0) { 86 | perror("ioctl"); 87 | return -1; 88 | } 89 | return bytes; 90 | } else if (S_ISREG(st.st_mode)) 91 | return st.st_size; 92 | 93 | return -1; 94 | } 95 | 96 | /* 97 | * io_uring 需要很多设置,看起来很麻烦 98 | * 所以 io_uring 的作者创建了 liburing,比较好用。 99 | * 但是,您应该花时间了解此代码。 100 | * */ 101 | 102 | int app_setup_uring(struct submitter *s) { 103 | struct app_io_sq_ring *sring = &s->sq_ring; 104 | struct app_io_cq_ring *cring = &s->cq_ring; 105 | struct io_uring_params p; 106 | void *sq_ptr, *cq_ptr; 107 | 108 | /* 109 | * 我们需要将 io_uring_params 结构体传递给 io_uring_setup() 去置0初始化。 110 | * 我们可以设置任何想要的标记。 111 | * */ 112 | memset(&p, 0, sizeof(p)); 113 | s->ring_fd = io_uring_setup(QUEUE_DEPTH, &p); 114 | if (s->ring_fd < 0) { 115 | perror("io_uring_setup"); 116 | return 1; 117 | } 118 | 119 | /* 120 | * io_uring 通信通过 2 个共享的内核用户空间环形缓冲区进行, 121 | * 可以在内核中通过 mmap() 调用映射。 122 | * 虽然完成队列是直接映射进去的, 但提交队列里面有个数组,我们也把它映射进* 去 123 | * */ 124 | 125 | int sring_sz = p.sq_off.array + p.sq_entries * sizeof(unsigned); 126 | int cring_sz = p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe); 127 | 128 | /* 在内核版本 5.4 及以上, 129 | * 可以使用单个 mmap() 调用同时完成两个缓冲区的映射。 130 | * 关于内核版本,可以检查 io_uring_params 的字段,并使用 mask 获取。 131 | * 如果 IORING_FEAT_SINGLE_MMAP 已设置,我们可以不用第二个 mmap() 去映* 射。 132 | * */ 133 | if (p.features & IORING_FEAT_SINGLE_MMAP) { 134 | if (cring_sz > sring_sz) { 135 | sring_sz = cring_sz; 136 | } 137 | cring_sz = sring_sz; 138 | } 139 | 140 | /* 在提交和完成队列环形缓冲区中映射。 141 | * 不过,较旧的内核仅映射到提交队列中。 142 | * */ 143 | sq_ptr = mmap(0, sring_sz, PROT_READ | PROT_WRITE, 144 | MAP_SHARED | MAP_POPULATE, 145 | s->ring_fd, IORING_OFF_SQ_RING); 146 | if (sq_ptr == MAP_FAILED) { 147 | perror("mmap"); 148 | return 1; 149 | } 150 | 151 | if (p.features & IORING_FEAT_SINGLE_MMAP) { 152 | cq_ptr = sq_ptr; 153 | } else { 154 | /* 分别映射到旧内核中的完成队列环形缓冲区 */ 155 | cq_ptr = mmap(0, cring_sz, PROT_READ | PROT_WRITE, 156 | MAP_SHARED | MAP_POPULATE, 157 | s->ring_fd, IORING_OFF_CQ_RING); 158 | if (cq_ptr == MAP_FAILED) { 159 | perror("mmap"); 160 | return 1; 161 | } 162 | } 163 | 164 | /* 将有用的字段保存在全局 app_io_sq_ring 结构中以备后用 165 | * 简单的一个参考 */ 166 | sring->head = sq_ptr + p.sq_off.head; 167 | sring->tail = sq_ptr + p.sq_off.tail; 168 | sring->ring_mask = sq_ptr + p.sq_off.ring_mask; 169 | sring->ring_entries = sq_ptr + p.sq_off.ring_entries; 170 | sring->flags = sq_ptr + p.sq_off.flags; 171 | sring->array = sq_ptr + p.sq_off.array; 172 | 173 | /* 映射到提交队列条目数组 */ 174 | s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe), 175 | PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, 176 | s->ring_fd, IORING_OFF_SQES); 177 | if (s->sqes == MAP_FAILED) { 178 | perror("mmap"); 179 | return 1; 180 | } 181 | 182 | /* 将有用的字段保存在全局 app_io_cq_ring 结构中以备后用 183 | * 简单参考 */ 184 | cring->head = cq_ptr + p.cq_off.head; 185 | cring->tail = cq_ptr + p.cq_off.tail; 186 | cring->ring_mask = cq_ptr + p.cq_off.ring_mask; 187 | cring->ring_entries = cq_ptr + p.cq_off.ring_entries; 188 | cring->cqes = cq_ptr + p.cq_off.cqes; 189 | 190 | return 0; 191 | } 192 | 193 | /* 194 | * 输出长度为 len 的字符串到 stdout 195 | * 我们在这里使用缓冲输出以提高效率, 196 | * 因为我们需要逐个字符地输出。 197 | * */ 198 | void output_to_console(char *buf, int len) { 199 | while (len--) { 200 | fputc(*buf++, stdout); 201 | } 202 | } 203 | 204 | /* 205 | * 从完成队列中读取。 206 | * 在这个函数中,我们从完成队列中读取完成事件, 207 | * 得到包含文件数据并将其打印到控制台的数据缓冲区。 208 | * */ 209 | 210 | void read_from_cq(struct submitter *s) { 211 | struct file_info *fi; 212 | struct app_io_cq_ring *cring = &s->cq_ring; 213 | struct io_uring_cqe *cqe; 214 | unsigned head, reaped = 0; 215 | 216 | head = *cring->head; 217 | 218 | do { 219 | read_barrier(); 220 | /* 221 | * 请记住,这是一个环形缓冲区。如果头==尾,则表示 222 | * 缓冲区为空。 223 | * */ 224 | if (head == *cring->tail) 225 | break; 226 | 227 | /* 获取条目 */ 228 | cqe = &cring->cqes[head & *s->cq_ring.ring_mask]; 229 | fi = (struct file_info*) cqe->user_data; 230 | if (cqe->res < 0) 231 | fprintf(stderr, "Error: %s\n", strerror(abs(cqe->res))); 232 | 233 | int blocks = (int) fi->file_sz / BLOCK_SZ; 234 | if (fi->file_sz % BLOCK_SZ) blocks++; 235 | 236 | for (int i = 0; i < blocks; i++) 237 | output_to_console(fi->iovecs[i].iov_base, fi->iovecs[i].iov_len); 238 | 239 | head++; 240 | } while (1); 241 | 242 | *cring->head = head; 243 | write_barrier(); 244 | } 245 | /* 246 | * 提交到提交队列。 247 | * 在这个函数中,我们将请求提交到提交队列。你可以提交 248 | * 我们的将是 readv() 请求,通过 IORING_OP_READV 指定。 249 | * 250 | * */ 251 | int submit_to_sq(char *file_path, struct submitter *s) { 252 | struct file_info *fi; 253 | 254 | int file_fd = open(file_path, O_RDONLY); 255 | if (file_fd < 0 ) { 256 | perror("open"); 257 | return 1; 258 | } 259 | 260 | struct app_io_sq_ring *sring = &s->sq_ring; 261 | unsigned index = 0, current_block = 0, tail = 0, next_tail = 0; 262 | 263 | off_t file_sz = get_file_size(file_fd); 264 | if (file_sz < 0) 265 | return 1; 266 | off_t bytes_remaining = file_sz; 267 | int blocks = (int) file_sz / BLOCK_SZ; 268 | if (file_sz % BLOCK_SZ) blocks++; 269 | 270 | fi = malloc(sizeof(*fi) + sizeof(struct iovec) * blocks); 271 | if (!fi) { 272 | fprintf(stderr, "Unable to allocate memory\n"); 273 | return 1; 274 | } 275 | fi->file_sz = file_sz; 276 | 277 | /* 278 | * 对于我们需要读取的文件的每个块,我们分配一个iovec struct 279 | * 索引到 iovecs 数组中。这个数组作为一部分提交传入。 280 | * 如果你不明白这一点,那么你需要去 281 | * 了解一下 readv() 和 writev() 系统调用的工作方式。 282 | * */ 283 | while (bytes_remaining) { 284 | off_t bytes_to_read = bytes_remaining; 285 | if (bytes_to_read > BLOCK_SZ) 286 | bytes_to_read = BLOCK_SZ; 287 | 288 | fi->iovecs[current_block].iov_len = bytes_to_read; 289 | 290 | void *buf; 291 | if( posix_memalign(&buf, BLOCK_SZ, BLOCK_SZ)) { 292 | perror("posix_memalign"); 293 | return 1; 294 | } 295 | fi->iovecs[current_block].iov_base = buf; 296 | 297 | current_block++; 298 | bytes_remaining -= bytes_to_read; 299 | } 300 | 301 | /* 将我们的提交队列条目添加到 SQE 环形缓冲区的尾部 */ 302 | next_tail = tail = *sring->tail; 303 | next_tail++; 304 | read_barrier(); 305 | index = tail & *s->sq_ring.ring_mask; 306 | struct io_uring_sqe *sqe = &s->sqes[index]; 307 | sqe->fd = file_fd; 308 | sqe->flags = 0; 309 | sqe->opcode = IORING_OP_READV; 310 | sqe->addr = (unsigned long) fi->iovecs; 311 | sqe->len = blocks; 312 | sqe->off = 0; 313 | sqe->user_data = (unsigned long long) fi; 314 | sring->array[index] = index; 315 | tail = next_tail; 316 | 317 | /* 更新尾部以便内核可以看到它 */ 318 | if(*sring->tail != tail) { 319 | *sring->tail = tail; 320 | write_barrier(); 321 | } 322 | 323 | /* 324 | * 告诉内核我们已经用 io_uring_enter() 提交了事件。 325 | * 们还传入了 IOURING_ENTER_GETEVENTS 标志,这会导致 326 | * io_uring_enter() 调用等待 min_complete 事件完成后返回。 327 | * */ 328 | int ret = io_uring_enter(s->ring_fd, 1,1, 329 | IORING_ENTER_GETEVENTS); 330 | if(ret < 0) { 331 | perror("io_uring_enter"); 332 | return 1; 333 | } 334 | 335 | return 0; 336 | } 337 | 338 | int main(int argc, char *argv[]) { 339 | struct submitter *s; 340 | 341 | if (argc < 2) { 342 | fprintf(stderr, "Usage: %s \n", argv[0]); 343 | return 1; 344 | } 345 | 346 | s = malloc(sizeof(*s)); 347 | if (!s) { 348 | perror("malloc"); 349 | return 1; 350 | } 351 | memset(s, 0, sizeof(*s)); 352 | 353 | if(app_setup_uring(s)) { 354 | fprintf(stderr, "Unable to setup uring!\n"); 355 | return 1; 356 | } 357 | 358 | for (int i = 1; i < argc; i++) { 359 | if(submit_to_sq(argv[i], s)) { 360 | fprintf(stderr, "Error reading file\n"); 361 | return 1; 362 | } 363 | read_from_cq(s); 364 | } 365 | 366 | return 0; 367 | } 368 | -------------------------------------------------------------------------------- /demo/uring_server.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #define SERVER_STRING "Server: zerohttpd/0.1\r\n" 13 | #define DEFAULT_SERVER_PORT 8000 14 | #define QUEUE_DEPTH 256 15 | #define READ_SZ 8192 16 | 17 | #define EVENT_TYPE_ACCEPT 0 18 | #define EVENT_TYPE_READ 1 19 | #define EVENT_TYPE_WRITE 2 20 | 21 | struct request { 22 | int event_type; 23 | int iovec_count; 24 | int client_socket; 25 | struct iovec iov[]; 26 | }; 27 | 28 | struct io_uring ring; 29 | 30 | const char *unimplemented_content = \ 31 | "HTTP/1.0 400 Bad Request\r\n" 32 | "Content-type: text/html\r\n" 33 | "\r\n" 34 | "" 35 | "" 36 | "ZeroHTTPd: Unimplemented" 37 | "" 38 | "" 39 | "

Bad Request (Unimplemented)

" 40 | "

Your client sent a request ZeroHTTPd did not understand and it is probably not your fault.

" 41 | "" 42 | ""; 43 | 44 | const char *http_404_content = \ 45 | "HTTP/1.0 404 Not Found\r\n" 46 | "Content-type: text/html\r\n" 47 | "\r\n" 48 | "" 49 | "" 50 | "ZeroHTTPd: Not Found" 51 | "" 52 | "" 53 | "

Not Found (404)

" 54 | "

Your client is asking for an object that was not found on this server.

" 55 | "" 56 | ""; 57 | 58 | /* 59 | * Utility function to convert a string to lower case. 60 | * */ 61 | 62 | void strtolower(char *str) { 63 | for (; *str; ++str) 64 | *str = (char)tolower(*str); 65 | } 66 | /* 67 | One function that prints the system call and the error details 68 | and then exits with error code 1. Non-zero meaning things didn't go well. 69 | */ 70 | void fatal_error(const char *syscall) { 71 | perror(syscall); 72 | exit(1); 73 | } 74 | 75 | /* 76 | * Helper function for cleaner looking code. 77 | * */ 78 | 79 | void *zh_malloc(size_t size) { 80 | void *buf = malloc(size); 81 | if (!buf) { 82 | fprintf(stderr, "Fatal error: unable to allocate memory.\n"); 83 | exit(1); 84 | } 85 | return buf; 86 | } 87 | 88 | /* 89 | * This function is responsible for setting up the main listening socket used by the 90 | * web server. 91 | * */ 92 | 93 | int setup_listening_socket(int port) { 94 | int sock; 95 | struct sockaddr_in srv_addr; 96 | 97 | sock = socket(PF_INET, SOCK_STREAM, 0); 98 | if (sock == -1) 99 | fatal_error("socket()"); 100 | 101 | int enable = 1; 102 | if (setsockopt(sock, 103 | SOL_SOCKET, SO_REUSEADDR, 104 | &enable, sizeof(int)) < 0) 105 | fatal_error("setsockopt(SO_REUSEADDR)"); 106 | 107 | 108 | memset(&srv_addr, 0, sizeof(srv_addr)); 109 | srv_addr.sin_family = AF_INET; 110 | srv_addr.sin_port = htons(port); 111 | srv_addr.sin_addr.s_addr = htonl(INADDR_ANY); 112 | 113 | /* We bind to a port and turn this socket into a listening 114 | * socket. 115 | * */ 116 | if (bind(sock, 117 | (const struct sockaddr *)&srv_addr, 118 | sizeof(srv_addr)) < 0) 119 | fatal_error("bind()"); 120 | 121 | if (listen(sock, 10) < 0) 122 | fatal_error("listen()"); 123 | 124 | return (sock); 125 | } 126 | 127 | int add_accept_request(int server_socket, struct sockaddr_in *client_addr, 128 | socklen_t *client_addr_len) { 129 | struct io_uring_sqe *sqe = io_uring_get_sqe(&ring); 130 | io_uring_prep_accept(sqe, server_socket, (struct sockaddr *) client_addr, 131 | client_addr_len, 0); 132 | struct request *req = malloc(sizeof(*req)); 133 | req->event_type = EVENT_TYPE_ACCEPT; 134 | io_uring_sqe_set_data(sqe, req); 135 | io_uring_submit(&ring); 136 | 137 | return 0; 138 | } 139 | 140 | int add_read_request(int client_socket) { 141 | struct io_uring_sqe *sqe = io_uring_get_sqe(&ring); 142 | struct request *req = malloc(sizeof(*req) + sizeof(struct iovec)); 143 | req->iov[0].iov_base = malloc(READ_SZ); 144 | req->iov[0].iov_len = READ_SZ; 145 | req->event_type = EVENT_TYPE_READ; 146 | req->client_socket = client_socket; 147 | memset(req->iov[0].iov_base, 0, READ_SZ); 148 | /* Linux kernel 5.5 has support for readv, but not for recv() or read() */ 149 | io_uring_prep_readv(sqe, client_socket, &req->iov[0], 1, 0); 150 | io_uring_sqe_set_data(sqe, req); 151 | io_uring_submit(&ring); 152 | return 0; 153 | } 154 | 155 | int add_write_request(struct request *req) { 156 | struct io_uring_sqe *sqe = io_uring_get_sqe(&ring); 157 | req->event_type = EVENT_TYPE_WRITE; 158 | io_uring_prep_writev(sqe, req->client_socket, req->iov, req->iovec_count, 0); 159 | io_uring_sqe_set_data(sqe, req); 160 | io_uring_submit(&ring); 161 | return 0; 162 | } 163 | 164 | void _send_static_string_content(const char *str, int client_socket) { 165 | struct request *req = zh_malloc(sizeof(*req) + sizeof(struct iovec)); 166 | unsigned long slen = strlen(str); 167 | req->iovec_count = 1; 168 | req->client_socket = client_socket; 169 | req->iov[0].iov_base = zh_malloc(slen); 170 | req->iov[0].iov_len = slen; 171 | memcpy(req->iov[0].iov_base, str, slen); 172 | add_write_request(req); 173 | } 174 | 175 | /* 176 | * When ZeroHTTPd encounters any other HTTP method other than GET or POST, this function 177 | * is used to inform the client. 178 | * */ 179 | 180 | void handle_unimplemented_method(int client_socket) { 181 | _send_static_string_content(unimplemented_content, client_socket); 182 | } 183 | 184 | /* 185 | * This function is used to send a "HTTP Not Found" code and message to the client in 186 | * case the file requested is not found. 187 | * */ 188 | 189 | void handle_http_404(int client_socket) { 190 | _send_static_string_content(http_404_content, client_socket); 191 | } 192 | 193 | /* 194 | * Once a static file is identified to be served, this function is used to read the file 195 | * and write it over the client socket using Linux's sendfile() system call. This saves us 196 | * the hassle of transferring file buffers from kernel to user space and back. 197 | * */ 198 | 199 | void copy_file_contents(char *file_path, off_t file_size, struct iovec *iov) { 200 | int fd; 201 | 202 | char *buf = zh_malloc(file_size); 203 | fd = open(file_path, O_RDONLY); 204 | if (fd < 0) 205 | fatal_error("open"); 206 | 207 | /* We should really check for short reads here */ 208 | int ret = read(fd, buf, file_size); 209 | if (ret < file_size) { 210 | fprintf(stderr, "Encountered a short read.\n"); 211 | } 212 | close(fd); 213 | 214 | iov->iov_base = buf; 215 | iov->iov_len = file_size; 216 | } 217 | 218 | /* 219 | * Simple function to get the file extension of the file that we are about to serve. 220 | * */ 221 | 222 | const char *get_filename_ext(const char *filename) { 223 | const char *dot = strrchr(filename, '.'); 224 | if (!dot || dot == filename) 225 | return ""; 226 | return dot + 1; 227 | } 228 | 229 | /* 230 | * Sends the HTTP 200 OK header, the server string, for a few types of files, it can also 231 | * send the content type based on the file extension. It also sends the content length 232 | * header. Finally it send a '\r\n' in a line by itself signalling the end of headers 233 | * and the beginning of any content. 234 | * */ 235 | 236 | void send_headers(const char *path, off_t len, struct iovec *iov) { 237 | char small_case_path[1024]; 238 | char send_buffer[1024]; 239 | strcpy(small_case_path, path); 240 | strtolower(small_case_path); 241 | 242 | char *str = "HTTP/1.0 200 OK\r\n"; 243 | unsigned long slen = strlen(str); 244 | iov[0].iov_base = zh_malloc(slen); 245 | iov[0].iov_len = slen; 246 | memcpy(iov[0].iov_base, str, slen); 247 | 248 | slen = strlen(SERVER_STRING); 249 | iov[1].iov_base = zh_malloc(slen); 250 | iov[1].iov_len = slen; 251 | memcpy(iov[1].iov_base, SERVER_STRING, slen); 252 | 253 | /* 254 | * Check the file extension for certain common types of files 255 | * on web pages and send the appropriate content-type header. 256 | * Since extensions can be mixed case like JPG, jpg or Jpg, 257 | * we turn the extension into lower case before checking. 258 | * */ 259 | const char *file_ext = get_filename_ext(small_case_path); 260 | if (strcmp("jpg", file_ext) == 0) 261 | strcpy(send_buffer, "Content-Type: image/jpeg\r\n"); 262 | if (strcmp("jpeg", file_ext) == 0) 263 | strcpy(send_buffer, "Content-Type: image/jpeg\r\n"); 264 | if (strcmp("png", file_ext) == 0) 265 | strcpy(send_buffer, "Content-Type: image/png\r\n"); 266 | if (strcmp("gif", file_ext) == 0) 267 | strcpy(send_buffer, "Content-Type: image/gif\r\n"); 268 | if (strcmp("htm", file_ext) == 0) 269 | strcpy(send_buffer, "Content-Type: text/html\r\n"); 270 | if (strcmp("html", file_ext) == 0) 271 | strcpy(send_buffer, "Content-Type: text/html\r\n"); 272 | if (strcmp("js", file_ext) == 0) 273 | strcpy(send_buffer, "Content-Type: application/javascript\r\n"); 274 | if (strcmp("css", file_ext) == 0) 275 | strcpy(send_buffer, "Content-Type: text/css\r\n"); 276 | if (strcmp("txt", file_ext) == 0) 277 | strcpy(send_buffer, "Content-Type: text/plain\r\n"); 278 | slen = strlen(send_buffer); 279 | iov[2].iov_base = zh_malloc(slen); 280 | iov[2].iov_len = slen; 281 | memcpy(iov[2].iov_base, send_buffer, slen); 282 | 283 | /* Send the content-length header, which is the file size in this case. */ 284 | sprintf(send_buffer, "content-length: %ld\r\n", len); 285 | slen = strlen(send_buffer); 286 | iov[3].iov_base = zh_malloc(slen); 287 | iov[3].iov_len = slen; 288 | memcpy(iov[3].iov_base, send_buffer, slen); 289 | 290 | /* 291 | * When the browser sees a '\r\n' sequence in a line on its own, 292 | * it understands there are no more headers. Content may follow. 293 | * */ 294 | strcpy(send_buffer, "\r\n"); 295 | slen = strlen(send_buffer); 296 | iov[4].iov_base = zh_malloc(slen); 297 | iov[4].iov_len = slen; 298 | memcpy(iov[4].iov_base, send_buffer, slen); 299 | } 300 | 301 | void handle_get_method(char *path, int client_socket) { 302 | char final_path[1024]; 303 | 304 | /* 305 | If a path ends in a trailing slash, the client probably wants the index 306 | file inside of that directory. 307 | */ 308 | if (path[strlen(path) - 1] == '/') { 309 | strcpy(final_path, "public"); 310 | strcat(final_path, path); 311 | strcat(final_path, "index.html"); 312 | } 313 | else { 314 | strcpy(final_path, "public"); 315 | strcat(final_path, path); 316 | } 317 | 318 | /* The stat() system call will give you information about the file 319 | * like type (regular file, directory, etc), size, etc. */ 320 | struct stat path_stat; 321 | if (stat(final_path, &path_stat) == -1) { 322 | printf("404 Not Found: %s (%s)\n", final_path, path); 323 | handle_http_404(client_socket); 324 | } 325 | else { 326 | /* Check if this is a normal/regular file and not a directory or something else */ 327 | if (S_ISREG(path_stat.st_mode)) { 328 | struct request *req = zh_malloc(sizeof(*req) + (sizeof(struct iovec) * 6)); 329 | req->iovec_count = 6; 330 | req->client_socket = client_socket; 331 | send_headers(final_path, path_stat.st_size, req->iov); 332 | copy_file_contents(final_path, path_stat.st_size, &req->iov[5]); 333 | printf("200 %s %ld bytes\n", final_path, path_stat.st_size); 334 | add_write_request( req); 335 | } 336 | else { 337 | handle_http_404(client_socket); 338 | printf("404 Not Found: %s\n", final_path); 339 | } 340 | } 341 | } 342 | 343 | /* 344 | * This function looks at method used and calls the appropriate handler function. 345 | * Since we only implement GET and POST methods, it calls handle_unimplemented_method() 346 | * in case both these don't match. This sends an error to the client. 347 | * */ 348 | 349 | void handle_http_method(char *method_buffer, int client_socket) { 350 | char *method, *path, *saveptr; 351 | 352 | method = strtok_r(method_buffer, " ", &saveptr); 353 | strtolower(method); 354 | path = strtok_r(NULL, " ", &saveptr); 355 | 356 | if (strcmp(method, "get") == 0) { 357 | handle_get_method(path, client_socket); 358 | } 359 | else { 360 | handle_unimplemented_method(client_socket); 361 | } 362 | } 363 | 364 | int get_line(const char *src, char *dest, int dest_sz) { 365 | for (int i = 0; i < dest_sz; i++) { 366 | dest[i] = src[i]; 367 | if (src[i] == '\r' && src[i+1] == '\n') { 368 | dest[i] = '\0'; 369 | return 0; 370 | } 371 | } 372 | return 1; 373 | } 374 | 375 | int handle_client_request(struct request *req) { 376 | char http_request[1024]; 377 | /* Get the first line, which will be the request */ 378 | if(get_line(req->iov[0].iov_base, http_request, sizeof(http_request))) { 379 | fprintf(stderr, "Malformed request\n"); 380 | exit(1); 381 | } 382 | handle_http_method(http_request, req->client_socket); 383 | return 0; 384 | } 385 | 386 | void server_loop(int server_socket) { 387 | struct io_uring_cqe *cqe; 388 | struct sockaddr_in client_addr; 389 | socklen_t client_addr_len = sizeof(client_addr); 390 | 391 | add_accept_request(server_socket, &client_addr, &client_addr_len); 392 | 393 | while (1) { 394 | int ret = io_uring_wait_cqe(&ring, &cqe); 395 | if (ret < 0) 396 | fatal_error("io_uring_wait_cqe"); 397 | struct request *req = (struct request *) cqe->user_data; 398 | if (cqe->res < 0) { 399 | fprintf(stderr, "Async request failed: %s for event: %d\n", 400 | strerror(-cqe->res), req->event_type); 401 | exit(1); 402 | } 403 | 404 | switch (req->event_type) { 405 | case EVENT_TYPE_ACCEPT: 406 | add_accept_request(server_socket, &client_addr, &client_addr_len); 407 | add_read_request(cqe->res); 408 | free(req); 409 | break; 410 | case EVENT_TYPE_READ: 411 | if (!cqe->res) { 412 | fprintf(stderr, "Empty request!\n"); 413 | break; 414 | } 415 | handle_client_request(req); 416 | free(req->iov[0].iov_base); 417 | free(req); 418 | break; 419 | case EVENT_TYPE_WRITE: 420 | for (int i = 0; i < req->iovec_count; i++) { 421 | free(req->iov[i].iov_base); 422 | } 423 | close(req->client_socket); 424 | free(req); 425 | break; 426 | } 427 | /* Mark this request as processed */ 428 | io_uring_cqe_seen(&ring, cqe); 429 | } 430 | } 431 | 432 | void sigint_handler(int signo) { 433 | printf("^C pressed. Shutting down.\n"); 434 | io_uring_queue_exit(&ring); 435 | exit(0); 436 | } 437 | 438 | int main() { 439 | int server_socket = setup_listening_socket(DEFAULT_SERVER_PORT); 440 | 441 | signal(SIGINT, sigint_handler); 442 | io_uring_queue_init(QUEUE_DEPTH, &ring, 0); 443 | server_loop(server_socket); 444 | 445 | return 0; 446 | } 447 | -------------------------------------------------------------------------------- /document/128.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunwei37/co-uring-WebServer/878911c6669ac8bf1abb1e11b224e6cdc86aa6f7/document/128.png -------------------------------------------------------------------------------- /document/benchmark.md: -------------------------------------------------------------------------------- 1 | # TODO -------------------------------------------------------------------------------- /document/coroutine/Coroutines-on-a-Separate-Thread.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunwei37/co-uring-WebServer/878911c6669ac8bf1abb1e11b224e6cdc86aa6f7/document/coroutine/Coroutines-on-a-Separate-Thread.md -------------------------------------------------------------------------------- /document/io_uring-by-example/io_uring-by-example1.md: -------------------------------------------------------------------------------- 1 | # io_uring 从原理到动手实践 part1: 使用系统调用接口实现 cat 程序 2 | 3 | ## 原文 4 | 5 | 感觉目前看到介绍 io_uring 的文章还是比较少,大部分都集中在对其原理性的介绍和简单的对官方文档的翻译,真正结合实际的例子还是比较少。本文翻译整理自一篇博客: 6 | 7 | [io-uring-by-example-part-1-introduction](https://unixism.net/2020/04/io-uring-by-example-part-1-introduction/) 8 | 9 | 我也增加了一些自己的理解和其他的参考材料。另外,在 2020 年,C++ 也正式将协程 coroutine 加入标准,我尝试使用 io_uring 和 c++20 协程实现了一个高性能web服务器,并进行了一些性能测试,具体代码会放在这个仓库里面,同时也包含了这篇文档以及所需的 demo 代码: 10 | 11 | [https://github.com/yunwei37/co-uring-WebServer](https://github.com/yunwei37/co-uring-WebServer) 12 | 13 | 14 | 15 | - [io_uring 从原理到动手实践 part1: 使用系统调用接口实现 cat 程序](#io_uring-从原理到动手实践-part1-使用系统调用接口实现-cat-程序) 16 | - [原文](#原文) 17 | - [介绍](#介绍) 18 | - [一个简单的 cat 程序](#一个简单的-cat-程序) 19 | - [Cat io_uring](#cat-io_uring) 20 | - [io_uring 接口](#io_uring-接口) 21 | - [完成队列条目 (Completion Queue Entry)](#完成队列条目-completion-queue-entry) 22 | - [顺序](#顺序) 23 | - [提交队列条目(SQE)](#提交队列条目sqe) 24 | - [io_uring 版本的 cat](#io_uring-版本的-cat) 25 | - [初始设置](#初始设置) 26 | - [处理共享的环形缓冲区](#处理共享的环形缓冲区) 27 | - [读取完成队列条目](#读取完成队列条目) 28 | - [提交](#提交) 29 | 30 | 31 | 32 | ## 介绍 33 | 34 | 事实上,只有 I/O 和计算是计算机真正做的两件事。在 Linux 下,对于计算,您可以在进程或线程之间进行选择;对于 I/O,Linux 既有同步 I/O,也称为阻塞 I/O,和异步 I/O。尽管异步 I/O(aio系统调用系列)已经成为 Linux 的一部分有一段历史了,但它们仅适用于直接 I/O 而不适用于缓冲 I/O。对于以缓冲模式打开的文件,aio就像常规的阻塞系统调用一样。这不是一个令人愉快的限制。除此之外,Linux 当前的aio 接口还有很多系统调用开销。 35 | 36 | 考虑到项目的复杂性,提出一个提供高性能异步 I/O 的 Linux 子系统并不容易,因此对 io_uring 的大肆宣传是绝对合理的。不仅io_uring提供了一个优雅的内核/用户空间接口,它还通过允许一种特殊的轮询模式,完全取消从内核到用户空间获取数据的系统调用,从而提供了卓越的性能。 37 | 38 | 然而,对于大多数的异步编程完全是另一回事。如果你已经试过在像 C 这样的低级语言中用select/ poll/epoll 异步编程,你会明白我的意思。我们不太擅长异步思考,换句话说,使用线程。线程有一个“从这里开始”、“做 1-2-3 件事”和“从这里结束”的进展。尽管它们被操作系统多次阻塞和启动,但这种错觉对程序员来说是隐藏的,因此它是一个相对简单的心理模型,可以吸收和适应您的需求。但这并不意味着异步编程很难:它通常是程序中的最低层。一旦你编写了一个抽象层出来,你就会很舒服并忙于做你的应用程序真正打算做的事情,你的用户主要关心的事情。 39 | 40 | 说到抽象,io_uring 确实提供了一个更高级的库 liburing,它实现并隐藏了很多io_uring 需要的模板代码,同时提供了一个更简单的接口供您处理。但是,如果不先了解 io_uring 底层是如何工作的,那么使用 liburing 的乐趣何在?知道了这一点,您也可以更好地使用 liburing:您会了解极端情况,并且可以更好地了解其背后工作的原理。这是一件好事。为此,我们将使用 liburing 构建大多数示例,但我们同时也会使用系统调用接口构建它们。 41 | 42 | ## 一个简单的 cat 程序 43 | 44 | 让我们以同步或阻塞的方式使用 readv() 系统调用,构建一个简单的 cat 等效命令。这将使您熟悉 readv(),它是启用分散/聚集 I/O 的系统调用集的一部分,也称为向量 I/O。如果您熟悉 readv() 工作方式,则可以跳到下一节。 45 | 46 | 比起 read() 和 write() 将文件描述符、缓冲区及其长度作为参数,readv() 和 writev() 将文件描述符、指向struct iovec结构数组的指针和最后一个表示该数组长度的参数作为参数。现在让我们来看看struct iovec。 47 | 48 | ```c 49 | struct iovec { 50 | void *iov_base; /* 起始地址 */ 51 | size_t iov_len; /* 要传输的字节数 */ 52 | }; 53 | ``` 54 | > 函数原型: 55 | > ssize_t readv(int fd, const struct iovec *iov, int iovcnt); 56 | > ssize_t writev(int fd, const struct iovec *iov, int iovcnt); 57 | > 58 | > 关于 readv/writev 的性能分析,可以参考 https://zhuanlan.zhihu.com/p/341366946 59 | 60 | 每个结构简单地指向一个缓冲区。一个基地址和一个长度。 61 | 62 | 您可能会问,比起常规 read() 和 write(),使用矢量或分散/收集 I/O 有什么意义。答案是使用 readv() 和 writev() 更自然。例如,使用readv(),您可以填充一个 struct 的许多成员,而无需求助于复制缓冲区或多次调用read(),这两种方法的效率都相对较低。同样的优势适用于writev(). 此外,这些调用是原子的,而多次调用read()和write()不是,如果您出于某种原因碰巧关心它。 63 | 64 | 虽然主要用于将文件的内容打印到控制台,但 cat 命令 concatenates(意味着连接在一起)并打印作为命令参数传入的文件的内容。在我们的cat示例中,我们将使用 readv() 从文件中读取数据以打印到控制台。我们将逐块读取文件,并且每个块都将由一个iovec 结构指向。readv() 被阻塞,当它返回时,假设没有错误,这些 struct iovec 结构指向一组包含文件数据的缓冲区。然后我们将它们打印到控制台。这足够简单。 65 | 66 | ```c 67 | #include 68 | #include 69 | #include 70 | #include 71 | #include 72 | #include 73 | #include 74 | 75 | #define BLOCK_SZ 4096 76 | 77 | /* 78 | * 返回传入文件描述符的文件的大小。 79 | * 正确处理常规文件和块设备。 80 | * */ 81 | 82 | off_t get_file_size(int fd) { 83 | struct stat st; 84 | 85 | if(fstat(fd, &st) < 0) { 86 | perror("fstat"); 87 | return -1; 88 | } 89 | if (S_ISBLK(st.st_mode)) { 90 | unsigned long long bytes; 91 | if (ioctl(fd, BLKGETSIZE64, &bytes) != 0) { 92 | perror("ioctl"); 93 | return -1; 94 | } 95 | return bytes; 96 | } else if (S_ISREG(st.st_mode)) 97 | return st.st_size; 98 | 99 | return -1; 100 | } 101 | 102 | /* 103 | * 输出长度为 len 的字符串到 stdout 104 | * 我们在这里使用缓冲输出以提高效率, 105 | * 因为我们需要逐个字符地输出。 106 | * */ 107 | void output_to_console(char *buf, int len) { 108 | while (len--) { 109 | fputc(*buf++, stdout); 110 | } 111 | } 112 | 113 | int read_and_print_file(char *file_name) { 114 | struct iovec *iovecs; 115 | int file_fd = open(file_name, O_RDONLY); 116 | if (file_fd < 0) { 117 | perror("open"); 118 | return 1; 119 | } 120 | 121 | off_t file_sz = get_file_size(file_fd); 122 | off_t bytes_remaining = file_sz; 123 | int blocks = (int) file_sz / BLOCK_SZ; 124 | if (file_sz % BLOCK_SZ) blocks++; 125 | iovecs = malloc(sizeof(struct iovec) * blocks); 126 | 127 | int current_block = 0; 128 | 129 | /* 130 | * 对于我们正在读取的文件,分配足够的块来容纳 131 | * 文件数据。每个块都在一个 iovec 结构中描述, 132 | * 它作为 iovecs 数组的一部分传递给 readv。 133 | * */ 134 | while (bytes_remaining) { 135 | off_t bytes_to_read = bytes_remaining; 136 | if (bytes_to_read > BLOCK_SZ) 137 | bytes_to_read = BLOCK_SZ; 138 | 139 | 140 | void *buf; 141 | if( posix_memalign(&buf, BLOCK_SZ, BLOCK_SZ)) { 142 | perror("posix_memalign"); 143 | return 1; 144 | } 145 | iovecs[current_block].iov_base = buf; 146 | iovecs[current_block].iov_len = bytes_to_read; 147 | current_block++; 148 | bytes_remaining -= bytes_to_read; 149 | } 150 | 151 | /* 152 | * readv() 调用将阻塞,直到所有 iovec 缓冲区被填满 153 | * 文件数据。一旦它返回,我们应该能够从 iovecs 访问文件数据 154 | * 并在控制台上打印它们。 155 | * */ 156 | int ret = readv(file_fd, iovecs, blocks); 157 | if (ret < 0) { 158 | perror("readv"); 159 | return 1; 160 | } 161 | 162 | for (int i = 0; i < blocks; i++) 163 | output_to_console(iovecs[i].iov_base, iovecs[i].iov_len); 164 | 165 | return 0; 166 | } 167 | 168 | int main(int argc, char *argv[]) { 169 | if (argc < 2) { 170 | fprintf(stderr, "Usage: %s [ ...]\n", 171 | argv[0]); 172 | return 1; 173 | } 174 | 175 | for (int i = 1; i < argc; i++) { 176 | if(read_and_print_file(argv[i])) { 177 | fprintf(stderr, "Error reading file\n"); 178 | return 1; 179 | } 180 | } 181 | 182 | return 0; 183 | } 184 | ``` 185 | 186 | 这是一个足够简单的程序。我们现在讨论这个,以便我们可以将它与我们接下来将采用的 io_uring 方法进行比较和对比。这个程序的核心是一个循环,它通过首先找到文件的大小,来计算保存我们正在读取的文件数据所需的块数,为所有需要的iovec结构分配内存。我们迭代等于文件大小的块数的计数,分配块大小的内存来保存实际数据,最后调用 readv() 读取数据。就像我们之前讨论过的,readv() 这里是同步的。这意味着它会阻塞,直到它满足了它被调用的请求。当它返回时,我们分配并指向的内存块iovec结构用文件数据填充。然后我们通过调用该 output_to_console() 函数将文件数据打印到控制台。 187 | 188 | ## Cat io_uring 189 | 190 | 现在让我们使用 io_uring 编写一个功能等效的程序. 我们将在 io_uring 使用的操作将是readv。 191 | 192 | ### io_uring 接口 193 | 194 | io_uring 的接口很简单。有一个提交队列,有一个完成队列。在提交队列中,您提交有关要完成的各种操作的信息,例如,对于我们当前的程序,我们想要用 readv() 读取文件,因此我们将提交队列请求,作为提交队列条目 (SQE) 的一部分。由于它是一个队列,您可以发出许多请求。这些操作可以是读、写等的混合。然后,我们称之为 io_uring_enter() 的系统调用告诉内核,我们已经向提交队列添加了请求。内核然后执行它的任务,一旦它完成了这些请求的处理,它就会将结果作为完成队列条目 (CQE)的一部分,放入完成队列,或者说每个对应 SQE 的完成队列条目中。这些 CQE 可以从用户空间访问。 195 | 196 | 精明的读者会注意到,这种用多个 I/O 请求填充队列然后进行单个系统调用的接口,而不是对每个 I/O 请求进行一次系统调用,已经更有效了。为了进一步提高效率,io_uring 支持一种模式,在这种模式下,内核轮询您进入提交队列的条目,而您甚至不必调用 io_uring_enter() 通知内核有关更新提交队列条目的信息。另一点需要注意的是,在 Spectre 和 Meltdown 硬件漏洞被发现,并且操作系统为其创建解决方法之后,系统调用比以往任何时候都更加昂贵。因此,对于高性能应用程序来说,减少系统调用的数量确实是一件大事。 197 | 198 | 在您执行任何这些操作之前,您需要设置队列,它们实际上是具有特定深度/长度的环形缓冲区。您调用 io_uring_setup() 系统调用来完成此操作。我们通过将提交队列条目添加到环形缓冲区,并从完成队列环形缓冲区,读取完成的队列条目来完成实际工作。这是对 io_uring 接口的概述。 199 | 200 | #### 完成队列条目 (Completion Queue Entry) 201 | 202 | 现在我们有了一个关于事物如何运作的心智模型,让我们更详细地看看这是如何完成的。与提交队列条目(SQE)相比,完成队列条目(CQE)非常简单,所以,让我们先来看看。SQE 是一个结构体,您可以使用它来提交请求, 将其添加到提交环形缓冲区。CQE 是一个结构体的实例,内核对添加到提交队列的每个 SQE 结构体实例进行响应。这包含您通过 SQE 实例请求的操作的结果: 203 | 204 | ```c 205 | struct io_uring_cqe { 206 | __u64 user_data; /* sqe->user_data submission passed back */ 207 | __s32 res; /* result code for this event */ 208 | __u32 flags; 209 | }; 210 | ``` 211 | 212 | 如代码注释中所述, user_data 字段是按原样从 SQE 传递到 CQE 实例的内容, 假设您在提交队列中提交了一堆请求,它们并不一定以相同的顺序完成并到达完成队列。以以下场景为例:您的机器上有两个磁盘,一个是旋转速度较慢的硬盘驱动器,另一个是超快的 SSD。您在提交队列中提交了 2 个请求,第一个在较慢的旋转硬盘上读取 100kB 文件,第二个在较快的 SSD 上读取相同大小的文件。如果要保持顺序,即使 SSD 上文件中的数据预计会更快到达,内核是否也应该等待旋转硬盘驱动器上文件中的数据可用?这是一个坏主意,因为这会阻止我们尽可能快地完成所有的任务。所以,当 CQE 可用时,它们可以按任何顺序到达,无论哪个操作快速完成,它都会立即可用。但 213 | 214 | 由于没有指定 CQE 到达的顺序,您如何识别特定 CQE 对应于哪个 SQE 请求?一种方法是使用该user_data字段来识别它。并不是说你会设置一个唯一的 ID 或其他东西,而是你通常会传递一个指针进去。如果这令人困惑,请等到稍后在这里看到一个清晰的示例。 215 | 216 | 完成队列条目很简单,因为它主要关注系统调用的返回值,该值在其res字段中返回。例如,如果您将 read 操作加入队列,成功完成后,它将包含读取的字节数。如果有错误,它将包含-errno. 基本上就是 read() 系统调用本身会返回的东西。 217 | 218 | #### 顺序 219 | 220 | 虽然我确实提到, CQE 可以按任何顺序到达,但您可以使用 SQE 排序强制对某些操作进行排序,实际上是将它们链接起来。我不会在本系列文章中讨论排序,但您可以阅读当前的 io_uring 规范参考,以了解如何执行此操作。 221 | 222 | > [https://kernel.dk/io_uring.pdf](io_uring-by-example1.md) 223 | 224 | #### 提交队列条目(SQE) 225 | 226 | 227 | 提交队列条目比完成队列条目稍微复杂一些,因为它需要足够通用,以表示和处理当今 Linux 可能的各种 I/O 操作。 228 | 229 | ```c 230 | struct io_uring_sqe { 231 | __u8 opcode; /* type of operation for this sqe */ 232 | __u8 flags; /* IOSQE_ flags */ 233 | __u16 ioprio; /* ioprio for the request */ 234 | __s32 fd; /* file descriptor to do IO on */ 235 | __u64 off; /* offset into file */ 236 | __u64 addr; /* pointer to buffer or iovecs */ 237 | __u32 len; /* buffer size or number of iovecs */ 238 | union { 239 | __kernel_rwf_t rw_flags; 240 | __u32 fsync_flags; 241 | __u16 poll_events; 242 | __u32 sync_range_flags; 243 | __u32 msg_flags; 244 | }; 245 | __u64 user_data; /* data to be passed back at completion time */ 246 | union { 247 | __u16 buf_index; /* index into fixed buffers, if used */ 248 | __u64 __pad2[3]; 249 | }; 250 | }; 251 | ``` 252 | 253 | 我知道这个结构看起来很大。更常用的字段只有几个,这很容易用一个简单的例子来解释,比如我们正在处理的那个:cat。您将使用readv()系统调用读取文件: 254 | 255 | - opcode用于指定操作,在我们的例子中,readv() 使用 IORING_OP_READV 常量。 256 | - fd 用于指定我们要读取的文件。 257 | - addr 用于指向 iovec 保存我们为 I/O 分配的缓冲区的地址和长度的结构数组。 258 | - 最后,len用于保存 iovecs 的数组的长度iovecs。 259 | 260 | 261 | 现在这并不太难。您填写这些值,让 io_uring 知道该做什么。您可以将多个 SQE 加入队列,并在您希望内核开始处理您的请求时最终调用 io_uring_enter()。 262 | 263 | ### io_uring 版本的 cat 264 | 265 | 让我们看看如何在我们cat程序的 io_uring 版本中实际完成这项工作: 266 | 267 | ```c 268 | #include 269 | #include 270 | #include 271 | #include 272 | #include 273 | #include 274 | #include 275 | #include 276 | #include 277 | #include 278 | #include 279 | 280 | /* 如果你的编译失败是因为缺少下面的头文件, 281 | * 您的内核可能太旧,无法支持 io_uring。 282 | * */ 283 | #include 284 | 285 | #define QUEUE_DEPTH 1 286 | #define BLOCK_SZ 1024 287 | 288 | /* This is x86 specific */ 289 | #define read_barrier() __asm__ __volatile__("":::"memory") 290 | #define write_barrier() __asm__ __volatile__("":::"memory") 291 | 292 | struct app_io_sq_ring { 293 | unsigned *head; 294 | unsigned *tail; 295 | unsigned *ring_mask; 296 | unsigned *ring_entries; 297 | unsigned *flags; 298 | unsigned *array; 299 | }; 300 | 301 | struct app_io_cq_ring { 302 | unsigned *head; 303 | unsigned *tail; 304 | unsigned *ring_mask; 305 | unsigned *ring_entries; 306 | struct io_uring_cqe *cqes; 307 | }; 308 | 309 | struct submitter { 310 | int ring_fd; 311 | struct app_io_sq_ring sq_ring; 312 | struct io_uring_sqe *sqes; 313 | struct app_io_cq_ring cq_ring; 314 | }; 315 | 316 | struct file_info { 317 | off_t file_sz; 318 | struct iovec iovecs[]; /* Referred by readv/writev */ 319 | }; 320 | 321 | /* 322 | * 这段代码是在没有io_uring相关系统调用的年代写的 323 | * 标准 C 库的一部分。所以,我们推出自己的系统调用包装器. 324 | * */ 325 | 326 | int io_uring_setup(unsigned entries, struct io_uring_params *p) 327 | { 328 | return (int) syscall(__NR_io_uring_setup, entries, p); 329 | } 330 | 331 | int io_uring_enter(int ring_fd, unsigned int to_submit, 332 | unsigned int min_complete, unsigned int flags) 333 | { 334 | return (int) syscall(__NR_io_uring_enter, ring_fd, to_submit, min_complete, 335 | flags, NULL, 0); 336 | } 337 | 338 | /* 339 | * 返回传入其打开文件描述符的文件的大小。 340 | * 正确处理常规文件和块设备。 341 | * */ 342 | 343 | off_t get_file_size(int fd) { 344 | struct stat st; 345 | 346 | if(fstat(fd, &st) < 0) { 347 | perror("fstat"); 348 | return -1; 349 | } 350 | if (S_ISBLK(st.st_mode)) { 351 | unsigned long long bytes; 352 | if (ioctl(fd, BLKGETSIZE64, &bytes) != 0) { 353 | perror("ioctl"); 354 | return -1; 355 | } 356 | return bytes; 357 | } else if (S_ISREG(st.st_mode)) 358 | return st.st_size; 359 | 360 | return -1; 361 | } 362 | 363 | /* 364 | * io_uring 需要很多设置,看起来很麻烦 365 | * 所以 io_uring 的作者创建了 liburing,比较好用。 366 | * 但是,您应该花时间了解此代码。 367 | * */ 368 | 369 | int app_setup_uring(struct submitter *s) { 370 | struct app_io_sq_ring *sring = &s->sq_ring; 371 | struct app_io_cq_ring *cring = &s->cq_ring; 372 | struct io_uring_params p; 373 | void *sq_ptr, *cq_ptr; 374 | 375 | /* 376 | * 我们需要将 io_uring_params 结构体传递给 io_uring_setup() 去置0初始化。 377 | * 我们可以设置任何想要的标记。 378 | * */ 379 | memset(&p, 0, sizeof(p)); 380 | s->ring_fd = io_uring_setup(QUEUE_DEPTH, &p); 381 | if (s->ring_fd < 0) { 382 | perror("io_uring_setup"); 383 | return 1; 384 | } 385 | 386 | /* 387 | * io_uring 通信通过 2 个共享的内核用户空间环形缓冲区进行, 388 | * 可以在内核中通过 mmap() 调用映射。 389 | * 虽然完成队列是直接映射进去的, 但提交队列里面有个数组,我们也把它映射进* 去 390 | * */ 391 | 392 | int sring_sz = p.sq_off.array + p.sq_entries * sizeof(unsigned); 393 | int cring_sz = p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe); 394 | 395 | /* 在内核版本 5.4 及以上, 396 | * 可以使用单个 mmap() 调用同时完成两个缓冲区的映射。 397 | * 关于内核版本,可以检查 io_uring_params 的字段,并使用 mask 获取。 398 | * 如果 IORING_FEAT_SINGLE_MMAP 已设置,我们可以不用第二个 mmap() 去映* 射。 399 | * */ 400 | if (p.features & IORING_FEAT_SINGLE_MMAP) { 401 | if (cring_sz > sring_sz) { 402 | sring_sz = cring_sz; 403 | } 404 | cring_sz = sring_sz; 405 | } 406 | 407 | /* 在提交和完成队列环形缓冲区中映射。 408 | * 不过,较旧的内核仅映射到提交队列中。 409 | * */ 410 | sq_ptr = mmap(0, sring_sz, PROT_READ | PROT_WRITE, 411 | MAP_SHARED | MAP_POPULATE, 412 | s->ring_fd, IORING_OFF_SQ_RING); 413 | if (sq_ptr == MAP_FAILED) { 414 | perror("mmap"); 415 | return 1; 416 | } 417 | 418 | if (p.features & IORING_FEAT_SINGLE_MMAP) { 419 | cq_ptr = sq_ptr; 420 | } else { 421 | /* 分别映射到旧内核中的完成队列环形缓冲区 */ 422 | cq_ptr = mmap(0, cring_sz, PROT_READ | PROT_WRITE, 423 | MAP_SHARED | MAP_POPULATE, 424 | s->ring_fd, IORING_OFF_CQ_RING); 425 | if (cq_ptr == MAP_FAILED) { 426 | perror("mmap"); 427 | return 1; 428 | } 429 | } 430 | 431 | /* 将有用的字段保存在全局 app_io_sq_ring 结构中以备后用 432 | * 简单的一个参考 */ 433 | sring->head = sq_ptr + p.sq_off.head; 434 | sring->tail = sq_ptr + p.sq_off.tail; 435 | sring->ring_mask = sq_ptr + p.sq_off.ring_mask; 436 | sring->ring_entries = sq_ptr + p.sq_off.ring_entries; 437 | sring->flags = sq_ptr + p.sq_off.flags; 438 | sring->array = sq_ptr + p.sq_off.array; 439 | 440 | /* 映射到提交队列条目数组 */ 441 | s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe), 442 | PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, 443 | s->ring_fd, IORING_OFF_SQES); 444 | if (s->sqes == MAP_FAILED) { 445 | perror("mmap"); 446 | return 1; 447 | } 448 | 449 | /* 将有用的字段保存在全局 app_io_cq_ring 结构中以备后用 450 | * 简单参考 */ 451 | cring->head = cq_ptr + p.cq_off.head; 452 | cring->tail = cq_ptr + p.cq_off.tail; 453 | cring->ring_mask = cq_ptr + p.cq_off.ring_mask; 454 | cring->ring_entries = cq_ptr + p.cq_off.ring_entries; 455 | cring->cqes = cq_ptr + p.cq_off.cqes; 456 | 457 | return 0; 458 | } 459 | 460 | /* 461 | * 输出长度为 len 的字符串到 stdout 462 | * 我们在这里使用缓冲输出以提高效率, 463 | * 因为我们需要逐个字符地输出。 464 | * */ 465 | void output_to_console(char *buf, int len) { 466 | while (len--) { 467 | fputc(*buf++, stdout); 468 | } 469 | } 470 | 471 | /* 472 | * 从完成队列中读取。 473 | * 在这个函数中,我们从完成队列中读取完成事件, 474 | * 得到包含文件数据并将其打印到控制台的数据缓冲区。 475 | * */ 476 | 477 | void read_from_cq(struct submitter *s) { 478 | struct file_info *fi; 479 | struct app_io_cq_ring *cring = &s->cq_ring; 480 | struct io_uring_cqe *cqe; 481 | unsigned head, reaped = 0; 482 | 483 | head = *cring->head; 484 | 485 | do { 486 | read_barrier(); 487 | /* 488 | * 请记住,这是一个环形缓冲区。如果头==尾,则表示 489 | * 缓冲区为空。 490 | * */ 491 | if (head == *cring->tail) 492 | break; 493 | 494 | /* 获取条目 */ 495 | cqe = &cring->cqes[head & *s->cq_ring.ring_mask]; 496 | fi = (struct file_info*) cqe->user_data; 497 | if (cqe->res < 0) 498 | fprintf(stderr, "Error: %s\n", strerror(abs(cqe->res))); 499 | 500 | int blocks = (int) fi->file_sz / BLOCK_SZ; 501 | if (fi->file_sz % BLOCK_SZ) blocks++; 502 | 503 | for (int i = 0; i < blocks; i++) 504 | output_to_console(fi->iovecs[i].iov_base, fi->iovecs[i].iov_len); 505 | 506 | head++; 507 | } while (1); 508 | 509 | *cring->head = head; 510 | write_barrier(); 511 | } 512 | /* 513 | * 提交到提交队列。 514 | * 在这个函数中,我们将请求提交到提交队列。你可以提交 515 | * 我们的将是 readv() 请求,通过 IORING_OP_READV 指定。 516 | * 517 | * */ 518 | int submit_to_sq(char *file_path, struct submitter *s) { 519 | struct file_info *fi; 520 | 521 | int file_fd = open(file_path, O_RDONLY); 522 | if (file_fd < 0 ) { 523 | perror("open"); 524 | return 1; 525 | } 526 | 527 | struct app_io_sq_ring *sring = &s->sq_ring; 528 | unsigned index = 0, current_block = 0, tail = 0, next_tail = 0; 529 | 530 | off_t file_sz = get_file_size(file_fd); 531 | if (file_sz < 0) 532 | return 1; 533 | off_t bytes_remaining = file_sz; 534 | int blocks = (int) file_sz / BLOCK_SZ; 535 | if (file_sz % BLOCK_SZ) blocks++; 536 | 537 | fi = malloc(sizeof(*fi) + sizeof(struct iovec) * blocks); 538 | if (!fi) { 539 | fprintf(stderr, "Unable to allocate memory\n"); 540 | return 1; 541 | } 542 | fi->file_sz = file_sz; 543 | 544 | /* 545 | * 对于我们需要读取的文件的每个块,我们分配一个iovec struct 546 | * 索引到 iovecs 数组中。这个数组作为一部分提交传入。 547 | * 如果你不明白这一点,那么你需要去 548 | * 了解一下 readv() 和 writev() 系统调用的工作方式。 549 | * */ 550 | while (bytes_remaining) { 551 | off_t bytes_to_read = bytes_remaining; 552 | if (bytes_to_read > BLOCK_SZ) 553 | bytes_to_read = BLOCK_SZ; 554 | 555 | fi->iovecs[current_block].iov_len = bytes_to_read; 556 | 557 | void *buf; 558 | if( posix_memalign(&buf, BLOCK_SZ, BLOCK_SZ)) { 559 | perror("posix_memalign"); 560 | return 1; 561 | } 562 | fi->iovecs[current_block].iov_base = buf; 563 | 564 | current_block++; 565 | bytes_remaining -= bytes_to_read; 566 | } 567 | 568 | /* 将我们的提交队列条目添加到 SQE 环形缓冲区的尾部 */ 569 | next_tail = tail = *sring->tail; 570 | next_tail++; 571 | read_barrier(); 572 | index = tail & *s->sq_ring.ring_mask; 573 | struct io_uring_sqe *sqe = &s->sqes[index]; 574 | sqe->fd = file_fd; 575 | sqe->flags = 0; 576 | sqe->opcode = IORING_OP_READV; 577 | sqe->addr = (unsigned long) fi->iovecs; 578 | sqe->len = blocks; 579 | sqe->off = 0; 580 | sqe->user_data = (unsigned long long) fi; 581 | sring->array[index] = index; 582 | tail = next_tail; 583 | 584 | /* 更新尾部以便内核可以看到它 */ 585 | if(*sring->tail != tail) { 586 | *sring->tail = tail; 587 | write_barrier(); 588 | } 589 | 590 | /* 591 | * 告诉内核我们已经用 io_uring_enter() 提交了事件。 592 | * 们还传入了 IOURING_ENTER_GETEVENTS 标志,这会导致 593 | * io_uring_enter() 调用等待 min_complete 事件完成后返回。 594 | * */ 595 | int ret = io_uring_enter(s->ring_fd, 1,1, 596 | IORING_ENTER_GETEVENTS); 597 | if(ret < 0) { 598 | perror("io_uring_enter"); 599 | return 1; 600 | } 601 | 602 | return 0; 603 | } 604 | 605 | int main(int argc, char *argv[]) { 606 | struct submitter *s; 607 | 608 | if (argc < 2) { 609 | fprintf(stderr, "Usage: %s \n", argv[0]); 610 | return 1; 611 | } 612 | 613 | s = malloc(sizeof(*s)); 614 | if (!s) { 615 | perror("malloc"); 616 | return 1; 617 | } 618 | memset(s, 0, sizeof(*s)); 619 | 620 | if(app_setup_uring(s)) { 621 | fprintf(stderr, "Unable to setup uring!\n"); 622 | return 1; 623 | } 624 | 625 | for (int i = 1; i < argc; i++) { 626 | if(submit_to_sq(argv[i], s)) { 627 | fprintf(stderr, "Error reading file\n"); 628 | return 1; 629 | } 630 | read_from_cq(s); 631 | } 632 | 633 | return 0; 634 | } 635 | ``` 636 | 637 | 使用 gcc 进行编译的时候,需要加上 `-luring` 标识。 638 | 639 | #### 初始设置 640 | 641 | 从 main() 开始,我们调用 app_setup_uring(),它执行我们使用 io_uring 所需的初始化工作。首先,我们使用我们需要的队列深度,和初始化为零的结构实例 io_uring_params 调用系统调用 io_uring_setup() 。当调用返回时,内核将填充此结构成员中的值。io_uring_params 是这样的: 642 | 643 | ```c 644 | struct io_uring_params { 645 | __u32 sq_entries; 646 | __u32 cq_entries; 647 | __u32 flags; 648 | __u32 sq_thread_cpu; 649 | __u32 sq_thread_idle; 650 | __u32 resv[5]; 651 | struct io_sqring_offsets sq_off; 652 | struct io_cqring_offsets cq_off; 653 | }; 654 | ``` 655 | 656 | 在将此结构作为 io_uring_setup() 系统调用的一部分传递之前,您唯一可以指定的是flags结构成员,但在此示例中,我们没有要传递的标志。此外,在本例中,我们一个接一个地处理文件,我们不会做任何并行 I/O,因为这是一个简单的例子,主要是为了理解io_uring. 为此,我们将队列深度设置为一。 657 | 658 | 来自io_uring_param结构的返回值、文件描述符和其他字段随后将用于调用 mmap() ,将两个环形缓冲区和一个提交队列条目数组映射到用户空间。我删除了一些周围的代码以专注于mmap()s。 659 | 660 | ```c 661 | /* 在提交和完成队列环形缓冲区中映射。 662 | * 不过,较旧的内核仅映射到提交队列中。 663 | * */ 664 | sq_ptr = mmap(0, sring_sz, PROT_READ | PROT_WRITE, 665 | MAP_SHARED | MAP_POPULATE, 666 | s->ring_fd, IORING_OFF_SQ_RING); 667 | if (sq_ptr == MAP_FAILED) { 668 | perror("mmap"); 669 | return 1; 670 | } 671 | 672 | if (p.features & IORING_FEAT_SINGLE_MMAP) { 673 | cq_ptr = sq_ptr; 674 | } else { 675 | /* 分别映射到旧内核中的完成队列环形缓冲区 */ 676 | cq_ptr = mmap(0, cring_sz, PROT_READ | PROT_WRITE, 677 | MAP_SHARED | MAP_POPULATE, 678 | s->ring_fd, IORING_OFF_CQ_RING); 679 | if (cq_ptr == MAP_FAILED) { 680 | perror("mmap"); 681 | return 1; 682 | } 683 | } 684 | 685 | .... 686 | 687 | /* 映射到提交队列条目数组 */ 688 | s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe), 689 | PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, 690 | s->ring_fd, IORING_OFF_SQES); 691 | if (s->sqes == MAP_FAILED) { 692 | perror("mmap"); 693 | return 1; 694 | } 695 | ``` 696 | 697 | 我们将重要的细节保存在我们的 app_io_sq_ring 结构中,以供日后参考。虽然我们分别映射了两个用于提交和完成的环形缓冲区,但您可能想知道第二个映射是做什么用的:虽然完成队列环直接索引 CQE 的共享数组,但提交环之间有一个间接数组。提交端环形缓冲区是该数组的索引,该数组又包含 SQE 的索引。这对于将提交请求嵌入内部数据结构的某些应用程序很有用。这种设置允许他们一次性提交多个提交条目,同时让他们更容易采用 io_uring。 698 | 699 | 注意:在内核版本 5.4 及更高版本中,单个 mmap() 映射同时映射提交和完成队列。然而,在较旧的内核中,它们需要单独映射。您可以通过检查 IORING_FEAT_SINGLE_MMAP功能标志,来检查内核将两个队列映射到一个队列的能力,而不是检查内核版本,就像我们在上面的代码中所做的那样。 700 | 701 | #### 处理共享的环形缓冲区 702 | 703 | 在常规编程中,我们习惯于处理用户空间和内核之间非常清晰的接口:系统调用。然而,系统调用确实有成本,并且对于像 那样的高性能接口io_uring,希望尽可能多地取消它们。前面我们看到,使用 io_uring 允许我们批量处理许多 I/O 请求并对io_uring_enter() 系统调用进行一次调用,而不是像通常那样进行多次系统调用。在轮询模式下,甚至不需要调用。 704 | 705 | 从用户空间读取或更新共享环形缓冲区时,需要注意确保读取时看到的是最新数据,更新后“刷新”或“同步”写入,以便内核会看到您的更新。这是因为 CPU 可以重新排序读取和写入,编译器也可以。当这发生在同一 CPU 上时,这通常不是问题。但是在 io_uring 中,当在两个不同的上下文(用户空间和内核)中涉及共享缓冲区时,在上下文切换后,它们可以在不同的 CPU 上运行。您需要从用户空间确保在读取之前,旧的写入是可见的。或者,当您在 SQE 中填写详细信息并更新提交环形缓冲区的尾部时,您希望确保对 SQE 成员所做的写入,在更新环形缓冲区尾部的写入之前是按顺序的。如果这些写入没有按顺序的,内核可能会看到尾部更新,但是当它读取 SQE 时,它可能找不到它读取时需要的所有数据。在轮询模式下,内核自动发现尾部的变化,这会成为一个真正的问题。这完全是因为 CPU 和编译器会重新排序读取和写入,以进行优化。 706 | 707 | #### 读取完成队列条目 708 | 709 | 与往常一样,我们首先处理事情的完成方面,因为它比提交方面更简单。对于完成事件,内核将 CQE 添加到环形缓冲区并更新尾部,而我们在用户空间从头部读取。与任何环形缓冲区一样,如果头部和尾部相等,则表示环形缓冲区为空。看看下面的代码: 710 | 711 | ```c 712 | unsigned head; 713 | head = cqring->head; 714 | read_barrier(); /* ensure previous writes are visible */ 715 | if (head != cqring->tail) { 716 | /* There is data available in the ring buffer */ 717 | struct io_uring_cqe *cqe; 718 | unsigned index; 719 | index = head & (cqring->mask); 720 | cqe = &cqring->cqes[index]; 721 | /* process completed cqe here */ 722 | ... 723 | /* we've now consumed this entry */ 724 | head++; 725 | } 726 | cqring->head = head; 727 | write_barrier(); 728 | ``` 729 | 730 | 要获取头部的索引,应用程序需要使用环形缓冲区的大小掩码来运算头部。请记住,上面代码中的任何行都可以在上下文切换后运行。所以,就在比较之前,我们有一个read_barrier()。这样,如果内核确实更新了尾部,我们可以在if语句中将其作为比较的一部分读取。一旦我们获得 CQE 并处理它,我们就会更新头部,让内核知道我们已经消耗了环形缓冲区中的一个条目。最后的 write_barrier() 确保我们所做的写入变得可见,以便内核知道它。 731 | 732 | #### 提交 733 | 734 | 提交与阅读完成相反。在完成时内核将条目添加到尾部,我们从环形缓冲区的头部读取条目,但在提交时,我们添加到尾部,内核从环形缓冲区的头部读取条目。 735 | 736 | ```c 737 | struct io_uring_sqe *sqe; 738 | unsigned tail, index; 739 | tail = sqring->tail; 740 | index = tail & (*sqring->ring_mask); 741 | sqe = &sqring→sqes[index]; 742 | /* this function call fills in the SQE details for this IO request */ 743 | app_init_io(sqe); 744 | /* fill the SQE index into the SQ ring array */ 745 | sqring->array[index] = index; 746 | tail++; 747 | write_barrier(); 748 | sqring->tail = tail; 749 | write_barrier(); 750 | ``` 751 | 752 | 在上面的代码片段中,app_init_io() 函数填写提交请求的详细信息。在更新尾部之前,我们有一个 write_barrier() 来确保在更新尾部之前之前的写入是有序的。然后我们更新尾部, 并再次调用 write_barrier() 以确保我们的更新被内核看到。 -------------------------------------------------------------------------------- /document/io_uring-by-example/io_uring-by-example2.md: -------------------------------------------------------------------------------- 1 | # io_uring 从原理到动手实践 part2: liburing 2 | 3 | ## 前言 4 | 5 | 上一篇提及了 io_uring 的基本使用方式 6 | 7 | ## 使用 liburing 的 cat 8 | 9 | 使用 io_uring 构建像读取文件的程序这样简单的东西可能并不像 io_uring 那么直观. 事实证明,它比使用同步 I/O 读取文件的程序有更多的代码。但是,如果您分析 cat_uring 的代码,您会发现大部分代码都有样板代码,可以很容易地将其隐藏在单独的文件中,并且不影响应用程序逻辑。在任何情况下,我们都是有目的地学习 io_uring 的低级细节,以便更好地理解它是如何工作的。但是,如果您打算在正在构建的实际应用程序中使用 io_uring,您可能不应该直接使用原始接口。您应该改用 liburing,这是一个很好的高级包装器. 10 | 11 | 现在让我们看看如何使用 liburing 构建 cat_uring. 我们将称之为cat_liburing。 12 | 13 | ```c 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #define QUEUE_DEPTH 1 23 | #define BLOCK_SZ 1024 24 | 25 | ... 26 | 27 | /* 28 | * Wait for a completion to be available, fetch the data from 29 | * the readv operation and print it to the console. 30 | * */ 31 | 32 | int get_completion_and_print(struct io_uring *ring) { 33 | struct io_uring_cqe *cqe; 34 | int ret = io_uring_wait_cqe(ring, &cqe); 35 | if (ret < 0) { 36 | perror("io_uring_wait_cqe"); 37 | return 1; 38 | } 39 | if (cqe->res < 0) { 40 | fprintf(stderr, "Async readv failed.\n"); 41 | return 1; 42 | } 43 | struct file_info *fi = io_uring_cqe_get_data(cqe); 44 | int blocks = (int) fi->file_sz / BLOCK_SZ; 45 | if (fi->file_sz % BLOCK_SZ) blocks++; 46 | for (int i = 0; i < blocks; i ++) 47 | output_to_console(fi->iovecs[i].iov_base, fi->iovecs[i].iov_len); 48 | 49 | io_uring_cqe_seen(ring, cqe); 50 | return 0; 51 | } 52 | 53 | /* 54 | * Submit the readv request via liburing 55 | * */ 56 | 57 | int submit_read_request(char *file_path, struct io_uring *ring) { 58 | int file_fd = open(file_path, O_RDONLY); 59 | if (file_fd < 0) { 60 | perror("open"); 61 | return 1; 62 | } 63 | off_t file_sz = get_file_size(file_fd); 64 | off_t bytes_remaining = file_sz; 65 | off_t offset = 0; 66 | int current_block = 0; 67 | int blocks = (int) file_sz / BLOCK_SZ; 68 | if (file_sz % BLOCK_SZ) blocks++; 69 | struct file_info *fi = malloc(sizeof(*fi) + 70 | (sizeof(struct iovec) * blocks)); 71 | 72 | /* 73 | * For each block of the file we need to read, we allocate an iovec struct 74 | * which is indexed into the iovecs array. This array is passed in as part 75 | * of the submission. If you don't understand this, then you need to look 76 | * up how the readv() and writev() system calls work. 77 | * */ 78 | while (bytes_remaining) { 79 | off_t bytes_to_read = bytes_remaining; 80 | if (bytes_to_read > BLOCK_SZ) 81 | bytes_to_read = BLOCK_SZ; 82 | 83 | offset += bytes_to_read; 84 | fi->iovecs[current_block].iov_len = bytes_to_read; 85 | 86 | void *buf; 87 | if( posix_memalign(&buf, BLOCK_SZ, BLOCK_SZ)) { 88 | perror("posix_memalign"); 89 | return 1; 90 | } 91 | fi->iovecs[current_block].iov_base = buf; 92 | 93 | current_block++; 94 | bytes_remaining -= bytes_to_read; 95 | } 96 | fi->file_sz = file_sz; 97 | 98 | /* Get an SQE */ 99 | struct io_uring_sqe *sqe = io_uring_get_sqe(ring); 100 | /* Setup a readv operation */ 101 | io_uring_prep_readv(sqe, file_fd, fi->iovecs, blocks, 0); 102 | /* Set user data */ 103 | io_uring_sqe_set_data(sqe, fi); 104 | /* Finally, submit the request */ 105 | io_uring_submit(ring); 106 | 107 | return 0; 108 | } 109 | 110 | int main(int argc, char *argv[]) { 111 | struct io_uring ring; 112 | 113 | if (argc < 2) { 114 | fprintf(stderr, "Usage: %s [file name] <[file name] ...>\n", 115 | argv[0]); 116 | return 1; 117 | } 118 | 119 | /* Initialize io_uring */ 120 | io_uring_queue_init(QUEUE_DEPTH, &ring, 0); 121 | 122 | for (int i = 1; i < argc; i++) { 123 | int ret = submit_read_request(argv[i], &ring); 124 | if (ret) { 125 | fprintf(stderr, "Error reading file: %s\n", argv[i]); 126 | return 1; 127 | } 128 | get_completion_and_print(&ring); 129 | } 130 | 131 | /* Call the clean-up function. */ 132 | io_uring_queue_exit(&ring); 133 | return 0; 134 | } 135 | ``` 136 | 137 | 让我们比较每个实现所花费的行数: 138 | 139 | 普通 cat:~120 行 140 | 原始 io_uring 的 cat:~360 行 141 | 使用 liburing 的 cat:~160 行 142 | 现在,使用 liburing, 随着所有样板代码的消失,逻辑就会变得明显起来。让我们快速浏览一下。我们这样初始化 io_uring: 143 | 144 | ```c 145 | io_uring_queue_init(QUEUE_DEPTH, &ring, 0); 146 | ``` 147 | 148 | 在函数 submit_read_request() 中,我们获得一个 SQE,为readv操作准备它,并提交它。 149 | 150 | ```c 151 | /* Get an SQE */ 152 | struct io_uring_sqe *sqe = io_uring_get_sqe(ring); 153 | /* Setup a readv operation */ 154 | io_uring_prep_readv(sqe, file_fd, fi->iovecs, blocks, 0); 155 | /* Set user data */ 156 | io_uring_sqe_set_data(sqe, fi); 157 | /* Finally, submit the request */ 158 | io_uring_submit(ring); 159 | ``` 160 | 161 | 我们等待事件完成并获取我们在提交端设置的用户数据,如下所示: 162 | 163 | ```c 164 | struct io_uring_cqe *cqe; 165 | int ret = io_uring_wait_cqe(ring, &cqe); 166 | struct file_info *fi = io_uring_cqe_get_data(cqe); 167 | ``` 168 | 169 | 当然,与使用原始接口相比,这使用起来要简单得多。 -------------------------------------------------------------------------------- /document/io_uring-by-example/io_uring-by-example3.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yunwei37/co-uring-WebServer/878911c6669ac8bf1abb1e11b224e6cdc86aa6f7/document/io_uring-by-example/io_uring-by-example3.md -------------------------------------------------------------------------------- /document/part1.md: -------------------------------------------------------------------------------- 1 | # 使用 c++20 协程与 io_uring 实现高性能 web 服务器 part1:一个最简单的 echo server 2 | 3 | 如果您不熟悉 io_uring 和 c++20 协程的,可以参考这个仓库里的其他一些文章和示例代码: 4 | 5 | [github.com/yunwei37/co-uring-WebServer](https://github.com/yunwei37/co-uring-WebServer) 6 | 7 | 这个版本的代码由 https://github.com/frevib/io_uring-echo-server 改造而来,是希望通过在 io_uring 的基础上,尝试实现最基本的协程 IO 模式,然后进行性能对比。之前的版本使用了一个 event loop 的模式,并通过 io_uring 的 IORING_OP_PROVIDE_BUFFERS 参数和 IORING_FEAT_FAST_POLL 参数,实现了零拷贝和内核线程的 polling,不需要额外的系统调用开销。 8 | 9 | 本文在 io_uring-echo-server 的基础上增添了一个简易的协程实现,完整的 demo 代码实现在这里:[github.com/yunwei37/co-uring-WebServer/blob/master/demo/io_uring_coroutine_echo_server.cpp](https://github.com/yunwei37/co-uring-WebServer/blob/master/demo/io_uring_coroutine_echo_server.cpp) 10 | 11 | ## 协程实现 12 | 13 | 原先的代码包含一个 event loop,大致是这样(忽略具体细节),进行 IO 和完成 IO 的逻辑是完全分开的: 14 | 15 | ```cpp 16 | 17 | add_accept(....); 18 | // start event loop 19 | while (1) { 20 | io_uring_for_each_cqe(&ring, head, cqe) { 21 | if (type == ACCEPT) { 22 | add_read(....); 23 | add_accept(....); 24 | } else if (type == READ) { 25 | if (read_cout <= 0) { 26 | close(); 27 | } 28 | add_write(....); 29 | } else if (type == WRITE) { 30 | add_read(....); 31 | } 32 | } 33 | } 34 | ``` 35 | 36 | 这里简单介绍一下协程的实现方式。使用协程的 co_await 关键字,可以让 IO 的异步回调变得更自然,例如对于一个连接进行 echo,我们的协程版本可以写成这样: 37 | 38 | ```cpp 39 | conn_task handle_echo(int fd) { 40 | while (true) { 41 | size_t size_r = co_await echo_read(MAX_MESSAGE_LEN, IOSQE_BUFFER_SELECT); 42 | if (size_r <= 0) { 43 | co_await echo_add_buffer(); 44 | shutdown(fd, SHUT_RDWR); 45 | connections.erase(fd); 46 | co_return; 47 | } 48 | co_await echo_write(size_r, 0); 49 | co_await echo_add_buffer(); 50 | } 51 | } 52 | ``` 53 | 54 | 这里 handle_echo 里面的 read 和 write,和同步的调用编写方式基本一样,只是在前面使用了 co_await 关键字,指明了该函数是个协程。 55 | 根据 C++ 的规范,这里的协程是无栈协程,需要实现一个 task 和 promise_type,例如: 56 | 57 | ```cpp 58 | struct conn_task { 59 | struct promise_type 60 | { 61 | using Handle = std::coroutine_handle; 62 | conn_task get_return_object() 63 | { 64 | return conn_task{Handle::from_promise(*this)}; 65 | } 66 | std::suspend_always initial_suspend() noexcept { 67 | return {}; 68 | } 69 | std::suspend_never final_suspend() noexcept { return {}; } 70 | void return_void() noexcept {} 71 | void unhandled_exception() noexcept {} 72 | struct io_uring *ring; 73 | struct conn_info conn_info; 74 | size_t res; 75 | }; 76 | promise_type::Handle handler; 77 | }; 78 | ``` 79 | 80 | 以 write 为例,它返回一个 awaitable 对象: 81 | 82 | ```cpp 83 | auto echo_write(size_t message_size, unsigned flags) { 84 | struct awaitable { 85 | bool await_ready() { return false; } 86 | void await_suspend(std::coroutine_handle h) { 87 | auto &p = h.promise(); 88 | struct io_uring_sqe *sqe = io_uring_get_sqe(p.ring); 89 | io_uring_prep_send(sqe, p.conn_info.fd, &bufs[p.conn_info.bid], message_size, 0); 90 | io_uring_sqe_set_flags(sqe, flags); 91 | p.conn_info.type = WRITE; 92 | memcpy(&sqe->user_data, &p.conn_info, sizeof(conn_info)); 93 | } 94 | size_t await_resume() { 95 | return 0; 96 | } 97 | size_t message_size; 98 | unsigned flags; 99 | }; 100 | return awaitable{message_size, flags}; 101 | } 102 | ``` 103 | 104 | 实际上,在运行到 write 调用时,由于 awaitable 对象中的 await_ready 返回 false,协程会在调用 await_suspend 之后停下来,回到主循环,在主循环中,当我们接收到 write 的调用时,只需要简单地通过协程句柄让协程继续运行: 105 | 106 | ```cpp 107 | .... 108 | if (type == WRITE) { 109 | h.resume(); 110 | } 111 | .... 112 | ``` 113 | 114 | 此时协程会从 await_resume() 中继续运行,并将 await_resume 的返回值作为 write 的返回值。具体细节可以参考仓库中的代码实现。 115 | 116 | ## benchmark 117 | 118 | - 运行环境:`Linux ubuntu 5.11.0-41-generic #45~20.04.1-Ubuntu SMP Wed Nov 10 10:20:10 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux` 119 | - vmware 16, 8GB ram,Intel Core i7-10750H,2 cores,4 Logical processors; 120 | - 编译指令:`g++ io_uring_echo_server.cpp -o ./io_uring_echo_server -Wall -O3 -D_GNU_SOURCE -luring -std=c++2a -fcoroutines` 121 | - benchmark tool:[https://github.com/haraldh/rust_echo_bench](https://github.com/haraldh/rust_echo_bench),使用 taskset 将其与一个核心绑定: 122 | 123 | ### io_uring with coroutine 124 | 125 | source:[github.com/yunwei37/co-uring-WebServer/blob/master/demo/io_uring_coroutine_echo_server.cpp](https://github.com/yunwei37/co-uring-WebServer/blob/master/demo/io_uring_coroutine_echo_server.cpp) 126 | 127 | request/sec, 60 sec: 128 | 129 | | clients | 1 | 50 | 150 | 300 | 500 | 130 | |:----------:|:-----:|:------:|:------:|:------:|:------:| 131 | | 128 bytes | 28635 | 39206 | 38985 | 35658 | 35013 | 132 | | 512 bytes | 34693 | 40981 | 40536 | 36040 | 35251 | 133 | | 1000 bytes | 22304 | 46619 | 43915 | 35162 | 34618 | 134 | 135 | ### io_uring without coroutine 136 | 137 | source:[github.com/yunwei37/co-uring-WebServer/blob/master/demo/io_uring_echo_server.cpp](https://github.com/yunwei37/co-uring-WebServer/blob/master/demo/io_uring_echo_server.cpp) 138 | 139 | request/sec, 60 sec: 140 | 141 | | clients | 1 | 50 | 150 | 300 | 500 | 142 | |:----------:|:-----:|:------:|:------:|:------:|:------:| 143 | | 128 bytes | 25405 | 35736 | 37010 | 28093 | 26337 | 144 | | 512 bytes | 26207 | 36847 | 39342 | 32921 | 38786 | 145 | | 1000 bytes | 26077 | 36865 | 39312 | 35115 | 52847 | 146 | 147 | 可能是机器性能的原因,在多线程情况下提升并没有很大。 148 | 149 | 简单画个图对比一下,可以发现仅仅是简单应用协程的情况下,不仅异步编程模型清晰了不少,性能也获得了一点提升: 150 | 151 | ![128](128.png) 152 | 153 | 测试脚本: 154 | 155 | ```bash 156 | #!/bin/bash 157 | echo $(uname -a) 158 | 159 | if [ "$#" -ne 1 ]; then 160 | echo "Please give port where echo server is running: $0 [port]" 161 | exit 162 | fi 163 | 164 | PID=$(lsof -itcp:$1 | sed -n -e 2p | awk '{print $2}') 165 | taskset -cp 0 $PID 166 | 167 | for bytes in 1 128 512 1000 168 | do 169 | for connections in 1 50 150 300 500 170 | do 171 | cargo run --release -- --address "localhost:$1" --number $connections --duration 60 --length $bytes 172 | sleep 4 173 | done 174 | done 175 | ``` 176 | 177 | # /usr/bin/time 178 | 179 | ``` 180 | yunwei@ubuntu:~/Desktop/co-uring$ /usr/bin/time --verbose build/demo/io_uring_coroutine_echo_server 1234 181 | io_uring echo server listening for connections on port: 1234 182 | Command terminated by signal 2 183 | Command being timed: "build/demo/io_uring_coroutine_echo_server 1234" 184 | User time (seconds): 0.75 185 | System time (seconds): 51.64 186 | Percent of CPU this job got: 80% 187 | Elapsed (wall clock) time (h:mm:ss or m:ss): 1:05.09 188 | Average shared text size (kbytes): 0 189 | Average unshared data size (kbytes): 0 190 | Average stack size (kbytes): 0 191 | Average total size (kbytes): 0 192 | Maximum resident set size (kbytes): 3656 193 | Average resident set size (kbytes): 0 194 | Major (requiring I/O) page faults: 0 195 | Minor (reclaiming a frame) page faults: 202 196 | Voluntary context switches: 38959 197 | Involuntary context switches: 56786 198 | Swaps: 0 199 | File system inputs: 0 200 | File system outputs: 0 201 | Socket messages sent: 0 202 | Socket messages received: 0 203 | Signals delivered: 0 204 | Page size (bytes): 4096 205 | Exit status: 0 206 | yunwei@ubuntu:~/Desktop/co-uring$ /usr/bin/time --verbose build/demo/io_uring_echo_server 1234 207 | io_uring echo server listening for connections on port: 1234 208 | Command terminated by signal 2 209 | Command being timed: "build/demo/io_uring_echo_server 1234" 210 | User time (seconds): 0.32 211 | System time (seconds): 51.45 212 | Percent of CPU this job got: 76% 213 | Elapsed (wall clock) time (h:mm:ss or m:ss): 1:07.60 214 | Average shared text size (kbytes): 0 215 | Average unshared data size (kbytes): 0 216 | Average stack size (kbytes): 0 217 | Average total size (kbytes): 0 218 | Maximum resident set size (kbytes): 1776 219 | Average resident set size (kbytes): 0 220 | Major (requiring I/O) page faults: 0 221 | Minor (reclaiming a frame) page faults: 135 222 | Voluntary context switches: 36851 223 | Involuntary context switches: 53051 224 | Swaps: 0 225 | File system inputs: 0 226 | File system outputs: 0 227 | Socket messages sent: 0 228 | Socket messages received: 0 229 | Signals delivered: 0 230 | Page size (bytes): 4096 231 | Exit status: 0 232 | ``` 233 | 234 | 235 | ## reference 236 | 237 | - [https://github.com/frevib/io_uring-echo-server](https://github.com/frevib/io_uring-echo-server) 238 | - [https://git.kernel.dk/cgit/liburing/](https://git.kernel.dk/cgit/liburing/) -------------------------------------------------------------------------------- /server/http_conn.h: -------------------------------------------------------------------------------- 1 | #ifndef HTTPCONNECTION_H 2 | #define HTTPCONNECTION_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "stream.h" 11 | #include 12 | 13 | constexpr int FILENAME_LEN = 200; 14 | constexpr int READ_BUFFER_SIZE = 2048; 15 | constexpr int WRITE_BUFFER_SIZE = 1024; 16 | 17 | class http_conn 18 | { 19 | public: 20 | enum METHOD 21 | { 22 | GET = 0, 23 | POST, 24 | HEAD, 25 | PUT, 26 | DELETE, 27 | TRACE, 28 | OPTIONS, 29 | CONNECT, 30 | PATH, 31 | UNIMPLEMENT_MOTHOD 32 | }; 33 | enum HTTP_CODE 34 | { 35 | NO_REQUEST, 36 | GET_REQUEST, 37 | BAD_REQUEST, 38 | NO_FOUND, 39 | FORBIDDEN_REQUEST, 40 | FILE_REQUEST, 41 | INTERNAL_ERROR, 42 | CLOSED_CONNECTION, 43 | }; 44 | 45 | public: 46 | http_conn() {} 47 | ~http_conn() {} 48 | HTTP_CODE handle_request(char *text_buffer); 49 | size_t get_response_size(); 50 | 51 | private: 52 | HTTP_CODE parse_request(char *text); 53 | bool process_write(HTTP_CODE ret); 54 | HTTP_CODE do_request(); 55 | void add_content(); 56 | void add_headers(); 57 | void add_content_type(); 58 | void add_content_length(); 59 | bool add_linger(); 60 | bool add_blank_line(); 61 | 62 | private: 63 | int fd; 64 | METHOD method; 65 | size_t response_size; 66 | char final_path[256]; 67 | char* response_pointer; 68 | size_t content_size; 69 | static const char *unimplemented_content; 70 | static const char *http_404_content; 71 | static const char *ok_header; 72 | }; 73 | 74 | const char *http_conn::unimplemented_content = \ 75 | "HTTP/1.0 400 Bad Request\r\n" 76 | "Content-type: text/html\r\n" 77 | "\r\n" 78 | "" 79 | "" 80 | "ZeroHTTPd: Unimplemented" 81 | "" 82 | "" 83 | "

Bad Request (Unimplemented)

" 84 | "

Your client sent a request ZeroHTTPd did not understand and it is probably not your fault.

" 85 | "" 86 | ""; 87 | 88 | const char *http_conn::http_404_content = \ 89 | "HTTP/1.0 404 Not Found\r\n" 90 | "Content-type: text/html\r\n" 91 | "\r\n" 92 | "" 93 | "" 94 | "ZeroHTTPd: Not Found" 95 | "" 96 | "" 97 | "

Not Found (404)

" 98 | "

Your client is asking for an object that was not found on this server.

" 99 | "" 100 | ""; 101 | 102 | const char* http_conn::ok_header = "HTTP/1.0 200 OK\r\n" 103 | "Server: zerohttpd/0.1\r\n"; 104 | 105 | void http_conn::add_content() { 106 | int fd; 107 | 108 | fd = open(final_path, O_RDONLY); 109 | if (fd < 0) 110 | fatal_error("open"); 111 | 112 | /* We should really check for short reads here */ 113 | size_t ret = read(fd, response_pointer, 2048); 114 | response_pointer[ret] = 0; 115 | response_pointer += ret; 116 | close(fd); 117 | } 118 | 119 | /* 120 | * Simple function to get the file extension of the file that we are about to serve. 121 | * */ 122 | 123 | const char *get_filename_ext(const char *filename) { 124 | const char *dot = strrchr(filename, '.'); 125 | if (!dot || dot == filename) 126 | return ""; 127 | return dot + 1; 128 | } 129 | 130 | void http_conn::add_content_type() { 131 | const char *file_ext = get_filename_ext(final_path); 132 | if (strcmp("jpg", file_ext) == 0) 133 | strcpy(response_pointer, "Content-Type: image/jpeg\r\n"); 134 | if (strcmp("jpeg", file_ext) == 0) 135 | strcpy(response_pointer, "Content-Type: image/jpeg\r\n"); 136 | if (strcmp("png", file_ext) == 0) 137 | strcpy(response_pointer, "Content-Type: image/png\r\n"); 138 | if (strcmp("gif", file_ext) == 0) 139 | strcpy(response_pointer, "Content-Type: image/gif\r\n"); 140 | if (strcmp("htm", file_ext) == 0) 141 | strcpy(response_pointer, "Content-Type: text/html\r\n"); 142 | if (strcmp("html", file_ext) == 0) 143 | strcpy(response_pointer, "Content-Type: text/html\r\n"); 144 | if (strcmp("js", file_ext) == 0) 145 | strcpy(response_pointer, "Content-Type: application/javascript\r\n"); 146 | if (strcmp("css", file_ext) == 0) 147 | strcpy(response_pointer, "Content-Type: text/css\r\n"); 148 | if (strcmp("txt", file_ext) == 0) 149 | strcpy(response_pointer, "Content-Type: text/plain\r\n"); 150 | response_pointer += strlen(response_pointer); 151 | } 152 | 153 | void http_conn::add_content_length() { 154 | sprintf(response_pointer, "content-length: %ld\r\n", content_size); 155 | response_pointer += strlen(response_pointer); 156 | } 157 | 158 | void http_conn::add_headers() { 159 | strcpy(response_pointer, ok_header); 160 | response_pointer += strlen(response_pointer); 161 | add_content_type(); 162 | add_content_length(); 163 | strcpy(response_pointer, "\r\n"); 164 | response_pointer += strlen(response_pointer); 165 | } 166 | 167 | http_conn::HTTP_CODE http_conn::handle_request(char *text_buffer) { 168 | HTTP_CODE code = parse_request(text_buffer); 169 | if (code == BAD_REQUEST) { 170 | strcpy(text_buffer, unimplemented_content); 171 | response_size = strlen(unimplemented_content); 172 | } else if (code == NO_FOUND) { 173 | strcpy(text_buffer, http_404_content); 174 | response_size = strlen(unimplemented_content); 175 | } else if (code == GET_REQUEST) { 176 | response_pointer = text_buffer; 177 | add_headers(); 178 | add_content(); 179 | response_size = response_pointer - text_buffer; 180 | } 181 | return code; 182 | } 183 | 184 | int get_first_line(char *src, size_t max_length) { 185 | size_t length = strlen(src) > max_length? max_length:strlen(src); 186 | for (size_t i = 0; i < length; i++) { 187 | if (src[i] == '\r' && src[i+1] == '\n') { 188 | src[i] = '\0'; 189 | return 1; 190 | } 191 | } 192 | return 0; 193 | } 194 | 195 | http_conn::HTTP_CODE http_conn::parse_request(char *text) { 196 | char *method, *path, *saveptr; 197 | struct stat path_stat; 198 | 199 | get_first_line(text, 2048); 200 | method = strtok_r(text, " ", &saveptr); 201 | strtolower(method); 202 | path = strtok_r(NULL, " ", &saveptr); 203 | 204 | if (strcmp(method, "get") == 0) { 205 | this->method = GET; 206 | } else { 207 | this->method = UNIMPLEMENT_MOTHOD; 208 | return BAD_REQUEST; 209 | } 210 | 211 | if (path[strlen(path) - 1] == '/') { 212 | strcpy(final_path, "index.html"); 213 | } 214 | else { 215 | strcpy(final_path, path); 216 | } 217 | if (stat(final_path, &path_stat) == -1) { 218 | printf("404 Not Found: %s (%s)\n", final_path, path); 219 | return NO_FOUND; 220 | } 221 | if (S_ISREG(path_stat.st_mode)) { 222 | content_size = path_stat.st_size; 223 | return GET_REQUEST; 224 | } else { 225 | return NO_FOUND; 226 | } 227 | } 228 | 229 | size_t http_conn::get_response_size() { 230 | return response_size; 231 | } 232 | 233 | #endif 234 | -------------------------------------------------------------------------------- /server/io_uring.h: -------------------------------------------------------------------------------- 1 | #ifndef IO_URING_H 2 | #define IO_URING_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "utils.h" 14 | #include "task.h" 15 | 16 | constexpr size_t MAX_MESSAGE_LEN = 2048; 17 | constexpr size_t BUFFERS_COUNT = 4096; 18 | 19 | class io_uring_handler 20 | { 21 | public: 22 | io_uring_handler(unsigned entries, int sock_listen_fd); 23 | void event_loop(task func(int)); 24 | void setup_first_buffer(); 25 | ~io_uring_handler(); 26 | void add_read_request(int fd, request &req); 27 | void add_write_request(int fd, size_t message_size, request &req); 28 | void add_accept_request(int fd, struct sockaddr *client_addr, socklen_t *client_len, unsigned flags); 29 | void add_buffer_request(request &req); 30 | void add_open_request(); 31 | void add_close_request(int fd); 32 | 33 | char* get_buffer_pointer(int bid) { 34 | return buffer[bid]; 35 | } 36 | 37 | int get_buffer_id(char* buffer) { 38 | return (buffer - (char*)this->buffer.get()) / MAX_MESSAGE_LEN; 39 | } 40 | 41 | private: 42 | struct io_uring ring; 43 | std::unique_ptr buffer; 44 | std::map connections; 45 | struct sockaddr_in client_addr; 46 | socklen_t client_len = sizeof(client_addr); 47 | int sock_listen_fd; 48 | }; 49 | 50 | io_uring_handler::io_uring_handler(unsigned entries, int sock_listen_fd) 51 | { 52 | struct io_uring_params params; 53 | memset(¶ms, 0, sizeof(params)); 54 | this->sock_listen_fd = sock_listen_fd; 55 | 56 | if (io_uring_queue_init_params(entries, &ring, ¶ms) < 0) 57 | { 58 | perror("io_uring_init_failed...\n"); 59 | exit(1); 60 | } 61 | 62 | // check if IORING_FEAT_FAST_POLL is supported 63 | if (!(params.features & IORING_FEAT_FAST_POLL)) 64 | { 65 | printf("IORING_FEAT_FAST_POLL not available in the kernel, quiting...\n"); 66 | exit(0); 67 | } 68 | 69 | struct io_uring_probe *probe; 70 | probe = io_uring_get_probe_ring(&ring); 71 | if (!probe || !io_uring_opcode_supported(probe, IORING_OP_PROVIDE_BUFFERS)) 72 | { 73 | printf("Buffer select not supported, skipping...\n"); 74 | exit(0); 75 | } 76 | free(probe); 77 | 78 | setup_first_buffer(); 79 | } 80 | 81 | void io_uring_handler::event_loop(task handle_event(int)) 82 | { 83 | // start event loop 84 | log("start event loop"); 85 | add_accept_request(sock_listen_fd, (struct sockaddr *)&client_addr, &client_len, 0); 86 | while (1) 87 | { 88 | io_uring_submit_and_wait(&ring, 1); 89 | struct io_uring_cqe *cqe; 90 | unsigned head; 91 | unsigned count = 0; 92 | 93 | // go through all CQEs 94 | io_uring_for_each_cqe(&ring, head, cqe) 95 | { 96 | ++count; 97 | request conn_i; 98 | memcpy(&conn_i, &cqe->user_data, sizeof(conn_i)); 99 | 100 | int type = conn_i.event_type; 101 | if (cqe->res == -ENOBUFS) 102 | { 103 | fprintf(stdout, "bufs in automatic buffer selection empty, this should not happen...\n"); 104 | fflush(stdout); 105 | exit(1); 106 | } 107 | else if (type == PROV_BUF) 108 | { 109 | if (cqe->res < 0) 110 | { 111 | printf("cqe->res = %d\n", cqe->res); 112 | exit(1); 113 | } 114 | } 115 | else if (type == ACCEPT) 116 | { 117 | int sock_conn_fd = cqe->res; 118 | // only read when there is no error, >= 0 119 | log("accept in io_uring_for_each_cqe"); 120 | if (sock_conn_fd >= 0) 121 | { 122 | connections.emplace(sock_conn_fd, handle_event(sock_conn_fd)); 123 | auto &h = connections.at(sock_conn_fd).handler; 124 | auto &p = h.promise(); 125 | p.request_info.client_socket = sock_conn_fd; 126 | p.uring = this; 127 | h.resume(); 128 | } 129 | 130 | // new connected client; read data from socket and re-add accept to monitor for new connections 131 | add_accept_request(sock_listen_fd, (struct sockaddr *)&client_addr, &client_len, 0); 132 | } 133 | else if (type == READ) 134 | { 135 | auto &h = connections.at(conn_i.client_socket).handler; 136 | auto &p = h.promise(); 137 | p.request_info.bid = cqe->flags >> 16; 138 | p.res = cqe->res; 139 | h.resume(); 140 | } 141 | else if (type == WRITE) 142 | { 143 | auto &h = connections.at(conn_i.client_socket).handler; 144 | h.promise().res = cqe->res; 145 | h.resume(); 146 | } 147 | } 148 | 149 | io_uring_cq_advance(&ring, count); 150 | } 151 | } 152 | 153 | void io_uring_handler::setup_first_buffer() 154 | { 155 | buffer.reset(new char[BUFFERS_COUNT][MAX_MESSAGE_LEN]); 156 | 157 | // register buffers for buffer selection 158 | struct io_uring_sqe *sqe; 159 | struct io_uring_cqe *cqe; 160 | 161 | sqe = io_uring_get_sqe(&ring); 162 | io_uring_prep_provide_buffers(sqe, buffer.get(), MAX_MESSAGE_LEN, BUFFERS_COUNT, group_id, 0); 163 | 164 | io_uring_submit(&ring); 165 | io_uring_wait_cqe(&ring, &cqe); 166 | if (cqe->res < 0) 167 | { 168 | printf("cqe->res = %d\n", cqe->res); 169 | exit(1); 170 | } 171 | io_uring_cqe_seen(&ring, cqe); 172 | } 173 | 174 | io_uring_handler::~io_uring_handler() 175 | { 176 | io_uring_queue_exit(&ring); 177 | } 178 | 179 | void io_uring_handler::add_read_request(int fd, request &req) 180 | { 181 | struct io_uring_sqe *sqe = io_uring_get_sqe(&ring); 182 | io_uring_prep_recv(sqe, fd, NULL, MAX_MESSAGE_LEN, 0); 183 | io_uring_sqe_set_flags(sqe, IOSQE_BUFFER_SELECT); 184 | sqe->buf_group = group_id; 185 | req.event_type = READ; 186 | sqe->user_data = req.uring_data; 187 | } 188 | 189 | void io_uring_handler::add_close_request(int fd) 190 | { 191 | shutdown(fd, SHUT_RDWR); 192 | connections.erase(fd); 193 | } 194 | 195 | 196 | void io_uring_handler::add_write_request(int fd, size_t message_size, request &req) 197 | { 198 | struct io_uring_sqe *sqe = io_uring_get_sqe(&ring); 199 | io_uring_prep_send(sqe, fd, &buffer[req.bid], message_size, 0); 200 | io_uring_sqe_set_flags(sqe, 0); 201 | req.event_type = WRITE; 202 | sqe->user_data = req.uring_data; 203 | log("add_write_request %lu", message_size); 204 | } 205 | 206 | void io_uring_handler::add_accept_request(int fd, struct sockaddr *client_addr, socklen_t *client_len, unsigned flags) 207 | { 208 | struct io_uring_sqe *sqe = io_uring_get_sqe(&ring); 209 | io_uring_prep_accept(sqe, fd, client_addr, client_len, 0); 210 | io_uring_sqe_set_flags(sqe, flags); 211 | 212 | request conn_i; 213 | conn_i.event_type = ACCEPT; 214 | conn_i.bid = 0; 215 | conn_i.client_socket = fd; 216 | 217 | sqe->user_data = conn_i.uring_data; 218 | } 219 | 220 | void io_uring_handler::add_buffer_request(request &req) 221 | { 222 | struct io_uring_sqe *sqe = io_uring_get_sqe(&ring); 223 | io_uring_prep_provide_buffers(sqe, buffer[req.bid], MAX_MESSAGE_LEN, 1, group_id, req.bid); 224 | req.event_type = PROV_BUF; 225 | sqe->user_data = req.uring_data; 226 | } 227 | 228 | void io_uring_handler::add_open_request() 229 | { 230 | } 231 | 232 | #endif 233 | -------------------------------------------------------------------------------- /server/main.cpp: -------------------------------------------------------------------------------- 1 | #include "server.h" 2 | #include 3 | 4 | int main() { 5 | log("main()\n"); 6 | server server(8000); 7 | server.start(); 8 | return 0; 9 | } 10 | -------------------------------------------------------------------------------- /server/server.h: -------------------------------------------------------------------------------- 1 | #ifndef SERVER_H 2 | #define SERVER_H 3 | 4 | #include 5 | #include "io_uring.h" 6 | #include "utils.h" 7 | #include 8 | #include 9 | #include 10 | #include "task.h" 11 | #include "stream.h" 12 | #include "http_conn.h" 13 | 14 | constexpr size_t ENTRIES = 2048; 15 | 16 | class server 17 | { 18 | public: 19 | server(int port); 20 | ~server(); 21 | void start(); 22 | private: 23 | std::unique_ptr uring; 24 | int sock_fd; 25 | 26 | void setup_listening_socket(int port); 27 | }; 28 | 29 | void server::setup_listening_socket(int port) 30 | { 31 | struct sockaddr_in srv_addr = {0}; 32 | 33 | sock_fd = socket(PF_INET, SOCK_STREAM, 0); 34 | if (sock_fd == -1) 35 | fatal_error("socket()"); 36 | 37 | int enable = 1; 38 | if (setsockopt(sock_fd, 39 | SOL_SOCKET, SO_REUSEADDR, 40 | &enable, sizeof(int)) < 0) 41 | fatal_error("setsockopt(SO_REUSEADDR)"); 42 | 43 | memset(&srv_addr, 0, sizeof(srv_addr)); 44 | srv_addr.sin_family = AF_INET; 45 | srv_addr.sin_port = htons(port); 46 | srv_addr.sin_addr.s_addr = htonl(INADDR_ANY); 47 | 48 | /* We bind to a port and turn this socket into a listening 49 | * socket. 50 | * */ 51 | if (bind(sock_fd, 52 | (const struct sockaddr *)&srv_addr, 53 | sizeof(srv_addr)) < 0) 54 | fatal_error("bind()"); 55 | 56 | if (listen(sock_fd, 10) < 0) 57 | fatal_error("listen()"); 58 | } 59 | 60 | server::server(int port) 61 | { 62 | log("server in port %d", port); 63 | setup_listening_socket(port); 64 | uring.reset(new io_uring_handler(ENTRIES, sock_fd)); 65 | } 66 | 67 | server::~server() 68 | { 69 | } 70 | 71 | task handle_http_request(int fd) { 72 | char* read_buffer; 73 | http_conn conn; 74 | 75 | log("accept request %d", fd); 76 | size_t read_bytes = co_await read_socket(&read_buffer); 77 | read_buffer[read_bytes] = 0; 78 | log("read_buffer %lu %s", read_bytes, read_buffer); 79 | conn.handle_request(read_buffer); 80 | size_t write_bytes = co_await write_socket(read_buffer, conn.get_response_size()); 81 | log("write_buffer %lu %s", write_bytes, read_buffer); 82 | co_await shutdown_socket(fd); 83 | co_return; 84 | } 85 | 86 | 87 | void server::start() { 88 | log("server::start()"); 89 | uring->event_loop(handle_http_request); 90 | } 91 | 92 | #endif 93 | -------------------------------------------------------------------------------- /server/stream.h: -------------------------------------------------------------------------------- 1 | #ifndef STREAM_H 2 | #define STREAM_H 3 | 4 | #include 5 | #include 6 | #include "io_uring.h" 7 | 8 | struct stream_base 9 | { 10 | stream_base(task::promise_type *promise, size_t message_size) 11 | : promise(promise), 12 | message_size(message_size) {} 13 | task::promise_type *promise = NULL; 14 | size_t message_size; 15 | }; 16 | 17 | struct read_awaitable : public stream_base 18 | { 19 | read_awaitable(task::promise_type *promise, size_t message_size, char **buffer_pointer) 20 | : stream_base(promise, message_size), 21 | buffer_pointer(buffer_pointer) {} 22 | bool await_ready() { return false; } 23 | void await_suspend(std::coroutine_handle h) 24 | { 25 | auto &promise = h.promise(); 26 | this->promise = &promise; 27 | promise.uring->add_read_request(promise.request_info.client_socket, promise.request_info); 28 | } 29 | size_t await_resume() 30 | { 31 | *buffer_pointer = promise->uring->get_buffer_pointer(promise->request_info.bid); 32 | return promise->res; 33 | } 34 | char **buffer_pointer; 35 | }; 36 | 37 | struct write_awaitable : public stream_base 38 | { 39 | write_awaitable(task::promise_type *promise, size_t message_size, char *buffer) 40 | : stream_base(promise, message_size), 41 | buffer(buffer) {} 42 | bool await_ready() { return false; } 43 | void await_suspend(std::coroutine_handle h) 44 | { 45 | auto &promise = h.promise(); 46 | this->promise = &promise; 47 | promise.request_info.bid = promise.uring->get_buffer_id(buffer); 48 | promise.uring->add_write_request(promise.request_info.client_socket, message_size, promise.request_info); 49 | log("write await_suspend %lu", message_size); 50 | } 51 | size_t await_resume() 52 | { 53 | promise->uring->add_buffer_request(promise->request_info); 54 | return promise->res; 55 | } 56 | char *buffer; 57 | }; 58 | 59 | struct read_file_awaitable : public read_awaitable 60 | { 61 | read_file_awaitable(task::promise_type *promise, size_t message_size, char **buffer_pointer, int read_fd) 62 | : read_awaitable(promise, message_size, buffer_pointer), 63 | read_fd(read_fd) {} 64 | void await_suspend(std::coroutine_handle h) 65 | { 66 | auto &promise = h.promise(); 67 | this->promise = &promise; 68 | promise.uring->add_read_request(read_fd, promise.request_info); 69 | } 70 | int read_fd; 71 | }; 72 | 73 | struct write_file_awaitable : public write_awaitable 74 | { 75 | write_file_awaitable(task::promise_type *promise, size_t message_size, char *buffer, int write_fd) 76 | : write_awaitable(promise, message_size, buffer), 77 | write_fd(write_fd) {} 78 | void await_suspend(std::coroutine_handle h) 79 | { 80 | auto &promise = h.promise(); 81 | this->promise = &promise; 82 | promise.request_info.bid = promise.uring->get_buffer_id(buffer); 83 | promise.uring->add_write_request(write_fd, message_size, promise.request_info); 84 | } 85 | int write_fd; 86 | }; 87 | 88 | struct close_awaitable 89 | { 90 | close_awaitable(int fd): fd(fd) {}; 91 | bool await_ready() { return true; } 92 | void await_suspend(std::coroutine_handle h) 93 | { 94 | auto &promise = h.promise(); 95 | promise.uring->add_close_request(fd); 96 | } 97 | void await_resume() 98 | { 99 | } 100 | int fd; 101 | }; 102 | 103 | auto read_socket(char **buffer_pointer) 104 | { 105 | return read_awaitable(nullptr, 0, buffer_pointer); 106 | } 107 | 108 | auto read_fd(int fd, char **buffer_pointer) 109 | { 110 | return read_file_awaitable(nullptr, 0, buffer_pointer, fd); 111 | } 112 | 113 | auto write_fd(int fd, char *buffer, size_t message_size) 114 | { 115 | return write_file_awaitable(nullptr, message_size, buffer, fd); 116 | } 117 | 118 | auto write_socket(char *buffer, size_t message_size) 119 | { 120 | log("write write_socket %lu", message_size); 121 | return write_awaitable(nullptr, message_size, buffer); 122 | } 123 | 124 | auto shutdown_socket(int fd) 125 | { 126 | return close_awaitable(fd); 127 | } 128 | 129 | #endif 130 | -------------------------------------------------------------------------------- /server/task.h: -------------------------------------------------------------------------------- 1 | #ifndef TASK_H 2 | #define TASK_H 3 | 4 | // infiniteDataStream.cpp 5 | #include 6 | #include 7 | #include 8 | 9 | enum task_option { 10 | ACCEPT, 11 | READ, 12 | WRITE, 13 | OPEN, 14 | PROV_BUF, 15 | }; 16 | 17 | union request { 18 | struct{ 19 | short event_type; 20 | short bid; 21 | int client_socket; 22 | }; 23 | unsigned long long uring_data; 24 | }; 25 | 26 | static_assert(sizeof(request) == sizeof(unsigned long long)); 27 | 28 | class io_uring_handler; 29 | 30 | struct task { 31 | struct promise_type 32 | { 33 | using Handle = std::coroutine_handle; 34 | task get_return_object() 35 | { 36 | return task{Handle::from_promise(*this)}; 37 | } 38 | std::suspend_always initial_suspend() noexcept { 39 | return {}; 40 | } 41 | std::suspend_never final_suspend() noexcept { return {}; } 42 | void return_void() noexcept {} 43 | void unhandled_exception() noexcept {} 44 | 45 | request request_info; 46 | io_uring_handler *uring; 47 | size_t res; 48 | }; 49 | explicit task(promise_type::Handle handler) : handler(handler) {} 50 | void destroy() { handler.destroy(); } 51 | task(const task &) = delete; 52 | task &operator=(const task &) = delete; 53 | task(task &&t) noexcept : handler(t.handler) { t.handler = {}; } 54 | task &operator=(task &&t) noexcept 55 | { 56 | if (this == &t) 57 | return *this; 58 | if (handler) 59 | handler.destroy(); 60 | handler = t.handler; 61 | t.handler = {}; 62 | return *this; 63 | } 64 | promise_type::Handle handler; 65 | }; 66 | 67 | #endif -------------------------------------------------------------------------------- /server/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H 2 | #define UTILS_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | constexpr int group_id = 1337; 9 | 10 | /* 11 | * Utility function to convert a string to lower case. 12 | * */ 13 | 14 | void strtolower(char *str) { 15 | for (; *str; ++str) 16 | *str = (char)tolower(*str); 17 | } 18 | /* 19 | One function that prints the system call and the error details 20 | and then exits with error code 1. Non-zero meaning things didn't go well. 21 | */ 22 | void fatal_error(const char *syscall) { 23 | perror(syscall); 24 | exit(1); 25 | } 26 | 27 | #define log(...) do { printf(__VA_ARGS__); putchar('\n'); fflush(stdout); } while(0) 28 | 29 | #endif -------------------------------------------------------------------------------- /test/client.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | using namespace std; 15 | 16 | #define MAXSIZE 1024 17 | #define IPADDRESS "127.0.0.1" 18 | #define SERV_PORT 8000 19 | #define FDSIZE 1024 20 | #define EPOLLEVENTS 20 21 | 22 | int setSocketNonBlocking1(int fd) 23 | { 24 | int flag = fcntl(fd, F_GETFL, 0); 25 | if (flag == -1) 26 | return -1; 27 | 28 | flag |= O_NONBLOCK; 29 | if (fcntl(fd, F_SETFL, flag) == -1) 30 | return -1; 31 | return 0; 32 | } 33 | 34 | int main(int argc, char *argv[]) 35 | { 36 | int sockfd; 37 | struct sockaddr_in servaddr; 38 | sockfd = socket(AF_INET, SOCK_STREAM, 0); 39 | bzero(&servaddr, sizeof(servaddr)); 40 | servaddr.sin_family = AF_INET; 41 | servaddr.sin_port = htons(SERV_PORT); 42 | inet_pton(AF_INET, IPADDRESS, &servaddr.sin_addr); 43 | char buff[4096]; 44 | buff[0] = '\0'; 45 | const char *p = " "; 46 | 47 | p = "GET / HTTP/1.0"; 48 | sockfd = socket(AF_INET, SOCK_STREAM, 0); 49 | if (connect(sockfd, (struct sockaddr *)&servaddr, sizeof(servaddr)) == 0) 50 | { 51 | setSocketNonBlocking1(sockfd); 52 | cout << "2:" << endl; 53 | ssize_t n = write(sockfd, p, strlen(p)); 54 | cout << "strlen(p) = " << strlen(p) << endl; 55 | sleep(1); 56 | n = read(sockfd, buff, 4096); 57 | cout << "n=" << n << endl; 58 | printf("%s", buff); 59 | close(sockfd); 60 | } 61 | else 62 | { 63 | perror("err2"); 64 | } 65 | sleep(1); 66 | 67 | p = "GET / HTTP/1.0\r\nHost: 192.168.52.135:8888\r\nContent-Type: " 68 | "application/x-www-form-urlencoded\r\nConnection: Keep-Alive\r\n\r\n"; 69 | sockfd = socket(AF_INET, SOCK_STREAM, 0); 70 | if (connect(sockfd, (struct sockaddr *)&servaddr, sizeof(servaddr)) == 0) 71 | { 72 | setSocketNonBlocking1(sockfd); 73 | cout << "3:" << endl; 74 | ssize_t n = write(sockfd, p, strlen(p)); 75 | cout << "strlen(p) = " << strlen(p) << endl; 76 | sleep(1); 77 | n = read(sockfd, buff, 4096); 78 | cout << "n=" << n << endl; 79 | printf("%s", buff); 80 | close(sockfd); 81 | } 82 | else 83 | { 84 | perror("err3"); 85 | } 86 | return 0; 87 | } --------------------------------------------------------------------------------