├── src ├── net_utils.hpp ├── CMakeLists.txt ├── net_utils.cpp ├── ring_buffer.hpp └── epoll.cpp └── README.md /src/net_utils.hpp: -------------------------------------------------------------------------------- 1 | #ifndef NET_UTILS_H_ 2 | #define NET_UTILS_H_ 3 | 4 | int make_socket_non_blocking (int sfd); 5 | int create_and_bind(char *port); 6 | 7 | #endif -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required (VERSION 2.6) 2 | project (epoll-example) 3 | 4 | ############################# 5 | ## Compiler flags. 6 | ############################# 7 | list( APPEND CMAKE_CXX_FLAGS "-pthread -std=c++0x -march=native -O2 -g -Wall ${CMAKE_CXX_FLAGS} ") 8 | 9 | 10 | ########## 11 | # Src 12 | ########## 13 | SET (ring_buffer_SRC ring_buffer.hpp) 14 | 15 | SET (net_utils_LIB net_utils) 16 | SET (net_utils_SRC net_utils.cpp) 17 | 18 | SET (epoll_SRC epoll.cpp) 19 | SET (epoll_EXECUTABLE epoll) 20 | 21 | 22 | ADD_LIBRARY(${net_utils_LIB} SHARED ${net_utils_SRC}) 23 | 24 | ADD_EXECUTABLE(${epoll_EXECUTABLE} ${epoll_SRC}) 25 | 26 | TARGET_LINK_LIBRARIES(${epoll_EXECUTABLE} ${net_utils_LIB}) -------------------------------------------------------------------------------- /src/net_utils.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | int make_socket_non_blocking (int sfd) { 11 | int flags, s; 12 | 13 | flags = fcntl(sfd, F_GETFL, 0); 14 | if (flags == -1) 15 | { 16 | perror("fcntl"); 17 | return -1; 18 | } 19 | 20 | flags |= O_NONBLOCK; 21 | s = fcntl(sfd, F_SETFL, flags); 22 | if (s == -1) 23 | { 24 | perror("fcntl"); 25 | return -1; 26 | } 27 | 28 | return 0; 29 | } 30 | 31 | int create_and_bind(char *port) { 32 | struct addrinfo hints; 33 | struct addrinfo *result, *rp; 34 | int sfd, retval; 35 | 36 | memset(&hints, 0, sizeof(struct addrinfo)); 37 | hints.ai_family = AF_UNSPEC; // Return IPv4 and IPv6 choices. 38 | hints.ai_socktype = SOCK_STREAM; // We want a TCP socket. 39 | hints.ai_flags = AI_PASSIVE; // All interfaces. 40 | 41 | retval = getaddrinfo(NULL, port, &hints, &result); 42 | if (retval != 0) { 43 | fprintf(stderr, "getaddrinfo: %s\n", gai_strerror(retval)); 44 | return -1; 45 | } 46 | 47 | for (rp = result; rp != NULL; rp = rp->ai_next) { 48 | sfd = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol); 49 | if (sfd == -1) 50 | continue; 51 | 52 | retval = bind(sfd, rp->ai_addr, rp->ai_addrlen); 53 | if (retval == 0) { 54 | // We managed to bind successfully! 55 | break; 56 | } 57 | 58 | close(sfd); 59 | } 60 | 61 | if (rp == NULL) { 62 | fprintf(stderr, "Could not bind\n"); 63 | return -1; 64 | } 65 | 66 | freeaddrinfo(result); 67 | 68 | return sfd; 69 | } -------------------------------------------------------------------------------- /src/ring_buffer.hpp: -------------------------------------------------------------------------------- 1 | #ifndef RING_BUFFER_H_ 2 | #define RING_BUFFER_H_ 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | namespace processor { 11 | 12 | #define CACHE_LINE_SIZE 64 13 | 14 | // A simple ring buffer for single producers and single consumers. 15 | // Does not support parallel consumers for now. 16 | template 17 | class RingBuffer { 18 | 19 | public: 20 | // Events_size must be a power of two. 21 | explicit RingBuffer(uint64_t event_size) : 22 | event_size_(event_size), 23 | publisher_sequence_(-1), 24 | cached_consumer_sequence_(-1), 25 | events_(new T[event_size]), 26 | consumer_sequence_(-1) { 27 | } 28 | 29 | // No copy constructor. 30 | RingBuffer(const RingBuffer&) = delete; 31 | RingBuffer& operator = (const RingBuffer &) = delete; 32 | 33 | // Used to get an event for a given sequence. 34 | //Can be called by both the producer and consumer. 35 | inline T* get(int64_t sequence) { 36 | return &events_[sequence & (event_size_ - 1)]; // size - 1 is the mask here. 37 | } 38 | 39 | // Can be called by either producer or consumer. 40 | inline uint64_t getBufferSize() const { 41 | return event_size_; 42 | } 43 | 44 | // Called by the producer to get the next publish slot. 45 | // Will block till there is a slot to claim. 46 | int64_t nextProducerSequence() { 47 | int64_t current_producer_sequence = publisher_sequence_.load(std::memory_order::memory_order_relaxed); 48 | int64_t next_producer_sequence = current_producer_sequence + 1; 49 | int64_t wrap_point = next_producer_sequence - event_size_; 50 | // TODO(Rajiv): Combine pausing with backoff + sleep. 51 | if (cached_consumer_sequence_ > wrap_point) { 52 | return next_producer_sequence; 53 | } 54 | cached_consumer_sequence_ = getConsumerSequence(); 55 | while (cached_consumer_sequence_ <= wrap_point) { 56 | _mm_pause(); 57 | cached_consumer_sequence_ = getConsumerSequence(); 58 | } 59 | return next_producer_sequence; 60 | } 61 | 62 | // Called by the producer to see what entries the consumer is done with. 63 | inline int64_t getConsumerSequence() const { 64 | return consumer_sequence_.load(std::memory_order::memory_order_acquire); 65 | } 66 | 67 | // Called by the producer after it has set the correct event entries. 68 | inline void publish(int64_t sequence) { 69 | publisher_sequence_.store(sequence, std::memory_order::memory_order_release); 70 | } 71 | 72 | // Called by the consumer to see where the producer is at. 73 | inline int64_t getProducerSequence() const { 74 | return publisher_sequence_.load(std::memory_order::memory_order_acquire); 75 | } 76 | 77 | // Called by the consumer to set the latest consumed sequence. 78 | inline void markConsumed(int64_t sequence) { 79 | // Assert if sequence is higher than the previous one? 80 | consumer_sequence_.store(sequence, std::memory_order::memory_order_release); 81 | } 82 | 83 | ~RingBuffer() { 84 | printf("Deleted Ring Buffer\n"); 85 | delete[] events_; 86 | } 87 | 88 | private: 89 | 90 | int64_t event_size_; 91 | std::atomic publisher_sequence_; 92 | int64_t cached_consumer_sequence_; 93 | T* events_; 94 | // Ensure that the consumer sequence is on it's own cache line to prevent false sharing. 95 | std::atomic consumer_sequence_ __attribute__ ((aligned (CACHE_LINE_SIZE))); 96 | 97 | } __attribute__ ((aligned(CACHE_LINE_SIZE))); 98 | 99 | } // processor 100 | 101 | #endif -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #Epoll sample. 2 | 3 | ##Build and test. 4 | 5 | Make sure you have cmake. This will only work on a Linux OS with epoll and accept4 support. 6 | 7 | git clone https://github.com/RajivKurian/epoll-example.git 8 | cd epoll-example 9 | mkdir build 10 | cd build 11 | cmake ../src 12 | make 13 | // To start the server at port 9090 14 | ./epoll 9090 15 | // You can test by using netcat in another terminal 16 | nc localhost 9090 17 | 18 | ##Details. 19 | 20 | A multi-threaded server which accepts bytes from the client and reverses them at arbitrary boundaries and sends it back to the client. 21 | 22 | 1. Most of the code for the epoll part of the tutorial is from a [Banu Systems Blog](https://banu.com/blog/2/how-to-use-epoll-a-complete-example-in-c/). The extension was making it multi threaded. 23 | 2. The code is a mix of C style (since it borrows from the blog) with some C++ thrown in. 24 | 3. There are two threads connected by a ring buffer: 25 | 1. Thread 1: Accepts client connections and reads bytes from them in an event loop using epoll. For every read event it reads the bytes into a ring buffer entry and publishes it with the client FD. 26 | 2. Thread 2: It reads events of the ring buffer. For each entry it prints the bytes, reverses them in place and sends the reversed bytes back to the client. 27 | 28 | ### Limitations. 29 | 30 | The code is very simplistic and does not handle a lot of cases. The limitations and some potential solutions are: 31 | 32 | 1. We check for the string "exit" from the client to stop the server. The "exit" string might be split across two different epoll events and we will not detect this. This is handled by a proper protocol design. 33 | 2. The worker thread writes the reversed string to the client. It checks for an error but does not check if all bytes were written. The worker thread can also have it's own epoll loop where it waits for a write event when all bytes don't go through. One should still first attempt a write and only make an epoll call if all the bytes did not go through. There are a few tricky things here: 34 | 1. Balancing between polling the ring buffer (for more events to process) and making epoll calls is tricky. 35 | 2. The worker thread must copy any of the ring buffer entry data that it still needs before marking the entry processed. In our case that would be the FD for the socket and the data not sent yet. This is important because the producer thread could overwrite the entry before we manage to send the bytes to the client. 36 | 3. If a particular socket is waiting for a writable signal, the worker thread should not write data on that socket after a separate event is processed till the first pending data is successfully written. We can process the event and just append the data to our already pending data. This can be done by having a linked list of buffers or even a realloc + copy. 37 | 3. An important benefit of using a ring buffer is that there are no malloc/free calls in steady state. All the memory is pre-allocated and just re-used. We use static 4096k buffers to read client data. If multiple sizes of buffers are needed this design can be altered for close to zero allocations in steady state. This is easily achieved by using a length prefixed protocol: 38 | 1. Once the first 4/8 bytes (32bit/64 bit length) of a message are read(this can be done on the stack), the length of the buffer needed to hold the entire message is known. 39 | 2. We can pick an appropriately sized buffer from a pool of buffers and use it for the rest of the message. Once the entire message is read we can put a pointer to this buffer on the ring buffer and publish it. Beware of [slowloris](http://en.wikipedia.org/wiki/Slowloris) attacks with such an approach though. Malicious or slow/faulty clients could cause buffer exhaustion on the server. For eg: A bunch of clients could connect and say they have a 10 MB message to send. Once you allocate a 10 MB buffer they never send any data but your buffers get used up. Using timers to reclaim such buffers and disconnect misbehaving clients can help with these problems. 40 | 3. Whenever we are short on buffers we could check all the ring buffer entries that have been processed in this cycle. We can claim all those buffers and put them in our pool. With this technique we must claim buffers at least once per cycle of the ring buffer otherwise we will end up reclaiming buffers that are still in use. This accounting can be a bit tricky. There are other ways of exchanging buffer info without any sharing (false or otherwise). 41 | 4. A modern memory allocator like Jemalloc already uses slab allocation + buddy memory allocation to reuse buffers. We could use Jemalloc and just call malloc and free instead of maintaining our own buffer pool. Note: We still always call malloc and free on the same thread instead of across thread boundaries. 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /src/epoll.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | #include "net_utils.hpp" 12 | #include "ring_buffer.hpp" 13 | 14 | #define MAXEVENTS 64 15 | 16 | #define DEFAULT_BUFFER_SIZE 4096 17 | #define DEFAULT_RING_BUFFER_SIZE 1024 18 | 19 | // Event data includes the socket FD, 20 | // a pointer to a fixed sized buffer, 21 | // number of bytes written and 22 | // a boolean indicating whether the worker thread should stop. 23 | struct event_data { 24 | int fd; 25 | char* buffer; 26 | int written; 27 | bool stop; 28 | 29 | event_data(): 30 | fd(-1), 31 | buffer(new char[DEFAULT_RING_BUFFER_SIZE]), 32 | written(0), 33 | stop(false) { 34 | 35 | } 36 | 37 | ~event_data() { 38 | delete[] buffer; 39 | } 40 | }; 41 | 42 | // Reverse in place. 43 | void reverse(char* p, int length) { 44 | int length_before_null = length - 1; 45 | int i,j; 46 | 47 | for (i = 0, j = length_before_null; i < j; i++, j--) { 48 | char temp = p[i]; 49 | p[i] = p[j]; 50 | p[j] = temp; 51 | } 52 | } 53 | 54 | int process_messages(processor::RingBuffer* ring_buffer) { 55 | int64_t prev_sequence = -1; 56 | int64_t next_sequence = -1; 57 | int num_events_processed = 0; 58 | int s = -1; 59 | while (true) { 60 | // Spin till a new sequence is available. 61 | while (next_sequence <= prev_sequence) { 62 | _mm_pause(); 63 | next_sequence = ring_buffer->getProducerSequence(); 64 | } 65 | // Once we have a new sequence process all items between the previous sequence and the new one. 66 | for (int64_t index = prev_sequence + 1; index <= next_sequence; index++) { 67 | auto e_data = ring_buffer->get(index); 68 | auto client_fd = e_data->fd; 69 | auto buffer = e_data->buffer; 70 | // Used to signal the server to stop. 71 | //printf("\nConsumer stop value %s\n", e_data->stop ? "true" : "false"); 72 | if (e_data->stop) 73 | goto exit_consumer; 74 | 75 | auto buffer_length = e_data->written; 76 | assert(client_fd != -1); 77 | assert(buffer_length > 0); 78 | 79 | // Write the buffer to standard output first. 80 | s = write (1, buffer, buffer_length); 81 | if (s == -1) { 82 | perror ("write"); 83 | abort (); 84 | } 85 | 86 | // Then reverse it and echo it back. 87 | reverse(buffer, buffer_length); 88 | s = write(client_fd, buffer, buffer_length); 89 | if (s == -1) { 90 | perror ("echo"); 91 | abort (); 92 | } 93 | // We are not checking to see if all the bytes have been written. 94 | // In case they are not written we must use our own epoll loop, express write interest 95 | // and write when the client socket is ready. 96 | ++num_events_processed; 97 | } 98 | // Mark events consumed. 99 | ring_buffer->markConsumed(next_sequence); 100 | prev_sequence = next_sequence; 101 | } 102 | exit_consumer: 103 | printf("Finished processing all events. Server shutting down. Num events processed = %d\n", num_events_processed); 104 | return 1; 105 | } 106 | 107 | void event_loop(int epfd, 108 | int sfd, 109 | processor::RingBuffer* ring_buffer) { 110 | int n, i; 111 | int retval; 112 | 113 | struct epoll_event event, current_event; 114 | // Buffer where events are returned. 115 | struct epoll_event* events = static_cast(calloc(MAXEVENTS, sizeof event)); 116 | 117 | while (true) { 118 | 119 | n = epoll_wait(epfd, events, MAXEVENTS, -1); 120 | 121 | for (i = 0; i < n; i++) { 122 | current_event = events[i]; 123 | 124 | if ((current_event.events & EPOLLERR) || 125 | (current_event.events & EPOLLHUP) || 126 | (!(current_event.events & EPOLLIN))) { 127 | // An error has occured on this fd, or the socket is not ready for reading (why were we notified then?). 128 | fprintf(stderr, "epoll error\n"); 129 | close(current_event.data.fd); 130 | } else if (current_event.events & EPOLLRDHUP) { 131 | // Stream socket peer closed connection, or shut down writing half of connection. 132 | // We still to handle disconnection when read()/recv() return 0 or -1 just to be sure. 133 | printf("Closed connection on descriptor vis EPOLLRDHUP %d\n", current_event.data.fd); 134 | // Closing the descriptor will make epoll remove it from the set of descriptors which are monitored. 135 | close(current_event.data.fd); 136 | } else if (sfd == current_event.data.fd) { 137 | // We have a notification on the listening socket, which means one or more incoming connections. 138 | while (true) { 139 | struct sockaddr in_addr; 140 | socklen_t in_len; 141 | int infd; 142 | char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV]; 143 | 144 | in_len = sizeof in_addr; 145 | // No need to make these sockets non blocking since accept4() takes care of it. 146 | infd = accept4(sfd, &in_addr, &in_len, SOCK_NONBLOCK | SOCK_CLOEXEC); 147 | if (infd == -1) { 148 | if ((errno == EAGAIN) || 149 | (errno == EWOULDBLOCK)) { 150 | break; // We have processed all incoming connections. 151 | } else { 152 | perror("accept"); 153 | break; 154 | } 155 | } 156 | 157 | // Print host and service info. 158 | retval = getnameinfo(&in_addr, in_len, 159 | hbuf, sizeof hbuf, 160 | sbuf, sizeof sbuf, 161 | NI_NUMERICHOST | NI_NUMERICSERV); 162 | if (retval == 0) { 163 | printf("Accepted connection on descriptor %d (host=%s, port=%s)\n", infd, hbuf, sbuf); 164 | } 165 | 166 | // Register the new FD to be monitored by epoll. 167 | event.data.fd = infd; 168 | // Register for read events, disconnection events and enable edge triggered behavior for the FD. 169 | event.events = EPOLLIN | EPOLLRDHUP | EPOLLET; 170 | retval = epoll_ctl(epfd, EPOLL_CTL_ADD, infd, &event); 171 | if (retval == -1) { 172 | perror("epoll_ctl"); 173 | abort(); 174 | } 175 | } 176 | } else { 177 | // We have data on the fd waiting to be read. Read and display it. 178 | // We must read whatever data is available completely, as we are running in edge-triggered mode 179 | // and won't get a notification again for the same data. 180 | bool should_close = false, done = false; 181 | 182 | while (!done) { 183 | ssize_t count; 184 | // Get the next ring buffer entry. 185 | auto next_write_index = ring_buffer->nextProducerSequence(); 186 | auto entry = ring_buffer->get(next_write_index); 187 | 188 | // Read the socket data into the buffer associated with the ring buffer entry. 189 | // Set the entry's fd field to the current socket fd. 190 | count = read(current_event.data.fd, entry->buffer, DEFAULT_BUFFER_SIZE); 191 | entry->written = count; 192 | entry->fd = current_event.data.fd; 193 | 194 | if (count == -1) { 195 | // EAGAIN or EWOULDBLOCK means we have no more data that can be read. 196 | // Everything else is a real error. 197 | if (!(errno == EAGAIN || errno == EWOULDBLOCK)) { 198 | perror("read"); 199 | should_close = true; 200 | } 201 | done = true; 202 | } else if (count == 0) { 203 | // Technically we don't need to handle this here, since we wait for EPOLLRDHUP. We handle it just to be sure. 204 | // End of file. The remote has closed the connection. 205 | should_close = true; 206 | done = true; 207 | } else { 208 | // Valid data. Process it. 209 | // Check if the client want's the server to exit. 210 | // This might never work out even if the client sends an exit signal because TCP might 211 | // split and rearrange the packets across epoll signal boundaries at the server. 212 | bool stop = (strncmp(entry->buffer, "exit", 4) == 0); 213 | entry->stop = stop; 214 | 215 | // Publish the ring buffer entry since all is well. 216 | ring_buffer->publish(next_write_index); 217 | if (stop) 218 | goto exit_loop; 219 | } 220 | } 221 | 222 | 223 | if (should_close) { 224 | printf("Closed connection on descriptor %d\n", current_event.data.fd); 225 | // Closing the descriptor will make epoll remove it from the set of descriptors which are monitored. 226 | close(current_event.data.fd); 227 | } 228 | } 229 | } 230 | } 231 | exit_loop: 232 | free(events); 233 | } 234 | 235 | int main (int argc, char *argv[]) { 236 | int sfd, epfd, retval; 237 | // Our ring buffer. 238 | auto ring_buffer = new processor::RingBuffer(DEFAULT_RING_BUFFER_SIZE); 239 | 240 | if (argc != 2) { 241 | fprintf(stderr, "Usage: %s [port]\n", argv[0]); 242 | exit (EXIT_FAILURE); 243 | } 244 | 245 | sfd = create_and_bind(argv[1]); 246 | if (sfd == -1) 247 | abort (); 248 | 249 | retval = make_socket_non_blocking(sfd); 250 | if (retval == -1) 251 | abort (); 252 | 253 | retval = listen(sfd, SOMAXCONN); 254 | if (retval == -1) { 255 | perror ("listen"); 256 | abort (); 257 | } 258 | 259 | epfd = epoll_create1(0); 260 | if (epfd == -1) { 261 | perror ("epoll_create"); 262 | abort (); 263 | } 264 | 265 | // Register the listening socket for epoll events. 266 | { 267 | struct epoll_event event; 268 | event.data.fd = sfd; 269 | event.events = EPOLLIN | EPOLLET; 270 | retval = epoll_ctl(epfd, EPOLL_CTL_ADD, sfd, &event); 271 | if (retval == -1) { 272 | perror ("epoll_ctl"); 273 | abort (); 274 | } 275 | } 276 | 277 | 278 | // Start the worker thread. 279 | std::thread t{process_messages, ring_buffer}; 280 | 281 | // Start the event loop. 282 | event_loop(epfd, sfd, ring_buffer); 283 | 284 | // Our server is ready to stop. Release all pending resources. 285 | t.join(); 286 | close(sfd); 287 | delete ring_buffer; 288 | 289 | return EXIT_SUCCESS; 290 | } --------------------------------------------------------------------------------