├── .gitignore ├── .gitmodules ├── LICENSE ├── Makefile.am ├── README.md ├── autogen.sh ├── bb.c ├── configure.ac ├── fluxcap.c ├── fluxcap.h ├── lib ├── .gitignore └── Makefile.am ├── respan.c ├── respan.h └── util ├── .gitignore ├── Makefile.am ├── ramdisk.c ├── tests ├── do_tests ├── test1 ├── test1.ans ├── test2 ├── test2.ans ├── test3 ├── test3.ans ├── test4 ├── test4.ans ├── test5 ├── test5.ans └── testdir.tar └── watch_copy.c /.gitignore: -------------------------------------------------------------------------------- 1 | fluxtop 2 | fluxcap 3 | ramdisk 4 | fpcap-replay 5 | fprune 6 | watch_copy 7 | *.o 8 | *.a 9 | .dirstamp 10 | *.swp 11 | Makefile.in 12 | aclocal.m4 13 | autom4te.cache 14 | compile 15 | configure 16 | depcomp 17 | install-sh 18 | missing 19 | Makefile 20 | config.log 21 | config.status 22 | .deps 23 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "lib/libut"] 2 | path = lib/libut 3 | url = https://github.com/troydhanson/libut.git 4 | ignore = untracked 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | © 2017 The Johns Hopkins University Applied Physics Laboratory LLC. All Rights Reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /Makefile.am: -------------------------------------------------------------------------------- 1 | srcdir = @srcdir@ 2 | 3 | SUBDIRS=lib util 4 | 5 | bin_PROGRAMS = fluxcap respan 6 | 7 | fluxcap_SOURCES = fluxcap.c bb.c fluxcap.h 8 | fluxcap_CPPFLAGS = -I$(srcdir)/lib/libut/include 9 | fluxcap_LDADD = -Llib -lut -lshr -lm 10 | 11 | respan_SOURCES = respan.c respan.h 12 | respan_CPPFLAGS = 13 | respan_LDADD = -lshr 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Back to the [fluxcap Github page](http://github.com/troydhanson/fluxcap). 2 | Back to [my other projects](http://troydhanson.github.io/). 3 | 4 | # About 5 | 6 | fluxcap: a network tap replication and aggregation tool 7 | 8 | A Linux host running fluxcap can: 9 | 10 | * accept taps, on one or more physical network interfaces, 11 | * aggregate them, possibly inserting VLAN tags, 12 | * transmit them, on one of more physical network interfaces, 13 | * send or receive taps encapsulated in GRE or VXLAN over IP. 14 | 15 | Fluxcap implements its features using raw sockets. It is 16 | written in C, MIT licensed, and for Linux only. 17 | 18 | Platforms: Ubuntu, RHEL/CentOS are primary, but it works on others. 19 | 20 | # Build & install 21 | 22 | ## Prereqs 23 | 24 | In order to build and use fluxcap you need to install a few packages. 25 | 26 | # Ubuntu 27 | sudo apt-get install git gcc automake autoconf libtool ethtool 28 | 29 | # RHEL/CentOS 30 | sudo yum install git gcc automake autoconf libtool ethtool 31 | 32 | Last, libshr must be built and installed prior to building fluxcap. 33 | 34 | git clone https://github.com/troydhanson/shr.git 35 | cd shr 36 | autoreconf -ivf 37 | ./configure 38 | make 39 | sudo make install 40 | sudo ldconfig 41 | 42 | The libshr libraries and header files are now in `/usr/local/lib` 43 | and `/usr/local/include`. 44 | 45 | Note that on many systems, `/usr/local/lib` is not in the default 46 | library search path. On systems like this, it is usually possible 47 | to add `/usr/local/lib` to the library search path by running: 48 | 49 | echo /usr/local/lib | sudo tee /etc/ld.so.conf.d/local.conf 50 | sudo ldconfig 51 | 52 | ## fluxcap 53 | 54 | In the top-level directory of fluxcap, run: 55 | 56 | git submodule update --init --recursive 57 | ./autogen.sh 58 | ./configure 59 | make 60 | sudo make install 61 | 62 | This installs `fluxcap`, typically into `/usr/local/bin`. 63 | 64 | # Preparing the host 65 | 66 | These things need to be done *at each system boot*. 67 | 68 | * disable hardware offloading on the receive/transmit NIC's 69 | * set up iptables to prevent accidental traffic on NIC's 70 | * make sure NIC's are up and not assigned IP addresses 71 | * mount a ramdisk for fluxcap's ring buffers 72 | 73 | This script can be used. Save the script somewhere, make it executable, 74 | and execute it at startup via `/etc/rc.local` or similar. 75 | 76 | #!/bin/bash 77 | INTERFACES="ens33 enp4s0" # replace with YOUR NIC names! 78 | for IF in $INTERFACES 79 | do 80 | ethtool -K $IF tso off # TCP segmentation offload (output) 81 | ethtool -K $IF ufo off # UDP segmentation offload (output) 82 | ethtool -K $IF gso off # generic segmentation offload (output) 83 | ethtool -K $IF gro off # generic receive offload (input) 84 | ethtool -K $IF lro off # large receive offload (input) 85 | 86 | /sbin/iptables -A INPUT -i $IF -j DROP 87 | /sbin/iptables -A OUTPUT -o $IF -j DROP 88 | 89 | /sbin/ip link set dev $IF up 90 | done 91 | 92 | Last we mount a ramdisk at each boot. You can name it anything; in 93 | this document, we use /ram as the mountpoint. Add to /etc/fstab: 94 | 95 | none /ram ramfs auto,noatime 0 0 96 | 97 | Then make the mountpoint and mount it: 98 | 99 | mkdir /ram 100 | mount /ram 101 | 102 | ### why disable offloads? 103 | 104 | When hardware offloading is left on, the NIC presents artificially large 105 | packets to the Linux host, by merging together IP packets in valid ways 106 | to reduce work the kernel would have to do in software. However, this is 107 | _undesirable_ for tap replication (and for any kind of packet analysis) 108 | because the larger, conglomerated packets fail re-transmission, and may 109 | vastly exceed MTU. Analysis tools want the original packets in any case. 110 | For the curious, an explanation of some offload parameters can be found 111 | [here](https://red.ht/2e608Oo). The usual symptom of skipping this step 112 | is to see fluxcap emit errors like `sendto: message too long`. 113 | 114 | ### why use iptables? 115 | 116 | It is just an added layer of protection from the host generating traffic 117 | of its own on the NIC. It is optional. 118 | 119 | ### why use ramfs? 120 | 121 | While tmpfs is newer, it can swap, and that is undesirable for this program. 122 | Use of ramfs is considered safe because we only create a few fixed-sized 123 | memory buffers in it. 124 | 125 | # Configuring fluxcap 126 | 127 | Here, we show how to run fluxcap by hand to set up tap replication or 128 | aggregation. In order to persist, these commands have to run at each 129 | system boot. We can use a process supervisor for that, but we show it 130 | by hand first. Everything needs to be run as root. 131 | 132 | ## Tap replication 133 | 134 | Suppose we have three available NIC's on a host and we want to replicate 135 | a tap coming into eth1, re-transmit it on eth2 and eth3. We'll assume 136 | that eth0 is a management NIC and, obviously, we leave that one alone. 137 | 138 | eth0: management (leave alone) 139 | eth1: tap from Cisco switch 140 | eth2: tap output (copy #1) 141 | eth3: tap output (copy #2) 142 | 143 | Remember the ramfs we mounted earlier? We mounted it at /ram. That is 144 | where we will create a fluxcap ring buffer. In this setup we have one 145 | input NIC, so we only need one ring buffer. 146 | 147 | cd /ram 148 | fluxcap -cr -s 100m cisco 149 | 150 | Now if you run `ls /ram/cisco` you see a 100 mb file there. It's a 151 | memory buffer. It's a file too. Everything in unix is a file, right? 152 | The name "cisco" could be anything, but when you start working with 153 | a dozen taps coming into one host, it helps to name things clearly. 154 | 155 | Why did we choose 100 mb size? (This uses real RAM by the way, so 156 | beware of making it too large; consider what RAM your host has free). 157 | The idea is we want the ring to be "large enough" that it can buffer 158 | data from the incoming tap long enough to get read by the transmit 159 | processes that will send out the output NIC's. Here "long enough" 160 | means "before the data in the ring buffer gets written over". One 161 | could contemplate how to size the buffer - but we will just pick 162 | a number. You can use 1G (that is, a gigabyte) for the buffer if 163 | you have a lot of RAM, and just forget about it. A gigabyte can 164 | buffer about ten seconds of traffic from a fully loaded gigabit NIC. 165 | In any case, we can eyeball the I/O rates, and watch for drops- but 166 | first we have to start up the receive and transmit processes. 167 | 168 | cd /ram 169 | fluxcap -rx -i eth1 cisco & 170 | fluxcap -tx -i eth2 cisco & 171 | fluxcap -tx -i eth3 cisco & 172 | 173 | At this point it is up and running. The first process captures on 174 | eth1 into the ring /ram/cisco. The second process transmits on eth2, 175 | and the last on eth3. (If you see messages like "sendto: too long" 176 | you should review the section on disabling NIC offloads above). 177 | 178 | We used `&` to put them in the background. You could run them in 179 | three separate terminals instead. In real life, we put them under 180 | a process supervisor, discussed further below. 181 | 182 | We can watch the I/O rates this way: 183 | 184 | fluxcap -io cisco 185 | 186 | We could add a VLAN tag on the data when it comes in from eth1. That 187 | helps distinguish things if we merge several taps down the road. Run 188 | `fluxcap -h` to see the VLAN tag injection and other options. 189 | 190 | ## Tap aggregation 191 | 192 | Suppose we have two input taps. We want to aggregate them. We want 193 | to transmit the aggregate tap on a third interface. 194 | 195 | enp0: management (leave alone) 196 | enp1: tap from Cisco switch 197 | enp2: tap from Dell switch 198 | enp3: aggregate (Cisco+Dell) output 199 | 200 | We want to create a ring buffer for each input NIC, so we need two. 201 | Doing this by hand at the shell prompt, we'd run: 202 | 203 | cd /ram 204 | fluxcap -cr -s 100m cisco dell 205 | 206 | Last we run two receive processes and two transmit processes: 207 | 208 | fluxcap -rx -i enp1 cisco & 209 | fluxcap -rx -i enp2 dell & 210 | fluxcap -tx -i enp3 cisco & 211 | fluxcap -tx -i enp3 dell & 212 | 213 | We can run `fluxcap -io cisco dell` at this point to see the I/O rates. 214 | In an aggregation scenario it may be helpful to synthesize VLAN tags on 215 | the input taps so they can still be distinguished after aggregation. We 216 | could have used `-V 100` on the cisco receiver, and `-V 200` for dell, 217 | for example. 218 | 219 | ## Under a process supervisor 220 | 221 | Running things by hand is good for testing. If persistence is needed, and 222 | resilience in the face of things like NIC's going up and down when someone 223 | unplugs the cable and plugs it back in, then use a process supervisor. An 224 | example using [pmtr](http://github.com/troydhanson/pmtr) is shown here. 225 | 226 | # pmtr.conf 227 | 228 | job { 229 | dir /ram 230 | cmd /usr/local/bin/fluxcap -cr -s 100m cisco dell 231 | wait 232 | once 233 | } 234 | 235 | job { 236 | dir /ram 237 | cmd /usr/local/bin/fluxcap -rx -i enp1 cisco 238 | } 239 | 240 | ... 241 | 242 | This way, pmtr starts things at boot, and restarts proceses that exit. 243 | The NIC offload script could be run from here too, instead of rc.local. 244 | 245 | ## Encapsulation modes 246 | 247 | Fluxcap can also transmit and receive taps over a regular IP network. The 248 | packets travel inside a layer of GRE or VXLAN encapsulation. 249 | 250 | VXLAN encapsulates the original packet in a new UDP packet to port 4789, and 251 | prepends an 8-byte header having a network identifier (VNI). Currently, 252 | fluxcap supports sending VXLAN but not receiving it. 253 | 254 | The GRE tunnel encapsulation modes are GRETAP, and regular GRE. GRETAP (also 255 | called TEB for "transparent ethernet bridging") is preferred over GRE. 256 | GRETAP preserves the MAC addresses in the encapsulation, whereas GRE does not. 257 | 258 | ### Transmitter 259 | 260 | In this example, the GRETAP recipient tunnel endpoint is 192.168.102.100: 261 | 262 | fluxcap -tx -E gretap:192.168.102.100 ring 263 | 264 | A VXLAN transmitter example that sets the VNI to 1234 is: 265 | 266 | fluxcap -tx -E vxlan:192.168.102.100 -K 1234 ring 267 | 268 | ### Receiver 269 | 270 | fluxcap -rx -E gretap ring 271 | 272 | To limit the interface and/or the IP address on which GRE is received, use: 273 | 274 | fluxcap -rx -E gretap:127.0.0.1 -i lo ring 275 | 276 | replacing 127.0.0.1 with the local IP address or replacing lo with an interface. 277 | 278 | ### GRE keys 279 | 280 | It is possible to set the GRE key on a transmitted GRE/GRETAP tunnel 281 | using the `-K ` option. For example, `-K 500` sets the key to 500. 282 | This is useful when aggregating multiple taps over GRE, when there is a 283 | need to differentiate them on the receiving end. 284 | 285 | On the receiver, `-K ` specifies the key that should be accepted. 286 | 287 | The key can be specified as a 32-bit unsigned integer, or as a dotted 288 | quad IP of any meaning to the user. 289 | 290 | ### VNI 291 | 292 | When using VXLAN encapsulation, the `-K ` is interpreted as a VNI. 293 | 294 | #### Receiver alternative: Linux OS decapsulation 295 | 296 | If the recipient host is Linux, it can decapsulate the tunnel for us. 297 | This creates a synthetic NIC on the host, ready for use with a packet 298 | analysis tool, which looks like the remote tap cable is plugged in. 299 | 300 | Confirm the recipient is getting the tunneled packets first. 301 | 302 | tcpdump -n "proto gre" 303 | 304 | Then, to have Linux decapsulate for us, modify these commands by 305 | replacing 192.168.102.100 with the recipient IP address, and replacing 306 | 192.168.102.1 with the transmitter's IP address. 307 | 308 | # gretap 309 | 310 | modprobe ip_gre 311 | ip link add gretap1 type gretap local 192.168.102.100 remote 192.168.102.1 312 | ip link set gretap1 up 313 | 314 | Now we can use gretap1 as if it were plugged into the remote tap. Try running 315 | `tcpdump -i gretap1 -nne` for example. If we had used gre instead of gretap: 316 | 317 | # gre 318 | 319 | modprobe ip_gre 320 | ip tunnel add gre1 type gre remote 192.168.102.1 local 192.168.102.100 ttl 255 321 | ip link set gre1 up 322 | 323 | ##### firewalld 324 | 325 | You may need to ensure that iptables/firewalld allow the traffic. On a CentOS 7 326 | system, `sudo systemctl stop firewalld` permits the data to arrive on gretap1. 327 | 328 | ##### MTU consideration 329 | 330 | When encapsulating packets, they grow. If the original packet was at the MTU of 331 | its network, and GRETAP encapsulation adds 24 bytes (28 if a GRE key is used), 332 | then each packet may fragment into two packets when sent over the tunnel. This 333 | IP fragmentation is reversed on the remote end invisibly. 334 | 335 | Fragmentation can be eliminated by raising the MTU on the tunnel network, or by 336 | truncating the packets (`-s`) to a max length when encapsulating. 337 | 338 | fluxcap -tx -s 1476 -E gretap:192.168.102.100 ring 339 | 340 | The syntax above would truncate at 1476 bytes, so that adding a 24-byte GRETAP 341 | header attains maximum MTU without fragmentation (on a 1500-byte MTU network). 342 | 343 | ## Other features 344 | 345 | Run `fluxcap -h` to see other options. 346 | 347 | 348 | -------------------------------------------------------------------------------- /autogen.sh: -------------------------------------------------------------------------------- 1 | autoreconf -ifv 2 | -------------------------------------------------------------------------------- /bb.c: -------------------------------------------------------------------------------- 1 | #include "fluxcap.h" 2 | 3 | /* 4 | * support to vectorize struct iovec and struct bb 5 | */ 6 | UT_mm iov_mm = { . sz = sizeof(struct iovec) }; 7 | 8 | void bb_init(void *_b) { 9 | struct bb *b = (struct bb*)_b; 10 | memset(b,0,sizeof(*b)); 11 | b->n = BATCH_SIZE; 12 | int mode = MAP_PRIVATE | MAP_ANONYMOUS /* | MAP_LOCKED */; 13 | b->d = mmap(0, b->n, PROT_READ|PROT_WRITE, mode, -1, 0); 14 | if (b->d == MAP_FAILED) { 15 | fprintf(stderr, "mmap: %s\n", strerror(errno)); 16 | abort(); 17 | } 18 | b->iov = utvector_new(&iov_mm); 19 | utvector_reserve(b->iov, BATCH_PKTS); 20 | } 21 | 22 | void bb_fini(void *_b) { 23 | struct bb *b = (struct bb*)_b; 24 | assert (b->d && (b->d != MAP_FAILED)); 25 | munmap(b->d, b->n); 26 | utvector_free(b->iov); 27 | } 28 | 29 | void bb_clear(void *_b) { 30 | struct bb *b = (struct bb*)_b; 31 | b->u = 0; 32 | utvector_clear(b->iov); 33 | } 34 | 35 | UT_mm bb_mm = { 36 | .sz = sizeof(struct bb), 37 | .init = bb_init, 38 | .fini = bb_fini, 39 | .clear = bb_clear, 40 | }; 41 | 42 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | AC_INIT([fluxcap], 3.1) 2 | AM_INIT_AUTOMAKE([foreign subdir-objects]) 3 | m4_ifdef([AM_SILENT_RULES], 4 | [AM_SILENT_RULES([yes]) 5 | ]) 6 | AC_PROG_CC 7 | AC_PROG_RANLIB 8 | 9 | have_shr_header=n 10 | have_shr_lib=n 11 | AC_CHECK_HEADERS([shr.h],[have_shr_header=y]) 12 | AC_CHECK_LIB(shr,shr_ctl,[have_shr_lib=y]) 13 | if test "x${have_shr_header}${have_shr_lib}" != xyy 14 | then 15 | AC_MSG_ERROR([ 16 | ----------------------------------------------------- 17 | The libshr build prerequisite was not found. Please 18 | see the build instructions, install libshr and retry. 19 | ----------------------------------------------------- 20 | ]) 21 | fi 22 | 23 | AC_CONFIG_FILES([Makefile 24 | util/Makefile 25 | lib/Makefile 26 | ]) 27 | AC_OUTPUT 28 | -------------------------------------------------------------------------------- /fluxcap.c: -------------------------------------------------------------------------------- 1 | #include "fluxcap.h" 2 | 3 | /* 4 | * fluxcap: a network tap replication and aggregation tool 5 | * 6 | */ 7 | 8 | struct mmsghdr bss_msgv[BATCH_PKTS]; 9 | 10 | struct { 11 | int verbose; 12 | char *prog; 13 | enum {mode_none, mode_transmit, mode_receive, mode_create, mode_watch} mode; 14 | char *file; 15 | char dev[MAX_NIC]; 16 | unsigned long ticks; 17 | int vlan; 18 | int pass_vlan; 19 | int tail; 20 | int fd; 21 | int tx_fd; 22 | int rx_fd; 23 | int signal_fd; 24 | int timer_fd; 25 | int epoll_fd; 26 | char pkt[MAX_PKT]; 27 | struct shr *ring; 28 | size_t size; /* ring create size (-cr), or snaplen (-rx/-tx) */ 29 | struct encap encap; 30 | struct itimerspec timer; 31 | uint16_t ip_id; /* for implementing IP fragmentation when */ 32 | int mtu; /* using gre encapsulation */ 33 | UT_vector /* of ptr */ *watch_rings; 34 | UT_vector /* of utstring */ *watch_names; 35 | UT_vector /* of struct ww */ *watch_win; 36 | UT_string *tmp; 37 | struct timeval now; 38 | struct bb bb; /* output shr ring batch buffer; accumulates til shr_writev */ 39 | struct bb rb; /* input shr ring batch buffer; accepts many via shr_readv */ 40 | struct bb pb; /* packet buffer (Special); faux bb wrapping kernel ring */ 41 | /* fields below are for packet input from AF_PACKET socket */ 42 | struct tpacket_req req; /* linux/if_packet.h */ 43 | unsigned ring_block_sz; /* see comments in initialization below */ 44 | unsigned ring_block_nr; /* number of blocks of sz above */ 45 | unsigned ring_frame_sz; /* snaplen */ 46 | unsigned ring_curr_idx; /* slot index in ring buffer */ 47 | unsigned ring_frame_nr; /* redundant, total frame count */ 48 | int strip_vlan; /* strip VLAN on rx if present (boolean) */ 49 | int drop_pct; /* sampling % 0 (keep all)-100(drop all) */ 50 | int use_tx_ring; /* 0 = sendto-based tx; 1=packet mmap ring-based tx */ 51 | int bypass_qdisc_on_tx; /* bypass kernel qdisc layer, more risk of loss */ 52 | struct fluxcap_stats stats; /* used to periodically update rx/rd stats */ 53 | int keep; /* in mode_create, keep existing ring if present */ 54 | int losing; 55 | struct bb gb; /* used in gre rx for recvmmsg */ 56 | struct mmsghdr *msgv; /* used in gre rx for recvmmsg */ 57 | } cfg = { 58 | .fd = -1, 59 | .tx_fd = -1, 60 | .rx_fd = -1, 61 | .signal_fd = -1, 62 | .timer_fd = -1, 63 | .epoll_fd = -1, 64 | .ring_block_sz = 1 << 22, /*4 mb; want powers of two due to kernel allocator*/ 65 | .ring_block_nr = 64, 66 | .ring_frame_sz = 1 << 11, /* 2048 for MTU & header, divisor of ring_block_sz*/ 67 | .timer = { 68 | .it_value = { .tv_sec = 0, .tv_nsec = 1 }, 69 | .it_interval = { .tv_sec = 0, .tv_nsec = 1000000000UL / TIMER_HZ }, 70 | }, 71 | .msgv = bss_msgv, 72 | }; 73 | 74 | extern UT_mm bb_mm; 75 | UT_mm ww_mm = { .sz = sizeof(struct ww), }; 76 | UT_mm _utmm_ptr = {.sz = sizeof(void*)}; 77 | UT_mm* utmm_ptr = &_utmm_ptr; 78 | 79 | /* signals that we'll accept via signalfd in epoll */ 80 | int sigs[] = {SIGHUP,SIGTERM,SIGINT,SIGQUIT,SIGALRM}; 81 | 82 | void usage() { 83 | fprintf(stderr, 84 | "usage: %s [-cr|-tx|-rx|-io] [options] \n" 85 | "\n" 86 | " create ring(s): -cr -s [k|m|g|t] ...\n" 87 | " transmit: -tx -i \n" 88 | " receive: -rx -i \n" 89 | " i/o view: -io ...\n" 90 | "\n" 91 | "Encapsulation modes:\n" 92 | " -tx -E gretap: [-K ] (GRETAP send)\n" 93 | " -rx -E gretap[:]> [-K ] [-i ] (GRETAP recv)\n" 94 | " -tx -E gre: [-K ] (GRE send)\n" 95 | " -tx -E vxlan: [-K ] (VXLAN send)\n" 96 | " where:\n" 97 | " GRE key/dotted quad (optional) [rx/tx]\n" 98 | " binds a local IP (optional) [rx]\n" 99 | " binds a local NIC (optional) [rx]\n" 100 | "\n" 101 | "Other options:\n" 102 | " -f 'vlan n' (accept packets tagged VLAN n) [tx]\n" 103 | " -V (inject VLAN tag) [rx/tx]\n" 104 | " -Q (remove VLAN tag) [rx]\n" 105 | " -d (downsample to <0-99>%% [rx/tx]\n" 106 | " -s (truncate at length) [rx/tx]\n" 107 | " -D (trim n tail bytes) [rx/tx]\n" 108 | " -R (tpacket-based tx) [tx]\n" 109 | " -q (bypass qdisc layer) [tx]\n" 110 | " -v (verbose)\n" 111 | "\n" 112 | " Kernel buffer options (TPACKET_V2) [rx/tx]\n" 113 | " Defaults apply if left unspecified. To use these options\n" 114 | " the block size must be a multiple of the system page size,\n" 115 | " and be small since it consumes physically contiguous pages.\n" 116 | " The number of blocks can be large. Their product is the buffer\n" 117 | " capacity. The frame size must evenly divide the block size.\n" 118 | " The parameters are checked to satisfy these constraints.\n" 119 | " The frame size is for one packet (with overhead) so it should\n" 120 | " exceed the MTU for full packet handling without truncation.\n" 121 | " -Z (max frame size) [2048]\n" 122 | " -B (number of blocks) [64])\n" 123 | " -S (block size log2) [22] (4mb)\n" 124 | "\n", 125 | cfg.prog); 126 | fprintf(stderr, "fluxcap version: %s\n", FLUXCAP_VERSION); 127 | exit(-1); 128 | } 129 | 130 | void hexdump(char *buf, size_t len) { 131 | size_t i,n=0; 132 | unsigned char c; 133 | while(n < len) { 134 | fprintf(stderr,"%08x ", (int)n); 135 | for(i=0; i < 16; i++) { 136 | c = (n+i < len) ? buf[n+i] : 0; 137 | if (n+i < len) fprintf(stderr,"%.2x ", c); 138 | else fprintf(stderr, " "); 139 | } 140 | for(i=0; i < 16; i++) { 141 | c = (n+i < len) ? buf[n+i] : ' '; 142 | if (c < 0x20 || c > 0x7e) c = '.'; 143 | fprintf(stderr,"%c",c); 144 | } 145 | fprintf(stderr,"\n"); 146 | n += 16; 147 | } 148 | } 149 | 150 | int new_epoll(int events, int fd) { 151 | int rc; 152 | struct epoll_event ev; 153 | memset(&ev,0,sizeof(ev)); // placate valgrind 154 | ev.events = events; 155 | ev.data.fd= fd; 156 | rc = epoll_ctl(cfg.epoll_fd, EPOLL_CTL_ADD, fd, &ev); 157 | if (rc == -1) { 158 | fprintf(stderr,"epoll_ctl: %s\n", strerror(errno)); 159 | } 160 | return rc; 161 | } 162 | 163 | /* 164 | * read_proc 165 | * 166 | * read a complete file from the /proc filesystem 167 | * this is special because its size is not known a priori 168 | * so a read/realloc loop is needed 169 | * 170 | * size into len, returning buffer or NULL on error. 171 | * caller should free the buffer eventually. 172 | */ 173 | char *read_proc(char *file, size_t *len) { 174 | char *buf=NULL, *b, *tmp; 175 | int fd = -1, rc = -1, eof=0; 176 | size_t sz, br=0, l; 177 | ssize_t nr; 178 | 179 | /* initial guess at a sufficient buffer size */ 180 | sz = 1000; 181 | 182 | fd = open(file, O_RDONLY); 183 | if (fd < 0) { 184 | fprintf(stderr,"open: %s\n", strerror(errno)); 185 | goto done; 186 | } 187 | 188 | while(!eof) { 189 | 190 | tmp = realloc(buf, sz); 191 | if (tmp == NULL) { 192 | fprintf(stderr, "out of memory\n"); 193 | goto done; 194 | } 195 | 196 | buf = tmp; 197 | b = buf + br; 198 | l = sz - br; 199 | 200 | do { 201 | nr = read(fd, b, l); 202 | if (nr < 0) { 203 | fprintf(stderr,"read: %s\n", strerror(errno)); 204 | goto done; 205 | } 206 | 207 | b += nr; 208 | l -= nr; 209 | br += nr; 210 | 211 | /* out of space? double buffer size */ 212 | if (l == 0) { 213 | sz *= 2; 214 | break; 215 | } 216 | 217 | if (nr == 0) eof = 1; 218 | 219 | } while (nr > 0); 220 | } 221 | 222 | *len = br; 223 | rc = 0; 224 | 225 | done: 226 | if (fd != -1) close(fd); 227 | if (rc && buf) { free(buf); buf = NULL; } 228 | return buf; 229 | } 230 | 231 | /* 232 | * find start and length of column N (one-based) 233 | * in input buffer buf of length buflen 234 | * 235 | * columns must be space-or-tab delimited 236 | * returns NULL if column not found 237 | * 238 | * the final column may end in newline or eob 239 | * 240 | * col: column index (1-based) 241 | * len: OUTPUT parameter (column length) 242 | * buf: buffer to find columns in 243 | * buflen: length of buf 244 | * 245 | * returns: 246 | * pointer to column N, or NULL 247 | */ 248 | #define ws(x) (((x) == ' ') || ((x) == '\t')) 249 | char *get_col(int col, size_t *len, char *buf, size_t buflen) { 250 | char *b, *start=NULL, *eob; 251 | int num; 252 | 253 | eob = buf + buflen; 254 | 255 | b = buf; 256 | num = 0; /* column number */ 257 | *len = 0; /* column length */ 258 | 259 | while (b < eob) { 260 | 261 | if (ws(*b) && (num == col)) break; /* end of sought column */ 262 | if (*b == '\n') break; /* end of line */ 263 | 264 | if (ws(*b)) *len = 0; /* skip over whitespace */ 265 | if ((!ws(*b)) && (*len == 0)) { /* record start of column */ 266 | num++; 267 | start = b; 268 | } 269 | if (!ws(*b)) (*len)++; /* increment column length */ 270 | b++; 271 | } 272 | 273 | if ((*len) && (num == col)) return start; 274 | return NULL; 275 | } 276 | 277 | /* 278 | * find route for a given destination IP address 279 | * 280 | * parameters: 281 | * dest_ip: the destination IP address in network order 282 | * interface: char[] to receive the output NIC interface name 283 | * must be at least IF_NAMESIZE bytes long; 284 | * see IF_NAMESIZE in /usr/include/net/if.h 285 | * returns: 286 | * 0 success 287 | * -1 error parsing routing table 288 | * -2 no route found 289 | * 290 | */ 291 | int find_route(uint32_t dest_ip, 292 | char *interface) { 293 | 294 | int rc = -1, sc; 295 | char *buf=NULL, *line, *b, *iface, *s_dest, *s_gw, *s_mask; 296 | unsigned mask, dest, gw, best_mask=0, nroutes=0; 297 | size_t len, sz=0, to_eob, iface_len; 298 | 299 | buf = read_proc("/proc/net/route", &sz); 300 | if (buf == NULL) goto done; 301 | 302 | /* find initial newline; discard header row */ 303 | b = buf; 304 | while ((b < buf+sz) && (*b != '\n')) b++; 305 | line = b+1; 306 | 307 | while (line < buf+sz) { 308 | 309 | to_eob = sz-(line-buf); 310 | 311 | s_dest = get_col(2, &len, line, to_eob); 312 | if (s_dest == NULL) goto done; 313 | sc = sscanf(s_dest, "%x", &dest); 314 | if (sc != 1) goto done; 315 | 316 | s_mask = get_col(8, &len, line, to_eob); 317 | if (s_mask == NULL) goto done; 318 | sc = sscanf(s_mask, "%x", &mask); 319 | if (sc != 1) goto done; 320 | 321 | iface = get_col(1, &iface_len, line, to_eob); 322 | if (iface == NULL) goto done; 323 | 324 | /* advance to next line */ 325 | b = line; 326 | while ((b < buf+sz) && (*b != '\n')) b++; 327 | line = b+1; 328 | 329 | /* does the route apply? */ 330 | if ((dest_ip & mask) != dest) continue; 331 | 332 | /* know a more specific route? */ 333 | if (mask < best_mask) continue; 334 | 335 | /* this is the best route so far */ 336 | best_mask = mask; 337 | 338 | /* copy details of this route */ 339 | if (iface_len + 1 > IF_NAMESIZE) goto done; 340 | memcpy(interface, iface, iface_len); 341 | interface[iface_len] = '\0'; 342 | nroutes++; 343 | } 344 | 345 | rc = nroutes ? 0 : -2; 346 | 347 | done: 348 | if (buf) free(buf); 349 | return rc; 350 | } 351 | 352 | /* get the MTU for the interface, or -1 on error */ 353 | int get_if_mtu(char *eth) { 354 | int fd = -1, sc, rc = -1; 355 | struct ifreq ifr; 356 | 357 | fd = socket(AF_INET, SOCK_DGRAM, 0); 358 | if (fd == -1) { 359 | fprintf(stderr, "socket: %s\n", strerror(errno)); 360 | goto done; 361 | } 362 | 363 | strncpy(ifr.ifr_name, eth, sizeof(ifr.ifr_name)); 364 | sc = ioctl(fd, SIOCGIFMTU, &ifr); 365 | if (sc < 0) { 366 | fprintf(stderr, "ioctl: %s\n", strerror(errno)); 367 | goto done; 368 | } 369 | 370 | rc = ifr.ifr_mtu; 371 | 372 | done: 373 | if (fd != -1) close(fd); 374 | return rc; 375 | } 376 | 377 | int check_ring_parameters(void) { 378 | int rc=-1; 379 | unsigned page_sz; 380 | 381 | if (cfg.ring_block_sz % cfg.ring_frame_sz) { 382 | fprintf(stderr,"-S block_sz must be multiple of -F frame_sz\n"); 383 | goto done; 384 | } 385 | 386 | page_sz = (unsigned)sysconf(_SC_PAGESIZE); 387 | 388 | if (cfg.ring_block_sz % page_sz) { 389 | fprintf(stderr,"-S block_sz must be multiple of page_sz %u\n", page_sz); 390 | goto done; 391 | } 392 | 393 | if (cfg.ring_frame_sz <= TPACKET2_HDRLEN) { 394 | fprintf(stderr,"-Z frame_sz must exceed %lu\n", TPACKET2_HDRLEN); 395 | goto done; 396 | } 397 | 398 | if (cfg.ring_frame_sz % TPACKET_ALIGNMENT) { 399 | fprintf(stderr,"-Z frame_sz must be a multiple of %u\n", TPACKET_ALIGNMENT); 400 | goto done; 401 | } 402 | 403 | cfg.ring_frame_nr = (cfg.ring_block_sz / cfg.ring_frame_sz) * cfg.ring_block_nr; 404 | 405 | rc = 0; 406 | 407 | done: 408 | return rc; 409 | 410 | } 411 | 412 | /* print the ring capacity in MB and packets 413 | * 414 | * here in userspace, the ring is nothing but a regular flat buffer. 415 | * it is comprised of contiguous slots - all of which have the same size. 416 | * 417 | * in kernel space, the ring is a set of blocks; each block is a number of 418 | * physically contiguous pages. since physically contiguous pages are 419 | * limited, the kernel only gets small allocations of them. it forms the 420 | * blocks into a virtually contiguous buffer for our benefit in user space. 421 | * 422 | * these kernel memory considerations are why the ring is specified as 423 | * a number of blocks (cfg.ring_block_nr) of a given size (cfg.ring_block_sz). 424 | * the other parameter (cfg.ring_frame_sz) is the max size of a packet structure 425 | * (struct tpacket_hdr, struct sockaddr_ll, packet itself, and padding). so 426 | * to deal with full packet data it needs to be the MTU plus all that overhead. 427 | * 428 | * we require block size to be a multiple of frame size, so there are no gaps 429 | * in the userspace view of the packet ring. it is a simple array of slots. 430 | * 431 | */ 432 | void describe_ring(char *label) { 433 | 434 | double block_size_mb = cfg.ring_block_sz / (1024.0 * 1024); 435 | double mb = cfg.ring_block_nr * block_size_mb; 436 | 437 | fprintf(stderr, "%s: %.1f megabytes (max %u packets)\n", 438 | label, mb, cfg.ring_frame_nr); 439 | 440 | if (cfg.verbose) { 441 | 442 | double bps = 10000000000.0; /* 10 gigabit/sec network */ 443 | double mbytes_per_sec = bps / ( 8 * 1024 * 1024); 444 | double sec = mb / mbytes_per_sec; 445 | 446 | fprintf(stderr, 447 | " RING: (%u blocks * %u bytes per block) = %.1f megabytes\n" 448 | " PACKETS: @(%u bytes/packet) = %u packets\n" 449 | " TIME TO QUENCH @ 10Gigabit/s: %.1f seconds\n", 450 | cfg.ring_block_nr, cfg.ring_block_sz, mb, 451 | cfg.ring_frame_sz, cfg.ring_frame_nr, sec); 452 | } 453 | } 454 | 455 | /* set up as a GRE receiver */ 456 | int setup_rx_encap(void) { 457 | struct sockaddr *sa; 458 | int i, sc, rc = -1; 459 | struct iovec *iov; 460 | socklen_t sz; 461 | 462 | cfg.rx_fd = socket(AF_INET, SOCK_RAW, IPPROTO_GRE); 463 | if (cfg.rx_fd == -1) { 464 | fprintf(stderr,"socket: %s\n", strerror(errno)); 465 | goto done; 466 | } 467 | 468 | /* bind local IP; defaults to INADDR_ANY */ 469 | struct sockaddr_in in; 470 | memset(&in, 0, sizeof(in)); 471 | in.sin_addr = cfg.encap.dst; 472 | sa = (struct sockaddr*)∈ 473 | sz = sizeof(in); 474 | 475 | sc = bind(cfg.rx_fd, sa, sz); 476 | if (sc < 0) { 477 | fprintf(stderr, "bind: %s\n", strerror(errno)); 478 | goto done; 479 | } 480 | 481 | /* bind specific RX NIC if requested */ 482 | sz = strlen(cfg.dev); 483 | sc = sz ? setsockopt(cfg.rx_fd, SOL_SOCKET, SO_BINDTODEVICE, cfg.dev, sz) : 0; 484 | if (sc < 0) { 485 | fprintf(stderr, "setsockopt: %s\n", strerror(errno)); 486 | goto done; 487 | } 488 | 489 | /* set up recvmmsg buffers */ 490 | assert(BATCH_SIZE == BATCH_PKTS * MAX_PKT); 491 | assert(cfg.gb.n == BATCH_PKTS * MAX_PKT); 492 | assert(cfg.gb.iov && (cfg.gb.iov->n == BATCH_PKTS)); 493 | cfg.gb.iov->i = cfg.gb.iov->n; /* mark slots used */ 494 | iov = (struct iovec*)utvector_head(cfg.gb.iov); 495 | for(i=0; i < BATCH_PKTS; i++) { 496 | iov[i].iov_base = cfg.gb.d + i * MAX_PKT; 497 | iov[i].iov_len = MAX_PKT; 498 | cfg.msgv[i].msg_hdr.msg_iov = &iov[i]; 499 | cfg.msgv[i].msg_hdr.msg_iovlen = 1; 500 | } 501 | 502 | rc = 0; 503 | 504 | done: 505 | return rc; 506 | } 507 | 508 | /* 509 | * Prepare to read packets using a AF_PACKET socket with PACKET_RX_RING 510 | * 511 | * see packet(7) 512 | * 513 | * also see 514 | * sudo apt-get install linux-doc 515 | * zless /usr/share/doc/linux-doc/networking/packet_mmap.txt.gz 516 | * 517 | * With PACKET_RX_RING (in TPACKET_V2) 518 | * the ring buffer consists of an array of packet slots. 519 | * 520 | * Each packet is preceded by a metadata structure in the slot. 521 | * The application and kernel communicate the head and tail of 522 | * the ring through tp_status field (TP_STATUS_[USER|KERNEL]). 523 | * 524 | */ 525 | 526 | int setup_rx(void) { 527 | int rc=-1, ec; 528 | 529 | if (check_ring_parameters() < 0) goto done; 530 | 531 | /* any link layer protocol packets (linux/if_ether.h) */ 532 | int protocol = htons(ETH_P_ALL); 533 | 534 | /* create the packet socket */ 535 | cfg.fd = socket(AF_PACKET, SOCK_RAW, protocol); 536 | if (cfg.fd == -1) { 537 | fprintf(stderr,"socket: %s\n", strerror(errno)); 538 | goto done; 539 | } 540 | 541 | /* convert interface name to index (in ifr.ifr_ifindex) */ 542 | struct ifreq ifr; 543 | strncpy(ifr.ifr_name, cfg.dev, sizeof(ifr.ifr_name)); 544 | ec = ioctl(cfg.fd, SIOCGIFINDEX, &ifr); 545 | if (ec < 0) { 546 | fprintf(stderr,"failed to find interface %s\n", cfg.dev); 547 | goto done; 548 | } 549 | 550 | /* PACKET_RX_RING comes in multiple versions. TPACKET_V2 is used here */ 551 | int v = TPACKET_V2; 552 | ec = setsockopt(cfg.fd, SOL_PACKET, PACKET_VERSION, &v, sizeof(v)); 553 | if (ec < 0) { 554 | fprintf(stderr,"setsockopt PACKET_VERSION: %s\n", strerror(errno)); 555 | goto done; 556 | } 557 | 558 | /* fill out the struct tpacket_req describing the ring buffer */ 559 | memset(&cfg.req, 0, sizeof(cfg.req)); 560 | cfg.req.tp_block_size = cfg.ring_block_sz; /* Min sz of contig block */ 561 | cfg.req.tp_frame_size = cfg.ring_frame_sz; /* Size of frame/snaplen */ 562 | cfg.req.tp_block_nr = cfg.ring_block_nr; /* Number of blocks */ 563 | cfg.req.tp_frame_nr = cfg.ring_frame_nr; /* Total number of frames */ 564 | describe_ring("PACKET_RX_RING"); 565 | ec = setsockopt(cfg.fd, SOL_PACKET, PACKET_RX_RING, &cfg.req, sizeof(cfg.req)); 566 | if (ec < 0) { 567 | fprintf(stderr,"setsockopt PACKET_RX_RING: %s\n", strerror(errno)); 568 | goto done; 569 | } 570 | 571 | /* now map the ring buffer we described above. lock in unswappable memory */ 572 | cfg.pb.n = cfg.req.tp_block_size * cfg.req.tp_block_nr; 573 | cfg.pb.d = mmap(NULL, cfg.pb.n, PROT_READ|PROT_WRITE, 574 | MAP_SHARED|MAP_LOCKED, cfg.fd, 0); 575 | if (cfg.pb.d == MAP_FAILED) { 576 | fprintf(stderr,"mmap: %s\n", strerror(errno)); 577 | goto done; 578 | } 579 | 580 | /* bind to receive the packets from just one interface */ 581 | struct sockaddr_ll sl; 582 | memset(&sl, 0, sizeof(sl)); 583 | sl.sll_family = AF_PACKET; 584 | sl.sll_protocol = protocol; 585 | sl.sll_ifindex = ifr.ifr_ifindex; 586 | ec = bind(cfg.fd, (struct sockaddr*)&sl, sizeof(sl)); 587 | if (ec < 0) { 588 | fprintf(stderr,"socket: %s\n", strerror(errno)); 589 | goto done; 590 | } 591 | 592 | /* set promiscuous mode to get all packets. */ 593 | struct packet_mreq m; 594 | memset(&m, 0, sizeof(m)); 595 | m.mr_ifindex = ifr.ifr_ifindex; 596 | m.mr_type = PACKET_MR_PROMISC; 597 | ec = setsockopt(cfg.fd, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &m, sizeof(m)); 598 | if (ec < 0) { 599 | fprintf(stderr,"setsockopt PACKET_ADD_MEMBERSHIP: %s\n", strerror(errno)); 600 | goto done; 601 | } 602 | 603 | rc = 0; 604 | 605 | done: 606 | return rc; 607 | } 608 | 609 | /* 610 | * create the transmit socket 611 | * 612 | * There are two fundamentally different types of sockets here, only one 613 | * of which is created, based on whether we are doing *encapsulated* transmit 614 | * (of the packet into a GRE tunnel that then rides over regular IP); or 615 | * "regular" packet transmission where we inject the packet to the NIC. 616 | * 617 | * MODE SOCKET TYPE SEE ALSO 618 | * -------- ---------------- --------------- 619 | * ENCAPSULATE RAW IP ip(7) and raw(7) 620 | * REGULAR RAW PACKET packet(7) 621 | * 622 | * Within REGULAR mode we further distinguish between sendto()-based 623 | * transmit, versus packet tx ring mode. The latter uses the kernel ring 624 | * buffer mechanism described in packet_mmap.txt. 625 | * 626 | */ 627 | int setup_tx(void) { 628 | char interface[IF_NAMESIZE], *ip; 629 | int rc=-1, ec, one = 1; 630 | 631 | if (cfg.encap.enable) { 632 | 633 | /* in encapsulation mode, use raw IP socket. */ 634 | cfg.tx_fd = socket(AF_INET, SOCK_RAW, IPPROTO_GRE); 635 | if (cfg.tx_fd == -1) { 636 | fprintf(stderr,"socket: %s\n", strerror(errno)); 637 | goto done; 638 | } 639 | 640 | /* IP_HDRINCL means WE form the IP headers.. with some help; see raw(7) */ 641 | ec = setsockopt(cfg.tx_fd, IPPROTO_IP, IP_HDRINCL, &one, sizeof(one)); 642 | if (ec < 0) { 643 | fprintf(stderr,"setsockopt IP_HDRINCL: %s\n", strerror(errno)); 644 | goto done; 645 | } 646 | 647 | /* we need the mtu of the egress NIC to implement IP fragmentation, 648 | * if needed, since raw sockets do not do that for us. to get the 649 | * interface mtu, we need the egress interface, based on routing */ 650 | ec = find_route( cfg.encap.dst.s_addr, interface); 651 | if (ec < 0) { 652 | ip = inet_ntoa(cfg.encap.dst); 653 | fprintf(stderr, "can't determine route to %s\n", ip); 654 | goto done; 655 | } 656 | 657 | cfg.mtu = get_if_mtu(interface); 658 | if (cfg.mtu < 0) { 659 | fprintf(stderr, "mtu lookup failed: %s\n", interface); 660 | goto done; 661 | } 662 | 663 | if (cfg.verbose) { 664 | ip = inet_ntoa(cfg.encap.dst); 665 | fprintf(stderr, "encapsulating to %s on interface %s mtu %d\n", 666 | ip, interface, cfg.mtu); 667 | } 668 | 669 | rc = 0; 670 | goto done; 671 | } 672 | 673 | /* 674 | * standard tx mode 675 | */ 676 | 677 | /* use a raw PACKET (link-level) socket */ 678 | cfg.tx_fd = socket(AF_PACKET, SOCK_RAW, 0 /* tx only */); 679 | if (cfg.tx_fd == -1) { 680 | fprintf(stderr,"socket: %s\n", strerror(errno)); 681 | goto done; 682 | } 683 | 684 | /* convert interface name to index (in ifr.ifr_ifindex) */ 685 | struct ifreq ifr; 686 | strncpy(ifr.ifr_name, cfg.dev, sizeof(ifr.ifr_name)); 687 | ec = ioctl(cfg.tx_fd, SIOCGIFINDEX, &ifr); 688 | if (ec < 0) { 689 | fprintf(stderr,"failed to find interface %s\n", cfg.dev); 690 | goto done; 691 | } 692 | 693 | /* bind interface for tx */ 694 | struct sockaddr_ll sl; 695 | memset(&sl, 0, sizeof(sl)); 696 | sl.sll_family = AF_PACKET; 697 | sl.sll_ifindex = ifr.ifr_ifindex; 698 | ec = bind(cfg.tx_fd, (struct sockaddr*)&sl, sizeof(sl)); 699 | if (ec < 0) { 700 | fprintf(stderr,"socket: %s\n", strerror(errno)); 701 | goto done; 702 | } 703 | 704 | /* when qdisc bypass is enabled, to quote packet_mmap.txt, "packets sent 705 | * through PF_PACKET will bypass the kernel's qdisc layer and are ... 706 | * pushed to the driver directly. Meaning, packet are not buffered, tc 707 | * disciplines are ignored, increased loss can occur and such packets are 708 | * not visible to other PF_PACKET sockets anymore." 709 | */ 710 | #ifdef PACKET_QDISC_BYPASS 711 | ec = cfg.bypass_qdisc_on_tx ? 712 | setsockopt(cfg.tx_fd, SOL_PACKET, PACKET_QDISC_BYPASS, &one, sizeof(one)) : 0; 713 | if (ec < 0) { 714 | fprintf(stderr,"setsockopt PACKET_QDISC_BYPASS: %s\n", strerror(errno)); 715 | goto done; 716 | } 717 | #else 718 | if (cfg.bypass_qdisc_on_tx) { 719 | fprintf(stderr,"setsockopt PACKET_QDISC_BYPASS: unsupported\n"); 720 | goto done; 721 | } 722 | #endif 723 | 724 | /* if we are using standard, sendto-based transmit, we are done */ 725 | if (cfg.use_tx_ring == 0) { 726 | rc = 0; 727 | goto done; 728 | } 729 | 730 | /************************************************************* 731 | * packet tx ring setup 732 | ************************************************************/ 733 | if (check_ring_parameters() < 0) goto done; 734 | 735 | int v = TPACKET_V2; 736 | ec = setsockopt(cfg.tx_fd, SOL_PACKET, PACKET_VERSION, &v, sizeof(v)); 737 | if (ec < 0) { 738 | fprintf(stderr,"setsockopt PACKET_VERSION: %s\n", strerror(errno)); 739 | goto done; 740 | } 741 | 742 | /* fill out the struct tpacket_req describing the ring buffer */ 743 | memset(&cfg.req, 0, sizeof(cfg.req)); 744 | cfg.req.tp_block_size = cfg.ring_block_sz; /* Min sz of contig block */ 745 | cfg.req.tp_frame_size = cfg.ring_frame_sz; /* Size of frame/snaplen */ 746 | cfg.req.tp_block_nr = cfg.ring_block_nr; /* Number of blocks */ 747 | cfg.req.tp_frame_nr = cfg.ring_frame_nr; /* Total number of frames */ 748 | describe_ring("PACKET_TX_RING"); 749 | ec = setsockopt(cfg.tx_fd, SOL_PACKET, PACKET_TX_RING, &cfg.req, sizeof(cfg.req)); 750 | if (ec < 0) { 751 | fprintf(stderr,"setsockopt PACKET_TX_RING: %s\n", strerror(errno)); 752 | goto done; 753 | } 754 | 755 | /* map the tx ring buffer into unswappable memory */ 756 | cfg.pb.n = cfg.req.tp_block_size * cfg.req.tp_block_nr; 757 | cfg.pb.d = mmap(NULL, cfg.pb.n, PROT_READ|PROT_WRITE, 758 | MAP_SHARED|MAP_LOCKED, cfg.tx_fd, 0); 759 | if (cfg.pb.d == MAP_FAILED) { 760 | fprintf(stderr,"mmap: %s\n", strerror(errno)); 761 | goto done; 762 | } 763 | 764 | rc = 0; 765 | 766 | done: 767 | return rc; 768 | } 769 | 770 | int bb_flush(struct shr *s, struct bb *b) { 771 | int rc = -1; 772 | struct iovec *iov; 773 | size_t n; 774 | ssize_t wr; 775 | 776 | n = utvector_len(b->iov); 777 | if (n == 0) { rc = 0; goto done; } 778 | iov = (struct iovec*)utvector_head(b->iov); 779 | 780 | wr = shr_writev(s, iov, n); 781 | if (wr < 0) { 782 | fprintf(stderr,"shr_write: error code %ld\n", (long)wr); 783 | goto done; 784 | } 785 | b->u = 0; 786 | utvector_clear(b->iov); 787 | 788 | rc = 0; 789 | 790 | done: 791 | return rc; 792 | } 793 | 794 | /* store the message into the batch buffer */ 795 | ssize_t bb_write(struct shr *s, struct bb *b, char *buf, size_t len) { 796 | struct iovec io; 797 | int rc = -1; 798 | 799 | if (b->n - b->u < len) { 800 | if (bb_flush(s,b) < 0) goto done; 801 | } 802 | 803 | assert((b->n - b->u) >= len); 804 | 805 | io.iov_base = &b->d[b->u]; 806 | io.iov_len = len; 807 | memcpy(io.iov_base, buf, len); 808 | utvector_push(b->iov, &io); 809 | b->u += len; 810 | 811 | rc = 0; 812 | 813 | done: 814 | return (rc < 0) ? (ssize_t)-1 : len; 815 | } 816 | 817 | /* add rx drops to the counter in the ring app data 818 | * 819 | * see /usr/include/linux/if_packet.h 820 | * see packet(7) 821 | * "Receiving statistics resets the internal counters." 822 | * 823 | */ 824 | int update_rx_drops(void) { 825 | struct tpacket_stats stats; 826 | struct fluxcap_stats st; 827 | size_t st_sz; 828 | void *stp; 829 | int sc, rc = -1; 830 | 831 | assert(cfg.mode == mode_receive); 832 | if (cfg.losing == 0) return 0; 833 | 834 | /* packet(7): "Receiving statistics resets the internal counters." */ 835 | socklen_t len = sizeof(stats); 836 | sc = getsockopt(cfg.fd, SOL_PACKET, PACKET_STATISTICS, &stats, &len); 837 | if (sc < 0) { 838 | fprintf(stderr,"getsockopt: %s\n", strerror(errno)); 839 | return -1; 840 | } 841 | 842 | if (cfg.verbose) { 843 | fprintf(stderr, "Received packets: %u\n", stats.tp_packets); 844 | fprintf(stderr, "Dropped packets: %u\n", stats.tp_drops); 845 | } 846 | 847 | stp = &st; 848 | st_sz = sizeof(st); 849 | 850 | sc = shr_appdata(cfg.ring, &stp, NULL, &st_sz); /* "get" */ 851 | if (sc < 0) { 852 | fprintf(stderr, "shr_appdata: error %d\n", sc); 853 | goto done; 854 | } 855 | 856 | st.rx_drops += stats.tp_drops; 857 | 858 | sc = shr_appdata(cfg.ring, NULL, stp, &st_sz); /* "set" */ 859 | if (sc < 0) { 860 | fprintf(stderr, "shr_appdata: error %d\n", sc); 861 | goto done; 862 | } 863 | 864 | cfg.losing = 0; 865 | rc = 0; 866 | 867 | done: 868 | return rc; 869 | } 870 | 871 | /* add ring read drops to the counter in the ring app data */ 872 | int update_rd_drops(void) { 873 | struct fluxcap_stats st; 874 | size_t st_sz; 875 | void *stp; 876 | int sc, rc = -1; 877 | 878 | stp = &st; 879 | st_sz = sizeof(st); 880 | 881 | sc = shr_appdata(cfg.ring, &stp, NULL, &st_sz); /* "get" */ 882 | if (sc < 0) { 883 | fprintf(stderr, "shr_appdata: error %d\n", sc); 884 | goto done; 885 | } 886 | 887 | st.rd_drops += shr_farm_stat(cfg.ring, 1); 888 | 889 | sc = shr_appdata(cfg.ring, NULL, stp, &st_sz); /* "set" */ 890 | if (sc < 0) { 891 | fprintf(stderr, "shr_appdata: error %d\n", sc); 892 | goto done; 893 | } 894 | 895 | rc = 0; 896 | 897 | done: 898 | return rc; 899 | } 900 | 901 | /* returns volatile memory - use immediately or copy. 902 | * takes bits-per-second as input, returns like "20 Mbit/s" 903 | * where "bit" is the unit, can also be "pkt" etc. 904 | * using whatever SI unit is most readable (K,M,G,T) 905 | */ 906 | char *format_rate(unsigned long bps, char *unit) { 907 | double b = bps; 908 | char *c = ""; 909 | if (b > 1024) { b /= 1024; c = "K"; } 910 | if (b > 1024) { b /= 1024; c = "M"; } 911 | if (b > 1024) { b /= 1024; c = "G"; } 912 | if (b > 1024) { b /= 1024; c = "T"; } 913 | utstring_clear(cfg.tmp); 914 | utstring_printf(cfg.tmp, "%.0f %s%s/s", b, c, unit); 915 | return utstring_body(cfg.tmp); 916 | } 917 | 918 | /* 919 | * status_rings 920 | * 921 | * update i/o metrics for each ring 922 | * 923 | */ 924 | int status_rings(void) { 925 | unsigned long start_tick, st, ct; 926 | struct shr_stat *ss; 927 | double elapsed_sec, lg10_b; 928 | size_t sz; 929 | int rc = -1, sc, i; 930 | char *name, *c; 931 | struct shr **r; 932 | struct ww *w; 933 | UT_string *s; 934 | ssize_t nr; 935 | void *fs; 936 | 937 | /* unicode 1/8 width box progression */ 938 | char *blocks[] = { "", "▏", "▎", "▍", "▌", "▋", "▊", "▉", "█"}; 939 | 940 | printf("\033[1;1H"); /* position at line 0, col 0 */ 941 | printf("\033[1m"); /* bold */ 942 | printf(" %-20s | %-12s | %-12s | %-12s \n\n", 943 | "name", "rx-rate", "rx-drop", "tx-drop"); 944 | printf("\033[m"); /* reset attributes */ 945 | 946 | /* go through the rings to obtain their in/out counters */ 947 | s = NULL; 948 | r = NULL; 949 | w = NULL; 950 | while ( (r = (struct shr**)utvector_next(cfg.watch_rings, r))) { 951 | s = (UT_string*)utvector_next(cfg.watch_names, s); 952 | w = (struct ww*)utvector_next(cfg.watch_win, w); 953 | assert(s); 954 | assert(w); 955 | 956 | name = utstring_body(s); 957 | 958 | ss = &w->win[ cfg.ticks % NWIN ].ss; 959 | sc = shr_stat(*r, ss, NULL); 960 | if (sc < 0) goto done; 961 | 962 | fs = &w->win[ cfg.ticks % NWIN ].fs; 963 | sz = sizeof(struct fluxcap_stats); 964 | sc = shr_appdata(*r, &fs, NULL, &sz); 965 | if (sc < 0) { 966 | fprintf(stderr, "shr_appdata: error %d\n", sc); 967 | goto done; 968 | } 969 | 970 | /* for this ring, compute intake & drops over the windows */ 971 | start_tick = (cfg.ticks < NWIN) ? 0 : (cfg.ticks - (NWIN - 1)); 972 | st = start_tick % NWIN; 973 | ct = cfg.ticks % NWIN; 974 | w->bw = w->win[ ct ].ss.bw - 975 | w->win[ st ].ss.bw; 976 | w->mw = w->win[ ct ].ss.mw - 977 | w->win[ st ].ss.mw; 978 | w->rx = w->win[ ct ].fs.rx_drops - 979 | w->win[ st ].fs.rx_drops; 980 | w->rd = w->win[ ct ].fs.rd_drops - 981 | w->win[ st ].fs.rd_drops; 982 | 983 | /* compute per second rates, log and strings */ 984 | elapsed_sec = (cfg.ticks - start_tick) * 1.0 / TIMER_HZ; 985 | memset( &w->ps, 0, sizeof(w->ps) ); 986 | if (elapsed_sec > 0) { 987 | w->ps.B = w->bw / elapsed_sec; 988 | w->ps.b = w->ps.B * 8; 989 | lg10_b = w->ps.b ? log10(w->ps.b) : 0; 990 | w->ps.lg10_b = (unsigned)floor(lg10_b); /* integer part */ 991 | w->ps.lg10_bf = (lg10_b - w->ps.lg10_b) * 8; /* fraction n/8 */ 992 | w->ps.rx = w->rx / elapsed_sec; 993 | w->ps.rd = w->rd / elapsed_sec; 994 | } 995 | 996 | /* render strings */ 997 | strncpy(w->name, name, NAME_MAX); 998 | w->name[NAME_MAX - 1] = '\0'; 999 | snprintf(w->ps.str.b, RATE_MAX, "%lu", w->ps.b); 1000 | snprintf(w->ps.str.rx, RATE_MAX, "%lu", w->ps.rx); 1001 | snprintf(w->ps.str.rd, RATE_MAX, "%lu", w->ps.rd); 1002 | 1003 | /* bits/s in */ 1004 | c = format_rate(w->ps.b, "bit"); 1005 | assert(strlen(c)+1 <= RATE_MAX); 1006 | strncpy(w->ps.str.E, c, RATE_MAX); 1007 | 1008 | /* rx (ingest) drops/s */ 1009 | c = format_rate(w->ps.rx, "bit"); 1010 | assert(strlen(c)+1 <= RATE_MAX); 1011 | strncpy(w->ps.str.X, c, RATE_MAX); 1012 | 1013 | /* rd (reader) drops/s */ 1014 | c = format_rate(w->ps.rd, "bit"); 1015 | assert(strlen(c)+1 <= RATE_MAX); 1016 | strncpy(w->ps.str.D, c, RATE_MAX); 1017 | 1018 | /* render to terminal */ 1019 | printf(" %-20.20s | %-12s | %-12s | %-12s ", 1020 | w->name, w->ps.str.E, w->ps.str.X, w->ps.str.D); 1021 | for(i=0; i < w->ps.lg10_b; i++) printf("%s", blocks[8]); 1022 | printf("%s", blocks[ w->ps.lg10_bf ]); 1023 | printf("\033[0K"); /* erase to end of line */ 1024 | printf("\n"); 1025 | } 1026 | 1027 | rc = 0; 1028 | 1029 | done: 1030 | return rc; 1031 | } 1032 | 1033 | /* work we do at 10hz 1034 | * 1035 | * normally nexp (number of expirations) is 1. 1036 | * in a busy process expirations may coalesce. 1037 | * 1038 | * we do "rainy day" cache flushes below 1039 | * so that time, like capacity, induce flush 1040 | */ 1041 | int timer_work(unsigned long nexp) { 1042 | int rc = -1, sc; 1043 | struct shr **r; 1044 | struct bb *b; 1045 | 1046 | switch(cfg.mode) { 1047 | 1048 | case mode_transmit: 1049 | sc = update_rd_drops(); 1050 | if (sc < 0) goto done; 1051 | break; 1052 | 1053 | case mode_receive: 1054 | sc = bb_flush(cfg.ring, &cfg.bb); 1055 | if (sc < 0) goto done; 1056 | sc = update_rx_drops(); 1057 | if (sc < 0) goto done; 1058 | break; 1059 | 1060 | case mode_watch: 1061 | sc = status_rings(); 1062 | if (sc < 0) goto done; 1063 | break; 1064 | 1065 | default: 1066 | break; 1067 | } 1068 | 1069 | rc = 0; 1070 | 1071 | done: 1072 | return rc; 1073 | } 1074 | 1075 | int show_stats(void) { 1076 | 1077 | return 0; 1078 | } 1079 | 1080 | int handle_signal(void) { 1081 | struct signalfd_siginfo info; 1082 | ssize_t nr; 1083 | int rc=-1; 1084 | 1085 | nr = read(cfg.signal_fd, &info, sizeof(info)); 1086 | if (nr != sizeof(info)) { 1087 | fprintf(stderr,"failed to read signal fd buffer\n"); 1088 | goto done; 1089 | } 1090 | 1091 | switch(info.ssi_signo) { 1092 | case SIGALRM: 1093 | gettimeofday(&cfg.now, NULL); 1094 | if (cfg.verbose) show_stats(); 1095 | alarm(1); 1096 | break; 1097 | default: 1098 | fprintf(stderr,"got signal %d\n", info.ssi_signo); 1099 | goto done; 1100 | break; 1101 | } 1102 | 1103 | rc = 0; 1104 | 1105 | done: 1106 | return rc; 1107 | } 1108 | 1109 | /* 1110 | * handle_timer 1111 | * 1112 | * triggered when our timerfd periodically expires. 1113 | * number of expirations is usually 1, but in a very 1114 | * busy process multiple expirations can coalesce. 1115 | * 1116 | */ 1117 | int handle_timer(void) { 1118 | unsigned long nexp; 1119 | int rc=-1, sc; 1120 | 1121 | sc = read(cfg.timer_fd, &nexp, sizeof(nexp)); 1122 | if (sc < 0) { 1123 | fprintf(stderr,"read: %s\n", strerror(errno)); 1124 | goto done; 1125 | } 1126 | 1127 | sc = timer_work(nexp); 1128 | if (sc < 0) goto done; 1129 | 1130 | cfg.ticks++; 1131 | 1132 | rc = 0; 1133 | 1134 | done: 1135 | return rc; 1136 | } 1137 | 1138 | /* 1139 | * encapsulate_tx 1140 | * 1141 | * using a raw IP socket, transmit GRE-or-VXLAN encapsulated packets. 1142 | * if necessary, perform IP fragmentation ourselves, as this 1143 | * is not done by the OS when using raw sockets. 1144 | */ 1145 | char gbuf[MAX_PKT]; 1146 | int encapsulate_tx(char *tx, ssize_t nx) { 1147 | uint16_t encap_ethertype, more_fragments=1, fo=0, fn=0; 1148 | uint32_t ip_src, ip_dst, seqno, off; 1149 | char *g, *ethertype, ipproto; 1150 | struct sockaddr_in sin; 1151 | struct sockaddr *dst; 1152 | ssize_t nr, fl; 1153 | socklen_t sz; 1154 | 1155 | uint16_t vxlan_src_port; 1156 | uint16_t vxlan_dst_port; 1157 | uint16_t vxlan_udp_len; 1158 | uint16_t vxlan_udp_cksum; 1159 | uint8_t vxlan_flags; 1160 | uint8_t *vni_big_endian; 1161 | 1162 | assert(nx >= 14); 1163 | 1164 | ip_src = 0; 1165 | ip_dst = cfg.encap.dst.s_addr; 1166 | 1167 | sin.sin_family = AF_INET; 1168 | sin.sin_port = 0; 1169 | sin.sin_addr = cfg.encap.dst; 1170 | dst = (struct sockaddr*)&sin; 1171 | sz = sizeof(sin); 1172 | 1173 | cfg.ip_id++; 1174 | g = gbuf; 1175 | off = 0; 1176 | 1177 | /* use IPPROTO_GRE (47) for gre/gretap or IPPROTO_UDP (17) for vxlan */ 1178 | ipproto = (cfg.encap.mode == mode_vxlan) ? IPPROTO_UDP : IPPROTO_GRE; 1179 | 1180 | /* construct 20-byte IP header. 1181 | * NOTE: some zeroed header fields are filled out for us, when we send this 1182 | * packet; particularly, checksum, src IP; ID and total length. see raw(7). 1183 | */ 1184 | g[0] = 4 << 4; /* IP version goes in MSB (upper 4 bits) of the first byte */ 1185 | g[0] |= 5; /* IP header length (5 * 4 = 20 bytes) in lower 4 bits */ 1186 | g[1] = 0; /* DSCP / ECN */ 1187 | g[2] = 0; /* total length (upper byte) (see NOTE) */ 1188 | g[3] = 0; /* total length (lower byte) (see NOTE) */ 1189 | g[4] = (cfg.ip_id & 0xff00) >> 8; /* id (upper byte); for frag reassembly */ 1190 | g[5] = (cfg.ip_id & 0x00ff); /* id (lower byte); for frag reassembly */ 1191 | g[6] = 0; /* 0 DF MF flags and upper bits of frag offset */ 1192 | g[7] = 0; /* lower bits of frag offset */ 1193 | g[8] = 255; /* TTL */ 1194 | g[9] = ipproto; /* IPPROTO_GRE or IPPROTO_UDP (VXLAN) */ 1195 | g[10] = 0; /* IP checksum (high byte) (see NOTE) */ 1196 | g[11] = 0; /* IP checksum (low byte) (see NOTE) */ 1197 | memcpy(&g[12], &ip_src, sizeof(ip_src)); /* IP source (see NOTE) */ 1198 | memcpy(&g[16], &ip_dst, sizeof(ip_dst)); /* IP destination */ 1199 | 1200 | g += 20; 1201 | 1202 | /* GRE or UDP header starts */ 1203 | 1204 | switch(cfg.encap.mode) { 1205 | case mode_gre: 1206 | memset(g, 0, 2); /* zero first two bytes of GRE header */ 1207 | g[0] |= (cfg.encap.key ? (1U << 5) : 0); /* key bit */ 1208 | g += 2; 1209 | ethertype = &tx[12]; /* copy ethertype from packet into GRE header */ 1210 | memcpy(g, ethertype, sizeof(uint16_t)); 1211 | g += 2; 1212 | if (cfg.encap.key) { 1213 | memcpy(g, &cfg.encap.key, 4); 1214 | g += 4; 1215 | } 1216 | nx -= 14; tx += 14; // elide original MACs and ethertype! 1217 | assert(nx <= sizeof(gbuf)-(g-gbuf)); 1218 | memcpy(g, tx, nx); 1219 | g += nx; 1220 | nx = g-gbuf; 1221 | break; 1222 | case mode_gretap: 1223 | memset(g, 0, 2); /* zero first two bytes of GRE header */ 1224 | g[0] |= (cfg.encap.key ? (1U << 5) : 0); /* key bit */ 1225 | g += 2; 1226 | encap_ethertype = htons(0x6558); /* transparent ethernet bridging */ 1227 | memcpy(g, &encap_ethertype, sizeof(uint16_t)); 1228 | g += 2; 1229 | if (cfg.encap.key) { 1230 | memcpy(g, &cfg.encap.key, 4); 1231 | g += 4; 1232 | } 1233 | assert(nx <= sizeof(gbuf)-(g-gbuf)); 1234 | memcpy(g, tx, nx); 1235 | g += nx; 1236 | nx = g-gbuf; 1237 | break; 1238 | case mode_vxlan: 1239 | /* 8 byte UDP header */ 1240 | vxlan_src_port = htons(9999); /* arbitrary */ 1241 | vxlan_dst_port = htons(4789); /* IANA assigned VXLAN dest port */ 1242 | vxlan_udp_len = htons(nx+8+8); /* payload + VXLAN header + UDP header */ 1243 | vxlan_udp_cksum = htons(0); 1244 | memcpy(g+0, &vxlan_src_port, 2); 1245 | memcpy(g+2, &vxlan_dst_port, 2); 1246 | memcpy(g+4, &vxlan_udp_len, 2); 1247 | memcpy(g+6, &vxlan_udp_cksum, 2); 1248 | g += 8; 1249 | /* 8 byte VXLAN header */ 1250 | vxlan_flags = 0x8; /* set I flag only */ 1251 | memcpy(g+0, &vxlan_flags, 1); 1252 | memset(g+1, 0, 7); /* clear reserved bits */ 1253 | /* vxlan VNI is 24 bit. copy the three LS bytes 1254 | * of cfg.encap.key. it's already in net order */ 1255 | vni_big_endian = ((uint8_t*)&cfg.encap.key) + 1; 1256 | memcpy(g+4, vni_big_endian, 3); 1257 | g += 8; 1258 | assert(nx <= sizeof(gbuf)-(g-gbuf)); 1259 | memcpy(g, tx, nx); 1260 | g += nx; 1261 | nx = g-gbuf; 1262 | break; 1263 | default: 1264 | assert(0); 1265 | break; 1266 | } 1267 | 1268 | /* 1269 | * send IP packet, performing fragmentation if greater than mtu 1270 | */ 1271 | do { 1272 | 1273 | more_fragments = (nx > cfg.mtu) ? 1 : 0; 1274 | assert((off & 0x7) == 0); 1275 | fo = off / 8; 1276 | 1277 | gbuf[6] = more_fragments ? (1 << 5) : 0; /* 0 DF [MF] flag */ 1278 | gbuf[6] |= (fo & 0x1f00) >> 8; /* upper bits of frag offset */ 1279 | gbuf[7] = fo & 0x00ff; /* lower bits of frag offset */ 1280 | 1281 | /* choose fragment length so it's below MTU and so the payload 1282 | * length after 20 byte header is a multiple of 8 as required */ 1283 | if (more_fragments) 1284 | fl = ((cfg.mtu - 20) & ~7U) + 20; 1285 | else 1286 | fl = nx; 1287 | 1288 | nr = sendto(cfg.tx_fd, gbuf, fl, 0, dst, sz); 1289 | if (nr != fl) { 1290 | fprintf(stderr,"sendto: %s\n", (nr<0) ? 1291 | strerror(errno) : "incomplete"); 1292 | return -1; 1293 | } 1294 | 1295 | /* keeping 20-byte IP header, slide next fragment payload */ 1296 | if (more_fragments) { 1297 | assert(fl > 20); 1298 | memmove(&gbuf[20], &gbuf[fl], nx - fl); 1299 | off += (fl - 20); 1300 | nx -= (fl - 20); 1301 | } 1302 | 1303 | } while (more_fragments); 1304 | 1305 | return 0; 1306 | } 1307 | 1308 | /* inject four bytes to the ethernet frame with an 802.1q vlan tag. 1309 | * note if this makes MTU exceeded it may result in sendto error */ 1310 | char buf[MAX_PKT]; 1311 | char vlan_tag[VLAN_LEN] = {0x81, 0x00, 0x00, 0x00}; 1312 | char *inject_vlan(char *tx, ssize_t *nx, uint16_t vlan) { 1313 | if (((*nx) + 4) > MAX_PKT) return NULL; 1314 | if ((*nx) <= MACS_LEN) return NULL; 1315 | /* prepare 802.1q tag vlan portion in network order */ 1316 | uint16_t v = htons(vlan); 1317 | memcpy(&vlan_tag[2], &v, sizeof(v)); 1318 | /* copy MAC's from original packet, inject 802.1q, copy packet */ 1319 | memcpy(buf, tx, MACS_LEN); 1320 | memcpy(buf+MACS_LEN, vlan_tag, VLAN_LEN); 1321 | memcpy(buf+MACS_LEN+VLAN_LEN, tx + MACS_LEN, (*nx) - MACS_LEN); 1322 | *nx += 4; 1323 | return buf; 1324 | } 1325 | 1326 | /* apply filtering to a rx or tx packet */ 1327 | int keep_packet(char *tx, size_t nx) { 1328 | uint16_t vlan; 1329 | int r; 1330 | 1331 | /* apply vlan test, if enabled */ 1332 | if (cfg.pass_vlan) { 1333 | 1334 | if (nx < MACS_LEN + VLAN_LEN) 1335 | return 0; 1336 | 1337 | if (memcmp(&tx[MACS_LEN], "\x81\x00", 2)) 1338 | return 0; 1339 | 1340 | memcpy(&vlan, &tx[MACS_LEN+2], sizeof(vlan)); 1341 | vlan = ntohs(vlan); 1342 | vlan &= 0x0fff; 1343 | 1344 | if (vlan != cfg.pass_vlan) 1345 | return 0; 1346 | } 1347 | 1348 | /* apply random drop, if enabled */ 1349 | if (cfg.drop_pct != 0) { 1350 | r = rand(); 1351 | if ((r * 100.0 / RAND_MAX) < cfg.drop_pct) 1352 | return 0; 1353 | } 1354 | 1355 | return 1; 1356 | } 1357 | 1358 | /* tx-ring mode only: start transmission from the ring */ 1359 | int initiate_transmit(void) { 1360 | 1361 | assert(cfg.use_tx_ring); 1362 | 1363 | /* initiate transmit, without waiting for completion */ 1364 | if (send(cfg.tx_fd, NULL, 0, MSG_DONTWAIT) < 0) { 1365 | 1366 | /* if tx is underway or the kernel can't sink any more data we can get 1367 | * "resource temporarily unavailable". solution: start a blocking tx */ 1368 | if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) { 1369 | 1370 | if (send(cfg.tx_fd, NULL, 0, 0) < 0) { 1371 | fprintf(stderr,"blocking transmit failed: %s\n", strerror(errno)); 1372 | return -1; 1373 | } 1374 | 1375 | } else { 1376 | 1377 | /* any other kind of send error is fatal */ 1378 | fprintf(stderr,"failed to initiate transmit: %s\n", strerror(errno)); 1379 | return -1; 1380 | } 1381 | } 1382 | 1383 | return 0; 1384 | } 1385 | 1386 | /* tx-ring mode only: poll kernel for space availability in tx ring */ 1387 | int wait_for_tx_space(void) { 1388 | int rc, timeout = 1000; /* milliseconds */ 1389 | 1390 | assert(cfg.use_tx_ring); 1391 | 1392 | struct pollfd pfd; 1393 | pfd.fd = cfg.tx_fd; 1394 | pfd.revents = 0; 1395 | pfd.events = POLLOUT; 1396 | 1397 | rc = poll(&pfd, 1, timeout); 1398 | if (rc <= 0) { 1399 | fprintf(stderr, "poll for tx space: %s\n", rc ? strerror(errno) : "timeout"); 1400 | return -1; 1401 | } 1402 | 1403 | return 0; 1404 | } 1405 | 1406 | int transmit_packets(void) { 1407 | int rc=-1, n, len, nq=0, failsafe=0; 1408 | struct sockaddr *dst = NULL; 1409 | struct sockaddr_in sin; 1410 | ssize_t nr,nt,nx; 1411 | struct iovec *io; 1412 | socklen_t sz = 0; 1413 | uint8_t *mac; 1414 | size_t nio; 1415 | 1416 | /* get pointer to iov array to be populated */ 1417 | utvector_clear(cfg.rb.iov); 1418 | nio = cfg.rb.iov->n; 1419 | io = (struct iovec*)cfg.rb.iov->d; 1420 | 1421 | /* read packets, up to BATCH_PKTS or BATCH_SIZE bytes */ 1422 | nr = shr_readv(cfg.ring, cfg.rb.d, cfg.rb.n, io, &nio); 1423 | if (nr < 0) { 1424 | fprintf(stderr, "shr_readv error: %ld\n", (long)nr); 1425 | goto done; 1426 | } 1427 | 1428 | /* set number of used iov slots */ 1429 | assert(nio <= cfg.rb.iov->n); 1430 | cfg.rb.iov->i = nio; 1431 | 1432 | /* iterate over packets obtained in shr_readv */ 1433 | io = NULL; 1434 | while ( (io = utvector_next(cfg.rb.iov, io))) { 1435 | 1436 | char *tx = io->iov_base; /* packet */ 1437 | nx = io->iov_len; /* length */ 1438 | if (keep_packet(tx, nx) == 0) continue; 1439 | 1440 | /* inject 802.1q tag if requested */ 1441 | if (cfg.vlan) tx = inject_vlan(tx,&nx,cfg.vlan); 1442 | if (tx == NULL) { 1443 | fprintf(stderr, "vlan tag injection failed\n"); 1444 | goto done; 1445 | } 1446 | 1447 | /* truncate outgoing packet if requested */ 1448 | if (cfg.size && (nx > cfg.size)) nx = cfg.size; 1449 | 1450 | /* trim N bytes from frame end if requested. */ 1451 | if (cfg.tail && (nx > cfg.tail)) nx -= cfg.tail; 1452 | 1453 | /* wrap encapsulation around it, if enabled */ 1454 | if (cfg.encap.enable) { 1455 | 1456 | if (encapsulate_tx(tx, nx)) goto done; 1457 | continue; 1458 | 1459 | } else if (cfg.use_tx_ring == 0) { 1460 | 1461 | nt = sendto(cfg.tx_fd, tx, nx, 0, dst, sz); 1462 | if (nt != nx) { 1463 | fprintf(stderr,"sendto: %s\n", (nt<0) ? 1464 | strerror(errno) : "incomplete"); 1465 | goto done; 1466 | } 1467 | 1468 | continue; 1469 | } 1470 | 1471 | /************************************************************* 1472 | * packet tx ring mode below 1473 | ************************************************************/ 1474 | 1475 | assert(cfg.encap.enable == 0); 1476 | assert(cfg.use_tx_ring); 1477 | 1478 | /* copy packet into kernel tx ring 1479 | * 1480 | * each packet occupies a slot. a tpacket2_hdr precedes the packet. 1481 | * once we initiate transmission from the ring, the tx progresses 1482 | * in kernel space. later, when we come round to the slot again, 1483 | * we can check its transmission status or outcome. 1484 | * 1485 | * a tx error, due to a malformed packet, causes the kernel to stop 1486 | * transmitting from the ring. it sets TP_STATUS_WRONG_FORMAT on the 1487 | * packet. normally, we treat this condition fatally. if the "keep 1488 | * going" option is enabled, tx errors are suppressed and ignored. 1489 | 1490 | * when we are about to write a packet into the slot, we may find 1491 | * the slot is in this tx error state due to the previous packet. 1492 | * or, we may find that the slot is still in-use. due to our 1493 | * independence from the actual tranmission process, we only learn 1494 | * of these states when we come round to the slot. it is normal to 1495 | * encounter uninitiated or in-progress transmission, and we wait 1496 | * for availability in the ring in that case. 1497 | * 1498 | * for all its sophistication, the ring-based transmitter had 1499 | * lower performance in my tests than the sendto-based transmitter. 1500 | * this may be due to the extra copying we do to populate the ring. 1501 | * this is why the sendto-transmitter is used by default. 1502 | * 1503 | */ 1504 | 1505 | /* get address of the current slot (metadata header, pad, packet) */ 1506 | uint8_t *cur = cfg.pb.d + cfg.ring_curr_idx * cfg.ring_frame_sz; 1507 | 1508 | /* struct tpacket2_hdr is defined in /usr/include/linux/if_packet.h */ 1509 | struct tpacket2_hdr *hdr = (struct tpacket2_hdr *)cur; 1510 | 1511 | retry_slot: 1512 | 1513 | if (failsafe++ > 1) { 1514 | fprintf(stderr, "internal error awaiting tx ring availability\n"); 1515 | goto done; 1516 | } 1517 | 1518 | /* did the slot have a previous error? */ 1519 | if (hdr->tp_status == TP_STATUS_WRONG_FORMAT) { 1520 | fprintf(stderr,"tx error- frame dump follows; exiting.\n"); 1521 | hexdump(cur, cfg.ring_frame_sz); 1522 | goto done; 1523 | } 1524 | 1525 | /* is the slot in-use, in the midst of transmission? */ 1526 | if (hdr->tp_status == TP_STATUS_SENDING) { 1527 | if (wait_for_tx_space() < 0) goto done; 1528 | goto retry_slot; 1529 | } 1530 | 1531 | /* is the slot in-use, awaiting transmit to begin? this can happen if 1532 | * we loop around the ring, before initiating transmit (say, if the batch 1533 | * size exceeds the ring size). it can also happen if we did initiate tx, 1534 | * if the kernel has yet to get to this packet and flag it sending. 1535 | */ 1536 | if (hdr->tp_status == TP_STATUS_SEND_REQUEST) { 1537 | if (initiate_transmit() < 0) goto done; 1538 | if (wait_for_tx_space() < 0) goto done; 1539 | goto retry_slot; 1540 | } 1541 | 1542 | /* if we got here, the slot _must_ be available. right? */ 1543 | if (hdr->tp_status != TP_STATUS_AVAILABLE) { 1544 | fprintf(stderr,"tx slot: unexpected flag %d\n", hdr->tp_status); 1545 | goto done; 1546 | } 1547 | 1548 | failsafe = 0; /* reset loop safegaurd */ 1549 | 1550 | /* put packet's link level header (first MAC) after the tpacket2_hdr plus 1551 | * alignment gap. (struct sockaddr_ll is not in the slot during tx). */ 1552 | mac = (uint8_t*)TPACKET_ALIGN(((unsigned long)cur) + 1553 | sizeof(struct tpacket2_hdr)); 1554 | len = cfg.ring_frame_sz - (mac - cur); 1555 | if (nx > len) { 1556 | fprintf(stderr, "packet length %ld exceeds effective frame_size %d\n", 1557 | (long)nx, len); 1558 | goto done; 1559 | } 1560 | 1561 | /* populate packet proper */ 1562 | memcpy(mac, tx, nx); 1563 | hdr->tp_len = nx; 1564 | hdr->tp_status = TP_STATUS_SEND_REQUEST; 1565 | nq++; 1566 | 1567 | /* point to next slot */ 1568 | cfg.ring_curr_idx = (cfg.ring_curr_idx + 1) % cfg.ring_frame_nr; 1569 | } 1570 | 1571 | /* if packets were queued in to kernel tx ring, initiate transmit */ 1572 | if (nq && (initiate_transmit() < 0)) goto done; 1573 | 1574 | rc = 0; 1575 | 1576 | done: 1577 | return rc; 1578 | } 1579 | 1580 | int receive_packets(void) { 1581 | int rc=-1, sw, wire_vlan, form_vlan, keep; 1582 | ssize_t nr,nt,nx; 1583 | struct iovec iov; 1584 | char *tx; 1585 | 1586 | while (1) { 1587 | 1588 | /* get address of the current slot (metadata header, pad, packet) */ 1589 | uint8_t *cur = cfg.pb.d + cfg.ring_curr_idx * cfg.ring_frame_sz; 1590 | 1591 | /* these structs start the frame, see /usr/include/linux/if_packet.h */ 1592 | struct tpacket2_hdr *hdr = (struct tpacket2_hdr *)cur; 1593 | struct sockaddr_ll *sll = (struct sockaddr_ll *)(cur + TPACKET2_HDRLEN); 1594 | 1595 | /* check if the packet is ready. this is how we break the loop */ 1596 | if ((hdr->tp_status & TP_STATUS_USER) == 0) break; 1597 | 1598 | /* note packet drop condition */ 1599 | if (hdr->tp_status & TP_STATUS_LOSING) cfg.losing = 1; 1600 | 1601 | tx = cur + hdr->tp_mac; 1602 | nx = hdr->tp_snaplen; 1603 | 1604 | /* upon receipt the wire vlan (if any) has been pulled out for us */ 1605 | wire_vlan = (hdr->tp_status & TP_STATUS_VLAN_VALID) ? 1606 | (hdr->tp_vlan_tci & 0xfff) : 0; 1607 | form_vlan = cfg.vlan ? cfg.vlan : wire_vlan; 1608 | if (cfg.strip_vlan) form_vlan = 0; 1609 | 1610 | /* inject 802.1q tag if requested */ 1611 | if (form_vlan) tx = inject_vlan(tx,&nx,form_vlan); 1612 | if (tx == NULL) { 1613 | fprintf(stderr, "vlan tag injection failed\n"); 1614 | goto done; 1615 | } 1616 | 1617 | /* truncate packet if requested */ 1618 | if (cfg.size && (nx > cfg.size)) nx = cfg.size; 1619 | 1620 | /* trim N bytes from frame end if requested. */ 1621 | if (cfg.tail && (nx > cfg.tail)) nx -= cfg.tail; 1622 | 1623 | keep = keep_packet(tx,nx); 1624 | 1625 | /* push into batch buffer */ 1626 | sw = keep ? bb_write(cfg.ring, &cfg.bb, tx, nx) : 0; 1627 | if (sw < 0) { 1628 | fprintf(stderr, "bb_write (%lu bytes): error code %d\n", (long)nx, sw); 1629 | goto done; 1630 | } 1631 | 1632 | /* return the packet by assigning status word TP_STATUS_KERNEL (0) */ 1633 | hdr->tp_status = TP_STATUS_KERNEL; 1634 | 1635 | /* next packet */ 1636 | cfg.ring_curr_idx = (cfg.ring_curr_idx + 1) % cfg.ring_frame_nr; 1637 | } 1638 | 1639 | rc = 0; 1640 | 1641 | done: 1642 | return rc; 1643 | } 1644 | 1645 | /* decode the gre packet into its fields. 1646 | * input pkt starts with outer IP header. 1647 | * fields are returned in network order! 1648 | * fields are zeroed if not present 1649 | * on decoding failure, returns -1. 1650 | * returns 0 on success 1651 | */ 1652 | #define GRE_MIN_HDR 4 1653 | #define GRE_CHECKSUM_LEN 2 1654 | #define GRE_RESERVED1_LEN 2 1655 | #define GRE_KEY_LEN 4 1656 | #define GRE_SEQNO_LEN 4 1657 | int decode_gre(char *pkt, ssize_t nr, uint16_t *csum, uint32_t *key, 1658 | uint32_t *seqno, char **payload, size_t *plen) { 1659 | int has_key, has_checksum, has_seqno, ko, co, so, po, ip_hdr_len; 1660 | uint8_t ip_proto; 1661 | 1662 | *key = 0; 1663 | *seqno = 0; 1664 | *csum = 0; 1665 | *payload = NULL; 1666 | *plen = 0; 1667 | 1668 | assert(nr > 0); 1669 | ip_hdr_len = (pkt[0] & 0x0f) * 4; 1670 | 1671 | if (nr < ip_hdr_len + GRE_MIN_HDR) 1672 | return -1; 1673 | 1674 | ip_proto = pkt[9]; 1675 | if (ip_proto != IPPROTO_GRE) 1676 | return -1; 1677 | 1678 | has_key = pkt[ip_hdr_len] & (1U << 5); 1679 | has_checksum = pkt[ip_hdr_len] & (1U << 7); 1680 | has_seqno = pkt[ip_hdr_len] & (1U << 4); 1681 | 1682 | if (has_checksum) { 1683 | co = ip_hdr_len + GRE_MIN_HDR; 1684 | if (co + GRE_CHECKSUM_LEN > nr) 1685 | return -1; 1686 | memcpy(csum, pkt + co, GRE_CHECKSUM_LEN); 1687 | } 1688 | 1689 | if (has_key) { 1690 | ko = ip_hdr_len + GRE_MIN_HDR 1691 | + (has_checksum ? GRE_CHECKSUM_LEN + GRE_RESERVED1_LEN : 0); 1692 | if (ko + GRE_KEY_LEN > nr) 1693 | return -1; 1694 | memcpy(key, pkt + ko, GRE_KEY_LEN); 1695 | } 1696 | 1697 | if (has_seqno) { 1698 | so = ip_hdr_len + GRE_MIN_HDR + 1699 | + (has_checksum ? GRE_CHECKSUM_LEN + GRE_RESERVED1_LEN : 0) 1700 | + (has_key ? GRE_KEY_LEN : 0); 1701 | if (so + GRE_SEQNO_LEN > nr) 1702 | return -1; 1703 | memcpy(seqno, pkt + so, GRE_SEQNO_LEN); 1704 | } 1705 | 1706 | po = ip_hdr_len + GRE_MIN_HDR + 1707 | + (has_checksum ? GRE_CHECKSUM_LEN + GRE_RESERVED1_LEN : 0) 1708 | + (has_key ? GRE_KEY_LEN : 0) 1709 | + (has_seqno ? GRE_SEQNO_LEN : 0); 1710 | 1711 | *plen = nr - po; 1712 | *payload = pkt + po; 1713 | return 0; 1714 | } 1715 | 1716 | int handle_grerx(void) { 1717 | int i, rc=-1, sc, keep, nmsgs; 1718 | char *data, *tx, *pkt; 1719 | size_t dlen, nx, len; 1720 | uint32_t seqno, key; 1721 | uint16_t csum; 1722 | ssize_t nr; 1723 | 1724 | nmsgs = recvmmsg(cfg.rx_fd, cfg.msgv, BATCH_PKTS, MSG_WAITFORONE, NULL); 1725 | if (nmsgs < 0) { 1726 | fprintf(stderr, "recvmmsg: %s\n", strerror(errno)); 1727 | goto done; 1728 | } 1729 | 1730 | if (cfg.verbose) 1731 | fprintf(stderr, "recvmmsg: %d messages received\n", nmsgs); 1732 | 1733 | for(i=0; i < nmsgs; i++) { 1734 | 1735 | pkt = cfg.msgv[i].msg_hdr.msg_iov[0].iov_base; 1736 | len = cfg.msgv[i].msg_len; 1737 | sc = decode_gre(pkt, len, &csum, &key, &seqno, &data, &dlen); 1738 | if (sc < 0) { 1739 | rc = 0; /* ignore and drop bad packets */ 1740 | goto done; 1741 | } 1742 | 1743 | /* test key matches desired key */ 1744 | if (cfg.encap.key != key) { 1745 | rc = 0; 1746 | goto done; 1747 | } 1748 | 1749 | /* decapsulate packet, advance over GRE header */ 1750 | tx = data; 1751 | nx = dlen; 1752 | if (nx == 0) { 1753 | rc = 0; 1754 | goto done; 1755 | } 1756 | 1757 | /* inject 802.1q tag if requested */ 1758 | if (cfg.vlan) tx = inject_vlan(tx,&nx,cfg.vlan); 1759 | if (tx == NULL) { 1760 | fprintf(stderr, "vlan tag injection failed\n"); 1761 | goto done; 1762 | } 1763 | 1764 | /* truncate packet if requested */ 1765 | if (cfg.size && (nx > cfg.size)) nx = cfg.size; 1766 | 1767 | /* trim N bytes from frame end if requested. */ 1768 | if (cfg.tail && (nx > cfg.tail)) nx -= cfg.tail; 1769 | 1770 | keep = keep_packet(tx,nx); 1771 | 1772 | /* push into batch buffer */ 1773 | sc = keep ? bb_write(cfg.ring, &cfg.bb, tx, nx) : 0; 1774 | if (sc < 0) { 1775 | fprintf(stderr, "bb_write (%lu bytes): error code %d\n", (long)nx, sc); 1776 | goto done; 1777 | } 1778 | } 1779 | 1780 | rc = 0; 1781 | 1782 | done: 1783 | return rc; 1784 | } 1785 | 1786 | int handle_io(void) { 1787 | int rc = -1; 1788 | 1789 | switch(cfg.mode) { 1790 | case mode_receive: 1791 | rc = receive_packets(); 1792 | break; 1793 | case mode_transmit: 1794 | rc = transmit_packets(); 1795 | break; 1796 | default: 1797 | assert(0); 1798 | break; 1799 | } 1800 | 1801 | return rc; 1802 | } 1803 | 1804 | size_t kmgt(char *optarg) { 1805 | size_t size=0; 1806 | char unit; 1807 | 1808 | int n = sscanf(optarg, "%lu%c", &size, &unit); 1809 | if (n == 0) usage(); 1810 | if (n == 2) { 1811 | switch (unit) { 1812 | case 't': case 'T': size *= 1024; /* fall through */ 1813 | case 'g': case 'G': size *= 1024; /* fall through */ 1814 | case 'm': case 'M': size *= 1024; /* fall through */ 1815 | case 'k': case 'K': size *= 1024; break; 1816 | default: usage(); break; 1817 | } 1818 | } 1819 | 1820 | return size; 1821 | } 1822 | 1823 | int parse_encap(char *opt) { 1824 | int rc = -1, len; 1825 | char *mode=opt,*name=opt, *colon; 1826 | struct hostent *e; 1827 | 1828 | colon = strchr(mode,':'); 1829 | if (colon) *colon = '\0'; 1830 | else if (cfg.mode == mode_transmit) { 1831 | fprintf(stderr,"encapsulation syntax error\n"); 1832 | goto done; 1833 | } 1834 | 1835 | if (!strcmp(mode,"gre")) cfg.encap.mode = mode_gre; 1836 | else if (!strcmp(mode,"gretap")) cfg.encap.mode = mode_gretap; 1837 | else if (!strcmp(mode,"vxlan")) cfg.encap.mode = mode_vxlan; 1838 | else { 1839 | fprintf(stderr,"invalid encapsulation mode\n"); 1840 | goto done; 1841 | } 1842 | 1843 | /* name is destination hostname (GRE tx mode), 1844 | or local IP to bind (GRE rx mode) */ 1845 | if (colon) { 1846 | name = colon+1; 1847 | e = gethostbyname(name); 1848 | if (e == NULL) { 1849 | fprintf(stderr, "gethostbyname: %s: %s\n", name, hstrerror(h_errno)); 1850 | goto done; 1851 | } 1852 | 1853 | if (e->h_length != sizeof(cfg.encap.dst)) { 1854 | fprintf(stderr, "DNS result size mismatch\n"); 1855 | goto done; 1856 | } 1857 | 1858 | memcpy(&cfg.encap.dst.s_addr, e->h_addr, e->h_length); 1859 | } 1860 | 1861 | rc = 0; 1862 | 1863 | done: 1864 | return rc; 1865 | } 1866 | 1867 | int main(int argc, char *argv[]) { 1868 | int rc = -1, n, opt, ring_mode, init_mode, pos, sc; 1869 | struct epoll_event ev; 1870 | cfg.prog = argv[0]; 1871 | struct shr *r; 1872 | struct bb *b; 1873 | char *file; 1874 | void **p; 1875 | 1876 | cfg.watch_rings = utvector_new(utmm_ptr); 1877 | cfg.watch_names = utvector_new(utstring_mm); 1878 | cfg.watch_win = utvector_new(&ww_mm); 1879 | utstring_new(cfg.tmp); 1880 | utmm_init(&bb_mm, &cfg.bb, 1); 1881 | utmm_init(&bb_mm, &cfg.rb, 1); 1882 | utmm_init(&bb_mm, &cfg.gb, 1); 1883 | 1884 | while ( (opt=getopt(argc,argv,"t:r:c:vi:hV:s:D:E:B:S:Z:Qd:K:Rqkf:")) != -1) { 1885 | switch(opt) { 1886 | case 't': cfg.mode = mode_transmit; if (*optarg != 'x') usage(); break; 1887 | case 'r': cfg.mode = mode_receive; if (*optarg != 'x') usage(); break; 1888 | case 'c': cfg.mode = mode_create; if (*optarg != 'r') usage(); break; 1889 | case 'E': cfg.encap.enable=1; if (parse_encap(optarg)) usage(); break; 1890 | case 'v': cfg.verbose++; break; 1891 | case 'k': cfg.keep=1; break; 1892 | case 'V': cfg.vlan=atoi(optarg); break; 1893 | case 'D': cfg.tail=atoi(optarg); break; 1894 | case 's': cfg.size = kmgt(optarg); break; 1895 | case 'B': cfg.ring_block_nr=atoi(optarg); break; 1896 | case 'S': cfg.ring_block_sz = 1 << (unsigned)atoi(optarg); break; 1897 | case 'Z': cfg.ring_frame_sz=atoi(optarg); break; 1898 | case 'q': cfg.bypass_qdisc_on_tx = 1; break; 1899 | case 'Q': cfg.strip_vlan = 1; break; 1900 | case 'd': cfg.drop_pct=100-atoi(optarg); break; 1901 | case 'K': cfg.encap.key = strchr(optarg, '.') ? 1902 | inet_addr(optarg) : htonl(atoi(optarg)); 1903 | break; 1904 | case 'R': cfg.use_tx_ring = 1; break; 1905 | case 'i': if (!strcmp(optarg, "o")) cfg.mode = mode_watch; /* -io */ 1906 | else { /* -i */ 1907 | if (strlen(optarg)+1 > MAX_NIC) goto done; 1908 | strncpy(cfg.dev, optarg, MAX_NIC); 1909 | } 1910 | break; 1911 | case 'f': sc = sscanf(optarg, "vlan %d", &cfg.pass_vlan); 1912 | if (sc != 1) usage(); 1913 | break; 1914 | case 'h': default: usage(); break; 1915 | } 1916 | } 1917 | 1918 | if ((cfg.drop_pct < 0) || (cfg.drop_pct > 100)) usage(); 1919 | 1920 | /* block all signals. we take signals synchronously via signalfd */ 1921 | sigset_t all; 1922 | sigfillset(&all); 1923 | sigprocmask(SIG_SETMASK,&all,NULL); 1924 | 1925 | /* a few signals we'll accept via our signalfd */ 1926 | sigset_t sw; 1927 | sigemptyset(&sw); 1928 | for(n=0; n < sizeof(sigs)/sizeof(*sigs); n++) sigaddset(&sw, sigs[n]); 1929 | 1930 | /* create the signalfd for receiving signals */ 1931 | cfg.signal_fd = signalfd(-1, &sw, 0); 1932 | if (cfg.signal_fd == -1) { 1933 | fprintf(stderr,"signalfd: %s\n", strerror(errno)); 1934 | goto done; 1935 | } 1936 | 1937 | /* create the timerfd for receiving clock events */ 1938 | cfg.timer_fd = timerfd_create(CLOCK_MONOTONIC, 0); 1939 | if (cfg.timer_fd == -1) { 1940 | fprintf(stderr,"timerfd_create: %s\n", strerror(errno)); 1941 | goto done; 1942 | } 1943 | 1944 | /* set up for periodic timer expiration */ 1945 | sc = timerfd_settime(cfg.timer_fd, 0, &cfg.timer, NULL); 1946 | if (sc < 0) { 1947 | fprintf(stderr, "timerfd_settime: %s\n", strerror(errno)); 1948 | goto done; 1949 | } 1950 | 1951 | /* set up the epoll instance */ 1952 | cfg.epoll_fd = epoll_create(1); 1953 | if (cfg.epoll_fd == -1) { 1954 | fprintf(stderr,"epoll: %s\n", strerror(errno)); 1955 | goto done; 1956 | } 1957 | 1958 | /* add descriptors of interest */ 1959 | if (new_epoll(EPOLLIN, cfg.signal_fd)) goto done; 1960 | if (new_epoll(EPOLLIN, cfg.timer_fd)) goto done; 1961 | if (cfg.mode == mode_watch && isatty(STDIN_FILENO)) { 1962 | if (new_epoll(EPOLLIN, STDIN_FILENO)) goto done; 1963 | } 1964 | 1965 | /* in transmit mode, epoll on the ring descriptor. 1966 | * in receive mode, epoll on the raw socket. 1967 | */ 1968 | switch (cfg.mode) { 1969 | case mode_receive: 1970 | if (cfg.dev == NULL) usage(); 1971 | ring_mode = SHR_WRONLY; 1972 | cfg.file = (optind < argc) ? argv[optind++] : NULL; 1973 | cfg.ring = shr_open(cfg.file, ring_mode); 1974 | if (cfg.ring == NULL) goto done; 1975 | sc = cfg.encap.enable ? setup_rx_encap() : setup_rx(); 1976 | if (sc < 0) goto done; 1977 | sc = cfg.encap.enable ? new_epoll(EPOLLIN, cfg.rx_fd) : 1978 | new_epoll(EPOLLIN, cfg.fd); 1979 | if (sc < 0) goto done; 1980 | break; 1981 | case mode_transmit: 1982 | if ((cfg.dev == NULL) && (cfg.encap.enable == 0)) usage(); 1983 | ring_mode = SHR_RDONLY|SHR_NONBLOCK; 1984 | cfg.file = (optind < argc) ? argv[optind++] : NULL; 1985 | cfg.ring = shr_open(cfg.file, ring_mode); 1986 | if (cfg.ring == NULL) goto done; 1987 | cfg.fd = shr_get_selectable_fd(cfg.ring); 1988 | if (cfg.fd < 0) goto done; 1989 | if (new_epoll(EPOLLIN, cfg.fd)) goto done; 1990 | if (setup_tx() < 0) goto done; 1991 | break; 1992 | case mode_create: 1993 | if (cfg.size == 0) usage(); 1994 | while (optind < argc) { 1995 | file = argv[optind++]; 1996 | init_mode = SHR_DROP|SHR_FARM|SHR_MLOCK|SHR_APPDATA; 1997 | if (cfg.keep) init_mode |= SHR_KEEPEXIST; 1998 | if (cfg.verbose) fprintf(stderr,"creating %s\n", file); 1999 | sc = shr_init(file, cfg.size, init_mode, &cfg.stats, sizeof(cfg.stats)); 2000 | if (sc < 0) goto done; 2001 | } 2002 | rc = 0; 2003 | goto done; 2004 | break; 2005 | case mode_watch: 2006 | while (optind < argc) { 2007 | file = argv[optind++]; 2008 | utstring_clear(cfg.tmp); 2009 | utstring_printf(cfg.tmp, "%s", file); 2010 | utvector_push(cfg.watch_names, cfg.tmp); 2011 | r = shr_open(file, SHR_RDONLY); 2012 | if (r == NULL) goto done; 2013 | utvector_push(cfg.watch_rings, &r); 2014 | utvector_extend(cfg.watch_win); 2015 | } 2016 | /* clear screen, move to 0,0 */ 2017 | printf("\033[2J\n"); 2018 | break; 2019 | default: 2020 | usage(); 2021 | } 2022 | 2023 | alarm(1); 2024 | 2025 | while (1) { 2026 | sc = epoll_wait(cfg.epoll_fd, &ev, 1, -1); 2027 | if (sc < 0) { 2028 | fprintf(stderr, "epoll: %s\n", strerror(errno)); 2029 | goto done; 2030 | } 2031 | 2032 | if (sc == 0) { assert(0); goto done; } 2033 | else if (ev.data.fd == cfg.signal_fd) { if (handle_signal() < 0) goto done;} 2034 | else if (ev.data.fd == cfg.timer_fd) { if (handle_timer() < 0) goto done;} 2035 | else if (ev.data.fd == cfg.rx_fd) { if (handle_grerx() < 0) goto done;} 2036 | else if (ev.data.fd == cfg.fd) { if (handle_io() < 0) goto done; } 2037 | else if (ev.data.fd == STDIN_FILENO) { goto done; } 2038 | else { 2039 | fprintf(stderr, "error: unknown descriptor\n"); 2040 | goto done; 2041 | } 2042 | } 2043 | 2044 | rc = 0; 2045 | 2046 | done: 2047 | /* in these modes, fd is internal to shr and closed by it */ 2048 | if (cfg.mode != mode_transmit) { 2049 | if (cfg.fd != -1) close(cfg.fd); 2050 | } 2051 | if (cfg.tx_fd != -1) close(cfg.tx_fd); 2052 | if (cfg.rx_fd != -1) close(cfg.rx_fd); 2053 | if (cfg.signal_fd != -1) close(cfg.signal_fd); 2054 | if (cfg.timer_fd != -1) close(cfg.timer_fd); 2055 | if (cfg.epoll_fd != -1) close(cfg.epoll_fd); 2056 | utmm_fini(&bb_mm, &cfg.bb, 1); 2057 | utmm_fini(&bb_mm, &cfg.rb, 1); 2058 | utmm_fini(&bb_mm, &cfg.gb, 1); 2059 | if ((cfg.pb.n != 0) && (cfg.pb.d != MAP_FAILED)) { 2060 | munmap(cfg.pb.d, cfg.pb.n); /* cfg.pb is mode specfic */ 2061 | assert(cfg.pb.iov == NULL); /* iov part of pb unused */ 2062 | } 2063 | if (cfg.ring) shr_close(cfg.ring); 2064 | p = NULL; 2065 | while ( (p = utvector_next(cfg.watch_rings, p))) shr_close(*p); 2066 | utvector_free(cfg.watch_rings); 2067 | utvector_free(cfg.watch_names); 2068 | utstring_free(cfg.tmp); 2069 | utvector_free(cfg.watch_win); 2070 | return rc; 2071 | } 2072 | -------------------------------------------------------------------------------- /fluxcap.h: -------------------------------------------------------------------------------- 1 | #ifndef _FLUXCAP_H_ 2 | #define _FLUXCAP_H_ 3 | 4 | #define _GNU_SOURCE 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include "shr.h" 29 | #include "libut.h" 30 | 31 | #define FLUXCAP_VERSION "3.2" 32 | #define MAX_NIC 64 /* longest NIC name we accept */ 33 | #define MAX_PKT 10000 /* max length of packet */ 34 | #define BATCH_PKTS 10000 /* max pkts to read in one shr_readv */ 35 | #define BATCH_SIZE (BATCH_PKTS*MAX_PKT) /* bytes buffered before shr_writev */ 36 | #define TIMER_HZ 10 /* rainy day flush/stats timer freq */ 37 | 38 | #define VLAN_LEN 4 39 | #define MACS_LEN (2*6) 40 | 41 | struct bb { 42 | size_t n; /* batch buffer size */ 43 | size_t u; /* batch buffer used */ 44 | char *d; /* batch buffer */ 45 | UT_vector /* of struct iovec */ *iov; 46 | }; 47 | 48 | struct encap { /* this is used in GRE encapsulation mode */ 49 | int enable; 50 | enum {mode_gre=0, mode_gretap, mode_vxlan} mode; 51 | struct in_addr dst; /* used as GRE TX dest IP, or GRE RX local IP */ 52 | uint32_t key; /* if non-zero, indicates RX/TX GRE key, or VXLAN VNI */ 53 | }; 54 | 55 | struct fluxcap_stats { 56 | size_t rx_drops; /* mode_receive drops in rx/pre-ring reported from kernel */ 57 | size_t rd_drops; /* mode_transmit/tee drops due to reader lag on shr ring */ 58 | }; 59 | 60 | /* watch window - for tracking rates over NWIN observations */ 61 | #define NWIN 100 62 | #define RATE_MAX 20 63 | #define NAME_MAX 80 64 | struct ww { 65 | char name[NAME_MAX]; 66 | 67 | struct { 68 | struct fluxcap_stats fs; 69 | struct shr_stat ss; 70 | } win[NWIN]; 71 | 72 | /* resulting delta from newest to oldest window */ 73 | unsigned long mw; /* packets in */ 74 | unsigned long bw; /* bytes in */ 75 | unsigned long rx; /* packet drops (tpacket rx) */ 76 | unsigned long rd; /* packet drops (reader lag) */ 77 | 78 | /* per second rates */ 79 | struct { 80 | unsigned long B; /* bytes in */ 81 | unsigned long b; /* bits in */ 82 | unsigned lg10_b; /* integer floor(base-10-log) of b */ 83 | unsigned lg10_bf;/* fraction part of ^ scaled to [0-8) */ 84 | unsigned long rx; /* packet drops (tpacket rx) */ 85 | unsigned long rd; /* packet drops (reader lag) */ 86 | 87 | /* per second rates as strings */ 88 | struct { 89 | char b[ RATE_MAX ]; /* bits per second */ 90 | char rx[RATE_MAX ]; /* drop-rx per second */ 91 | char rd[RATE_MAX ]; /* drop-rd per second */ 92 | char E[ RATE_MAX ]; /* bits per second (human units e.g. Mbit/s) */ 93 | char X[ RATE_MAX ]; /* drop-rx per second (human units e.g. Mbit/s) */ 94 | char D[ RATE_MAX ]; /* drop-rd per second (human units e.g. Mbit/s) */ 95 | } str; 96 | } ps; 97 | }; 98 | 99 | 100 | #endif 101 | -------------------------------------------------------------------------------- /lib/.gitignore: -------------------------------------------------------------------------------- 1 | libtpl.a 2 | libut.a 3 | -------------------------------------------------------------------------------- /lib/Makefile.am: -------------------------------------------------------------------------------- 1 | srcdir = @srcdir@ 2 | 3 | SUBDIRS= 4 | 5 | # build these external libraries as convenience libs 6 | 7 | noinst_LIBRARIES = libut.a 8 | 9 | libut_a_CFLAGS = -Wall -Wextra 10 | libut_a_CPPFLAGS = -I$(srcdir)/libut/include 11 | libut_a_SOURCES = libut/src/libut.c \ 12 | libut/src/utvector.c \ 13 | libut/src/utmm.c \ 14 | libut/src/ringbuf.c 15 | 16 | -------------------------------------------------------------------------------- /respan.c: -------------------------------------------------------------------------------- 1 | /* 2 | * respan: a tool to receive or retransmit a network tap 3 | * 4 | * © 2019 The Johns Hopkins University Applied Physics Laboratory LLC. 5 | * All Rights Reserved. 6 | * 7 | * AUTHOR: Troy D. Hanson 8 | * LICENSE: MIT 9 | * PACKAGE: fluxcap 10 | * 11 | */ 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include "respan.h" 34 | 35 | struct { 36 | char *prog; 37 | int verbose; 38 | char *dir; 39 | time_t now; 40 | int rotate_sec; 41 | int maxsz_mb; 42 | int epoll_fd; 43 | int signal_fd; 44 | int rx_fd; 45 | io_mode from; 46 | io_mode to; 47 | char *file_pat; 48 | char pkt[MAX_PKT]; 49 | /* savefile mapping */ 50 | char *sv_addr; 51 | size_t sv_len; 52 | int sv_fd; 53 | time_t sv_ts; /* time reflected in name of savefile */ 54 | int sv_seq; /* sequence number of save file within ts second */ 55 | off_t sv_cur; /* next write offset within save file */ 56 | } cfg = { 57 | .rx_fd = -1, 58 | .epoll_fd = -1, 59 | .signal_fd = -1, 60 | .rotate_sec = 5, 61 | .maxsz_mb = 10, 62 | .dir = ".", 63 | .file_pat = FILE_PATTERN, 64 | }; 65 | 66 | /* signals that we'll accept via signalfd in epoll */ 67 | int sigs[] = {SIGHUP,SIGTERM,SIGINT,SIGQUIT,SIGALRM}; 68 | 69 | #define x(a) #a, 70 | char *mode_strings[] = { MODES NULL }; 71 | #undef x 72 | 73 | struct option options[] = { 74 | { 75 | .name = "from", 76 | .has_arg = 1, 77 | .val = 'F', 78 | }, 79 | { 80 | .name = "to", 81 | .has_arg = 1, 82 | .val = 'T', 83 | }, 84 | { 85 | .name = "help", 86 | .has_arg = 0, 87 | .val = 'h', 88 | }, 89 | { 90 | .name = NULL, /* terminal element */ 91 | }, 92 | }; 93 | 94 | void usage() { 95 | fprintf(stderr, 96 | "usage: %s [-v] --from erspan --to pcap:\n" 97 | " pcap options\n" 98 | " -G (in sec)\n" 99 | " -C (in mb)\n" 100 | " -w (eg. %s)\n" 101 | "\n", 102 | cfg.prog, 103 | FILE_PATTERN); 104 | exit(-1); 105 | } 106 | 107 | int new_epoll(int events, int fd) { 108 | int rc; 109 | struct epoll_event ev; 110 | memset(&ev,0,sizeof(ev)); // placate valgrind 111 | ev.events = events; 112 | ev.data.fd= fd; 113 | rc = epoll_ctl(cfg.epoll_fd, EPOLL_CTL_ADD, fd, &ev); 114 | if (rc == -1) { 115 | fprintf(stderr,"epoll_ctl: %s\n", strerror(errno)); 116 | } 117 | return rc; 118 | } 119 | 120 | const uint8_t pcap_glb_hdr[] = { 121 | 0xd4, 0xc3, 0xb2, 0xa1, /* magic number */ 122 | 0x02, 0x00, 0x04, 0x00, /* version major, version minor */ 123 | 0x00, 0x00, 0x00, 0x00, /* this zone */ 124 | 0x00, 0x00, 0x00, 0x00, /* sigfigs */ 125 | 0xff, 0xff, 0x00, 0x00, /* snaplen */ 126 | 0x01, 0x00, 0x00, 0x00 /* network */ 127 | }; 128 | 129 | int close_savefile() { 130 | int rc=-1, sc; 131 | 132 | sc = munmap(cfg.sv_addr, cfg.sv_len); 133 | if (sc < 0) { 134 | fprintf(stderr,"munmap: %s\n", strerror(errno)); 135 | goto done; 136 | } 137 | 138 | sc = ftruncate(cfg.sv_fd, cfg.sv_cur); 139 | if (sc < 0) { 140 | fprintf(stderr,"ftruncate: %s\n", strerror(errno)); 141 | goto done; 142 | } 143 | 144 | sc = close(cfg.sv_fd); 145 | if (sc < 0) { 146 | fprintf(stderr,"close: %s\n", strerror(errno)); 147 | goto done; 148 | } 149 | 150 | rc = 0; 151 | 152 | done: 153 | return rc; 154 | } 155 | 156 | int reopen_savefile() { 157 | char base[FILE_MAX]; 158 | char path[FILE_MAX]; 159 | int rc=-1, sc; 160 | 161 | /* close out current savefile, if we have one */ 162 | sc = cfg.sv_addr ? close_savefile() : 0; 163 | if (sc < 0) goto done; 164 | 165 | cfg.sv_addr= NULL; 166 | cfg.sv_len = 0; 167 | cfg.sv_cur = 0; 168 | cfg.sv_fd =-1; 169 | if (cfg.sv_ts == cfg.now) 170 | cfg.sv_seq++; 171 | else 172 | cfg.sv_seq = 0; 173 | 174 | /* format filename with strftime */ 175 | cfg.sv_ts = cfg.now; 176 | sc = strftime(base, sizeof(base), cfg.file_pat, localtime(&cfg.now)); 177 | if (sc == 0) { 178 | fprintf(stderr,"strftime: error in file pattern\n"); 179 | goto done; 180 | } 181 | 182 | /* form full path to open */ 183 | snprintf(path, sizeof(path), "%s/%s%.2u.pcap", cfg.dir, base, cfg.sv_seq); 184 | 185 | /* map file into memory */ 186 | cfg.sv_fd = open(path, O_RDWR|O_CREAT|O_EXCL, 0644); 187 | if (cfg.sv_fd < 0) { 188 | fprintf(stderr, "open %s: %s\n", path, strerror(errno)); 189 | goto done; 190 | } 191 | 192 | /* set its initial length; we fill it in memory to this size */ 193 | cfg.sv_len = cfg.maxsz_mb*(1024*1024); 194 | sc = ftruncate(cfg.sv_fd, cfg.sv_len); 195 | if (sc < 0) { 196 | fprintf(stderr, "ftruncate %s: %s\n", path, strerror(errno)); 197 | goto done; 198 | } 199 | 200 | int mode = PROT_READ|PROT_WRITE; 201 | cfg.sv_addr = mmap(0, cfg.sv_len, mode, MAP_SHARED, cfg.sv_fd, 0); 202 | if (cfg.sv_addr == MAP_FAILED) { 203 | fprintf(stderr, "mmap %s: %s\n", path, strerror(errno)); 204 | cfg.sv_addr = NULL; 205 | goto done; 206 | } 207 | 208 | /* set up global header. */ 209 | memcpy(&cfg.sv_addr[cfg.sv_cur], pcap_glb_hdr, sizeof(pcap_glb_hdr)); 210 | cfg.sv_cur += sizeof(pcap_glb_hdr); 211 | 212 | rc = 0; 213 | 214 | done: 215 | return rc; 216 | } 217 | 218 | 219 | int periodic_work(void) { 220 | int rc = -1, sc; 221 | 222 | /* test rotation interval */ 223 | if (cfg.sv_addr == NULL) { 224 | rc = 0; 225 | goto done; 226 | } 227 | 228 | if (cfg.sv_ts + cfg.rotate_sec > cfg.now) { 229 | rc = 0; 230 | goto done; 231 | } 232 | 233 | sc = reopen_savefile(); 234 | if (sc < 0) goto done; 235 | 236 | rc = 0; 237 | 238 | done: 239 | return rc; 240 | } 241 | 242 | int handle_signal() { 243 | struct signalfd_siginfo info; 244 | int sc, rc=-1; 245 | ssize_t nr; 246 | char *s; 247 | 248 | nr = read(cfg.signal_fd, &info, sizeof(info)); 249 | if (nr != sizeof(info)) { 250 | fprintf(stderr,"failed to read signal fd buffer\n"); 251 | goto done; 252 | } 253 | 254 | switch(info.ssi_signo) { 255 | case SIGALRM: 256 | cfg.now = time(NULL); 257 | sc = periodic_work(); 258 | if (sc < 0) goto done; 259 | alarm(1); 260 | break; 261 | default: 262 | s = strsignal(info.ssi_signo); 263 | fprintf(stderr,"got signal %d (%s)\n", info.ssi_signo, s); 264 | goto done; 265 | break; 266 | } 267 | 268 | rc = 0; 269 | 270 | done: 271 | return rc; 272 | } 273 | 274 | 275 | int parse_mode(char *in) { 276 | char *colon, **m; 277 | int n, i=0; 278 | 279 | colon = strchr(in, ':'); 280 | n = colon ? colon-in : strlen(in); 281 | 282 | m = mode_strings; 283 | while (*m) { 284 | if (!strncmp(*m, in, n)) { 285 | 286 | /* found match */ 287 | 288 | /* parse dir from pcap: */ 289 | if (colon && (i == mode_pcap)) 290 | cfg.dir = strdup(colon+1); 291 | 292 | return i; 293 | } 294 | m++; 295 | i++; 296 | } 297 | 298 | return mode_none; 299 | 300 | } 301 | 302 | int record_packet(char *pkt, size_t len) { 303 | uint32_t sec, usec, caplen, origlen; 304 | int sc, rc = -1; 305 | size_t fl; 306 | 307 | if (cfg.sv_addr == NULL) { 308 | rc = 0; 309 | goto done; 310 | } 311 | 312 | /* does enough space remain in the output area? */ 313 | fl = (sizeof(uint32_t) * 4) + len; 314 | if (cfg.sv_cur + fl >= cfg.maxsz_mb*(1024*1024)) { 315 | sc = reopen_savefile(); 316 | if (sc < 0) goto done; 317 | } 318 | 319 | /* conjure timestamp from our clock */ 320 | sec = (uint32_t)cfg.now; 321 | usec = 0; 322 | caplen = len; 323 | origlen = len; 324 | 325 | /* write packet header and packet. */ 326 | memcpy(&cfg.sv_addr[cfg.sv_cur], &sec, sizeof(uint32_t)); 327 | cfg.sv_cur += sizeof(uint32_t); 328 | memcpy(&cfg.sv_addr[cfg.sv_cur], &usec, sizeof(uint32_t)); 329 | cfg.sv_cur += sizeof(uint32_t); 330 | memcpy(&cfg.sv_addr[cfg.sv_cur], &caplen, sizeof(uint32_t)); 331 | cfg.sv_cur += sizeof(uint32_t); 332 | memcpy(&cfg.sv_addr[cfg.sv_cur], &origlen, sizeof(uint32_t)); 333 | cfg.sv_cur += sizeof(uint32_t); 334 | memcpy(&cfg.sv_addr[cfg.sv_cur], pkt, len); 335 | cfg.sv_cur += len; 336 | 337 | rc = 0; 338 | 339 | done: 340 | return rc; 341 | } 342 | 343 | /* set up as a GRE receiver */ 344 | int setup_rx_encap(void) { 345 | struct sockaddr *sa; 346 | int i, sc, rc = -1; 347 | struct iovec *iov; 348 | socklen_t sz; 349 | 350 | cfg.rx_fd = socket(AF_INET, SOCK_RAW, IPPROTO_GRE); 351 | if (cfg.rx_fd == -1) { 352 | fprintf(stderr,"socket: %s\n", strerror(errno)); 353 | goto done; 354 | } 355 | 356 | rc = 0; 357 | 358 | done: 359 | return rc; 360 | } 361 | 362 | /* decode the gre packet into its fields. 363 | * input pkt starts with outer IP header. 364 | * fields are returned in network order! 365 | * fields are zeroed if not present 366 | * on decoding failure, returns -1. 367 | * returns 0 on success 368 | */ 369 | #define GRE_MIN_HDR 4 370 | #define GRE_CHECKSUM_LEN 2 371 | #define GRE_RESERVED1_LEN 2 372 | #define GRE_KEY_LEN 4 373 | #define GRE_SEQNO_LEN 4 374 | int decode_gre(char *pkt, ssize_t nr, uint16_t *type, uint16_t *csum, 375 | uint32_t *key, uint32_t *seqno, char **payload, size_t *plen) { 376 | int has_key, has_checksum, has_seqno, ko, co, so, po, ip_hdr_len; 377 | uint8_t ip_proto; 378 | 379 | *key = 0; 380 | *seqno = 0; 381 | *csum = 0; 382 | *payload = NULL; 383 | *plen = 0; 384 | *type = 0; 385 | 386 | assert(nr > 0); 387 | ip_hdr_len = (pkt[0] & 0x0f) * 4; 388 | 389 | if (nr < ip_hdr_len + GRE_MIN_HDR) 390 | return -1; 391 | 392 | ip_proto = pkt[9]; 393 | if (ip_proto != IPPROTO_GRE) 394 | return -1; 395 | 396 | memcpy(type, &pkt[ip_hdr_len + 2], sizeof(uint16_t)); 397 | 398 | has_key = pkt[ip_hdr_len] & (1U << 5); 399 | has_checksum = pkt[ip_hdr_len] & (1U << 7); 400 | has_seqno = pkt[ip_hdr_len] & (1U << 4); 401 | 402 | if (has_checksum) { 403 | co = ip_hdr_len + GRE_MIN_HDR; 404 | if (co + GRE_CHECKSUM_LEN > nr) 405 | return -1; 406 | memcpy(csum, pkt + co, GRE_CHECKSUM_LEN); 407 | } 408 | 409 | if (has_key) { 410 | ko = ip_hdr_len + GRE_MIN_HDR 411 | + (has_checksum ? GRE_CHECKSUM_LEN + GRE_RESERVED1_LEN : 0); 412 | if (ko + GRE_KEY_LEN > nr) 413 | return -1; 414 | memcpy(key, pkt + ko, GRE_KEY_LEN); 415 | } 416 | 417 | if (has_seqno) { 418 | so = ip_hdr_len + GRE_MIN_HDR + 419 | + (has_checksum ? GRE_CHECKSUM_LEN + GRE_RESERVED1_LEN : 0) 420 | + (has_key ? GRE_KEY_LEN : 0); 421 | if (so + GRE_SEQNO_LEN > nr) 422 | return -1; 423 | memcpy(seqno, pkt + so, GRE_SEQNO_LEN); 424 | } 425 | 426 | po = ip_hdr_len + GRE_MIN_HDR + 427 | + (has_checksum ? GRE_CHECKSUM_LEN + GRE_RESERVED1_LEN : 0) 428 | + (has_key ? GRE_KEY_LEN : 0) 429 | + (has_seqno ? GRE_SEQNO_LEN : 0); 430 | 431 | *plen = nr - po; 432 | *payload = pkt + po; 433 | return 0; 434 | } 435 | 436 | /* see ovs-fields(7) */ 437 | #define ERSPAN_V1_GRETYPE 0x88be 438 | #define ERSPAN_V1_HDR 8 439 | #define ERSPAN_V2_GRETYPE 0x22eb 440 | #define ERSPAN_V2_HDR 12 441 | int decode_erspan(uint16_t gre_type, uint8_t *in, size_t in_len, 442 | char **out, size_t *out_len) { 443 | int has_subhdr, rc = -1; 444 | 445 | gre_type = ntohs(gre_type); 446 | 447 | switch(gre_type) { 448 | case ERSPAN_V1_GRETYPE: /* erspan version 1 aka Type II */ 449 | if (in_len < ERSPAN_V1_HDR) goto done; 450 | *out = in + ERSPAN_V1_HDR; 451 | *out_len = in_len - ERSPAN_V1_HDR; 452 | if (cfg.verbose) fprintf(stderr, " erspan v1\n"); 453 | break; 454 | case ERSPAN_V2_GRETYPE: /* erspan version 2 aka Type III */ 455 | if (in_len < ERSPAN_V2_HDR) goto done; 456 | /* test if ERSPAN "Optional subheader" flag is set */ 457 | has_subhdr = (in[11] & 0x1) ? 1 : 0; 458 | *out = in + ERSPAN_V2_HDR + (has_subhdr ? 8 : 0); 459 | *out_len = in_len - ERSPAN_V2_HDR - (has_subhdr ? 8 : 0); 460 | if (cfg.verbose) 461 | fprintf(stderr, " erspan v2 (sub_hdr: %d)\n", has_subhdr); 462 | break; 463 | default: 464 | fprintf(stderr, "unknown gre erspan type 0x%x\n", gre_type); 465 | goto done; 466 | } 467 | 468 | rc = 0; 469 | 470 | done: 471 | return rc; 472 | } 473 | 474 | int handle_grerx(void) { 475 | uint32_t seqno, key; 476 | uint16_t csum, type; 477 | char *data, *out; 478 | size_t dlen, sz; 479 | int rc=-1, sc; 480 | ssize_t nr; 481 | 482 | nr = read(cfg.rx_fd, cfg.pkt, sizeof(cfg.pkt)); 483 | if (nr < 0) { 484 | fprintf(stderr, "read: %s\n", strerror(errno)); 485 | goto done; 486 | } 487 | 488 | if (cfg.verbose) 489 | fprintf(stderr, "received GRE packet of %zd bytes\n", nr); 490 | 491 | sc = decode_gre(cfg.pkt, nr, &type, &csum, &key, &seqno, &data, &dlen); 492 | if (sc < 0) { 493 | rc = 0; 494 | goto done; 495 | } 496 | 497 | /* decapsulate packet, advance over GRE header */ 498 | if (dlen == 0) { 499 | rc = 0; 500 | goto done; 501 | } 502 | 503 | /* expect ERSPAN header at this point - discard */ 504 | sc = decode_erspan(type, data, dlen, &out, &sz); 505 | if (sc < 0) { 506 | rc = 0; 507 | goto done; 508 | } 509 | 510 | /* save the packet */ 511 | sc = record_packet(out, sz); 512 | if (sc < 0) goto done; 513 | 514 | rc = 0; 515 | 516 | done: 517 | return rc; 518 | } 519 | 520 | int main(int argc, char *argv[]) { 521 | struct epoll_event ev; 522 | int opt, rc=-1, sc, n; 523 | 524 | cfg.now = time(NULL); 525 | cfg.prog = argv[0]; 526 | 527 | do { 528 | opt = getopt_long_only(argc, argv, "vhF:T:G:C:w:", options, NULL); 529 | switch (opt) { 530 | case 'F': cfg.from = parse_mode(optarg); break; 531 | case 'T': cfg.to = parse_mode(optarg); break; 532 | case 'G': cfg.rotate_sec = atoi(optarg); break; 533 | case 'C': cfg.maxsz_mb = atoi(optarg); break; 534 | case 'w': cfg.file_pat = strdup(optarg); break; 535 | case 'v': cfg.verbose++; break; 536 | case 'h': usage(); 537 | case -1: break; 538 | } 539 | } while (opt > 0); 540 | 541 | if (cfg.from == mode_none) usage(); 542 | if (cfg.to == mode_none) usage(); 543 | 544 | /* right now we only support this mode */ 545 | assert(cfg.to == mode_pcap); 546 | assert(cfg.from == mode_erspan); 547 | 548 | /* block all signals. we take signals synchronously via signalfd */ 549 | sigset_t all; 550 | sigfillset(&all); 551 | sigprocmask(SIG_SETMASK,&all,NULL); 552 | 553 | /* a few signals we'll accept via our signalfd */ 554 | sigset_t sw; 555 | sigemptyset(&sw); 556 | for(n=0; n < sizeof(sigs)/sizeof(*sigs); n++) 557 | sigaddset(&sw, sigs[n]); 558 | 559 | /* create the signalfd for receiving signals */ 560 | cfg.signal_fd = signalfd(-1, &sw, 0); 561 | if (cfg.signal_fd == -1) { 562 | fprintf(stderr,"signalfd: %s\n", strerror(errno)); 563 | goto done; 564 | } 565 | 566 | /* set up the epoll instance */ 567 | cfg.epoll_fd = epoll_create(1); 568 | if (cfg.epoll_fd == -1) { 569 | fprintf(stderr,"epoll: %s\n", strerror(errno)); 570 | goto done; 571 | } 572 | 573 | /* set up the encapsulation receiver */ 574 | sc = setup_rx_encap(); 575 | if (sc < 0) goto done; 576 | 577 | /* add descriptors of interest */ 578 | sc = new_epoll(EPOLLIN, cfg.signal_fd); 579 | if (sc < 0) goto done; 580 | sc = new_epoll(EPOLLIN, cfg.rx_fd); 581 | if (sc < 0) goto done; 582 | 583 | /* open the initial savefile */ 584 | sc = reopen_savefile(); 585 | if (sc < 0) goto done; 586 | 587 | alarm(1); 588 | for (;;) { 589 | 590 | sc = epoll_wait(cfg.epoll_fd, &ev, 1, -1); 591 | if (sc < 0) { 592 | fprintf(stderr,"epoll: %s\n", strerror(errno)); 593 | break; 594 | } 595 | 596 | if (ev.data.fd == cfg.signal_fd) { 597 | sc = handle_signal(); 598 | if (sc < 0) goto done; 599 | } 600 | else if (ev.data.fd == cfg.rx_fd) { 601 | sc = handle_grerx(); 602 | if (sc < 0) goto done; 603 | } 604 | else { 605 | fprintf(stderr, "unknown fd\n"); 606 | assert(0); 607 | } 608 | 609 | } 610 | 611 | rc = 0; 612 | 613 | done: 614 | if (cfg.sv_addr) close_savefile(); 615 | if (cfg.rx_fd != -1) close(cfg.rx_fd); 616 | if (cfg.epoll_fd != -1) close(cfg.epoll_fd); 617 | if (cfg.signal_fd != -1) close(cfg.signal_fd); 618 | return rc; 619 | } 620 | -------------------------------------------------------------------------------- /respan.h: -------------------------------------------------------------------------------- 1 | /* 2 | * © 2019 The Johns Hopkins University Applied Physics Laboratory LLC. 3 | * All Rights Reserved. 4 | * 5 | * AUTHOR: Troy D. Hanson 6 | * LICENSE: MIT 7 | * PACKAGE: fluxcap 8 | */ 9 | 10 | #ifndef RESPAN_H 11 | #define RESPAN_H 12 | 13 | #if defined __cplusplus 14 | extern "C" { 15 | #endif 16 | 17 | #define MODES x(none) x(erspan) x(pcap) 18 | #define x(a) mode_ ## a, 19 | typedef enum { MODES } io_mode; 20 | #undef x 21 | 22 | #define RESPAN_VERSION "0.1" 23 | #define FILE_MAX 250 /* instead of FILENAME_MAX or PATH_MAX */ 24 | #define FILE_PATTERN "%Y%m%d%H%M%S" 25 | #define MAX_PKT 65536 26 | 27 | #if defined __cplusplus 28 | } 29 | #endif 30 | 31 | #endif /* RESPAN_H */ 32 | -------------------------------------------------------------------------------- /util/.gitignore: -------------------------------------------------------------------------------- 1 | ffcp 2 | fwalk 3 | fpcap-push 4 | fprune 5 | *.db 6 | -------------------------------------------------------------------------------- /util/Makefile.am: -------------------------------------------------------------------------------- 1 | srcdir = @srcdir@ 2 | 3 | bin_PROGRAMS = ramdisk watch_copy 4 | ramdisk_SOURCES = ramdisk.c 5 | ramdisk_CPPFLAGS = -I$(srcdir)/../lib/libut/include 6 | watch_copy_SOURCES = watch_copy.c 7 | 8 | -------------------------------------------------------------------------------- /util/ramdisk.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "utarray.h" 15 | #include 16 | 17 | #define TMPFS_MAGIC 0x01021994 18 | #define RAMFS_MAGIC 0x858458F6 19 | 20 | /****************************************************************************** 21 | * ramdisk 22 | * 23 | * a utility with modes to: 24 | * - create a ramdisk, 25 | * - query a ramdisk (see its size and percent full) 26 | * - unmount a ramdisk 27 | * 28 | * The ramdisk used here is the 'tmpfs' filesystem which is not strictly a 29 | * pure RAM device; it can swap under the kernel's discretion. I have also 30 | * noticed that a large ramdisk (say, 6gb on a system with 8gb ram) might 31 | * exhibit 'no space left on device' even when only 50% full. A better 32 | * query mode would show the status (resident, paged, etc) of ramdisk pages. 33 | *****************************************************************************/ 34 | 35 | /* command line configuration parameters */ 36 | int verbose; 37 | int ramfs; 38 | enum {MODE_NONE,MODE_QUERY,MODE_CREATE,MODE_UNMOUNT} mode = MODE_NONE; 39 | char *sz="50%"; 40 | char *ramdisk; 41 | UT_array *dirs; 42 | 43 | void usage(char *prog) { 44 | fprintf(stderr, "This utility creates a tmpfs ramdisk on a given mountpount.\n"); 45 | fprintf(stderr, "It does nothing if a tmpfs is already mounted on that point.\n"); 46 | fprintf(stderr, "\n"); 47 | fprintf(stderr,"usage:\n\n"); 48 | fprintf(stderr, "-c (create mode):\n"); 49 | fprintf(stderr, " %s -c [-s ] [-d ] [-r] \n", prog); 50 | fprintf(stderr, " -s suffixed with k|m|g|%% [default: 50%%]\n"); 51 | fprintf(stderr, " -d directory to post-create inside ramdisk (repeatable)\n"); 52 | fprintf(stderr, " -r use ramfs instead of tmpfs (grows unbounded, no swap)\n"); 53 | fprintf(stderr, "\n"); 54 | fprintf(stderr, "-q (query mode):\n"); 55 | fprintf(stderr, " %s -q \n", prog); 56 | fprintf(stderr, "\n"); 57 | fprintf(stderr, "-u (unmount mode):\n"); 58 | fprintf(stderr, " %s -u \n", prog); 59 | fprintf(stderr, "\n"); 60 | fprintf(stderr, "Examples of creating a ramdisk:\n"); 61 | fprintf(stderr, " %s -c -s 1g /mnt/ramdisk\n", prog); 62 | fprintf(stderr, " %s -c -s 1g -d /mnt/ramdisk/in -d /mnt/ramdisk/out /mnt/ramdisk\n", prog); 63 | fprintf(stderr, "\n"); 64 | fprintf(stderr, "Note: 'cat /proc/mounts' to see mounted tmpfs ramdisks.\n"); 65 | exit(-1); 66 | } 67 | 68 | /* Prevent a ramdisk from being mounted at the mount-point of an 69 | * existing ramdisk. This prevents people from accidently stacking tmpfs. 70 | * However it is OK to mount a ramdisk on a subdirectory of another ramdisk. */ 71 | int suitable_mountpoint(char *dir, struct stat *sb, struct statfs *sf) { 72 | size_t dlen = strlen(dir); 73 | char pdir[PATH_MAX]; 74 | struct stat psb; 75 | 76 | if (dlen+4 > PATH_MAX) { 77 | syslog(LOG_ERR, "path too long\n"); 78 | return -1; 79 | } 80 | 81 | if (stat(ramdisk, sb) == -1) { /* does mount point exist? */ 82 | syslog(LOG_ERR, "no mount point %s: %s\n", ramdisk, strerror(errno)); 83 | return -1; 84 | } 85 | if (S_ISDIR(sb->st_mode) == 0) { /* has to be a directory */ 86 | syslog(LOG_ERR, "mount point %s: not a directory\n", ramdisk); 87 | return -1; 88 | } 89 | if (statfs(ramdisk, sf) == -1) { /* what kinda filesystem is it on? */ 90 | syslog(LOG_ERR, "can't statfs %s: %s\n", ramdisk, strerror(errno)); 91 | return -1; 92 | } 93 | 94 | /* is it already a tmpfs mountpoint? */ 95 | memcpy(pdir,dir,dlen+1); strcat(pdir,"/.."); 96 | if (stat(pdir, &psb) == -1) { 97 | syslog(LOG_ERR, "can't stat %s: %s\n", pdir, strerror(errno)); 98 | return -1; 99 | } 100 | int is_mountpoint = (psb.st_dev == sb->st_dev) ? 0 : 1; 101 | int is_tmpfs = (sf->f_type == TMPFS_MAGIC); 102 | int is_ramfs = (sf->f_type == RAMFS_MAGIC); 103 | if (is_mountpoint && (is_tmpfs || is_ramfs)) { 104 | //syslog(LOG_INFO, "already a tmpfs mountpoint: %s\n", dir, strerror(errno)); 105 | return -2; 106 | } 107 | 108 | return 0; 109 | } 110 | 111 | #define KB 1024L 112 | #define MB (1024*1024L) 113 | #define GB (1024*1024*1024L) 114 | int query_ramdisk(void) { 115 | struct stat sb; struct statfs sf; 116 | if (suitable_mountpoint(ramdisk, &sb, &sf) != -2) { 117 | printf("%s: not a ramdisk\n", ramdisk); 118 | return -1; 119 | } 120 | if (sf.f_type == RAMFS_MAGIC) { 121 | printf("%s: ramfs ramdisk (unbounded size)\n", ramdisk); 122 | return 0; 123 | } 124 | char szb[100]; 125 | long bytes = sf.f_bsize*sf.f_blocks; 126 | if (bytes < KB) snprintf(szb, sizeof(szb), "%ld bytes", bytes); 127 | else if (bytes < MB) snprintf(szb, sizeof(szb), "%ld kb", bytes/KB); 128 | else if (bytes < GB) snprintf(szb, sizeof(szb), "%ld mb", bytes/MB); 129 | else snprintf(szb, sizeof(szb), "%ld gb", bytes/GB); 130 | int used_pct = 100 - (sf.f_bfree * 100.0 / sf.f_blocks); 131 | printf("%s: ramdisk of size %s (%d%% used)\n", ramdisk, szb, used_pct); 132 | return 0; 133 | } 134 | 135 | int unmount_ramdisk(void) { 136 | struct stat sb; struct statfs sf; 137 | if (suitable_mountpoint(ramdisk, &sb, &sf) != -2) { 138 | syslog(LOG_ERR,"%s: not a ramdisk\n", ramdisk); 139 | return -1; 140 | } 141 | if (umount(ramdisk) == -1) { 142 | syslog(LOG_ERR,"%s: cannot unmount\n", ramdisk); 143 | return -1; 144 | } 145 | return 0; 146 | } 147 | 148 | int create_ramdisk(void) { 149 | int rc; 150 | char opts[100], *kind; 151 | 152 | struct stat sb; struct statfs sf; 153 | rc = suitable_mountpoint(ramdisk, &sb, &sf); 154 | if (rc) return rc; 155 | 156 | kind = "tmpfs"; 157 | if (ramfs) kind = "ramfs"; 158 | 159 | /* ok, mount a ramdisk on this point */ 160 | snprintf(opts,sizeof(opts),"size=%s",sz); 161 | rc=mount("none", ramdisk, kind, MS_NOATIME|MS_NODEV, opts); 162 | if (rc) syslog(LOG_ERR, "can't make ramdisk %s: %s\n", ramdisk, strerror(errno)); 163 | return rc; 164 | } 165 | 166 | void make_dirs(UT_array *dirs) { 167 | char **d, *dir; 168 | d=NULL; 169 | while ( (d=(char**)utarray_next(dirs,d))) { 170 | dir = *d; 171 | /* fprintf(stderr,"dir is %s\n",dir); */ 172 | if (mkdir(dir, 0777) == -1) { 173 | fprintf(stderr,"failed to make %s: %s\n",dir,strerror(errno)); 174 | } 175 | } 176 | } 177 | 178 | int main(int argc, char * argv[]) { 179 | int opt, rc; 180 | utarray_new(dirs,&ut_str_icd); 181 | 182 | while ( (opt = getopt(argc, argv, "v+cqus:hd:r")) != -1) { 183 | switch (opt) { 184 | case 'v': verbose++; break; 185 | case 'r': ramfs=1; break; 186 | case 'q': if (mode) usage(argv[0]); mode=MODE_QUERY; break; 187 | case 'c': if (mode) usage(argv[0]); mode=MODE_CREATE; break; 188 | case 'u': if (mode) usage(argv[0]); mode=MODE_UNMOUNT; break; 189 | case 's': sz=strdup(optarg); break; 190 | case 'd': utarray_push_back(dirs,&optarg); break; 191 | case 'h': default: usage(argv[0]); break; 192 | } 193 | } 194 | if (optind < argc) ramdisk=argv[optind++]; 195 | if (!ramdisk) usage(argv[0]); 196 | openlog("ramdisk", LOG_PID|LOG_PERROR, LOG_LOCAL0); 197 | 198 | switch(mode) { 199 | case MODE_CREATE: rc=create_ramdisk(); make_dirs(dirs); break; 200 | case MODE_UNMOUNT: rc=unmount_ramdisk(); break; 201 | case MODE_QUERY: rc=query_ramdisk(); break; 202 | default: usage(argv[0]); break; 203 | } 204 | utarray_free(dirs); 205 | return rc; 206 | } 207 | -------------------------------------------------------------------------------- /util/tests/do_tests: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | my @tests; 7 | for (glob "test*[0-9]") { 8 | push @tests, $_ if -e "$_.ans"; 9 | } 10 | 11 | my $num_failed=0; 12 | 13 | for my $test (@tests) { 14 | print "$test\n"; 15 | `./$test > $test.out 2>/dev/null`; 16 | `diff $test.out $test.ans`; 17 | print "$test failed\n" if $?; 18 | $num_failed++ if $?; 19 | } 20 | 21 | print scalar @tests . " tests conducted, $num_failed failed.\n"; 22 | exit $num_failed; 23 | -------------------------------------------------------------------------------- /util/tests/test1: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p testdir 4 | tar xf testdir.tar -C testdir 5 | shr-tool -c -s 1m ring 6 | 7 | ../fwalk -d testdir -r ring 8 | shr-tool -r ring 9 | rm -rf testdir ring 10 | -------------------------------------------------------------------------------- /util/tests/test1.ans: -------------------------------------------------------------------------------- 1 | /home/thanson/checkouts/public/fluxcap/util/tests/testdir/20180101/ABC/m 2 | /home/thanson/checkouts/public/fluxcap/util/tests/testdir/20180101/DEF/m 3 | /home/thanson/checkouts/public/fluxcap/util/tests/testdir/20180102/ABC/m 4 | /home/thanson/checkouts/public/fluxcap/util/tests/testdir/20180102/DEF/m 5 | -------------------------------------------------------------------------------- /util/tests/test2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # this test requires sudo to mount a ramdisk 3 | 4 | # create a ring 5 | shr-tool -c -s 1m ring 6 | 7 | # mount a ramdisk (need a real filesystem for fprune) 8 | mkdir -p ramdisk 9 | sudo ../ramdisk -c -s 1m ramdisk 10 | 11 | # we will make a directory to test pruning of empties 12 | mkdir ramdisk/empty 13 | 14 | # put content into ramdisk 15 | tar xf testdir.tar -C ramdisk 16 | dd if=/dev/zero of=ramdisk/0.dat bs=100k count=1 17 | dd if=/dev/zero of=ramdisk/1.dat bs=100k count=1 18 | 19 | echo pre-prune 20 | 21 | # dump directory contents to ring 22 | ../fwalk -d ramdisk -r ring 23 | shr-tool -r ring 24 | 25 | # confirm empty directory is there 26 | find ramdisk -name empty 27 | 28 | # prune ramdisk to 10% of 1m (100k) 29 | ../fprune -d ramdisk -p 10 -r ring -N 10 -u -W -P & 30 | PID1=$! 31 | sleep 5 32 | kill $PID1 33 | wait $PID1 34 | 35 | echo post-prune 36 | 37 | # dump directory contents to ring 38 | ../fwalk -d ramdisk -r ring 39 | shr-tool -r ring 40 | 41 | # confirm empty directory is gone 42 | find ramdisk -name empty 43 | 44 | # clean up 45 | sudo ../ramdisk -u ramdisk 46 | rm -rf ramdisk ring 47 | -------------------------------------------------------------------------------- /util/tests/test2.ans: -------------------------------------------------------------------------------- 1 | pre-prune 2 | /home/thanson/checkouts/public/fluxcap/util/tests/ramdisk/0.dat 3 | /home/thanson/checkouts/public/fluxcap/util/tests/ramdisk/1.dat 4 | /home/thanson/checkouts/public/fluxcap/util/tests/ramdisk/20180101/ABC/m 5 | /home/thanson/checkouts/public/fluxcap/util/tests/ramdisk/20180101/DEF/m 6 | /home/thanson/checkouts/public/fluxcap/util/tests/ramdisk/20180102/ABC/m 7 | /home/thanson/checkouts/public/fluxcap/util/tests/ramdisk/20180102/DEF/m 8 | ramdisk/empty 9 | files.map: 10 slots (0 GB) 10 | post-prune 11 | /home/thanson/checkouts/public/fluxcap/util/tests/ramdisk/1.dat 12 | /home/thanson/checkouts/public/fluxcap/util/tests/ramdisk/20180101/ABC/m 13 | /home/thanson/checkouts/public/fluxcap/util/tests/ramdisk/20180101/DEF/m 14 | /home/thanson/checkouts/public/fluxcap/util/tests/ramdisk/20180102/ABC/m 15 | /home/thanson/checkouts/public/fluxcap/util/tests/ramdisk/20180102/DEF/m 16 | -------------------------------------------------------------------------------- /util/tests/test3: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # create a ring 4 | shr-tool -c -s 1m ring oring 5 | 6 | # make test directories 7 | mkdir -p testdir outdir 8 | 9 | # put content into testdir 10 | tar xf testdir.tar -C testdir 11 | 12 | # dump directory contents to ring 13 | ../fwalk -d testdir -r ring 14 | 15 | # copy from ring files using naming template 16 | ../ffcp -i ring -o oring -r '(\d{8})/(\w{3})/(.*)$' -t 'outdir/$1/$2.$3' -m -z & 17 | PID1=$! 18 | sleep 5 19 | kill $PID1 20 | wait $PID1 21 | 22 | echo "listing output directory" 23 | find outdir 24 | 25 | echo "listing output ring" 26 | shr-tool -r oring 27 | 28 | # clean up 29 | rm -rf testdir outdir ring oring 30 | -------------------------------------------------------------------------------- /util/tests/test3.ans: -------------------------------------------------------------------------------- 1 | listing output directory 2 | outdir 3 | outdir/20180101 4 | outdir/20180101/ABC.m.gz 5 | outdir/20180101/DEF.m.gz 6 | outdir/20180102 7 | outdir/20180102/ABC.m.gz 8 | outdir/20180102/DEF.m.gz 9 | listing output ring 10 | outdir/20180101/ABC.m.gz 11 | outdir/20180101/DEF.m.gz 12 | outdir/20180102/ABC.m.gz 13 | outdir/20180102/DEF.m.gz 14 | -------------------------------------------------------------------------------- /util/tests/test4: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # create a ring 4 | shr-tool -c -s 1m ring oring 5 | 6 | # make test directories 7 | mkdir -p testdir outdir 8 | 9 | # put content into testdir 10 | echo "hello, world!" > testdir/hello 11 | 12 | # dump directory contents to ring 13 | ../fwalk -d testdir -r ring 14 | 15 | # copy from ring files to outdir/basename.gz 16 | ../ffcp -i ring -o oring -t 'outdir/$0' -z & 17 | PID1=$! 18 | sleep 5 19 | kill $PID1 20 | wait $PID1 21 | 22 | echo "listing output directory" 23 | find outdir 24 | 25 | echo "listing output ring" 26 | shr-tool -r oring 27 | 28 | # testing gunzip compatibility 29 | gunzip -c outdir/hello.gz 30 | 31 | # clean up 32 | rm -rf testdir outdir ring oring 33 | -------------------------------------------------------------------------------- /util/tests/test4.ans: -------------------------------------------------------------------------------- 1 | listing output directory 2 | outdir 3 | outdir/hello.gz 4 | listing output ring 5 | outdir/hello.gz 6 | hello, world! 7 | -------------------------------------------------------------------------------- /util/tests/test5: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # create a tiny ring 4 | # so we block in write 5 | shr-tool -c -s 1k ring 6 | 7 | # make test directories 8 | mkdir -p testdir 9 | 10 | # put content into testdir 11 | tar xf testdir.tar -C testdir 12 | 13 | # dump directory contents to ring 14 | echo "filling ring" 15 | ../fwalk -d testdir -r ring # ok 16 | ../fwalk -d testdir -r ring # ok 17 | ../fwalk -d testdir -r ring # ok 18 | 19 | # we know from manual testing that 20 | # the next write would block for 21 | # space availability in the ring 22 | echo "final write, should block" 23 | ../fwalk -d testdir -r ring & 24 | PID1=$! 25 | START_TIME=`date +%s` 26 | sleep 1 27 | 28 | # confirm that it is blocked. this 29 | # does not actually send a signal 30 | kill -0 $PID1 31 | if [ $? -eq 0 ] 32 | then 33 | echo "ok, blocked in ring write" 34 | else 35 | echo "failed to block in write!" 36 | exit -1 37 | fi 38 | 39 | # schedule failsafe unmaskable signal 40 | echo "scheduling sigkill in 10s" 41 | (sleep 10; kill -9 $PID1) & 42 | 43 | # send more benign signal 44 | # to wake fwalk if it lets 45 | # in signals in shr-write 46 | # in bw_wait_ul -> select 47 | echo "sending sigterm in 1s" 48 | sleep 1 49 | kill -TERM $PID1 50 | 51 | # one or the other signal should 52 | # make fwalk exit so wait for it 53 | wait $PID1 54 | echo "writer terminated (status $?)" 55 | END_TIME=`date +%s` 56 | let ELAPSED=$END_TIME-$START_TIME 57 | if [ $ELAPSED -gt 5 ] 58 | then 59 | echo "terminated by failsafe signal," 60 | echo "this means standard signal was" 61 | echo "insufficient to unblock writer" 62 | else 63 | echo "good, standard signal suffices" 64 | fi 65 | 66 | # clean up 67 | rm -rf testdir ring 68 | -------------------------------------------------------------------------------- /util/tests/test5.ans: -------------------------------------------------------------------------------- 1 | filling ring 2 | final write, should block 3 | ok, blocked in ring write 4 | scheduling sigkill in 10s 5 | sending sigterm in 1s 6 | writer terminated (status 0) 7 | good, standard signal suffices 8 | -------------------------------------------------------------------------------- /util/tests/testdir.tar: -------------------------------------------------------------------------------- 1 | 20180101/0000775000175000017500000000000013237267555011701 5ustar thansonthanson20180101/DEF/0000775000175000017500000000000013237267604012272 5ustar thansonthanson20180101/DEF/m0000664000175000017500000000000013237267604012437 0ustar thansonthanson20180101/ABC/0000775000175000017500000000000013237267604012261 5ustar thansonthanson20180101/ABC/m0000664000175000017500000000000013237267604012426 0ustar thansonthanson20180102/0000775000175000017500000000000013237267564011702 5ustar thansonthanson20180102/DEF/0000775000175000017500000000000013237267604012273 5ustar thansonthanson20180102/DEF/m0000664000175000017500000000000013237267604012440 0ustar thansonthanson20180102/ABC/0000775000175000017500000000000013237267604012262 5ustar thansonthanson20180102/ABC/m0000664000175000017500000000000013237267604012427 0ustar thansonthanson -------------------------------------------------------------------------------- /util/watch_copy.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | struct { 15 | int verbose; 16 | int pattern_mode; 17 | int mkdir_mode; 18 | char *prog; 19 | } CF; 20 | 21 | /* usage: watch_copy 22 | * 23 | * whenever a file in watch-dir is closed (if it was open for writing), 24 | * it is copied to the dest-pattern. It does not recurse in watch-dir. 25 | * 26 | * This implementation mmaps the source and dest files. 27 | * 28 | */ 29 | 30 | void usage() { 31 | fprintf(stderr,"usage: %s [-p] [-m] \n", CF.prog); 32 | fprintf(stderr,"\n"); 33 | fprintf(stderr," -v (verbose)\n"); 34 | fprintf(stderr," -p (pattern mode)\n"); 35 | fprintf(stderr," -m (mkdir mode; makes destination directory if needed,\n"); 36 | fprintf(stderr," supports only one level of directory creation)\n"); 37 | fprintf(stderr,"\n"); 38 | fprintf(stderr," can be a directory, or a pattern (if -p)\n"); 39 | fprintf(stderr,"\n"); 40 | fprintf(stderr,"pattern syntax: $1 = first character of file basename\n"); 41 | fprintf(stderr," $2 = second character of file basename\n"); 42 | fprintf(stderr," $3 = third character (likewise $4, ...)\n"); 43 | fprintf(stderr," $A = tenth character (likewise $B, ...)\n"); 44 | fprintf(stderr," $Z = 36th character\n"); 45 | fprintf(stderr," $0 = entire original file basename\n"); 46 | fprintf(stderr," $$ = literal $\n"); 47 | fprintf(stderr," everything else is literal\n"); 48 | fprintf(stderr,"\n"); 49 | fprintf(stderr,"note: quote pattern expressions to protect from shell!\n"); 50 | fprintf(stderr,"\n"); 51 | fprintf(stderr,"examples:\n"); 52 | fprintf(stderr," %s /tmp /data\n", CF.prog); 53 | fprintf(stderr," (/tmp/abc123.pcap -> /data/abc123.pcap)\n"); 54 | fprintf(stderr,"\n"); 55 | fprintf(stderr," %s -p /tmp '/data/$1$2$3/$0'\n", CF.prog); 56 | fprintf(stderr," (/tmp/abc123.pcap -> /data/abc/abc123.pcap)\n"); 57 | fprintf(stderr,"\n"); 58 | fprintf(stderr," %s -mp /tmp '/data/$A$B/$0'\n", CF.prog); 59 | fprintf(stderr," (/tmp/fw-20170921.pcap -> /data/21/fw-20170921.pcap)\n"); 60 | fprintf(stderr,"\n"); 61 | 62 | exit(-1); 63 | } 64 | 65 | #define append(c) do { \ 66 | if (olen == 0) goto done; \ 67 | *(o++) = (c); \ 68 | olen--; \ 69 | } while(0) 70 | 71 | /* make a pathname from pattern applied to src. literals are copied, $0 is src, 72 | * and $1 through $9 and $A through $Z refer to positions 1 through 36 in src */ 73 | int pat2path(char *out, size_t olen, char *src, char *pat) { 74 | char *p = pat; 75 | char *o = out; 76 | size_t l = strlen(src); 77 | int i, rc = -1; 78 | unsigned char x; 79 | 80 | while (*p != '\0') { 81 | if (*p == '$') { /* translate next pattern character */ 82 | p++; 83 | if (*p == '$') append(*p); /* special case: $$ */ 84 | else { 85 | 86 | /* here if we had $x where x must be [0-9A-Z] */ 87 | if ((*p >= '0') && (*p <= '9')) x = *p - '0'; 88 | else if ((*p >= 'A') && (*p <= 'Z')) x = *p - 'A' + 10; 89 | else { 90 | fprintf(stderr,"invalid position %c\n", *p); 91 | goto done; 92 | } 93 | 94 | if (x == 0) { 95 | /* $0 means whole src */ 96 | if (olen < l) goto done; 97 | memcpy(o, src, l); 98 | o += l; 99 | } else { 100 | /* copy from 1-based offset to 0-based */ 101 | if (l < x) {fprintf(stderr,"position %c > %s\n", *p, src); goto done;} 102 | append(src[x-1]); 103 | } 104 | } 105 | } else append(*p); /* copy literal character */ 106 | p++; 107 | } 108 | 109 | append('\0'); 110 | rc = 0; 111 | 112 | done: 113 | return rc; 114 | } 115 | 116 | int map_copy(char *file, char *dest) { 117 | struct stat s; 118 | char *src=NULL,*dst=NULL; 119 | int fd=-1,dd=-1,rc=-1; 120 | 121 | /* source file */ 122 | if ( (fd = open(file, O_RDONLY)) == -1) { 123 | fprintf(stderr,"can't open %s: %s\n", file, strerror(errno)); 124 | goto done; 125 | } 126 | if (fstat(fd, &s) == -1) { 127 | fprintf(stderr,"can't stat %s: %s\n", file, strerror(errno)); 128 | goto done; 129 | } 130 | if (!S_ISREG(s.st_mode)) { 131 | fprintf(stderr,"not a regular file: %s\n", file); 132 | goto done; 133 | } 134 | src = mmap(0, s.st_size, PROT_READ, MAP_PRIVATE, fd, 0); 135 | if (src == MAP_FAILED) { 136 | fprintf(stderr, "mmap %s: %s\n", file, strerror(errno)); 137 | goto done; 138 | } 139 | 140 | /* dest file */ 141 | if ( (dd = open(dest, O_RDWR|O_TRUNC|O_CREAT, 0644)) == -1) { 142 | fprintf(stderr,"can't open %s: %s\n", dest, strerror(errno)); 143 | goto done; 144 | } 145 | if (ftruncate(dd, s.st_size) == -1) { 146 | fprintf(stderr,"ftruncate: %s\n", strerror(errno)); 147 | goto done; 148 | } 149 | dst = mmap(0, s.st_size, PROT_READ|PROT_WRITE, MAP_SHARED, dd, 0); 150 | if (dst == MAP_FAILED) { 151 | fprintf(stderr, "mmap %s: %s\n", dest, strerror(errno)); 152 | goto done; 153 | } 154 | memcpy(dst,src,s.st_size); 155 | 156 | rc = 0; 157 | 158 | done: 159 | if (src && (src != MAP_FAILED)) { 160 | if (munmap(src, s.st_size)) fprintf(stderr,"munmap: %s\n",strerror(errno)); 161 | } 162 | if (dst && (dst != MAP_FAILED)) { 163 | if (munmap(dst, s.st_size)) fprintf(stderr,"munmap: %s\n",strerror(errno)); 164 | } 165 | if (fd != -1) close(fd); 166 | if (dd != -1) close(dd); 167 | return rc; 168 | } 169 | 170 | /* this implementation only supports making one level of directory */ 171 | int do_mkdir(char *path) { 172 | int rc = -1, sc; 173 | char dir[PATH_MAX], *d; 174 | size_t l = strlen(path); 175 | struct stat s; 176 | 177 | /* dirname may modify its input, so pass a copy in */ 178 | if (l+1 > sizeof(dir)) goto done; 179 | memcpy(dir, path, l+1); 180 | d = dirname(dir); 181 | 182 | sc = stat(d, &s); 183 | if (sc < 0) { 184 | /* try to make the path */ 185 | if (mkdir(d, 0755) == 0) { rc = 0; goto done; } 186 | fprintf(stderr, "mkdir failed: %s %s\n", d, strerror(errno)); 187 | goto done; 188 | } else { 189 | /* path exists. is it a directory? */ 190 | if (S_ISDIR(s.st_mode)) { rc = 0; goto done; } /* yes */ 191 | fprintf(stderr, "path exists as non-directory: %s\n", d); 192 | goto done; 193 | } 194 | 195 | rc = 0; 196 | 197 | done: 198 | return rc; 199 | } 200 | 201 | int main(int argc, char *argv[]) { 202 | char *src=NULL, *dst=NULL, *name, oldpath[PATH_MAX],newpath[PATH_MAX]; 203 | int fd=-1, wd, mask, opt, rc=-1, slen, dlen; 204 | struct inotify_event *eb=NULL, *ev, *nx; 205 | size_t eb_sz = sizeof(*eb) + PATH_MAX, sz; 206 | ssize_t nr; 207 | 208 | CF.prog = argv[0]; 209 | 210 | while ( (opt = getopt(argc,argv,"pmvh")) > 0) { 211 | switch(opt) { 212 | case 'v': CF.verbose++; break; 213 | case 'p': CF.pattern_mode=1; break; 214 | case 'm': CF.mkdir_mode=1; break; 215 | case 'h': default: usage(); break; 216 | } 217 | } 218 | 219 | /* expect two more arguments - source and destination */ 220 | if (argc > optind) src = argv[optind++]; 221 | if (argc > optind) dst = argv[optind++]; 222 | if ((src == NULL) || (dst == NULL)) usage(); 223 | 224 | /* initialize source path buffer as /srcdir/... */ 225 | slen = strlen(src); 226 | memcpy(oldpath, src, slen); oldpath[slen]='/'; 227 | 228 | /* initialize dest path as /dstdir/... (regular mode) */ 229 | dlen = strlen(dst); 230 | memcpy(newpath, dst, dlen); newpath[dlen]='/'; 231 | 232 | /* setup inotify watch on src dir */ 233 | if ( (fd = inotify_init()) == -1) { 234 | fprintf(stderr, "inotify_init failed: %s\n", strerror(errno)); 235 | goto done; 236 | } 237 | 238 | mask = IN_CLOSE_WRITE; 239 | if ( (wd = inotify_add_watch(fd, src, mask)) == -1) { 240 | fprintf(stderr, "inotify_add_watch failed: %s\n", strerror(errno)); 241 | goto done; 242 | } 243 | 244 | /* see inotify(7) as inotify_event has a trailing name 245 | * field allocated beyond the fixed structure; we must 246 | * allocate enough room for the kernel to populate it */ 247 | if ( (eb = malloc(eb_sz)) == NULL) { 248 | fprintf(stderr, "out of memory\n"); 249 | goto done; 250 | } 251 | 252 | /* one read will produce one or more event structures */ 253 | while ( (nr=read(fd,eb,eb_sz)) > 0) { 254 | for(ev = eb; nr > 0; ev = nx) { 255 | 256 | sz = sizeof(*ev) + ev->len; 257 | nx = (struct inotify_event*)((char*)ev + sz); 258 | nr -= sz; 259 | 260 | name = (ev->len ? ev->name : src); 261 | memcpy(&oldpath[slen+1],name,strlen(name)+1); 262 | if (CF.pattern_mode == 0) memcpy(&newpath[dlen+1],name,strlen(name)+1); 263 | else if (pat2path(newpath, sizeof(newpath), name, dst) < 0) goto done; 264 | 265 | if (CF.mkdir_mode) { 266 | if (do_mkdir(newpath) < 0) goto done; 267 | } 268 | 269 | if (CF.verbose) fprintf(stderr, "%s --> %s\n", oldpath, newpath); 270 | if (map_copy(oldpath, newpath)) goto done; 271 | } 272 | } 273 | 274 | done: 275 | if (fd != -1) close(fd); 276 | if (eb) free(eb); 277 | return rc; 278 | } 279 | --------------------------------------------------------------------------------