├── .gitignore
├── .gitmodules
├── LICENSE
├── Makefile.am
├── README.md
├── autogen.sh
├── bb.c
├── configure.ac
├── fluxcap.c
├── fluxcap.h
├── lib
    ├── .gitignore
    └── Makefile.am
├── respan.c
├── respan.h
└── util
    ├── .gitignore
    ├── Makefile.am
    ├── ramdisk.c
    ├── tests
        ├── do_tests
        ├── test1
        ├── test1.ans
        ├── test2
        ├── test2.ans
        ├── test3
        ├── test3.ans
        ├── test4
        ├── test4.ans
        ├── test5
        ├── test5.ans
        └── testdir.tar
    └── watch_copy.c


/.gitignore:
--------------------------------------------------------------------------------
 1 | fluxtop
 2 | fluxcap
 3 | ramdisk
 4 | fpcap-replay
 5 | fprune
 6 | watch_copy
 7 | *.o
 8 | *.a
 9 | .dirstamp
10 | *.swp
11 | Makefile.in
12 | aclocal.m4
13 | autom4te.cache
14 | compile
15 | configure
16 | depcomp
17 | install-sh
18 | missing
19 | Makefile
20 | config.log
21 | config.status
22 | .deps
23 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "lib/libut"]
2 | 	path = lib/libut
3 | 	url = https://github.com/troydhanson/libut.git
4 |   ignore = untracked
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | © 2017 The Johns Hopkins University Applied Physics Laboratory LLC.  All Rights Reserved. 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/Makefile.am:
--------------------------------------------------------------------------------
 1 | srcdir = @srcdir@
 2 | 
 3 | SUBDIRS=lib util
 4 | 
 5 | bin_PROGRAMS = fluxcap respan
 6 | 
 7 | fluxcap_SOURCES = fluxcap.c bb.c fluxcap.h
 8 | fluxcap_CPPFLAGS = -I$(srcdir)/lib/libut/include
 9 | fluxcap_LDADD = -Llib -lut -lshr -lm
10 | 
11 | respan_SOURCES = respan.c respan.h
12 | respan_CPPFLAGS =
13 | respan_LDADD = -lshr
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Back to the [fluxcap Github page](http://github.com/troydhanson/fluxcap).
  2 | Back to [my other projects](http://troydhanson.github.io/).
  3 | 
  4 | # About
  5 | 
  6 | fluxcap: a network tap replication and aggregation tool
  7 | 
  8 | A Linux host running fluxcap can:
  9 | 
 10 |  * accept taps, on one or more physical network interfaces,
 11 |  * aggregate them, possibly inserting VLAN tags,
 12 |  * transmit them, on one of more physical network interfaces,
 13 |  * send or receive taps encapsulated in GRE or VXLAN over IP.
 14 | 
 15 | Fluxcap implements its features using raw sockets. It is
 16 | written in C, MIT licensed, and for Linux only. 
 17 | 
 18 | Platforms: Ubuntu, RHEL/CentOS are primary, but it works on others.
 19 | 
 20 | # Build & install
 21 | 
 22 | ## Prereqs
 23 | 
 24 | In order to build and use fluxcap you need to install a few packages.
 25 | 
 26 |     # Ubuntu
 27 |     sudo apt-get install git gcc automake autoconf libtool ethtool
 28 | 
 29 |     # RHEL/CentOS
 30 |     sudo yum install git gcc automake autoconf libtool ethtool
 31 | 
 32 | Last, libshr must be built and installed prior to building fluxcap.
 33 | 
 34 |     git clone https://github.com/troydhanson/shr.git
 35 |     cd shr
 36 |     autoreconf -ivf
 37 |     ./configure
 38 |     make
 39 |     sudo make install
 40 |     sudo ldconfig
 41 | 
 42 | The libshr libraries and header files are now in `/usr/local/lib`
 43 | and `/usr/local/include`.
 44 | 
 45 | Note that on many systems, `/usr/local/lib` is not in the default
 46 | library search path. On systems like this, it is usually possible
 47 | to add `/usr/local/lib` to the library search path by running:
 48 | 
 49 |     echo /usr/local/lib | sudo tee /etc/ld.so.conf.d/local.conf
 50 |     sudo ldconfig
 51 | 
 52 | ## fluxcap
 53 | 
 54 | In the top-level directory of fluxcap, run:
 55 | 
 56 |     git submodule update --init --recursive
 57 |     ./autogen.sh
 58 |     ./configure
 59 |     make
 60 |     sudo make install
 61 | 
 62 | This installs `fluxcap`, typically into `/usr/local/bin`.
 63 | 
 64 | # Preparing the host
 65 | 
 66 | These things need to be done *at each system boot*.
 67 | 
 68 |  * disable hardware offloading on the receive/transmit NIC's
 69 |  * set up iptables to prevent accidental traffic on NIC's
 70 |  * make sure NIC's are up and not assigned IP addresses
 71 |  * mount a ramdisk for fluxcap's ring buffers
 72 | 
 73 | This script can be used. Save the script somewhere, make it executable,
 74 | and execute it at startup via `/etc/rc.local` or similar.
 75 | 
 76 |     #!/bin/bash
 77 |     INTERFACES="ens33 enp4s0" # replace with YOUR NIC names!
 78 |     for IF in $INTERFACES
 79 |     do
 80 |       ethtool -K $IF tso off  # TCP segmentation offload (output)
 81 |       ethtool -K $IF ufo off  # UDP segmentation offload (output)
 82 |       ethtool -K $IF gso off  # generic segmentation offload (output)
 83 |       ethtool -K $IF gro off  # generic receive offload (input)
 84 |       ethtool -K $IF lro off  # large receive offload (input)
 85 | 
 86 |       /sbin/iptables -A INPUT  -i $IF -j DROP
 87 |       /sbin/iptables -A OUTPUT -o $IF -j DROP
 88 | 
 89 |       /sbin/ip link set dev $IF up
 90 |     done
 91 | 
 92 | Last we mount a ramdisk at each boot. You can name it anything; in
 93 | this document, we use /ram as the mountpoint. Add to /etc/fstab:
 94 | 
 95 |     none  /ram   ramfs auto,noatime 0 0
 96 | 
 97 | Then make the mountpoint and mount it:
 98 | 
 99 |     mkdir /ram
100 |     mount /ram
101 | 
102 | ### why disable offloads?
103 | 
104 | When hardware offloading is left on, the NIC presents artificially large
105 | packets to the Linux host, by merging together IP packets in valid ways
106 | to reduce work the kernel would have to do in software. However, this is 
107 | _undesirable_ for tap replication (and for any kind of packet analysis)
108 | because the larger, conglomerated packets fail re-transmission, and may
109 | vastly exceed MTU. Analysis tools want the original packets in any case.
110 | For the curious, an explanation of some offload parameters can be found
111 | [here](https://red.ht/2e608Oo). The usual symptom of skipping this step
112 | is to see fluxcap emit errors like `sendto: message too long`.
113 | 
114 | ### why use iptables?
115 | 
116 | It is just an added layer of protection from the host generating traffic
117 | of its own on the NIC. It is optional.
118 | 
119 | ### why use ramfs?
120 | 
121 | While tmpfs is newer, it can swap, and that is undesirable for this program.
122 | Use of ramfs is considered safe because we only create a few fixed-sized
123 | memory buffers in it.
124 | 
125 | # Configuring fluxcap
126 | 
127 | Here, we show how to run fluxcap by hand to set up tap replication or
128 | aggregation. In order to persist, these commands have to run at each
129 | system boot. We can use a process supervisor for that, but we show it
130 | by hand first. Everything needs to be run as root.
131 | 
132 | ## Tap replication
133 | 
134 | Suppose we have three available NIC's on a host and we want to replicate
135 | a tap coming into eth1, re-transmit it on eth2 and eth3. We'll assume
136 | that eth0 is a management NIC and, obviously, we leave that one alone.
137 | 
138 |     eth0: management (leave alone)
139 |     eth1: tap from Cisco switch
140 |     eth2: tap output (copy #1)
141 |     eth3: tap output (copy #2)
142 | 
143 | Remember the ramfs we mounted earlier? We mounted it at /ram. That is
144 | where we will create a fluxcap ring buffer. In this setup we have one
145 | input NIC, so we only need one ring buffer.
146 | 
147 |     cd /ram
148 |     fluxcap -cr -s 100m cisco
149 | 
150 | Now if you run `ls /ram/cisco` you see a 100 mb file there. It's a 
151 | memory buffer. It's a file too. Everything in unix is a file, right?
152 | The name "cisco" could be anything, but when you start working with
153 | a dozen taps coming into one host, it helps to name things clearly.
154 | 
155 | Why did we choose 100 mb size? (This uses real RAM by the way, so
156 | beware of making it too large; consider what RAM your host has free).
157 | The idea is we want the ring to be "large enough" that it can buffer
158 | data from the incoming tap long enough to get read by the transmit
159 | processes that will send out the output NIC's. Here "long enough" 
160 | means "before the data in the ring buffer gets written over". One
161 | could contemplate how to size the buffer - but we will just pick
162 | a number. You can use 1G (that is, a gigabyte) for the buffer if
163 | you have a lot of RAM, and just forget about it. A gigabyte can 
164 | buffer about ten seconds of traffic from a fully loaded gigabit NIC.
165 | In any case, we can eyeball the I/O rates, and watch for drops- but
166 | first we have to start up the receive and transmit processes.
167 | 
168 |     cd /ram
169 |     fluxcap -rx -i eth1 cisco &
170 |     fluxcap -tx -i eth2 cisco &
171 |     fluxcap -tx -i eth3 cisco &
172 | 
173 | At this point it is up and running. The first process captures on 
174 | eth1 into the ring /ram/cisco. The second process transmits on eth2,
175 | and the last on eth3. (If you see messages like "sendto: too long" 
176 | you should review the section on disabling NIC offloads above).
177 | 
178 | We used `&` to put them in the background. You could run them in
179 | three separate terminals instead. In real life, we put them under
180 | a process supervisor, discussed further below.
181 | 
182 | We can watch the I/O rates this way:
183 | 
184 |     fluxcap -io cisco
185 | 
186 | We could add a VLAN tag on the data when it comes in from eth1. That
187 | helps distinguish things if we merge several taps down the road. Run
188 | `fluxcap -h` to see the VLAN tag injection and other options.
189 | 
190 | ## Tap aggregation
191 | 
192 | Suppose we have two input taps. We want to aggregate them. We want
193 | to transmit the aggregate tap on a third interface.
194 | 
195 |     enp0: management (leave alone)
196 |     enp1: tap from Cisco switch
197 |     enp2: tap from Dell switch
198 |     enp3: aggregate (Cisco+Dell) output
199 | 
200 | We want to create a ring buffer for each input NIC, so we need two.
201 | Doing this by hand at the shell prompt, we'd run:
202 | 
203 |     cd /ram
204 |     fluxcap -cr -s 100m cisco dell
205 | 
206 | Last we run two receive processes and two transmit processes:
207 | 
208 |     fluxcap -rx -i enp1 cisco &
209 |     fluxcap -rx -i enp2 dell &
210 |     fluxcap -tx -i enp3 cisco &
211 |     fluxcap -tx -i enp3 dell &
212 | 
213 | We can run `fluxcap -io cisco dell` at this point to see the I/O rates.
214 | In an aggregation scenario it may be helpful to synthesize VLAN tags on
215 | the input taps so they can still be distinguished after aggregation. We
216 | could have used `-V 100` on the cisco receiver, and `-V 200` for dell,
217 | for example.
218 | 
219 | ## Under a process supervisor
220 | 
221 | Running things by hand is good for testing. If persistence is needed, and
222 | resilience in the face of things like NIC's going up and down when someone
223 | unplugs the cable and plugs it back in, then use a process supervisor. An
224 | example using [pmtr](http://github.com/troydhanson/pmtr) is shown here.
225 | 
226 |     # pmtr.conf
227 | 
228 |     job {
229 |       dir /ram
230 |       cmd /usr/local/bin/fluxcap -cr -s 100m cisco dell
231 |       wait
232 |       once
233 |     }
234 | 
235 |     job {
236 |       dir /ram
237 |       cmd /usr/local/bin/fluxcap -rx -i enp1 cisco
238 |     }
239 | 
240 |     ...
241 | 
242 | This way, pmtr starts things at boot, and restarts proceses that exit.
243 | The NIC offload script could be run from here too, instead of rc.local.
244 | 
245 | ## Encapsulation modes
246 | 
247 | Fluxcap can also transmit and receive taps over a regular IP network.  The
248 | packets travel inside a layer of GRE or VXLAN encapsulation.
249 | 
250 | VXLAN encapsulates the original packet in a new UDP packet to port 4789, and
251 | prepends an 8-byte header having a network identifier (VNI).  Currently,
252 | fluxcap supports sending VXLAN but not receiving it.
253 | 
254 | The GRE tunnel encapsulation modes are GRETAP, and regular GRE.  GRETAP (also
255 | called TEB for "transparent ethernet bridging") is preferred over GRE.
256 | GRETAP preserves the MAC addresses in the encapsulation, whereas GRE does not.
257 | 
258 | ### Transmitter
259 | 
260 | In this example, the GRETAP recipient tunnel endpoint is 192.168.102.100:
261 | 
262 |     fluxcap -tx -E gretap:192.168.102.100 ring
263 | 
264 | A VXLAN transmitter example that sets the VNI to 1234 is:
265 | 
266 |     fluxcap -tx -E vxlan:192.168.102.100 -K 1234 ring
267 | 
268 | ### Receiver
269 | 
270 |     fluxcap -rx -E gretap ring
271 | 
272 | To limit the interface and/or the IP address on which GRE is received, use:
273 | 
274 |     fluxcap -rx -E gretap:127.0.0.1 -i lo ring
275 | 
276 | replacing 127.0.0.1 with the local IP address or replacing lo with an interface.
277 | 
278 | ### GRE keys
279 | 
280 | It is possible to set the GRE key on a transmitted GRE/GRETAP tunnel
281 | using the `-K <key>` option. For example, `-K 500` sets the key to 500.
282 | This is useful when aggregating multiple taps over GRE, when there is a
283 | need to differentiate them on the receiving end.
284 | 
285 | On the receiver, `-K <key>` specifies the key that should be accepted.
286 | 
287 | The key can be specified as a 32-bit unsigned integer, or as a dotted
288 | quad IP of any meaning to the user.
289 | 
290 | ### VNI
291 | 
292 | When using VXLAN encapsulation, the `-K <key>` is interpreted as a VNI.
293 | 
294 | #### Receiver alternative: Linux OS decapsulation
295 | 
296 | If the recipient host is Linux, it can decapsulate the tunnel for us.
297 | This creates a synthetic NIC on the host, ready for use with a packet
298 | analysis tool, which looks like the remote tap cable is plugged in.
299 | 
300 | Confirm the recipient is getting the tunneled packets first.
301 | 
302 |     tcpdump -n "proto gre"
303 | 
304 | Then, to have Linux decapsulate for us, modify these commands by
305 | replacing 192.168.102.100 with the recipient IP address, and replacing
306 | 192.168.102.1 with the transmitter's IP address.
307 | 
308 |     # gretap
309 | 
310 |     modprobe ip_gre
311 |     ip link add gretap1 type gretap local 192.168.102.100 remote 192.168.102.1
312 |     ip link set gretap1 up
313 | 
314 | Now we can use gretap1 as if it were plugged into the remote tap. Try running
315 | `tcpdump -i gretap1 -nne` for example.  If we had used gre instead of gretap:
316 | 
317 |     # gre
318 | 
319 |     modprobe ip_gre
320 |     ip tunnel add gre1 type gre remote 192.168.102.1 local 192.168.102.100 ttl 255
321 |     ip link set gre1 up
322 | 
323 | ##### firewalld
324 | 
325 | You may need to ensure that iptables/firewalld allow the traffic. On a CentOS 7
326 | system, `sudo systemctl stop firewalld` permits the data to arrive on gretap1.
327 | 
328 | ##### MTU consideration
329 | 
330 | When encapsulating packets, they grow. If the original packet was at the MTU of
331 | its network, and GRETAP encapsulation adds 24 bytes (28 if a GRE key is used),
332 | then each packet may fragment into two packets when sent over the tunnel. This
333 | IP fragmentation is reversed on the remote end invisibly.
334 | 
335 | Fragmentation can be eliminated by raising the MTU on the tunnel network, or by
336 | truncating the packets (`-s`) to a max length when encapsulating.
337 | 
338 |     fluxcap -tx -s 1476 -E gretap:192.168.102.100 ring
339 | 
340 | The syntax above would truncate at 1476 bytes, so that adding a 24-byte GRETAP
341 | header attains maximum MTU without fragmentation (on a 1500-byte MTU network).
342 | 
343 | ## Other features
344 | 
345 | Run `fluxcap -h` to see other options.
346 | 
347 | 
348 | 


--------------------------------------------------------------------------------
/autogen.sh:
--------------------------------------------------------------------------------
1 | autoreconf -ifv
2 | 


--------------------------------------------------------------------------------
/bb.c:
--------------------------------------------------------------------------------
 1 | #include "fluxcap.h"
 2 | 
 3 | /*
 4 |  * support to vectorize struct iovec and struct bb
 5 |  */
 6 | UT_mm iov_mm = { . sz = sizeof(struct iovec) };
 7 | 
 8 | void bb_init(void *_b) {
 9 |   struct bb *b = (struct bb*)_b;
10 |   memset(b,0,sizeof(*b));
11 |   b->n = BATCH_SIZE;
12 |   int mode = MAP_PRIVATE | MAP_ANONYMOUS /* | MAP_LOCKED */;
13 |   b->d = mmap(0, b->n, PROT_READ|PROT_WRITE, mode, -1, 0);
14 |   if (b->d == MAP_FAILED) {
15 |     fprintf(stderr, "mmap: %s\n", strerror(errno));
16 |     abort();
17 |   }
18 |   b->iov = utvector_new(&iov_mm);
19 |   utvector_reserve(b->iov, BATCH_PKTS);
20 | }
21 | 
22 | void bb_fini(void *_b) {
23 |   struct bb *b = (struct bb*)_b;
24 |   assert (b->d && (b->d != MAP_FAILED));
25 |   munmap(b->d, b->n);
26 |   utvector_free(b->iov);
27 | }
28 | 
29 | void bb_clear(void *_b) {
30 |   struct bb *b = (struct bb*)_b;
31 |   b->u = 0;
32 |   utvector_clear(b->iov);
33 | }
34 | 
35 | UT_mm bb_mm = { 
36 |   .sz = sizeof(struct bb),
37 |   .init = bb_init,
38 |   .fini = bb_fini,
39 |   .clear = bb_clear,
40 | };
41 | 
42 | 


--------------------------------------------------------------------------------
/configure.ac:
--------------------------------------------------------------------------------
 1 | AC_INIT([fluxcap], 3.1)
 2 | AM_INIT_AUTOMAKE([foreign subdir-objects])
 3 | m4_ifdef([AM_SILENT_RULES],
 4 |     [AM_SILENT_RULES([yes])
 5 |     ])
 6 | AC_PROG_CC
 7 | AC_PROG_RANLIB
 8 | 
 9 | have_shr_header=n
10 | have_shr_lib=n
11 | AC_CHECK_HEADERS([shr.h],[have_shr_header=y])
12 | AC_CHECK_LIB(shr,shr_ctl,[have_shr_lib=y])
13 | if test "x${have_shr_header}${have_shr_lib}" != xyy
14 | then
15 |   AC_MSG_ERROR([
16 |   -----------------------------------------------------
17 |   The libshr build prerequisite was not found. Please
18 |   see the build instructions, install libshr and retry.
19 |   -----------------------------------------------------
20 |   ])
21 | fi
22 | 
23 | AC_CONFIG_FILES([Makefile
24 |    util/Makefile
25 |    lib/Makefile
26 |    ])
27 | AC_OUTPUT
28 | 


--------------------------------------------------------------------------------
/fluxcap.c:
--------------------------------------------------------------------------------
   1 | #include "fluxcap.h"
   2 | 
   3 | /* 
   4 |  * fluxcap: a network tap replication and aggregation tool
   5 |  *
   6 |  */
   7 | 
   8 | struct mmsghdr bss_msgv[BATCH_PKTS];
   9 | 
  10 | struct {
  11 |   int verbose;
  12 |   char *prog;
  13 |   enum {mode_none, mode_transmit, mode_receive, mode_create, mode_watch} mode;
  14 |   char *file;
  15 |   char dev[MAX_NIC];
  16 |   unsigned long ticks;
  17 |   int vlan;
  18 |   int pass_vlan;
  19 |   int tail;
  20 |   int fd;
  21 |   int tx_fd;
  22 |   int rx_fd;
  23 |   int signal_fd;
  24 |   int timer_fd;
  25 |   int epoll_fd;
  26 |   char pkt[MAX_PKT];
  27 |   struct shr *ring;
  28 |   size_t size; /* ring create size (-cr), or snaplen (-rx/-tx) */
  29 |   struct encap encap;
  30 |   struct itimerspec timer;
  31 |   uint16_t ip_id; /* for implementing IP fragmentation when */
  32 |   int mtu;        /* using gre encapsulation */
  33 |   UT_vector /* of ptr */ *watch_rings;
  34 |   UT_vector /* of utstring */ *watch_names;
  35 |   UT_vector /* of struct ww */ *watch_win;
  36 |   UT_string *tmp;
  37 |   struct timeval now;
  38 |   struct bb bb; /* output shr ring batch buffer; accumulates til shr_writev */
  39 |   struct bb rb; /* input shr ring batch buffer; accepts many via shr_readv */
  40 |   struct bb pb; /* packet buffer (Special); faux bb wrapping kernel ring */
  41 |   /* fields below are for packet input from AF_PACKET socket */
  42 |   struct tpacket_req req; /* linux/if_packet.h */
  43 |   unsigned ring_block_sz; /* see comments in initialization below */
  44 |   unsigned ring_block_nr; /* number of blocks of sz above */
  45 |   unsigned ring_frame_sz; /* snaplen */
  46 |   unsigned ring_curr_idx; /* slot index in ring buffer */
  47 |   unsigned ring_frame_nr; /* redundant, total frame count */
  48 |   int strip_vlan; /* strip VLAN on rx if present (boolean) */
  49 |   int drop_pct;   /* sampling % 0 (keep all)-100(drop all) */
  50 |   int use_tx_ring; /* 0 = sendto-based tx; 1=packet mmap ring-based tx */
  51 |   int bypass_qdisc_on_tx; /* bypass kernel qdisc layer, more risk of loss */
  52 |   struct fluxcap_stats stats; /* used to periodically update rx/rd stats */
  53 |   int keep; /* in mode_create, keep existing ring if present */
  54 |   int losing;
  55 |   struct bb gb;         /* used in gre rx for recvmmsg */
  56 |   struct mmsghdr *msgv; /* used in gre rx for recvmmsg */
  57 | } cfg = {
  58 |   .fd = -1,
  59 |   .tx_fd = -1,
  60 |   .rx_fd = -1,
  61 |   .signal_fd = -1,
  62 |   .timer_fd = -1,
  63 |   .epoll_fd = -1,
  64 |   .ring_block_sz = 1 << 22, /*4 mb; want powers of two due to kernel allocator*/
  65 |   .ring_block_nr = 64,
  66 |   .ring_frame_sz = 1 << 11, /* 2048 for MTU & header, divisor of ring_block_sz*/
  67 |   .timer = {
  68 |     .it_value =    { .tv_sec = 0, .tv_nsec = 1 },
  69 |     .it_interval = { .tv_sec = 0, .tv_nsec = 1000000000UL / TIMER_HZ },
  70 |   },
  71 |   .msgv = bss_msgv,
  72 | };
  73 | 
  74 | extern UT_mm bb_mm;
  75 | UT_mm ww_mm = { .sz = sizeof(struct ww), };
  76 | UT_mm _utmm_ptr = {.sz = sizeof(void*)};
  77 | UT_mm* utmm_ptr = &_utmm_ptr;
  78 | 
  79 | /* signals that we'll accept via signalfd in epoll */
  80 | int sigs[] = {SIGHUP,SIGTERM,SIGINT,SIGQUIT,SIGALRM};
  81 | 
  82 | void usage() {
  83 |   fprintf(stderr,
  84 |        "usage: %s [-cr|-tx|-rx|-io] [options] <ring>\n"
  85 |        "\n"
  86 |        " create ring(s): -cr -s <size>[k|m|g|t] <ring> ...\n"
  87 |        " transmit:       -tx -i <eth>  <ring>\n"
  88 |        " receive:        -rx -i <eth>  <ring>\n"
  89 |        " i/o view:       -io <ring> ...\n"
  90 |        "\n"
  91 |        "Encapsulation modes:\n"
  92 |        "    -tx -E gretap:<host>  [-K <key>]            <ring>  (GRETAP send)\n"
  93 |        "    -rx -E gretap[:<ip>]> [-K <key>] [-i <eth>] <ring>  (GRETAP recv)\n"
  94 |        "    -tx -E gre:<host>     [-K <key>]            <ring>  (GRE send)\n"
  95 |        "    -tx -E vxlan:<host>   [-K <VNI>]            <ring>  (VXLAN send)\n"
  96 |        " where:\n"
  97 |        "    <key> GRE key/dotted quad (optional) [rx/tx]\n"
  98 |        "    <ip>  binds a local IP    (optional) [rx]\n"
  99 |        "    <eth> binds a local NIC   (optional) [rx]\n"
 100 |        "\n"
 101 |        "Other options:\n"
 102 |        "    -f 'vlan n'  (accept packets tagged VLAN n) [tx]\n"
 103 |        "    -V <vlan>    (inject VLAN tag) [rx/tx]\n"
 104 |        "    -Q           (remove VLAN tag) [rx]\n"
 105 |        "    -d <percent> (downsample to <0-99>%% [rx/tx]\n"
 106 |        "    -s <length>  (truncate at length) [rx/tx]\n"
 107 |        "    -D <n>       (trim n tail bytes) [rx/tx]\n"
 108 |        "    -R           (tpacket-based tx) [tx]\n"
 109 |        "    -q           (bypass qdisc layer) [tx]\n"
 110 |        "    -v           (verbose)\n"
 111 |        "\n"
 112 |        " Kernel buffer options (TPACKET_V2) [rx/tx]\n"
 113 |        "  Defaults apply if left unspecified. To use these options\n"
 114 |        "  the block size must be a multiple of the system page size,\n"
 115 |        "  and be small since it consumes physically contiguous pages.\n"
 116 |        "  The number of blocks can be large. Their product is the buffer\n"
 117 |        "  capacity. The frame size must evenly divide the block size.\n"
 118 |        "  The parameters are checked to satisfy these constraints.\n"
 119 |        "  The frame size is for one packet (with overhead) so it should\n"
 120 |        "  exceed the MTU for full packet handling without truncation.\n"
 121 |        "    -Z <frame-size>  (max frame size)   [2048]\n"
 122 |        "    -B <num-blocks>  (number of blocks) [64])\n"
 123 |        "    -S <block-size>  (block size log2)  [22] (4mb)\n"
 124 |        "\n",
 125 |           cfg.prog);
 126 |   fprintf(stderr, "fluxcap version: %s\n", FLUXCAP_VERSION);
 127 |   exit(-1);
 128 | }
 129 | 
 130 | void hexdump(char *buf, size_t len) {
 131 |   size_t i,n=0;
 132 |   unsigned char c;
 133 |   while(n < len) {
 134 |     fprintf(stderr,"%08x ", (int)n);
 135 |     for(i=0; i < 16; i++) {
 136 |       c = (n+i < len) ? buf[n+i] : 0;
 137 |       if (n+i < len) fprintf(stderr,"%.2x ", c);
 138 |       else fprintf(stderr, "   ");
 139 |     }
 140 |     for(i=0; i < 16; i++) {
 141 |       c = (n+i < len) ? buf[n+i] : ' ';
 142 |       if (c < 0x20 || c > 0x7e) c = '.';
 143 |       fprintf(stderr,"%c",c);
 144 |     }
 145 |     fprintf(stderr,"\n");
 146 |     n += 16;
 147 |   }
 148 | }
 149 | 
 150 | int new_epoll(int events, int fd) {
 151 |   int rc;
 152 |   struct epoll_event ev;
 153 |   memset(&ev,0,sizeof(ev)); // placate valgrind
 154 |   ev.events = events;
 155 |   ev.data.fd= fd;
 156 |   rc = epoll_ctl(cfg.epoll_fd, EPOLL_CTL_ADD, fd, &ev);
 157 |   if (rc == -1) {
 158 |     fprintf(stderr,"epoll_ctl: %s\n", strerror(errno));
 159 |   }
 160 |   return rc;
 161 | }
 162 | 
 163 | /*
 164 |  * read_proc
 165 |  *
 166 |  * read a complete file from the /proc filesystem
 167 |  * this is special because its size is not known a priori
 168 |  * so a read/realloc loop is needed
 169 |  *
 170 |  * size into len, returning buffer or NULL on error.
 171 |  * caller should free the buffer eventually.
 172 |  */
 173 | char *read_proc(char *file, size_t *len) {
 174 |   char *buf=NULL, *b, *tmp;
 175 |   int fd = -1, rc = -1, eof=0;
 176 |   size_t sz, br=0, l;
 177 |   ssize_t nr;
 178 | 
 179 |   /* initial guess at a sufficient buffer size */
 180 |   sz = 1000;
 181 | 
 182 |   fd = open(file, O_RDONLY);
 183 |   if (fd < 0) {
 184 |     fprintf(stderr,"open: %s\n", strerror(errno));
 185 |     goto done;
 186 |   }
 187 | 
 188 |   while(!eof) {
 189 | 
 190 |     tmp = realloc(buf, sz);
 191 |     if (tmp == NULL) {
 192 |       fprintf(stderr, "out of memory\n");
 193 |       goto done;
 194 |     }
 195 | 
 196 |     buf = tmp;
 197 |     b = buf + br;
 198 |     l = sz - br;
 199 | 
 200 |     do {
 201 |       nr = read(fd, b, l);
 202 |       if (nr < 0) {
 203 |         fprintf(stderr,"read: %s\n", strerror(errno));
 204 |         goto done;
 205 |       }
 206 | 
 207 |       b += nr;
 208 |       l -= nr;
 209 |       br += nr;
 210 | 
 211 |       /* out of space? double buffer size */
 212 |       if (l == 0) { 
 213 |         sz *= 2;
 214 |         break;
 215 |       }
 216 | 
 217 |       if (nr == 0) eof = 1;
 218 | 
 219 |     } while (nr > 0);
 220 |   }
 221 | 
 222 |   *len = br;
 223 |   rc = 0;
 224 | 
 225 |  done:
 226 |   if (fd != -1) close(fd);
 227 |   if (rc && buf) { free(buf); buf = NULL; }
 228 |   return buf;
 229 | }
 230 | 
 231 | /*
 232 |  * find start and length of column N (one-based)
 233 |  * in input buffer buf of length buflen
 234 |  *
 235 |  * columns must be space-or-tab delimited
 236 |  * returns NULL if column not found
 237 |  *
 238 |  * the final column may end in newline or eob  
 239 |  *
 240 |  * col: column index (1-based)
 241 |  * len: OUTPUT parameter (column length)
 242 |  * buf: buffer to find columns in
 243 |  * buflen: length of buf
 244 |  *
 245 |  * returns:
 246 |  *   pointer to column N, or NULL
 247 |  */
 248 | #define ws(x) (((x) == ' ') || ((x) == '\t'))
 249 | char *get_col(int col, size_t *len, char *buf, size_t buflen) {
 250 |   char *b, *start=NULL, *eob;
 251 |   int num;
 252 | 
 253 |   eob = buf + buflen;
 254 | 
 255 |   b = buf;
 256 |   num = 0;  /* column number */
 257 |   *len = 0; /* column length */
 258 | 
 259 |   while (b < eob) {
 260 | 
 261 |     if (ws(*b) && (num == col)) break; /* end of sought column */
 262 |     if (*b == '\n') break;             /* end of line */
 263 | 
 264 |     if (ws(*b)) *len = 0;              /* skip over whitespace */
 265 |     if ((!ws(*b)) && (*len == 0)) {    /* record start of column */
 266 |       num++;
 267 |       start = b;
 268 |     }
 269 |     if (!ws(*b)) (*len)++;             /* increment column length */
 270 |     b++;
 271 |   }
 272 | 
 273 |   if ((*len) && (num == col)) return start;
 274 |   return NULL;
 275 | }
 276 | 
 277 | /*
 278 |  * find route for a given destination IP address
 279 |  *
 280 |  * parameters:
 281 |  *  dest_ip:   the destination IP address in network order
 282 |  *  interface: char[] to receive the output NIC interface name
 283 |  *             must be at least IF_NAMESIZE bytes long;
 284 |  *             see IF_NAMESIZE in /usr/include/net/if.h
 285 |  * returns:
 286 |  *   0 success
 287 |  *  -1 error parsing routing table
 288 |  *  -2 no route found
 289 |  *
 290 |  */
 291 | int find_route(uint32_t dest_ip, 
 292 |                char *interface) {
 293 | 
 294 |   int rc = -1, sc;
 295 |   char *buf=NULL, *line, *b, *iface, *s_dest, *s_gw, *s_mask;
 296 |   unsigned mask, dest, gw, best_mask=0, nroutes=0;
 297 |   size_t len, sz=0, to_eob, iface_len;
 298 | 
 299 |   buf = read_proc("/proc/net/route", &sz);
 300 |   if (buf == NULL) goto done;
 301 | 
 302 |   /* find initial newline; discard header row */
 303 |   b = buf;
 304 |   while ((b < buf+sz) && (*b != '\n')) b++;
 305 |   line = b+1;
 306 | 
 307 |   while (line < buf+sz) {
 308 | 
 309 |     to_eob = sz-(line-buf);
 310 | 
 311 |     s_dest = get_col(2, &len, line, to_eob);
 312 |     if (s_dest == NULL) goto done;
 313 |     sc = sscanf(s_dest, "%x", &dest);
 314 |     if (sc != 1) goto done;
 315 | 
 316 |     s_mask = get_col(8, &len, line, to_eob);
 317 |     if (s_mask == NULL) goto done;
 318 |     sc = sscanf(s_mask, "%x", &mask);
 319 |     if (sc != 1) goto done;
 320 | 
 321 |     iface = get_col(1, &iface_len, line, to_eob);
 322 |     if (iface == NULL) goto done;
 323 | 
 324 |     /* advance to next line */
 325 |     b = line;
 326 |     while ((b < buf+sz) && (*b != '\n')) b++;
 327 |     line = b+1;
 328 | 
 329 |     /* does the route apply? */
 330 |     if ((dest_ip & mask) != dest) continue;
 331 | 
 332 |     /* know a more specific route? */
 333 |     if (mask < best_mask) continue;
 334 | 
 335 |     /* this is the best route so far */
 336 |     best_mask = mask;
 337 | 
 338 |     /* copy details of this route */
 339 |     if (iface_len + 1 > IF_NAMESIZE) goto done;
 340 |     memcpy(interface, iface, iface_len);
 341 |     interface[iface_len] = '\0';
 342 |     nroutes++;
 343 |   }
 344 | 
 345 |   rc = nroutes ? 0 : -2;
 346 | 
 347 |  done:
 348 |   if (buf) free(buf);
 349 |   return rc;
 350 | }
 351 | 
 352 | /* get the MTU for the interface, or -1 on error */
 353 | int get_if_mtu(char *eth) {
 354 |   int fd = -1, sc, rc = -1;
 355 |   struct ifreq ifr;
 356 | 
 357 |   fd = socket(AF_INET, SOCK_DGRAM, 0);
 358 |   if (fd == -1) {
 359 |     fprintf(stderr, "socket: %s\n", strerror(errno));
 360 |     goto done;
 361 |   }
 362 | 
 363 |   strncpy(ifr.ifr_name, eth, sizeof(ifr.ifr_name));
 364 |   sc = ioctl(fd, SIOCGIFMTU, &ifr);
 365 |   if (sc < 0) {
 366 |     fprintf(stderr, "ioctl: %s\n", strerror(errno));
 367 |     goto done;
 368 |   }
 369 | 
 370 |   rc = ifr.ifr_mtu;
 371 | 
 372 |  done:
 373 |   if (fd != -1) close(fd);
 374 |   return rc;
 375 | }
 376 | 
 377 | int check_ring_parameters(void) {
 378 |   int rc=-1;
 379 |   unsigned page_sz;
 380 | 
 381 |   if (cfg.ring_block_sz % cfg.ring_frame_sz) {
 382 |     fprintf(stderr,"-S block_sz must be multiple of -F frame_sz\n");
 383 |     goto done;
 384 |   }
 385 | 
 386 |   page_sz = (unsigned)sysconf(_SC_PAGESIZE);
 387 | 
 388 |   if (cfg.ring_block_sz % page_sz) {
 389 |     fprintf(stderr,"-S block_sz must be multiple of page_sz %u\n", page_sz);
 390 |     goto done;
 391 |   }
 392 | 
 393 |   if (cfg.ring_frame_sz <= TPACKET2_HDRLEN) {
 394 |     fprintf(stderr,"-Z frame_sz must exceed %lu\n", TPACKET2_HDRLEN);
 395 |     goto done;
 396 |   }
 397 | 
 398 |   if (cfg.ring_frame_sz % TPACKET_ALIGNMENT) {
 399 |     fprintf(stderr,"-Z frame_sz must be a multiple of %u\n", TPACKET_ALIGNMENT);
 400 |     goto done;
 401 |   }
 402 | 
 403 |   cfg.ring_frame_nr = (cfg.ring_block_sz / cfg.ring_frame_sz) * cfg.ring_block_nr;
 404 | 
 405 |   rc = 0;
 406 |  
 407 |  done:
 408 |   return rc;
 409 | 
 410 | }
 411 | 
 412 | /* print the ring capacity in MB and packets 
 413 |  *
 414 |  * here in userspace, the ring is nothing but a regular flat buffer.
 415 |  * it is comprised of contiguous slots - all of which have the same size.
 416 |  *
 417 |  * in kernel space, the ring is a set of blocks; each block is a number of
 418 |  * physically contiguous pages. since physically contiguous pages are
 419 |  * limited, the kernel only gets small allocations of them. it forms the
 420 |  * blocks into a virtually contiguous buffer for our benefit in user space.
 421 |  *
 422 |  * these kernel memory considerations are why the ring is specified as
 423 |  * a number of blocks (cfg.ring_block_nr) of a given size (cfg.ring_block_sz).
 424 |  * the other parameter (cfg.ring_frame_sz) is the max size of a packet structure
 425 |  * (struct tpacket_hdr, struct sockaddr_ll, packet itself, and padding). so
 426 |  * to deal with full packet data it needs to be the MTU plus all that overhead.
 427 |  *
 428 |  * we require block size to be a multiple of frame size, so there are no gaps
 429 |  * in the userspace view of the packet ring. it is a simple array of slots.
 430 |  *
 431 |  */
 432 | void describe_ring(char *label) {
 433 | 
 434 |   double block_size_mb = cfg.ring_block_sz / (1024.0 * 1024);
 435 |   double mb = cfg.ring_block_nr * block_size_mb;
 436 | 
 437 |   fprintf(stderr, "%s: %.1f megabytes (max %u packets)\n",
 438 |      label, mb, cfg.ring_frame_nr);
 439 | 
 440 |   if (cfg.verbose) {
 441 | 
 442 |     double bps = 10000000000.0; /* 10 gigabit/sec network */
 443 |     double mbytes_per_sec = bps / ( 8 * 1024 * 1024);
 444 |     double sec = mb / mbytes_per_sec;
 445 | 
 446 |     fprintf(stderr,
 447 |        " RING: (%u blocks * %u bytes per block) = %.1f megabytes\n"
 448 |        " PACKETS: @(%u bytes/packet) = %u packets\n"
 449 |        " TIME TO QUENCH @ 10Gigabit/s: %.1f seconds\n",
 450 |        cfg.ring_block_nr, cfg.ring_block_sz, mb,
 451 |        cfg.ring_frame_sz, cfg.ring_frame_nr, sec);
 452 |   }
 453 | }
 454 | 
 455 | /* set up as a GRE receiver */
 456 | int setup_rx_encap(void) {
 457 |   struct sockaddr *sa;
 458 |   int i, sc, rc = -1;
 459 |   struct iovec *iov;
 460 |   socklen_t sz;
 461 | 
 462 |   cfg.rx_fd = socket(AF_INET, SOCK_RAW, IPPROTO_GRE);
 463 |   if (cfg.rx_fd == -1) {
 464 |     fprintf(stderr,"socket: %s\n", strerror(errno));
 465 |     goto done;
 466 |   }
 467 | 
 468 |   /* bind local IP; defaults to INADDR_ANY */
 469 |   struct sockaddr_in in;
 470 |   memset(&in, 0, sizeof(in));
 471 |   in.sin_addr = cfg.encap.dst;
 472 |   sa = (struct sockaddr*)&in;
 473 |   sz = sizeof(in);
 474 | 
 475 |   sc = bind(cfg.rx_fd, sa, sz);
 476 |   if (sc < 0) {
 477 |     fprintf(stderr, "bind: %s\n", strerror(errno));
 478 |     goto done;
 479 |   }
 480 | 
 481 |   /* bind specific RX NIC if requested */
 482 |   sz = strlen(cfg.dev);
 483 |   sc = sz ? setsockopt(cfg.rx_fd, SOL_SOCKET, SO_BINDTODEVICE, cfg.dev, sz) : 0;
 484 |   if (sc < 0) {
 485 |     fprintf(stderr, "setsockopt: %s\n", strerror(errno));
 486 |     goto done;
 487 |   }
 488 | 
 489 |   /* set up recvmmsg buffers */
 490 |   assert(BATCH_SIZE == BATCH_PKTS * MAX_PKT);
 491 |   assert(cfg.gb.n == BATCH_PKTS * MAX_PKT);
 492 |   assert(cfg.gb.iov && (cfg.gb.iov->n == BATCH_PKTS));
 493 |   cfg.gb.iov->i = cfg.gb.iov->n; /* mark slots used */
 494 |   iov = (struct iovec*)utvector_head(cfg.gb.iov);
 495 |   for(i=0; i < BATCH_PKTS; i++) {
 496 |     iov[i].iov_base = cfg.gb.d + i * MAX_PKT;
 497 |     iov[i].iov_len = MAX_PKT;
 498 |     cfg.msgv[i].msg_hdr.msg_iov = &iov[i];
 499 |     cfg.msgv[i].msg_hdr.msg_iovlen = 1;
 500 |   }
 501 | 
 502 |   rc = 0;
 503 | 
 504 |  done:
 505 |   return rc;
 506 | }
 507 | 
 508 | /* 
 509 |  * Prepare to read packets using a AF_PACKET socket with PACKET_RX_RING
 510 |  * 
 511 |  * see packet(7)
 512 |  *
 513 |  * also see
 514 |  *  sudo apt-get install linux-doc
 515 |  *  zless /usr/share/doc/linux-doc/networking/packet_mmap.txt.gz
 516 |  *
 517 |  * With PACKET_RX_RING (in TPACKET_V2)
 518 |  * the ring buffer consists of an array of packet slots.
 519 |  *
 520 |  * Each packet is preceded by a metadata structure in the slot.
 521 |  * The application and kernel communicate the head and tail of
 522 |  * the ring through tp_status field (TP_STATUS_[USER|KERNEL]).
 523 |  *
 524 |  */
 525 | 
 526 | int setup_rx(void) {
 527 |   int rc=-1, ec;
 528 | 
 529 |   if (check_ring_parameters() < 0) goto done;
 530 | 
 531 |   /* any link layer protocol packets (linux/if_ether.h) */
 532 |   int protocol = htons(ETH_P_ALL);
 533 | 
 534 |   /* create the packet socket */
 535 |   cfg.fd = socket(AF_PACKET, SOCK_RAW, protocol);
 536 |   if (cfg.fd == -1) {
 537 |     fprintf(stderr,"socket: %s\n", strerror(errno));
 538 |     goto done;
 539 |   }
 540 | 
 541 |   /* convert interface name to index (in ifr.ifr_ifindex) */
 542 |   struct ifreq ifr; 
 543 |   strncpy(ifr.ifr_name, cfg.dev, sizeof(ifr.ifr_name));
 544 |   ec = ioctl(cfg.fd, SIOCGIFINDEX, &ifr);
 545 |   if (ec < 0) {
 546 |     fprintf(stderr,"failed to find interface %s\n", cfg.dev);
 547 |     goto done;
 548 |   }
 549 | 
 550 |   /* PACKET_RX_RING comes in multiple versions. TPACKET_V2 is used here */
 551 |   int v = TPACKET_V2;
 552 |   ec = setsockopt(cfg.fd, SOL_PACKET, PACKET_VERSION, &v, sizeof(v));
 553 |   if (ec < 0) {
 554 |     fprintf(stderr,"setsockopt PACKET_VERSION: %s\n", strerror(errno));
 555 |     goto done;
 556 |   }
 557 | 
 558 |   /* fill out the struct tpacket_req describing the ring buffer */
 559 |   memset(&cfg.req, 0, sizeof(cfg.req));
 560 |   cfg.req.tp_block_size = cfg.ring_block_sz; /* Min sz of contig block */
 561 |   cfg.req.tp_frame_size = cfg.ring_frame_sz; /* Size of frame/snaplen */
 562 |   cfg.req.tp_block_nr = cfg.ring_block_nr;   /* Number of blocks */
 563 |   cfg.req.tp_frame_nr = cfg.ring_frame_nr;   /* Total number of frames */
 564 |   describe_ring("PACKET_RX_RING");
 565 |   ec = setsockopt(cfg.fd, SOL_PACKET, PACKET_RX_RING, &cfg.req, sizeof(cfg.req));
 566 |   if (ec < 0) {
 567 |     fprintf(stderr,"setsockopt PACKET_RX_RING: %s\n", strerror(errno));
 568 |     goto done;
 569 |   }
 570 | 
 571 |   /* now map the ring buffer we described above. lock in unswappable memory */
 572 |   cfg.pb.n = cfg.req.tp_block_size * cfg.req.tp_block_nr;
 573 |   cfg.pb.d = mmap(NULL, cfg.pb.n, PROT_READ|PROT_WRITE,
 574 |                       MAP_SHARED|MAP_LOCKED, cfg.fd, 0);
 575 |   if (cfg.pb.d == MAP_FAILED) {
 576 |     fprintf(stderr,"mmap: %s\n", strerror(errno));
 577 |     goto done;
 578 |   }
 579 | 
 580 |   /* bind to receive the packets from just one interface */
 581 |   struct sockaddr_ll sl;
 582 |   memset(&sl, 0, sizeof(sl));
 583 |   sl.sll_family = AF_PACKET;
 584 |   sl.sll_protocol = protocol;
 585 |   sl.sll_ifindex = ifr.ifr_ifindex;
 586 |   ec = bind(cfg.fd, (struct sockaddr*)&sl, sizeof(sl));
 587 |   if (ec < 0) {
 588 |     fprintf(stderr,"socket: %s\n", strerror(errno));
 589 |     goto done;
 590 |   }
 591 | 
 592 |   /* set promiscuous mode to get all packets. */
 593 |   struct packet_mreq m;
 594 |   memset(&m, 0, sizeof(m));
 595 |   m.mr_ifindex = ifr.ifr_ifindex;
 596 |   m.mr_type = PACKET_MR_PROMISC;
 597 |   ec = setsockopt(cfg.fd, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &m, sizeof(m));
 598 |   if (ec < 0) {
 599 |     fprintf(stderr,"setsockopt PACKET_ADD_MEMBERSHIP: %s\n", strerror(errno));
 600 |     goto done;
 601 |   }
 602 | 
 603 |   rc = 0;
 604 | 
 605 |  done:
 606 |   return rc;
 607 | }
 608 | 
 609 | /* 
 610 |  * create the transmit socket 
 611 |  * 
 612 |  * There are two fundamentally different types of sockets here, only one
 613 |  * of which is created, based on whether we are doing *encapsulated* transmit
 614 |  * (of the packet into a GRE tunnel that then rides over regular IP); or
 615 |  * "regular" packet transmission where we inject the packet to the NIC.
 616 |  *
 617 |  *     MODE            SOCKET TYPE         SEE ALSO
 618 |  *     --------         ----------------    ---------------
 619 |  *     ENCAPSULATE     RAW IP              ip(7) and raw(7)
 620 |  *     REGULAR         RAW PACKET          packet(7) 
 621 |  *
 622 |  * Within REGULAR mode we further distinguish between sendto()-based
 623 |  * transmit, versus packet tx ring mode. The latter uses the kernel ring
 624 |  * buffer mechanism described in packet_mmap.txt.
 625 |  *
 626 |  */
 627 | int setup_tx(void) {
 628 |   char interface[IF_NAMESIZE], *ip;
 629 |   int rc=-1, ec, one = 1;
 630 | 
 631 |   if (cfg.encap.enable) {
 632 | 
 633 |     /* in encapsulation mode, use raw IP socket. */
 634 |     cfg.tx_fd = socket(AF_INET, SOCK_RAW, IPPROTO_GRE);
 635 |     if (cfg.tx_fd == -1) {
 636 |       fprintf(stderr,"socket: %s\n", strerror(errno));
 637 |       goto done;
 638 |     }
 639 | 
 640 |     /* IP_HDRINCL means WE form the IP headers.. with some help; see raw(7) */
 641 |     ec = setsockopt(cfg.tx_fd, IPPROTO_IP, IP_HDRINCL, &one, sizeof(one));
 642 |     if (ec < 0) {
 643 |       fprintf(stderr,"setsockopt IP_HDRINCL: %s\n", strerror(errno));
 644 |       goto done;
 645 |     }
 646 | 
 647 |     /* we need the mtu of the egress NIC to implement IP fragmentation,
 648 |      * if needed, since raw sockets do not do that for us. to get the 
 649 |      * interface mtu, we need the egress interface, based on routing */
 650 |     ec = find_route( cfg.encap.dst.s_addr, interface);
 651 |     if (ec < 0) {
 652 |       ip = inet_ntoa(cfg.encap.dst);
 653 |       fprintf(stderr, "can't determine route to %s\n", ip);
 654 |       goto done;
 655 |     }
 656 | 
 657 |     cfg.mtu = get_if_mtu(interface);
 658 |     if (cfg.mtu < 0) {
 659 |       fprintf(stderr, "mtu lookup failed: %s\n", interface);
 660 |       goto done;
 661 |     }
 662 | 
 663 |     if (cfg.verbose) {
 664 |       ip = inet_ntoa(cfg.encap.dst);
 665 |       fprintf(stderr, "encapsulating to %s on interface %s mtu %d\n",
 666 |         ip, interface, cfg.mtu);
 667 |     }
 668 | 
 669 |     rc = 0;
 670 |     goto done;
 671 |   } 
 672 |   
 673 |   /* 
 674 |    * standard tx mode
 675 |    */
 676 | 
 677 |   /* use a raw PACKET (link-level) socket */
 678 |   cfg.tx_fd = socket(AF_PACKET, SOCK_RAW, 0 /* tx only */);
 679 |   if (cfg.tx_fd == -1) {
 680 |     fprintf(stderr,"socket: %s\n", strerror(errno));
 681 |     goto done;
 682 |   }
 683 | 
 684 |   /* convert interface name to index (in ifr.ifr_ifindex) */
 685 |   struct ifreq ifr;
 686 |   strncpy(ifr.ifr_name, cfg.dev, sizeof(ifr.ifr_name));
 687 |   ec = ioctl(cfg.tx_fd, SIOCGIFINDEX, &ifr);
 688 |   if (ec < 0) {
 689 |     fprintf(stderr,"failed to find interface %s\n", cfg.dev);
 690 |     goto done;
 691 |   }
 692 | 
 693 |   /* bind interface for tx */
 694 |   struct sockaddr_ll sl;
 695 |   memset(&sl, 0, sizeof(sl));
 696 |   sl.sll_family = AF_PACKET;
 697 |   sl.sll_ifindex = ifr.ifr_ifindex;
 698 |   ec = bind(cfg.tx_fd, (struct sockaddr*)&sl, sizeof(sl));
 699 |   if (ec < 0) {
 700 |     fprintf(stderr,"socket: %s\n", strerror(errno));
 701 |     goto done;
 702 |   }
 703 | 
 704 |   /* when qdisc bypass is enabled, to quote packet_mmap.txt, "packets sent
 705 |    * through PF_PACKET will bypass the kernel's qdisc layer and are ...
 706 |    * pushed to the driver directly.  Meaning, packet are not buffered, tc
 707 |    * disciplines are ignored, increased loss can occur and such packets are 
 708 |    * not visible to other PF_PACKET sockets anymore."
 709 |    */
 710 | #ifdef PACKET_QDISC_BYPASS
 711 |   ec = cfg.bypass_qdisc_on_tx ?
 712 |       setsockopt(cfg.tx_fd, SOL_PACKET, PACKET_QDISC_BYPASS, &one, sizeof(one)) : 0;
 713 |   if (ec < 0) {
 714 |     fprintf(stderr,"setsockopt PACKET_QDISC_BYPASS: %s\n", strerror(errno));
 715 |     goto done;
 716 |   }
 717 | #else
 718 |   if (cfg.bypass_qdisc_on_tx) {
 719 |     fprintf(stderr,"setsockopt PACKET_QDISC_BYPASS: unsupported\n");
 720 |     goto done;
 721 |   }
 722 | #endif
 723 | 
 724 |   /* if we are using standard, sendto-based transmit, we are done */
 725 |   if (cfg.use_tx_ring == 0) {
 726 |     rc  = 0;
 727 |     goto done;
 728 |   }
 729 | 
 730 |   /*************************************************************
 731 |    * packet tx ring setup
 732 |    ************************************************************/
 733 |   if (check_ring_parameters() < 0) goto done;
 734 | 
 735 |   int v = TPACKET_V2;
 736 |   ec = setsockopt(cfg.tx_fd, SOL_PACKET, PACKET_VERSION, &v, sizeof(v));
 737 |   if (ec < 0) {
 738 |     fprintf(stderr,"setsockopt PACKET_VERSION: %s\n", strerror(errno));
 739 |     goto done;
 740 |   }
 741 | 
 742 |   /* fill out the struct tpacket_req describing the ring buffer */
 743 |   memset(&cfg.req, 0, sizeof(cfg.req));
 744 |   cfg.req.tp_block_size = cfg.ring_block_sz; /* Min sz of contig block */
 745 |   cfg.req.tp_frame_size = cfg.ring_frame_sz; /* Size of frame/snaplen */
 746 |   cfg.req.tp_block_nr = cfg.ring_block_nr;   /* Number of blocks */
 747 |   cfg.req.tp_frame_nr = cfg.ring_frame_nr;   /* Total number of frames */
 748 |   describe_ring("PACKET_TX_RING");
 749 |   ec = setsockopt(cfg.tx_fd, SOL_PACKET, PACKET_TX_RING, &cfg.req, sizeof(cfg.req));
 750 |   if (ec < 0) {
 751 |     fprintf(stderr,"setsockopt PACKET_TX_RING: %s\n", strerror(errno));
 752 |     goto done;
 753 |   }
 754 | 
 755 |   /* map the tx ring buffer into unswappable memory */
 756 |   cfg.pb.n = cfg.req.tp_block_size * cfg.req.tp_block_nr;
 757 |   cfg.pb.d = mmap(NULL, cfg.pb.n, PROT_READ|PROT_WRITE,
 758 |                       MAP_SHARED|MAP_LOCKED, cfg.tx_fd, 0);
 759 |   if (cfg.pb.d == MAP_FAILED) {
 760 |     fprintf(stderr,"mmap: %s\n", strerror(errno));
 761 |     goto done;
 762 |   }
 763 | 
 764 |   rc = 0;
 765 | 
 766 |  done:
 767 |   return rc;
 768 | }
 769 | 
 770 | int bb_flush(struct shr *s, struct bb *b) {
 771 |   int rc = -1;
 772 |   struct iovec *iov;
 773 |   size_t n;
 774 |   ssize_t wr;
 775 | 
 776 |   n = utvector_len(b->iov);
 777 |   if (n == 0) { rc = 0; goto done; }
 778 |   iov = (struct iovec*)utvector_head(b->iov);
 779 | 
 780 |   wr = shr_writev(s, iov, n);
 781 |   if (wr < 0) {
 782 |     fprintf(stderr,"shr_write: error code %ld\n", (long)wr);
 783 |     goto done;
 784 |   }
 785 |   b->u = 0;
 786 |   utvector_clear(b->iov);
 787 | 
 788 |   rc = 0;
 789 | 
 790 |  done:
 791 |   return rc;
 792 | }
 793 | 
 794 | /* store the message into the batch buffer */
 795 | ssize_t bb_write(struct shr *s, struct bb *b, char *buf, size_t len) {
 796 |   struct iovec io;
 797 |   int rc = -1;
 798 | 
 799 |   if (b->n - b->u < len) {
 800 |     if (bb_flush(s,b) < 0) goto done;
 801 |   }
 802 | 
 803 |   assert((b->n - b->u) >= len);
 804 | 
 805 |   io.iov_base = &b->d[b->u];
 806 |   io.iov_len = len;
 807 |   memcpy(io.iov_base, buf, len);
 808 |   utvector_push(b->iov, &io);
 809 |   b->u += len;
 810 | 
 811 |   rc = 0;
 812 | 
 813 |  done:
 814 |   return (rc < 0) ? (ssize_t)-1 : len;
 815 | }
 816 | 
 817 | /* add rx drops to the counter in the ring app data
 818 |  *
 819 |  * see /usr/include/linux/if_packet.h
 820 |  * see packet(7)
 821 |  * "Receiving statistics resets the internal counters."
 822 |  *
 823 |  */
 824 | int update_rx_drops(void) {
 825 |   struct tpacket_stats stats;
 826 |   struct fluxcap_stats st;
 827 |   size_t st_sz;
 828 |   void *stp;
 829 |   int sc, rc = -1;
 830 | 
 831 |   assert(cfg.mode == mode_receive);
 832 |   if (cfg.losing == 0) return 0;
 833 | 
 834 |   /* packet(7): "Receiving statistics resets the internal counters."  */
 835 |   socklen_t len = sizeof(stats);
 836 |   sc = getsockopt(cfg.fd, SOL_PACKET, PACKET_STATISTICS, &stats, &len);
 837 |   if (sc < 0) {
 838 |     fprintf(stderr,"getsockopt: %s\n", strerror(errno));
 839 |     return -1;
 840 |   }
 841 | 
 842 |   if (cfg.verbose) {
 843 |     fprintf(stderr, "Received packets: %u\n", stats.tp_packets);
 844 |     fprintf(stderr, "Dropped packets:  %u\n", stats.tp_drops);
 845 |   }
 846 | 
 847 |   stp = &st;
 848 |   st_sz = sizeof(st);
 849 | 
 850 |   sc = shr_appdata(cfg.ring, &stp, NULL, &st_sz); /* "get" */
 851 |   if (sc < 0) {
 852 |     fprintf(stderr, "shr_appdata: error %d\n", sc);
 853 |     goto done;
 854 |   }
 855 | 
 856 |   st.rx_drops += stats.tp_drops;
 857 | 
 858 |   sc = shr_appdata(cfg.ring, NULL, stp, &st_sz); /* "set" */
 859 |   if (sc < 0) {
 860 |     fprintf(stderr, "shr_appdata: error %d\n", sc);
 861 |     goto done;
 862 |   }
 863 | 
 864 |   cfg.losing = 0;
 865 |   rc = 0;
 866 | 
 867 |  done:
 868 |   return rc;
 869 | }
 870 | 
 871 | /* add ring read drops to the counter in the ring app data */
 872 | int update_rd_drops(void) {
 873 |   struct fluxcap_stats st;
 874 |   size_t st_sz;
 875 |   void *stp;
 876 |   int sc, rc = -1;
 877 | 
 878 |   stp = &st;
 879 |   st_sz = sizeof(st);
 880 | 
 881 |   sc = shr_appdata(cfg.ring, &stp, NULL, &st_sz); /* "get" */
 882 |   if (sc < 0) {
 883 |     fprintf(stderr, "shr_appdata: error %d\n", sc);
 884 |     goto done;
 885 |   }
 886 | 
 887 |   st.rd_drops += shr_farm_stat(cfg.ring, 1);
 888 | 
 889 |   sc = shr_appdata(cfg.ring, NULL, stp, &st_sz); /* "set" */
 890 |   if (sc < 0) {
 891 |     fprintf(stderr, "shr_appdata: error %d\n", sc);
 892 |     goto done;
 893 |   }
 894 |  
 895 |   rc = 0;
 896 | 
 897 |  done:
 898 |   return rc;
 899 | }
 900 | 
 901 | /* returns volatile memory - use immediately or copy.
 902 |  * takes bits-per-second as input, returns like "20 Mbit/s"
 903 |  * where "bit" is the unit, can also be "pkt" etc.
 904 |  * using whatever SI unit is most readable (K,M,G,T) 
 905 |  */
 906 | char *format_rate(unsigned long bps, char *unit) {
 907 |   double b = bps;
 908 |   char *c = "";
 909 |   if (b > 1024) { b /= 1024; c = "K"; }
 910 |   if (b > 1024) { b /= 1024; c = "M"; }
 911 |   if (b > 1024) { b /= 1024; c = "G"; }
 912 |   if (b > 1024) { b /= 1024; c = "T"; }
 913 |   utstring_clear(cfg.tmp);
 914 |   utstring_printf(cfg.tmp, "%.0f %s%s/s", b, c, unit);
 915 |   return utstring_body(cfg.tmp);
 916 | }
 917 | 
 918 | /*
 919 |  * status_rings
 920 |  *
 921 |  * update i/o metrics for each ring
 922 |  *
 923 |  */
 924 | int status_rings(void) {
 925 |   unsigned long start_tick, st, ct;
 926 |   struct shr_stat *ss;
 927 |   double elapsed_sec, lg10_b;
 928 |   size_t sz;
 929 |   int rc = -1, sc, i;
 930 |   char *name, *c;
 931 |   struct shr **r;
 932 |   struct ww *w;
 933 |   UT_string *s;
 934 |   ssize_t nr;
 935 |   void *fs;
 936 | 
 937 |   /* unicode 1/8 width box progression */
 938 |   char *blocks[] = {  "", "▏", "▎", "▍", "▌", "▋", "▊", "▉", "█"};
 939 | 
 940 |   printf("\033[1;1H"); /* position at line 0, col 0 */
 941 |   printf("\033[1m"); /* bold */
 942 |   printf(" %-20s | %-12s | %-12s | %-12s \n\n", 
 943 |           "name", "rx-rate", "rx-drop", "tx-drop");
 944 |   printf("\033[m"); /* reset attributes */
 945 | 
 946 |   /* go through the rings to obtain their in/out counters */
 947 |   s = NULL;
 948 |   r = NULL;
 949 |   w = NULL;
 950 |   while ( (r = (struct shr**)utvector_next(cfg.watch_rings, r))) {
 951 |     s = (UT_string*)utvector_next(cfg.watch_names, s);
 952 |     w = (struct ww*)utvector_next(cfg.watch_win, w);
 953 |     assert(s);
 954 |     assert(w);
 955 | 
 956 |     name = utstring_body(s);
 957 | 
 958 |     ss = &w->win[ cfg.ticks % NWIN ].ss;
 959 |     sc = shr_stat(*r, ss, NULL);
 960 |     if (sc < 0) goto done;
 961 | 
 962 |     fs = &w->win[ cfg.ticks % NWIN ].fs;
 963 |     sz = sizeof(struct fluxcap_stats);
 964 |     sc = shr_appdata(*r, &fs, NULL, &sz);
 965 |     if (sc < 0) {
 966 |       fprintf(stderr, "shr_appdata: error %d\n", sc);
 967 |       goto done;
 968 |     }
 969 | 
 970 |     /* for this ring, compute intake & drops over the windows */
 971 |     start_tick = (cfg.ticks < NWIN) ? 0 : (cfg.ticks - (NWIN - 1));
 972 |     st = start_tick % NWIN;
 973 |     ct = cfg.ticks  % NWIN;
 974 |     w->bw = w->win[ ct ].ss.bw -
 975 |             w->win[ st ].ss.bw;
 976 |     w->mw = w->win[ ct ].ss.mw -
 977 |             w->win[ st ].ss.mw;
 978 |     w->rx = w->win[ ct ].fs.rx_drops -
 979 |             w->win[ st ].fs.rx_drops;
 980 |     w->rd = w->win[ ct ].fs.rd_drops -
 981 |             w->win[ st ].fs.rd_drops;
 982 | 
 983 |     /* compute per second rates, log and strings */
 984 |     elapsed_sec = (cfg.ticks - start_tick) * 1.0 / TIMER_HZ;
 985 |     memset( &w->ps, 0, sizeof(w->ps) );
 986 |     if (elapsed_sec > 0) {
 987 |       w->ps.B = w->bw / elapsed_sec;
 988 |       w->ps.b = w->ps.B * 8;
 989 |       lg10_b = w->ps.b ? log10(w->ps.b) : 0;
 990 |       w->ps.lg10_b = (unsigned)floor(lg10_b);      /* integer part */
 991 |       w->ps.lg10_bf = (lg10_b - w->ps.lg10_b) * 8; /* fraction n/8 */
 992 |       w->ps.rx = w->rx / elapsed_sec;
 993 |       w->ps.rd = w->rd / elapsed_sec;
 994 |     }
 995 | 
 996 |     /* render strings */
 997 |     strncpy(w->name, name, NAME_MAX);
 998 |     w->name[NAME_MAX - 1] = '\0';
 999 |     snprintf(w->ps.str.b,  RATE_MAX, "%lu", w->ps.b);
1000 |     snprintf(w->ps.str.rx, RATE_MAX, "%lu", w->ps.rx);
1001 |     snprintf(w->ps.str.rd, RATE_MAX, "%lu", w->ps.rd);
1002 | 
1003 |     /* bits/s in */
1004 |     c = format_rate(w->ps.b, "bit");
1005 | 		assert(strlen(c)+1 <= RATE_MAX);
1006 | 		strncpy(w->ps.str.E, c, RATE_MAX);
1007 | 
1008 |     /* rx (ingest) drops/s */
1009 |     c = format_rate(w->ps.rx, "bit");
1010 | 		assert(strlen(c)+1 <= RATE_MAX);
1011 | 		strncpy(w->ps.str.X, c, RATE_MAX);
1012 | 
1013 |     /* rd (reader) drops/s */
1014 |     c = format_rate(w->ps.rd, "bit");
1015 | 		assert(strlen(c)+1 <= RATE_MAX);
1016 | 		strncpy(w->ps.str.D, c, RATE_MAX);
1017 | 
1018 |     /* render to terminal */
1019 |     printf(" %-20.20s | %-12s | %-12s | %-12s ", 
1020 |       w->name, w->ps.str.E, w->ps.str.X, w->ps.str.D);
1021 |     for(i=0; i < w->ps.lg10_b; i++) printf("%s", blocks[8]);
1022 |     printf("%s", blocks[ w->ps.lg10_bf ]);
1023 |     printf("\033[0K"); /* erase to end of line */
1024 |     printf("\n");
1025 |   }
1026 | 
1027 |   rc = 0;
1028 | 
1029 |  done:
1030 |   return rc;
1031 | }
1032 | 
1033 | /*  work we do at 10hz
1034 |  *
1035 |  *  normally nexp (number of expirations) is 1.
1036 |  *  in a busy process expirations may coalesce.
1037 |  *
1038 |  *  we do "rainy day" cache flushes below
1039 |  *  so that time, like capacity, induce flush
1040 |  */
1041 | int timer_work(unsigned long nexp) {
1042 |   int rc = -1, sc;
1043 |   struct shr **r;
1044 |   struct bb *b;
1045 | 
1046 |   switch(cfg.mode) {
1047 | 
1048 |     case mode_transmit:
1049 |       sc = update_rd_drops();
1050 |       if (sc < 0) goto done;
1051 |       break;
1052 | 
1053 |     case mode_receive:
1054 |       sc = bb_flush(cfg.ring, &cfg.bb);
1055 |       if (sc < 0) goto done;
1056 |       sc = update_rx_drops();
1057 |       if (sc < 0) goto done;
1058 |       break;
1059 | 
1060 |     case mode_watch:
1061 |       sc = status_rings();
1062 |       if (sc < 0) goto done;
1063 |       break;
1064 | 
1065 |     default:
1066 |       break;
1067 |   }
1068 | 
1069 |   rc = 0;
1070 | 
1071 |  done:
1072 |   return rc;
1073 | }
1074 | 
1075 | int show_stats(void) {
1076 | 
1077 |   return 0;
1078 | }
1079 | 
1080 | int handle_signal(void) {
1081 |   struct signalfd_siginfo info;
1082 |   ssize_t nr;
1083 |   int rc=-1;
1084 |   
1085 |   nr = read(cfg.signal_fd, &info, sizeof(info));
1086 |   if (nr != sizeof(info)) {
1087 |     fprintf(stderr,"failed to read signal fd buffer\n");
1088 |     goto done;
1089 |   }
1090 | 
1091 |   switch(info.ssi_signo) {
1092 |     case SIGALRM: 
1093 |       gettimeofday(&cfg.now, NULL);
1094 |       if (cfg.verbose) show_stats();
1095 |       alarm(1); 
1096 |       break;
1097 |     default: 
1098 |       fprintf(stderr,"got signal %d\n", info.ssi_signo);  
1099 |       goto done;
1100 |       break;
1101 |   }
1102 | 
1103 |  rc = 0;
1104 | 
1105 |  done:
1106 |   return rc;
1107 | }
1108 | 
1109 | /*
1110 |  * handle_timer
1111 |  *
1112 |  * triggered when our timerfd periodically expires.
1113 |  * number of expirations is usually 1, but in a very
1114 |  * busy process multiple expirations can coalesce.
1115 |  *
1116 |  */
1117 | int handle_timer(void) {
1118 |   unsigned long nexp;
1119 |   int rc=-1, sc;
1120 |  
1121 |   sc = read(cfg.timer_fd, &nexp, sizeof(nexp));
1122 |   if (sc < 0) {
1123 |     fprintf(stderr,"read: %s\n", strerror(errno));
1124 |     goto done;
1125 |   }
1126 | 
1127 |   sc = timer_work(nexp);
1128 |   if (sc < 0) goto done;
1129 | 
1130 |   cfg.ticks++;
1131 | 
1132 |  rc = 0;
1133 | 
1134 |  done:
1135 |   return rc;
1136 | }
1137 | 
1138 | /*
1139 |  * encapsulate_tx
1140 |  *
1141 |  * using a raw IP socket, transmit GRE-or-VXLAN encapsulated packets.
1142 |  * if necessary, perform IP fragmentation ourselves, as this
1143 |  * is not done by the OS when using raw sockets.
1144 |  */
1145 | char gbuf[MAX_PKT];
1146 | int encapsulate_tx(char *tx, ssize_t nx) {
1147 |   uint16_t encap_ethertype, more_fragments=1, fo=0, fn=0;
1148 |   uint32_t ip_src, ip_dst, seqno, off;
1149 |   char *g, *ethertype, ipproto;
1150 |   struct sockaddr_in sin;
1151 |   struct sockaddr *dst;
1152 |   ssize_t nr, fl;
1153 |   socklen_t sz;
1154 | 
1155 |   uint16_t vxlan_src_port;
1156 |   uint16_t vxlan_dst_port;
1157 |   uint16_t vxlan_udp_len;
1158 |   uint16_t vxlan_udp_cksum;
1159 |   uint8_t vxlan_flags;
1160 |   uint8_t *vni_big_endian;
1161 | 
1162 |   assert(nx >= 14);
1163 | 
1164 |   ip_src = 0;
1165 |   ip_dst = cfg.encap.dst.s_addr;
1166 | 
1167 |   sin.sin_family = AF_INET;
1168 |   sin.sin_port = 0;
1169 |   sin.sin_addr = cfg.encap.dst;
1170 |   dst = (struct sockaddr*)&sin;
1171 |   sz = sizeof(sin);
1172 | 
1173 |   cfg.ip_id++;
1174 |   g = gbuf;
1175 |   off = 0;
1176 | 
1177 |   /* use IPPROTO_GRE (47) for gre/gretap or IPPROTO_UDP (17) for vxlan */
1178 |   ipproto = (cfg.encap.mode == mode_vxlan) ? IPPROTO_UDP : IPPROTO_GRE;
1179 | 
1180 |   /* construct 20-byte IP header. 
1181 |    * NOTE: some zeroed header fields are filled out for us, when we send this
1182 |    * packet; particularly, checksum, src IP; ID and total length. see raw(7).
1183 |    */
1184 |   g[0] = 4 << 4;  /* IP version goes in MSB (upper 4 bits) of the first byte */
1185 |   g[0] |= 5;      /* IP header length (5 * 4 = 20 bytes) in lower 4 bits */
1186 |   g[1] = 0;       /* DSCP / ECN */
1187 |   g[2] = 0;       /* total length (upper byte) (see NOTE) */
1188 |   g[3] = 0;       /* total length (lower byte) (see NOTE) */
1189 |   g[4] = (cfg.ip_id & 0xff00) >> 8; /* id (upper byte); for frag reassembly */
1190 |   g[5] = (cfg.ip_id & 0x00ff);      /* id (lower byte); for frag reassembly */
1191 |   g[6] = 0;       /* 0 DF MF flags and upper bits of frag offset */
1192 |   g[7] = 0;       /* lower bits of frag offset */
1193 |   g[8] = 255;     /* TTL */
1194 |   g[9] = ipproto; /* IPPROTO_GRE or IPPROTO_UDP (VXLAN) */
1195 |   g[10] = 0;      /* IP checksum (high byte) (see NOTE) */
1196 |   g[11] = 0;      /* IP checksum (low byte) (see NOTE) */
1197 |   memcpy(&g[12], &ip_src, sizeof(ip_src)); /* IP source (see NOTE) */
1198 |   memcpy(&g[16], &ip_dst, sizeof(ip_dst)); /* IP destination */
1199 | 
1200 |   g += 20;
1201 | 
1202 |   /* GRE or UDP header starts */
1203 | 
1204 |   switch(cfg.encap.mode) {
1205 |     case mode_gre:
1206 |       memset(g, 0, 2); /* zero first two bytes of GRE header */
1207 |       g[0] |= (cfg.encap.key ? (1U << 5) : 0); /* key bit */
1208 |       g += 2;
1209 |       ethertype = &tx[12]; /* copy ethertype from packet into GRE header */
1210 |       memcpy(g, ethertype, sizeof(uint16_t));
1211 |       g += 2;
1212 |       if (cfg.encap.key) {
1213 |         memcpy(g, &cfg.encap.key, 4);
1214 |         g += 4;
1215 |       }
1216 |       nx -= 14; tx += 14; // elide original MACs and ethertype!
1217 |       assert(nx <= sizeof(gbuf)-(g-gbuf));
1218 |       memcpy(g, tx, nx);
1219 |       g += nx;
1220 |       nx = g-gbuf;
1221 |       break;
1222 |     case mode_gretap:
1223 |       memset(g, 0, 2); /* zero first two bytes of GRE header */
1224 |       g[0] |= (cfg.encap.key ? (1U << 5) : 0); /* key bit */
1225 |       g += 2;
1226 |       encap_ethertype = htons(0x6558); /* transparent ethernet bridging */
1227 |       memcpy(g, &encap_ethertype, sizeof(uint16_t));
1228 |       g += 2;
1229 |       if (cfg.encap.key) {
1230 |         memcpy(g, &cfg.encap.key, 4);
1231 |         g += 4;
1232 |       }
1233 |       assert(nx <= sizeof(gbuf)-(g-gbuf));
1234 |       memcpy(g, tx, nx);
1235 |       g += nx;
1236 |       nx = g-gbuf;
1237 |       break;
1238 |     case mode_vxlan:
1239 |       /* 8 byte UDP header */
1240 |       vxlan_src_port = htons(9999);  /* arbitrary */
1241 |       vxlan_dst_port = htons(4789);  /* IANA assigned VXLAN dest port */
1242 |       vxlan_udp_len = htons(nx+8+8); /* payload + VXLAN header + UDP header */
1243 |       vxlan_udp_cksum = htons(0);
1244 |       memcpy(g+0, &vxlan_src_port, 2);
1245 |       memcpy(g+2, &vxlan_dst_port, 2);
1246 |       memcpy(g+4, &vxlan_udp_len, 2);
1247 |       memcpy(g+6, &vxlan_udp_cksum, 2);
1248 |       g += 8;
1249 |       /* 8 byte VXLAN header */
1250 |       vxlan_flags = 0x8; /* set I flag only */
1251 |       memcpy(g+0, &vxlan_flags, 1);
1252 |       memset(g+1, 0, 7); /* clear reserved bits */
1253 |       /* vxlan VNI is 24 bit. copy the three LS bytes
1254 |        * of cfg.encap.key. it's already in net order */
1255 |       vni_big_endian = ((uint8_t*)&cfg.encap.key) + 1;
1256 |       memcpy(g+4, vni_big_endian, 3);
1257 |       g += 8;
1258 |       assert(nx <= sizeof(gbuf)-(g-gbuf));
1259 |       memcpy(g, tx, nx);
1260 |       g += nx;
1261 |       nx = g-gbuf;
1262 |       break;
1263 |     default:
1264 |       assert(0);
1265 |       break;
1266 |   }
1267 | 
1268 |   /*
1269 |    * send IP packet, performing fragmentation if greater than mtu
1270 |    */
1271 |   do {
1272 | 
1273 |     more_fragments = (nx > cfg.mtu) ? 1 : 0;
1274 |     assert((off & 0x7) == 0);
1275 |     fo = off / 8;
1276 | 
1277 |     gbuf[6]  = more_fragments ? (1 << 5) : 0; /* 0 DF [MF] flag */
1278 |     gbuf[6] |= (fo & 0x1f00) >> 8; /* upper bits of frag offset */
1279 |     gbuf[7] =  fo & 0x00ff;        /* lower bits of frag offset */
1280 | 
1281 |     /* choose fragment length so it's below MTU and so the payload 
1282 |      * length after 20 byte header is a multiple of 8 as required */
1283 |     if (more_fragments)
1284 |       fl = ((cfg.mtu - 20) & ~7U) + 20;
1285 |     else
1286 |       fl = nx;
1287 | 
1288 |     nr = sendto(cfg.tx_fd, gbuf, fl, 0, dst, sz);
1289 |     if (nr != fl) {
1290 |       fprintf(stderr,"sendto: %s\n", (nr<0) ? 
1291 |         strerror(errno) : "incomplete");
1292 |       return -1;
1293 |     }
1294 | 
1295 |     /* keeping 20-byte IP header, slide next fragment payload */
1296 |     if (more_fragments) {
1297 |       assert(fl > 20);
1298 |       memmove(&gbuf[20], &gbuf[fl], nx - fl);
1299 |       off += (fl - 20);
1300 |       nx  -= (fl - 20);
1301 |     }
1302 | 
1303 |   } while (more_fragments);
1304 | 
1305 |   return 0;
1306 | }
1307 | 
1308 | /* inject four bytes to the ethernet frame with an 802.1q vlan tag.
1309 |  * note if this makes MTU exceeded it may result in sendto error */
1310 | char buf[MAX_PKT];
1311 | char vlan_tag[VLAN_LEN] = {0x81, 0x00, 0x00, 0x00};
1312 | char *inject_vlan(char *tx, ssize_t *nx, uint16_t vlan) {
1313 |   if (((*nx) + 4) > MAX_PKT) return NULL;
1314 |   if ((*nx) <= MACS_LEN) return NULL;
1315 |   /* prepare 802.1q tag vlan portion in network order */
1316 |   uint16_t v = htons(vlan);
1317 |   memcpy(&vlan_tag[2], &v, sizeof(v));
1318 |   /* copy MAC's from original packet, inject 802.1q, copy packet */
1319 |   memcpy(buf,                   tx,            MACS_LEN);
1320 |   memcpy(buf+MACS_LEN,          vlan_tag,      VLAN_LEN);
1321 |   memcpy(buf+MACS_LEN+VLAN_LEN, tx + MACS_LEN, (*nx) - MACS_LEN);
1322 |   *nx += 4;
1323 |   return buf;
1324 | }
1325 | 
1326 | /* apply filtering to a rx or tx packet */
1327 | int keep_packet(char *tx, size_t nx) {
1328 |   uint16_t vlan;
1329 |   int r;
1330 | 
1331 |   /* apply vlan test, if enabled */
1332 |   if (cfg.pass_vlan) {
1333 | 
1334 |     if (nx < MACS_LEN + VLAN_LEN)
1335 |       return 0;
1336 | 
1337 |     if (memcmp(&tx[MACS_LEN], "\x81\x00", 2))
1338 |       return 0;
1339 | 
1340 |     memcpy(&vlan, &tx[MACS_LEN+2], sizeof(vlan));
1341 |     vlan = ntohs(vlan);
1342 |     vlan &= 0x0fff;
1343 | 
1344 |     if (vlan != cfg.pass_vlan)
1345 |       return 0;
1346 |   }
1347 | 
1348 |   /* apply random drop, if enabled */
1349 |   if (cfg.drop_pct != 0) {
1350 |     r = rand();
1351 |     if ((r * 100.0 / RAND_MAX) < cfg.drop_pct)
1352 |       return 0;
1353 |   }
1354 | 
1355 |   return 1;
1356 | }
1357 | 
1358 | /* tx-ring mode only: start transmission from the ring */
1359 | int initiate_transmit(void) {
1360 | 
1361 |   assert(cfg.use_tx_ring);
1362 | 
1363 |   /* initiate transmit, without waiting for completion */
1364 |   if (send(cfg.tx_fd, NULL, 0, MSG_DONTWAIT) < 0) {
1365 | 
1366 |     /* if tx is underway or the kernel can't sink any more data we can get
1367 |      * "resource temporarily unavailable". solution: start a blocking tx */
1368 |     if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) {
1369 | 
1370 |       if (send(cfg.tx_fd, NULL, 0, 0) < 0) {
1371 |         fprintf(stderr,"blocking transmit failed: %s\n", strerror(errno));
1372 |         return -1;
1373 |       }
1374 | 
1375 |     } else {
1376 | 
1377 |       /* any other kind of send error is fatal */
1378 |       fprintf(stderr,"failed to initiate transmit: %s\n", strerror(errno));
1379 |       return -1;
1380 |     }
1381 |   }
1382 | 
1383 |   return 0;
1384 | }
1385 | 
1386 | /* tx-ring mode only: poll kernel for space availability in tx ring */
1387 | int wait_for_tx_space(void) {
1388 |   int rc, timeout = 1000; /* milliseconds */
1389 | 
1390 |   assert(cfg.use_tx_ring);
1391 | 
1392 |   struct pollfd pfd;
1393 |   pfd.fd = cfg.tx_fd;
1394 |   pfd.revents = 0;
1395 |   pfd.events = POLLOUT;
1396 | 
1397 |   rc = poll(&pfd, 1, timeout);
1398 |   if (rc <= 0) {
1399 |     fprintf(stderr, "poll for tx space: %s\n", rc ? strerror(errno) : "timeout");
1400 |     return -1;
1401 |   }
1402 | 
1403 |   return 0;
1404 | }
1405 | 
1406 | int transmit_packets(void) {
1407 |   int rc=-1, n, len, nq=0, failsafe=0;
1408 |   struct sockaddr *dst = NULL;
1409 |   struct sockaddr_in sin;
1410 |   ssize_t nr,nt,nx;
1411 |   struct iovec *io;
1412 |   socklen_t sz = 0;
1413 |   uint8_t *mac;
1414 |   size_t nio;
1415 | 
1416 |   /* get pointer to iov array to be populated */
1417 |   utvector_clear(cfg.rb.iov);
1418 |   nio = cfg.rb.iov->n;
1419 |   io = (struct iovec*)cfg.rb.iov->d;
1420 | 
1421 |   /* read packets, up to BATCH_PKTS or BATCH_SIZE bytes */
1422 |   nr = shr_readv(cfg.ring, cfg.rb.d, cfg.rb.n, io, &nio);
1423 |   if (nr < 0) {
1424 |     fprintf(stderr, "shr_readv error: %ld\n", (long)nr);
1425 |     goto done;
1426 |   }
1427 | 
1428 |   /* set number of used iov slots */
1429 |   assert(nio <= cfg.rb.iov->n);
1430 |   cfg.rb.iov->i = nio;
1431 | 
1432 |   /* iterate over packets obtained in shr_readv */
1433 |   io = NULL;
1434 |   while ( (io = utvector_next(cfg.rb.iov, io))) {
1435 | 
1436 |     char *tx = io->iov_base; /* packet */
1437 |     nx = io->iov_len;        /* length */
1438 |     if (keep_packet(tx, nx) == 0) continue;
1439 | 
1440 |     /* inject 802.1q tag if requested */
1441 |     if (cfg.vlan) tx = inject_vlan(tx,&nx,cfg.vlan);
1442 |     if (tx == NULL) {
1443 |       fprintf(stderr, "vlan tag injection failed\n");
1444 |       goto done;
1445 |     }
1446 | 
1447 |     /* truncate outgoing packet if requested */
1448 |     if (cfg.size && (nx > cfg.size)) nx = cfg.size;
1449 | 
1450 |     /* trim N bytes from frame end if requested. */
1451 |     if (cfg.tail && (nx > cfg.tail)) nx -= cfg.tail;
1452 | 
1453 |     /* wrap encapsulation around it, if enabled */
1454 |     if (cfg.encap.enable) {
1455 | 
1456 |       if (encapsulate_tx(tx, nx)) goto done;
1457 |       continue;
1458 | 
1459 |     } else if (cfg.use_tx_ring == 0) {
1460 | 
1461 |       nt = sendto(cfg.tx_fd, tx, nx, 0, dst, sz);
1462 |       if (nt != nx) {
1463 |         fprintf(stderr,"sendto: %s\n", (nt<0) ? 
1464 |           strerror(errno) : "incomplete");
1465 |         goto done;
1466 |       }
1467 | 
1468 |       continue;
1469 |     }
1470 | 
1471 |     /*************************************************************
1472 |      * packet tx ring mode below
1473 |      ************************************************************/
1474 | 
1475 |     assert(cfg.encap.enable == 0);
1476 |     assert(cfg.use_tx_ring);
1477 | 
1478 |     /* copy packet into kernel tx ring 
1479 |      *
1480 |      * each packet occupies a slot. a tpacket2_hdr precedes the packet.
1481 |      * once we initiate transmission from the ring, the tx progresses
1482 |      * in kernel space. later, when we come round to the slot again,
1483 |      * we can check its transmission status or outcome.
1484 |      *
1485 |      * a tx error, due to a malformed packet, causes the kernel to stop
1486 |      * transmitting from the ring. it sets TP_STATUS_WRONG_FORMAT on the
1487 |      * packet. normally, we treat this condition fatally. if the "keep 
1488 |      * going" option is enabled, tx errors are suppressed and ignored.
1489 | 
1490 |      * when we are about to write a packet into the slot, we may find
1491 |      * the slot is in this tx error state due to the previous packet.
1492 |      * or, we may find that the slot is still in-use. due to our
1493 |      * independence from the actual tranmission process, we only learn
1494 |      * of these states when we come round to the slot. it is normal to
1495 |      * encounter uninitiated or in-progress transmission, and we wait
1496 |      * for availability in the ring in that case.
1497 |      *
1498 |      * for all its sophistication, the ring-based transmitter had
1499 |      * lower performance in my tests than the sendto-based transmitter.
1500 |      * this may be due to the extra copying we do to populate the ring.
1501 |      * this is why the sendto-transmitter is used by default.
1502 |      *
1503 |      */
1504 | 
1505 |     /* get address of the current slot (metadata header, pad, packet) */
1506 |     uint8_t *cur = cfg.pb.d + cfg.ring_curr_idx * cfg.ring_frame_sz;
1507 | 
1508 |     /* struct tpacket2_hdr is defined in /usr/include/linux/if_packet.h */
1509 |     struct tpacket2_hdr *hdr = (struct tpacket2_hdr *)cur;
1510 | 
1511 |    retry_slot:
1512 | 
1513 |     if (failsafe++ > 1) {
1514 |       fprintf(stderr, "internal error awaiting tx ring availability\n");
1515 |       goto done;
1516 |     }
1517 | 
1518 |     /* did the slot have a previous error? */
1519 |     if (hdr->tp_status == TP_STATUS_WRONG_FORMAT) {
1520 |       fprintf(stderr,"tx error- frame dump follows; exiting.\n");
1521 |       hexdump(cur, cfg.ring_frame_sz);
1522 |       goto done;
1523 |     }
1524 | 
1525 |     /* is the slot in-use, in the midst of transmission? */
1526 |     if (hdr->tp_status == TP_STATUS_SENDING) {
1527 |       if (wait_for_tx_space() < 0) goto done;
1528 |       goto retry_slot;
1529 |     }
1530 | 
1531 |     /* is the slot in-use, awaiting transmit to begin? this can happen if
1532 |      * we loop around the ring, before initiating transmit (say, if the batch
1533 |      * size exceeds the ring size). it can also happen if we did initiate tx,
1534 |      * if the kernel has yet to get to this packet and flag it sending.
1535 |      */
1536 |     if (hdr->tp_status == TP_STATUS_SEND_REQUEST) {
1537 |       if (initiate_transmit() < 0) goto done;
1538 |       if (wait_for_tx_space() < 0) goto done;
1539 |       goto retry_slot;
1540 |     }
1541 | 
1542 |     /* if we got here, the slot _must_ be available. right? */
1543 |     if (hdr->tp_status != TP_STATUS_AVAILABLE) {
1544 |       fprintf(stderr,"tx slot: unexpected flag %d\n", hdr->tp_status);
1545 |       goto done;
1546 |     }
1547 | 
1548 |     failsafe = 0;  /* reset loop safegaurd */
1549 | 
1550 |     /* put packet's link level header (first MAC) after the tpacket2_hdr plus
1551 |      * alignment gap.  (struct sockaddr_ll is not in the slot during tx). */
1552 |     mac = (uint8_t*)TPACKET_ALIGN(((unsigned long)cur) +
1553 |           sizeof(struct tpacket2_hdr));
1554 |     len = cfg.ring_frame_sz - (mac - cur);
1555 |     if (nx > len) {
1556 |       fprintf(stderr, "packet length %ld exceeds effective frame_size %d\n",
1557 |         (long)nx, len);
1558 |       goto done;
1559 |     }
1560 | 
1561 |     /* populate packet proper */
1562 |     memcpy(mac, tx, nx);
1563 |     hdr->tp_len = nx;
1564 |     hdr->tp_status = TP_STATUS_SEND_REQUEST;
1565 |     nq++;
1566 | 
1567 |     /* point to next slot */
1568 |     cfg.ring_curr_idx = (cfg.ring_curr_idx + 1) % cfg.ring_frame_nr;
1569 |   }
1570 | 
1571 |   /* if packets were queued in to kernel tx ring, initiate transmit */
1572 |   if (nq && (initiate_transmit() < 0)) goto done;
1573 | 
1574 |   rc = 0;
1575 | 
1576 |  done:
1577 |   return rc;
1578 | }
1579 | 
1580 | int receive_packets(void) {
1581 |   int rc=-1, sw, wire_vlan, form_vlan, keep;
1582 |   ssize_t nr,nt,nx;
1583 |   struct iovec iov;
1584 |   char *tx;
1585 | 
1586 |   while (1) {
1587 | 
1588 |     /* get address of the current slot (metadata header, pad, packet) */
1589 |     uint8_t *cur = cfg.pb.d + cfg.ring_curr_idx * cfg.ring_frame_sz;
1590 | 
1591 |     /* these structs start the frame, see /usr/include/linux/if_packet.h */
1592 |     struct tpacket2_hdr *hdr = (struct tpacket2_hdr *)cur;
1593 |     struct sockaddr_ll *sll = (struct sockaddr_ll *)(cur + TPACKET2_HDRLEN);
1594 | 
1595 |     /* check if the packet is ready. this is how we break the loop */
1596 |     if ((hdr->tp_status & TP_STATUS_USER) == 0) break;
1597 | 
1598 |     /* note packet drop condition */
1599 |     if (hdr->tp_status & TP_STATUS_LOSING) cfg.losing = 1;
1600 | 
1601 |     tx = cur + hdr->tp_mac;
1602 |     nx = hdr->tp_snaplen;
1603 | 
1604 |     /* upon receipt the wire vlan (if any) has been pulled out for us */
1605 |     wire_vlan = (hdr->tp_status & TP_STATUS_VLAN_VALID) ? 
1606 |                 (hdr->tp_vlan_tci & 0xfff) : 0;
1607 |     form_vlan = cfg.vlan ? cfg.vlan : wire_vlan;
1608 |     if (cfg.strip_vlan) form_vlan = 0;
1609 | 
1610 |     /* inject 802.1q tag if requested */
1611 |     if (form_vlan) tx = inject_vlan(tx,&nx,form_vlan);
1612 |     if (tx == NULL) {
1613 |       fprintf(stderr, "vlan tag injection failed\n");
1614 |       goto done;
1615 |     }
1616 | 
1617 |     /* truncate packet if requested */
1618 |     if (cfg.size && (nx > cfg.size)) nx = cfg.size;
1619 | 
1620 |     /* trim N bytes from frame end if requested. */
1621 |     if (cfg.tail && (nx > cfg.tail)) nx -= cfg.tail;
1622 | 
1623 |     keep = keep_packet(tx,nx);
1624 | 
1625 |     /* push into batch buffer */
1626 |     sw = keep ? bb_write(cfg.ring, &cfg.bb, tx, nx) : 0;
1627 |     if (sw < 0) {
1628 |       fprintf(stderr, "bb_write (%lu bytes): error code %d\n", (long)nx, sw);
1629 |       goto done;
1630 |     }
1631 | 
1632 |     /* return the packet by assigning status word TP_STATUS_KERNEL (0) */
1633 |     hdr->tp_status = TP_STATUS_KERNEL;
1634 | 
1635 |     /* next packet */
1636 |     cfg.ring_curr_idx = (cfg.ring_curr_idx + 1) % cfg.ring_frame_nr;
1637 |   }
1638 | 
1639 |   rc = 0;
1640 | 
1641 |  done:
1642 |   return rc;
1643 | }
1644 | 
1645 | /* decode the gre packet into its fields.
1646 |  * input pkt starts with outer IP header.
1647 |  * fields are returned in network order!
1648 |  * fields are zeroed if not present
1649 |  * on decoding failure, returns -1.
1650 |  * returns 0 on success
1651 |  */
1652 | #define GRE_MIN_HDR 4
1653 | #define GRE_CHECKSUM_LEN 2
1654 | #define GRE_RESERVED1_LEN 2
1655 | #define GRE_KEY_LEN 4
1656 | #define GRE_SEQNO_LEN 4
1657 | int decode_gre(char *pkt, ssize_t nr, uint16_t *csum, uint32_t *key, 
1658 |                uint32_t *seqno, char **payload, size_t *plen) {
1659 |   int has_key, has_checksum, has_seqno, ko, co, so, po, ip_hdr_len;
1660 |   uint8_t ip_proto;
1661 | 
1662 |   *key = 0;
1663 |   *seqno = 0;
1664 |   *csum = 0;
1665 |   *payload = NULL;
1666 |   *plen = 0;
1667 | 
1668 |   assert(nr > 0);
1669 |   ip_hdr_len = (pkt[0] & 0x0f) * 4;
1670 | 
1671 |   if (nr < ip_hdr_len + GRE_MIN_HDR)
1672 |     return -1;
1673 | 
1674 |   ip_proto = pkt[9];
1675 |   if (ip_proto != IPPROTO_GRE)
1676 |     return -1;
1677 | 
1678 |   has_key      = pkt[ip_hdr_len] & (1U << 5);
1679 |   has_checksum = pkt[ip_hdr_len] & (1U << 7);
1680 |   has_seqno    = pkt[ip_hdr_len] & (1U << 4);
1681 | 
1682 |   if (has_checksum) {
1683 |     co = ip_hdr_len + GRE_MIN_HDR;
1684 |     if (co + GRE_CHECKSUM_LEN > nr)
1685 |       return -1;
1686 |     memcpy(csum, pkt + co, GRE_CHECKSUM_LEN);
1687 |   }
1688 | 
1689 |   if (has_key) {
1690 |     ko = ip_hdr_len + GRE_MIN_HDR
1691 |          + (has_checksum ? GRE_CHECKSUM_LEN + GRE_RESERVED1_LEN : 0);
1692 |     if (ko + GRE_KEY_LEN > nr)
1693 |       return -1;
1694 |     memcpy(key, pkt + ko, GRE_KEY_LEN);
1695 |   }
1696 | 
1697 |   if (has_seqno) {
1698 |     so = ip_hdr_len + GRE_MIN_HDR +
1699 |          + (has_checksum ? GRE_CHECKSUM_LEN + GRE_RESERVED1_LEN : 0)
1700 |          + (has_key      ? GRE_KEY_LEN : 0);
1701 |     if (so + GRE_SEQNO_LEN > nr)
1702 |       return -1;
1703 |     memcpy(seqno, pkt + so, GRE_SEQNO_LEN);
1704 |   }
1705 | 
1706 |   po = ip_hdr_len + GRE_MIN_HDR +
1707 |        + (has_checksum ? GRE_CHECKSUM_LEN + GRE_RESERVED1_LEN : 0)
1708 |        + (has_key      ? GRE_KEY_LEN : 0)
1709 |        + (has_seqno    ? GRE_SEQNO_LEN : 0);
1710 | 
1711 |   *plen = nr - po;
1712 |   *payload = pkt + po;
1713 |   return 0;
1714 | }
1715 | 
1716 | int handle_grerx(void) {
1717 |   int i, rc=-1, sc, keep, nmsgs;
1718 |   char *data, *tx, *pkt;
1719 |   size_t dlen, nx, len;
1720 |   uint32_t seqno, key;
1721 |   uint16_t csum;
1722 |   ssize_t nr;
1723 | 
1724 |   nmsgs = recvmmsg(cfg.rx_fd, cfg.msgv, BATCH_PKTS, MSG_WAITFORONE, NULL);
1725 |   if (nmsgs < 0) {
1726 |     fprintf(stderr, "recvmmsg: %s\n", strerror(errno));
1727 |     goto done;
1728 |   }
1729 | 
1730 |   if (cfg.verbose)
1731 |     fprintf(stderr, "recvmmsg: %d messages received\n", nmsgs);
1732 | 
1733 |   for(i=0; i < nmsgs; i++) {
1734 | 
1735 |     pkt = cfg.msgv[i].msg_hdr.msg_iov[0].iov_base;
1736 |     len = cfg.msgv[i].msg_len;
1737 |     sc = decode_gre(pkt, len, &csum, &key, &seqno, &data, &dlen);
1738 |     if (sc < 0) {
1739 |       rc = 0;  /* ignore and drop bad packets */
1740 |       goto done;
1741 |     }
1742 | 
1743 |     /* test key matches desired key */
1744 |     if (cfg.encap.key != key) {
1745 |       rc = 0;
1746 |       goto done;
1747 |     }
1748 | 
1749 |     /* decapsulate packet, advance over GRE header */
1750 |     tx = data;
1751 |     nx = dlen;
1752 |     if (nx == 0) {
1753 |       rc = 0;
1754 |       goto done;
1755 |     }
1756 | 
1757 |     /* inject 802.1q tag if requested */
1758 |     if (cfg.vlan) tx = inject_vlan(tx,&nx,cfg.vlan);
1759 |     if (tx == NULL) {
1760 |       fprintf(stderr, "vlan tag injection failed\n");
1761 |       goto done;
1762 |     }
1763 | 
1764 |     /* truncate packet if requested */
1765 |     if (cfg.size && (nx > cfg.size)) nx = cfg.size;
1766 | 
1767 |     /* trim N bytes from frame end if requested. */
1768 |     if (cfg.tail && (nx > cfg.tail)) nx -= cfg.tail;
1769 | 
1770 |     keep = keep_packet(tx,nx);
1771 | 
1772 |     /* push into batch buffer */
1773 |     sc = keep ? bb_write(cfg.ring, &cfg.bb, tx, nx) : 0;
1774 |     if (sc < 0) {
1775 |       fprintf(stderr, "bb_write (%lu bytes): error code %d\n", (long)nx, sc);
1776 |       goto done;
1777 |     }
1778 |   }
1779 | 
1780 |   rc = 0;
1781 | 
1782 |  done:
1783 |   return rc;
1784 | }
1785 | 
1786 | int handle_io(void) {
1787 |   int rc = -1;
1788 | 
1789 |   switch(cfg.mode) {
1790 |     case mode_receive:
1791 |       rc = receive_packets();
1792 |       break;
1793 |     case mode_transmit:
1794 |       rc = transmit_packets();
1795 |       break;
1796 |     default:
1797 |       assert(0);
1798 |       break;
1799 |   }
1800 | 
1801 |   return rc;
1802 | }
1803 | 
1804 | size_t kmgt(char *optarg) {
1805 |  size_t size=0;
1806 |  char unit;
1807 | 
1808 |  int n = sscanf(optarg, "%lu%c", &size, &unit);
1809 |  if (n == 0) usage();
1810 |  if (n == 2) {
1811 |     switch (unit) {
1812 |       case 't': case 'T': size *= 1024; /* fall through */
1813 |       case 'g': case 'G': size *= 1024; /* fall through */
1814 |       case 'm': case 'M': size *= 1024; /* fall through */
1815 |       case 'k': case 'K': size *= 1024; break;
1816 |       default: usage(); break;
1817 |     }
1818 |  }
1819 | 
1820 |  return size;
1821 | }
1822 | 
1823 | int parse_encap(char *opt) {
1824 |   int rc = -1, len;
1825 |   char *mode=opt,*name=opt, *colon;
1826 |   struct hostent *e;
1827 | 
1828 |   colon = strchr(mode,':');
1829 |   if (colon) *colon = '\0';
1830 |   else if (cfg.mode == mode_transmit) {
1831 |     fprintf(stderr,"encapsulation syntax error\n");
1832 |     goto done;
1833 |   }
1834 | 
1835 |   if      (!strcmp(mode,"gre"))    cfg.encap.mode = mode_gre;
1836 |   else if (!strcmp(mode,"gretap")) cfg.encap.mode = mode_gretap;
1837 |   else if (!strcmp(mode,"vxlan"))  cfg.encap.mode = mode_vxlan;
1838 |   else { 
1839 |     fprintf(stderr,"invalid encapsulation mode\n");
1840 |     goto done;
1841 |   }
1842 | 
1843 |   /* name is destination hostname (GRE tx mode),
1844 |               or local IP to bind (GRE rx mode) */
1845 |   if (colon) {
1846 |     name = colon+1;
1847 |     e = gethostbyname(name);
1848 |     if (e == NULL) {
1849 |       fprintf(stderr, "gethostbyname: %s: %s\n", name, hstrerror(h_errno));
1850 |       goto done;
1851 |     }
1852 | 
1853 |     if (e->h_length != sizeof(cfg.encap.dst)) {
1854 |       fprintf(stderr, "DNS result size mismatch\n");
1855 |       goto done;
1856 |     }
1857 | 
1858 |     memcpy(&cfg.encap.dst.s_addr, e->h_addr, e->h_length);
1859 |   }
1860 | 
1861 |   rc = 0;
1862 | 
1863 |  done:
1864 |   return rc;
1865 | }
1866 | 
1867 | int main(int argc, char *argv[]) {
1868 |   int rc = -1, n, opt, ring_mode, init_mode, pos, sc;
1869 |   struct epoll_event ev;
1870 |   cfg.prog = argv[0];
1871 |   struct shr *r;
1872 |   struct bb *b;
1873 |   char *file;
1874 |   void **p;
1875 | 
1876 |   cfg.watch_rings = utvector_new(utmm_ptr);
1877 |   cfg.watch_names = utvector_new(utstring_mm);
1878 |   cfg.watch_win = utvector_new(&ww_mm);
1879 |   utstring_new(cfg.tmp);
1880 |   utmm_init(&bb_mm, &cfg.bb, 1);
1881 |   utmm_init(&bb_mm, &cfg.rb, 1);
1882 |   utmm_init(&bb_mm, &cfg.gb, 1);
1883 | 
1884 |   while ( (opt=getopt(argc,argv,"t:r:c:vi:hV:s:D:E:B:S:Z:Qd:K:Rqkf:")) != -1) {
1885 |     switch(opt) {
1886 |       case 't': cfg.mode = mode_transmit; if (*optarg != 'x') usage(); break;
1887 |       case 'r': cfg.mode = mode_receive;  if (*optarg != 'x') usage(); break;
1888 |       case 'c': cfg.mode = mode_create;   if (*optarg != 'r') usage(); break;
1889 |       case 'E': cfg.encap.enable=1; if (parse_encap(optarg)) usage(); break;
1890 |       case 'v': cfg.verbose++; break;
1891 |       case 'k': cfg.keep=1; break;
1892 |       case 'V': cfg.vlan=atoi(optarg); break; 
1893 |       case 'D': cfg.tail=atoi(optarg); break; 
1894 |       case 's': cfg.size = kmgt(optarg); break;
1895 |       case 'B': cfg.ring_block_nr=atoi(optarg); break;
1896 |       case 'S': cfg.ring_block_sz = 1 << (unsigned)atoi(optarg); break;
1897 |       case 'Z': cfg.ring_frame_sz=atoi(optarg); break;
1898 |       case 'q': cfg.bypass_qdisc_on_tx = 1; break;
1899 |       case 'Q': cfg.strip_vlan = 1; break;
1900 |       case 'd': cfg.drop_pct=100-atoi(optarg); break;
1901 |       case 'K': cfg.encap.key = strchr(optarg, '.') ? 
1902 |                 inet_addr(optarg) : htonl(atoi(optarg));
1903 |                 break;
1904 |       case 'R': cfg.use_tx_ring = 1; break;
1905 |       case 'i': if (!strcmp(optarg, "o")) cfg.mode = mode_watch; /* -io */
1906 |                 else {                                           /* -i <nic> */
1907 |                   if (strlen(optarg)+1 > MAX_NIC) goto done;
1908 |                   strncpy(cfg.dev, optarg, MAX_NIC);
1909 |                 }
1910 |                 break;
1911 |       case 'f': sc = sscanf(optarg, "vlan %d", &cfg.pass_vlan);
1912 |                 if (sc != 1) usage();
1913 |                 break;
1914 |       case 'h': default: usage(); break;
1915 |     }
1916 |   }
1917 | 
1918 |   if ((cfg.drop_pct < 0) || (cfg.drop_pct > 100)) usage();
1919 | 
1920 |   /* block all signals. we take signals synchronously via signalfd */
1921 |   sigset_t all;
1922 |   sigfillset(&all);
1923 |   sigprocmask(SIG_SETMASK,&all,NULL);
1924 | 
1925 |   /* a few signals we'll accept via our signalfd */
1926 |   sigset_t sw;
1927 |   sigemptyset(&sw);
1928 |   for(n=0; n < sizeof(sigs)/sizeof(*sigs); n++) sigaddset(&sw, sigs[n]);
1929 | 
1930 |   /* create the signalfd for receiving signals */
1931 |   cfg.signal_fd = signalfd(-1, &sw, 0);
1932 |   if (cfg.signal_fd == -1) {
1933 |     fprintf(stderr,"signalfd: %s\n", strerror(errno));
1934 |     goto done;
1935 |   }
1936 | 
1937 |   /* create the timerfd for receiving clock events */
1938 |   cfg.timer_fd = timerfd_create(CLOCK_MONOTONIC, 0);
1939 |   if (cfg.timer_fd == -1) {
1940 |     fprintf(stderr,"timerfd_create: %s\n", strerror(errno));
1941 |     goto done;
1942 |   }
1943 | 
1944 |   /* set up for periodic timer expiration */
1945 |   sc = timerfd_settime(cfg.timer_fd, 0, &cfg.timer, NULL);
1946 |   if (sc < 0) {
1947 |     fprintf(stderr, "timerfd_settime: %s\n", strerror(errno));
1948 |     goto done;
1949 |   }
1950 | 
1951 |   /* set up the epoll instance */
1952 |   cfg.epoll_fd = epoll_create(1); 
1953 |   if (cfg.epoll_fd == -1) {
1954 |     fprintf(stderr,"epoll: %s\n", strerror(errno));
1955 |     goto done;
1956 |   }
1957 | 
1958 |   /* add descriptors of interest */
1959 |   if (new_epoll(EPOLLIN, cfg.signal_fd)) goto done;
1960 |   if (new_epoll(EPOLLIN, cfg.timer_fd))  goto done;
1961 |   if (cfg.mode == mode_watch && isatty(STDIN_FILENO)) {
1962 |    if (new_epoll(EPOLLIN, STDIN_FILENO)) goto done;
1963 |   }
1964 | 
1965 |   /* in transmit mode, epoll on the ring descriptor.
1966 |    * in receive mode, epoll on the raw socket.
1967 |    */
1968 |   switch (cfg.mode) {
1969 |     case mode_receive:
1970 |       if (cfg.dev == NULL) usage();
1971 |       ring_mode = SHR_WRONLY;
1972 |       cfg.file = (optind < argc) ? argv[optind++] : NULL;
1973 |       cfg.ring = shr_open(cfg.file, ring_mode);
1974 |       if (cfg.ring == NULL) goto done;
1975 |       sc = cfg.encap.enable ? setup_rx_encap() : setup_rx();
1976 |       if (sc < 0) goto done;
1977 |       sc = cfg.encap.enable ? new_epoll(EPOLLIN, cfg.rx_fd) : 
1978 |                               new_epoll(EPOLLIN, cfg.fd);
1979 |       if (sc < 0) goto done;
1980 |       break;
1981 |     case mode_transmit:
1982 |       if ((cfg.dev == NULL) && (cfg.encap.enable == 0)) usage();
1983 |       ring_mode = SHR_RDONLY|SHR_NONBLOCK;
1984 |       cfg.file = (optind < argc) ? argv[optind++] : NULL;
1985 |       cfg.ring = shr_open(cfg.file, ring_mode);
1986 |       if (cfg.ring == NULL) goto done;
1987 |       cfg.fd = shr_get_selectable_fd(cfg.ring);
1988 |       if (cfg.fd < 0) goto done;
1989 |       if (new_epoll(EPOLLIN, cfg.fd)) goto done;
1990 |       if (setup_tx() < 0) goto done;
1991 |       break;
1992 |     case mode_create:
1993 |       if (cfg.size == 0) usage();
1994 |       while (optind < argc) {
1995 |         file = argv[optind++];
1996 |         init_mode = SHR_DROP|SHR_FARM|SHR_MLOCK|SHR_APPDATA;
1997 |         if (cfg.keep) init_mode |= SHR_KEEPEXIST;
1998 |         if (cfg.verbose) fprintf(stderr,"creating %s\n", file);
1999 |         sc = shr_init(file, cfg.size, init_mode, &cfg.stats, sizeof(cfg.stats));
2000 |         if (sc < 0) goto done;
2001 |       }
2002 |       rc = 0;
2003 |       goto done;
2004 |       break;
2005 |     case mode_watch:
2006 |       while (optind < argc) {
2007 |         file = argv[optind++];
2008 |         utstring_clear(cfg.tmp);
2009 |         utstring_printf(cfg.tmp, "%s", file);
2010 |         utvector_push(cfg.watch_names, cfg.tmp);
2011 |         r = shr_open(file, SHR_RDONLY);
2012 |         if (r == NULL) goto done;
2013 |         utvector_push(cfg.watch_rings, &r);
2014 |         utvector_extend(cfg.watch_win);
2015 |       }
2016 |       /* clear screen, move to 0,0 */
2017 |       printf("\033[2J\n");
2018 |       break;
2019 |     default:
2020 |       usage();
2021 |   }
2022 | 
2023 |   alarm(1);
2024 | 
2025 |   while (1) {
2026 |     sc = epoll_wait(cfg.epoll_fd, &ev, 1, -1);
2027 |     if (sc < 0) {
2028 |       fprintf(stderr, "epoll: %s\n", strerror(errno));
2029 |       goto done;
2030 |     }
2031 | 
2032 |     if (sc == 0) { assert(0); goto done; }
2033 |     else if (ev.data.fd == cfg.signal_fd) { if (handle_signal() < 0) goto done;}
2034 |     else if (ev.data.fd == cfg.timer_fd)  { if (handle_timer()  < 0) goto done;}
2035 |     else if (ev.data.fd == cfg.rx_fd)     { if (handle_grerx()  < 0) goto done;}
2036 |     else if (ev.data.fd == cfg.fd)        { if (handle_io() < 0) goto done; }
2037 |     else if (ev.data.fd == STDIN_FILENO)  { goto done; }
2038 |     else {
2039 |       fprintf(stderr, "error: unknown descriptor\n");
2040 |       goto done;
2041 |     }
2042 |   }
2043 |   
2044 |   rc = 0;
2045 | 
2046 | done:
2047 |   /* in these modes, fd is internal to shr and closed by it */
2048 |   if (cfg.mode != mode_transmit) {
2049 |     if (cfg.fd != -1) close(cfg.fd);
2050 |   }
2051 |   if (cfg.tx_fd != -1) close(cfg.tx_fd);
2052 |   if (cfg.rx_fd != -1) close(cfg.rx_fd);
2053 |   if (cfg.signal_fd != -1) close(cfg.signal_fd);
2054 |   if (cfg.timer_fd != -1) close(cfg.timer_fd);
2055 |   if (cfg.epoll_fd != -1) close(cfg.epoll_fd);
2056 |   utmm_fini(&bb_mm, &cfg.bb, 1);
2057 |   utmm_fini(&bb_mm, &cfg.rb, 1);
2058 |   utmm_fini(&bb_mm, &cfg.gb, 1);
2059 |   if ((cfg.pb.n != 0) && (cfg.pb.d != MAP_FAILED)) {
2060 |     munmap(cfg.pb.d, cfg.pb.n); /* cfg.pb is mode specfic */
2061 |     assert(cfg.pb.iov == NULL); /* iov part of pb unused */
2062 |   }
2063 |   if (cfg.ring) shr_close(cfg.ring);
2064 |   p = NULL;
2065 |   while ( (p = utvector_next(cfg.watch_rings, p))) shr_close(*p);
2066 |   utvector_free(cfg.watch_rings);
2067 |   utvector_free(cfg.watch_names);
2068 |   utstring_free(cfg.tmp);
2069 |   utvector_free(cfg.watch_win);
2070 |   return rc;
2071 | }
2072 | 


--------------------------------------------------------------------------------
/fluxcap.h:
--------------------------------------------------------------------------------
  1 | #ifndef _FLUXCAP_H_
  2 | #define _FLUXCAP_H_
  3 | 
  4 | #define _GNU_SOURCE
  5 | #include <linux/if_packet.h>
  6 | #include <net/ethernet.h>
  7 | #include <sys/signalfd.h>
  8 | #include <sys/timerfd.h>
  9 | #include <netinet/in.h>
 10 | #include <sys/socket.h>
 11 | #include <sys/epoll.h>
 12 | #include <sys/ioctl.h>
 13 | #include <sys/types.h>
 14 | #include <arpa/inet.h>
 15 | #include <sys/mman.h>
 16 | #include <signal.h>
 17 | #include <unistd.h>
 18 | #include <stdlib.h>
 19 | #include <string.h>
 20 | #include <net/if.h>
 21 | #include <assert.h>
 22 | #include <stdio.h>
 23 | #include <errno.h>
 24 | #include <netdb.h>
 25 | #include <fcntl.h>
 26 | #include <poll.h>
 27 | #include <math.h>
 28 | #include "shr.h"
 29 | #include "libut.h"
 30 | 
 31 | #define FLUXCAP_VERSION "3.2"
 32 | #define MAX_NIC 64             /* longest NIC name we accept */
 33 | #define MAX_PKT 10000          /* max length of packet */
 34 | #define BATCH_PKTS 10000       /* max pkts to read in one shr_readv */
 35 | #define BATCH_SIZE (BATCH_PKTS*MAX_PKT) /* bytes buffered before shr_writev */
 36 | #define TIMER_HZ 10            /* rainy day flush/stats timer freq */
 37 | 
 38 | #define VLAN_LEN 4
 39 | #define MACS_LEN (2*6)
 40 | 
 41 | struct bb {
 42 |   size_t n; /* batch buffer size */
 43 |   size_t u; /* batch buffer used */
 44 |   char  *d; /* batch buffer */
 45 |   UT_vector /* of struct iovec */ *iov; 
 46 | };
 47 | 
 48 | struct encap { /* this is used in GRE encapsulation mode */
 49 |   int enable;
 50 |   enum {mode_gre=0, mode_gretap, mode_vxlan} mode;
 51 |   struct in_addr dst; /* used as GRE TX dest IP, or GRE RX local IP */
 52 |   uint32_t key;       /* if non-zero, indicates RX/TX GRE key, or VXLAN VNI */
 53 | };
 54 | 
 55 | struct fluxcap_stats {
 56 |   size_t rx_drops;  /* mode_receive drops in rx/pre-ring reported from kernel */
 57 |   size_t rd_drops;  /* mode_transmit/tee drops due to reader lag on shr ring */
 58 | };
 59 | 
 60 | /* watch window - for tracking rates over NWIN observations */
 61 | #define NWIN 100
 62 | #define RATE_MAX 20
 63 | #define NAME_MAX 80
 64 | struct ww {
 65 |   char name[NAME_MAX];
 66 | 
 67 |   struct {
 68 |     struct fluxcap_stats fs;
 69 |     struct shr_stat ss;
 70 |   } win[NWIN];
 71 | 
 72 |   /* resulting delta from newest to oldest window */
 73 |   unsigned long mw; /* packets in */
 74 |   unsigned long bw; /* bytes in */
 75 |   unsigned long rx; /* packet drops (tpacket rx) */
 76 |   unsigned long rd; /* packet drops (reader lag) */
 77 | 
 78 |   /* per second rates */
 79 |   struct {
 80 |     unsigned long B; /* bytes in */
 81 |     unsigned long b; /* bits in */
 82 |     unsigned lg10_b; /* integer floor(base-10-log) of b */
 83 |     unsigned lg10_bf;/* fraction part of ^ scaled to [0-8) */
 84 |     unsigned long rx; /* packet drops (tpacket rx) */
 85 |     unsigned long rd; /* packet drops (reader lag) */
 86 |     
 87 |     /* per second rates as strings */
 88 |     struct {
 89 |       char b[ RATE_MAX ]; /* bits    per second */
 90 |       char rx[RATE_MAX ]; /* drop-rx per second */
 91 |       char rd[RATE_MAX ]; /* drop-rd per second */
 92 |       char E[ RATE_MAX ]; /* bits    per second (human units e.g. Mbit/s) */
 93 |       char X[ RATE_MAX ]; /* drop-rx per second (human units e.g. Mbit/s) */
 94 |       char D[ RATE_MAX ]; /* drop-rd per second (human units e.g. Mbit/s) */
 95 |     } str;
 96 |   } ps;
 97 | };
 98 | 
 99 | 
100 | #endif
101 | 


--------------------------------------------------------------------------------
/lib/.gitignore:
--------------------------------------------------------------------------------
1 | libtpl.a
2 | libut.a
3 | 


--------------------------------------------------------------------------------
/lib/Makefile.am:
--------------------------------------------------------------------------------
 1 | srcdir = @srcdir@
 2 | 
 3 | SUBDIRS=
 4 | 
 5 | # build these external libraries as convenience libs
 6 | 
 7 | noinst_LIBRARIES = libut.a
 8 | 
 9 | libut_a_CFLAGS = -Wall -Wextra
10 | libut_a_CPPFLAGS = -I$(srcdir)/libut/include
11 | libut_a_SOURCES = libut/src/libut.c    \
12 |                   libut/src/utvector.c \
13 |                   libut/src/utmm.c     \
14 |                   libut/src/ringbuf.c
15 | 
16 | 


--------------------------------------------------------------------------------
/respan.c:
--------------------------------------------------------------------------------
  1 | /* 
  2 |  * respan: a tool to receive or retransmit a network tap
  3 |  *
  4 |  * © 2019 The Johns Hopkins University Applied Physics Laboratory LLC.
  5 |  * All Rights Reserved. 
  6 |  *
  7 |  * AUTHOR: Troy D. Hanson
  8 |  * LICENSE: MIT
  9 |  * PACKAGE: fluxcap
 10 |  *
 11 |  */
 12 | 
 13 | #include <sys/signalfd.h>
 14 | #include <sys/socket.h>
 15 | #include <netinet/in.h>
 16 | #include <arpa/inet.h>
 17 | #include <sys/epoll.h>
 18 | #include <sys/mman.h>
 19 | #include <sys/types.h>
 20 | #include <sys/stat.h>
 21 | #include <getopt.h>
 22 | #include <signal.h>
 23 | #include <stdlib.h>
 24 | #include <unistd.h>
 25 | #include <assert.h>
 26 | #include <string.h>
 27 | #include <sys/un.h>
 28 | #include <errno.h>
 29 | #include <stdio.h>
 30 | #include <netdb.h>
 31 | #include <fcntl.h>
 32 | #include <time.h>
 33 | #include "respan.h"
 34 | 
 35 | struct {
 36 |   char *prog;
 37 |   int verbose;
 38 |   char *dir;
 39 |   time_t now;
 40 |   int rotate_sec;
 41 |   int maxsz_mb;
 42 |   int epoll_fd;
 43 |   int signal_fd;
 44 |   int rx_fd;
 45 |   io_mode from;
 46 |   io_mode to;
 47 |   char *file_pat;
 48 |   char pkt[MAX_PKT];
 49 |   /* savefile mapping */
 50 |   char *sv_addr;
 51 |   size_t sv_len;
 52 |   int    sv_fd;  
 53 |   time_t sv_ts;  /* time reflected in name of savefile */
 54 |   int    sv_seq; /* sequence number of save file within ts second */
 55 |   off_t  sv_cur; /* next write offset within save file */
 56 | } cfg = {
 57 |   .rx_fd = -1,
 58 |   .epoll_fd = -1,
 59 |   .signal_fd = -1,
 60 |   .rotate_sec = 5,
 61 |   .maxsz_mb = 10,
 62 |   .dir = ".",
 63 |   .file_pat = FILE_PATTERN,
 64 | };
 65 | 
 66 | /* signals that we'll accept via signalfd in epoll */
 67 | int sigs[] = {SIGHUP,SIGTERM,SIGINT,SIGQUIT,SIGALRM};
 68 | 
 69 | #define x(a) #a,
 70 | char *mode_strings[] = { MODES NULL };
 71 | #undef x
 72 | 
 73 | struct option options[] = {
 74 |   {
 75 |     .name = "from",
 76 |     .has_arg = 1,
 77 |     .val = 'F',
 78 |   },
 79 |   {
 80 |     .name = "to",
 81 |     .has_arg = 1,
 82 |     .val = 'T',
 83 |   },
 84 |   {
 85 |     .name = "help",
 86 |     .has_arg = 0,
 87 |     .val = 'h',
 88 |   },
 89 |   {
 90 |     .name = NULL, /* terminal element */
 91 |   },
 92 | };
 93 | 
 94 | void usage() {
 95 |   fprintf(stderr,
 96 |        "usage: %s [-v] --from erspan --to pcap:<dir>\n"
 97 |        " pcap options\n"
 98 |        "     -G <rotate-sec>   (in sec)\n"
 99 |        "     -C <file-size>    (in mb)\n"
100 |        "     -w <file-pat>     (eg. %s)\n"
101 |        "\n",
102 |        cfg.prog,
103 |        FILE_PATTERN);
104 |   exit(-1);
105 | }
106 | 
107 | int new_epoll(int events, int fd) {
108 |   int rc;
109 |   struct epoll_event ev;
110 |   memset(&ev,0,sizeof(ev)); // placate valgrind
111 |   ev.events = events;
112 |   ev.data.fd= fd;
113 |   rc = epoll_ctl(cfg.epoll_fd, EPOLL_CTL_ADD, fd, &ev);
114 |   if (rc == -1) {
115 |     fprintf(stderr,"epoll_ctl: %s\n", strerror(errno));
116 |   }
117 |   return rc;
118 | }
119 | 
120 | const uint8_t pcap_glb_hdr[] = {
121 |     0xd4, 0xc3, 0xb2, 0xa1,  /* magic number */
122 |     0x02, 0x00, 0x04, 0x00,  /* version major, version minor */
123 |     0x00, 0x00, 0x00, 0x00,  /* this zone */
124 |     0x00, 0x00, 0x00, 0x00,  /* sigfigs  */
125 |     0xff, 0xff, 0x00, 0x00,  /* snaplen  */
126 |     0x01, 0x00, 0x00, 0x00   /* network  */
127 | };
128 | 
129 | int close_savefile() {
130 |   int rc=-1, sc;
131 | 
132 |   sc = munmap(cfg.sv_addr, cfg.sv_len);
133 |   if (sc < 0) {
134 |     fprintf(stderr,"munmap: %s\n", strerror(errno));
135 |     goto done;
136 |   }
137 | 
138 |   sc = ftruncate(cfg.sv_fd, cfg.sv_cur);
139 |   if (sc < 0) {
140 |     fprintf(stderr,"ftruncate: %s\n", strerror(errno));
141 |     goto done;
142 |   }
143 | 
144 |   sc = close(cfg.sv_fd);
145 |   if (sc < 0) {
146 |     fprintf(stderr,"close: %s\n", strerror(errno));
147 |     goto done;
148 |   }
149 | 
150 |   rc = 0;
151 | 
152 |  done:
153 |   return rc;
154 | }
155 | 
156 | int reopen_savefile() {
157 |   char base[FILE_MAX];
158 |   char path[FILE_MAX];
159 |   int rc=-1, sc;
160 | 
161 |   /* close out current savefile, if we have one */
162 |   sc = cfg.sv_addr ? close_savefile() : 0;
163 |   if (sc < 0) goto done;
164 | 
165 |   cfg.sv_addr= NULL;
166 |   cfg.sv_len = 0;
167 |   cfg.sv_cur = 0;
168 |   cfg.sv_fd  =-1;
169 |   if (cfg.sv_ts == cfg.now)
170 |     cfg.sv_seq++;
171 |   else
172 |     cfg.sv_seq = 0;
173 |   
174 |   /* format filename with strftime */
175 |   cfg.sv_ts = cfg.now;
176 |   sc = strftime(base, sizeof(base), cfg.file_pat, localtime(&cfg.now));
177 |   if (sc == 0) {
178 |     fprintf(stderr,"strftime: error in file pattern\n");
179 |     goto done; 
180 |   }
181 | 
182 |   /* form full path to open */
183 |   snprintf(path, sizeof(path), "%s/%s%.2u.pcap", cfg.dir, base, cfg.sv_seq);
184 | 
185 |   /* map file into memory */
186 |   cfg.sv_fd = open(path, O_RDWR|O_CREAT|O_EXCL, 0644);
187 |   if (cfg.sv_fd < 0) {
188 |     fprintf(stderr, "open %s: %s\n", path, strerror(errno));
189 |     goto done;
190 |   }
191 | 
192 |   /* set its initial length; we fill it in memory to this size */
193 |   cfg.sv_len = cfg.maxsz_mb*(1024*1024);
194 |   sc = ftruncate(cfg.sv_fd, cfg.sv_len);
195 |   if (sc < 0) {
196 |     fprintf(stderr, "ftruncate %s: %s\n", path, strerror(errno));
197 |     goto done;
198 |   }
199 | 
200 |   int mode = PROT_READ|PROT_WRITE;
201 |   cfg.sv_addr = mmap(0, cfg.sv_len, mode, MAP_SHARED, cfg.sv_fd, 0);
202 |   if (cfg.sv_addr == MAP_FAILED) {
203 |     fprintf(stderr, "mmap %s: %s\n", path, strerror(errno));
204 |     cfg.sv_addr = NULL;
205 |     goto done;
206 |   }
207 | 
208 |   /* set up global header. */
209 |   memcpy(&cfg.sv_addr[cfg.sv_cur], pcap_glb_hdr, sizeof(pcap_glb_hdr));
210 |   cfg.sv_cur += sizeof(pcap_glb_hdr);
211 | 
212 |   rc = 0;
213 | 
214 |  done: 
215 |   return rc;
216 | }
217 | 
218 | 
219 | int periodic_work(void) {
220 |   int rc = -1, sc;
221 | 
222 |   /* test rotation interval */
223 |   if (cfg.sv_addr == NULL) {
224 |     rc = 0;
225 |     goto done;
226 |   }
227 | 
228 |   if (cfg.sv_ts + cfg.rotate_sec > cfg.now) {
229 |     rc = 0;
230 |     goto done;
231 |   }
232 | 
233 |   sc = reopen_savefile();
234 |   if (sc < 0) goto done;
235 | 
236 |   rc = 0;
237 | 
238 |  done:
239 |   return rc;
240 | }
241 | 
242 | int handle_signal() {
243 |   struct signalfd_siginfo info;
244 |   int sc, rc=-1;
245 |   ssize_t nr;
246 |   char *s;
247 |   
248 |   nr = read(cfg.signal_fd, &info, sizeof(info));
249 |   if (nr != sizeof(info)) {
250 |     fprintf(stderr,"failed to read signal fd buffer\n");
251 |     goto done;
252 |   }
253 | 
254 |   switch(info.ssi_signo) {
255 |     case SIGALRM: 
256 |       cfg.now = time(NULL);
257 |       sc = periodic_work();
258 |       if (sc < 0) goto done;
259 |       alarm(1); 
260 |       break;
261 |     default: 
262 |       s = strsignal(info.ssi_signo);
263 |       fprintf(stderr,"got signal %d (%s)\n", info.ssi_signo, s);
264 |       goto done;
265 |       break;
266 |   }
267 | 
268 |  rc = 0;
269 | 
270 |  done:
271 |   return rc;
272 | }
273 | 
274 | 
275 | int parse_mode(char *in) {
276 |   char *colon, **m;
277 |   int n, i=0;
278 | 
279 |   colon = strchr(in, ':');
280 |   n = colon ? colon-in : strlen(in);
281 | 
282 |   m = mode_strings;
283 |   while (*m) {
284 |     if (!strncmp(*m, in, n)) {
285 | 
286 |       /* found match */
287 | 
288 |       /* parse dir from pcap:<dir> */
289 |       if (colon && (i == mode_pcap))
290 |         cfg.dir = strdup(colon+1);
291 | 
292 |       return i;
293 |     }
294 |     m++;
295 |     i++;
296 |   }
297 | 
298 |   return mode_none;
299 | 
300 | }
301 | 
302 | int record_packet(char *pkt, size_t len) {
303 |   uint32_t sec, usec, caplen, origlen;
304 |   int sc, rc = -1;
305 |   size_t fl;
306 | 
307 |   if (cfg.sv_addr == NULL) {
308 |     rc = 0;
309 |     goto done;
310 |   }
311 | 
312 |   /* does enough space remain in the output area? */
313 |   fl = (sizeof(uint32_t) * 4) + len;
314 |   if (cfg.sv_cur + fl >= cfg.maxsz_mb*(1024*1024)) {
315 |     sc = reopen_savefile();
316 |     if (sc < 0) goto done;
317 |   }
318 | 
319 |   /* conjure timestamp from our clock */
320 |   sec = (uint32_t)cfg.now;
321 |   usec = 0;
322 |   caplen = len;
323 |   origlen = len;
324 | 
325 |   /* write packet header and packet. */
326 |   memcpy(&cfg.sv_addr[cfg.sv_cur], &sec, sizeof(uint32_t));
327 |   cfg.sv_cur += sizeof(uint32_t);
328 |   memcpy(&cfg.sv_addr[cfg.sv_cur], &usec, sizeof(uint32_t));
329 |   cfg.sv_cur += sizeof(uint32_t);
330 |   memcpy(&cfg.sv_addr[cfg.sv_cur], &caplen, sizeof(uint32_t));
331 |   cfg.sv_cur += sizeof(uint32_t);
332 |   memcpy(&cfg.sv_addr[cfg.sv_cur], &origlen, sizeof(uint32_t));
333 |   cfg.sv_cur += sizeof(uint32_t);
334 |   memcpy(&cfg.sv_addr[cfg.sv_cur], pkt, len);
335 |   cfg.sv_cur += len;
336 | 
337 |   rc = 0;
338 | 
339 |  done:
340 |   return rc;
341 | }
342 | 
343 | /* set up as a GRE receiver */
344 | int setup_rx_encap(void) {
345 |   struct sockaddr *sa;
346 |   int i, sc, rc = -1;
347 |   struct iovec *iov;
348 |   socklen_t sz;
349 | 
350 |   cfg.rx_fd = socket(AF_INET, SOCK_RAW, IPPROTO_GRE);
351 |   if (cfg.rx_fd == -1) {
352 |     fprintf(stderr,"socket: %s\n", strerror(errno));
353 |     goto done;
354 |   }
355 | 
356 |   rc = 0;
357 | 
358 |  done:
359 |   return rc;
360 | }
361 | 
362 | /* decode the gre packet into its fields.
363 |  * input pkt starts with outer IP header.
364 |  * fields are returned in network order!
365 |  * fields are zeroed if not present
366 |  * on decoding failure, returns -1.
367 |  * returns 0 on success
368 |  */
369 | #define GRE_MIN_HDR 4
370 | #define GRE_CHECKSUM_LEN 2
371 | #define GRE_RESERVED1_LEN 2
372 | #define GRE_KEY_LEN 4
373 | #define GRE_SEQNO_LEN 4
374 | int decode_gre(char *pkt, ssize_t nr, uint16_t *type, uint16_t *csum,
375 |      uint32_t *key, uint32_t *seqno, char **payload, size_t *plen) {
376 |   int has_key, has_checksum, has_seqno, ko, co, so, po, ip_hdr_len;
377 |   uint8_t ip_proto;
378 | 
379 |   *key = 0;
380 |   *seqno = 0;
381 |   *csum = 0;
382 |   *payload = NULL;
383 |   *plen = 0;
384 |   *type = 0;
385 | 
386 |   assert(nr > 0);
387 |   ip_hdr_len = (pkt[0] & 0x0f) * 4;
388 | 
389 |   if (nr < ip_hdr_len + GRE_MIN_HDR)
390 |     return -1;
391 | 
392 |   ip_proto = pkt[9];
393 |   if (ip_proto != IPPROTO_GRE)
394 |     return -1;
395 | 
396 |   memcpy(type, &pkt[ip_hdr_len + 2], sizeof(uint16_t));
397 | 
398 |   has_key      = pkt[ip_hdr_len] & (1U << 5);
399 |   has_checksum = pkt[ip_hdr_len] & (1U << 7);
400 |   has_seqno    = pkt[ip_hdr_len] & (1U << 4);
401 | 
402 |   if (has_checksum) {
403 |     co = ip_hdr_len + GRE_MIN_HDR;
404 |     if (co + GRE_CHECKSUM_LEN > nr)
405 |       return -1;
406 |     memcpy(csum, pkt + co, GRE_CHECKSUM_LEN);
407 |   }
408 | 
409 |   if (has_key) {
410 |     ko = ip_hdr_len + GRE_MIN_HDR
411 |          + (has_checksum ? GRE_CHECKSUM_LEN + GRE_RESERVED1_LEN : 0);
412 |     if (ko + GRE_KEY_LEN > nr)
413 |       return -1;
414 |     memcpy(key, pkt + ko, GRE_KEY_LEN);
415 |   }
416 | 
417 |   if (has_seqno) {
418 |     so = ip_hdr_len + GRE_MIN_HDR +
419 |          + (has_checksum ? GRE_CHECKSUM_LEN + GRE_RESERVED1_LEN : 0)
420 |          + (has_key      ? GRE_KEY_LEN : 0);
421 |     if (so + GRE_SEQNO_LEN > nr)
422 |       return -1;
423 |     memcpy(seqno, pkt + so, GRE_SEQNO_LEN);
424 |   }
425 | 
426 |   po = ip_hdr_len + GRE_MIN_HDR +
427 |        + (has_checksum ? GRE_CHECKSUM_LEN + GRE_RESERVED1_LEN : 0)
428 |        + (has_key      ? GRE_KEY_LEN : 0)
429 |        + (has_seqno    ? GRE_SEQNO_LEN : 0);
430 | 
431 |   *plen = nr - po;
432 |   *payload = pkt + po;
433 |   return 0;
434 | }
435 | 
436 | /* see ovs-fields(7) */
437 | #define ERSPAN_V1_GRETYPE 0x88be
438 | #define ERSPAN_V1_HDR 8
439 | #define ERSPAN_V2_GRETYPE 0x22eb
440 | #define ERSPAN_V2_HDR 12
441 | int decode_erspan(uint16_t gre_type, uint8_t *in, size_t in_len, 
442 |   char **out, size_t *out_len) {
443 |   int has_subhdr, rc = -1;
444 | 
445 |   gre_type = ntohs(gre_type);
446 | 
447 |   switch(gre_type) {
448 |     case ERSPAN_V1_GRETYPE: /* erspan version 1 aka Type II */
449 |       if (in_len < ERSPAN_V1_HDR) goto done;
450 |       *out = in + ERSPAN_V1_HDR;
451 |       *out_len = in_len - ERSPAN_V1_HDR;
452 |       if (cfg.verbose) fprintf(stderr, " erspan v1\n");
453 |       break;
454 |     case ERSPAN_V2_GRETYPE: /* erspan version 2 aka Type III */
455 |       if (in_len < ERSPAN_V2_HDR) goto done;
456 |       /* test if ERSPAN "Optional subheader" flag is set */
457 |       has_subhdr = (in[11] & 0x1) ? 1 : 0;
458 |       *out = in + ERSPAN_V2_HDR + (has_subhdr ? 8 : 0);
459 |       *out_len = in_len - ERSPAN_V2_HDR - (has_subhdr ? 8 : 0);
460 |       if (cfg.verbose)
461 |         fprintf(stderr, " erspan v2 (sub_hdr: %d)\n", has_subhdr);
462 |       break;
463 |     default:
464 |       fprintf(stderr, "unknown gre erspan type 0x%x\n", gre_type);
465 |       goto done;
466 |   }
467 | 
468 |   rc = 0;
469 | 
470 |  done:
471 |   return rc;
472 | }
473 | 
474 | int handle_grerx(void) {
475 |   uint32_t seqno, key;
476 |   uint16_t csum, type;
477 |   char *data, *out;
478 |   size_t dlen, sz;
479 |   int rc=-1, sc;
480 |   ssize_t nr;
481 | 
482 |   nr = read(cfg.rx_fd, cfg.pkt, sizeof(cfg.pkt));
483 |   if (nr < 0) {
484 |     fprintf(stderr, "read: %s\n", strerror(errno));
485 |     goto done;
486 |   }
487 | 
488 |   if (cfg.verbose)
489 |     fprintf(stderr, "received GRE packet of %zd bytes\n", nr);
490 | 
491 |   sc = decode_gre(cfg.pkt, nr, &type, &csum, &key, &seqno, &data, &dlen);
492 |   if (sc < 0) {
493 |     rc = 0;
494 |     goto done;
495 |   }
496 | 
497 |   /* decapsulate packet, advance over GRE header */
498 |   if (dlen == 0) {
499 |     rc = 0;
500 |     goto done;
501 |   }
502 | 
503 |   /* expect ERSPAN header at this point - discard */
504 |   sc = decode_erspan(type, data, dlen, &out, &sz);
505 |   if (sc < 0) {
506 |     rc = 0;
507 |     goto done;
508 |   }
509 | 
510 |   /* save the packet */
511 |   sc = record_packet(out, sz);
512 |   if (sc < 0) goto done;
513 | 
514 |   rc = 0;
515 | 
516 |  done:
517 |   return rc;
518 | }
519 | 
520 | int main(int argc, char *argv[]) {
521 |   struct epoll_event ev;
522 |   int opt, rc=-1, sc, n;
523 | 
524 |   cfg.now = time(NULL);
525 |   cfg.prog = argv[0];
526 | 
527 |   do {
528 |      opt = getopt_long_only(argc, argv, "vhF:T:G:C:w:", options, NULL);
529 |      switch (opt) {
530 |        case 'F': cfg.from = parse_mode(optarg); break;
531 |        case 'T': cfg.to   = parse_mode(optarg); break;
532 |        case 'G': cfg.rotate_sec = atoi(optarg); break;
533 |        case 'C': cfg.maxsz_mb = atoi(optarg); break;
534 |        case 'w': cfg.file_pat = strdup(optarg); break;
535 |        case 'v': cfg.verbose++; break;
536 |        case 'h': usage();
537 |        case -1: break;
538 |      }
539 |   } while (opt > 0);
540 | 
541 |   if (cfg.from == mode_none) usage();
542 |   if (cfg.to   == mode_none) usage();
543 | 
544 |   /* right now we only support this mode */
545 |   assert(cfg.to == mode_pcap);
546 |   assert(cfg.from == mode_erspan);
547 | 
548 |   /* block all signals. we take signals synchronously via signalfd */
549 |   sigset_t all;
550 |   sigfillset(&all);
551 |   sigprocmask(SIG_SETMASK,&all,NULL);
552 | 
553 |   /* a few signals we'll accept via our signalfd */
554 |   sigset_t sw;
555 |   sigemptyset(&sw);
556 |   for(n=0; n < sizeof(sigs)/sizeof(*sigs); n++)
557 | 	  sigaddset(&sw, sigs[n]);
558 | 
559 |   /* create the signalfd for receiving signals */
560 |   cfg.signal_fd = signalfd(-1, &sw, 0);
561 |   if (cfg.signal_fd == -1) {
562 |     fprintf(stderr,"signalfd: %s\n", strerror(errno));
563 |     goto done;
564 |   }
565 | 
566 |   /* set up the epoll instance */
567 |   cfg.epoll_fd = epoll_create(1); 
568 |   if (cfg.epoll_fd == -1) {
569 |     fprintf(stderr,"epoll: %s\n", strerror(errno));
570 |     goto done;
571 |   }
572 | 
573 |   /* set up the encapsulation receiver */
574 |   sc = setup_rx_encap();
575 |   if (sc < 0) goto done;
576 | 
577 |   /* add descriptors of interest */
578 |   sc = new_epoll(EPOLLIN, cfg.signal_fd);
579 |   if (sc < 0) goto done;
580 |   sc = new_epoll(EPOLLIN, cfg.rx_fd);
581 |   if (sc < 0) goto done;
582 | 
583 |   /* open the initial savefile */
584 |   sc = reopen_savefile();
585 |   if (sc < 0) goto done;
586 | 
587 |   alarm(1);
588 |   for (;;) {
589 | 
590 |     sc = epoll_wait(cfg.epoll_fd, &ev, 1, -1);
591 |     if (sc < 0) {
592 |       fprintf(stderr,"epoll: %s\n", strerror(errno));
593 |       break;
594 |     }
595 | 
596 |     if (ev.data.fd == cfg.signal_fd) {
597 |       sc = handle_signal();
598 |       if (sc < 0) goto done;
599 |     } 
600 |     else if (ev.data.fd == cfg.rx_fd) {
601 |       sc = handle_grerx();
602 |       if (sc < 0) goto done;
603 |     } 
604 |     else {
605 |       fprintf(stderr, "unknown fd\n");
606 |       assert(0);
607 |     }
608 | 
609 |   }
610 | 
611 |   rc = 0;
612 | 
613 |  done:
614 |   if (cfg.sv_addr) close_savefile();
615 |   if (cfg.rx_fd != -1) close(cfg.rx_fd);
616 |   if (cfg.epoll_fd != -1) close(cfg.epoll_fd);
617 |   if (cfg.signal_fd != -1) close(cfg.signal_fd);
618 |   return rc;
619 | }
620 | 


--------------------------------------------------------------------------------
/respan.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * © 2019 The Johns Hopkins University Applied Physics Laboratory LLC.
 3 |  * All Rights Reserved. 
 4 |  *
 5 |  * AUTHOR: Troy D. Hanson
 6 |  * LICENSE: MIT
 7 |  * PACKAGE: fluxcap
 8 |  */
 9 | 
10 | #ifndef RESPAN_H
11 | #define RESPAN_H
12 | 
13 | #if defined __cplusplus
14 | extern "C" {
15 | #endif
16 | 
17 | #define MODES x(none) x(erspan) x(pcap)
18 | #define x(a) mode_ ## a,
19 | typedef enum { MODES } io_mode;
20 | #undef x
21 | 
22 | #define RESPAN_VERSION "0.1"
23 | #define FILE_MAX 250 /* instead of FILENAME_MAX or PATH_MAX */
24 | #define FILE_PATTERN "%Y%m%d%H%M%S"
25 | #define MAX_PKT 65536
26 | 
27 | #if defined __cplusplus
28 |  }
29 | #endif
30 | 
31 | #endif /* RESPAN_H */
32 | 


--------------------------------------------------------------------------------
/util/.gitignore:
--------------------------------------------------------------------------------
1 | ffcp
2 | fwalk
3 | fpcap-push
4 | fprune
5 | *.db
6 | 


--------------------------------------------------------------------------------
/util/Makefile.am:
--------------------------------------------------------------------------------
1 | srcdir = @srcdir@
2 | 
3 | bin_PROGRAMS = ramdisk watch_copy
4 | ramdisk_SOURCES = ramdisk.c
5 | ramdisk_CPPFLAGS = -I$(srcdir)/../lib/libut/include 
6 | watch_copy_SOURCES = watch_copy.c
7 | 
8 | 


--------------------------------------------------------------------------------
/util/ramdisk.c:
--------------------------------------------------------------------------------
  1 | #include <sys/mount.h>
  2 | #include <syslog.h>
  3 | #include <sys/vfs.h>
  4 | #include <sys/types.h>
  5 | #include <sys/stat.h>
  6 | #include <fcntl.h>
  7 | #include <dirent.h>
  8 | #include <stdio.h>
  9 | #include <assert.h>
 10 | #include <errno.h>
 11 | #include <string.h>
 12 | #include <stdlib.h>
 13 | #include <unistd.h>
 14 | #include "utarray.h"
 15 | #include <limits.h>
 16 | 
 17 | #define TMPFS_MAGIC           0x01021994
 18 | #define RAMFS_MAGIC           0x858458F6
 19 |  
 20 | /******************************************************************************
 21 |  * ramdisk
 22 |  *
 23 |  *   a utility with modes to: 
 24 |  *   - create a ramdisk,
 25 |  *   - query a ramdisk (see its size and percent full)
 26 |  *   - unmount a ramdisk 
 27 |  *
 28 |  * The ramdisk used here is the 'tmpfs' filesystem which is not strictly a 
 29 |  * pure RAM device; it can swap under the kernel's discretion. I have also
 30 |  * noticed that a large ramdisk (say, 6gb on a system with 8gb ram) might 
 31 |  * exhibit 'no space left on device' even when only 50% full. A better 
 32 |  * query mode would show the status (resident, paged, etc) of ramdisk pages.
 33 |  *****************************************************************************/
 34 | 
 35 | /* command line configuration parameters */
 36 | int verbose;
 37 | int ramfs;
 38 | enum {MODE_NONE,MODE_QUERY,MODE_CREATE,MODE_UNMOUNT} mode = MODE_NONE;
 39 | char *sz="50%";
 40 | char *ramdisk;
 41 | UT_array *dirs;
 42 |  
 43 | void usage(char *prog) {
 44 |   fprintf(stderr, "This utility creates a tmpfs ramdisk on a given mountpount.\n");
 45 |   fprintf(stderr, "It does nothing if a tmpfs is already mounted on that point.\n");
 46 |   fprintf(stderr, "\n");
 47 |   fprintf(stderr,"usage:\n\n");
 48 |   fprintf(stderr, "-c (create mode):\n");
 49 |   fprintf(stderr, "   %s -c [-s <size>] [-d <dir>] [-r] <mount-point>\n", prog);
 50 |   fprintf(stderr, "   -s <size> suffixed with k|m|g|%% [default: 50%%]\n");
 51 |   fprintf(stderr, "   -d <dir> directory to post-create inside ramdisk (repeatable)\n");
 52 |   fprintf(stderr, "   -r use ramfs instead of tmpfs (grows unbounded, no swap)\n");
 53 |   fprintf(stderr, "\n");
 54 |   fprintf(stderr, "-q (query mode):\n");
 55 |   fprintf(stderr, "   %s -q <ramdisk-mount-point>\n", prog);
 56 |   fprintf(stderr, "\n");
 57 |   fprintf(stderr, "-u (unmount mode):\n");
 58 |   fprintf(stderr, "   %s -u <ramdisk-mount-point>\n", prog);
 59 |   fprintf(stderr, "\n");
 60 |   fprintf(stderr, "Examples of creating a ramdisk:\n");
 61 |   fprintf(stderr, " %s -c -s 1g /mnt/ramdisk\n", prog);
 62 |   fprintf(stderr, " %s -c -s 1g -d /mnt/ramdisk/in -d /mnt/ramdisk/out /mnt/ramdisk\n", prog);
 63 |   fprintf(stderr, "\n");
 64 |   fprintf(stderr, "Note: 'cat /proc/mounts' to see mounted tmpfs ramdisks.\n");
 65 |   exit(-1);
 66 | }
 67 | 
 68 | /* Prevent a ramdisk from being mounted at the mount-point of an 
 69 |  * existing ramdisk. This prevents people from accidently stacking tmpfs.
 70 |  * However it is OK to mount a ramdisk on a subdirectory of another ramdisk. */
 71 | int suitable_mountpoint(char *dir, struct stat *sb, struct statfs *sf) {
 72 |   size_t dlen = strlen(dir);
 73 |   char pdir[PATH_MAX];
 74 |   struct stat psb;
 75 | 
 76 |   if (dlen+4 > PATH_MAX) {
 77 |     syslog(LOG_ERR, "path too long\n");
 78 |     return -1;
 79 |   }
 80 | 
 81 |   if (stat(ramdisk, sb) == -1) { /* does mount point exist? */
 82 |     syslog(LOG_ERR, "no mount point %s: %s\n", ramdisk, strerror(errno));
 83 |     return -1;
 84 |   }
 85 |   if (S_ISDIR(sb->st_mode) == 0) { /* has to be a directory */
 86 |     syslog(LOG_ERR, "mount point %s: not a directory\n", ramdisk);
 87 |     return -1;
 88 |   }
 89 |   if (statfs(ramdisk, sf) == -1) { /* what kinda filesystem is it on? */
 90 |     syslog(LOG_ERR, "can't statfs %s: %s\n", ramdisk, strerror(errno));
 91 |     return -1;
 92 |   }
 93 | 
 94 |   /* is it already a tmpfs mountpoint? */
 95 |   memcpy(pdir,dir,dlen+1); strcat(pdir,"/..");
 96 |   if (stat(pdir, &psb) == -1) {
 97 |     syslog(LOG_ERR, "can't stat %s: %s\n", pdir, strerror(errno));
 98 |     return -1;
 99 |   }
100 |   int is_mountpoint = (psb.st_dev == sb->st_dev) ? 0 : 1;
101 |   int is_tmpfs = (sf->f_type == TMPFS_MAGIC);
102 |   int is_ramfs = (sf->f_type == RAMFS_MAGIC);
103 |   if (is_mountpoint && (is_tmpfs || is_ramfs)) {
104 |     //syslog(LOG_INFO, "already a tmpfs mountpoint: %s\n", dir, strerror(errno));
105 |     return -2;
106 |   }
107 | 
108 |   return 0;
109 | }
110 | 
111 | #define KB 1024L
112 | #define MB (1024*1024L)
113 | #define GB (1024*1024*1024L)
114 | int query_ramdisk(void) {
115 |   struct stat sb; struct statfs sf;
116 |   if (suitable_mountpoint(ramdisk, &sb, &sf) != -2) {
117 |     printf("%s: not a ramdisk\n", ramdisk);
118 |     return -1;
119 |   }
120 |   if (sf.f_type == RAMFS_MAGIC) {
121 |     printf("%s: ramfs ramdisk (unbounded size)\n", ramdisk);
122 |     return 0;
123 |   }
124 |   char szb[100];
125 |   long bytes = sf.f_bsize*sf.f_blocks;
126 |   if (bytes < KB) snprintf(szb, sizeof(szb), "%ld bytes", bytes);
127 |   else if (bytes < MB) snprintf(szb, sizeof(szb), "%ld kb", bytes/KB);
128 |   else if (bytes < GB) snprintf(szb, sizeof(szb), "%ld mb", bytes/MB);
129 |   else                 snprintf(szb, sizeof(szb), "%ld gb", bytes/GB);
130 |   int used_pct = 100 - (sf.f_bfree * 100.0 / sf.f_blocks);
131 |   printf("%s: ramdisk of size %s (%d%% used)\n", ramdisk, szb, used_pct);
132 |   return 0;
133 | }
134 | 
135 | int unmount_ramdisk(void) {
136 |   struct stat sb; struct statfs sf;
137 |   if (suitable_mountpoint(ramdisk, &sb, &sf) != -2) {
138 |     syslog(LOG_ERR,"%s: not a ramdisk\n", ramdisk);
139 |     return -1;
140 |   }
141 |   if (umount(ramdisk) == -1) {
142 |     syslog(LOG_ERR,"%s: cannot unmount\n", ramdisk);
143 |     return -1;
144 |   }
145 |   return 0;
146 | }
147 | 
148 | int create_ramdisk(void) {
149 |   int rc;
150 |   char opts[100], *kind;
151 | 
152 |   struct stat sb; struct statfs sf;
153 |   rc = suitable_mountpoint(ramdisk, &sb, &sf);
154 |   if (rc) return rc;
155 | 
156 |   kind = "tmpfs";
157 |   if (ramfs) kind = "ramfs";
158 | 
159 |   /* ok, mount a ramdisk on this point */
160 |   snprintf(opts,sizeof(opts),"size=%s",sz);
161 |   rc=mount("none", ramdisk, kind, MS_NOATIME|MS_NODEV, opts);
162 |   if (rc) syslog(LOG_ERR, "can't make ramdisk %s: %s\n", ramdisk, strerror(errno));
163 |   return rc;
164 | }
165 | 
166 | void make_dirs(UT_array *dirs) {
167 |   char **d, *dir;
168 |   d=NULL;
169 |   while ( (d=(char**)utarray_next(dirs,d))) {
170 |     dir = *d;
171 |     /* fprintf(stderr,"dir is %s\n",dir); */
172 |     if (mkdir(dir, 0777) == -1) {
173 |       fprintf(stderr,"failed to make %s: %s\n",dir,strerror(errno));
174 |     }
175 |   }
176 | }
177 | 
178 | int main(int argc, char * argv[]) {
179 |   int opt, rc;
180 |   utarray_new(dirs,&ut_str_icd);
181 |  
182 |   while ( (opt = getopt(argc, argv, "v+cqus:hd:r")) != -1) {
183 |     switch (opt) {
184 |       case 'v': verbose++; break;
185 |       case 'r': ramfs=1; break;
186 |       case 'q': if (mode) usage(argv[0]); mode=MODE_QUERY; break;
187 |       case 'c': if (mode) usage(argv[0]); mode=MODE_CREATE; break;
188 |       case 'u': if (mode) usage(argv[0]); mode=MODE_UNMOUNT; break;
189 |       case 's': sz=strdup(optarg); break;
190 |       case 'd': utarray_push_back(dirs,&optarg); break;
191 |       case 'h': default: usage(argv[0]); break;
192 |     }
193 |   }
194 |   if (optind < argc) ramdisk=argv[optind++];
195 |   if (!ramdisk) usage(argv[0]);
196 |   openlog("ramdisk", LOG_PID|LOG_PERROR, LOG_LOCAL0);
197 | 
198 |   switch(mode) {
199 |     case MODE_CREATE: rc=create_ramdisk(); make_dirs(dirs); break;
200 |     case MODE_UNMOUNT: rc=unmount_ramdisk(); break;
201 |     case MODE_QUERY: rc=query_ramdisk(); break;
202 |     default: usage(argv[0]); break;
203 |   }
204 |   utarray_free(dirs);
205 |   return rc;
206 | }
207 | 


--------------------------------------------------------------------------------
/util/tests/do_tests:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | 
 6 | my @tests;
 7 | for (glob "test*[0-9]") {
 8 |     push @tests, $_ if -e "$_.ans";
 9 | }
10 | 
11 | my $num_failed=0;
12 | 
13 | for my $test (@tests) {
14 |     print "$test\n";
15 |     `./$test > $test.out 2>/dev/null`;
16 |     `diff $test.out $test.ans`;
17 |     print "$test failed\n" if $?;
18 |     $num_failed++ if $?;
19 | }
20 | 
21 | print scalar @tests . " tests conducted, $num_failed failed.\n";
22 | exit $num_failed;
23 | 


--------------------------------------------------------------------------------
/util/tests/test1:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | mkdir -p testdir
 4 | tar xf testdir.tar -C testdir
 5 | shr-tool -c -s 1m ring
 6 | 
 7 | ../fwalk -d testdir -r ring
 8 | shr-tool -r ring
 9 | rm -rf testdir ring
10 | 


--------------------------------------------------------------------------------
/util/tests/test1.ans:
--------------------------------------------------------------------------------
1 | /home/thanson/checkouts/public/fluxcap/util/tests/testdir/20180101/ABC/m
2 | /home/thanson/checkouts/public/fluxcap/util/tests/testdir/20180101/DEF/m
3 | /home/thanson/checkouts/public/fluxcap/util/tests/testdir/20180102/ABC/m
4 | /home/thanson/checkouts/public/fluxcap/util/tests/testdir/20180102/DEF/m
5 | 


--------------------------------------------------------------------------------
/util/tests/test2:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # this test requires sudo to mount a ramdisk
 3 | 
 4 | # create a ring
 5 | shr-tool -c -s 1m ring
 6 | 
 7 | # mount a ramdisk (need a real filesystem for fprune)
 8 | mkdir -p ramdisk
 9 | sudo ../ramdisk -c -s 1m ramdisk
10 | 
11 | # we will make a directory to test pruning of empties
12 | mkdir ramdisk/empty
13 | 
14 | # put content into ramdisk
15 | tar xf testdir.tar -C ramdisk
16 | dd if=/dev/zero of=ramdisk/0.dat bs=100k count=1
17 | dd if=/dev/zero of=ramdisk/1.dat bs=100k count=1
18 | 
19 | echo pre-prune
20 | 
21 | # dump directory contents to ring
22 | ../fwalk -d ramdisk -r ring
23 | shr-tool -r ring
24 | 
25 | # confirm empty directory is there
26 | find ramdisk -name empty
27 | 
28 | # prune ramdisk to 10% of 1m (100k)
29 | ../fprune -d ramdisk -p 10 -r ring -N 10 -u -W -P &
30 | PID1=$!
31 | sleep 5
32 | kill $PID1
33 | wait $PID1
34 | 
35 | echo post-prune
36 | 
37 | # dump directory contents to ring
38 | ../fwalk -d ramdisk -r ring
39 | shr-tool -r ring
40 | 
41 | # confirm empty directory is gone 
42 | find ramdisk -name empty
43 | 
44 | # clean up
45 | sudo ../ramdisk -u ramdisk
46 | rm -rf ramdisk ring
47 | 


--------------------------------------------------------------------------------
/util/tests/test2.ans:
--------------------------------------------------------------------------------
 1 | pre-prune
 2 | /home/thanson/checkouts/public/fluxcap/util/tests/ramdisk/0.dat
 3 | /home/thanson/checkouts/public/fluxcap/util/tests/ramdisk/1.dat
 4 | /home/thanson/checkouts/public/fluxcap/util/tests/ramdisk/20180101/ABC/m
 5 | /home/thanson/checkouts/public/fluxcap/util/tests/ramdisk/20180101/DEF/m
 6 | /home/thanson/checkouts/public/fluxcap/util/tests/ramdisk/20180102/ABC/m
 7 | /home/thanson/checkouts/public/fluxcap/util/tests/ramdisk/20180102/DEF/m
 8 | ramdisk/empty
 9 | files.map: 10 slots (0 GB)
10 | post-prune
11 | /home/thanson/checkouts/public/fluxcap/util/tests/ramdisk/1.dat
12 | /home/thanson/checkouts/public/fluxcap/util/tests/ramdisk/20180101/ABC/m
13 | /home/thanson/checkouts/public/fluxcap/util/tests/ramdisk/20180101/DEF/m
14 | /home/thanson/checkouts/public/fluxcap/util/tests/ramdisk/20180102/ABC/m
15 | /home/thanson/checkouts/public/fluxcap/util/tests/ramdisk/20180102/DEF/m
16 | 


--------------------------------------------------------------------------------
/util/tests/test3:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # create a ring
 4 | shr-tool -c -s 1m ring oring
 5 | 
 6 | # make test directories
 7 | mkdir -p testdir outdir
 8 | 
 9 | # put content into testdir
10 | tar xf testdir.tar -C testdir
11 | 
12 | # dump directory contents to ring
13 | ../fwalk -d testdir -r ring
14 | 
15 | # copy from ring files using naming template
16 | ../ffcp -i ring -o oring -r '(\d{8})/(\w{3})/(.*)$' -t 'outdir/$1/$2.$3' -m -z &
17 | PID1=$!
18 | sleep 5
19 | kill $PID1
20 | wait $PID1
21 | 
22 | echo "listing output directory"
23 | find outdir
24 | 
25 | echo "listing output ring"
26 | shr-tool -r oring
27 | 
28 | # clean up
29 | rm -rf testdir outdir ring oring
30 | 


--------------------------------------------------------------------------------
/util/tests/test3.ans:
--------------------------------------------------------------------------------
 1 | listing output directory
 2 | outdir
 3 | outdir/20180101
 4 | outdir/20180101/ABC.m.gz
 5 | outdir/20180101/DEF.m.gz
 6 | outdir/20180102
 7 | outdir/20180102/ABC.m.gz
 8 | outdir/20180102/DEF.m.gz
 9 | listing output ring
10 | outdir/20180101/ABC.m.gz
11 | outdir/20180101/DEF.m.gz
12 | outdir/20180102/ABC.m.gz
13 | outdir/20180102/DEF.m.gz
14 | 


--------------------------------------------------------------------------------
/util/tests/test4:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # create a ring
 4 | shr-tool -c -s 1m ring oring
 5 | 
 6 | # make test directories
 7 | mkdir -p testdir outdir
 8 | 
 9 | # put content into testdir
10 | echo "hello, world!" > testdir/hello
11 | 
12 | # dump directory contents to ring
13 | ../fwalk -d testdir -r ring
14 | 
15 | # copy from ring files to outdir/basename.gz
16 | ../ffcp -i ring -o oring -t 'outdir/$0' -z &
17 | PID1=$!
18 | sleep 5
19 | kill $PID1
20 | wait $PID1
21 | 
22 | echo "listing output directory"
23 | find outdir
24 | 
25 | echo "listing output ring"
26 | shr-tool -r oring
27 | 
28 | # testing gunzip compatibility
29 | gunzip -c outdir/hello.gz
30 | 
31 | # clean up
32 | rm -rf testdir outdir ring oring
33 | 


--------------------------------------------------------------------------------
/util/tests/test4.ans:
--------------------------------------------------------------------------------
1 | listing output directory
2 | outdir
3 | outdir/hello.gz
4 | listing output ring
5 | outdir/hello.gz
6 | hello, world!
7 | 


--------------------------------------------------------------------------------
/util/tests/test5:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # create a tiny ring 
 4 | # so we block in write
 5 | shr-tool -c -s 1k ring
 6 | 
 7 | # make test directories
 8 | mkdir -p testdir
 9 | 
10 | # put content into testdir
11 | tar xf testdir.tar -C testdir
12 | 
13 | # dump directory contents to ring
14 | echo "filling ring"
15 | ../fwalk -d testdir -r ring # ok
16 | ../fwalk -d testdir -r ring # ok
17 | ../fwalk -d testdir -r ring # ok
18 | 
19 | # we know from manual testing that
20 | # the next write would block for 
21 | # space availability in the ring
22 | echo "final write, should block"
23 | ../fwalk -d testdir -r ring  &
24 | PID1=$!
25 | START_TIME=`date +%s`
26 | sleep 1
27 | 
28 | # confirm that it is blocked. this
29 | # does not actually send a signal
30 | kill -0 $PID1
31 | if [ $? -eq 0 ]
32 | then 
33 |   echo "ok, blocked in ring write"
34 | else 
35 |   echo "failed to block in write!"
36 |   exit -1
37 | fi
38 | 
39 | # schedule failsafe unmaskable signal
40 | echo "scheduling sigkill in 10s"
41 | (sleep 10; kill -9 $PID1) &
42 | 
43 | # send more benign signal
44 | # to wake fwalk if it lets
45 | # in signals in shr-write
46 | # in bw_wait_ul -> select
47 | echo "sending sigterm in 1s"
48 | sleep 1
49 | kill -TERM $PID1
50 | 
51 | # one or the other signal should 
52 | # make fwalk exit so wait for it
53 | wait $PID1
54 | echo "writer terminated (status $?)"
55 | END_TIME=`date +%s`
56 | let ELAPSED=$END_TIME-$START_TIME
57 | if [ $ELAPSED -gt 5 ]
58 | then
59 |   echo "terminated by failsafe signal,"
60 |   echo "this means standard signal was"
61 |   echo "insufficient to unblock writer"
62 | else
63 |   echo "good, standard signal suffices"
64 | fi
65 | 
66 | # clean up
67 | rm -rf testdir ring
68 | 


--------------------------------------------------------------------------------
/util/tests/test5.ans:
--------------------------------------------------------------------------------
1 | filling ring
2 | final write, should block
3 | ok, blocked in ring write
4 | scheduling sigkill in 10s
5 | sending sigterm in 1s
6 | writer terminated (status 0)
7 | good, standard signal suffices
8 | 


--------------------------------------------------------------------------------
/util/tests/testdir.tar:
--------------------------------------------------------------------------------
1 | 20180101/                                                                                           0000775 0001750 0001750 00000000000 13237267555 011701  5                                                                                                    ustar   thanson                         thanson                                                                                                                                                                                                                20180101/DEF/                                                                                       0000775 0001750 0001750 00000000000 13237267604 012272  5                                                                                                    ustar   thanson                         thanson                                                                                                                                                                                                                20180101/DEF/m                                                                                      0000664 0001750 0001750 00000000000 13237267604 012437  0                                                                                                    ustar   thanson                         thanson                                                                                                                                                                                                                20180101/ABC/                                                                                       0000775 0001750 0001750 00000000000 13237267604 012261  5                                                                                                    ustar   thanson                         thanson                                                                                                                                                                                                                20180101/ABC/m                                                                                      0000664 0001750 0001750 00000000000 13237267604 012426  0                                                                                                    ustar   thanson                         thanson                                                                                                                                                                                                                20180102/                                                                                           0000775 0001750 0001750 00000000000 13237267564 011702  5                                                                                                    ustar   thanson                         thanson                                                                                                                                                                                                                20180102/DEF/                                                                                       0000775 0001750 0001750 00000000000 13237267604 012273  5                                                                                                    ustar   thanson                         thanson                                                                                                                                                                                                                20180102/DEF/m                                                                                      0000664 0001750 0001750 00000000000 13237267604 012440  0                                                                                                    ustar   thanson                         thanson                                                                                                                                                                                                                20180102/ABC/                                                                                       0000775 0001750 0001750 00000000000 13237267604 012262  5                                                                                                    ustar   thanson                         thanson                                                                                                                                                                                                                20180102/ABC/m                                                                                      0000664 0001750 0001750 00000000000 13237267604 012427  0                                                                                                    ustar   thanson                         thanson                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                


--------------------------------------------------------------------------------
/util/watch_copy.c:
--------------------------------------------------------------------------------
  1 | #include <sys/inotify.h>
  2 | #include <sys/stat.h>
  3 | #include <unistd.h>
  4 | #include <libgen.h>
  5 | #include <sys/mman.h>
  6 | #include <sys/types.h>
  7 | #include <limits.h>
  8 | #include <stdio.h>
  9 | #include <fcntl.h>
 10 | #include <errno.h>
 11 | #include <string.h>
 12 | #include <stdlib.h>
 13 | 
 14 | struct {
 15 |   int verbose;
 16 |   int pattern_mode;
 17 |   int mkdir_mode;
 18 |   char *prog;
 19 | } CF;
 20 | 
 21 | /* usage: watch_copy <watch-dir> <dest-pattern>
 22 |  *
 23 |  * whenever a file in watch-dir is closed (if it was open for writing),
 24 |  * it is copied to the dest-pattern. It does not recurse in watch-dir.
 25 |  *
 26 |  * This implementation mmaps the source and dest files.
 27 |  * 
 28 |  */
 29 | 
 30 | void usage() {
 31 |   fprintf(stderr,"usage: %s [-p] [-m] <watch-dir> <dest-pattern>\n", CF.prog);
 32 |   fprintf(stderr,"\n");
 33 |   fprintf(stderr," -v (verbose)\n");
 34 |   fprintf(stderr," -p (pattern mode)\n");
 35 |   fprintf(stderr," -m (mkdir mode; makes destination directory if needed,\n");
 36 |   fprintf(stderr,"     supports only one level of directory creation)\n");
 37 |   fprintf(stderr,"\n");
 38 |   fprintf(stderr,"<dest-pattern> can be a directory, or a pattern (if -p)\n");
 39 |   fprintf(stderr,"\n");
 40 |   fprintf(stderr,"pattern syntax: $1 = first character of file basename\n");
 41 |   fprintf(stderr,"                $2 = second character of file basename\n");
 42 |   fprintf(stderr,"                $3 = third character (likewise $4, ...)\n");
 43 |   fprintf(stderr,"                $A = tenth character (likewise $B, ...)\n");
 44 |   fprintf(stderr,"                $Z = 36th character\n");
 45 |   fprintf(stderr,"                $0 = entire original file basename\n");
 46 |   fprintf(stderr,"                $$ = literal $\n");
 47 |   fprintf(stderr,"                everything else is literal\n");
 48 |   fprintf(stderr,"\n");
 49 |   fprintf(stderr,"note: quote pattern expressions to protect from shell!\n");
 50 |   fprintf(stderr,"\n");
 51 |   fprintf(stderr,"examples:\n");
 52 |   fprintf(stderr,"  %s /tmp /data\n", CF.prog);
 53 |   fprintf(stderr,"  (/tmp/abc123.pcap -> /data/abc123.pcap)\n");
 54 |   fprintf(stderr,"\n");
 55 |   fprintf(stderr,"  %s -p /tmp '/data/$1$2$3/$0'\n", CF.prog);
 56 |   fprintf(stderr,"  (/tmp/abc123.pcap -> /data/abc/abc123.pcap)\n");
 57 |   fprintf(stderr,"\n");
 58 |   fprintf(stderr,"  %s -mp /tmp '/data/$A$B/$0'\n", CF.prog);
 59 |   fprintf(stderr,"  (/tmp/fw-20170921.pcap -> /data/21/fw-20170921.pcap)\n");
 60 |   fprintf(stderr,"\n");
 61 | 
 62 |   exit(-1);
 63 | }
 64 | 
 65 | #define append(c) do {        \
 66 |   if (olen == 0) goto done;   \
 67 |   *(o++) = (c);               \
 68 |   olen--;                     \
 69 | } while(0)
 70 | 
 71 | /* make a pathname from pattern applied to src. literals are copied, $0 is src,
 72 |  * and $1 through $9 and $A through $Z refer to positions 1 through 36 in src */
 73 | int pat2path(char *out, size_t olen, char *src, char *pat) {
 74 |   char *p = pat;
 75 |   char *o = out;
 76 |   size_t l = strlen(src);
 77 |   int i, rc = -1; 
 78 |   unsigned char x;
 79 | 
 80 |   while (*p != '\0') {
 81 |     if (*p == '$') {    /* translate next pattern character */
 82 |       p++;
 83 |       if (*p == '$') append(*p); /* special case: $$ */
 84 |       else {
 85 | 
 86 |         /* here if we had $x where x must be [0-9A-Z] */
 87 |         if      ((*p >= '0') && (*p <= '9')) x = *p - '0';
 88 |         else if ((*p >= 'A') && (*p <= 'Z')) x = *p - 'A' + 10;
 89 |         else { 
 90 |           fprintf(stderr,"invalid position %c\n", *p); 
 91 |           goto done;
 92 |         }
 93 | 
 94 |         if (x == 0) { 
 95 |           /* $0 means whole src */
 96 |           if (olen < l) goto done;
 97 |           memcpy(o, src, l);
 98 |           o += l;
 99 |         } else { 
100 |           /* copy from 1-based offset to 0-based */
101 |           if (l < x)  {fprintf(stderr,"position %c > %s\n", *p, src); goto done;}
102 |           append(src[x-1]);
103 |         }
104 |       }
105 |     } else append(*p); /* copy literal character */
106 |     p++;
107 |   }
108 | 
109 |   append('\0');
110 |   rc = 0;
111 | 
112 |  done:
113 |   return rc;
114 | }
115 | 
116 | int map_copy(char *file, char *dest) {
117 |   struct stat s;
118 |   char *src=NULL,*dst=NULL;
119 |   int fd=-1,dd=-1,rc=-1;
120 | 
121 |   /* source file */
122 |   if ( (fd = open(file, O_RDONLY)) == -1) {
123 |     fprintf(stderr,"can't open %s: %s\n", file, strerror(errno));
124 |     goto done;
125 |   }
126 |   if (fstat(fd, &s) == -1) {
127 |     fprintf(stderr,"can't stat %s: %s\n", file, strerror(errno));
128 |     goto done;
129 |   }
130 |   if (!S_ISREG(s.st_mode)) {
131 |     fprintf(stderr,"not a regular file: %s\n", file);
132 |     goto done;
133 |   }
134 |   src = mmap(0, s.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
135 |   if (src == MAP_FAILED) {
136 |     fprintf(stderr, "mmap %s: %s\n", file, strerror(errno));
137 |     goto done;
138 |   }
139 | 
140 |   /* dest file */
141 |   if ( (dd = open(dest, O_RDWR|O_TRUNC|O_CREAT, 0644)) == -1) {
142 |     fprintf(stderr,"can't open %s: %s\n", dest, strerror(errno));
143 |     goto done;
144 |   }
145 |   if (ftruncate(dd, s.st_size) == -1) {
146 |     fprintf(stderr,"ftruncate: %s\n", strerror(errno));
147 |     goto done;
148 |   }
149 |   dst = mmap(0, s.st_size, PROT_READ|PROT_WRITE, MAP_SHARED, dd, 0);
150 |   if (dst == MAP_FAILED) {
151 |     fprintf(stderr, "mmap %s: %s\n", dest, strerror(errno));
152 |     goto done;
153 |   }
154 |   memcpy(dst,src,s.st_size);
155 | 
156 |   rc = 0;
157 | 
158 | done:
159 |   if (src && (src != MAP_FAILED)) {
160 |     if (munmap(src, s.st_size)) fprintf(stderr,"munmap: %s\n",strerror(errno));
161 |   }
162 |   if (dst && (dst != MAP_FAILED)) {
163 |     if (munmap(dst, s.st_size)) fprintf(stderr,"munmap: %s\n",strerror(errno));
164 |   }
165 |   if (fd != -1) close(fd);
166 |   if (dd != -1) close(dd);
167 |   return rc;
168 | }
169 | 
170 | /* this implementation only supports making one level of directory */
171 | int do_mkdir(char *path) {
172 |   int rc = -1, sc;
173 |   char dir[PATH_MAX], *d;
174 |   size_t l = strlen(path);
175 |   struct stat s;
176 | 
177 |   /* dirname may modify its input, so pass a copy in */
178 |   if (l+1 > sizeof(dir)) goto done;
179 |   memcpy(dir, path, l+1);
180 |   d = dirname(dir);
181 | 
182 |   sc = stat(d, &s);
183 |   if (sc < 0) {
184 |     /* try to make the path */
185 |     if (mkdir(d, 0755) == 0) { rc = 0; goto done; }
186 |     fprintf(stderr, "mkdir failed: %s %s\n", d, strerror(errno));
187 |     goto done;
188 |   } else {
189 |     /* path exists. is it a directory? */
190 |     if (S_ISDIR(s.st_mode)) { rc = 0; goto done; } /* yes */
191 |     fprintf(stderr, "path exists as non-directory: %s\n", d);
192 |     goto done;
193 |   }
194 | 
195 |   rc = 0;
196 | 
197 |  done:
198 |   return rc;
199 | }
200 | 
201 | int main(int argc, char *argv[]) {
202 |   char *src=NULL, *dst=NULL, *name, oldpath[PATH_MAX],newpath[PATH_MAX];
203 |   int fd=-1, wd, mask, opt, rc=-1, slen, dlen;
204 |   struct inotify_event *eb=NULL, *ev, *nx;
205 |   size_t eb_sz = sizeof(*eb) + PATH_MAX, sz;
206 |   ssize_t nr;
207 | 
208 |   CF.prog = argv[0];
209 | 
210 |   while ( (opt = getopt(argc,argv,"pmvh")) > 0) {
211 |     switch(opt) {
212 |       case 'v': CF.verbose++; break;
213 |       case 'p': CF.pattern_mode=1; break;
214 |       case 'm': CF.mkdir_mode=1; break;
215 |       case 'h': default: usage(); break;
216 |     }
217 |   }
218 | 
219 |   /* expect two more arguments - source and destination */
220 |   if (argc > optind) src = argv[optind++];
221 |   if (argc > optind) dst = argv[optind++];
222 |   if ((src == NULL) || (dst == NULL)) usage();
223 | 
224 |   /* initialize source path buffer as /srcdir/... */
225 |   slen = strlen(src);
226 |   memcpy(oldpath, src, slen); oldpath[slen]='/';
227 | 
228 |   /* initialize dest path as /dstdir/... (regular mode) */
229 |   dlen = strlen(dst);
230 |   memcpy(newpath, dst, dlen); newpath[dlen]='/';
231 | 
232 |   /* setup inotify watch on src dir */
233 |   if ( (fd = inotify_init()) == -1) {
234 |     fprintf(stderr, "inotify_init failed: %s\n", strerror(errno));
235 |     goto done;
236 |   }
237 | 
238 |   mask = IN_CLOSE_WRITE;
239 |   if ( (wd = inotify_add_watch(fd, src, mask)) == -1) {
240 |     fprintf(stderr, "inotify_add_watch failed: %s\n", strerror(errno));
241 |     goto done;
242 |   }
243 | 
244 |   /* see inotify(7) as inotify_event has a trailing name
245 |    * field allocated beyond the fixed structure; we must
246 |    * allocate enough room for the kernel to populate it */
247 |   if ( (eb = malloc(eb_sz)) == NULL) {
248 |     fprintf(stderr, "out of memory\n");
249 |     goto done;
250 |   }
251 | 
252 |   /* one read will produce one or more event structures */
253 |   while ( (nr=read(fd,eb,eb_sz)) > 0) {
254 |     for(ev = eb; nr > 0; ev = nx) {
255 | 
256 |       sz = sizeof(*ev) + ev->len;
257 |       nx = (struct inotify_event*)((char*)ev + sz);
258 |       nr -= sz;
259 | 
260 |       name = (ev->len ? ev->name : src);
261 |       memcpy(&oldpath[slen+1],name,strlen(name)+1);
262 |       if (CF.pattern_mode == 0) memcpy(&newpath[dlen+1],name,strlen(name)+1);
263 |       else if (pat2path(newpath, sizeof(newpath), name, dst) < 0) goto done;
264 | 
265 |       if (CF.mkdir_mode) {
266 |         if (do_mkdir(newpath) < 0) goto done;
267 |       }
268 | 
269 |       if (CF.verbose) fprintf(stderr, "%s --> %s\n", oldpath, newpath); 
270 |       if (map_copy(oldpath, newpath)) goto done;
271 |     }
272 |   }
273 | 
274 |  done:
275 |   if (fd != -1) close(fd);
276 |   if (eb) free(eb);
277 |   return rc;
278 | }
279 | 


--------------------------------------------------------------------------------