├── .gitignore ├── .gitmodules ├── Makefile ├── log.h ├── mpis-ebpf.c ├── mpis-routectl.c ├── mpis-table.c ├── mpis-table.h ├── mpis-table.l ├── mpis-table.y ├── mpis.h ├── readme.md └── test.sh /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | *.dev 3 | *.tab.h 4 | *.tab.c 5 | *.yy.c 6 | *.o 7 | *.ll 8 | mpis-routectl -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "libbpf"] 2 | path = libbpf 3 | url = https://github.com/libbpf/libbpf/ 4 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | LLC ?= llc 2 | CLANG ?= clang 3 | CC ?= gcc 4 | FLEX = flex 5 | BISON = bison 6 | 7 | LIBBPF_DIR = libbpf/src 8 | LIBBPF = $(LIBBPF_DIR)/libbpf.a 9 | 10 | CFLAGS += -I$(LIBBPF_DIR)/build/usr/include/ -Wall -Wextra -march=native 11 | LDFLAGS += -L$(LIBBPF_DIR) 12 | LIBS = -l:libbpf.a -lelf -lz 13 | BPF_CFLAGS += -I$(LIBBPF_DIR)/build/usr/include/ -Wall -Wextra 14 | 15 | all: CFLAGS+=-O3 -g 16 | all: BPF_CFLAGS+=-O3 -g 17 | all: mpis-routectl mpis-ebpf.o 18 | 19 | debug: CFLAGS+=-O0 -g 20 | debug: BPF_CFLAGS+=-O0 -g 21 | debug: mpis-routectl mpis-ebpf.o 22 | 23 | MPIS_ROUTECTL_OBJS=mpis-routectl.o mpis-table.o mpis-table.tab.o mpis-table.yy.o 24 | 25 | .PHONY: all debug clean 26 | 27 | mpis-routectl: $(MPIS_ROUTECTL_OBJS) 28 | $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS) $(LIBS) 29 | 30 | %.c: %.y 31 | # pass 32 | 33 | %.tab.c: %.y 34 | $(BISON) -d $< 35 | 36 | %.yy.c: %.l 37 | $(FLEX) -o $@ $< 38 | 39 | %.o: %.c 40 | $(CC) -c -o $@ $< $(CFLAGS) 41 | 42 | $(LIBBPF): 43 | @if [ ! -d $(LIBBPF_DIR) ]; then \ 44 | echo "libbpf not found, try \`git submodule update --init'"; \ 45 | exit 1; \ 46 | else \ 47 | cd $(LIBBPF_DIR) && $(MAKE) all; \ 48 | mkdir -p build; DESTDIR=build $(MAKE) install_headers; \ 49 | fi 50 | 51 | mpis-ebpf.o: mpis-ebpf.c $(LIBBPF) 52 | $(CLANG) -S -target bpf $(BPF_CFLAGS) -emit-llvm -c -o ${@:.o=.ll} $< 53 | $(LLC) -march=bpf -filetype=obj -o $@ ${@:.o=.ll} 54 | 55 | mpis-routectl.c: $(LIBBPF) 56 | 57 | clean: 58 | rm -fr *.o *.tab.c *.yy.c mpis-routectl $(LIBBPF_DIR)/build 59 | $(MAKE) -C $(LIBBPF_DIR) clean -------------------------------------------------------------------------------- /log.h: -------------------------------------------------------------------------------- 1 | #ifndef MPIS_LOG_H 2 | #define MPIS_LOG_H 3 | #include 4 | #define log_debug(fmt, ...) log("DEBUG", fmt, ## __VA_ARGS__) 5 | #define log_info(fmt, ...) log("INFO ", fmt, ## __VA_ARGS__) 6 | #define log_notice(fmt, ...) log("NOTE ", fmt, ## __VA_ARGS__) 7 | #define log_warn(fmt, ...) log("WARN ", fmt, ## __VA_ARGS__) 8 | #define log_error(fmt, ...) log("ERROR", fmt, ## __VA_ARGS__) 9 | #define log_fatal(fmt, ...) log("FATAL", fmt, ## __VA_ARGS__) 10 | #define log(log_level, fmt, ...) fprintf(stderr, "[" log_level "] %s:%d %s: " fmt, __FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__) 11 | #endif // MPIS_LOG_H -------------------------------------------------------------------------------- /mpis-ebpf.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "mpis.h" 11 | #include "mpis-table.h" 12 | 13 | #define likely(x) __builtin_expect(!!(x), 1) 14 | #define unlikely(x) __builtin_expect(!!(x), 0) 15 | 16 | #define IPHDR_MAXLEN 60 17 | 18 | char _license[] SEC("license") = "GPL"; 19 | 20 | struct { 21 | __uint(type, BPF_MAP_TYPE_LPM_TRIE); 22 | __uint(key_size, 8); 23 | __type(value, mpis_table); 24 | __uint(max_entries, MAX_ENTRIES); 25 | __uint(map_flags, BPF_F_NO_PREALLOC); 26 | } encap_map SEC(".maps"); 27 | 28 | struct { 29 | __uint(type, BPF_MAP_TYPE_HASH); 30 | __type(key, __u32); 31 | __type(value, mpis_table); 32 | __uint(max_entries, MAX_ENTRIES); 33 | } decap_swap_map SEC(".maps"); 34 | 35 | struct vlan_hdr { 36 | __be16 vlan_id; 37 | __be16 inner_ether_proto; 38 | }; 39 | 40 | static __always_inline void put16(__u16 *to, __u16 new, __u32 *diff) { 41 | __u32 tmp32 = *to - new; 42 | *diff += (tmp32 - (tmp32 >= 0xffff)) & 0xffff; // add and carry 43 | *to = new; 44 | } 45 | 46 | static __always_inline void put32(__u32 *to, __u32 new, __u32 *diff) { 47 | __u16 *to16 = (__u16 *) to; 48 | __u16 *new16 = (__u16 *) &new; 49 | 50 | put16(to16, new16[0], diff); 51 | put16(to16 + 1, new16[1], diff); 52 | } 53 | 54 | static __always_inline void end_put(__u32 *diff, __u16 *cksum) { 55 | *diff = *cksum + *diff; 56 | *cksum = *diff + (*diff >> 16); 57 | } 58 | 59 | static __always_inline void do_encap_frag(struct iphdr *ip, mpis_table *entry) { 60 | __u32 diff = 0; 61 | 62 | if (entry->target_data >= ip->ttl) { 63 | put32(&ip->daddr, entry->target, &diff); 64 | return end_put(&diff, &ip->check); 65 | } 66 | 67 | put32((__u32 *) &ip->id, ip->daddr, &diff); 68 | put32(&ip->daddr, entry->target, &diff); 69 | end_put(&diff, &ip->check); 70 | } 71 | 72 | static __always_inline void do_decap_or_swap_frag(struct iphdr *ip, mpis_table *entry) { 73 | __u32 diff = 0; 74 | 75 | if (entry->target_type == TTYPE_DECAP) { 76 | if ((ip->saddr & bpf_htonl(entry->mask)) != entry->target) { 77 | return; 78 | } 79 | 80 | put32(&ip->daddr, *(__u32 *) &ip->id, &diff); 81 | put32((__u32 *) &ip->id, 0, &diff); 82 | return end_put(&diff, &ip->check); 83 | } else if (entry->target_type == TTYPE_SWAP) { 84 | if (entry->target_data >= ip->ttl) { 85 | return; 86 | } 87 | 88 | put32(&ip->daddr, entry->target, &diff); 89 | end_put(&diff, &ip->check); 90 | } 91 | } 92 | 93 | static __always_inline void do_encap(struct iphdr *ip, mpis_table *entry) { 94 | __u32 diff = 0; 95 | 96 | if (entry->target_data >= ip->ttl) { 97 | put32(&ip->daddr, entry->target, &diff); 98 | return end_put(&diff, &ip->check); 99 | } 100 | 101 | put16(&ip->id, (((__u16 *) &ip->saddr)[1] & entry->mask) | (ip->id & ~entry->mask), &diff); 102 | put32(&ip->saddr, ip->daddr, &diff); 103 | put32(&ip->daddr, entry->target, &diff); 104 | end_put(&diff, &ip->check); 105 | } 106 | 107 | static __always_inline void do_decap_or_swap(struct iphdr *ip, mpis_table *entry) { 108 | __u32 diff = 0; 109 | 110 | if (entry->target_type == TTYPE_DECAP) { 111 | put32(&ip->daddr, ip->saddr, &diff); 112 | put32(&ip->saddr, bpf_htonl(bpf_ntohl(entry->target) | bpf_ntohs((ip->id & entry->mask))), &diff); 113 | return end_put(&diff, &ip->check); 114 | } else if (entry->target_type == TTYPE_SWAP) { 115 | if (entry->target_data >= ip->ttl) { 116 | return; 117 | } 118 | 119 | put32(&ip->daddr, entry->target, &diff); 120 | end_put(&diff, &ip->check); 121 | } 122 | } 123 | 124 | SEC("xdp") int mpis(struct xdp_md *ctx) { 125 | void *data_end = (void *) (long) ctx->data_end; 126 | void *data = (void *) (long) ctx->data; 127 | struct ethhdr *eth = data; 128 | struct vlan_hdr *vhdr; 129 | struct iphdr *ip; 130 | struct bpf_fib_lookup fib_params = {}; 131 | void *l3hdr; 132 | mpis_table *entry = NULL; 133 | __u16 ether_proto; 134 | __u32 lpm_key[2]; 135 | int matched = 0; 136 | long ret; 137 | 138 | if (unlikely(data + sizeof(struct ethhdr) > data_end)) { 139 | return XDP_DROP; 140 | } 141 | 142 | l3hdr = data + sizeof(struct ethhdr); 143 | ether_proto = bpf_ntohs(eth->h_proto); 144 | 145 | // vlan? just skip the header. 146 | if (ether_proto == ETH_P_8021Q || ether_proto == ETH_P_8021AD) { 147 | if (l3hdr + sizeof(struct vlan_hdr) > data_end) { 148 | return XDP_DROP; 149 | } 150 | 151 | vhdr = l3hdr; 152 | l3hdr += sizeof(struct vlan_hdr); 153 | ether_proto = vhdr->inner_ether_proto; 154 | } 155 | 156 | // qinq? just skip again. 157 | if (ether_proto == ETH_P_8021Q || ether_proto == ETH_P_8021AD) { 158 | if (l3hdr + sizeof(struct vlan_hdr) > data_end) { 159 | return XDP_DROP; 160 | } 161 | 162 | vhdr = l3hdr; 163 | l3hdr += sizeof(struct vlan_hdr); 164 | ether_proto = vhdr->inner_ether_proto; 165 | } 166 | 167 | if (ether_proto != ETH_P_IP) { 168 | return XDP_PASS; // don't care 169 | } 170 | 171 | if (unlikely(l3hdr + sizeof(struct iphdr) > data_end)) { 172 | return XDP_DROP; 173 | } 174 | 175 | ip = l3hdr; 176 | 177 | if (unlikely(l3hdr + ip->ihl * sizeof(__u32) > data_end)) { 178 | return XDP_DROP; 179 | } 180 | 181 | lpm_key[0] = 32; 182 | lpm_key[1] = ip->saddr; 183 | entry = bpf_map_lookup_elem(&encap_map, &lpm_key); 184 | if (entry != NULL && entry->iif == ctx->ingress_ifindex) { 185 | if (entry->target_flags & TFLAG_OVERRIDE_FRAG) { 186 | do_encap_frag(ip, entry); 187 | } else { 188 | do_encap(ip, entry); 189 | } 190 | matched = 1; 191 | } 192 | 193 | if (!matched) { 194 | entry = bpf_map_lookup_elem(&decap_swap_map, &ip->daddr); 195 | if (entry != NULL && entry->iif == ctx->ingress_ifindex) { 196 | matched = 1; 197 | if (entry->target_flags & TFLAG_OVERRIDE_FRAG) { 198 | do_decap_or_swap_frag(ip, entry); 199 | } else { 200 | do_decap_or_swap(ip, entry); 201 | } 202 | } 203 | } 204 | 205 | if (matched && entry->target_flags & TFLAG_BYPASS_LINUX) { 206 | fib_params.family = AF_INET; 207 | fib_params.tos = ip->tos; 208 | fib_params.l4_protocol = ip->protocol; 209 | fib_params.sport = 0; 210 | fib_params.dport = 0; 211 | fib_params.tot_len = bpf_ntohs(ip->tot_len); 212 | fib_params.ipv4_src = ip->saddr; 213 | fib_params.ipv4_dst = ip->daddr; 214 | fib_params.ifindex = ctx->ingress_ifindex; 215 | 216 | ret = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), 0); 217 | 218 | if (likely(ret == BPF_FIB_LKUP_RET_SUCCESS)) { 219 | __builtin_memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN); 220 | __builtin_memcpy(eth->h_source, fib_params.smac, ETH_ALEN); 221 | return bpf_redirect(fib_params.ifindex, 0); 222 | } 223 | } 224 | 225 | return XDP_PASS; 226 | } -------------------------------------------------------------------------------- /mpis-routectl.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "log.h" 10 | #include "mpis-table.h" 11 | 12 | void usage(const char *me) { 13 | fprintf(stderr, "usage: %s [-adhrs] -t mpis-table-file -e epbf-object [interfaces ...]\n", me); 14 | fprintf(stderr, " -a: attach (default)\n"); 15 | fprintf(stderr, " -d: detach\n"); 16 | fprintf(stderr, " -r: re-attach\n"); 17 | fprintf(stderr, " -s: xdp skb mode\n"); 18 | fprintf(stderr, " -h: help\n"); 19 | } 20 | 21 | int ebpf_loadprog(const char* progname, struct bpf_object** to, int *fd); 22 | int populate_table(struct bpf_object *obj, const mpis_table *table, size_t n_entries, int *dfd, int *efd); 23 | 24 | int attach(int prog_fd, unsigned int ifindex, int flags); 25 | int detach(unsigned int ifindex, int flags); 26 | 27 | int main(int argc, char **argv) { 28 | int ret, prog_fd, dmap_fd, emap_fd, i, skb_mode = 0, xdp_flags = 0; 29 | unsigned int ifindex; 30 | mpis_table *table = NULL; 31 | struct bpf_object *obj; 32 | char op = 'a', opt; 33 | const char *ebpf_name = NULL, *table_file_name = NULL; 34 | ssize_t n_entries; 35 | 36 | while ((opt = getopt(argc, argv, "gadrst:e:")) != -1) { 37 | switch (opt) { 38 | case 'a': 39 | case 'd': 40 | case 'r': 41 | op = opt; 42 | continue; 43 | case 'h': 44 | usage(argv[0]); 45 | return 0; 46 | case 't': 47 | table_file_name = optarg; 48 | continue; 49 | case 'e': 50 | ebpf_name = optarg; 51 | continue; 52 | case 's': 53 | skb_mode = 1; 54 | continue; 55 | default: 56 | usage(argv[0]); 57 | return 1; 58 | } 59 | } 60 | 61 | if (ebpf_name == NULL || table_file_name == NULL) { 62 | usage(argv[0]); 63 | return 1; 64 | } 65 | 66 | n_entries = parse_routes(table_file_name, &table); 67 | 68 | if (n_entries < 0) { 69 | usage(argv[0]); 70 | return 1; 71 | } 72 | 73 | ret = ebpf_loadprog(ebpf_name, &obj, &prog_fd); 74 | if (ret < 0) { 75 | return 1; 76 | } 77 | 78 | ret = populate_table(obj, table, (size_t) n_entries, &dmap_fd, &emap_fd); 79 | if (ret < 0) { 80 | return 1; 81 | } 82 | 83 | xdp_flags = skb_mode ? XDP_FLAGS_SKB_MODE : XDP_FLAGS_DRV_MODE; 84 | 85 | log_debug("loaded mpis table.\n"); 86 | 87 | argc -= optind; 88 | argv += optind; 89 | 90 | for (i = 0; i < argc; i++) { 91 | ifindex = if_nametoindex(argv[i]); 92 | if (ifindex == 0) { 93 | log_error("if_nametoindex(%s): %s\n", argv[i], strerror(errno)); 94 | return 1; 95 | } 96 | 97 | if (op == 'a') { 98 | ret = attach(prog_fd, ifindex, xdp_flags); 99 | if (ret < 0) { 100 | return 1; 101 | } 102 | } else if (op == 'd') { 103 | ret = detach(ifindex, xdp_flags); 104 | if (ret < 0) { 105 | return 1; 106 | } 107 | } else if (op == 'r') { 108 | ret = detach(ifindex, xdp_flags); 109 | if (ret < 0) { 110 | return 1; 111 | } 112 | ret = attach(prog_fd, ifindex, xdp_flags); 113 | if (ret < 0) { 114 | return 1; 115 | } 116 | } 117 | } 118 | 119 | log_info("mpis loaded and running.\n"); 120 | 121 | return 0; 122 | } 123 | 124 | int ebpf_loadprog(const char* progname, struct bpf_object** to, int *fd) { 125 | int ret, prog_fd; 126 | struct bpf_object *obj; 127 | struct bpf_program *prog; 128 | 129 | obj = bpf_object__open_file(progname, NULL); 130 | if (libbpf_get_error(obj)) { 131 | log_error("failed to open object file '%s'", progname); 132 | return -1; 133 | } 134 | 135 | prog = bpf_object__next_program(obj, NULL); 136 | if (!prog) { 137 | log_error("failed to find program in object file '%s'\n", progname); 138 | return -1; 139 | } 140 | 141 | bpf_program__set_type(prog, BPF_PROG_TYPE_XDP); 142 | 143 | ret = bpf_object__load(obj); 144 | if (ret) { 145 | log_error("failed to load object file '%s': %s", progname, strerror(errno)); 146 | return -1; 147 | } 148 | 149 | prog_fd = bpf_program__fd(prog); 150 | if (prog_fd < 0) { 151 | log_error("failed to retrieve ebpf program fd.\n"); 152 | return -1; 153 | } 154 | 155 | *to = obj; 156 | *fd = prog_fd; 157 | 158 | return 0; 159 | } 160 | 161 | int populate_table(struct bpf_object *obj, const mpis_table *table, size_t n_entries, int *dfd, int *efd) { 162 | int ret, emap_fd, dmap_fd; 163 | const mpis_table *tptr; 164 | struct bpf_lpm_trie_key *key; 165 | size_t i; 166 | 167 | emap_fd = bpf_object__find_map_fd_by_name(obj, "encap_map"); 168 | dmap_fd = bpf_object__find_map_fd_by_name(obj, "decap_swap_map"); 169 | 170 | if (emap_fd < 0 || dmap_fd < 0) { 171 | log_error("failed to retrieve mpis map fd.\n"); 172 | return -1; 173 | } 174 | 175 | key = malloc(sizeof(struct bpf_lpm_trie_key) + 4); 176 | 177 | for (i = 0; i < n_entries; ++i) { 178 | tptr = &table[i]; 179 | 180 | if (tptr->target_type == TTYPE_ENCAP) { 181 | key->prefixlen = tptr->cidr; 182 | memcpy(key->data, &tptr->selector, sizeof(uint32_t)); 183 | 184 | ret = bpf_map_update_elem(emap_fd, key, tptr, 0); 185 | 186 | if (ret < 0) { 187 | log_error("failed to update encap map: %s\n", strerror(errno)); 188 | return -1; 189 | } 190 | } else if (tptr->target_type == TTYPE_DECAP || tptr->target_type == TTYPE_SWAP) { 191 | ret = bpf_map_update_elem(dmap_fd, &tptr->selector, tptr, BPF_ANY); 192 | 193 | if (ret < 0) { 194 | log_error("failed to update decap map: %s\n", strerror(errno)); 195 | return -1; 196 | } 197 | } 198 | } 199 | 200 | *dfd = dmap_fd; 201 | *efd = emap_fd; 202 | 203 | return 0; 204 | } 205 | 206 | int attach(int prog_fd, unsigned int ifindex, int flags) { 207 | return bpf_xdp_attach(ifindex, prog_fd, flags, NULL); 208 | } 209 | 210 | int detach(unsigned int ifindex, int flags) { 211 | return bpf_xdp_detach(ifindex, flags, NULL); 212 | } -------------------------------------------------------------------------------- /mpis-table.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "mpis-table.h" 7 | #include "log.h" 8 | #include "mpis.h" 9 | 10 | static mpis_table table[MAX_ENTRIES]; 11 | static int retval; 12 | static size_t n_entries; 13 | 14 | void new_table() { 15 | retval = 0; 16 | n_entries = 0; 17 | } 18 | 19 | void end_table() { 20 | // nothing to do for now 21 | } 22 | 23 | mpis_table *get_table(size_t *table_sz) { 24 | *table_sz = n_entries; 25 | return table; 26 | } 27 | 28 | void add_entry(uint8_t target_type, const char *ifname, uint32_t selector, uint32_t target, uint8_t cidr, uint32_t target_data, uint8_t flags) { 29 | mpis_table *current_entry = &table[n_entries++]; 30 | memset(current_entry, 0, sizeof(mpis_table)); 31 | 32 | current_entry->iif = if_nametoindex(ifname); 33 | if (current_entry->iif == 0) { 34 | log_error("invalid interface name '%s': %s\n", ifname, strerror(errno)); 35 | retval = -1; 36 | return; 37 | } 38 | 39 | if ((cidr > 32 || cidr < 16) && !(flags & TFLAG_OVERRIDE_FRAG)) { 40 | log_error("invalid prefix length - must be between 16 and 32 if not using override-frag\n"); 41 | retval = -1; 42 | return; 43 | } 44 | 45 | current_entry->selector = selector; 46 | current_entry->cidr = cidr; 47 | current_entry->mask = ~((1 << (32 - cidr)) - 1); 48 | current_entry->target_type = target_type; 49 | current_entry->target = target; 50 | current_entry->target_data = target_data; 51 | current_entry->target_flags = flags; 52 | } 53 | 54 | void store_retval(int val) { 55 | retval = val; 56 | } 57 | 58 | int get_retval() { 59 | return retval; 60 | } -------------------------------------------------------------------------------- /mpis-table.h: -------------------------------------------------------------------------------- 1 | #ifndef MPIS_TABLE_H 2 | #define MPIS_TABLE_H 3 | #include 4 | #include 5 | 6 | #define TTYPE_ENCAP 0x01 7 | #define TTYPE_DECAP 0x02 8 | #define TTYPE_SWAP 0x03 9 | 10 | #define TFLAG_BYPASS_LINUX 0b00000001 11 | #define TFLAG_OVERRIDE_FRAG 0b00000010 12 | 13 | typedef struct _mpis_table { 14 | uint32_t iif; 15 | uint32_t selector; 16 | uint32_t target; 17 | 18 | // mask & cidr: valid if action is not swap 19 | uint32_t mask; 20 | uint8_t cidr; 21 | 22 | // valid if type is not decap. data is cutoff-ttl value. 23 | uint8_t target_data; 24 | 25 | // TTYPE_* 26 | uint8_t target_type; 27 | 28 | // TFLAG_* 29 | uint8_t target_flags; 30 | } mpis_table; 31 | 32 | void new_table(); 33 | void end_table(); 34 | mpis_table *get_table(); 35 | 36 | void add_entry(uint8_t target_type, const char *ifname, uint32_t selector, uint32_t target, uint8_t cidr, uint32_t target_data, uint8_t flags); 37 | 38 | void store_retval(int retval); 39 | int get_retval(); 40 | 41 | ssize_t parse_routes(const char *filename, mpis_table **table); 42 | 43 | #endif // MPIS_TABLE_H -------------------------------------------------------------------------------- /mpis-table.l: -------------------------------------------------------------------------------- 1 | %{ 2 | #include 3 | #include "mpis-table.tab.h" 4 | %} 5 | 6 | %option noyywrap 7 | %option nounput 8 | %option noinput 9 | %option yylineno 10 | 11 | %% 12 | [ \t\n] {} 13 | 14 | \#[^\n]* {} 15 | 16 | src { 17 | return SRC; 18 | } 19 | 20 | dst { 21 | return DST; 22 | } 23 | 24 | iif { 25 | return IIF; 26 | } 27 | 28 | encap { 29 | return ENCAP; 30 | } 31 | 32 | decap { 33 | return DECAP; 34 | } 35 | 36 | swap { 37 | return SWAP; 38 | } 39 | 40 | cutoff-ttl { 41 | return CUTOFF_TTL; 42 | } 43 | 44 | bypass-linux { 45 | return BYPASS_LINUX; 46 | } 47 | 48 | override-frag { 49 | return OVERRIDE_FRAG; 50 | } 51 | 52 | \/ { 53 | return SLASH; 54 | } 55 | 56 | ([0-9]{1,3}\.){3}[0-9]{1,3} { 57 | inet_pton(AF_INET, yytext, &yylval.u32); 58 | return IP; 59 | } 60 | 61 | [0-9]+ { 62 | yylval.u32 = atoi(yytext); 63 | return NUMBER; 64 | } 65 | 66 | [a-zA-Z]+[a-zA-Z0-9_\-]* { 67 | yylval.str = strdup(yytext); 68 | return IDENT; 69 | } 70 | 71 | %% -------------------------------------------------------------------------------- /mpis-table.y: -------------------------------------------------------------------------------- 1 | %{ 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "mpis-table.h" 7 | #include "log.h" 8 | 9 | extern int yylineno; 10 | extern int yylex(); 11 | extern FILE *yyin; 12 | 13 | static const char *_filename; 14 | 15 | void yyerror(const char *s); 16 | 17 | #define ERR_IF_NULL(x) if ((x) == NULL) { \ 18 | store_retval(-1);\ 19 | log_error("internal error while parsing mpis routing table.\n");\ 20 | YYERROR;\ 21 | } 22 | %} 23 | 24 | %locations 25 | %define parse.error verbose 26 | 27 | %union { 28 | uint32_t u32; 29 | uint8_t u8; 30 | char *str; 31 | } 32 | 33 | %token IDENT 34 | %token IP NUMBER 35 | %token SRC DST IIF 36 | %token ENCAP DECAP SWAP CUTOFF_TTL 37 | %token SLASH 38 | %token BYPASS_LINUX OVERRIDE_FRAG 39 | 40 | %type entry_flags entry_flag 41 | 42 | %% 43 | mpis_table 44 | : mpis_table mpis_entry 45 | | mpis_entry 46 | 47 | mpis_entry 48 | : IIF IDENT SRC IP SLASH NUMBER ENCAP IP CUTOFF_TTL NUMBER entry_flags { 49 | add_entry(TTYPE_ENCAP, $2, $4, $8, $6, $10, $11); 50 | free($2); 51 | } 52 | | IIF IDENT DST IP SWAP IP CUTOFF_TTL NUMBER entry_flags { 53 | add_entry(TTYPE_SWAP, $2, $4, $6, 0, $8, $9); 54 | free($2); 55 | } 56 | | IIF IDENT DST IP DECAP IP SLASH NUMBER entry_flags { 57 | add_entry(TTYPE_DECAP, $2, $4, $6, $8, 0, $9); 58 | free($2); 59 | } 60 | 61 | entry_flags 62 | : entry_flags entry_flag { 63 | $$ = $1 | $2; 64 | } 65 | | entry_flag 66 | 67 | entry_flag 68 | : %empty { 69 | $$ = 0; 70 | } 71 | | BYPASS_LINUX { 72 | $$ = TFLAG_BYPASS_LINUX; 73 | } 74 | | OVERRIDE_FRAG { 75 | $$ = TFLAG_OVERRIDE_FRAG; 76 | } 77 | 78 | %% 79 | 80 | ssize_t parse_routes(const char *filename, mpis_table **table) { 81 | size_t sz = 0; 82 | _filename = filename; 83 | 84 | 85 | FILE *f = fopen(filename, "r"); 86 | if (!f) { 87 | log_fatal("failed to open config file %s\n", filename); 88 | return -1; 89 | } 90 | 91 | new_table(); 92 | 93 | yyin = f; 94 | yyparse(); 95 | fclose(f); 96 | 97 | end_table(); 98 | 99 | *table = get_table(&sz); 100 | 101 | if (get_retval() < 0) { 102 | return -1; 103 | } 104 | 105 | return sz; 106 | } 107 | 108 | void yyerror(const char *s) { 109 | log_fatal("%s:%d - %s\n", _filename, yylineno, s); 110 | store_retval(-1); 111 | } -------------------------------------------------------------------------------- /mpis.h: -------------------------------------------------------------------------------- 1 | #ifndef MPIS_H 2 | #define MPIS_H 3 | 4 | #define MAX_ENTRIES 1000 5 | 6 | #endif // MPIS_H 7 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | MPIS (Multiprotocol IP Switching) 2 | --- 3 | 4 | MPIS is an eBPF-based "tunneling" technique. The word "tunnel" is stated in quotes since this is not actually a tunnel. It also has the benefit of not losing any MTU during "tunneling." 5 | 6 | It does come with some costs. Namely: 7 | 8 | - To use this tunnel, it's best to have a connection that does not enforce source address filtering (a.k.a. reverse path filtering); 9 | - For each tunnel endpoint, it is only possible to pass traffic for hosts within the same subnet. The size of the source subnet can be up to `/16`; 10 | - IP-layer fragmentation may be affected, depending on the size of the subnet you decided to tunnel. 11 | 12 | ...or, if your connection enforces source address filtering, but you do not care about IP-layer fragmentation, it can also work. 13 | 14 | ### How it works 15 | 16 | Reading the limitations above, you might have guessed how it works. MPIS uses the IP identification field to save the tunnel information. The receiver then restores the info from the ID field upon reception. 17 | 18 | #### Using only part of the ID field 19 | 20 | The first mode of operation, where the the IP-layer fragmentation ability is kept works like this: 21 | 22 | The sender is doing the followings: 23 | 24 | ```c 25 | ip->id = (((__u16 *) &ip->saddr)[1] & entry->mask) | (ip->id & ~entry->mask); 26 | ip->saddr = ip->daddr; 27 | ip->daddr = entry->target; 28 | ``` 29 | 30 | ...basically, if you are trying to tunnel a `/24` subnet, the least significant 8 bits of the IP ID field will be overridden by the least significant 8 bits of the source IP address. It then swaps the source and destination address and sets the new destination address to `target` - this `target` is the IP address of the tunnel receiver. Since IP ID is a 16-bit field, this works for `/32` to `/16`. 31 | 32 | And on the receiver side: 33 | 34 | ```c 35 | ip->daddr = ip->saddr; 36 | ip->saddr = bpf_htonl(bpf_ntohl(entry->target) | bpf_ntohs((ip->id & entry->mask))); 37 | ``` 38 | 39 | ...it also swaps the source and destination address. It then used the pre-configured prefix and the IP restored from IP ID field to recover the sender IP. At this point, we have recovered the original IP datagram (except for the 8-bit in the ID field are lost, but that should not have too big of an impact). 40 | 41 | Let's consider this more realistic example: say, you have two sites - one at SJC and the other one at LAX. You want a tunnel between the sites. Let's assume that: 42 | 43 | - You want to tunnel traffic from SJC subnet `192.0.2.0/24` to your premium China transit you purchased at LAX. 44 | - You have a Linux router at `203.0.113.1` at LAX as tunnel receiver. 45 | - A host, `192.0.2.123`, is trying to reach `120.232.0.1` over the "tunnel." 46 | 47 | This is what will happen on the tunnel sender: 48 | 49 | 1. The tunnel sender receives a packet from `192.0.2.123`: 50 | ```` 51 | 192.0.2.123 -> 120.232.0.1 [IP_ID = 0x1145] 52 | ```` 53 | 2. Tunnel sender rewrites the `IP_ID` field with last 8-bit of the source IP address: 54 | ``` 55 | 192.0.2.123 -> 120.232.0.1 [IP_ID = 0x117b] 56 | ``` 57 | 3. Tunnel sedner swaps the src/dst address: 58 | ``` 59 | 120.232.0.1 -> 192.0.2.123 [IP_ID = 0x117b] 60 | ``` 61 | 4. Tunnel sender rewrites the dst address as tunnel receiver: 62 | ``` 63 | 120.232.0.1 -> 203.0.113.1 [IP_ID = 0x117b] 64 | ``` 65 | 66 | Now, this packet will be delivered to `203.0.113.1`. This is your tunnel receiver. Upon packet reception, the tunnel receiver does the followings: 67 | 68 | 1. The tunnel receiver extracts the last 8 bit of IP ID field: 69 | ``` 70 | 120.232.0.1 -> 203.0.113.1 [IP_ID = 0x117b, IP_ID_LAST8 = 0x7b] 71 | ``` 72 | 2. Tunnel receiver swaps the src/dst address: 73 | ``` 74 | 203.0.113.1 -> 120.232.0.1 [IP_ID_LAST8 = 0x7b] 75 | ``` 76 | 3. Tunnel receiver set the source address as the network address of the tunneled prefix: 77 | ``` 78 | 192.0.2.0 -> 120.232.0.1 [IP_ID_LAST8 = 0x7b] 79 | ``` 80 | 4. Tunnel receiver does a bitwise or between `IP_ID_LAST8` and source address: 81 | ``` 82 | (192.0.2.0 | 0x7b) = 192.0.2.123 -> 120.232.0.1 83 | ``` 84 | 85 | At this point, we can forward this packet as we normally would. 86 | 87 | #### Using the entire ID field, fragmentation flags and fragmentation offset 88 | 89 | MPIS can also operate in another mode - where it simply overrides bit 32 to 64 (ID field, frag flags, and frag offset) of the IP header with the actual destination address. It then changes the destination address to the tunnel receiver's address. The receiver can easily recover the address by copying bit 32 to 64 to the dst address field and clearing the frag-related bits. Since now we have full 32 bits, the `/16` source subnet size limit is also gone. 90 | 91 | This mode keeps the original sender address, so it can pass through reverse path filtering if there's one. However, since the entire bit 32 to 64 is nuked, IP-layer fragmentation is not going to work anymore. 92 | 93 | ### Usage 94 | 95 | To build MPIS: 96 | 97 | ``` 98 | $ sudo apt install build-essential clang llvm libelf-dev gcc-multilib linux-headers-`dpkg --print-architecture` flex bison 99 | $ git clone https://github.com/apernet/mpis 100 | $ cd mpis 101 | $ git submodule update --init 102 | $ make 103 | ``` 104 | 105 | To configure MPIS, you will need to define a MPIS route table. Syntax: 106 | 107 | ``` 108 | iif src / encap cutoff-ttl [flags ...] 109 | iif dst swap cutoff-ttl [flags ...] 110 | iif dst decap / [flags ...] 111 | ``` 112 | 113 | There are three types of actions: `encap`, `swap`, and `decap`. And possible `flags` are: 114 | 115 | - `bypass-linux`: Bypass Linux network stack: perform routing table lookup directly in XDP and do IP forwarding directly in XDP. Note that with this enabled, Linux will not be able to see the packet at all, including tools like `tcpdump`. 116 | - `override-frag`: Use bit 32 to 64 in the IP header (frag-related bits) to store the destination instead of using the source address field. This allows MPIS to function in networks with reverse path filtering but will nuke the IP-layer fragmentation ability. 117 | 118 | #### encap 119 | 120 | `encap` action "encapsulates" the traffic either by: 121 | 122 | - partially overriding the ID field, swapping src/dst, and setting dst to the given `receiver`. (default mode) 123 | - overriding the entire bit 32 to 64 of the IP header with the actual destination, then setting the dst IP to the given `receiver`. (`override-frag` mode) 124 | 125 | `cutoff-ttl` allows you to define a TTL value, where if the TTL of the packet is lower than the given value, MPIS will not change the ID field and source IP. This means that if users were to do a traceroute, until the given TTL, users were actually tracing to the tunnel receiver. Hops on the path to the tunnel receiver will reply with the TTL expired message. Since that is the same path tunneled packet will actually travel, it can be useful for troubleshooting. 126 | 127 | #### swap 128 | 129 | `swap` action changes the destination address again, potentially relaying the tunneled traffic to another receiver. `cutoff-ttl` is not working correctly for `swap` yet, and `override-frag` flag does nothing for `swap`. 130 | 131 | #### decap 132 | 133 | `decap` action "decapsulates" the traffic that was previously `encap`-ed by the sender. Encap-mode related flags, if any, must be the same with the `encap` side. 134 | 135 | ### Configuration example 136 | 137 | Putting the example in "how it works" as configuration files will look something like this: 138 | 139 | SJC site: 140 | 141 | ``` 142 | iif eth1 src 192.0.2.0/24 encap 203.0.113.1 cutoff-ttl 10 143 | ``` 144 | 145 | LAX site: 146 | 147 | ``` 148 | iif eth0 dst 203.0.113.1 decap 192.0.2.0/24 149 | ``` 150 | 151 | Note that this only creates a one-side path (SJC -> LAX). 152 | 153 | ### Running 154 | 155 | To run `mpis`, use `mpis-routectl`: 156 | 157 | ``` 158 | usage: ./mpis-routectl [-adhrs] -t mpis-table-file -e epbf-object [interfaces ...] 159 | -a: attach (default) 160 | -d: detach 161 | -r: re-attach 162 | -s: xdp skb mode 163 | -h: help 164 | ``` 165 | 166 | For example, to run on `eth0` and `eth1`, using `routes.conf` as route configuration: 167 | 168 | ``` 169 | $ sudo ./mpis-routectl -t routes.conf -e mpis-ebpf.o eth0 eth1 170 | ``` 171 | 172 | If it fails, try running in SKB mode (`-s`). Note that the error `libbpf: Error in bpf_create_map_xattr(encap_map):Invalid argument(-22). Retrying without BTF.` can be safely ignored. 173 | 174 | ### Misc 175 | 176 | #### Why "MPIS"? 177 | 178 | Well, that just sounded right to me. MPLS swaps labels, and MPIS swaps IP addresses. MPLS operates at layer 2, so it can carry "MP" layer 2 traffics. IP operates at layer 3, so MPIS carries layer 3 "MP" traffic (i.e., UDP, TCP, ICMP, GRE, etc.) 179 | 180 | #### About IP fragmentation 181 | 182 | When tunneling a `/24`, we are taking 8 bits away from the ID field. This means there can only be 256 unique IDs for actual fragmentation. In most workloads, this really shouldn't matter. IP-layer fragmentation is quite rare; at least, you won't see any impact on your normal TCP connections. 183 | 184 | If fragmentation is really that important - we can always reduce the subnet size. Tunneling `/28` each receiver IP will give 12 bits for ID (4,096 unique IDs). I can't really think of any normal workflow that would cause this many unique fragmented flows. 185 | 186 | #### Security consideration 187 | 188 | Potentially, this allows others to have your network send spoof packets against their source IP address. Not sure how useful that will be. -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -me 4 | 5 | send="send" 6 | recv="recv" 7 | 8 | node1="node1" 9 | node2="node2" 10 | 11 | cleanup() { 12 | ip netns del "$send" 13 | ip netns del "$recv" 14 | ip netns del "$node1" 15 | ip netns del "$node2" 16 | } 17 | 18 | ip netns add "${send}" 19 | ip netns add "${recv}" 20 | ip netns add "${node1}" 21 | ip netns add "${node2}" 22 | 23 | trap cleanup EXIT 24 | 25 | # node1 -- send -- recv -- node2 26 | ip link add dev link-sr mtu 1500 netns "${send}" type veth peer name link-rs mtu 1500 netns "${recv}" 27 | ip link add dev link-1s mtu 1500 netns "${send}" type veth peer name link-s1 mtu 1500 netns "${node1}" 28 | ip link add dev link-r2 mtu 1500 netns "${recv}" type veth peer name link-2r mtu 1500 netns "${node2}" 29 | 30 | ip -netns "${send}" link set link-sr up 31 | ip -netns "${recv}" link set link-rs up 32 | ip -netns "${send}" link set link-1s up 33 | ip -netns "${node1}" link set link-s1 up 34 | ip -netns "${recv}" link set link-r2 up 35 | ip -netns "${node2}" link set link-2r up 36 | 37 | # send (10.0.0.1) ---- recv (10.0.0.254) 38 | ip -netns "${send}" -4 addr add "10.0.0.1/24" dev link-sr 39 | ip -netns "${recv}" -4 addr add "10.0.0.254/24" dev link-rs 40 | 41 | # node1 (10.1.0.1) -- send (10.1.0.254) 42 | ip -netns "${node1}" -4 addr add "10.1.0.1/24" dev link-s1 43 | ip -netns "${send}" -4 addr add "10.1.0.254/24" dev link-1s 44 | 45 | # recv (10.2.0.254) -- node2 (10.2.0.1) 46 | ip -netns "${recv}" -4 addr add "10.2.0.254/24" dev link-r2 47 | ip -netns "${node2}" -4 addr add "10.2.0.1/24" dev link-2r 48 | 49 | # node1 default via send 50 | ip -netns "${node1}" -4 route add default via 10.1.0.254 51 | 52 | # node2 default via recv 53 | ip -netns "${node2}" -4 route add default via 10.2.0.254 54 | 55 | # send default via recv 56 | ip -netns "${send}" -4 route add default via 10.0.0.254 57 | 58 | # recv default via node2 59 | ip -netns "${recv}" -4 route add default via 10.2.0.1 60 | 61 | # node1 tunnel ip on loopback 62 | ip -netns "${node1}" -4 addr add 1.2.3.4/32 dev lo 63 | ip -netns "${node1}" -4 addr add 11.22.33.44/32 dev lo 64 | ip -netns "${node1}" link set lo up 65 | 66 | # route lo traffic to node1 67 | ip -netns "${send}" -4 route add 1.2.3.4/32 via 10.1.0.1 68 | ip -netns "${send}" -4 route add 11.22.33.44/32 via 10.1.0.1 69 | 70 | # node2 tunnel ip on loopback 71 | ip -netns "${node2}" -4 addr add 5.6.7.8/32 dev lo 72 | ip -netns "${node2}" -4 addr add 55.66.77.88/32 dev lo 73 | ip -netns "${node2}" link set lo up 74 | 75 | cat > test-send.cfg << EOF 76 | iif link-1s src 1.2.3.0/24 encap 10.0.0.254 cutoff-ttl 10 77 | iif link-sr dst 10.0.0.1 decap 5.6.7.0/24 78 | EOF 79 | 80 | cat > test-recv.cfg << EOF 81 | iif link-rs dst 10.0.0.254 decap 1.2.3.0/24 82 | iif link-r2 src 5.6.7.0/24 encap 10.0.0.1 cutoff-ttl 10 83 | EOF 84 | 85 | # mpis on sender 86 | ip netns exec "${send}" ./mpis-routectl -t test-send.cfg -e mpis-ebpf.o link-1s link-sr 87 | 88 | # mpis on receiver 89 | ip netns exec "${recv}" ./mpis-routectl -t test-recv.cfg -e mpis-ebpf.o link-rs link-r2 90 | 91 | read -p 'env ready, press enter to proceed with test.' 92 | 93 | echo '==== tests started ====' 94 | 95 | ip netns exec "${node1}" ping -I 1.2.3.4 5.6.7.8 -c20 > /dev/null 2>&1 & 96 | ping_pid_1=$! 97 | 98 | echo -n 'testing encap... ' 99 | timeout 5s ip netns exec "${send}" tcpdump -i link-sr -n 'icmp and src 5.6.7.8 and dst 10.0.0.254' -c1 > /dev/null 2>&1 && echo 'ok' || echo 'failed' 100 | 101 | echo -n 'testing decap... ' 102 | timeout 5s ip netns exec "${recv}" tcpdump -i link-rs -n 'icmp and src 1.2.3.4 and dst 5.6.7.8' -c1 > /dev/null 2>&1 && echo 'ok' || echo 'failed' 103 | 104 | echo -n 'testing encap (inverse)... ' 105 | timeout 5s ip netns exec "${recv}" tcpdump -i link-rs -n 'icmp and src 1.2.3.4 and dst 10.0.0.1' -c1 > /dev/null 2>&1 && echo 'ok' || echo 'failed' 106 | 107 | echo -n 'testing decap (inverse)... ' 108 | timeout 5s ip netns exec "${send}" tcpdump -i link-sr -n 'icmp and dst 1.2.3.4 and src 5.6.7.8' -c1 > /dev/null 2>&1 && echo 'ok' || echo 'failed' 109 | 110 | ip netns exec "${node1}" ping -I 11.22.33.44 55.66.77.88 -c10 > /dev/null 2>&1 & 111 | ping_pid_2=$! 112 | 113 | echo -n 'testing sender-passthrough... ' 114 | timeout 5s ip netns exec "${send}" tcpdump -i link-sr -n 'icmp and src 11.22.33.44 and dst 55.66.77.88' -c1 > /dev/null 2>&1 && echo 'ok' || echo 'failed' 115 | 116 | echo -n 'testing receiver-passthrough... ' 117 | timeout 5s ip netns exec "${recv}" tcpdump -i link-rs -n 'icmp and src 11.22.33.44 and dst 55.66.77.88' -c1 > /dev/null 2>&1 && echo 'ok' || echo 'failed' 118 | 119 | ip netns exec "${node1}" ping -t5 -I 1.2.3.4 55.66.77.88 -c10 > /dev/null 2>&1 & 120 | ping_pid_3=$! 121 | 122 | echo -n 'testing ttl-cutoff... ' 123 | timeout 5s ip netns exec "${send}" tcpdump -i link-sr -n 'icmp and src 1.2.3.4 and dst 10.0.0.254' -c1 > /dev/null 2>&1 && echo 'ok' || echo 'failed' 124 | 125 | echo -n 'testing fragmentation... ' 126 | timeout 5s ip netns exec "${node1}" ping -I 1.2.3.4 5.6.7.8 -c1 -s2000 > /dev/null 2>&1 && echo 'ok' || echo 'failed' 127 | 128 | echo -n 'testing ip options... ' 129 | timeout 5s ip netns exec "${node1}" ping -R -I 1.2.3.4 5.6.7.8 -c1 > /dev/null 2>&1 && echo 'ok' || echo 'failed' 130 | 131 | kill $ping_pid_1 || true 132 | kill $ping_pid_2 || true 133 | kill $ping_pid_3 || true 134 | 135 | echo 'running speedtest (node1 <- node2)... ' 136 | ip netns exec "${node2}" iperf3 -s -1 > /dev/null 2>&1 & 137 | iperf_pid_1=$! 138 | 139 | sleep 1 140 | 141 | ip netns exec "${node1}" iperf3 -B 1.2.3.4 -Rc 5.6.7.8 || echo 'iperf failed' 142 | 143 | echo 'running speedtest (node2 <- node1)... ' 144 | ip netns exec "${node2}" iperf3 -s -1 > /dev/null 2>&1 & 145 | iperf_pid_2=$! 146 | 147 | sleep 1 148 | 149 | ip netns exec "${node1}" iperf3 -B 1.2.3.4 -c 5.6.7.8 || echo 'iperf failed' 150 | 151 | kill $iperf_pid_1 || true 152 | kill $iperf_pid_2 || true 153 | 154 | cat > test-send.cfg << EOF 155 | iif link-1s src 1.2.3.0/24 encap 10.0.0.254 cutoff-ttl 10 override-frag 156 | iif link-sr dst 10.0.0.1 decap 5.6.7.0/24 override-frag 157 | EOF 158 | 159 | cat > test-recv.cfg << EOF 160 | iif link-rs dst 10.0.0.254 decap 1.2.3.0/24 override-frag 161 | iif link-r2 src 5.6.7.0/24 encap 10.0.0.1 cutoff-ttl 10 override-frag 162 | EOF 163 | 164 | ip netns exec "${send}" ./mpis-routectl -t test-send.cfg -e mpis-ebpf.o link-1s link-sr 165 | ip netns exec "${recv}" ./mpis-routectl -t test-recv.cfg -e mpis-ebpf.o link-rs link-r2 166 | 167 | ip netns exec "${node1}" ping -I 1.2.3.4 5.6.7.8 -c20 > /dev/null 2>&1 & 168 | ping_pid_1=$! 169 | 170 | echo -n 'testing encap (override-frag)... ' 171 | timeout 5s ip netns exec "${send}" tcpdump -i link-sr -n 'icmp and src 1.2.3.4 and dst 10.0.0.254' -c1 > /dev/null 2>&1 && echo 'ok' || echo 'failed' 172 | 173 | echo -n 'testing decap (override-frag)... ' 174 | timeout 5s ip netns exec "${recv}" tcpdump -i link-rs -n 'icmp and src 1.2.3.4 and dst 5.6.7.8' -c1 > /dev/null 2>&1 && echo 'ok' || echo 'failed' 175 | 176 | echo -n 'testing encap (inverse, override-frag)... ' 177 | timeout 5s ip netns exec "${recv}" tcpdump -i link-rs -n 'icmp and src 5.6.7.8 and dst 10.0.0.1' -c1 > /dev/null 2>&1 && echo 'ok' || echo 'failed' 178 | 179 | echo -n 'testing decap (inverse, override-frag)... ' 180 | timeout 5s ip netns exec "${send}" tcpdump -i link-sr -n 'icmp and dst 1.2.3.4 and src 5.6.7.8' -c1 > /dev/null 2>&1 && echo 'ok' || echo 'failed' 181 | 182 | ip netns exec "${node1}" ping -I 11.22.33.44 55.66.77.88 -c10 > /dev/null 2>&1 & 183 | ping_pid_2=$! 184 | 185 | echo -n 'testing sender-passthrough (override-frag)... ' 186 | timeout 5s ip netns exec "${send}" tcpdump -i link-sr -n 'icmp and src 11.22.33.44 and dst 55.66.77.88' -c1 > /dev/null 2>&1 && echo 'ok' || echo 'failed' 187 | 188 | echo -n 'testing receiver-passthrough (override-frag)... ' 189 | timeout 5s ip netns exec "${recv}" tcpdump -i link-rs -n 'icmp and src 11.22.33.44 and dst 55.66.77.88' -c1 > /dev/null 2>&1 && echo 'ok' || echo 'failed' 190 | 191 | ip netns exec "${node1}" ping -t5 -I 1.2.3.4 55.66.77.88 -c10 > /dev/null 2>&1 & 192 | ping_pid_3=$! 193 | 194 | echo -n 'testing ttl-cutoff (override-frag)... ' 195 | timeout 5s ip netns exec "${send}" tcpdump -i link-sr -n 'icmp and src 1.2.3.4 and dst 10.0.0.254' -c1 > /dev/null 2>&1 && echo 'ok' || echo 'failed' 196 | 197 | echo -n 'testing fragmentation (override-frag - should fail)... ' 198 | timeout 5s ip netns exec "${node1}" ping -I 1.2.3.4 5.6.7.8 -c1 -s2000 > /dev/null 2>&1 && echo 'ok' || echo 'failed' 199 | 200 | echo -n 'testing ip options (override-frag)... ' 201 | timeout 5s ip netns exec "${node1}" ping -R -I 1.2.3.4 5.6.7.8 -c1 > /dev/null 2>&1 && echo 'ok' || echo 'failed' 202 | 203 | kill $ping_pid_1 || true 204 | kill $ping_pid_2 || true 205 | kill $ping_pid_3 || true 206 | 207 | echo 'running speedtest (override-frag, node1 <- node2)... ' 208 | ip netns exec "${node2}" iperf3 -s -1 > /dev/null 2>&1 & 209 | iperf_pid_1=$! 210 | 211 | sleep 1 212 | 213 | ip netns exec "${node1}" iperf3 -B 1.2.3.4 -Rc 5.6.7.8 || echo 'iperf failed' 214 | 215 | echo 'running speedtest (override-frag, node2 <- node1)... ' 216 | ip netns exec "${node2}" iperf3 -s -1 > /dev/null 2>&1 & 217 | iperf_pid_2=$! 218 | 219 | sleep 1 220 | 221 | ip netns exec "${node1}" iperf3 -B 1.2.3.4 -c 5.6.7.8 || echo 'iperf failed' 222 | 223 | kill $iperf_pid_1 || true 224 | kill $iperf_pid_2 || true 225 | 226 | read -p 'all tests completed; you may continue to play with the netns, or press enter to cleanup and exit. ' 227 | 228 | wait 229 | 230 | --------------------------------------------------------------------------------