├── .gitignore ├── Makefile ├── Readme.md ├── TestCodes ├── compile.sh ├── icmp.c ├── mac.c ├── rmi.c ├── rmi.h ├── route.c ├── route_dump.c ├── routemac └── xlb_test.c ├── bpf_helpers.h ├── bpf_load.c ├── bpf_load.h ├── bpf_util.h ├── icmp.c ├── libbpf.h ├── mac.c ├── perf-sys.h ├── rmi.h ├── route.c ├── tools ├── include │ ├── linux │ │ └── bpf.h │ └── uapi │ │ └── linux │ │ ├── bpf.h │ │ └── bpf_common.h └── lib │ └── bpf │ ├── bpf.c │ ├── bpf.h │ ├── libbpf.c │ └── libbpf.h ├── xlb.sh ├── xlb.sh2 ├── xlb.sh3 ├── xlb.sh4 ├── xlb_cmdline.c ├── xlb_common.h ├── xlb_kern.c ├── xlb_user.c ├── xlb_util.c ├── xlb_util.h ├── xlbd.c └── xlbd.yaml /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.o 3 | *ll 4 | map 5 | map_cmdline 6 | xlb 7 | xlb_cmdline 8 | xdp_ddos01_blacklist 9 | xdp_ddos01_blacklist_cmdline 10 | xdp_tx_iptunnel 11 | xlb_test 12 | rmi 13 | a.out 14 | xlbd 15 | 16 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # 2 | TARGETS := xlb 3 | 4 | MANAGEMENT_DAEMON := xlbd 5 | 6 | CMDLINE_TOOLS := xlb_cmdline 7 | COMMON_H = ${CMDLINE_TOOLS:_cmdline=_common.h} 8 | 9 | RMI_SOURCES := route.c 10 | RMI_SOURCES += icmp.c 11 | RMI_SOURCES += mac.c 12 | RMI_SOURCES += xlb_util.c 13 | RMI_OBJECTS = ${RMI_SOURCES:.c=.o} 14 | 15 | # Generate file name-scheme based on TARGETS 16 | KERN_SOURCES = ${TARGETS:=_kern.c} 17 | USER_SOURCES = ${TARGETS:=_user.c} 18 | KERN_OBJECTS = ${KERN_SOURCES:.c=.o} 19 | USER_OBJECTS = ${USER_SOURCES:.c=.o} 20 | 21 | # Notice: the kbuilddir can be redefined on make cmdline 22 | kbuilddir ?= /lib/modules/$(shell uname -r)/build/ 23 | KERNEL=$(kbuilddir) 24 | 25 | CFLAGS := -g -O2 -Wall 26 | 27 | # Local copy of kernel/tools/lib/ 28 | #CFLAGS += -I./tools/lib 29 | CFLAGS += -I$(KERNEL)/tools/lib 30 | # 31 | # Local copy of uapi/linux/bpf.h kept under ./tools/include 32 | # needed due to enum dependency in bpf_helpers.h 33 | #CFLAGS += -I./tools/include 34 | # For building libbpf there is a lot of kernel includes in tools/include/ 35 | CFLAGS += -I$(KERNEL)/tools/include 36 | CFLAGS += -I$(KERNEL)/tools/perf 37 | CFLAGS += -I$(KERNEL)/usr/include 38 | # Strange dependency to "selftests" due to "bpf_util.h" 39 | #CFLAGS += -I$(KERNEL)/tools/testing/selftests/bpf/ 40 | 41 | LDFLAGS= -lelf 42 | 43 | # Objects that xxx_user program is linked with: 44 | OBJECT_BPF_SYSCALLS = tools/lib/bpf/bpf.o 45 | OBJECT_LOADBPF = bpf_load.o 46 | OBJECTS = $(OBJECT_BPF_SYSCALLS) $(OBJECT_LOADBPF) 47 | # 48 | # The tools/lib/bpf/libbpf is avail via a library 49 | OBJECT_BPF_LIBBPF = tools/lib/bpf/libbpf.o 50 | 51 | # Allows pointing LLC/CLANG to another LLVM backend, redefine on cmdline: 52 | # make LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang 53 | LLC ?= llc 54 | CLANG ?= clang 55 | 56 | CC = gcc 57 | 58 | NOSTDINC_FLAGS := -nostdinc -isystem $(shell $(CC) -print-file-name=include) 59 | 60 | # Copy of uapi/linux/bpf.h stored here: 61 | 62 | LINUXINCLUDE := -I$(KERNEL)/arch/x86/include 63 | LINUXINCLUDE += -I$(KERNEL)/arch/x86/include/generated/uapi 64 | LINUXINCLUDE += -I$(KERNEL)/arch/x86/include/generated 65 | LINUXINCLUDE += -I$(KERNEL)/include 66 | LINUXINCLUDE += -I$(KERNEL)/arch/x86/include/uapi 67 | LINUXINCLUDE += -I$(KERNEL)/include/uapi 68 | LINUXINCLUDE += -I$(KERNEL)/include/generated/uapi 69 | LINUXINCLUDE += -include $(KERNEL)/include/linux/kconfig.h 70 | LINUXINCLUDE += -I$(KERNEL)/tools/lib 71 | 72 | #LINUXINCLUDE += -I./tools/include/ 73 | 74 | #EXTRA_CFLAGS=-Werror 75 | EXTRA_CFLAGS= -D__BPF_TRACING__ 76 | 77 | all: dependencies $(TARGETS) $(KERN_OBJECTS) $(CMDLINE_TOOLS) $(MANAGEMENT_DAEMON) 78 | 79 | .PHONY: dependencies clean verify_cmds verify_llvm_target_bpf $(CLANG) $(LLC) 80 | 81 | # Manually define dependencies to e.g. include files 82 | napi_monitor: napi_monitor.h 83 | napi_monitor_kern.o: napi_monitor.h 84 | 85 | clean: 86 | @find . -type f \ 87 | \( -name '*~' \ 88 | -o -name '*.ll' \ 89 | -o -name '*.bc' \ 90 | -o -name 'core' \) \ 91 | -exec rm -vf '{}' \; 92 | rm -f $(OBJECTS) 93 | rm -f $(TARGETS) 94 | rm -f $(KERN_OBJECTS) 95 | rm -f $(USER_OBJECTS) 96 | rm -f $(RMI_OBJECTS) 97 | rm -f $(OBJECT_BPF_LIBBPF) libbpf.a 98 | 99 | dependencies: verify_llvm_target_bpf linux-src-devel-headers 100 | 101 | linux-src: 102 | @if ! test -d $(KERNEL)/; then \ 103 | echo "ERROR: Need kernel source code to compile against" ;\ 104 | echo "(Cannot open directory: $(KERNEL))" ;\ 105 | exit 1; \ 106 | else true; fi 107 | 108 | linux-src-libbpf: linux-src 109 | @if ! test -d $(KERNEL)/tools/lib/bpf/; then \ 110 | echo "ERROR: Need kernel source code to compile against" ;\ 111 | echo " and specifically tools/lib/bpf/ "; \ 112 | exit 1; \ 113 | else true; fi 114 | 115 | linux-src-devel-headers: linux-src-libbpf 116 | @if ! test -d $(KERNEL)/usr/include/ ; then \ 117 | echo -n "WARNING: Need kernel source devel headers"; \ 118 | echo " likely need to run:"; \ 119 | echo " (in kernel source dir: $(KERNEL))"; \ 120 | echo -e "\n make headers_install\n"; \ 121 | true ; \ 122 | else true; fi 123 | 124 | verify_cmds: $(CLANG) $(LLC) 125 | @for TOOL in $^ ; do \ 126 | if ! (which -- "$${TOOL}" > /dev/null 2>&1); then \ 127 | echo "*** ERROR: Cannot find LLVM tool $${TOOL}" ;\ 128 | exit 1; \ 129 | else true; fi; \ 130 | done 131 | 132 | verify_llvm_target_bpf: verify_cmds 133 | @if ! (${LLC} -march=bpf -mattr=help > /dev/null 2>&1); then \ 134 | echo "*** ERROR: LLVM (${LLC}) does not support 'bpf' target" ;\ 135 | echo " NOTICE: LLVM version >= 3.7.1 required" ;\ 136 | exit 2; \ 137 | else true; fi 138 | 139 | # Helpers for bpf syscalls (from tools/lib/bpf/bpf.c) 140 | $(OBJECT_BPF_SYSCALLS): %.o: %.c 141 | $(CC) $(CFLAGS) -o $@ -c $< 142 | 143 | $(OBJECT_LOADBPF): bpf_load.c bpf_load.h 144 | $(CC) $(CFLAGS) -o $@ -c $< 145 | 146 | # ISSUE: The libbpf.a library creates a kernel source dependency, for 147 | # include files from tools/include/ 148 | $(OBJECT_BPF_LIBBPF): %.o: %.c 149 | $(CC) $(CFLAGS) -o $@ -c $< 150 | # 151 | libbpf.a: $(OBJECT_BPF_LIBBPF) $(OBJECT_BPF_SYSCALLS) 152 | $(RM) $@; $(AR) rcs $@ $^ 153 | 154 | # Compiling of eBPF restricted-C code with LLVM 155 | # clang option -S generated output file with suffix .ll 156 | # which is the non-binary LLVM assembly language format 157 | # (normally LLVM bitcode format .bc is generated) 158 | # 159 | # Use -Wno-address-of-packed-member as eBPF verifier enforces 160 | # unaligned access checks where necessary 161 | # 162 | $(KERN_OBJECTS): %.o: %.c bpf_helpers.h 163 | $(CLANG) -S $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) \ 164 | -D__KERNEL__ -D__ASM_SYSREG_H \ 165 | -Wall \ 166 | -Wno-unused-value -Wno-pointer-sign \ 167 | -D__TARGET_ARCH_$(ARCH) \ 168 | -Wno-compare-distinct-pointer-types \ 169 | -Wno-gnu-variable-sized-type-not-at-end \ 170 | -Wno-tautological-compare \ 171 | -Wno-unknown-warning-option \ 172 | -Wno-address-of-packed-member \ 173 | -O2 -emit-llvm -c $< 174 | $(LLC) -march=bpf -filetype=obj -o $@ ${@:.o=.ll} 175 | 176 | $(TARGETS): %: %_user.c $(OBJECTS) Makefile 177 | $(CC) $(CFLAGS) $(OBJECTS) $(LDFLAGS) -o $@ $< 178 | 179 | $(CMDLINE_TOOLS): %: %.c $(OBJECTS) Makefile $(COMMON_H) $(RMI_OBJECTS) rmi.h 180 | $(CC) -g $(CFLAGS) $(OBJECTS) $(RMI_OBJECTS) $(LDFLAGS) -o $@ $< 181 | 182 | $(MANAGEMENT_DAEMON): %: %.c $(OBJECTS) Makefile $(COMMON_H) $(RMI_OBJECTS) rmi.h 183 | $(CC) -g $(CFLAGS) $(OBJECTS) $(RMI_OBJECTS) $(LDFLAGS) -o $@ $< -lyaml 184 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # XDP Loadbalancer 2 | 3 | ## To load xdp bytecode 4 | 5 | Load: 6 | ``` 7 | ./xlb -i eth0 -v 8 | ``` 9 | 10 | Unload: 11 | ``` 12 | ./xlb -i eth0 -r 13 | ``` 14 | 15 | Check to see the xdp binary is loaded 16 | ``` 17 | # ip link show dev eth0 18 | 2: eth0: mtu 1500 xdp qdisc mq state UP mode DEFAULT group default qlen 1000 19 | link/ether 52:54:00:11:00:1b brd ff:ff:ff:ff:ff:ff 20 | prog/xdp id 94 tag e09d47c63a72ab36 jited 21 | ``` 22 | The line, "prog/xdp id 94 tag e09d47c63a72ab36 jited" indicate that the xdp byte code is correctly hooked to the eth0. 23 | 24 | ## Setup loadbalancer 25 | 26 | Basic usage of the xlb_cmdline 27 | ``` 28 | Usage: ./xlb_cmdline [...] 29 | -i Interface name(eg. eth0) 30 | -A ServiceIP(a.k.a. VIP) 31 | -t (for TCP, optional, default) 32 | -u (for UDP, optional) 33 | -r WorkerIP 34 | -v verbose 35 | -L list lb table 36 | -l list lbcache 37 | -h Display this help 38 | ``` 39 | 40 | Create service 41 | ``` 42 | ./xlb_cmdline -i eth0 -A 10.1.4.1 -p 80 43 | ``` 44 | 45 | Add real servers 46 | ``` 47 | ./xlb_cmdline -i eth0 -a 10.1.4.1 -p 80 -r 10.0.0.24 48 | ./xlb_cmdline -i eth0 -a 10.1.4.1 -p 80 -r 10.0.0.23 49 | ./xlb_cmdline -i eth0 -a 10.1.4.1 -p 80 -r 10.0.0.22 50 | ``` 51 | 52 | Show registered services. 53 | ``` 54 | ./xlb_cmdline -i eth0 -L 55 | service: 10.1.4.1:80(6) { 56 | src: 10.0.0.27, dst: 10.0.0.22 (52:54:0:11:0:16) 57 | src: 10.0.0.27, dst: 10.0.0.23 (52:54:0:11:0:17) 58 | src: 10.0.0.27, dst: 10.0.0.24 (52:54:0:11:0:18) 59 | } 60 | ``` 61 | 62 | Delete real servers 63 | ``` 64 | ./xlb_cmdline -i eth0 -d 10.1.4.1 -p 80 -r 10.0.0.22 65 | ./xlb_cmdline -i eth0 -d 10.1.4.1 -p 80 -r 10.0.0.23 66 | ./xlb_cmdline -i eth0 -d 10.1.4.1 -p 80 -r 10.0.0.24 67 | ``` 68 | 69 | Delete service 70 | ``` 71 | ./xlb_cmdline -i eth0 -D 10.1.4.1 -p 80 72 | ``` 73 | 74 | 75 | -------------------------------------------------------------------------------- /TestCodes/compile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gcc -c -o icmp.o icmp.c 4 | gcc -c -o mac.o mac.c 5 | gcc -c -o route.o route.c 6 | gcc -c -o rmi.o rmi.c 7 | gcc -o rmi rmi.o icmp.o mac.o route.o 8 | 9 | -------------------------------------------------------------------------------- /TestCodes/icmp.c: -------------------------------------------------------------------------------- 1 | #include "rmi.h" 2 | 3 | #define PACKETSIZE 64 4 | struct packet 5 | { 6 | struct icmphdr hdr; 7 | char msg[PACKETSIZE-sizeof(struct icmphdr)]; 8 | }; 9 | 10 | int pid=-1; 11 | struct protoent *proto=NULL; 12 | 13 | unsigned short checksum(void *b, int len) 14 | { unsigned short *buf = b; 15 | unsigned int sum=0; 16 | unsigned short result; 17 | 18 | for ( sum = 0; len > 1; len -= 2 ) 19 | sum += *buf++; 20 | if ( len == 1 ) 21 | sum += *(unsigned char*)buf; 22 | sum = (sum >> 16) + (sum & 0xFFFF); 23 | sum += (sum >> 16); 24 | result = ~sum; 25 | return result; 26 | } 27 | 28 | void ping(struct sockaddr_in *addr) 29 | { const int val=255; 30 | int i, sd, cnt=1; 31 | struct packet pckt; 32 | 33 | sd = socket(PF_INET, SOCK_RAW, proto->p_proto); 34 | if ( sd < 0 ) 35 | { 36 | perror("socket"); 37 | return; 38 | } 39 | if ( setsockopt(sd, SOL_IP, IP_TTL, &val, sizeof(val)) != 0) 40 | perror("Set TTL option"); 41 | if ( fcntl(sd, F_SETFL, O_NONBLOCK) != 0 ) 42 | perror("Request nonblocking I/O"); 43 | 44 | if (DEBUG) printf("ICMP #%d sent.\n", cnt); 45 | bzero(&pckt, sizeof(pckt)); 46 | pckt.hdr.type = ICMP_ECHO; 47 | pckt.hdr.un.echo.id = pid; 48 | for ( i = 0; i < sizeof(pckt.msg)-1; i++ ) 49 | pckt.msg[i] = i+'0'; 50 | pckt.msg[i] = 0; 51 | pckt.hdr.un.echo.sequence = cnt++; 52 | pckt.hdr.checksum = checksum(&pckt, sizeof(pckt)); 53 | if ( sendto(sd, &pckt, sizeof(pckt), 0, (struct sockaddr*)addr, sizeof(*addr)) <= 0 ) 54 | perror("sendto"); 55 | } 56 | 57 | int icmp_send_1pkt(in_addr_t *dst_ip) 58 | { 59 | struct sockaddr_in addr; 60 | 61 | proto = getprotobyname("ICMP"); 62 | bzero(&addr, sizeof(addr)); 63 | addr.sin_family = AF_INET; 64 | addr.sin_port = 0; 65 | addr.sin_addr.s_addr = *dst_ip; 66 | ping(&addr); 67 | 68 | return 0; 69 | } 70 | 71 | -------------------------------------------------------------------------------- /TestCodes/mac.c: -------------------------------------------------------------------------------- 1 | #include "rmi.h" 2 | 3 | int xlb_get_mac(in_addr_t *host, char *mac, int *dev){ 4 | 5 | int s; 6 | 7 | struct arpreq req; 8 | struct sockaddr_in *sin; 9 | static char buf[256]; 10 | 11 | bzero((caddr_t)&req, sizeof(req)); 12 | 13 | sin = (struct sockaddr_in *)&req.arp_pa; 14 | sin->sin_family = AF_INET; 15 | sin->sin_addr.s_addr = *host; 16 | 17 | if((s = socket(AF_INET, SOCK_DGRAM, 0)) < 0){ 18 | perror("socket() failed."); 19 | exit(-1); 20 | } 21 | 22 | if_indextoname(*dev, req.arp_dev); 23 | if (DEBUG) printf("ifname= %s\n", req.arp_dev); 24 | 25 | if(ioctl(s, SIOCGARP, (caddr_t)&req) <0){ 26 | if(errno == ENXIO){ 27 | 28 | icmp_send_1pkt(&sin->sin_addr.s_addr); 29 | usleep(100000); 30 | 31 | if(ioctl(s, SIOCGARP, (caddr_t)&req) <0){ 32 | if(errno == ENXIO){ 33 | printf("%s - no entry.\n", inet_ntop(AF_INET, host, buf, 256)); 34 | // printf("%lu - no entry.\n", *host); 35 | exit(-1); 36 | } else { 37 | perror("SIOCGARP"); 38 | exit(-1); 39 | } 40 | } 41 | 42 | } else { 43 | perror("SIOCGARP"); 44 | exit(-1); 45 | } 46 | } 47 | 48 | if(!(req.arp_flags & ATF_COM)){ 49 | printf("Could not get workers Mac address from arp cache.\n"); 50 | exit(-1); 51 | } 52 | 53 | memcpy(mac, req.arp_ha.sa_data, 6); 54 | 55 | return(0); 56 | } 57 | 58 | -------------------------------------------------------------------------------- /TestCodes/rmi.c: -------------------------------------------------------------------------------- 1 | #include "rmi.h" 2 | 3 | int main(int argc, char *argv[]) 4 | { 5 | char ipaddr[16]; 6 | strcpy(ipaddr, argv[1]); 7 | 8 | char mac[6]; 9 | int dev=0; 10 | 11 | // struct in_addr src_ip, nh_ip, dst_ip; 12 | in_addr_t src_ip, nh_ip, dst_ip; 13 | 14 | inet_pton(AF_INET, argv[1], &dst_ip); 15 | xlb_iproute_get(&dst_ip,&src_ip,&nh_ip, &dev); 16 | 17 | static char buf[256]; 18 | printf("src: %s \n", inet_ntop(AF_INET, &src_ip, buf, 256)); 19 | printf("nexthop: %s \n", inet_ntop(AF_INET, &nh_ip, buf, 256)); 20 | printf("dev: %d \n", dev); 21 | 22 | xlb_get_mac(&nh_ip, mac , &dev); 23 | 24 | char mac_txt[6] = {0}; 25 | ether_ntoa_r((struct ether_addr *)mac, mac_txt); 26 | printf("mac: %s\n", mac_txt ); 27 | 28 | } 29 | -------------------------------------------------------------------------------- /TestCodes/rmi.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | //#include 14 | #include 15 | 16 | #include 17 | #include 18 | 19 | #include 20 | #include 21 | #include 22 | 23 | 24 | /// icmp 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | /// 35 | 36 | #define IFLIST_REPLY_BUFFER 8192 37 | 38 | #ifndef DEBUG 39 | #define DEBUG 0 40 | #endif 41 | 42 | #define NLMSG_TAIL(nmsg) \ 43 | ((struct rtattr *) (((void *) (nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len))) 44 | 45 | int xlb_parse_route(struct nlmsghdr *nlh, in_addr_t *src_ip, in_addr_t *nh_ip, int *dev); 46 | int addattr_l(struct nlmsghdr *n, int maxlen, int type, const void *data, 47 | int alen); 48 | int xlb_iproute_get(in_addr_t *dst_ip, in_addr_t *src_ip , in_addr_t *nh_ip, int *dev); 49 | int xlb_get_mac(in_addr_t *host, char *mac, int *dev); 50 | 51 | unsigned short checksum(void *b, int len); 52 | void ping(struct sockaddr_in *addr); 53 | int icmp_send_1pkt(in_addr_t *dst_ip); 54 | 55 | -------------------------------------------------------------------------------- /TestCodes/route.c: -------------------------------------------------------------------------------- 1 | #include "rmi.h" 2 | 3 | int xlb_parse_route(struct nlmsghdr *nlh, in_addr_t *src_ip, in_addr_t *nh_ip, int *dev) 4 | { 5 | struct rtmsg *route_entry; 6 | struct rtattr *route_attribute; 7 | int route_attribute_len = 0; 8 | unsigned char route_netmask = 0; 9 | unsigned char route_protocol = 0; 10 | int via = 0; 11 | 12 | route_entry = (struct rtmsg *) NLMSG_DATA(nlh); 13 | 14 | if (route_entry->rtm_table != RT_TABLE_MAIN) 15 | return 1; 16 | 17 | route_netmask = route_entry->rtm_dst_len; 18 | route_protocol = route_entry->rtm_protocol; 19 | route_attribute = (struct rtattr *) RTM_RTA(route_entry); 20 | route_attribute_len = RTM_PAYLOAD(nlh); 21 | 22 | for ( ; RTA_OK(route_attribute, route_attribute_len); \ 23 | route_attribute = RTA_NEXT(route_attribute, route_attribute_len)) 24 | { 25 | 26 | if (route_attribute->rta_type == RTA_DST) 27 | if (via == 0) 28 | memcpy(nh_ip, RTA_DATA(route_attribute), 4); 29 | 30 | if (route_attribute->rta_type == RTA_GATEWAY) 31 | { 32 | memcpy(nh_ip, RTA_DATA(route_attribute), 4); 33 | via = 1; 34 | } 35 | 36 | if (route_attribute->rta_type == RTA_PREFSRC) 37 | memcpy(src_ip, RTA_DATA(route_attribute), 4); 38 | 39 | if (route_attribute->rta_type == RTA_OIF) 40 | memcpy(dev, RTA_DATA(route_attribute), sizeof(int)); 41 | } 42 | 43 | return 0; 44 | } 45 | 46 | int addattr_l(struct nlmsghdr *n, int maxlen, int type, const void *data, 47 | int alen) 48 | { 49 | int len = RTA_LENGTH(alen); 50 | struct rtattr *rta; 51 | 52 | if (NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len) > maxlen) { 53 | fprintf(stderr, 54 | "addattr_l ERROR: message exceeded bound of %d\n", 55 | maxlen); 56 | return -1; 57 | } 58 | rta = NLMSG_TAIL(n); 59 | rta->rta_type = type; 60 | rta->rta_len = len; 61 | if (alen) 62 | memcpy(RTA_DATA(rta), data, alen); 63 | n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len); 64 | return 0; 65 | } 66 | 67 | int xlb_iproute_get(in_addr_t *dst_ip, in_addr_t *src_ip , in_addr_t *nh_ip, int *dev) 68 | { 69 | struct msghdr rtnl_msg; 70 | struct iovec io; 71 | int fd; 72 | 73 | struct { 74 | struct nlmsghdr n; 75 | struct rtmsg r; 76 | char buf[1024]; 77 | } req; 78 | 79 | memset(&rtnl_msg, 0, sizeof(rtnl_msg)); 80 | memset(&req, 0, sizeof(req)); 81 | 82 | req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)); 83 | req.n.nlmsg_flags = NLM_F_REQUEST; 84 | req.n.nlmsg_type = RTM_GETROUTE; 85 | req.r.rtm_family = AF_INET; 86 | 87 | 88 | addattr_l(&req.n, sizeof(req), RTA_DST, dst_ip, 4); 89 | 90 | io.iov_base = &req; 91 | io.iov_len = req.n.nlmsg_len; 92 | rtnl_msg.msg_iov = &io; 93 | rtnl_msg.msg_iovlen = 1; 94 | 95 | fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); 96 | sendmsg(fd, (struct msghdr *) &rtnl_msg, 0); 97 | 98 | /* parse reply */ 99 | 100 | { 101 | struct nlmsghdr *answer; 102 | struct msghdr rtnl_reply; 103 | struct iovec io_reply; 104 | char reply[IFLIST_REPLY_BUFFER]; 105 | 106 | 107 | memset(&io_reply, 0, sizeof(io_reply)); 108 | memset(&rtnl_reply, 0, sizeof(rtnl_reply)); 109 | 110 | io.iov_base = reply; 111 | io.iov_len = IFLIST_REPLY_BUFFER; 112 | rtnl_reply.msg_iov = &io; 113 | rtnl_reply.msg_iovlen = 1; 114 | 115 | recvmsg(fd, &rtnl_reply, 0); 116 | answer = (struct nlmsghdr *) reply; 117 | 118 | xlb_parse_route(answer, src_ip, nh_ip, dev); 119 | } 120 | 121 | close(fd); 122 | 123 | return 0; 124 | } 125 | 126 | -------------------------------------------------------------------------------- /TestCodes/route_dump.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | #include 11 | #include 12 | 13 | #include 14 | 15 | #define IFLIST_REPLY_BUFFER 8192 16 | 17 | typedef struct nl_req_s nl_req_t; 18 | 19 | struct nl_req_s { 20 | struct nlmsghdr hdr; 21 | struct rtmsg r; 22 | char buf[1025]; 23 | }; 24 | 25 | void rtnl_print_route(struct nlmsghdr *nlh) 26 | { 27 | struct rtmsg *route_entry; 28 | struct rtattr *route_attribute; 29 | int route_attribute_len = 0; 30 | unsigned char route_netmask = 0; 31 | unsigned char route_protocol = 0; 32 | char dst_ip[32]; 33 | char gw_ip[32]; 34 | char src_ip[32]; 35 | int via = 0; 36 | 37 | route_entry = (struct rtmsg *) NLMSG_DATA(nlh); 38 | 39 | if (route_entry->rtm_table != RT_TABLE_MAIN) 40 | return; 41 | 42 | route_netmask = route_entry->rtm_dst_len; 43 | route_protocol = route_entry->rtm_protocol; 44 | route_attribute = (struct rtattr *) RTM_RTA(route_entry); 45 | route_attribute_len = RTM_PAYLOAD(nlh); 46 | 47 | for ( ; RTA_OK(route_attribute, route_attribute_len); \ 48 | route_attribute = RTA_NEXT(route_attribute, route_attribute_len)) 49 | { 50 | printf("hello\n"); 51 | if (route_attribute->rta_type == RTA_DST) 52 | { 53 | inet_ntop(AF_INET, RTA_DATA(route_attribute), \ 54 | dst_ip, sizeof(dst_ip)); 55 | } 56 | if (route_attribute->rta_type == RTA_GATEWAY) 57 | { 58 | inet_ntop(AF_INET, RTA_DATA(route_attribute), \ 59 | gw_ip, sizeof(gw_ip)); 60 | via = 1; 61 | } 62 | if (route_attribute->rta_type == RTA_PREFSRC) 63 | { 64 | inet_ntop(AF_INET, RTA_DATA(route_attribute), \ 65 | src_ip, sizeof(src_ip)); 66 | } 67 | } 68 | printf("route to destination --> %s/%d proto %d and gateway %s\n src=%s, via=%d\n", \ 69 | dst_ip, route_netmask, route_protocol, gw_ip,src_ip, via); 70 | 71 | } 72 | 73 | #define NLMSG_TAIL(nmsg) \ 74 | ((struct rtattr *) (((void *) (nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len))) 75 | 76 | int addattr_l(struct nlmsghdr *n, int maxlen, int type, const void *data, 77 | int alen) 78 | { 79 | int len = RTA_LENGTH(alen); 80 | struct rtattr *rta; 81 | 82 | if (NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len) > maxlen) { 83 | fprintf(stderr, 84 | "addattr_l ERROR: message exceeded bound of %d\n", 85 | maxlen); 86 | return -1; 87 | } 88 | rta = NLMSG_TAIL(n); 89 | rta->rta_type = type; 90 | rta->rta_len = len; 91 | if (alen) 92 | memcpy(RTA_DATA(rta), data, alen); 93 | n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len); 94 | return 0; 95 | } 96 | 97 | int main(int argc, char **argv) 98 | { 99 | int fd; 100 | 101 | struct msghdr rtnl_msg; /* generic msghdr struct for use with sendmsg */ 102 | struct iovec io; /* IO vector for sendmsg */ 103 | 104 | nl_req_t req; /* structure that describes the rtnetlink packet itself */ 105 | char reply[IFLIST_REPLY_BUFFER]; /* a large buffer to receive lots of link information */ 106 | 107 | pid_t pid = getpid(); /* our process ID to build the correct netlink address */ 108 | int end = 0; /* some flag to end loop parsing */ 109 | 110 | fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); 111 | 112 | memset(&rtnl_msg, 0, sizeof(rtnl_msg)); 113 | memset(&req, 0, sizeof(req)); 114 | 115 | req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)); 116 | req.hdr.nlmsg_type = RTM_GETROUTE; 117 | req.hdr.nlmsg_flags = NLM_F_REQUEST; 118 | req.r.rtm_family = AF_INET; 119 | 120 | // char ipaddr[16]; 121 | // strcpy(ipaddr, argv[1]); 122 | // strcpy(ipaddr, "10.0.0.22"); 123 | __u8 cp[]={10,1,0,22}; 124 | __u8 *ap; 125 | int i; 126 | 127 | addattr_l(&req.hdr, sizeof(req), RTA_DST, cp, 4); 128 | 129 | io.iov_base = &req; 130 | io.iov_len = req.hdr.nlmsg_len; 131 | rtnl_msg.msg_iov = &io; 132 | rtnl_msg.msg_iovlen = 1; 133 | 134 | sendmsg(fd, (struct msghdr *) &rtnl_msg, 0); 135 | 136 | /* parse reply */ 137 | 138 | { 139 | int len; 140 | struct nlmsghdr *msg_ptr; /* pointer to current message part */ 141 | 142 | struct msghdr rtnl_reply; /* generic msghdr structure for use with recvmsg */ 143 | struct iovec io_reply; 144 | 145 | memset(&io_reply, 0, sizeof(io_reply)); 146 | memset(&rtnl_reply, 0, sizeof(rtnl_reply)); 147 | 148 | io.iov_base = reply; 149 | io.iov_len = IFLIST_REPLY_BUFFER; 150 | rtnl_reply.msg_iov = &io; 151 | rtnl_reply.msg_iovlen = 1; 152 | 153 | len = recvmsg(fd, &rtnl_reply, 0); /* read as much data as fits in the receive buffer */ 154 | msg_ptr = (struct nlmsghdr *) reply; 155 | rtnl_print_route(msg_ptr); 156 | } 157 | 158 | close(fd); 159 | 160 | return 0; 161 | } 162 | -------------------------------------------------------------------------------- /TestCodes/routemac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ktaka-ccmp/xdp-loadbalancer/2008d0dd7c42cf5ccec53fcfb9a797f5c70042f6/TestCodes/routemac -------------------------------------------------------------------------------- /TestCodes/xlb_test.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | #include 18 | 19 | #include 20 | #include 21 | #include 22 | 23 | 24 | #define IFLIST_REPLY_BUFFER 8192 25 | #define DEBUG 0 26 | 27 | int xlb_parse_route(struct nlmsghdr *nlh, __u8 *src, __u8 *next, int *dev) 28 | { 29 | struct rtmsg *route_entry; 30 | struct rtattr *route_attribute; 31 | int route_attribute_len = 0; 32 | unsigned char route_netmask = 0; 33 | unsigned char route_protocol = 0; 34 | char dst_ip[32]; 35 | char gw_ip[32]; 36 | char src_ip[32]; 37 | int i, via = 0; 38 | __u8 *addr; 39 | 40 | route_entry = (struct rtmsg *) NLMSG_DATA(nlh); 41 | 42 | if (route_entry->rtm_table != RT_TABLE_MAIN) 43 | return 1; 44 | 45 | route_netmask = route_entry->rtm_dst_len; 46 | route_protocol = route_entry->rtm_protocol; 47 | route_attribute = (struct rtattr *) RTM_RTA(route_entry); 48 | route_attribute_len = RTM_PAYLOAD(nlh); 49 | 50 | for ( ; RTA_OK(route_attribute, route_attribute_len); \ 51 | route_attribute = RTA_NEXT(route_attribute, route_attribute_len)) 52 | { 53 | 54 | if (route_attribute->rta_type == RTA_DST) 55 | { 56 | if(DEBUG) inet_ntop(AF_INET, RTA_DATA(route_attribute), dst_ip, sizeof(dst_ip)); 57 | if (via == 0) 58 | memcpy(next, RTA_DATA(route_attribute), 4); 59 | } 60 | 61 | if (route_attribute->rta_type == RTA_GATEWAY) 62 | { 63 | if(DEBUG) inet_ntop(AF_INET, RTA_DATA(route_attribute), gw_ip, sizeof(gw_ip)); 64 | memcpy(next, RTA_DATA(route_attribute), 4); 65 | via = 1; 66 | } 67 | 68 | if (route_attribute->rta_type == RTA_PREFSRC) 69 | { 70 | if(DEBUG) inet_ntop(AF_INET, RTA_DATA(route_attribute), src_ip, sizeof(src_ip)); 71 | memcpy(src, RTA_DATA(route_attribute), 4); 72 | } 73 | 74 | if (route_attribute->rta_type == RTA_OIF) 75 | { 76 | memcpy(dev, RTA_DATA(route_attribute), sizeof(int)); 77 | } 78 | } 79 | 80 | if(DEBUG) 81 | printf("route to destination --> %s/%d proto %d and gateway %s\n src=%s\n", \ 82 | dst_ip, route_netmask, route_protocol, gw_ip,src_ip); 83 | 84 | return 0; 85 | } 86 | 87 | #define NLMSG_TAIL(nmsg) \ 88 | ((struct rtattr *) (((void *) (nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len))) 89 | 90 | int addattr_l(struct nlmsghdr *n, int maxlen, int type, const void *data, 91 | int alen) 92 | { 93 | int len = RTA_LENGTH(alen); 94 | struct rtattr *rta; 95 | 96 | if (NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len) > maxlen) { 97 | fprintf(stderr, 98 | "addattr_l ERROR: message exceeded bound of %d\n", 99 | maxlen); 100 | return -1; 101 | } 102 | rta = NLMSG_TAIL(n); 103 | rta->rta_type = type; 104 | rta->rta_len = len; 105 | if (alen) 106 | memcpy(RTA_DATA(rta), data, alen); 107 | n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len); 108 | return 0; 109 | } 110 | 111 | static int xlb_iproute_get(char *dst_ip, __u8 *src , __u8 *next, int *dev) 112 | { 113 | struct msghdr rtnl_msg; 114 | struct iovec io; 115 | int fd; 116 | __u32 addr; 117 | 118 | struct { 119 | struct nlmsghdr n; 120 | struct rtmsg r; 121 | char buf[1024]; 122 | } req; 123 | 124 | memset(&rtnl_msg, 0, sizeof(rtnl_msg)); 125 | memset(&req, 0, sizeof(req)); 126 | 127 | req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)); 128 | req.n.nlmsg_flags = NLM_F_REQUEST; 129 | req.n.nlmsg_type = RTM_GETROUTE; 130 | req.r.rtm_family = AF_INET; 131 | 132 | inet_pton(AF_INET, dst_ip , &addr); 133 | 134 | addattr_l(&req.n, sizeof(req), RTA_DST, &addr, 4); 135 | 136 | io.iov_base = &req; 137 | io.iov_len = req.n.nlmsg_len; 138 | rtnl_msg.msg_iov = &io; 139 | rtnl_msg.msg_iovlen = 1; 140 | 141 | fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); 142 | sendmsg(fd, (struct msghdr *) &rtnl_msg, 0); 143 | 144 | /* parse reply */ 145 | 146 | { 147 | int len; 148 | struct nlmsghdr *answer; 149 | struct msghdr rtnl_reply; 150 | struct iovec io_reply; 151 | char reply[IFLIST_REPLY_BUFFER]; 152 | 153 | 154 | memset(&io_reply, 0, sizeof(io_reply)); 155 | memset(&rtnl_reply, 0, sizeof(rtnl_reply)); 156 | 157 | io.iov_base = reply; 158 | io.iov_len = IFLIST_REPLY_BUFFER; 159 | rtnl_reply.msg_iov = &io; 160 | rtnl_reply.msg_iovlen = 1; 161 | 162 | len = recvmsg(fd, &rtnl_reply, 0); 163 | answer = (struct nlmsghdr *) reply; 164 | // rtnl_print_route(msg_ptr); 165 | 166 | xlb_parse_route(answer, src, next, dev); 167 | } 168 | 169 | close(fd); 170 | 171 | return 0; 172 | } 173 | 174 | static int xlb_get_mac(__u8 *host, __u8 *mac, int *dev){ 175 | int s; 176 | 177 | struct arpreq req; 178 | struct sockaddr_in *sin; 179 | static char buf[256]; 180 | 181 | // char *host = argv[1]; 182 | 183 | bzero((caddr_t)&req, sizeof(req)); 184 | 185 | sin = (struct sockaddr_in *)&req.arp_pa; 186 | sin->sin_family = AF_INET; /* Address Family: Internet */ 187 | sin->sin_addr.s_addr = inet_addr(inet_ntop(AF_INET, host, buf, 256)); 188 | // sin->sin_addr.s_addr = host; 189 | 190 | if((s = socket(AF_INET, SOCK_DGRAM, 0)) < 0){ 191 | perror("socket() failed."); 192 | exit(-1); 193 | } /* Socket is opened.*/ 194 | 195 | strcpy(req.arp_dev, "eth0"); 196 | 197 | 198 | if(ioctl(s, SIOCGARP, (caddr_t)&req) <0){ 199 | if(errno == ENXIO){ 200 | 201 | icmp_send_1pkt(&sin->sin_addr.s_addr); 202 | usleep(100000); 203 | 204 | if(ioctl(s, SIOCGARP, (caddr_t)&req) <0){ 205 | if(errno == ENXIO){ 206 | printf("%s - no entry.\n", inet_ntop(AF_INET, host, buf, 256)); 207 | printf("%lu - no entry.\n", *host); 208 | exit(-1); 209 | } else { 210 | perror("SIOCGARP"); 211 | exit(-1); 212 | } 213 | } 214 | 215 | } else { 216 | perror("SIOCGARP"); 217 | exit(-1); 218 | } 219 | } 220 | 221 | memcpy(mac, req.arp_ha.sa_data, 6); 222 | 223 | 224 | return(0); 225 | } 226 | 227 | 228 | int main(int argc, char *argv[]) 229 | { 230 | char ipaddr[16]; 231 | strcpy(ipaddr, argv[1]); 232 | // strcpy(ipaddr, "10.0.0.22"); 233 | 234 | __u8 src[4], nexthop[4], mac[6]; 235 | int dev=0; 236 | 237 | xlb_iproute_get(ipaddr,src,nexthop, &dev); 238 | 239 | xlb_get_mac(nexthop, mac , &dev); 240 | 241 | static char buf[256]; 242 | printf("src: %s \n", inet_ntop(AF_INET, src, buf, 256)); 243 | printf("nexthop: %s \n", inet_ntop(AF_INET, nexthop, buf, 256)); 244 | printf("dev: %d \n", dev); 245 | 246 | char mac_txt[6] = {0}; 247 | ether_ntoa_r((struct ether_addr *)mac, mac_txt); 248 | printf("mac: %s\n", mac_txt ); 249 | 250 | } 251 | -------------------------------------------------------------------------------- /bpf_helpers.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: GPL-2.0 */ 2 | /* Copied from $(KERNEL)/tools/testing/selftests/bpf/bpf_helpers.h 3 | * Needed by xxx_kern.c files 4 | * 5 | * Pulling this in creates a dependency to uapi/linux/bpf.h 6 | * this is maintained under tools/include/uapi/linux/bpf.h 7 | */ 8 | #ifndef __BPF_HELPERS_H 9 | #define __BPF_HELPERS_H 10 | 11 | /* helper macro to place programs, maps, license in 12 | * different sections in elf_bpf file. Section names 13 | * are interpreted by elf_bpf loader 14 | */ 15 | #define SEC(NAME) __attribute__((section(NAME), used)) 16 | 17 | /* helper functions called from eBPF programs written in C */ 18 | static void *(*bpf_map_lookup_elem)(void *map, void *key) = 19 | (void *) BPF_FUNC_map_lookup_elem; 20 | static int (*bpf_map_update_elem)(void *map, void *key, void *value, 21 | unsigned long long flags) = 22 | (void *) BPF_FUNC_map_update_elem; 23 | static int (*bpf_map_delete_elem)(void *map, void *key) = 24 | (void *) BPF_FUNC_map_delete_elem; 25 | static int (*bpf_probe_read)(void *dst, int size, void *unsafe_ptr) = 26 | (void *) BPF_FUNC_probe_read; 27 | static unsigned long long (*bpf_ktime_get_ns)(void) = 28 | (void *) BPF_FUNC_ktime_get_ns; 29 | static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) = 30 | (void *) BPF_FUNC_trace_printk; 31 | static void (*bpf_tail_call)(void *ctx, void *map, int index) = 32 | (void *) BPF_FUNC_tail_call; 33 | static unsigned long long (*bpf_get_smp_processor_id)(void) = 34 | (void *) BPF_FUNC_get_smp_processor_id; 35 | static unsigned long long (*bpf_get_current_pid_tgid)(void) = 36 | (void *) BPF_FUNC_get_current_pid_tgid; 37 | static unsigned long long (*bpf_get_current_uid_gid)(void) = 38 | (void *) BPF_FUNC_get_current_uid_gid; 39 | static int (*bpf_get_current_comm)(void *buf, int buf_size) = 40 | (void *) BPF_FUNC_get_current_comm; 41 | static unsigned long long (*bpf_perf_event_read)(void *map, 42 | unsigned long long flags) = 43 | (void *) BPF_FUNC_perf_event_read; 44 | static int (*bpf_clone_redirect)(void *ctx, int ifindex, int flags) = 45 | (void *) BPF_FUNC_clone_redirect; 46 | static int (*bpf_redirect)(int ifindex, int flags) = 47 | (void *) BPF_FUNC_redirect; 48 | static int (*bpf_redirect_map)(void *map, int key, int flags) = 49 | (void *) BPF_FUNC_redirect_map; 50 | static int (*bpf_perf_event_output)(void *ctx, void *map, 51 | unsigned long long flags, void *data, 52 | int size) = 53 | (void *) BPF_FUNC_perf_event_output; 54 | static int (*bpf_get_stackid)(void *ctx, void *map, int flags) = 55 | (void *) BPF_FUNC_get_stackid; 56 | static int (*bpf_probe_write_user)(void *dst, void *src, int size) = 57 | (void *) BPF_FUNC_probe_write_user; 58 | static int (*bpf_current_task_under_cgroup)(void *map, int index) = 59 | (void *) BPF_FUNC_current_task_under_cgroup; 60 | static int (*bpf_skb_get_tunnel_key)(void *ctx, void *key, int size, int flags) = 61 | (void *) BPF_FUNC_skb_get_tunnel_key; 62 | static int (*bpf_skb_set_tunnel_key)(void *ctx, void *key, int size, int flags) = 63 | (void *) BPF_FUNC_skb_set_tunnel_key; 64 | static int (*bpf_skb_get_tunnel_opt)(void *ctx, void *md, int size) = 65 | (void *) BPF_FUNC_skb_get_tunnel_opt; 66 | static int (*bpf_skb_set_tunnel_opt)(void *ctx, void *md, int size) = 67 | (void *) BPF_FUNC_skb_set_tunnel_opt; 68 | static unsigned long long (*bpf_get_prandom_u32)(void) = 69 | (void *) BPF_FUNC_get_prandom_u32; 70 | static int (*bpf_xdp_adjust_head)(void *ctx, int offset) = 71 | (void *) BPF_FUNC_xdp_adjust_head; 72 | static int (*bpf_xdp_adjust_meta)(void *ctx, int offset) = 73 | (void *) BPF_FUNC_xdp_adjust_meta; 74 | static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval, 75 | int optlen) = 76 | (void *) BPF_FUNC_setsockopt; 77 | static int (*bpf_getsockopt)(void *ctx, int level, int optname, void *optval, 78 | int optlen) = 79 | (void *) BPF_FUNC_getsockopt; 80 | static int (*bpf_sk_redirect_map)(void *ctx, void *map, int key, int flags) = 81 | (void *) BPF_FUNC_sk_redirect_map; 82 | static int (*bpf_sock_map_update)(void *map, void *key, void *value, 83 | unsigned long long flags) = 84 | (void *) BPF_FUNC_sock_map_update; 85 | static int (*bpf_perf_event_read_value)(void *map, unsigned long long flags, 86 | void *buf, unsigned int buf_size) = 87 | (void *) BPF_FUNC_perf_event_read_value; 88 | static int (*bpf_perf_prog_read_value)(void *ctx, void *buf, 89 | unsigned int buf_size) = 90 | (void *) BPF_FUNC_perf_prog_read_value; 91 | static int (*bpf_override_return)(void *ctx, unsigned long rc) = 92 | (void *) BPF_FUNC_override_return; 93 | 94 | /* llvm builtin functions that eBPF C program may use to 95 | * emit BPF_LD_ABS and BPF_LD_IND instructions 96 | */ 97 | struct sk_buff; 98 | unsigned long long load_byte(void *skb, 99 | unsigned long long off) asm("llvm.bpf.load.byte"); 100 | unsigned long long load_half(void *skb, 101 | unsigned long long off) asm("llvm.bpf.load.half"); 102 | unsigned long long load_word(void *skb, 103 | unsigned long long off) asm("llvm.bpf.load.word"); 104 | 105 | /* a helper structure used by eBPF C program 106 | * to describe map attributes to elf_bpf loader 107 | */ 108 | struct bpf_map_def { 109 | unsigned int type; 110 | unsigned int key_size; 111 | unsigned int value_size; 112 | unsigned int max_entries; 113 | unsigned int map_flags; 114 | unsigned int inner_map_idx; 115 | unsigned int numa_node; 116 | }; 117 | 118 | static int (*bpf_skb_load_bytes)(void *ctx, int off, void *to, int len) = 119 | (void *) BPF_FUNC_skb_load_bytes; 120 | static int (*bpf_skb_store_bytes)(void *ctx, int off, void *from, int len, int flags) = 121 | (void *) BPF_FUNC_skb_store_bytes; 122 | static int (*bpf_l3_csum_replace)(void *ctx, int off, int from, int to, int flags) = 123 | (void *) BPF_FUNC_l3_csum_replace; 124 | static int (*bpf_l4_csum_replace)(void *ctx, int off, int from, int to, int flags) = 125 | (void *) BPF_FUNC_l4_csum_replace; 126 | static int (*bpf_skb_under_cgroup)(void *ctx, void *map, int index) = 127 | (void *) BPF_FUNC_skb_under_cgroup; 128 | static int (*bpf_skb_change_head)(void *, int len, int flags) = 129 | (void *) BPF_FUNC_skb_change_head; 130 | 131 | /* Scan the ARCH passed in from ARCH env variable (see Makefile) */ 132 | #if defined(__TARGET_ARCH_x86) 133 | #define bpf_target_x86 134 | #define bpf_target_defined 135 | #elif defined(__TARGET_ARCH_s930x) 136 | #define bpf_target_s930x 137 | #define bpf_target_defined 138 | #elif defined(__TARGET_ARCH_arm64) 139 | #define bpf_target_arm64 140 | #define bpf_target_defined 141 | #elif defined(__TARGET_ARCH_mips) 142 | #define bpf_target_mips 143 | #define bpf_target_defined 144 | #elif defined(__TARGET_ARCH_powerpc) 145 | #define bpf_target_powerpc 146 | #define bpf_target_defined 147 | #elif defined(__TARGET_ARCH_sparc) 148 | #define bpf_target_sparc 149 | #define bpf_target_defined 150 | #else 151 | #undef bpf_target_defined 152 | #endif 153 | 154 | /* Fall back to what the compiler says */ 155 | #ifndef bpf_target_defined 156 | #if defined(__x86_64__) 157 | #define bpf_target_x86 158 | #elif defined(__s390x__) 159 | #define bpf_target_s930x 160 | #elif defined(__aarch64__) 161 | #define bpf_target_arm64 162 | #elif defined(__mips__) 163 | #define bpf_target_mips 164 | #elif defined(__powerpc__) 165 | #define bpf_target_powerpc 166 | #elif defined(__sparc__) 167 | #define bpf_target_sparc 168 | #endif 169 | #endif 170 | 171 | #if defined(bpf_target_x86) 172 | 173 | #define PT_REGS_PARM1(x) ((x)->di) 174 | #define PT_REGS_PARM2(x) ((x)->si) 175 | #define PT_REGS_PARM3(x) ((x)->dx) 176 | #define PT_REGS_PARM4(x) ((x)->cx) 177 | #define PT_REGS_PARM5(x) ((x)->r8) 178 | #define PT_REGS_RET(x) ((x)->sp) 179 | #define PT_REGS_FP(x) ((x)->bp) 180 | #define PT_REGS_RC(x) ((x)->ax) 181 | #define PT_REGS_SP(x) ((x)->sp) 182 | #define PT_REGS_IP(x) ((x)->ip) 183 | 184 | #elif defined(bpf_target_s390x) 185 | 186 | #define PT_REGS_PARM1(x) ((x)->gprs[2]) 187 | #define PT_REGS_PARM2(x) ((x)->gprs[3]) 188 | #define PT_REGS_PARM3(x) ((x)->gprs[4]) 189 | #define PT_REGS_PARM4(x) ((x)->gprs[5]) 190 | #define PT_REGS_PARM5(x) ((x)->gprs[6]) 191 | #define PT_REGS_RET(x) ((x)->gprs[14]) 192 | #define PT_REGS_FP(x) ((x)->gprs[11]) /* Works only with CONFIG_FRAME_POINTER */ 193 | #define PT_REGS_RC(x) ((x)->gprs[2]) 194 | #define PT_REGS_SP(x) ((x)->gprs[15]) 195 | #define PT_REGS_IP(x) ((x)->psw.addr) 196 | 197 | #elif defined(bpf_target_arm64) 198 | 199 | #define PT_REGS_PARM1(x) ((x)->regs[0]) 200 | #define PT_REGS_PARM2(x) ((x)->regs[1]) 201 | #define PT_REGS_PARM3(x) ((x)->regs[2]) 202 | #define PT_REGS_PARM4(x) ((x)->regs[3]) 203 | #define PT_REGS_PARM5(x) ((x)->regs[4]) 204 | #define PT_REGS_RET(x) ((x)->regs[30]) 205 | #define PT_REGS_FP(x) ((x)->regs[29]) /* Works only with CONFIG_FRAME_POINTER */ 206 | #define PT_REGS_RC(x) ((x)->regs[0]) 207 | #define PT_REGS_SP(x) ((x)->sp) 208 | #define PT_REGS_IP(x) ((x)->pc) 209 | 210 | #elif defined(bpf_target_mips) 211 | 212 | #define PT_REGS_PARM1(x) ((x)->regs[4]) 213 | #define PT_REGS_PARM2(x) ((x)->regs[5]) 214 | #define PT_REGS_PARM3(x) ((x)->regs[6]) 215 | #define PT_REGS_PARM4(x) ((x)->regs[7]) 216 | #define PT_REGS_PARM5(x) ((x)->regs[8]) 217 | #define PT_REGS_RET(x) ((x)->regs[31]) 218 | #define PT_REGS_FP(x) ((x)->regs[30]) /* Works only with CONFIG_FRAME_POINTER */ 219 | #define PT_REGS_RC(x) ((x)->regs[1]) 220 | #define PT_REGS_SP(x) ((x)->regs[29]) 221 | #define PT_REGS_IP(x) ((x)->cp0_epc) 222 | 223 | #elif defined(bpf_target_powerpc) 224 | 225 | #define PT_REGS_PARM1(x) ((x)->gpr[3]) 226 | #define PT_REGS_PARM2(x) ((x)->gpr[4]) 227 | #define PT_REGS_PARM3(x) ((x)->gpr[5]) 228 | #define PT_REGS_PARM4(x) ((x)->gpr[6]) 229 | #define PT_REGS_PARM5(x) ((x)->gpr[7]) 230 | #define PT_REGS_RC(x) ((x)->gpr[3]) 231 | #define PT_REGS_SP(x) ((x)->sp) 232 | #define PT_REGS_IP(x) ((x)->nip) 233 | 234 | #elif defined(bpf_target_sparc) 235 | 236 | #define PT_REGS_PARM1(x) ((x)->u_regs[UREG_I0]) 237 | #define PT_REGS_PARM2(x) ((x)->u_regs[UREG_I1]) 238 | #define PT_REGS_PARM3(x) ((x)->u_regs[UREG_I2]) 239 | #define PT_REGS_PARM4(x) ((x)->u_regs[UREG_I3]) 240 | #define PT_REGS_PARM5(x) ((x)->u_regs[UREG_I4]) 241 | #define PT_REGS_RET(x) ((x)->u_regs[UREG_I7]) 242 | #define PT_REGS_RC(x) ((x)->u_regs[UREG_I0]) 243 | #define PT_REGS_SP(x) ((x)->u_regs[UREG_FP]) 244 | 245 | /* Should this also be a bpf_target check for the sparc case? */ 246 | #if defined(__arch64__) 247 | #define PT_REGS_IP(x) ((x)->tpc) 248 | #else 249 | #define PT_REGS_IP(x) ((x)->pc) 250 | #endif 251 | 252 | #endif 253 | 254 | #ifdef bpf_target_powerpc 255 | #define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = (ctx)->link; }) 256 | #define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP 257 | #elif bpf_target_sparc 258 | #define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = PT_REGS_RET(ctx); }) 259 | #define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP 260 | #else 261 | #define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ \ 262 | bpf_probe_read(&(ip), sizeof(ip), (void *)PT_REGS_RET(ctx)); }) 263 | #define BPF_KRETPROBE_READ_RET_IP(ip, ctx) ({ \ 264 | bpf_probe_read(&(ip), sizeof(ip), \ 265 | (void *)(PT_REGS_FP(ctx) + sizeof(ip))); }) 266 | #endif 267 | 268 | #endif 269 | -------------------------------------------------------------------------------- /bpf_load.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0 2 | /* 3 | * Notice: Modified copy of kernel/samples/bpf/bpf_load.c 4 | * - Up-to-date with kernel v4.14-rc8 5 | * 6 | * Added features: 7 | * - Fixed load order of prog_fd[] program sections 8 | */ 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include "libbpf.h" 35 | #include "bpf_load.h" 36 | #include "perf-sys.h" 37 | 38 | #define DEBUGFS "/sys/kernel/debug/tracing/" 39 | 40 | static char license[128]; 41 | static int kern_version; 42 | static bool processed_sec[128]; 43 | char bpf_log_buf[BPF_LOG_BUF_SIZE]; 44 | int map_fd[MAX_MAPS]; 45 | int prog_fd[MAX_PROGS]; 46 | int event_fd[MAX_PROGS]; 47 | int prog_cnt; 48 | int prog_array_fd = -1; 49 | 50 | struct bpf_map_data map_data[MAX_MAPS]; 51 | int map_data_count = 0; 52 | 53 | static int populate_prog_array(const char *event, int prog_fd) 54 | { 55 | int ind = atoi(event), err; 56 | 57 | err = bpf_map_update_elem(prog_array_fd, &ind, &prog_fd, BPF_ANY); 58 | if (err < 0) { 59 | printf("failed to store prog_fd in prog_array\n"); 60 | return -1; 61 | } 62 | return 0; 63 | } 64 | 65 | static int load_and_attach(const char *event, struct bpf_insn *prog, int size) 66 | { 67 | bool is_socket = strncmp(event, "socket", 6) == 0; 68 | bool is_kprobe = strncmp(event, "kprobe/", 7) == 0; 69 | bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0; 70 | bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0; 71 | bool is_xdp = strncmp(event, "xdp", 3) == 0; 72 | bool is_perf_event = strncmp(event, "perf_event", 10) == 0; 73 | bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0; 74 | bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0; 75 | bool is_sockops = strncmp(event, "sockops", 7) == 0; 76 | bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0; 77 | size_t insns_cnt = size / sizeof(struct bpf_insn); 78 | enum bpf_prog_type prog_type; 79 | char buf[256]; 80 | int fd, efd, err, id; 81 | struct perf_event_attr attr = {}; 82 | 83 | attr.type = PERF_TYPE_TRACEPOINT; 84 | attr.sample_type = PERF_SAMPLE_RAW; 85 | attr.sample_period = 1; 86 | attr.wakeup_events = 1; 87 | 88 | if (is_socket) { 89 | prog_type = BPF_PROG_TYPE_SOCKET_FILTER; 90 | } else if (is_kprobe || is_kretprobe) { 91 | prog_type = BPF_PROG_TYPE_KPROBE; 92 | } else if (is_tracepoint) { 93 | prog_type = BPF_PROG_TYPE_TRACEPOINT; 94 | } else if (is_xdp) { 95 | prog_type = BPF_PROG_TYPE_XDP; 96 | } else if (is_perf_event) { 97 | prog_type = BPF_PROG_TYPE_PERF_EVENT; 98 | } else if (is_cgroup_skb) { 99 | prog_type = BPF_PROG_TYPE_CGROUP_SKB; 100 | } else if (is_cgroup_sk) { 101 | prog_type = BPF_PROG_TYPE_CGROUP_SOCK; 102 | } else if (is_sockops) { 103 | prog_type = BPF_PROG_TYPE_SOCK_OPS; 104 | } else if (is_sk_skb) { 105 | prog_type = BPF_PROG_TYPE_SK_SKB; 106 | } else { 107 | printf("Unknown event '%s'\n", event); 108 | return -1; 109 | } 110 | 111 | fd = bpf_load_program(prog_type, prog, insns_cnt, license, kern_version, 112 | bpf_log_buf, BPF_LOG_BUF_SIZE); 113 | if (fd < 0) { 114 | printf("bpf_load_program(prog_cnt=%d) err=%d\n%s", 115 | prog_cnt, errno, bpf_log_buf); 116 | return -1; 117 | } 118 | 119 | prog_fd[prog_cnt++] = fd; 120 | 121 | if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk) 122 | return 0; 123 | 124 | if (is_socket || is_sockops || is_sk_skb) { 125 | if (is_socket) 126 | event += 6; 127 | else 128 | event += 7; 129 | if (*event != '/') 130 | return 0; 131 | event++; 132 | if (!isdigit(*event)) { 133 | printf("invalid prog number\n"); 134 | return -1; 135 | } 136 | return populate_prog_array(event, fd); 137 | } 138 | 139 | if (is_kprobe || is_kretprobe) { 140 | if (is_kprobe) 141 | event += 7; 142 | else 143 | event += 10; 144 | 145 | if (*event == 0) { 146 | printf("event name cannot be empty\n"); 147 | return -1; 148 | } 149 | 150 | if (isdigit(*event)) 151 | return populate_prog_array(event, fd); 152 | 153 | snprintf(buf, sizeof(buf), 154 | "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events", 155 | is_kprobe ? 'p' : 'r', event, event); 156 | err = system(buf); 157 | if (err < 0) { 158 | printf("failed to create kprobe '%s' error '%s'\n", 159 | event, strerror(errno)); 160 | return -1; 161 | } 162 | 163 | strcpy(buf, DEBUGFS); 164 | strcat(buf, "events/kprobes/"); 165 | strcat(buf, event); 166 | strcat(buf, "/id"); 167 | } else if (is_tracepoint) { 168 | event += 11; 169 | 170 | if (*event == 0) { 171 | printf("event name cannot be empty\n"); 172 | return -1; 173 | } 174 | strcpy(buf, DEBUGFS); 175 | strcat(buf, "events/"); 176 | strcat(buf, event); 177 | strcat(buf, "/id"); 178 | } 179 | 180 | efd = open(buf, O_RDONLY, 0); 181 | if (efd < 0) { 182 | printf("failed to open event %s\n", event); 183 | return -1; 184 | } 185 | 186 | err = read(efd, buf, sizeof(buf)); 187 | if (err < 0 || err >= sizeof(buf)) { 188 | printf("read from '%s' failed '%s'\n", event, strerror(errno)); 189 | return -1; 190 | } 191 | 192 | close(efd); 193 | 194 | buf[err] = 0; 195 | id = atoi(buf); 196 | attr.config = id; 197 | 198 | efd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0); 199 | if (efd < 0) { 200 | printf("event %d fd %d err %s\n", id, efd, strerror(errno)); 201 | return -1; 202 | } 203 | event_fd[prog_cnt - 1] = efd; 204 | err = ioctl(efd, PERF_EVENT_IOC_ENABLE, 0); 205 | if (err < 0) { 206 | printf("ioctl PERF_EVENT_IOC_ENABLE failed err %s\n", 207 | strerror(errno)); 208 | return -1; 209 | } 210 | err = ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd); 211 | if (err < 0) { 212 | printf("ioctl PERF_EVENT_IOC_SET_BPF failed err %s\n", 213 | strerror(errno)); 214 | return -1; 215 | } 216 | 217 | return 0; 218 | } 219 | 220 | static int load_maps(struct bpf_map_data *maps, int nr_maps, 221 | fixup_map_cb fixup_map) 222 | { 223 | int i, numa_node; 224 | 225 | for (i = 0; i < nr_maps; i++) { 226 | if (fixup_map) { 227 | fixup_map(&maps[i], i); 228 | /* Allow userspace to assign map FD prior to creation */ 229 | if (maps[i].fd != -1) { 230 | map_fd[i] = maps[i].fd; 231 | continue; 232 | } 233 | } 234 | 235 | numa_node = maps[i].def.map_flags & BPF_F_NUMA_NODE ? 236 | maps[i].def.numa_node : -1; 237 | 238 | if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS || 239 | maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) { 240 | int inner_map_fd = map_fd[maps[i].def.inner_map_idx]; 241 | 242 | map_fd[i] = bpf_create_map_in_map_node(maps[i].def.type, 243 | maps[i].name, 244 | maps[i].def.key_size, 245 | inner_map_fd, 246 | maps[i].def.max_entries, 247 | maps[i].def.map_flags, 248 | numa_node); 249 | } else { 250 | map_fd[i] = bpf_create_map_node(maps[i].def.type, 251 | maps[i].name, 252 | maps[i].def.key_size, 253 | maps[i].def.value_size, 254 | maps[i].def.max_entries, 255 | maps[i].def.map_flags, 256 | numa_node); 257 | 258 | /* DEBUG 259 | printf("MapType: %d \n", maps[i].def.type); 260 | printf("MapName: %s \n", maps[i].name); 261 | printf("MapKeysize: %d \n", maps[i].def.key_size); 262 | printf("MapValusize: %d \n", maps[i].def.value_size); 263 | printf("MapMaxEnt: %d \n", maps[i].def.max_entries); 264 | printf("MapMapFlags: %d \n", maps[i].def.map_flags); 265 | printf("mapfd: %d \n", map_fd[i]); 266 | 267 | printf("Hello World2\n"); 268 | */ 269 | 270 | } 271 | if (map_fd[i] < 0) { 272 | printf("failed to create a map: %d %s\n", 273 | errno, strerror(errno)); 274 | return 1; 275 | } 276 | maps[i].fd = map_fd[i]; 277 | 278 | if (maps[i].def.type == BPF_MAP_TYPE_PROG_ARRAY) 279 | prog_array_fd = map_fd[i]; 280 | } 281 | return 0; 282 | } 283 | 284 | static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname, 285 | GElf_Shdr *shdr, Elf_Data **data) 286 | { 287 | Elf_Scn *scn; 288 | 289 | scn = elf_getscn(elf, i); 290 | if (!scn) 291 | return 1; 292 | 293 | if (gelf_getshdr(scn, shdr) != shdr) 294 | return 2; 295 | 296 | *shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name); 297 | if (!*shname || !shdr->sh_size) 298 | return 3; 299 | 300 | *data = elf_getdata(scn, 0); 301 | if (!*data || elf_getdata(scn, *data) != NULL) 302 | return 4; 303 | 304 | return 0; 305 | } 306 | 307 | static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols, 308 | GElf_Shdr *shdr, struct bpf_insn *insn, 309 | struct bpf_map_data *maps, int nr_maps) 310 | { 311 | int i, nrels; 312 | 313 | nrels = shdr->sh_size / shdr->sh_entsize; 314 | 315 | for (i = 0; i < nrels; i++) { 316 | GElf_Sym sym; 317 | GElf_Rel rel; 318 | unsigned int insn_idx; 319 | bool match = false; 320 | int map_idx; 321 | 322 | gelf_getrel(data, i, &rel); 323 | 324 | insn_idx = rel.r_offset / sizeof(struct bpf_insn); 325 | 326 | gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym); 327 | 328 | if (insn[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) { 329 | printf("invalid relo for insn[%d].code 0x%x\n", 330 | insn_idx, insn[insn_idx].code); 331 | return 1; 332 | } 333 | insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD; 334 | 335 | /* Match FD relocation against recorded map_data[] offset */ 336 | for (map_idx = 0; map_idx < nr_maps; map_idx++) { 337 | if (maps[map_idx].elf_offset == sym.st_value) { 338 | match = true; 339 | break; 340 | } 341 | } 342 | if (match) { 343 | insn[insn_idx].imm = maps[map_idx].fd; 344 | } else { 345 | printf("invalid relo for insn[%d] no map_data match\n", 346 | insn_idx); 347 | return 1; 348 | } 349 | } 350 | 351 | return 0; 352 | } 353 | 354 | static int cmp_symbols(const void *l, const void *r) 355 | { 356 | const GElf_Sym *lsym = (const GElf_Sym *)l; 357 | const GElf_Sym *rsym = (const GElf_Sym *)r; 358 | 359 | if (lsym->st_value < rsym->st_value) 360 | return -1; 361 | else if (lsym->st_value > rsym->st_value) 362 | return 1; 363 | else 364 | return 0; 365 | } 366 | 367 | static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx, 368 | Elf *elf, Elf_Data *symbols, int strtabidx) 369 | { 370 | int map_sz_elf, map_sz_copy; 371 | bool validate_zero = false; 372 | Elf_Data *data_maps; 373 | int i, nr_maps; 374 | GElf_Sym *sym; 375 | Elf_Scn *scn; 376 | 377 | if (maps_shndx < 0) 378 | return -EINVAL; 379 | if (!symbols) 380 | return -EINVAL; 381 | 382 | /* Get data for maps section via elf index */ 383 | scn = elf_getscn(elf, maps_shndx); 384 | if (scn) 385 | data_maps = elf_getdata(scn, NULL); 386 | if (!scn || !data_maps) { 387 | printf("Failed to get Elf_Data from maps section %d\n", 388 | maps_shndx); 389 | return -EINVAL; 390 | } 391 | 392 | /* For each map get corrosponding symbol table entry */ 393 | sym = calloc(MAX_MAPS+1, sizeof(GElf_Sym)); 394 | for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) { 395 | assert(nr_maps < MAX_MAPS+1); 396 | if (!gelf_getsym(symbols, i, &sym[nr_maps])) 397 | continue; 398 | if (sym[nr_maps].st_shndx != maps_shndx) 399 | continue; 400 | /* Only increment iif maps section */ 401 | nr_maps++; 402 | } 403 | 404 | /* Align to map_fd[] order, via sort on offset in sym.st_value */ 405 | qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols); 406 | 407 | /* Keeping compatible with ELF maps section changes 408 | * ------------------------------------------------ 409 | * The program size of struct bpf_map_def is known by loader 410 | * code, but struct stored in ELF file can be different. 411 | * 412 | * Unfortunately sym[i].st_size is zero. To calculate the 413 | * struct size stored in the ELF file, assume all struct have 414 | * the same size, and simply divide with number of map 415 | * symbols. 416 | */ 417 | map_sz_elf = data_maps->d_size / nr_maps; 418 | map_sz_copy = sizeof(struct bpf_map_def); 419 | if (map_sz_elf < map_sz_copy) { 420 | /* 421 | * Backward compat, loading older ELF file with 422 | * smaller struct, keeping remaining bytes zero. 423 | */ 424 | map_sz_copy = map_sz_elf; 425 | } else if (map_sz_elf > map_sz_copy) { 426 | /* 427 | * Forward compat, loading newer ELF file with larger 428 | * struct with unknown features. Assume zero means 429 | * feature not used. Thus, validate rest of struct 430 | * data is zero. 431 | */ 432 | validate_zero = true; 433 | } 434 | 435 | /* Memcpy relevant part of ELF maps data to loader maps */ 436 | for (i = 0; i < nr_maps; i++) { 437 | unsigned char *addr, *end; 438 | struct bpf_map_def *def; 439 | const char *map_name; 440 | size_t offset; 441 | 442 | map_name = elf_strptr(elf, strtabidx, sym[i].st_name); 443 | maps[i].name = strdup(map_name); 444 | if (!maps[i].name) { 445 | printf("strdup(%s): %s(%d)\n", map_name, 446 | strerror(errno), errno); 447 | free(sym); 448 | return -errno; 449 | } 450 | 451 | /* Symbol value is offset into ELF maps section data area */ 452 | offset = sym[i].st_value; 453 | def = (struct bpf_map_def *)(data_maps->d_buf + offset); 454 | maps[i].elf_offset = offset; 455 | memset(&maps[i].def, 0, sizeof(struct bpf_map_def)); 456 | memcpy(&maps[i].def, def, map_sz_copy); 457 | 458 | /* Verify no newer features were requested */ 459 | if (validate_zero) { 460 | addr = (unsigned char*) def + map_sz_copy; 461 | end = (unsigned char*) def + map_sz_elf; 462 | for (; addr < end; addr++) { 463 | if (*addr != 0) { 464 | free(sym); 465 | return -EFBIG; 466 | } 467 | } 468 | } 469 | } 470 | 471 | free(sym); 472 | return nr_maps; 473 | } 474 | 475 | static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) 476 | { 477 | int fd, i, ret, maps_shndx = -1, strtabidx = -1; 478 | Elf *elf; 479 | GElf_Ehdr ehdr; 480 | GElf_Shdr shdr, shdr_prog; 481 | Elf_Data *data, *data_prog, *data_maps = NULL, *symbols = NULL; 482 | char *shname, *shname_prog; 483 | int nr_maps = 0; 484 | 485 | /* reset global variables */ 486 | kern_version = 0; 487 | memset(license, 0, sizeof(license)); 488 | memset(processed_sec, 0, sizeof(processed_sec)); 489 | 490 | if (elf_version(EV_CURRENT) == EV_NONE) 491 | return 1; 492 | 493 | fd = open(path, O_RDONLY, 0); 494 | if (fd < 0) 495 | return 1; 496 | 497 | elf = elf_begin(fd, ELF_C_READ, NULL); 498 | 499 | if (!elf) 500 | return 1; 501 | 502 | if (gelf_getehdr(elf, &ehdr) != &ehdr) 503 | return 1; 504 | 505 | /* clear all kprobes */ 506 | i = system("echo \"\" > /sys/kernel/debug/tracing/kprobe_events"); 507 | 508 | /* scan over all elf sections to get license and map info */ 509 | for (i = 1; i < ehdr.e_shnum; i++) { 510 | 511 | if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) 512 | continue; 513 | 514 | if (0) /* helpful for llvm debugging */ 515 | printf("section %d:%s data %p size %zd link %d flags %d\n", 516 | i, shname, data->d_buf, data->d_size, 517 | shdr.sh_link, (int) shdr.sh_flags); 518 | 519 | if (strcmp(shname, "license") == 0) { 520 | processed_sec[i] = true; 521 | memcpy(license, data->d_buf, data->d_size); 522 | } else if (strcmp(shname, "version") == 0) { 523 | processed_sec[i] = true; 524 | if (data->d_size != sizeof(int)) { 525 | printf("invalid size of version section %zd\n", 526 | data->d_size); 527 | return 1; 528 | } 529 | memcpy(&kern_version, data->d_buf, sizeof(int)); 530 | } else if (strcmp(shname, "maps") == 0) { 531 | int j; 532 | 533 | maps_shndx = i; 534 | data_maps = data; 535 | for (j = 0; j < MAX_MAPS; j++) 536 | map_data[j].fd = -1; 537 | } else if (shdr.sh_type == SHT_SYMTAB) { 538 | strtabidx = shdr.sh_link; 539 | symbols = data; 540 | } 541 | } 542 | 543 | ret = 1; 544 | 545 | if (!symbols) { 546 | printf("missing SHT_SYMTAB section\n"); 547 | goto done; 548 | } 549 | 550 | if (data_maps) { 551 | nr_maps = load_elf_maps_section(map_data, maps_shndx, 552 | elf, symbols, strtabidx); 553 | if (nr_maps < 0) { 554 | printf("Error: Failed loading ELF maps (errno:%d):%s\n", 555 | nr_maps, strerror(-nr_maps)); 556 | ret = 1; 557 | goto done; 558 | } 559 | if (load_maps(map_data, nr_maps, fixup_map)) 560 | goto done; 561 | map_data_count = nr_maps; 562 | 563 | processed_sec[maps_shndx] = true; 564 | } 565 | 566 | /* process all relo sections, and rewrite bpf insns for maps */ 567 | for (i = 1; i < ehdr.e_shnum; i++) { 568 | if (processed_sec[i]) 569 | continue; 570 | 571 | if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) 572 | continue; 573 | 574 | if (shdr.sh_type == SHT_REL) { 575 | struct bpf_insn *insns; 576 | 577 | /* locate prog sec that need map fixup (relocations) */ 578 | if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog, 579 | &shdr_prog, &data_prog)) 580 | continue; 581 | 582 | if (shdr_prog.sh_type != SHT_PROGBITS || 583 | !(shdr_prog.sh_flags & SHF_EXECINSTR)) 584 | continue; 585 | 586 | insns = (struct bpf_insn *) data_prog->d_buf; 587 | processed_sec[i] = true; /* relo section */ 588 | 589 | if (parse_relo_and_apply(data, symbols, &shdr, insns, 590 | map_data, nr_maps)) 591 | continue; 592 | } 593 | } 594 | 595 | /* load programs */ 596 | for (i = 1; i < ehdr.e_shnum; i++) { 597 | 598 | if (processed_sec[i]) 599 | continue; 600 | 601 | if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) 602 | continue; 603 | 604 | if (memcmp(shname, "kprobe/", 7) == 0 || 605 | memcmp(shname, "kretprobe/", 10) == 0 || 606 | memcmp(shname, "tracepoint/", 11) == 0 || 607 | memcmp(shname, "xdp", 3) == 0 || 608 | memcmp(shname, "perf_event", 10) == 0 || 609 | memcmp(shname, "socket", 6) == 0 || 610 | memcmp(shname, "cgroup/", 7) == 0 || 611 | memcmp(shname, "sockops", 7) == 0 || 612 | memcmp(shname, "sk_skb", 6) == 0) { 613 | ret = load_and_attach(shname, data->d_buf, 614 | data->d_size); 615 | if (ret != 0) 616 | goto done; 617 | } 618 | } 619 | 620 | ret = 0; 621 | done: 622 | close(fd); 623 | return ret; 624 | } 625 | 626 | int load_bpf_file(char *path) 627 | { 628 | return do_load_bpf_file(path, NULL); 629 | } 630 | 631 | int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map) 632 | { 633 | return do_load_bpf_file(path, fixup_map); 634 | } 635 | 636 | void read_trace_pipe(void) 637 | { 638 | int trace_fd; 639 | 640 | trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0); 641 | if (trace_fd < 0) 642 | return; 643 | 644 | while (1) { 645 | static char buf[4096]; 646 | ssize_t sz; 647 | 648 | sz = read(trace_fd, buf, sizeof(buf)); 649 | if (sz > 0) { 650 | buf[sz] = 0; 651 | puts(buf); 652 | } 653 | } 654 | } 655 | 656 | #define MAX_SYMS 300000 657 | static struct ksym syms[MAX_SYMS]; 658 | static int sym_cnt; 659 | 660 | static int ksym_cmp(const void *p1, const void *p2) 661 | { 662 | return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr; 663 | } 664 | 665 | int load_kallsyms(void) 666 | { 667 | FILE *f = fopen("/proc/kallsyms", "r"); 668 | char func[256], buf[256]; 669 | char symbol; 670 | void *addr; 671 | int i = 0; 672 | 673 | if (!f) 674 | return -ENOENT; 675 | 676 | while (!feof(f)) { 677 | if (!fgets(buf, sizeof(buf), f)) 678 | break; 679 | if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3) 680 | break; 681 | if (!addr) 682 | continue; 683 | syms[i].addr = (long) addr; 684 | syms[i].name = strdup(func); 685 | i++; 686 | } 687 | sym_cnt = i; 688 | qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp); 689 | return 0; 690 | } 691 | 692 | struct ksym *ksym_search(long key) 693 | { 694 | int start = 0, end = sym_cnt; 695 | int result; 696 | 697 | while (start < end) { 698 | size_t mid = start + (end - start) / 2; 699 | 700 | result = key - syms[mid].addr; 701 | if (result < 0) 702 | end = mid; 703 | else if (result > 0) 704 | start = mid + 1; 705 | else 706 | return &syms[mid]; 707 | } 708 | 709 | if (start >= 1 && syms[start - 1].addr < key && 710 | key < syms[start].addr) 711 | /* valid ksym */ 712 | return &syms[start - 1]; 713 | 714 | /* out of range. return _stext */ 715 | return &syms[0]; 716 | } 717 | 718 | int set_link_xdp_fd(int ifindex, int fd, __u32 flags) 719 | { 720 | struct sockaddr_nl sa; 721 | int sock, seq = 0, len, ret = -1; 722 | char buf[4096]; 723 | struct nlattr *nla, *nla_xdp; 724 | struct { 725 | struct nlmsghdr nh; 726 | struct ifinfomsg ifinfo; 727 | char attrbuf[64]; 728 | } req; 729 | struct nlmsghdr *nh; 730 | struct nlmsgerr *err; 731 | 732 | memset(&sa, 0, sizeof(sa)); 733 | sa.nl_family = AF_NETLINK; 734 | 735 | sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); 736 | if (sock < 0) { 737 | printf("open netlink socket: %s\n", strerror(errno)); 738 | return -1; 739 | } 740 | 741 | if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { 742 | printf("bind to netlink: %s\n", strerror(errno)); 743 | goto cleanup; 744 | } 745 | 746 | memset(&req, 0, sizeof(req)); 747 | req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); 748 | req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; 749 | req.nh.nlmsg_type = RTM_SETLINK; 750 | req.nh.nlmsg_pid = 0; 751 | req.nh.nlmsg_seq = ++seq; 752 | req.ifinfo.ifi_family = AF_UNSPEC; 753 | req.ifinfo.ifi_index = ifindex; 754 | 755 | /* started nested attribute for XDP */ 756 | nla = (struct nlattr *)(((char *)&req) 757 | + NLMSG_ALIGN(req.nh.nlmsg_len)); 758 | nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/; 759 | nla->nla_len = NLA_HDRLEN; 760 | 761 | /* add XDP fd */ 762 | nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); 763 | nla_xdp->nla_type = 1/*IFLA_XDP_FD*/; 764 | nla_xdp->nla_len = NLA_HDRLEN + sizeof(int); 765 | memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd)); 766 | nla->nla_len += nla_xdp->nla_len; 767 | 768 | /* if user passed in any flags, add those too */ 769 | if (flags) { 770 | nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); 771 | nla_xdp->nla_type = 3/*IFLA_XDP_FLAGS*/; 772 | nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags); 773 | memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags)); 774 | nla->nla_len += nla_xdp->nla_len; 775 | } 776 | 777 | req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len); 778 | 779 | if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) { 780 | printf("send to netlink: %s\n", strerror(errno)); 781 | goto cleanup; 782 | } 783 | 784 | len = recv(sock, buf, sizeof(buf), 0); 785 | if (len < 0) { 786 | printf("recv from netlink: %s\n", strerror(errno)); 787 | goto cleanup; 788 | } 789 | 790 | for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len); 791 | nh = NLMSG_NEXT(nh, len)) { 792 | if (nh->nlmsg_pid != getpid()) { 793 | printf("Wrong pid %d, expected %d\n", 794 | nh->nlmsg_pid, getpid()); 795 | goto cleanup; 796 | } 797 | if (nh->nlmsg_seq != seq) { 798 | printf("Wrong seq %d, expected %d\n", 799 | nh->nlmsg_seq, seq); 800 | goto cleanup; 801 | } 802 | switch (nh->nlmsg_type) { 803 | case NLMSG_ERROR: 804 | err = (struct nlmsgerr *)NLMSG_DATA(nh); 805 | if (!err->error) 806 | continue; 807 | printf("nlmsg error %s\n", strerror(-err->error)); 808 | goto cleanup; 809 | case NLMSG_DONE: 810 | break; 811 | } 812 | } 813 | 814 | ret = 0; 815 | 816 | cleanup: 817 | close(sock); 818 | return ret; 819 | } 820 | -------------------------------------------------------------------------------- /bpf_load.h: -------------------------------------------------------------------------------- 1 | /* Copy of samples/bpf/bpf_load.h */ 2 | #ifndef __BPF_LOAD_H 3 | #define __BPF_LOAD_H 4 | 5 | #include "libbpf.h" 6 | 7 | #define MAX_MAPS 32 8 | #define MAX_PROGS 32 9 | 10 | struct bpf_map_def { 11 | unsigned int type; 12 | unsigned int key_size; 13 | unsigned int value_size; 14 | unsigned int max_entries; 15 | unsigned int map_flags; 16 | unsigned int inner_map_idx; 17 | unsigned int numa_node; 18 | }; 19 | 20 | struct bpf_map_data { 21 | int fd; 22 | char *name; 23 | size_t elf_offset; 24 | struct bpf_map_def def; 25 | }; 26 | 27 | typedef void (*fixup_map_cb)(struct bpf_map_data *map, int idx); 28 | 29 | extern int prog_fd[MAX_PROGS]; 30 | extern int event_fd[MAX_PROGS]; 31 | extern char bpf_log_buf[BPF_LOG_BUF_SIZE]; 32 | extern int prog_cnt; 33 | 34 | /* There is a one-to-one mapping between map_fd[] and map_data[]. 35 | * The map_data[] just contains more rich info on the given map. 36 | */ 37 | extern int map_fd[MAX_MAPS]; 38 | extern struct bpf_map_data map_data[MAX_MAPS]; 39 | extern int map_data_count; 40 | 41 | /* parses elf file compiled by llvm .c->.o 42 | * . parses 'maps' section and creates maps via BPF syscall 43 | * . parses 'license' section and passes it to syscall 44 | * . parses elf relocations for BPF maps and adjusts BPF_LD_IMM64 insns by 45 | * storing map_fd into insn->imm and marking such insns as BPF_PSEUDO_MAP_FD 46 | * . loads eBPF programs via BPF syscall 47 | * 48 | * One ELF file can contain multiple BPF programs which will be loaded 49 | * and their FDs stored stored in prog_fd array 50 | * 51 | * returns zero on success 52 | */ 53 | int load_bpf_file(char *path); 54 | int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map); 55 | 56 | void read_trace_pipe(void); 57 | struct ksym { 58 | long addr; 59 | char *name; 60 | }; 61 | 62 | int load_kallsyms(void); 63 | struct ksym *ksym_search(long key); 64 | 65 | /* UAPI XDP_FLAGS avail in include/linux/if_link.h, but distro are 66 | * lacking behind. 67 | */ 68 | #ifndef XDP_FLAGS_UPDATE_IF_NOEXIST 69 | #define XDP_FLAGS_UPDATE_IF_NOEXIST (1U << 0) 70 | #endif 71 | /* Since v4.12-rc1 : b5cdae3291f7 ("net: Generic XDP") */ 72 | #ifndef XDP_FLAGS_SKB_MODE 73 | #define XDP_FLAGS_SKB_MODE (1U << 1) 74 | #endif 75 | /* Since: v4.12-rc2 : 0489df9a430e ("xdp: add flag to enforce driver mode") */ 76 | #ifndef XDP_FLAGS_DRV_MODE 77 | #define XDP_FLAGS_DRV_MODE (1U << 2) 78 | #endif 79 | /* Since: v4.13-rc1 / ee5d032f7d03 ("xdp: add HW offload mode flag for installing programs")*/ 80 | #ifndef XDP_FLAGS_HW_MODE 81 | #define XDP_FLAGS_HW_MODE (1U << 3) 82 | #undef XDP_FLAGS_MODES 83 | #define XDP_FLAGS_MODES (XDP_FLAGS_SKB_MODE | \ 84 | XDP_FLAGS_DRV_MODE | \ 85 | XDP_FLAGS_HW_MODE) 86 | #undef XDP_FLAGS_MASK 87 | #define XDP_FLAGS_MASK (XDP_FLAGS_UPDATE_IF_NOEXIST | \ 88 | XDP_FLAGS_MODES) 89 | #endif 90 | 91 | int set_link_xdp_fd(int ifindex, int fd, __u32 flags); 92 | #endif 93 | -------------------------------------------------------------------------------- /bpf_util.h: -------------------------------------------------------------------------------- 1 | #ifndef __BPF_UTIL__ 2 | #define __BPF_UTIL__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | static inline unsigned int bpf_num_possible_cpus(void) 10 | { 11 | static const char *fcpu = "/sys/devices/system/cpu/possible"; 12 | unsigned int start, end, possible_cpus = 0; 13 | char buff[128]; 14 | FILE *fp; 15 | 16 | fp = fopen(fcpu, "r"); 17 | if (!fp) { 18 | printf("Failed to open %s: '%s'!\n", fcpu, strerror(errno)); 19 | exit(1); 20 | } 21 | 22 | while (fgets(buff, sizeof(buff), fp)) { 23 | if (sscanf(buff, "%u-%u", &start, &end) == 2) { 24 | possible_cpus = start == 0 ? end + 1 : 0; 25 | break; 26 | } 27 | } 28 | 29 | fclose(fp); 30 | if (!possible_cpus) { 31 | printf("Failed to retrieve # possible CPUs!\n"); 32 | exit(1); 33 | } 34 | 35 | return possible_cpus; 36 | } 37 | 38 | #endif /* __BPF_UTIL__ */ 39 | -------------------------------------------------------------------------------- /icmp.c: -------------------------------------------------------------------------------- 1 | #include "rmi.h" 2 | 3 | #define PACKETSIZE 64 4 | struct packet 5 | { 6 | struct icmphdr hdr; 7 | char msg[PACKETSIZE-sizeof(struct icmphdr)]; 8 | }; 9 | 10 | int pid=-1; 11 | struct protoent *proto=NULL; 12 | 13 | unsigned short checksum(void *b, int len) 14 | { unsigned short *buf = b; 15 | unsigned int sum=0; 16 | unsigned short result; 17 | 18 | for ( sum = 0; len > 1; len -= 2 ) 19 | sum += *buf++; 20 | if ( len == 1 ) 21 | sum += *(unsigned char*)buf; 22 | sum = (sum >> 16) + (sum & 0xFFFF); 23 | sum += (sum >> 16); 24 | result = ~sum; 25 | return result; 26 | } 27 | 28 | void ping(struct sockaddr_in *addr) 29 | { const int val=255; 30 | int i, sd, cnt=1; 31 | struct packet pckt; 32 | 33 | sd = socket(PF_INET, SOCK_RAW, proto->p_proto); 34 | if ( sd < 0 ) 35 | { 36 | perror("socket"); 37 | return; 38 | } 39 | if ( setsockopt(sd, SOL_IP, IP_TTL, &val, sizeof(val)) != 0) 40 | perror("Set TTL option"); 41 | if ( fcntl(sd, F_SETFL, O_NONBLOCK) != 0 ) 42 | perror("Request nonblocking I/O"); 43 | 44 | if (DEBUG) printf("ICMP #%d sent.\n", cnt); 45 | bzero(&pckt, sizeof(pckt)); 46 | pckt.hdr.type = ICMP_ECHO; 47 | pckt.hdr.un.echo.id = pid; 48 | for ( i = 0; i < sizeof(pckt.msg)-1; i++ ) 49 | pckt.msg[i] = i+'0'; 50 | pckt.msg[i] = 0; 51 | pckt.hdr.un.echo.sequence = cnt++; 52 | pckt.hdr.checksum = checksum(&pckt, sizeof(pckt)); 53 | if ( sendto(sd, &pckt, sizeof(pckt), 0, (struct sockaddr*)addr, sizeof(*addr)) <= 0 ) 54 | perror("sendto"); 55 | } 56 | 57 | int icmp_send_1pkt(in_addr_t *dst_ip) 58 | { 59 | struct sockaddr_in addr; 60 | 61 | proto = getprotobyname("ICMP"); 62 | bzero(&addr, sizeof(addr)); 63 | addr.sin_family = AF_INET; 64 | addr.sin_port = 0; 65 | addr.sin_addr.s_addr = *dst_ip; 66 | ping(&addr); 67 | 68 | return 0; 69 | } 70 | 71 | -------------------------------------------------------------------------------- /libbpf.h: -------------------------------------------------------------------------------- 1 | /* Copied from $(KERNEL)/samples/bpf/libbpf.h 2 | * WARNING: Don't confuse this with tools/lib/bpf/libbpf.h 3 | */ 4 | /* eBPF mini library */ 5 | #ifndef __LIBBPF_H 6 | #define __LIBBPF_H 7 | 8 | /* Notice: This include is tricky because, due to Makefile 9 | * construct of -I$(KERNEL)/tools/lib/ this include find 10 | * tools/lib/bpf/bpf.h which defines the userspace API 11 | */ 12 | #include 13 | 14 | struct bpf_insn; 15 | 16 | /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ 17 | 18 | #define BPF_ALU64_REG(OP, DST, SRC) \ 19 | ((struct bpf_insn) { \ 20 | .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \ 21 | .dst_reg = DST, \ 22 | .src_reg = SRC, \ 23 | .off = 0, \ 24 | .imm = 0 }) 25 | 26 | #define BPF_ALU32_REG(OP, DST, SRC) \ 27 | ((struct bpf_insn) { \ 28 | .code = BPF_ALU | BPF_OP(OP) | BPF_X, \ 29 | .dst_reg = DST, \ 30 | .src_reg = SRC, \ 31 | .off = 0, \ 32 | .imm = 0 }) 33 | 34 | /* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */ 35 | 36 | #define BPF_ALU64_IMM(OP, DST, IMM) \ 37 | ((struct bpf_insn) { \ 38 | .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \ 39 | .dst_reg = DST, \ 40 | .src_reg = 0, \ 41 | .off = 0, \ 42 | .imm = IMM }) 43 | 44 | #define BPF_ALU32_IMM(OP, DST, IMM) \ 45 | ((struct bpf_insn) { \ 46 | .code = BPF_ALU | BPF_OP(OP) | BPF_K, \ 47 | .dst_reg = DST, \ 48 | .src_reg = 0, \ 49 | .off = 0, \ 50 | .imm = IMM }) 51 | 52 | /* Short form of mov, dst_reg = src_reg */ 53 | 54 | #define BPF_MOV64_REG(DST, SRC) \ 55 | ((struct bpf_insn) { \ 56 | .code = BPF_ALU64 | BPF_MOV | BPF_X, \ 57 | .dst_reg = DST, \ 58 | .src_reg = SRC, \ 59 | .off = 0, \ 60 | .imm = 0 }) 61 | 62 | #define BPF_MOV32_REG(DST, SRC) \ 63 | ((struct bpf_insn) { \ 64 | .code = BPF_ALU | BPF_MOV | BPF_X, \ 65 | .dst_reg = DST, \ 66 | .src_reg = SRC, \ 67 | .off = 0, \ 68 | .imm = 0 }) 69 | 70 | /* Short form of mov, dst_reg = imm32 */ 71 | 72 | #define BPF_MOV64_IMM(DST, IMM) \ 73 | ((struct bpf_insn) { \ 74 | .code = BPF_ALU64 | BPF_MOV | BPF_K, \ 75 | .dst_reg = DST, \ 76 | .src_reg = 0, \ 77 | .off = 0, \ 78 | .imm = IMM }) 79 | 80 | #define BPF_MOV32_IMM(DST, IMM) \ 81 | ((struct bpf_insn) { \ 82 | .code = BPF_ALU | BPF_MOV | BPF_K, \ 83 | .dst_reg = DST, \ 84 | .src_reg = 0, \ 85 | .off = 0, \ 86 | .imm = IMM }) 87 | 88 | /* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ 89 | #define BPF_LD_IMM64(DST, IMM) \ 90 | BPF_LD_IMM64_RAW(DST, 0, IMM) 91 | 92 | #define BPF_LD_IMM64_RAW(DST, SRC, IMM) \ 93 | ((struct bpf_insn) { \ 94 | .code = BPF_LD | BPF_DW | BPF_IMM, \ 95 | .dst_reg = DST, \ 96 | .src_reg = SRC, \ 97 | .off = 0, \ 98 | .imm = (__u32) (IMM) }), \ 99 | ((struct bpf_insn) { \ 100 | .code = 0, /* zero is reserved opcode */ \ 101 | .dst_reg = 0, \ 102 | .src_reg = 0, \ 103 | .off = 0, \ 104 | .imm = ((__u64) (IMM)) >> 32 }) 105 | 106 | #ifndef BPF_PSEUDO_MAP_FD 107 | # define BPF_PSEUDO_MAP_FD 1 108 | #endif 109 | 110 | /* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */ 111 | #define BPF_LD_MAP_FD(DST, MAP_FD) \ 112 | BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD) 113 | 114 | 115 | /* Direct packet access, R0 = *(uint *) (skb->data + imm32) */ 116 | 117 | #define BPF_LD_ABS(SIZE, IMM) \ 118 | ((struct bpf_insn) { \ 119 | .code = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS, \ 120 | .dst_reg = 0, \ 121 | .src_reg = 0, \ 122 | .off = 0, \ 123 | .imm = IMM }) 124 | 125 | /* Memory load, dst_reg = *(uint *) (src_reg + off16) */ 126 | 127 | #define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ 128 | ((struct bpf_insn) { \ 129 | .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \ 130 | .dst_reg = DST, \ 131 | .src_reg = SRC, \ 132 | .off = OFF, \ 133 | .imm = 0 }) 134 | 135 | /* Memory store, *(uint *) (dst_reg + off16) = src_reg */ 136 | 137 | #define BPF_STX_MEM(SIZE, DST, SRC, OFF) \ 138 | ((struct bpf_insn) { \ 139 | .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \ 140 | .dst_reg = DST, \ 141 | .src_reg = SRC, \ 142 | .off = OFF, \ 143 | .imm = 0 }) 144 | 145 | /* Memory store, *(uint *) (dst_reg + off16) = imm32 */ 146 | 147 | #define BPF_ST_MEM(SIZE, DST, OFF, IMM) \ 148 | ((struct bpf_insn) { \ 149 | .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \ 150 | .dst_reg = DST, \ 151 | .src_reg = 0, \ 152 | .off = OFF, \ 153 | .imm = IMM }) 154 | 155 | /* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */ 156 | 157 | #define BPF_JMP_REG(OP, DST, SRC, OFF) \ 158 | ((struct bpf_insn) { \ 159 | .code = BPF_JMP | BPF_OP(OP) | BPF_X, \ 160 | .dst_reg = DST, \ 161 | .src_reg = SRC, \ 162 | .off = OFF, \ 163 | .imm = 0 }) 164 | 165 | /* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */ 166 | 167 | #define BPF_JMP_IMM(OP, DST, IMM, OFF) \ 168 | ((struct bpf_insn) { \ 169 | .code = BPF_JMP | BPF_OP(OP) | BPF_K, \ 170 | .dst_reg = DST, \ 171 | .src_reg = 0, \ 172 | .off = OFF, \ 173 | .imm = IMM }) 174 | 175 | /* Raw code statement block */ 176 | 177 | #define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \ 178 | ((struct bpf_insn) { \ 179 | .code = CODE, \ 180 | .dst_reg = DST, \ 181 | .src_reg = SRC, \ 182 | .off = OFF, \ 183 | .imm = IMM }) 184 | 185 | /* Program exit */ 186 | 187 | #define BPF_EXIT_INSN() \ 188 | ((struct bpf_insn) { \ 189 | .code = BPF_JMP | BPF_EXIT, \ 190 | .dst_reg = 0, \ 191 | .src_reg = 0, \ 192 | .off = 0, \ 193 | .imm = 0 }) 194 | 195 | #endif 196 | -------------------------------------------------------------------------------- /mac.c: -------------------------------------------------------------------------------- 1 | #include "rmi.h" 2 | 3 | int xlb_get_mac(in_addr_t *host, char *mac, int *dev){ 4 | 5 | int s; 6 | 7 | struct arpreq req; 8 | struct sockaddr_in *sin; 9 | static char buf[256]; 10 | 11 | bzero((caddr_t)&req, sizeof(req)); 12 | 13 | sin = (struct sockaddr_in *)&req.arp_pa; 14 | sin->sin_family = AF_INET; 15 | sin->sin_addr.s_addr = *host; 16 | 17 | if((s = socket(AF_INET, SOCK_DGRAM, 0)) < 0){ 18 | perror("socket() failed."); 19 | exit(-1); 20 | } 21 | 22 | if_indextoname(*dev, req.arp_dev); 23 | if (DEBUG) printf("ifname= %s\n", req.arp_dev); 24 | 25 | if(ioctl(s, SIOCGARP, (caddr_t)&req) <0){ 26 | if(errno == ENXIO){ 27 | 28 | icmp_send_1pkt(&sin->sin_addr.s_addr); 29 | usleep(100000); 30 | 31 | if(ioctl(s, SIOCGARP, (caddr_t)&req) <0){ 32 | if(errno == ENXIO){ 33 | printf("%s - no entry.\n", inet_ntop(AF_INET, host, buf, 256)); 34 | // printf("%lu - no entry.\n", *host); 35 | exit(-1); 36 | } else { 37 | perror("SIOCGARP"); 38 | exit(-1); 39 | } 40 | } 41 | 42 | } else { 43 | perror("SIOCGARP"); 44 | exit(-1); 45 | } 46 | } 47 | 48 | if(!(req.arp_flags & ATF_COM)){ 49 | printf("Could not get workers Mac address from arp cache.\n"); 50 | exit(-1); 51 | } 52 | 53 | memcpy(mac, req.arp_ha.sa_data, 6); 54 | 55 | return(0); 56 | } 57 | 58 | -------------------------------------------------------------------------------- /perf-sys.h: -------------------------------------------------------------------------------- 1 | /* Notice: copy of kernel/tools/perf/perf-sys.h */ 2 | #ifndef _PERF_SYS_H 3 | #define _PERF_SYS_H 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | /* 11 | #include 12 | #include 13 | #include 14 | 15 | #if defined(__i386__) 16 | #define cpu_relax() asm volatile("rep; nop" ::: "memory"); 17 | #define CPUINFO_PROC {"model name"} 18 | #endif 19 | 20 | #if defined(__x86_64__) 21 | #define cpu_relax() asm volatile("rep; nop" ::: "memory"); 22 | #define CPUINFO_PROC {"model name"} 23 | #endif 24 | 25 | #ifdef __powerpc__ 26 | #define CPUINFO_PROC {"cpu"} 27 | #endif 28 | 29 | #ifdef __s390__ 30 | #define CPUINFO_PROC {"vendor_id"} 31 | #endif 32 | 33 | #ifdef __sh__ 34 | #define CPUINFO_PROC {"cpu type"} 35 | #endif 36 | 37 | #ifdef __hppa__ 38 | #define CPUINFO_PROC {"cpu"} 39 | #endif 40 | 41 | #ifdef __sparc__ 42 | #define CPUINFO_PROC {"cpu"} 43 | #endif 44 | 45 | #ifdef __alpha__ 46 | #define CPUINFO_PROC {"cpu model"} 47 | #endif 48 | 49 | #ifdef __ia64__ 50 | #define cpu_relax() asm volatile ("hint @pause" ::: "memory") 51 | #define CPUINFO_PROC {"model name"} 52 | #endif 53 | 54 | #ifdef __arm__ 55 | #define CPUINFO_PROC {"model name", "Processor"} 56 | #endif 57 | 58 | #ifdef __aarch64__ 59 | #define cpu_relax() asm volatile("yield" ::: "memory") 60 | #endif 61 | 62 | #ifdef __mips__ 63 | #define CPUINFO_PROC {"cpu model"} 64 | #endif 65 | 66 | #ifdef __arc__ 67 | #define CPUINFO_PROC {"Processor"} 68 | #endif 69 | 70 | #ifdef __metag__ 71 | #define CPUINFO_PROC {"CPU"} 72 | #endif 73 | 74 | #ifdef __xtensa__ 75 | #define CPUINFO_PROC {"core ID"} 76 | #endif 77 | 78 | #ifdef __tile__ 79 | #define cpu_relax() asm volatile ("mfspr zero, PASS" ::: "memory") 80 | #define CPUINFO_PROC {"model name"} 81 | #endif 82 | 83 | #ifndef cpu_relax 84 | #define cpu_relax() barrier() 85 | #endif 86 | */ 87 | 88 | static inline int 89 | sys_perf_event_open(struct perf_event_attr *attr, 90 | pid_t pid, int cpu, int group_fd, 91 | unsigned long flags) 92 | { 93 | int fd; 94 | 95 | fd = syscall(__NR_perf_event_open, attr, pid, cpu, 96 | group_fd, flags); 97 | 98 | #ifdef HAVE_ATTR_TEST 99 | if (unlikely(test_attr__enabled)) 100 | test_attr__open(attr, pid, cpu, fd, group_fd, flags); 101 | #endif 102 | return fd; 103 | } 104 | 105 | #endif /* _PERF_SYS_H */ 106 | -------------------------------------------------------------------------------- /rmi.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | //#include 14 | #include 15 | 16 | #include 17 | #include 18 | 19 | #include 20 | #include 21 | #include 22 | 23 | 24 | /// icmp 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | /// 35 | 36 | #define IFLIST_REPLY_BUFFER 8192 37 | 38 | #ifndef DEBUG 39 | #define DEBUG 0 40 | #endif 41 | 42 | #define NLMSG_TAIL(nmsg) \ 43 | ((struct rtattr *) (((void *) (nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len))) 44 | 45 | int xlb_parse_route(struct nlmsghdr *nlh, in_addr_t *src_ip, in_addr_t *nh_ip, int *dev); 46 | int addattr_l(struct nlmsghdr *n, int maxlen, int type, const void *data, 47 | int alen); 48 | int xlb_iproute_get(in_addr_t *dst_ip, in_addr_t *src_ip , in_addr_t *nh_ip, int *dev); 49 | int xlb_get_mac(in_addr_t *host, char *mac, int *dev); 50 | 51 | unsigned short checksum(void *b, int len); 52 | void ping(struct sockaddr_in *addr); 53 | int icmp_send_1pkt(in_addr_t *dst_ip); 54 | 55 | -------------------------------------------------------------------------------- /route.c: -------------------------------------------------------------------------------- 1 | #include "rmi.h" 2 | 3 | int xlb_parse_route(struct nlmsghdr *nlh, in_addr_t *src_ip, in_addr_t *nh_ip, int *dev) 4 | { 5 | struct rtmsg *route_entry; 6 | struct rtattr *route_attribute; 7 | int route_attribute_len = 0; 8 | // unsigned char route_netmask = 0; 9 | // unsigned char route_protocol = 0; 10 | int via = 0; 11 | 12 | route_entry = (struct rtmsg *) NLMSG_DATA(nlh); 13 | 14 | if (route_entry->rtm_table != RT_TABLE_MAIN) 15 | return 1; 16 | 17 | // route_netmask = route_entry->rtm_dst_len; 18 | // route_protocol = route_entry->rtm_protocol; 19 | route_attribute = (struct rtattr *) RTM_RTA(route_entry); 20 | route_attribute_len = RTM_PAYLOAD(nlh); 21 | 22 | for ( ; RTA_OK(route_attribute, route_attribute_len); \ 23 | route_attribute = RTA_NEXT(route_attribute, route_attribute_len)) 24 | { 25 | 26 | if (route_attribute->rta_type == RTA_DST) 27 | if (via == 0) 28 | memcpy(nh_ip, RTA_DATA(route_attribute), 4); 29 | 30 | if (route_attribute->rta_type == RTA_GATEWAY) 31 | { 32 | memcpy(nh_ip, RTA_DATA(route_attribute), 4); 33 | via = 1; 34 | } 35 | 36 | if (route_attribute->rta_type == RTA_PREFSRC) 37 | memcpy(src_ip, RTA_DATA(route_attribute), 4); 38 | 39 | if (route_attribute->rta_type == RTA_OIF) 40 | memcpy(dev, RTA_DATA(route_attribute), sizeof(int)); 41 | } 42 | 43 | return 0; 44 | } 45 | 46 | int addattr_l(struct nlmsghdr *n, int maxlen, int type, const void *data, 47 | int alen) 48 | { 49 | int len = RTA_LENGTH(alen); 50 | struct rtattr *rta; 51 | 52 | if (NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len) > maxlen) { 53 | fprintf(stderr, 54 | "addattr_l ERROR: message exceeded bound of %d\n", 55 | maxlen); 56 | return -1; 57 | } 58 | rta = NLMSG_TAIL(n); 59 | rta->rta_type = type; 60 | rta->rta_len = len; 61 | if (alen) 62 | memcpy(RTA_DATA(rta), data, alen); 63 | n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len); 64 | return 0; 65 | } 66 | 67 | int xlb_iproute_get(in_addr_t *dst_ip, in_addr_t *src_ip , in_addr_t *nh_ip, int *dev) 68 | { 69 | struct msghdr rtnl_msg; 70 | struct iovec io; 71 | int fd; 72 | 73 | struct { 74 | struct nlmsghdr n; 75 | struct rtmsg r; 76 | char buf[1024]; 77 | } req; 78 | 79 | memset(&rtnl_msg, 0, sizeof(rtnl_msg)); 80 | memset(&req, 0, sizeof(req)); 81 | 82 | req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)); 83 | req.n.nlmsg_flags = NLM_F_REQUEST; 84 | req.n.nlmsg_type = RTM_GETROUTE; 85 | req.r.rtm_family = AF_INET; 86 | 87 | 88 | addattr_l(&req.n, sizeof(req), RTA_DST, dst_ip, 4); 89 | 90 | io.iov_base = &req; 91 | io.iov_len = req.n.nlmsg_len; 92 | rtnl_msg.msg_iov = &io; 93 | rtnl_msg.msg_iovlen = 1; 94 | 95 | fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); 96 | sendmsg(fd, (struct msghdr *) &rtnl_msg, 0); 97 | 98 | /* parse reply */ 99 | 100 | { 101 | struct nlmsghdr *answer; 102 | struct msghdr rtnl_reply; 103 | struct iovec io_reply; 104 | char reply[IFLIST_REPLY_BUFFER]; 105 | 106 | 107 | memset(&io_reply, 0, sizeof(io_reply)); 108 | memset(&rtnl_reply, 0, sizeof(rtnl_reply)); 109 | 110 | io.iov_base = reply; 111 | io.iov_len = IFLIST_REPLY_BUFFER; 112 | rtnl_reply.msg_iov = &io; 113 | rtnl_reply.msg_iovlen = 1; 114 | 115 | recvmsg(fd, &rtnl_reply, 0); 116 | answer = (struct nlmsghdr *) reply; 117 | 118 | xlb_parse_route(answer, src_ip, nh_ip, dev); 119 | } 120 | 121 | close(fd); 122 | 123 | return 0; 124 | } 125 | 126 | -------------------------------------------------------------------------------- /tools/include/linux/bpf.h: -------------------------------------------------------------------------------- 1 | ../uapi/linux/bpf.h -------------------------------------------------------------------------------- /tools/include/uapi/linux/bpf.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ 2 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 3 | * 4 | * Copy of kernel tools/include/uapi/linux/bpf.h 5 | * 6 | * This program is free software; you can redistribute it and/or 7 | * modify it under the terms of version 2 of the GNU General Public 8 | * License as published by the Free Software Foundation. 9 | */ 10 | 11 | /* NOTICE: Need to keep a more updated copy of bpf.h UAPI definitions 12 | * when developing new bpf features. Unfortunately there is an 13 | * include mess, I haven't solved 100% yet. Thus, this trick warns 14 | * when another (presumably) older version of this bpf.h UAPI got 15 | * included (likely from distro or kernel source) 16 | */ 17 | #ifdef __LINUX_BPF_H__ 18 | # ifndef __LINUX_BPF_H__git_repo_copy 19 | # warning "This bpf.h git-repo-copy not getting used" 20 | //# else 21 | //# warning "Double include of this bpf.h (consider cleanup)" 22 | # endif 23 | #endif 24 | 25 | #ifndef __LINUX_BPF_H__ 26 | #define __LINUX_BPF_H__ 27 | 28 | #define __LINUX_BPF_H__git_repo_copy 29 | 30 | #include 31 | #include 32 | 33 | /* Extended instruction set based on top of classic BPF */ 34 | 35 | /* instruction classes */ 36 | #define BPF_ALU64 0x07 /* alu mode in double word width */ 37 | 38 | /* ld/ldx fields */ 39 | #define BPF_DW 0x18 /* double word (64-bit) */ 40 | #define BPF_XADD 0xc0 /* exclusive add */ 41 | 42 | /* alu/jmp fields */ 43 | #define BPF_MOV 0xb0 /* mov reg to reg */ 44 | #define BPF_ARSH 0xc0 /* sign extending arithmetic shift right */ 45 | 46 | /* change endianness of a register */ 47 | #define BPF_END 0xd0 /* flags for endianness conversion: */ 48 | #define BPF_TO_LE 0x00 /* convert to little-endian */ 49 | #define BPF_TO_BE 0x08 /* convert to big-endian */ 50 | #define BPF_FROM_LE BPF_TO_LE 51 | #define BPF_FROM_BE BPF_TO_BE 52 | 53 | /* jmp encodings */ 54 | #define BPF_JNE 0x50 /* jump != */ 55 | #define BPF_JLT 0xa0 /* LT is unsigned, '<' */ 56 | #define BPF_JLE 0xb0 /* LE is unsigned, '<=' */ 57 | #define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */ 58 | #define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */ 59 | #define BPF_JSLT 0xc0 /* SLT is signed, '<' */ 60 | #define BPF_JSLE 0xd0 /* SLE is signed, '<=' */ 61 | #define BPF_CALL 0x80 /* function call */ 62 | #define BPF_EXIT 0x90 /* function return */ 63 | 64 | /* Register numbers */ 65 | enum { 66 | BPF_REG_0 = 0, 67 | BPF_REG_1, 68 | BPF_REG_2, 69 | BPF_REG_3, 70 | BPF_REG_4, 71 | BPF_REG_5, 72 | BPF_REG_6, 73 | BPF_REG_7, 74 | BPF_REG_8, 75 | BPF_REG_9, 76 | BPF_REG_10, 77 | __MAX_BPF_REG, 78 | }; 79 | 80 | /* BPF has 10 general purpose 64-bit registers and stack frame. */ 81 | #define MAX_BPF_REG __MAX_BPF_REG 82 | 83 | struct bpf_insn { 84 | __u8 code; /* opcode */ 85 | __u8 dst_reg:4; /* dest register */ 86 | __u8 src_reg:4; /* source register */ 87 | __s16 off; /* signed offset */ 88 | __s32 imm; /* signed immediate constant */ 89 | }; 90 | 91 | /* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ 92 | struct bpf_lpm_trie_key { 93 | __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ 94 | __u8 data[0]; /* Arbitrary size */ 95 | }; 96 | 97 | /* BPF syscall commands, see bpf(2) man-page for details. */ 98 | enum bpf_cmd { 99 | BPF_MAP_CREATE, 100 | BPF_MAP_LOOKUP_ELEM, 101 | BPF_MAP_UPDATE_ELEM, 102 | BPF_MAP_DELETE_ELEM, 103 | BPF_MAP_GET_NEXT_KEY, 104 | BPF_PROG_LOAD, 105 | BPF_OBJ_PIN, 106 | BPF_OBJ_GET, 107 | BPF_PROG_ATTACH, 108 | BPF_PROG_DETACH, 109 | BPF_PROG_TEST_RUN, 110 | BPF_PROG_GET_NEXT_ID, 111 | BPF_MAP_GET_NEXT_ID, 112 | BPF_PROG_GET_FD_BY_ID, 113 | BPF_MAP_GET_FD_BY_ID, 114 | BPF_OBJ_GET_INFO_BY_FD, 115 | BPF_PROG_QUERY, 116 | }; 117 | 118 | enum bpf_map_type { 119 | BPF_MAP_TYPE_UNSPEC, 120 | BPF_MAP_TYPE_HASH, 121 | BPF_MAP_TYPE_ARRAY, 122 | BPF_MAP_TYPE_PROG_ARRAY, 123 | BPF_MAP_TYPE_PERF_EVENT_ARRAY, 124 | BPF_MAP_TYPE_PERCPU_HASH, 125 | BPF_MAP_TYPE_PERCPU_ARRAY, 126 | BPF_MAP_TYPE_STACK_TRACE, 127 | BPF_MAP_TYPE_CGROUP_ARRAY, 128 | BPF_MAP_TYPE_LRU_HASH, 129 | BPF_MAP_TYPE_LRU_PERCPU_HASH, 130 | BPF_MAP_TYPE_LPM_TRIE, 131 | BPF_MAP_TYPE_ARRAY_OF_MAPS, 132 | BPF_MAP_TYPE_HASH_OF_MAPS, 133 | BPF_MAP_TYPE_DEVMAP, 134 | BPF_MAP_TYPE_SOCKMAP, 135 | BPF_MAP_TYPE_CPUMAP, 136 | }; 137 | 138 | enum bpf_prog_type { 139 | BPF_PROG_TYPE_UNSPEC, 140 | BPF_PROG_TYPE_SOCKET_FILTER, 141 | BPF_PROG_TYPE_KPROBE, 142 | BPF_PROG_TYPE_SCHED_CLS, 143 | BPF_PROG_TYPE_SCHED_ACT, 144 | BPF_PROG_TYPE_TRACEPOINT, 145 | BPF_PROG_TYPE_XDP, 146 | BPF_PROG_TYPE_PERF_EVENT, 147 | BPF_PROG_TYPE_CGROUP_SKB, 148 | BPF_PROG_TYPE_CGROUP_SOCK, 149 | BPF_PROG_TYPE_LWT_IN, 150 | BPF_PROG_TYPE_LWT_OUT, 151 | BPF_PROG_TYPE_LWT_XMIT, 152 | BPF_PROG_TYPE_SOCK_OPS, 153 | BPF_PROG_TYPE_SK_SKB, 154 | BPF_PROG_TYPE_CGROUP_DEVICE, 155 | }; 156 | 157 | enum bpf_attach_type { 158 | BPF_CGROUP_INET_INGRESS, 159 | BPF_CGROUP_INET_EGRESS, 160 | BPF_CGROUP_INET_SOCK_CREATE, 161 | BPF_CGROUP_SOCK_OPS, 162 | BPF_SK_SKB_STREAM_PARSER, 163 | BPF_SK_SKB_STREAM_VERDICT, 164 | BPF_CGROUP_DEVICE, 165 | __MAX_BPF_ATTACH_TYPE 166 | }; 167 | 168 | #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE 169 | 170 | /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command 171 | * 172 | * NONE(default): No further bpf programs allowed in the subtree. 173 | * 174 | * BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program, 175 | * the program in this cgroup yields to sub-cgroup program. 176 | * 177 | * BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program, 178 | * that cgroup program gets run in addition to the program in this cgroup. 179 | * 180 | * Only one program is allowed to be attached to a cgroup with 181 | * NONE or BPF_F_ALLOW_OVERRIDE flag. 182 | * Attaching another program on top of NONE or BPF_F_ALLOW_OVERRIDE will 183 | * release old program and attach the new one. Attach flags has to match. 184 | * 185 | * Multiple programs are allowed to be attached to a cgroup with 186 | * BPF_F_ALLOW_MULTI flag. They are executed in FIFO order 187 | * (those that were attached first, run first) 188 | * The programs of sub-cgroup are executed first, then programs of 189 | * this cgroup and then programs of parent cgroup. 190 | * When children program makes decision (like picking TCP CA or sock bind) 191 | * parent program has a chance to override it. 192 | * 193 | * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups. 194 | * A cgroup with NONE doesn't allow any programs in sub-cgroups. 195 | * Ex1: 196 | * cgrp1 (MULTI progs A, B) -> 197 | * cgrp2 (OVERRIDE prog C) -> 198 | * cgrp3 (MULTI prog D) -> 199 | * cgrp4 (OVERRIDE prog E) -> 200 | * cgrp5 (NONE prog F) 201 | * the event in cgrp5 triggers execution of F,D,A,B in that order. 202 | * if prog F is detached, the execution is E,D,A,B 203 | * if prog F and D are detached, the execution is E,A,B 204 | * if prog F, E and D are detached, the execution is C,A,B 205 | * 206 | * All eligible programs are executed regardless of return code from 207 | * earlier programs. 208 | */ 209 | #define BPF_F_ALLOW_OVERRIDE (1U << 0) 210 | #define BPF_F_ALLOW_MULTI (1U << 1) 211 | 212 | /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the 213 | * verifier will perform strict alignment checking as if the kernel 214 | * has been built with CONFIG_EFFICIENT_UNALIGNED_ACCESS not set, 215 | * and NET_IP_ALIGN defined to 2. 216 | */ 217 | #define BPF_F_STRICT_ALIGNMENT (1U << 0) 218 | 219 | /* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */ 220 | #define BPF_PSEUDO_MAP_FD 1 221 | 222 | /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative 223 | * offset to another bpf function 224 | */ 225 | #define BPF_PSEUDO_CALL 1 226 | 227 | /* flags for BPF_MAP_UPDATE_ELEM command */ 228 | #define BPF_ANY 0 /* create new element or update existing */ 229 | #define BPF_NOEXIST 1 /* create new element if it didn't exist */ 230 | #define BPF_EXIST 2 /* update existing element */ 231 | 232 | /* flags for BPF_MAP_CREATE command */ 233 | #define BPF_F_NO_PREALLOC (1U << 0) 234 | /* Instead of having one common LRU list in the 235 | * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list 236 | * which can scale and perform better. 237 | * Note, the LRU nodes (including free nodes) cannot be moved 238 | * across different LRU lists. 239 | */ 240 | #define BPF_F_NO_COMMON_LRU (1U << 1) 241 | /* Specify numa node during map creation */ 242 | #define BPF_F_NUMA_NODE (1U << 2) 243 | 244 | /* flags for BPF_PROG_QUERY */ 245 | #define BPF_F_QUERY_EFFECTIVE (1U << 0) 246 | 247 | #define BPF_OBJ_NAME_LEN 16U 248 | 249 | /* Flags for accessing BPF object */ 250 | #define BPF_F_RDONLY (1U << 3) 251 | #define BPF_F_WRONLY (1U << 4) 252 | 253 | union bpf_attr { 254 | struct { /* anonymous struct used by BPF_MAP_CREATE command */ 255 | __u32 map_type; /* one of enum bpf_map_type */ 256 | __u32 key_size; /* size of key in bytes */ 257 | __u32 value_size; /* size of value in bytes */ 258 | __u32 max_entries; /* max number of entries in a map */ 259 | __u32 map_flags; /* BPF_MAP_CREATE related 260 | * flags defined above. 261 | */ 262 | __u32 inner_map_fd; /* fd pointing to the inner map */ 263 | __u32 numa_node; /* numa node (effective only if 264 | * BPF_F_NUMA_NODE is set). 265 | */ 266 | char map_name[BPF_OBJ_NAME_LEN]; 267 | __u32 map_ifindex; /* ifindex of netdev to create on */ 268 | }; 269 | 270 | struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ 271 | __u32 map_fd; 272 | __aligned_u64 key; 273 | union { 274 | __aligned_u64 value; 275 | __aligned_u64 next_key; 276 | }; 277 | __u64 flags; 278 | }; 279 | 280 | struct { /* anonymous struct used by BPF_PROG_LOAD command */ 281 | __u32 prog_type; /* one of enum bpf_prog_type */ 282 | __u32 insn_cnt; 283 | __aligned_u64 insns; 284 | __aligned_u64 license; 285 | __u32 log_level; /* verbosity level of verifier */ 286 | __u32 log_size; /* size of user buffer */ 287 | __aligned_u64 log_buf; /* user supplied buffer */ 288 | __u32 kern_version; /* checked when prog_type=kprobe */ 289 | __u32 prog_flags; 290 | char prog_name[BPF_OBJ_NAME_LEN]; 291 | __u32 prog_ifindex; /* ifindex of netdev to prep for */ 292 | }; 293 | 294 | struct { /* anonymous struct used by BPF_OBJ_* commands */ 295 | __aligned_u64 pathname; 296 | __u32 bpf_fd; 297 | __u32 file_flags; 298 | }; 299 | 300 | struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */ 301 | __u32 target_fd; /* container object to attach to */ 302 | __u32 attach_bpf_fd; /* eBPF program to attach */ 303 | __u32 attach_type; 304 | __u32 attach_flags; 305 | }; 306 | 307 | struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ 308 | __u32 prog_fd; 309 | __u32 retval; 310 | __u32 data_size_in; 311 | __u32 data_size_out; 312 | __aligned_u64 data_in; 313 | __aligned_u64 data_out; 314 | __u32 repeat; 315 | __u32 duration; 316 | } test; 317 | 318 | struct { /* anonymous struct used by BPF_*_GET_*_ID */ 319 | union { 320 | __u32 start_id; 321 | __u32 prog_id; 322 | __u32 map_id; 323 | }; 324 | __u32 next_id; 325 | __u32 open_flags; 326 | }; 327 | 328 | struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */ 329 | __u32 bpf_fd; 330 | __u32 info_len; 331 | __aligned_u64 info; 332 | } info; 333 | 334 | struct { /* anonymous struct used by BPF_PROG_QUERY command */ 335 | __u32 target_fd; /* container object to query */ 336 | __u32 attach_type; 337 | __u32 query_flags; 338 | __u32 attach_flags; 339 | __aligned_u64 prog_ids; 340 | __u32 prog_cnt; 341 | } query; 342 | } __attribute__((aligned(8))); 343 | 344 | /* BPF helper function descriptions: 345 | * 346 | * void *bpf_map_lookup_elem(&map, &key) 347 | * Return: Map value or NULL 348 | * 349 | * int bpf_map_update_elem(&map, &key, &value, flags) 350 | * Return: 0 on success or negative error 351 | * 352 | * int bpf_map_delete_elem(&map, &key) 353 | * Return: 0 on success or negative error 354 | * 355 | * int bpf_probe_read(void *dst, int size, void *src) 356 | * Return: 0 on success or negative error 357 | * 358 | * u64 bpf_ktime_get_ns(void) 359 | * Return: current ktime 360 | * 361 | * int bpf_trace_printk(const char *fmt, int fmt_size, ...) 362 | * Return: length of buffer written or negative error 363 | * 364 | * u32 bpf_prandom_u32(void) 365 | * Return: random value 366 | * 367 | * u32 bpf_raw_smp_processor_id(void) 368 | * Return: SMP processor ID 369 | * 370 | * int bpf_skb_store_bytes(skb, offset, from, len, flags) 371 | * store bytes into packet 372 | * @skb: pointer to skb 373 | * @offset: offset within packet from skb->mac_header 374 | * @from: pointer where to copy bytes from 375 | * @len: number of bytes to store into packet 376 | * @flags: bit 0 - if true, recompute skb->csum 377 | * other bits - reserved 378 | * Return: 0 on success or negative error 379 | * 380 | * int bpf_l3_csum_replace(skb, offset, from, to, flags) 381 | * recompute IP checksum 382 | * @skb: pointer to skb 383 | * @offset: offset within packet where IP checksum is located 384 | * @from: old value of header field 385 | * @to: new value of header field 386 | * @flags: bits 0-3 - size of header field 387 | * other bits - reserved 388 | * Return: 0 on success or negative error 389 | * 390 | * int bpf_l4_csum_replace(skb, offset, from, to, flags) 391 | * recompute TCP/UDP checksum 392 | * @skb: pointer to skb 393 | * @offset: offset within packet where TCP/UDP checksum is located 394 | * @from: old value of header field 395 | * @to: new value of header field 396 | * @flags: bits 0-3 - size of header field 397 | * bit 4 - is pseudo header 398 | * other bits - reserved 399 | * Return: 0 on success or negative error 400 | * 401 | * int bpf_tail_call(ctx, prog_array_map, index) 402 | * jump into another BPF program 403 | * @ctx: context pointer passed to next program 404 | * @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY 405 | * @index: 32-bit index inside array that selects specific program to run 406 | * Return: 0 on success or negative error 407 | * 408 | * int bpf_clone_redirect(skb, ifindex, flags) 409 | * redirect to another netdev 410 | * @skb: pointer to skb 411 | * @ifindex: ifindex of the net device 412 | * @flags: bit 0 - if set, redirect to ingress instead of egress 413 | * other bits - reserved 414 | * Return: 0 on success or negative error 415 | * 416 | * u64 bpf_get_current_pid_tgid(void) 417 | * Return: current->tgid << 32 | current->pid 418 | * 419 | * u64 bpf_get_current_uid_gid(void) 420 | * Return: current_gid << 32 | current_uid 421 | * 422 | * int bpf_get_current_comm(char *buf, int size_of_buf) 423 | * stores current->comm into buf 424 | * Return: 0 on success or negative error 425 | * 426 | * u32 bpf_get_cgroup_classid(skb) 427 | * retrieve a proc's classid 428 | * @skb: pointer to skb 429 | * Return: classid if != 0 430 | * 431 | * int bpf_skb_vlan_push(skb, vlan_proto, vlan_tci) 432 | * Return: 0 on success or negative error 433 | * 434 | * int bpf_skb_vlan_pop(skb) 435 | * Return: 0 on success or negative error 436 | * 437 | * int bpf_skb_get_tunnel_key(skb, key, size, flags) 438 | * int bpf_skb_set_tunnel_key(skb, key, size, flags) 439 | * retrieve or populate tunnel metadata 440 | * @skb: pointer to skb 441 | * @key: pointer to 'struct bpf_tunnel_key' 442 | * @size: size of 'struct bpf_tunnel_key' 443 | * @flags: room for future extensions 444 | * Return: 0 on success or negative error 445 | * 446 | * u64 bpf_perf_event_read(map, flags) 447 | * read perf event counter value 448 | * @map: pointer to perf_event_array map 449 | * @flags: index of event in the map or bitmask flags 450 | * Return: value of perf event counter read or error code 451 | * 452 | * int bpf_redirect(ifindex, flags) 453 | * redirect to another netdev 454 | * @ifindex: ifindex of the net device 455 | * @flags: 456 | * cls_bpf: 457 | * bit 0 - if set, redirect to ingress instead of egress 458 | * other bits - reserved 459 | * xdp_bpf: 460 | * all bits - reserved 461 | * Return: cls_bpf: TC_ACT_REDIRECT on success or TC_ACT_SHOT on error 462 | * xdp_bfp: XDP_REDIRECT on success or XDP_ABORT on error 463 | * int bpf_redirect_map(map, key, flags) 464 | * redirect to endpoint in map 465 | * @map: pointer to dev map 466 | * @key: index in map to lookup 467 | * @flags: -- 468 | * Return: XDP_REDIRECT on success or XDP_ABORT on error 469 | * 470 | * u32 bpf_get_route_realm(skb) 471 | * retrieve a dst's tclassid 472 | * @skb: pointer to skb 473 | * Return: realm if != 0 474 | * 475 | * int bpf_perf_event_output(ctx, map, flags, data, size) 476 | * output perf raw sample 477 | * @ctx: struct pt_regs* 478 | * @map: pointer to perf_event_array map 479 | * @flags: index of event in the map or bitmask flags 480 | * @data: data on stack to be output as raw data 481 | * @size: size of data 482 | * Return: 0 on success or negative error 483 | * 484 | * int bpf_get_stackid(ctx, map, flags) 485 | * walk user or kernel stack and return id 486 | * @ctx: struct pt_regs* 487 | * @map: pointer to stack_trace map 488 | * @flags: bits 0-7 - numer of stack frames to skip 489 | * bit 8 - collect user stack instead of kernel 490 | * bit 9 - compare stacks by hash only 491 | * bit 10 - if two different stacks hash into the same stackid 492 | * discard old 493 | * other bits - reserved 494 | * Return: >= 0 stackid on success or negative error 495 | * 496 | * s64 bpf_csum_diff(from, from_size, to, to_size, seed) 497 | * calculate csum diff 498 | * @from: raw from buffer 499 | * @from_size: length of from buffer 500 | * @to: raw to buffer 501 | * @to_size: length of to buffer 502 | * @seed: optional seed 503 | * Return: csum result or negative error code 504 | * 505 | * int bpf_skb_get_tunnel_opt(skb, opt, size) 506 | * retrieve tunnel options metadata 507 | * @skb: pointer to skb 508 | * @opt: pointer to raw tunnel option data 509 | * @size: size of @opt 510 | * Return: option size 511 | * 512 | * int bpf_skb_set_tunnel_opt(skb, opt, size) 513 | * populate tunnel options metadata 514 | * @skb: pointer to skb 515 | * @opt: pointer to raw tunnel option data 516 | * @size: size of @opt 517 | * Return: 0 on success or negative error 518 | * 519 | * int bpf_skb_change_proto(skb, proto, flags) 520 | * Change protocol of the skb. Currently supported is v4 -> v6, 521 | * v6 -> v4 transitions. The helper will also resize the skb. eBPF 522 | * program is expected to fill the new headers via skb_store_bytes 523 | * and lX_csum_replace. 524 | * @skb: pointer to skb 525 | * @proto: new skb->protocol type 526 | * @flags: reserved 527 | * Return: 0 on success or negative error 528 | * 529 | * int bpf_skb_change_type(skb, type) 530 | * Change packet type of skb. 531 | * @skb: pointer to skb 532 | * @type: new skb->pkt_type type 533 | * Return: 0 on success or negative error 534 | * 535 | * int bpf_skb_under_cgroup(skb, map, index) 536 | * Check cgroup2 membership of skb 537 | * @skb: pointer to skb 538 | * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type 539 | * @index: index of the cgroup in the bpf_map 540 | * Return: 541 | * == 0 skb failed the cgroup2 descendant test 542 | * == 1 skb succeeded the cgroup2 descendant test 543 | * < 0 error 544 | * 545 | * u32 bpf_get_hash_recalc(skb) 546 | * Retrieve and possibly recalculate skb->hash. 547 | * @skb: pointer to skb 548 | * Return: hash 549 | * 550 | * u64 bpf_get_current_task(void) 551 | * Returns current task_struct 552 | * Return: current 553 | * 554 | * int bpf_probe_write_user(void *dst, void *src, int len) 555 | * safely attempt to write to a location 556 | * @dst: destination address in userspace 557 | * @src: source address on stack 558 | * @len: number of bytes to copy 559 | * Return: 0 on success or negative error 560 | * 561 | * int bpf_current_task_under_cgroup(map, index) 562 | * Check cgroup2 membership of current task 563 | * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type 564 | * @index: index of the cgroup in the bpf_map 565 | * Return: 566 | * == 0 current failed the cgroup2 descendant test 567 | * == 1 current succeeded the cgroup2 descendant test 568 | * < 0 error 569 | * 570 | * int bpf_skb_change_tail(skb, len, flags) 571 | * The helper will resize the skb to the given new size, to be used f.e. 572 | * with control messages. 573 | * @skb: pointer to skb 574 | * @len: new skb length 575 | * @flags: reserved 576 | * Return: 0 on success or negative error 577 | * 578 | * int bpf_skb_pull_data(skb, len) 579 | * The helper will pull in non-linear data in case the skb is non-linear 580 | * and not all of len are part of the linear section. Only needed for 581 | * read/write with direct packet access. 582 | * @skb: pointer to skb 583 | * @len: len to make read/writeable 584 | * Return: 0 on success or negative error 585 | * 586 | * s64 bpf_csum_update(skb, csum) 587 | * Adds csum into skb->csum in case of CHECKSUM_COMPLETE. 588 | * @skb: pointer to skb 589 | * @csum: csum to add 590 | * Return: csum on success or negative error 591 | * 592 | * void bpf_set_hash_invalid(skb) 593 | * Invalidate current skb->hash. 594 | * @skb: pointer to skb 595 | * 596 | * int bpf_get_numa_node_id() 597 | * Return: Id of current NUMA node. 598 | * 599 | * int bpf_skb_change_head() 600 | * Grows headroom of skb and adjusts MAC header offset accordingly. 601 | * Will extends/reallocae as required automatically. 602 | * May change skb data pointer and will thus invalidate any check 603 | * performed for direct packet access. 604 | * @skb: pointer to skb 605 | * @len: length of header to be pushed in front 606 | * @flags: Flags (unused for now) 607 | * Return: 0 on success or negative error 608 | * 609 | * int bpf_xdp_adjust_head(xdp_md, delta) 610 | * Adjust the xdp_md.data by delta 611 | * @xdp_md: pointer to xdp_md 612 | * @delta: An positive/negative integer to be added to xdp_md.data 613 | * Return: 0 on success or negative on error 614 | * 615 | * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr) 616 | * Copy a NUL terminated string from unsafe address. In case the string 617 | * length is smaller than size, the target is not padded with further NUL 618 | * bytes. In case the string length is larger than size, just count-1 619 | * bytes are copied and the last byte is set to NUL. 620 | * @dst: destination address 621 | * @size: maximum number of bytes to copy, including the trailing NUL 622 | * @unsafe_ptr: unsafe address 623 | * Return: 624 | * > 0 length of the string including the trailing NUL on success 625 | * < 0 error 626 | * 627 | * u64 bpf_get_socket_cookie(skb) 628 | * Get the cookie for the socket stored inside sk_buff. 629 | * @skb: pointer to skb 630 | * Return: 8 Bytes non-decreasing number on success or 0 if the socket 631 | * field is missing inside sk_buff 632 | * 633 | * u32 bpf_get_socket_uid(skb) 634 | * Get the owner uid of the socket stored inside sk_buff. 635 | * @skb: pointer to skb 636 | * Return: uid of the socket owner on success or overflowuid if failed. 637 | * 638 | * u32 bpf_set_hash(skb, hash) 639 | * Set full skb->hash. 640 | * @skb: pointer to skb 641 | * @hash: hash to set 642 | * 643 | * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen) 644 | * Calls setsockopt. Not all opts are available, only those with 645 | * integer optvals plus TCP_CONGESTION. 646 | * Supported levels: SOL_SOCKET and IPPROTO_TCP 647 | * @bpf_socket: pointer to bpf_socket 648 | * @level: SOL_SOCKET or IPPROTO_TCP 649 | * @optname: option name 650 | * @optval: pointer to option value 651 | * @optlen: length of optval in bytes 652 | * Return: 0 or negative error 653 | * 654 | * int bpf_getsockopt(bpf_socket, level, optname, optval, optlen) 655 | * Calls getsockopt. Not all opts are available. 656 | * Supported levels: IPPROTO_TCP 657 | * @bpf_socket: pointer to bpf_socket 658 | * @level: IPPROTO_TCP 659 | * @optname: option name 660 | * @optval: pointer to option value 661 | * @optlen: length of optval in bytes 662 | * Return: 0 or negative error 663 | * 664 | * int bpf_skb_adjust_room(skb, len_diff, mode, flags) 665 | * Grow or shrink room in sk_buff. 666 | * @skb: pointer to skb 667 | * @len_diff: (signed) amount of room to grow/shrink 668 | * @mode: operation mode (enum bpf_adj_room_mode) 669 | * @flags: reserved for future use 670 | * Return: 0 on success or negative error code 671 | * 672 | * int bpf_sk_redirect_map(map, key, flags) 673 | * Redirect skb to a sock in map using key as a lookup key for the 674 | * sock in map. 675 | * @map: pointer to sockmap 676 | * @key: key to lookup sock in map 677 | * @flags: reserved for future use 678 | * Return: SK_PASS 679 | * 680 | * int bpf_sock_map_update(skops, map, key, flags) 681 | * @skops: pointer to bpf_sock_ops 682 | * @map: pointer to sockmap to update 683 | * @key: key to insert/update sock in map 684 | * @flags: same flags as map update elem 685 | * 686 | * int bpf_xdp_adjust_meta(xdp_md, delta) 687 | * Adjust the xdp_md.data_meta by delta 688 | * @xdp_md: pointer to xdp_md 689 | * @delta: An positive/negative integer to be added to xdp_md.data_meta 690 | * Return: 0 on success or negative on error 691 | * 692 | * int bpf_perf_event_read_value(map, flags, buf, buf_size) 693 | * read perf event counter value and perf event enabled/running time 694 | * @map: pointer to perf_event_array map 695 | * @flags: index of event in the map or bitmask flags 696 | * @buf: buf to fill 697 | * @buf_size: size of the buf 698 | * Return: 0 on success or negative error code 699 | * 700 | * int bpf_perf_prog_read_value(ctx, buf, buf_size) 701 | * read perf prog attached perf event counter and enabled/running time 702 | * @ctx: pointer to ctx 703 | * @buf: buf to fill 704 | * @buf_size: size of the buf 705 | * Return : 0 on success or negative error code 706 | * 707 | * int bpf_override_return(pt_regs, rc) 708 | * @pt_regs: pointer to struct pt_regs 709 | * @rc: the return value to set 710 | */ 711 | #define __BPF_FUNC_MAPPER(FN) \ 712 | FN(unspec), \ 713 | FN(map_lookup_elem), \ 714 | FN(map_update_elem), \ 715 | FN(map_delete_elem), \ 716 | FN(probe_read), \ 717 | FN(ktime_get_ns), \ 718 | FN(trace_printk), \ 719 | FN(get_prandom_u32), \ 720 | FN(get_smp_processor_id), \ 721 | FN(skb_store_bytes), \ 722 | FN(l3_csum_replace), \ 723 | FN(l4_csum_replace), \ 724 | FN(tail_call), \ 725 | FN(clone_redirect), \ 726 | FN(get_current_pid_tgid), \ 727 | FN(get_current_uid_gid), \ 728 | FN(get_current_comm), \ 729 | FN(get_cgroup_classid), \ 730 | FN(skb_vlan_push), \ 731 | FN(skb_vlan_pop), \ 732 | FN(skb_get_tunnel_key), \ 733 | FN(skb_set_tunnel_key), \ 734 | FN(perf_event_read), \ 735 | FN(redirect), \ 736 | FN(get_route_realm), \ 737 | FN(perf_event_output), \ 738 | FN(skb_load_bytes), \ 739 | FN(get_stackid), \ 740 | FN(csum_diff), \ 741 | FN(skb_get_tunnel_opt), \ 742 | FN(skb_set_tunnel_opt), \ 743 | FN(skb_change_proto), \ 744 | FN(skb_change_type), \ 745 | FN(skb_under_cgroup), \ 746 | FN(get_hash_recalc), \ 747 | FN(get_current_task), \ 748 | FN(probe_write_user), \ 749 | FN(current_task_under_cgroup), \ 750 | FN(skb_change_tail), \ 751 | FN(skb_pull_data), \ 752 | FN(csum_update), \ 753 | FN(set_hash_invalid), \ 754 | FN(get_numa_node_id), \ 755 | FN(skb_change_head), \ 756 | FN(xdp_adjust_head), \ 757 | FN(probe_read_str), \ 758 | FN(get_socket_cookie), \ 759 | FN(get_socket_uid), \ 760 | FN(set_hash), \ 761 | FN(setsockopt), \ 762 | FN(skb_adjust_room), \ 763 | FN(redirect_map), \ 764 | FN(sk_redirect_map), \ 765 | FN(sock_map_update), \ 766 | FN(xdp_adjust_meta), \ 767 | FN(perf_event_read_value), \ 768 | FN(perf_prog_read_value), \ 769 | FN(getsockopt), \ 770 | FN(override_return), 771 | 772 | /* integer value in 'imm' field of BPF_CALL instruction selects which helper 773 | * function eBPF program intends to call 774 | */ 775 | #define __BPF_ENUM_FN(x) BPF_FUNC_ ## x 776 | enum bpf_func_id { 777 | __BPF_FUNC_MAPPER(__BPF_ENUM_FN) 778 | __BPF_FUNC_MAX_ID, 779 | }; 780 | #undef __BPF_ENUM_FN 781 | 782 | /* All flags used by eBPF helper functions, placed here. */ 783 | 784 | /* BPF_FUNC_skb_store_bytes flags. */ 785 | #define BPF_F_RECOMPUTE_CSUM (1ULL << 0) 786 | #define BPF_F_INVALIDATE_HASH (1ULL << 1) 787 | 788 | /* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags. 789 | * First 4 bits are for passing the header field size. 790 | */ 791 | #define BPF_F_HDR_FIELD_MASK 0xfULL 792 | 793 | /* BPF_FUNC_l4_csum_replace flags. */ 794 | #define BPF_F_PSEUDO_HDR (1ULL << 4) 795 | #define BPF_F_MARK_MANGLED_0 (1ULL << 5) 796 | #define BPF_F_MARK_ENFORCE (1ULL << 6) 797 | 798 | /* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ 799 | #define BPF_F_INGRESS (1ULL << 0) 800 | 801 | /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ 802 | #define BPF_F_TUNINFO_IPV6 (1ULL << 0) 803 | 804 | /* BPF_FUNC_get_stackid flags. */ 805 | #define BPF_F_SKIP_FIELD_MASK 0xffULL 806 | #define BPF_F_USER_STACK (1ULL << 8) 807 | #define BPF_F_FAST_STACK_CMP (1ULL << 9) 808 | #define BPF_F_REUSE_STACKID (1ULL << 10) 809 | 810 | /* BPF_FUNC_skb_set_tunnel_key flags. */ 811 | #define BPF_F_ZERO_CSUM_TX (1ULL << 1) 812 | #define BPF_F_DONT_FRAGMENT (1ULL << 2) 813 | 814 | /* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and 815 | * BPF_FUNC_perf_event_read_value flags. 816 | */ 817 | #define BPF_F_INDEX_MASK 0xffffffffULL 818 | #define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK 819 | /* BPF_FUNC_perf_event_output for sk_buff input context. */ 820 | #define BPF_F_CTXLEN_MASK (0xfffffULL << 32) 821 | 822 | /* Mode for BPF_FUNC_skb_adjust_room helper. */ 823 | enum bpf_adj_room_mode { 824 | BPF_ADJ_ROOM_NET, 825 | }; 826 | 827 | /* user accessible mirror of in-kernel sk_buff. 828 | * new fields can only be added to the end of this structure 829 | */ 830 | struct __sk_buff { 831 | __u32 len; 832 | __u32 pkt_type; 833 | __u32 mark; 834 | __u32 queue_mapping; 835 | __u32 protocol; 836 | __u32 vlan_present; 837 | __u32 vlan_tci; 838 | __u32 vlan_proto; 839 | __u32 priority; 840 | __u32 ingress_ifindex; 841 | __u32 ifindex; 842 | __u32 tc_index; 843 | __u32 cb[5]; 844 | __u32 hash; 845 | __u32 tc_classid; 846 | __u32 data; 847 | __u32 data_end; 848 | __u32 napi_id; 849 | 850 | /* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */ 851 | __u32 family; 852 | __u32 remote_ip4; /* Stored in network byte order */ 853 | __u32 local_ip4; /* Stored in network byte order */ 854 | __u32 remote_ip6[4]; /* Stored in network byte order */ 855 | __u32 local_ip6[4]; /* Stored in network byte order */ 856 | __u32 remote_port; /* Stored in network byte order */ 857 | __u32 local_port; /* stored in host byte order */ 858 | /* ... here. */ 859 | 860 | __u32 data_meta; 861 | }; 862 | 863 | struct bpf_tunnel_key { 864 | __u32 tunnel_id; 865 | union { 866 | __u32 remote_ipv4; 867 | __u32 remote_ipv6[4]; 868 | }; 869 | __u8 tunnel_tos; 870 | __u8 tunnel_ttl; 871 | __u16 tunnel_ext; 872 | __u32 tunnel_label; 873 | }; 874 | 875 | /* Generic BPF return codes which all BPF program types may support. 876 | * The values are binary compatible with their TC_ACT_* counter-part to 877 | * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT 878 | * programs. 879 | * 880 | * XDP is handled seprately, see XDP_*. 881 | */ 882 | enum bpf_ret_code { 883 | BPF_OK = 0, 884 | /* 1 reserved */ 885 | BPF_DROP = 2, 886 | /* 3-6 reserved */ 887 | BPF_REDIRECT = 7, 888 | /* >127 are reserved for prog type specific return codes */ 889 | }; 890 | 891 | struct bpf_sock { 892 | __u32 bound_dev_if; 893 | __u32 family; 894 | __u32 type; 895 | __u32 protocol; 896 | __u32 mark; 897 | __u32 priority; 898 | }; 899 | 900 | #define XDP_PACKET_HEADROOM 256 901 | 902 | /* User return codes for XDP prog type. 903 | * A valid XDP program must return one of these defined values. All other 904 | * return codes are reserved for future use. Unknown return codes will 905 | * result in packet drops and a warning via bpf_warn_invalid_xdp_action(). 906 | */ 907 | enum xdp_action { 908 | XDP_ABORTED = 0, 909 | XDP_DROP, 910 | XDP_PASS, 911 | XDP_TX, 912 | XDP_REDIRECT, 913 | }; 914 | 915 | /* user accessible metadata for XDP packet hook 916 | * new fields must be added to the end of this structure 917 | */ 918 | struct xdp_md { 919 | __u32 data; 920 | __u32 data_end; 921 | __u32 data_meta; 922 | /* Below access go through struct xdp_rxq_info */ 923 | __u32 ingress_ifindex; /* rxq->dev->ifindex */ 924 | __u32 rx_queue_index; /* rxq->queue_index */ 925 | }; 926 | 927 | enum sk_action { 928 | SK_DROP = 0, 929 | SK_PASS, 930 | }; 931 | 932 | #define BPF_TAG_SIZE 8 933 | 934 | struct bpf_prog_info { 935 | __u32 type; 936 | __u32 id; 937 | __u8 tag[BPF_TAG_SIZE]; 938 | __u32 jited_prog_len; 939 | __u32 xlated_prog_len; 940 | __aligned_u64 jited_prog_insns; 941 | __aligned_u64 xlated_prog_insns; 942 | __u64 load_time; /* ns since boottime */ 943 | __u32 created_by_uid; 944 | __u32 nr_map_ids; 945 | __aligned_u64 map_ids; 946 | char name[BPF_OBJ_NAME_LEN]; 947 | __u32 ifindex; 948 | __u64 netns_dev; 949 | __u64 netns_ino; 950 | } __attribute__((aligned(8))); 951 | 952 | struct bpf_map_info { 953 | __u32 type; 954 | __u32 id; 955 | __u32 key_size; 956 | __u32 value_size; 957 | __u32 max_entries; 958 | __u32 map_flags; 959 | char name[BPF_OBJ_NAME_LEN]; 960 | __u32 ifindex; 961 | __u64 netns_dev; 962 | __u64 netns_ino; 963 | } __attribute__((aligned(8))); 964 | 965 | /* User bpf_sock_ops struct to access socket values and specify request ops 966 | * and their replies. 967 | * Some of this fields are in network (bigendian) byte order and may need 968 | * to be converted before use (bpf_ntohl() defined in samples/bpf/bpf_endian.h). 969 | * New fields can only be added at the end of this structure 970 | */ 971 | struct bpf_sock_ops { 972 | __u32 op; 973 | union { 974 | __u32 reply; 975 | __u32 replylong[4]; 976 | }; 977 | __u32 family; 978 | __u32 remote_ip4; /* Stored in network byte order */ 979 | __u32 local_ip4; /* Stored in network byte order */ 980 | __u32 remote_ip6[4]; /* Stored in network byte order */ 981 | __u32 local_ip6[4]; /* Stored in network byte order */ 982 | __u32 remote_port; /* Stored in network byte order */ 983 | __u32 local_port; /* stored in host byte order */ 984 | __u32 is_fullsock; /* Some TCP fields are only valid if 985 | * there is a full socket. If not, the 986 | * fields read as zero. 987 | */ 988 | __u32 snd_cwnd; 989 | __u32 srtt_us; /* Averaged RTT << 3 in usecs */ 990 | }; 991 | 992 | /* List of known BPF sock_ops operators. 993 | * New entries can only be added at the end 994 | */ 995 | enum { 996 | BPF_SOCK_OPS_VOID, 997 | BPF_SOCK_OPS_TIMEOUT_INIT, /* Should return SYN-RTO value to use or 998 | * -1 if default value should be used 999 | */ 1000 | BPF_SOCK_OPS_RWND_INIT, /* Should return initial advertized 1001 | * window (in packets) or -1 if default 1002 | * value should be used 1003 | */ 1004 | BPF_SOCK_OPS_TCP_CONNECT_CB, /* Calls BPF program right before an 1005 | * active connection is initialized 1006 | */ 1007 | BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, /* Calls BPF program when an 1008 | * active connection is 1009 | * established 1010 | */ 1011 | BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, /* Calls BPF program when a 1012 | * passive connection is 1013 | * established 1014 | */ 1015 | BPF_SOCK_OPS_NEEDS_ECN, /* If connection's congestion control 1016 | * needs ECN 1017 | */ 1018 | BPF_SOCK_OPS_BASE_RTT, /* Get base RTT. The correct value is 1019 | * based on the path and may be 1020 | * dependent on the congestion control 1021 | * algorithm. In general it indicates 1022 | * a congestion threshold. RTTs above 1023 | * this indicate congestion 1024 | */ 1025 | }; 1026 | 1027 | #define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ 1028 | #define TCP_BPF_SNDCWND_CLAMP 1002 /* Set sndcwnd_clamp */ 1029 | 1030 | struct bpf_perf_event_value { 1031 | __u64 counter; 1032 | __u64 enabled; 1033 | __u64 running; 1034 | }; 1035 | 1036 | #define BPF_DEVCG_ACC_MKNOD (1ULL << 0) 1037 | #define BPF_DEVCG_ACC_READ (1ULL << 1) 1038 | #define BPF_DEVCG_ACC_WRITE (1ULL << 2) 1039 | 1040 | #define BPF_DEVCG_DEV_BLOCK (1ULL << 0) 1041 | #define BPF_DEVCG_DEV_CHAR (1ULL << 1) 1042 | 1043 | struct bpf_cgroup_dev_ctx { 1044 | /* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */ 1045 | __u32 access_type; 1046 | __u32 major; 1047 | __u32 minor; 1048 | }; 1049 | 1050 | #endif /* __LINUX_BPF_H__ */ 1051 | -------------------------------------------------------------------------------- /tools/include/uapi/linux/bpf_common.h: -------------------------------------------------------------------------------- 1 | /* Copy of kernel tools/include/uapi/linux/bpf_common.h 2 | */ 3 | #ifndef _UAPI__LINUX_BPF_COMMON_H__ 4 | #define _UAPI__LINUX_BPF_COMMON_H__ 5 | 6 | /* Instruction classes */ 7 | #define BPF_CLASS(code) ((code) & 0x07) 8 | #define BPF_LD 0x00 9 | #define BPF_LDX 0x01 10 | #define BPF_ST 0x02 11 | #define BPF_STX 0x03 12 | #define BPF_ALU 0x04 13 | #define BPF_JMP 0x05 14 | #define BPF_RET 0x06 15 | #define BPF_MISC 0x07 16 | 17 | /* ld/ldx fields */ 18 | #define BPF_SIZE(code) ((code) & 0x18) 19 | #define BPF_W 0x00 20 | #define BPF_H 0x08 21 | #define BPF_B 0x10 22 | #define BPF_MODE(code) ((code) & 0xe0) 23 | #define BPF_IMM 0x00 24 | #define BPF_ABS 0x20 25 | #define BPF_IND 0x40 26 | #define BPF_MEM 0x60 27 | #define BPF_LEN 0x80 28 | #define BPF_MSH 0xa0 29 | 30 | /* alu/jmp fields */ 31 | #define BPF_OP(code) ((code) & 0xf0) 32 | #define BPF_ADD 0x00 33 | #define BPF_SUB 0x10 34 | #define BPF_MUL 0x20 35 | #define BPF_DIV 0x30 36 | #define BPF_OR 0x40 37 | #define BPF_AND 0x50 38 | #define BPF_LSH 0x60 39 | #define BPF_RSH 0x70 40 | #define BPF_NEG 0x80 41 | #define BPF_MOD 0x90 42 | #define BPF_XOR 0xa0 43 | 44 | #define BPF_JA 0x00 45 | #define BPF_JEQ 0x10 46 | #define BPF_JGT 0x20 47 | #define BPF_JGE 0x30 48 | #define BPF_JSET 0x40 49 | #define BPF_SRC(code) ((code) & 0x08) 50 | #define BPF_K 0x00 51 | #define BPF_X 0x08 52 | 53 | #ifndef BPF_MAXINSNS 54 | #define BPF_MAXINSNS 4096 55 | #endif 56 | 57 | #endif /* _UAPI__LINUX_BPF_COMMON_H__ */ 58 | -------------------------------------------------------------------------------- /tools/lib/bpf/bpf.c: -------------------------------------------------------------------------------- 1 | /* 2 | * common eBPF ELF operations. 3 | * 4 | * Copyright (C) 2013-2015 Alexei Starovoitov 5 | * Copyright (C) 2015 Wang Nan 6 | * Copyright (C) 2015 Huawei Inc. 7 | * 8 | * This program is free software; you can redistribute it and/or 9 | * modify it under the terms of the GNU Lesser General Public 10 | * License as published by the Free Software Foundation; 11 | * version 2.1 of the License (not later!) 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU Lesser General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU Lesser General Public 19 | * License along with this program; if not, see 20 | */ 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include "bpf.h" 28 | 29 | /* 30 | * When building perf, unistd.h is overridden. __NR_bpf is 31 | * required to be defined explicitly. 32 | */ 33 | #ifndef __NR_bpf 34 | # if defined(__i386__) 35 | # define __NR_bpf 357 36 | # elif defined(__x86_64__) 37 | # define __NR_bpf 321 38 | # elif defined(__aarch64__) 39 | # define __NR_bpf 280 40 | # elif defined(__sparc__) 41 | # define __NR_bpf 349 42 | # elif defined(__s390__) 43 | # define __NR_bpf 351 44 | # else 45 | # error __NR_bpf not defined. libbpf does not support your arch. 46 | # endif 47 | #endif 48 | 49 | #define min(x, y) ((x) < (y) ? (x) : (y)) 50 | 51 | static inline __u64 ptr_to_u64(const void *ptr) 52 | { 53 | return (__u64) (unsigned long) ptr; 54 | } 55 | 56 | static inline int sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr, 57 | unsigned int size) 58 | { 59 | return syscall(__NR_bpf, cmd, attr, size); 60 | } 61 | 62 | int bpf_create_map_node(enum bpf_map_type map_type, const char *name, 63 | int key_size, int value_size, int max_entries, 64 | __u32 map_flags, int node) 65 | { 66 | __u32 name_len = name ? strlen(name) : 0; 67 | union bpf_attr attr; 68 | 69 | memset(&attr, '\0', sizeof(attr)); 70 | 71 | attr.map_type = map_type; 72 | attr.key_size = key_size; 73 | attr.value_size = value_size; 74 | attr.max_entries = max_entries; 75 | attr.map_flags = map_flags; 76 | memcpy(attr.map_name, name, min(name_len, BPF_OBJ_NAME_LEN - 1)); 77 | 78 | if (node >= 0) { 79 | attr.map_flags |= BPF_F_NUMA_NODE; 80 | attr.numa_node = node; 81 | } 82 | 83 | return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); 84 | } 85 | 86 | int bpf_create_map(enum bpf_map_type map_type, int key_size, 87 | int value_size, int max_entries, __u32 map_flags) 88 | { 89 | return bpf_create_map_node(map_type, NULL, key_size, value_size, 90 | max_entries, map_flags, -1); 91 | } 92 | 93 | int bpf_create_map_name(enum bpf_map_type map_type, const char *name, 94 | int key_size, int value_size, int max_entries, 95 | __u32 map_flags) 96 | { 97 | return bpf_create_map_node(map_type, name, key_size, value_size, 98 | max_entries, map_flags, -1); 99 | } 100 | 101 | int bpf_create_map_in_map_node(enum bpf_map_type map_type, const char *name, 102 | int key_size, int inner_map_fd, int max_entries, 103 | __u32 map_flags, int node) 104 | { 105 | __u32 name_len = name ? strlen(name) : 0; 106 | union bpf_attr attr; 107 | 108 | memset(&attr, '\0', sizeof(attr)); 109 | 110 | attr.map_type = map_type; 111 | attr.key_size = key_size; 112 | attr.value_size = 4; 113 | attr.inner_map_fd = inner_map_fd; 114 | attr.max_entries = max_entries; 115 | attr.map_flags = map_flags; 116 | memcpy(attr.map_name, name, min(name_len, BPF_OBJ_NAME_LEN - 1)); 117 | 118 | if (node >= 0) { 119 | attr.map_flags |= BPF_F_NUMA_NODE; 120 | attr.numa_node = node; 121 | } 122 | 123 | return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); 124 | } 125 | 126 | int bpf_create_map_in_map(enum bpf_map_type map_type, const char *name, 127 | int key_size, int inner_map_fd, int max_entries, 128 | __u32 map_flags) 129 | { 130 | return bpf_create_map_in_map_node(map_type, name, key_size, 131 | inner_map_fd, max_entries, map_flags, 132 | -1); 133 | } 134 | 135 | int bpf_load_program_name(enum bpf_prog_type type, const char *name, 136 | const struct bpf_insn *insns, 137 | size_t insns_cnt, const char *license, 138 | __u32 kern_version, char *log_buf, 139 | size_t log_buf_sz) 140 | { 141 | int fd; 142 | union bpf_attr attr; 143 | __u32 name_len = name ? strlen(name) : 0; 144 | 145 | bzero(&attr, sizeof(attr)); 146 | attr.prog_type = type; 147 | attr.insn_cnt = (__u32)insns_cnt; 148 | attr.insns = ptr_to_u64(insns); 149 | attr.license = ptr_to_u64(license); 150 | attr.log_buf = ptr_to_u64(NULL); 151 | attr.log_size = 0; 152 | attr.log_level = 0; 153 | attr.kern_version = kern_version; 154 | memcpy(attr.prog_name, name, min(name_len, BPF_OBJ_NAME_LEN - 1)); 155 | 156 | fd = sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); 157 | if (fd >= 0 || !log_buf || !log_buf_sz) 158 | return fd; 159 | 160 | /* Try again with log */ 161 | attr.log_buf = ptr_to_u64(log_buf); 162 | attr.log_size = log_buf_sz; 163 | attr.log_level = 1; 164 | log_buf[0] = 0; 165 | return sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); 166 | } 167 | 168 | int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns, 169 | size_t insns_cnt, const char *license, 170 | __u32 kern_version, char *log_buf, 171 | size_t log_buf_sz) 172 | { 173 | return bpf_load_program_name(type, NULL, insns, insns_cnt, license, 174 | kern_version, log_buf, log_buf_sz); 175 | } 176 | 177 | int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns, 178 | size_t insns_cnt, int strict_alignment, 179 | const char *license, __u32 kern_version, 180 | char *log_buf, size_t log_buf_sz, int log_level) 181 | { 182 | union bpf_attr attr; 183 | 184 | bzero(&attr, sizeof(attr)); 185 | attr.prog_type = type; 186 | attr.insn_cnt = (__u32)insns_cnt; 187 | attr.insns = ptr_to_u64(insns); 188 | attr.license = ptr_to_u64(license); 189 | attr.log_buf = ptr_to_u64(log_buf); 190 | attr.log_size = log_buf_sz; 191 | attr.log_level = log_level; 192 | log_buf[0] = 0; 193 | attr.kern_version = kern_version; 194 | attr.prog_flags = strict_alignment ? BPF_F_STRICT_ALIGNMENT : 0; 195 | 196 | return sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); 197 | } 198 | 199 | int bpf_map_update_elem(int fd, const void *key, const void *value, 200 | __u64 flags) 201 | { 202 | union bpf_attr attr; 203 | 204 | bzero(&attr, sizeof(attr)); 205 | attr.map_fd = fd; 206 | attr.key = ptr_to_u64(key); 207 | attr.value = ptr_to_u64(value); 208 | attr.flags = flags; 209 | 210 | return sys_bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); 211 | } 212 | 213 | int bpf_map_lookup_elem(int fd, const void *key, void *value) 214 | { 215 | union bpf_attr attr; 216 | 217 | bzero(&attr, sizeof(attr)); 218 | attr.map_fd = fd; 219 | attr.key = ptr_to_u64(key); 220 | attr.value = ptr_to_u64(value); 221 | 222 | return sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)); 223 | } 224 | 225 | int bpf_map_delete_elem(int fd, const void *key) 226 | { 227 | union bpf_attr attr; 228 | 229 | bzero(&attr, sizeof(attr)); 230 | attr.map_fd = fd; 231 | attr.key = ptr_to_u64(key); 232 | 233 | return sys_bpf(BPF_MAP_DELETE_ELEM, &attr, sizeof(attr)); 234 | } 235 | 236 | int bpf_map_get_next_key(int fd, const void *key, void *next_key) 237 | { 238 | union bpf_attr attr; 239 | 240 | bzero(&attr, sizeof(attr)); 241 | attr.map_fd = fd; 242 | attr.key = ptr_to_u64(key); 243 | attr.next_key = ptr_to_u64(next_key); 244 | 245 | return sys_bpf(BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr)); 246 | } 247 | 248 | int bpf_obj_pin(int fd, const char *pathname) 249 | { 250 | union bpf_attr attr; 251 | 252 | bzero(&attr, sizeof(attr)); 253 | attr.pathname = ptr_to_u64((void *)pathname); 254 | attr.bpf_fd = fd; 255 | 256 | return sys_bpf(BPF_OBJ_PIN, &attr, sizeof(attr)); 257 | } 258 | 259 | int bpf_obj_get(const char *pathname) 260 | { 261 | union bpf_attr attr; 262 | 263 | bzero(&attr, sizeof(attr)); 264 | attr.pathname = ptr_to_u64((void *)pathname); 265 | 266 | return sys_bpf(BPF_OBJ_GET, &attr, sizeof(attr)); 267 | } 268 | 269 | int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type, 270 | unsigned int flags) 271 | { 272 | union bpf_attr attr; 273 | 274 | bzero(&attr, sizeof(attr)); 275 | attr.target_fd = target_fd; 276 | attr.attach_bpf_fd = prog_fd; 277 | attr.attach_type = type; 278 | attr.attach_flags = flags; 279 | 280 | return sys_bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)); 281 | } 282 | 283 | int bpf_prog_detach(int target_fd, enum bpf_attach_type type) 284 | { 285 | union bpf_attr attr; 286 | 287 | bzero(&attr, sizeof(attr)); 288 | attr.target_fd = target_fd; 289 | attr.attach_type = type; 290 | 291 | return sys_bpf(BPF_PROG_DETACH, &attr, sizeof(attr)); 292 | } 293 | 294 | int bpf_prog_detach2(int prog_fd, int target_fd, enum bpf_attach_type type) 295 | { 296 | union bpf_attr attr; 297 | 298 | bzero(&attr, sizeof(attr)); 299 | attr.target_fd = target_fd; 300 | attr.attach_bpf_fd = prog_fd; 301 | attr.attach_type = type; 302 | 303 | return sys_bpf(BPF_PROG_DETACH, &attr, sizeof(attr)); 304 | } 305 | 306 | int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, 307 | __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt) 308 | { 309 | union bpf_attr attr; 310 | int ret; 311 | 312 | bzero(&attr, sizeof(attr)); 313 | attr.query.target_fd = target_fd; 314 | attr.query.attach_type = type; 315 | attr.query.query_flags = query_flags; 316 | attr.query.prog_cnt = *prog_cnt; 317 | attr.query.prog_ids = ptr_to_u64(prog_ids); 318 | 319 | ret = sys_bpf(BPF_PROG_QUERY, &attr, sizeof(attr)); 320 | if (attach_flags) 321 | *attach_flags = attr.query.attach_flags; 322 | *prog_cnt = attr.query.prog_cnt; 323 | return ret; 324 | } 325 | 326 | int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size, 327 | void *data_out, __u32 *size_out, __u32 *retval, 328 | __u32 *duration) 329 | { 330 | union bpf_attr attr; 331 | int ret; 332 | 333 | bzero(&attr, sizeof(attr)); 334 | attr.test.prog_fd = prog_fd; 335 | attr.test.data_in = ptr_to_u64(data); 336 | attr.test.data_out = ptr_to_u64(data_out); 337 | attr.test.data_size_in = size; 338 | attr.test.repeat = repeat; 339 | 340 | ret = sys_bpf(BPF_PROG_TEST_RUN, &attr, sizeof(attr)); 341 | if (size_out) 342 | *size_out = attr.test.data_size_out; 343 | if (retval) 344 | *retval = attr.test.retval; 345 | if (duration) 346 | *duration = attr.test.duration; 347 | return ret; 348 | } 349 | 350 | int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id) 351 | { 352 | union bpf_attr attr; 353 | int err; 354 | 355 | bzero(&attr, sizeof(attr)); 356 | attr.start_id = start_id; 357 | 358 | err = sys_bpf(BPF_PROG_GET_NEXT_ID, &attr, sizeof(attr)); 359 | if (!err) 360 | *next_id = attr.next_id; 361 | 362 | return err; 363 | } 364 | 365 | int bpf_map_get_next_id(__u32 start_id, __u32 *next_id) 366 | { 367 | union bpf_attr attr; 368 | int err; 369 | 370 | bzero(&attr, sizeof(attr)); 371 | attr.start_id = start_id; 372 | 373 | err = sys_bpf(BPF_MAP_GET_NEXT_ID, &attr, sizeof(attr)); 374 | if (!err) 375 | *next_id = attr.next_id; 376 | 377 | return err; 378 | } 379 | 380 | int bpf_prog_get_fd_by_id(__u32 id) 381 | { 382 | union bpf_attr attr; 383 | 384 | bzero(&attr, sizeof(attr)); 385 | attr.prog_id = id; 386 | 387 | return sys_bpf(BPF_PROG_GET_FD_BY_ID, &attr, sizeof(attr)); 388 | } 389 | 390 | int bpf_map_get_fd_by_id(__u32 id) 391 | { 392 | union bpf_attr attr; 393 | 394 | bzero(&attr, sizeof(attr)); 395 | attr.map_id = id; 396 | 397 | return sys_bpf(BPF_MAP_GET_FD_BY_ID, &attr, sizeof(attr)); 398 | } 399 | 400 | int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len) 401 | { 402 | union bpf_attr attr; 403 | int err; 404 | 405 | bzero(&attr, sizeof(attr)); 406 | attr.info.bpf_fd = prog_fd; 407 | attr.info.info_len = *info_len; 408 | attr.info.info = ptr_to_u64(info); 409 | 410 | err = sys_bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr)); 411 | if (!err) 412 | *info_len = attr.info.info_len; 413 | 414 | return err; 415 | } 416 | -------------------------------------------------------------------------------- /tools/lib/bpf/bpf.h: -------------------------------------------------------------------------------- 1 | /* 2 | * common eBPF ELF operations. 3 | * 4 | * Copyright (C) 2013-2015 Alexei Starovoitov 5 | * Copyright (C) 2015 Wang Nan 6 | * Copyright (C) 2015 Huawei Inc. 7 | * 8 | * This program is free software; you can redistribute it and/or 9 | * modify it under the terms of the GNU Lesser General Public 10 | * License as published by the Free Software Foundation; 11 | * version 2.1 of the License (not later!) 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU Lesser General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU Lesser General Public 19 | * License along with this program; if not, see 20 | */ 21 | #ifndef __BPF_BPF_H 22 | #define __BPF_BPF_H 23 | 24 | #include 25 | #include 26 | 27 | int bpf_create_map_node(enum bpf_map_type map_type, const char *name, 28 | int key_size, int value_size, int max_entries, 29 | __u32 map_flags, int node); 30 | int bpf_create_map_name(enum bpf_map_type map_type, const char *name, 31 | int key_size, int value_size, int max_entries, 32 | __u32 map_flags); 33 | int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, 34 | int max_entries, __u32 map_flags); 35 | int bpf_create_map_in_map_node(enum bpf_map_type map_type, const char *name, 36 | int key_size, int inner_map_fd, int max_entries, 37 | __u32 map_flags, int node); 38 | int bpf_create_map_in_map(enum bpf_map_type map_type, const char *name, 39 | int key_size, int inner_map_fd, int max_entries, 40 | __u32 map_flags); 41 | 42 | /* Recommend log buffer size */ 43 | #define BPF_LOG_BUF_SIZE (256 * 1024) 44 | int bpf_load_program_name(enum bpf_prog_type type, const char *name, 45 | const struct bpf_insn *insns, 46 | size_t insns_cnt, const char *license, 47 | __u32 kern_version, char *log_buf, 48 | size_t log_buf_sz); 49 | int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns, 50 | size_t insns_cnt, const char *license, 51 | __u32 kern_version, char *log_buf, 52 | size_t log_buf_sz); 53 | int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns, 54 | size_t insns_cnt, int strict_alignment, 55 | const char *license, __u32 kern_version, 56 | char *log_buf, size_t log_buf_sz, int log_level); 57 | 58 | int bpf_map_update_elem(int fd, const void *key, const void *value, 59 | __u64 flags); 60 | 61 | int bpf_map_lookup_elem(int fd, const void *key, void *value); 62 | int bpf_map_delete_elem(int fd, const void *key); 63 | int bpf_map_get_next_key(int fd, const void *key, void *next_key); 64 | int bpf_obj_pin(int fd, const char *pathname); 65 | int bpf_obj_get(const char *pathname); 66 | int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type, 67 | unsigned int flags); 68 | int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type); 69 | int bpf_prog_detach2(int prog_fd, int attachable_fd, enum bpf_attach_type type); 70 | int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size, 71 | void *data_out, __u32 *size_out, __u32 *retval, 72 | __u32 *duration); 73 | int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id); 74 | int bpf_map_get_next_id(__u32 start_id, __u32 *next_id); 75 | int bpf_prog_get_fd_by_id(__u32 id); 76 | int bpf_map_get_fd_by_id(__u32 id); 77 | int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len); 78 | int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, 79 | __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt); 80 | #endif 81 | -------------------------------------------------------------------------------- /tools/lib/bpf/libbpf.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Common eBPF ELF object loading operations. 3 | * 4 | * Copyright (C) 2013-2015 Alexei Starovoitov 5 | * Copyright (C) 2015 Wang Nan 6 | * Copyright (C) 2015 Huawei Inc. 7 | * 8 | * This program is free software; you can redistribute it and/or 9 | * modify it under the terms of the GNU Lesser General Public 10 | * License as published by the Free Software Foundation; 11 | * version 2.1 of the License (not later!) 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU Lesser General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU Lesser General Public 19 | * License along with this program; if not, see 20 | */ 21 | #ifndef __BPF_LIBBPF_H 22 | #define __BPF_LIBBPF_H 23 | 24 | #include 25 | #include 26 | #include 27 | #include // for size_t 28 | #include 29 | 30 | enum libbpf_errno { 31 | __LIBBPF_ERRNO__START = 4000, 32 | 33 | /* Something wrong in libelf */ 34 | LIBBPF_ERRNO__LIBELF = __LIBBPF_ERRNO__START, 35 | LIBBPF_ERRNO__FORMAT, /* BPF object format invalid */ 36 | LIBBPF_ERRNO__KVERSION, /* Incorrect or no 'version' section */ 37 | LIBBPF_ERRNO__ENDIAN, /* Endian mismatch */ 38 | LIBBPF_ERRNO__INTERNAL, /* Internal error in libbpf */ 39 | LIBBPF_ERRNO__RELOC, /* Relocation failed */ 40 | LIBBPF_ERRNO__LOAD, /* Load program failure for unknown reason */ 41 | LIBBPF_ERRNO__VERIFY, /* Kernel verifier blocks program loading */ 42 | LIBBPF_ERRNO__PROG2BIG, /* Program too big */ 43 | LIBBPF_ERRNO__KVER, /* Incorrect kernel version */ 44 | LIBBPF_ERRNO__PROGTYPE, /* Kernel doesn't support this program type */ 45 | __LIBBPF_ERRNO__END, 46 | }; 47 | 48 | int libbpf_strerror(int err, char *buf, size_t size); 49 | 50 | /* 51 | * In include/linux/compiler-gcc.h, __printf is defined. However 52 | * it should be better if libbpf.h doesn't depend on Linux header file. 53 | * So instead of __printf, here we use gcc attribute directly. 54 | */ 55 | typedef int (*libbpf_print_fn_t)(const char *, ...) 56 | __attribute__((format(printf, 1, 2))); 57 | 58 | void libbpf_set_print(libbpf_print_fn_t warn, 59 | libbpf_print_fn_t info, 60 | libbpf_print_fn_t debug); 61 | 62 | /* Hide internal to user */ 63 | struct bpf_object; 64 | 65 | struct bpf_object *bpf_object__open(const char *path); 66 | struct bpf_object *bpf_object__open_buffer(void *obj_buf, 67 | size_t obj_buf_sz, 68 | const char *name); 69 | int bpf_object__pin(struct bpf_object *object, const char *path); 70 | void bpf_object__close(struct bpf_object *object); 71 | 72 | /* Load/unload object into/from kernel */ 73 | int bpf_object__load(struct bpf_object *obj); 74 | int bpf_object__unload(struct bpf_object *obj); 75 | const char *bpf_object__name(struct bpf_object *obj); 76 | unsigned int bpf_object__kversion(struct bpf_object *obj); 77 | 78 | struct bpf_object *bpf_object__next(struct bpf_object *prev); 79 | #define bpf_object__for_each_safe(pos, tmp) \ 80 | for ((pos) = bpf_object__next(NULL), \ 81 | (tmp) = bpf_object__next(pos); \ 82 | (pos) != NULL; \ 83 | (pos) = (tmp), (tmp) = bpf_object__next(tmp)) 84 | 85 | typedef void (*bpf_object_clear_priv_t)(struct bpf_object *, void *); 86 | int bpf_object__set_priv(struct bpf_object *obj, void *priv, 87 | bpf_object_clear_priv_t clear_priv); 88 | void *bpf_object__priv(struct bpf_object *prog); 89 | 90 | /* Accessors of bpf_program. */ 91 | struct bpf_program; 92 | struct bpf_program *bpf_program__next(struct bpf_program *prog, 93 | struct bpf_object *obj); 94 | 95 | #define bpf_object__for_each_program(pos, obj) \ 96 | for ((pos) = bpf_program__next(NULL, (obj)); \ 97 | (pos) != NULL; \ 98 | (pos) = bpf_program__next((pos), (obj))) 99 | 100 | typedef void (*bpf_program_clear_priv_t)(struct bpf_program *, 101 | void *); 102 | 103 | int bpf_program__set_priv(struct bpf_program *prog, void *priv, 104 | bpf_program_clear_priv_t clear_priv); 105 | 106 | void *bpf_program__priv(struct bpf_program *prog); 107 | 108 | const char *bpf_program__title(struct bpf_program *prog, bool needs_copy); 109 | 110 | int bpf_program__fd(struct bpf_program *prog); 111 | int bpf_program__pin_instance(struct bpf_program *prog, const char *path, 112 | int instance); 113 | int bpf_program__pin(struct bpf_program *prog, const char *path); 114 | 115 | struct bpf_insn; 116 | 117 | /* 118 | * Libbpf allows callers to adjust BPF programs before being loaded 119 | * into kernel. One program in an object file can be transform into 120 | * multiple variants to be attached to different code. 121 | * 122 | * bpf_program_prep_t, bpf_program__set_prep and bpf_program__nth_fd 123 | * are APIs for this propose. 124 | * 125 | * - bpf_program_prep_t: 126 | * It defines 'preprocessor', which is a caller defined function 127 | * passed to libbpf through bpf_program__set_prep(), and will be 128 | * called before program is loaded. The processor should adjust 129 | * the program one time for each instances according to the number 130 | * passed to it. 131 | * 132 | * - bpf_program__set_prep: 133 | * Attachs a preprocessor to a BPF program. The number of instances 134 | * whould be created is also passed through this function. 135 | * 136 | * - bpf_program__nth_fd: 137 | * After the program is loaded, get resuling fds from bpf program for 138 | * each instances. 139 | * 140 | * If bpf_program__set_prep() is not used, the program whould be loaded 141 | * without adjustment during bpf_object__load(). The program has only 142 | * one instance. In this case bpf_program__fd(prog) is equal to 143 | * bpf_program__nth_fd(prog, 0). 144 | */ 145 | 146 | struct bpf_prog_prep_result { 147 | /* 148 | * If not NULL, load new instruction array. 149 | * If set to NULL, don't load this instance. 150 | */ 151 | struct bpf_insn *new_insn_ptr; 152 | int new_insn_cnt; 153 | 154 | /* If not NULL, result fd is set to it */ 155 | int *pfd; 156 | }; 157 | 158 | /* 159 | * Parameters of bpf_program_prep_t: 160 | * - prog: The bpf_program being loaded. 161 | * - n: Index of instance being generated. 162 | * - insns: BPF instructions array. 163 | * - insns_cnt:Number of instructions in insns. 164 | * - res: Output parameter, result of transformation. 165 | * 166 | * Return value: 167 | * - Zero: pre-processing success. 168 | * - Non-zero: pre-processing, stop loading. 169 | */ 170 | typedef int (*bpf_program_prep_t)(struct bpf_program *prog, int n, 171 | struct bpf_insn *insns, int insns_cnt, 172 | struct bpf_prog_prep_result *res); 173 | 174 | int bpf_program__set_prep(struct bpf_program *prog, int nr_instance, 175 | bpf_program_prep_t prep); 176 | 177 | int bpf_program__nth_fd(struct bpf_program *prog, int n); 178 | 179 | /* 180 | * Adjust type of bpf program. Default is kprobe. 181 | */ 182 | int bpf_program__set_socket_filter(struct bpf_program *prog); 183 | int bpf_program__set_tracepoint(struct bpf_program *prog); 184 | int bpf_program__set_kprobe(struct bpf_program *prog); 185 | int bpf_program__set_sched_cls(struct bpf_program *prog); 186 | int bpf_program__set_sched_act(struct bpf_program *prog); 187 | int bpf_program__set_xdp(struct bpf_program *prog); 188 | int bpf_program__set_perf_event(struct bpf_program *prog); 189 | void bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type); 190 | 191 | bool bpf_program__is_socket_filter(struct bpf_program *prog); 192 | bool bpf_program__is_tracepoint(struct bpf_program *prog); 193 | bool bpf_program__is_kprobe(struct bpf_program *prog); 194 | bool bpf_program__is_sched_cls(struct bpf_program *prog); 195 | bool bpf_program__is_sched_act(struct bpf_program *prog); 196 | bool bpf_program__is_xdp(struct bpf_program *prog); 197 | bool bpf_program__is_perf_event(struct bpf_program *prog); 198 | 199 | /* 200 | * We don't need __attribute__((packed)) now since it is 201 | * unnecessary for 'bpf_map_def' because they are all aligned. 202 | * In addition, using it will trigger -Wpacked warning message, 203 | * and will be treated as an error due to -Werror. 204 | */ 205 | struct bpf_map_def { 206 | unsigned int type; 207 | unsigned int key_size; 208 | unsigned int value_size; 209 | unsigned int max_entries; 210 | }; 211 | 212 | /* 213 | * There is another 'struct bpf_map' in include/linux/map.h. However, 214 | * it is not a uapi header so no need to consider name clash. 215 | */ 216 | struct bpf_map; 217 | struct bpf_map * 218 | bpf_object__find_map_by_name(struct bpf_object *obj, const char *name); 219 | 220 | /* 221 | * Get bpf_map through the offset of corresponding struct bpf_map_def 222 | * in the bpf object file. 223 | */ 224 | struct bpf_map * 225 | bpf_object__find_map_by_offset(struct bpf_object *obj, size_t offset); 226 | 227 | struct bpf_map * 228 | bpf_map__next(struct bpf_map *map, struct bpf_object *obj); 229 | #define bpf_map__for_each(pos, obj) \ 230 | for ((pos) = bpf_map__next(NULL, (obj)); \ 231 | (pos) != NULL; \ 232 | (pos) = bpf_map__next((pos), (obj))) 233 | 234 | int bpf_map__fd(struct bpf_map *map); 235 | const struct bpf_map_def *bpf_map__def(struct bpf_map *map); 236 | const char *bpf_map__name(struct bpf_map *map); 237 | 238 | typedef void (*bpf_map_clear_priv_t)(struct bpf_map *, void *); 239 | int bpf_map__set_priv(struct bpf_map *map, void *priv, 240 | bpf_map_clear_priv_t clear_priv); 241 | void *bpf_map__priv(struct bpf_map *map); 242 | int bpf_map__pin(struct bpf_map *map, const char *path); 243 | 244 | long libbpf_get_error(const void *ptr); 245 | 246 | int bpf_prog_load(const char *file, enum bpf_prog_type type, 247 | struct bpf_object **pobj, int *prog_fd); 248 | #endif 249 | -------------------------------------------------------------------------------- /xlb.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ./xlb -i eth0 -r ;./xlb -i eth0 -v 4 | ./xlb_cmdline -i eth0 -A 10.1.2.1 -p 80 5 | ./xlb_cmdline -i eth0 -a 10.1.2.1 -p 80 -r 10.0.0.24 6 | ./xlb_cmdline -i eth0 -a 10.1.2.1 -p 80 -r 10.0.0.23 7 | ./xlb_cmdline -i eth0 -a 10.1.2.1 -p 80 -r 10.0.0.22 8 | 9 | ./xlb_cmdline -i eth0 -L 10 | 11 | -------------------------------------------------------------------------------- /xlb.sh2: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ./xlb -i eth0 -r ;./xlb -i eth0 -v 4 | ./xlb_cmdline -i eth0 -A 10.1.4.1 -p 80 5 | ./xlb_cmdline -i eth0 -a 10.1.4.1 -p 80 -r 10.0.0.24 6 | ./xlb_cmdline -i eth0 -a 10.1.4.1 -p 80 -r 10.0.0.23 7 | ./xlb_cmdline -i eth0 -a 10.1.4.1 -p 80 -r 10.0.0.22 8 | 9 | ./xlb_cmdline -i eth0 -A 10.1.4.2 -p 80 10 | ./xlb_cmdline -i eth0 -a 10.1.4.2 -p 80 -r 10.0.0.22 11 | 12 | ./xlb_cmdline -i eth0 -A 10.1.4.3 -p 80 13 | ./xlb_cmdline -i eth0 -a 10.1.4.3 -p 80 -r 10.0.0.22 14 | ./xlb_cmdline -i eth0 -a 10.1.4.3 -p 80 -r 10.0.0.23 15 | 16 | ./xlb_cmdline -i eth0 -L 17 | 18 | -------------------------------------------------------------------------------- /xlb.sh3: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ./xlb -i eth0 -r ;./xlb -i eth0 -v 4 | ./xlb_cmdline -i eth0 -A 10.1.4.1 -p 80 5 | ./xlb_cmdline -i eth0 -a 10.1.4.1 -p 80 -r 10.0.0.22 6 | ./xlb_cmdline -i eth0 -a 10.1.4.1 -p 80 -r 10.0.0.23 7 | ./xlb_cmdline -i eth0 -a 10.1.4.1 -p 80 -r 10.0.0.24 8 | 9 | ./xlb_cmdline -i eth0 -A 10.1.4.2 -p 80 10 | ./xlb_cmdline -i eth0 -a 10.1.4.2 -p 80 -r 10.0.0.22 11 | ./xlb_cmdline -i eth0 -a 10.1.4.2 -p 80 -r 10.0.0.23 12 | ./xlb_cmdline -i eth0 -a 10.1.4.2 -p 80 -r 10.0.0.24 13 | 14 | ./xlb_cmdline -i eth0 -L 15 | 16 | echo 17 | read -p "Type enter to continue ..." choice 18 | echo 19 | 20 | ./xlb_cmdline -i eth0 -d 10.1.4.1 -p 80 -r 10.0.0.22 21 | ./xlb_cmdline -i eth0 -d 10.1.4.1 -p 80 -r 10.0.0.23 22 | ./xlb_cmdline -i eth0 -d 10.1.4.1 -p 80 -r 10.0.0.24 23 | ./xlb_cmdline -i eth0 -D 10.1.4.1 -p 80 24 | ./xlb_cmdline -i eth0 -L -v 25 | 26 | echo 27 | read -p "Type enter to continue ..." choice 28 | echo 29 | 30 | ./xlb_cmdline -i eth0 -d 10.1.4.2 -p 80 -r 10.0.0.23 31 | ./xlb_cmdline -i eth0 -d 10.1.4.2 -p 80 -r 10.0.0.24 32 | ./xlb_cmdline -i eth0 -D 10.1.4.2 -p 80 33 | ./xlb_cmdline -i eth0 -L -v 34 | 35 | -------------------------------------------------------------------------------- /xlb.sh4: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | createsvc(){ 4 | ./xlb -i eth0 -r ;./xlb -i eth0 5 | ./xlb_cmdline -i eth0 -A 10.1.4.1 -p 80 6 | ./xlb_cmdline -i eth0 -a 10.1.4.1 -p 80 -r 10.0.0.22 7 | ./xlb_cmdline -i eth0 -a 10.1.4.1 -p 80 -r 10.0.0.23 8 | ./xlb_cmdline -i eth0 -a 10.1.4.1 -p 80 -r 10.0.0.24 9 | 10 | 11 | ./xlb_cmdline -i eth0 -A 10.1.4.2 -p 80 12 | ./xlb_cmdline -i eth0 -a 10.1.4.2 -p 80 -r 10.0.0.24 13 | ./xlb_cmdline -i eth0 -a 10.1.4.2 -p 80 -r 10.0.0.23 14 | 15 | } 16 | 17 | deletesvc(){ 18 | ./xlb_cmdline -i eth0 -d 10.1.4.1 -p 80 -r 10.0.0.22 19 | ./xlb_cmdline -i eth0 -d 10.1.4.1 -p 80 -r 10.0.0.23 20 | ./xlb_cmdline -i eth0 -d 10.1.4.1 -p 80 -r 10.0.0.24 21 | ./xlb_cmdline -i eth0 -D 10.1.4.1 -p 80 22 | 23 | ./xlb_cmdline -i eth0 -d 10.1.4.2 -p 80 -r 10.0.0.23 24 | ./xlb_cmdline -i eth0 -d 10.1.4.2 -p 80 -r 10.0.0.24 25 | ./xlb_cmdline -i eth0 -D 10.1.4.2 -p 80 26 | } 27 | 28 | listsvc(){ 29 | ./xlb_cmdline -i eth0 -L 30 | } 31 | 32 | waitkey(){ 33 | echo 34 | #read -p "Type enter to continue ..." choice 35 | read -p "Type enter... " choice 36 | echo 37 | } 38 | 39 | createsvc 40 | 41 | waitkey 42 | 43 | listsvc 44 | 45 | waitkey 46 | 47 | deletesvc 48 | listsvc 49 | 50 | waitkey 51 | 52 | createsvc 53 | 54 | waitkey 55 | 56 | listsvc 57 | 58 | waitkey 59 | 60 | cat < in an IPv4 header and XDP_TX it out.\n" 25 | "The workers are selected by a round robin manner.\n\n"); 26 | printf("Usage: %s [...]\n", cmd); 27 | printf(" -i Interface name(eg. eth0)\n"); 28 | printf(" -A ServiceIP(a.k.a. VIP)\n"); 29 | printf(" -t (for TCP, optional, default)\n"); 30 | printf(" -u (for UDP, optional)\n"); 31 | printf(" -r WorkerIP\n"); 32 | printf(" -v verbose\n"); 33 | printf(" -L list lb table\n"); 34 | printf(" -l list lbcache\n"); 35 | printf(" -h Display this help\n"); 36 | } 37 | 38 | int main(int argc, char **argv) 39 | { 40 | const char *optstr = "i:A:D:a:d:r:p:SLlvhut"; 41 | int port = 0; 42 | struct iptnl_info tnl = {}; 43 | struct vip vip = {}; 44 | int opt; 45 | 46 | int fd_service, fd_linklist, fd_worker, fd_svcid; 47 | 48 | bool do_list = false; 49 | bool monitor = false; 50 | 51 | enum action action = ACTION_LIST; 52 | 53 | tnl.family = AF_UNSPEC; 54 | vip.protocol = IPPROTO_TCP; 55 | 56 | 57 | while ((opt = getopt(argc, argv, optstr)) != -1) { 58 | unsigned short family; 59 | unsigned int *v6; 60 | 61 | switch (opt) { 62 | case 'v': 63 | verbose = 1; 64 | break; 65 | case 'i': 66 | if (strlen(optarg) >= IF_NAMESIZE) { 67 | fprintf(stderr, "ERR: Intereface name too long\n"); 68 | goto error; 69 | } 70 | ifname = (char *)&ifname_buf; 71 | strncpy(ifname, optarg, IF_NAMESIZE); 72 | ifindex = if_nametoindex(ifname); 73 | if (ifindex == 0) { 74 | fprintf(stderr, 75 | "ERR: Interface name unknown err(%d):%s\n", 76 | errno, strerror(errno)); 77 | goto error; 78 | } 79 | break; 80 | case 'A': 81 | action = ACTION_ADD_SVC; 82 | vip.family = parse_ipstr(optarg, vip.daddr.v6); 83 | if (vip.family == AF_UNSPEC) 84 | return 1; 85 | break; 86 | case 'D': 87 | action = ACTION_DEL_SVC; 88 | vip.family = parse_ipstr(optarg, vip.daddr.v6); 89 | if (vip.family == AF_UNSPEC) 90 | return 1; 91 | break; 92 | case 'a': 93 | action = ACTION_ADD_REAL; 94 | vip.family = parse_ipstr(optarg, vip.daddr.v6); 95 | if (vip.family == AF_UNSPEC) 96 | return 1; 97 | break; 98 | case 'd': 99 | action = ACTION_DEL_REAL; 100 | vip.family = parse_ipstr(optarg, vip.daddr.v6); 101 | if (vip.family == AF_UNSPEC) 102 | return 1; 103 | break; 104 | case 'L': 105 | do_list = true; 106 | break; 107 | case 'l': 108 | monitor = true; 109 | break; 110 | case 'u': 111 | vip.protocol = IPPROTO_UDP; 112 | break; 113 | case 't': 114 | vip.protocol = IPPROTO_TCP; 115 | break; 116 | case 'p': 117 | if (parse_port(optarg, &port)) 118 | return 1; 119 | break; 120 | case 'r': 121 | v6 = tnl.daddr.v6; 122 | 123 | family = parse_ipstr(optarg, v6); 124 | if (family == AF_UNSPEC) 125 | return 1; 126 | if (tnl.family == AF_UNSPEC) { 127 | tnl.family = family; 128 | } else if (tnl.family != family) { 129 | fprintf(stderr, 130 | "The IP version of the src and dst addresses used in the IP encapsulation does not match\n"); 131 | return 1; 132 | } 133 | break; 134 | case 'S': 135 | xdp_flags |= XDP_FLAGS_SKB_MODE; 136 | break; 137 | error: 138 | default: 139 | usage(argv[0]); 140 | return 1; 141 | } 142 | // opt_flags[opt] = 0; 143 | } 144 | 145 | 146 | if (ifindex == -1) { 147 | printf("ERR: required option -i missing"); 148 | usage(argv[0]); 149 | return EXIT_FAIL_OPTION; 150 | } 151 | 152 | vip.dport = htons(port); 153 | 154 | if (action == ACTION_ADD_SVC) { 155 | xlb_add_svc(&vip); 156 | } else if (action == ACTION_DEL_SVC) { 157 | xlb_del_svc(&vip); 158 | } else if (action == ACTION_ADD_REAL) { 159 | xlb_add_real(&vip, &tnl); 160 | } else if (action == ACTION_DEL_REAL) { 161 | xlb_del_real(&vip, &tnl); 162 | } 163 | 164 | if (DEBUG||verbose||do_list) { 165 | list_all(); 166 | } 167 | 168 | if (verbose) { 169 | service_list_all(); 170 | linklist_list_all(); 171 | worker_list_all(); 172 | svcid_list_all(); 173 | } 174 | 175 | if (monitor) { 176 | list_lbcache(); 177 | } 178 | 179 | return 0; 180 | } 181 | -------------------------------------------------------------------------------- /xlb_common.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016 Facebook 3 | * Copyright (c) 2018 Cluster Computing Inc. 4 | * 5 | * This program is free software; you can redistribute it and/or 6 | * modify it under the terms of version 2 of the GNU General Public 7 | * License as published by the Free Software Foundation. 8 | */ 9 | 10 | #ifndef _SAMPLES_BPF_XDP_TX_IPTNL_COMMON_H 11 | #define _SAMPLES_BPF_XDP_TX_IPTNL_COMMON_H 12 | 13 | #include 14 | 15 | #define EXIT_OK 0 16 | #define EXIT_FAIL 1 17 | #define EXIT_FAIL_OPTION 2 18 | #define EXIT_FAIL_XDP 3 19 | #define EXIT_FAIL_MAP 20 20 | #define EXIT_FAIL_MAP_KEY 21 21 | #define EXIT_FAIL_MAP_FILE 22 22 | #define EXIT_FAIL_MAP_FS 23 23 | #define EXIT_FAIL_IP 30 24 | #define EXIT_FAIL_PORT 31 25 | #define EXIT_FAIL_BPF 40 26 | #define EXIT_FAIL_BPF_ELF 41 27 | #define EXIT_FAIL_BPF_RELOCATE 42 28 | 29 | #define MAX_IPTNL_ENTRIES 256U 30 | #define MAX_SVC_ENTRIES 256U 31 | 32 | //#define ACTION_ADD (1<<0) 33 | //#define ACTION_DEL (1<<1) 34 | 35 | enum action { 36 | ACTION_LIST, 37 | ACTION_ADD_SVC, 38 | ACTION_DEL_SVC, 39 | ACTION_ADD_REAL, 40 | ACTION_DEL_REAL 41 | }; 42 | 43 | static int verbose = 0; 44 | 45 | //#define DEBUG true 46 | #ifndef DEBUG 47 | #define DEBUG false 48 | #endif 49 | 50 | static const char *file_service = "/sys/fs/bpf/service"; 51 | static const char *file_linklist = "/sys/fs/bpf/linklist"; 52 | static const char *file_worker = "/sys/fs/bpf/worker"; 53 | static const char *file_svcid = "/sys/fs/bpf/svcid"; 54 | static const char *file_lbcache = "/sys/fs/bpf/lbcache"; 55 | 56 | struct vip { 57 | union { 58 | __u32 v6[4]; 59 | __u32 v4; 60 | } daddr; 61 | __u16 dport; 62 | __u16 family; 63 | __u8 protocol; 64 | }; 65 | 66 | struct iptnl_info { 67 | union { 68 | __u32 v6[4]; 69 | __u32 v4; 70 | } saddr; 71 | union { 72 | __u32 v6[4]; 73 | __u32 v4; 74 | } daddr; 75 | __u16 family; 76 | // __u8 dmac[6]; 77 | char dmac[6]; 78 | // struct ether_addr* dmac; 79 | }; 80 | 81 | struct sip { 82 | union { 83 | __u32 v6[4]; 84 | __u32 v4; 85 | } saddr; 86 | __u16 sport; 87 | __u16 family; 88 | __u8 protocol; 89 | }; 90 | 91 | struct flow { 92 | struct vip vip; 93 | struct sip sip; 94 | }; 95 | 96 | #endif 97 | -------------------------------------------------------------------------------- /xlb_kern.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016 Facebook 3 | * Copyright (c) 2018 Cluster Computing Inc. 4 | * 5 | * 6 | * This program is free software; you can redistribute it and/or 7 | * modify it under the terms of version 2 of the GNU General Public 8 | * License as published by the Free Software Foundation. 9 | * 10 | */ 11 | #define KBUILD_MODNAME "foo" 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include "bpf_helpers.h" 20 | #include "xlb_common.h" 21 | 22 | struct bpf_map_def SEC("maps") service = { 23 | .type = BPF_MAP_TYPE_HASH, 24 | .key_size = sizeof(struct vip), 25 | .value_size = sizeof(__u64), 26 | .max_entries = MAX_IPTNL_ENTRIES, 27 | }; 28 | 29 | struct bpf_map_def SEC("maps") linklist = { 30 | .type = BPF_MAP_TYPE_HASH, 31 | .key_size = sizeof(__u64), 32 | .value_size = sizeof(__u64), 33 | .max_entries = MAX_IPTNL_ENTRIES, 34 | }; 35 | 36 | struct bpf_map_def SEC("maps") worker = { 37 | .type = BPF_MAP_TYPE_HASH, 38 | .key_size = sizeof(__u64), 39 | .value_size = sizeof(struct iptnl_info), 40 | .max_entries = 65536, 41 | }; 42 | 43 | struct bpf_map_def SEC("maps") lbcache = { 44 | .type = BPF_MAP_TYPE_LRU_HASH, 45 | // .type = BPF_MAP_TYPE_HASH, 46 | .key_size = sizeof(struct flow), 47 | .value_size = sizeof(__u64), 48 | .max_entries = 200, 49 | // .max_entries = 65536, 50 | }; 51 | 52 | struct bpf_map_def SEC("maps") svcid = { 53 | .type = BPF_MAP_TYPE_HASH, 54 | .key_size = sizeof(__u16), 55 | .value_size = sizeof(struct vip), 56 | .max_entries = 256, 57 | }; 58 | 59 | static __always_inline int get_dport(void *trans_data, void *data_end, 60 | u8 protocol) 61 | { 62 | struct tcphdr *th; 63 | struct udphdr *uh; 64 | 65 | switch (protocol) { 66 | case IPPROTO_TCP: 67 | th = (struct tcphdr *)trans_data; 68 | if (th + 1 > data_end) 69 | return -1; 70 | return th->dest; 71 | case IPPROTO_UDP: 72 | uh = (struct udphdr *)trans_data; 73 | if (uh + 1 > data_end) 74 | return -1; 75 | return uh->dest; 76 | default: 77 | return 0; 78 | } 79 | } 80 | 81 | static __always_inline int get_sport(void *trans_data, void *data_end, 82 | u8 protocol) 83 | { 84 | struct tcphdr *th; 85 | struct udphdr *uh; 86 | 87 | switch (protocol) { 88 | case IPPROTO_TCP: 89 | th = (struct tcphdr *)trans_data; 90 | if (th + 1 > data_end) 91 | return -1; 92 | return th->source; 93 | case IPPROTO_UDP: 94 | uh = (struct udphdr *)trans_data; 95 | if (uh + 1 > data_end) 96 | return -1; 97 | return uh->source; 98 | default: 99 | return 0; 100 | } 101 | } 102 | 103 | static __always_inline void set_ethhdr(struct ethhdr *new_eth, 104 | const struct ethhdr *old_eth, 105 | const struct iptnl_info *tnl, 106 | __be16 h_proto) 107 | { 108 | memcpy(new_eth->h_source, old_eth->h_dest, sizeof(new_eth->h_source)); 109 | memcpy(new_eth->h_dest, tnl->dmac, sizeof(new_eth->h_dest)); 110 | new_eth->h_proto = h_proto; 111 | } 112 | 113 | static __always_inline void update_lbcache_v4(struct ethhdr *new_eth) 114 | { 115 | } 116 | 117 | static __always_inline int handle_ipv4(struct xdp_md *xdp) 118 | { 119 | void *data_end = (void *)(long)xdp->data_end; 120 | void *data = (void *)(long)xdp->data; 121 | struct iptnl_info *tnl; 122 | struct ethhdr *new_eth; 123 | struct ethhdr *old_eth; 124 | struct iphdr *iph = data + sizeof(struct ethhdr); 125 | u16 *next_iph_u16; 126 | u16 payload_len; 127 | struct vip vip = {}; 128 | int dport; 129 | u32 csum = 0; 130 | int i; 131 | 132 | if (iph + 1 > data_end) 133 | return XDP_DROP; 134 | 135 | dport = get_dport(iph + 1, data_end, iph->protocol); 136 | if (dport == -1) 137 | return XDP_DROP; 138 | 139 | vip.protocol = iph->protocol; 140 | vip.family = AF_INET; 141 | vip.daddr.v4 = iph->daddr; 142 | vip.dport = dport; 143 | payload_len = ntohs(iph->tot_len); 144 | 145 | struct flow flow = {}; 146 | __u64 *wkid_p, wkid; 147 | __u64 *next_wkid_p, next_wkid; 148 | struct sip sip = {}; 149 | int sport; 150 | 151 | if (iph + 1 > data_end) 152 | return XDP_DROP; 153 | 154 | sport = get_sport(iph + 1, data_end, iph->protocol); 155 | if (sport == -1) 156 | return XDP_DROP; 157 | 158 | sip.protocol = iph->protocol; 159 | sip.family = AF_INET; 160 | sip.saddr.v4 = iph->saddr; 161 | sip.sport = sport; 162 | 163 | flow.vip = vip; 164 | flow.sip = sip; 165 | 166 | wkid_p = bpf_map_lookup_elem(&lbcache, &flow); 167 | if (!wkid_p) { 168 | wkid_p = bpf_map_lookup_elem(&service, &vip); 169 | if (!wkid_p) return XDP_PASS; 170 | 171 | wkid = *wkid_p; 172 | bpf_map_update_elem(&lbcache, &flow, &wkid, BPF_ANY); 173 | 174 | next_wkid_p = bpf_map_lookup_elem(&linklist, &wkid); 175 | if (!next_wkid_p) return XDP_PASS; 176 | next_wkid = *next_wkid_p; 177 | bpf_map_update_elem(&service, &vip, &next_wkid, BPF_ANY); 178 | } 179 | 180 | wkid = *wkid_p; 181 | tnl = bpf_map_lookup_elem(&worker, &wkid); 182 | if (!tnl || tnl->family != AF_INET) 183 | return XDP_PASS; 184 | 185 | if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct iphdr))) 186 | return XDP_DROP; 187 | 188 | data = (void *)(long)xdp->data; 189 | data_end = (void *)(long)xdp->data_end; 190 | 191 | new_eth = data; 192 | iph = data + sizeof(*new_eth); 193 | old_eth = data + sizeof(*iph); 194 | 195 | if (new_eth + 1 > data_end || 196 | old_eth + 1 > data_end || 197 | iph + 1 > data_end) 198 | return XDP_DROP; 199 | 200 | set_ethhdr(new_eth, old_eth, tnl, htons(ETH_P_IP)); 201 | 202 | iph->version = 4; 203 | iph->ihl = sizeof(*iph) >> 2; 204 | iph->frag_off = 0; 205 | iph->protocol = IPPROTO_IPIP; 206 | iph->check = 0; 207 | iph->tos = 0; 208 | iph->tot_len = htons(payload_len + sizeof(*iph)); 209 | iph->daddr = tnl->daddr.v4; 210 | iph->saddr = tnl->saddr.v4; 211 | iph->ttl = 8; 212 | 213 | next_iph_u16 = (u16 *)iph; 214 | #pragma clang loop unroll(full) 215 | for (i = 0; i < sizeof(*iph) >> 1; i++) 216 | csum += *next_iph_u16++; 217 | 218 | iph->check = ~((csum & 0xffff) + (csum >> 16)); 219 | 220 | return XDP_TX; 221 | } 222 | 223 | SEC("xdp_tx_iptunnel") 224 | int _xdp_tx_iptunnel(struct xdp_md *xdp) 225 | { 226 | void *data_end = (void *)(long)xdp->data_end; 227 | void *data = (void *)(long)xdp->data; 228 | struct ethhdr *eth = data; 229 | __u16 h_proto; 230 | 231 | if (eth + 1 > data_end) 232 | return XDP_DROP; 233 | 234 | h_proto = eth->h_proto; 235 | 236 | if (h_proto == htons(ETH_P_IP)) 237 | return handle_ipv4(xdp); 238 | else 239 | return XDP_PASS; 240 | } 241 | 242 | char _license[] SEC("license") = "GPL"; 243 | -------------------------------------------------------------------------------- /xlb_user.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016 Facebook 3 | * Copyright (c) 2018 Cluster Computing Inc. 4 | * 5 | * This program is free software; you can redistribute it and/or 6 | * modify it under the terms of version 2 of the GNU General Public 7 | * License as published by the Free Software Foundation. 8 | * 9 | */ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include "bpf_load.h" 25 | #include "libbpf.h" 26 | #include "bpf_util.h" 27 | #include "xlb_common.h" 28 | 29 | #include 30 | #include 31 | #include 32 | 33 | static char ifname_buf[IF_NAMESIZE]; 34 | static char *ifname = NULL; 35 | 36 | static int ifindex = -1; 37 | static __u32 xdp_flags = 0; 38 | 39 | #define NR_MAPS 5 40 | int maps_marked_for_export[MAX_MAPS] = { 0 }; 41 | 42 | static const char* map_idx_to_export_filename(int idx) 43 | { 44 | const char *file = NULL; 45 | 46 | /* Mapping map_fd[idx] to export filenames */ 47 | switch (idx) { 48 | case 0: 49 | file = file_service; 50 | break; 51 | case 1: 52 | file = file_linklist; 53 | break; 54 | case 2: 55 | file = file_worker; 56 | break; 57 | case 3: 58 | file = file_lbcache; 59 | break; 60 | case 4: 61 | file = file_svcid; 62 | break; 63 | default: 64 | break; 65 | } 66 | 67 | if (DEBUG) printf("FileNAME: %s \n", file); 68 | 69 | return file; 70 | } 71 | 72 | static void remove_xdp_program(int ifindex, const char *ifname, __u32 xdp_flags) 73 | { 74 | int i; 75 | fprintf(stderr, "Removing XDP program on ifindex:%d device:%s\n", 76 | ifindex, ifname); 77 | if (ifindex > -1) 78 | set_link_xdp_fd(ifindex, -1, xdp_flags); 79 | 80 | for (i = 0; i < NR_MAPS; i++) { 81 | const char *file = map_idx_to_export_filename(i); 82 | 83 | if (unlink(file) < 0) { 84 | printf("WARN: cannot rm map(%s) file:%s err(%d):%s\n", 85 | map_data[i].name, file, errno, strerror(errno)); 86 | } 87 | } 88 | } 89 | 90 | static void usage(const char *cmd) 91 | { 92 | printf("Start a XDP prog which encapsulates incoming packets\n"); 93 | printf("Usage: %s [...]\n", cmd); 94 | printf(" -i Interface Index\n"); 95 | printf(" -S use skb-mode\n"); 96 | printf(" -N enforce native mode\n"); 97 | printf(" -v verbose\n"); 98 | printf(" -h Display this help\n"); 99 | } 100 | 101 | #ifndef BPF_FS_MAGIC 102 | # define BPF_FS_MAGIC 0xcafe4a11 103 | #endif 104 | 105 | static int bpf_fs_check_path(const char *path) 106 | { 107 | struct statfs st_fs; 108 | char *dname, *dir; 109 | int err = 0; 110 | 111 | if (path == NULL) 112 | return -EINVAL; 113 | 114 | dname = strdup(path); 115 | if (dname == NULL) 116 | return -ENOMEM; 117 | 118 | dir = dirname(dname); 119 | if (statfs(dir, &st_fs)) { 120 | fprintf(stderr, "ERR: failed to statfs %s: (%d)%s\n", 121 | dir, errno, strerror(errno)); 122 | err = -errno; 123 | } 124 | free(dname); 125 | 126 | if (!err && st_fs.f_type != BPF_FS_MAGIC) { 127 | fprintf(stderr, 128 | "ERR: specified path %s is not on BPF FS\n\n" 129 | " You need to mount the BPF filesystem type like:\n" 130 | " mount -t bpf bpf /sys/fs/bpf/\n\n", 131 | path); 132 | err = -EINVAL; 133 | } 134 | 135 | return err; 136 | } 137 | 138 | int load_map_file(const char *file, struct bpf_map_data *map_data) 139 | { 140 | int fd; 141 | 142 | if (bpf_fs_check_path(file) < 0) { 143 | exit(EXIT_FAIL_MAP_FS); 144 | } 145 | 146 | fd = bpf_obj_get(file); 147 | if (fd > 0) { /* Great: map file already existed use it */ 148 | if (verbose) 149 | printf(" - Loaded bpf-map:%-30s from file:%s\n", 150 | map_data->name, file); 151 | return fd; 152 | } 153 | return -1; 154 | } 155 | 156 | void pre_load_maps_via_fs(struct bpf_map_data *map_data, int idx) 157 | { 158 | const char *file; 159 | int fd; 160 | 161 | file = map_idx_to_export_filename(idx); 162 | fd = load_map_file(file, map_data); 163 | 164 | if (fd > 0) { 165 | map_data->fd = fd; 166 | } else { 167 | maps_marked_for_export[idx] = 1; 168 | } 169 | } 170 | 171 | int export_map_idx(int map_idx) 172 | { 173 | const char *file; 174 | 175 | file = map_idx_to_export_filename(map_idx); 176 | 177 | if (bpf_obj_pin(map_fd[map_idx], file) != 0) { 178 | fprintf(stderr, "ERR: Cannot pin map(%s) file:%s err(%d):%s\n", 179 | map_data[map_idx].name, file, errno, strerror(errno)); 180 | return EXIT_FAIL_MAP; 181 | } 182 | if (verbose) 183 | printf(" - Export bpf-map:%-30s to file:%s\n", 184 | map_data[map_idx].name, file); 185 | return 0; 186 | } 187 | 188 | void export_maps(void) 189 | { 190 | int i; 191 | 192 | for (i = 0; i < NR_MAPS; i++) { 193 | if (maps_marked_for_export[i] == 1) 194 | export_map_idx(i); 195 | } 196 | } 197 | 198 | void chown_maps(uid_t owner, gid_t group) 199 | { 200 | const char *file; 201 | int i; 202 | 203 | for (i = 0; i < NR_MAPS; i++) { 204 | file = map_idx_to_export_filename(i); 205 | 206 | if (chown(file, owner, group) < 0) 207 | fprintf(stderr, 208 | "WARN: Cannot chown file:%s err(%d):%s\n", 209 | file, errno, strerror(errno)); 210 | } 211 | } 212 | 213 | int main(int argc, char **argv) 214 | { 215 | const char *optstr = "i:Shvr"; 216 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 217 | char filename[256]; 218 | int opt; 219 | 220 | uid_t owner = -1; /* -1 result in no-change of owner */ 221 | gid_t group = -1; 222 | 223 | bool rm_xdp_prog = false; 224 | 225 | while ((opt = getopt(argc, argv, optstr)) != -1) { 226 | switch (opt) { 227 | case 'v': 228 | verbose = 1; 229 | break; 230 | case 'r': 231 | rm_xdp_prog = true; 232 | break; 233 | case 'i': 234 | if (strlen(optarg) >= IF_NAMESIZE) { 235 | fprintf(stderr, "ERR: Intereface name too long\n"); 236 | goto error; 237 | } 238 | ifname = (char *)&ifname_buf; 239 | strncpy(ifname, optarg, IF_NAMESIZE); 240 | ifindex = if_nametoindex(ifname); 241 | if (ifindex == 0) { 242 | fprintf(stderr, 243 | "ERR: Interface name unknown err(%d):%s\n", 244 | errno, strerror(errno)); 245 | goto error; 246 | } 247 | break; 248 | case 'S': 249 | xdp_flags |= XDP_FLAGS_SKB_MODE; 250 | break; 251 | error: 252 | default: 253 | usage(argv[0]); 254 | return 1; 255 | } 256 | } 257 | 258 | if (ifindex == -1) { 259 | printf("ERR: required option -i missing"); 260 | usage(argv[0]); 261 | return EXIT_FAIL_OPTION; 262 | } 263 | 264 | if (rm_xdp_prog) { 265 | remove_xdp_program(ifindex, ifname, xdp_flags); 266 | return 0; 267 | } 268 | 269 | if (setrlimit(RLIMIT_MEMLOCK, &r)) { 270 | perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)"); 271 | return 1; 272 | } 273 | 274 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); 275 | 276 | if (load_bpf_file_fixup_map(filename, pre_load_maps_via_fs)) { 277 | fprintf(stderr, "Error in load_bpf_file_fixup_map(): %s", bpf_log_buf); 278 | return 1; 279 | } 280 | 281 | if (!prog_fd[0]) { 282 | printf("load_bpf_file: %s\n", strerror(errno)); 283 | return 1; 284 | } 285 | 286 | export_maps(); 287 | 288 | if (owner >= 0) 289 | chown_maps(owner, group); 290 | 291 | if (set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) { 292 | printf("link set xdp fd failed\n"); 293 | return 1; 294 | } 295 | 296 | return 0; 297 | } 298 | -------------------------------------------------------------------------------- /xlb_util.c: -------------------------------------------------------------------------------- 1 | #include "rmi.h" 2 | #include "xlb_util.h" 3 | 4 | int parse_ipstr(const char *ipstr, unsigned int *addr) 5 | { 6 | if (inet_pton(AF_INET6, ipstr, addr) == 1) { 7 | return AF_INET6; 8 | } else if (inet_pton(AF_INET, ipstr, addr) == 1) { 9 | addr[1] = addr[2] = addr[3] = 0; 10 | return AF_INET; 11 | } 12 | 13 | fprintf(stderr, "%s is an invalid IP\n", ipstr); 14 | return AF_UNSPEC; 15 | } 16 | 17 | int parse_port(const char *port_str, int *port) 18 | { 19 | char *end; 20 | long tmp_port; 21 | 22 | tmp_port = strtol(port_str, &end, 10); 23 | if (tmp_port < 1 || tmp_port > 65535) { 24 | fprintf(stderr, "Invalid port(s):%s\n", port_str); 25 | return 1; 26 | } 27 | 28 | *port = tmp_port; 29 | return 0; 30 | } 31 | 32 | int open_bpf_map(const char *file) 33 | { 34 | int fd; 35 | 36 | fd = bpf_obj_get(file); 37 | if (fd < 0) { 38 | printf("ERR: Failed to open bpf map file:%s err(%d):%s\n", 39 | file, errno, strerror(errno)); 40 | exit(EXIT_FAIL_MAP_FILE); 41 | } 42 | return fd; 43 | } 44 | 45 | __u64 conv(char ipadr[], __u16 svcid) 46 | { 47 | 48 | __u64 num=svcid, val; 49 | char *tok,*ptr; 50 | char ip_txt[INET_ADDRSTRLEN] = {0}; 51 | 52 | strncpy(ip_txt, ipadr, INET_ADDRSTRLEN); 53 | 54 | tok=strtok(ip_txt,"."); 55 | while( tok != NULL) 56 | { 57 | val=strtoul(tok,&ptr,0); 58 | num=(num << 8) + val; 59 | // printf("(val,num)=(%llu,%llu)\n",val,num); 60 | tok=strtok(NULL,"."); 61 | } 62 | return(num); 63 | } 64 | 65 | void lnklst_add_to_map(int fd, struct iptnl_info *vip , __u64 *head){ 66 | __u64 key = *head , next, min, max, ipint; 67 | char ip_txt[INET_ADDRSTRLEN] = {0}; 68 | 69 | assert(inet_ntop(vip->family, &vip->daddr.v4, ip_txt, sizeof(ip_txt))); 70 | ipint = conv(ip_txt, *head>>32); 71 | 72 | if ( bpf_map_lookup_elem(fd, &ipint, &next) == 0 ){ 73 | printf("Worker already exists!\n"); 74 | return; 75 | } 76 | 77 | if ( bpf_map_lookup_elem(fd, &key, &next) == -1 ){ // 1st entry. Create new. 78 | next = key; 79 | assert(bpf_map_update_elem(fd, &key, &next, BPF_NOEXIST) == 0 ); 80 | 81 | } else if ( next == key ){ // 2nd entry. Only one entry exists. 82 | assert(bpf_map_update_elem(fd, &key, &ipint, BPF_ANY) == 0 ); 83 | assert(bpf_map_update_elem(fd, &ipint, &key, BPF_ANY) == 0 ); 84 | *head = key < ipint ? key : ipint; 85 | 86 | } else { 87 | 88 | // Find minimum 89 | if (key > next){ // if head is the last entry 90 | min = next; 91 | max = key; 92 | } else { 93 | while (key < next){ 94 | key = next; 95 | bpf_map_lookup_elem(fd, &key, &next); 96 | } 97 | max = key; 98 | min = next; 99 | } 100 | 101 | *head = min; 102 | 103 | if (( ipint < min )||( max < ipint )){ // new entry is the smallest or the largest 104 | 105 | assert(bpf_map_update_elem(fd, &ipint, &min, BPF_ANY) == 0 ); 106 | assert(bpf_map_update_elem(fd, &max, &ipint, BPF_ANY) == 0 ); // update tail 107 | 108 | *head = min < ipint ? min : ipint; 109 | 110 | } else if (( min < ipint ) && ( ipint < max )){ 111 | 112 | key = min; 113 | bpf_map_lookup_elem(fd, &key, &next); 114 | 115 | while ( next < ipint ){ // find the key where (key < ipint < next) 116 | key = next; 117 | bpf_map_lookup_elem(fd, &key, &next); 118 | } 119 | assert(bpf_map_update_elem(fd, &key, &ipint, BPF_ANY) == 0); 120 | assert(bpf_map_update_elem(fd, &ipint, &next, BPF_ANY) == 0); 121 | } 122 | 123 | } 124 | } 125 | 126 | void lnklst_del_from_map(int fd, struct iptnl_info *vip , __u64 *head){ 127 | __u64 key = *head , next, min, max, ipint; 128 | char ip_txt[INET_ADDRSTRLEN] = {0}; 129 | 130 | int svcint = *head>>32; 131 | 132 | assert(inet_ntop(vip->family, &vip->daddr.v4, ip_txt, sizeof(ip_txt))); 133 | ipint = conv(ip_txt, svcint); 134 | 135 | if ( bpf_map_lookup_elem(fd, &ipint, &next) != 0 ){ 136 | printf("Worker does not exist!\n"); 137 | return; 138 | } 139 | 140 | if ( ipint == next ) {// last entry. Delete & update head 141 | 142 | assert(bpf_map_delete_elem(fd, &ipint) == 0 ); 143 | 144 | *head = conv("0.0.0.0", svcint); 145 | 146 | } else { 147 | bpf_map_lookup_elem(fd, &key, &next); 148 | // Find minimum 149 | if (key > next){ // if head is the last entry 150 | min = next; 151 | max = key; 152 | } else { 153 | while (key < next){ 154 | key = next; 155 | bpf_map_lookup_elem(fd, &key, &next); 156 | } 157 | min = next; 158 | max = key; 159 | } 160 | 161 | *head = min; 162 | 163 | if ( ipint == min ){ // new entry is the smallest or the largest 164 | 165 | bpf_map_lookup_elem(fd, &ipint, &next); 166 | 167 | assert(bpf_map_update_elem(fd, &max, &next, BPF_ANY) == 0 ); 168 | assert(bpf_map_delete_elem(fd, &ipint) == 0 ); 169 | 170 | *head = next; 171 | 172 | } else if ( max == ipint ){ // new entry is the smallest or the largest 173 | 174 | key = min; 175 | bpf_map_lookup_elem(fd, &key, &next); 176 | 177 | while ( next < ipint ){ // find the key where (key < ipint = next = max) 178 | key = next; 179 | bpf_map_lookup_elem(fd, &key, &next); 180 | } 181 | assert(bpf_map_update_elem(fd, &key, &min, BPF_ANY) == 0); 182 | assert(bpf_map_delete_elem(fd, &ipint) == 0); 183 | 184 | } else if (( min < ipint ) && ( ipint < max ) ){ 185 | 186 | key = min; 187 | bpf_map_lookup_elem(fd, &key, &next); 188 | 189 | while ( next < ipint ){ // find the key where (key < ipint = next) 190 | key = next; 191 | bpf_map_lookup_elem(fd, &key, &next); 192 | } 193 | bpf_map_lookup_elem(fd, &ipint, &next); 194 | assert(bpf_map_update_elem(fd, &key, &next, BPF_ANY) == 0); 195 | assert(bpf_map_delete_elem(fd, &ipint) == 0); 196 | } 197 | 198 | } 199 | } 200 | 201 | void svcid_list_all() 202 | { 203 | 204 | __u64 key = 0, next_key; 205 | __u64 head; 206 | 207 | int fd = open_bpf_map(file_svcid); 208 | 209 | while (bpf_map_get_next_key(fd, &key, &next_key) == 0) { 210 | key = next_key; 211 | bpf_map_lookup_elem(fd, &key, &head); 212 | 213 | printf("svcid = %llu\n}\n", key); 214 | printf("head = %llu\n}\n", head); 215 | } 216 | 217 | close(fd); 218 | } 219 | 220 | void service_list_all() 221 | { 222 | 223 | struct vip key = {}, next_key; 224 | __u64 head; 225 | char ip_txt[INET_ADDRSTRLEN] = {0}; 226 | 227 | int fd = open_bpf_map(file_service); 228 | 229 | printf("Service List: \n"); 230 | while (bpf_map_get_next_key(fd, &key, &next_key) == 0) { 231 | key = next_key; 232 | bpf_map_lookup_elem(fd, &key, &head); 233 | 234 | assert(inet_ntop(key.family, &key.daddr.v4, ip_txt, sizeof(ip_txt))); 235 | printf("{\nVIP: %s\n" , ip_txt); 236 | printf("%d\n", key.protocol ); 237 | printf("%d\n", ntohs(key.dport)); 238 | printf("head = %llu\n}\n", head); 239 | } 240 | printf("\n"); 241 | 242 | close(fd); 243 | } 244 | 245 | void worker_list_all() 246 | { 247 | __u64 key = 0, next_key; 248 | struct iptnl_info value; 249 | char ip_txt[INET_ADDRSTRLEN] = {0}; 250 | char mac_txt[] = "00:00:00:00:00:00"; 251 | 252 | int fd = open_bpf_map(file_worker); 253 | 254 | while (bpf_map_get_next_key(fd, &key, &next_key) == 0) { 255 | bpf_map_lookup_elem(fd, &next_key, &value); 256 | 257 | printf("{\nkey: %llu\n" , next_key); 258 | printf("{\nsvcid: %d\n" , next_key>>32); 259 | 260 | assert(inet_ntop(value.family, &value.saddr.v4, ip_txt, sizeof(ip_txt))); 261 | printf("src: %s\n", ip_txt ); 262 | assert(inet_ntop(value.family, &value.daddr.v4, ip_txt, sizeof(ip_txt))); 263 | printf("dst: %s\n", ip_txt ); 264 | assert(ether_ntoa_r((struct ether_addr *)value.dmac, mac_txt)); 265 | printf("mac: %s\n}\n", mac_txt ); 266 | 267 | key = next_key; 268 | } 269 | 270 | close(fd); 271 | } 272 | 273 | void linklist_list_all(){ 274 | 275 | __u64 key = 0, next_key; 276 | __u64 value; 277 | 278 | int fd = open_bpf_map(file_linklist); 279 | 280 | while (bpf_map_get_next_key(fd, &key, &next_key) == 0) { 281 | key = next_key; 282 | bpf_map_lookup_elem(fd, &key, &value); 283 | printf("(key, value) = (%llu,%llu)\n" , key, value); 284 | } 285 | close(fd); 286 | } 287 | 288 | void show_worker( __u64 key){ 289 | 290 | struct iptnl_info value; 291 | char daddr_txt[INET_ADDRSTRLEN] = {0}; 292 | char saddr_txt[INET_ADDRSTRLEN] = {0}; 293 | char mac_txt[] = "00:00:00:00:00:00"; 294 | 295 | int fd = open_bpf_map(file_worker); 296 | 297 | if (bpf_map_lookup_elem(fd, &key, &value) == -1 ) return; 298 | 299 | assert(inet_ntop(value.family, &value.saddr.v4, saddr_txt, sizeof(saddr_txt))); 300 | assert(inet_ntop(value.family, &value.daddr.v4, daddr_txt, sizeof(daddr_txt))); 301 | assert(ether_ntoa_r((struct ether_addr *)value.dmac, mac_txt)); 302 | 303 | if (DEBUG) printf("key: %llu\n", key); 304 | 305 | // printf(" dst: %u\n", value.daddr.v4); 306 | printf(" src: %s, dst: %s (%s)\n", saddr_txt, daddr_txt, mac_txt ); 307 | 308 | close(fd); 309 | } 310 | 311 | void list_worker_from_head( __u64 head){ 312 | 313 | __u64 key = head; 314 | __u64 value=0; 315 | 316 | int fd = open_bpf_map(file_linklist); 317 | 318 | printf("{\n"); 319 | while (value != head){ 320 | show_worker(key); 321 | if (bpf_map_lookup_elem(fd, &key, &value) != 0) break; 322 | key = value; 323 | } 324 | printf("}\n"); 325 | 326 | close(fd); 327 | } 328 | 329 | void list_all() 330 | { 331 | int fd, flag=0; 332 | struct vip key = {}, next_key; 333 | __u64 head; 334 | char daddr_txt[INET_ADDRSTRLEN] = {0}; 335 | 336 | fd = open_bpf_map(file_service); 337 | 338 | while (bpf_map_get_next_key(fd, &key, &next_key) == 0) { 339 | key = next_key; 340 | bpf_map_lookup_elem(fd, &key, &head); 341 | 342 | assert(inet_ntop(key.family, &key.daddr.v4, daddr_txt, sizeof(daddr_txt))); 343 | printf("service(#%d): %s:%d(%d) " , (__u16)(head>>32), daddr_txt, ntohs(key.dport), key.protocol); 344 | 345 | if (DEBUG) printf(", head = %llu ", head); 346 | 347 | list_worker_from_head(head); 348 | flag=1; 349 | } 350 | 351 | if (flag == 0){ 352 | printf("We have no service here.\n"); 353 | } 354 | 355 | close(fd); 356 | } 357 | 358 | void list_lbcache() 359 | { 360 | int fd; 361 | struct flow key = {}, next_key; 362 | __u64 wkid; 363 | 364 | char daddr_txt[INET_ADDRSTRLEN] = {0}; 365 | char saddr_txt[INET_ADDRSTRLEN] = {0}; 366 | 367 | fd = open_bpf_map(file_lbcache); 368 | int fdw = open_bpf_map(file_worker); 369 | 370 | while (bpf_map_get_next_key(fd, &key, &next_key) == 0) { 371 | 372 | key = next_key; 373 | bpf_map_lookup_elem(fd, &key, &wkid); 374 | 375 | inet_ntop(key.vip.family, &key.vip.daddr.v4, daddr_txt, sizeof(daddr_txt)); 376 | inet_ntop(key.sip.family, &key.sip.saddr.v4, saddr_txt, sizeof(saddr_txt)); 377 | 378 | printf(" %s:%d -> %s:%d (%d) => " 379 | ,saddr_txt,ntohs(key.sip.sport) 380 | ,daddr_txt,ntohs(key.vip.dport) 381 | ,key.vip.protocol 382 | ); 383 | 384 | struct iptnl_info value; 385 | char mac_txt[] = "00:00:00:00:00:00"; 386 | 387 | bpf_map_lookup_elem(fdw, &wkid, &value); 388 | inet_ntop(value.family, &value.daddr.v4, daddr_txt, sizeof(daddr_txt)); 389 | assert(ether_ntoa_r((struct ether_addr *)value.dmac, mac_txt)); 390 | printf("%s (%s)\n", daddr_txt, mac_txt ); 391 | 392 | } 393 | 394 | close(fdw); 395 | close(fd); 396 | } 397 | 398 | void xlb_add_svc(struct vip* vip) 399 | { 400 | int i; 401 | struct vip vip_tmp; 402 | char ip_txt[INET_ADDRSTRLEN] = {0}; 403 | __u16 svcid = 0; 404 | __u64 head; 405 | 406 | // printf("vip->daddr.v4 = %u \n", &vip->daddr.v4); 407 | // assert(inet_ntop(vip->family, &vip->daddr.v4, ip_txt, sizeof(ip_txt))); 408 | // printf("Adding service \"%s:%d\".\n", ip_txt, ntohs(vip->dport)); 409 | 410 | int fd_service = open_bpf_map(file_service); 411 | int fd_svcid = open_bpf_map(file_svcid); 412 | 413 | // 0. Check if the service already exists. 414 | if (bpf_map_lookup_elem(fd_service, vip, &head) == 0 ){ 415 | // assert(inet_ntop((*vip).family, &(*vip).daddr.v4, ip_txt, sizeof(ip_txt))); 416 | assert(inet_ntop(vip->family, &vip->daddr.v4, ip_txt, sizeof(ip_txt))); 417 | printf("%s:%d (#%d)\n",ip_txt,ntohs(vip->dport),head>>32); 418 | return; 419 | } 420 | 421 | // 1. Assign svcid and create head(32+8 bit number). 422 | for (i = 1; i < MAX_SVC_ENTRIES ; i++){ 423 | if (bpf_map_lookup_elem(fd_svcid, &i, &vip_tmp) == -1 ){ 424 | svcid = i ; 425 | bpf_map_update_elem(fd_svcid, &i, vip, BPF_NOEXIST); 426 | break ; 427 | } 428 | } 429 | if (svcid == 0) return; 430 | 431 | // printf("Service id %d\n", svcid); 432 | 433 | head = conv("0.0.0.0", svcid); 434 | 435 | // 2. Add service to the service map. 436 | // bpf_map_update_elem(fd_service, &vip->daddr.v4, &head, BPF_NOEXIST); 437 | bpf_map_update_elem(fd_service, vip, &head, BPF_NOEXIST); 438 | 439 | assert(inet_ntop(vip->family, &vip->daddr.v4, ip_txt, sizeof(ip_txt))); 440 | printf("+%s:%d (#%d)\n",ip_txt,ntohs(vip->dport),svcid); 441 | 442 | close(fd_service); 443 | close(fd_svcid); 444 | } 445 | 446 | void xlb_del_svc(struct vip* vip) 447 | { 448 | char ip_txt[INET_ADDRSTRLEN] = {0}; 449 | __u16 svcid = 0; 450 | __u64 head; 451 | 452 | int fd_service = open_bpf_map(file_service); 453 | int fd_svcid = open_bpf_map(file_svcid); 454 | 455 | // 0. Check if the service & worker exist. 456 | if (bpf_map_lookup_elem(fd_service, vip, &head) == -1 ){ 457 | assert(inet_ntop(vip->family, &vip->daddr.v4, ip_txt, sizeof(ip_txt))); 458 | printf("The service \"%s:%d\" does not exist!\n", ip_txt, ntohs(vip->dport)); 459 | return; 460 | } 461 | svcid = head>>32; 462 | 463 | if (head == conv("0.0.0.0", svcid)) { // If there is no worker then remove service 464 | bpf_map_delete_elem(fd_service, vip); 465 | bpf_map_delete_elem(fd_svcid, &svcid); 466 | 467 | assert(inet_ntop(vip->family, &vip->daddr.v4, ip_txt, sizeof(ip_txt))); 468 | printf("-%s:%d (#%d)\n",ip_txt,ntohs(vip->dport),svcid); 469 | 470 | } else { 471 | printf("\nWorkers still exist for service(#%d)! Delete them first.\n\n",svcid); 472 | // do_list=1; 473 | // return EXIT_FAIL; 474 | } 475 | close(fd_service); 476 | close(fd_svcid); 477 | } 478 | 479 | void xlb_add_real(struct vip* vip, struct iptnl_info* tnl) 480 | { 481 | char ip_txt[INET_ADDRSTRLEN] = {0}; 482 | struct vip vip_tmp; 483 | struct iptnl_info tnl_tmp = {}; 484 | __u16 svcid = 0; 485 | __u64 head, daddrint; 486 | 487 | 488 | in_addr_t nh_ip; 489 | int dev=0; 490 | 491 | xlb_iproute_get(&tnl->daddr.v4, &tnl->saddr.v4, &nh_ip, &dev); 492 | xlb_get_mac(&nh_ip, tnl->dmac , &dev); 493 | 494 | if (DEBUG){ 495 | char buf[256]; 496 | char mac_txt[] = "00:00:00:00:00:00"; 497 | 498 | printf("src: %s \n", inet_ntop(AF_INET, &tnl->saddr.v4, buf, 256)); 499 | assert(ether_ntoa_r((struct ether_addr *)tnl->dmac, mac_txt)); 500 | printf("nexthop: %s (%s) \n", inet_ntop(AF_INET, &nh_ip, buf, 256), mac_txt); 501 | // printf("mac: %s\n", mac_txt ); 502 | } 503 | 504 | int fd_service = open_bpf_map(file_service); 505 | int fd_linklist = open_bpf_map(file_linklist); 506 | int fd_worker = open_bpf_map(file_worker); 507 | int fd_svcid = open_bpf_map(file_svcid); 508 | 509 | // 0. Check if the service & worker exist. 510 | if (bpf_map_lookup_elem(fd_service, vip, &head) == -1 ){ 511 | assert(inet_ntop(vip->family, &vip->daddr.v4, ip_txt, sizeof(ip_txt))); 512 | printf("The service \"%s:%d\" does not exist!\n", ip_txt, ntohs(vip->dport)); 513 | return; 514 | } 515 | svcid = head>>32; 516 | 517 | if (bpf_map_lookup_elem(fd_svcid, &svcid, &vip_tmp) == -1 ){ 518 | // No svcid in the fd_svcid map? Unlikey but just checking. 519 | return; 520 | } 521 | 522 | assert(inet_ntop(tnl->family, &tnl->daddr.v4, ip_txt, sizeof(ip_txt))); 523 | daddrint = conv(ip_txt, svcid); 524 | 525 | // 1. Check if the head is for "0.0.0.0" i.e. there's no worker yet. 526 | // If so, generate new head from worker ip. 527 | 528 | if (head == conv("0.0.0.0",svcid)) { 529 | head = daddrint; 530 | } 531 | 532 | // 2. Check if the worker already exists for the service. 533 | if (bpf_map_lookup_elem(fd_worker, &daddrint, &tnl_tmp) == 0 ){ 534 | // printf("\"%s\" already exists for service(#%d)!\n",ip_txt,svcid); 535 | printf(" %s (#%d)\n",ip_txt,svcid); 536 | return; 537 | } 538 | 539 | if (verbose) printf("head old = %llu\n", head); 540 | 541 | // 3. Insert wkrtag into the linked-list. 542 | // 4. Add worker. 543 | // 5. Update service map entry with new head. 544 | lnklst_add_to_map(fd_linklist, tnl, &head); 545 | bpf_map_update_elem(fd_worker, &daddrint, tnl, BPF_ANY); 546 | bpf_map_update_elem(fd_service, &vip->daddr.v4, &head, BPF_ANY); 547 | 548 | // printf("+ %s added for #%d\n",ip_txt,svcid); 549 | printf("+ %s (#%d)\n",ip_txt,svcid); 550 | 551 | if (verbose) printf("head new = %llu\n", head); 552 | 553 | close(fd_service); 554 | close(fd_svcid); 555 | close(fd_linklist); 556 | close(fd_worker); 557 | } 558 | 559 | void xlb_del_real(struct vip* vip, struct iptnl_info* tnl) 560 | { 561 | char ip_txt[INET_ADDRSTRLEN] = {0}; 562 | struct iptnl_info tnl_tmp = {}; 563 | __u16 svcid = 0; 564 | __u64 head, daddrint; 565 | 566 | 567 | int fd_service = open_bpf_map(file_service); 568 | int fd_linklist = open_bpf_map(file_linklist); 569 | int fd_worker = open_bpf_map(file_worker); 570 | 571 | // 0. Check if the service & worker exist. 572 | if (bpf_map_lookup_elem(fd_service, vip, &head) == -1 ){ 573 | assert(inet_ntop(vip->family, &vip->daddr.v4, ip_txt, sizeof(ip_txt))); 574 | printf("The service \"%s:%d\" does not exist!\n", ip_txt, ntohs(vip->dport)); 575 | return; 576 | } 577 | svcid = head>>32; 578 | 579 | assert(inet_ntop(tnl->family, &tnl->daddr.v4, ip_txt, sizeof(ip_txt))); 580 | daddrint = conv(ip_txt, svcid); 581 | if (bpf_map_lookup_elem(fd_worker, &daddrint, &tnl_tmp) == -1 ){ 582 | printf("%s does not exist for service(#%d)!\n",ip_txt,svcid); 583 | return; 584 | } 585 | 586 | 587 | // 1. Delete wkrtag from the linked-list. 588 | // lnklst_del_from_map(fd_linklist, &tnl, &daddr); 589 | // 2. Delete worker. 590 | // 3. Update service map entry with new head. 591 | 592 | lnklst_del_from_map(fd_linklist, tnl, &head); 593 | bpf_map_delete_elem(fd_worker, &daddrint); 594 | bpf_map_update_elem(fd_service, &vip->daddr.v4, &head, BPF_ANY); 595 | 596 | // printf(" %s removed from #%d\n",ip_txt,svcid); 597 | printf("- %s (#%d)\n",ip_txt,svcid); 598 | 599 | close(fd_service); 600 | close(fd_linklist); 601 | close(fd_worker); 602 | } 603 | 604 | -------------------------------------------------------------------------------- /xlb_util.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "bpf_load.h" 15 | #include "libbpf.h" 16 | #include "bpf_util.h" 17 | #include 18 | #include "xlb_common.h" 19 | 20 | int parse_ipstr(const char*, unsigned int*); 21 | int parse_port(const char*, int*); 22 | 23 | int open_bpf_map(const char*); 24 | 25 | void lnklst_add_to_map(int, struct iptnl_info *, __u64*); 26 | void lnklst_del_from_map(int, struct iptnl_info*, __u64*); 27 | 28 | void svcid_list_all(); 29 | void service_list_all(); 30 | void worker_list_all(); 31 | void linklist_list_all(); 32 | void show_worker(__u64); 33 | void list_worker_from_head(__u64); 34 | void list_all(); 35 | void list_lbcache(); 36 | 37 | void xlb_add_svc(struct vip*); 38 | void xlb_del_svc(struct vip*); 39 | void xlb_add_real(struct vip*, struct iptnl_info*); 40 | void xlb_del_real(struct vip*, struct iptnl_info*); 41 | 42 | struct _service { 43 | struct vip svc; 44 | struct iptnl_info wkr[256]; 45 | int wkr_count; 46 | }; 47 | 48 | 49 | -------------------------------------------------------------------------------- /xlbd.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "xlb_util.h" 3 | 4 | char* conf_yaml; 5 | 6 | 7 | enum state_value { 8 | EXPECT_NONE, 9 | EXPECT_MAP, 10 | EXPECT_IPV4, 11 | EXPECT_PORT, 12 | }; 13 | 14 | enum vip_or_rip { 15 | NONE, 16 | VIP, 17 | RIP, 18 | }; 19 | 20 | struct parser_state { 21 | int rip_nest_level; 22 | int vip_nest_level; 23 | enum state_value state; 24 | enum vip_or_rip vor; 25 | char *vip; 26 | char *rip; 27 | char *port; 28 | }; 29 | 30 | int svc_num; 31 | 32 | struct _service service[256]; 33 | 34 | void prune_workers(){ 35 | __u64 key = 0, next_key; 36 | struct iptnl_info tnl; 37 | char ip_txt[INET_ADDRSTRLEN] = {0}; 38 | 39 | int fd_worker = open_bpf_map(file_worker); 40 | 41 | while (bpf_map_get_next_key(fd_worker, &key, &next_key) == 0) { 42 | bool doomed_worker = true; 43 | bpf_map_lookup_elem(fd_worker, &next_key, &tnl); 44 | 45 | if(DEBUG){ 46 | printf("\nsvcid: %d\n" , next_key>>32); 47 | assert(inet_ntop(tnl.family, &tnl.daddr.v4, ip_txt, sizeof(ip_txt))); 48 | printf("dst: %s\n", ip_txt ); 49 | } 50 | 51 | struct vip vip; 52 | int svcid = next_key>>32; 53 | 54 | int fd_svcid = open_bpf_map(file_svcid); 55 | bpf_map_lookup_elem(fd_svcid, &svcid, &vip); 56 | close(fd_svcid); 57 | 58 | for (int k=1 ; k < svc_num+1;k++){ 59 | if ( vip.daddr.v4 == service[k].svc.daddr.v4 && 60 | vip.dport == service[k].svc.dport && 61 | vip.protocol == service[k].svc.protocol){ 62 | 63 | for (int l=0 ; l < service[k].wkr_count ;l++){ 64 | if (DEBUG) 65 | printf("%d,%d,%d,%d\n",tnl.daddr.v4,service->wkr[l].daddr.v4,l,service->wkr_count); 66 | 67 | if ( tnl.daddr.v4 == service[k].wkr[l].daddr.v4){ 68 | doomed_worker = false; 69 | break; 70 | } 71 | } 72 | 73 | if (doomed_worker == false) 74 | break; 75 | } 76 | } 77 | 78 | if (doomed_worker==true){ 79 | if (DEBUG){ 80 | assert(inet_ntop(tnl.family, &tnl.daddr.v4, ip_txt, sizeof(ip_txt))); 81 | printf("Worker %s for #%d is doomed\n", ip_txt, svcid); 82 | } 83 | xlb_del_real(&vip,&tnl); 84 | } 85 | 86 | key = next_key; 87 | } 88 | 89 | close(fd_worker); 90 | } 91 | 92 | void prune_services() 93 | { 94 | struct vip key = {}, next_key; 95 | __u64 head,value; 96 | 97 | int fd_service = open_bpf_map(file_service); 98 | 99 | while (bpf_map_get_next_key(fd_service, &key, &next_key) == 0) { 100 | key = next_key; 101 | bpf_map_lookup_elem(fd_service, &key, &head); 102 | 103 | bool doomed_service = true; 104 | if (DEBUG) 105 | printf("%d, %d, %d\n",key.daddr.v4, key.dport, key.protocol); 106 | 107 | for (int k=1 ; k < svc_num+1;k++){ 108 | if (DEBUG) 109 | printf("....-> %d, %d, %d\n",service[k].svc.daddr.v4, service[k].svc.dport, service[k].svc.protocol); 110 | 111 | if ( key.daddr.v4 == service[k].svc.daddr.v4 && 112 | key.dport == service[k].svc.dport && 113 | key.protocol == service[k].svc.protocol){ 114 | doomed_service = false; 115 | } 116 | } 117 | 118 | if (doomed_service){ 119 | if (DEBUG){ 120 | char ip_txt[INET_ADDRSTRLEN] = {0}; 121 | assert(inet_ntop(key.family, &key.daddr.v4, ip_txt, sizeof(ip_txt))); 122 | printf("Service %s:%d(%d) is doomed\n", ip_txt, ntohs(key.dport), key.protocol); 123 | } 124 | xlb_del_svc(&key); 125 | } 126 | } 127 | 128 | close(fd_service); 129 | } 130 | 131 | int reflect_yaml() 132 | { 133 | for (int k=1 ; k < svc_num+1;k++){ 134 | xlb_add_svc(&service[k].svc); 135 | for (int l=0 ; l < service[k].wkr_count ;l++){ 136 | xlb_add_real(&service[k].svc, &service[k].wkr[l]); 137 | } 138 | } 139 | 140 | printf("\n"); 141 | 142 | prune_workers(); 143 | prune_services(); 144 | 145 | printf("\n"); 146 | 147 | return 0; 148 | } 149 | 150 | int parse_yaml() 151 | { 152 | struct _rs { 153 | char *ipv4; 154 | }; 155 | 156 | struct _vs { 157 | int num_rs; 158 | char *ipv4; 159 | char *port; 160 | struct _rs rs[256]; 161 | }; 162 | 163 | FILE *fh; 164 | yaml_parser_t parser; 165 | yaml_event_t event; 166 | int nest_level = 0 ; 167 | struct parser_state state = {.state=EXPECT_NONE}; 168 | 169 | struct _vs *vs = malloc(sizeof(struct _vs)*256); 170 | int j=0,i=0; 171 | 172 | fh = fopen(conf_yaml, "rb"); 173 | if(fh == NULL) 174 | printf("Failed to open \"%s\"\n", conf_yaml); 175 | assert(fh); 176 | 177 | if(!yaml_parser_initialize(&parser)) 178 | fputs("Failed to initialize parser!\n", stderr); 179 | if(fh == NULL) 180 | fputs("Failed to open file!\n", stderr); 181 | 182 | yaml_parser_set_input_file(&parser, fh); 183 | 184 | do { 185 | if (!yaml_parser_parse(&parser, &event)) { 186 | printf("Parser error %d\n", parser.error); 187 | exit(EXIT_FAILURE); 188 | } 189 | 190 | switch(event.type) 191 | { 192 | case YAML_MAPPING_START_EVENT: 193 | nest_level++; 194 | break; 195 | case YAML_MAPPING_END_EVENT: 196 | nest_level--; 197 | if ( state.rip_nest_level == nest_level) { 198 | // printf("(VIP,PORT,RIP) = (%s,%s,%s)\n", state.vip, state.port, state.rip); 199 | vs[i].rs[j].ipv4 = strdup(state.rip); 200 | j++; 201 | vs[i].num_rs=j; 202 | } 203 | break; 204 | case YAML_SCALAR_EVENT: 205 | 206 | if (strcmp(event.data.scalar.value, "virtual_server") == 0) { 207 | state.state = EXPECT_MAP; 208 | state.vor = VIP; 209 | i++;vs[i].num_rs=0; 210 | // vs[i].num_rs=0;i++; 211 | state.vip_nest_level = nest_level; 212 | } else if (strcmp((char*)event.data.scalar.value, "real_servers") == 0 || 213 | strcmp((char*)event.data.scalar.value, "real_servers") == 0) { 214 | // printf("(VIP,PORT) = (%s,%s)\n", state.vip, state.port); 215 | vs[i].ipv4 = strdup(state.vip); 216 | vs[i].port = strdup(state.port); 217 | j=0; 218 | state.state = EXPECT_MAP; 219 | state.vor = RIP; 220 | state.rip_nest_level = nest_level; 221 | } else if (strcmp((char*)event.data.scalar.value, "ipv4") == 0 ){ 222 | state.state = EXPECT_IPV4; 223 | } else if (strcmp(event.data.scalar.value, "port") == 0 ){ 224 | state.state = EXPECT_PORT; 225 | } else { // parse values 226 | 227 | if (state.vor == VIP && state.state == EXPECT_IPV4 ){ 228 | state.vip = strdup(event.data.scalar.value); 229 | } else if (state.vor == VIP && state.state == EXPECT_PORT){ 230 | state.port = strdup(event.data.scalar.value); 231 | } else if (state.vor == RIP && state.state == EXPECT_IPV4){ 232 | state.rip = strdup(event.data.scalar.value); 233 | } 234 | 235 | state.state = EXPECT_NONE; 236 | } 237 | break; 238 | 239 | case YAML_NO_EVENT: 240 | case YAML_STREAM_START_EVENT: 241 | case YAML_STREAM_END_EVENT: 242 | case YAML_DOCUMENT_START_EVENT: 243 | case YAML_DOCUMENT_END_EVENT: 244 | case YAML_SEQUENCE_START_EVENT: 245 | case YAML_SEQUENCE_END_EVENT: 246 | case YAML_ALIAS_EVENT: 247 | break; 248 | default: 249 | break; 250 | } 251 | if(event.type != YAML_STREAM_END_EVENT) 252 | yaml_event_delete(&event); 253 | } while(event.type != YAML_STREAM_END_EVENT); 254 | 255 | yaml_event_delete(&event); 256 | yaml_parser_delete(&parser); 257 | fclose(fh); 258 | 259 | svc_num=i; 260 | 261 | /* 262 | for (int k=1 ; k < i+1;k++){ 263 | printf("%s:%s\n",vs[k].ipv4,vs[k].port); 264 | for (int l=0 ; l < vs[k].num_rs ;l++){ 265 | printf(" %s\n",vs[k].rs[l].ipv4); 266 | } 267 | } 268 | printf("\n"); 269 | */ 270 | 271 | for (int k=1 ; k < svc_num+1;k++){ 272 | 273 | service[k].svc.protocol = IPPROTO_TCP; 274 | service[k].svc.family= parse_ipstr(vs[k].ipv4, &service[k].svc.daddr.v6); 275 | 276 | int port=0; 277 | parse_port(vs[k].port, &port); 278 | service[k].svc.dport=htons(port); 279 | 280 | for (int l=0 ; l < vs[k].num_rs ;l++){ 281 | service[k].wkr[l].family=parse_ipstr(vs[k].rs[l].ipv4, &service[k].wkr[l].daddr.v6); 282 | } 283 | service[k].wkr_count = vs[k].num_rs; 284 | } 285 | 286 | free(vs); 287 | return 0; 288 | } 289 | 290 | void sig_reader(int signal){ 291 | printf("recved signal = %d\n",signal); 292 | parse_yaml(); 293 | reflect_yaml(); 294 | } 295 | 296 | int main(int argc, const char *argv[]) 297 | { 298 | struct sigaction sa; 299 | if (argc != 2){ 300 | printf("argc = %d\n", argc); 301 | printf("argc must be 2\n"); 302 | exit(1); 303 | } 304 | conf_yaml = strdup(argv[1]); 305 | parse_yaml(); 306 | reflect_yaml(); 307 | 308 | printf("\nMy pid is: %d\n\n", getpid()); 309 | sa.sa_handler = &sig_reader; 310 | sa.sa_flags = SA_RESTART; 311 | sigfillset(&sa.sa_mask); 312 | 313 | sigaction (SIGUSR1, &sa, NULL); 314 | sigaction (SIGHUP, &sa, NULL); 315 | 316 | 317 | while(1) { 318 | sleep(1); 319 | } 320 | 321 | } 322 | -------------------------------------------------------------------------------- /xlbd.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | - virtual_server: 3 | ipv4: 10.1.1.1 4 | port: 80 5 | real_servers: 6 | - ipv4: 172.16.51.2 7 | - ipv4: 172.16.57.2 8 | - virtual_server: 9 | ipv4: 10.1.1.2 10 | port: 80 11 | real_servers: 12 | - ipv4: 172.16.51.2 13 | - virtual_server: 14 | ipv4: 10.0.0.1 15 | port: 81 16 | real_servers: 17 | - ipv4: 172.16.51.2 18 | - ipv4: 192.168.51.2 19 | 20 | 21 | --------------------------------------------------------------------------------