├── .gitmodules ├── README.md ├── brc ├── .gitignore ├── Makefile ├── bpf_helpers.h ├── brc.bpf.c ├── brc.c ├── brc_common.h ├── mount.sh ├── parse_ip.c ├── restart.sh ├── trace.bpf.c └── trace.c ├── tools └── bpftool └── vmlinux ├── arm64 ├── vmlinux.h └── vmlinux_516.h ├── vmlinux.h └── x86 ├── vmlinux.h └── vmlinux_508.h /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "libbpf"] 2 | path = libbpf 3 | url = https://github.com/libbpf/libbpf.git 4 | [submodule "redis"] 5 | path = redis 6 | url = git@github.com:redis/redis.git 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ebpf-cache-for-redis 2 | 3 | 1. git submodule update --init --recursive 4 | 2. cd brc 5 | 3. make 6 | 7 | 需要修改libbpf/src/libbpf.c 中`SEC_DEF("tc", SCHED_CLS, 0, SEC_NONE),`为`SEC_DEF("tc", SCHED_CLS, 0, SEC_NONE | SEC_SLOPPY_PFX),` 8 | 9 | - ./redis-server 启动一个redis服务器 10 | - ./redis-cli CONFIG SET protected-mode no 使得其可被外网访问 11 | - 在brc/.bin 中执行`sh ../restart.sh` 12 | - 在任意路径执行`sh mount.sh` 13 | - 另一台机器上启动./redis-cli 执行set get 14 | - 执行`./tools/bpftool map dump name map_cache` 查看实际cache中的数据 15 | - 在`/tmp/brc_stats.txt`和`/tmp/brc_stats_interval.txt`中可以查看map_stats中的实时数据 16 | 17 | 18 | 19 | --- 20 | # 手动编译 21 | 22 | 1. clang -g -O2 -target bpf -D__TARGET_ARCH_x86 -I../vmlinux/x86/ -idirafter /usr/local/include -idirafter /usr/lib64/clang/11.0.0/include -idirafter /usr/include -c trace.bpf.c -o trace.bpf.o 23 | 2. /root/exercise/libbpf-bootstrap/tools/bpftool gen skeleton trace.bpf.o > trace.skel.h 24 | 3. clang -g -O2 -Wall -I . -c trace.c -o trace.o 25 | 4. clang -Wall -O2 -g trace.o /root/exercise/libbpf-bootstrap/examples/c/.output/libbpf.a -lelf -lz -o trace 26 | 27 | # 执行 28 | 现在在用户态挂载的时候有一个大问题,就是bpf_tc_hook_create不太成熟,资料较少,所以使用object pin先把TC bpf挂载,然后再手动挂载TC程序,我倾向于使用更低级别的接口,但是捣鼓了两天没搞出来,后续看下libbpf源码 29 | 1. make && cd .bin && ./brc 30 | 2. tc qdisc add dev eth0 clsact 31 | 3. tc filter add dev eth0 ingress bpf object-pinned /sys/fs/bpf/tc/brc_rx_filter 32 | 4. tc filter add dev eth0 egress bpf object-pinned /sys/fs/bpf/tc/brc_tx_filter 33 | 5. cat /sys/kernel/debug/tracing/trace_pipe 34 | 35 | 上述三四步不能颠倒,否则map_keys中的数据可能会出现错乱 36 | 37 | 如果bpf文件系统还没挂载就执行: 38 | 1. mount -t bpf none /sys/fs/bpf/ 39 | 40 | # 卸载 41 | 卸载bpf程序和qdisc 42 | 1. tc filter del dev eth0 egress 43 | 2. tc filter del dev eth0 ingress 44 | 3. tc qdisc del dev eth0 clsact 45 | 4. rm /sys/fs/bpf/tc/brc_rx_filter 46 | 5. rm /sys/fs/bpf/tc/brc_tx_filter 47 | 48 | # 对于BPF程序的解释 49 | ## brc_rx_filter 50 | 1. 将6379端口TCP协议的get请求执行解析,解析结果放在pctx中,然后执行尾调用brc_hash_keys 51 | 2. 对于非get数据执行brc_invalidate_cache 52 | 53 | ## brc_hash_keys 54 | 1. 找到这个get请求中key的hash_index 55 | 2. 对应的entry如果是有效的话调用brc_prepare_packet 56 | 3. 对应的entry是无效的话把key放入到invaild_key_data中,在egress中需要用到这个key的数据(queue如何把栈上数据放入其中) 57 | ## brc_invalidate_cache 58 | 1. 使得set操作中key对应的hash entry为invaild 59 | ## brc_tx_filter 60 | 1. 对于6379端口且是批量回复的数据执行解析,解析结果放在pctx中,如果发现是"$-1\r\n"的话需要从invaild_key_data中pop一个数据项,然后执行brc_update_cache 61 | ## brc_update_cache 62 | 1. 从invaild_key_data中拿到此次get实际key到数据 63 | 2. 计算key对应的hash_index 64 | 3. 如果此entry是invaild的,替换其中所有的值;如果是vaild有效,意味着此次get的这个 65 | 66 | 67 | # Redis协议 68 | https://redis.io/topics/protocol 69 | ## 批量回复 70 | 支持二进制安全字符串 71 | > "*3\r\n$3\r\nSET\r\n$5\r\nmykey\r\n$7\r\nmyvalue\r\n" 72 | 73 | > "*2\r\n$3\r\nget\r\n$13\r\nusername:1234\r\n" 74 | 75 | > "$6\r\nfoobar\r\n" 76 | 77 | > "$-1\r\n" 78 | 79 | ## 整数回复 80 | `:` 之后就是整数 81 | > :0\r\n 82 | 83 | > :1000\r\n 84 | 85 | 86 | ## 状态回复 87 | 客户端返回`+`之后的消息 88 | > +OK\r\n 89 | 90 | ## 错误回复 91 | `-`之后代表错误类型,ERR 是一个通用错误,而 WRONGTYPE 则是一个更特定的错误,之后为内容 92 | > -Error message\r\n 93 | 94 | > -WRONGTYPE Operation against a key holding the wrong kind of value\r\n 95 | 96 | ## 数组回复 97 | 98 | 99 | # 关于用户态/内核态一致性 100 | 希望cache不会对用户态本身对数据有任何影响,所以set操作是一定会pass到用户态处理的,然后在get的时候再执行cache update的 101 | 102 | 但是这样的话仅仅凭借get返回值的数据没法更新cache(返回值中不存在key,无法自解释),所以现在是否需要再get操作执行的时候做一些数据的保存呢 103 | 104 | 一个方法是在get操作是在ingress中插入key,然后在egress中把这个key拿出来,现在看起来只要get了,除非redis崩了,egree中一定是可以拿到这个消息的,但是用户态可能出现阻塞,也就是说cache中key可能出现堆积,但是一定是先进先出的一个过程,搞一个循环队列? 105 | 106 | 在用户态宕机的时候清空内核cache,然后在get可能使得循环队列超限的时候也清空内核cache 107 | 108 | # pragma clang loop unroll(disable) 109 | 指示编译器不允许展开循环,但是clang官网说展开的话可以增加ILP(指令级别并行)的机会 110 | https://cseweb.ucsd.edu//classes/wi05/cse240a/ilp1.pdf 111 | https://clang.llvm.org/docs/LanguageExtensions.html#loop-unrolling 112 | 113 | # debug 114 | 1. https://stackoverflow.com/questions/53136145/how-to-solve-the-r0-invalid-mem-access-inv-error-when-loading-an-ebpf-file-o 115 | 2. https://mechpen.github.io/posts/2019-08-29-bpf-verifier/index.html 116 | 3. 貌似bpf_printk的中文不能显示? 117 | 4. spin_lock的范围内不允许使用bpf_printk 118 | 5. 目前queue的push操作需要栈上的对象,导致key的上限比较小 -------------------------------------------------------------------------------- /brc/.gitignore: -------------------------------------------------------------------------------- 1 | /.output 2 | /.bin 3 | /tcp_sock.c -------------------------------------------------------------------------------- /brc/Makefile: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) 2 | OUTPUT := .output 3 | BINARY := .bin 4 | CLANG ?= clang 5 | LLVM_STRIP ?= llvm-strip 6 | BPFTOOL ?= $(abspath ../tools/bpftool) 7 | LIBBPF_SRC := $(abspath ../libbpf/src) 8 | LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a) 9 | ARCH := $(shell uname -m | sed 's/x86_64/x86/' | sed 's/aarch64/arm64/' | sed 's/ppc64le/powerpc/' | sed 's/mips.*/mips/') 10 | VMLINUX := ../vmlinux/$(ARCH)/vmlinux.h 11 | # Use our own libbpf API headers and Linux UAPI headers distributed with 12 | # libbpf to avoid dependency on system-wide headers, which could be missing or 13 | # outdated 14 | INCLUDES := -I$(OUTPUT) -I$(BINARY) -I../libbpf/include/uapi -I. -I$(dir $(VMLINUX)) 15 | CFLAGS := -g -Wall -v 16 | 17 | APPS = brc 18 | 19 | # Get Clang's default includes on this system. We'll explicitly add these dirs 20 | # to the includes list when compiling with `-target bpf` because otherwise some 21 | # architecture-specific dirs will be "missing" on some architectures/distros - 22 | # headers such as asm/types.h, asm/byteorder.h, asm/socket.h, asm/sockios.h, 23 | # sys/cdefs.h etc. might be missing. 24 | # 25 | # Use '-idirafter': Don't interfere with include mechanics except where the 26 | # build would have failed anyways. 27 | CLANG_BPF_SYS_INCLUDES = $(shell $(CLANG) -v -E - &1 \ 28 | | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') 29 | 30 | # 输出宏 31 | ifeq ($(V),1) 32 | Q = 33 | msg = 34 | else 35 | Q = @ 36 | msg = @printf ' %-8s %s%s\n' \ 37 | "$(1)" \ 38 | "$(patsubst $(abspath $(OUTPUT))/%,%,$(2))" \ 39 | "$(if $(3), $(3))"; 40 | MAKEFLAGS += --no-print-directory 41 | endif 42 | 43 | .PHONY: all 44 | all: $(APPS) 45 | 46 | .PHONY: clean 47 | clean: 48 | $(call msg,CLEAN) 49 | $(Q)rm -rf $(OUTPUT) ${BINARY} $(APPS) 50 | 51 | $(OUTPUT) $(OUTPUT)/libbpf $(BINARY): 52 | $(call msg,MKDIR,$@) 53 | $(Q)mkdir -p $@ 54 | 55 | # Build libbpf 56 | $(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf 57 | $(call msg,LIB,$@) 58 | $(Q)$(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1 \ 59 | OBJDIR=$(dir $@)/libbpf DESTDIR=$(dir $@) \ 60 | INCLUDEDIR= LIBDIR= UAPIDIR= \ 61 | install 62 | 63 | # 标准的五步走 64 | # $< 第一个依赖文件 65 | # $@ 目标文件 66 | # $^ 所有的依赖文件 67 | # dir是取文件名的目录部分 68 | # | 的语义感觉是如果需要此对象,则运行对应指令 69 | # Build BPF code 70 | $(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(VMLINUX) | $(OUTPUT) 71 | $(call msg,BPF,$@) 72 | $(Q)$(CLANG) -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH) $(INCLUDES) $(CLANG_BPF_SYS_INCLUDES) -c $(filter %.c,$^) -o $@ 73 | $(Q)$(LLVM_STRIP) -g $@ # strip useless DWARF info 74 | 75 | # Generate BPF skeletons 76 | $(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) 77 | $(call msg,GEN-SKEL,$@) 78 | $(Q)$(BPFTOOL) gen skeleton $< > $@ 79 | 80 | # Build user-space code 81 | # patsubst 功能为替换文件模式 $(patsubst 原模式, 目标模式, 文件列表) 82 | $(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h 83 | 84 | # wildcard 用来获取列表中所有的某后缀对象 85 | $(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT) 86 | $(call msg,CC,$@) 87 | $(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@ 88 | 89 | # Build application binary 90 | $(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(BINARY) 91 | $(call msg,BINARY,$@) 92 | $(Q)$(CC) $(CFLAGS) $^ -lelf -lz -o $(BINARY)/$@ 93 | 94 | 95 | # delete failed targets 96 | .DELETE_ON_ERROR: 97 | 98 | # keep intermediate (.skel.h, .bpf.o, etc) targets 99 | .SECONDARY: 100 | 101 | -------------------------------------------------------------------------------- /brc/bpf_helpers.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ 2 | #ifndef __BPF_HELPERS__ 3 | #define __BPF_HELPERS__ 4 | 5 | #define __uint(name, val) int (*name)[val] 6 | #define __type(name, val) typeof(val) *name 7 | 8 | /* helper macro to print out debug messages */ 9 | #define bpf_printk(fmt, ...) \ 10 | ({ \ 11 | char ____fmt[] = fmt; \ 12 | bpf_trace_printk(____fmt, sizeof(____fmt), \ 13 | ##__VA_ARGS__); \ 14 | }) 15 | 16 | #ifdef __clang__ 17 | 18 | /* helper macro to place programs, maps, license in 19 | * different sections in elf_bpf file. Section names 20 | * are interpreted by elf_bpf loader 21 | */ 22 | #define SEC(NAME) __attribute__((section(NAME), used)) 23 | 24 | /* helper functions called from eBPF programs written in C */ 25 | static void *(*bpf_map_lookup_elem)(void *map, const void *key) = 26 | (void *) BPF_FUNC_map_lookup_elem; 27 | static int (*bpf_map_update_elem)(void *map, const void *key, const void *value, 28 | unsigned long long flags) = 29 | (void *) BPF_FUNC_map_update_elem; 30 | static int (*bpf_map_delete_elem)(void *map, const void *key) = 31 | (void *) BPF_FUNC_map_delete_elem; 32 | static int (*bpf_map_push_elem)(void *map, const void *value, 33 | unsigned long long flags) = 34 | (void *) BPF_FUNC_map_push_elem; 35 | static int (*bpf_map_pop_elem)(void *map, void *value) = 36 | (void *) BPF_FUNC_map_pop_elem; 37 | static int (*bpf_map_peek_elem)(void *map, void *value) = 38 | (void *) BPF_FUNC_map_peek_elem; 39 | static int (*bpf_probe_read)(void *dst, int size, const void *unsafe_ptr) = 40 | (void *) BPF_FUNC_probe_read; 41 | static unsigned long long (*bpf_ktime_get_ns)(void) = 42 | (void *) BPF_FUNC_ktime_get_ns; 43 | static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) = 44 | (void *) BPF_FUNC_trace_printk; 45 | static void (*bpf_tail_call)(void *ctx, void *map, int index) = 46 | (void *) BPF_FUNC_tail_call; 47 | static unsigned long long (*bpf_get_smp_processor_id)(void) = 48 | (void *) BPF_FUNC_get_smp_processor_id; 49 | static unsigned long long (*bpf_get_current_pid_tgid)(void) = 50 | (void *) BPF_FUNC_get_current_pid_tgid; 51 | static unsigned long long (*bpf_get_current_uid_gid)(void) = 52 | (void *) BPF_FUNC_get_current_uid_gid; 53 | static int (*bpf_get_current_comm)(void *buf, int buf_size) = 54 | (void *) BPF_FUNC_get_current_comm; 55 | static unsigned long long (*bpf_perf_event_read)(void *map, 56 | unsigned long long flags) = 57 | (void *) BPF_FUNC_perf_event_read; 58 | static int (*bpf_clone_redirect)(void *ctx, int ifindex, int flags) = 59 | (void *) BPF_FUNC_clone_redirect; 60 | static int (*bpf_redirect)(int ifindex, int flags) = 61 | (void *) BPF_FUNC_redirect; 62 | static int (*bpf_redirect_map)(void *map, int key, int flags) = 63 | (void *) BPF_FUNC_redirect_map; 64 | static int (*bpf_perf_event_output)(void *ctx, void *map, 65 | unsigned long long flags, void *data, 66 | int size) = 67 | (void *) BPF_FUNC_perf_event_output; 68 | static int (*bpf_get_stackid)(void *ctx, void *map, int flags) = 69 | (void *) BPF_FUNC_get_stackid; 70 | static int (*bpf_probe_write_user)(void *dst, const void *src, int size) = 71 | (void *) BPF_FUNC_probe_write_user; 72 | static int (*bpf_current_task_under_cgroup)(void *map, int index) = 73 | (void *) BPF_FUNC_current_task_under_cgroup; 74 | static int (*bpf_skb_get_tunnel_key)(void *ctx, void *key, int size, int flags) = 75 | (void *) BPF_FUNC_skb_get_tunnel_key; 76 | static int (*bpf_skb_set_tunnel_key)(void *ctx, void *key, int size, int flags) = 77 | (void *) BPF_FUNC_skb_set_tunnel_key; 78 | static int (*bpf_skb_get_tunnel_opt)(void *ctx, void *md, int size) = 79 | (void *) BPF_FUNC_skb_get_tunnel_opt; 80 | static int (*bpf_skb_set_tunnel_opt)(void *ctx, void *md, int size) = 81 | (void *) BPF_FUNC_skb_set_tunnel_opt; 82 | static unsigned long long (*bpf_get_prandom_u32)(void) = 83 | (void *) BPF_FUNC_get_prandom_u32; 84 | static int (*bpf_xdp_adjust_head)(void *ctx, int offset) = 85 | (void *) BPF_FUNC_xdp_adjust_head; 86 | static int (*bpf_xdp_adjust_meta)(void *ctx, int offset) = 87 | (void *) BPF_FUNC_xdp_adjust_meta; 88 | static int (*bpf_get_socket_cookie)(void *ctx) = 89 | (void *) BPF_FUNC_get_socket_cookie; 90 | static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval, 91 | int optlen) = 92 | (void *) BPF_FUNC_setsockopt; 93 | static int (*bpf_getsockopt)(void *ctx, int level, int optname, void *optval, 94 | int optlen) = 95 | (void *) BPF_FUNC_getsockopt; 96 | static int (*bpf_sock_ops_cb_flags_set)(void *ctx, int flags) = 97 | (void *) BPF_FUNC_sock_ops_cb_flags_set; 98 | static int (*bpf_sk_redirect_map)(void *ctx, void *map, int key, int flags) = 99 | (void *) BPF_FUNC_sk_redirect_map; 100 | static int (*bpf_sk_redirect_hash)(void *ctx, void *map, void *key, int flags) = 101 | (void *) BPF_FUNC_sk_redirect_hash; 102 | static int (*bpf_sock_map_update)(void *map, void *key, void *value, 103 | unsigned long long flags) = 104 | (void *) BPF_FUNC_sock_map_update; 105 | static int (*bpf_sock_hash_update)(void *map, void *key, void *value, 106 | unsigned long long flags) = 107 | (void *) BPF_FUNC_sock_hash_update; 108 | static int (*bpf_perf_event_read_value)(void *map, unsigned long long flags, 109 | void *buf, unsigned int buf_size) = 110 | (void *) BPF_FUNC_perf_event_read_value; 111 | static int (*bpf_perf_prog_read_value)(void *ctx, void *buf, 112 | unsigned int buf_size) = 113 | (void *) BPF_FUNC_perf_prog_read_value; 114 | static int (*bpf_override_return)(void *ctx, unsigned long rc) = 115 | (void *) BPF_FUNC_override_return; 116 | static int (*bpf_msg_redirect_map)(void *ctx, void *map, int key, int flags) = 117 | (void *) BPF_FUNC_msg_redirect_map; 118 | static int (*bpf_msg_redirect_hash)(void *ctx, 119 | void *map, void *key, int flags) = 120 | (void *) BPF_FUNC_msg_redirect_hash; 121 | static int (*bpf_msg_apply_bytes)(void *ctx, int len) = 122 | (void *) BPF_FUNC_msg_apply_bytes; 123 | static int (*bpf_msg_cork_bytes)(void *ctx, int len) = 124 | (void *) BPF_FUNC_msg_cork_bytes; 125 | static int (*bpf_msg_pull_data)(void *ctx, int start, int end, int flags) = 126 | (void *) BPF_FUNC_msg_pull_data; 127 | static int (*bpf_msg_push_data)(void *ctx, int start, int end, int flags) = 128 | (void *) BPF_FUNC_msg_push_data; 129 | static int (*bpf_msg_pop_data)(void *ctx, int start, int cut, int flags) = 130 | (void *) BPF_FUNC_msg_pop_data; 131 | static int (*bpf_bind)(void *ctx, void *addr, int addr_len) = 132 | (void *) BPF_FUNC_bind; 133 | static int (*bpf_xdp_adjust_tail)(void *ctx, int offset) = 134 | (void *) BPF_FUNC_xdp_adjust_tail; 135 | static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state, 136 | int size, int flags) = 137 | (void *) BPF_FUNC_skb_get_xfrm_state; 138 | static int (*bpf_sk_select_reuseport)(void *ctx, void *map, void *key, __u32 flags) = 139 | (void *) BPF_FUNC_sk_select_reuseport; 140 | static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) = 141 | (void *) BPF_FUNC_get_stack; 142 | static int (*bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params, 143 | int plen, __u32 flags) = 144 | (void *) BPF_FUNC_fib_lookup; 145 | static int (*bpf_lwt_push_encap)(void *ctx, unsigned int type, void *hdr, 146 | unsigned int len) = 147 | (void *) BPF_FUNC_lwt_push_encap; 148 | static int (*bpf_lwt_seg6_store_bytes)(void *ctx, unsigned int offset, 149 | void *from, unsigned int len) = 150 | (void *) BPF_FUNC_lwt_seg6_store_bytes; 151 | static int (*bpf_lwt_seg6_action)(void *ctx, unsigned int action, void *param, 152 | unsigned int param_len) = 153 | (void *) BPF_FUNC_lwt_seg6_action; 154 | static int (*bpf_lwt_seg6_adjust_srh)(void *ctx, unsigned int offset, 155 | unsigned int len) = 156 | (void *) BPF_FUNC_lwt_seg6_adjust_srh; 157 | static int (*bpf_rc_repeat)(void *ctx) = 158 | (void *) BPF_FUNC_rc_repeat; 159 | static int (*bpf_rc_keydown)(void *ctx, unsigned int protocol, 160 | unsigned long long scancode, unsigned int toggle) = 161 | (void *) BPF_FUNC_rc_keydown; 162 | static unsigned long long (*bpf_get_current_cgroup_id)(void) = 163 | (void *) BPF_FUNC_get_current_cgroup_id; 164 | static void *(*bpf_get_local_storage)(void *map, unsigned long long flags) = 165 | (void *) BPF_FUNC_get_local_storage; 166 | static unsigned long long (*bpf_skb_cgroup_id)(void *ctx) = 167 | (void *) BPF_FUNC_skb_cgroup_id; 168 | static unsigned long long (*bpf_skb_ancestor_cgroup_id)(void *ctx, int level) = 169 | (void *) BPF_FUNC_skb_ancestor_cgroup_id; 170 | static struct bpf_sock *(*bpf_sk_lookup_tcp)(void *ctx, 171 | struct bpf_sock_tuple *tuple, 172 | int size, unsigned long long netns_id, 173 | unsigned long long flags) = 174 | (void *) BPF_FUNC_sk_lookup_tcp; 175 | static struct bpf_sock *(*bpf_skc_lookup_tcp)(void *ctx, 176 | struct bpf_sock_tuple *tuple, 177 | int size, unsigned long long netns_id, 178 | unsigned long long flags) = 179 | (void *) BPF_FUNC_skc_lookup_tcp; 180 | static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx, 181 | struct bpf_sock_tuple *tuple, 182 | int size, unsigned long long netns_id, 183 | unsigned long long flags) = 184 | (void *) BPF_FUNC_sk_lookup_udp; 185 | static int (*bpf_sk_release)(struct bpf_sock *sk) = 186 | (void *) BPF_FUNC_sk_release; 187 | static int (*bpf_skb_vlan_push)(void *ctx, __be16 vlan_proto, __u16 vlan_tci) = 188 | (void *) BPF_FUNC_skb_vlan_push; 189 | static int (*bpf_skb_vlan_pop)(void *ctx) = 190 | (void *) BPF_FUNC_skb_vlan_pop; 191 | static int (*bpf_rc_pointer_rel)(void *ctx, int rel_x, int rel_y) = 192 | (void *) BPF_FUNC_rc_pointer_rel; 193 | static void (*bpf_spin_lock)(struct bpf_spin_lock *lock) = 194 | (void *) BPF_FUNC_spin_lock; 195 | static void (*bpf_spin_unlock)(struct bpf_spin_lock *lock) = 196 | (void *) BPF_FUNC_spin_unlock; 197 | static struct bpf_sock *(*bpf_sk_fullsock)(struct bpf_sock *sk) = 198 | (void *) BPF_FUNC_sk_fullsock; 199 | static struct bpf_tcp_sock *(*bpf_tcp_sock)(struct bpf_sock *sk) = 200 | (void *) BPF_FUNC_tcp_sock; 201 | static struct bpf_sock *(*bpf_get_listener_sock)(struct bpf_sock *sk) = 202 | (void *) BPF_FUNC_get_listener_sock; 203 | static int (*bpf_skb_ecn_set_ce)(void *ctx) = 204 | (void *) BPF_FUNC_skb_ecn_set_ce; 205 | static int (*bpf_tcp_check_syncookie)(struct bpf_sock *sk, 206 | void *ip, int ip_len, void *tcp, int tcp_len) = 207 | (void *) BPF_FUNC_tcp_check_syncookie; 208 | static int (*bpf_sysctl_get_name)(void *ctx, char *buf, 209 | unsigned long long buf_len, 210 | unsigned long long flags) = 211 | (void *) BPF_FUNC_sysctl_get_name; 212 | static int (*bpf_sysctl_get_current_value)(void *ctx, char *buf, 213 | unsigned long long buf_len) = 214 | (void *) BPF_FUNC_sysctl_get_current_value; 215 | static int (*bpf_sysctl_get_new_value)(void *ctx, char *buf, 216 | unsigned long long buf_len) = 217 | (void *) BPF_FUNC_sysctl_get_new_value; 218 | static int (*bpf_sysctl_set_new_value)(void *ctx, const char *buf, 219 | unsigned long long buf_len) = 220 | (void *) BPF_FUNC_sysctl_set_new_value; 221 | static int (*bpf_strtol)(const char *buf, unsigned long long buf_len, 222 | unsigned long long flags, long *res) = 223 | (void *) BPF_FUNC_strtol; 224 | static int (*bpf_strtoul)(const char *buf, unsigned long long buf_len, 225 | unsigned long long flags, unsigned long *res) = 226 | (void *) BPF_FUNC_strtoul; 227 | static void *(*bpf_sk_storage_get)(void *map, struct bpf_sock *sk, 228 | void *value, __u64 flags) = 229 | (void *) BPF_FUNC_sk_storage_get; 230 | static int (*bpf_sk_storage_delete)(void *map, struct bpf_sock *sk) = 231 | (void *)BPF_FUNC_sk_storage_delete; 232 | static int (*bpf_send_signal)(unsigned sig) = (void *)BPF_FUNC_send_signal; 233 | static long long (*bpf_tcp_gen_syncookie)(struct bpf_sock *sk, void *ip, 234 | int ip_len, void *tcp, int tcp_len) = 235 | (void *) BPF_FUNC_tcp_gen_syncookie; 236 | 237 | /* llvm builtin functions that eBPF C program may use to 238 | * emit BPF_LD_ABS and BPF_LD_IND instructions 239 | */ 240 | struct sk_buff; 241 | unsigned long long load_byte(void *skb, 242 | unsigned long long off) asm("llvm.bpf.load.byte"); 243 | unsigned long long load_half(void *skb, 244 | unsigned long long off) asm("llvm.bpf.load.half"); 245 | unsigned long long load_word(void *skb, 246 | unsigned long long off) asm("llvm.bpf.load.word"); 247 | 248 | /* a helper structure used by eBPF C program 249 | * to describe map attributes to elf_bpf loader 250 | */ 251 | struct bpf_map_def { 252 | unsigned int type; 253 | unsigned int key_size; 254 | unsigned int value_size; 255 | unsigned int max_entries; 256 | unsigned int map_flags; 257 | unsigned int inner_map_idx; 258 | unsigned int numa_node; 259 | }; 260 | 261 | #else 262 | 263 | #include 264 | 265 | #endif 266 | 267 | #define BPF_ANNOTATE_KV_PAIR(name, type_key, type_val) \ 268 | struct ____btf_map_##name { \ 269 | type_key key; \ 270 | type_val value; \ 271 | }; \ 272 | struct ____btf_map_##name \ 273 | __attribute__ ((section(".maps." #name), used)) \ 274 | ____btf_map_##name = { } 275 | 276 | static int (*bpf_skb_load_bytes)(void *ctx, int off, void *to, int len) = 277 | (void *) BPF_FUNC_skb_load_bytes; 278 | static int (*bpf_skb_load_bytes_relative)(void *ctx, int off, void *to, int len, __u32 start_header) = 279 | (void *) BPF_FUNC_skb_load_bytes_relative; 280 | static int (*bpf_skb_store_bytes)(void *ctx, int off, void *from, int len, int flags) = 281 | (void *) BPF_FUNC_skb_store_bytes; 282 | static int (*bpf_l3_csum_replace)(void *ctx, int off, int from, int to, int flags) = 283 | (void *) BPF_FUNC_l3_csum_replace; 284 | static int (*bpf_l4_csum_replace)(void *ctx, int off, int from, int to, int flags) = 285 | (void *) BPF_FUNC_l4_csum_replace; 286 | static int (*bpf_csum_diff)(void *from, int from_size, void *to, int to_size, int seed) = 287 | (void *) BPF_FUNC_csum_diff; 288 | static int (*bpf_skb_under_cgroup)(void *ctx, void *map, int index) = 289 | (void *) BPF_FUNC_skb_under_cgroup; 290 | static int (*bpf_skb_change_head)(void *, int len, int flags) = 291 | (void *) BPF_FUNC_skb_change_head; 292 | static int (*bpf_skb_pull_data)(void *, int len) = 293 | (void *) BPF_FUNC_skb_pull_data; 294 | static unsigned int (*bpf_get_cgroup_classid)(void *ctx) = 295 | (void *) BPF_FUNC_get_cgroup_classid; 296 | static unsigned int (*bpf_get_route_realm)(void *ctx) = 297 | (void *) BPF_FUNC_get_route_realm; 298 | static int (*bpf_skb_change_proto)(void *ctx, __be16 proto, __u64 flags) = 299 | (void *) BPF_FUNC_skb_change_proto; 300 | static int (*bpf_skb_change_type)(void *ctx, __u32 type) = 301 | (void *) BPF_FUNC_skb_change_type; 302 | static unsigned int (*bpf_get_hash_recalc)(void *ctx) = 303 | (void *) BPF_FUNC_get_hash_recalc; 304 | static unsigned long long (*bpf_get_current_task)(void) = 305 | (void *) BPF_FUNC_get_current_task; 306 | static int (*bpf_skb_change_tail)(void *ctx, __u32 len, __u64 flags) = 307 | (void *) BPF_FUNC_skb_change_tail; 308 | static long long (*bpf_csum_update)(void *ctx, __u32 csum) = 309 | (void *) BPF_FUNC_csum_update; 310 | static void (*bpf_set_hash_invalid)(void *ctx) = 311 | (void *) BPF_FUNC_set_hash_invalid; 312 | static int (*bpf_get_numa_node_id)(void) = 313 | (void *) BPF_FUNC_get_numa_node_id; 314 | static int (*bpf_probe_read_str)(void *ctx, __u32 size, 315 | const void *unsafe_ptr) = 316 | (void *) BPF_FUNC_probe_read_str; 317 | static unsigned int (*bpf_get_socket_uid)(void *ctx) = 318 | (void *) BPF_FUNC_get_socket_uid; 319 | static unsigned int (*bpf_set_hash)(void *ctx, __u32 hash) = 320 | (void *) BPF_FUNC_set_hash; 321 | static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode, 322 | unsigned long long flags) = 323 | (void *) BPF_FUNC_skb_adjust_room; 324 | 325 | /* Scan the ARCH passed in from ARCH env variable (see Makefile) */ 326 | #if defined(__TARGET_ARCH_x86) 327 | #define bpf_target_x86 328 | #define bpf_target_defined 329 | #elif defined(__TARGET_ARCH_s390) 330 | #define bpf_target_s390 331 | #define bpf_target_defined 332 | #elif defined(__TARGET_ARCH_arm) 333 | #define bpf_target_arm 334 | #define bpf_target_defined 335 | #elif defined(__TARGET_ARCH_arm64) 336 | #define bpf_target_arm64 337 | #define bpf_target_defined 338 | #elif defined(__TARGET_ARCH_mips) 339 | #define bpf_target_mips 340 | #define bpf_target_defined 341 | #elif defined(__TARGET_ARCH_powerpc) 342 | #define bpf_target_powerpc 343 | #define bpf_target_defined 344 | #elif defined(__TARGET_ARCH_sparc) 345 | #define bpf_target_sparc 346 | #define bpf_target_defined 347 | #else 348 | #undef bpf_target_defined 349 | #endif 350 | 351 | /* Fall back to what the compiler says */ 352 | #ifndef bpf_target_defined 353 | #if defined(__x86_64__) 354 | #define bpf_target_x86 355 | #elif defined(__s390__) 356 | #define bpf_target_s390 357 | #elif defined(__arm__) 358 | #define bpf_target_arm 359 | #elif defined(__aarch64__) 360 | #define bpf_target_arm64 361 | #elif defined(__mips__) 362 | #define bpf_target_mips 363 | #elif defined(__powerpc__) 364 | #define bpf_target_powerpc 365 | #elif defined(__sparc__) 366 | #define bpf_target_sparc 367 | #endif 368 | #endif 369 | 370 | #if defined(bpf_target_x86) 371 | 372 | #ifdef __KERNEL__ 373 | #define PT_REGS_PARM1(x) ((x)->di) 374 | #define PT_REGS_PARM2(x) ((x)->si) 375 | #define PT_REGS_PARM3(x) ((x)->dx) 376 | #define PT_REGS_PARM4(x) ((x)->cx) 377 | #define PT_REGS_PARM5(x) ((x)->r8) 378 | #define PT_REGS_RET(x) ((x)->sp) 379 | #define PT_REGS_FP(x) ((x)->bp) 380 | #define PT_REGS_RC(x) ((x)->ax) 381 | #define PT_REGS_SP(x) ((x)->sp) 382 | #define PT_REGS_IP(x) ((x)->ip) 383 | #else 384 | #ifdef __i386__ 385 | /* i386 kernel is built with -mregparm=3 */ 386 | #define PT_REGS_PARM1(x) ((x)->eax) 387 | #define PT_REGS_PARM2(x) ((x)->edx) 388 | #define PT_REGS_PARM3(x) ((x)->ecx) 389 | #define PT_REGS_PARM4(x) 0 390 | #define PT_REGS_PARM5(x) 0 391 | #define PT_REGS_RET(x) ((x)->esp) 392 | #define PT_REGS_FP(x) ((x)->ebp) 393 | #define PT_REGS_RC(x) ((x)->eax) 394 | #define PT_REGS_SP(x) ((x)->esp) 395 | #define PT_REGS_IP(x) ((x)->eip) 396 | #else 397 | #define PT_REGS_PARM1(x) ((x)->rdi) 398 | #define PT_REGS_PARM2(x) ((x)->rsi) 399 | #define PT_REGS_PARM3(x) ((x)->rdx) 400 | #define PT_REGS_PARM4(x) ((x)->rcx) 401 | #define PT_REGS_PARM5(x) ((x)->r8) 402 | #define PT_REGS_RET(x) ((x)->rsp) 403 | #define PT_REGS_FP(x) ((x)->rbp) 404 | #define PT_REGS_RC(x) ((x)->rax) 405 | #define PT_REGS_SP(x) ((x)->rsp) 406 | #define PT_REGS_IP(x) ((x)->rip) 407 | #endif 408 | #endif 409 | 410 | #elif defined(bpf_target_s390) 411 | 412 | /* s390 provides user_pt_regs instead of struct pt_regs to userspace */ 413 | struct pt_regs; 414 | #define PT_REGS_S390 const volatile user_pt_regs 415 | #define PT_REGS_PARM1(x) (((PT_REGS_S390 *)(x))->gprs[2]) 416 | #define PT_REGS_PARM2(x) (((PT_REGS_S390 *)(x))->gprs[3]) 417 | #define PT_REGS_PARM3(x) (((PT_REGS_S390 *)(x))->gprs[4]) 418 | #define PT_REGS_PARM4(x) (((PT_REGS_S390 *)(x))->gprs[5]) 419 | #define PT_REGS_PARM5(x) (((PT_REGS_S390 *)(x))->gprs[6]) 420 | #define PT_REGS_RET(x) (((PT_REGS_S390 *)(x))->gprs[14]) 421 | /* Works only with CONFIG_FRAME_POINTER */ 422 | #define PT_REGS_FP(x) (((PT_REGS_S390 *)(x))->gprs[11]) 423 | #define PT_REGS_RC(x) (((PT_REGS_S390 *)(x))->gprs[2]) 424 | #define PT_REGS_SP(x) (((PT_REGS_S390 *)(x))->gprs[15]) 425 | #define PT_REGS_IP(x) (((PT_REGS_S390 *)(x))->psw.addr) 426 | 427 | #elif defined(bpf_target_arm) 428 | 429 | #define PT_REGS_PARM1(x) ((x)->uregs[0]) 430 | #define PT_REGS_PARM2(x) ((x)->uregs[1]) 431 | #define PT_REGS_PARM3(x) ((x)->uregs[2]) 432 | #define PT_REGS_PARM4(x) ((x)->uregs[3]) 433 | #define PT_REGS_PARM5(x) ((x)->uregs[4]) 434 | #define PT_REGS_RET(x) ((x)->uregs[14]) 435 | #define PT_REGS_FP(x) ((x)->uregs[11]) /* Works only with CONFIG_FRAME_POINTER */ 436 | #define PT_REGS_RC(x) ((x)->uregs[0]) 437 | #define PT_REGS_SP(x) ((x)->uregs[13]) 438 | #define PT_REGS_IP(x) ((x)->uregs[12]) 439 | 440 | #elif defined(bpf_target_arm64) 441 | 442 | /* arm64 provides struct user_pt_regs instead of struct pt_regs to userspace */ 443 | struct pt_regs; 444 | #define PT_REGS_ARM64 const volatile struct user_pt_regs 445 | #define PT_REGS_PARM1(x) (((PT_REGS_ARM64 *)(x))->regs[0]) 446 | #define PT_REGS_PARM2(x) (((PT_REGS_ARM64 *)(x))->regs[1]) 447 | #define PT_REGS_PARM3(x) (((PT_REGS_ARM64 *)(x))->regs[2]) 448 | #define PT_REGS_PARM4(x) (((PT_REGS_ARM64 *)(x))->regs[3]) 449 | #define PT_REGS_PARM5(x) (((PT_REGS_ARM64 *)(x))->regs[4]) 450 | #define PT_REGS_RET(x) (((PT_REGS_ARM64 *)(x))->regs[30]) 451 | /* Works only with CONFIG_FRAME_POINTER */ 452 | #define PT_REGS_FP(x) (((PT_REGS_ARM64 *)(x))->regs[29]) 453 | #define PT_REGS_RC(x) (((PT_REGS_ARM64 *)(x))->regs[0]) 454 | #define PT_REGS_SP(x) (((PT_REGS_ARM64 *)(x))->sp) 455 | #define PT_REGS_IP(x) (((PT_REGS_ARM64 *)(x))->pc) 456 | 457 | #elif defined(bpf_target_mips) 458 | 459 | #define PT_REGS_PARM1(x) ((x)->regs[4]) 460 | #define PT_REGS_PARM2(x) ((x)->regs[5]) 461 | #define PT_REGS_PARM3(x) ((x)->regs[6]) 462 | #define PT_REGS_PARM4(x) ((x)->regs[7]) 463 | #define PT_REGS_PARM5(x) ((x)->regs[8]) 464 | #define PT_REGS_RET(x) ((x)->regs[31]) 465 | #define PT_REGS_FP(x) ((x)->regs[30]) /* Works only with CONFIG_FRAME_POINTER */ 466 | #define PT_REGS_RC(x) ((x)->regs[1]) 467 | #define PT_REGS_SP(x) ((x)->regs[29]) 468 | #define PT_REGS_IP(x) ((x)->cp0_epc) 469 | 470 | #elif defined(bpf_target_powerpc) 471 | 472 | #define PT_REGS_PARM1(x) ((x)->gpr[3]) 473 | #define PT_REGS_PARM2(x) ((x)->gpr[4]) 474 | #define PT_REGS_PARM3(x) ((x)->gpr[5]) 475 | #define PT_REGS_PARM4(x) ((x)->gpr[6]) 476 | #define PT_REGS_PARM5(x) ((x)->gpr[7]) 477 | #define PT_REGS_RC(x) ((x)->gpr[3]) 478 | #define PT_REGS_SP(x) ((x)->sp) 479 | #define PT_REGS_IP(x) ((x)->nip) 480 | 481 | #elif defined(bpf_target_sparc) 482 | 483 | #define PT_REGS_PARM1(x) ((x)->u_regs[UREG_I0]) 484 | #define PT_REGS_PARM2(x) ((x)->u_regs[UREG_I1]) 485 | #define PT_REGS_PARM3(x) ((x)->u_regs[UREG_I2]) 486 | #define PT_REGS_PARM4(x) ((x)->u_regs[UREG_I3]) 487 | #define PT_REGS_PARM5(x) ((x)->u_regs[UREG_I4]) 488 | #define PT_REGS_RET(x) ((x)->u_regs[UREG_I7]) 489 | #define PT_REGS_RC(x) ((x)->u_regs[UREG_I0]) 490 | #define PT_REGS_SP(x) ((x)->u_regs[UREG_FP]) 491 | 492 | /* Should this also be a bpf_target check for the sparc case? */ 493 | #if defined(__arch64__) 494 | #define PT_REGS_IP(x) ((x)->tpc) 495 | #else 496 | #define PT_REGS_IP(x) ((x)->pc) 497 | #endif 498 | 499 | #endif 500 | 501 | #if defined(bpf_target_powerpc) 502 | #define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = (ctx)->link; }) 503 | #define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP 504 | #elif defined(bpf_target_sparc) 505 | #define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = PT_REGS_RET(ctx); }) 506 | #define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP 507 | #else 508 | #define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ \ 509 | bpf_probe_read(&(ip), sizeof(ip), (void *)PT_REGS_RET(ctx)); }) 510 | #define BPF_KRETPROBE_READ_RET_IP(ip, ctx) ({ \ 511 | bpf_probe_read(&(ip), sizeof(ip), \ 512 | (void *)(PT_REGS_FP(ctx) + sizeof(ip))); }) 513 | #endif 514 | 515 | /* 516 | * BPF_CORE_READ abstracts away bpf_probe_read() call and captures offset 517 | * relocation for source address using __builtin_preserve_access_index() 518 | * built-in, provided by Clang. 519 | * 520 | * __builtin_preserve_access_index() takes as an argument an expression of 521 | * taking an address of a field within struct/union. It makes compiler emit 522 | * a relocation, which records BTF type ID describing root struct/union and an 523 | * accessor string which describes exact embedded field that was used to take 524 | * an address. See detailed description of this relocation format and 525 | * semantics in comments to struct bpf_offset_reloc in libbpf_internal.h. 526 | * 527 | * This relocation allows libbpf to adjust BPF instruction to use correct 528 | * actual field offset, based on target kernel BTF type that matches original 529 | * (local) BTF, used to record relocation. 530 | */ 531 | #define BPF_CORE_READ(dst, src) \ 532 | bpf_probe_read((dst), sizeof(*(src)), \ 533 | __builtin_preserve_access_index(src)) 534 | 535 | #endif -------------------------------------------------------------------------------- /brc/brc.bpf.c: -------------------------------------------------------------------------------- 1 | #include "vmlinux.h" // 必须放在首位 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | //#include "../libbpf/include/uapi/linux/pkt_cls.h" 8 | 9 | #include "bpf_helpers.h" 10 | #include "brc_common.h" 11 | 12 | char LICENSE[] SEC("license") = "Dual BSD/GPL"; 13 | 14 | #define tcp_hdrlen(tcp) (tcp->doff * 4) 15 | // https://zhangbinalan.gitbooks.io/protocol/content/ipxie_yi_tou_bu.html 16 | #define ipv4_hdrlen(ip) (ip->ihl * 4) 17 | 18 | // https://mechpen.github.io/posts/2019-08-29-bpf-verifier/index.html 19 | #define ensure_header(skb, var_off, const_off, hdr) do{ \ 20 | u32 len = const_off + sizeof(*hdr); \ 21 | void *data = (void *)(long)skb->data + var_off; \ 22 | void *data_end = (void *)(long)skb->data_end; \ 23 | \ 24 | if (data + len > data_end) \ 25 | bpf_skb_pull_data(skb, var_off + len); \ 26 | \ 27 | data = (void *)(long)skb->data + var_off; \ 28 | data_end = (void *)(long)skb->data_end; \ 29 | if (data + len > data_end) \ 30 | return 0; \ 31 | \ 32 | hdr = (void *)(data + const_off); \ 33 | } while(0) 34 | 35 | /* 36 | * @brief: 用于尾调用 37 | * @notes: 尾调用上限目前为33 38 | **/ 39 | struct { 40 | __uint(type, BPF_MAP_TYPE_PROG_ARRAY); 41 | __uint(max_entries, RECURSION_UPPER_LIMIT); 42 | __type(key, u32); 43 | __type(value, u32); 44 | } tc_progs SEC(".maps"); 45 | 46 | /* 47 | * @brief: 实际的hash表{hash->brc_cache_entry} 48 | **/ 49 | struct { 50 | __uint(type, BPF_MAP_TYPE_ARRAY); 51 | __type(key, u32); 52 | __type(value, struct brc_cache_entry); 53 | __uint(max_entries, BRC_CACHE_ENTRY_COUNT); 54 | } map_cache SEC(".maps"); 55 | 56 | /* 57 | * @brief: 用于在ingress中把get的key保存,在egress中 58 | **/ 59 | struct { 60 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 61 | __type(key, unsigned int); 62 | __type(value, struct redis_key); 63 | __uint(max_entries, BRC_MAX_KEY_IN_PACKET); 64 | } map_keys SEC(".maps"); 65 | 66 | /* 67 | * @brief: 用于和用户态之间传递数据 68 | **/ 69 | struct { 70 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 71 | __type(key, unsigned int); 72 | __type(value, struct brc_stats); 73 | __uint(max_entries, MAP_STATS_MAX); 74 | } map_stats SEC(".maps"); 75 | 76 | struct parsing_context; 77 | struct { 78 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 79 | __type(key, unsigned int); 80 | __type(value, struct parsing_context); 81 | __uint(max_entries, PARSING_MAX); 82 | } map_parsing_context SEC(".maps"); 83 | 84 | struct brc_cache_key; 85 | struct { 86 | __uint(type, BPF_MAP_TYPE_QUEUE); 87 | //__type(key, 0); queue这里设置这个在load的时候会报错 88 | __type(value, struct brc_cache_key); 89 | __uint(max_entries, BRC_CACHE_QUEUE_SIZE); 90 | } map_invaild_key SEC(".maps"); 91 | 92 | struct redis_key { 93 | u32 hash; 94 | char key_data[BRC_MAX_KEY_LENGTH + 1]; 95 | unsigned int len; 96 | }; 97 | 98 | // 因为redis协议的返回值无法自解释,但是我们又希望维护内核态和用户态的一致性,所以设置一个BPF_MAP_TYPE_QUEUE 99 | struct brc_cache_key { 100 | struct bpf_spin_lock lock; 101 | unsigned int len; 102 | char key[BRC_MAX_KEY_LENGTH + 1]; // 为了在hash相同的时候判断是否是同一个key 103 | }; 104 | 105 | struct parsing_context { 106 | unsigned int value_size; // 在brc_rx_filter也可以代表key的大小 107 | unsigned short read_pkt_offset; 108 | int hash; // 存储解析过程中的hash,方便后续直接拿到对应的value值 109 | }; 110 | 111 | struct brc_cache_entry { 112 | struct bpf_spin_lock lock; 113 | unsigned int key_len; 114 | unsigned int data_len; 115 | char valid; 116 | int hash; 117 | char key[BRC_MAX_KEY_LENGTH]; // 为了在hash相同的时候判断是否是同一个key 118 | char data[BRC_MAX_CACHE_DATA_SIZE]; 119 | }; 120 | 121 | SEC("tc/brc_rx_filter") 122 | int brc_rx_filter_main(struct __sk_buff *skb) { 123 | void *data_end = (void *)(long)skb->data_end; 124 | void *data = (void *)(long)skb->data; 125 | struct ethhdr *eth = data; 126 | struct iphdr *ip = data + sizeof(*eth); 127 | void *transp = data + sizeof(*eth) + sizeof(*ip); 128 | // 这里的解析应该是不规范的,参考ensure_header上面的链接 129 | struct tcphdr *tcp; 130 | char *payload; 131 | __be16 dport; 132 | 133 | if (ip + 1 > data_end) 134 | return 0; 135 | 136 | switch (ip->protocol) { 137 | case IPPROTO_UDP: 138 | return 0; 139 | case IPPROTO_TCP: 140 | tcp = (struct tcphdr *) transp; 141 | if (tcp + 1 > data_end) 142 | return 0; 143 | dport = tcp->dest; 144 | payload = transp + sizeof(*tcp); 145 | break; 146 | default: 147 | return 0; 148 | } 149 | 150 | // 经过上面的循环拿到目的端口和payload相关,payload是真实数据包的其实地址 151 | if (dport == bpf_htons(6379) && payload+13 <= data_end) { 152 | bpf_printk("recive request from port 6379 %s\n", payload); 153 | // 目前只支持get 154 | // "*2\r\n$3\r\nget\r\n$4\r\nkey1\r\n" 155 | // 前八个字节亘古不变 156 | if (payload[8] == 'g' && payload[9] == 'e' && 157 | payload[10] == 't' && payload[11] == '\r' && payload[12] == '\n') { // is this a GET request 158 | bpf_printk("this is a get request\n"); 159 | unsigned int map_stats_index = MAP_STATS; 160 | unsigned int parsing_egress = PARSING_INGRESS; 161 | // 如果一个目标端口的TCP包来了,而且是get请求,就会更新map_stats表中的数据 162 | struct brc_stats *stats = bpf_map_lookup_elem(&map_stats, &map_stats_index); 163 | if (!stats) { 164 | bpf_printk("stats invaild\n"); 165 | return 0; 166 | } 167 | stats->get_recv_count++; 168 | 169 | // 解析上下文 170 | struct parsing_context *pctx = bpf_map_lookup_elem(&map_parsing_context, &parsing_egress); 171 | if (!pctx) { 172 | bpf_printk("pctx invaild\n"); 173 | return 0; 174 | } 175 | // 14这个下标上应该是'$' 176 | pctx->read_pkt_offset = 13; 177 | pctx->value_size = 0; 178 | 179 | // "*2\r\n$3\r\nget\r\n$13\r\nusername:1234\r\n" 180 | // 这里不加 pctx->read_pkt_offset < BRC_MAX_PACKET_LENGTH 就会载入失败 181 | // ebpf如何处理无限循环? 182 | if (pctx->read_pkt_offset < BRC_MAX_PACKET_LENGTH && payload+pctx->read_pkt_offset+1 <= data_end && payload[pctx->read_pkt_offset] == '$') { 183 | pctx->read_pkt_offset++; // 现在pctx->read_pkt_offset是数字的第一个字符的下标 184 | while (pctx->read_pkt_offset < BRC_MAX_PACKET_LENGTH && payload+pctx->read_pkt_offset+1 <= data_end && payload[pctx->read_pkt_offset] != '\r' && 185 | payload[pctx->read_pkt_offset] >= '0' && payload[pctx->read_pkt_offset] <= '9') { 186 | pctx->value_size *= 10; 187 | pctx->value_size += payload[pctx->read_pkt_offset] - '0'; 188 | pctx->read_pkt_offset++; 189 | } 190 | } else { 191 | bpf_printk("common check\n"); 192 | return 0; 193 | } 194 | 195 | if (payload+pctx->read_pkt_offset+1 > data_end || pctx->value_size > BRC_MAX_KEY_LENGTH) { 196 | stats->big_key_pass_to_user++; 197 | bpf_printk("out of bounds\n"); 198 | return 0; 199 | } 200 | bpf_printk("value size is %d\n", pctx->value_size); 201 | // 目前value_size是key的大小,read_pkt_offset是key的第一个字节 202 | bpf_tail_call(skb, &tc_progs, BRC_PROG_TC_HASH_KEYS); 203 | } else { 204 | // *3\r\n$3\r\nset\r\n$4\r\nkey1\r\n$6\r\nvalue1\r\n 205 | bpf_printk("this is a set request\n"); 206 | // 非get请求就会来这里,set会把标记设置为invaild 207 | bpf_tail_call(skb, &tc_progs, BRC_PROG_TC_INVALIDATE_CACHE); 208 | } 209 | } 210 | 211 | return 0; 212 | } 213 | 214 | // 这里主要做的事情是通过get中的key计算hash值 215 | // cache中是vaild就返回,如果是invaild就放入全局cache,等到get返回的时候获取key的值 216 | // 在egress可能接收到set的返回值和get的返回值,前者我们忽略,那后者一定都是invaild以后去用户态拿数据的请求了 217 | // 因为redis的单线程模型,所以这里使用一个队列来解决redis协议无法自解释的问题看起来是OK的 218 | SEC("tc/brc_hash_keys") 219 | int brc_hash_keys_main(struct __sk_buff *skb) { 220 | void *data_end = (void *)(long)skb->data_end; 221 | void *data = (void *)(long)skb->data; 222 | struct ethhdr *eth = data; 223 | struct iphdr *ip = data + sizeof(*eth); 224 | void *transp = data + sizeof(*eth) + sizeof(*ip); 225 | // 这里的解析应该是不规范的,参考ensure_header上面的链接 226 | struct tcphdr *tcp; 227 | char *payload; 228 | bpf_printk("this is brc_hash_keys_main\n"); 229 | 230 | // 这里必须要先验证ip + 1 > data_end,才能执行后面 231 | if (ip + 1 > data_end || ip->protocol != IPPROTO_TCP) 232 | return 0; 233 | 234 | tcp = (struct tcphdr *) transp; 235 | if (tcp + 1 > data_end) 236 | return 0; 237 | payload = transp + sizeof(*tcp); 238 | int off; 239 | 240 | unsigned int map_stats_index = MAP_STATS; 241 | unsigned int parsing_egress = PARSING_INGRESS; 242 | 243 | struct parsing_context *pctx = bpf_map_lookup_elem(&map_parsing_context, &parsing_egress); 244 | if (!pctx) { 245 | return 0; 246 | } 247 | 248 | // *3\r\n$3\r\nset\r\n$4\r\nkey1\r\n$6\r\nvalue1\r\n 249 | bpf_printk("pctx->read_pkt_offset [%d], pctx->value_size [%d]\n", pctx->read_pkt_offset, pctx->value_size); 250 | // pctx->read_pkt_offset > BRC_MAX_PACKET_LENGTH 非常重要,没这个load不了 251 | if (pctx->value_size > BRC_MAX_KEY_LENGTH || pctx->read_pkt_offset > BRC_MAX_PACKET_LENGTH) { 252 | return 1; 253 | } 254 | u32 hash = FNV_OFFSET_BASIS_32; 255 | // 目前payload的第一个字节就是key实际值的第一个字节 256 | // value_size是key的大小 257 | // 循环中一定要显式的限定为有限循环,且需要给payload判断是否有效 258 | if (payload + pctx->read_pkt_offset <= data_end) { 259 | payload = payload + pctx->read_pkt_offset; 260 | } 261 | 262 | // 这个printf一加就load失败 263 | //bpf_printk("this is brc_hash_keys, payload is %s\n", payload); 264 | // "*2\r\n$3\r\nget\r\n$4\r\nkey1\r\n" 265 | if (payload + 2 <= data_end && payload[0] == '\r' && payload[1] == '\n') { 266 | for (off = 2; payload+off+1 <= data_end && off < pctx->value_size+2; ++off) { 267 | hash ^= payload[off]; 268 | hash *= FNV_PRIME_32; 269 | } 270 | } else { 271 | bpf_printk("payload not begin 'CRLF'\n"); 272 | return 0; 273 | } 274 | 275 | u32 cache_idx = hash % BRC_CACHE_ENTRY_COUNT; 276 | bpf_printk("brc_hash_keys: hash is [%d], cache_idx is[%d] \n", hash, cache_idx); 277 | 278 | struct brc_cache_entry *entry = bpf_map_lookup_elem(&map_cache, &cache_idx); 279 | if (!entry) { 280 | return 0; 281 | } 282 | 283 | struct brc_stats *stats = bpf_map_lookup_elem(&map_stats, &map_stats_index); 284 | if (!stats) { 285 | return 0; 286 | } 287 | 288 | // 到了这里证明是个get操作,如果发现是invaild的,就把数据放入queue;如果是vaild的话就直接继续执行尾调用 289 | bpf_spin_lock(&entry->lock); 290 | // hash相同且字符串也一样证明找对了;vaild准备返回相关的事务;invaild pass 到用户态处理 291 | if (entry->valid) { 292 | // 这个标识代表在hash相同是判断key是否相同 293 | bool diff = true; 294 | if (pctx->value_size != entry->key_len) { 295 | diff = false; 296 | } 297 | if (diff) { 298 | for (off = 2; off < BRC_MAX_KEY_LENGTH && payload+off+1 <= data_end && off < pctx->value_size; ++off) { 299 | if (payload[off] != entry->key[off - 2]) { 300 | diff = false; 301 | break; 302 | } 303 | } 304 | } 305 | u32 tmp_hash = entry->hash; 306 | int vaild = entry->valid; 307 | // 为了在brc_prepare_packet中拿到对应的data 308 | bpf_spin_unlock(&entry->lock); 309 | // spin_lock的范围内不允许使用bpf_printk 310 | bpf_printk("this entry idx[%d] is vaild\n", cache_idx); 311 | // hash值相同且逐字节比较相同,此时开始准备返回数据包 312 | bpf_printk("tmp_hash[%d] hash[%d] vaild[%d]\n", tmp_hash, hash, vaild); 313 | // 这里做了比较后面的尾调用就不需要了 314 | if (tmp_hash == hash && diff && vaild) { 315 | stats->hit_count++; 316 | pctx->hash = tmp_hash; 317 | return 0; 318 | //bpf_tail_call(skb, &tc_progs, BRC_PROG_TC_PREPARE_PACKET); 319 | } 320 | // 能到这里证明对应entry有效,但是于get中的key不匹配 321 | } else { 322 | // entry是无效的,pass到用户态处理 323 | bpf_spin_unlock(&entry->lock); 324 | bpf_printk("this entry idx[%d] is invaild\n", cache_idx); 325 | // 这里是一个栈变量,限制了key的大小 326 | struct redis_key key_entry = { 327 | .hash = hash, 328 | .len = pctx->value_size 329 | }; 330 | 331 | for (off = 2; off < BRC_MAX_KEY_LENGTH && payload+off+1 <= data_end && off < pctx->value_size + 2; ++off) { 332 | key_entry.key_data[off - 2] = payload[off]; 333 | } 334 | //bpf_printk("key_entry.key_data is [%s]\n", key_entry.key_data); 335 | 336 | if (off >= BRC_MAX_KEY_LENGTH || payload+off+1 > data_end) { 337 | bpf_printk("error : out of bounds\n"); 338 | return 0; 339 | } 340 | // if (pctx->value_size >= BRC_MAX_KEY_LENGTH){ 341 | // return 1; 342 | // } 343 | // 用于debug 344 | //key_entry.key_data[pctx->value_size] = '\n'; 345 | 346 | bpf_map_push_elem(&map_invaild_key, &key_entry, BPF_ANY); 347 | } 348 | 349 | stats->miss_count++; 350 | bpf_printk("get pass to user\n"); 351 | return 0; 352 | } 353 | 354 | SEC("tc/brc_prepare_packet") 355 | int brc_prepare_packet_main(struct __sk_buff *skb) { 356 | void *data_end = (void *)(long)skb->data_end; 357 | void *data = (void *)(long)skb->data; 358 | struct ethhdr *eth = data; 359 | struct iphdr *ip = data + sizeof(*eth); 360 | void *transp = data + sizeof(*eth) + sizeof(*ip); 361 | struct tcphdr *tcp; 362 | bpf_printk("this is brc_prepare_packet\n"); 363 | 364 | // 这里必须要先验证ip + 1 > data_end,才能执行后面 365 | if (ip + 1 > data_end || ip->protocol != IPPROTO_TCP) 366 | return 0; 367 | 368 | tcp = (struct tcphdr *) transp; 369 | if (tcp + 1 > data_end) 370 | return 0; 371 | 372 | unsigned char tmp_mac[ETH_ALEN]; 373 | __be32 tmp_ip; 374 | __be16 tmp_port; 375 | 376 | // tmp_mac存的是源mac地址 377 | memcpy(tmp_mac, eth->h_source, ETH_ALEN); 378 | // 源地址改成旧包的目的地址 379 | memcpy(eth->h_source, eth->h_dest, ETH_ALEN); 380 | // 目的地址改成以前的源地址 381 | memcpy(eth->h_dest, tmp_mac, ETH_ALEN); 382 | 383 | // 和上面同理,交换IP和port 384 | tmp_ip = ip->saddr; 385 | ip->saddr = ip->daddr; 386 | ip->daddr = tmp_ip; 387 | 388 | tmp_port = tcp->source; 389 | tcp->source = tcp->dest; 390 | tcp->dest = tmp_port; 391 | 392 | // 可以用来grow和shrink,虽然从官网来看这玩意是用来修改mac和L3 layer的 393 | if (bpf_skb_adjust_room(skb, ADJUST_HEAD_LEN, BPF_ADJ_ROOM_NET, BPF_F_ADJ_ROOM_FIXED_GSO)) 394 | return 0; 395 | 396 | bpf_printk("brc_prepare_packet : data_end - data [%d] %d\n", data_end - data, skb->len); 397 | bpf_printk("==========={%s}\n", skb->data); 398 | 399 | bpf_tail_call(skb, &tc_progs, BRC_PROG_TC_WRITE_REPLY); 400 | return 0; 401 | } 402 | 403 | SEC("tc/brc_write_reply") 404 | int brc_write_reply_main(struct __sk_buff *skb) { 405 | // void *data_end = (void *)(long)skb->data_end; 406 | // void *data = (void *)(long)skb->data; 407 | // struct ethhdr *eth = data; 408 | // struct iphdr *ip = data + sizeof(*eth); 409 | // void *transp = data + sizeof(*eth) + sizeof(*ip); 410 | // struct tcphdr *tcp; 411 | // bpf_printk("this is brc_prepare_packet\n"); 412 | 413 | // // 这里必须要先验证ip + 1 > data_end,才能执行后面 414 | // if (ip + 1 > data_end || ip->protocol != IPPROTO_TCP) 415 | // return 0; 416 | 417 | // tcp = (struct tcphdr *) transp; 418 | // if (tcp + 1 > data_end) 419 | // return 0; 420 | // char* payload = transp + sizeof(*tcp); 421 | // int off = 0; 422 | 423 | // // 检查一下上一步再grow以后包中的地址设置正确 424 | // bpf_printk("port: %d %d\n", bpf_ntohs(tcp->source), bpf_ntohs(tcp->dest)); 425 | // bpf_printk("addr: %s %s\n", bpf_ntohs(ip->saddr), bpf_ntohs(ip->daddr)); 426 | 427 | // unsigned int parsing_egress = PARSING_INGRESS; 428 | // struct parsing_context *pctx = bpf_map_lookup_elem(&map_parsing_context, &parsing_egress); 429 | // if (!pctx) { 430 | // return 0; 431 | // } 432 | 433 | // unsigned int map_stats_index = MAP_STATS; 434 | // struct brc_stats *stats = bpf_map_lookup_elem(&map_stats, &map_stats_index); 435 | // if (!stats) { 436 | // return 0; 437 | // } 438 | 439 | // int hash = pctx->hash; 440 | // u32 cache_idx = hash % BRC_CACHE_ENTRY_COUNT; 441 | // bpf_printk("brc_hash_keys: hash is [%d], cache_idx is[%d] \n", hash, cache_idx); 442 | 443 | // struct brc_cache_entry *entry = bpf_map_lookup_elem(&map_cache, &cache_idx); 444 | // if (!entry) { 445 | // return 0; 446 | // } 447 | // unsigned int written = 0; 448 | 449 | // if (payload+off+1 <= data_end) { 450 | // payload[0] = '$' 451 | // } 452 | 453 | // bpf_spin_lock(&entry->lock); 454 | // // 前面已经检查过了,这里看起来不需要做检查了,但是先检查下其实也没什么大问题 455 | // if (entry->valid && hash == entry->hash) { 456 | 457 | 458 | // for (off = 0; off < BRC_MAX_CACHE_DATA_SIZE && off < entry->data_len && payload+off+1 <= data_end; off++) { 459 | // payload[off] = entry->data[off]; 460 | // written += 1; 461 | // } 462 | 463 | // } 464 | // bpf_spin_unlock(&entry->lock); 465 | 466 | // if (payload+written+2 <= data_end) { 467 | // payload[written++] = 'E'; 468 | // payload[written++] = 'N'; 469 | // payload[written++] = 'D'; 470 | // payload[written++] = '\r'; 471 | // payload[written++] = '\n'; 472 | 473 | // // 这是尾调用,不会回到调用方,所以维护一个write_pkt_offset 474 | // if (bpf_xdp_adjust_head(ctx, 0 - (int) (sizeof(struct ethhdr) + sizeof(struct iphdr) + sizeof(struct udphdr) 475 | // + sizeof(struct memcached_udp_header) + pctx->write_pkt_offset))) { // pop headers + previously written data 476 | // return XDP_DROP; 477 | // } 478 | 479 | // void *data_end = (void *)(long)ctx->data_end; 480 | // void *data = (void *)(long)ctx->data; 481 | // struct iphdr *ip = data + sizeof(struct ethhdr); 482 | // struct udphdr *udp = data + sizeof(struct ethhdr) + sizeof(*ip); 483 | // payload = data + sizeof(struct ethhdr) + sizeof(*ip) + sizeof(*udp) + sizeof(struct memcached_udp_header); 484 | 485 | // if (udp + 1 > data_end) 486 | // return XDP_PASS; 487 | 488 | // ip->tot_len = htons((payload+pctx->write_pkt_offset+written) - (char*)ip); 489 | // ip->check = compute_ip_checksum(ip); 490 | // // 为啥不需要 491 | // udp->check = 0; // computing udp checksum is not required 492 | // udp->len = htons((payload+pctx->write_pkt_offset+written) - (char*)udp); 493 | 494 | // bpf_xdp_adjust_tail(ctx, 0 - (int) ((long) data_end - (long) (payload+pctx->write_pkt_offset+written))); // try to strip additional bytes 495 | 496 | // return XDP_TX; 497 | // } 498 | 499 | // 以这种方式扩展数据包看起来是不对的,因为我们希望的是去扩展payload,但是现在实际上扩展的是L3 layer,先试试看能不能跑通 500 | // if (bpf_skb_adjust_room(skb, ADJUST_HEAD_LEN, BPF_ADJ_ROOM_NET, BPF_F_ADJ_ROOM_FIXED_GSO)) 501 | // return 0; 502 | // return XDP_PASS; 503 | return 0; 504 | } 505 | 506 | SEC("tc/brc_maintain_tcp") 507 | int brc_maintain_tcp_main(struct __sk_buff *skb) { 508 | // static struct bpf_sock *(*bpf_skc_lookup_tcp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 99; 509 | // libbpf/src/bpf_helper_defs.h xdp/tc 支持 bpf_skc_lookup_tcp 510 | // https://elixir.bootlin.com/linux/v5.17/source/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c#L94 调用bpf_skc_lookup_tcp的例子 511 | // vmlinux/x86/vmlinux.h cls支持 bpf_skc_lookup_tcp 512 | 513 | // static struct bpf_tcp_sock *(*bpf_tcp_sock)(struct bpf_sock *sk) = (void *) 96; 514 | // libbpf/src/bpf_helper_defs.h 支持 bpf_tcp_sock,这里返回的结构体才是需要修改的,但是只支持tc,所以看起来这里也需要使用tc 515 | 516 | // https://elixir.bootlin.com/linux/v5.10.13/source/tools/testing/selftests/bpf/bpf_tcp_helpers.h#L53 bpf_sock定义 517 | 518 | // btf_bpf_tcp_sock 519 | // ============================================= 520 | return XDP_PASS; 521 | } 522 | 523 | // 只做一件事,就是在get的时候让这个hash index上的entry invaild 524 | SEC("tc/brc_invalidate_cache") 525 | int brc_invalidate_cache_main(struct __sk_buff *skb) { 526 | void *data_end = (void *)(long)skb->data_end; 527 | void *data = (void *)(long)skb->data; 528 | struct ethhdr *eth = data; 529 | struct iphdr *ip = data + sizeof(*eth); 530 | void *transp = data + sizeof(*eth) + sizeof(*ip); 531 | // 这里的解析应该是不规范的,参考ensure_header上面的链接 532 | struct tcphdr *tcp; 533 | char *payload; 534 | int key_size = 0; 535 | 536 | // 这里必须要先验证ip + 1 > data_end,才能执行后面 537 | if (ip + 1 > data_end || ip->protocol != IPPROTO_TCP) 538 | return 0; 539 | 540 | tcp = (struct tcphdr *) transp; 541 | if (tcp + 1 > data_end) 542 | return 0; 543 | payload = transp + sizeof(*tcp); 544 | 545 | unsigned int map_stats_index = MAP_STATS; 546 | struct brc_stats *stats = bpf_map_lookup_elem(&map_stats, &map_stats_index); 547 | if (!stats) { 548 | return 0; 549 | } 550 | 551 | u32 hash; 552 | int set_found = 0, interval = 0, key_found = 0, key_index = 0; 553 | // 下面是一个状态机 554 | // *3\r\n$3\r\nset\r\n |(13) $4\r\nkey1\r\n$6\r\nvalue1\r\n 555 | bpf_printk("come on! payload is %s\n", payload); 556 | for (unsigned int off = 8; off < BRC_MAX_PACKET_LENGTH && payload+off+1 <= data_end;) { 557 | if (set_found == 0 && payload+off+5 <= data_end && 558 | payload[off] == 's' && payload[off+1] == 'e' && payload[off+2] == 't') { 559 | set_found = 1; 560 | //bpf_printk("find set!!!\n"); 561 | // 把off移动搭配key的长度字段的第一个字符,除了set还跳过了’\r\n‘ 562 | off += 5; 563 | stats->set_recv_count++; 564 | } 565 | else if (interval == 0 && set_found == 1 && payload+off+1 <= data_end && payload[off] == '$') { 566 | off++; 567 | while (off < BRC_MAX_CACHE_DATA_SIZE && payload+off+1 <= data_end && 568 | payload[off] != '\r' && payload[off] >= '0' && payload[off] <= '9') { 569 | key_size *= 10; 570 | key_size += payload[off] - '0'; 571 | off++; 572 | } 573 | if (payload+off+2 <= data_end && payload[off] == '\r' && payload[off+1] == '\n') { 574 | off += 2; 575 | } else { 576 | bpf_printk("may be redis error2.\n"); 577 | return 0; 578 | } 579 | //bpf_printk("find interval!!!\n"); 580 | interval = 1; 581 | } // 在key解析开始时做一些前置准备 582 | else if (key_found == 0 && interval == 1 && payload+off+1 <= data_end && payload[off] != '\r') { 583 | bpf_printk("find key start!!!\n"); 584 | hash = FNV_OFFSET_BASIS_32; 585 | hash ^= payload[off]; 586 | hash *= FNV_PRIME_32; 587 | key_found = 1; 588 | off += 1; 589 | } // 目前解析也不用长度,直接用'\n'判断 590 | else if (key_found == 1 && payload+off+1 <= data_end ) { 591 | if (payload[off] == '\r') { // 找到key的末尾了 592 | u32 cache_idx = hash % BRC_CACHE_ENTRY_COUNT; 593 | bpf_printk("find key end!!! hash[%d] cache_idx[%d]\n", hash, cache_idx); 594 | struct brc_cache_entry *entry = bpf_map_lookup_elem(&map_cache, &cache_idx); 595 | if (!entry) { 596 | bpf_printk("entry invaild when key hash end\n"); 597 | return 0; 598 | } 599 | bpf_spin_lock(&entry->lock); 600 | if (entry->valid) { 601 | stats->invalidation_count++; 602 | } 603 | entry->valid = 0; 604 | bpf_spin_unlock(&entry->lock); 605 | break; 606 | } 607 | else { // still processing the key 608 | hash ^= payload[off]; 609 | hash *= FNV_PRIME_32; 610 | off += 1; 611 | } 612 | } else { 613 | // 这个条件必须得有,要么就load不进去 614 | break; 615 | } 616 | } 617 | // 为了看下interval解析到对不对,但是放在那个循环里就会load失败 618 | bpf_printk("key_size is [%d]\n", key_size); 619 | bpf_printk("brc_invalidate_cache finish!!!\n"); 620 | return 0; 621 | } 622 | 623 | SEC("tc/brc_tx_filter") 624 | int brc_tx_filter_main(struct __sk_buff *skb) { 625 | // 从源码来看第二个参数填len这样的用法其实是最正确的,因为pskb_may_pull中在执行实际expend时会使用第二个参数减去线性区大小 626 | bpf_skb_pull_data(skb, skb->len); 627 | void *data_end = (void *)(long)skb->data_end; 628 | void *data = (void *)(long)skb->data; 629 | struct ethhdr *eth = data; 630 | struct iphdr *ip = data + sizeof(*eth); 631 | void *transp = data + sizeof(*eth) + sizeof(*ip); 632 | // 这里的解析应该是不规范的,参考ensure_header上面的链接 633 | struct tcphdr *tcp; 634 | char *payload; 635 | __be16 sport; 636 | struct redis_key key_entry; 637 | unsigned int map_stats_index = MAP_STATS; 638 | unsigned int parsing_egress = PARSING_EGRESS; 639 | 640 | if (ip + 1 > data_end) 641 | return 0; 642 | 643 | switch (ip->protocol) { 644 | case IPPROTO_UDP: 645 | return 0; 646 | case IPPROTO_TCP: 647 | tcp = (struct tcphdr *) transp; 648 | if (tcp + 1 > data_end) 649 | return 0; 650 | sport = tcp->source; 651 | payload = transp + sizeof(*tcp); 652 | break; 653 | default: 654 | return 0; 655 | } 656 | 657 | if (sport == bpf_htons(6379)) { 658 | int tcp_len = tcp_hdrlen(tcp); 659 | int ip_len = ipv4_hdrlen(ip); 660 | bpf_printk("data_end - data -> [%d]\n", data_end - data); 661 | bpf_printk("brc_tx_filter payload [%s] skb->len[%d]\n", payload, skb->len); 662 | bpf_printk("brc_tx_filter tcp->len[%d] ip->len[%d]\n", tcp_len, tcp_len); 663 | if (payload + 3 <= data_end) { 664 | bpf_printk("payload: [%c] [%c] [%c] \n", payload[0], payload[1], payload[2]); 665 | } else { 666 | bpf_printk("gg\n"); 667 | } 668 | } 669 | 670 | if (payload+2 <= data_end && payload[0] == '$' && payload[1] == '6') { 671 | bpf_printk("brc_tx_filter payload [%s] sport[%d]\n", payload, sport); 672 | } 673 | 674 | // 因为下面先用到了[0],所以需要检查此下标(payload + 1 <= data_end )是否是有效的,这是必要的步骤 675 | // 目前只处理批量回复,只监听6379,先支持set/get操作,后续再说 676 | // redis这部分的解析逻辑在 processBulkItem,我们需要的就是string2ll 677 | // 这个版本不能把尾调用和bpf to bpf结合使用就只能把解析也放在这个尾调用里面了 678 | // "$6\r\nvalue1\r\n" 679 | if (sport == bpf_htons(6379) && payload + 1 <= data_end && payload[0] == '$') { 680 | //bpf_printk("this is brc_tx_filter. payload is [%s]\n", payload); 681 | bpf_printk("this is brc_tx_filter.\n"); 682 | // step1:先解析出数字,然后向后推一个/r/n,然后再执行尾调用 683 | struct parsing_context *pctx = bpf_map_lookup_elem(&map_parsing_context, &parsing_egress); 684 | if (!pctx) { 685 | bpf_map_pop_elem(&map_invaild_key, &key_entry); 686 | return 0; 687 | } 688 | pctx->value_size = 0; 689 | pctx->read_pkt_offset = 1; // '$' 690 | 691 | // "$-1\r\n" 692 | // 一个get请求从客户端没读到自己希望的数据,那在全局cache中也需要一次delete操作 693 | if (payload+pctx->read_pkt_offset+1 <= data_end && payload[pctx->read_pkt_offset] == '-') { 694 | bpf_map_pop_elem(&map_invaild_key, &key_entry); 695 | return 0; 696 | } 697 | // 那剩下的就是invaild的get操作,且确实从用户态获取到值了,这就需要尝试更新内核cache了 698 | while (pctx->read_pkt_offset < BRC_MAX_CACHE_DATA_SIZE && payload+pctx->read_pkt_offset+1 <= data_end && payload[pctx->read_pkt_offset] != '\r' && 699 | payload[pctx->read_pkt_offset] >= '0' && payload[pctx->read_pkt_offset] <= '9') { 700 | pctx->value_size *= 10; 701 | pctx->value_size += payload[pctx->read_pkt_offset] - '0'; 702 | pctx->read_pkt_offset++; 703 | } 704 | 705 | if (payload+pctx->read_pkt_offset+1 > data_end || pctx->value_size > BRC_MAX_CACHE_DATA_SIZE) { 706 | bpf_map_pop_elem(&map_invaild_key, &key_entry); 707 | return 0; 708 | } 709 | 710 | if (payload+pctx->read_pkt_offset+2 <= data_end && 711 | payload[pctx->read_pkt_offset] == '\r' && payload[pctx->read_pkt_offset + 1] == '\n') { 712 | pctx->read_pkt_offset+=2; 713 | } 714 | 715 | bpf_printk("brc_tx_filter: pctx->value_size is [%d] pctx->read_pkt_offset is [%d]\n", pctx->value_size, pctx->read_pkt_offset); 716 | // 现在 pctx->read_pkt_offset 的位置就是数据的第一个字节,且value_size是数据的实际大小 717 | 718 | // step2:更新map_stats状态 719 | struct brc_stats *stats = bpf_map_lookup_elem(&map_stats, &map_stats_index); 720 | if (!stats) { 721 | bpf_map_pop_elem(&map_invaild_key, &key_entry); 722 | return 0; 723 | } 724 | stats->try_update++; 725 | 726 | // step3: 尾调用,开始把数据写入hash表 727 | bpf_tail_call(skb, &tc_progs, BRC_PROG_TC_UPDATE_CACHE); 728 | } 729 | 730 | return 0; 731 | } 732 | 733 | SEC("tc/brc_update_cache") 734 | int brc_update_cache_main(struct __sk_buff *skb) { 735 | bpf_skb_pull_data(skb, skb->len); 736 | bpf_printk("this is brc_update_cache_main\n"); 737 | struct redis_key key_entry; 738 | void *data_end = (void *)(long)skb->data_end; 739 | void *data = (void *)(long)skb->data; 740 | struct ethhdr *eth = data; 741 | struct iphdr *ip = data + sizeof(*eth); 742 | void *transp = data + sizeof(*eth) + sizeof(*ip); 743 | // 这里的解析应该是不规范的,参考ensure_header上面的链接 744 | struct tcphdr *tcp; 745 | char *payload; 746 | 747 | // 这里必须要先验证ip + 1 > data_end,才能执行后面 748 | if (ip + 1 > data_end || ip->protocol != IPPROTO_TCP) 749 | return 0; 750 | 751 | tcp = (struct tcphdr *) transp; 752 | if (tcp + 1 > data_end) 753 | return 0; 754 | payload = transp + sizeof(*tcp); 755 | int off; 756 | 757 | unsigned int map_stats_index = MAP_STATS; 758 | unsigned int parsing_egress = PARSING_EGRESS; 759 | 760 | struct parsing_context *pctx = bpf_map_lookup_elem(&map_parsing_context, &parsing_egress); 761 | if (!pctx) { 762 | return 0; 763 | } 764 | 765 | // pctx->read_pkt_offset > BRC_MAX_PACKET_LENGTH 非常重要,没这个load不了 766 | if (pctx->value_size > BRC_MAX_CACHE_DATA_SIZE || pctx->read_pkt_offset > BRC_MAX_PACKET_LENGTH) { 767 | return 1; 768 | } 769 | // 目前payload的第一个字节就是key实际值的第一个字节 770 | // value_size是key的大小 771 | // 循环中一定要显式的限定为有限循环,且需要给payload判断是否有效 772 | if (payload + pctx->read_pkt_offset <= data_end) { 773 | payload = payload + pctx->read_pkt_offset; 774 | } 775 | 776 | bpf_printk("redis get reply(read_offset=%d)\n", pctx->read_pkt_offset); 777 | 778 | // ========================================== 779 | 780 | // 获取此get返回值对应的key 781 | bpf_map_pop_elem(&map_invaild_key, &key_entry); 782 | // hash算法为FNV-1a 783 | // "$6\r\nfoobar\r\n" 784 | // step1:找到此key对应的hash_index 785 | // off < BRC_MAX_KEY_LENGTH 必须放在循环里,不能放在上面用key_entry.len和BRC_MAX_KEY_LENGTH做 786 | u32 hash = FNV_OFFSET_BASIS_32; 787 | for (unsigned int off = 0; off < BRC_MAX_KEY_LENGTH && off < key_entry.len ; off++) { 788 | hash ^= key_entry.key_data[off]; 789 | hash *= FNV_PRIME_32; 790 | } 791 | u32 cache_idx = hash % BRC_CACHE_ENTRY_COUNT; 792 | 793 | bpf_printk("hash is [%d], cache_idx is [%d]\n", hash, cache_idx); 794 | bpf_printk("key_entry.hash [%d] key_entry.len [%d]\n", key_entry.hash, key_entry.len); 795 | 796 | struct brc_cache_entry *entry = bpf_map_lookup_elem(&map_cache, &cache_idx); 797 | if (!entry) { 798 | return 0; 799 | } 800 | 801 | bpf_spin_lock(&entry->lock); 802 | // step2: 只要vaild是0,我们就会全量的替换 803 | if (!entry->valid) { 804 | entry->valid = 1; 805 | entry->hash = hash; 806 | 807 | entry->key_len = key_entry.len; 808 | for(int i = 0; i < BRC_MAX_KEY_LENGTH && i < key_entry.len; ++i) { 809 | entry->key[i] = key_entry.key_data[i]; 810 | } 811 | 812 | entry->data_len = pctx->value_size; 813 | for(int i = 0; payload+i+1 <= data_end && i < pctx->value_size && i < BRC_CACHE_ENTRY_COUNT; ++i) { 814 | entry->data[i] = payload[i]; 815 | } 816 | 817 | bpf_spin_unlock(&entry->lock); 818 | bpf_printk("brc_update_cache entry->key: %s, entry->key_len\n", entry->key, entry->key_len); 819 | bpf_printk("brc_update_cache entry->data: %s, entry->data_len\n", entry->data, entry->data_len); 820 | 821 | struct brc_stats *stats = bpf_map_lookup_elem(&map_stats, &map_stats_index); 822 | if (!stats) { 823 | return 0; 824 | } 825 | stats->update_count++; 826 | } else { 827 | bpf_spin_unlock(&entry->lock); 828 | } 829 | 830 | return 0; 831 | } -------------------------------------------------------------------------------- /brc/brc.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "brc_common.h" 13 | #include 14 | #include // bpf_map_update_elem 15 | 16 | #include "brc.skel.h" 17 | 18 | #define BPF_SYSFS_ROOT "/sys/fs/bpf" 19 | #define STATS_PATH "/tmp/brc_stats.txt" 20 | #define STATS_INTERVAL_PATH "/tmp/brc_stats_interval.txt" 21 | 22 | struct bpf_progs_desc { 23 | char name[256]; 24 | enum bpf_prog_type type; 25 | int map_prog_idx; 26 | struct bpf_program *prog; 27 | }; 28 | 29 | static struct bpf_progs_desc progs[] = { 30 | {"tc/brc_rx_filter", BPF_PROG_TYPE_SCHED_CLS, -1, NULL}, 31 | {"tc/brc_hash_keys", BPF_PROG_TYPE_SCHED_CLS, BRC_PROG_TC_HASH_KEYS, NULL}, 32 | {"tc/brc_prepare_packet", BPF_PROG_TYPE_SCHED_CLS, BRC_PROG_TC_PREPARE_PACKET, NULL}, 33 | {"tc/brc_write_reply", BPF_PROG_TYPE_SCHED_CLS, BRC_PROG_TC_WRITE_REPLY, NULL}, 34 | {"tc/brc_maintain_tcp", BPF_PROG_TYPE_SCHED_CLS, BRC_PORG_TC_MAINTAIN_TCP, NULL}, 35 | {"tc/brc_invalidate_cache", BPF_PROG_TYPE_SCHED_CLS, BRC_PROG_TC_INVALIDATE_CACHE, NULL}, 36 | {"tc/brc_tx_filter", BPF_PROG_TYPE_SCHED_CLS, -1, NULL}, 37 | {"tc/brc_update_cache", BPF_PROG_TYPE_SCHED_CLS, BRC_PROG_TC_UPDATE_CACHE, NULL}, 38 | }; 39 | 40 | static int cpu_nums = 0; 41 | 42 | 43 | static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) { 44 | return vfprintf(stderr, format, args); 45 | } 46 | 47 | static volatile bool exiting = false; 48 | 49 | static void sig_handler(int sig) { 50 | exiting = true; 51 | } 52 | 53 | void construct_mount_path(char* pathame, char* prog_name) { 54 | int len = snprintf(pathame, PATH_MAX, "%s/%s", BPF_SYSFS_ROOT, prog_name); 55 | printf("mount path : %s\n", pathame); 56 | if (len < 0) { 57 | fprintf(stderr, "Error: Program name '%s' is invalid\n", prog_name); 58 | exit(1); 59 | } else if (len >= PATH_MAX) { 60 | fprintf(stderr, "Error: Path name '%s' is too long\n", prog_name); 61 | exit(1); 62 | } 63 | return; 64 | } 65 | 66 | int write_stats_to_file(char *filename, int map_fd) { 67 | // map类型是BPF_MAP_TYPE_PERCPU_ARRAY,所以取数据的array应该是定长的 68 | struct brc_stats stats[cpu_nums]; 69 | struct brc_stats aggregate_stats; 70 | __u32 key = 0; 71 | FILE *fp; 72 | 73 | memset(&aggregate_stats, 0, sizeof(struct brc_stats)); 74 | 75 | assert(bpf_map_lookup_elem(map_fd, &key, stats) == 0); 76 | for (int i = 0; i < cpu_nums; i++) { 77 | aggregate_stats.get_recv_count += stats[i].get_recv_count; 78 | aggregate_stats.set_recv_count += stats[i].set_recv_count; 79 | aggregate_stats.hit_misprediction += stats[i].hit_misprediction; 80 | aggregate_stats.hit_count += stats[i].hit_count; 81 | aggregate_stats.miss_count += stats[i].miss_count; 82 | aggregate_stats.try_update += stats[i].try_update; 83 | aggregate_stats.update_count += stats[i].update_count; 84 | aggregate_stats.invalidation_count += stats[i].invalidation_count; 85 | aggregate_stats.big_key_pass_to_user += stats[i].big_key_pass_to_user; 86 | } 87 | // 追加写入 88 | fp = fopen(STATS_PATH, "w+"); 89 | if (fp == NULL) { 90 | fprintf(stderr, "Error: failed to write stats to file '%s'\n", filename); 91 | return -1; 92 | } 93 | 94 | fprintf(fp, "STAT get_recv_count %u\n", aggregate_stats.get_recv_count); 95 | fprintf(fp, "STAT set_recv_count %u\n", aggregate_stats.set_recv_count); 96 | fprintf(fp, "STAT hit_misprediction %u\n", aggregate_stats.hit_misprediction); 97 | fprintf(fp, "STAT hit_count %u\n", aggregate_stats.hit_count); 98 | fprintf(fp, "STAT miss_count %u\n", aggregate_stats.miss_count); 99 | fprintf(fp, "STAT try_update %u\n", aggregate_stats.try_update); 100 | fprintf(fp, "STAT update_count %u\n", aggregate_stats.update_count); 101 | fprintf(fp, "STAT invalidation_count %u\n", aggregate_stats.invalidation_count); 102 | fprintf(fp, "STAT big_key_pass_to_user %u\n", aggregate_stats.big_key_pass_to_user); 103 | 104 | fclose(fp); 105 | return 0; 106 | } 107 | 108 | int write_stat_line(FILE *fp, int map_fd) { 109 | struct brc_stats stats[cpu_nums]; 110 | struct brc_stats aggregate_stats; 111 | __u32 key = 0; 112 | 113 | memset(&aggregate_stats, 0, sizeof(struct brc_stats)); 114 | 115 | assert(bpf_map_lookup_elem(map_fd, &key, stats) == 0); 116 | 117 | for (int i = 0; i < cpu_nums; i++) { 118 | aggregate_stats.get_recv_count += stats[i].get_recv_count; 119 | aggregate_stats.set_recv_count += stats[i].set_recv_count; 120 | aggregate_stats.hit_misprediction += stats[i].hit_misprediction; 121 | aggregate_stats.hit_count += stats[i].hit_count; 122 | aggregate_stats.miss_count += stats[i].miss_count; 123 | aggregate_stats.try_update += stats[i].try_update; 124 | aggregate_stats.update_count += stats[i].update_count; 125 | aggregate_stats.invalidation_count += stats[i].invalidation_count; 126 | aggregate_stats.big_key_pass_to_user += stats[i].big_key_pass_to_user; 127 | } 128 | 129 | fprintf(fp, "%lu,%u,%u,%u,%u,%u,%u,%u,%u,%u\n", (unsigned long)time(NULL), aggregate_stats.get_recv_count, 130 | aggregate_stats.set_recv_count, aggregate_stats.hit_misprediction, 131 | aggregate_stats.hit_count, aggregate_stats.miss_count, aggregate_stats.try_update, 132 | aggregate_stats.update_count, aggregate_stats.invalidation_count, aggregate_stats.big_key_pass_to_user); 133 | 134 | return 0; 135 | } 136 | 137 | int main(int argc, char **argv) { 138 | struct rlimit mem_limit = {RLIM_INFINITY, RLIM_INFINITY}; 139 | struct brc_bpf *skel; 140 | int map_tc_progs_fd, prog_count, map_stats_fd, tc_main_fd; 141 | // 目前写死,后续可以再修改 142 | int interface_idx; 143 | int stats_poll_count = 5, stats_poll_interval = 5; 144 | int err; 145 | 146 | // 设置一些debug信息的回调 147 | libbpf_set_print(libbpf_print_fn); 148 | 149 | // libbpf不会默认调节锁定的内存 150 | if (setrlimit(RLIMIT_MEMLOCK, &mem_limit)) { 151 | perror("setrlimit failed"); 152 | return 1; 153 | } 154 | 155 | signal(SIGINT, sig_handler); 156 | signal(SIGTERM, sig_handler); 157 | 158 | // Load and verify BPF application 159 | skel = brc_bpf__open(); 160 | if (!skel) { 161 | fprintf(stderr, "Failed to open and load BPF skeleton\n"); 162 | return 1; 163 | } 164 | 165 | // Load and verify BPF programs 166 | err = brc_bpf__load(skel); 167 | if (err) { 168 | fprintf(stderr, "Failed to load and verify BPF skeleton\n"); 169 | goto cleanup; 170 | } 171 | 172 | //======================填充progs数组==================================== 173 | prog_count = sizeof(progs) / sizeof(progs[0]); 174 | for (int i = 0; i < prog_count; i++) { 175 | progs[i].prog = bpf_object__find_program_by_title(skel->obj, progs[i].name); 176 | if (!progs[i].prog) { 177 | fprintf(stderr, "Error: bpf_object__find_program_by_title failed\n"); 178 | return 1; 179 | } 180 | bpf_program__set_type(progs[i].prog, progs[i].type); 181 | } 182 | //=================================================================== 183 | 184 | //======================用于尾调用==================================== 185 | map_tc_progs_fd = bpf_object__find_map_fd_by_name(skel->obj, "tc_progs"); 186 | if (map_tc_progs_fd < 0) { 187 | fprintf(stderr, "Error: bpf_object__find_map_fd_by_name failed\n"); 188 | return 1; 189 | } 190 | //=================================================================== 191 | 192 | //======================用于填充prog_map============================== 193 | for (int i = 0; i < prog_count; i++) { 194 | int prog_fd = bpf_program__fd(progs[i].prog); 195 | if (prog_fd < 0) { 196 | fprintf(stderr, "Error: Couldn't get file descriptor for program %s\n", progs[i].name); 197 | return 1; 198 | } 199 | // -1指的是主程序 200 | if (progs[i].map_prog_idx != -1) { 201 | printf("tail call[%s]\n", progs[i].name); 202 | // 给 progs map 的 map_prog_idx 插入 prog_fd 203 | err = bpf_map_update_elem(map_tc_progs_fd, &progs[i].map_prog_idx, &prog_fd, 0); 204 | if (err) { 205 | fprintf(stderr, "Error: bpf_map_update_elem failed for prog array map\n"); 206 | return 1; 207 | } 208 | } else { 209 | // TC相关,bpf_tc_attach的例子太少了,现在也没时间看libbpf的代码,所以pin下,命令行手动挂载 210 | char pathname[PATH_MAX]; 211 | construct_mount_path(pathname, progs[i].name); 212 | printf("main prog[%s] mount path : %s\n", progs[i].name, pathname); 213 | retry: 214 | if (bpf_program__pin(progs[i].prog, pathname)) { 215 | fprintf(stderr, "Error: Failed to pin program '%s' to path %s\n", progs[i].name, pathname); 216 | if (errno == EEXIST) { 217 | fprintf(stdout, "BPF program '%s' already pinned, unpinning it to reload it\n", progs[i].name); 218 | if (bpf_program__unpin(progs[i].prog, pathname)) { 219 | fprintf(stderr, "Error: Fail to unpin program '%s' at %s\n", progs[i].name, pathname); 220 | return -1; 221 | } 222 | printf("Retry mount TC bpf to %s\n", pathname); 223 | goto retry; 224 | } 225 | return -1; 226 | } 227 | } 228 | } 229 | //=========================================================================== 230 | 231 | //============================brc_tx_filter载入================================ 232 | // https://elixir.bootlin.com/linux/latest/source/tools/testing/selftests/bpf/prog_tests/tc_bpf.c#L36 233 | // https://lwn.net/Articles/856041/ 234 | // tc_main_fd = bpf_object__find_map_fd_by_name(skel->obj, "tc/brc_rx_filter"); 235 | 236 | // struct bpf_tc_hook tc_main_hook = { 237 | // .attach_point = BPF_TC_EGRESS, 238 | // .ifindex = interface_idx, 239 | // .sz = sizeof(struct bpf_tc_hook)}; 240 | // struct bpf_tc_opts tc_main_opts = { 241 | // .sz = sizeof(struct bpf_tc_opts), 242 | // .handle = 1, 243 | // .priority = 1, 244 | // .prog_fd = tc_main_fd}; 245 | 246 | // getchar(); // 用于GDB 247 | 248 | // if (bpf_tc_hook_create(&tc_main_hook) != 0) { 249 | // fprintf(stderr, "bpf_tc_hook_create invalid hook ifindex == %d\n", interface_idx); 250 | // goto cleanup; 251 | // } else { 252 | // printf("sucess for create hook\n"); 253 | // } 254 | 255 | // if (bpf_tc_attach(&tc_main_hook, &tc_main_opts) != 0) { 256 | // fprintf(stderr, "bpf_tc_attach invalid hook ifindex == %d\n", interface_idx); 257 | // goto cleanup; 258 | // } 259 | //=================================================================== 260 | 261 | //============================注册对应的信号处理函数================================ 262 | cpu_nums = libbpf_num_possible_cpus(); 263 | 264 | map_stats_fd = bpf_object__find_map_fd_by_name(skel->obj, "map_stats"); 265 | if (map_stats_fd < 0) { 266 | fprintf(stderr, "Error: bpf_object__find_map_fd_by_name failed\n"); 267 | return 1; 268 | } 269 | 270 | sigset_t signal_mask; 271 | sigemptyset(&signal_mask); 272 | sigaddset(&signal_mask, SIGINT); 273 | sigaddset(&signal_mask, SIGTERM); 274 | 275 | err = sigprocmask(SIG_BLOCK, &signal_mask, NULL); 276 | if (err != 0) { 277 | fprintf(stderr, "Error: Failed to set signal mask\n"); 278 | exit(EXIT_FAILURE); 279 | } 280 | 281 | int sig, cur_poll_count = 0, quit = 0; 282 | FILE *fp = NULL; 283 | 284 | if (stats_poll_count > 0 && stats_poll_interval > 0) { 285 | fp = fopen(STATS_INTERVAL_PATH, "w+"); 286 | if (fp == NULL) { 287 | fprintf(stderr, "Error: failed to open file '%s'\n", STATS_INTERVAL_PATH); 288 | return -1; 289 | } 290 | // 隔这么长时间触发一次SIGALRM信号 291 | //alarm(stats_poll_interval); 292 | } 293 | 294 | int ret = 0; 295 | while (!quit) { 296 | // 是否可能出现信号丢失的情况 297 | err = sigwait(&signal_mask, &sig); 298 | if (err != 0) { 299 | fprintf(stderr, "Error: Failed to wait for signal\n"); 300 | exit(EXIT_FAILURE); 301 | } 302 | 303 | switch (sig) { 304 | case SIGINT: 305 | case SIGTERM: 306 | // 按了 ctrl+c 以后就把map_stats中的数据写到目标文件中 307 | ret = write_stats_to_file(STATS_PATH, map_stats_fd); 308 | //quit = 1; 309 | break; 310 | 311 | case SIGALRM: 312 | ret |= write_stat_line(fp, map_stats_fd); 313 | if (++cur_poll_count < stats_poll_count) { 314 | alarm(stats_poll_interval); 315 | } else { 316 | ret |= write_stats_to_file(STATS_PATH, map_stats_fd); 317 | if (fp != NULL) { 318 | fclose(fp); 319 | } 320 | //quit = 1; 321 | } 322 | break; 323 | default: 324 | fprintf(stderr, "Unknown signal\n"); 325 | break; 326 | } 327 | } 328 | 329 | cleanup: 330 | // Clean up 331 | // if (bpf_tc_hook_destroy(&tc_main_hook) == -EINVAL) { 332 | // fprintf(stderr, "bpf_tc_hook_destroy invalid hook ifindex == 0\n"); 333 | // return 1; 334 | // } 335 | 336 | // struct bpf_tc_opts opts = { 337 | // .handle = 1, 338 | // .priority = 1, 339 | // .sz = sizeof(struct bpf_tc_opts)}; 340 | 341 | // if (bpf_tc_detach(&tc_main_hook, &opts) == -EINVAL) { 342 | // fprintf(stderr, "bpf_tc_detach invalid hook ifindex == 0\n"); 343 | // return 1; 344 | // } 345 | brc_bpf__destroy(skel); 346 | 347 | return err < 0 ? -err : 0; 348 | } 349 | -------------------------------------------------------------------------------- /brc/brc_common.h: -------------------------------------------------------------------------------- 1 | #ifndef _BRC_COMMON_H 2 | #define _BRC_COMMON_H 3 | 4 | #define BRC_MAX_CACHE_DATA_SIZE (1024) 5 | #define BRC_CACHE_ENTRY_COUNT (1024) 6 | #define BRC_MAX_PACKET_LENGTH (1518) 7 | #define BRC_MAX_KEY_LENGTH (256) 8 | #define BRC_MAX_KEY_IN_PACKET (10) 9 | #define BRC_CACHE_QUEUE_SIZE (512) 10 | #define RECURSION_UPPER_LIMIT (33) 11 | // https://www.vbforums.com/showthread.php?879417-RESOLVED-restore-my-string-from-a-hash 12 | #define FNV_OFFSET_BASIS_32 (2166136261) 13 | #define FNV_PRIME_32 (16777619) 14 | #define ETH_ALEN (6) 15 | #define ADJUST_HEAD_LEN (128) 16 | 17 | enum { 18 | BRC_PROG_TC_UPDATE_CACHE = 0, 19 | BRC_PROG_TC_HASH_KEYS, 20 | BRC_PROG_TC_PREPARE_PACKET, 21 | BRC_PROG_TC_WRITE_REPLY, 22 | BRC_PORG_TC_MAINTAIN_TCP, 23 | BRC_PROG_TC_INVALIDATE_CACHE, 24 | BRC_PROG_TC_MAX, 25 | }; 26 | 27 | enum { 28 | PARSING_INGRESS = 0, 29 | PARSING_EGRESS, 30 | PARSING_MAX, 31 | }; 32 | 33 | enum { 34 | MAP_STATS = 0, 35 | MAP_STATS_MAX, 36 | }; 37 | 38 | // 用于与客户端交互,所以需要放到common中来 39 | struct brc_stats { 40 | unsigned int get_recv_count; // 接收的get数 41 | unsigned int set_recv_count; // 接收的set数 42 | unsigned int hit_misprediction; // 预期命中但未命中(由于哈希冲突或 invalidation/update 的竞争) 43 | unsigned int hit_count; // hit的次数 44 | unsigned int miss_count; // miss的次数 45 | unsigned int try_update; // 尝试更新缓存的次数 46 | unsigned int update_count; // cache被更新的次数 47 | unsigned int invalidation_count; // cache被设置为invaild的次数 48 | unsigned int big_key_pass_to_user; // key的大小超过BRC_MAX_KEY_LENGTH,pass到用户态处理 49 | }; 50 | 51 | #endif -------------------------------------------------------------------------------- /brc/mount.sh: -------------------------------------------------------------------------------- 1 | tc qdisc add dev eth0 clsact 2 | tc filter add dev eth0 ingress bpf object-pinned /sys/fs/bpf/tc/brc_rx_filter 3 | tc filter add dev eth0 egress bpf object-pinned /sys/fs/bpf/tc/brc_tx_filter 4 | cat /sys/kernel/debug/tracing/trace_pipe -------------------------------------------------------------------------------- /brc/parse_ip.c: -------------------------------------------------------------------------------- 1 | SEC("tc/brc_rx_filter") 2 | int brc_rx_filter_main(struct __sk_buff *skb) { 3 | // static struct bpf_sock *(*bpf_skc_lookup_tcp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 99; 4 | // libbpf/src/bpf_helper_defs.h xdp/tc 支持 bpf_skc_lookup_tcp 5 | // https://elixir.bootlin.com/linux/v5.17/source/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c#L94 调用bpf_skc_lookup_tcp的例子 6 | // vmlinux/x86/vmlinux.h cls支持 bpf_skc_lookup_tcp 7 | 8 | // static struct bpf_tcp_sock *(*bpf_tcp_sock)(struct bpf_sock *sk) = (void *) 96; 9 | // libbpf/src/bpf_helper_defs.h 支持 bpf_tcp_sock,这里返回的结构体才是需要修改的,但是只支持tc,所以看起来这里也需要使用tc 10 | 11 | // https://elixir.bootlin.com/linux/v5.10.13/source/tools/testing/selftests/bpf/bpf_tcp_helpers.h#L53 bpf_sock定义 12 | 13 | // btf_bpf_tcp_sock 14 | // ============================================= 15 | u32 hdrlen, var_off, const_off; 16 | __be16 dport = 0; 17 | struct iphdr *ip; 18 | // 这里暂且不考虑IPV6 19 | // https://stackoverflow.com/questions/53136145/how-to-solve-the-r0-invalid-mem-access-inv-error-when-loading-an-ebpf-file-o 20 | struct tcphdr *tcp; 21 | char *payload; 22 | void *data = (void *)(long)skb->data; 23 | void *data_end = (void *)(long)skb->data_end; 24 | 25 | var_off = 0; 26 | const_off = sizeof(struct ethhdr); 27 | ensure_header(skb, var_off, const_off, ip); 28 | 29 | switch (ip->protocol) { 30 | case IPPROTO_UDP: 31 | payload = data; 32 | return 0; 33 | case IPPROTO_TCP: 34 | hdrlen = ipv4_hdrlen(ip); 35 | if (hdrlen < sizeof(struct iphdr)) 36 | return 0; 37 | var_off += hdrlen; 38 | ensure_header(skb, var_off, const_off, tcp); 39 | 40 | dport = tcp->dest; 41 | hdrlen = tcp_hdrlen(tcp); 42 | if (hdrlen < sizeof(struct tcphdr)) 43 | return 0; 44 | var_off += hdrlen; 45 | payload = data + const_off + var_off; 46 | break; 47 | default: 48 | return XDP_PASS; 49 | } 50 | // 经过上面的循环拿到目的端口和payload相关,payload是真实数据包的起始地址 51 | if (dport == bpf_htons(6379) && payload+14 <= data_end) { 52 | // 目前只支持get 53 | // "*2\r\n$3\r\nget\r\n$13\r\nusername:1234\r\n" 54 | // 前八个字节亘古不变 55 | if (ip->protocol == IPPROTO_TCP && payload[9] == 'g' && payload[10] == 'e' && payload[11] == 't' && payload[12] == '\r' && payload[13] == '\n') { // is this a GET request 56 | unsigned int map_stats_index = MAP_STATS; 57 | unsigned int parsing_egress = PARSING_INGRESS; 58 | // 如果一个目标端口的TCP包来了,而且是get请求,就会更新map_stats表中的数据 59 | struct brc_stats *stats = bpf_map_lookup_elem(&map_stats, &map_stats_index); 60 | if (!stats) { 61 | return 0; 62 | } 63 | stats->get_recv_count++; 64 | 65 | // 解析上下文 66 | struct parsing_context *pctx = bpf_map_lookup_elem(&map_parsing_context, &parsing_egress); 67 | if (!pctx) { 68 | return 0; 69 | } 70 | // 14这个下标上应该是'$' 71 | pctx->read_pkt_offset = 14; 72 | pctx->value_size = 0; 73 | 74 | // "*2\r\n$3\r\nget\r\n$13\r\nusername:1234\r\n" 75 | if (payload+pctx->read_pkt_offset < data_end && payload[pctx->read_pkt_offset] == '$') { 76 | pctx->read_pkt_offset++; // 现在pctx->read_pkt_offset是数字的第一个字符的下标 77 | while (payload+pctx->read_pkt_offset <= data_end && payload[pctx->read_pkt_offset] != '\r' && 78 | payload[pctx->read_pkt_offset] >= '0' && payload[pctx->read_pkt_offset] <= '9') { 79 | pctx->value_size *= 10; 80 | pctx->value_size += payload[pctx->read_pkt_offset] - '0'; 81 | pctx->read_pkt_offset++; 82 | } 83 | } else { 84 | return 0; 85 | } 86 | 87 | if (payload+pctx->read_pkt_offset > data_end || pctx->value_size > BRC_MAX_KEY_LENGTH) { 88 | stats->big_key_pass_to_user++; 89 | return 0; 90 | } 91 | // 目前value_size是key的大小,read_pkt_offset是key的第一个字节 92 | bpf_tail_call(skb, &tc_progs, BRC_PROG_XDP_HASH_KEYS); 93 | } else if (ip->protocol == IPPROTO_TCP) { 94 | // 非get请求就会来这里,set会把标记设置为invaild 95 | bpf_tail_call(skb, &tc_progs, BRC_PROG_XDP_INVALIDATE_CACHE); 96 | } 97 | } 98 | return 0; 99 | } -------------------------------------------------------------------------------- /brc/restart.sh: -------------------------------------------------------------------------------- 1 | tc filter del dev eth0 egress 2 | tc filter del dev eth0 ingress 3 | tc qdisc del dev eth0 clsact 4 | rm /sys/fs/bpf/tc/brc_rx_filter 5 | rm /sys/fs/bpf/tc/brc_tx_filter 6 | 7 | cd .. 8 | make 9 | cd .bin 10 | ./brc -------------------------------------------------------------------------------- /brc/trace.bpf.c: -------------------------------------------------------------------------------- 1 | #include "vmlinux.h" 2 | #include 3 | #include 4 | #include 5 | #include "bpf_helpers.h" 6 | 7 | char LICENSE[] SEC("license") = "Dual BSD/GPL"; 8 | 9 | struct { 10 | __uint(type, BPF_MAP_TYPE_PROG_ARRAY); 11 | __uint(max_entries, 1024); 12 | __type(key, u32); 13 | __type(value, u32); 14 | } progs SEC(".maps"); 15 | 16 | 17 | SEC("kprobe/__seccomp_filter") 18 | int BPF_KPROBE(__seccomp_filter, int this_syscall, const struct seccomp_data *sd, const bool recheck_after_trace) 19 | { 20 | // 这里注意ebpf程序栈空间只有512字节,太大这里会报错的,可以自己调大一点看看 21 | char comm_name[30]; 22 | bpf_get_current_comm(comm_name, sizeof(comm_name)); 23 | // 调用失败以后会直接 fall through 24 | bpf_tail_call(ctx, &progs, this_syscall); 25 | 26 | char fmt[] = "syscall=%d common=%s\n"; 27 | bpf_trace_printk(fmt, sizeof(fmt), this_syscall, comm_name); 28 | return 0; 29 | } 30 | 31 | /* we jump here when syscall number == __NR_write */ 32 | SEC("kprobe/SYS__NR_write") 33 | int bpf_func_SYS__NR_write(struct pt_regs *ctx) 34 | { 35 | struct seccomp_data sd; 36 | bpf_probe_read(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx)); 37 | if (sd.args[2] > 0) { 38 | char fmt[] = "write(fd=%d, buf=%p, size=%d)\n"; 39 | bpf_trace_printk(fmt, sizeof(fmt), 40 | sd.args[0], sd.args[1], sd.args[2]); 41 | } 42 | return 0; 43 | } 44 | 45 | SEC("kprobe/SYS__NR_read") 46 | int bpf_func_SYS__NR_read(struct pt_regs *ctx) 47 | { 48 | struct seccomp_data sd; 49 | bpf_probe_read(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx)); 50 | if (sd.args[2] > 0 && sd.args[2] <= 1024) { 51 | char fmt[] = "read(fd=%d, buf=%p, size=%d)\n"; 52 | bpf_trace_printk(fmt, sizeof(fmt), 53 | sd.args[0], sd.args[1], sd.args[2]); 54 | } 55 | return 0; 56 | } 57 | 58 | SEC("kprobe/SYS__NR_open") 59 | int bpf_func_SYS__NR_open(struct pt_regs *ctx) 60 | { 61 | struct seccomp_data sd; 62 | bpf_probe_read(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx)); 63 | char fmt[] = "open(fd=%d, path=%p)\n"; 64 | bpf_trace_printk(fmt, sizeof(fmt), sd.args[0], sd.args[1]); 65 | return 0; 66 | } -------------------------------------------------------------------------------- /brc/trace.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | #include "trace.skel.h" 14 | 15 | #define BPF_SYSFS_ROOT "/sys/fs/bpf" 16 | 17 | enum { 18 | SYS__NR_read = 3, 19 | SYS__NR_write = 4, 20 | SYS__NR_open = 5, 21 | }; 22 | 23 | struct bpf_progs_desc { 24 | char name[256]; 25 | enum bpf_prog_type type; 26 | int map_prog_idx; 27 | struct bpf_program *prog; 28 | }; 29 | 30 | static struct bpf_progs_desc progs[] = { 31 | {"kprobe/__seccomp_filter", BPF_PROG_TYPE_KPROBE, -1, NULL}, 32 | {"kprobe/SYS__NR_read", BPF_PROG_TYPE_KPROBE, SYS__NR_read, NULL}, 33 | {"kprobe/SYS__NR_write", BPF_PROG_TYPE_KPROBE, SYS__NR_write, NULL}, 34 | {"kprobe/SYS__NR_open", BPF_PROG_TYPE_KPROBE, SYS__NR_open, NULL}, 35 | }; 36 | 37 | static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) 38 | { 39 | return vfprintf(stderr, format, args); 40 | } 41 | 42 | static volatile bool exiting = false; 43 | 44 | static void sig_handler(int sig) 45 | { 46 | exiting = true; 47 | } 48 | 49 | int main(int argc, char **argv) 50 | { 51 | struct trace_bpf *skel; 52 | int map_progs_fd, main_prog_fd, prog_count; 53 | int err; 54 | 55 | // 设置一些debug信息的回调 56 | libbpf_set_print(libbpf_print_fn); 57 | 58 | signal(SIGINT, sig_handler); 59 | signal(SIGTERM, sig_handler); 60 | 61 | // Load and verify BPF application 62 | skel = trace_bpf__open(); 63 | if (!skel) { 64 | fprintf(stderr, "Failed to open and load BPF skeleton\n"); 65 | return 1; 66 | } 67 | 68 | // Load and verify BPF programs 69 | err = trace_bpf__load(skel); 70 | if (err) { 71 | fprintf(stderr, "Failed to load and verify BPF skeleton\n"); 72 | goto cleanup; 73 | } 74 | 75 | map_progs_fd = bpf_object__find_map_fd_by_name(skel->obj, "progs"); 76 | prog_count = sizeof(progs) / sizeof(progs[0]); 77 | for (int i = 0; i < prog_count; i++) { 78 | progs[i].prog = bpf_object__find_program_by_title(skel->obj, progs[i].name); 79 | if (!progs[i].prog) { 80 | fprintf(stderr, "Error: bpf_object__find_program_by_title failed\n"); 81 | return 1; 82 | } 83 | bpf_program__set_type(progs[i].prog, progs[i].type); 84 | } 85 | 86 | for (int i = 0; i < prog_count; i++) { 87 | int prog_fd = bpf_program__fd(progs[i].prog); 88 | if (prog_fd < 0) { 89 | fprintf(stderr, "Error: Couldn't get file descriptor for program %s\n", progs[i].name); 90 | return 1; 91 | } 92 | 93 | // -1指的是主程序 94 | if (progs[i].map_prog_idx != -1) { 95 | unsigned int map_prog_idx = progs[i].map_prog_idx; 96 | if (map_prog_idx < 0) { 97 | fprintf(stderr, "Error: Cannot get prog fd for bpf program %s\n", progs[i].name); 98 | return 1; 99 | } 100 | // 给 progs map 的 map_prog_idx 插入 prog_fd 101 | err = bpf_map_update_elem(map_progs_fd, &map_prog_idx, &prog_fd, 0); 102 | if (err) { 103 | fprintf(stderr, "Error: bpf_map_update_elem failed for prog array map\n"); 104 | return 1; 105 | } 106 | } 107 | } 108 | 109 | // 只载入主程序,尾调用不载入,所以不可以调用trace_bpf__attach 110 | struct bpf_link* link = bpf_program__attach(skel->progs.__seccomp_filter); 111 | if (link == NULL) { 112 | fprintf(stderr, "Error: bpf_program__attach failed\n"); 113 | return 1; 114 | } 115 | 116 | while(exiting){ 117 | // 写个裸循环会吃巨多CPU的 118 | sleep(1); 119 | } 120 | 121 | cleanup: 122 | // Clean up 123 | trace_bpf__destroy(skel); 124 | 125 | return err < 0 ? -err : 0; 126 | } 127 | -------------------------------------------------------------------------------- /tools/bpftool: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Super-long/ebpf-cache-for-redis/591e91fa9cde394cdc0477a1afafd17cc810fb5f/tools/bpftool -------------------------------------------------------------------------------- /vmlinux/arm64/vmlinux.h: -------------------------------------------------------------------------------- 1 | vmlinux_516.h -------------------------------------------------------------------------------- /vmlinux/vmlinux.h: -------------------------------------------------------------------------------- 1 | x86/vmlinux_508.h -------------------------------------------------------------------------------- /vmlinux/x86/vmlinux.h: -------------------------------------------------------------------------------- 1 | vmlinux_508.h --------------------------------------------------------------------------------