├── .gitignore ├── memcpy_kprobe_user.c ├── memcpy_kprobe_kern.c ├── memcpy_stat_kern.c ├── memcpy_stat_user.c ├── bpf_load.h ├── Makefile ├── perf-sys.h ├── bpf ├── bpf.h └── bpf.c ├── libbpf.h ├── bpf_helpers.h ├── README.md └── bpf_load.c /.gitignore: -------------------------------------------------------------------------------- 1 | tags 2 | *.cscope 3 | *.swp 4 | *.o 5 | *.ll 6 | memcpy_kprobe 7 | memcpy_stat 8 | -------------------------------------------------------------------------------- /memcpy_kprobe_user.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "libbpf.h" 6 | #include "bpf_load.h" 7 | 8 | int main(int argc, char **argv) 9 | { 10 | char filename[256]; 11 | 12 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); 13 | 14 | if (load_bpf_file(filename)) { 15 | printf("%s", bpf_log_buf); 16 | return 1; 17 | } 18 | 19 | read_trace_pipe(); 20 | 21 | return 0; 22 | } 23 | -------------------------------------------------------------------------------- /memcpy_kprobe_kern.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "bpf_helpers.h" 6 | 7 | SEC("kprobe/memcpy") 8 | 9 | int bpf_prog1(struct pt_regs *ctx) 10 | { 11 | unsigned long long size; 12 | char fmt[] = "memcpy size %d\n"; 13 | 14 | bpf_probe_read(&size, sizeof(size), (void *)&PT_REGS_PARM3(ctx)); 15 | 16 | bpf_trace_printk(fmt, sizeof(fmt), size); 17 | 18 | return 0; 19 | } 20 | 21 | char _license[] SEC("license") = "GPL"; 22 | u32 _version SEC("version") = LINUX_VERSION_CODE; 23 | -------------------------------------------------------------------------------- /memcpy_stat_kern.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "bpf_helpers.h" 6 | 7 | struct bpf_map_def SEC("maps") my_map = { 8 | .type = BPF_MAP_TYPE_HASH, 9 | .key_size = sizeof(size_t), 10 | .value_size = sizeof(u32), 11 | .max_entries = 17, 12 | }; 13 | 14 | SEC("kprobe/memcpy") 15 | 16 | int bpf_prog1(struct pt_regs *ctx) 17 | { 18 | size_t size; 19 | u32 *val, count_start = 0; 20 | 21 | bpf_probe_read(&size, sizeof(size), (void *)&PT_REGS_PARM3(ctx)); 22 | 23 | if (size % 64) 24 | size += (64 - size % 64); 25 | 26 | if (size > 1024) 27 | size = 1024; 28 | 29 | val = bpf_map_lookup_elem(&my_map, &size); 30 | if (val && *val < UINT_MAX) 31 | *val = *val + 1; 32 | else 33 | bpf_map_update_elem(&my_map, &size, &count_start, BPF_NOEXIST); 34 | 35 | return 0; 36 | } 37 | 38 | char _license[] SEC("license") = "GPL"; 39 | u32 _version SEC("version") = LINUX_VERSION_CODE; 40 | -------------------------------------------------------------------------------- /memcpy_stat_user.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "libbpf.h" 7 | #include "bpf_load.h" 8 | 9 | int main(int argc, char **argv) 10 | { 11 | char filename[256]; 12 | size_t size; 13 | unsigned int size_cnt = 0; 14 | 15 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); 16 | 17 | if (load_bpf_file(filename)) { 18 | printf("%s", bpf_log_buf); 19 | return 1; 20 | } 21 | 22 | while (1) { 23 | printf("\tSize\t\tCount\n"); 24 | for (size = 0; size <=1024; size = size + 64) { 25 | if(bpf_map_lookup_elem(map_fd[0], &size, &size_cnt)) 26 | size_cnt = 0; 27 | if (size == 1024) 28 | printf("%4ld - %4ld*\t\t%d\n", size - 63, size, 29 | size_cnt); 30 | else if (size) 31 | printf("%4ld - %4ld\t\t%d\n", size - 63, size, 32 | size_cnt); 33 | else 34 | printf(" 0\t\t\t%d\n", size_cnt); 35 | } 36 | printf ("* Size > 1024 have been counted in this interval\n"); 37 | sleep(2); 38 | } 39 | 40 | return 0; 41 | } 42 | -------------------------------------------------------------------------------- /bpf_load.h: -------------------------------------------------------------------------------- 1 | #ifndef __BPF_LOAD_H 2 | #define __BPF_LOAD_H 3 | 4 | #include "libbpf.h" 5 | 6 | #define MAX_MAPS 32 7 | #define MAX_PROGS 32 8 | 9 | struct bpf_map_def { 10 | unsigned int type; 11 | unsigned int key_size; 12 | unsigned int value_size; 13 | unsigned int max_entries; 14 | unsigned int map_flags; 15 | unsigned int inner_map_idx; 16 | }; 17 | 18 | struct bpf_map_data { 19 | int fd; 20 | char *name; 21 | size_t elf_offset; 22 | struct bpf_map_def def; 23 | }; 24 | 25 | typedef void (*fixup_map_cb)(struct bpf_map_data *map, int idx); 26 | 27 | extern int prog_fd[MAX_PROGS]; 28 | extern int event_fd[MAX_PROGS]; 29 | extern char bpf_log_buf[BPF_LOG_BUF_SIZE]; 30 | extern int prog_cnt; 31 | 32 | /* There is a one-to-one mapping between map_fd[] and map_data[]. 33 | * The map_data[] just contains more rich info on the given map. 34 | */ 35 | extern int map_fd[MAX_MAPS]; 36 | extern struct bpf_map_data map_data[MAX_MAPS]; 37 | extern int map_data_count; 38 | 39 | /* parses elf file compiled by llvm .c->.o 40 | * . parses 'maps' section and creates maps via BPF syscall 41 | * . parses 'license' section and passes it to syscall 42 | * . parses elf relocations for BPF maps and adjusts BPF_LD_IMM64 insns by 43 | * storing map_fd into insn->imm and marking such insns as BPF_PSEUDO_MAP_FD 44 | * . loads eBPF programs via BPF syscall 45 | * 46 | * One ELF file can contain multiple BPF programs which will be loaded 47 | * and their FDs stored stored in prog_fd array 48 | * 49 | * returns zero on success 50 | */ 51 | int load_bpf_file(char *path); 52 | int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map); 53 | 54 | void read_trace_pipe(void); 55 | struct ksym { 56 | long addr; 57 | char *name; 58 | }; 59 | 60 | int load_kallsyms(void); 61 | struct ksym *ksym_search(long key); 62 | int set_link_xdp_fd(int ifindex, int fd, __u32 flags); 63 | #endif 64 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | TARGETS := memcpy_kprobe 2 | TARGETS += memcpy_stat 3 | 4 | # Generate file name-scheme based on TARGETS 5 | KERN_SOURCES = ${TARGETS:=_kern.c} 6 | USER_SOURCES = ${TARGETS:=_user.c} 7 | KERN_OBJECTS = ${KERN_SOURCES:.c=.o} 8 | USER_OBJECTS = ${USER_SOURCES:.c=.o} 9 | 10 | # Notice: the kbuilddir can be redefined on make cmdline 11 | KERNEL ?= /lib/modules/$(shell uname -r)/build/ 12 | 13 | CFLAGS := -O2 -Wall 14 | CFLAGS += -I ./ 15 | 16 | EXTRA_CFLAGS=-Werror 17 | 18 | LDFLAGS= -lelf 19 | 20 | BPFLIB = bpf/bpf.o 21 | BPFLIB += bpf_load.o 22 | 23 | LLC ?= llc 24 | CLANG ?= clang 25 | CC = gcc 26 | 27 | NOSTDINC_FLAGS := -nostdinc -isystem $(shell $(CC) -print-file-name=include) 28 | ARCH=$(shell uname -m | sed 's/x86_64/x86/' | sed 's/i386/x86/') 29 | 30 | LINUXINCLUDE += -I$(KERNEL)/arch/$(ARCH)/include/generated/uapi 31 | LINUXINCLUDE += -I$(KERNEL)/arch/$(ARCH)/include/generated 32 | LINUXINCLUDE += -I$(KERNEL)/arch/$(ARCH)/include 33 | LINUXINCLUDE += -I$(KERNEL)/arch/$(ARCH)/include/uapi 34 | LINUXINCLUDE += -I$(KERNEL)/include 35 | LINUXINCLUDE += -I$(KERNEL)/include/uapi 36 | LINUXINCLUDE += -include $(KERNEL)/include/linux/kconfig.h 37 | LINUXINCLUDE += -I$(KERNEL)/include/generated/uapi 38 | 39 | all: $(TARGETS) $(KERN_OBJECTS) 40 | 41 | .PHONY: clean $(CLANG) $(LLC) 42 | 43 | clean: 44 | rm -f *.ll 45 | rm -f $(BPFLIB) 46 | rm -f $(TARGETS) 47 | rm -f $(KERN_OBJECTS) 48 | rm -f $(USER_OBJECTS) 49 | 50 | # clang option -S generated output file with suffix .ll 51 | # which is the non-binary LLVM assembly language format 52 | # (normally LLVM bitcode format .bc is generated) 53 | # 54 | $(KERN_OBJECTS): %.o: %.c bpf_helpers.h 55 | #it will generate .ll file which is actually a LLVM assembly code 56 | $(CLANG) -S $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) \ 57 | -D__KERNEL__ -D__ASM_SYSREG_H -Wno-unused-value -Wno-pointer-sign \ 58 | -Wno-compare-distinct-pointer-types \ 59 | -Wno-gnu-variable-sized-type-not-at-end \ 60 | -Wno-tautological-compare \ 61 | -O2 -emit-llvm -c $< 62 | #now translate LLVM assembly to native assembly 63 | $(LLC) -march=bpf -filetype=obj -o $@ ${@:.o=.ll} 64 | 65 | $(TARGETS): %: %_user.c $(BPFLIB) Makefile 66 | $(CC) $(CFLAGS) $(BPFLIB) $(LDFLAGS) -o $@ $< 67 | -------------------------------------------------------------------------------- /perf-sys.h: -------------------------------------------------------------------------------- 1 | #ifndef _PERF_SYS_H 2 | #define _PERF_SYS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #if defined(__i386__) 11 | #define cpu_relax() asm volatile("rep; nop" ::: "memory"); 12 | #define CPUINFO_PROC {"model name"} 13 | #endif 14 | 15 | #if defined(__x86_64__) 16 | #define cpu_relax() asm volatile("rep; nop" ::: "memory"); 17 | #define CPUINFO_PROC {"model name"} 18 | #endif 19 | 20 | #ifdef __powerpc__ 21 | #define CPUINFO_PROC {"cpu"} 22 | #endif 23 | 24 | #ifdef __s390__ 25 | #define CPUINFO_PROC {"vendor_id"} 26 | #endif 27 | 28 | #ifdef __sh__ 29 | #define CPUINFO_PROC {"cpu type"} 30 | #endif 31 | 32 | #ifdef __hppa__ 33 | #define CPUINFO_PROC {"cpu"} 34 | #endif 35 | 36 | #ifdef __sparc__ 37 | #define CPUINFO_PROC {"cpu"} 38 | #endif 39 | 40 | #ifdef __alpha__ 41 | #define CPUINFO_PROC {"cpu model"} 42 | #endif 43 | 44 | #ifdef __ia64__ 45 | #define cpu_relax() asm volatile ("hint @pause" ::: "memory") 46 | #define CPUINFO_PROC {"model name"} 47 | #endif 48 | 49 | #ifdef __arm__ 50 | #define CPUINFO_PROC {"model name", "Processor"} 51 | #endif 52 | 53 | #ifdef __aarch64__ 54 | #define cpu_relax() asm volatile("yield" ::: "memory") 55 | #endif 56 | 57 | #ifdef __mips__ 58 | #define CPUINFO_PROC {"cpu model"} 59 | #endif 60 | 61 | #ifdef __arc__ 62 | #define CPUINFO_PROC {"Processor"} 63 | #endif 64 | 65 | #ifdef __metag__ 66 | #define CPUINFO_PROC {"CPU"} 67 | #endif 68 | 69 | #ifdef __xtensa__ 70 | #define CPUINFO_PROC {"core ID"} 71 | #endif 72 | 73 | #ifdef __tile__ 74 | #define cpu_relax() asm volatile ("mfspr zero, PASS" ::: "memory") 75 | #define CPUINFO_PROC {"model name"} 76 | #endif 77 | 78 | #ifndef cpu_relax 79 | #define cpu_relax() barrier() 80 | #endif 81 | 82 | static inline int 83 | sys_perf_event_open(struct perf_event_attr *attr, 84 | pid_t pid, int cpu, int group_fd, 85 | unsigned long flags) 86 | { 87 | int fd; 88 | 89 | fd = syscall(__NR_perf_event_open, attr, pid, cpu, 90 | group_fd, flags); 91 | 92 | #ifdef HAVE_ATTR_TEST 93 | if (unlikely(test_attr__enabled)) 94 | test_attr__open(attr, pid, cpu, fd, group_fd, flags); 95 | #endif 96 | return fd; 97 | } 98 | 99 | #endif /* _PERF_SYS_H */ 100 | -------------------------------------------------------------------------------- /bpf/bpf.h: -------------------------------------------------------------------------------- 1 | /* 2 | * common eBPF ELF operations. 3 | * 4 | * Copyright (C) 2013-2015 Alexei Starovoitov 5 | * Copyright (C) 2015 Wang Nan 6 | * Copyright (C) 2015 Huawei Inc. 7 | * 8 | * This program is free software; you can redistribute it and/or 9 | * modify it under the terms of the GNU Lesser General Public 10 | * License as published by the Free Software Foundation; 11 | * version 2.1 of the License (not later!) 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU Lesser General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU Lesser General Public 19 | * License along with this program; if not, see 20 | */ 21 | #ifndef __BPF_BPF_H 22 | #define __BPF_BPF_H 23 | 24 | #include 25 | #include 26 | 27 | int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, 28 | int max_entries, __u32 map_flags); 29 | int bpf_create_map_in_map(enum bpf_map_type map_type, int key_size, 30 | int inner_map_fd, int max_entries, __u32 map_flags); 31 | 32 | /* Recommend log buffer size */ 33 | #define BPF_LOG_BUF_SIZE 65536 34 | int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns, 35 | size_t insns_cnt, const char *license, 36 | __u32 kern_version, char *log_buf, 37 | size_t log_buf_sz); 38 | int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns, 39 | size_t insns_cnt, int strict_alignment, 40 | const char *license, __u32 kern_version, 41 | char *log_buf, size_t log_buf_sz); 42 | 43 | int bpf_map_update_elem(int fd, const void *key, const void *value, 44 | __u64 flags); 45 | 46 | int bpf_map_lookup_elem(int fd, const void *key, void *value); 47 | int bpf_map_delete_elem(int fd, const void *key); 48 | int bpf_map_get_next_key(int fd, const void *key, void *next_key); 49 | int bpf_obj_pin(int fd, const char *pathname); 50 | int bpf_obj_get(const char *pathname); 51 | int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type, 52 | unsigned int flags); 53 | int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type); 54 | int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size, 55 | void *data_out, __u32 *size_out, __u32 *retval, 56 | __u32 *duration); 57 | 58 | #endif 59 | -------------------------------------------------------------------------------- /libbpf.h: -------------------------------------------------------------------------------- 1 | /* eBPF mini library */ 2 | #ifndef __LIBBPF_H 3 | #define __LIBBPF_H 4 | 5 | #include 6 | 7 | struct bpf_insn; 8 | 9 | /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ 10 | 11 | #define BPF_ALU64_REG(OP, DST, SRC) \ 12 | ((struct bpf_insn) { \ 13 | .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \ 14 | .dst_reg = DST, \ 15 | .src_reg = SRC, \ 16 | .off = 0, \ 17 | .imm = 0 }) 18 | 19 | #define BPF_ALU32_REG(OP, DST, SRC) \ 20 | ((struct bpf_insn) { \ 21 | .code = BPF_ALU | BPF_OP(OP) | BPF_X, \ 22 | .dst_reg = DST, \ 23 | .src_reg = SRC, \ 24 | .off = 0, \ 25 | .imm = 0 }) 26 | 27 | /* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */ 28 | 29 | #define BPF_ALU64_IMM(OP, DST, IMM) \ 30 | ((struct bpf_insn) { \ 31 | .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \ 32 | .dst_reg = DST, \ 33 | .src_reg = 0, \ 34 | .off = 0, \ 35 | .imm = IMM }) 36 | 37 | #define BPF_ALU32_IMM(OP, DST, IMM) \ 38 | ((struct bpf_insn) { \ 39 | .code = BPF_ALU | BPF_OP(OP) | BPF_K, \ 40 | .dst_reg = DST, \ 41 | .src_reg = 0, \ 42 | .off = 0, \ 43 | .imm = IMM }) 44 | 45 | /* Short form of mov, dst_reg = src_reg */ 46 | 47 | #define BPF_MOV64_REG(DST, SRC) \ 48 | ((struct bpf_insn) { \ 49 | .code = BPF_ALU64 | BPF_MOV | BPF_X, \ 50 | .dst_reg = DST, \ 51 | .src_reg = SRC, \ 52 | .off = 0, \ 53 | .imm = 0 }) 54 | 55 | #define BPF_MOV32_REG(DST, SRC) \ 56 | ((struct bpf_insn) { \ 57 | .code = BPF_ALU | BPF_MOV | BPF_X, \ 58 | .dst_reg = DST, \ 59 | .src_reg = SRC, \ 60 | .off = 0, \ 61 | .imm = 0 }) 62 | 63 | /* Short form of mov, dst_reg = imm32 */ 64 | 65 | #define BPF_MOV64_IMM(DST, IMM) \ 66 | ((struct bpf_insn) { \ 67 | .code = BPF_ALU64 | BPF_MOV | BPF_K, \ 68 | .dst_reg = DST, \ 69 | .src_reg = 0, \ 70 | .off = 0, \ 71 | .imm = IMM }) 72 | 73 | #define BPF_MOV32_IMM(DST, IMM) \ 74 | ((struct bpf_insn) { \ 75 | .code = BPF_ALU | BPF_MOV | BPF_K, \ 76 | .dst_reg = DST, \ 77 | .src_reg = 0, \ 78 | .off = 0, \ 79 | .imm = IMM }) 80 | 81 | /* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ 82 | #define BPF_LD_IMM64(DST, IMM) \ 83 | BPF_LD_IMM64_RAW(DST, 0, IMM) 84 | 85 | #define BPF_LD_IMM64_RAW(DST, SRC, IMM) \ 86 | ((struct bpf_insn) { \ 87 | .code = BPF_LD | BPF_DW | BPF_IMM, \ 88 | .dst_reg = DST, \ 89 | .src_reg = SRC, \ 90 | .off = 0, \ 91 | .imm = (__u32) (IMM) }), \ 92 | ((struct bpf_insn) { \ 93 | .code = 0, /* zero is reserved opcode */ \ 94 | .dst_reg = 0, \ 95 | .src_reg = 0, \ 96 | .off = 0, \ 97 | .imm = ((__u64) (IMM)) >> 32 }) 98 | 99 | #ifndef BPF_PSEUDO_MAP_FD 100 | # define BPF_PSEUDO_MAP_FD 1 101 | #endif 102 | 103 | /* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */ 104 | #define BPF_LD_MAP_FD(DST, MAP_FD) \ 105 | BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD) 106 | 107 | 108 | /* Direct packet access, R0 = *(uint *) (skb->data + imm32) */ 109 | 110 | #define BPF_LD_ABS(SIZE, IMM) \ 111 | ((struct bpf_insn) { \ 112 | .code = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS, \ 113 | .dst_reg = 0, \ 114 | .src_reg = 0, \ 115 | .off = 0, \ 116 | .imm = IMM }) 117 | 118 | /* Memory load, dst_reg = *(uint *) (src_reg + off16) */ 119 | 120 | #define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ 121 | ((struct bpf_insn) { \ 122 | .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \ 123 | .dst_reg = DST, \ 124 | .src_reg = SRC, \ 125 | .off = OFF, \ 126 | .imm = 0 }) 127 | 128 | /* Memory store, *(uint *) (dst_reg + off16) = src_reg */ 129 | 130 | #define BPF_STX_MEM(SIZE, DST, SRC, OFF) \ 131 | ((struct bpf_insn) { \ 132 | .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \ 133 | .dst_reg = DST, \ 134 | .src_reg = SRC, \ 135 | .off = OFF, \ 136 | .imm = 0 }) 137 | 138 | /* Atomic memory add, *(uint *)(dst_reg + off16) += src_reg */ 139 | 140 | #define BPF_STX_XADD(SIZE, DST, SRC, OFF) \ 141 | ((struct bpf_insn) { \ 142 | .code = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD, \ 143 | .dst_reg = DST, \ 144 | .src_reg = SRC, \ 145 | .off = OFF, \ 146 | .imm = 0 }) 147 | 148 | /* Memory store, *(uint *) (dst_reg + off16) = imm32 */ 149 | 150 | #define BPF_ST_MEM(SIZE, DST, OFF, IMM) \ 151 | ((struct bpf_insn) { \ 152 | .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \ 153 | .dst_reg = DST, \ 154 | .src_reg = 0, \ 155 | .off = OFF, \ 156 | .imm = IMM }) 157 | 158 | /* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */ 159 | 160 | #define BPF_JMP_REG(OP, DST, SRC, OFF) \ 161 | ((struct bpf_insn) { \ 162 | .code = BPF_JMP | BPF_OP(OP) | BPF_X, \ 163 | .dst_reg = DST, \ 164 | .src_reg = SRC, \ 165 | .off = OFF, \ 166 | .imm = 0 }) 167 | 168 | /* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */ 169 | 170 | #define BPF_JMP_IMM(OP, DST, IMM, OFF) \ 171 | ((struct bpf_insn) { \ 172 | .code = BPF_JMP | BPF_OP(OP) | BPF_K, \ 173 | .dst_reg = DST, \ 174 | .src_reg = 0, \ 175 | .off = OFF, \ 176 | .imm = IMM }) 177 | 178 | /* Raw code statement block */ 179 | 180 | #define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \ 181 | ((struct bpf_insn) { \ 182 | .code = CODE, \ 183 | .dst_reg = DST, \ 184 | .src_reg = SRC, \ 185 | .off = OFF, \ 186 | .imm = IMM }) 187 | 188 | /* Program exit */ 189 | 190 | #define BPF_EXIT_INSN() \ 191 | ((struct bpf_insn) { \ 192 | .code = BPF_JMP | BPF_EXIT, \ 193 | .dst_reg = 0, \ 194 | .src_reg = 0, \ 195 | .off = 0, \ 196 | .imm = 0 }) 197 | 198 | #endif 199 | -------------------------------------------------------------------------------- /bpf_helpers.h: -------------------------------------------------------------------------------- 1 | #ifndef __BPF_HELPERS_H 2 | #define __BPF_HELPERS_H 3 | 4 | /* helper macro to place programs, maps, license in 5 | * different sections in elf_bpf file. Section names 6 | * are interpreted by elf_bpf loader 7 | */ 8 | #define SEC(NAME) __attribute__((section(NAME), used)) 9 | 10 | /* helper functions called from eBPF programs written in C */ 11 | static void *(*bpf_map_lookup_elem)(void *map, void *key) = 12 | (void *) BPF_FUNC_map_lookup_elem; 13 | static int (*bpf_map_update_elem)(void *map, void *key, void *value, 14 | unsigned long long flags) = 15 | (void *) BPF_FUNC_map_update_elem; 16 | static int (*bpf_map_delete_elem)(void *map, void *key) = 17 | (void *) BPF_FUNC_map_delete_elem; 18 | static int (*bpf_probe_read)(void *dst, int size, void *unsafe_ptr) = 19 | (void *) BPF_FUNC_probe_read; 20 | static unsigned long long (*bpf_ktime_get_ns)(void) = 21 | (void *) BPF_FUNC_ktime_get_ns; 22 | static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) = 23 | (void *) BPF_FUNC_trace_printk; 24 | static void (*bpf_tail_call)(void *ctx, void *map, int index) = 25 | (void *) BPF_FUNC_tail_call; 26 | static unsigned long long (*bpf_get_smp_processor_id)(void) = 27 | (void *) BPF_FUNC_get_smp_processor_id; 28 | static unsigned long long (*bpf_get_current_pid_tgid)(void) = 29 | (void *) BPF_FUNC_get_current_pid_tgid; 30 | static unsigned long long (*bpf_get_current_uid_gid)(void) = 31 | (void *) BPF_FUNC_get_current_uid_gid; 32 | static int (*bpf_get_current_comm)(void *buf, int buf_size) = 33 | (void *) BPF_FUNC_get_current_comm; 34 | static int (*bpf_perf_event_read)(void *map, int index) = 35 | (void *) BPF_FUNC_perf_event_read; 36 | static int (*bpf_clone_redirect)(void *ctx, int ifindex, int flags) = 37 | (void *) BPF_FUNC_clone_redirect; 38 | static int (*bpf_redirect)(int ifindex, int flags) = 39 | (void *) BPF_FUNC_redirect; 40 | static int (*bpf_perf_event_output)(void *ctx, void *map, 41 | unsigned long long flags, void *data, 42 | int size) = 43 | (void *) BPF_FUNC_perf_event_output; 44 | static int (*bpf_get_stackid)(void *ctx, void *map, int flags) = 45 | (void *) BPF_FUNC_get_stackid; 46 | static int (*bpf_probe_write_user)(void *dst, void *src, int size) = 47 | (void *) BPF_FUNC_probe_write_user; 48 | static int (*bpf_current_task_under_cgroup)(void *map, int index) = 49 | (void *) BPF_FUNC_current_task_under_cgroup; 50 | static int (*bpf_skb_get_tunnel_key)(void *ctx, void *key, int size, int flags) = 51 | (void *) BPF_FUNC_skb_get_tunnel_key; 52 | static int (*bpf_skb_set_tunnel_key)(void *ctx, void *key, int size, int flags) = 53 | (void *) BPF_FUNC_skb_set_tunnel_key; 54 | static int (*bpf_skb_get_tunnel_opt)(void *ctx, void *md, int size) = 55 | (void *) BPF_FUNC_skb_get_tunnel_opt; 56 | static int (*bpf_skb_set_tunnel_opt)(void *ctx, void *md, int size) = 57 | (void *) BPF_FUNC_skb_set_tunnel_opt; 58 | static unsigned long long (*bpf_get_prandom_u32)(void) = 59 | (void *) BPF_FUNC_get_prandom_u32; 60 | static int (*bpf_xdp_adjust_head)(void *ctx, int offset) = 61 | (void *) BPF_FUNC_xdp_adjust_head; 62 | 63 | /* llvm builtin functions that eBPF C program may use to 64 | * emit BPF_LD_ABS and BPF_LD_IND instructions 65 | */ 66 | struct sk_buff; 67 | unsigned long long load_byte(void *skb, 68 | unsigned long long off) asm("llvm.bpf.load.byte"); 69 | unsigned long long load_half(void *skb, 70 | unsigned long long off) asm("llvm.bpf.load.half"); 71 | unsigned long long load_word(void *skb, 72 | unsigned long long off) asm("llvm.bpf.load.word"); 73 | 74 | /* a helper structure used by eBPF C program 75 | * to describe map attributes to elf_bpf loader 76 | */ 77 | struct bpf_map_def { 78 | unsigned int type; 79 | unsigned int key_size; 80 | unsigned int value_size; 81 | unsigned int max_entries; 82 | unsigned int map_flags; 83 | unsigned int inner_map_idx; 84 | }; 85 | 86 | static int (*bpf_skb_load_bytes)(void *ctx, int off, void *to, int len) = 87 | (void *) BPF_FUNC_skb_load_bytes; 88 | static int (*bpf_skb_store_bytes)(void *ctx, int off, void *from, int len, int flags) = 89 | (void *) BPF_FUNC_skb_store_bytes; 90 | static int (*bpf_l3_csum_replace)(void *ctx, int off, int from, int to, int flags) = 91 | (void *) BPF_FUNC_l3_csum_replace; 92 | static int (*bpf_l4_csum_replace)(void *ctx, int off, int from, int to, int flags) = 93 | (void *) BPF_FUNC_l4_csum_replace; 94 | static int (*bpf_skb_under_cgroup)(void *ctx, void *map, int index) = 95 | (void *) BPF_FUNC_skb_under_cgroup; 96 | static int (*bpf_skb_change_head)(void *, int len, int flags) = 97 | (void *) BPF_FUNC_skb_change_head; 98 | 99 | #if defined(__x86_64__) 100 | 101 | #define PT_REGS_PARM1(x) ((x)->di) 102 | #define PT_REGS_PARM2(x) ((x)->si) 103 | #define PT_REGS_PARM3(x) ((x)->dx) 104 | #define PT_REGS_PARM4(x) ((x)->cx) 105 | #define PT_REGS_PARM5(x) ((x)->r8) 106 | #define PT_REGS_RET(x) ((x)->sp) 107 | #define PT_REGS_FP(x) ((x)->bp) 108 | #define PT_REGS_RC(x) ((x)->ax) 109 | #define PT_REGS_SP(x) ((x)->sp) 110 | #define PT_REGS_IP(x) ((x)->ip) 111 | 112 | #elif defined(__s390x__) 113 | 114 | #define PT_REGS_PARM1(x) ((x)->gprs[2]) 115 | #define PT_REGS_PARM2(x) ((x)->gprs[3]) 116 | #define PT_REGS_PARM3(x) ((x)->gprs[4]) 117 | #define PT_REGS_PARM4(x) ((x)->gprs[5]) 118 | #define PT_REGS_PARM5(x) ((x)->gprs[6]) 119 | #define PT_REGS_RET(x) ((x)->gprs[14]) 120 | #define PT_REGS_FP(x) ((x)->gprs[11]) /* Works only with CONFIG_FRAME_POINTER */ 121 | #define PT_REGS_RC(x) ((x)->gprs[2]) 122 | #define PT_REGS_SP(x) ((x)->gprs[15]) 123 | #define PT_REGS_IP(x) ((x)->psw.addr) 124 | 125 | #elif defined(__aarch64__) 126 | 127 | #define PT_REGS_PARM1(x) ((x)->regs[0]) 128 | #define PT_REGS_PARM2(x) ((x)->regs[1]) 129 | #define PT_REGS_PARM3(x) ((x)->regs[2]) 130 | #define PT_REGS_PARM4(x) ((x)->regs[3]) 131 | #define PT_REGS_PARM5(x) ((x)->regs[4]) 132 | #define PT_REGS_RET(x) ((x)->regs[30]) 133 | #define PT_REGS_FP(x) ((x)->regs[29]) /* Works only with CONFIG_FRAME_POINTER */ 134 | #define PT_REGS_RC(x) ((x)->regs[0]) 135 | #define PT_REGS_SP(x) ((x)->sp) 136 | #define PT_REGS_IP(x) ((x)->pc) 137 | 138 | #elif defined(__powerpc__) 139 | 140 | #define PT_REGS_PARM1(x) ((x)->gpr[3]) 141 | #define PT_REGS_PARM2(x) ((x)->gpr[4]) 142 | #define PT_REGS_PARM3(x) ((x)->gpr[5]) 143 | #define PT_REGS_PARM4(x) ((x)->gpr[6]) 144 | #define PT_REGS_PARM5(x) ((x)->gpr[7]) 145 | #define PT_REGS_RC(x) ((x)->gpr[3]) 146 | #define PT_REGS_SP(x) ((x)->sp) 147 | #define PT_REGS_IP(x) ((x)->nip) 148 | 149 | #elif defined(__sparc__) 150 | 151 | #define PT_REGS_PARM1(x) ((x)->u_regs[UREG_I0]) 152 | #define PT_REGS_PARM2(x) ((x)->u_regs[UREG_I1]) 153 | #define PT_REGS_PARM3(x) ((x)->u_regs[UREG_I2]) 154 | #define PT_REGS_PARM4(x) ((x)->u_regs[UREG_I3]) 155 | #define PT_REGS_PARM5(x) ((x)->u_regs[UREG_I4]) 156 | #define PT_REGS_RET(x) ((x)->u_regs[UREG_I7]) 157 | #define PT_REGS_RC(x) ((x)->u_regs[UREG_I0]) 158 | #define PT_REGS_SP(x) ((x)->u_regs[UREG_FP]) 159 | #if defined(__arch64__) 160 | #define PT_REGS_IP(x) ((x)->tpc) 161 | #else 162 | #define PT_REGS_IP(x) ((x)->pc) 163 | #endif 164 | 165 | #endif 166 | 167 | #ifdef __powerpc__ 168 | #define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = (ctx)->link; }) 169 | #define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP 170 | #elif defined(__sparc__) 171 | #define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = PT_REGS_RET(ctx); }) 172 | #define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP 173 | #else 174 | #define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ \ 175 | bpf_probe_read(&(ip), sizeof(ip), (void *)PT_REGS_RET(ctx)); }) 176 | #define BPF_KRETPROBE_READ_RET_IP(ip, ctx) ({ \ 177 | bpf_probe_read(&(ip), sizeof(ip), \ 178 | (void *)(PT_REGS_FP(ctx) + sizeof(ip))); }) 179 | #endif 180 | 181 | #endif 182 | -------------------------------------------------------------------------------- /bpf/bpf.c: -------------------------------------------------------------------------------- 1 | /* 2 | * common eBPF ELF operations. 3 | * 4 | * Copyright (C) 2013-2015 Alexei Starovoitov 5 | * Copyright (C) 2015 Wang Nan 6 | * Copyright (C) 2015 Huawei Inc. 7 | * 8 | * This program is free software; you can redistribute it and/or 9 | * modify it under the terms of the GNU Lesser General Public 10 | * License as published by the Free Software Foundation; 11 | * version 2.1 of the License (not later!) 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU Lesser General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU Lesser General Public 19 | * License along with this program; if not, see 20 | */ 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include "bpf.h" 28 | 29 | /* 30 | * When building perf, unistd.h is overridden. __NR_bpf is 31 | * required to be defined explicitly. 32 | */ 33 | #ifndef __NR_bpf 34 | # if defined(__i386__) 35 | # define __NR_bpf 357 36 | # elif defined(__x86_64__) 37 | # define __NR_bpf 321 38 | # elif defined(__aarch64__) 39 | # define __NR_bpf 280 40 | # elif defined(__sparc__) 41 | # define __NR_bpf 349 42 | # else 43 | # error __NR_bpf not defined. libbpf does not support your arch. 44 | # endif 45 | #endif 46 | 47 | static inline __u64 ptr_to_u64(const void *ptr) 48 | { 49 | return (__u64) (unsigned long) ptr; 50 | } 51 | 52 | static inline int sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr, 53 | unsigned int size) 54 | { 55 | return syscall(__NR_bpf, cmd, attr, size); 56 | } 57 | 58 | int bpf_create_map(enum bpf_map_type map_type, int key_size, 59 | int value_size, int max_entries, __u32 map_flags) 60 | { 61 | union bpf_attr attr; 62 | 63 | memset(&attr, '\0', sizeof(attr)); 64 | 65 | attr.map_type = map_type; 66 | attr.key_size = key_size; 67 | attr.value_size = value_size; 68 | attr.max_entries = max_entries; 69 | attr.map_flags = map_flags; 70 | 71 | return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); 72 | } 73 | 74 | int bpf_create_map_in_map(enum bpf_map_type map_type, int key_size, 75 | int inner_map_fd, int max_entries, __u32 map_flags) 76 | { 77 | union bpf_attr attr; 78 | 79 | memset(&attr, '\0', sizeof(attr)); 80 | 81 | attr.map_type = map_type; 82 | attr.key_size = key_size; 83 | attr.value_size = 4; 84 | attr.inner_map_fd = inner_map_fd; 85 | attr.max_entries = max_entries; 86 | attr.map_flags = map_flags; 87 | 88 | return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); 89 | } 90 | 91 | int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns, 92 | size_t insns_cnt, const char *license, 93 | __u32 kern_version, char *log_buf, size_t log_buf_sz) 94 | { 95 | int fd; 96 | union bpf_attr attr; 97 | 98 | bzero(&attr, sizeof(attr)); 99 | attr.prog_type = type; 100 | attr.insn_cnt = (__u32)insns_cnt; 101 | attr.insns = ptr_to_u64(insns); 102 | attr.license = ptr_to_u64(license); 103 | attr.log_buf = ptr_to_u64(NULL); 104 | attr.log_size = 0; 105 | attr.log_level = 0; 106 | attr.kern_version = kern_version; 107 | 108 | fd = sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); 109 | if (fd >= 0 || !log_buf || !log_buf_sz) 110 | return fd; 111 | 112 | /* Try again with log */ 113 | attr.log_buf = ptr_to_u64(log_buf); 114 | attr.log_size = log_buf_sz; 115 | attr.log_level = 1; 116 | log_buf[0] = 0; 117 | return sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); 118 | } 119 | 120 | int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns, 121 | size_t insns_cnt, int strict_alignment, 122 | const char *license, __u32 kern_version, 123 | char *log_buf, size_t log_buf_sz) 124 | { 125 | union bpf_attr attr; 126 | 127 | bzero(&attr, sizeof(attr)); 128 | attr.prog_type = type; 129 | attr.insn_cnt = (__u32)insns_cnt; 130 | attr.insns = ptr_to_u64(insns); 131 | attr.license = ptr_to_u64(license); 132 | attr.log_buf = ptr_to_u64(log_buf); 133 | attr.log_size = log_buf_sz; 134 | attr.log_level = 2; 135 | log_buf[0] = 0; 136 | attr.kern_version = kern_version; 137 | attr.prog_flags = strict_alignment ? BPF_F_STRICT_ALIGNMENT : 0; 138 | 139 | return sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); 140 | } 141 | 142 | int bpf_map_update_elem(int fd, const void *key, const void *value, 143 | __u64 flags) 144 | { 145 | union bpf_attr attr; 146 | 147 | bzero(&attr, sizeof(attr)); 148 | attr.map_fd = fd; 149 | attr.key = ptr_to_u64(key); 150 | attr.value = ptr_to_u64(value); 151 | attr.flags = flags; 152 | 153 | return sys_bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); 154 | } 155 | 156 | int bpf_map_lookup_elem(int fd, const void *key, void *value) 157 | { 158 | union bpf_attr attr; 159 | 160 | bzero(&attr, sizeof(attr)); 161 | attr.map_fd = fd; 162 | attr.key = ptr_to_u64(key); 163 | attr.value = ptr_to_u64(value); 164 | 165 | return sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)); 166 | } 167 | 168 | int bpf_map_delete_elem(int fd, const void *key) 169 | { 170 | union bpf_attr attr; 171 | 172 | bzero(&attr, sizeof(attr)); 173 | attr.map_fd = fd; 174 | attr.key = ptr_to_u64(key); 175 | 176 | return sys_bpf(BPF_MAP_DELETE_ELEM, &attr, sizeof(attr)); 177 | } 178 | 179 | int bpf_map_get_next_key(int fd, const void *key, void *next_key) 180 | { 181 | union bpf_attr attr; 182 | 183 | bzero(&attr, sizeof(attr)); 184 | attr.map_fd = fd; 185 | attr.key = ptr_to_u64(key); 186 | attr.next_key = ptr_to_u64(next_key); 187 | 188 | return sys_bpf(BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr)); 189 | } 190 | 191 | int bpf_obj_pin(int fd, const char *pathname) 192 | { 193 | union bpf_attr attr; 194 | 195 | bzero(&attr, sizeof(attr)); 196 | attr.pathname = ptr_to_u64((void *)pathname); 197 | attr.bpf_fd = fd; 198 | 199 | return sys_bpf(BPF_OBJ_PIN, &attr, sizeof(attr)); 200 | } 201 | 202 | int bpf_obj_get(const char *pathname) 203 | { 204 | union bpf_attr attr; 205 | 206 | bzero(&attr, sizeof(attr)); 207 | attr.pathname = ptr_to_u64((void *)pathname); 208 | 209 | return sys_bpf(BPF_OBJ_GET, &attr, sizeof(attr)); 210 | } 211 | 212 | int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type, 213 | unsigned int flags) 214 | { 215 | union bpf_attr attr; 216 | 217 | bzero(&attr, sizeof(attr)); 218 | attr.target_fd = target_fd; 219 | attr.attach_bpf_fd = prog_fd; 220 | attr.attach_type = type; 221 | attr.attach_flags = flags; 222 | 223 | return sys_bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)); 224 | } 225 | 226 | int bpf_prog_detach(int target_fd, enum bpf_attach_type type) 227 | { 228 | union bpf_attr attr; 229 | 230 | bzero(&attr, sizeof(attr)); 231 | attr.target_fd = target_fd; 232 | attr.attach_type = type; 233 | 234 | return sys_bpf(BPF_PROG_DETACH, &attr, sizeof(attr)); 235 | } 236 | 237 | int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size, 238 | void *data_out, __u32 *size_out, __u32 *retval, 239 | __u32 *duration) 240 | { 241 | union bpf_attr attr; 242 | int ret; 243 | 244 | bzero(&attr, sizeof(attr)); 245 | attr.test.prog_fd = prog_fd; 246 | attr.test.data_in = ptr_to_u64(data); 247 | attr.test.data_out = ptr_to_u64(data_out); 248 | attr.test.data_size_in = size; 249 | attr.test.repeat = repeat; 250 | 251 | ret = sys_bpf(BPF_PROG_TEST_RUN, &attr, sizeof(attr)); 252 | if (size_out) 253 | *size_out = attr.test.data_size_out; 254 | if (retval) 255 | *retval = attr.test.retval; 256 | if (duration) 257 | *duration = attr.test.duration; 258 | return ret; 259 | } 260 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # learn-bpf 2 | 3 | This program has been tested with 4.12.5-200.fc25.x86_64. 4 | 5 | memcpy_kprobe inserts kprobe at the entry of kernel memcpy() and prints bpf 6 | kernel messages in trace buffer. memcpy_stat prepares a table in kernel 7 | space itself for the count of memcpy() called with different sizes. 8 | 9 | # make 10 | # ./memcpy_kprobe 11 | memcpy_kprobe-24908 [005] d... 151374.866218: : memcpy size 2 12 | memcpy_kprobe-24908 [005] d... 151374.866221: : memcpy size 13 13 | memcpy_kprobe-24908 [005] d... 151374.866224: : memcpy size 1 14 | memcpy_kprobe-24908 [005] d... 151374.866226: : memcpy size 2 15 | memcpy_kprobe-24908 [005] d... 151374.866229: : memcpy size 2 16 | memcpy_kprobe-24908 [005] d... 151374.866232: : memcpy size 1 17 | memcpy_kprobe-24908 [005] d... 151374.866234: : memcpy size 1 18 | memcpy_kprobe-24908 [005] d... 151374.866237: : memcpy size 2 19 | 20 | # ./memcpy_stat 21 | Size Count 22 | 0 134 23 | 1 - 64 10444 24 | 65 - 128 23 25 | 129 - 192 16 26 | 193 - 256 240 27 | 257 - 320 0 28 | 321 - 384 0 29 | 385 - 448 7 30 | 449 - 512 7 31 | 513 - 576 0 32 | 577 - 640 0 33 | 641 - 704 0 34 | 705 - 768 0 35 | 769 - 832 0 36 | 833 - 896 0 37 | 897 - 960 0 38 | 961 - 1024* 107 39 | * Size > 1024 have been counted in this interval 40 | 41 | eBPF history and program description 42 | ==================================== 43 | 44 | Writing a simple eBPF application for Kernel Tracing 45 | ===================================================== 46 | 47 | 48 | 49 | eBPF an introduction 50 | ====================== 51 | 52 | * BPF machine 53 | 54 | In the year 1992, Steven McCanne and Van Jacobson from Lawrence Berkeley 55 | Laboratory [1] proposed a solution to BSD Unix systems for minimising 56 | unwanted network packet copy to user space by implementing an in-kernel 57 | packet filter. This filter is known as Berkeley Packet Filter(BPF). It was 58 | latter introduced in Linux Kernel version 2.1.75 in 1997. 59 | 60 | This filter was aimed to filter all the unwanted packets as early as 61 | possible, so the filtering mechanism had to be shifted from user space 62 | utilities like tcpdump to the in-kernel virtual machine. A group of 63 | assembly like instructions for filtering necessary packet are sent from 64 | user space to kernel by a system call bpf(). Kernel statically analyzes 65 | the programs before loading them and makes sure that they cannot hang or 66 | harm a running system. 67 | 68 | The BPF machine abstraction consists of [1] an accumulator, an index 69 | register ( x ), a scratch memory store, and an implicit program counter. 70 | It has a small set of arithmetic, logical, and jump instructions. The 71 | accumulator is used for arithmetic operations, while the index register 72 | provides offsets into the packet or into the scratch memory areas. Lets see 73 | the example of a small BPF program written in BPF bytecode: 74 | 75 | ldh [12] 76 | jeq #ETHERTYPE_IP, l1, l2 77 | l1: ret #TRUE 78 | l2: ret #0 79 | 80 | ldh instruction loads a half word (16 bit) value in accumulator from offset 81 | 12 in ethernet packet which is ethernet type field. If it is not an IP 82 | packet then 0 will be returned and so the packet would be rejected. 83 | 84 | * BPF JIT compiler 85 | 86 | A just in time (JIT) compiler was introduced into kernel [2] in 2011 to 87 | speed up BPF bytecode execution. This compiler translates BPF bytecode into 88 | host system’s assembly code. Such compiler exists for x86_64, SPARC, 89 | PowerPC, ARM, ARM64, MIPS and s390 and can be enabled through 90 | CONFIG_BPF_JIT. 91 | 92 | * eBPF machine 93 | 94 | Extended BPF (eBPF) is an enhancement over BPF (which is now called as cBPF 95 | i.e. classical BPF) having more resources like 10 registers and 1-8 byte 96 | load/store instructions etc [3]. While BPF was having only forward jump, 97 | eBPF has both backward as well as forward jump, and so we can have a loop. 98 | Of Course kernel takes care that loop still terminates properly. It also 99 | includes global data store which is called maps, and this maps state 100 | persist between events, therefore eBPF can be also used for aggregating 101 | statistics of events. Further, an eBPF program can be written in ‘C’ like 102 | functions, which can be compiled using GCC/LLVM compiler. eBPF has been 103 | designed to be JITed with one to one mapping, so a very optimized code 104 | having performance as fast as natively compiled code can be generated. 105 | 106 | eBPF and tracing review 107 | ========================== 108 | 109 | * Upstream kernel development 110 | 111 | Traditional built-in tracers in Linux are used in post-process manner, 112 | where they would dump fixed event details and then user space tools like 113 | perf or trace-cmd can post process to get required information e.g `perf 114 | stat`. However, eBPF has ability to prepare user information in kernel 115 | context, and only transfer needed information to user space. So far support 116 | of kprobes, tracepoints and perf_events filtering using eBPF have been 117 | implemented in upstream kernel. They have been supported with arch x86-64, 118 | aarch64, s390x, powerpc64 and sparc64. 119 | 120 | One can look into following Linux kernel files to get an insight of it. 121 | 122 | - kernel/bpf/ 123 | - kernel/trace/bpf_trace.c 124 | - kernel/events/core.c 125 | 126 | * User space development 127 | 128 | There have been user space tools development for in-kernel tree as well as 129 | out of the kernel tree. Following files/directories are good to look into 130 | upstream kernel for ebpf usage. 131 | 132 | - tools/lib/bpf 133 | - tools/perf/util/bpf-loader.c 134 | - samples/bpf/ 135 | 136 | bcc( https://github.com/iovisor/bcc.git) is another out of kernel tree tool 137 | which has very efficient kernel tracing programs for specific usage(like 138 | `funccount` which counts functions matching a pattern). 139 | 140 | Perf has also a bpf interface which can be used to load eBPF object 141 | into kernel. 142 | 143 | eBPF tracing: User space to kernel space flow 144 | =============================================== 145 | 146 | Lets first understand couple of useful entity to interact with ebpf kernel: 147 | 148 | * BPF system call 149 | 150 | user can interact using bpf() system call whose prototype is int bpf(int 151 | cmd, union bpf_attr *attr, unsigned int size); 152 | 153 | One can see `man bpf` for detail about the different possible arguments. 154 | Here, I am providing summary of those arguments. 155 | 156 | *cmd* can be any of the defined enum bpf_cmd, which tells kernel 157 | mainly about management of map area (like it’s creation, updating, 158 | deleting or finding an element within it etc), attaching or detaching 159 | a program etc. 160 | 161 | *attr* can be one of the user defined structure which can be used 162 | by respective command. 163 | 164 | *size* will be the size of attr. 165 | 166 | * BPF Maps: 167 | 168 | eBPF tracing calculates the stats in kernel domain itself. We will need 169 | some memory/data structure within the kernel to create such stats. Maps 170 | are a generic data structure for storage of different types of data in the 171 | form of key-value pair. They allow sharing of data between eBPF kernel 172 | programs, and also between kernel and user-space applications. 173 | 174 | Couple of important attributes for maps: 175 | 176 | - Type (map_type) 177 | - maximum number of elements (max_entries) 178 | - key size in bytes (key_size) 179 | - value size in bytes (value_size) 180 | 181 | A map can be of different types like Hash, Array, Program array etc. We 182 | need to choose appropriate type as per our needs. For example if key is a 183 | string or not from an integer series then hash map can be used for faster 184 | look up, however if key is like an index then array map will provide the 185 | fastest look up method. 186 | 187 | We can not have a key bigger than key_size and can not store a value bigger 188 | than value_size. max_entries is the maximum number of key-value pair which 189 | can be stored within map. 190 | 191 | * Some important command: 192 | 193 | - BPF_PROG_LOAD: Couple of important attributes for this program. 194 | 195 | prog_type : some of the program type useful for tracing are 196 | 197 | BPF_PROG_TYPE_KPROBE 198 | 199 | BPF_PROG_TYPE_TRACEPOINT, 200 | 201 | BPF_PROG_TYPE_PERF_EVENT, 202 | 203 | insns: is pointer to “struct bpf_insn” which has bpf instruction to 204 | be executed by in-kernel bpf VM. 205 | 206 | insn_cnt: total number of instructions present at insns. 207 | 208 | license:string, which must be GPL compatible to call helper functions 209 | marked gpl_only. 210 | 211 | kern_version: version of kernel tree 212 | 213 | - BPF_MAP_CREATE: It accepts attributes as discussed in BPF Maps section, 214 | creates a new map and then returns a new file descriptor that refers to the 215 | map. Returned map_fd can be used for lookup or update map elements with 216 | commands like BPF_MAP_LOOKUP_ELEM, BPF_MAP_UPDATE_ELEM, BPF_MAP_DELETE_ELEM 217 | or BPF_MAP_GET_NEXT_KEY. These map manipulation command accepts an 218 | attribute with map_fd, key and value. 219 | 220 | Now lets lookup some code which can explain it’s working. See the exmaple 221 | code here: https://github.com/pratyushanand/learn-bpf 222 | 223 | Above code is a standalone ebpf demo code which does not need any other 224 | ebpf library code. It has a small library to load different sections of bpf 225 | kernel code (bpf_load.c) and then some wrapper function on top of bpf() 226 | system call (bpf.c) to manipulate map and load kernel bpf code. When we 227 | compile this code we get two executable, memcpy_kprobe and memcpy_stat. 228 | Lets first see what memcpy_kprobe* files do. 229 | 230 | For each application we have one *_kern file and another *_user file. 231 | *_kern file has a function “int bpf_prog1(struct pt_regs *ctx)”. This 232 | function is executed in kernel, so it can access kernel variable and 233 | functions. memcpy_kprobe_kern.c has three section mappings for program, 234 | license and version respectively. Data from these sections are made part of 235 | attributes of system call bpf(BPF_PROG_LOAD,...) and then kernel executes 236 | loaded bpf instructions as per prog_type attribute. So, bpf code in 237 | memcpy_kprobe_kern.c will be executed when a kprobe instrumented at the 238 | entry of kernel memcpy() is hit. When this bpf code is executed, it will 239 | read 3rd argument of memcpy() ie size of copy and then will print one 240 | statement for “memcpy size” in trace buffer. memcpy_kprobe_user.c loads 241 | kernel program and keeps on reading trace buffer to show what kernel ebpf 242 | program is writing into it. 243 | 244 | We have another demo memcpy_stat which prepares stats of memcpy() copy size 245 | in kernel itself. memcpy_stat_kern.c has one more section as maps. 246 | bpf_prog1() reads memcpy() sizes and updates map table. Corresponding user 247 | space program memcpy_stat_user.c reads map table at every 2 second and 248 | prints stats on console. 249 | 250 | Above two simple example can help one to understand, how a user can write 251 | kernel ebpf code for kernel tracing and statistics preparation. 252 | 253 | [1] http://www.tcpdump.org/papers/bpf-usenix93.pdf 254 | [2] https://lwn.net/Articles/437884/ 255 | [3] https://www.kernel.org/doc/Documentation/networking/filter.txt 256 | [4] http://events.linuxfoundation.org/sites/events/files/slides/Performance%20Monitoring%20and%20Analysis%20Using%20perf%20and%20BPF_1.pdf 257 | 258 | 259 | -------------------------------------------------------------------------------- /bpf_load.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include "libbpf.h" 27 | #include "bpf_load.h" 28 | #include "perf-sys.h" 29 | 30 | #define DEBUGFS "/sys/kernel/debug/tracing/" 31 | 32 | static char license[128]; 33 | static int kern_version; 34 | static bool processed_sec[128]; 35 | char bpf_log_buf[BPF_LOG_BUF_SIZE]; 36 | int map_fd[MAX_MAPS]; 37 | int prog_fd[MAX_PROGS]; 38 | int event_fd[MAX_PROGS]; 39 | int prog_cnt; 40 | int prog_array_fd = -1; 41 | 42 | struct bpf_map_data map_data[MAX_MAPS]; 43 | int map_data_count = 0; 44 | 45 | static int populate_prog_array(const char *event, int prog_fd) 46 | { 47 | int ind = atoi(event), err; 48 | 49 | err = bpf_map_update_elem(prog_array_fd, &ind, &prog_fd, BPF_ANY); 50 | if (err < 0) { 51 | printf("failed to store prog_fd in prog_array\n"); 52 | return -1; 53 | } 54 | return 0; 55 | } 56 | 57 | static int load_and_attach(const char *event, struct bpf_insn *prog, int size) 58 | { 59 | bool is_socket = strncmp(event, "socket", 6) == 0; 60 | bool is_kprobe = strncmp(event, "kprobe/", 7) == 0; 61 | bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0; 62 | bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0; 63 | bool is_xdp = strncmp(event, "xdp", 3) == 0; 64 | bool is_perf_event = strncmp(event, "perf_event", 10) == 0; 65 | bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0; 66 | bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0; 67 | size_t insns_cnt = size / sizeof(struct bpf_insn); 68 | enum bpf_prog_type prog_type; 69 | char buf[256]; 70 | int fd, efd, err, id; 71 | struct perf_event_attr attr = {}; 72 | 73 | attr.type = PERF_TYPE_TRACEPOINT; 74 | attr.sample_type = PERF_SAMPLE_RAW; 75 | attr.sample_period = 1; 76 | attr.wakeup_events = 1; 77 | 78 | if (is_socket) { 79 | prog_type = BPF_PROG_TYPE_SOCKET_FILTER; 80 | } else if (is_kprobe || is_kretprobe) { 81 | prog_type = BPF_PROG_TYPE_KPROBE; 82 | } else if (is_tracepoint) { 83 | prog_type = BPF_PROG_TYPE_TRACEPOINT; 84 | } else if (is_xdp) { 85 | prog_type = BPF_PROG_TYPE_XDP; 86 | } else if (is_perf_event) { 87 | prog_type = BPF_PROG_TYPE_PERF_EVENT; 88 | } else if (is_cgroup_skb) { 89 | prog_type = BPF_PROG_TYPE_CGROUP_SKB; 90 | } else if (is_cgroup_sk) { 91 | prog_type = BPF_PROG_TYPE_CGROUP_SOCK; 92 | } else { 93 | printf("Unknown event '%s'\n", event); 94 | return -1; 95 | } 96 | 97 | fd = bpf_load_program(prog_type, prog, insns_cnt, license, kern_version, 98 | bpf_log_buf, BPF_LOG_BUF_SIZE); 99 | if (fd < 0) { 100 | printf("bpf_load_program() err=%d\n%s", errno, bpf_log_buf); 101 | return -1; 102 | } 103 | 104 | prog_fd[prog_cnt++] = fd; 105 | 106 | if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk) 107 | return 0; 108 | 109 | if (is_socket) { 110 | event += 6; 111 | if (*event != '/') 112 | return 0; 113 | event++; 114 | if (!isdigit(*event)) { 115 | printf("invalid prog number\n"); 116 | return -1; 117 | } 118 | return populate_prog_array(event, fd); 119 | } 120 | 121 | if (is_kprobe || is_kretprobe) { 122 | if (is_kprobe) 123 | event += 7; 124 | else 125 | event += 10; 126 | 127 | if (*event == 0) { 128 | printf("event name cannot be empty\n"); 129 | return -1; 130 | } 131 | 132 | if (isdigit(*event)) 133 | return populate_prog_array(event, fd); 134 | 135 | snprintf(buf, sizeof(buf), 136 | "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events", 137 | is_kprobe ? 'p' : 'r', event, event); 138 | err = system(buf); 139 | if (err < 0) { 140 | printf("failed to create kprobe '%s' error '%s'\n", 141 | event, strerror(errno)); 142 | return -1; 143 | } 144 | 145 | strcpy(buf, DEBUGFS); 146 | strcat(buf, "events/kprobes/"); 147 | strcat(buf, event); 148 | strcat(buf, "/id"); 149 | } else if (is_tracepoint) { 150 | event += 11; 151 | 152 | if (*event == 0) { 153 | printf("event name cannot be empty\n"); 154 | return -1; 155 | } 156 | strcpy(buf, DEBUGFS); 157 | strcat(buf, "events/"); 158 | strcat(buf, event); 159 | strcat(buf, "/id"); 160 | } 161 | 162 | efd = open(buf, O_RDONLY, 0); 163 | if (efd < 0) { 164 | printf("failed to open event %s\n", event); 165 | return -1; 166 | } 167 | 168 | err = read(efd, buf, sizeof(buf)); 169 | if (err < 0 || err >= sizeof(buf)) { 170 | printf("read from '%s' failed '%s'\n", event, strerror(errno)); 171 | return -1; 172 | } 173 | 174 | close(efd); 175 | 176 | buf[err] = 0; 177 | id = atoi(buf); 178 | attr.config = id; 179 | 180 | efd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0); 181 | if (efd < 0) { 182 | printf("event %d fd %d err %s\n", id, efd, strerror(errno)); 183 | return -1; 184 | } 185 | event_fd[prog_cnt - 1] = efd; 186 | ioctl(efd, PERF_EVENT_IOC_ENABLE, 0); 187 | ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd); 188 | 189 | return 0; 190 | } 191 | 192 | static int load_maps(struct bpf_map_data *maps, int nr_maps, 193 | fixup_map_cb fixup_map) 194 | { 195 | int i; 196 | 197 | for (i = 0; i < nr_maps; i++) { 198 | if (fixup_map) { 199 | fixup_map(&maps[i], i); 200 | /* Allow userspace to assign map FD prior to creation */ 201 | if (maps[i].fd != -1) { 202 | map_fd[i] = maps[i].fd; 203 | continue; 204 | } 205 | } 206 | 207 | if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS || 208 | maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) { 209 | int inner_map_fd = map_fd[maps[i].def.inner_map_idx]; 210 | 211 | map_fd[i] = bpf_create_map_in_map(maps[i].def.type, 212 | maps[i].def.key_size, 213 | inner_map_fd, 214 | maps[i].def.max_entries, 215 | maps[i].def.map_flags); 216 | } else { 217 | map_fd[i] = bpf_create_map(maps[i].def.type, 218 | maps[i].def.key_size, 219 | maps[i].def.value_size, 220 | maps[i].def.max_entries, 221 | maps[i].def.map_flags); 222 | } 223 | if (map_fd[i] < 0) { 224 | printf("failed to create a map: %d %s\n", 225 | errno, strerror(errno)); 226 | return 1; 227 | } 228 | maps[i].fd = map_fd[i]; 229 | 230 | if (maps[i].def.type == BPF_MAP_TYPE_PROG_ARRAY) 231 | prog_array_fd = map_fd[i]; 232 | } 233 | return 0; 234 | } 235 | 236 | static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname, 237 | GElf_Shdr *shdr, Elf_Data **data) 238 | { 239 | Elf_Scn *scn; 240 | 241 | scn = elf_getscn(elf, i); 242 | if (!scn) 243 | return 1; 244 | 245 | if (gelf_getshdr(scn, shdr) != shdr) 246 | return 2; 247 | 248 | *shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name); 249 | if (!*shname || !shdr->sh_size) 250 | return 3; 251 | 252 | *data = elf_getdata(scn, 0); 253 | if (!*data || elf_getdata(scn, *data) != NULL) 254 | return 4; 255 | 256 | return 0; 257 | } 258 | 259 | static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols, 260 | GElf_Shdr *shdr, struct bpf_insn *insn, 261 | struct bpf_map_data *maps, int nr_maps) 262 | { 263 | int i, nrels; 264 | 265 | nrels = shdr->sh_size / shdr->sh_entsize; 266 | 267 | for (i = 0; i < nrels; i++) { 268 | GElf_Sym sym; 269 | GElf_Rel rel; 270 | unsigned int insn_idx; 271 | bool match = false; 272 | int j, map_idx; 273 | 274 | gelf_getrel(data, i, &rel); 275 | 276 | insn_idx = rel.r_offset / sizeof(struct bpf_insn); 277 | 278 | gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym); 279 | 280 | if (insn[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) { 281 | printf("invalid relo for insn[%d].code 0x%x\n", 282 | insn_idx, insn[insn_idx].code); 283 | return 1; 284 | } 285 | insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD; 286 | 287 | /* Match FD relocation against recorded map_data[] offset */ 288 | for (map_idx = 0; map_idx < nr_maps; map_idx++) { 289 | if (maps[map_idx].elf_offset == sym.st_value) { 290 | match = true; 291 | break; 292 | } 293 | } 294 | if (match) { 295 | insn[insn_idx].imm = maps[map_idx].fd; 296 | } else { 297 | printf("invalid relo for insn[%d] no map_data match\n", 298 | insn_idx); 299 | return 1; 300 | } 301 | } 302 | 303 | return 0; 304 | } 305 | 306 | static int cmp_symbols(const void *l, const void *r) 307 | { 308 | const GElf_Sym *lsym = (const GElf_Sym *)l; 309 | const GElf_Sym *rsym = (const GElf_Sym *)r; 310 | 311 | if (lsym->st_value < rsym->st_value) 312 | return -1; 313 | else if (lsym->st_value > rsym->st_value) 314 | return 1; 315 | else 316 | return 0; 317 | } 318 | 319 | static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx, 320 | Elf *elf, Elf_Data *symbols, int strtabidx) 321 | { 322 | int map_sz_elf, map_sz_copy; 323 | bool validate_zero = false; 324 | Elf_Data *data_maps; 325 | int i, nr_maps; 326 | GElf_Sym *sym; 327 | Elf_Scn *scn; 328 | int copy_sz; 329 | 330 | if (maps_shndx < 0) 331 | return -EINVAL; 332 | if (!symbols) 333 | return -EINVAL; 334 | 335 | /* Get data for maps section via elf index */ 336 | scn = elf_getscn(elf, maps_shndx); 337 | if (scn) 338 | data_maps = elf_getdata(scn, NULL); 339 | if (!scn || !data_maps) { 340 | printf("Failed to get Elf_Data from maps section %d\n", 341 | maps_shndx); 342 | return -EINVAL; 343 | } 344 | 345 | /* For each map get corrosponding symbol table entry */ 346 | sym = calloc(MAX_MAPS+1, sizeof(GElf_Sym)); 347 | for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) { 348 | assert(nr_maps < MAX_MAPS+1); 349 | if (!gelf_getsym(symbols, i, &sym[nr_maps])) 350 | continue; 351 | if (sym[nr_maps].st_shndx != maps_shndx) 352 | continue; 353 | /* Only increment iif maps section */ 354 | nr_maps++; 355 | } 356 | 357 | /* Align to map_fd[] order, via sort on offset in sym.st_value */ 358 | qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols); 359 | 360 | /* Keeping compatible with ELF maps section changes 361 | * ------------------------------------------------ 362 | * The program size of struct bpf_map_def is known by loader 363 | * code, but struct stored in ELF file can be different. 364 | * 365 | * Unfortunately sym[i].st_size is zero. To calculate the 366 | * struct size stored in the ELF file, assume all struct have 367 | * the same size, and simply divide with number of map 368 | * symbols. 369 | */ 370 | map_sz_elf = data_maps->d_size / nr_maps; 371 | map_sz_copy = sizeof(struct bpf_map_def); 372 | if (map_sz_elf < map_sz_copy) { 373 | /* 374 | * Backward compat, loading older ELF file with 375 | * smaller struct, keeping remaining bytes zero. 376 | */ 377 | map_sz_copy = map_sz_elf; 378 | } else if (map_sz_elf > map_sz_copy) { 379 | /* 380 | * Forward compat, loading newer ELF file with larger 381 | * struct with unknown features. Assume zero means 382 | * feature not used. Thus, validate rest of struct 383 | * data is zero. 384 | */ 385 | validate_zero = true; 386 | } 387 | 388 | /* Memcpy relevant part of ELF maps data to loader maps */ 389 | for (i = 0; i < nr_maps; i++) { 390 | unsigned char *addr, *end; 391 | struct bpf_map_def *def; 392 | const char *map_name; 393 | size_t offset; 394 | 395 | map_name = elf_strptr(elf, strtabidx, sym[i].st_name); 396 | maps[i].name = strdup(map_name); 397 | if (!maps[i].name) { 398 | printf("strdup(%s): %s(%d)\n", map_name, 399 | strerror(errno), errno); 400 | free(sym); 401 | return -errno; 402 | } 403 | 404 | /* Symbol value is offset into ELF maps section data area */ 405 | offset = sym[i].st_value; 406 | def = (struct bpf_map_def *)(data_maps->d_buf + offset); 407 | maps[i].elf_offset = offset; 408 | memset(&maps[i].def, 0, sizeof(struct bpf_map_def)); 409 | memcpy(&maps[i].def, def, map_sz_copy); 410 | 411 | /* Verify no newer features were requested */ 412 | if (validate_zero) { 413 | addr = (unsigned char*) def + map_sz_copy; 414 | end = (unsigned char*) def + map_sz_elf; 415 | for (; addr < end; addr++) { 416 | if (*addr != 0) { 417 | free(sym); 418 | return -EFBIG; 419 | } 420 | } 421 | } 422 | } 423 | 424 | free(sym); 425 | return nr_maps; 426 | } 427 | 428 | static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) 429 | { 430 | int fd, i, ret, maps_shndx = -1, strtabidx = -1; 431 | Elf *elf; 432 | GElf_Ehdr ehdr; 433 | GElf_Shdr shdr, shdr_prog; 434 | Elf_Data *data, *data_prog, *data_maps = NULL, *symbols = NULL; 435 | char *shname, *shname_prog; 436 | int nr_maps = 0; 437 | 438 | /* reset global variables */ 439 | kern_version = 0; 440 | memset(license, 0, sizeof(license)); 441 | memset(processed_sec, 0, sizeof(processed_sec)); 442 | 443 | if (elf_version(EV_CURRENT) == EV_NONE) 444 | return 1; 445 | 446 | fd = open(path, O_RDONLY, 0); 447 | if (fd < 0) 448 | return 1; 449 | 450 | elf = elf_begin(fd, ELF_C_READ, NULL); 451 | 452 | if (!elf) 453 | return 1; 454 | 455 | if (gelf_getehdr(elf, &ehdr) != &ehdr) 456 | return 1; 457 | 458 | /* clear all kprobes */ 459 | i = system("echo \"\" > /sys/kernel/debug/tracing/kprobe_events"); 460 | 461 | /* scan over all elf sections to get license and map info */ 462 | for (i = 1; i < ehdr.e_shnum; i++) { 463 | 464 | if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) 465 | continue; 466 | 467 | if (0) /* helpful for llvm debugging */ 468 | printf("section %d:%s data %p size %zd link %d flags %d\n", 469 | i, shname, data->d_buf, data->d_size, 470 | shdr.sh_link, (int) shdr.sh_flags); 471 | 472 | if (strcmp(shname, "license") == 0) { 473 | processed_sec[i] = true; 474 | memcpy(license, data->d_buf, data->d_size); 475 | } else if (strcmp(shname, "version") == 0) { 476 | processed_sec[i] = true; 477 | if (data->d_size != sizeof(int)) { 478 | printf("invalid size of version section %zd\n", 479 | data->d_size); 480 | return 1; 481 | } 482 | memcpy(&kern_version, data->d_buf, sizeof(int)); 483 | } else if (strcmp(shname, "maps") == 0) { 484 | int j; 485 | 486 | maps_shndx = i; 487 | data_maps = data; 488 | for (j = 0; j < MAX_MAPS; j++) 489 | map_data[j].fd = -1; 490 | } else if (shdr.sh_type == SHT_SYMTAB) { 491 | strtabidx = shdr.sh_link; 492 | symbols = data; 493 | } 494 | } 495 | 496 | ret = 1; 497 | 498 | if (!symbols) { 499 | printf("missing SHT_SYMTAB section\n"); 500 | goto done; 501 | } 502 | 503 | if (data_maps) { 504 | nr_maps = load_elf_maps_section(map_data, maps_shndx, 505 | elf, symbols, strtabidx); 506 | if (nr_maps < 0) { 507 | printf("Error: Failed loading ELF maps (errno:%d):%s\n", 508 | nr_maps, strerror(-nr_maps)); 509 | ret = 1; 510 | goto done; 511 | } 512 | if (load_maps(map_data, nr_maps, fixup_map)) 513 | goto done; 514 | map_data_count = nr_maps; 515 | 516 | processed_sec[maps_shndx] = true; 517 | } 518 | 519 | /* load programs that need map fixup (relocations) */ 520 | for (i = 1; i < ehdr.e_shnum; i++) { 521 | if (processed_sec[i]) 522 | continue; 523 | 524 | if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) 525 | continue; 526 | if (shdr.sh_type == SHT_REL) { 527 | struct bpf_insn *insns; 528 | 529 | if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog, 530 | &shdr_prog, &data_prog)) 531 | continue; 532 | 533 | if (shdr_prog.sh_type != SHT_PROGBITS || 534 | !(shdr_prog.sh_flags & SHF_EXECINSTR)) 535 | continue; 536 | 537 | insns = (struct bpf_insn *) data_prog->d_buf; 538 | 539 | processed_sec[shdr.sh_info] = true; 540 | processed_sec[i] = true; 541 | 542 | if (parse_relo_and_apply(data, symbols, &shdr, insns, 543 | map_data, nr_maps)) 544 | continue; 545 | 546 | if (memcmp(shname_prog, "kprobe/", 7) == 0 || 547 | memcmp(shname_prog, "kretprobe/", 10) == 0 || 548 | memcmp(shname_prog, "tracepoint/", 11) == 0 || 549 | memcmp(shname_prog, "xdp", 3) == 0 || 550 | memcmp(shname_prog, "perf_event", 10) == 0 || 551 | memcmp(shname_prog, "socket", 6) == 0 || 552 | memcmp(shname_prog, "cgroup/", 7) == 0) 553 | load_and_attach(shname_prog, insns, data_prog->d_size); 554 | } 555 | } 556 | 557 | /* load programs that don't use maps */ 558 | for (i = 1; i < ehdr.e_shnum; i++) { 559 | 560 | if (processed_sec[i]) 561 | continue; 562 | 563 | if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) 564 | continue; 565 | 566 | if (memcmp(shname, "kprobe/", 7) == 0 || 567 | memcmp(shname, "kretprobe/", 10) == 0 || 568 | memcmp(shname, "tracepoint/", 11) == 0 || 569 | memcmp(shname, "xdp", 3) == 0 || 570 | memcmp(shname, "perf_event", 10) == 0 || 571 | memcmp(shname, "socket", 6) == 0 || 572 | memcmp(shname, "cgroup/", 7) == 0) 573 | load_and_attach(shname, data->d_buf, data->d_size); 574 | } 575 | 576 | ret = 0; 577 | done: 578 | close(fd); 579 | return ret; 580 | } 581 | 582 | int load_bpf_file(char *path) 583 | { 584 | return do_load_bpf_file(path, NULL); 585 | } 586 | 587 | int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map) 588 | { 589 | return do_load_bpf_file(path, fixup_map); 590 | } 591 | 592 | void read_trace_pipe(void) 593 | { 594 | int trace_fd; 595 | 596 | trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0); 597 | if (trace_fd < 0) 598 | return; 599 | 600 | while (1) { 601 | static char buf[4096]; 602 | ssize_t sz; 603 | 604 | sz = read(trace_fd, buf, sizeof(buf)); 605 | if (sz > 0) { 606 | buf[sz] = 0; 607 | puts(buf); 608 | } 609 | } 610 | } 611 | 612 | #define MAX_SYMS 300000 613 | static struct ksym syms[MAX_SYMS]; 614 | static int sym_cnt; 615 | 616 | static int ksym_cmp(const void *p1, const void *p2) 617 | { 618 | return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr; 619 | } 620 | 621 | int load_kallsyms(void) 622 | { 623 | FILE *f = fopen("/proc/kallsyms", "r"); 624 | char func[256], buf[256]; 625 | char symbol; 626 | void *addr; 627 | int i = 0; 628 | 629 | if (!f) 630 | return -ENOENT; 631 | 632 | while (!feof(f)) { 633 | if (!fgets(buf, sizeof(buf), f)) 634 | break; 635 | if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3) 636 | break; 637 | if (!addr) 638 | continue; 639 | syms[i].addr = (long) addr; 640 | syms[i].name = strdup(func); 641 | i++; 642 | } 643 | sym_cnt = i; 644 | qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp); 645 | return 0; 646 | } 647 | 648 | struct ksym *ksym_search(long key) 649 | { 650 | int start = 0, end = sym_cnt; 651 | int result; 652 | 653 | while (start < end) { 654 | size_t mid = start + (end - start) / 2; 655 | 656 | result = key - syms[mid].addr; 657 | if (result < 0) 658 | end = mid; 659 | else if (result > 0) 660 | start = mid + 1; 661 | else 662 | return &syms[mid]; 663 | } 664 | 665 | if (start >= 1 && syms[start - 1].addr < key && 666 | key < syms[start].addr) 667 | /* valid ksym */ 668 | return &syms[start - 1]; 669 | 670 | /* out of range. return _stext */ 671 | return &syms[0]; 672 | } 673 | 674 | int set_link_xdp_fd(int ifindex, int fd, __u32 flags) 675 | { 676 | struct sockaddr_nl sa; 677 | int sock, seq = 0, len, ret = -1; 678 | char buf[4096]; 679 | struct nlattr *nla, *nla_xdp; 680 | struct { 681 | struct nlmsghdr nh; 682 | struct ifinfomsg ifinfo; 683 | char attrbuf[64]; 684 | } req; 685 | struct nlmsghdr *nh; 686 | struct nlmsgerr *err; 687 | 688 | memset(&sa, 0, sizeof(sa)); 689 | sa.nl_family = AF_NETLINK; 690 | 691 | sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); 692 | if (sock < 0) { 693 | printf("open netlink socket: %s\n", strerror(errno)); 694 | return -1; 695 | } 696 | 697 | if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { 698 | printf("bind to netlink: %s\n", strerror(errno)); 699 | goto cleanup; 700 | } 701 | 702 | memset(&req, 0, sizeof(req)); 703 | req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); 704 | req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; 705 | req.nh.nlmsg_type = RTM_SETLINK; 706 | req.nh.nlmsg_pid = 0; 707 | req.nh.nlmsg_seq = ++seq; 708 | req.ifinfo.ifi_family = AF_UNSPEC; 709 | req.ifinfo.ifi_index = ifindex; 710 | 711 | /* started nested attribute for XDP */ 712 | nla = (struct nlattr *)(((char *)&req) 713 | + NLMSG_ALIGN(req.nh.nlmsg_len)); 714 | nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/; 715 | nla->nla_len = NLA_HDRLEN; 716 | 717 | /* add XDP fd */ 718 | nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); 719 | nla_xdp->nla_type = 1/*IFLA_XDP_FD*/; 720 | nla_xdp->nla_len = NLA_HDRLEN + sizeof(int); 721 | memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd)); 722 | nla->nla_len += nla_xdp->nla_len; 723 | 724 | /* if user passed in any flags, add those too */ 725 | if (flags) { 726 | nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); 727 | nla_xdp->nla_type = 3/*IFLA_XDP_FLAGS*/; 728 | nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags); 729 | memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags)); 730 | nla->nla_len += nla_xdp->nla_len; 731 | } 732 | 733 | req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len); 734 | 735 | if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) { 736 | printf("send to netlink: %s\n", strerror(errno)); 737 | goto cleanup; 738 | } 739 | 740 | len = recv(sock, buf, sizeof(buf), 0); 741 | if (len < 0) { 742 | printf("recv from netlink: %s\n", strerror(errno)); 743 | goto cleanup; 744 | } 745 | 746 | for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len); 747 | nh = NLMSG_NEXT(nh, len)) { 748 | if (nh->nlmsg_pid != getpid()) { 749 | printf("Wrong pid %d, expected %d\n", 750 | nh->nlmsg_pid, getpid()); 751 | goto cleanup; 752 | } 753 | if (nh->nlmsg_seq != seq) { 754 | printf("Wrong seq %d, expected %d\n", 755 | nh->nlmsg_seq, seq); 756 | goto cleanup; 757 | } 758 | switch (nh->nlmsg_type) { 759 | case NLMSG_ERROR: 760 | err = (struct nlmsgerr *)NLMSG_DATA(nh); 761 | if (!err->error) 762 | continue; 763 | printf("nlmsg error %s\n", strerror(-err->error)); 764 | goto cleanup; 765 | case NLMSG_DONE: 766 | break; 767 | } 768 | } 769 | 770 | ret = 0; 771 | 772 | cleanup: 773 | close(sock); 774 | return ret; 775 | } 776 | --------------------------------------------------------------------------------