├── .gitignore
├── memcpy_kprobe_user.c
├── memcpy_kprobe_kern.c
├── memcpy_stat_kern.c
├── memcpy_stat_user.c
├── bpf_load.h
├── Makefile
├── perf-sys.h
├── bpf
    ├── bpf.h
    └── bpf.c
├── libbpf.h
├── bpf_helpers.h
├── README.md
└── bpf_load.c


/.gitignore:
--------------------------------------------------------------------------------
1 | tags
2 | *.cscope
3 | *.swp
4 | *.o
5 | *.ll
6 | memcpy_kprobe
7 | memcpy_stat
8 | 


--------------------------------------------------------------------------------
/memcpy_kprobe_user.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <unistd.h>
 3 | #include <stdlib.h>
 4 | #include <linux/bpf.h>
 5 | #include "libbpf.h"
 6 | #include "bpf_load.h"
 7 | 
 8 | int main(int argc, char **argv)
 9 | {
10 | 	char filename[256];
11 | 
12 | 	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
13 | 
14 | 	if (load_bpf_file(filename)) {
15 | 		printf("%s", bpf_log_buf);
16 | 		return 1;
17 | 	}
18 | 
19 | 	read_trace_pipe();
20 | 
21 | 	return 0;
22 | }
23 | 


--------------------------------------------------------------------------------
/memcpy_kprobe_kern.c:
--------------------------------------------------------------------------------
 1 | #include <linux/filter.h>
 2 | #include <linux/ptrace.h>
 3 | #include <linux/version.h>
 4 | #include <uapi/linux/bpf.h>
 5 | #include "bpf_helpers.h"
 6 | 
 7 | SEC("kprobe/memcpy")
 8 | 
 9 | int bpf_prog1(struct pt_regs *ctx)
10 | {
11 | 	unsigned long long size;
12 | 	char fmt[] = "memcpy size %d\n";
13 | 
14 | 	bpf_probe_read(&size, sizeof(size), (void *)&PT_REGS_PARM3(ctx));
15 | 
16 | 	bpf_trace_printk(fmt, sizeof(fmt), size);
17 | 
18 | 	return 0;
19 | }
20 | 
21 | char _license[] SEC("license") = "GPL";
22 | u32 _version SEC("version") = LINUX_VERSION_CODE;
23 | 


--------------------------------------------------------------------------------
/memcpy_stat_kern.c:
--------------------------------------------------------------------------------
 1 | #include <linux/filter.h>
 2 | #include <linux/ptrace.h>
 3 | #include <linux/version.h>
 4 | #include <uapi/linux/bpf.h>
 5 | #include "bpf_helpers.h"
 6 | 
 7 | struct bpf_map_def SEC("maps") my_map = {
 8 | 	.type = BPF_MAP_TYPE_HASH,
 9 | 	.key_size = sizeof(size_t),
10 | 	.value_size = sizeof(u32),
11 | 	.max_entries = 17,
12 | };
13 | 
14 | SEC("kprobe/memcpy")
15 | 
16 | int bpf_prog1(struct pt_regs *ctx)
17 | {
18 | 	size_t size;
19 | 	u32 *val, count_start = 0;
20 | 
21 | 	bpf_probe_read(&size, sizeof(size), (void *)&PT_REGS_PARM3(ctx));
22 | 
23 | 	if (size % 64)
24 | 		size += (64 - size % 64);
25 | 
26 | 	if (size > 1024)
27 | 		size = 1024;
28 | 
29 | 	val = bpf_map_lookup_elem(&my_map, &size);
30 | 	if (val && *val < UINT_MAX)
31 | 		*val = *val + 1;
32 | 	else
33 | 		bpf_map_update_elem(&my_map, &size, &count_start, BPF_NOEXIST);
34 | 
35 | 	return 0;
36 | }
37 | 
38 | char _license[] SEC("license") = "GPL";
39 | u32 _version SEC("version") = LINUX_VERSION_CODE;
40 | 


--------------------------------------------------------------------------------
/memcpy_stat_user.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <assert.h>
 3 | #include <unistd.h>
 4 | #include <stdlib.h>
 5 | #include <linux/bpf.h>
 6 | #include "libbpf.h"
 7 | #include "bpf_load.h"
 8 | 
 9 | int main(int argc, char **argv)
10 | {
11 | 	char filename[256];
12 | 	size_t size;
13 | 	unsigned int size_cnt = 0;
14 | 
15 | 	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
16 | 
17 | 	if (load_bpf_file(filename)) {
18 | 		printf("%s", bpf_log_buf);
19 | 		return 1;
20 | 	}
21 | 
22 | 	while (1) {
23 | 		printf("\tSize\t\tCount\n");
24 | 		for (size = 0; size <=1024; size = size + 64) {
25 | 			if(bpf_map_lookup_elem(map_fd[0], &size, &size_cnt))
26 | 				size_cnt = 0;
27 | 			if (size == 1024)
28 | 				printf("%4ld - %4ld*\t\t%d\n", size - 63, size,
29 | 						size_cnt);
30 | 			else if (size)
31 | 				printf("%4ld - %4ld\t\t%d\n", size - 63, size,
32 | 						size_cnt);
33 | 			else
34 | 				printf("   0\t\t\t%d\n", size_cnt);
35 | 		}
36 | 		printf ("* Size > 1024 have been counted in this interval\n");
37 | 		sleep(2);
38 | 	}
39 | 
40 | 	return 0;
41 | }
42 | 


--------------------------------------------------------------------------------
/bpf_load.h:
--------------------------------------------------------------------------------
 1 | #ifndef __BPF_LOAD_H
 2 | #define __BPF_LOAD_H
 3 | 
 4 | #include "libbpf.h"
 5 | 
 6 | #define MAX_MAPS 32
 7 | #define MAX_PROGS 32
 8 | 
 9 | struct bpf_map_def {
10 | 	unsigned int type;
11 | 	unsigned int key_size;
12 | 	unsigned int value_size;
13 | 	unsigned int max_entries;
14 | 	unsigned int map_flags;
15 | 	unsigned int inner_map_idx;
16 | };
17 | 
18 | struct bpf_map_data {
19 | 	int fd;
20 | 	char *name;
21 | 	size_t elf_offset;
22 | 	struct bpf_map_def def;
23 | };
24 | 
25 | typedef void (*fixup_map_cb)(struct bpf_map_data *map, int idx);
26 | 
27 | extern int prog_fd[MAX_PROGS];
28 | extern int event_fd[MAX_PROGS];
29 | extern char bpf_log_buf[BPF_LOG_BUF_SIZE];
30 | extern int prog_cnt;
31 | 
32 | /* There is a one-to-one mapping between map_fd[] and map_data[].
33 |  * The map_data[] just contains more rich info on the given map.
34 |  */
35 | extern int map_fd[MAX_MAPS];
36 | extern struct bpf_map_data map_data[MAX_MAPS];
37 | extern int map_data_count;
38 | 
39 | /* parses elf file compiled by llvm .c->.o
40 |  * . parses 'maps' section and creates maps via BPF syscall
41 |  * . parses 'license' section and passes it to syscall
42 |  * . parses elf relocations for BPF maps and adjusts BPF_LD_IMM64 insns by
43 |  *   storing map_fd into insn->imm and marking such insns as BPF_PSEUDO_MAP_FD
44 |  * . loads eBPF programs via BPF syscall
45 |  *
46 |  * One ELF file can contain multiple BPF programs which will be loaded
47 |  * and their FDs stored stored in prog_fd array
48 |  *
49 |  * returns zero on success
50 |  */
51 | int load_bpf_file(char *path);
52 | int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map);
53 | 
54 | void read_trace_pipe(void);
55 | struct ksym {
56 | 	long addr;
57 | 	char *name;
58 | };
59 | 
60 | int load_kallsyms(void);
61 | struct ksym *ksym_search(long key);
62 | int set_link_xdp_fd(int ifindex, int fd, __u32 flags);
63 | #endif
64 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | TARGETS := memcpy_kprobe 
 2 | TARGETS += memcpy_stat
 3 | 
 4 | # Generate file name-scheme based on TARGETS
 5 | KERN_SOURCES = ${TARGETS:=_kern.c}
 6 | USER_SOURCES = ${TARGETS:=_user.c}
 7 | KERN_OBJECTS = ${KERN_SOURCES:.c=.o}
 8 | USER_OBJECTS = ${USER_SOURCES:.c=.o}
 9 | 
10 | # Notice: the kbuilddir can be redefined on make cmdline
11 | KERNEL ?= /lib/modules/$(shell uname -r)/build/
12 | 
13 | CFLAGS := -O2 -Wall
14 | CFLAGS += -I ./
15 | 
16 | EXTRA_CFLAGS=-Werror
17 | 
18 | LDFLAGS= -lelf
19 | 
20 | BPFLIB = bpf/bpf.o
21 | BPFLIB += bpf_load.o
22 | 
23 | LLC ?= llc
24 | CLANG ?= clang
25 | CC = gcc
26 | 
27 | NOSTDINC_FLAGS := -nostdinc -isystem $(shell $(CC) -print-file-name=include)
28 | ARCH=$(shell uname -m | sed 's/x86_64/x86/' | sed 's/i386/x86/')
29 | 
30 | LINUXINCLUDE += -I$(KERNEL)/arch/$(ARCH)/include/generated/uapi
31 | LINUXINCLUDE += -I$(KERNEL)/arch/$(ARCH)/include/generated
32 | LINUXINCLUDE += -I$(KERNEL)/arch/$(ARCH)/include
33 | LINUXINCLUDE += -I$(KERNEL)/arch/$(ARCH)/include/uapi
34 | LINUXINCLUDE += -I$(KERNEL)/include
35 | LINUXINCLUDE += -I$(KERNEL)/include/uapi
36 | LINUXINCLUDE += -include $(KERNEL)/include/linux/kconfig.h
37 | LINUXINCLUDE += -I$(KERNEL)/include/generated/uapi
38 | 
39 | all: $(TARGETS) $(KERN_OBJECTS)
40 | 
41 | .PHONY: clean $(CLANG) $(LLC)
42 | 
43 | clean:
44 | 	rm -f *.ll
45 | 	rm -f $(BPFLIB)
46 | 	rm -f $(TARGETS)
47 | 	rm -f $(KERN_OBJECTS)
48 | 	rm -f $(USER_OBJECTS)
49 | 
50 | #  clang option -S generated output file with suffix .ll
51 | #   which is the non-binary LLVM assembly language format
52 | #   (normally LLVM bitcode format .bc is generated)
53 | #
54 | $(KERN_OBJECTS): %.o: %.c bpf_helpers.h
55 | 	#it will generate .ll file which is actually a LLVM assembly code
56 | 	$(CLANG) -S $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) \
57 | 	    -D__KERNEL__ -D__ASM_SYSREG_H -Wno-unused-value -Wno-pointer-sign \
58 | 	    -Wno-compare-distinct-pointer-types \
59 | 	    -Wno-gnu-variable-sized-type-not-at-end \
60 | 	    -Wno-tautological-compare \
61 | 	    -O2 -emit-llvm -c $<
62 | 	#now translate LLVM assembly to native assembly
63 | 	$(LLC) -march=bpf -filetype=obj -o $@ ${@:.o=.ll}
64 | 
65 | $(TARGETS): %: %_user.c $(BPFLIB) Makefile
66 | 	$(CC) $(CFLAGS) $(BPFLIB) $(LDFLAGS) -o $@ $<
67 | 


--------------------------------------------------------------------------------
/perf-sys.h:
--------------------------------------------------------------------------------
  1 | #ifndef _PERF_SYS_H
  2 | #define _PERF_SYS_H
  3 | 
  4 | #include <unistd.h>
  5 | #include <sys/types.h>
  6 | #include <sys/syscall.h>
  7 | #include <linux/types.h>
  8 | #include <linux/perf_event.h>
  9 | 
 10 | #if defined(__i386__)
 11 | #define cpu_relax()	asm volatile("rep; nop" ::: "memory");
 12 | #define CPUINFO_PROC	{"model name"}
 13 | #endif
 14 | 
 15 | #if defined(__x86_64__)
 16 | #define cpu_relax()	asm volatile("rep; nop" ::: "memory");
 17 | #define CPUINFO_PROC	{"model name"}
 18 | #endif
 19 | 
 20 | #ifdef __powerpc__
 21 | #define CPUINFO_PROC	{"cpu"}
 22 | #endif
 23 | 
 24 | #ifdef __s390__
 25 | #define CPUINFO_PROC	{"vendor_id"}
 26 | #endif
 27 | 
 28 | #ifdef __sh__
 29 | #define CPUINFO_PROC	{"cpu type"}
 30 | #endif
 31 | 
 32 | #ifdef __hppa__
 33 | #define CPUINFO_PROC	{"cpu"}
 34 | #endif
 35 | 
 36 | #ifdef __sparc__
 37 | #define CPUINFO_PROC	{"cpu"}
 38 | #endif
 39 | 
 40 | #ifdef __alpha__
 41 | #define CPUINFO_PROC	{"cpu model"}
 42 | #endif
 43 | 
 44 | #ifdef __ia64__
 45 | #define cpu_relax()	asm volatile ("hint @pause" ::: "memory")
 46 | #define CPUINFO_PROC	{"model name"}
 47 | #endif
 48 | 
 49 | #ifdef __arm__
 50 | #define CPUINFO_PROC	{"model name", "Processor"}
 51 | #endif
 52 | 
 53 | #ifdef __aarch64__
 54 | #define cpu_relax()	asm volatile("yield" ::: "memory")
 55 | #endif
 56 | 
 57 | #ifdef __mips__
 58 | #define CPUINFO_PROC	{"cpu model"}
 59 | #endif
 60 | 
 61 | #ifdef __arc__
 62 | #define CPUINFO_PROC	{"Processor"}
 63 | #endif
 64 | 
 65 | #ifdef __metag__
 66 | #define CPUINFO_PROC	{"CPU"}
 67 | #endif
 68 | 
 69 | #ifdef __xtensa__
 70 | #define CPUINFO_PROC	{"core ID"}
 71 | #endif
 72 | 
 73 | #ifdef __tile__
 74 | #define cpu_relax()	asm volatile ("mfspr zero, PASS" ::: "memory")
 75 | #define CPUINFO_PROC    {"model name"}
 76 | #endif
 77 | 
 78 | #ifndef cpu_relax
 79 | #define cpu_relax() barrier()
 80 | #endif
 81 | 
 82 | static inline int
 83 | sys_perf_event_open(struct perf_event_attr *attr,
 84 | 		      pid_t pid, int cpu, int group_fd,
 85 | 		      unsigned long flags)
 86 | {
 87 | 	int fd;
 88 | 
 89 | 	fd = syscall(__NR_perf_event_open, attr, pid, cpu,
 90 | 		     group_fd, flags);
 91 | 
 92 | #ifdef HAVE_ATTR_TEST
 93 | 	if (unlikely(test_attr__enabled))
 94 | 		test_attr__open(attr, pid, cpu, fd, group_fd, flags);
 95 | #endif
 96 | 	return fd;
 97 | }
 98 | 
 99 | #endif /* _PERF_SYS_H */
100 | 


--------------------------------------------------------------------------------
/bpf/bpf.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * common eBPF ELF operations.
 3 |  *
 4 |  * Copyright (C) 2013-2015 Alexei Starovoitov <ast@kernel.org>
 5 |  * Copyright (C) 2015 Wang Nan <wangnan0@huawei.com>
 6 |  * Copyright (C) 2015 Huawei Inc.
 7 |  *
 8 |  * This program is free software; you can redistribute it and/or
 9 |  * modify it under the terms of the GNU Lesser General Public
10 |  * License as published by the Free Software Foundation;
11 |  * version 2.1 of the License (not later!)
12 |  *
13 |  * This program is distributed in the hope that it will be useful,
14 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 |  * GNU Lesser General Public License for more details.
17 |  *
18 |  * You should have received a copy of the GNU Lesser General Public
19 |  * License along with this program; if not,  see <http://www.gnu.org/licenses>
20 |  */
21 | #ifndef __BPF_BPF_H
22 | #define __BPF_BPF_H
23 | 
24 | #include <linux/bpf.h>
25 | #include <stddef.h>
26 | 
27 | int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
28 | 		   int max_entries, __u32 map_flags);
29 | int bpf_create_map_in_map(enum bpf_map_type map_type, int key_size,
30 | 			  int inner_map_fd, int max_entries, __u32 map_flags);
31 | 
32 | /* Recommend log buffer size */
33 | #define BPF_LOG_BUF_SIZE 65536
34 | int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns,
35 | 		     size_t insns_cnt, const char *license,
36 | 		     __u32 kern_version, char *log_buf,
37 | 		     size_t log_buf_sz);
38 | int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns,
39 | 		       size_t insns_cnt, int strict_alignment,
40 | 		       const char *license, __u32 kern_version,
41 | 		       char *log_buf, size_t log_buf_sz);
42 | 
43 | int bpf_map_update_elem(int fd, const void *key, const void *value,
44 | 			__u64 flags);
45 | 
46 | int bpf_map_lookup_elem(int fd, const void *key, void *value);
47 | int bpf_map_delete_elem(int fd, const void *key);
48 | int bpf_map_get_next_key(int fd, const void *key, void *next_key);
49 | int bpf_obj_pin(int fd, const char *pathname);
50 | int bpf_obj_get(const char *pathname);
51 | int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type,
52 | 		    unsigned int flags);
53 | int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type);
54 | int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size,
55 | 		      void *data_out, __u32 *size_out, __u32 *retval,
56 | 		      __u32 *duration);
57 | 
58 | #endif
59 | 


--------------------------------------------------------------------------------
/libbpf.h:
--------------------------------------------------------------------------------
  1 | /* eBPF mini library */
  2 | #ifndef __LIBBPF_H
  3 | #define __LIBBPF_H
  4 | 
  5 | #include <bpf/bpf.h>
  6 | 
  7 | struct bpf_insn;
  8 | 
  9 | /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
 10 | 
 11 | #define BPF_ALU64_REG(OP, DST, SRC)				\
 12 | 	((struct bpf_insn) {					\
 13 | 		.code  = BPF_ALU64 | BPF_OP(OP) | BPF_X,	\
 14 | 		.dst_reg = DST,					\
 15 | 		.src_reg = SRC,					\
 16 | 		.off   = 0,					\
 17 | 		.imm   = 0 })
 18 | 
 19 | #define BPF_ALU32_REG(OP, DST, SRC)				\
 20 | 	((struct bpf_insn) {					\
 21 | 		.code  = BPF_ALU | BPF_OP(OP) | BPF_X,		\
 22 | 		.dst_reg = DST,					\
 23 | 		.src_reg = SRC,					\
 24 | 		.off   = 0,					\
 25 | 		.imm   = 0 })
 26 | 
 27 | /* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */
 28 | 
 29 | #define BPF_ALU64_IMM(OP, DST, IMM)				\
 30 | 	((struct bpf_insn) {					\
 31 | 		.code  = BPF_ALU64 | BPF_OP(OP) | BPF_K,	\
 32 | 		.dst_reg = DST,					\
 33 | 		.src_reg = 0,					\
 34 | 		.off   = 0,					\
 35 | 		.imm   = IMM })
 36 | 
 37 | #define BPF_ALU32_IMM(OP, DST, IMM)				\
 38 | 	((struct bpf_insn) {					\
 39 | 		.code  = BPF_ALU | BPF_OP(OP) | BPF_K,		\
 40 | 		.dst_reg = DST,					\
 41 | 		.src_reg = 0,					\
 42 | 		.off   = 0,					\
 43 | 		.imm   = IMM })
 44 | 
 45 | /* Short form of mov, dst_reg = src_reg */
 46 | 
 47 | #define BPF_MOV64_REG(DST, SRC)					\
 48 | 	((struct bpf_insn) {					\
 49 | 		.code  = BPF_ALU64 | BPF_MOV | BPF_X,		\
 50 | 		.dst_reg = DST,					\
 51 | 		.src_reg = SRC,					\
 52 | 		.off   = 0,					\
 53 | 		.imm   = 0 })
 54 | 
 55 | #define BPF_MOV32_REG(DST, SRC)					\
 56 | 	((struct bpf_insn) {					\
 57 | 		.code  = BPF_ALU | BPF_MOV | BPF_X,		\
 58 | 		.dst_reg = DST,					\
 59 | 		.src_reg = SRC,					\
 60 | 		.off   = 0,					\
 61 | 		.imm   = 0 })
 62 | 
 63 | /* Short form of mov, dst_reg = imm32 */
 64 | 
 65 | #define BPF_MOV64_IMM(DST, IMM)					\
 66 | 	((struct bpf_insn) {					\
 67 | 		.code  = BPF_ALU64 | BPF_MOV | BPF_K,		\
 68 | 		.dst_reg = DST,					\
 69 | 		.src_reg = 0,					\
 70 | 		.off   = 0,					\
 71 | 		.imm   = IMM })
 72 | 
 73 | #define BPF_MOV32_IMM(DST, IMM)					\
 74 | 	((struct bpf_insn) {					\
 75 | 		.code  = BPF_ALU | BPF_MOV | BPF_K,		\
 76 | 		.dst_reg = DST,					\
 77 | 		.src_reg = 0,					\
 78 | 		.off   = 0,					\
 79 | 		.imm   = IMM })
 80 | 
 81 | /* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */
 82 | #define BPF_LD_IMM64(DST, IMM)					\
 83 | 	BPF_LD_IMM64_RAW(DST, 0, IMM)
 84 | 
 85 | #define BPF_LD_IMM64_RAW(DST, SRC, IMM)				\
 86 | 	((struct bpf_insn) {					\
 87 | 		.code  = BPF_LD | BPF_DW | BPF_IMM,		\
 88 | 		.dst_reg = DST,					\
 89 | 		.src_reg = SRC,					\
 90 | 		.off   = 0,					\
 91 | 		.imm   = (__u32) (IMM) }),			\
 92 | 	((struct bpf_insn) {					\
 93 | 		.code  = 0, /* zero is reserved opcode */	\
 94 | 		.dst_reg = 0,					\
 95 | 		.src_reg = 0,					\
 96 | 		.off   = 0,					\
 97 | 		.imm   = ((__u64) (IMM)) >> 32 })
 98 | 
 99 | #ifndef BPF_PSEUDO_MAP_FD
100 | # define BPF_PSEUDO_MAP_FD	1
101 | #endif
102 | 
103 | /* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */
104 | #define BPF_LD_MAP_FD(DST, MAP_FD)				\
105 | 	BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)
106 | 
107 | 
108 | /* Direct packet access, R0 = *(uint *) (skb->data + imm32) */
109 | 
110 | #define BPF_LD_ABS(SIZE, IMM)					\
111 | 	((struct bpf_insn) {					\
112 | 		.code  = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS,	\
113 | 		.dst_reg = 0,					\
114 | 		.src_reg = 0,					\
115 | 		.off   = 0,					\
116 | 		.imm   = IMM })
117 | 
118 | /* Memory load, dst_reg = *(uint *) (src_reg + off16) */
119 | 
120 | #define BPF_LDX_MEM(SIZE, DST, SRC, OFF)			\
121 | 	((struct bpf_insn) {					\
122 | 		.code  = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM,	\
123 | 		.dst_reg = DST,					\
124 | 		.src_reg = SRC,					\
125 | 		.off   = OFF,					\
126 | 		.imm   = 0 })
127 | 
128 | /* Memory store, *(uint *) (dst_reg + off16) = src_reg */
129 | 
130 | #define BPF_STX_MEM(SIZE, DST, SRC, OFF)			\
131 | 	((struct bpf_insn) {					\
132 | 		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM,	\
133 | 		.dst_reg = DST,					\
134 | 		.src_reg = SRC,					\
135 | 		.off   = OFF,					\
136 | 		.imm   = 0 })
137 | 
138 | /* Atomic memory add, *(uint *)(dst_reg + off16) += src_reg */
139 | 
140 | #define BPF_STX_XADD(SIZE, DST, SRC, OFF)			\
141 | 	((struct bpf_insn) {					\
142 | 		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD,	\
143 | 		.dst_reg = DST,					\
144 | 		.src_reg = SRC,					\
145 | 		.off   = OFF,					\
146 | 		.imm   = 0 })
147 | 
148 | /* Memory store, *(uint *) (dst_reg + off16) = imm32 */
149 | 
150 | #define BPF_ST_MEM(SIZE, DST, OFF, IMM)				\
151 | 	((struct bpf_insn) {					\
152 | 		.code  = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM,	\
153 | 		.dst_reg = DST,					\
154 | 		.src_reg = 0,					\
155 | 		.off   = OFF,					\
156 | 		.imm   = IMM })
157 | 
158 | /* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */
159 | 
160 | #define BPF_JMP_REG(OP, DST, SRC, OFF)				\
161 | 	((struct bpf_insn) {					\
162 | 		.code  = BPF_JMP | BPF_OP(OP) | BPF_X,		\
163 | 		.dst_reg = DST,					\
164 | 		.src_reg = SRC,					\
165 | 		.off   = OFF,					\
166 | 		.imm   = 0 })
167 | 
168 | /* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */
169 | 
170 | #define BPF_JMP_IMM(OP, DST, IMM, OFF)				\
171 | 	((struct bpf_insn) {					\
172 | 		.code  = BPF_JMP | BPF_OP(OP) | BPF_K,		\
173 | 		.dst_reg = DST,					\
174 | 		.src_reg = 0,					\
175 | 		.off   = OFF,					\
176 | 		.imm   = IMM })
177 | 
178 | /* Raw code statement block */
179 | 
180 | #define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM)			\
181 | 	((struct bpf_insn) {					\
182 | 		.code  = CODE,					\
183 | 		.dst_reg = DST,					\
184 | 		.src_reg = SRC,					\
185 | 		.off   = OFF,					\
186 | 		.imm   = IMM })
187 | 
188 | /* Program exit */
189 | 
190 | #define BPF_EXIT_INSN()						\
191 | 	((struct bpf_insn) {					\
192 | 		.code  = BPF_JMP | BPF_EXIT,			\
193 | 		.dst_reg = 0,					\
194 | 		.src_reg = 0,					\
195 | 		.off   = 0,					\
196 | 		.imm   = 0 })
197 | 
198 | #endif
199 | 


--------------------------------------------------------------------------------
/bpf_helpers.h:
--------------------------------------------------------------------------------
  1 | #ifndef __BPF_HELPERS_H
  2 | #define __BPF_HELPERS_H
  3 | 
  4 | /* helper macro to place programs, maps, license in
  5 |  * different sections in elf_bpf file. Section names
  6 |  * are interpreted by elf_bpf loader
  7 |  */
  8 | #define SEC(NAME) __attribute__((section(NAME), used))
  9 | 
 10 | /* helper functions called from eBPF programs written in C */
 11 | static void *(*bpf_map_lookup_elem)(void *map, void *key) =
 12 | 	(void *) BPF_FUNC_map_lookup_elem;
 13 | static int (*bpf_map_update_elem)(void *map, void *key, void *value,
 14 | 				  unsigned long long flags) =
 15 | 	(void *) BPF_FUNC_map_update_elem;
 16 | static int (*bpf_map_delete_elem)(void *map, void *key) =
 17 | 	(void *) BPF_FUNC_map_delete_elem;
 18 | static int (*bpf_probe_read)(void *dst, int size, void *unsafe_ptr) =
 19 | 	(void *) BPF_FUNC_probe_read;
 20 | static unsigned long long (*bpf_ktime_get_ns)(void) =
 21 | 	(void *) BPF_FUNC_ktime_get_ns;
 22 | static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) =
 23 | 	(void *) BPF_FUNC_trace_printk;
 24 | static void (*bpf_tail_call)(void *ctx, void *map, int index) =
 25 | 	(void *) BPF_FUNC_tail_call;
 26 | static unsigned long long (*bpf_get_smp_processor_id)(void) =
 27 | 	(void *) BPF_FUNC_get_smp_processor_id;
 28 | static unsigned long long (*bpf_get_current_pid_tgid)(void) =
 29 | 	(void *) BPF_FUNC_get_current_pid_tgid;
 30 | static unsigned long long (*bpf_get_current_uid_gid)(void) =
 31 | 	(void *) BPF_FUNC_get_current_uid_gid;
 32 | static int (*bpf_get_current_comm)(void *buf, int buf_size) =
 33 | 	(void *) BPF_FUNC_get_current_comm;
 34 | static int (*bpf_perf_event_read)(void *map, int index) =
 35 | 	(void *) BPF_FUNC_perf_event_read;
 36 | static int (*bpf_clone_redirect)(void *ctx, int ifindex, int flags) =
 37 | 	(void *) BPF_FUNC_clone_redirect;
 38 | static int (*bpf_redirect)(int ifindex, int flags) =
 39 | 	(void *) BPF_FUNC_redirect;
 40 | static int (*bpf_perf_event_output)(void *ctx, void *map,
 41 | 				    unsigned long long flags, void *data,
 42 | 				    int size) =
 43 | 	(void *) BPF_FUNC_perf_event_output;
 44 | static int (*bpf_get_stackid)(void *ctx, void *map, int flags) =
 45 | 	(void *) BPF_FUNC_get_stackid;
 46 | static int (*bpf_probe_write_user)(void *dst, void *src, int size) =
 47 | 	(void *) BPF_FUNC_probe_write_user;
 48 | static int (*bpf_current_task_under_cgroup)(void *map, int index) =
 49 | 	(void *) BPF_FUNC_current_task_under_cgroup;
 50 | static int (*bpf_skb_get_tunnel_key)(void *ctx, void *key, int size, int flags) =
 51 | 	(void *) BPF_FUNC_skb_get_tunnel_key;
 52 | static int (*bpf_skb_set_tunnel_key)(void *ctx, void *key, int size, int flags) =
 53 | 	(void *) BPF_FUNC_skb_set_tunnel_key;
 54 | static int (*bpf_skb_get_tunnel_opt)(void *ctx, void *md, int size) =
 55 | 	(void *) BPF_FUNC_skb_get_tunnel_opt;
 56 | static int (*bpf_skb_set_tunnel_opt)(void *ctx, void *md, int size) =
 57 | 	(void *) BPF_FUNC_skb_set_tunnel_opt;
 58 | static unsigned long long (*bpf_get_prandom_u32)(void) =
 59 | 	(void *) BPF_FUNC_get_prandom_u32;
 60 | static int (*bpf_xdp_adjust_head)(void *ctx, int offset) =
 61 | 	(void *) BPF_FUNC_xdp_adjust_head;
 62 | 
 63 | /* llvm builtin functions that eBPF C program may use to
 64 |  * emit BPF_LD_ABS and BPF_LD_IND instructions
 65 |  */
 66 | struct sk_buff;
 67 | unsigned long long load_byte(void *skb,
 68 | 			     unsigned long long off) asm("llvm.bpf.load.byte");
 69 | unsigned long long load_half(void *skb,
 70 | 			     unsigned long long off) asm("llvm.bpf.load.half");
 71 | unsigned long long load_word(void *skb,
 72 | 			     unsigned long long off) asm("llvm.bpf.load.word");
 73 | 
 74 | /* a helper structure used by eBPF C program
 75 |  * to describe map attributes to elf_bpf loader
 76 |  */
 77 | struct bpf_map_def {
 78 | 	unsigned int type;
 79 | 	unsigned int key_size;
 80 | 	unsigned int value_size;
 81 | 	unsigned int max_entries;
 82 | 	unsigned int map_flags;
 83 | 	unsigned int inner_map_idx;
 84 | };
 85 | 
 86 | static int (*bpf_skb_load_bytes)(void *ctx, int off, void *to, int len) =
 87 | 	(void *) BPF_FUNC_skb_load_bytes;
 88 | static int (*bpf_skb_store_bytes)(void *ctx, int off, void *from, int len, int flags) =
 89 | 	(void *) BPF_FUNC_skb_store_bytes;
 90 | static int (*bpf_l3_csum_replace)(void *ctx, int off, int from, int to, int flags) =
 91 | 	(void *) BPF_FUNC_l3_csum_replace;
 92 | static int (*bpf_l4_csum_replace)(void *ctx, int off, int from, int to, int flags) =
 93 | 	(void *) BPF_FUNC_l4_csum_replace;
 94 | static int (*bpf_skb_under_cgroup)(void *ctx, void *map, int index) =
 95 | 	(void *) BPF_FUNC_skb_under_cgroup;
 96 | static int (*bpf_skb_change_head)(void *, int len, int flags) =
 97 | 	(void *) BPF_FUNC_skb_change_head;
 98 | 
 99 | #if defined(__x86_64__)
100 | 
101 | #define PT_REGS_PARM1(x) ((x)->di)
102 | #define PT_REGS_PARM2(x) ((x)->si)
103 | #define PT_REGS_PARM3(x) ((x)->dx)
104 | #define PT_REGS_PARM4(x) ((x)->cx)
105 | #define PT_REGS_PARM5(x) ((x)->r8)
106 | #define PT_REGS_RET(x) ((x)->sp)
107 | #define PT_REGS_FP(x) ((x)->bp)
108 | #define PT_REGS_RC(x) ((x)->ax)
109 | #define PT_REGS_SP(x) ((x)->sp)
110 | #define PT_REGS_IP(x) ((x)->ip)
111 | 
112 | #elif defined(__s390x__)
113 | 
114 | #define PT_REGS_PARM1(x) ((x)->gprs[2])
115 | #define PT_REGS_PARM2(x) ((x)->gprs[3])
116 | #define PT_REGS_PARM3(x) ((x)->gprs[4])
117 | #define PT_REGS_PARM4(x) ((x)->gprs[5])
118 | #define PT_REGS_PARM5(x) ((x)->gprs[6])
119 | #define PT_REGS_RET(x) ((x)->gprs[14])
120 | #define PT_REGS_FP(x) ((x)->gprs[11]) /* Works only with CONFIG_FRAME_POINTER */
121 | #define PT_REGS_RC(x) ((x)->gprs[2])
122 | #define PT_REGS_SP(x) ((x)->gprs[15])
123 | #define PT_REGS_IP(x) ((x)->psw.addr)
124 | 
125 | #elif defined(__aarch64__)
126 | 
127 | #define PT_REGS_PARM1(x) ((x)->regs[0])
128 | #define PT_REGS_PARM2(x) ((x)->regs[1])
129 | #define PT_REGS_PARM3(x) ((x)->regs[2])
130 | #define PT_REGS_PARM4(x) ((x)->regs[3])
131 | #define PT_REGS_PARM5(x) ((x)->regs[4])
132 | #define PT_REGS_RET(x) ((x)->regs[30])
133 | #define PT_REGS_FP(x) ((x)->regs[29]) /* Works only with CONFIG_FRAME_POINTER */
134 | #define PT_REGS_RC(x) ((x)->regs[0])
135 | #define PT_REGS_SP(x) ((x)->sp)
136 | #define PT_REGS_IP(x) ((x)->pc)
137 | 
138 | #elif defined(__powerpc__)
139 | 
140 | #define PT_REGS_PARM1(x) ((x)->gpr[3])
141 | #define PT_REGS_PARM2(x) ((x)->gpr[4])
142 | #define PT_REGS_PARM3(x) ((x)->gpr[5])
143 | #define PT_REGS_PARM4(x) ((x)->gpr[6])
144 | #define PT_REGS_PARM5(x) ((x)->gpr[7])
145 | #define PT_REGS_RC(x) ((x)->gpr[3])
146 | #define PT_REGS_SP(x) ((x)->sp)
147 | #define PT_REGS_IP(x) ((x)->nip)
148 | 
149 | #elif defined(__sparc__)
150 | 
151 | #define PT_REGS_PARM1(x) ((x)->u_regs[UREG_I0])
152 | #define PT_REGS_PARM2(x) ((x)->u_regs[UREG_I1])
153 | #define PT_REGS_PARM3(x) ((x)->u_regs[UREG_I2])
154 | #define PT_REGS_PARM4(x) ((x)->u_regs[UREG_I3])
155 | #define PT_REGS_PARM5(x) ((x)->u_regs[UREG_I4])
156 | #define PT_REGS_RET(x) ((x)->u_regs[UREG_I7])
157 | #define PT_REGS_RC(x) ((x)->u_regs[UREG_I0])
158 | #define PT_REGS_SP(x) ((x)->u_regs[UREG_FP])
159 | #if defined(__arch64__)
160 | #define PT_REGS_IP(x) ((x)->tpc)
161 | #else
162 | #define PT_REGS_IP(x) ((x)->pc)
163 | #endif
164 | 
165 | #endif
166 | 
167 | #ifdef __powerpc__
168 | #define BPF_KPROBE_READ_RET_IP(ip, ctx)		({ (ip) = (ctx)->link; })
169 | #define BPF_KRETPROBE_READ_RET_IP		BPF_KPROBE_READ_RET_IP
170 | #elif defined(__sparc__)
171 | #define BPF_KPROBE_READ_RET_IP(ip, ctx)		({ (ip) = PT_REGS_RET(ctx); })
172 | #define BPF_KRETPROBE_READ_RET_IP		BPF_KPROBE_READ_RET_IP
173 | #else
174 | #define BPF_KPROBE_READ_RET_IP(ip, ctx)		({				\
175 | 		bpf_probe_read(&(ip), sizeof(ip), (void *)PT_REGS_RET(ctx)); })
176 | #define BPF_KRETPROBE_READ_RET_IP(ip, ctx)	({				\
177 | 		bpf_probe_read(&(ip), sizeof(ip),				\
178 | 				(void *)(PT_REGS_FP(ctx) + sizeof(ip))); })
179 | #endif
180 | 
181 | #endif
182 | 


--------------------------------------------------------------------------------
/bpf/bpf.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * common eBPF ELF operations.
  3 |  *
  4 |  * Copyright (C) 2013-2015 Alexei Starovoitov <ast@kernel.org>
  5 |  * Copyright (C) 2015 Wang Nan <wangnan0@huawei.com>
  6 |  * Copyright (C) 2015 Huawei Inc.
  7 |  *
  8 |  * This program is free software; you can redistribute it and/or
  9 |  * modify it under the terms of the GNU Lesser General Public
 10 |  * License as published by the Free Software Foundation;
 11 |  * version 2.1 of the License (not later!)
 12 |  *
 13 |  * This program is distributed in the hope that it will be useful,
 14 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 15 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 16 |  * GNU Lesser General Public License for more details.
 17 |  *
 18 |  * You should have received a copy of the GNU Lesser General Public
 19 |  * License along with this program; if not,  see <http://www.gnu.org/licenses>
 20 |  */
 21 | 
 22 | #include <stdlib.h>
 23 | #include <memory.h>
 24 | #include <unistd.h>
 25 | #include <asm/unistd.h>
 26 | #include <linux/bpf.h>
 27 | #include "bpf.h"
 28 | 
 29 | /*
 30 |  * When building perf, unistd.h is overridden. __NR_bpf is
 31 |  * required to be defined explicitly.
 32 |  */
 33 | #ifndef __NR_bpf
 34 | # if defined(__i386__)
 35 | #  define __NR_bpf 357
 36 | # elif defined(__x86_64__)
 37 | #  define __NR_bpf 321
 38 | # elif defined(__aarch64__)
 39 | #  define __NR_bpf 280
 40 | # elif defined(__sparc__)
 41 | #  define __NR_bpf 349
 42 | # else
 43 | #  error __NR_bpf not defined. libbpf does not support your arch.
 44 | # endif
 45 | #endif
 46 | 
 47 | static inline __u64 ptr_to_u64(const void *ptr)
 48 | {
 49 | 	return (__u64) (unsigned long) ptr;
 50 | }
 51 | 
 52 | static inline int sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr,
 53 | 			  unsigned int size)
 54 | {
 55 | 	return syscall(__NR_bpf, cmd, attr, size);
 56 | }
 57 | 
 58 | int bpf_create_map(enum bpf_map_type map_type, int key_size,
 59 | 		   int value_size, int max_entries, __u32 map_flags)
 60 | {
 61 | 	union bpf_attr attr;
 62 | 
 63 | 	memset(&attr, '\0', sizeof(attr));
 64 | 
 65 | 	attr.map_type = map_type;
 66 | 	attr.key_size = key_size;
 67 | 	attr.value_size = value_size;
 68 | 	attr.max_entries = max_entries;
 69 | 	attr.map_flags = map_flags;
 70 | 
 71 | 	return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
 72 | }
 73 | 
 74 | int bpf_create_map_in_map(enum bpf_map_type map_type, int key_size,
 75 | 			  int inner_map_fd, int max_entries, __u32 map_flags)
 76 | {
 77 | 	union bpf_attr attr;
 78 | 
 79 | 	memset(&attr, '\0', sizeof(attr));
 80 | 
 81 | 	attr.map_type = map_type;
 82 | 	attr.key_size = key_size;
 83 | 	attr.value_size = 4;
 84 | 	attr.inner_map_fd = inner_map_fd;
 85 | 	attr.max_entries = max_entries;
 86 | 	attr.map_flags = map_flags;
 87 | 
 88 | 	return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
 89 | }
 90 | 
 91 | int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns,
 92 | 		     size_t insns_cnt, const char *license,
 93 | 		     __u32 kern_version, char *log_buf, size_t log_buf_sz)
 94 | {
 95 | 	int fd;
 96 | 	union bpf_attr attr;
 97 | 
 98 | 	bzero(&attr, sizeof(attr));
 99 | 	attr.prog_type = type;
100 | 	attr.insn_cnt = (__u32)insns_cnt;
101 | 	attr.insns = ptr_to_u64(insns);
102 | 	attr.license = ptr_to_u64(license);
103 | 	attr.log_buf = ptr_to_u64(NULL);
104 | 	attr.log_size = 0;
105 | 	attr.log_level = 0;
106 | 	attr.kern_version = kern_version;
107 | 
108 | 	fd = sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
109 | 	if (fd >= 0 || !log_buf || !log_buf_sz)
110 | 		return fd;
111 | 
112 | 	/* Try again with log */
113 | 	attr.log_buf = ptr_to_u64(log_buf);
114 | 	attr.log_size = log_buf_sz;
115 | 	attr.log_level = 1;
116 | 	log_buf[0] = 0;
117 | 	return sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
118 | }
119 | 
120 | int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns,
121 | 		       size_t insns_cnt, int strict_alignment,
122 | 		       const char *license, __u32 kern_version,
123 | 		       char *log_buf, size_t log_buf_sz)
124 | {
125 | 	union bpf_attr attr;
126 | 
127 | 	bzero(&attr, sizeof(attr));
128 | 	attr.prog_type = type;
129 | 	attr.insn_cnt = (__u32)insns_cnt;
130 | 	attr.insns = ptr_to_u64(insns);
131 | 	attr.license = ptr_to_u64(license);
132 | 	attr.log_buf = ptr_to_u64(log_buf);
133 | 	attr.log_size = log_buf_sz;
134 | 	attr.log_level = 2;
135 | 	log_buf[0] = 0;
136 | 	attr.kern_version = kern_version;
137 | 	attr.prog_flags = strict_alignment ? BPF_F_STRICT_ALIGNMENT : 0;
138 | 
139 | 	return sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
140 | }
141 | 
142 | int bpf_map_update_elem(int fd, const void *key, const void *value,
143 | 			__u64 flags)
144 | {
145 | 	union bpf_attr attr;
146 | 
147 | 	bzero(&attr, sizeof(attr));
148 | 	attr.map_fd = fd;
149 | 	attr.key = ptr_to_u64(key);
150 | 	attr.value = ptr_to_u64(value);
151 | 	attr.flags = flags;
152 | 
153 | 	return sys_bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
154 | }
155 | 
156 | int bpf_map_lookup_elem(int fd, const void *key, void *value)
157 | {
158 | 	union bpf_attr attr;
159 | 
160 | 	bzero(&attr, sizeof(attr));
161 | 	attr.map_fd = fd;
162 | 	attr.key = ptr_to_u64(key);
163 | 	attr.value = ptr_to_u64(value);
164 | 
165 | 	return sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
166 | }
167 | 
168 | int bpf_map_delete_elem(int fd, const void *key)
169 | {
170 | 	union bpf_attr attr;
171 | 
172 | 	bzero(&attr, sizeof(attr));
173 | 	attr.map_fd = fd;
174 | 	attr.key = ptr_to_u64(key);
175 | 
176 | 	return sys_bpf(BPF_MAP_DELETE_ELEM, &attr, sizeof(attr));
177 | }
178 | 
179 | int bpf_map_get_next_key(int fd, const void *key, void *next_key)
180 | {
181 | 	union bpf_attr attr;
182 | 
183 | 	bzero(&attr, sizeof(attr));
184 | 	attr.map_fd = fd;
185 | 	attr.key = ptr_to_u64(key);
186 | 	attr.next_key = ptr_to_u64(next_key);
187 | 
188 | 	return sys_bpf(BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr));
189 | }
190 | 
191 | int bpf_obj_pin(int fd, const char *pathname)
192 | {
193 | 	union bpf_attr attr;
194 | 
195 | 	bzero(&attr, sizeof(attr));
196 | 	attr.pathname = ptr_to_u64((void *)pathname);
197 | 	attr.bpf_fd = fd;
198 | 
199 | 	return sys_bpf(BPF_OBJ_PIN, &attr, sizeof(attr));
200 | }
201 | 
202 | int bpf_obj_get(const char *pathname)
203 | {
204 | 	union bpf_attr attr;
205 | 
206 | 	bzero(&attr, sizeof(attr));
207 | 	attr.pathname = ptr_to_u64((void *)pathname);
208 | 
209 | 	return sys_bpf(BPF_OBJ_GET, &attr, sizeof(attr));
210 | }
211 | 
212 | int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type,
213 | 		    unsigned int flags)
214 | {
215 | 	union bpf_attr attr;
216 | 
217 | 	bzero(&attr, sizeof(attr));
218 | 	attr.target_fd	   = target_fd;
219 | 	attr.attach_bpf_fd = prog_fd;
220 | 	attr.attach_type   = type;
221 | 	attr.attach_flags  = flags;
222 | 
223 | 	return sys_bpf(BPF_PROG_ATTACH, &attr, sizeof(attr));
224 | }
225 | 
226 | int bpf_prog_detach(int target_fd, enum bpf_attach_type type)
227 | {
228 | 	union bpf_attr attr;
229 | 
230 | 	bzero(&attr, sizeof(attr));
231 | 	attr.target_fd	 = target_fd;
232 | 	attr.attach_type = type;
233 | 
234 | 	return sys_bpf(BPF_PROG_DETACH, &attr, sizeof(attr));
235 | }
236 | 
237 | int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size,
238 | 		      void *data_out, __u32 *size_out, __u32 *retval,
239 | 		      __u32 *duration)
240 | {
241 | 	union bpf_attr attr;
242 | 	int ret;
243 | 
244 | 	bzero(&attr, sizeof(attr));
245 | 	attr.test.prog_fd = prog_fd;
246 | 	attr.test.data_in = ptr_to_u64(data);
247 | 	attr.test.data_out = ptr_to_u64(data_out);
248 | 	attr.test.data_size_in = size;
249 | 	attr.test.repeat = repeat;
250 | 
251 | 	ret = sys_bpf(BPF_PROG_TEST_RUN, &attr, sizeof(attr));
252 | 	if (size_out)
253 | 		*size_out = attr.test.data_size_out;
254 | 	if (retval)
255 | 		*retval = attr.test.retval;
256 | 	if (duration)
257 | 		*duration = attr.test.duration;
258 | 	return ret;
259 | }
260 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 |     # learn-bpf
  2 |     
  3 |     This program has been tested with 4.12.5-200.fc25.x86_64.
  4 |     
  5 |     memcpy_kprobe inserts kprobe at the entry of kernel memcpy() and prints bpf
  6 |     kernel messages in trace buffer. memcpy_stat prepares a table in kernel
  7 |     space itself for the count of memcpy() called with different sizes.
  8 |     
  9 |     # make
 10 |     # ./memcpy_kprobe
 11 |        memcpy_kprobe-24908 [005] d... 151374.866218: : memcpy size 2
 12 |        memcpy_kprobe-24908 [005] d... 151374.866221: : memcpy size 13
 13 |        memcpy_kprobe-24908 [005] d... 151374.866224: : memcpy size 1
 14 |        memcpy_kprobe-24908 [005] d... 151374.866226: : memcpy size 2
 15 |        memcpy_kprobe-24908 [005] d... 151374.866229: : memcpy size 2
 16 |        memcpy_kprobe-24908 [005] d... 151374.866232: : memcpy size 1
 17 |        memcpy_kprobe-24908 [005] d... 151374.866234: : memcpy size 1
 18 |        memcpy_kprobe-24908 [005] d... 151374.866237: : memcpy size 2
 19 |     
 20 |     # ./memcpy_stat
 21 |             Size            Count
 22 |        0                    134
 23 |        1 -   64             10444
 24 |       65 -  128             23
 25 |      129 -  192             16
 26 |      193 -  256             240
 27 |      257 -  320             0
 28 |      321 -  384             0
 29 |      385 -  448             7
 30 |      449 -  512             7
 31 |      513 -  576             0
 32 |      577 -  640             0
 33 |      641 -  704             0
 34 |      705 -  768             0
 35 |      769 -  832             0
 36 |      833 -  896             0
 37 |      897 -  960             0
 38 |      961 - 1024*            107
 39 |     * Size > 1024 have been counted in this interval
 40 |   
 41 | eBPF history and program description
 42 | ====================================
 43 | 
 44 | Writing a simple eBPF application for Kernel Tracing
 45 | =====================================================
 46 | 
 47 | 
 48 | 
 49 | eBPF an introduction
 50 | ======================
 51 | 
 52 | * BPF machine
 53 | 
 54 | In the year 1992, Steven McCanne and Van Jacobson from Lawrence Berkeley
 55 | Laboratory [1] proposed a solution to BSD Unix systems for minimising
 56 | unwanted network packet copy to user space by implementing an in-kernel
 57 | packet filter. This filter is known as Berkeley Packet Filter(BPF). It was
 58 | latter introduced in Linux Kernel version 2.1.75 in 1997. 
 59 | 
 60 | This filter was aimed to filter all the unwanted packets as early as
 61 | possible, so the filtering mechanism had to be shifted from user space
 62 | utilities like tcpdump to the in-kernel virtual machine. A group of
 63 | assembly like instructions for filtering necessary packet are sent from
 64 | user space to kernel by  a system call bpf(). Kernel statically analyzes
 65 | the programs before loading them and makes sure that they cannot hang or
 66 | harm a running system.
 67 | 
 68 | The BPF machine abstraction consists of [1] an accumulator, an index
 69 | register ( x ),  a scratch memory store, and an implicit program counter.
 70 | It has a small set of arithmetic, logical, and jump instructions.  The
 71 | accumulator is used for arithmetic operations, while the index register
 72 | provides offsets into the packet or into the scratch memory areas. Lets see
 73 | the example of a  small BPF program written in BPF bytecode:
 74 | 
 75 |     ldh    [12]
 76 |     jeq    #ETHERTYPE_IP, l1, l2
 77 |     l1:    ret    #TRUE
 78 |     l2:    ret    #0
 79 | 
 80 | ldh instruction loads a half word (16 bit) value in accumulator from offset
 81 | 12 in ethernet packet which is ethernet type field. If it is not an IP
 82 | packet then 0 will be returned and so the packet would be rejected.
 83 | 
 84 | * BPF JIT compiler
 85 | 
 86 | A just in time (JIT) compiler was introduced into kernel [2] in 2011 to
 87 | speed up BPF bytecode execution. This compiler translates BPF bytecode into
 88 | host system’s assembly code. Such compiler exists for x86_64, SPARC,
 89 | PowerPC, ARM, ARM64, MIPS and s390 and can be enabled through
 90 | CONFIG_BPF_JIT.
 91 | 
 92 | * eBPF machine
 93 | 
 94 | Extended BPF (eBPF) is an enhancement over BPF (which is now called as cBPF
 95 | i.e. classical BPF) having more resources like 10 registers and 1-8 byte
 96 | load/store instructions etc [3]. While BPF was having only forward jump,
 97 | eBPF has both backward as well as forward jump, and so we can have a loop.
 98 | Of Course kernel takes care that loop still terminates properly. It also
 99 | includes global data store which is called maps, and this maps state
100 | persist between events, therefore eBPF can be also used for aggregating
101 | statistics of events. Further, an eBPF program can be written in ‘C’ like
102 | functions, which can be compiled using GCC/LLVM compiler. eBPF has been
103 | designed to be JITed with one to one mapping,  so a very optimized code
104 | having performance as fast as natively compiled code can be generated.
105 | 
106 | eBPF and tracing review
107 | ==========================
108 | 
109 | * Upstream kernel development
110 | 
111 | Traditional built-in tracers in Linux are used in post-process manner,
112 | where they would dump fixed event details and then user space tools like
113 | perf or trace-cmd can post process to get required information e.g `perf
114 | stat`.  However, eBPF has ability to prepare user information in kernel
115 | context, and only transfer needed information to user space. So far support
116 | of kprobes, tracepoints and perf_events filtering using eBPF have been
117 | implemented in upstream kernel. They have been supported with arch x86-64,
118 | aarch64, s390x, powerpc64 and sparc64.
119 | 
120 | One can look into following Linux kernel files to get an insight of it.
121 | 
122 | - kernel/bpf/
123 | - kernel/trace/bpf_trace.c
124 | - kernel/events/core.c
125 | 
126 | * User space development
127 | 
128 | There have been user space tools development for in-kernel tree as well as
129 | out of the kernel tree. Following files/directories are good to look into
130 | upstream kernel for ebpf usage.
131 | 
132 | - tools/lib/bpf
133 | - tools/perf/util/bpf-loader.c
134 | - samples/bpf/
135 | 
136 | bcc( https://github.com/iovisor/bcc.git) is another out of kernel tree tool
137 | which has very efficient kernel tracing programs for specific usage(like
138 | `funccount` which counts functions matching a pattern).
139 | 
140 | Perf has also a bpf interface which can be used to load eBPF object
141 | into kernel.
142 | 
143 | eBPF tracing: User space to kernel space flow
144 | ===============================================
145 | 
146 | Lets first understand couple of useful entity to interact with ebpf kernel:
147 | 
148 | * BPF system call
149 | 
150 | user  can interact using bpf() system call whose prototype is int bpf(int
151 | cmd, union bpf_attr *attr, unsigned int size);
152 |             
153 | One can see `man bpf` for detail about the different possible arguments.
154 | Here, I am providing summary of those arguments.
155 | 
156 | 	*cmd* can be any of the defined enum bpf_cmd, which tells kernel
157 | 	mainly about management of map area (like it’s creation, updating,
158 | 	deleting or finding an element within it etc), attaching or detaching
159 | 	a program etc.
160 | 
161 | 	*attr* can be one of the user defined structure which can be used
162 | 	by respective command.
163 | 
164 | 	*size* will be the size of attr.
165 | 
166 | * BPF Maps:
167 | 
168 | eBPF tracing calculates the stats in kernel domain itself. We will need
169 | some memory/data structure within the kernel to create such stats.  Maps
170 | are a generic data structure for storage of different types of data in the
171 | form of key-value pair.  They allow sharing of data between eBPF kernel
172 | programs, and also between kernel and user-space applications.
173 | 
174 | Couple of important attributes for maps:
175 | 
176 | - Type (map_type)
177 | - maximum number of elements (max_entries)
178 | - key size in bytes (key_size)
179 | - value size in bytes (value_size)
180 | 
181 | A map can be of different types like Hash, Array, Program array etc. We
182 | need to choose appropriate type as per our needs. For example if key is a
183 | string or not from an integer series then hash map can be used for faster
184 | look up, however if key is like an index then array map will provide the
185 | fastest look up method.
186 | 
187 | We can not have a key bigger than key_size and can not store a value bigger
188 | than value_size. max_entries is the maximum number of key-value pair which
189 | can be stored within map.
190 | 
191 | * Some important command:
192 | 
193 | - BPF_PROG_LOAD: Couple of important attributes for this program.
194 | 
195 | 	prog_type : some of the program type useful for tracing are
196 | 
197 | 		BPF_PROG_TYPE_KPROBE
198 | 
199 | 		BPF_PROG_TYPE_TRACEPOINT,
200 | 
201 | 		BPF_PROG_TYPE_PERF_EVENT,
202 | 
203 | 	insns: 	is pointer to “struct bpf_insn” which has bpf instruction to
204 | 		be executed by in-kernel bpf VM.
205 | 
206 | 	insn_cnt: total number of instructions present at insns.
207 | 
208 | 	license:string, which must be GPL compatible to call helper functions
209 | 		marked gpl_only.
210 | 
211 | 	kern_version: version of kernel tree
212 | 
213 | - BPF_MAP_CREATE:  It accepts attributes as discussed in BPF Maps section,
214 | creates a new map and then returns a new file descriptor that refers to the
215 | map. Returned map_fd can be used for lookup or update map elements with
216 | commands like BPF_MAP_LOOKUP_ELEM, BPF_MAP_UPDATE_ELEM, BPF_MAP_DELETE_ELEM
217 | or BPF_MAP_GET_NEXT_KEY. These map manipulation command accepts an
218 | attribute with map_fd, key and value.
219 | 
220 | Now lets lookup some code which can explain it’s working. See the exmaple
221 | code here: https://github.com/pratyushanand/learn-bpf
222 | 
223 | Above code is a standalone ebpf demo code which does not need any other
224 | ebpf library code. It has a small library to load different sections of bpf
225 | kernel code (bpf_load.c) and then some wrapper function on top of bpf()
226 | system call (bpf.c) to manipulate map and load kernel bpf code. When we
227 | compile this code we get two executable, memcpy_kprobe and memcpy_stat.
228 | Lets first see what memcpy_kprobe* files do.
229 | 
230 | For each application we have one *_kern file and another *_user file.
231 | *_kern file has a function “int bpf_prog1(struct pt_regs *ctx)”.  This
232 | function is executed in kernel, so it can access kernel variable and
233 | functions. memcpy_kprobe_kern.c  has three section mappings for program,
234 | license and version respectively. Data from these sections are made part of
235 | attributes of system call bpf(BPF_PROG_LOAD,...) and then kernel executes
236 | loaded bpf instructions as per prog_type attribute. So, bpf code in
237 | memcpy_kprobe_kern.c will be executed when a kprobe instrumented at the
238 | entry of kernel memcpy() is hit. When this bpf code is executed, it will
239 | read 3rd argument of memcpy() ie size of copy and then will print one
240 | statement for “memcpy size” in trace buffer. memcpy_kprobe_user.c loads
241 | kernel program and keeps on reading trace buffer to show what kernel ebpf
242 | program is writing into it.
243 | 
244 | We have another demo memcpy_stat which prepares stats of memcpy() copy size
245 | in kernel itself. memcpy_stat_kern.c has one more section as maps.
246 | bpf_prog1() reads memcpy() sizes and updates map table.  Corresponding user
247 | space program memcpy_stat_user.c reads map table at every 2 second and
248 | prints stats on console.
249 | 
250 | Above two simple example can help one to understand, how a user can write
251 | kernel ebpf code for kernel tracing and statistics preparation.
252 | 
253 | [1] http://www.tcpdump.org/papers/bpf-usenix93.pdf 
254 | [2] https://lwn.net/Articles/437884/
255 | [3] https://www.kernel.org/doc/Documentation/networking/filter.txt
256 | [4] http://events.linuxfoundation.org/sites/events/files/slides/Performance%20Monitoring%20and%20Analysis%20Using%20perf%20and%20BPF_1.pdf
257 | 
258 | 
259 | 


--------------------------------------------------------------------------------
/bpf_load.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <sys/types.h>
  3 | #include <sys/stat.h>
  4 | #include <fcntl.h>
  5 | #include <libelf.h>
  6 | #include <gelf.h>
  7 | #include <errno.h>
  8 | #include <unistd.h>
  9 | #include <string.h>
 10 | #include <stdbool.h>
 11 | #include <stdlib.h>
 12 | #include <linux/bpf.h>
 13 | #include <linux/filter.h>
 14 | #include <linux/perf_event.h>
 15 | #include <linux/netlink.h>
 16 | #include <linux/rtnetlink.h>
 17 | #include <linux/types.h>
 18 | #include <sys/types.h>
 19 | #include <sys/socket.h>
 20 | #include <sys/syscall.h>
 21 | #include <sys/ioctl.h>
 22 | #include <sys/mman.h>
 23 | #include <poll.h>
 24 | #include <ctype.h>
 25 | #include <assert.h>
 26 | #include "libbpf.h"
 27 | #include "bpf_load.h"
 28 | #include "perf-sys.h"
 29 | 
 30 | #define DEBUGFS "/sys/kernel/debug/tracing/"
 31 | 
 32 | static char license[128];
 33 | static int kern_version;
 34 | static bool processed_sec[128];
 35 | char bpf_log_buf[BPF_LOG_BUF_SIZE];
 36 | int map_fd[MAX_MAPS];
 37 | int prog_fd[MAX_PROGS];
 38 | int event_fd[MAX_PROGS];
 39 | int prog_cnt;
 40 | int prog_array_fd = -1;
 41 | 
 42 | struct bpf_map_data map_data[MAX_MAPS];
 43 | int map_data_count = 0;
 44 | 
 45 | static int populate_prog_array(const char *event, int prog_fd)
 46 | {
 47 | 	int ind = atoi(event), err;
 48 | 
 49 | 	err = bpf_map_update_elem(prog_array_fd, &ind, &prog_fd, BPF_ANY);
 50 | 	if (err < 0) {
 51 | 		printf("failed to store prog_fd in prog_array\n");
 52 | 		return -1;
 53 | 	}
 54 | 	return 0;
 55 | }
 56 | 
 57 | static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 58 | {
 59 | 	bool is_socket = strncmp(event, "socket", 6) == 0;
 60 | 	bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
 61 | 	bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
 62 | 	bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0;
 63 | 	bool is_xdp = strncmp(event, "xdp", 3) == 0;
 64 | 	bool is_perf_event = strncmp(event, "perf_event", 10) == 0;
 65 | 	bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0;
 66 | 	bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0;
 67 | 	size_t insns_cnt = size / sizeof(struct bpf_insn);
 68 | 	enum bpf_prog_type prog_type;
 69 | 	char buf[256];
 70 | 	int fd, efd, err, id;
 71 | 	struct perf_event_attr attr = {};
 72 | 
 73 | 	attr.type = PERF_TYPE_TRACEPOINT;
 74 | 	attr.sample_type = PERF_SAMPLE_RAW;
 75 | 	attr.sample_period = 1;
 76 | 	attr.wakeup_events = 1;
 77 | 
 78 | 	if (is_socket) {
 79 | 		prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
 80 | 	} else if (is_kprobe || is_kretprobe) {
 81 | 		prog_type = BPF_PROG_TYPE_KPROBE;
 82 | 	} else if (is_tracepoint) {
 83 | 		prog_type = BPF_PROG_TYPE_TRACEPOINT;
 84 | 	} else if (is_xdp) {
 85 | 		prog_type = BPF_PROG_TYPE_XDP;
 86 | 	} else if (is_perf_event) {
 87 | 		prog_type = BPF_PROG_TYPE_PERF_EVENT;
 88 | 	} else if (is_cgroup_skb) {
 89 | 		prog_type = BPF_PROG_TYPE_CGROUP_SKB;
 90 | 	} else if (is_cgroup_sk) {
 91 | 		prog_type = BPF_PROG_TYPE_CGROUP_SOCK;
 92 | 	} else {
 93 | 		printf("Unknown event '%s'\n", event);
 94 | 		return -1;
 95 | 	}
 96 | 
 97 | 	fd = bpf_load_program(prog_type, prog, insns_cnt, license, kern_version,
 98 | 			      bpf_log_buf, BPF_LOG_BUF_SIZE);
 99 | 	if (fd < 0) {
100 | 		printf("bpf_load_program() err=%d\n%s", errno, bpf_log_buf);
101 | 		return -1;
102 | 	}
103 | 
104 | 	prog_fd[prog_cnt++] = fd;
105 | 
106 | 	if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk)
107 | 		return 0;
108 | 
109 | 	if (is_socket) {
110 | 		event += 6;
111 | 		if (*event != '/')
112 | 			return 0;
113 | 		event++;
114 | 		if (!isdigit(*event)) {
115 | 			printf("invalid prog number\n");
116 | 			return -1;
117 | 		}
118 | 		return populate_prog_array(event, fd);
119 | 	}
120 | 
121 | 	if (is_kprobe || is_kretprobe) {
122 | 		if (is_kprobe)
123 | 			event += 7;
124 | 		else
125 | 			event += 10;
126 | 
127 | 		if (*event == 0) {
128 | 			printf("event name cannot be empty\n");
129 | 			return -1;
130 | 		}
131 | 
132 | 		if (isdigit(*event))
133 | 			return populate_prog_array(event, fd);
134 | 
135 | 		snprintf(buf, sizeof(buf),
136 | 			 "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events",
137 | 			 is_kprobe ? 'p' : 'r', event, event);
138 | 		err = system(buf);
139 | 		if (err < 0) {
140 | 			printf("failed to create kprobe '%s' error '%s'\n",
141 | 			       event, strerror(errno));
142 | 			return -1;
143 | 		}
144 | 
145 | 		strcpy(buf, DEBUGFS);
146 | 		strcat(buf, "events/kprobes/");
147 | 		strcat(buf, event);
148 | 		strcat(buf, "/id");
149 | 	} else if (is_tracepoint) {
150 | 		event += 11;
151 | 
152 | 		if (*event == 0) {
153 | 			printf("event name cannot be empty\n");
154 | 			return -1;
155 | 		}
156 | 		strcpy(buf, DEBUGFS);
157 | 		strcat(buf, "events/");
158 | 		strcat(buf, event);
159 | 		strcat(buf, "/id");
160 | 	}
161 | 
162 | 	efd = open(buf, O_RDONLY, 0);
163 | 	if (efd < 0) {
164 | 		printf("failed to open event %s\n", event);
165 | 		return -1;
166 | 	}
167 | 
168 | 	err = read(efd, buf, sizeof(buf));
169 | 	if (err < 0 || err >= sizeof(buf)) {
170 | 		printf("read from '%s' failed '%s'\n", event, strerror(errno));
171 | 		return -1;
172 | 	}
173 | 
174 | 	close(efd);
175 | 
176 | 	buf[err] = 0;
177 | 	id = atoi(buf);
178 | 	attr.config = id;
179 | 
180 | 	efd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
181 | 	if (efd < 0) {
182 | 		printf("event %d fd %d err %s\n", id, efd, strerror(errno));
183 | 		return -1;
184 | 	}
185 | 	event_fd[prog_cnt - 1] = efd;
186 | 	ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
187 | 	ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd);
188 | 
189 | 	return 0;
190 | }
191 | 
192 | static int load_maps(struct bpf_map_data *maps, int nr_maps,
193 | 		     fixup_map_cb fixup_map)
194 | {
195 | 	int i;
196 | 
197 | 	for (i = 0; i < nr_maps; i++) {
198 | 		if (fixup_map) {
199 | 			fixup_map(&maps[i], i);
200 | 			/* Allow userspace to assign map FD prior to creation */
201 | 			if (maps[i].fd != -1) {
202 | 				map_fd[i] = maps[i].fd;
203 | 				continue;
204 | 			}
205 | 		}
206 | 
207 | 		if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
208 | 		    maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) {
209 | 			int inner_map_fd = map_fd[maps[i].def.inner_map_idx];
210 | 
211 | 			map_fd[i] = bpf_create_map_in_map(maps[i].def.type,
212 | 							maps[i].def.key_size,
213 | 							inner_map_fd,
214 | 							maps[i].def.max_entries,
215 | 							maps[i].def.map_flags);
216 | 		} else {
217 | 			map_fd[i] = bpf_create_map(maps[i].def.type,
218 | 						   maps[i].def.key_size,
219 | 						   maps[i].def.value_size,
220 | 						   maps[i].def.max_entries,
221 | 						   maps[i].def.map_flags);
222 | 		}
223 | 		if (map_fd[i] < 0) {
224 | 			printf("failed to create a map: %d %s\n",
225 | 			       errno, strerror(errno));
226 | 			return 1;
227 | 		}
228 | 		maps[i].fd = map_fd[i];
229 | 
230 | 		if (maps[i].def.type == BPF_MAP_TYPE_PROG_ARRAY)
231 | 			prog_array_fd = map_fd[i];
232 | 	}
233 | 	return 0;
234 | }
235 | 
236 | static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname,
237 | 		   GElf_Shdr *shdr, Elf_Data **data)
238 | {
239 | 	Elf_Scn *scn;
240 | 
241 | 	scn = elf_getscn(elf, i);
242 | 	if (!scn)
243 | 		return 1;
244 | 
245 | 	if (gelf_getshdr(scn, shdr) != shdr)
246 | 		return 2;
247 | 
248 | 	*shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
249 | 	if (!*shname || !shdr->sh_size)
250 | 		return 3;
251 | 
252 | 	*data = elf_getdata(scn, 0);
253 | 	if (!*data || elf_getdata(scn, *data) != NULL)
254 | 		return 4;
255 | 
256 | 	return 0;
257 | }
258 | 
259 | static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols,
260 | 				GElf_Shdr *shdr, struct bpf_insn *insn,
261 | 				struct bpf_map_data *maps, int nr_maps)
262 | {
263 | 	int i, nrels;
264 | 
265 | 	nrels = shdr->sh_size / shdr->sh_entsize;
266 | 
267 | 	for (i = 0; i < nrels; i++) {
268 | 		GElf_Sym sym;
269 | 		GElf_Rel rel;
270 | 		unsigned int insn_idx;
271 | 		bool match = false;
272 | 		int j, map_idx;
273 | 
274 | 		gelf_getrel(data, i, &rel);
275 | 
276 | 		insn_idx = rel.r_offset / sizeof(struct bpf_insn);
277 | 
278 | 		gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym);
279 | 
280 | 		if (insn[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) {
281 | 			printf("invalid relo for insn[%d].code 0x%x\n",
282 | 			       insn_idx, insn[insn_idx].code);
283 | 			return 1;
284 | 		}
285 | 		insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD;
286 | 
287 | 		/* Match FD relocation against recorded map_data[] offset */
288 | 		for (map_idx = 0; map_idx < nr_maps; map_idx++) {
289 | 			if (maps[map_idx].elf_offset == sym.st_value) {
290 | 				match = true;
291 | 				break;
292 | 			}
293 | 		}
294 | 		if (match) {
295 | 			insn[insn_idx].imm = maps[map_idx].fd;
296 | 		} else {
297 | 			printf("invalid relo for insn[%d] no map_data match\n",
298 | 			       insn_idx);
299 | 			return 1;
300 | 		}
301 | 	}
302 | 
303 | 	return 0;
304 | }
305 | 
306 | static int cmp_symbols(const void *l, const void *r)
307 | {
308 | 	const GElf_Sym *lsym = (const GElf_Sym *)l;
309 | 	const GElf_Sym *rsym = (const GElf_Sym *)r;
310 | 
311 | 	if (lsym->st_value < rsym->st_value)
312 | 		return -1;
313 | 	else if (lsym->st_value > rsym->st_value)
314 | 		return 1;
315 | 	else
316 | 		return 0;
317 | }
318 | 
319 | static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx,
320 | 				 Elf *elf, Elf_Data *symbols, int strtabidx)
321 | {
322 | 	int map_sz_elf, map_sz_copy;
323 | 	bool validate_zero = false;
324 | 	Elf_Data *data_maps;
325 | 	int i, nr_maps;
326 | 	GElf_Sym *sym;
327 | 	Elf_Scn *scn;
328 | 	int copy_sz;
329 | 
330 | 	if (maps_shndx < 0)
331 | 		return -EINVAL;
332 | 	if (!symbols)
333 | 		return -EINVAL;
334 | 
335 | 	/* Get data for maps section via elf index */
336 | 	scn = elf_getscn(elf, maps_shndx);
337 | 	if (scn)
338 | 		data_maps = elf_getdata(scn, NULL);
339 | 	if (!scn || !data_maps) {
340 | 		printf("Failed to get Elf_Data from maps section %d\n",
341 | 		       maps_shndx);
342 | 		return -EINVAL;
343 | 	}
344 | 
345 | 	/* For each map get corrosponding symbol table entry */
346 | 	sym = calloc(MAX_MAPS+1, sizeof(GElf_Sym));
347 | 	for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) {
348 | 		assert(nr_maps < MAX_MAPS+1);
349 | 		if (!gelf_getsym(symbols, i, &sym[nr_maps]))
350 | 			continue;
351 | 		if (sym[nr_maps].st_shndx != maps_shndx)
352 | 			continue;
353 | 		/* Only increment iif maps section */
354 | 		nr_maps++;
355 | 	}
356 | 
357 | 	/* Align to map_fd[] order, via sort on offset in sym.st_value */
358 | 	qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols);
359 | 
360 | 	/* Keeping compatible with ELF maps section changes
361 | 	 * ------------------------------------------------
362 | 	 * The program size of struct bpf_map_def is known by loader
363 | 	 * code, but struct stored in ELF file can be different.
364 | 	 *
365 | 	 * Unfortunately sym[i].st_size is zero.  To calculate the
366 | 	 * struct size stored in the ELF file, assume all struct have
367 | 	 * the same size, and simply divide with number of map
368 | 	 * symbols.
369 | 	 */
370 | 	map_sz_elf = data_maps->d_size / nr_maps;
371 | 	map_sz_copy = sizeof(struct bpf_map_def);
372 | 	if (map_sz_elf < map_sz_copy) {
373 | 		/*
374 | 		 * Backward compat, loading older ELF file with
375 | 		 * smaller struct, keeping remaining bytes zero.
376 | 		 */
377 | 		map_sz_copy = map_sz_elf;
378 | 	} else if (map_sz_elf > map_sz_copy) {
379 | 		/*
380 | 		 * Forward compat, loading newer ELF file with larger
381 | 		 * struct with unknown features. Assume zero means
382 | 		 * feature not used.  Thus, validate rest of struct
383 | 		 * data is zero.
384 | 		 */
385 | 		validate_zero = true;
386 | 	}
387 | 
388 | 	/* Memcpy relevant part of ELF maps data to loader maps */
389 | 	for (i = 0; i < nr_maps; i++) {
390 | 		unsigned char *addr, *end;
391 | 		struct bpf_map_def *def;
392 | 		const char *map_name;
393 | 		size_t offset;
394 | 
395 | 		map_name = elf_strptr(elf, strtabidx, sym[i].st_name);
396 | 		maps[i].name = strdup(map_name);
397 | 		if (!maps[i].name) {
398 | 			printf("strdup(%s): %s(%d)\n", map_name,
399 | 			       strerror(errno), errno);
400 | 			free(sym);
401 | 			return -errno;
402 | 		}
403 | 
404 | 		/* Symbol value is offset into ELF maps section data area */
405 | 		offset = sym[i].st_value;
406 | 		def = (struct bpf_map_def *)(data_maps->d_buf + offset);
407 | 		maps[i].elf_offset = offset;
408 | 		memset(&maps[i].def, 0, sizeof(struct bpf_map_def));
409 | 		memcpy(&maps[i].def, def, map_sz_copy);
410 | 
411 | 		/* Verify no newer features were requested */
412 | 		if (validate_zero) {
413 | 			addr = (unsigned char*) def + map_sz_copy;
414 | 			end  = (unsigned char*) def + map_sz_elf;
415 | 			for (; addr < end; addr++) {
416 | 				if (*addr != 0) {
417 | 					free(sym);
418 | 					return -EFBIG;
419 | 				}
420 | 			}
421 | 		}
422 | 	}
423 | 
424 | 	free(sym);
425 | 	return nr_maps;
426 | }
427 | 
428 | static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
429 | {
430 | 	int fd, i, ret, maps_shndx = -1, strtabidx = -1;
431 | 	Elf *elf;
432 | 	GElf_Ehdr ehdr;
433 | 	GElf_Shdr shdr, shdr_prog;
434 | 	Elf_Data *data, *data_prog, *data_maps = NULL, *symbols = NULL;
435 | 	char *shname, *shname_prog;
436 | 	int nr_maps = 0;
437 | 
438 | 	/* reset global variables */
439 | 	kern_version = 0;
440 | 	memset(license, 0, sizeof(license));
441 | 	memset(processed_sec, 0, sizeof(processed_sec));
442 | 
443 | 	if (elf_version(EV_CURRENT) == EV_NONE)
444 | 		return 1;
445 | 
446 | 	fd = open(path, O_RDONLY, 0);
447 | 	if (fd < 0)
448 | 		return 1;
449 | 
450 | 	elf = elf_begin(fd, ELF_C_READ, NULL);
451 | 
452 | 	if (!elf)
453 | 		return 1;
454 | 
455 | 	if (gelf_getehdr(elf, &ehdr) != &ehdr)
456 | 		return 1;
457 | 
458 | 	/* clear all kprobes */
459 | 	i = system("echo \"\" > /sys/kernel/debug/tracing/kprobe_events");
460 | 
461 | 	/* scan over all elf sections to get license and map info */
462 | 	for (i = 1; i < ehdr.e_shnum; i++) {
463 | 
464 | 		if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
465 | 			continue;
466 | 
467 | 		if (0) /* helpful for llvm debugging */
468 | 			printf("section %d:%s data %p size %zd link %d flags %d\n",
469 | 			       i, shname, data->d_buf, data->d_size,
470 | 			       shdr.sh_link, (int) shdr.sh_flags);
471 | 
472 | 		if (strcmp(shname, "license") == 0) {
473 | 			processed_sec[i] = true;
474 | 			memcpy(license, data->d_buf, data->d_size);
475 | 		} else if (strcmp(shname, "version") == 0) {
476 | 			processed_sec[i] = true;
477 | 			if (data->d_size != sizeof(int)) {
478 | 				printf("invalid size of version section %zd\n",
479 | 				       data->d_size);
480 | 				return 1;
481 | 			}
482 | 			memcpy(&kern_version, data->d_buf, sizeof(int));
483 | 		} else if (strcmp(shname, "maps") == 0) {
484 | 			int j;
485 | 
486 | 			maps_shndx = i;
487 | 			data_maps = data;
488 | 			for (j = 0; j < MAX_MAPS; j++)
489 | 				map_data[j].fd = -1;
490 | 		} else if (shdr.sh_type == SHT_SYMTAB) {
491 | 			strtabidx = shdr.sh_link;
492 | 			symbols = data;
493 | 		}
494 | 	}
495 | 
496 | 	ret = 1;
497 | 
498 | 	if (!symbols) {
499 | 		printf("missing SHT_SYMTAB section\n");
500 | 		goto done;
501 | 	}
502 | 
503 | 	if (data_maps) {
504 | 		nr_maps = load_elf_maps_section(map_data, maps_shndx,
505 | 						elf, symbols, strtabidx);
506 | 		if (nr_maps < 0) {
507 | 			printf("Error: Failed loading ELF maps (errno:%d):%s\n",
508 | 			       nr_maps, strerror(-nr_maps));
509 | 			ret = 1;
510 | 			goto done;
511 | 		}
512 | 		if (load_maps(map_data, nr_maps, fixup_map))
513 | 			goto done;
514 | 		map_data_count = nr_maps;
515 | 
516 | 		processed_sec[maps_shndx] = true;
517 | 	}
518 | 
519 | 	/* load programs that need map fixup (relocations) */
520 | 	for (i = 1; i < ehdr.e_shnum; i++) {
521 | 		if (processed_sec[i])
522 | 			continue;
523 | 
524 | 		if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
525 | 			continue;
526 | 		if (shdr.sh_type == SHT_REL) {
527 | 			struct bpf_insn *insns;
528 | 
529 | 			if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog,
530 | 				    &shdr_prog, &data_prog))
531 | 				continue;
532 | 
533 | 			if (shdr_prog.sh_type != SHT_PROGBITS ||
534 | 			    !(shdr_prog.sh_flags & SHF_EXECINSTR))
535 | 				continue;
536 | 
537 | 			insns = (struct bpf_insn *) data_prog->d_buf;
538 | 
539 | 			processed_sec[shdr.sh_info] = true;
540 | 			processed_sec[i] = true;
541 | 
542 | 			if (parse_relo_and_apply(data, symbols, &shdr, insns,
543 | 						 map_data, nr_maps))
544 | 				continue;
545 | 
546 | 			if (memcmp(shname_prog, "kprobe/", 7) == 0 ||
547 | 			    memcmp(shname_prog, "kretprobe/", 10) == 0 ||
548 | 			    memcmp(shname_prog, "tracepoint/", 11) == 0 ||
549 | 			    memcmp(shname_prog, "xdp", 3) == 0 ||
550 | 			    memcmp(shname_prog, "perf_event", 10) == 0 ||
551 | 			    memcmp(shname_prog, "socket", 6) == 0 ||
552 | 			    memcmp(shname_prog, "cgroup/", 7) == 0)
553 | 				load_and_attach(shname_prog, insns, data_prog->d_size);
554 | 		}
555 | 	}
556 | 
557 | 	/* load programs that don't use maps */
558 | 	for (i = 1; i < ehdr.e_shnum; i++) {
559 | 
560 | 		if (processed_sec[i])
561 | 			continue;
562 | 
563 | 		if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
564 | 			continue;
565 | 
566 | 		if (memcmp(shname, "kprobe/", 7) == 0 ||
567 | 		    memcmp(shname, "kretprobe/", 10) == 0 ||
568 | 		    memcmp(shname, "tracepoint/", 11) == 0 ||
569 | 		    memcmp(shname, "xdp", 3) == 0 ||
570 | 		    memcmp(shname, "perf_event", 10) == 0 ||
571 | 		    memcmp(shname, "socket", 6) == 0 ||
572 | 		    memcmp(shname, "cgroup/", 7) == 0)
573 | 			load_and_attach(shname, data->d_buf, data->d_size);
574 | 	}
575 | 
576 | 	ret = 0;
577 | done:
578 | 	close(fd);
579 | 	return ret;
580 | }
581 | 
582 | int load_bpf_file(char *path)
583 | {
584 | 	return do_load_bpf_file(path, NULL);
585 | }
586 | 
587 | int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map)
588 | {
589 | 	return do_load_bpf_file(path, fixup_map);
590 | }
591 | 
592 | void read_trace_pipe(void)
593 | {
594 | 	int trace_fd;
595 | 
596 | 	trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
597 | 	if (trace_fd < 0)
598 | 		return;
599 | 
600 | 	while (1) {
601 | 		static char buf[4096];
602 | 		ssize_t sz;
603 | 
604 | 		sz = read(trace_fd, buf, sizeof(buf));
605 | 		if (sz > 0) {
606 | 			buf[sz] = 0;
607 | 			puts(buf);
608 | 		}
609 | 	}
610 | }
611 | 
612 | #define MAX_SYMS 300000
613 | static struct ksym syms[MAX_SYMS];
614 | static int sym_cnt;
615 | 
616 | static int ksym_cmp(const void *p1, const void *p2)
617 | {
618 | 	return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr;
619 | }
620 | 
621 | int load_kallsyms(void)
622 | {
623 | 	FILE *f = fopen("/proc/kallsyms", "r");
624 | 	char func[256], buf[256];
625 | 	char symbol;
626 | 	void *addr;
627 | 	int i = 0;
628 | 
629 | 	if (!f)
630 | 		return -ENOENT;
631 | 
632 | 	while (!feof(f)) {
633 | 		if (!fgets(buf, sizeof(buf), f))
634 | 			break;
635 | 		if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3)
636 | 			break;
637 | 		if (!addr)
638 | 			continue;
639 | 		syms[i].addr = (long) addr;
640 | 		syms[i].name = strdup(func);
641 | 		i++;
642 | 	}
643 | 	sym_cnt = i;
644 | 	qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp);
645 | 	return 0;
646 | }
647 | 
648 | struct ksym *ksym_search(long key)
649 | {
650 | 	int start = 0, end = sym_cnt;
651 | 	int result;
652 | 
653 | 	while (start < end) {
654 | 		size_t mid = start + (end - start) / 2;
655 | 
656 | 		result = key - syms[mid].addr;
657 | 		if (result < 0)
658 | 			end = mid;
659 | 		else if (result > 0)
660 | 			start = mid + 1;
661 | 		else
662 | 			return &syms[mid];
663 | 	}
664 | 
665 | 	if (start >= 1 && syms[start - 1].addr < key &&
666 | 	    key < syms[start].addr)
667 | 		/* valid ksym */
668 | 		return &syms[start - 1];
669 | 
670 | 	/* out of range. return _stext */
671 | 	return &syms[0];
672 | }
673 | 
674 | int set_link_xdp_fd(int ifindex, int fd, __u32 flags)
675 | {
676 | 	struct sockaddr_nl sa;
677 | 	int sock, seq = 0, len, ret = -1;
678 | 	char buf[4096];
679 | 	struct nlattr *nla, *nla_xdp;
680 | 	struct {
681 | 		struct nlmsghdr  nh;
682 | 		struct ifinfomsg ifinfo;
683 | 		char             attrbuf[64];
684 | 	} req;
685 | 	struct nlmsghdr *nh;
686 | 	struct nlmsgerr *err;
687 | 
688 | 	memset(&sa, 0, sizeof(sa));
689 | 	sa.nl_family = AF_NETLINK;
690 | 
691 | 	sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
692 | 	if (sock < 0) {
693 | 		printf("open netlink socket: %s\n", strerror(errno));
694 | 		return -1;
695 | 	}
696 | 
697 | 	if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
698 | 		printf("bind to netlink: %s\n", strerror(errno));
699 | 		goto cleanup;
700 | 	}
701 | 
702 | 	memset(&req, 0, sizeof(req));
703 | 	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
704 | 	req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
705 | 	req.nh.nlmsg_type = RTM_SETLINK;
706 | 	req.nh.nlmsg_pid = 0;
707 | 	req.nh.nlmsg_seq = ++seq;
708 | 	req.ifinfo.ifi_family = AF_UNSPEC;
709 | 	req.ifinfo.ifi_index = ifindex;
710 | 
711 | 	/* started nested attribute for XDP */
712 | 	nla = (struct nlattr *)(((char *)&req)
713 | 				+ NLMSG_ALIGN(req.nh.nlmsg_len));
714 | 	nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/;
715 | 	nla->nla_len = NLA_HDRLEN;
716 | 
717 | 	/* add XDP fd */
718 | 	nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
719 | 	nla_xdp->nla_type = 1/*IFLA_XDP_FD*/;
720 | 	nla_xdp->nla_len = NLA_HDRLEN + sizeof(int);
721 | 	memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd));
722 | 	nla->nla_len += nla_xdp->nla_len;
723 | 
724 | 	/* if user passed in any flags, add those too */
725 | 	if (flags) {
726 | 		nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
727 | 		nla_xdp->nla_type = 3/*IFLA_XDP_FLAGS*/;
728 | 		nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags);
729 | 		memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags));
730 | 		nla->nla_len += nla_xdp->nla_len;
731 | 	}
732 | 
733 | 	req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
734 | 
735 | 	if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
736 | 		printf("send to netlink: %s\n", strerror(errno));
737 | 		goto cleanup;
738 | 	}
739 | 
740 | 	len = recv(sock, buf, sizeof(buf), 0);
741 | 	if (len < 0) {
742 | 		printf("recv from netlink: %s\n", strerror(errno));
743 | 		goto cleanup;
744 | 	}
745 | 
746 | 	for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
747 | 	     nh = NLMSG_NEXT(nh, len)) {
748 | 		if (nh->nlmsg_pid != getpid()) {
749 | 			printf("Wrong pid %d, expected %d\n",
750 | 			       nh->nlmsg_pid, getpid());
751 | 			goto cleanup;
752 | 		}
753 | 		if (nh->nlmsg_seq != seq) {
754 | 			printf("Wrong seq %d, expected %d\n",
755 | 			       nh->nlmsg_seq, seq);
756 | 			goto cleanup;
757 | 		}
758 | 		switch (nh->nlmsg_type) {
759 | 		case NLMSG_ERROR:
760 | 			err = (struct nlmsgerr *)NLMSG_DATA(nh);
761 | 			if (!err->error)
762 | 				continue;
763 | 			printf("nlmsg error %s\n", strerror(-err->error));
764 | 			goto cleanup;
765 | 		case NLMSG_DONE:
766 | 			break;
767 | 		}
768 | 	}
769 | 
770 | 	ret = 0;
771 | 
772 | cleanup:
773 | 	close(sock);
774 | 	return ret;
775 | }
776 | 


--------------------------------------------------------------------------------