├── .gitignore
├── Doc
    ├── Notes
    │   ├── AF_XDP
    │   │   ├── AF_XDP初步探索.assets
    │   │   │   ├── image-20211108150438795.png
    │   │   │   ├── image-20211108153129856.png
    │   │   │   ├── image-20211108154528997.png
    │   │   │   ├── image-20211108154911059.png
    │   │   │   ├── image-20211108223028565.png
    │   │   │   ├── image-20211108223725043.png
    │   │   │   ├── image-20211108223952958.png
    │   │   │   ├── image-20211108224515886.png
    │   │   │   ├── image-20211109120556512.png
    │   │   │   ├── image-20211109120656105.png
    │   │   │   ├── image-20211109215310651.png
    │   │   │   └── image-20211109215330513.png
    │   │   ├── AF_XDP初步探索.md
    │   │   ├── 包复制技术.assets
    │   │   │   └── image-20220723171851322.png
    │   │   └── 包复制技术.md
    │   ├── MPTCPV1_scheduler
    │   │   ├── 编译内核.assets
    │   │   │   ├── image-20221029172426258.png
    │   │   │   ├── image-20221029172606063.png
    │   │   │   ├── image-20221030121511464.png
    │   │   │   ├── image-20221030134001716.png
    │   │   │   ├── image-20221030151521474.png
    │   │   │   ├── image-20221030151556274-16671141804591.png
    │   │   │   ├── image-20221030151556274.png
    │   │   │   ├── image-20221030155827800.png
    │   │   │   ├── image-20221030161928637.png
    │   │   │   ├── image-20221031141043437.png
    │   │   │   ├── image-20221031144952647.png
    │   │   │   ├── image-20221031145125353.png
    │   │   │   ├── image-20221031163152128.png
    │   │   │   ├── image-20221031163214989.png
    │   │   │   ├── image-20221031173807981.png
    │   │   │   ├── image-20221101104127087.png
    │   │   │   ├── image-20221101114221785.png
    │   │   │   ├── image-20221101155538090.png
    │   │   │   ├── image-20221102103323083.png
    │   │   │   ├── image-20221102104209026.png
    │   │   │   ├── image-20221102105116812.png
    │   │   │   └── image-20221102105149134.png
    │   │   └── 编译内核.md
    │   ├── eBPF_HW_offload
    │   │   └── eBPF_HW_offload.md
    │   ├── eBPF_MAP_user
    │   │   ├── eBPF_MAP_note.assets
    │   │   │   └── image-20220520183538699.png
    │   │   └── eBPF_MAP_note.md
    │   ├── eBPF_TCP_ca
    │   │   └── eBPF_TCP_ca.md
    │   ├── eBPF_bpflink_kernel
    │   │   └── eBPF_bpflink_kernel.md
    │   ├── eBPF_helperfunc_kernel
    │   │   └── eBPF_helperfunc_kernel.md
    │   ├── eBPF_introduction_slides
    │   │   ├── README.md
    │   │   └── slides.md
    │   ├── eBPF_kernel_document_note
    │   │   ├── eBPF_kernel_document_note.assets
    │   │   │   ├── image-20220901150942553.png
    │   │   │   ├── image-20220901152829447.png
    │   │   │   ├── image-20220901155226898.png
    │   │   │   ├── image-20220901160259370.png
    │   │   │   ├── image-20220901161031445.png
    │   │   │   ├── image-20220901163611445.png
    │   │   │   ├── image-20220901164811553.png
    │   │   │   ├── image-20220901165036148.png
    │   │   │   ├── image-20220901165204233.png
    │   │   │   ├── image-20220901165233885.png
    │   │   │   ├── image-20220901165341202.png
    │   │   │   ├── image-20220901165558510.png
    │   │   │   ├── image-20220901165809164.png
    │   │   │   ├── image-20220901170115110.png
    │   │   │   ├── image-20220901170432600.png
    │   │   │   └── image-20220901171034117.png
    │   │   └── eBPF_kernel_document_note.md
    │   ├── eBPF_map_kernel
    │   │   ├── Map_Ops_BPF_MAP_TYPE_ARRAY.assets
    │   │   │   └── image-20221226102621676.png
    │   │   ├── Map_Ops_BPF_MAP_TYPE_ARRAY.md
    │   │   ├── Map_Ops_BPF_MAP_TYPE_HASH .assets
    │   │   │   ├── image-20221109162954511.png
    │   │   │   ├── image-20221130164200740.png
    │   │   │   ├── image-20221130165305704.png
    │   │   │   ├── image-20221201161747521.png
    │   │   │   └── image-20221201161850374.png
    │   │   ├── Map_Ops_BPF_MAP_TYPE_HASH .md
    │   │   ├── Map_Ops_BPF_MAP_TYPE_STRUCT_OP.assets
    │   │   │   ├── image-20221109163807298.png
    │   │   │   ├── image-20221110172442249.png
    │   │   │   ├── image-20221110172608309.png
    │   │   │   ├── image-20221110192516048.png
    │   │   │   ├── image-20221115212332238.png
    │   │   │   ├── image-20221115212335463.png
    │   │   │   └── image-20221115213420120.png
    │   │   ├── Map_Ops_BPF_MAP_TYPE_STRUCT_OP.md
    │   │   ├── eBPF_map_kernel.assets
    │   │   │   ├── image-20221108110550651.png
    │   │   │   ├── image-20221108112647370.png
    │   │   │   ├── image-20221108114101804.png
    │   │   │   ├── image-20221108165434824.png
    │   │   │   ├── image-20221108170547309.png
    │   │   │   ├── image-20221109153041179.png
    │   │   │   ├── image-20221109154550592.png
    │   │   │   ├── image-20221109160126858.png
    │   │   │   └── image-20221109160314579.png
    │   │   └── eBPF_map_kernel.md
    │   ├── eBPF_prog_kernel
    │   │   ├── eBPF_prog_kernel.assets
    │   │   │   ├── image-20221102213705213.png
    │   │   │   ├── image-20221103161824697.png
    │   │   │   └── image-20221103163709300.png
    │   │   ├── eBPF_prog_kernel.md
    │   │   └── eBPF_prog_kernel.pptx
    │   ├── eBPF_verifier_note
    │   │   ├── eBPF_varifier_note.assets
    │   │   │   ├── image-20220627143836218.png
    │   │   │   ├── image-20220627144244437.png
    │   │   │   └── image-20220627144933988.png
    │   │   └── eBPF_varifier_note.md
    │   ├── 使用Ftrace修改函数参数.md
    │   ├── 利用bpf修改函数返回值的探索
    │   │   ├── 利用bpf修改函数返回值的探索.assets
    │   │   │   ├── image-20210922182718960.png
    │   │   │   ├── image-20210922183431404.png
    │   │   │   ├── image-20210922184227041.png
    │   │   │   └── image-20210922184419413.png
    │   │   └── 利用bpf修改函数返回值的探索.md
    │   ├── 利用bpf修改用户空间函数参数
    │   │   ├── 利用bpf修改用户空间函数参数.assets
    │   │   │   ├── image-20211013230335647.png
    │   │   │   ├── image-20211013230702277.png
    │   │   │   ├── image-20211013230710713.png
    │   │   │   ├── image-20211013231022799.png
    │   │   │   ├── image-20211013231143806.png
    │   │   │   ├── image-20211013232705596.png
    │   │   │   └── image-20211013233018492.png
    │   │   └── 利用bpf修改用户空间函数参数.md
    │   └── 利用xdp修改packet的实践
    │   │   ├── 利用xdp修改packet的实践.assets
    │   │       ├── image-20220115144744384.png
    │   │       ├── image-20220116211129370.png
    │   │       ├── image-20220117144718144-5218977.png
    │   │       ├── image-20220117160645160.png
    │   │       ├── image-20220117161105612.png
    │   │       ├── image-20220117161317391.png
    │   │       ├── image-20220117165340867.png
    │   │       └── image-20220117170708671.png
    │   │   └── 利用xdp修改packet的实践.md
    ├── eBPF_Resources.assets
    │   └── image-20220901115500782.png
    └── eBPF_Resources.md
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | *.DS_Store
2 | 


--------------------------------------------------------------------------------
/Doc/Notes/AF_XDP/AF_XDP初步探索.assets/image-20211108150438795.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/AF_XDP/AF_XDP初步探索.assets/image-20211108150438795.png


--------------------------------------------------------------------------------
/Doc/Notes/AF_XDP/AF_XDP初步探索.assets/image-20211108153129856.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/AF_XDP/AF_XDP初步探索.assets/image-20211108153129856.png


--------------------------------------------------------------------------------
/Doc/Notes/AF_XDP/AF_XDP初步探索.assets/image-20211108154528997.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/AF_XDP/AF_XDP初步探索.assets/image-20211108154528997.png


--------------------------------------------------------------------------------
/Doc/Notes/AF_XDP/AF_XDP初步探索.assets/image-20211108154911059.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/AF_XDP/AF_XDP初步探索.assets/image-20211108154911059.png


--------------------------------------------------------------------------------
/Doc/Notes/AF_XDP/AF_XDP初步探索.assets/image-20211108223028565.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/AF_XDP/AF_XDP初步探索.assets/image-20211108223028565.png


--------------------------------------------------------------------------------
/Doc/Notes/AF_XDP/AF_XDP初步探索.assets/image-20211108223725043.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/AF_XDP/AF_XDP初步探索.assets/image-20211108223725043.png


--------------------------------------------------------------------------------
/Doc/Notes/AF_XDP/AF_XDP初步探索.assets/image-20211108223952958.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/AF_XDP/AF_XDP初步探索.assets/image-20211108223952958.png


--------------------------------------------------------------------------------
/Doc/Notes/AF_XDP/AF_XDP初步探索.assets/image-20211108224515886.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/AF_XDP/AF_XDP初步探索.assets/image-20211108224515886.png


--------------------------------------------------------------------------------
/Doc/Notes/AF_XDP/AF_XDP初步探索.assets/image-20211109120556512.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/AF_XDP/AF_XDP初步探索.assets/image-20211109120556512.png


--------------------------------------------------------------------------------
/Doc/Notes/AF_XDP/AF_XDP初步探索.assets/image-20211109120656105.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/AF_XDP/AF_XDP初步探索.assets/image-20211109120656105.png


--------------------------------------------------------------------------------
/Doc/Notes/AF_XDP/AF_XDP初步探索.assets/image-20211109215310651.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/AF_XDP/AF_XDP初步探索.assets/image-20211109215310651.png


--------------------------------------------------------------------------------
/Doc/Notes/AF_XDP/AF_XDP初步探索.assets/image-20211109215330513.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/AF_XDP/AF_XDP初步探索.assets/image-20211109215330513.png


--------------------------------------------------------------------------------
/Doc/Notes/AF_XDP/AF_XDP初步探索.md:
--------------------------------------------------------------------------------
  1 | # AF_XDP初步探索
  2 | 
  3 | ## AF_XDP原理
  4 | 
  5 | **概述** 
  6 | 
  7 | AF_XDP是一类特殊的socket也被称为XSK, 是为了高性能包处理，提出的一项技术，AF_XDP配合XDP使用。
  8 | 
  9 | XSK 建立之后需要绑定在**特定的网卡的特定的特定队列**上， 同时在该设备上加载 xdp 程序。当网络包到达的时候，xdp程序可以根据网络包的内容，选择将packet直接重定向到XSK中，绕过内核网络栈，实现在用户态进行包处理。和DPDK不同的是，如果网络设备的驱动支持，AF_XDP技术可以通过映射用户态和内核态的内存，实现无需将packet拷贝到用户态(ZERO_COPY模式)，进一步提升处理效率。
 10 | 
 11 | 除了在接受路径上对packet进行重定向之外，也可以通过 XSK 绕过内核网络栈直接将packet发送出去。
 12 | 
 13 | 因此结合我们项目的需求，我认为利用AF_XDP实现双网络栈从而对 mptcp进行优化是可行的。但是这种优化有一个问题，必须借助XSK, 因此感觉**无法对用户层完全透明** 。
 14 | 
 15 | **具体原理** 
 16 | 
 17 | AF_XDP 的核心是下面这几个概念： 
 18 | 
 19 | ### UMEM 
 20 | 
 21 | UMEM是一块虚拟的连续的内存区域，UMEM具有以下几个特点： 
 22 | 
 23 | 1. UMEM 由一个个内存块(chunks)组成, 一个内存块称为一个frame。 一个frame的大小通常是4kB。
 24 | 2. 每一个XSK都需要attach到一块UMEM上，而一块UMEM可以attach多个XSK, 也就是同一 UMEM可以由多个XSK共享。
 25 | 3. XSK接收到的packet (通过XDP程序重定向）, 要发送的packet。内容都保存在UMEM中。
 26 | 
 27 | ### RINGS
 28 | 
 29 | UMEM一共有 4 种 ring。ring是一块环形的缓冲区，是一个单生产者单消费者的环形队列。(这意味着如果使用多线程，需要application自己实现同步操作)
 30 | 
 31 | 整体上来说有两种ring,  生产者ring 和消费者ring。这种ring的类型是根据用户态程序扮演的角色来划分的
 32 | 
 33 | * 生产者ring, 用户态程序是生产者往 ring写数据, 内核是消费者.（tx ring, fill ring) 
 34 | * 消费者ring, 用户态程序是消费者消费ring的数据, 内核是生产者
 35 | 
 36 | 源码中的定义如下： 
 37 | 
 38 | ```c
 39 | /* Do not access these members directly. Use the functions below. */
 40 | #define DEFINE_XSK_RING(name) \
 41 | struct name { \
 42 | 	__u32 cached_prod; \
 43 | 	__u32 cached_cons; \
 44 | 	__u32 mask; \
 45 | 	__u32 size; \
 46 | 	__u32 *producer; \
 47 | 	__u32 *consumer; \
 48 | 	void *ring; \
 49 | 	__u32 *flags; \
 50 | }
 51 | 
 52 | DEFINE_XSK_RING(xsk_ring_prod);
 53 | DEFINE_XSK_RING(xsk_ring_cons);
 54 | ```
 55 | 
 56 | 对于生产者ring 和 消费者 ring 除了结构体名称不同之外，其余都相同。
 57 | 
 58 | 数据结构含义如下： 
 59 | 
 60 | * cached_prod:  缓存的生产者下标 
 61 | * producer:  当前的生产者的位置
 62 | * cached_cons: 缓存的消费者下标 
 63 | * consumer: 当前的消费者位置
 64 | * ring :  环形队列数据区， 环形队列本质上是一个数组，数组长度是用户设定的环形队列的长度，数组的元素是指向 UMEM 的地址（这里的地址指的是相对于UMEM, 所以ring保存的是一个便宜量） 
 65 | * size : 环形队列的长度, 所有 ring 的大小都必须是 2 的次方
 66 | * mask : 对 idx (后文解释)取掩摸计算数组的下标， mask = size - 1. 、
 67 | * flags : 一些设置的flag 
 68 | 
 69 | #### 生产者ring 
 70 | 
 71 | 用户态程序扮演生产者的角色，为了完成一次 produce, 用户态程序需要： 
 72 | 
 73 | 1. reserver(提前缓存要生产的数目)
 74 | 2. 往ring里写数据（UMEM的偏移量）
 75 | 3. submit， 完成一次生产
 76 | 
 77 | 这三个步骤，操作 cached_prod, cached_cons, producer 指针。并且有相应的帮助函数： 
 78 | 
 79 | ```c
 80 | static inline __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb)
 81 | {
 82 | 	__u32 free_entries = r->cached_cons - r->cached_prod;
 83 | 
 84 | 	if (free_entries >= nb)
 85 | 		return free_entries;
 86 | 
 87 | 	/* Refresh the local tail pointer.
 88 | 	 * cached_cons is r->size bigger than the real consumer pointer so
 89 | 	 * that this addition can be avoided in the more frequently
 90 | 	 * executed code that computs free_entries in the beginning of
 91 | 	 * this function. Without this optimization it whould have been
 92 | 	 * free_entries = r->cached_prod - r->cached_cons + r->size.
 93 | 	 */
 94 | 	r->cached_cons = *r->consumer + r->size;
 95 | 
 96 | 	return r->cached_cons - r->cached_prod;
 97 | }
 98 | 
 99 | static inline size_t xsk_ring_prod__reserve(struct xsk_ring_prod *prod,
100 | 					    size_t nb, __u32 *idx)
101 | {
102 | 	if (xsk_prod_nb_free(prod, nb) < nb)
103 | 		return 0;
104 | 
105 | 	*idx = prod->cached_prod;
106 | 	prod->cached_prod += nb;
107 | 
108 | 	return nb;
109 | }
110 | 
111 | static inline void xsk_ring_prod__submit(struct xsk_ring_prod *prod, size_t nb)
112 | {
113 | 	/* Make sure everything has been written to the ring before indicating
114 | 	 * this to the kernel by writing the producer pointer.
115 | 	 */
116 | 	libbpf_smp_wmb();
117 | 
118 | 	*prod->producer += nb;
119 | }
120 | ```
121 | 
122 | 
123 | 
124 | **xsk_prod_nb_free** 
125 | 
126 | * param: 
127 | 
128 |   1. xsk_ring_prod *r : ring 指针
129 | 
130 |   2. __u32 nb： number of blocks (用户给定的 frame数目)
131 | 
132 | * return: 
133 | 	实际可供生产者使用的帧数量(也可以理解成队列的元素个数)
134 | 
135 | 该函数给定 nb， 并返回指定队列的实际可用的帧数目。因为可用的帧数目其实是由返回值决定的，nb的主要作用是刷新 cached_cons （之后解释） 
136 | 
137 | **xsk_ring_prod__reserve**
138 | 
139 | * param: 
140 | 
141 |   1. xsk_ring_prod *prod : ring指针
142 | 
143 |   2.  size_t nb: 用户希望预留的 fram数目
144 | 
145 |   3.   __u32 *idx: 值返回参数，如果reserve成功，idx被设置为第一个可用的地址（地址idx到数组下标还需要经过mask转化）
146 | 
147 | * return:
148 | 	0 或 nb, 0 ： reserve失败， nb reserve成功
149 | 
150 | 该函数首先调用 xsk_prod_nb_free 获取当前可用的 frames 数目，如果 frames < nb ，返回 0预分配失败。否则讲idx值设置为 prod->cached_prod, 并且将 cached_prod的值往前移nb（因为是只是预分配，所以操作cached_prod)
151 | 
152 | **xsk_ring_prod__submit** 
153 | 
154 | * param: 
155 |   1. xsk_ring_prod *prod: ring的指针
156 | 
157 | * nb: 实际生产的数量
158 | 
159 | 该函数在预分配之后，用户态程序往ring中生产了 nb 个frame后，调用该函数提交生产的数目，该函数操作 produce指针前移nb, 真正完成一次生产。 这里的 ` libbpf_smp_wmb()` 指的是内存栅栏，在源码中的定义为:`asm volatile("" : : : "memory")` 。(以下为个人理解）这是编译器级别的内存栅栏，这句话有两个作用，1. 确保 `libbpf_smp_wmp`之后的代码不会因为编译器优化放到`libbpf_smp_wmp`之前执行（禁止编译器进行乱序优化） 2. 确保读取的变量读取的是内存的最新的值，而不是寄存器或者是cached中的值。 在这里 `xsk_ring_prod__submit`是 static inline类型的函数，当内联展开的时候 `libbpf_smp_wmp`确保指令不会被乱序执行。同时producer指向的地址同时被用户态程序和内核态程序使用（将内核台地址使用mmap 映射到用户态），内存栅栏也确保submit的时候能读到最新的值。
160 | 
161 | **生产者ring的工作过程** 
162 | 
163 | 现在阐述 producer ring 具体是如何工作。
164 | 
165 | 假设 `ring->size = 8`  因此数组的下标从 0 到 7 
166 | 
167 | 那么 `ring->mask = 0x7 (ring->size-1) `
168 | 
169 | 为了方便表述符号规定如下: 
170 | 
171 | * `consumer` = `*ring->consuemr`
172 | * `producer`=`*ring->producer`
173 | * `cached_prod `= `ring->cached_prod`
174 | * `cached_cons` = `ring->cached_cons` 
175 | 
176 | **初始化** 
177 | 
178 | ![image-20211108150438795](AF_XDP初步探索.assets/image-20211108150438795.png)
179 | 
180 | 对于生产者ring来说，稍微难理解的是 `cached_cons`的作用，我个人理解 `cached_cons`的作用起到了标识ring尾部的效果，这样就可以利用 `cached_cons 和 cached_prod` 来计算当前可供生产者使用的 frames。
181 | 
182 | 初始化的时候 cached_cons 被设置为： `consumer + ring->size = 0 + 8 = 8` ， 第 15 行保证  `cached_cons =  consuemr + size` , 第15行代码是为了当用户申请的 nb 数目较大，但是根据 cached_cons 和 cached_prod 计算得到的frames < nb （这可能是因为 cached_cons 没有及时更新导致的），因此第 15 行的代码会刷新 `cached_cons`将其重新设置到尾部，并再次进行计算。
183 | 
184 | 现在调用一次： `xsk_ring_prod__reserve(ring, 2, &idx)` ，用户态程序想要生产两个frame,  首先进行预分配(reserve)
185 | 
186 | **after reserve 2 frames**
187 | 
188 |  ![image-20211108153129856](AF_XDP初步探索.assets/image-20211108153129856.png)
189 | 
190 | 可以看到， `cached_prod`往前走了2步，这意味着如果此时调用 `xsk_prod_nb_free` 返回 6 （8-2），因为前两个frame虽然还没有被填入数据，但是已经被预分配出去了，所以剩下的可供生产的 frames 为 6。
191 | 
192 | 值返回参数 idx， 值被设置为 0 ， 也就是一开始 cached_prod 指向的位置。因此只需要一个for循环，递增idx即可生产数据了。
193 | 
194 | 有一个问题，idx的类型是 u32, 因此当 idx > 7时，显然是不能把 idx 直接当作数据下标来使用。 因此实际的下标为 ： `idx & mask` 
195 | 
196 | 就本例来说， 加入 idx = 9 , `9 & 0x7 = 1 ` 因此 idx = 1 和 idx = 9 指向数组的同一个元素。这里相比于使用取摸法，选择了 mask 与的方式来实现环形队列，按位与的效率比取模操作效率要高，但前提是数组的大小是 2 的次方。因此，AF_XDP要求 ring 的 size 必须是 2 的次方。 
197 | 
198 | 现在生产了两个数据，调用 `xsk_ring_prod__submit(ring, 2)`
199 | 
200 | **after submit** 
201 | 
202 | ![image-20211108154528997](AF_XDP初步探索.assets/image-20211108154528997.png)
203 | 
204 | 这步操作比较简单，将 producer 前移 nb(2)
205 | 
206 | 对于生产者ring, 内核是消费者（内核消费的代码我没有看）， 内核消费ring, 并更新 consumer 和 cached_cons。 
207 | 
208 | 现在假设内核消费了一个数据： 
209 | 
210 | **after kernel consume 1 frame** 
211 | 
212 | ![image-20211108154911059](AF_XDP初步探索.assets/image-20211108154911059.png)
213 | 
214 | 可以看到，kernel 将 consumer 和 cached_cons 都往前移一位。 此时如果调用 `xsk_prod_nb_free` 将返回 7 （因为内核只消费了一个数据) 。这就是生产者ring 的一次完整的生产过程
215 | 
216 | #### 消费者ring 
217 | 
218 | 和生产者ring相反，用户态程序充当消费者，为了完成一次consume, 需要： 
219 | 
220 | 1. peek, 预消费, 获取当前能够消费的frames的数目
221 | 2. 消费数据
222 | 3. release, 释放已经消费过的frames(生产者能够重新生产)
223 | 
224 | 类似的，相关的帮助函数如下： 
225 | 
226 | ```c
227 | static inline __u32 xsk_cons_nb_avail(struct xsk_ring_cons *r, __u32 nb)
228 | {
229 | 	__u32 entries = r->cached_prod - r->cached_cons;
230 | 
231 | 	if (entries == 0) {
232 | 		r->cached_prod = *r->producer;
233 | 		entries = r->cached_prod - r->cached_cons;
234 | 	}
235 | 
236 | 	return (entries > nb) ? nb : entries;
237 | }
238 | 
239 | static inline size_t xsk_ring_cons__peek(struct xsk_ring_cons *cons,
240 | 					 size_t nb, __u32 *idx)
241 | {
242 | 	size_t entries = xsk_cons_nb_avail(cons, nb);
243 | 
244 | 	if (entries > 0) {
245 | 		/* Make sure we do not speculatively read the data before
246 | 		 * we have received the packet buffers from the ring.
247 | 		 */
248 | 		libbpf_smp_rmb();
249 | 
250 | 		*idx = cons->cached_cons;
251 | 		cons->cached_cons += entries;
252 | 	}
253 | 
254 | 	return entries;
255 | }
256 | 
257 | static inline void xsk_ring_cons__release(struct xsk_ring_cons *cons, size_t nb)
258 | {
259 | 	/* Make sure data has been read before indicating we are done
260 | 	 * with the entries by updating the consumer pointer.
261 | 	 */
262 | 	libbpf_smp_rwmb();
263 | 
264 | 	*cons->consumer += nb;
265 | }
266 | ```
267 | 
268 | **xsk_cons_nb_avail**
269 | 
270 | * param: 
271 | 
272 |   1. xsk_ring_cons *r :  ring的指针
273 |   2. nb : 用户需要消费的frames的数目
274 | 
275 | * return : 
276 | 
277 |   返回当前可供消费的frames数目， 如果实际可消费frames数目 > nb 返回 nb, 否则返回实际的数目
278 | 
279 | 该函数给定nb, 返回实际可以被消费的frame数目。 
280 | 
281 | **xsk_ring_cons__peek** 
282 | 
283 | * param :
284 | 
285 |   1. struct xsk_ring_cons *cons  ：ring指针
286 |   2. nb : 用户希望 peek 的 frames的数目
287 |   3. idx : 值返回参数，函数返回值，idx的值是第一个可以消费frame的地址（用法和 produce ring 的 idx 类似） 
288 | 
289 | * return : 
290 | 
291 |   含义与**xsk_cons_nb_avail** 返回值相同
292 | 
293 | 用户调用该函数，预消费nb个frame的数据，该函数修改 cached_cons ,  将可以消费的第一个地址的值赋给idx(之后可以用一个for循环递增idx，然后消费数据)
294 | 
295 | **xsk_ring_cons__release** 
296 | 
297 | * param :
298 |   1. cons : ring的指针
299 |   2. nb :  用户实际已经消费完成的，可以释放的frame的数目
300 | 
301 | 当用户态程序，预消费(xsk_ring_cons__peek)之后，实际消费之后，调用该函数完成消费。
302 | 
303 | **消费者ring工作过程** 
304 | 
305 | 使用的定义和生产者ring类似。
306 | 
307 | **初始化** 
308 | 
309 | ![image-20211108223028565](AF_XDP初步探索.assets/image-20211108223028565.png)
310 | 
311 | 和生产者ring有所不同，用户态程序充当的消费者角色， cached_cons在这里没有标识尾部的作用。因此初始化 consumer, producer, cached_prod, cached_cons 均为 0 
312 | 
313 | **内核生产了2个frame之后** 
314 | 
315 | ![image-20211108223725043](AF_XDP初步探索.assets/image-20211108223725043.png)
316 | 
317 | **after peak** 
318 | 
319 | 用户态程序peek了一个frame 
320 | 
321 | ![image-20211108223952958](AF_XDP初步探索.assets/image-20211108223952958.png)
322 | 
323 | **after release** 
324 | 
325 | 消费过程， idx 的含义和生产者ring类似。
326 | 
327 | ![image-20211108224515886](AF_XDP初步探索.assets/image-20211108224515886.png)
328 | 
329 | ### UMEM 4种特殊的 RING
330 | 
331 | #### Fill Ring
332 | 
333 | fill ring 是生产者ring,  用户态程序往fill ring中写入可用的 addr (UMEM的偏移，后文的地址如无特殊说明指的都是偏移)， 内核消费 fill ring， 将fill ring提供的地址用于接受路径。因此，fill ring的作用是将地址的所有权从用户态转移到内核态。 一个 UMEM**有且只有一个**fill ring。 如果用户态程序不往 fill ring 生产数据，那么 XSK将接收不到任何数据。
334 | 
335 | FILL RING是生产者ring ,生产的步骤 （reserve, produce, submit) 在前文生产者ring详细提及。在具体生产的时候，我们获取idx, 并调用函数 ： 
336 | 
337 | ```c
338 | static inline __u64 *xsk_ring_prod__fill_addr(struct xsk_ring_prod *fill,
339 | 					      __u32 idx)
340 | {
341 | 	__u64 *addrs = (__u64 *)fill->ring;
342 | 
343 | 	return &addrs[idx & fill->mask];
344 | }
345 | ```
346 | 
347 | 该函数输入 idx , 并返回数组对应的元素的指针，用户态程序写入可用的地址（偏移量）即可。
348 | 
349 | #### Complete Ring
350 | 
351 | Complete ring 是消费者ring , 和 fill ring 相反。内核是生产者，往该 ring 写入使用完成了可以供用户态再次使用的地址，用户态程序消费该ring, 并回收地址。complete ring 将发送路径（tx ring) 使用完成的地址的归属权从内核态转移到用户态。一个UMEM**有且只有一个complete ring**。
352 | 
353 | Complete ring 和 fill ring 彼此互补，在实际使用，往往需要用户态程序实现地址管理和分配，在用户态程序初始阶段将空闲的地址写入fill ring。随着程序的运行，用户态程序不断消费complete ring, 回收地址，并在合适的时机将回收的地址再次写入fill ring ， 实现一个循环。
354 | 
355 | Complete ring 是消费者ring, 实际使用需要遵循消费的步骤（peek , 消费， release)。 在具体消费的时候，我们获取idx，并调用函数： 
356 | 
357 | ```c
358 | static inline const __u64 *
359 | xsk_ring_cons__comp_addr(const struct xsk_ring_cons *comp, __u32 idx)
360 | {
361 | 	const __u64 *addrs = (const __u64 *)comp->ring;
362 | 
363 | 	return &addrs[idx & comp->mask];
364 | }
365 | ```
366 | 
367 | 该函数返回 ring 数组对应元素的指针，值得注意的是该指针被const 修饰，意味着我们只能获取地址，不能修改内容。
368 | 
369 | #### Rx Ring 
370 | 
371 | Rx ring 是消费者ring, 如果xdp程序将packet重定向到该XSK, 那么内核往该Ring写入数据。用户态程序消费该ring，获取packet的内容并处理packet。这意味着，对于XSK，接收packet不是通过recv实现的，而是通过消费rx ring 实现的。XSK可以配合 poll select epoll 使用，当poll返回的时候意味着有数据到达，此时消费rx ring 。每一个xdp socket (XSK) 都必须要有一个 rx ring 或者 tx ring 。
372 | 
373 | rx ring 的元素定义如下： 
374 | 
375 | ```c
376 | struct xdp_desc {
377 | 	__u64 addr;    // UMEM 偏移
378 | 	__u32 len;     // packet 长度，因为packet不一定占满整个 frame 
379 | 	__u32 options;
380 | };
381 | ```
382 | 
383 | rx ring 消费需要遵循 peek, consume , release
384 | 
385 | 在具体消费的时候，调用该函数： 
386 | 
387 | ```c
388 | static inline const struct xdp_desc *
389 | xsk_ring_cons__rx_desc(const struct xsk_ring_cons *rx, __u32 idx)
390 | {
391 | 	const struct xdp_desc *descs = (const struct xdp_desc *)rx->ring;
392 | 
393 | 	return &descs[idx & rx->mask];
394 | }
395 | ```
396 | 
397 | 返回 xdp_desc的指针。
398 | 
399 | 可以使用下面这个函数获取 packet的指针： 
400 | 
401 | ```c
402 | static inline void *xsk_umem__get_data(void *umem_area, __u64 addr)
403 | {
404 | 	return &((char *)umem_area)[addr];
405 | }
406 | ```
407 | 
408 | 配合 len 即可访问 packet。 
409 | 
410 | #### tx ring 
411 | 
412 | Tx ring是生产者ring， Tx ring 作用于发送路径，用户态向tx ring写入**xdp_desc** 结构体(包含地址addr 和 长度len), 内核消费tx ring。有别于rx ring, 生产完成时候，用户态程序还需要主动调用 `sendto` 才能触发内核发送packet。eg : 
413 | 
414 | `sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0);` 
415 | 
416 | XSK 发送数据和普通的 socket 不同，sendto 只是一个触发动作，真正要发送的内容由用户态向tx ring 写入的 xdp_desc结构体制定，该结构体的addr指向 UMEM的一块内存区域，也就是packet的地址，len指定了要发送的packet的长度。
417 | 
418 | 还有一个问题需要解决：XSK是如何对packet进行处理的？ 在 rx ring中我们提到，消费rx ring可以获取 packet的地址。因此在接收到packet之后，我们可以直接在原有的packet上进行修改（原地修改），然后再将改地址和新的packet长度重新写回  tx ring。这样可以减少一次packet拷贝，进一步提升效率。每一个xdp socket (XSK) 都必须要有一个 rx ring 或者 tx ring 
419 | 
420 | tx ring 的元素定义如下：(和  rx ring 是一样的) 
421 | 
422 | ```c
423 | struct xdp_desc {
424 | 	__u64 addr;
425 | 	__u32 len;
426 | 	__u32 options;
427 | };
428 | ```
429 | 
430 | tx ring 的生产同样也需要 reserve, produce , submit 
431 | 
432 | 在具体生产的时候，调用该函数： 
433 | 
434 | ```c
435 | static inline struct xdp_desc *xsk_ring_prod__tx_desc(struct xsk_ring_prod *tx,
436 | 						      __u32 idx)
437 | {
438 | 	struct xdp_desc *descs = (struct xdp_desc *)tx->ring;
439 | 
440 | 	return &descs[idx & tx->mask];
441 | }
442 | 
443 | ```
444 | 
445 | 输入idx, 返回 tx ring 元素的指针。
446 | 
447 | ### AF_XDP原理图
448 | 
449 | 下图阐述了，ring 元素和 UMEM的关系，以及四种ring的数据流动
450 | 
451 | ![image-20211109120556512](AF_XDP初步探索.assets/image-20211109120556512.png)
452 | 
453 | 
454 | 
455 | 下图阐述了， ring 和 XSK以及 UMEM的关系
456 | 
457 | ![image-20211109120656105](AF_XDP初步探索.assets/image-20211109120656105.png)
458 | 
459 | * 一个XSK 必须有一个 tx ring 或者 rx ring 
460 | * 多个XSK 可以共享同一个UMEM
461 | * 同一个UMEM只能有一个 fill ring 和 completion ring 
462 | * ring 的元素是指向 UMEM 的地址（偏移） 
463 | 
464 | ## 如何使用AF_XDP
465 | 
466 | 为了使用 AF_XDP, 我们需要： 
467 | 
468 | 1. 内核态的 xdp程序，也就是attach到特定设备的， 类型为 xdp的 BPF(ebpf)程序。 xdp程序负责根据packet的内容，将packet重定向到xsk 
469 | 2. 用户态的 xsk 程序，创建xsk, 创建umem,  和 ring，并将其绑定，然后消费ring , 完成对packet的接收和处理
470 | 
471 | ### 内核态xdp程序
472 | 
473 | 这不是本文叙述的观点，因此强调一下重点。
474 | 
475 | 重点在于 xdp 程序的返回码是重定向类型的返回码
476 | 
477 | eg : 
478 | 
479 | ```c 
480 | /* A set entry here means that the correspnding queue_id
481 |      * has an active AF_XDP socket bound to it. */
482 |     if (bpf_map_lookup_elem(&xsks_map, &index))
483 |         return bpf_redirect_map(&xsks_map, index, 0);
484 | ```
485 | 
486 | ```c
487 | /*
488 |  * bpf_redirect_map
489 |  *
490 |  * 	Redirect the packet to the endpoint referenced by *map* at
491 |  * 	index *key*. Depending on its type, this *map* can contain
492 |  * 	references to net devices (for forwarding packets through other
493 |  * 	ports), or to CPUs (for redirecting XDP frames to another CPU;
494 |  * 	but this is only implemented for native XDP (with driver
495 |  * 	support) as of this writing).
496 |  *
497 |  * 	The lower two bits of *flags* are used as the return code if
498 |  * 	the map lookup fails. This is so that the return value can be
499 |  * 	one of the XDP program return codes up to XDP_TX, as chosen by
500 |  * 	the caller. Any higher bits in the *flags* argument must be
501 |  * 	unset.
502 |  *
503 |  * 	When used to redirect packets to net devices, this helper
504 |  * 	provides a high performance increase over **bpf_redirect**\ ().
505 |  * 	This is due to various implementation details of the underlying
506 |  * 	mechanisms, one of which is the fact that **bpf_redirect_map**\
507 |  * 	() tries to send packet as a "bulk" to the device.
508 |  *
509 |  * Returns
510 |  * 	**XDP_REDIRECT** on success, or **XDP_ABORTED** on error.
511 |  */
512 | static int (*bpf_redirect_map)(void *map, __u32 key, __u64 flags) = (void *) 51;
513 | ```
514 | 
515 | 我们提到， XSK是需要绑定在特定设备的特定队列上的，为了实现重定向， XDP程序需要
516 | 
517 | 1. 查找 xsks_map, 查找的key 是 index 即该packet对应的queue 
518 | 2. bpf_redirect_map 调用函数，进行重定向，如果找不到这个queue, 我们可以指定行为（drop 或者是 pass 交给内核网络栈处理）
519 | 
520 | ### 用户态的XSK程序
521 | 
522 | 我们可以将用户态的 XSK 程序划分为 ： 
523 | 
524 | 1. XSK 初始化（ 包括 UMEM初始化，设置 fq, cq，XSK初始化，设置 rq fq, 并将其绑定...) 
525 | 2. 向 fill ring  生产数据
526 | 3. 消费 rx ring 的数据
527 | 4. 向 tx ring 生产数据
528 | 5. 消费complete ring 的数据
529 | 
530 | #### XSK初始化
531 | 
532 | XSK初始化，如果没有xsk提供的帮助函数(./tools/lib/bpf/xsk.h)，是非常繁琐且复杂的。如果详细了解可以阅读，(./tools/lib/bpf/xsk.h, ./tools/lib/bpf/xsk.c ) (./samples/bpf/xdpsock_user.c)  ps: linux 源代码下。这里篇幅所限，不做详细解释，下文主要是基于 xsk.h 的帮助函数，解释函数参数的含义，和使用方法。
533 | 
534 | 1. **创建并设置UMEM**
535 | 
536 | ```c
537 | struct xsk_umem_config {
538 | 	__u32 fill_size;
539 | 	__u32 comp_size;
540 | 	__u32 frame_size;
541 | 	__u32 frame_headroom;
542 | 	__u32 flags;
543 | };
544 | 
545 | //从源码来看，config默认配置如下： 
546 | //	  cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;    2048
547 | //    cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;    2048
548 | //    cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;       4096 4kB(1<<12)
549 | //    cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM; 0
550 | //    cfg->flags = XSK_UMEM__DEFAULT_FLAGS; 0
551 | //    flags的取值我还没有找到文档和用法
552 | 
553 | struct xsk_umem {
554 | 	struct xsk_ring_prod *fill;
555 | 	struct xsk_ring_cons *comp;
556 | 	char *umem_area;
557 | 	struct xsk_umem_config config;
558 | 	int fd;
559 | 	int refcount;
560 | };
561 | 
562 | #define DEFINE_XSK_RING(name) \
563 | struct name { \
564 | 	__u32 cached_prod; \
565 | 	__u32 cached_cons; \
566 | 	__u32 mask; \
567 | 	__u32 size; \
568 | 	__u32 *producer; \
569 | 	__u32 *consumer; \
570 | 	void *ring; \
571 | 	__u32 *flags; \
572 | }
573 | 
574 | DEFINE_XSK_RING(xsk_ring_prod);
575 | DEFINE_XSK_RING(xsk_ring_cons);
576 | 
577 | /* Set config to NULL to get the default configuration. */
578 | LIBBPF_API int xsk_umem__create(struct xsk_umem **umem,
579 | 				void *umem_area, __u64 size,
580 | 				struct xsk_ring_prod *fill,
581 | 				struct xsk_ring_cons *comp,
582 | 				const struct xsk_umem_config *config);
583 | ```
584 | 
585 | **xk_umem_config** 
586 | 
587 | 1. fill_size: fill ring 的长度
588 | 2. comp_size: complete ring 的长度
589 | 3. frame_size : frame的大小，单位为 byte
590 | 4. frame_headroom:   packet 实际存放的起始地址是  addr(frame) + frame_headroom,  如果我们需要对packet进一步包装（外面再套一层）那么就可以设置该参数
591 | 5. Flags: 选项，默认是 0 ， 我目前还没找到相关的文档
592 | 
593 | **xsk_umem__create**
594 | 
595 | 参数： 
596 | 
597 | **struct xsk_umem \*\*umem** :  值返回参数， 如果成功了 umem 被设置为 xsk_umem__create 函数创建的 xsk_umem 指针
598 | 
599 | 说明一下 xsk_umem结构体
600 | 
601 | 1. fill : 描述fill ring 的指针，如果成功 fill 被设置为参数传入的 fill(直接把地址填进去
602 | 2. comp: 描述complete ring 的指针， 其余同fill 
603 | 3. umem_area :  UMEM的指针，也就是UMEM的起始地址
604 | 4.  struct xsk_umem_config config:  创建UMEM使用的配置
605 | 5. fd：XSK的描述符。  调用该函数的时候，该函数会同时创建类型AF_XDP的socket(因为一个umem至少要绑定一个socket)
606 | 6. refcount:  引用计数，用来标识该umem被引用次数，调用该函数，该值被设为1 。 引用计数主要是为了用来判断当前和该UMEM绑定的socket数目。（这个主要在多个socket 绑定同一个 umem 的时候使用） 
607 | 
608 | 
609 | 
610 | 
611 | 
612 | ## 实验结果
613 | 
614 | 程序运行效果
615 | 
616 | ![image-20211109215310651](AF_XDP初步探索.assets/image-20211109215310651.png)
617 | 
618 | ![image-20211109215330513](AF_XDP初步探索.assets/image-20211109215330513.png)
619 | 


--------------------------------------------------------------------------------
/Doc/Notes/AF_XDP/包复制技术.assets/image-20220723171851322.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/AF_XDP/包复制技术.assets/image-20220723171851322.png


--------------------------------------------------------------------------------
/Doc/Notes/AF_XDP/包复制技术.md:
--------------------------------------------------------------------------------
 1 | # 包复制技术
 2 | 
 3 | ## 为什么需要包复制技术(挑战)
 4 | 
 5 | * MPTCP通过引入新的TCP选项将TCP拓展为MPTCP
 6 | 
 7 | * TCP选项字段容量有限，最多只有40个字节，同一个包在部分情况下无法同时放置多个MPTCP选项字段（例如 携带数据的DSS字段(20字节) 加上 timestamp选项（10字节），MPTCP V1 ADD_ADDR选项（16字节））。因此在MPTCP的设计下，一些控制选项例如ADD_ADDR往往会放在冗余的ack中。为了避免这些冗余ack影响正常传输的拥塞控制窗口，MPTCP协议规定MPTCP实现不能将带有MPTCP选项的重复ack视为拥塞控制信号，于此同时MPTCP也不应该连续发送两个以上的重复ack作为MPTCP控制信号。
 8 | 
 9 | * eMPTCP基于包修改来控制MPTCP的行为，因此如果eMPTCP配置的流量没有数据包的时候，eMPTCP就失效了，例如，使用eMPTCP将一条子流设置为backup子流，这条子流上不会有数据发送，基于包修改不能将备用的子流恢复为数据流，因此需要包复制技术，为已经停止数据传输的子流构造控制报文。
10 | 
11 | * XDP/TC的触发方式是包驱动，每次收到/发出一个包，XDP/TC程序根据包的内容，对包进行修改然后返回动作值，决定丢包，将包交给协议栈等。但是XDP/TC无法将收到的一个packet分割为多个packet，也无法构造packet，XDP/TC不能直接用来实现包复制。
12 | 
13 | ## 包复制技术核心
14 | 
15 | 为了解决上述挑战，我们设计了基于AF_XDP 和XDP/TC的包复制机制。如下图所示： 首先，我们需要构造出能够被对方内核协议栈接收的数据包，构造数据包最大的挑战在于，TCP头部的seq和ack, 以及MPTCP选项的DSS字段，需要是合法的。在MPTCP场景下, 为了构造出数据包我们首先需要选择一条子流，获取当前时刻最新的TCP seq和ack，除此之外还要获取最新的MPTCP DSS字段的值。然后使用获取的值构造出数据包，并在这个数据包中添加 ADD_ADDR或者MP_PRIO选项实现控制MPTCP行为的目的。为此我们为eMPTCP增加了 pkt_copy actor来捕获这些值，进行数据包的复制。由于我们使用了数据流中最新的 seq、ack、DSS来构造数据包，构造出的数据包会成为冗余的ACK，但是根据MPTCP协议的规定，冗余的ack并不会被视为拥塞信号，因此该机制不会影响拥塞控制窗口。再构造数据包之后，我们使用AF_XDP作为数据包的发送路径。AF_XDP允许我们绕过内核网络栈，将数据包直接通过网卡发送出去，这样就避免了网络协议栈的处理开销。
16 | 
17 | ![image-20220723171851322](包复制技术.assets/image-20220723171851322.png)
18 | 
19 | ### 包复制技术的代价
20 | 
21 | 包复制技术的代价，包括利用XDP/TC捕获最新的seq、ack、dss并将其传输到用户态，在用户态构造数据包，利用AF_XDP将数据包直接从网卡发送出去。捕获关键数据，利用eBPF高效的包处理，eBPF基于perf_event实现的用户态内核态高效通信，代价很低。在发送路径上，AF_XDP直接从网卡发送数据，减少了网络栈的处理开销。对于控制报文而言，因为报文较小（通常在80个字节以内），构造数据包的开销也很低。因此我们提出的包复制技术能够以极低的代价，构造控制报文，控制网络协议的行为。
22 | 


--------------------------------------------------------------------------------
/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221029172426258.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221029172426258.png


--------------------------------------------------------------------------------
/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221029172606063.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221029172606063.png


--------------------------------------------------------------------------------
/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221030121511464.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221030121511464.png


--------------------------------------------------------------------------------
/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221030134001716.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221030134001716.png


--------------------------------------------------------------------------------
/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221030151521474.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221030151521474.png


--------------------------------------------------------------------------------
/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221030151556274-16671141804591.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221030151556274-16671141804591.png


--------------------------------------------------------------------------------
/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221030151556274.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221030151556274.png


--------------------------------------------------------------------------------
/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221030155827800.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221030155827800.png


--------------------------------------------------------------------------------
/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221030161928637.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221030161928637.png


--------------------------------------------------------------------------------
/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221031141043437.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221031141043437.png


--------------------------------------------------------------------------------
/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221031144952647.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221031144952647.png


--------------------------------------------------------------------------------
/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221031145125353.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221031145125353.png


--------------------------------------------------------------------------------
/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221031163152128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221031163152128.png


--------------------------------------------------------------------------------
/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221031163214989.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221031163214989.png


--------------------------------------------------------------------------------
/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221031173807981.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221031173807981.png


--------------------------------------------------------------------------------
/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221101104127087.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221101104127087.png


--------------------------------------------------------------------------------
/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221101114221785.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221101114221785.png


--------------------------------------------------------------------------------
/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221101155538090.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221101155538090.png


--------------------------------------------------------------------------------
/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221102103323083.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221102103323083.png


--------------------------------------------------------------------------------
/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221102104209026.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221102104209026.png


--------------------------------------------------------------------------------
/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221102105116812.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221102105116812.png


--------------------------------------------------------------------------------
/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221102105149134.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/MPTCPV1_scheduler/编译内核.assets/image-20221102105149134.png


--------------------------------------------------------------------------------
/Doc/Notes/MPTCPV1_scheduler/编译内核.md:
--------------------------------------------------------------------------------
  1 | # 编译内核
  2 | 
  3 | ## 传统编译(X86)
  4 | 
  5 | 在虚拟机上利用Makefile直接编译安装内核
  6 | 
  7 | 安装一些必要的依赖
  8 | 
  9 | ```bash
 10 | sudo apt-get install build-essential libncurses-dev bison flex libssl-dev libelf-dev
 11 | ```
 12 | 
 13 | 对照内核文档检查[Minimal Requirements](https://www.kernel.org/doc/html/latest/process/changes.html#changes)
 14 | 
 15 | 检查过程中补充了以下内容
 16 | 
 17 | ``` bash
 18 | sudo apt-get install pahole
 19 | sudo apt-get install jfsutils
 20 | sudo apt-get install reiserfsprogs
 21 | sudo apt-get install xfsprogs
 22 | sudo apt-get install btrfs-progs
 23 | sudo apt-get install quota
 24 | sudo apt-get install nfs-common
 25 | sudo apt-get install udev
 26 | ```
 27 | 
 28 | (mcelog未安装)
 29 | 
 30 | ```bash
 31 | make O=/home/forward/CODING/Source/build/kernel menuconfig
 32 | make O=/home/forward/CODING/Source/build/kernel -j 4
 33 | ```
 34 | 
 35 | 遇到问题：
 36 | 
 37 | ![image-20221030151556274](编译内核.assets/image-20221030151556274-16671141804591.png)
 38 | 
 39 | 修改.config文件，`CONFIG_SYSTEM_TRUSTED_KEYS`、`CONFIG_SYSTEM_REVOCATION_KEYS`置为空
 40 | 
 41 | ![image-20221030151521474](编译内核.assets/image-20221030151521474.png)
 42 | 
 43 | 12：15开始编译，13:40结束。编译完成后：
 44 | 
 45 | ![image-20221030134001716](编译内核.assets/image-20221030134001716.png)
 46 | 
 47 | 之后，安装内核模块：
 48 | 
 49 | ```bash
 50 | sudo make O=/home/forward/CODING/Source/build/kernel modules_install
 51 | ```
 52 | 
 53 | ![image-20221030155827800](编译内核.assets/image-20221030155827800.png)
 54 | 
 55 | 最后，安装内核本身：
 56 | 
 57 | ```bash
 58 | sudo make O=/home/forward/CODING/Source/build/kernel install
 59 | ```
 60 | 
 61 | 更改grub，重启内核后，选择对应内核版本：
 62 | 
 63 | ![image-20221030161928637](编译内核.assets/image-20221030161928637.png)
 64 | 
 65 | ### 参考
 66 | 
 67 | https://www.kernel.org/doc/html/latest/admin-guide/README.html?highlight=documentation+process+changes+rst
 68 | 
 69 | https://www.kernel.org/doc/html/latest/process/changes.html#changes
 70 | 
 71 | https://zhuanlan.zhihu.com/p/453345990
 72 | 
 73 | https://cloud.tencent.com/developer/article/1114403
 74 | 
 75 | ## make deb-pkg
 76 | 
 77 | 打包生成deb：
 78 | 
 79 | 在make时候加上deb-pkg即可在上一级目录生成对应`.deb`文件，此处为服务器上打包成功截图：
 80 | 
 81 | ![image-20221031173807981](编译内核.assets/image-20221031173807981.png)
 82 | 
 83 | ### 参考
 84 | 
 85 | https://www.listera.top/deepinbian-yi-xin-nei-he/
 86 | 
 87 | https://unix.stackexchange.com/questions/238469/difference-between-make-kpkg-and-make-deb-pkg
 88 | 
 89 | https://www.cnblogs.com/wwang/archive/2011/01/07/1929486.html
 90 | 
 91 | 
 92 | 
 93 | ## 交叉编译(X86->arm64)
 94 | 
 95 | ### Ubuntu18.04
 96 | 
 97 | 下载安装工具链
 98 | 
 99 | http://releases.linaro.org/components/toolchain/binaries/latest-7/aarch64-linux-gnu/
100 | 
101 | ![image-20221031141043437](编译内核.assets/image-20221031141043437.png)
102 | 
103 | 对应服务器Ubuntu18.04 gcc7.5的版本选用图中工具链
104 | 
105 | 拷贝默认配置：
106 | 
107 | ```bash
108 | cp ./arch/arm64/configs/defconfig .config
109 | ```
110 | 
111 | 配置选项：
112 | 
113 | ```bash
114 | make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- menuconfig
115 | ```
116 | 
117 | ![image-20221031144952647](编译内核.assets/image-20221031144952647.png)
118 | 
119 | ![image-20221102103323083](编译内核.assets/image-20221102103323083.png)
120 | 
121 | 编译打包deb：
122 | 
123 | ```bash
124 | make -j8 deb-pkg ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu-
125 | ```
126 | 
127 | ![image-20221102105116812](编译内核.assets/image-20221102105116812.png)
128 | 
129 | ![image-20221102105149134](编译内核.assets/image-20221102105149134.png)
130 | 
131 | 
132 | 
133 | ### Ubuntu 22.04.1
134 | 
135 | 由于一开始在服务器Ubuntu18.04上交叉编译未能成功，退而使用虚拟机Ubuntu22.04.1进行编译
136 | 
137 | 安装最新的交叉编译工具链
138 | 
139 | https://developer.arm.com/downloads/-/arm-gnu-toolchain-downloads
140 | 
141 | ![image-20221102104209026](编译内核.assets/image-20221102104209026.png)
142 | 
143 | 之后按照同样的步骤进行编译：
144 | 
145 | ```bash
146 | make ARCH=arm64 CROSS_COMPILE=aarch64-none-linux-gnu- menuconfig
147 | ```
148 | 
149 | ```bash
150 | make -j4 deb-pkg ARCH=arm64 CROSS_COMPILE=aarch64-none-linux-gnu-
151 | ```
152 | 
153 | 成功编译：
154 | 
155 | ![image-20221101114221785](编译内核.assets/image-20221101114221785.png)
156 | 
157 | 并在树莓派上安装了相应的deb文件：
158 | 
159 | ![image-20221101155538090](编译内核.assets/image-20221101155538090.png)
160 | 
161 | [修改grub](https://blog.csdn.net/bby1987/article/details/104264285)后重启，**未能使用**已安装的内核版本
162 | 
163 | 目前估计是树莓派固件不支持 导致无法升级
164 | 
165 | 也有可能是 直接编译Linux内核源码无法适用于树莓派
166 | 
167 | 
168 | 
169 | ### 参考
170 | 
171 | https://zhuanlan.zhihu.com/p/115173146
172 | 
173 | https://blog.csdn.net/Luckiers/article/details/124531266
174 | 
175 | https://segmentfault.com/a/1190000020955640
176 | 
177 | https://www.cnblogs.com/cqwo/p/15420530.html
178 | 
179 | https://xuchengpeng.com/cross-compile-arm64-kernel.html
180 | 
181 | 
182 | 
183 | ## 问题记录
184 | 
185 | 常见问题：
186 | 
187 | ![image-20221029172426258](编译内核.assets/image-20221029172426258.png)
188 | 
189 | https://blog.csdn.net/qq_36393978/article/details/118157426
190 | 
191 | 
192 | 
193 | 交叉编译问题：
194 | 
195 | ![image-20221031163152128](编译内核.assets/image-20221031163152128.png) 
196 | 
197 | ![image-20221031163214989](编译内核.assets/image-20221031163214989.png)
198 | 
199 | 其中，缺少auto.conf问题，在Ubuntu22.04上未出现，在Ubuntu18.04上也不影响后续编译
200 | 
201 | 而`Detected assembler with broken ...`问题，由于**命令**问题，未能使用正确的交叉编译工具链，应为`CORSS_COMPILE`！！！
202 | 
203 | 这也是之前在服务器上未能成功编译的根本原因


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_HW_offload/eBPF_HW_offload.md:
--------------------------------------------------------------------------------
 1 | # 现有eBPF硬件卸载方案初步调研
 2 | 
 3 | ## Netronome系列智能网卡
 4 | 
 5 | [Netronome](https://www.netronome.com/) 是目前最主流的智能网卡提供商之一。目前其公司的产品 Smart Agilio CX 型号的智能网卡，**支持将eBPF XDP程序offload到硬件上执行**。
 6 | 
 7 | ## 硬件卸载内核源码分析
 8 | 
 9 | ### hook overview 
10 | 
11 | ### prog offloading hook
12 | 
13 | #### MAP offloading hook
14 | 
15 | ### verifier offloading hook 
16 | 
17 | ### Netronome driver 
18 | 
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_MAP_user/eBPF_MAP_note.assets/image-20220520183538699.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_MAP_user/eBPF_MAP_note.assets/image-20220520183538699.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_MAP_user/eBPF_MAP_note.md:
--------------------------------------------------------------------------------
 1 | ![image-20220520183538699](./eBPF_MAP_note.assets/image-20220520183538699.png)
 2 | 
 3 | 
 4 | 
 5 | ## BPF MAP 的机制
 6 | 
 7 | 正常来说在，一个源文件中的多个BPF程序可以共享同一个BPF MAP，例如对于一下的代码（在一个源文件中） 
 8 | 
 9 | ```c
10 | struct {
11 |     __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
12 |     __type(key, int);
13 |     __type(value, int);
14 |     __uint(max_entries, MAX_XDP_ACTION_NUM);
15 | } xdp_actions SEC(".maps");
16 | //xdp_actions是一个BPF MAP 
17 | 
18 | SEC("xdp")
19 | int xdp_prog1(void *ctx) {
20 |     bpf_map_lookup_elem(&xdp_actions ,.....)；  //使用 bpf map 
21 | }
22 | 
23 | SEC("xdp")
24 | int xdp_prog2(void *ctx) {
25 |     bpf_map_lookup_elem(&xdp_actions ,.....)   //使用 bpf map 
26 | }
27 | ```
28 | 
29 | 有一个BPF MAP， 和两个 xdp 程序，这两个程序无需做额外的操作就可以共享这个BPF MAP 
30 | 
31 | 原理：
32 | 
33 | 1. 在使用 llvm 编译该源文件的时候，在使用到bpf map 的地方（通过指针 &xdp_actions)  并不是填入一个地址，而是填一个占位符。
34 | 2. 当加载 编译后的 .o文件时， 加载器会扫描源文件中的 BPF MAP ，并且在用户态创建该map, 并获得文件描述符 fd（例如这里的 xdp_actions就会有一个文件描述符)
35 | 3. 加载器扫描 .o文件，对于所有使用到这个 Bpf map 的地方，用2 获得的fd 替换 1中的占位符。
36 | 
37 | 但是，如果BPF程序分散在多个源文件中，eg 
38 | 
39 | src 1: 
40 | 
41 | ```c
42 | struct {
43 |     __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
44 |     __type(key, int);
45 |     __type(value, int);
46 |     __uint(max_entries, MAX_XDP_ACTION_NUM);
47 | } xdp_actions SEC(".maps");
48 | //xdp_actions是一个BPF MAP 
49 | 
50 | SEC("xdp")
51 | int xdp_prog1(void *ctx) {
52 |     bpf_map_lookup_elem(&xdp_actions ,.....)   //使用 bpf map 
53 | }
54 | ```
55 | 
56 | src 2: 
57 | 
58 | ```	c
59 | struct {
60 |     __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
61 |     __type(key, int);
62 |     __type(value, int);
63 |     __uint(max_entries, MAX_XDP_ACTION_NUM);
64 | } xdp_actions SEC(".maps");
65 | //xdp_actions是一个BPF MAP 
66 | 
67 | SEC("xdp")
68 | int xdp_prog2(void *ctx) {
69 |     bpf_map_lookup_elem(&xdp_actions ,.....)   //使用 bpf map 
70 | }
71 | ```
72 | 
73 | 在正常的编译，加载流程下（1中的流程），这两个 prog不会共享同一个 map, 会使用不同的map。
74 | 
75 | 也就是说会有两个 xdp_actions
76 | 
77 | 因此为了实现多个源文件共享同一个 bpf map。我们需要在
78 | 
79 | `当加载 编译后的 .o文件时， 加载器会扫描源文件中的 BPF MAP ，并且在用户态创建该map, 并获得文件描述符 fd（例如这里的 xdp_actions就会有一个文件描述符)`
80 | 
81 | 做这一步的时候，让多个 .o文件的 xdp_actions使用同一个 fd。
82 | 
83 | 我们的做法是（以上面的代码为例）
84 | 
85 | 1. 加载 src1.o时，创建 xdp_actions, 获取 fd,  并将其 pin 到 bpf 虚拟文件系统(pin 并不是必须的，pin的目的是为了，在加载例程结束之后，其它的用户态进程能够通过 BPF VFS(虚拟文件系统)获取该BPF MAP的文件描述符)
86 | 2. 在加载 src2.o之前， 使用 libbpf 的 bpf_map__reuse_fd API,  使用 第一步中获取的 fd 来作为 src2.o 中 xdp_actions的fd，而不是创建它
87 | 3. 这样两个 程序可以共享同一个bpf map 
88 | 
89 | 
90 | 
91 | 用一句话简单概括：分散在不同文件里的 BPF 程序不能直接共享同一个 BPF MAP, 需用通过 
92 | 
93 | bpf_map__reuse_fd 复用不同文件中的 BPF MAP FD，从而实现共享。
94 | 
95 | 
96 | 
97 | 


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_TCP_ca/eBPF_TCP_ca.md:
--------------------------------------------------------------------------------
  1 | # eBPF_TCP_CA
  2 | 
  3 | ## 数据结构
  4 | 
  5 | ### 全局变量 unsupported_ops
  6 | 
  7 | 不支持使用eBPF实现的函数。(get_info)
  8 | 
  9 | ```c
 10 | static u32 unsupported_ops[] = {
 11 | 	offsetof(struct tcp_congestion_ops, get_info),
 12 | };
 13 | ```
 14 | 
 15 | **全局变量 optional_ops **
 16 | 
 17 | 可选项，这些函数可以不需要用eBPF函数实现(并不一定要实现)
 18 | 
 19 | ```c
 20 | static u32 optional_ops[] = {
 21 | 	offsetof(struct tcp_congestion_ops, init),
 22 | 	offsetof(struct tcp_congestion_ops, release),
 23 | 	offsetof(struct tcp_congestion_ops, set_state),
 24 | 	offsetof(struct tcp_congestion_ops, cwnd_event),
 25 | 	offsetof(struct tcp_congestion_ops, in_ack_event),
 26 | 	offsetof(struct tcp_congestion_ops, pkts_acked),
 27 | 	offsetof(struct tcp_congestion_ops, min_tso_segs),
 28 | 	offsetof(struct tcp_congestion_ops, sndbuf_expand),
 29 | 	offsetof(struct tcp_congestion_ops, cong_control),
 30 | };
 31 | ```
 32 | 
 33 | 
 34 | 
 35 | ## 代码逻辑
 36 | 
 37 | ### bpf_tcp_ca_verifier_ops
 38 | 
 39 | #### bpf_tcp_ca_get_func_proto
 40 | 
 41 | 
 42 | 
 43 | ### bpf_tcp_ca_init
 44 | 
 45 | `static int bpf_tcp_ca_init(struct btf *btf)` 
 46 | 
 47 | ```c
 48 | static int bpf_tcp_ca_init(struct btf *btf)
 49 | {
 50 | 	s32 type_id;
 51 | 
 52 | 	type_id = btf_find_by_name_kind(btf, "sock", BTF_KIND_STRUCT);
 53 | 	if (type_id < 0)
 54 | 		return -EINVAL;
 55 | 	sock_id = type_id;
 56 | 
 57 | 	type_id = btf_find_by_name_kind(btf, "tcp_sock", BTF_KIND_STRUCT);
 58 | 	if (type_id < 0)
 59 | 		return -EINVAL;
 60 | 	tcp_sock_id = type_id;     //设置全局static变量 tcp_sock_id 
 61 | 	tcp_sock_type = btf_type_by_id(btf, tcp_sock_id); //设置全局static变量 tcp_sock_type
 62 | 
 63 | 	return 0;
 64 | }
 65 | 
 66 | ```
 67 | 
 68 | 主要是为了获取并出示化， tcp_sock_id 和 tcp_sock_type 这两个全局静态变量。这两个变量应该是供verifier使用, 以及定义 bpf_func_proto使用。
 69 | 
 70 | ### bpf_tcp_ca_init_member
 71 | 
 72 | `static int bpf_tcp_ca_init_member(const struct btf_type *t, const struct btf_member *member,  void *kdata, const void *udata)`
 73 | 
 74 | bpf_struct_ops 的 init_member 钩子的实现。被调用的时候，负责： 
 75 | 
 76 | 1. 处理非函数指针成员，flags 和 name 
 77 | 2. 对于函数指针成员检查是否合法
 78 | 
 79 | -> `const struct tcp_congestion_ops *utcp_ca;`
 80 | 
 81 | -> `struct tcp_congestion_ops *tcp_ca;`
 82 | 
 83 | -> `int prog_fd; u32 moff;`
 84 | 
 85 | -> `utcp_ca = (const struct tcp_congestion_ops *)udata; tcp_ca = (struct tcp_congestion_ops *)kdata;`
 86 | 
 87 | -> `moff = __btf_member_bit_offset(t, member) / 8;`  获取该成员在tcp_congestion_ops结构体的字节偏移量
 88 | 
 89 | -> `siwtch (moff)` 
 90 | 
 91 | ​	--> `case offsetof(struct tcp_congestion_ops, flags):` 处理 `tcp_congestion_ops.flags` 
 92 | 
 93 | ​		---> `tcp_ca->flags = utcp_ca->flags; return 1` 
 94 | 
 95 | ​	--> `case offsetof(struct tcp_congestion_ops, name):`  处理 `tcp_congestion_ops.name` 
 96 | 
 97 | ​		---> `bpf_obj_name_cpy(tcp_ca->name, utcp_ca->name,  sizeof(tcp_ca->name))` 
 98 | 
 99 | ​		---> `if (tcp_ca_find(utcp_ca->name)) return -EEXIST;` 
100 | 
101 | -> `prog_fd = (int)(*(unsigned long *)(udata + moff));` 
102 | 
103 | -> `if (!prog_fd && !is_optional(moff) && !is_unsupported(moff)) return -EINVAL;` **prog_fd=0 意味着BPF程序并没有提供该成员函数的实现**
104 | 
105 | ​	--> `is_optional(moff)`   判断该函数是否是可选的函数（没有要求一定要实现） 
106 | 
107 | ​	--> `is_unsupported(moff)` 判断是否支持用eBPF程序实现该函数。
108 | 
109 | -> `return 0;`
110 | 
111 | ### bpf_tcp_ca_reg
112 | 
113 | 将使用bpf实现的拥塞控制算法注册到拥塞控制算法链表中。
114 | 
115 | ```c 
116 | static int bpf_tcp_ca_reg(void *kdata)
117 | {
118 | 	return tcp_register_congestion_control(kdata);
119 | }
120 | ```
121 | 
122 | 


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_bpflink_kernel/eBPF_bpflink_kernel.md:
--------------------------------------------------------------------------------
 1 | # BPF Link Kernel 
 2 | 
 3 | ## 数据结构
 4 | 
 5 | ### struct bpf_link 
 6 | 
 7 | ```mermaid
 8 | classDiagram 
 9 | 	class bpf_link {
10 | 		atomic64_t refcnt
11 | 		u32 id
12 | 		const struct bpf_link_ops *ops
13 | 		struct bpf_prog *prog
14 | 		struct work_struct work
15 | 	}
16 | 	
17 |     bpf_link --> bpf_link_ops 
18 |     bpf_link --> bpf_prog
19 |     bpf_link *-- work_struct 
20 | ```
21 | 
22 | 
23 | 
24 | ```c
25 | struct bpf_link {
26 | 	atomic64_t refcnt;
27 | 	u32 id;
28 | 	enum bpf_link_type type;
29 | 	const struct bpf_link_ops *ops;
30 | 	struct bpf_prog *prog;
31 | 	struct work_struct work;
32 | };
33 | 
34 | ```
35 | 
36 | 
37 | 
38 | ## 函数逻辑
39 | 
40 | ### bpf_link_init 
41 | 
42 | `void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,  const struct bpf_link_ops *ops, struct bpf_prog *prog) `
43 | 
44 | ```c
45 | void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
46 | 		   const struct bpf_link_ops *ops, struct bpf_prog *prog)
47 | {
48 | 	atomic64_set(&link->refcnt, 1);   //设置引用计数
49 | 	link->type = type;
50 | 	link->id = 0;
51 | 	link->ops = ops;
52 | 	link->prog = prog;   //和prog建立关系
53 | }
54 | ```
55 | 
56 | **ops** : 不同类型的prog有不同类型的 bpf_link_ops, 例如 对于 struct_op 来说 为： `bpf_struct_ops_link_lops` 
57 | 
58 | ## 编程技巧
59 | 
60 | 


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_helperfunc_kernel/eBPF_helperfunc_kernel.md:
--------------------------------------------------------------------------------
1 | # eBPF_helperfunc_kernel 
2 | 
3 | ## 数据结构
4 | 
5 | ## 函数逻辑
6 | 
7 | ## 编程技巧


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_introduction_slides/README.md:
--------------------------------------------------------------------------------
1 | 使用slide dev来编译这个ppt 
2 | 
3 | 


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_introduction_slides/slides.md:
--------------------------------------------------------------------------------
   1 | ---
   2 | # try also 'default' to start simple
   3 | theme: seriph
   4 | # random image from a curated Unsplash collection by Anthony
   5 | # like them? see https://unsplash.com/collections/94734566/slidev
   6 | background: ./arnold-francisca-FBNxmwEVpAc-unsplash.jpg
   7 | # apply any windi css classes to the current slide
   8 | class: 'text-center'
   9 | # https://sli.dev/custom/highlighters.html
  10 | highlighter: shiki
  11 | lineNumbers: true
  12 | # some information about the slides, markdown enabled
  13 | info: |
  14 |   ## Slidev Starter Template
  15 |   Presentation slides for developers.
  16 | 
  17 |   Learn more at [Sli.dev](https://sli.dev)
  18 | ---
  19 | 
  20 | # eBPF编程分享
  21 | 
  22 | 6.16 杨彬
  23 | 
  24 | <div class="pt-12">
  25 |   <span @click="$slidev.nav.next" class="px-2 p-1 rounded cursor-pointer" hover="bg-white bg-opacity-10">
  26 |     Press Space for next page <carbon:arrow-right class="inline"/>
  27 |   </span>
  28 | </div>
  29 | 
  30 | <a href="https://github.com/slidevjs/slidev" target="_blank" alt="GitHub"
  31 |   class="abs-br m-6 text-xl icon-btn opacity-50 !border-none !hover:text-white">
  32 |   <carbon-logo-github />
  33 | </a>
  34 | 
  35 | <!--
  36 | The last comment block of each slide will be treated as slide notes. It will be visible and editable in Presenter Mode along with the slide. [Read more in the docs](https://sli.dev/guide/syntax.html#notes)
  37 | -->
  38 | 
  39 | ---
  40 | layout: two-cols
  41 | ---
  42 | 
  43 | # eBPF Overview 
  44 | 
  45 | ## BPF Program 
  46 | * 使用伪C作为开发语言
  47 | * 使用llvm将C代码编译为BPF字节码
  48 | * KPROBE, XDP, SCHED_ACT, SCHED_CLS...
  49 | 
  50 | ## BPF MAP 
  51 | * eBPF程序之间的数据共享
  52 | * eBPF程序和用户态程序之间的数据共享
  53 | * HASH, ARRAY, PROG_ARRAY, LRU_HASH, PERCPU_HASH, PER_CPU_ARRAY...
  54 | 
  55 | ## BPF Link 
  56 | * 描述eBPF程序attach到hook的关系
  57 | 
  58 | ::right::
  59 | 
  60 | <style>
  61 | .div {
  62 |   position:absolute;
  63 |   top:20%;
  64 | }
  65 | </style>
  66 | 
  67 | <div class="div">
  68 | <img src="/eBPF_overview.png" class="h-80"/>
  69 | <center>Fig. eBPF overview</center>
  70 | </div>
  71 | 
  72 | ---
  73 | layout: two-cols
  74 | ---
  75 | 
  76 | # eBPF Overview: eBPF Program
  77 | 
  78 | <style>
  79 | h2 {
  80 |   font-size:20px;
  81 | }
  82 | 
  83 | ul {
  84 |   font-size:10px;
  85 | }
  86 | </style>
  87 | 
  88 | ## eBPF程序类型
  89 | * 定义在 <kbd>./include/uapi/linux/bpf.h bpf_prog_type</kbd>
  90 | * 在实际编写代码时，不同的程序类型使用不同的函数签名
  91 | 
  92 | ## 可使用函数调用
  93 | 
  94 | * eBPF帮助函数<kbd>./src/bpf_helper_defs.h</kbd>
  95 | * 自己定义的 <kbd>static __always_inline</kbd> 函数
  96 | * 宏函数
  97 | 
  98 | ## 编程限制
  99 | * 不允许使用 unbounded loop，对于低版本的eBPF不允许使用循环
 100 | * 不允许使用全局变量
 101 | * 栈区限制 MAX_BPF_STACK(512bytes)<kbd>./include/linux/filter.h</kbd>
 102 | * 字节码长度被限制为 4096条指令 <kbd>./include/uapi/linux/bpf_common.h BPF_MAXINSNS</kbd>
 103 | 
 104 | ::right::
 105 | 
 106 | ```c{all|14-26|14,15|18|21|23}
 107 | struct {
 108 |     __uint(type, BPF_MAP_TYPE_ARRAY);
 109 |     __type(key, int);
 110 |     __type(value, __u64);
 111 |     __uint(max_entries, 1000);
 112 | } count SEC(".maps");
 113 | 
 114 | static __always_inline __u64 cal_packet_len(void *b, void *e) {
 115 |     return e - b; 
 116 | }
 117 | 
 118 | #define lock_xadd(ptr, val)   __sync_fetch_and_add(ptr, val)
 119 | 
 120 | SEC("xdp")
 121 | int test_xdp(struct xdp_md *ctx) {
 122 |     void *data = (void *)(__u64)(ctx->data);
 123 |     void *data_end = (void *)(__u64)(ctx->data_end);
 124 |     __u64 pkt_len = cal_packet_len(data, data_end);
 125 |     __u64 key = 0;
 126 |     __u64 *val_p;
 127 |     val_p = bpf_map_lookup_elem(&count, &key);
 128 |     if (val_p != NULL) {
 129 |         lock_xadd(val_p, pkt_len);
 130 |     }
 131 |     return XDP_PASS;
 132 | }
 133 | ```
 134 | 
 135 | ---
 136 | layout: two-cols
 137 | ---
 138 | 
 139 | # eBPF Overview : BPF MAP
 140 | 
 141 | <style>
 142 | h2 {
 143 |   font-size:20px;
 144 | }
 145 | 
 146 | ul {
 147 |   font-size:10px;
 148 | }
 149 | </style>
 150 | 
 151 | ## BPF MAP类型
 152 | * 定义在 <kbd>./include/uapi/linux/bpf.h bpf_map_tyspe</kbd>
 153 | * 常用map类型： 
 154 |   * hash: BPF_MAP_TYPE_HASH(LRU_HASH, PER_CPU_HASH)
 155 |   * 数组: BPF_MAP_TYPE_ARRAY(PER_CPU_ARRAY)
 156 | 
 157 | ## BPF MAP定义
 158 | * 定义section
 159 | * MAP类型
 160 | * key type
 161 | * value type
 162 | * max_entries
 163 | 
 164 | ## BPF MAP操作
 165 | * 增/改, <kbd>long bpf_map_update_elem(void *map,const void *key,const void *value,__u64 flags)</kbd>
 166 | * 删, <kbd>long bpf_map_delete_elem(void *map,const void *key)</kbd>
 167 | * 查,<kbd>void* bpf_map_lookup_elem)(void *map,const void *key)</kbd>
 168 |   
 169 | ::right::
 170 | 
 171 | ```c{all|1-6|6|2,3,4,5|21}
 172 | struct {
 173 |     __uint(type, BPF_MAP_TYPE_ARRAY);
 174 |     __type(key, int);
 175 |     __type(value, __u64);
 176 |     __uint(max_entries, 1000);
 177 | } count SEC(".maps");
 178 | 
 179 | static __always_inline __u64 cal_packet_len(void *b, void *e) {
 180 |     return e - b; 
 181 | }
 182 | 
 183 | #define lock_xadd(ptr, val)   __sync_fetch_and_add(ptr, val)
 184 | 
 185 | SEC("xdp")
 186 | int test_xdp(struct xdp_md *ctx) {
 187 |     void *data = (void *)(__u64)(ctx->data);
 188 |     void *data_end = (void *)(__u64)(ctx->data_end);
 189 |     __u64 pkt_len = cal_packet_len(data, data_end);
 190 |     __u64 key = 0;
 191 |     __u64 *val_p;
 192 |     val_p = bpf_map_lookup_elem(&count, &key);
 193 |     if (val_p != NULL) {
 194 |         lock_xadd(val_p, pkt_len);
 195 |     }
 196 |     return XDP_PASS;
 197 | }
 198 | ```
 199 | 
 200 | ---
 201 | layout: two-cols
 202 | ---
 203 | 
 204 | # eBPF Overview : BPF Link
 205 | ## BPF Link?
 206 | * 表示attach关系
 207 | * libbpf attach系列API会返回 <kbd>bpf_link struct</kbd>
 208 | * BPF link 也拥有fd
 209 | 
 210 | ## BPF Link的作用
 211 | * 方便对attach关系的管理
 212 | * BPF Link会增加BPF程序的引用计数
 213 | * BPF Link可以被pin到VFS中
 214 | 
 215 | ::right::
 216 | 
 217 | ```c{all|5}
 218 | struct bpf_link {
 219 | 	int (*detach)(struct bpf_link *link);
 220 | 	void (*dealloc)(struct bpf_link *link);
 221 | 	char *pin_path;		/* NULL, if not pinned */
 222 | 	int fd;			/* hook FD, -1 if not applicable */
 223 | 	bool disconnected;
 224 | };
 225 | ```
 226 | <center>Code.bpf_link define</center>
 227 | 
 228 | ```c
 229 | struct bpf_link *
 230 | bpf_program__attach_xdp(const struct bpf_program *prog, int ifindex);
 231 | 
 232 | struct bpf_link *
 233 | bpf_program__attach_kprobe(const struct bpf_program *prog, bool retprobe,
 234 | 			   const char *func_name);
 235 | ```
 236 | <center>Code. libbpf attach sample APIs</center>
 237 | 
 238 | ---
 239 | layout: two-cols
 240 | ---
 241 | 
 242 | # eBPF Overview: Libbpf
 243 | 
 244 | <style>
 245 | h2 {
 246 |   font-size:30px;
 247 | }
 248 | 
 249 | ul {
 250 |   font-size:15px;
 251 | }
 252 | </style>
 253 | 
 254 | ## Libbpf API特点
 255 | * API不直接对fd进行操作
 256 | * API对struct进行操作，这些struct对应着前面所说的概念，例如bpf_program
 257 | * BPF大部分系统调用针对fd进行操作，libbpf提供了相应的函数从对象获取fd
 258 | 
 259 | ## 通过Libbpf加载eBPF对象的步骤(之一)
 260 | 1. Open eBPF object using <kbd>bpf_object__open</kbd>
 261 | 2. Load eBPF object into kernel using <kbd>bpf_object__load</kbd>
 262 | 3. Get BPF program object and fd 
 263 | 4. Get BPF MAP object and fd
 264 | 5. Attach to specific hook 
 265 | 
 266 | ::right::
 267 | 
 268 | ```c{all|4|5|11|12-15|16|19|21}
 269 | int load_bpf_object(path) {
 270 |     int res; 
 271 |     struct bpf_object* obj;
 272 |     obj = bpf_object__open(path); //bpf_object__open_file 
 273 |     res = bpf_object__load(obj);
 274 |     if (res < 0) {
 275 |         return -1;
 276 |     }
 277 |     //get prog
 278 |     struct bpf_program *prog;
 279 |     prog = bpf_object__find_program_by_name(obj, NAME);
 280 |     res = libbpf_get_error(prog);
 281 |     if (res <0 ) {
 282 |         return -1;
 283 |     }
 284 |     int prog_fd = bpf_program__fd(prog);
 285 |     //get map 
 286 |     struct bpf_map *map;
 287 |     map = bpf_object__find_map_by_name(obj, NAME);
 288 |     res = libbpf_get_error(map);
 289 |     int map_fd = bpf_map__fd(map);
 290 |     
 291 |     #attaches here
 292 | }
 293 | ```
 294 | ---
 295 | 
 296 | # eBPF Overview: Dev Big Picture
 297 | 
 298 | 
 299 | ## 如何判断能否利用eBPF实现某些功能？ 
 300 | 1. 根据需求查看eBPF程序类型，查找相关资料确定其功能
 301 | 2. 查看该eBPF程序类型的hook, 如何加载？ 如何被调用？ 
 302 | 3. 查找该程序类型的demo 
 303 | 4. 根据程序复杂度以及功能，查找需要使用的 BPF_MAP
 304 | 5. 具体的开发和调试
 305 | 
 306 | ---
 307 | layout: two-cols
 308 | ---
 309 | 
 310 | # Lifetime of eBPF Object 
 311 | 
 312 | <style>
 313 | h2 {
 314 |   font-size:10px;
 315 | }
 316 | 
 317 | ul {
 318 |   font-size:8px;
 319 | }
 320 | </style>
 321 | 
 322 | ### BPF 对象
 323 | * BPF MAP, BPF PROG, BPF_LINK
 324 | * 每一个对象有一个引用计数 ref
 325 | * 只有当ref为0时，对象才会被内核销毁
 326 | 
 327 | ### BPF MAP REF
 328 | * 创建MAP, ref = 1
 329 | * load prog, ref+=1 (每次使用到该map的程序被加载)
 330 | * prog被销毁, ref-=1
 331 | 
 332 | ### BPF PROG REF
 333 | * 加载prog, ref = 1
 334 | * attach(create a link), ref += 1
 335 | * detach(delete a link), ref -= 1
 336 | 
 337 | ### BPF Link REF
 338 | * attach, ref = 1 (global e.g. xdp,tc 应用程序退出link不会被销毁)
 339 | * detach, ref -= 1 (to be test, 同时pin和detach?)
 340 | 
 341 | ### PIN
 342 | * fd关闭，all object ref-=1
 343 | * pin , all object ref+=1
 344 | * unpin, all object ref-=1
 345 | 
 346 | ### Other(to be test)
 347 | * prog array
 348 | * map of map
 349 | * bpftool
 350 | 
 351 | ::right::
 352 | 
 353 | <center>
 354 | <img src="/eBPF_MAP.png" class="h-120"/>
 355 | </center>
 356 | 
 357 | <center>Fig.BPF MAP创建流程</center>
 358 | 
 359 | ---
 360 | layout: two-cols
 361 | ---
 362 | 
 363 | # eBPF VFS
 364 | 
 365 | <style>
 366 | ul {
 367 |   font-size: 8px;
 368 | }
 369 | </style>
 370 | 
 371 | #### eBPF VFS本质
 372 | * eBPF VFS本质是一个虚拟的文件系统
 373 | * 对于pin到VFS的object, VFS持有该object的一个引用计数，使得object的引用计数不会变为0，被销毁
 374 | * 并不是将内存的内容保存到文件系统上
 375 | * 可以通过VFS获取object fd, 实现用户态和内核态，eBPF程序之间的map sharing
 376 | 
 377 | #### PIN using fd
 378 | * 通过 <kbd>int bpf_obj_pin(int fd, const char *pathname)</kbd> 将object pin到 VFS
 379 | * 通过 <kbd>int bpf_obj_get(const char *pathname)</kbd> 获取已经pin到VFS的object的fd 
 380 | 
 381 | #### PIN using higher-level APIs
 382 | * pin map <kbd>int bpf_map__pin(struct bpf_map *map, const char *path)</kbd>
 383 | * pin prog <kbd>int bpf_program__pin(struct bpf_program *prog, const char *path)</kbd>
 384 | * pin object <kbd>int bpf_object__pin(struct bpf_object *object, const char *path)</kbd>
 385 | 
 386 | #### PIN using bpftool
 387 | * 使用bpftool可以查看已经加载到内存中的object(map, program, link)
 388 | * 可以将指定的 object pin 到指定的路径
 389 | 
 390 | 
 391 | #### UNPIN
 392 | * 直接使用 rm（简单粗暴） 
 393 | * 使用 libbpf 提供的 unpin API
 394 | 
 395 | ::right:: 
 396 | 
 397 | ```c{all|3,4|6,7|9|all}
 398 | void pin_object(struct bpf_map *map, struct bpf_program *prog) {
 399 |     //pin use fd 
 400 |     bpf_obj_pin(bpf_map__fd(map), PATH);
 401 |     bpf_obj_pin(bpf_program__fd(prog), PATH);
 402 |     //pin use high level api 
 403 |     bpf_map__pin(map, PATH);
 404 |     bpf_program__pin(prog, PATH);
 405 |     //get pin fd 
 406 |     int fd = bpf_obj_get(PATH)
 407 | }
 408 | ```
 409 | <div style="margin-top:20px">
 410 | <center>
 411 | <img src="/bpftool_pin.png"/>
 412 | </center>
 413 | </div>
 414 | 
 415 | <div style="margin-top:15px">
 416 | <center>
 417 | <img src="/pin_sample.png"/>
 418 | </center>
 419 | </div>
 420 | 
 421 | ---
 422 | layout: two-cols
 423 | ---
 424 | 
 425 | # eBPF MAP Sharing 
 426 | 
 427 | ## 如果eBPF程序在同一.o文件内
 428 | * 根据MAP创建过程，同一.o文件内的prog使用的相同的MAP指针会被fd替换
 429 | * 同一.o文件内的prog直接共享MAP
 430 | 
 431 | ## 如果eBPF程序在不同.o文件内
 432 | * MAP会被创建多次，因此指针被替换为不同map的fd
 433 | * 无法直接共享MAP,虽然使用的MAP名字相同但是是不同的MAP
 434 | 
 435 | 
 436 | ::right::
 437 | 
 438 | <style>
 439 |   span {
 440 |     font-size: 6px;
 441 |   }
 442 | </style>
 443 | 
 444 | ```c
 445 | struct {
 446 |     __uint(type, BPF_MAP_TYPE_ARRAY);
 447 |     __type(key, int);
 448 |     __type(value, __u64);
 449 |     __uint(max_entries, 1000);
 450 | } count SEC(".maps");
 451 | 
 452 | SEC("xdp")
 453 | int test_xdp(struct xdp_md *ctx) {
 454 |     __u64 key = 0;
 455 |     __u64 *val_p;
 456 |     val_p = bpf_map_lookup_elem(&count, &key);
 457 |     if (val_p != NULL) lock_xadd(val_p, 1);
 458 |     return XDP_PASS;
 459 | }
 460 | 
 461 | SEC("xdp")
 462 | int test_xdp2(struct xdp_md *ctx) {
 463 |     __u64 key = 0;
 464 |     __u64 *val_p;
 465 |     val_p = bpf_map_lookup_elem(&count, &key);
 466 |     if (val_p != NULL) bpf_trace_printk("count %lu", *val_p);
 467 |     return XDP_PASS;
 468 | }
 469 | ```
 470 | 
 471 | 
 472 | ---
 473 | layout: two-cols
 474 | ---
 475 | 
 476 | # eBPF MAP Sharing
 477 | 
 478 | <style>
 479 | li {
 480 |   font-size:12px;
 481 | }
 482 | 
 483 | ul {
 484 |   font-size:10px;
 485 | }
 486 | 
 487 | </style>
 488 | ### 不在同一个.o文件的progs如何共享MAP?
 489 | 
 490 | 1. 获取已创建MAP的fd
 491 |    * 调用libbpf API从object对象中获取fd(创建MAP和使用MAP是同一个进程)
 492 |    * 通过<kbd>bpf_obj_get</kbd>获取已经PIN到VFS的MAP fd
 493 | 
 494 | 2. 使用已经创建的MAP的fd
 495 |    * 在调用<kbd>bpf_object__open</kbd>之前
 496 |    * 调用<kbd>int bpf_map__reuse_fd(struct bpf_map *map, int fd)</kbd>
 497 | 3. 加载BPF object
 498 | 
 499 | ```c{all|4,5}
 500 | void reuse_map(int reuse_fd) {
 501 |     struct bpf_object* obj;
 502 |     obj = bpf_object__open(path); 
 503 |     map = bpf_object__find_map_by_name(obj, NAME);
 504 |     bpf_map__reuse_fd(map, reuse_fd);
 505 |     bpf_object__load(obj);
 506 | }
 507 | ```
 508 | 
 509 | ::right::
 510 | 
 511 | <center>
 512 | <img src="/share_map.png" class="h-110"/>
 513 | </center>
 514 | 
 515 | <center>Fig. BPF MAP reuse fd</center>
 516 | 
 517 | ---
 518 | layout: two-cols
 519 | ---
 520 | 
 521 | # eBPF MAP Sharing
 522 | 
 523 | ### 如何在用户态和内核态共享MAP?
 524 | 1. 关键点:获取MAP fd(用户态和内核态使用同一个MAP fd)
 525 |    * 调用libbpf API从object对象中获取fd(创建MAP和使用MAP是同一个进程)
 526 |    * 通过<kbd>bpf_obj_get</kbd>获取已经PIN到VFS的MAP fd 
 527 | 
 528 | 2. 用户态MAP操作
 529 |   * 增/改, <kbd>int bpf_map_update_elem(int fd,const void *key,const void *value,__u64 flags)</kbd>
 530 |   * 删, <kbd>int bpf_map_delete_elem(int fd,const void *key)</kbd>
 531 |   * 查,<kbd>int bpf_map_lookup_elem(int fd,const void *key, void *value)</kbd>
 532 |   
 533 | ::right::
 534 | 
 535 | ```c{all|3|8-13}
 536 | int lookup_flow(const char* map_path, u64 *data, u32 src, u32 dst, u16 source, u16 port) {
 537 |     int res;
 538 |     int fd = bpf_obj_get(map_path);
 539 |     if (fd <= 0) {
 540 |         return -1;
 541 |     }
 542 |     struct tcp4flow flow_key; 
 543 |     flow_key.src = src;
 544 |     flow_key.dst = dst;
 545 |     flow_key.srouce = source;
 546 |     flow_key.port = port;
 547 |     u64 bytes;
 548 |     res = bpf_map_lookup_elem(fd, &flow_key, &bytes);
 549 |     *data = bytes;
 550 |     return 0;
 551 | }
 552 | ```
 553 | 
 554 | <center>Code. user-space bpf map sample</center>
 555 | 
 556 | ---
 557 | layout: two-cols
 558 | ---
 559 | 
 560 | # eBPF for Packet Processing : Overview
 561 | 
 562 | ### Hook for eBPF Packet Processing
 563 | * ingress packets : XDP, TC
 564 | * egress packets : TC 
 565 | 
 566 | ### ability of eBPF Packet Processing
 567 | * 统计/监控：流量监控，包监控，统计网络信息....
 568 | * 修改packet内容
 569 | * 过滤 (防火墙 iptable)
 570 | * 重定向 (负载均衡器)
 571 | * 其它...（prestack 缓存，BMC)
 572 | 
 573 | ### 和kernel-bypass方案的比较(XDP)
 574 | * 可以通过重定向/AF_XDP的方式完全绕过内核
 575 | * 也可以对包进行一定的修改之后复用内核网络栈
 576 | 
 577 | ::right:: 
 578 | 
 579 | <img src="/eBPF_packet_process.png"/>
 580 | <center>Fig. eBPF packet processing</center>
 581 | 
 582 | ---
 583 | layout: two-cols
 584 | ---
 585 | 
 586 | # eBPF for Packet Processing : XDP
 587 | 
 588 | <style>
 589 | ul {
 590 |   font-size: 10px;
 591 | }
 592 | </style>
 593 | 
 594 | ### XDP程序参数
 595 | 
 596 | * <kbd>./include/uapi/linux/bpf.h struct xdp_md</kbd>
 597 | * 尚未分配skb的原始数据
 598 | * 数据包区域： $[data, data\_end)$
 599 | * 拥有最多32bytes的meta : $[data\_meta, data)$
 600 | 
 601 | ### XDP的返回值
 602 | * 定义在 <kbd>./include/uapi/linux/bpf.h enum xdp_action</kbd>
 603 | * XDP_ABORTED = 0, BPF异常
 604 | * XDP_DROP, 将包丢弃
 605 | * XDP_PASS, 重新交给内核处理
 606 | * XDP_TX, 反射，将包从原有的nic重新发送回去
 607 | * XDP_REDIRECT,  重定向，定向到不同的cpu, XSK(AF_XDP), Nic(egress)
 608 | 
 609 | ### XDP的功能
 610 | * direct packet access(读写数据包)
 611 | * grow/shrink packet room (bpf_xdp_adjust_head)
 612 | * redirect : 先调用bpf_redirect，再返回XDP_REDIRECT, 如果是 XDP_TX直接返回即可
 613 | 
 614 | ::right::
 615 | 
 616 | ```c{all|2,3|3,4|all}
 617 | struct xdp_md {
 618 | 	__u32 data;
 619 | 	__u32 data_end;
 620 | 	__u32 data_meta;
 621 | 	/* Below access go through struct xdp_rxq_info */
 622 | 	__u32 ingress_ifindex; /* rxq->dev->ifindex */
 623 | 	__u32 rx_queue_index;  /* rxq->queue_index  */
 624 | 
 625 | 	__u32 egress_ifindex;  /* txq->dev->ifindex */
 626 | };
 627 | ```
 628 | 
 629 | ```c{all|1,2|3,4,6|7-10|12,14}
 630 | SEC("xdp")
 631 | int test_xdp(struct xdp_md *ctx) {
 632 |     void *data = (void *)(__u64)(ctx->data);
 633 |     void *data_end = (void *)(__u64)(ctx->data_end);
 634 |     void *pos = data;
 635 |     //eth
 636 |     struct ethhdr *eth = pos;
 637 |     if ((void*)(eth + 1) > data_end) {
 638 |         goto fail;
 639 |     }
 640 |     pos += sizeof(struct ethhdr);
 641 |     return XDP_PASS;
 642 | fail:
 643 |     return XDP_DROP;
 644 | }
 645 | ```
 646 | 
 647 | ---
 648 | layout: two-cols
 649 | ---
 650 | 
 651 | # eBPF for Packet Processing : TC
 652 | 
 653 | <style>
 654 | ul {
 655 |   font-size: 10px;
 656 | }
 657 | </style>
 658 | 
 659 | ### TC程序参数
 660 | 
 661 | * <kbd>./include/uapi/linux/bpf.h struct __sk_buff</kbd>
 662 | * 已经分配了sk_buff, 有丰富的信息(图中只展示一部分)
 663 | * 有20bytes的可用cb
 664 | * 通过data_meta和XDP程序共享数据
 665 | * 使用data, data_end访问线性区
 666 | 
 667 | ### TC返回值(TC_ACT 控制码)
 668 | * 定义在 <kbd>./include/uapi/linux/pkt_cls.h</kbd>
 669 | * TC_ACT_OK, TC数据包处理流程允许继续处理数据包
 670 | * TC_ACT_SHOT, 终止数据包处理，丢弃数据包
 671 | * TC_ACT_UNSPEC(-1), 使用TC的默认操作，类似分类器返回 -1 
 672 | * TC_ACT_REDIRECT, 重定向(egress or ingress Nic)
 673 | * ...
 674 | 
 675 | ### TC的功能
 676 | * direct packet access(读写数据包线性区域，通常是packet header)
 677 | * 使用 <kbd>bpf_skb_load_bytes bpf_skb_pull_data</kbd>来读写非线形区域(通常是应用层数据)
 678 | * grow/shrink packet room (bpf_skb_adjust_room)
 679 | * redirect : 重定向(egress or ingress Nic)(to be test)
 680 | * 和其它的TC模块进行交互(TC_CLS可以直接返回class id, 修改skb的classid?)
 681 | 
 682 | ::right:: 
 683 | 
 684 | ```c
 685 | struct __sk_buff {
 686 | 	__u32 len;
 687 | 	__u32 cb[5];
 688 | 	__u32 data;
 689 | 	__u32 data_end;
 690 | 	__u32 data_meta;
 691 |   ...
 692 | };
 693 | 
 694 | ```
 695 | 
 696 | ```c{all|1,2|3,4,6|7-10|12,14}
 697 | SEC("tc")
 698 | int test_tc(struct __sk_buff *ctx) {
 699 |     void *data = (void *)(__u64)(ctx->data);
 700 |     void *data_end = (void *)(__u64)(ctx->data_end);
 701 |     void *pos = data;
 702 |     //eth
 703 |     struct ethhdr *eth = pos;
 704 |     if ((void*)(eth + 1) > data_end) {
 705 |         goto fail;
 706 |     }
 707 |     pos += sizeof(struct ethhdr);
 708 |     return TC_ACT_SHOT;
 709 | fail:
 710 |     return TC_ACT_UNSPEC;
 711 | }
 712 | ```
 713 | 
 714 | ---
 715 | layout: two-cols
 716 | ---
 717 | 
 718 | 
 719 | <style>
 720 | ul {
 721 |   font-size: 15px;
 722 | }
 723 | </style>
 724 | 
 725 | # eBPF for Packet Processing : Direct Packet Access
 726 | 
 727 | ### Direct Packet Access(DPA) ? 
 728 | * DPA的含义是**直接**通过指针来访问packet
 729 | * DPA可以用来访问packet(XDP),packet线性区(TC),meta
 730 | 
 731 | ### How to use DPA?
 732 | * 要点在于，在对指针解引用之前必须验证指针的有效性
 733 | * 有效性指的是: $ptr \in [data, data\_end)$
 734 | * 验证伴随着编程的全程，只要我们使用了一个新的指针访问数据包，就必须验证这个指针。
 735 | 
 736 | ### TIPs for DPA 
 737 | * 切记使用指针之前进行验证
 738 | * 采用扫描的方式
 739 |   1. 从packet开始到结束，扫描packet， pos 就是扫描线，每次扫描都进行验证
 740 |   2. 用新的变量记录下已经验证过的pos (记录重要的节点)
 741 | 
 742 | ::right::
 743 | 
 744 | ```c
 745 | struct xdp_md {
 746 | 	__u32 data;
 747 | 	__u32 data_end;
 748 | 	__u32 data_meta;
 749 | 	/* Below access go through struct xdp_rxq_info */
 750 | 	__u32 ingress_ifindex; /* rxq->dev->ifindex */
 751 | 	__u32 rx_queue_index;  /* rxq->queue_index  */
 752 | };
 753 | ```
 754 | 
 755 | ```c
 756 | if ((void*)ptr + len > data_end) {
 757 |     //如果ptr不是有效的指针
 758 |     return;
 759 | }
 760 | // access ptr here
 761 | ```
 762 | 
 763 | <center>
 764 | <img src="/direct_packet_access.png"/>
 765 | </center>
 766 | 
 767 | <center>
 768 | <img src="/DPA_SCAN.png"/>
 769 | </center>
 770 | ---
 771 | layout: two-cols
 772 | ---
 773 | 
 774 | # eBPF for Packet Processing : Adjust Room
 775 | 
 776 | ### How to adjust room 
 777 | * 使用adjust room相关的eBPF帮助函(bpf_xdp_adjust_head, bpf_xdp_adjust_tail)
 778 | * 可以从头和尾两个方向进行adjust 
 779 | 
 780 | ### Adjust room技术要点
 781 | * adjust的本质是增加packet内存，并修改指针, adjust tail 修改 data_end, adjust head 修改data 
 782 | * 通过delta参数来控制修改空间的大小，指针+=delta
 783 | * 因此对于adjust head来说，如果delta为负代表增加空间，为正减少空间，adjust tail刚好相反
 784 | * adjust空间之后，所有的指针必须重新验证。
 785 | 
 786 | ::right::
 787 | 
 788 | ```c{all|3-10|6|11|13-16}
 789 | SEC("xdp")
 790 | int test_xdp(struct xdp_md *ctx) {
 791 |     void *data = (void *)(__u64)(ctx->data);
 792 |     void *data_end = (void *)(__u64)(ctx->data_end);
 793 |     int res; 
 794 |     struct hdr_cursor nh = {.pos = data};   //scan
 795 |     struct ethhdr *eth;
 796 |     struct iphdr *iph;
 797 |     struct tcphdr *tcph;
 798 |     res = is_tcp_packet(&nh, data_end, &eth, &iph, &tcph);
 799 |     bpf_xdp_adjust_head(ctx, -BYTES) // grow, positive for shrink 
 800 |     //recheck
 801 |     data = (void *)(__u64)(ctx->data);
 802 |     data_end = (void *)(__u64)(ctx->data_end);
 803 |     nh.pos = data;     //scan
 804 |     res = is_tcp_packet(&nh, data_end, &eth, &iph, &tcph);
 805 | }
 806 | ```
 807 | 
 808 | <center>
 809 | <img src="/adjust_room.png"/>
 810 | </center>
 811 | 
 812 | ---
 813 | layout: two-cols
 814 | ---  
 815 | 
 816 | # eBPF Tail Call 
 817 | 
 818 | ### What is eBPF Tail Call 
 819 | * 解决ebpf单个程序最大长度限制的问题。ebpf最多支持32次尾调用
 820 | * 划分程序结构，便于通过验证器的验证，开发和调试
 821 | * 通过尾调用动态修改eBPF程序行为（policy chain)
 822 | 
 823 | ### eBPF Tail Call 特性
 824 | * 如果一个函数执行了尾调用，那么被调用函数和调用函数的 **bpf程序类型相同**
 825 | * 一个函数执行尾调用，跳转到另一个bpf程序之后，函数**不会返回调用函数的执行流**
 826 | 
 827 | ### 使用eBPF Tail Call 
 828 | * 声明类型为 BPF_MAP_TYPE_PROG_ARRAY的映射(key 和 value类型均为int)
 829 | * 在用户态在prog_array对应的index，写入被调用程序的fd
 830 | * bpf程序中，在适当的时候执行该bpf_tail_call方法
 831 | 
 832 | ::right:: 
 833 | 
 834 | ```c{all|1-6|3-4|10|17|all}
 835 | struct {
 836 |     __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
 837 |     __type(key, int);
 838 |     __type(value, int);
 839 |     __uint(max_entries, MAX_XDP_ACTION_NUM);
 840 | } xdp_actions SEC(".maps");
 841 | SEC("xdp")
 842 | int entry(struct xdp_md *ctx)
 843 | {
 844 |     bpf_tail_call(ctx, &xdp_actions, 0);
 845 |     //will not go here if tailcall success
 846 |     return XDP_DROP;
 847 | }
 848 | SEC("xdp")
 849 | int prog1(struct xdp_md *ctx)
 850 | {
 851 |     bpf_trace_printk("this is prog1");
 852 |     return XDP_PASS;
 853 | }
 854 | ```
 855 | 
 856 | ```c{all|4-5}
 857 | void set_tail_call() {
 858 |     //get prog_array_fd of xdp_actions
 859 |     //get fd of prog1 
 860 |     int index = 0;
 861 |     bpf_map_update_elem(prog_array_fd, &index, &prog1_fd, 0);
 862 | }
 863 | ```
 864 | 
 865 | ---
 866 | layout: two-cols
 867 | ---  
 868 | 
 869 | # Techniques in eMPTCP : Policy Chain
 870 | 
 871 | ## What is Policy Chain 
 872 | * 通过eBPF尾调用实现，利用了eBPF尾调用动态修改eBPF程序行为的特性
 873 | * eBPF尾调用的增强，**动态决定一条eBPF程序尾调用链**
 874 | 
 875 | ## Challenge
 876 | * eBPF程序不支持全局变量
 877 | * 如何知道下一个要调用的程序?(index)
 878 | * 如何处理并发问题? (例如仅仅通过普通的BPF MAP来记录下调用信息)
 879 | 
 880 | ::right::
 881 | 
 882 | <img src="/policy_chain.png"/>
 883 | <center>Fig. eMPTCP design overview</center>
 884 | 
 885 | ---
 886 | layout: two-cols
 887 | ---  
 888 | 
 889 | # Techniques in eMPTCP : Policy Chain
 890 | 
 891 | ### 利用Meta data来保存TailCall所需的状态信息(context)
 892 | * XDP data_meta(32bytes)
 893 | * TC cb array(20bytes) (tc没有找到操作 data_meta的接口)
 894 | * meta是每一个包都有的，解决了并发问题。
 895 | 
 896 | ### Context
 897 | * 每当尾调用发生时, 当前的eBPF程序从meta data处获取context信息
 898 | * 从context信息获取下一个要调用的程序
 899 | * 从context信息获取参数
 900 | * 参数可以是立即数(直接保存在context中)，也可以从BPF_MAP中再读一次
 901 | 
 902 | ::right::
 903 | 
 904 | ```c
 905 | union chain_t{
 906 |     __u8 idx;
 907 |     __u8 next_idx;
 908 | };
 909 | struct action_t {
 910 |     union chain_t chain;
 911 |     __u8       param_type:2,
 912 |                rsv:6;
 913 |     union {
 914 |         __u16 imme;
 915 |         struct {
 916 |             __u8 offset;
 917 |             __u8 len;
 918 |         } mem;
 919 |     } param;
 920 | };
 921 | struct action_chain_t {
 922 |     struct action_t actions[ACTION_CHAIN_MAX_LEN];
 923 | };
 924 | ```
 925 | 
 926 | <center><img src="/context.png" class="h-30"/></center>
 927 | <center>Fig. policy chain context</center>
 928 | 
 929 | ---
 930 | layout: two-cols
 931 | ---  
 932 | 
 933 | # Techniques in eMPTCP : Policy Chain
 934 | 
 935 | ```c
 936 | #define XDP_POLICY_PRE_SEC \
 937 |     xdp_policy_t POLICY; \
 938 |     res = xdp_get_and_pop_policy(ctx, &POLICY);\
 939 |     CHECK_RES(res);\
 940 |     __u8 NEXT_IDX = POLICY.chain.next_idx;
 941 | 
 942 | #define XDP_ACTION_POST_SEC \
 943 | next:                                   \
 944 |     if (NEXT_IDX == DEFAULT_POLICY) {\
 945 |         goto exit;                   \
 946 |     }                                \
 947 |     goto next_action;
 948 | ```
 949 | 
 950 | <center><img src="/context.png" class="h-30"/></center>
 951 | <center>Fig. policy chain context</center>
 952 | 
 953 | ::right::
 954 | 
 955 | ```c{all|4,5|10-12|13}
 956 | SEC("xdp")
 957 | int entry(struct xdp_md *ctx) {
 958 |     //read chain context policies from bpf_map(dymatically)
 959 |     int first_policy;
 960 |     res = xdp_set_policy_chain(ctx, policies, &first_policy);
 961 |     if (res < 0) {
 962 |         goto fail;
 963 |     }
 964 | 
 965 |     if (first_policy == DEFAULT_POLICY) {
 966 |         return XDP_PASS:
 967 |     }
 968 |     bpf_tail_call(ctx, &xdp_actions, first_policy);
 969 | }
 970 | ```
 971 | 
 972 | ```c{all|3|4|5|6-7}
 973 | SEC("xdp")
 974 | int subpolicy(struct xdp_md *ctx) {
 975 |     XDP_POLICY_PRE_SEC
 976 |     __u16 param = POLICY.param.imme;
 977 |     XDP_ACTION_POST_SE
 978 | next_action:
 979 |     bpf_tail_call(ctx, &NEXT_IDX);
 980 | }
 981 | ```
 982 | ---
 983 | layout: two-cols
 984 | ---  
 985 | 
 986 | # Techniques in eMPTCP : Bound-Loop
 987 | 
 988 | ### 思路
 989 | * 设置循环边界（40字节) 
 990 | * 逐字节扫描（外层for)
 991 | * 逐选项检查(SCAN_MPTCP_OPT_SUB)
 992 | * 通过指针(start/pos)记录当前扫描的位置
 993 | ```c{all|2|3|4-6|7-9|10-12|all}
 994 | #define SCAN_MPTCP_OPT_SUB(pos, de, sub){\
 995 |     struct mptcp_option *opt = (pos);\
 996 |     CHECK_BOUND(opt, (de));\
 997 |     if (opt->kind == MPTCP_KIND && opt->sub == (sub)){\
 998 |         goto found;\
 999 |     }\
1000 |     if (opt->kind == 0 || opt->kind == 1) {\
1001 |         pos += 1;\
1002 |     }\
1003 |     else {\
1004 |         pos += opt->len;\
1005 |     }\
1006 | }\
1007 | ```
1008 | 
1009 | ::right::
1010 | 
1011 | ```c{all|6|7|8|10-13|16-18|all}
1012 | static __always_inline int check_mptcp_opt(struct hdr_cursor *nh, void *data_end, int tcp_opt_len, int sub) {
1013 |     void *start = nh->pos;
1014 |     void *pos = start;
1015 |     #pragma unroll 40
1016 |     for (int index = 0; index < 40; index++) {
1017 |         int curr_idx = pos - start;
1018 |         if (curr_idx >= tcp_opt_len) goto not_exists;
1019 |         if (curr_idx == index) SCAN_MPTCP_OPT_SUB(pos, data_end, sub);
1020 |     }
1021 | found:
1022 |     //found mptcp option
1023 |     nh->pos = pos;
1024 |     return 0;
1025 | out_of_bound:
1026 |     return -1;
1027 | not_exists:
1028 |     return -2; 
1029 | }
1030 | ```
1031 | 
1032 | ### ps: 
1033 | * <kbd> CHECK_BOUND(opt, (de)); </kbd>不可省略
1034 | * 在编程时随时要验证指针有效性
1035 | * 封装API
1036 | 
1037 | ---
1038 | layout: two-cols
1039 | ---  
1040 | 
1041 | 
1042 | # Techniques in eMPTCP : Bound-Loop
1043 | 
1044 | ```c{all|2|3|4|5-6|all}
1045 | #define COPY_TCP_OPT_TO_P(index, tcp_opt_len, pkt_dst, src, de){                \
1046 |     if ((index) >= (tcp_opt_len)) goto out;                                     \
1047 |     CHECK_BOUND_BY_SIZE(pkt_dst, de, 4);                                        \
1048 |     __builtin_memcpy((void*)(pkt_dst),(void*)(src),4);                          \
1049 |     (src) = (void*)(src) + 4;                                                   \
1050 |     (pkt_dst) = (void*)(pkt_dst) + 4;                                                   \
1051 | }\
1052 | 
1053 | ```
1054 | 
1055 | **关键** 
1056 | 1. <kbd> CHECK_BOUND_BY_SIZE(pkt_dst, de, 4);</kbd>不能省略
1057 | 2. 随时检查指针有效性
1058 | 3. SCAN的方法(pkt_dst)
1059 | 4. 固定次数的循环，4字节一次，循环10次
1060 | 
1061 | ::right::
1062 | 
1063 | ```c{all|8-9|10|14|19-20|all}
1064 | static __always_inline int add_tcp_opts(struct hdr_cursor *nh, void *data_end, const void *opts, __u16 size) {
1065 |     if (opts == NULL) goto fail;
1066 |     if ((size & 0x3) != 0) {
1067 |         //size % 4 != 0
1068 |         goto fail;
1069 |     }
1070 | 
1071 |     void *pkt_dst = nh->pos;
1072 |     const void *src = opts;
1073 |     __u16 s4 = size >> 2;  
1074 | 
1075 | #pragma unroll 10
1076 |     for (int i = 0; i < 10; i++) {
1077 |         COPY_TCP_OPT_TO_P(i, s4, pkt_dst, src, data_end);
1078 |     out:
1079 |         break;
1080 |     }
1081 | 
1082 |     nh->pos = pkt_dst;
1083 |     return 0;
1084 | 
1085 | out_of_bound:
1086 | fail:
1087 |     return -1;
1088 | }
1089 | ```
1090 | 
1091 | 
1092 | ---
1093 | layout: two-cols
1094 | ---  
1095 | # Techniques in eMPTCP : Packet Manipulation
1096 | 
1097 | <center><img src="/packet_modify.png" class="h-30"/></center>
1098 | <center>Fig. packet modify</center>
1099 | <center><img src="/packet_rmopt.png" class="h-30"/></center>
1100 | <center>Fig. packet remove opt</center>
1101 | 
1102 | **方法：**
1103 | * 利用 DPA 获取选项指针
1104 | * 修改指针指向的内容
1105 | * 重新计算校验和
1106 | ::right::
1107 | 
1108 | ```c
1109 | static __always_inline __sum16 csum16_add(__sum16 csum, __be16 addend)
1110 | {
1111 | 	__u16 res = (__u16)csum;
1112 | 
1113 | 	res += (__u16)addend;
1114 | 	return (__sum16)(res + (res < (__u16)addend));
1115 | }
1116 | static __always_inline __sum16 csum16_sub(__sum16 csum, __be16 addend)
1117 | {
1118 | 	return csum16_add(csum, ~addend);
1119 | }
1120 | static __always_inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new)
1121 | {
1122 | 	*sum = ~csum16_add(csum16_sub(~(*sum), old), new);
1123 | }
1124 | ```
1125 | 
1126 | ```c{all|3|4|5|6,7|8|all}
1127 | SEC("xdp")
1128 | int modify_recv_win(struct xdp_md *ctx) {
1129 |     XDP_POLICY_RRE_SEC
1130 |     is_tcp_packet(&nh, data_end, &eth, &iph, &tcph);
1131 |     u16 window = POLICY.param.imme;
1132 |     csum_replace2(&tcph->check, tcph->window, window); 
1133 |     tcph->window = window;
1134 |     XDP_POLICY_POST_SEC
1135 | }
1136 | ```
1137 | 
1138 | ---
1139 | layout: two-cols
1140 | ---  
1141 | # Techniques in eMPTCP : Packet Manipulation
1142 | 
1143 | ```c{all|6,7,8|9,10|12-15|16|all}
1144 | static __always_inline int xdp_grow_tcp_header(struct xdp_md *ctx, struct hdr_cursor *nh,  __u16 tcp_opt_len, int bytes, int *modified) {
1145 |     void * data = (void *)(long)ctx->data;
1146 |     void * data_end =  (void *)(long)ctx->data_end; 
1147 |     nh->pos = data;
1148 |     int res;
1149 |     struct pkt_header_buf_t buf;
1150 |     //1. store header to buf
1151 |     restore_header(&buf, nh, data_end, tcp_opt_len);
1152 |     //2. grow header
1153 |     bpf_xdp_adjust_head(ctx, -bytes);
1154 |     //3 reset data and data_end
1155 |     data =  (void *)(long)ctx->data; 
1156 |     data_end =  (void *)(long)ctx->data_end; 
1157 |     //4. recover header 
1158 |     nh->pos = data;
1159 |     recover_header(&buf, nh, data_end, tcp_opt_len);
1160 |     return 0;
1161 | }
1162 | ```
1163 | 
1164 | ::right::
1165 | 
1166 | ```c{all|2|3|4|5|6,7|all}
1167 | // add MP_PRIO 4bytes
1168 | xdp_grow_tcp_header(ctx, &nh, tcp_opt_len, sizeof(struct mp_prio), &modified);  //1 
1169 | is_tcp_packet(&new_nh, data_end, &eth, &iph, &tcph);  //2
1170 | add_tcp_opts(&nh, data_end, &prio_opt, sizeof(struct mp_prio));  //3
1171 | update_tcphlen_csum(iph, tcph, sizeof(struct mp_prio));  //4
1172 | //recompute checksum , mp_prio 4 bytes
1173 | add_tcpopt_csum(&tcph->check, &prio_opt, sizeof(struct mp_prio)); //5
1174 | ```
1175 | <center><img src="/packet_insertopt.png" class="h-30"/></center>
1176 | <center>Fig. insert tcp opt</center>
1177 | 
1178 | <style>
1179 |   li {
1180 |     font-size:12px;
1181 |   }
1182 | </style>
1183 | 
1184 | **要点**： 
1185 | 1. BPF没有提供直接扩展tcp头部的帮助函数，因此最关键的是实现增长TCP头部空间
1186 | 2. move forward 也可以采用逐字节移动的方式(to be test)
1187 | 3. 如果修改了包长度，一定要记得更新，ip头部和TCP伪头部校验和。
1188 | 
1189 | ---
1190 | layout: two-cols
1191 | ---  
1192 | 
1193 | # My eBPF Lib 
1194 | ## Feture 
1195 | * 利用python 和 C 混编封装了 libbpf API
1196 | * 使用异常的风格来进行错误处理
1197 | * 封装了加载eBPF object的类
1198 | * 封装了perf_output 
1199 | * 其它方便使用eBPF的API
1200 | 
1201 | ```python
1202 | #python func wrapper for easy usage 
1203 | lib = ct.CDLL(CONFIG.libbpf_path, use_errno = True)
1204 | lib.bpf_obj_pin.restype = ct.c_int
1205 | lib.bpf_obj_pin.argtypes = [ct.c_int, ct.c_char_p]
1206 | def bpf_obj_pin(fd, pathname):
1207 |     '''
1208 |     @param:
1209 |         fd: bpf object fd 
1210 |         pathname : path in bpf virtual file system(str)
1211 |     '''
1212 |     res = lib.bpf_obj_pin(ct.c_int(fd), pathname.encode(encoding = "utf-8"))
1213 |     check_res("bpf_obj_pin", res)
1214 | ```
1215 | 
1216 | ::right::
1217 | **使用json描述bpf object对象**
1218 | ```json
1219 | XDP_ACTION_ENTRY = {
1220 |     "obj_path" : "path of xdp_action.o",
1221 |     "progs" : {
1222 |         "action_entry" : {
1223 |             "prog_type" : BPF_PROG_TYPE_XDP
1224 |         }
1225 |     },
1226 |     "pin_maps" : {
1227 |         "xdp_actions" : {
1228 |             "pin_path" : "path of xdp_actions",
1229 |             "flag" : BPFLoaderBase.PIN_MAP_FLAG.PIN_IF_NOT_EXIST
1230 |     }
1231 | }
1232 | ```
1233 | 
1234 | 
1235 | **方便的加载API**
1236 | ```python
1237 | #loader BPFObjectLoader / BPFBCCLoader
1238 | with load(bpf_obj, loader, unpin_only_fail = True) as entry:
1239 |    bpf_xdp_attach(if_nametoindex(interface), \
1240 |       entry.get_prog_fd("action_entry"), XDP_FLAGS.XDP_FLAGS_UPDATE_IF_NOEXIST, ct.c_void_p(None))  
1241 | ```
1242 | 
1243 | ---
1244 | layout: two-cols
1245 | --- 
1246 | 
1247 | ```python
1248 | #create selector chian
1249 | sc = SelectorChain()
1250 | sc.add("ip_pair", SELECTOR_AND).\
1251 |    add("service", SELECTOR_AND)
1252 | #create actor chian
1253 | ac = ActionChain()
1254 | ac.add("add_subflow").add("redirect")
1255 | #create policy chain
1256 | pc = PolicyChain(sc, ac)
1257 | #apply policy chain
1258 | pc.select(0,local_addr = "10.200.0.2",\
1259 | remote_addr = "10.200.1.2").set(1, "spark")
1260 | ```
1261 | 
1262 | ::right
1263 | 
1264 | ```c
1265 | #include "emptcp_utils.h"
1266 | #include "emptcp_common.h"
1267 | SEC("xdp")
1268 | int your_own_policy(struct xdp_md *ctx) 
1269 | {
1270 |     SELECTOR_PRE_SEC 
1271 | 
1272 | /*your own codes*/ 
1273 | 
1274 |     SELECTOR_POST_SEC
1275 | }
1276 | ```
1277 | ---
1278 | 
1279 | # Future Work
1280 | 
1281 | 1. 首要工作时完善目前的eMPTCP
1282 |    * 完善代码
1283 |    * innetwork-computing
1284 |    * packet 伪装
1285 |    * 填坑
1286 | 
1287 | 2. AF_XDP
1288 |    * AF_XDP作为一种 kernel-pypass 的手段
1289 |    * libxdp 
1290 | 
1291 | 3. eBPF
1292 |    * 其它类型的eBPF程序(比如SOCK_OPTS, STRUCT_OPTS)，寻找新的use case
1293 |    * 看一下eBPF源代码，看看能不能在MAP存储等方面(JIT以上，系统调用以下的优化空间)
1294 | 


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901150942553.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901150942553.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901152829447.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901152829447.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901155226898.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901155226898.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901160259370.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901160259370.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901161031445.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901161031445.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901163611445.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901163611445.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901164811553.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901164811553.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901165036148.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901165036148.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901165204233.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901165204233.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901165233885.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901165233885.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901165341202.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901165341202.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901165558510.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901165558510.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901165809164.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901165809164.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901170115110.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901170115110.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901170432600.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901170432600.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901171034117.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.assets/image-20220901171034117.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_kernel_document_note/eBPF_kernel_document_note.md:
--------------------------------------------------------------------------------
  1 | ## eBPF内核文档阅读笔记(based on linux 5.19)
  2 | 
  3 | ### 验证器
  4 | 
  5 | **不能load未赋值的寄存器** 
  6 | 
  7 | ![image-20220901165036148](./eBPF_kernel_document_note.assets/image-20220901165036148.png)
  8 | 
  9 | **load/store指令只能用于特定的寄存器类型** 
 10 | 
 11 | ![image-20220901150942553](./eBPF_kernel_document_note.assets/image-20220901150942553.png)
 12 | 
 13 | **不同的eBPF程序支持不同的函数集**
 14 | 
 15 | ![image-20220901152829447](./eBPF_kernel_document_note.assets/image-20220901152829447.png)
 16 | 
 17 | **eBPF程序会追踪每一个寄存器的取值范围，包括标量的取值和指针的Offset** 
 18 | 
 19 | ![image-20220901155226898](./eBPF_kernel_document_note.assets/image-20220901155226898.png)
 20 | 
 21 | **eBPF程序对于指针寄存器的variable offset 保存id, 共享检查状态** (暂时这么理解)
 22 | 
 23 | 暂时理解为，如果一个拥有 variable offset 的指针寄存器 check了，它的copy由于拥有相同的Id,也被check了
 24 | 
 25 | 
 26 | 
 27 | ![image-20220901160259370](./eBPF_kernel_document_note.assets/image-20220901160259370.png)
 28 | 
 29 | **eBPF验证器使用Directly Packet Access验证的例子** 
 30 | 
 31 | ![image-20220901161031445](./eBPF_kernel_document_note.assets/image-20220901161031445.png)
 32 | 
 33 | 这里也体现了，验证了R5之后 R3也一起被验证了
 34 | 
 35 | **Directly Memory Access的验证范围问题(很重要感觉经常犯错出现）**
 36 | 
 37 | ![image-20220901163611445](./eBPF_kernel_document_note.assets/image-20220901163611445.png)
 38 | 
 39 | * 意味着 Directly Memory Access 在验证的时候是可以使用变量的，也就是上面提到的 variable offset 
 40 | * 但是 rx不能大于 16 字节，否则会导致溢出，r3 < data ，导致无法验证范围，这也是 **invalid access to packet** 的报错原因之一。
 41 | 
 42 | **eBPF验证器的剪枝逻辑**
 43 | 
 44 | 如果之前验证过了没问题，那么在同等条件或者更严格条件下进行验证也没有问题
 45 | 
 46 | `regsafe().  states_equal().`
 47 | 
 48 | ### eBPF验证器常见错误
 49 | 
 50 | **不可达的指令** 
 51 | 
 52 | (如果是用 presdo-C 然后 LLVM编译器编译一般不会出现这样的事情)
 53 | 
 54 | ![image-20220901165204233](./eBPF_kernel_document_note.assets/image-20220901165204233.png)
 55 | 
 56 | **read没有初始化的寄存器** 
 57 | 
 58 | ![image-20220901165233885](./eBPF_kernel_document_note.assets/image-20220901165233885.png)
 59 | 
 60 | **没有返回值，没有设置R0** 
 61 | 
 62 | ![image-20220901165341202](./eBPF_kernel_document_note.assets/image-20220901165341202.png)
 63 | 
 64 | **Stack指针越界** 
 65 | 
 66 | 堆栈往往位于最底部，所以offset范围是 [-MAX_STACK, R10)
 67 | 
 68 | **堆栈没有赋值就传递地址** 
 69 | 
 70 | (常见，struct 没有用 __builtin_memset初始化)
 71 | 
 72 | ![image-20220901165558510](./eBPF_kernel_document_note.assets/image-20220901165558510.png)
 73 | 
 74 | **BPF map fd为0**
 75 | 
 76 | ![image-20220901165809164](./eBPF_kernel_document_note.assets/image-20220901165809164.png)
 77 | 
 78 | **没有检查bpf_map_lookup 的返回值** 
 79 | 
 80 | 原因是执行lookup 之后会把寄存器的指针类型设置为 **PTR_TO_MAP_VALUE_OR_NULL** 不允许对该指针执行算数操作，即 5里的 r0 + 0
 81 | 
 82 | ![image-20220901170115110](./eBPF_kernel_document_note.assets/image-20220901170115110.png)
 83 | 
 84 | **访问没有对齐** 
 85 | 
 86 | 这里读 8 个字节，但是 却是 r0 + 4， 没有内存对齐（正确的对齐例如 r0 + 8) 
 87 | 
 88 | 所以说 eBPF也会检查访问的对齐
 89 | 
 90 | ![image-20220901170432600](./eBPF_kernel_document_note.assets/image-20220901170432600.png)
 91 | 
 92 | **Socket Ptr相关** 
 93 | 
 94 | ![image-20220901171034117](./eBPF_kernel_document_note.assets/image-20220901171034117.png)
 95 | 
 96 | socket Ptr check之后 reference 会加一。这里没有check直接设置为NULL 可能导致 引用泄漏(个人理解)
 97 | 
 98 | 同样的没有check也会报错。
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_ARRAY.assets/image-20221226102621676.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_ARRAY.assets/image-20221226102621676.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_ARRAY.md:
--------------------------------------------------------------------------------
  1 | # Map Ops of BPF_MAP_TYPE_ARRAY
  2 | 
  3 | ## 数据结构
  4 | 
  5 | ### array_map 实现的ops
  6 | 
  7 | ![image-20221226102621676](Map_Ops_BPF_MAP_TYPE_ARRAY.assets/image-20221226102621676.png)
  8 | 
  9 | ### bpf_array
 10 | 
 11 | ```c
 12 | 
 13 | struct bpf_array {
 14 | 	struct bpf_map map;
 15 | 	u32 elem_size;
 16 | 	u32 index_mask;
 17 | 	struct bpf_array_aux *aux;
 18 | 	union {
 19 | 		char value[0] __aligned(8);
 20 | 		void *ptrs[0] __aligned(8);
 21 | 		void __percpu *pptrs[0] __aligned(8);
 22 | 	};
 23 | };
 24 | ```
 25 | 
 26 | ### bpf_array_aux
 27 | 
 28 | ```c
 29 | struct bpf_array_aux {
 30 | 	/* Programs with direct jumps into programs part of this array. */
 31 | 	struct list_head poke_progs;
 32 | 	struct bpf_map *map;
 33 | 	struct mutex poke_mutex;
 34 | 	struct work_struct work;
 35 | };
 36 | ```
 37 | 
 38 | ## 代码逻辑
 39 | 
 40 | ### **array_map_alloc_check**
 41 | 
 42 | 该函数比较简单，负责对MAP创建的flags进行检查。部分类型的FLAG只有部分类型的MAP才能使用
 43 | 
 44 | ```c
 45 | int array_map_alloc_check(union bpf_attr *attr)
 46 | {
 47 | 	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
 48 | 	int numa_node = bpf_map_attr_numa_node(attr);
 49 | 
 50 | 	/* check sanity of attributes */
 51 | 	if (attr->max_entries == 0 || attr->key_size != 4 ||
 52 | 	    attr->value_size == 0 ||
 53 | 	    attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
 54 | 	    !bpf_map_flags_access_ok(attr->map_flags) ||
 55 | 	    (percpu && numa_node != NUMA_NO_NODE))
 56 | 		return -EINVAL;
 57 | 
 58 | 	if (attr->map_type != BPF_MAP_TYPE_ARRAY &&
 59 | 	    attr->map_flags & (BPF_F_MMAPABLE | BPF_F_INNER_MAP))
 60 | 		return -EINVAL;
 61 | 
 62 | 	if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY &&
 63 | 	    attr->map_flags & BPF_F_PRESERVE_ELEMS)
 64 | 		return -EINVAL;
 65 | 
 66 | 	if (attr->value_size > KMALLOC_MAX_SIZE)
 67 | 		/* if value_size is bigger, the user space won't be able to
 68 | 		 * access the elements.
 69 | 		 */
 70 | 		return -E2BIG;
 71 | 
 72 | 	return 0;
 73 | }
 74 | ```
 75 | 
 76 | ### array_map_alloc
 77 | 
 78 | ->  `bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;`
 79 | 
 80 | -> `int numa_node = bpf_map_attr_numa_node(attr);`
 81 | 
 82 | -> `u32 elem_size, index_mask, max_entries; u64 array_size, mask64; struct bpf_array *array; `   声明临时变量
 83 | 
 84 | -> `bool bypass_spec_v1 = bpf_bypass_spec_v1();` **??这个bypass_spec_v1有什么用**
 85 | 
 86 | ​	--> `return perfmon_capable();`
 87 | 
 88 | -> `elem_size = round_up(attr->value_size, 8);`  和BPF_HASH类似，向上取整到8的倍数
 89 | 
 90 | -> `max_entries = attr->max_entries;` 
 91 | 
 92 | -> manually `roundup_pow_of_tow()` 
 93 | 
 94 | ​	--> `mask64 = fls_long(max_entries - 1);`  
 95 | 
 96 | ​	--> `mask64 = 1ULL << mask64;` 
 97 | 
 98 | ​	-->`mask64 -= 1;`
 99 | 
100 | -> `index_mask = mask64` 
101 | 
102 | -> `if  (!bypass_spec_v1)  ` **??why**
103 | 
104 | ​	--> `max_entries = index_mask + 1;`
105 | 
106 | ​	--> `check for overflow` 
107 | 
108 | -> `array_size = sizeof(*array);` 
109 | 
110 | -> `if (percpu) `  计算percpu array 的 array size 
111 | 
112 | ​	--> `array_size += (u64) max_entries * sizeof(void *);`    和 BPF_HASH类似，如果类型是percpu的话，那么value size是一个指针
113 | 
114 | -> `else`  非percpu array 的array size
115 | 
116 | ​	--> `if (attr->map_flags & BPF_F_MMAPABLE)` **对于mmapable的需要地址根据page size对齐**
117 | 
118 | ​		---> `array_size = PAGE_ALIGN(array_size);`
119 | 
120 | ​		---> `array_size += PAGE_ALIGN((u64) max_entries * elem_size);` 
121 | 
122 | ​	--> `else` 
123 | 
124 | ​		---> `array_size += (u64) max_entries * elem_size;` 
125 | 
126 | -> `if (attr->map_flags & BPF_F_MMAPABLE) `  分配 mmapable的memory 
127 | 
128 | ​	--> `void *data = bpf_map_area_mmapable_alloc(array_size, numa_node);`  (size 可以直接使用)
129 | 
130 | ​	--> `array = data + PAGE_ALIGN(sizeof(struct bpf_array)) -  offsetof(struct bpf_array, value);` 因为在前面计算大小的时候，对sizeof(array) 按照 page_size进行对其，所以这里选择从第二个page作为array value的开始，从而计算出array的实际地址(浪费了一部分内存, 相对应的在free的时候需要根据 array的地址 计算出 data的地址)
131 | 
132 | -> `else` 
133 | 
134 | ​	--> `array = bpf_map_area_alloc(array_size, numa_node);` 
135 | 
136 | -> `array->index_mask = index_mask;` 
137 | 
138 | -> `array->map.bypass_spec_v1 = bypass_spec_v1;` 
139 | 
140 | -> `bpf_map_init_from_attr(&array->map, attr);` 设置通用的 map属性，**自己写map的时候 最后也记得调用一下这个帮助函数**
141 | 
142 | -> `array->elem_size = elem_size;` 
143 | 
144 | -> `if (percpu && bpf_array_alloc_percpu(array))` 
145 | 
146 | ​	--> `bpf_array_alloc_percpu(array)`
147 | 
148 | ​		---> `for (i = 0; i < array->map.max_entries; i++) ` 
149 | 
150 | ​			----> `ptr = bpf_map_alloc_percpu(&array->map, array->elem_size, 8, GFP_USER | __GFP_NOWARN);` 分配 percpu内存
151 | 
152 | ​			----> `	array->pptrs[i] = ptr;`  记录 offset  
153 | 
154 | -> `return &array->map` 
155 | 
156 | ### array_map_free
157 | 
158 | -> `struct bpf_array *array = container_of(map, struct bpf_array, map);` 
159 | 
160 | -> free kptrs in maps  **释放map里保存的 kptr**  **??kptr机制**
161 | 
162 | -> `if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)` 
163 | 
164 | ​	--> `bpf_array_free_percpu(array);` 
165 | 
166 | ​		---> `for (i = 0; i < array->map.max_entries; i++) ` 
167 | 
168 | ​			----> `free_percpu(array->pptrs[i]);` **调用 free_percpu 来释放 alloc的percpu变量**
169 | 
170 | ​			----> `cond_resched()` 
171 | 
172 | -> `if (array->map.map_flags & BPF_F_MMAPABLE)` 
173 | 
174 | ​	--> `bpf_map_area_free(array_map_vmalloc_addr(array));` 
175 | 
176 | ​		---> `(void *)round_down((unsigned long)array, PAGE_SIZE);`  根据array的地址按照page_size向下取整计算一开始分配的data 
177 | 
178 | -> `else` 
179 | 
180 | ​	--> `bpf_map_area_free(array);` **可以调用该函数来释放分配的内存**
181 | 
182 | ### array_map_lookup_elem
183 | 
184 | -> `struct bpf_array *array = container_of(map, struct bpf_array, map);` 
185 | 
186 | -> `u32 index = *(u32 *)key;` 获取array的index 
187 | 
188 | -> ` return array->value + array->elem_size * (index & array->index_mask);`  返回array的地址 可以看到是没有做什么并发保护的，并且也没有使用RCU机制，**??因此推测还是必须要使用 bpf_spin_lock才能保证同步**
189 | 
190 | ### percpu_array_map_lookup_elem
191 | 
192 | ### percpu_array_map_lookup_percpu_elem
193 | 
194 | ### array_map_update_elem
195 | 
196 | -> `struct bpf_array *array = container_of(map, struct bpf_array, map);`
197 | 
198 | -> `u32 index = *(u32 *)key; char *val;`
199 | 
200 | -> `if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)` 
201 | 
202 | ​	--> `memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), value, map->value_size);` `this_cpu_ptr`将offset转化为 percpu section里的指针
203 | 
204 | -> `else` 
205 | 
206 | ​	--> `val = array->value + array->elem_size * (index & array->index_mask);` 计算指针的位置
207 | 
208 | ​	-->`if (map_flags & BPF_F_LOCK)`
209 | 
210 | ​		---> `copy_map_value_locked(map, val, value, false);`  
211 | 
212 | ​			----> `lock = dst + map->spin_lock_off;`  使用保存在map中的 spinlock
213 | 
214 | ​			----> `preempt_disable();` 关闭抢占
215 | 
216 | ​			----> `__bpf_spin_lock_irqsave(lock);` 
217 | 
218 | ​			----> `copy_map_value(map, dst, src);`  **之后可以调用该函数来复制**
219 | 
220 | ​			----> `__bpf_spin_unlock_irqrestore(lock);` 
221 | 
222 | ​			----> `preempt_enable();` 
223 | 
224 | ​	--> `else` 
225 | 
226 | ​		---> `copy_map_value(map, val, value);` 
227 | 
228 | ​	--> `	check_and_free_fields(array, val);`  **释放 timmer 和 kptr**
229 | 
230 | -> `return 0` 
231 | 
232 | ### array_map_delete_elem 
233 | 
234 | ```c
235 | static int array_map_delete_elem(struct bpf_map *map, void *key)
236 | {
237 | 	return -EINVAL;
238 | }
239 | ```
240 | 
241 | ### array_map_check_btf
242 | 
243 | ### array_map_seq_show_elem 
244 | 
245 | 


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_HASH .assets/image-20221109162954511.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_HASH .assets/image-20221109162954511.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_HASH .assets/image-20221130164200740.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_HASH .assets/image-20221130164200740.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_HASH .assets/image-20221130165305704.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_HASH .assets/image-20221130165305704.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_HASH .assets/image-20221201161747521.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_HASH .assets/image-20221201161747521.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_HASH .assets/image-20221201161850374.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_HASH .assets/image-20221201161850374.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_HASH .md:
--------------------------------------------------------------------------------
  1 | # Map Ops of BPF_MAP_TYPE_HASH 
  2 | 
  3 | ## 数据结构
  4 | 
  5 | bpf_maps_ops 定义在 `./kernel/bpf/hashtab.c` 中
  6 | 
  7 | ![image-20221109162954511](Map_Ops_BPF_MAP_TYPE_HASH .assets/image-20221109162954511.png)
  8 | 
  9 | 从 BTF_ID_LIST_SINGLE 来看， BPF_MAP_TYPE_HASH 对应一种 BTF id (BTF 文档里的最后一部分，有时间再check一下), 同一类的内核数据结构的BTF可以使用链表或者 set 组织起来。
 10 | 
 11 | ### struct bucket
 12 | 
 13 | ```c 
 14 | struct bucket {
 15 | 	struct hlist_nulls_head head;
 16 | 	union {
 17 | 		raw_spinlock_t raw_lock;
 18 | 		spinlock_t     lock;
 19 | 	};
 20 | };
 21 | ```
 22 | 
 23 | 对hash进行同步操作的时候，对bucket上锁？
 24 | 
 25 | ### struct bpf_htab 
 26 | 
 27 | ```c
 28 | struct bpf_htab {
 29 | 	struct bpf_map map;
 30 | 	struct bucket *buckets;
 31 | 	void *elems;
 32 | 	union {
 33 | 		struct pcpu_freelist freelist;
 34 | 		struct bpf_lru lru;
 35 | 	};
 36 | 	struct htab_elem *__percpu *extra_elems;
 37 | 	atomic_t count;	/* number of elements in this hashtable */
 38 | 	u32 n_buckets;	/* number of hash buckets */
 39 | 	u32 elem_size;	/* size of each element in bytes */
 40 | 	u32 hashrnd;
 41 | 	struct lock_class_key lockdep_key;
 42 | 	int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
 43 | };
 44 | 
 45 | ```
 46 | 
 47 | **struct lock_class_key lockdep_key;** 
 48 | 
 49 | 动态锁，在使用之前需要先调用`lockdep_register_key(&htab->lockdep_key);`   注册。在free之前需要调用 `lockdep_unregister_key(&htab->lockdep_key);` 
 50 | 
 51 | **u32 hashrnd** 
 52 | 
 53 | 哈希表的随机种子。如果设置了`BPF_F_ZERO_SEED`那么 hashrnd 为 0 
 54 | 
 55 | **struct htab_elem *__percpu *extra_elems;**
 56 | 
 57 | 对于BPF_MAP类型是 `BPF_MAP_TYPE_PERCPU_HASH` `BPF_MAP_TYPE_LRU_PERCPU_HASH` `BPF_MAP_TYPE_LRU_HASH` **不使用该字段**
 58 | 
 59 | 只给普通的HASH使用.
 60 | 
 61 | 在 bpf_map_update的时候使用
 62 | 
 63 | extra_elems是一个指向 struct htab_elem* 的指针（指向指针的指针）。其指向的对象 struct htab_elem* 是一个percpu 对象
 64 | 
 65 | **struct pcpu_freelist freelist;** 
 66 | 
 67 | 由非LRU类型的HASH使用
 68 | 
 69 | ### struct pcpu_freelist
 70 | 
 71 | ```c
 72 | struct pcpu_freelist {
 73 | 	struct pcpu_freelist_head __percpu *freelist;
 74 | 	struct pcpu_freelist_head extralist;
 75 | };
 76 | ```
 77 | 
 78 | ### struct pcpu_freelist_head 
 79 | 
 80 | ```c
 81 | struct pcpu_freelist_head {
 82 | 	struct pcpu_freelist_node *first;
 83 | 	raw_spinlock_t lock;
 84 | };
 85 | ```
 86 | 
 87 | **lock** 以list为粒度的锁
 88 | 
 89 | ### struct pcpu_freelist_node
 90 | 
 91 | ```c
 92 | struct pcpu_freelist_node {
 93 | 	struct pcpu_freelist_node *next;
 94 | };
 95 | ```
 96 | 
 97 | ### struct htab_elem
 98 | 
 99 | ```c
100 | /* each htab element is struct htab_elem + key + value */
101 | struct htab_elem {
102 | 	union {
103 | 		struct hlist_nulls_node hash_node;
104 | 		struct {
105 | 			void *padding;
106 | 			union {
107 | 				struct bpf_htab *htab;
108 | 				struct pcpu_freelist_node fnode;
109 | 				struct htab_elem *batch_flink;
110 | 			};
111 | 		};
112 | 	};
113 | 	union {
114 | 		struct rcu_head rcu;
115 | 		struct bpf_lru_node lru_node;
116 | 	};
117 | 	u32 hash;
118 | 	char key[] __aligned(8);
119 | };
120 | ```
121 | 
122 | union 排布： 
123 | 
124 | ![image-20221130165305704](Map_Ops_BPF_MAP_TYPE_HASH .assets/image-20221130165305704.png)
125 | 
126 | **char key[] __aligned(8);** 
127 | 
128 | 保存着实际的Key和value
129 | 
130 | 如果为Percpu类型 1.对于PERCPU类型的map来说，value是一个指向percpu指针的指针 
131 | 
132 | ### struct bpf_lru 
133 | 
134 | ```c
135 | struct bpf_lru {
136 | 	union {
137 | 		struct bpf_common_lru common_lru;
138 | 		struct bpf_lru_list __percpu *percpu_lru;
139 | 	};
140 | 	del_from_htab_func del_from_htab;
141 | 	void *del_arg;
142 | 	unsigned int hash_offset;
143 | 	unsigned int nr_scans;
144 | 	bool percpu;
145 | };
146 | ```
147 | 
148 | ### struct bpf_common_lru 
149 | 
150 | ```c
151 | struct bpf_common_lru {
152 | 	struct bpf_lru_list lru_list;
153 | 	struct bpf_lru_locallist __percpu *local_list;
154 | };
155 | ```
156 | 
157 | ### struct bpf_lru_list 
158 | 
159 | ```c
160 | struct bpf_lru_list {
161 | 	struct list_head lists[NR_BPF_LRU_LIST_T];  //3
162 | 	unsigned int counts[NR_BPF_LRU_LIST_COUNT];  //2
163 | 	/* The next inactive list rotation starts from here */
164 | 	struct list_head *next_inactive_rotation;
165 | 
166 | 	raw_spinlock_t lock ____cacheline_aligned_in_smp;
167 | };
168 | ```
169 | 
170 | ### struct bpf_lru_locallist 
171 | 
172 | ```c
173 | struct bpf_lru_locallist {
174 | 	struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T];
175 | 	u16 next_steal;
176 | 	raw_spinlock_t lock;
177 | };
178 | ```
179 | 
180 | ## 函数逻辑
181 | 
182 | ### htab_map_alloc
183 | 
184 | `static struct bpf_map *htab_map_alloc(union bpf_attr *attr)`
185 | 
186 | -> `bool percpu = map_type == BPF_MAP_TYPE_PERCPU_HASH || map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH` 
187 | 
188 | -> `bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);`
189 | 
190 | -> `bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);`  ps : percpu_lru指的是每一个CPU都有单独的LRU_LIST
191 | 
192 | ->`struct bpf_htab *htab; htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT)`  分配结构体内存
193 | 
194 | ->`lockdep_register_key(&htab->lockdep_key);`   注册锁，这个地方涉及到同步和死锁检测暂时先放着。
195 | 
196 | -> `bpf_map_init_from_attr(&htab->map, attr);`  初始化, key_size、value_size等简单的属性，**在自己写map的时候可以调用该函数**
197 | 
198 | ->`if (percpu_lru)` 
199 | 
200 | ​	--> `htab->map.max_entries = roundup(attr->max_entries, num_possible_cpus());` 将max_entries 向上取整为 num_possible_cpus的整数倍。目的是确保每一个 cpu上的 lru list的元素个数相同 ensure each CPU's lru list has >=1 elements. ensure each CPU's lru list has >=1 elements. 
201 | 
202 | ​	--> `if (htab->map.max_entries < attr->max_entries)` 应该是处理可能的溢出问题
203 | 
204 | ​		---> `htab->map.max_entries = rounddown(attr->max_entries, num_possible_cpus());`
205 | 
206 | -> `	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);`  /* hash table size must be power of 2 */. 所以桶的个数 >= max_entries 
207 | 
208 | -> `htab->elem_size = sizeof(struct htab_elem) + round_up(htab->map.key_size, 8);`  **key按照8个字节对齐**
209 | 
210 | -> `if (percpu) ? htab->elem_size += sizeof(void *); : htab->elem_size += round_up(htab->map.value_size, 8);   ` 如果是percpu类型的变量，其value是一个指向percpu指针的指针。(见prealloc_init说明)
211 | 
212 | -> `if (htab->n_buckets > U32_MAX / sizeof(struct bucket)) err`**之后要根据bucket的总空间分配大小，这里检查总大小是否越界。也就是说HASH_MAP总个数存在着一个理论上限** 
213 | 
214 | -> `htab->buckets = bpf_map_area_alloc(htab->n_buckets *
215 |  sizeof(struct bucket), htab->map.numa_node);` 为hash表的桶分配空间
216 | 
217 | -> `for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) `  初始化 `bpf_htab->map_locked` 每一个CPU一把锁，**??该Lock的作用**
218 | 
219 | ​	--> `htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map, sizeof(int), sizeof(int), GFP_USER)`  **之后给BPF_MAP分配per_cpu内存可以调用这个方法**
220 | 
221 | -> `htab->map.map_flags & BPF_F_ZERO_SEED ? : htab->hashnrd : get_random_int()`   **内核里使用随机数，可以使用get_random_int**
222 | 
223 | -> `	htab_init_buckets(htab);` 初始化 buckets
224 | 
225 | -> `if (!(attr->map_flags & BPF_F_NO_PREALLOC))` `if (prealloc)` 正常我们用的时候不会设置 `BPF_F_NO_REALLOC` 所以这段逻辑正常会被调用,  这里的 prealloc指的应该是是否要预先把BPF_MAP所需要的内存都分配好。
226 | 
227 | ​	--> ` err = prealloc_init(htab);` 见`prealloc_init` 函数逻辑。分配hash其余空间
228 | 
229 | ​	--> `if (!percpu && !lru) err = alloc_extra_elems(htab);  `
230 | 
231 | ​		---> `struct htab_elem *__percpu *pptr`
232 | 
233 | ​		---> `struct htab_elem *l_new` 
234 | 
235 | ​		---> `struct pcpu_freelist_node *l;` 
236 | 
237 | ​		---> `pptr = bpf_map_alloc_percpu(&htab->map, sizeof(struct htab_elem *), 8, GFP_USER | __GFP_NOWARN);`
238 | 
239 | ​		---> `for_each_possible_cpu(cpu)`
240 | 
241 | ​			----> `l = pcpu_freelist_pop(&htab->freelist);`尝试从每一个CPU的freelist 以及全部CPU共享的extra   list 中 pop**一个** elem(fnode)。这里的pop一定会成功，因为在此之前，prealloc_init 额外分配了CPU数目的 elem ,pop will succeed, since prealloc_init() preallocated extra num_possible_cpus elements
242 | 
243 | ​			----> `*per_cpu_ptr(pptr, cpu) = l_new;` 保存改CPU的extra_elem 
244 | 
245 | ​		---> `htab->extra_elems = pptr;`
246 | 
247 | ### prealloc_init 
248 | 
249 | `static int prealloc_init(struct bpf_htab *htab)` 
250 | 
251 | -> `u32 num_entries = htab->map.max_entries;` 
252 | 
253 | -> `if (htab_has_extra_elems(htab))`
254 | 
255 | ​	--> `htab_has_extra_elems(htab)` 
256 | 
257 | ​		---> `return !htab_is_percpu(htab) && !htab_is_lru(htab);`  对于BPF_MAP类型是 `BPF_MAP_TYPE_PERCPU_HASH` `BPF_MAP_TYPE_LRU_PERCPU_HASH` `BPF_MAP_TYPE_LRU_HASH` **不需要用到** extra_elems 这些数据, 也就是只有最普通的HASH需要用到extra elems 
258 | 
259 | ​	--> `num_entries += num_possible_cpus();`   **在alloc_extra_elems中使用，由正常的HASH使用，用来作为填充各个CPU的freelist的“终结值"**
260 | 
261 | -> `htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries, htab->map.numa_node);` 分配hash实际elem所占用的空间, **从这里也可得知,elem的和buckets的个数基本呈1:1的关系（不完全相同）**
262 | 
263 | -> `if (htab_is_percpu)`  为 percpu类型的hash tab分配percpu空间
264 | 
265 | ​	--> `for (i = 0; i < num_entries; i++) `
266 | 
267 | ​		---> `u32 size = round_up(htab->map.value_size, 8);`  value size 和 key size都需要8字节对齐
268 | 
269 | ​		---> `void __percpu *pptr;` 指针是percpu的，也就是说每一个CPU都保存着一份 pptr
270 | 
271 | ​		---> `pptr = bpf_map_alloc_percpu(&htab->map, size, 8, GFP_USER | __GFP_NOWARN);`
272 | 
273 | ​		--->`htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, pptr);` 
274 | 
275 | ​			---->`*(void __percpu **)(l->key + key_size) = pptr;`  1.对于PERCPU类型的map来说，value是一个指向percpu指针的指针(其指向的对象是一个percpu指针) 2. 这里的强转告诉编译器要从 percpu section 里获取地址。之所以要使用双重指针，是因为 bpf_map_alloc分配空间的时候，并不是分配percpu的空间。因此该指针是一个普通的指针。该指针指向一个percpu指针。通过这个percpu指针就能拿到分配给每一个CPU的elem了。
276 | 
277 | ​		---> `cond_resched();` **??主动让出CPU，方式占用太长时间的CPU**
278 | 
279 | -> `if (htab_is_lru(htab))`
280 | 
281 | ​	--> `err = bpf_lru_init(&htab->lru,  htab->map.map_flags & BPF_F_NO_COMMON_LRU, offsetof(struct htab_elem, hash) - offsetof(struct htab_elem, lru_node), htab_lru_map_delete_node, htab);`  ??lru部分先跳过
282 | 
283 | -> `else`
284 | 
285 | ​	--> `err = pcpu_freelist_init(&htab->freelist);` 
286 | 
287 | ​		---> `int cpu;`
288 | 
289 | ​		---> `s->freelist = alloc_percpu(struct pcpu_freelist_head);`   freelist 是一个 percpu类型的变量，表示每一个CPU都有一条list 
290 | 
291 | ​		---> `for_each_possible_cpu(cpu) ` 
292 | 
293 | ​			----> `struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu);`  **可以考虑调用 per_cpu_ptr来获取percpu指针**
294 | 
295 | ​			----> `raw_spin_lock_init(&head->lock); head->first = NULL;` 初始化锁和list 
296 | 
297 | ​		---> `raw_spin_lock_init(&s->extralist.lock); s->extralist.first = NULL` 初始化 htab->freelist.extralist 
298 | 
299 | -> `if (htab_is_lru(htab))`
300 | 
301 | ​	--> `bpf_lru_populate(&htab->lru, htab->elems, offsetof(struct htab_elem, lru_node),htab->elem_size, num_entries);` **??lru部分先跳过**
302 | 
303 | -> `else` 
304 | 
305 | ​	--> `pcpu_freelist_populate(&htab->freelist, htab->elems + offsetof(struct htab_elem, fnode),  htab->elem_size, num_entries);`  对正常类型的HASH填充 freelist 
306 | 
307 | ​		---> `void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,u32 nr_elems)` `buf`的地址是htab_elem的fnode
308 | 
309 | ​		---> `struct pcpu_freelist_head *head;` 
310 | 
311 | ​		---> `int i, cpu, pcpu_entries;` 
312 | 
313 | ​		---> `pcpu_entries = nr_elems / num_possible_cpus() + 1;`  每一个CPU上的 s->freelist（每一个CPU一条list）都通过 fnode保存着一部分（pcou_entries)的elems 
314 | 
315 | ​		---> `for_each_possible_cpu(cpu) `
316 | 
317 | ​			----> `again` 
318 | 
319 | ​			----> `head = per_cpu_ptr(s->freelist, cpu);`  获取链表头，从头插入
320 | 
321 | ​			---->  `pcpu_freelist_push_node(head, buf);`  buf指向要插入结点的 fnode的地址
322 | 
323 | ​			----> `i++; buf += elem_size;` 下一个 elem 
324 | 
325 | ​			----> `if (i == nr_elems) break;` 遍历完成所有的elem
326 | 
327 | ​			----> `if (i % pcpu_entries) goto again` 还未遍历完该CPU的elems，每一个CPU负责一部分的elems 
328 | 
329 | ### htab_map_update_elem 
330 | 
331 | `static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags)` 
332 | 
333 | 
334 | 
335 | ## 编程技巧
336 | 
337 | ### 使用双重指针实现percpu变量
338 | 
339 | ![image-20221201161850374](Map_Ops_BPF_MAP_TYPE_HASH .assets/image-20221201161850374.png)
340 | 
341 | ​		
342 | 
343 | 
344 | 
345 | 


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_STRUCT_OP.assets/image-20221109163807298.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_STRUCT_OP.assets/image-20221109163807298.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_STRUCT_OP.assets/image-20221110172442249.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_STRUCT_OP.assets/image-20221110172442249.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_STRUCT_OP.assets/image-20221110172608309.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_STRUCT_OP.assets/image-20221110172608309.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_STRUCT_OP.assets/image-20221110192516048.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_STRUCT_OP.assets/image-20221110192516048.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_STRUCT_OP.assets/image-20221115212332238.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_STRUCT_OP.assets/image-20221115212332238.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_STRUCT_OP.assets/image-20221115212335463.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_STRUCT_OP.assets/image-20221115212335463.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_STRUCT_OP.assets/image-20221115213420120.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_STRUCT_OP.assets/image-20221115213420120.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_map_kernel/Map_Ops_BPF_MAP_TYPE_STRUCT_OP.md:
--------------------------------------------------------------------------------
  1 | # Map Ops of BPF_MAP_TYPE_STRUCT_OP 
  2 | 
  3 | ## 数据结构
  4 | 
  5 | ### 全局数据 bpf_struct_ops 
  6 | 
  7 | `./kernel/bpf/bpf_struct_ops.c` 
  8 | 
  9 | 保存所有的 bpf_struct_op， 按照BTF id 进行索引
 10 | 
 11 | ![image-20221110172442249](Map_Ops_BPF_MAP_TYPE_STRUCT_OP.assets/image-20221110172442249.png)
 12 | 
 13 | ![image-20221110172608309](Map_Ops_BPF_MAP_TYPE_STRUCT_OP.assets/image-20221110172608309.png)
 14 | 
 15 | 目前只实现了 congestion_ops
 16 | 
 17 | map_ops实现在 `./kernel/bpf/bpf_struct_ops.c`
 18 | 
 19 | ![image-20221109163807298](Map_Ops_BPF_MAP_TYPE_STRUCT_OP.assets/image-20221109163807298.png)
 20 | 
 21 | ### 全局变量 bpf_struct_ops_link_lops
 22 | 
 23 | ```c
 24 | const struct bpf_link_ops bpf_struct_ops_link_lops = {
 25 | 	.release = bpf_struct_ops_link_release,
 26 | 	.dealloc = bpf_struct_ops_link_dealloc,
 27 | };
 28 | ```
 29 | 
 30 | ```c
 31 | static void bpf_struct_ops_link_release(struct bpf_link *link)
 32 | {
 33 | }
 34 | 
 35 | //根据link free bpf_trampoline_link
 36 | static void bpf_struct_ops_link_dealloc(struct bpf_link *link)
 37 | {
 38 | 	struct bpf_tramp_link *tlink = container_of(link, struct bpf_tramp_link, link);
 39 | 
 40 | 	kfree(tlink);
 41 | }
 42 | ```
 43 | 
 44 | 
 45 | 
 46 | ### bpf_struct_ops 
 47 | 
 48 | BPF使用该数据结构来描述 struct_ops的内核挂载点，一系列的函数指针, 和数据结构
 49 | 
 50 | ```mermaid
 51 | classDiagram 
 52 | 	class bpf_struct_ops {
 53 | 		const struct bpf_verifier_ops *verifier_ops
 54 | 		init(struct btf *btf) int 
 55 | 		check_member(const struct btf_type *t, const struct btf_member *member) int 
 56 | 		init_member(const struct btf_type *t, const struct btf_member *member, void *kdata, const void *udata) int 
 57 | 		reg(void *kdata) int 
 58 | 		unreg(void *kdata) void 
 59 | 		const struct btf_type *type 
 60 | 		const struct byf_type *value_type 
 61 | 		struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS];
 62 | 	}
 63 | 	
 64 | 	class btf_type {
 65 | 	
 66 | 	}
 67 | 	
 68 | 	class btf_member {
 69 | 	
 70 | 	}
 71 | 	class btf_verifier_ops {
 72 | 	
 73 | 	}
 74 | 	class btf_func_model {
 75 | 	
 76 | 	}
 77 | 	bpf_struct_ops-->btf_type 
 78 | 	bpf_struct_ops..>btf_member 
 79 | 	bpf_struct_ops-->btf_verifier_ops 
 80 | 	bpf_struct_ops-->btf_func_model 
 81 | ```
 82 | 
 83 | **struct btf_type type** 
 84 | 
 85 | 对应着内核结构体，例如 tcp_congestion_ops。 在 `bpf_struct_ops_init`中被设置 ： `type_id = btf_find_by_name_kind(btf, st_ops->name, BTF_KIND_STRUCT);` （以拥塞控制为例，st_ops->name的值为 : `tcp_congestion_ops`
 86 | 
 87 | **struct btf_type value_type** 
 88 | 
 89 | 对应着 `bpf_struct_ops_##name`结构体，例如`struct bpf_struct_ops_tcp_congestion_ops` 。在 `bpf_struct_ops_init`中被设置 ： `value_id = btf_find_by_name_kind(btf, value_name, BTF_KIND_STRUCT);`
 90 | 
 91 | **init**
 92 | 
 93 | 负责初始化该struct_op的BTF信息，在`bpf_struct_ops_init`中被调用,调用路径
 94 | 
 95 | -> `bpf_struct_ops_init` 
 96 | 
 97 | ​	--> `st_ops->init(btf)`
 98 | 
 99 | **check_member** 
100 | 
101 | 在verifier中被调用。对于用eBPF实现的struct_op检查其成员是否合法
102 | 
103 | ->`check_struct_ops_btf_id`
104 | 
105 | ​	--> `st_ops->check_member(t, member);` 
106 | 
107 | **init_member**
108 | 
109 | 在`bpf_struct_ops_map_update_elem` 被调用, 主要负责： 1. 处理非函数指针字段(例如flags) 2. 对函数指针的字段进行验证。（例如验证是否都实现了必要的hook函数等）。
110 | 
111 | -> `bpf_struct_ops_map_update_elem`
112 | 
113 | ​	--> `err = st_ops->init_member(t, member, kdata, udata);` 
114 | 
115 | **reg**
116 | 
117 | 在`bpf_struct_ops_map_update_elem`被调用，负责将 eBPF实现的结构体 `kvalue.kdata`注册到对应的内核路径中。（以tcp_congestion_ops为例，调用 `tcp_register_congestion_control(kdata);`方法
118 | 
119 | -> `bpf_struct_ops_map_update_elem`
120 | 
121 | ​	--> `err = st_ops->reg(kdata);`
122 | 
123 | ### bpf_verifier_ops
124 | 
125 | ```c
126 | struct bpf_verifier_ops {
127 | 	/* return eBPF function prototype for verification */
128 | 	const struct bpf_func_proto *
129 | 	(*get_func_proto)(enum bpf_func_id func_id,
130 | 			  const struct bpf_prog *prog);
131 | 
132 | 	/* return true if 'size' wide access at offset 'off' within bpf_context
133 | 	 * with 'type' (read or write) is allowed
134 | 	 */
135 | 	bool (*is_valid_access)(int off, int size, enum bpf_access_type type,
136 | 				const struct bpf_prog *prog,
137 | 				struct bpf_insn_access_aux *info);
138 | 	int (*gen_prologue)(struct bpf_insn *insn, bool direct_write,
139 | 			    const struct bpf_prog *prog);
140 | 	int (*gen_ld_abs)(const struct bpf_insn *orig,
141 | 			  struct bpf_insn *insn_buf);
142 | 	u32 (*convert_ctx_access)(enum bpf_access_type type,
143 | 				  const struct bpf_insn *src,
144 | 				  struct bpf_insn *dst,
145 | 				  struct bpf_prog *prog, u32 *target_size);
146 | 	int (*btf_struct_access)(struct bpf_verifier_log *log,
147 | 				 const struct btf *btf,
148 | 				 const struct btf_type *t, int off, int size,
149 | 				 enum bpf_access_type atype,
150 | 				 u32 *next_btf_id, enum bpf_type_flag *flag);
151 | };
152 | ```
153 | 
154 | **get_func_proto**
155 | 
156 | 当eBPF程序调用eBPF帮助函数时，调用该函数返回帮助函数的函数原型（bpf_func_proto) 也判断特定的程序类型是否支持使用的eBPF帮助函数
157 | 
158 | -> `do_check`
159 | 
160 | ​	--> `check_helper_call` 
161 | 
162 | ​		---> `if (env->ops->get_func_proto) fn = env->ops->get_func_proto(func_id, env->prog);`
163 | 
164 | **is_valid_access**
165 | 
166 | 当访问 `btf_ctx`变量的时候，执行的check (验证器验证阶段)
167 | 
168 | -> `do_check`
169 | 
170 | ​	--> `check_ctx_access`
171 | 
172 | ​		---> ` env->ops->is_valid_access(off, size, t, env->prog, &info)`
173 | 
174 | **btf_struct_access**
175 | 
176 | 在BPF验证器的验证阶段，当需要access内核数据结构时候调用。确保访问内核数据结构的安全性。(从代码上来看该函数有默认实现)
177 | 
178 | 调用链: 
179 | 
180 | -> `check_mem_access`
181 | 	--> ` if (base_type(reg->type) == PTR_TO_BTF_ID &&!type_may_be_null(reg->type)) `
182 | 
183 | ​		---> `check_ptr_to_btf_access`
184 | 
185 | ​			----> `if (env->ops->btf_struct_access) ret = env->ops->btf_struct_access;`   
186 | 
187 | ### bpf_struct_ops_map 
188 | 
189 | 该数据结构表示一个 STRUCT_OP map。它和 bpf_map的关系就类似，mptcp_sock 和 tcp_sock的关系。（可以简单理解为面向对象的继承）
190 | 
191 | ```mermaid
192 | classDiagram
193 | 	class bpf_struct_ops_map {
194 | 		struct bpf_map map
195 | 		struct rcu_head rcu
196 | 		const struct bpf_struct_ops *st_ops
197 | 		struct mutex lock
198 | 		struct bpf_link **links
199 | 		void *image 
200 | 		struct bpf_struct_ops_value *uvalue
201 | 		struct bpf_struct_ops_value kvalue
202 | 	}
203 | 	
204 | 	class bpf_struct_ops_value {
205 | 		refcount_t refcnt 
206 | 		bpf_struct_ops_state state 
207 | 		data[]
208 | 	}
209 | 	
210 | 	class bpf_link {
211 | 		atomic64_t refcnt
212 | 		u32 id
213 | 		const struct bpf_link_ops *ops
214 | 		struct bpf_prog *prog
215 | 		struct work_struct work
216 | 	}
217 | 	
218 | 	bpf_map <|-- bpf_struct_ops_map 
219 | 	bpf_struct_ops_map-->bpf_struct_ops
220 | 	bpf_struct_ops_map --> bpf_link
221 |     bpf_struct_ops_map --> bpf_struct_ops_value
222 |     bpf_struct_ops_map *-- bpf_struct_ops_value 
223 |     
224 |     bpf_link --> bpf_link_ops 
225 |     bpf_link --> bpf_prog
226 |     bpf_link *-- work_struct 
227 | 	
228 | ```
229 | 
230 | ```c 
231 | struct bpf_struct_ops_map {
232 | 	struct bpf_map map;
233 | 	struct rcu_head rcu;
234 | 	const struct bpf_struct_ops *st_ops;
235 | 	/* protect map_update */
236 | 	struct mutex lock;
237 | 	/* link has all the bpf_links that is populated
238 | 	 * to the func ptr of the kernel's struct
239 | 	 * (in kvalue.data).
240 | 	 */
241 | 	struct bpf_link **links;
242 | 	/* image is a page that has all the trampolines
243 | 	 * that stores the func args before calling the bpf_prog.
244 | 	 * A PAGE_SIZE "image" is enough to store all trampoline for
245 | 	 * "links[]".
246 | 	 */
247 | 	void *image;
248 | 	/* uvalue->data stores the kernel struct
249 | 	 * (e.g. tcp_congestion_ops) that is more useful
250 | 	 * to userspace than the kvalue.  For example,
251 | 	 * the bpf_prog's id is stored instead of the kernel
252 | 	 * address of a func ptr.
253 | 	 */
254 | 	struct bpf_struct_ops_value *uvalue;
255 | 	/* kvalue.data stores the actual kernel's struct
256 | 	 * (e.g. tcp_congestion_ops) that will be
257 | 	 * registered to the kernel subsystem.
258 | 	 */
259 | 	struct bpf_struct_ops_value kvalue;
260 | };
261 | 
262 | ```
263 | 
264 | **image** 
265 | 
266 | 所谓蹦床本质上是一段汇编代码，负责完成 context切换（save 参数）和调用loaded bpf_prog。struct_op 将 links里的所有link(对应于每一个需要)
267 | 
268 | 蹦床例子： 
269 | 
270 | ![image-20221115213420120](Map_Ops_BPF_MAP_TYPE_STRUCT_OP.assets/image-20221115213420120.png)
271 | 
272 | image保存了该struct_op所有link的蹦床。蹦床的地址，按照moff(成员的字节偏移量)保存在 kvalue.kdata里。
273 | 
274 | **struct bpf_struct_ops_value *uvalue;**
275 | 
276 | uvalue 保存`bpf_map_update`传入的value参数的内存内容,
277 | 
278 | uvalue.data 这一块内存，本质上保存的是一个结构体(和内核结构体对应)： 1. 对于结构体的非函数指针成员，例如flags，或者name等，按照BTF编码后对应内核数据结构的成员。 **2. 对于函数指针，保存的是BPF_PROG的fd**
279 | 
280 | **struct bpf_struct_ops_value kvalue;** 
281 | 
282 | kvalue.kdata 按照moff(成员的字节偏移量)保存了所有成员的link的蹦床地址。 
283 | 
284 | `*(void **)(kdata + moff) = image;`
285 | 
286 | kvalue. refcnt 这个应用计数，被 `bpf_module_get`使用用来标识由由多少内核代码使用了该模块(struct)
287 | 
288 | kvalue.data 这一块内存，本质上保存的是一个内核结构体(和内核结构体对应)： 1. 对于结构体的非函数指针成员，同一般的内核结构体。2. 对于函数指针，保存的是对应函数的 image地址(image入口)
289 | 
290 | ### struct bpf_struct_ops_value 
291 | 
292 | ```c
293 | struct bpf_struct_ops_value {
294 | 	BPF_STRUCT_OPS_COMMON_VALUE;
295 | 	char data[] ____cacheline_aligned_in_smp;
296 | };
297 | ```
298 | 
299 |  data保存的就是实际的内核数据结构 (比如tcp_congestion_ops),  以tcp_congestion_ops 为例，其对应数据结构为 : (由宏自动生成结构体定义，见编程技巧)
300 | 
301 | ```c
302 | struct bpf_struct_ops_tcp_congestion_ops {
303 |     BPF_STRUCT_OPS_COMMON_VALUE;
304 |     struct tcp_congestion_ops data ____cacheline_aligned_in_smp;
305 | };
306 | ```
307 | 
308 | ### struct bpf_tramp_link 
309 | 
310 | ```c
311 | struct bpf_tramp_link {
312 | 	struct bpf_link link;
313 | 	struct hlist_node tramp_hlist;
314 | 	u64 cookie;
315 | };
316 | ```
317 | 
318 | 额外增加了 cookie字段，以及hlist。估计用Hash表的方式将bpf_link组织起来
319 | 
320 | ```c
321 | struct bpf_tramp_links {
322 | 	struct bpf_tramp_link *links[BPF_MAX_TRAMP_LINKS];
323 | 	int nr_links;
324 | };
325 | ```
326 | 
327 |  **cookie** 会被保存到 `bpf_tramp_run_ctx` 的bpf_cookie字段中
328 | 
329 | ```c
330 | struct bpf_tramp_run_ctx {
331 | 	struct bpf_run_ctx run_ctx;
332 | 	u64 bpf_cookie;
333 | 	struct bpf_run_ctx *saved_run_ctx;
334 | };
335 | ```
336 | 
337 | ## 函数逻辑
338 | 
339 | ## bpf_struct_ops_init
340 | 
341 | **(推测)将所有struct_op对应的结构体(bpf_struct_ops)（例如 tcp_congestion_ops,注册到内核BTF中）**
342 | 
343 | `void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log) `
344 | 
345 | 该函数负责初始化内部的STRUCT_OP相关的BTF信息
346 | 
347 | 由函数`btf_parse_vmlinux`->`bpf_struct_ops_init` 调用
348 | 
349 | ### bpf_struct_ops_map_alloc_check
350 | 
351 | 实现很简单
352 | 
353 | ```c 
354 | static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
355 | {
356 | 	if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
357 | 	    attr->map_flags || !attr->btf_vmlinux_value_type_id)
358 | 		return -EINVAL;
359 | 	return 0;
360 | }
361 | ```
362 | 
363 | 1. key必须是4 个字节 (int)
364 | 2. max_entries 必须为1 （保存要加载的 bpf_struct 的 fd , 例如 bpf_tcp_congestion_ops) 
365 | 3. 不能携带任何 map_flags 
366 | 4. 必须携带 btf_vmlinux_value_type_id （对应的内核数据结构) 
367 | 
368 | ps : 这里我感觉部分的check，和map_create里的check重复了
369 | 
370 | ### bpf_struct_ops_map_alloc
371 | 
372 | `static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)`
373 | 
374 | 给struct_ops类型的map分配内存并初始化 
375 | 
376 | **调用逻辑**
377 | 
378 | -> `const struct bpf_struct_ops *st_ops;` 
379 | 
380 | -> `struct bpf_struct_ops_map *st_map;` 
381 | 
382 | -> `	const struct btf_type *t, *vt;`
383 | 
384 | -> `struct bpf_map *map;` 
385 | 
386 | -> `st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id);`
387 | 
388 | ​	--> `for each item in bpf_struct_ops`  STRUCT_OP相关的内核数据结构 **保存在全局数据 bpf_struct_ops中重要的hook点** 
389 | 
390 | ​		---> `if (bpf_struct_ops[i]->value_id == value_id)  return bpf_struct_ops[i];`  `btf_vmlinux_value_type_id` 对应着特定的 `BPF_STRUCT_OPS_TYPE_name` 枚举值，对应着特定的 `struct bpf_name` ，例如name 为 tcp_congestion_ops， 那么枚举变量为`BPF_STRUCT_OPS_TYPE_tcp_congestion_ops`  结构体为 `struct bpf_tcp_congestion_ops` 
391 | 
392 | -> `vt = st_ops->value_type;`   对应着 `bpf_struct_ops_##name`结构体，例如`struct bpf_struct_ops_tcp_congestion_ops` 。在 `bpf_struct_ops_init`中被设置 ： `value_id = btf_find_by_name_kind(btf, value_name, BTF_KIND_STRUCT);`
393 | 
394 | ->`t = st_ops->type`  对应着内核结构体，例如 tcp_congestion_ops。 在 `bpf_struct_ops_init`中被设置 ： `type_id = btf_find_by_name_kind(btf, st_ops->name, BTF_KIND_STRUCT);` （以拥塞控制为例，st_ops->name的值为 : `tcp_congestion_ops`
395 | 
396 | -> `st_map_size = sizeof(*st_map) + (vt->size - sizeof(struct bpf_struct_ops_value)); `   `bpf_struct_ops_value` 有一些公用的数据结构，比如 refcnt, 这里的 - 操作就是为了把公共部分去掉。实际上是计算 `st_map->kvalue->data`的空间 （保存对应的内核数据结构，例如 tcp_congestion_ops)
397 | 
398 | -> `st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE);`   **以后如果自己增加新的MAP类型，可以直接调用这个函数来分配内存**
399 | 
400 | ​	--> `__bpf_map_area_alloc(size, numa_node, false);`    ps : false 指的是 mmapable = false 
401 | 
402 | ​		---> `kmalloc_node or __vmalloc_node_range`  如果所需要的内存空间太大的话，会使用 __vmalloc_node_range
403 | 
404 | -> `st_map->st_ops = st_ops; map = &st_map->map;`
405 | 
406 | -> `st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);`  为 uvalue分配一块内存 **??这块内存的作用**
407 | 
408 | ->`st_map->links = bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_links *), NUMA_NO_NODE);`  这个地方分配的大小验证了我之前对type含义的推测。type->vlen(type结构体的memeber数量) 代表着暴露接口的数量， 每一个接口用一个 `bpf_link`表示（即将一段eBPF PROG 挂在到该暴露的接口下，因为bpf_link就是一种对挂载的抽象) ，因此这里要分配的空间大小为 `btf_type_vlen(t) * sizeof(struct bpf_links *)` 
409 | 
410 | ->`st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);`  **根据注释，image保存bpf_link的trampolines**
411 | 
412 | ​	--> `module_alloc(size);` (体系结构相关) 经过查阅资料，module_alloc在module_init中调用。为内核模块分配内存，这也说明了，struct_ops实际上就可以看成一个内核模块。
413 | 
414 | ​		---> `__vmalloc_node_range`  with vm_flags = `VM_FLUSH_RESET_PERMS` , prot = `PAGE_KERNEL_EXEC` (底层硬件相关) 
415 | 
416 | -> `mutex_init(&st_map->lock);`  protect map_update 
417 | 
418 | -> `set_vm_flush_reset_perms(st_map->image);` 设置 image对应 vm_struct `VM_FLUSH_RESET_PERMS` 
419 | 
420 | -> `bpf_map_init_from_attr(map, attr);`  根据attr 初始化其它的通用 map属性
421 | 
422 | ​	--> `map->map_type; map->key_size; map->value_size; map->max_entries; map->map_flags; map->numa_node; map->map_extra;   `
423 | 
424 | -> `return map `
425 | 
426 | ### bpf_struct_ops_map_update_elem
427 | 
428 | `static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, void *value, u64 flags);`
429 | 
430 | **函数逻辑** 
431 | 
432 | -> 初始化变量
433 | 
434 | ​	--> `struct bpf_struct_ops_value *uvalue, *kvalue;`
435 | 
436 | ​	--> `const struct btf_member *member;`
437 | 
438 | ​	--> `const struct btf_type *t  = st_ops->type;`
439 | 
440 | ​	--> `struct bpf_tramp_links *tlinks = NULL;` 用于 `bpf_struct_ops_prepare_trampoline` 函数的临时变量
441 | 
442 | ​	-->`void *image, *image_end`
443 | 
444 | -> `if (flags) return -EINVAL`
445 | 
446 | -> `err = check_zero_holes(st_ops->value_type, value); `  该函数简单来说检查 value 内存区域有没有未初始化bit。**根据`value_type`**里的`btf_member`定义(offset 和 size)。用户传进来的value对应着，`bpf_struct_ops_tcp_congestion_ops` (和value_type相对应)
447 | 
448 | -> `uvalue = value; err = check_zero_holes(t, uvalue->data);` 该函数简单来说检查 value 内存区域有没有未初始化bit。**根据`st_ops->type`**里的`btf_member`定义(offset 和 size)。
449 | 
450 | -> `if (uvalue->state || refcount_read(&uvalue->refcnt)) return -EINVAL;`  被update的区域需要是未初始化区域。
451 | 
452 | -> `tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); ` 分配内存
453 | 
454 | -> `uvalue = (struct bpf_struct_ops_value *)st_map->uvalue;` 
455 | 
456 | -> `kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue;`  kvalue直接保存在 st_map中。
457 | 
458 | -> `	mutex_lock(&st_map->lock);` lock **位于st_map里的锁**（不是放在bpf_map里的锁) 
459 | 
460 | -> `memcpy(uvalue, value, map->value_size);`  uvalue 保存`bpf_map_update`传入的value值
461 | 
462 | -> `udata = &uvalue->data; kdata = &kvalue->data;`
463 | 
464 | -> `for_each_member(i, st_ops->type, member)`
465 | 
466 | ​	--> `const struct btf_type *mtype, *ptype;`
467 | 
468 | ​	--> `struct bpf_prog *prog; struct bpf_tramp_link *link;`
469 | 
470 | ​	--> `moff = __btf_member_bit_offset(t, member) / 8;`  获取 struct member 距离struct 的 字节偏移量
471 | 
472 | ​	--> `ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL);`   判断用户态传入的memeber是指针类型。根据`struct_op`的文档，用户态需要把BPF_PROG的prog_fd写入 map的value中。**这里的ptype就对应着pfd**。 实际上，struct_ops是一系列函数指针的集合，函数指针的btf类型也应该是 ptr。
473 | 
474 | ​	--> `if (ptype == module_type) ` ...**.?? module_type暂时放一下**，没有看到具体的用法
475 | 
476 | ​	--> `err = st_ops->init_member(t, member, kdata, udata);`  **重要hook点,调用具体struct_ops的init_member方法**。主要负责： 1. 处理非函数指针字段(例如flags) 2. 对函数指针的字段进行验证。（例如验证是否都实现了必要的hook函数等）。
477 | 
478 | ​	--> `if (err > 0) continue;` The ->init_member() has handled this member. 例如handle name, 或者是handle flags（以tcp拥塞控制为例) 。**eBPF的struct_ops是允许暴露部分成员变量的**，例如flags, name等。对于这种类型的 member应该有具体的struct op的 init_member来处理。
479 | 
480 | ​	--> `if (!ptype || !btf_type_is_func_proto(ptype)) `  /* All non func ptr member must be 0 */
481 | 
482 | ​		---> `if (memchr_inv(udata + moff, 0, msize)) `  err = -EINVAL;
483 | 
484 | ​	--> `prog_fd = (int)(*(unsigned long *)(udata + moff));` 根据`struct_op`的文档，用户态需要把BPF_PROG的prog_fd写入 map的value中。
485 | 
486 | ​	--> `prog = bpf_prog_get(prog_fd);` 根据prog fd获取`bpf_prog`
487 | 
488 | ​	--> `if (!prog_fd) continue;`  对于不设置的函数 prog_fd设置为0即可。
489 | 
490 | ​	--> `if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) || prog->aux->attach_btf_id != st_ops->type_id || prog->expected_attach_type != i) err = -EINVAL;` 说明了STRUCT_OP类型的prog, `prog->aux->attach_btf_id`和` prog->expected_attach_type`的含义。
491 | 
492 | ​	--> `link = kzalloc(sizeof(*link), GFP_USER);` 给link分配空间 `struct bpf_tramp_link *link`
493 | 
494 | ​	--> `bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_link_lops, prog);`  初始化类型为 `BPF_LINK_TYPE_STRUCT_OPS`的bpf_link
495 | 
496 | ​	--> `st_map->links[i] = &link->link;`  st_map中保存的是`bpf_link`而非`bpf_trampoline_link` 但其实也能够根据link地址拿到trampoline_link的地址
497 | 
498 | ​	--> `err = bpf_struct_ops_prepare_trampoline(tlinks, link, &st_ops->func_models[i], image, image_end);` 
499 | 
500 | ​		---> `tlinks[BPF_TRAMP_FENTRY].links[0] = link;`
501 | 
502 | ​		---> `tlinks[BPF_TRAMP_FENTRY].nr_links = 1;`
503 | 
504 | ​		---> `flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0;`
505 | 
506 | ​		---> `return arch_prepare_bpf_trampoline(NULL, image, image_end, model, flags, tlinks, NULL);`  体系结构相关，**生成该bpf_link的蹦床(本质上是一段汇编代码，实现context save和调用JIT的eBPF程序**, 生成的蹦床最终保存在 `st_maps->image`中。（详情见 bpf_link中该函数的笔记）
507 | 
508 | ​	--> `*(void **)(kdata + moff) = image;`   **kdata保存着每一个prog(struct_op对应的函数的bpf_prog)的蹦床在`st_maps->image`中的地址。** 
509 | 
510 | ​	--> `image += err;`  `bpf_struct_ops_prepare_trampoline`返回的该bpf_link的蹦床的大小（汇编代码的字节数) 
511 | 
512 | -> `refcount_set(&kvalue->refcnt, 1);`  设置 `st_map.kvalue`的引用计数
513 | 
514 | -> `bpf_map_inc(map);` **增加bpf_map的引用计数** `atomic64_inc(&map->refcnt);`
515 | 
516 | -> `set_memory_ro((long)st_map->image, 1); set_memory_x((long)st_map->image, 1);` 设置 image的读写权限
517 | 
518 | -> `err = st_ops->reg(kdata);` **重要Hook点，调用特定struct_op的reg方法**
519 | 
520 | -> `	smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE);`
521 | 
522 | -> `kfree(tlinks);`
523 | 
524 | -> `mutex_unlock(&st_map->lock);`
525 | 
526 | ### (推测)注册 struct_op中可以使用的内核函数
527 | 
528 | 调用函数 ` register_btf_kfunc_id_set` 
529 | 
530 | ```c 
531 | BTF_SET_START(bpf_tcp_ca_check_kfunc_ids)
532 | BTF_ID(func, tcp_reno_ssthresh)
533 | BTF_ID(func, tcp_reno_cong_avoid)
534 | BTF_ID(func, tcp_reno_undo_cwnd)
535 | BTF_ID(func, tcp_slow_start)
536 | BTF_ID(func, tcp_cong_avoid_ai)
537 | BTF_SET_END(bpf_tcp_ca_check_kfunc_ids)
538 | static int __init bpf_tcp_ca_kfunc_init(void)
539 | {
540 | 	return register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_tcp_ca_kfunc_set);
541 | }
542 | ```
543 | 
544 | ## How to add a new struct_op hook ? 
545 | 
546 | 1. 设计 自己需要暴露的 模块接口 `struct my_module`
547 | 
548 | 2. 实现验证器操作 `struct verifier_ops`
549 | 
550 |    * `get_fun_proto` 可以使用的帮助函数
551 |    * `is_valid_access` 访问 ctx 的验证操作
552 |    * `btf_struct_access` 访问具体的 指针的验证操作 
553 | 
554 | 3. 实现 `struct bpf_struct_ops`
555 | 
556 |    * 实现 `init`方法
557 |    * 实现 `reg`方法
558 |    * 实现`unreg`方法
559 |    * 实现`check_member`方法
560 |    * 实现`init_member`方法
561 |    * **设置name的值为 "$my_module"** (填上my_module的名字)
562 |    * 实现 verifier_ops 
563 | 
564 |    
565 | 
566 | 4. 设置允许调用的kern function 
567 | 
568 |    eg : `register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_tcp_ca_kfunc_set);` 
569 | 
570 |    ```c
571 |    BTF_SET_START(bpf_tcp_ca_check_kfunc_ids)
572 |    BTF_ID(func, tcp_reno_ssthresh)
573 |    BTF_ID(func, tcp_reno_cong_avoid)
574 |    BTF_ID(func, tcp_reno_undo_cwnd)
575 |    BTF_ID(func, tcp_slow_start)
576 |    BTF_ID(func, tcp_cong_avoid_ai)
577 |    BTF_SET_END(bpf_tcp_ca_check_kfunc_ids)
578 |    
579 |    static const struct btf_kfunc_id_set bpf_tcp_ca_kfunc_set = {
580 |    	.owner     = THIS_MODULE,
581 |    	.check_set = &bpf_tcp_ca_check_kfunc_ids,
582 |    };
583 |    ```
584 | 
585 |    
586 | 
587 | ## 编程技巧
588 | 
589 | ### 利用宏自动生成 struct 定义
590 | 
591 | ![image-20221110192516048](Map_Ops_BPF_MAP_TYPE_STRUCT_OP.assets/image-20221110192516048.png)
592 | 
593 | ### 使用空结构体标记地址,配合 container_of使用
594 | 
595 | ![image-20221115212335463](Map_Ops_BPF_MAP_TYPE_STRUCT_OP.assets/image-20221115212335463.png)
596 | 


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_map_kernel/eBPF_map_kernel.assets/image-20221108110550651.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_map_kernel/eBPF_map_kernel.assets/image-20221108110550651.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_map_kernel/eBPF_map_kernel.assets/image-20221108112647370.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_map_kernel/eBPF_map_kernel.assets/image-20221108112647370.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_map_kernel/eBPF_map_kernel.assets/image-20221108114101804.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_map_kernel/eBPF_map_kernel.assets/image-20221108114101804.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_map_kernel/eBPF_map_kernel.assets/image-20221108165434824.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_map_kernel/eBPF_map_kernel.assets/image-20221108165434824.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_map_kernel/eBPF_map_kernel.assets/image-20221108170547309.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_map_kernel/eBPF_map_kernel.assets/image-20221108170547309.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_map_kernel/eBPF_map_kernel.assets/image-20221109153041179.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_map_kernel/eBPF_map_kernel.assets/image-20221109153041179.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_map_kernel/eBPF_map_kernel.assets/image-20221109154550592.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_map_kernel/eBPF_map_kernel.assets/image-20221109154550592.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_map_kernel/eBPF_map_kernel.assets/image-20221109160126858.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_map_kernel/eBPF_map_kernel.assets/image-20221109160126858.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_map_kernel/eBPF_map_kernel.assets/image-20221109160314579.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_map_kernel/eBPF_map_kernel.assets/image-20221109160314579.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_map_kernel/eBPF_map_kernel.md:
--------------------------------------------------------------------------------
  1 | # eBPF MAP Kernel 
  2 | 
  3 | ## 数据结构
  4 | 
  5 | ### bpfptr_t 
  6 | 
  7 | ```c
  8 | typedef sockptr_t bpfptr_t;
  9 | ```
 10 | 
 11 | ![image-20221109154550592](eBPF_map_kernel.assets/image-20221109154550592.png)
 12 | 
 13 | 用一个结构体来统一表示内核态指针和用户态指针。因为这些指针经常设计内核态和用户态的数据交换。
 14 | 
 15 | ### bpf_map 
 16 | 
 17 | `struct bpf_map` 是BPF MAP的核心数据结构，定义在 ./include/linux/bpf.h 中
 18 | 
 19 | ```mermaid
 20 | classDiagram 
 21 | 	class bpf_map {
 22 | 		bpf_map_ops *ops
 23 | 		bpf_map *inner_map_meta
 24 | 		bpf_map_value_off *kptr_off_tab
 25 | 		btf *btf
 26 | 		mem_cgroup *memcg 
 27 | 		bpf_map_off_arr *off_arr
 28 | 		work_struct work 
 29 | 		owner 
 30 | 	}
 31 | 	class bpf_map_ops {
 32 | 	}
 33 | 	
 34 | 	class owner {
 35 | 		spinlock_t lock 
 36 | 		bpf_prog_type type 
 37 | 		bool jited 
 38 | 		bool xdp_has_frags
 39 | 	}
 40 | 	
 41 | 	bpf_map-->bpf_map_ops
 42 | 	bpf_map--bpf_map
 43 | 	bpf_map-->bpf_map_value_off 
 44 | 	bpf_map-->btf
 45 | 	bpf_map-->bpf_map_off_arr
 46 | 	bpf_map-->work_struct 
 47 | 	bpf_map-->owner 
 48 | ```
 49 | 
 50 | **u32 spin_lock_off** 
 51 | 
 52 | `bpf_spin_lock`在value结构体中的偏移量(以字节为单位)， **从这个字段也可以看出，value里只能有一个 bpf_spin_lock**
 53 | 
 54 | **struct bpf_map_off_arr *off_arr;** 
 55 | 
 56 | ![image-20221108165434824](eBPF_map_kernel.assets/image-20221108165434824.png)
 57 | 
 58 | 这个结构体里保存着 valb特殊成员 (目前有，`bpf_spin_lock` , `bpf_timer` ,`bpf_kptr`（可以有多个） )的 offset(相对于 value struct的字节偏移量) 和 size(大小) 信息。 并按照 offset 进行排序（filed_off数组，和filed_size数组) 。特殊成员由枚举类型标识 : 
 59 | 
 60 | **struct bpf_map_value_off *kptr_off_tab;** 
 61 | 
 62 | ![image-20221108170547309](eBPF_map_kernel.assets/image-20221108170547309.png)
 63 | 
 64 | 目前map的value好像允许存放 kernel function（module or ..) 的指针了？ 可以存放多个，也就是说很多 kernel模块可以通过 eBPF暴露出去。（很奇怪的一点事，struct_op 要求不存在 value_id, 那这个特性的作用是什么？） 这个成员变变量保存着 bpf_kptr 的描述信息。 
 65 | 
 66 | **atomic64_t writecnt;** 
 67 | 
 68 | 写计数器
 69 | 
 70 | 在 `bpf_update_map` 中 ``err = bpf_map_new_fd(map, f_flags);` `
 71 | 
 72 | **u32 btf_vmlinux_value_type_id;**
 73 | 
 74 | 通过btf_id找到对应的 内核数据结构`sturct bpf_struct_ops`(STRUCT_OP)。保存在 btf.ids section 里
 75 | 
 76 | **struct owner** 
 77 | 
 78 | 这个数据结构应该是用来表示使用了这个map的prog的。(猜测)
 79 | 
 80 | ### bpf_map_ops 
 81 | 
 82 | `struct bpf_map_ops` 是一组抽象的接口，抽象了不同MAP所需要的共有的方法。对于BPF_MAP来说这是很重要的hook点。`bpf_map_ops`里的方法包括，`bpf_map_update, bpf_map_delete` 等。
 83 | 
 84 | **map_meta_equal** 
 85 | 
 86 | 用于map_of_map类型的inner map(hash of map ,array of array) 必须实现的方法， 比较插入的map类型是否匹配，调用路径
 87 | 
 88 | -> `bpf_map_update_elem`
 89 | 
 90 | --> `map_fd_get_ptr(bpf_map_fd_get_ptr)`
 91 | 
 92 | ---> `map->ops->map_meta_equal`
 93 | 
 94 | **map_alloc_check**
 95 | 
 96 | 分配内存前检查map参数
 97 | 
 98 | ->`map_create`
 99 | 
100 | -->`find_and_alloc_map`
101 | 
102 | --->`err = map->ops->map_alloc_check` 
103 | 
104 | **map_alloc**
105 | 
106 | 分配map内存
107 | 
108 | ->`map_create`
109 | 
110 | -->`find_and_alloc_map`
111 | 
112 | --->`map = ops->map_alloc`
113 | 
114 | **map_free** 
115 | 
116 | 释放map的内存
117 | 
118 | ->`bpf_put`
119 | 
120 | -->`bpf_map_free_deferred` (workqueue)
121 | 
122 | --->`map->ops->map_free` 
123 | 
124 | **map_get_next_key**
125 | 
126 | 用于遍历map 
127 | 
128 | ->`map_get_next_key`
129 | 
130 | -->`map->ops->map_get_next_key`
131 | 
132 | **map_release_uref** 
133 | 
134 | 一般在该函数里处理还未触发的定时器 (release user reference)
135 | 
136 | ->`bpf_map_put_with_uref` 
137 | 
138 | ​	-->`bpf_map_put_uref`
139 | 
140 | ​		--> `map->ops->map_release_unref(map);`
141 | 
142 | **map_lookup_elem** 
143 | 
144 | 具体map lookup的实现
145 | 
146 | **map_update_elem** 
147 | 
148 | 具体map的update方法的实现
149 | 
150 | **map_delete_elem** 
151 | 
152 | 具体map的delete方法的实现
153 | 
154 | **map_gen_lookup** 
155 | 
156 | **用一系列eBPF指令代替系统调用(例如array可以简单获取地址), 在一定程度上应该会更高效**
157 | 
158 | -> `bpf_check` 
159 | 
160 | --> `do_misc_fixups`
161 | 
162 | ---> `map->ops->map_gen_lookup`
163 | 
164 | **map_direct_value_addr** 
165 | 
166 | 对于只读的map(如何赋初值) 例如只有一个元素且只读的ARRAY(当成全局变量使用), 通过该函数优化直接返回地址，在验证阶段替换指令
167 | 
168 | -> `bpf_check` 
169 | 
170 | ​	--> `check_mem_access` 
171 | 
172 | ​		---> `bpf_map_direct_read` 
173 | 
174 | ​			---->`map->ops->map_direct_value_addr` 
175 | 
176 | **map_direct_value_meta** 
177 | 
178 | **??从调用路径上来看，该函数是当 prog dump到用户态时调用获取 map fd, 以array为例将地址转化为 off**
179 | 
180 | ->` bpf_obj_get_info_by_fd`
181 | 
182 | ​	--> `bpf_prog_get_info_by_fd` 
183 | 
184 | ​		--->`bpf_insn_prepare_dump` 
185 | 
186 | ​			----> `map = bpf_map_from_imm(prog, imm, &off, &type);` 
187 | 
188 | ​				----->`map->ops->map_direct_value_meta(map, addr, off)` 
189 | 
190 | **map_mmap** 
191 | 
192 | **对BPF map使用mmap需要调用的函数 ??需要熟悉mmap的原理和用法(需要配置 BPF_F_MMAPABLE**
193 | 
194 | -> `bpf_map_mmap` bpf_map_fops
195 | 
196 | --> `map->ops->map__mmap` 
197 | 
198 | **map_seq_show_elem** 
199 | 
200 | 猜测和打印相关(seq_file, proc)
201 | 
202 | -> `map_seq_show` 
203 | 
204 | ​	--> `map->ops->map_seq_show_elem` 
205 | 
206 | **map_check_btf** 
207 | 
208 | 创建map之前检查map key和value的btf
209 | 
210 | ->`map_create`
211 | 
212 | --> `map_check_btf`
213 | 
214 | **map_lookup_batch** 
215 | 
216 | map批量查找的具体实现
217 | 
218 | **map_update_batch** 
219 | 
220 | map批量更新的具体实现	
221 | 
222 | **map_set_for_each_callback_args** 
223 | 
224 | 对于 `bpf_map_for_each_elem` 帮助函数 设置回调函数的参数
225 | 
226 | -> `check_helper_call` 
227 | 
228 | ​	--> `set_map_elem_callback_state` 
229 | 
230 | ​		---> `__check_func_call(.., map_set_for_each_callback_args) ` 
231 | 
232 | **bpf_for_each_array_elem** 
233 | 
234 | map遍历的具体实现
235 | 
236 | **map_btf_id** 
237 | 
238 | 该map的btf_id
239 | 
240 | .map_btf_id = &array_map_btf_ids[0],
241 | 
242 | **iter_seq_info** 
243 | 
244 | **??bpf迭代器相关，之后再看，需要阅读kernel seq的文档**
245 | 
246 | ## 代码逻辑
247 | 
248 | ### map_create 
249 | 
250 | 用户态通过eBPF系统调用 创建eBPF map 
251 | 
252 | **函数参数** 
253 | 
254 | ![image-20221108114101804](eBPF_map_kernel.assets/image-20221108114101804.png)
255 | 
256 | **调用逻辑** (一般情况下只看成功的部分，错误处理暂时跳过)
257 | 
258 | -> `int numa_node = bpf_map_attr_numa_node(attr);`   判断是否需要把map创建在特定的numa_node上，默认是 NUMA_NO_NODE
259 | 
260 | -> `struct bpf_map *map` 
261 | 
262 | -> `attr`参数检查。检查attr的属性和BPF_MAP_TYPE是否匹配，检查flag是否合法，numa_node是否合法。例如 如果提供了`attr->btf_vmlinux_vbalue_type_id` ，那么type必须是 `BPF_MAP_TYPE_STRUCT_OPS` 
263 | 
264 | ->`map = find_and_allco_map(attr)`  **根据 BPF_MAP_TYPE 查找并初始化MAP** 
265 | 
266 | ​	--> `const struct bpf_map_ops *ops`  不同类型的map，ops的函数接口一样，但是实现不一样，**是很重要的hook点**
267 | 
268 | ​	--> `type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types));` 
269 | 
270 | ​	--> `ops = bpf_map_types[type];`  **根据type查找对应map的ops, 也就是hook op的地方。**bpf_map_types是一个全局数组定义在 <linux/bpf_types.h> 中，如果要增加新的map或者修改已有的map,需要修改该头文件。
271 | 
272 | ​	--> `if (ops->map_alloc_check) err = ops->map_alloc_check(attr)`  在alloc前进行检查
273 | 
274 | ​	--> `if (attr->map_ifindex) ops = &bpf_map_offload_ops`  **判断是否需要将map卸载到硬件上**
275 | 
276 | ​	--> `map = ops->map_alloc(attr)` 分配并初始化map。具体的初始化方法和具体的map相关 **重要hook点**
277 | 
278 | ​	-->`map->ops = ops; map->map_type = type`  **挂载ops**
279 | 
280 | -> `bpf_obj_name_cpy(map->name, attr->map_name, sizeof(attr->map_name));` 
281 | 
282 | -> **初始化同步信息** (几个同步字段的作用需要进一步研究)
283 | 
284 | ​	--> `atomic64_set(&map->refcnt, 1);` 猜测和map的生命周期有关
285 | ​	--> `atomic64_set(&map->usercnt, 1);`
286 | ​	--> `mutex_init(&map->freeze_mutex);`
287 | ​	--> `spin_lock_init(&map->owner.lock);`
288 | ​	--> `map->spin_lock_off = -EINVAL;` **`bpf_spin_lock`在value结构体中的偏移量(以字节为单位)**。 在下面的 `map_check_btf`中进行设置, **这个字段和 bpf_spin_lock 相关(eBPF并发控制)**
289 | ​	--> `map->timer_off = -EINVAL;`
290 | 
291 | -> 设置`bpf_map`中的BTF信息  `if (attr->btf_key_type_id || attr->btf_value_type_id || attr->btf_vmlinux_type_id)`
292 | 
293 | ​	--> `struct btf *btf` 
294 | 
295 | ​	--> `btf = btf_get_by_fd(attr->btf_fd); map->btf = btf` 获取加载的 btf信息。（在加载 map之前要先加载整个.o文件的BTF） 
296 | 
297 | ​	--> `attr->btf_value_type_id ? `  `err = map_check_btf(map, btf, attr->btf_key_type_id,  attr->btf_value_type_id);` 检查bpf map key 和 value的btf信息。 **只有携带value信息的map才需要进行 map_check_btf**
298 | 
299 | ​		---> `const struct btf_type *key_type, *value_type;`     
300 | 
301 | ​		---> `u32 key_size, value_size;` 
302 | 
303 | ​		---> `btf_key_id ? key_type = btf_type_id_size(btf, &btf_key_id, &key_size);   `  在btf相关的代码中， `size_type`的含义是使用size属性的 kind, 例如 INT, STRUCT等。`btf_type_id_size`这个函数的含义指的是，解析携带size属性的btf_type（显然key,value 都需要有size)。**同时该函数也会去掉const等修饰符，直接获取数据类型表征的BTF**
304 | 
305 | ​		---> `!btf_key_id?` 部分maps要求key不指定btf_id, 在这种情况下，要求 `map->ops->map_check_btf != NULL`  (由map自己再做一次类型检查？)
306 | 
307 | ​		---> `value_type = btf_type_id_size(btf, &btf_value_id, &value_size);` 解析value 的 size 类型的 `btf_type `
308 | 
309 | ​		---> `map->spin_lock_off = btf_find_spin_lock(btf, value_type)`  查找spin_lock在value结构体中的字节偏移量, 
310 | 
311 | ​		---> `if (map_value_has_spin_lock(map)) `  如果 value中含有 `bpf_spin_lock` 。检查该spin_lock是否是有效的，只有特定的MAP类型(HASH, ARRAY, CGROUP_STORAGE 等)才允许设置 spin_lock。 检查spin_lock是否再范围内
312 | 
313 | ​		---> `map->timer_off = btf_find_timer(btf, value_type);`  查找timer 在value结构体中的字节偏移量 **暂时跳过，后续需要再找一下timer的用例**
314 | 
315 | ​		---> `map->kptr_off_tab = btf_parse_kptrs(btf, value_type);`  类似 timer 和 spin_lock。**（现在可以在eBPF MAP的value里放kptr ? )**
316 | 
317 | ​		---> `map->ops->map_check_btf ? ret = map->ops->map_check_btf(map, btf, key_type, value_type);`  通用校验结束，执行特殊校验(和map类型有关，挂载在ops类型中)
318 | 
319 | ​	--> `map->btf_key_type_id = attr->btf_key_type_id;` 
320 | 
321 | ​	--> `map->btf_value_type_id = attr->btf_value_type_id;` 
322 | 
323 | ​	--> `map->btf_vmlinux_value_type_id = attr->btf_vmlinux_value_type_id;`
324 | 
325 | -> `err = bpf_map_alloc_off_arr(map);` `bpf_spin_lock; bpf_timer; bpf_kptr` 的信息都保存在 off_arr 成员变量中。调用该方法保存 offset 和 size信息，并按照offset进行排序。
326 | 
327 | -> `err = security_bpf_map_alloc(map);`  create map 里的 LSM钩子
328 | 
329 | -> `err = bpf_map_alloc_id(map);` 为 bpf_map分配id 
330 | 
331 | -> `	bpf_map_save_memcg(map);`  cgroup相关，暂时跳过，**再看一下cgroup相关的文档**。猜测是记录cgoup分配的内存信息。
332 | 
333 | -> `err = bpf_map_new_fd(map, f_flags);`  分配 fd 
334 | 
335 | ### map_update_elem 
336 | 
337 | 用户态通过 update 系统调用 更新 bpf map 
338 | 
339 | 系统调用参数
340 | 
341 | ![image-20221109153041179](eBPF_map_kernel.assets/image-20221109153041179.png)
342 | 
343 | **调用逻辑** 
344 | 
345 | -> `bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);`  内核态指针和用户态指针不同。用 `struct bpfptr_t` 来统一表示。
346 | 
347 | -> `bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel);`
348 | 
349 | -> `struct bpf_map *map; void *key, value; struct fd f`
350 | 
351 | -> `f = fdget(ufd)` 通过 fd 获取 struct fd. 
352 | 
353 | -> `map = __bpf_map_get(f);`  获取 ufd 对应的 bpf_map; 实际上是 
354 | 
355 | ​	--> `f.file->private_data;` 
356 | 
357 | -> `bpf_map_write_active_inc(map);`   `map->writecnt` 原子增 1 
358 | 
359 | -> `!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) ? err` 检查写权限
360 | 
361 | -> `(attr->flags & BPF_F_LOCK) && !map_value_has_spin_lock(map) ? err`  如果 update 的 flag 里有 `BPF_F_LOCK` 那么value里必须包含自旋锁，通过在 map_careate中解析得到的offset进行判断
362 | 
363 | -> `	key = ___bpf_copy_key(ukey, map->key_size);`  分配内核空间， 将key从用户态拷贝到内核态
364 | 
365 | ​	--> `kvmemdup_bpfptr(ukey, key_size)f;` 
366 | 
367 | ​		---> `void *p = kvmalloc(len, GFP_USER | __GFP_NOWARN);` 
368 | 
369 | ​		---> `copy_from_bpfptr(p, src, len)` 
370 | 
371 | -> `value_size = bpf_map_value_size(map);`  计算map的value_size。**特别注意的是，对于per cpu类型其size为 map->value_size * cpu_num** 这也验证了，用户态对 percpu的map类型调用 update ，返回的是所有 CPU的value组合而成的数组。**需要在用户态做进一步的聚合**
372 | 
373 | ​	--> for per cpu `return round_up(map->value_size, 8) * num_possible_cpus();` 
374 | 
375 | ​	--> for fd array (array of map,  prog array, hash of maps) ` return sizeof(u32);`
376 | 
377 | ​	--> 其它 `map->value_size;`
378 | 
379 | -> `value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);`  分配内存
380 | 
381 | -> `copy_from_bpfptr(value, uvalue, value_size) != 0` 将需要 update 的 value 从用户态拷贝到内核态
382 | 
383 | ->`rcu_read_lock();` 
384 | 
385 | -> `err = bpf_map_update_value(map, f, key, value, attr->flags);` 调用特定map的update函数。 （目前到这里都没有涉及并发控制) ,  writecnt 这个院子变量的用途也暂且不明。
386 | 
387 | -> `rcu_read_unlock();` 
388 | 
389 | ## 如何实现一个新的BPF_MAP ? 
390 | 
391 | 1. 实现 bpf_map_ops
392 |    * map_alloc_check
393 |    * map_alloc
394 |    * map_free
395 |    * map_update
396 |    * map_lookup
397 |    * (map_delete)
398 |    * map_check_btf
399 |    * map_btf_id 
400 |    * map_seq_show_elem? 
401 | 2. 修改bpf工具，例如libbpf和bpftool
402 | 
403 | ## 编程技巧
404 | 
405 | ### 缓存友好代码
406 | 
407 | ![image-20221108110550651](eBPF_map_kernel.assets/image-20221108110550651.png)
408 | 
409 | 1. 把访问频率高的，只读数据尽量放在一起（放在一个cache line中） 提高访问效率
410 | 
411 | ![image-20221108112647370](eBPF_map_kernel.assets/image-20221108112647370.png)
412 | 
413 | 2. 一些写频率不一致的变量，拉开存储间隔，避免cache false sharing。(关于false sharing的文章见 doc里 linux cache)
414 | 
415 | ### kernel 内判断指针地址是否合法
416 | 
417 | ![image-20221109160126858](eBPF_map_kernel.assets/image-20221109160126858.png)
418 | 
419 | 这里主要使用 IS_ERR 宏。 该宏会判断地址是否超出特定的地址边界 
420 | 
421 | ![image-20221109160314579](eBPF_map_kernel.assets/image-20221109160314579.png)


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_prog_kernel/eBPF_prog_kernel.assets/image-20221102213705213.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_prog_kernel/eBPF_prog_kernel.assets/image-20221102213705213.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_prog_kernel/eBPF_prog_kernel.assets/image-20221103161824697.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_prog_kernel/eBPF_prog_kernel.assets/image-20221103161824697.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_prog_kernel/eBPF_prog_kernel.assets/image-20221103163709300.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_prog_kernel/eBPF_prog_kernel.assets/image-20221103163709300.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_prog_kernel/eBPF_prog_kernel.md:
--------------------------------------------------------------------------------
  1 | # eBPF_prog_kernel 
  2 | 
  3 | ## 数据结构
  4 | 
  5 | ### **bpf_prog**
  6 | 
  7 | 参数解释， attach_btf, dst_prog 应该表示被attach的内核函数/模块的btf_id, 或者被attach的eBPF程序。因为从` bpf_prog_load_check_attach`函数来看，只有一下几类函数可以设置这两个值
  8 | 
  9 | ![image-20221102213705213](eBPF_prog_kernel.assets/image-20221102213705213.png)
 10 | 
 11 | **expected_attach_type** 
 12 | 
 13 | 不同的prog类型含义不同，比如对于socket_op来说，attach_type代表不同的 socket时间
 14 | 
 15 | 对于struct_op类型来说，attach_type代表被attach的内核模块(struct)下对应的函数(btf_member)的id。
 16 | 
 17 | ### bpf_prog_aux 
 18 | 
 19 | **jited_linfo 和 linfo** 
 20 | 
 21 | jited_linfo从注释来看， JIT可以理解成把 BPF指令逐一映射为机器指令
 22 | 
 23 | ![image-20221103163709300](eBPF_prog_kernel.assets/image-20221103163709300.png)
 24 | 
 25 | **attach_btf_id** 
 26 | 
 27 | prog attach点代表的btf。
 28 | 
 29 | 以struct_op为例，这里的btf点对应内核的数据结构 `prog->aux->attach_btf_id != st_ops->type_id`
 30 | 
 31 | 
 32 | 
 33 | ## 主要函数
 34 | 
 35 | ### BPF_PROG_LOAD
 36 | 
 37 | **调用逻辑**
 38 | 
 39 | `bpf_prog_load`  系统调用，加载eBPF程序
 40 | 
 41 | -> `struct bpf_prog *prog, *dst_prog = NULL;`
 42 | 
 43 | -> 参数，权限，license检查
 44 | 
 45 | ->   ` ? dst_prog = bpf_prog_get(attach_prog_fd)`  or ` ? attach_btf = bpf_get_btf_vmlinux()`
 46 | 
 47 | ->`bpf_prog_load_fixup_attach_type(attr)` 为了做兼容，兼容一些旧版本的eBPF程序不需要指定 expected_attach_type
 48 | 
 49 | -> `bpf_prog_load_check_attach(type, expected_attach_type, attach_btf, attach_btf_id, dst_prog) `  , 检查attach的类型和prog_type 是否匹配，prog_type和 attach_btf, dst_prog是否匹配, 参数合法性检查
 50 | 
 51 | -> `prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);`   为prog分配内存。 gfp_flags : GFP_KERNEL_ACCOUNT |  GFP_USER (代表为用户态分配的内存但是内核可以直接访问)
 52 | 
 53 | ​	--> `prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags)`   为 `bpf_prog`结构体除了 bpf_prog_stats 之外的成员分配内存
 54 | 
 55 | ​		--->  `struct bpf_prog_aux *aux; struct bpf_prog *fp;`
 56 | 
 57 | ​		---> `fp->aux->prog = fp;`  bpf_prog_aux 辅助数据结构的prog成员变量同时指向 bpf_prog
 58 | 
 59 | ​		---> `prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);`    每一个CPU都有一份 bpf_prog_stats （记录一些bpf程序的状态信息
 60 | 
 61 | ​        ---> `for_each_possible_cpu` 初始化 ` prog->stats`
 62 | 
 63 | -> 根据系统调用传入的参数`attr`初始化 `prog`和`prog->aux`的字段，例如`prog->expected_attach_type, prog->aux->attach_btf = attach_btf`
 64 | 
 65 | -> `prog->aux->user = get_current_user();`  初始化加载该Prog的用户信息
 66 | 
 67 | -> `copy_from_bpfptr(prog->insns, make_bpfptr(attr->insns, uattr.is_kernel),  bpf_prog_insn_size(prog))`  从用户态把eBPF字节码拷贝到内核态内存中
 68 | 
 69 | -> `(bpf_prog_is_dev_bound(prog->aux))? bpf_prog_offload_init(prog, attr);`  把bpf程序卸载到硬件例如智能网卡上执行
 70 | 
 71 | -> `find_prog_type(type, prog)` **查找load的eBPF程序类型内核是否支持，并设置内核预定义的设置**。设置prog->type
 72 | 
 73 | ​	--> `type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types)); `  全局变量 `bpf_prog_types` 保存着内核支持的所有BPF程序类型及其对应的信息(例如 op )
 74 | 
 75 | ​	--> `ops = bpf_prog_types[type];`  
 76 | 
 77 | ​    --> `bpf_prog_is_dev_bound(prog->aux) ? prog->aux->ops = &bpf_offload_prog_ops; : prog->aux->ops = ops` **ops是bpf_prog相关的重要hook点，在这里设置通过查找array来设置对应的Hook** 
 78 | 
 79 | ​	--> `prog->type = type`
 80 | 
 81 | -> `err = bpf_check(&prog, attr, uattr);` **运行eBPF验证器** 做一些验证同时还要做一写 relocate的操作，以后再看
 82 | 
 83 | -> `prog = bpf_prog_select_runtime(prog, &err);` 选择运行环境 JIT 或者是解释器（优先使用JIT)
 84 | 
 85 | ​	--> `bpf_prog_select_func(fp)` 
 86 | 
 87 | ​		---> `fp->bpf_func = __bpf_prog_ret0_warn;`  如果JIT则使用默认的函数，该函数直接返回0什么都不干, 并打印警报表示JIT没有生效 。因为如果JIT成功， bpf_func 会被 bpf_int_jit_compile 函数替换
 88 | 
 89 | ​	--> `for !bpf_prog_is_dev_bound(fp->aux)`
 90 | 
 91 | ​	    --->  `*err = bpf_prog_alloc_jited_linfo(fp);` 分配空间
 92 | 
 93 | ​		---> `fp = bpf_int_jit_compile(fp);`  体系结构相关, 做一一映射,  设置 `fp->bpf_func` 。 如果JIT失败该函数返回没有被JIT的program, 使得能够回退回解释器。
 94 | 
 95 | ​		---> `bpf_prog_jit_attempt_done(fp);` 检查JIT是否成功
 96 | 
 97 | ​    -->` bpf_check_tail_call(fp)` 检查尾调用的程序类型是否匹配
 98 | 
 99 | ​		--->`bpf_prog_map_compatible(map, fp)` 
100 | 
101 | -> `err = bpf_prog_alloc_id(prog);` 分配全局唯一的 prog_id 
102 | 
103 | -> `bpf_prog_kallsyms_add(prog);`  暴露该 bpf_prog的地址 (暴露给 kallsyms) 在 bpf_prog_new_fd之前执行否则会出问题(有相关的patch讨论这个问题)。 因此可以用 eBPF去 trace BPF程序
104 | 
105 | -> `perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);` 猜测：添加bpf相关的perf event
106 | 
107 | -> `bpf_audit_prog(prog, BPF_AUDIT_LOAD);` 记录相关的审计信息
108 | 
109 | -> `err = bpf_prog_new_fd(prog);` 生成 fd 
110 | 
111 | ### BPF_PROG_ATTACH
112 | 
113 | 
114 | 
115 | ## 编程技巧
116 | 
117 | ### 通过宏文件来自动化生成结构体
118 | 
119 | 通过宏来控制包含头文件里不同的内容，从而根据头文件里的宏定义自动化地生成全局变量
120 | 
121 | ![image-20221103161824697](eBPF_prog_kernel.assets/image-20221103161824697.png)
122 | 
123 | 


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_prog_kernel/eBPF_prog_kernel.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_prog_kernel/eBPF_prog_kernel.pptx


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_verifier_note/eBPF_varifier_note.assets/image-20220627143836218.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_verifier_note/eBPF_varifier_note.assets/image-20220627143836218.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_verifier_note/eBPF_varifier_note.assets/image-20220627144244437.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_verifier_note/eBPF_varifier_note.assets/image-20220627144244437.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_verifier_note/eBPF_varifier_note.assets/image-20220627144933988.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/eBPF_verifier_note/eBPF_varifier_note.assets/image-20220627144933988.png


--------------------------------------------------------------------------------
/Doc/Notes/eBPF_verifier_note/eBPF_varifier_note.md:
--------------------------------------------------------------------------------
  1 | # eBPF 无法通过验证例子
  2 | 
  3 | ## example1 由于C代码和实际编译的字节码不一致
  4 | 
  5 | **场景** 
  6 | 
  7 | 1. TC 有一个cb数组  ctx->cb[5] 
  8 | 2. cb[0] 存放一个 index 属于 [1,5)
  9 | 3. 根据cb[0] 的index 拿到cb数组的某一个值
 10 | 
 11 | **C代码** 
 12 | 
 13 | ```c 
 14 | #include "common.h"
 15 | 
 16 | SEC("tc")
 17 | int scancb(struct __sk_buff *ctx) {
 18 |     //scan cb
 19 |     __u32 curr_id = ctx->cb[0];
 20 |     __u32 target;
 21 | #pragma unroll
 22 |     for (int i = 1; i < 5; i++) {
 23 |         if (curr_id == i) {
 24 |             target = ctx->cb[i];
 25 |             break;
 26 |         }
 27 |     }
 28 |     //use target
 29 |     bpfprintk("target %d", target);
 30 |     return 0;
 31 | }
 32 | ```
 33 | 
 34 | **编译后的字节码** 
 35 | 
 36 | ```assembly
 37 | test.c.o:	file format ELF64-BPF
 38 | 
 39 | 
 40 | Disassembly of section tc:
 41 | 
 42 | 0000000000000000 scancb:
 43 | ;     __u32 curr_id = ctx->cb[0];
 44 |        0:	r4 = *(u32 *)(r1 + 48)
 45 | ;         if (curr_id == i) {
 46 |        1:	if r4 s> 2 goto +6 <LBB0_3>
 47 |        2:	r2 = 1
 48 |        3:	if r4 == 1 goto +10 <LBB0_8>
 49 |        4:	if r4 == 2 goto +1 <LBB0_5>
 50 |        5:	goto +11 <LBB0_9>
 51 | 
 52 | 0000000000000030 LBB0_5:
 53 |        6:	r2 = 2
 54 |        7:	goto +6 <LBB0_8>
 55 | 
 56 | 0000000000000040 LBB0_3:
 57 |        8:	if r4 == 3 goto +4 <LBB0_6>
 58 |        9:	if r4 == 4 goto +1 <LBB0_7>
 59 |       10:	goto +6 <LBB0_9>
 60 | 
 61 | 0000000000000058 LBB0_7:
 62 |       11:	r2 = 4
 63 |       12:	goto +1 <LBB0_8>
 64 | 
 65 | 0000000000000068 LBB0_6:
 66 |       13:	r2 = 3
 67 | 
 68 | 0000000000000070 LBB0_8:
 69 | ;             target = ctx->cb[i];
 70 |       14:	r2 <<= 2
 71 |       15:	r1 += r2
 72 |       16:	r3 = *(u32 *)(r1 + 48)
 73 | 
 74 | 0000000000000088 LBB0_9:
 75 |       17:	r1 = 100
 76 | ;     bpfprintk("target %d", target);
 77 |       18:	*(u16 *)(r10 - 8) = r1
 78 |       19:	r1 = 2675266157534142836 ll
 79 |       21:	*(u64 *)(r10 - 16) = r1
 80 |       22:	r1 = r10
 81 |       23:	r1 += -16
 82 |       24:	r2 = 10
 83 |       25:	call 6
 84 | ;     return 0;
 85 |       26:	r0 = 0
 86 |       27:	exit
 87 | ```
 88 | 
 89 | **原因分析** 
 90 | 
 91 | 看一下汇编后的代码， r1寄存器存放的是 ctx指针。
 92 | 
 93 | 1. C代码 `__u32 curr_id = ctx->cb[0]` 是能通过验证的，对应于汇编语句 `r4 = *(u32 *)(r1 + 48)` 可以看到汇编语句只是从r1存放的起始地址计算cb数组的地址，并且做一个解引用的操作。curr_id存放在r4寄存器中
 94 | 
 95 | 2. 但是C代码循环体（伪循环，实际是循环展开）里的操作是不能通过验证的。以 curr_id == 1为例，跳转到LBB0_5。对应汇编代码 14-16行
 96 | 
 97 |    ![image-20220627143836218](eBPF 无法通过验证例子.assets/image-20220627143836218.png)
 98 | 
 99 | ​	此时r2值为1， r2 <<= 2 是因为cb 是一个 u32的数组（左移动四位）。关键在于第 15句 r1 += r2 修改了 r1的指针，导致了验证不通过。理想情况下我们希望汇编这么写 `r3 = *(u32 *)(r1 + 52)` 而不是对r1进行修改。
100 | 
101 | 报错如下： 
102 | 
103 | ![image-20220627144244437](./eBPF_varifier_note.assets/image-20220627144244437.png)
104 | 
105 | 可以看到就是提示报错就是因为修改了 r1 寄存器（存放ctx的指针）。但是这个报错针对的是编译后的字节码，而非原本的C代码。
106 | 
107 | ## example2 
108 | 
109 | 如果将上一个例子 不要使用 for 循环，C代码更改为： 
110 | 
111 | ```c 
112 | #include "common.h"
113 | 
114 | SEC("tc")
115 | int scancb(struct __sk_buff *ctx) {
116 |     //scan cb
117 |     __u32 curr_id = ctx->cb[0];
118 |     __u32 target = 0;
119 |     if (curr_id == 1) {
120 |         target = ctx->cb[1];
121 |     }
122 | 
123 |     if (curr_id == 2) {
124 |         target = ctx->cb[2];
125 |     }
126 |     //use target
127 |     bpfprintk("target %d", target);
128 |     return 0;
129 | }
130 | char _license[] SEC("license") = "GPL";
131 | ```
132 | 
133 | 使用常数引用 cb 数组
134 | 
135 | ```assembly
136 | test.c.o:	file format ELF64-BPF
137 | 
138 | 
139 | Disassembly of section tc:
140 | 
141 | 0000000000000000 scancb:
142 | ;     __u32 curr_id = ctx->cb[0];
143 |        0:	r2 = *(u32 *)(r1 + 48)
144 | ;     if (curr_id == 1) {
145 |        1:	if r2 == 2 goto +4 <LBB0_3>
146 |        2:	r3 = 0
147 |        3:	if r2 != 1 goto +4 <LBB0_5>
148 | ;         target = ctx->cb[1];
149 |        4:	r1 += 52
150 |        5:	goto +1 <LBB0_4>
151 | 
152 | 0000000000000030 LBB0_3:
153 | ;         target = ctx->cb[2];
154 |        6:	r1 += 56
155 | 
156 | 0000000000000038 LBB0_4:
157 |        7:	r3 = *(u32 *)(r1 + 0)
158 | 
159 | 0000000000000040 LBB0_5:
160 |        8:	r1 = 100
161 | ;     bpfprintk("target %d", target);
162 |        9:	*(u16 *)(r10 - 8) = r1
163 |       10:	r1 = 2675266157534142836 ll
164 |       12:	*(u64 *)(r10 - 16) = r1
165 |       13:	r1 = r10
166 |       14:	r1 += -16
167 |       15:	r2 = 10
168 |       16:	call 6
169 | ;     return 0;
170 |       17:	r0 = 0
171 |       18:	exit
172 | ```
173 | 
174 | 可以看到汇编后， 第4、6行仍然修改了r1寄存器
175 | 
176 | 所以依然报错: 
177 | 
178 | ![image-20220627144933988](eBPF_varifier_note.assets/image-20220627144933988.png)


--------------------------------------------------------------------------------
/Doc/Notes/使用Ftrace修改函数参数.md:
--------------------------------------------------------------------------------
  1 | # 使用Ftrace修改函数参数
  2 | 
  3 | ## 引入
  4 | 
  5 | ebpf程序对于pt_regs变量只能读取无法修改，导致其功能被局限在tracing上，而无法对现有的内核函数进行参数的修改。Ftrace为内核模块提供了api，来对内核符号表(kallsyms)中注册的函数进行添加回调函数的操作。在内核代码([热补丁的实现原理](https://richardweiyang-2.gitbook.io/kernel-exploring/00-index-3/05-kernel_live_patch))中看到了通过使用ftrace修改regs->ip来进行跳板的跳转，于是猜想ftrace可以在回调函数中可以进行修改寄存器的操作从而修改被hook函数的参数。
  6 | 
  7 | ## 实验流程
  8 | 
  9 | 首先需要一个被hook的函数，这里需要自己写一个能与用户态交互的内核模块，借鉴了[这篇](https://zhuanlan.zhihu.com/p/420194002)文章来实现。具体功能是在用户态使用`cat /dev/lkm_example` 时会出发模块中的`device_read`函数，来向用户态打印hello world。这里选择hook `device_read`来进行实验。
 10 | 
 11 | `device_read`函数代码如下：
 12 | 
 13 | ```c
 14 | #define DEVICE_NAME "lkm_example"
 15 | #define EXAMPLE_MSG "Hello, World!\n"
 16 | #define MSG_BUFFER_LEN 15
 17 | ......
 18 |     
 19 | static char msg_buffer[MSG_BUFFER_LEN];
 20 | static char *msg_ptr;
 21 | ......
 22 |     
 23 | /* When a process reads from our device, this gets called. */
 24 | static ssize_t device_read(struct file *flip, char *buffer, size_t len, loff_t *offset) {
 25 |  int bytes_read = 0;
 26 |   /* If we’re at the end, loop back to the beginning */
 27 |   if (*msg_ptr == 0) {
 28 |    msg_ptr = msg_buffer;
 29 |   }
 30 |   /* Put data in the buffer */
 31 |   while (len && *msg_ptr) {
 32 |     /* Buffer is in user data, not kernel, so you can’t just reference
 33 |      * with a pointer. The function put_user handles this for us */
 34 |     printk("lkm:flip: %lx, buffer: %lx, len: %lx, offset: %lx",flip,buffer,len,offset);//测试函数的参数是否被修改,
 35 |     put_user(*(msg_ptr++), buffer++);
 36 |     len--;
 37 |     bytes_read++;
 38 |  }
 39 |   return bytes_read;
 40 | }
 41 | ```
 42 | 
 43 | hook点准备好了，接下来需要编写ftrace回调函数的模块了，首先需要了解几个api，时间允许可以细读[这篇](https://docs.kernel.org/trace/ftrace-uses.html)文章，省流版如下：
 44 | 
 45 | `struct ftrace_ops`：用来存储ftrace回调函数等信息，其中的成员`ftrace_func_t func`用来保存回调函数的指针
 46 | 
 47 | `ftrace_set_filter`：用来将准备好的`ftrace_ops`限制在指定的hook函数上，如果不设置，则会在hook在所有函数上(比如schedule())，很危险- -
 48 | 
 49 | `register_ftrace_function`：用来将准备好的ftrace_ops注册到内核中并启用该hook点，需要在设置hook点之后调用
 50 | 
 51 | ```c
 52 | /**
 53 |  * register_ftrace_function - register a function for profiling
 54 |  * @ops:	ops structure that holds the function for profiling.
 55 |  *
 56 |  * Register a function to be called by all functions in the
 57 |  * kernel.
 58 |  *
 59 |  * Note: @ops->func and all the functions it calls must be labeled
 60 |  *       with "notrace", otherwise it will go into a
 61 |  *       recursive loop.
 62 |  * notrace宏位于<linux/ftrace.h>，用于防止回调函数也被hook而导致无限循环，但好像不用加也可以，并且ftrace提供了其他的机制来防止这一现象
 63 |  */
 64 | int register_ftrace_function(struct ftrace_ops *ops)
 65 | ```
 66 | 
 67 | 
 68 | 
 69 | ```c
 70 | /**
 71 |  * ftrace_set_filter - set a function to filter on in ftrace
 72 |  * @ops - the ops to set the filter with
 73 |  * @buf - the string that holds the function filter text.
 74 |  * @len - the length of the string.
 75 |  * @reset - non zero to reset all filters before applying this filter.
 76 |  *
 77 |  * Filters denote which functions should be enabled when tracing is enabled.
 78 |  * If @buf is NULL and reset is set, all functions will be enabled for tracing.
 79 |  * 这里第二个参数是被hook函数在内核符号表中的名字，第三个参数是名字字符串的长度，第四个参数代表是追加模式还是覆盖模式
 80 |  */
 81 | int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
 82 | 		       int len, int reset)
 83 | ```
 84 | 
 85 | 此外，ftrace规定了回调函数的类型声明
 86 | 
 87 | ```c
 88 | void callback_func(unsigned long ip, unsigned long parent_ip,
 89 |                    struct ftrace_ops *op, struct pt_regs *regs);
 90 | //ip是instruction pointer，指示fentry的指令位置
 91 | //parent_ip指示被hook函数的位置
 92 | ```
 93 | 
 94 | 接下来使用api来编写ftrace模块
 95 | 
 96 | ```c
 97 | // SPDX-License-Identifier: GPL-2.0-only
 98 | #include <linux/module.h>
 99 | 
100 | #include <linux/sched.h> /* for wake_up_process() */
101 | #include <linux/ftrace.h>
102 | 
103 | //自定义的回调函数
104 | static void callback_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *regs){
105 |     //代码开始和结束均是防止循环调用而加的测试代码
106 |     int bit;
107 |     bit = ftrace_test_recursion_trylock(ip, parent_ip);
108 |     if (bit < 0)
109 |         return;
110 | 	//trace逻辑为如下三行，将所有的信息均打印出来
111 |     trace_printk("callback_func! ,ip:%lx, pip:%lx, *op:%lx, *regs:%lx, dx:%lx, ax:%lx, bx:%lx, cx:%lx, si:%lx, di:%lx, r8:%lx,r9:%lx,r12:%lx,r13:%lx,r14:%lx\n",ip,parent_ip,op,regs,regs->regs.dx,regs->regs.ax,regs->regs.bx,regs->regs.cx,regs->regs.si,regs->regs.di,regs->regs.r8,regs->regs.r9,regs->regs.r12,regs->regs.r13,regs->regs.r14);
112 |     regs->regs.dx = 0x0000000000000001ull;//修改dx，相当于修改被hook函数的第三个参数
113 |     trace_printk("regs changed! ,ip:%lx, pip:%lx, *op:%lx, *regs:%lx, dx:%lx, ax:%lx, bx:%lx, cx:%lx, si:%lx, di:%lx, r8:%lx,r9:%lx,r12:%lx,r13:%lx,r14:%lx\n",ip,parent_ip,op,regs,regs->regs.dx,regs->regs.ax,regs->regs.bx,regs->regs.cx,regs->regs.si,regs->regs.di,regs->regs.r8,regs->regs.r9,regs->regs.r12,regs->regs.r13,regs->regs.r14);
114 | 
115 |     ftrace_test_recursion_unlock(bit);
116 | }
117 | 
118 | //配置ftrace_ops
119 | static struct ftrace_ops ops = {
120 |       .func                    = callback_func, //这里设置回调函数
121 |       .flags                   = FTRACE_OPS_FL_SAVE_REGS //具体flag的定义详见上面那篇文章，如果需要读取修改寄存器需要添加该flag
122 |     //   .private                 = any_private_data_structure,
123 | };
124 | 
125 | //在模块初始化中设置hook点并启用
126 | static int __init ftrace_direct_init(void)
127 | {
128 |     ftrace_set_filter(&ops, "device_read", strlen("device_read"), 0);//如果想hook其他函数，修改名字就好
129 |     return register_ftrace_function(&ops);
130 | 
131 | }
132 | //模块卸载时取消注册
133 | static void __exit ftrace_direct_exit(void)
134 | {
135 |     unregister_ftrace_function(&ops);
136 | }
137 | 
138 | module_init(ftrace_direct_init);
139 | module_exit(ftrace_direct_exit);
140 | 
141 | MODULE_AUTHOR("Steven Rostedt");
142 | MODULE_DESCRIPTION("Example use case of using register_ftrace_direct()");
143 | MODULE_LICENSE("GPL");
144 | ```
145 | 
146 | ## 实验结果
147 | 
148 | 首先将两个模块编译并加载到内核后。使用`cat /proc/kallsyms | grep device_read`查询内核符号表，可以看到被hook函数出现在表中(lkm即为被hook的函数所在模块)。
149 | 
150 | ![image-20221025113448249](https://lunqituchuang.oss-cn-hangzhou.aliyuncs.com/image-20221025113448249.png)
151 | 
152 | 使用cat之后开启一个终端使用`sudo cat /sys/kernel/tracing/trace_pipe`来读取回调函数打印的结果。
153 | 
154 | 开启另一个终端执行`cat /dev/lkm_example` 来触发被hook的函数。
155 | 
156 | 查看trace_pipe中的结果如下：
157 | 
158 | ![修改regs结果](https://lunqituchuang.oss-cn-hangzhou.aliyuncs.com/修改regs结果.png)
159 | 
160 | 在终端中使用`sudo dmesg`查看`device_read`中`printk`打印的结果，来验证被hook函数中的参数是否真正被修改了
161 | 
162 | ![image-20221025114118583](https://lunqituchuang.oss-cn-hangzhou.aliyuncs.com/image-20221025114118583.png)
163 | 
164 | 可以看到dx中保存的第三个参数`len`确实被修改了，证明了实验猜想正确
165 | 
166 | ## 附录
167 | 
168 | 代码仓库地址:[https://github.com/balisong77/ftrace_demo](https://github.com/balisong77/ftrace_demo)
169 | 
170 | 参考教程和ftrace编程样例：
171 | 
172 | [https://nixhacker.com/hooking-syscalls-in-linux-using-ftrace/](https://nixhacker.com/hooking-syscalls-in-linux-using-ftrace/)
173 | 
174 | [使用ftrace修改ip出现的问题](https://stackoverflow.com/questions/42966520/restoring-task-pt-regs-when-returning-to-original-function-from-ftrace-handler)修改ip需要`FTRACE_OPS_FL_IPMODIFY`flag被设置
175 | 
176 | [ftrace源码原理小探](https://richardweiyang-2.gitbook.io/kernel-exploring/00-index-3/04-ftrace_internal)
177 | 
178 | 内核模块Makefile模板
179 | 
180 | ```makefile
181 | obj-m = hook_by_name.o
182 | KERNEL_VER = $(shell uname -r)
183 | all:
184 | 	make -C /lib/modules/$(KERNEL_VER)/build M=$(PWD) modules
185 | clean:
186 | 	make -C /lib/modules/$(KERNEL_VER)/build M=$(PWD) clean
187 | ```
188 | 
189 | lkm模块完整代码
190 | 
191 | ```c
192 | #include <linux/init.h>
193 | #include <linux/module.h>
194 | #include <linux/kernel.h>
195 | #include <linux/fs.h>
196 | #include <linux/uaccess.h>
197 | 
198 | MODULE_LICENSE("GPL");
199 | MODULE_AUTHOR("Robert W. Oliver II");
200 | MODULE_DESCRIPTION("A simple example Linux module.");
201 | MODULE_VERSION("0.01");
202 | 
203 | #define DEVICE_NAME "lkm_example"
204 | #define EXAMPLE_MSG "Hello, World!\n"
205 | #define MSG_BUFFER_LEN 15
206 | 
207 | /* Prototypes for device functions */
208 | static int device_open(struct inode *, struct file *);
209 | static int device_release(struct inode *, struct file *);
210 | static ssize_t device_read(struct file *, char *, size_t, loff_t *);
211 | static ssize_t device_write(struct file *, const char *, size_t, loff_t *);
212 |                
213 | static int major_num;
214 | static int device_open_count = 0;
215 | static char msg_buffer[MSG_BUFFER_LEN];
216 | static char *msg_ptr;
217 |                
218 | /* This structure points to all of the device functions */
219 | static struct file_operations file_ops = {
220 |  .read = device_read,
221 |  .write = device_write,
222 |  .open = device_open,
223 |  .release = device_release
224 | };
225 |                
226 | /* When a process reads from our device, this gets called. */
227 | static ssize_t device_read(struct file *flip, char *buffer, size_t len, loff_t *offset) {
228 |  int bytes_read = 0;
229 |   /* If we’re at the end, loop back to the beginning */
230 |   if (*msg_ptr == 0) {
231 |    msg_ptr = msg_buffer;
232 |   }
233 |   /* Put data in the buffer */
234 |   while (len && *msg_ptr) {
235 |     /* Buffer is in user data, not kernel, so you can’t just reference
236 |      * with a pointer. The function put_user handles this for us */
237 |     printk("lkm:flip: %lx, buffer: %lx, len: %lx, offset: %lx",flip,buffer,len,offset);
238 |     put_user(*(msg_ptr++), buffer++);
239 |     len--;
240 |     bytes_read++;
241 |  }
242 |   return bytes_read;
243 | }
244 | 
245 | /* Called when a process tries to write to our device */
246 | static ssize_t device_write(struct file *flip, const char *buffer, size_t len, loff_t *offset) {
247 |  /* This is a read-only device */
248 |   printk(KERN_ALERT "This operation is not supported.\n");
249 |   return -EINVAL;
250 | }
251 |          
252 | /* Called when a process opens our device */
253 | static int device_open(struct inode *inode, struct file *file) {
254 |   /* If device is open, return busy */
255 |   if (device_open_count) {
256 |    return -EBUSY;
257 |   }
258 |   device_open_count++;
259 |   try_module_get(THIS_MODULE);
260 |   return 0;
261 | }
262 |          
263 | /* Called when a process closes our device */
264 | static int device_release(struct inode *inode, struct file *file) {
265 |   /* Decrement the open counter and usage count. Without this, the module would not unload. */
266 |   device_open_count--;
267 |   module_put(THIS_MODULE);
268 |   return 0;
269 | }
270 |          
271 | static int __init lkm_example_init(void) {
272 |   /* Fill buffer with our message */
273 |   strncpy(msg_buffer, EXAMPLE_MSG, MSG_BUFFER_LEN);
274 |   /* Set the msg_ptr to the buffer */
275 |   msg_ptr = msg_buffer;
276 |   /* Try to register character device */
277 |   major_num = register_chrdev(0, "lkm_example", &file_ops);
278 |   if (major_num < 0) {
279 |    printk(KERN_ALERT "Could not register device: %d\n", major_num);
280 |    return major_num;
281 |   } else {
282 |    printk(KERN_INFO "lkm_example module loaded with device major number %d\n", major_num);
283 |    return 0;
284 |   }
285 | }
286 | 
287 | static void __exit lkm_example_exit(void) {
288 |   /* Remember — we have to clean up after ourselves. Unregister the character device. */
289 |   unregister_chrdev(major_num, DEVICE_NAME);
290 |   printk(KERN_INFO "Goodbye, World!\n");
291 | }
292 | 
293 | /* Register module functions */
294 | module_init(lkm_example_init);
295 | module_exit(lkm_example_exit);
296 | ```
297 | 
298 | 


--------------------------------------------------------------------------------
/Doc/Notes/利用bpf修改函数返回值的探索/利用bpf修改函数返回值的探索.assets/image-20210922182718960.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/利用bpf修改函数返回值的探索/利用bpf修改函数返回值的探索.assets/image-20210922182718960.png


--------------------------------------------------------------------------------
/Doc/Notes/利用bpf修改函数返回值的探索/利用bpf修改函数返回值的探索.assets/image-20210922183431404.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/利用bpf修改函数返回值的探索/利用bpf修改函数返回值的探索.assets/image-20210922183431404.png


--------------------------------------------------------------------------------
/Doc/Notes/利用bpf修改函数返回值的探索/利用bpf修改函数返回值的探索.assets/image-20210922184227041.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/利用bpf修改函数返回值的探索/利用bpf修改函数返回值的探索.assets/image-20210922184227041.png


--------------------------------------------------------------------------------
/Doc/Notes/利用bpf修改函数返回值的探索/利用bpf修改函数返回值的探索.assets/image-20210922184419413.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/利用bpf修改函数返回值的探索/利用bpf修改函数返回值的探索.assets/image-20210922184419413.png


--------------------------------------------------------------------------------
/Doc/Notes/利用bpf修改函数返回值的探索/利用bpf修改函数返回值的探索.md:
--------------------------------------------------------------------------------
  1 | # 利用BPF技术以及kprobe修改函数返回值
  2 | 
  3 | ## bpf_override_return
  4 | 
  5 | 利用bpf技术修改函数返回值，必须使用bpf库提供的函数bpf_override_return。
  6 | 
  7 | 其定义和说明如下： 
  8 | 
  9 | ```c
 10 | /*
 11 |  * bpf_override_return
 12 |  *
 13 |  * 	Used for error injection, this helper uses kprobes to override
 14 |  * 	the return value of the probed function, and to set it to *rc*.
 15 |  * 	The first argument is the context *regs* on which the kprobe
 16 |  * 	works.
 17 |  *
 18 |  * 	This helper works by setting the PC (program counter)
 19 |  * 	to an override function which is run in place of the original
 20 |  * 	probed function. This means the probed function is not run at
 21 |  * 	all. The replacement function just returns with the required
 22 |  * 	value.
 23 |  *
 24 |  * 	This helper has security implications, and thus is subject to
 25 |  * 	restrictions. It is only available if the kernel was compiled
 26 |  * 	with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration
 27 |  * 	option, and in this case it only works on functions tagged with
 28 |  * 	**ALLOW_ERROR_INJECTION** in the kernel code.
 29 |  *
 30 |  * 	Also, the helper is only available for the architectures having
 31 |  * 	the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing,
 32 |  * 	x86 architecture is the only one to support this feature.
 33 |  *
 34 |  * Returns
 35 |  * 	0
 36 |  */
 37 | static long (*bpf_override_return)(struct pt_regs *regs, __u64 rc) = (void *) 58;
 38 | ```
 39 | 
 40 | 其中 regs 是 bpf kprobe 的环境参数，rc 是修改的返回值。
 41 | 
 42 | 从说明文档我们可以知道 bpf_override_return 的原理： 
 43 | 
 44 | 1. 通过修改 PC(程序计数器)，***使得被 probe的函数不会执行 **(个人理解有点类似通过修改指令的方式拦截probe的函数)，并且用 **rc 的值来替代原函数的返回值**， 因此这一技术经常用来 error injection（后文会提到） 
 45 | 2. 要使用该函数，编译内核的时候必须设置CONFIG_BPF_KPROBE_OVERRIDE 为 yes ，（按照说明目前只有 x86架构的机器支持这一特性）
 46 | 3. 只能对标记**ALLOW_ERROR_INJECTION** ，的函数使用 bpf_override_return 
 47 | 
 48 | 这个函数的原理不难理解，现在简单讨论一下**ALLOW_ERROR_INJECTION** 
 49 | 
 50 | ### **ALLOW_ERROR_INJECTION** 
 51 | 
 52 | 首先在 mptcp-0.95的内核中，被打上该标记的函数并不多： 
 53 | 
 54 | 以下是对 **ALLOW_ERROR_INJECTION**  标记的检索结果
 55 | 
 56 | ![image-20210922182718960](利用bpf修改函数返回值的探索.assets/image-20210922182718960.png)
 57 | 
 58 | 而 ALLOW_ERROR_INJECTION 根据源码其实是一段宏定义
 59 | 
 60 | ```c
 61 | #ifdef CONFIG_FUNCTION_ERROR_INJECTION
 62 | /*
 63 |  * Whitelist ganerating macro. Specify functions which can be
 64 |  * error-injectable using this macro.
 65 |  */
 66 | #define ALLOW_ERROR_INJECTION(fname, _etype)				\
 67 | static struct error_injection_entry __used				\
 68 | 	__attribute__((__section__("_error_injection_whitelist")))	\
 69 | 	_eil_addr_##fname = {						\
 70 | 		.addr = (unsigned long)fname,				\
 71 | 		.etype = EI_ETYPE_##_etype,				\
 72 | 	};
 73 | #else
 74 | #define ALLOW_ERROR_INJECTION(fname, _etype)
 75 | #endif
 76 | #endif
 77 | 
 78 | ```
 79 | 
 80 | ### bpf_override_return的设计初衷
 81 | 
 82 | bpf 为了保证安全性，该函数的使用收到了较为严格的限制，该函数的文档中也说明，该函数的初衷是 error injection ,方便注入error，使得程序在需要的时候停止，其本质是为了方便 测试 和 调试
 83 | 
 84 | ## bcc inject 
 85 | 
 86 | bcc 的 inject工具比较好的应用了 bpf_override_return 
 87 | 
 88 | 这个工具对底层的三个函数 进行 kprobe,  并修改其返回值，使得返回指定的 errno,达到注入错误，并使得整条调用链失败。
 89 | 
 90 | 三个函数分别对应注入的三种模式 (参数 mode )
 91 | 
 92 | ![image-20210922183431404](利用bpf修改函数返回值的探索.assets/image-20210922183431404.png)
 93 | 
 94 | 因为这三个函数属于底层的函数，大部分的函数都会调用到。
 95 | 
 96 | ### inject 的主要功能
 97 | 
 98 | inject 的功能可以描述为： 
 99 | 
100 | 用户可以指定 一条函数调用链以及调用链上每一个函数的参数条件，当条件满足的时候，根据用户选择的模式（底层的注入函数，三个模式选一个），让底层的注入函数返回特定的 errno, 达到让整条调用链失败的效果
101 | 
102 | ## inject的测试和 bpf_override_return的测试
103 | 
104 | 下面我基于bcc inject 工具做了一个简单的实验，验证 bpf_override_return的实际作用
105 | 
106 | 实验步骤： 
107 | 
108 | 1. 我使用 kmalloc 模式，并且设置，当 tcp_v4_connect 函数被调用的时候，底层的 `should_failslab`会返回 指定的 error，使得 tcp_v4_connect调用失败。命令为： `sudo inject kmalloc -v 'tcp_v4_connect()'
109 | 2. 我打开浏览器，作为触发实验的对象
110 | 3. 我使用 bcc trace 工具，追踪 tcp_v4_connect()函数的返回值，以及 should_failslab 的返回值，命令为： `sudo trace -L 3849 'r::should_failslab "ret: %d",retval' 'r::tcp_v4_connect() "ret: %d", retval' ` (-L 指定浏览器对应的线程)
111 | 
112 | 实验结果： 
113 | 
114 | 1. 无法通过浏览器访问网页 （因为 tcp_v4_connect()调用失败了） 
115 | 2. 追踪结果的分析： 
116 | 
117 | ![image-20210922184227041](利用bpf修改函数返回值的探索.assets/image-20210922184227041.png)
118 | 
119 | 从追踪结果我们可以看到，tcp_v4_connect函数和部分 should_failedslab 函数返回值为 -12 , 查询，该返回值对应的errno是
120 | 
121 | ![image-20210922184419413](利用bpf修改函数返回值的探索.assets/image-20210922184419413.png)
122 | 
123 | 是 和 kmalloc错误的含义相同。
124 | 
125 | 这个简单的小实验证明了，bpf_override_return是有效的
126 | 
127 | ## bpf_override_return的应用
128 | 
129 | 我们明确，该函数的主要作用是错误注入。
130 | 
131 | 并且该函数的使用条件还是很严苛的。目前**我们无法直接使用** 
132 | 
133 | 但是这并不意味着该函数没用。我目前对该函数应用设想主要在未来修改内核阶段： 
134 | 
135 | 1. **我们可以在指定的地方，设置相应的钩子函数，并给钩子函数打上相应的ALLOW_ERROR_INJECTION标签** 
136 | 2. **钩子函数的返回值对应着下一段程序的分支** 
137 | 3. **我们通过kprobe 以及 bpf_override_return 来控制 钩子函数的返回值，以此来达到根据参数值跳过某一函数的执行，或者决定函数分支的效果** 
138 | 
139 | 举个例子： 
140 | 
141 | 假设我们要修改mptcp_established_option函数
142 | 
143 | ```c
144 | void mptcp_established_option(args...) {
145 |    if (not my_hook(args...)) {
146 |      return;
147 |    }
148 | }
149 | int  my_hook(args...) {
150 |   return 0;
151 | }
152 | ```
153 | 
154 | My_hook 是一个钩子函数，我们把mptcp_established_option参数传递进去，但是什么也不做，默认返回0（不跳过mptcp_established_option执行）
155 | 
156 | 我们可以利用 bpf kprobe , 根据mptcp_established_option的参数，决定是否跳过 mptcp_established_option的执行。我们可以利用 bpf_override_return 修改 my_hook 的返回值，来达到这一效果。
157 | 
158 | 
159 | 
160 | 以上只是一个例子，也只是我的设想，但是这种做法对性能的损耗并没有实际验证过。
161 | 
162 | 
163 | 
164 | ## 给 tcp_v4_connect注入错误的源码
165 | 
166 | 最后附上 inject 给tcp_v4_connect 注入错误的 c 源码： 
167 | 
168 | 这段源码的重点在于 should_failslab_entry 函数
169 | 
170 | 其中后缀带 entry 是 kprobe 
171 | 
172 | 后缀是 exit 是 kretprobe
173 | 
174 | ```c
175 | #include <linux/mm.h>
176 | 
177 | struct pid_struct {
178 |     u64 curr_call; /* book keeping to handle recursion */
179 |     u64 conds_met; /* stack pointer */
180 |     u64 stack[2];
181 | };
182 | BPF_HASH(m, u32, struct pid_struct);
183 | BPF_ARRAY(count, u32, 1);
184 | int tcp_v4_connect_entry(struct pt_regs *ctx)
185 | {
186 |         u32 pid = bpf_get_current_pid_tgid();
187 |         
188 |         /*
189 |          * Early exit for probability case
190 |          */
191 |         if (false)
192 |                return 0;
193 |         /*
194 |          * Top level function init map
195 |          */
196 |         struct pid_struct p_struct = {0, 0};
197 |         m.insert(&pid, &p_struct);
198 |         
199 | 
200 |         struct pid_struct *p = m.lookup(&pid);
201 | 
202 |         if (!p)
203 |                 return 0;
204 | 
205 |         /*
206 |          * preparation for predicate, if necessary
207 |          */
208 |          
209 |         /*
210 |          * Generate entry logic
211 |          */
212 |         
213 | 
214 |         if (p->conds_met >= 2)
215 |                 return 0;
216 |         if (p->conds_met == 0 && (true)) {
217 |                 p->stack[0] = p->curr_call;
218 |                 p->conds_met++;
219 |         }
220 | 
221 |         p->curr_call++;
222 | 
223 |         return 0;
224 | }
225 | int tcp_v4_connect_exit(struct pt_regs *ctx)
226 | {
227 |         u32 pid = bpf_get_current_pid_tgid();
228 | 
229 |         struct pid_struct *p = m.lookup(&pid);
230 | 
231 |         if (!p)
232 |                 return 0;
233 | 
234 |         p->curr_call--;
235 | 
236 |         /*
237 |          * Generate exit logic
238 |          */
239 |         
240 |         if (p->conds_met < 1 || p->conds_met >= 3)
241 |                 return 0;
242 | 
243 |         if (p->stack[p->conds_met - 1] == p->curr_call)
244 |                 p->conds_met--;
245 |         
246 |         
247 |         /*
248 |          * Top level function clean up map
249 |          */
250 |         m.delete(&pid);
251 |         
252 |         return 0;
253 | }
254 | int should_failslab_entry(struct pt_regs *ctx, struct kmem_cache *s, gfp_t gfpflags)
255 | {
256 |         u32 overridden = 0;
257 |         int zero = 0;
258 |         u32* val;
259 | 
260 |         val = count.lookup(&zero);
261 |         if (val)
262 |             overridden = *val;
263 | 
264 |         /*
265 |          * preparation for predicate, if necessary
266 |          */
267 |          
268 |         /*
269 |          * If this is the only call in the chain and predicate passes
270 |          */
271 |         if (2 == 1 && (true) && overridden < -1) {
272 |                 count.atomic_increment(zero);
273 |                 bpf_override_return(ctx, -ENOMEM);
274 |                 return 0;
275 |         }
276 |         u32 pid = bpf_get_current_pid_tgid();
277 | 
278 |         struct pid_struct *p = m.lookup(&pid);
279 | 
280 |         if (!p)
281 |                 return 0;
282 | 
283 |         /*
284 |          * If all conds have been met and predicate passes
285 |          */
286 |         if (p->conds_met == 1 && (true) && overridden < -1) {
287 |                 count.atomic_increment(zero);
288 |                 bpf_override_return(ctx, -ENOMEM);
289 |         }
290 |         return 0;
291 | }
292 | 
293 | ```
294 | 
295 | 


--------------------------------------------------------------------------------
/Doc/Notes/利用bpf修改用户空间函数参数/利用bpf修改用户空间函数参数.assets/image-20211013230335647.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/利用bpf修改用户空间函数参数/利用bpf修改用户空间函数参数.assets/image-20211013230335647.png


--------------------------------------------------------------------------------
/Doc/Notes/利用bpf修改用户空间函数参数/利用bpf修改用户空间函数参数.assets/image-20211013230702277.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/利用bpf修改用户空间函数参数/利用bpf修改用户空间函数参数.assets/image-20211013230702277.png


--------------------------------------------------------------------------------
/Doc/Notes/利用bpf修改用户空间函数参数/利用bpf修改用户空间函数参数.assets/image-20211013230710713.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/利用bpf修改用户空间函数参数/利用bpf修改用户空间函数参数.assets/image-20211013230710713.png


--------------------------------------------------------------------------------
/Doc/Notes/利用bpf修改用户空间函数参数/利用bpf修改用户空间函数参数.assets/image-20211013231022799.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/利用bpf修改用户空间函数参数/利用bpf修改用户空间函数参数.assets/image-20211013231022799.png


--------------------------------------------------------------------------------
/Doc/Notes/利用bpf修改用户空间函数参数/利用bpf修改用户空间函数参数.assets/image-20211013231143806.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/利用bpf修改用户空间函数参数/利用bpf修改用户空间函数参数.assets/image-20211013231143806.png


--------------------------------------------------------------------------------
/Doc/Notes/利用bpf修改用户空间函数参数/利用bpf修改用户空间函数参数.assets/image-20211013232705596.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/利用bpf修改用户空间函数参数/利用bpf修改用户空间函数参数.assets/image-20211013232705596.png


--------------------------------------------------------------------------------
/Doc/Notes/利用bpf修改用户空间函数参数/利用bpf修改用户空间函数参数.assets/image-20211013233018492.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/利用bpf修改用户空间函数参数/利用bpf修改用户空间函数参数.assets/image-20211013233018492.png


--------------------------------------------------------------------------------
/Doc/Notes/利用bpf修改用户空间函数参数/利用bpf修改用户空间函数参数.md:
--------------------------------------------------------------------------------
  1 | # 利用bpf修改用户空间函数参数
  2 | 
  3 | ## 前言 
  4 | 
  5 | 最近成功利用bpf，结合 uprobe 修改用户态系统调用的函数参数。结论可以概括为： 
  6 | 
  7 | 1. 可以利用bpf提供的 bpf_probe_write_user 来实现这一目标，可以修改用户态的库函数，自己写的用户态程序的函数等(不局限于C语言)
  8 | 2. bpf目前仅提倡该函数用在测试和debug中，使用 bpf_probe_write_user 函数，会在日志中打印警告信息
  9 | 
 10 | ## 修改函数参数
 11 | 
 12 | linux 大量使用了 C 语言，因此本次针对C 语言的函数修改。我们知道 C 语言中，函数参数有两种类型： 
 13 | 
 14 | 1. 按值传递的普通参数 （int) 
 15 | 2. 指针 （void*)
 16 | 
 17 | 本次我实验成功的是第二种，对于第一种，我有两种思路，但是没有成功，我后面会稍微讨论一下。
 18 | 
 19 | ### 修改指针类型的参数
 20 | 
 21 | 我们知道，指针是变量的地址，如果参数是指针的话，我们的思路很简单： 
 22 | 
 23 | **获取变量的地址，并利用 bpf_probe_write_user 修改内存的内容** 
 24 | 
 25 | 本次实验，修改 `int setsockopt(int sockfd, int level, int optname, const void *optval, socklen_t optlen)` 参数optval的值
 26 | 
 27 | 核心代码如下： 
 28 | 
 29 | ```c
 30 | #include <linux/ptrace.h>
 31 | BPF_PERF_OUTPUT(events);
 32 | 
 33 | struct Event {
 34 |   int opt;
 35 |   int val_after;
 36 |   int val_before;
 37 | };
 38 | 
 39 | int override_setsockopt(struct pt_regs *ctx, int sockfd, int level, int optname, const void *optval) {
 40 |     if (optname ==  42) {
 41 |         int before = *(int*)optval;
 42 |         int after = 0;
 43 |         void *optv = (void*)optval; // remove const
 44 |         if (bpf_probe_write_user(optv, &after, sizeof(int))) {
 45 |             return -1;
 46 |         }
 47 |         struct Event e = {optname, *(int*)optval, before};
 48 |         events.perf_submit(ctx, &e, sizeof(e));
 49 |     }
 50 |     return 0;
 51 | }
 52 | ```
 53 | 
 54 | 其中： 
 55 | 
 56 | `void *optv = (void*)optval;` 这一行去除 const 限制
 57 | 
 58 | 这段代码的含义就是，每当 setsockopt 被调用的时候，会被 uprobe 拦截，判断 optname 是不是 42(MPTCP_ENABLED),  如果是强行设置改选项的值为 0 （不启动 mptcp_enabled) , 为了方便观测，我将 optname, optval 的原始值 和 以及修改后指针指向内存的值返回到用户态。
 59 | 
 60 | 值得一提的是： 
 61 | 
 62 | `bpf_probe_write_user(optv, &after, sizeof(int))` 
 63 | 
 64 | 写内存我们需要知道写多少长度，在 C 语言中, void* 指针（单纯表示内存地址）作为参数往往需要配合地址长度，从这个角度来说修改void*类型的指针是比较容易的。
 65 | 
 66 | #### **实验** 
 67 | 
 68 | 我使用师兄写的 mptcp_pingpong 来进行实验，并使用 wireshark 抓 pingpong 程序通信的 packet，判断抓到的packet协议类型是不是mptcp
 69 | 
 70 | Mptcp pingpong 默认使用了mptcp协议，我利用 bpf程序强制讲 mptcp_enable 选项的值设置为 0 
 71 | 
 72 | 因此预期的实验效果是： **使用 bpf程序之前，能够抓到 pingpong 的 mptcp 协议包， 而启动 bpf 程序后抓不到mptcp协议包** 
 73 | 
 74 | #### **实验结果** 
 75 | 
 76 | **使用bpf程序前** 
 77 | 
 78 | ![image-20211013230335647](利用bpf修改用户空间函数参数.assets/image-20211013230335647.png)
 79 | 
 80 | 
 81 | 
 82 | 从下方的wireshark 可以看到抓到了mptcp协议包 （目的ip 是 192.168.124.22) 
 83 | 
 84 | **使用 bpf程序之后**
 85 | 
 86 | ![image-20211013230702277](利用bpf修改用户空间函数参数.assets/image-20211013230702277.png)
 87 | 
 88 | ![image-20211013230710713](利用bpf修改用户空间函数参数.assets/image-20211013230710713.png)
 89 | 
 90 | 可以看到只抓到了 tcp的包，而没有 mptcp的包
 91 | 
 92 | 说明 setsockopt 被 bpf 程序拦截并禁用了 mptcp 
 93 | 
 94 | ### 非指针类型的参数
 95 | 
 96 | 对于非指针类型的参数，因为他们是 copy by value 我们修改了 uprobe 函数的参数值，也无法修改原始函数的参数值。
 97 | 
 98 | 对于 x86架构的机器，有两个思路
 99 | 
100 | 首先我们看一下 pt_regs 在 x86架构下的定义： 
101 | 
102 | ![image-20211013231022799](利用bpf修改用户空间函数参数.assets/image-20211013231022799.png)
103 | 
104 | 可以看到 pt_regs 保存着uprobe之前函数调用的寄存器信息。
105 | 
106 | 查询x86架构寄存器的使用方案，可以得到： 
107 | 
108 | ![image-20211013231143806](利用bpf修改用户空间函数参数.assets/image-20211013231143806.png)
109 | 
110 | 有两个关键信息： 
111 | 
112 | 1. 对于前6个参数，参数值保存在寄存器中。（因此也可以通过寄存器来获取参数，这也是 bcc 的 trace 利用 arg占位符来获取参数之的原理，可以考虑之后加到 my_trace中） 
113 | 2. sp寄存器是堆栈指针寄存器保存栈顶的地址，gcc编译器默认将参数从右到左压栈
114 | 
115 | 因此我们有两个思路：
116 | 
117 | 1. 修改对应寄存器的值
118 | 2. 利用 sp 指针，计算相应参数在堆栈中的地址，获取地址并修改堆栈内容，来达到修改参数值的效果这也是文章（ https://cloud.tencent.com/developer/article/1790913） 
119 | 
120 | 然而我目前： 
121 | 
122 | 1. 方案1 还没测试过，但我觉得大概率是不能成功的
123 | 2. 方案2 我试过了，但是并不能读出正确的内存的值
124 | 
125 | 还是以 setsockopt 为例 ，我首先希望通过 sp 寄存器来获取 optname 这个参数的值(第三个参数）： 
126 | 
127 | 根据https://zhuanlan.zhihu.com/p/27339191 文章的叙述，x86架构，堆栈顶应该是 父函数的返回地址，往下依次是从左到右的各个参数： 
128 | 
129 | 代码如下：
130 | 
131 | ```c 
132 | if (optname ==  42) {
133 | 	u64 sp = ctx->sp;
134 | 	u64 addr = sp + sizeof(void*) + sizeof(int) + sizeof(int) + sizeof(int);
135 | 	int opt = 0;
136 | 	bpf_probe_read_user(&opt, sizeof(int), (void*)addr);
137 | }
138 | ```
139 | 
140 | 然而我并不能读到正确的 optname ， 我利用文章 https://cloud.tencent.com/developer/article/1790913， 将最后的写操作改为读操作也并不能得到正确的结果。
141 | 
142 | 我对汇编还不是特别熟悉，知道了sp寄存器的值，我还不能正确读到堆栈的地址，如果能做到这一步，应该是可以实现修改这种类型的参数值的。（sp 保存段内地址便宜，而ss 保存堆栈的段地址） 
143 | 
144 | 然而即使上述都成功了，我觉得这种方式也不太好，因为根据地层的寄存器来修改参数值，受到体系结构的限制比较大，不同的体系结构的寄存器使用方式都不同。
145 | 
146 | ### bpf不具备拦截的功能
147 | 
148 | 根据内核的文档： https://www.kernel.org/doc/html/latest/bpf/bpf_design_QA.html#q-can-bpf-overwrite-arbitrary-user-memory
149 | 
150 | ![image-20211013232705596](利用bpf修改用户空间函数参数.assets/image-20211013232705596.png)
151 | 
152 | 内核并不支持利用 bpf 修改内存， 也不提供这个功能。因此利用bpf修改用户态内存这种方式有待商榷，还需要考虑到效率问题，和一些其他的安全性问题（比如多线程，竞争冒险）
153 | 
154 | 使用 bpf_probe_write_user, 在日志中会出现： 
155 | 
156 | ![image-20211013233018492](利用bpf修改用户空间函数参数.assets/image-20211013233018492.png)
157 | 
158 | 
159 | 
160 | ## 代码
161 | 
162 | 最后附上，修改指针类型参数的代码： 
163 | 
164 | ```python 
165 | from bcc import BPF
166 | import sys
167 | import argparse
168 | 
169 | def gen_tgid_filter(tgid):
170 |     tgid_str = ""
171 |     if tgid != None :
172 |         tgid_str = """
173 | if (__tgid != %d) { return 0; }
174 |         """%tgid
175 |     return tgid_str
176 | 
177 | def gen_pid_filter(pid):
178 |     pid_str = ""
179 |     if pid != None :
180 |         pid_str = """
181 | if (__pid != %d) { return 0; }
182 |         """%pid
183 |     return pid_str
184 | 
185 | program = """
186 | #include <linux/ptrace.h>
187 | BPF_PERF_OUTPUT(events);
188 | 
189 | struct Event {
190 |   int opt;
191 |   int val_after;
192 |   int val_before;
193 | };
194 | 
195 | int override_setsockopt(struct pt_regs *ctx, int sockfd, int level, int optname, const void *optval) {
196 |     u64 __pid_tgid = bpf_get_current_pid_tgid();
197 |     u32 __tgid = __pid_tgid >> 32;
198 |     u32 __pid = __pid_tgid; // implicit cast to u32 for bottom half
199 |     //tgid
200 |     %s
201 |     //pid
202 |     %s
203 |     if (optname ==  42) {
204 |         int before = *(int*)optval;
205 |         int after = 0;
206 |         void *optv = (void*)optval; // remove const
207 |         if (bpf_probe_write_user(optv, &after, sizeof(int))) {
208 |             return -1;
209 |         }
210 |         struct Event e = {optname, *(int*)optval, before};
211 |         events.perf_submit(ctx, &e, sizeof(e));
212 |     }
213 |     return 0;
214 | }
215 | """
216 | 
217 | def print_event(cpu, data, size):
218 |   event = bpf["events"].event(data)
219 |   print("override, opt: %d, val_before: %d, val_after: %d"%(event.opt, event.val_before, event.val_after))
220 | 
221 | if __name__ == '__main__' :
222 |     parser = argparse.ArgumentParser(description = "Attach to functions and print trace messages.")
223 |     parser.add_argument("-p", "--pid", type = int, metavar = "PID",
224 |                   dest = "tgid", help = "id of the process to trace (optional)")
225 |     parser.add_argument("-L", "--tid", type = int, metavar = "TID",
226 |                   dest = "pid", help = "id of the thread to trace (optional)")
227 |     args = parser.parse_args()
228 |     program = program%(gen_tgid_filter(args.tgid), gen_pid_filter(args.pid))
229 | 
230 |     bpf = BPF(text = program)
231 |     bpf.attach_uprobe(name = "c" , sym = "setsockopt", fn_name = "override_setsockopt")
232 |     bpf["events"].open_perf_buffer(print_event)
233 | 
234 |     while True:
235 |         try:
236 |             bpf.perf_buffer_poll()
237 |         except KeyboardInterrupt:
238 |             exit()
239 | ```
240 | 
241 | 


--------------------------------------------------------------------------------
/Doc/Notes/利用xdp修改packet的实践/利用xdp修改packet的实践.assets/image-20220115144744384.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/利用xdp修改packet的实践/利用xdp修改packet的实践.assets/image-20220115144744384.png


--------------------------------------------------------------------------------
/Doc/Notes/利用xdp修改packet的实践/利用xdp修改packet的实践.assets/image-20220116211129370.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/利用xdp修改packet的实践/利用xdp修改packet的实践.assets/image-20220116211129370.png


--------------------------------------------------------------------------------
/Doc/Notes/利用xdp修改packet的实践/利用xdp修改packet的实践.assets/image-20220117144718144-5218977.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/利用xdp修改packet的实践/利用xdp修改packet的实践.assets/image-20220117144718144-5218977.png


--------------------------------------------------------------------------------
/Doc/Notes/利用xdp修改packet的实践/利用xdp修改packet的实践.assets/image-20220117160645160.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/利用xdp修改packet的实践/利用xdp修改packet的实践.assets/image-20220117160645160.png


--------------------------------------------------------------------------------
/Doc/Notes/利用xdp修改packet的实践/利用xdp修改packet的实践.assets/image-20220117161105612.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/利用xdp修改packet的实践/利用xdp修改packet的实践.assets/image-20220117161105612.png


--------------------------------------------------------------------------------
/Doc/Notes/利用xdp修改packet的实践/利用xdp修改packet的实践.assets/image-20220117161317391.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/利用xdp修改packet的实践/利用xdp修改packet的实践.assets/image-20220117161317391.png


--------------------------------------------------------------------------------
/Doc/Notes/利用xdp修改packet的实践/利用xdp修改packet的实践.assets/image-20220117165340867.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/利用xdp修改packet的实践/利用xdp修改packet的实践.assets/image-20220117165340867.png


--------------------------------------------------------------------------------
/Doc/Notes/利用xdp修改packet的实践/利用xdp修改packet的实践.assets/image-20220117170708671.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/Notes/利用xdp修改packet的实践/利用xdp修改packet的实践.assets/image-20220117170708671.png


--------------------------------------------------------------------------------
/Doc/Notes/利用xdp修改packet的实践/利用xdp修改packet的实践.md:
--------------------------------------------------------------------------------
  1 | # 利用XDP修改packet的实践
  2 | 
  3 | [toc]
  4 | 
  5 | ## EBPF和XDP基础
  6 | 
  7 | XDP全称是eXpress Data Path(高速数据路径)，位于网卡接收数据包之后，内核网络栈之前，常被用做高效的包处理、包转发、负载均衡等。本文主要是xdp在包处理方面的实践。
  8 | 
  9 | xdp可以**访问并修改**整个packet的内容，xdp处理完packet之后，返回action码，执行不同的操作，action码如下： 
 10 | 
 11 | ```c
 12 | /* User return codes for XDP prog type.
 13 |  * A valid XDP program must return one of these defined values. All other
 14 |  * return codes are reserved for future use. Unknown return codes will
 15 |  * result in packet drops and a warning via bpf_warn_invalid_xdp_action().
 16 |  */
 17 | enum xdp_action {
 18 | 	XDP_ABORTED = 0,  //bpf异常
 19 | 	XDP_DROP,     //将包丢弃
 20 | 	XDP_PASS,     //重新交给内核处理
 21 | 	XDP_TX,       //反射，将包从原有的nic重新发送回去
 22 | 	XDP_REDIRECT, //重定向，定向到不同的cpu, XSK等..
 23 | };
 24 | 
 25 | ```
 26 | 
 27 | 本文主要是对数据包进行修改，不涉及包的转发，因此返回码只是用了 XDP_PASS(将packet交给内核处理) 和 XDP_DROP （将包丢弃） 
 28 | 
 29 | ### directly packet access 
 30 | 
 31 | XDP目前使用了 directly packet access 技术，可以直接通过指针访问并修改packet的内容。使用这项技术的要点在于，**使用指针之前必须对指针进行验证** 
 32 | 
 33 | 首先，xdp操作对象是： 
 34 | 
 35 | ```c
 36 | /* user accessible metadata for XDP packet hook
 37 |  * new fields must be added to the end of this structure
 38 |  */
 39 | struct xdp_md {
 40 | 	__u32 data;
 41 | 	__u32 data_end;
 42 | 	__u32 data_meta;
 43 | 	/* Below access go through struct xdp_rxq_info */
 44 | 	__u32 ingress_ifindex; /* rxq->dev->ifindex */
 45 | 	__u32 rx_queue_index;  /* rxq->queue_index  */
 46 | };
 47 | ```
 48 | 
 49 | 在本文中主要使用 data 和 data_end ，这是两个指针，分别指向 packet 的头部和尾部，如下图所示： 
 50 | 
 51 | ![image-20220115144744384](利用xdp修改packet的实践.assets/image-20220115144744384.png)
 52 | 
 53 | 验证指的是： 
 54 | 
 55 | 假设 ： 
 56 | 
 57 | 1. 我们要访问的packet区域（上图红色的区域）的开始指针为 ptr
 58 | 2. 区域长度(bytes)为len
 59 | 
 60 | **我们在使用ptr之前必须**： 
 61 | 
 62 | 判断区域是否超过 data_end，即
 63 | 
 64 | ```c
 65 | if ((void*)ptr + len > data_end) {
 66 |     //如果ptr不是有效的指针
 67 |     return;
 68 | }
 69 | // access ptr here
 70 | ```
 71 | 
 72 | **特别值得注意的是，这种验证伴随着编程的全程，只要我们使用了一个新的指针，就必须验证这个指针，否则无法通过验证器的验证** 并且经常会不知道错在哪里，针对这点有一个比较好的编程方案，后文会提及。
 73 | 
 74 | #### example 
 75 | 
 76 | 举一个更具体的例子，假设有一个包最外层是 以太网头， 我们要访问以太网头，那么我们的代码如下： 
 77 | 
 78 | ```c
 79 | void *data_end = (void *)(long)ctx->data_end;
 80 | void *data = (void *)(long)ctx->data;
 81 | 
 82 | struct ethhdr *eth = data;
 83 | 
 84 | /* Byte-count bounds check; check if current pointer + size of header
 85 |     * is after data_end.
 86 | */
 87 | if ((void*)(eth + 1) > data_end) {
 88 |      return -1;
 89 | }
 90 | //access eth here
 91 | ```
 92 | 
 93 | ### 修改packet的大小
 94 | 
 95 | xdp支持修改packet的大小，但是只能在packet的开头，或者包的结尾，
 96 | 
 97 | 增加或者减少packet的大小，并且一旦修改大小之后，之前验证过的所有指针都失效了，必须要重新验证。
 98 | 
 99 | ebpf提供了帮助函数来完成包大小的修改： 
100 | 
101 | ```c
102 | /*
103 |  * bpf_xdp_adjust_head
104 |  *
105 |  * 	Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that
106 |  * 	it is possible to use a negative value for *delta*. This helper
107 |  * 	can be used to prepare the packet for pushing or popping
108 |  * 	headers.
109 |  *
110 |  * 	A call to this helper is susceptible to change the underlying
111 |  * 	packet buffer. Therefore, at load time, all checks on pointers
112 |  * 	previously done by the verifier are invalidated and must be
113 |  * 	performed again, if the helper is used in combination with
114 |  * 	direct packet access.
115 |  *
116 |  * Returns
117 |  * 	0 on success, or a negative error in case of failure.
118 |  */
119 | static long (*bpf_xdp_adjust_head)(struct xdp_md *xdp_md, int delta) = (void *) 44;
120 | 
121 | /*
122 |  * bpf_xdp_adjust_tail
123 |  *
124 |  * 	Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is
125 |  * 	possible to both shrink and grow the packet tail.
126 |  * 	Shrink done via *delta* being a negative integer.
127 |  *
128 |  * 	A call to this helper is susceptible to change the underlying
129 |  * 	packet buffer. Therefore, at load time, all checks on pointers
130 |  * 	previously done by the verifier are invalidated and must be
131 |  * 	performed again, if the helper is used in combination with
132 |  * 	direct packet access.
133 |  *
134 |  * Returns
135 |  * 	0 on success, or a negative error in case of failure.
136 |  */
137 | static long (*bpf_xdp_adjust_tail)(struct xdp_md *xdp_md, int delta) = (void *) 65;
138 | ```
139 | 
140 | 这两个分别从packet 头部 和 尾部对packet的大小进行修改。如下图所示:
141 | 
142 | ![image-20220117144718144](利用xdp修改packet的实践.assets/image-20220117144718144-5218977.png)
143 | 
144 | ### ebpf里的函数调用
145 | 
146 | ebpf对函数调用做出了严格的限制，这主要出于安全考虑，基本上常用的库函数都无法使用。
147 | 
148 | bpf支持的函数调用类型如下： 
149 | 
150 | 1. bpf帮助函数
151 | 2. 由`static __always_inline `修饰的自己定义的内联函数
152 | 3. 宏函数，包括自己定义的宏函数，或者是内核头文件定义的宏函数（比如计算校验和的宏函数，contain_of) 
153 | 
154 | 简单理解我们能在ebpf中调用的函数，要么是bpf定义好的帮助函数，要么就是内联的函数。因此在项目中定义的函数，都有 `static __always_inline`修饰。
155 | 
156 | ### ebpf尾调用
157 | 
158 | 由于ebpf单个程序的最大长度为 4k(4096)条指令。使用尾调用可以： 
159 | 
160 | 1. 解决ebpf单个程序最大长度限制的问题。ebpf最多支持32次尾调用。
161 | 2. 更好地划分程序的结构，让代码更加易于开发和维护
162 | 
163 | 尾调用和常规的函数调用不同： 
164 | 
165 | 1. 如果一个函数执行了尾调用，那么被调用函数和调用函数的 **bpf程序类型相同** 
166 | 2. 一个函数执行尾调用，跳转到另一个bpf程序之后，函数**不会返回调用函数的执行流**
167 | 
168 | example: 
169 | 
170 | 现在有两个bpf程序，类型都是BPF_XDP， test1 和 test2, test1 尾调用 test2， 如下： 
171 | 
172 | ```c
173 | int test1(struct xdp_md *ctx)
174 | {
175 |     xdp_actions.call(ctx, 0);  //尾调用test2
176 |     
177 |     //之后的代码不会被执行
178 | }
179 | 
180 | int test2(struct xdp_md *ctx)
181 | {
182 |     
183 | }
184 | ```
185 | 
186 | #### 使用尾调用
187 | 
188 | （bcc) 
189 | 
190 | 首先有一种bpf映射，专门为尾调用设计： 
191 | 
192 | `BPF_PROG_ARRAY`
193 | 
194 | 它的key是整数，value是被尾调用的bpf程序的文件描述符。
195 | 
196 | 具体的流程： 
197 | 
198 | 1. 要声明 BPF_PROG_ARRAY 类型的 bpf 映射
199 | 2. 在用户态**给对应的index**写入bpf程序程序描述符
200 | 3. bpf程序中，在适当的时候执行该映射的 call 方法，进行尾调用
201 | 
202 | 其中 1 和 3 都比较好理解。主要是2: 
203 | 
204 | 我们写完一段bpf程序之后，首先要将其编译成 ELF 文件，然后将其加载到 bpf 虚拟机中。加载成功之后，我们能够获得被加载程序的文件描述符，fd。 2 中做的就是在用户态，将相应的文件描述符写入BPF_PROG_ARRAY 对应的索引中。
205 | 
206 | 值得注意的是（个人理解），尾调用相当于执行一段新的bpf程序，因此**调用前的临时变量无法访问**，合理划分程序结构，在适当的地方进行尾调用对于提升计算效率来说十分重要。
207 | 
208 | ### BPF虚拟文件系统
209 | 
210 | 当使用多个bpf程序时，bpf程序之间需要进行数据通信。数据通信可以通过 BPF 虚拟文件系统来实现。在bcc中为 BPF_TABLE_PINNED。
211 | 
212 | 其定义如下：
213 | 
214 | ```c
215 | BPF_TABLE_PINNED(_table_type, _key_type, _leaf_type, _name, _max_entries, "/sys/fs/bpf/xyz")
216 | ```
217 | 
218 | 作用是： 
219 | 
220 | 如果bpf虚拟文件系统下该table存在（由路径指定），那么使用已经存在的table, 否则创建一个新的table并且挂在到文件系统中。
221 | 
222 | ## 主要设计
223 | 
224 | ### 整体设计
225 | 
226 | 整体的思路可以概括为： 对于每一条mptcp主流，记录并不断更新主流大小，拦截所有的 mptcp add address option并保存，当主流大小达到指定的阈值时，将之前保存的 option 重新放入ack包中。
227 | 
228 | 通过拦截并恢复 add address option，让client在一段时间内无法感知server额外的地址，从而实现对于特定大小的流阻止子流建立的效果。
229 | 
230 | 如下图所示： 
231 | 
232 | ![image-20220116211129370](利用xdp修改packet的实践.assets/image-20220116211129370.png)
233 | 
234 | 横坐标代表流的大小，有两个关键的决策变量： 
235 | 
236 | 1. remove_param 
237 | 2. recover_param 
238 | 
239 | 对于一条特定的流，流由tcp 4元组 (源地址，源端口号，目的地址，目的端口号) 唯一标识，假设当前流大小为x , 由上图所示： 
240 | 
241 | 1. 当 x < remove_param 时，移除收到的packet中的 mptcp add address option ，并保存到 bpf hash 中
242 | 2. 当 x >= recover_param 时，**逐一**将之前保存的 add address option 重新放置回收到包的 tcp options 中，让client能重新感知client新的地址
243 | 
244 | ps: 可以同时维护 $10^7$条主流
245 | 
246 | ### XDP程序
247 | 
248 | 整个XDP程序，由 4 个 xdp 子程序组成，分别是：
249 | 
250 | 1. xdp_main , 主程序
251 | 2. record_flow, 将mptcp主流记录到bpf hash的子程序
252 | 3. store_and_rm_add_addr，将add address 选项从packet中移除，并保存到bpf hash的子程序
253 | 4. recover_add_addr, 将保存的 add address 选项，重新放置回接收到的packet的子程序
254 | 
255 | (这些bpf程序在同名的 .c 文件中)
256 | 
257 | #### xdp_main
258 | 
259 | 在主程序 xdp_main中，符合特定条件时通过 bpf 尾调用，调用 record_flow, store_and_rm_add_addr,recover_add_addr 这三个子程序。主流程如下图所示： 
260 | 
261 | ```mermaid
262 | flowchart TD
263 |     pkt(pkt)-->enable{contains tcp header && enable == 1}
264 |     enable-->|no|XDP_PASS(XDP_PASS)
265 |     enable-->|yes|is_SA{tcp.flags.syn == 1 && tcp.flags.ack == 1}
266 |     is_SA-->|yes|record_flow(record_flow)
267 |     is_SA-->|no|is_fin{tcp.flags.fin == 1}
268 |     is_fin-->|yes|remove_flow(remove_flow from bpf hash)
269 |     is_fin-->|no|flow_size[update flow size]
270 |     flow_size-->is_remove{flow_size < rm_flow_size}
271 |     is_remove-->|yes|rm(store_and_rm_add_addr)
272 |     is_remove-->|no|is_recover{flow_size >= recover_flow_size}
273 |     is_recover-->|yes|recover(recover_add_addr)
274 |     is_recover-->|no|XDP_PASS2(XDP_PASS)
275 | ```
276 | 
277 | 整个流程图还是比较简单的，解释一下流程图里的几个重要的判断节点： 
278 | 
279 | 1. `contains tcp header && enable == 1`  我们只处理tcp包，enable是一个全局的开关，如果 enable == 0 . 那么后面的机制就失效了
280 | 2. `tcp.flags.syn == 1 && tcp.flags.ack == 1`  XDP程序是运行在client上的，即主流的建立是由 client 发送 syn包并携带 mp_capable选项，xdp作用在接受路径上，所以如果收到了服务端的 syn + ack 包，那么这个packet就**有可能是**（具体是不是还要看是否携带mp_capable，由子程序判断）主流上的第二个包，因此尾调用 recover_flow, 尝试记录子流
281 | 3. `tcp_flags.fin == 1` 我们使用bpf hash来跟踪当前系统中的所有的mptcp主流，但是hash的容量是有限的，因此一旦收到fin包我们就要主动删除记录的流，释放资源
282 | 4. `flow_size < rm_flow_size` , rm_flow_size 即决策变量 remove_param，符合条件时，尾调用 store_and_rm_add_addr
283 | 5. `flow_size >= recover_flow_size` , recover_flow_size 即决策变量，recover_param， 符合条件时，尾调用 recover_add_addr 
284 | 
285 | 主程序和其它被尾调用的bpf子程序通过bpf hash来进行数据的交换： 记录流，记录add address选项，恢复流...， 这个bpf hash是核心的数据结构。
286 | 
287 | #### 核心数据结构
288 | 
289 | 这个bpf_hash 定义如下
290 | 
291 | ```c
292 | BPF_TABLE_PINNED("hash", struct mptcp_connect, struct ring_buffer, mptcp_connects, MAX_CONNECT_SIZE, "/sys/fs/bpf/mptcp_connects");
293 | 
294 | struct mptcp_connect {
295 |     __be32      saddr;
296 |     __be32	daddr;
297 |     __be16	source;
298 |     __be16	dest;
299 | } __attribute__((__packed__));
300 | 
301 | struct xdp_mp_add_addr {
302 | 	__u8	kind;
303 | 	__u8	len;
304 | #if defined(__LITTLE_ENDIAN_BITFIELD)
305 | 	__u8	ipver:4,
306 | 		sub:4;
307 | #elif defined(__BIG_ENDIAN_BITFIELD)
308 | 	__u8	sub:4,
309 | 		ipver:4;
310 | #else
311 | #error	"Adjust your <asm/byteorder.h> defines"
312 | #endif
313 | 	__u8	addr_id;
314 | 	struct in_addr	addr;
315 | 	//__be16		port;
316 | } __attribute__((__packed__));
317 | 
318 | // 永远保持最新的 16 个， 类似滑动窗口
319 | // 因为一张网卡有一个xdp程序，所以不需要并发控制
320 | struct ring_buffer {
321 |     __u64 last_ack;
322 |     __u64 recvd;
323 |     __u32 consumer;   
324 |     __u32 producer;   //next to be produce 
325 |     struct xdp_mp_add_addr buff[RING_MAX_SIZE];
326 | };
327 | ```
328 | 
329 | 1. 这个bpf hash 是一个 pinned table , 挂载在bpf 虚拟文件系统中，是的多个bpf程序可以通过该bpf hash 进行数据通信
330 | 
331 | 2. hash key 为 `mptcp_connect` 其实就是 tcp 四元组
332 | 3. hash value, 是一个称为 ring buffer 的数据结构： 
333 |    * last_ack : 上一次接收到的主流的 mptcp dss 选项 ack 的值，使用该值来计算并更新流的大小
334 |    * recvd: 目前估算的接收到的数据包的总大小（bytes) 即流的大小
335 |    * buff : 这是一个环形队列，队列元素是 mptcp add address 选项，队列大小为 16 
336 |    * consumer: 环形队列消费者指针
337 |    * producer: 环形队列生产者指针
338 | 
339 | 该数据结构的重点就是这个环形队列，该队列的作用是： 
340 | 
341 | 最多保存该mptcp主连接**最近的接收到的RING_MAX_SIZE**个mptcp add address 选项。最近的意思是，假设目前队列已满，但是还是收到了新的 add address, 那么会将最老的 add address 选项删除，写入新的 add address 选项。
342 | 
343 | ring buffer 的完整定义如下： 
344 | 
345 | ```c
346 | #define RING_MAX_SHIFT 4
347 | #define RING_MAX_SIZE (1 << RING_MAX_SHIFT)
348 | #define RING_MASK ((RING_MAX_SIZE) - 1)
349 | 
350 | //0 : we can consume, means producer > consumer  
351 | static __always_inline int check_cons(struct ring_buffer *b) {
352 |     return !(b->consumer < b->producer);   //if consumer < producer return 0   
353 | }
354 | 
355 | 
356 | //once call this func and buff is full, the earliest item must be droped even if confirm isnot be called 
357 | static __always_inline struct xdp_mp_add_addr* ring_buff_prod (struct ring_buffer *b) {
358 |     if ((b->producer - b->consumer) == RING_MAX_SIZE) b->consumer++;
359 |     //produce
360 |     return &(b->buff[b->producer & RING_MASK]);
361 | }
362 | 
363 | static __always_inline void ring_buff_confirm_prod(struct ring_buffer *b) {
364 |     b->producer++;
365 | }
366 | 
367 | //should call check_cons first 
368 | static __always_inline const struct xdp_mp_add_addr* ring_buff_cons(struct ring_buffer *b) {
369 |     return &(b->buff[b->consumer & RING_MASK]);
370 | }
371 | 
372 | static __always_inline void ring_buff_confirm_cons(struct ring_buffer *b) {
373 |     b->consumer++;
374 | }
375 | ```
376 | 
377 | 通过帮助函数： check_cons， ring_buff_prod， ring_buff_confirm_prod ， ring_buff_cons， ring_buff_confirm_cons
378 | 
379 | 为了保证队列的一致性，我将生产和消费的过程都拆分成两个步骤： 
380 | 
381 | 1. 尝试 生产/消费
382 | 2. 确认 生产/消费
383 | 
384 | #### record flow 
385 | 
386 | (./kern_programs/record_flow.c)
387 | 
388 | record_flow bpf程序的作用是**记录新的mptcp主流** , 这个bpf程序比较简单，主要流程如下图所示:
389 | 
390 | ```mermaid
391 | flowchart TD
392 | 	pkt(pkt)-->get_tcp_header-->is_mptcp{contains mptcp capable option}
393 | 	is_mptcp-->|no|XDP_PASS(XDP_PASS)
394 | 	is_mptcp-->|yes|record[update mptcp_connect]-->XDP_PASS2(XDP_PASS)
395 | ```
396 | 
397 | 这里的 update mptcp_connect,  mptcp_connect 是上文提到的维护的核心数据结构 bpf hash的名称, 可以理解为插入一个新的key。
398 | 
399 | #### store_and_rm_add_addr
400 | 
401 | (./kern_programs/store_and_rm_add_addr.c)
402 | 
403 | 该bpf子程序的作用是： 
404 | 
405 | 1. 判断是否携带 add address options
406 | 2. 如果携带，将 add address options 保存到对应的 ring buffer中，并将其全部更改为 nop(即删除该选项)
407 | 
408 | 流程图如下： 
409 | 
410 | ```mermaid
411 | flowchart TD
412 | 	pkt(pkt)-->get_tcp_header-->add_addr{contains mptcp add addr option}
413 | 	add_addr-->|no|XDP_PASS(XDP_PASS)
414 | 	add_addr-->|yes|record_addaddr-->remove_addaddr-->XDP_PASS2(XDP_PASS)
415 | 	
416 | ```
417 | 
418 | 这个bpf程序也比较简单，比较关键的地方在于，remove add addr 选项并不是通过减少 packet空间，将add addr 选项的字节删除，而是通过用 nop(tcp options中用来表示空选项的选项，常用来作为填充和对齐)覆盖 add addr 的字节来实现的。这样做开销是最小的，在第一部分我们提到，xdp 只能从packet头或者尾扩展空间。
419 | 
420 | #### recover_add_addr 
421 | 
422 | (./kern_programs/recover_add_addr.c)
423 | 
424 | 这个bpf子程序功能描述起来很简单：将之前保存的 add addr 选项重新放置到收到的packet的 tcp options中。虽然功能简单，但是由于bpf的种种限制，具体编码实现的时候有点复杂。
425 | 
426 | 核心问题是如何恢复，这里涉及到以下难点： 
427 | 
428 | 1. 收到的ack包是很有可能携带数据的
429 | 2. tcp选项的长度是不固定的
430 | 3. 前文所述，想要扩展空间只能从头部或者尾部扩展
431 | 
432 | 由于packet携带数据，因此我们只能使用 `bpf_xdp_adjust_head`从头部开始调整空间，然后将原有的包往前“平移“到新的空间，多余的空间插入 add address option。如下图所示： 
433 | 
434 | ![image-20220117160645160](利用xdp修改packet的实践.assets/image-20220117160645160.png)
435 | 
436 | （具体的请参考代码） 
437 | 
438 | ### 工程结构
439 | 
440 | 随着项目增大，一个合适的工程结构，对于之后的开发，新模块的加入，代码的阅读来说都是十分重要的。这个项目是基于 bcc 开发的，这里经过我的探索和实践，摸索出了一套合适（还有待改进）工程结构，这里对此进行阐述。
441 | 
442 | 目录结构如下： 
443 | 
444 | ![image-20220117161105612](利用xdp修改packet的实践.assets/image-20220117161105612.png)
445 | 
446 | 和 xdp 有关的文件是： 
447 | 
448 | 1. kern_programs ： 存放着所有的内核态bpf程序
449 | 2. xdp_funcs_loader.py : 全自动化的 bpf 程序编译脚本
450 | 3. mptcp_xdp_loader.py : 调用xdp_funcs_loader.py， 用户最终使用的加载 xdp程序的脚本
451 | 
452 | 其中 kern_programs 文件夹内容如下： 
453 | 
454 | ![image-20220117161317391](利用xdp修改packet的实践.assets/image-20220117161317391.png)
455 | 
456 | #### kern_programs
457 | 
458 | 首先在前文，我提到bpf支持的函数调用类型很有限，因此在本项目中我采用： 
459 | 
460 | 1. 共有的内联函数（static __always_inline) 都抽象到公共的头文件中，common.h(一些通用的内联函数和宏函数)， ring_buffer.h(环形队列相关的数据结构定义和方法)
461 | 2. bpf子程序（主程序和被尾调用的程序），一个子程序使用一个.c文件，并且子程序的bpf函数名和文件名相同（这个可以带来很大的好处）
462 | 3. xdp bpf 主程序统一命名为 xdp_main 
463 | 
464 | 以 record_flow.c 为例子： 
465 | 
466 | 其bpf函数定义如下： 
467 | 
468 | ```c
469 | int record_flow(struct xdp_md *ctx) {
470 |     void *data_end = (void *)(long)ctx->data_end;
471 |     void *data = (void *)(long)ctx->data;
472 |     
473 |     __u32 default_action = XDP_PASS; /* Default action */
474 | 
475 |     struct hdr_cursor nh = {.pos=data};
476 |     int res, tcphl;
477 |     struct ethhdr *eth; 
478 |     struct iphdr *iph;
479 |     struct tcphdr *tcph;
480 |     
481 |     tcphl = res = is_tcp_packet(&nh, data_end, &eth, &iph, &tcph);
482 |     .....
483 | }
484 | ```
485 | 
486 | #### xdp_funcs_loader
487 | 
488 | bcc 提供最大的便利，就是让我们能够方便地编译bpf程序，并将其加载到bpf虚拟机中。上文提到，所有的bpf子程序的名称和文件名称是相通的。因此我们扫描 kern_programs 文件夹下的所有的 .c 文件，我们就能够知道所有的 bpf 程序，并获取它们的文件描述符, 这样我们能实现不需要修改编译脚本，自动化地编译所有的bpf子程序，为之后添加新的bpf子程序提供了便利。
489 | 
490 | ```python
491 | PYTHON_SRC_DIR = os.path.abspath(os.path.dirname(__file__))
492 | PROJECT_DIR = os.path.abspath(os.path.dirname(PYTHON_SRC_DIR))
493 | 
494 | KERN_PROGRAM_DIR = os.path.join(PYTHON_SRC_DIR, 'kern_programs')
495 | XDP_MAIN_FUNC = "xdp_main"
496 | 
497 | FILE_BLACK_LIST = ["xdp_sub_program_kern.c", "xdp_add_addr_kern.c"]
498 | #FILE_WHITE_LIST = ["recover_add_addr.c"]
499 | FILE_WHITE_LIST = None
500 | 
501 | XDP_PROG_ARRAY = "xdp_actions"
502 | TAIL_CALL_DICT = {
503 |     "record_flow" : 0,
504 |     "store_and_rm_add_addr" : 1,
505 |     "recover_add_addr" : 2
506 | }
507 | ```
508 | 
509 | 上面是这个脚本的变量定义的部分： 
510 | 
511 | 1. FILE_BLACK_LIST 和 FILE_WHITE_LIST 指定我们跳过或者是只编译某些.c文件 ，这为调试带来了极大的便利
512 | 2. TAIL_CALL_DICT 这是定义了尾调用中索引和文件描述符的关系。
513 | 
514 | 总结一下，通过一些trick 我们就能实现以最小的修改代价实现自动化编译并加载bpf程序，充分发挥bcc的优势，同时也让项目工程结构更加清晰。
515 | 
516 | ### 决策
517 | 
518 | 前文我们提到了有三个决策变量： 
519 | 
520 | 1. enable ：全局开关
521 | 2. remove_param
522 | 3. recover_param
523 | 
524 | 这三个决策变量保存在一个 pinned table 中： 
525 | 
526 | ```c
527 | BPF_TABLE_PINNED("array", u32, int, mptcp_policy_context, 3, "/sys/fs/bpf/mptcp_policy_context")
528 | ```
529 | 
530 | 因此决策做的事就是，在用户态通过某些计算，将这三个变量写入 mptcp_policy_context中，而内核态的xdp程序会从 mptcp_policy_context 读取决策变量，这样就实现动态更改内核态xdp程序行为的目的。
531 | 
532 | 对此我也进行了一些封装： 
533 | 
534 | 其中： 
535 | 
536 | 1. policy_maker.py 封装了各种决策器, 目前有 constant 决策器(写入固定的参数)， 和 cpu决策器（根据cpu利用率，动态调整recover_param） 
537 | 2. mptcp_policy_maker.py ， 调用 policy_maker.py 中的方法，封装了将决策器计算的值写入 mptcp_poinly_context中
538 | 
539 | 换言之，如果要新增新的决策器，我们修改policy_maker.py即可，同时修改mptcp_policy_maker.py中对应的配置： 
540 | 
541 | ```python 
542 | MODE_DICT = {
543 |     "constant" : ConstantProxyMaker,
544 |     "cpu-usage" : CPUUsageProxyMaker
545 | }
546 | 
547 | #key : 决策器名， value: 类名
548 | ```
549 | 
550 | #### 开发新的policy maker
551 | 
552 | 新的policy maker类应该具备以下方法和属性
553 | 
554 | ```python 
555 | class NewPolicyMaker
556 |    self.permanent #决策器是否长期存在, true ：长期存在， false: 仅进行一次决策
557 |    self.interval. #如果 self.permanent = true, 该值表示决策的频率
558 | 
559 |    #决策函数，return (remove_param, recover_param)
560 |    def make_decision(self):
561 |         psss 
562 | ```
563 | 
564 | 具体的请参照 policy_maker.py中的例子
565 | 
566 | ### 其它编程技巧
567 | 
568 | #### 解决循环问题
569 | 
570 | 我们目前使用的bpf是无法使用循环的，有一个比较方便的替代方案： 
571 | 
572 | ```c
573 | #pragma unroll 40
574 | for (int i = 0; i < 40; i++) {
575 |     ....
576 | }
577 | ```
578 | 
579 | 这里的 40 是循环次数，这个循环次数是一个可以在编译期决定的值，不能是一个变量。#pragma unroll 实际上只是在编译期由编译器将循环展开，这样在bpf验证器看来是没有循环的。
580 | 
581 | 在本项目中比较典型的就是，扫描tcp options。 我们知道 tcp选项长度是不固定的，每个选项也是不固定的，然而bpf又不允许使用循环。因此我只能采用逐字节扫描的方式，配合 goto 和 #pragma unroll 实现。
582 | 
583 | 比如判断 tcp options 中是否携带 mptcp dss 选项的代码如下： 
584 | 
585 | ```c
586 | #define CHECK_MP_DSS(next, curr, opt_l, pos, de){\
587 |     if (curr >= opt_l) {\
588 |         goto out_of_bound;\
589 |     }\
590 |     if (next == curr) {\
591 |         struct mp_dss *tmp = pos;\
592 |         if ((void*)(tmp + 1) > de) {\
593 |             goto out_of_bound;\
594 |         }\
595 |         if (tmp->kind == 30 && tmp->sub == 2){\
596 |             goto out;\
597 |         }\
598 |         if (tmp->kind == 0 || tmp->kind == 1) {\
599 |             pos += 1;\
600 |             next += 1;\
601 |         }\
602 |         else {\
603 |             pos += tmp->len;\
604 |             next+= tmp->len;\
605 |         }\
606 |     }\
607 | }\
608 | 
609 | static __always_inline int carry_mp_dss(struct hdr_cursor *nh,
610 |         void *data_end,
611 |         int opt_length,
612 |         struct mp_dss **mp_dss) {
613 |     //if contains  mptcp option return 0
614 |     void *start = nh->pos;
615 |     int next = 0;
616 |     #pragma unroll 40
617 |     for (int i = 0; i < 40; i++) {
618 |         CHECK_MP_DSS(next, i, opt_length, start, data_end);
619 |     }
620 | 
621 | out_of_bound: 
622 |     return -2;
623 | out: 
624 |     nh->pos = start;
625 |     *mp_dss = (struct mp_dss*)start;
626 |     return 0;
627 | }
628 | ```
629 | 
630 | #### memcpy
631 | 
632 | 在recover_add_addr子程序中，我们需要**"平移header"**, 这是通过将原有的header保存到临时空间中，然后再复制回去实现的。此时memcpy能起比较重要的作用。
633 | 
634 | 但是bpf是不能直接调用memcpy的，我们只能调用 ` __builtin_memcpy(dst,src,4)`
635 | 
636 | 这是一个宏函数，特别值得注意的是，最后一个参数必须是编译期定值，不能是一个变量，否则是无法通过bpf验证器验证的
637 | 
638 | #### 计算校验和
639 | 
640 | 计算校验和可以直接使用 linux kernel 的宏函数。定义在： 
641 | 
642 | `#include <net/checksum.h>`
643 | 
644 | 头文件中。
645 | 
646 | ```c
647 | static inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new)
648 | {
649 | 	*sum = ~csum16_add(csum16_sub(~(*sum), old), new);
650 | }
651 | 
652 | 
653 | static inline __sum16 csum16_add(__sum16 csum, __be16 addend)
654 | {
655 | 	u16 res = (__force u16)csum;
656 | 
657 | 	res += (__force u16)addend;
658 | 	return (__force __sum16)(res + (res < (__force u16)addend));
659 | }
660 | ```
661 | 
662 | csum_replace2 比较直观。特别要提醒的是 csum16_add 计算得到的值还需要再按位取反才是tcp header字段中的 csum 
663 | 
664 | ## 遇到的问题和解决策略
665 | 
666 | ### directly packet access 及其验证
667 | 
668 | 这个问题是开发xdp程序中最容易犯的错误，归根到底我们再写代码的过程中一定要时刻牢记： **使用指针之前必须对指针进行验证**。
669 | 
670 | 举个例子： 
671 | 
672 | 还是以扫描 tcp options 中是否携带 mp capable 为例: 
673 | 
674 | ```c
675 | #define CHECK_MP_DSS(next, curr, opt_l, pos, de){\
676 |     if (curr >= opt_l) {\
677 |         goto out_of_bound;\
678 |     }\
679 |     if (next == curr) {\
680 |         struct mp_dss *tmp = pos;\
681 |         if ((void*)(tmp + 1) > de) {\
682 |             goto out_of_bound;\
683 |         }\
684 |         if (tmp->kind == 30 && tmp->sub == 2){\
685 |             goto out;\
686 |         }\
687 |         if (tmp->kind == 0 || tmp->kind == 1) {\
688 |             pos += 1;\
689 |             next += 1;\
690 |         }\
691 |         else {\
692 |             pos += tmp->len;\
693 |             next+= tmp->len;\
694 |         }\
695 |     }\
696 | }\
697 | 
698 | static __always_inline int carry_mp_dss(struct hdr_cursor *nh,
699 |         void *data_end,
700 |         int opt_length,
701 |         struct mp_dss **mp_dss) {
702 |     //if contains  mptcp option return 0
703 |     void *start = nh->pos;
704 |     int next = 0;
705 |     #pragma unroll 40
706 |     for (int i = 0; i < 40; i++) {
707 |         CHECK_MP_DSS(next, i, opt_length, start, data_end);
708 |     }
709 | 
710 | out_of_bound: 
711 |     return -2;
712 | out: 
713 |     nh->pos = start;
714 |     *mp_dss = (struct mp_dss*)start;
715 |     return 0;
716 | }
717 | ```
718 | 
719 | 上面是正确的代码，但是如果我们修改为： 
720 | 
721 | ```c
722 | #define CHECK_MP_DSS(next, curr, opt_l, pos, de){\
723 |     if (curr >= opt_l) {\
724 |         goto out_of_bound;\
725 |     }\
726 |     if (next == curr) {\
727 |         struct mp_dss *tmp = pos;\
728 |         /*
729 |         //将这部分删除
730 |         if ((void*)(tmp + 1) > de) {\
731 |             goto out_of_bound;\
732 |         }\
733 |         */
734 |         if (tmp->kind == 30 && tmp->sub == 2){\
735 |             goto out;\
736 |         }\
737 |         if (tmp->kind == 0 || tmp->kind == 1) {\
738 |             pos += 1;\
739 |             next += 1;\
740 |         }\
741 |         else {\
742 |             pos += tmp->len;\
743 |             next+= tmp->len;\
744 |         }\
745 |     }\
746 | }\
747 |     
748 |     static __always_inline int carry_mp_dss(struct hdr_cursor *nh,
749 |         void *data_end,
750 |         int opt_length,
751 |         struct mp_dss **mp_dss) {
752 |     //if contains  mptcp option return 0
753 |     void *start = nh->pos;
754 |     int next = 0;
755 |     /*
756 |     	增加一个总的判断
757 |     */
758 |     if (start + opt_length > data_end) {
759 |     	return;
760 |     }
761 |     #pragma unroll 40
762 |     for (int i = 0; i < 40; i++) {
763 |         CHECK_MP_DSS(next, i, opt_length, start, data_end);
764 |     }
765 | 
766 | out_of_bound: 
767 |     return -2;
768 | out: 
769 |     nh->pos = start;
770 |     *mp_dss = (struct mp_dss*)start;
771 |     return 0;
772 | }
773 | ```
774 | 
775 | 验证器就会报错。而且很难找到错误
776 | 
777 | 原因在于，我们自认为在最外层做一次总的判断就够了（判断start），但是bpf验证器并没有那么聪明，它关注的是我们使用的指针 tmp, 因此即使我们最外层做了验证，不在每次使用tmp的时候都进行验证，bpf验证器是不会高兴的。
778 | 
779 | 因此一种比较好的实践是： 
780 | 
781 | 采用扫描的方法： 
782 | 
783 | 定义
784 | 
785 | ```c
786 | struct hdr_cursor {
787 |     void *pos;
788 | };
789 | ```
790 | 
791 | 1. 从packet开始到结束，扫描packet， pos 就是扫描线，每次扫描都进行验证
792 | 2. 用新的变量记录下已经验证过的pos (记录重要的节点)
793 | 
794 | 入下图所示： 
795 | 
796 | ![image-20220117170708671](利用xdp修改packet的实践.assets/image-20220117170708671.png)
797 | 
798 | ethdr , iphdr tcphdr 分别是 以太网，ip, tcp 的头部指针，都是 pos 曾经扫描过的地方。
799 | 
800 | ### bpf代码复杂度问题
801 | 
802 | ![image-20220117165340867](利用xdp修改packet的实践.assets/image-20220117165340867.png)
803 | 
804 | 上面这个是bpf验证器的报错，很有误导性。
805 | 
806 | 报错的真实原因我认为是： 
807 | 
808 | bpf程序的圈复杂度太高了，可以简单理解为，分支太多，嵌套的条件判断太多。因为我暂时还没有阅读bpf验证器的源码，比较好的解决方法只有，尽量减少条件判断和嵌套的条件判断，如果一段bpf程序过于复杂那么就拆分成多个 bpf程序，然后使用尾调用。
809 | 
810 | ## 总结
811 | 
812 | 这个项目思路简单，难度在于真正的代码开发和调试，所谓的写代码5min ，调试2h。为了提升bpf程序的开发效率，除了积累总结大量开发经验之外，阅读bpf验证器的源码，弄清楚bpf验证器的实现是很关键的。因为很多时候编译很容易能通过，但是验证不容易通过，报错信息也十分模棱两可。


--------------------------------------------------------------------------------
/Doc/eBPF_Resources.assets/image-20220901115500782.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chonepieceyb/eBPF-documentation/6a4e8d286d7f5d440345d98fa0f7eeb8fed9cd5d/Doc/eBPF_Resources.assets/image-20220901115500782.png


--------------------------------------------------------------------------------
/Doc/eBPF_Resources.md:
--------------------------------------------------------------------------------
  1 | # BPF Documents
  2 | 
  3 | ## 内核邮件讨论
  4 | [eBPF程序执行过程关闭抢占的假设在高版本内核是错误的，一个per-cpu MAP尽量只被一个eBPF程序使用](https://lore.kernel.org/bpf/CAMy7=ZWPc279vnKK6L1fssp5h7cb6cqS9_EuMNbfVBg_ixmTrQ@mail.gmail.com/T/)
  5 | 
  6 | ## 内核社区文档
  7 | 
  8 | [国内的Linux相关内容很不错的网站](http://www.wowotech.net/sort/irq_subsystem)
  9 | 
 10 | ## 权威文档或开发记录
 11 | 
 12 | [kernel-tree](https://kernel.googlesource.com/pub/scm/linux/kernel/git/bpf/bpf-next)
 13 | 
 14 | [xdp meta 内核开发提交记录](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=de8f3a83b0a0fddb2cf56e7a718127e9619ea3da)
 15 | 
 16 | [libbpf tc attach API](https://patchwork.kernel.org/project/netdevbpf/patch/20210512103451.989420-3-memxor@gmail.com/)
 17 | 
 18 | [eBPF指令集](https://docs.kernel.org/bpf/instruction-set.html)
 19 | 
 20 | [bpf: Introduce bpf sk local storage(每一个socket单独开一块存储给BPF程序,不用再使用HASH MAP)](https://lore.kernel.org/bpf/20190426233939.1330422-1-kafai@fb.com/)
 21 | 
 22 | eBPF启动原子操作 
 23 | 
 24 | ![image-20220901115500782](./eBPF_Resources.assets/image-20220901115500782.png)
 25 | 
 26 | [Program Types and ELF Sections](https://docs.kernel.org/bpf/libbpf/program_types.html#program-types-and-elf)
 27 | 
 28 | [kernel5.6eBPF支持struct_ops和拥塞控制](https://lwn.net/Articles/811631/)
 29 | 
 30 | [kernel5.13eBPF支持调用内核函数](https://lwn.net/Articles/856005/)
 31 | 
 32 | [引入struct_ops的patch](https://lore.kernel.org/bpf/20191214004737.1652076-1-kafai@fb.com/)
 33 | 
 34 | [BTF kernel 文档](https://docs.kernel.org/bpf/btf.html)
 35 | 
 36 | [BPF Iterator 文档](https://docs.kernel.org/bpf/bpf_iterators.html)
 37 | 
 38 | [BPF kptr支持](https://lwn.net/Articles/900749/)
 39 | 
 40 | [SECure COMPuting with filters](https://www.kernel.org/doc/Documentation/prctl/seccomp_filter.txt)
 41 | 
 42 | [Introduce the BPF dispatcher](https://lore.kernel.org/bpf/20191211123017.13212-1-bjorn.topel@gmail.com/)
 43 | 
 44 | [BPF skeleton介绍](https://manpages.ubuntu.com/manpages/focal/man8/bpftool-gen.8.html) 
 45 | 
 46 | [Long-lived kernel pointers in BPF](https://lwn.net/Articles/900749/)
 47 | 
 48 | ## 教程
 49 | 
 50 | [eBPF程序类型](https://arthurchiao.art/blog/bpf-advanced-notes-1-zh/)
 51 | 
 52 | [BTF简介与入门实践](https://www.containiq.com/post/btf-bpf-type-format)
 53 | 
 54 | [eBPF BTF kernel 文档的中文翻译](https://www.ebpf.top/post/kernel_btf/)
 55 | 
 56 | [BTF vmlinux功能概述](https://www.ebpf.top/post/intro_vmlinux_h/)
 57 | 
 58 | [eBPF ring buffer用法](https://www.ebpf.top/post/bpf_ring_buffer/)
 59 | 
 60 | [eBPF TCP相关程序类型教程sock_op, struct_op](https://arthurchiao.art/blog/bpf-advanced-notes-5-zh/)
 61 | 
 62 | ## 问题定位&解决(stack overflow)
 63 | 
 64 | [tc 无法正确 redirect](https://stackoverflow.com/questions/63352978/why-dont-bpf-redirect-work-correctlyxdp)
 65 | 
 66 | [af_xdp](https://stackoverflow.com/questions/72329171/bpf-map-type-xskmap-element-not-set-even-though-xsk-socket-create-succeeds)
 67 | 
 68 | [af_xdp need wakeup](https://stackoverflow.com/questions/60828204/af-xdp-how-does-xdp-use-need-wakeup-work-e-g-how-to-reduce-ksoftirqd-load)
 69 | 
 70 | [Userspace程序的bpf_ktime_get_ns](https://stackoverflow.com/a/60976989)
 71 | 
 72 | 
 73 | ## Linux源码相关
 74 | 
 75 | [Linux 内核API(看/写代码的时候，跨函数查阅)](https://www.kernel.org/doc/html/v5.19/core-api/index.html)
 76 | 
 77 | [Linux 内核源码在线阅读网站(支持宏，函数，变量的搜索)](https://elixir.bootlin.com/linux/latest/source)
 78 | 
 79 | [Linux 内核揭秘(Linunx inside汉化版)](https://github.com/MintCN/linux-insides-zh)
 80 | 
 81 | [Linux 性能调优和tracing工具使用思维导图(密码:linux)](https://www.processon.com/view/link/62ef5b4e0791292e9d378261#map)
 82 | 
 83 | [Linux Tracing systems & how they fit together](https://jvns.ca/blog/2017/07/05/linux-tracing-systems/)
 84 | 
 85 | [Linux 中断机制学习资料(softirq,tasklet)](https://www.cnblogs.com/arnoldlu/p/8659972.html)
 86 | 
 87 | [Linux workqueue CMWQ 介绍](https://www.jianshu.com/p/4f25d39fdff4)
 88 | 
 89 | [Linux 内核代码中的双向链表list_head介绍](https://www.linuxidc.com/Linux/2011-10/44627.htm)
 90 | 
 91 | [Linux 内核等待队列 Wait queue介绍](https://www.cnblogs.com/gctech/p/6872301.html)
 92 | 
 93 | [Linux rcu机制介绍(链表操作常见)](https://zhuanlan.zhihu.com/p/113999842)
 94 | 
 95 | [Linux kernel链表，字符串常见API](https://www.cnblogs.com/lifexy/p/10175143.html)
 96 | 
 97 | [Linux per cpu变量的原理及其实现(内含重新编译某个.o文件的方法)](https://zhuanlan.zhihu.com/p/340985476)
 98 | 
 99 | [Linux 内核内存回收机制](https://zhuanlan.zhihu.com/p/72998605)
100 | 
101 | [Linux cache机制(文章比较老但是看完能有一个大致的理解](https://developer.aliyun.com/article/338201)
102 | 
103 | [Linux 内核之旅-内核网络相关博客](http://kerneltravel.net/categories/%E5%86%85%E6%A0%B8%E7%BD%91%E7%BB%9C/)
104 | 
105 | [Linux zero-copy network](https://lwn.net/Articles/726917/.)
106 | 
107 | [Linux spin_lock和raw_spin_lock](https://blog.csdn.net/DroidPhone/article/details/7395983)
108 | 
109 | [Linux kernel run document](https://docs.kernel.org/RCU/whatisRCU.html#rcu-overview)
110 | 
111 | [Linux 内核变更记录总结](https://kernelnewbies.org/LinuxVersions)
112 | 
113 | ## Linux kernel documentation 
114 | 
115 | [Linux 内存屏障](https://www.kernel.org/doc/Documentation/memory-barriers.txt)
116 | 
117 | [Linux atomic_t](https://www.kernel.org/doc/Documentation/atomic_t.txt) 
118 | 
119 | ## eBPF use case 
120 | [使用EBPF来进行TCP拥塞控制](https://www.ebpf.top/post/ebpf_struct_ops/)
121 | 
122 | [MPTCP: Extending kernel functionality with eBPF and Netlink(使用eBPF的struct op来扩展MPTCP scheduler)](https://lpc.events/event/16/contributions/1354/)
123 | 
124 | ## eBPF视频
125 | [LPC2022 eBPF & Networking](https://www.youtube.com/watch?v=andvNRUAAs0&t=5237s)
126 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # eBPF-documentation
 2 | eBPF documentation 
 3 | 
 4 | ## 文档规则
 5 | 
 6 | Doc/eBPF_Resources.md 
 7 | 
 8 | 这个文档主要以链接搜集为主,  对于一些链接资料，直接把简介和链接贴上去即可。为了减少记录的时间成本。直接在github上修改文件即可。尽量不要插入图片
 9 | 
10 | Doc/Notes 每个人各自的笔记，带有 .asserts后缀的文件姐用来存放图片资源, 原则上不要修改其它人的笔记，这样可以减少冲突，.asserts 文件夹和.md文件放在同一文件夹下
11 | 
12 | 推荐使用 typora来写笔记
13 | 


--------------------------------------------------------------------------------