├── .gitignore ├── .gitmodules ├── LICENSE ├── Makefile ├── README.md ├── azure-pipelines.yaml ├── headers ├── bpf_endian.h ├── bpf_legacy.h ├── bpf_util.h ├── jhash.h ├── linux │ ├── bpf.h │ ├── err.h │ ├── if_link.h │ └── if_xdp.h └── perf-sys.h ├── scripts └── ci_test.sh └── src ├── common.h ├── keepalive_gre.c └── keepalive_gre6.c /.gitignore: -------------------------------------------------------------------------------- 1 | *.ll 2 | *.pcap 3 | build/ 4 | 5 | # Created by https://www.gitignore.io/api/c 6 | # Edit at https://www.gitignore.io/?templates=c 7 | 8 | ### C ### 9 | # Prerequisites 10 | *.d 11 | 12 | # Object files 13 | *.o 14 | *.ko 15 | *.obj 16 | *.elf 17 | 18 | # Linker output 19 | *.ilk 20 | *.map 21 | *.exp 22 | 23 | # Precompiled Headers 24 | *.gch 25 | *.pch 26 | 27 | # Libraries 28 | *.lib 29 | *.a 30 | *.la 31 | *.lo 32 | 33 | # Shared objects (inc. Windows DLLs) 34 | *.dll 35 | *.so 36 | *.so.* 37 | *.dylib 38 | 39 | # Executables 40 | *.exe 41 | *.out 42 | *.app 43 | *.i*86 44 | *.x86_64 45 | *.hex 46 | 47 | # Debug files 48 | *.dSYM/ 49 | *.su 50 | *.idb 51 | *.pdb 52 | 53 | # Kernel Module Compile Results 54 | *.mod* 55 | *.cmd 56 | .tmp_versions/ 57 | modules.order 58 | Module.symvers 59 | Mkfile.old 60 | dkms.conf 61 | 62 | # End of https://www.gitignore.io/api/c -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "libbpf"] 2 | path = libbpf 3 | url = https://github.com/libbpf/libbpf.git 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) 2 | 3 | SRC_DIR = src 4 | BUILD_DIR = build 5 | 6 | XDP_C = $(wildcard $(SRC_DIR)/*.c) 7 | XDP_OBJ = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o, $(XDP_C)) 8 | 9 | USER_LIBS := 10 | EXTRA_DEPS := 11 | 12 | LLC ?= llc 13 | CLANG ?= clang 14 | CC ?= gcc 15 | 16 | LIBBPF_DIR = libbpf/src/ 17 | OBJECT_LIBBPF = $(LIBBPF_DIR)/libbpf.a 18 | 19 | CFLAGS ?= -I$(LIBBPF_DIR)/build/usr/include/ -g 20 | CFLAGS += -I../headers/ 21 | LDFLAGS ?= -L$(LIBBPF_DIR) 22 | 23 | LIBS = -l:libbpf.a -lelf $(USER_LIBS) 24 | 25 | BPF_CFLAGS ?= -I$(LIBBPF_DIR)/build/usr/include/ -I../headers/ 26 | BPF_CFLAGS += -Wall -Wno-unused-value -Wno-pointer-sign -Wno-compare-distinct-pointer-types 27 | BPF_CFLAGS_EXTRA ?= -Werror -Wno-visibility 28 | BPF_CFLAGS_USER ?= 29 | 30 | ifeq ($(DEBUG), 1) 31 | BPF_CFLAGS_USER += -DDEBUG 32 | endif 33 | 34 | all: llvm-check $(XDP_OBJ) 35 | 36 | .PHONY: clean $(CLANG) $(LLC) 37 | 38 | clean: 39 | rm -rf $(LIBBPF_DIR)/build 40 | $(MAKE) -C $(LIBBPF_DIR) clean 41 | rm -rf $(BUILD_DIR) 42 | rm -f *~ 43 | 44 | llvm-check: $(CLANG) $(LLC) 45 | @for TOOL in $^ ; do \ 46 | if [ ! $$(command -v $${TOOL} 2>/dev/null) ]; then \ 47 | echo "*** ERROR: Cannot find tool $${TOOL}" ;\ 48 | exit 1; \ 49 | else true; fi; \ 50 | done 51 | 52 | $(BUILD_DIR): 53 | mkdir -p $(BUILD_DIR) 54 | 55 | $(OBJECT_LIBBPF): 56 | @if [ ! -d $(LIBBPF_DIR) ]; then \ 57 | echo "Error: Need libbpf submodule"; \ 58 | echo "May need to run git submodule update --init"; \ 59 | exit 1; \ 60 | else \ 61 | cd $(LIBBPF_DIR) && $(MAKE) all; \ 62 | mkdir -p build; DESTDIR=build $(MAKE) install_headers; \ 63 | fi 64 | 65 | $(XDP_OBJ): $(BUILD_DIR)/%.o: $(SRC_DIR)/%.c $(BUILD_DIR) $(OBJECT_LIBBPF) Makefile $(EXTRA_DEPS) 66 | $(CLANG) -S \ 67 | -target bpf \ 68 | -D __BPF_TRACING__ \ 69 | $(BPF_CFLAGS) $(BPF_CFLAGS_EXTRA) $(BPF_CFLAGS_USER) \ 70 | -O2 -emit-llvm -c -g -o ${@:.o=.ll} $< 71 | $(LLC) -march=bpf -filetype=obj -o $@ ${@:.o=.ll} 72 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # linux-gre-keepalive 2 | 3 | This eBPF program adds high-performance reply-only GRE keepalive support for Linux kernel. 4 | 5 | [![Build Status](https://dev.azure.com/nekomimiswitch/General/_apis/build/status/linux-gre-keepalive?branchName=master)](https://dev.azure.com/nekomimiswitch/General/_build/latest?definitionId=78&branchName=master) 6 | 7 | Note: If you don't want to install anything and don't care about some potential security problems, just enable the following 2 options to get native GRE keepalive support on Linux: 8 | ``` 9 | sysctl net.ipv4.conf.default.accept_local=1 10 | sysctl net.ipv4.conf.all.accept_local=1 11 | ``` 12 | 13 | ## Compatiblity 14 | 15 | | Protocol | Linux name | XDP Executable | Tested Vendors | Comments | 16 | |---------- |------------ |------------------ |----------------- |------------- | 17 | | GRE | gre | keepalive_gre.o | Cisco, MikroTik | | 18 | | GRE6 | ip6gre | keepalive_gre6.o | MikroTik | | 19 | 20 | ## Usage 21 | 22 | Simply load the correct XDP executable on the tunnel interface you just created. For example, assume you have set up the GRE tunnel as `gre0`, to enable GRE keepalive: 23 | 24 | ```shell 25 | ip link set dev gre0 xdp object build/keepalive_gre.o 26 | ``` 27 | 28 | To disable it without removing the tunnel interface: 29 | 30 | ```shell 31 | ip link set dev gre0 xdp off 32 | ``` 33 | 34 | Loading an executable on other types of interfaces is considered an undefined behavior. 35 | 36 | ## Caveats 37 | 38 | ### GRE on Cisco IOS XE 39 | 40 | On Cisco IOS XE, you must explicitly configure an ip address or an ipv6 address to make the GRE tunnel actually send something. If you don't configure IP addresses, `debug tunnel keepalive` will still show keepalive packets being sent, but the other end won't receive anything. A valid configuration example: 41 | 42 | ``` 43 | interface Tunnel10 44 | ip address 10.0.0.1 255.255.255.0 45 | keepalive 1 2 46 | tunnel source GigabitEthernet1 47 | tunnel destination your.other.end.ip.address 48 | tunnel mode gre ip 49 | ``` 50 | 51 | ### GRE6 (ip6gre) keepalive support 52 | 53 | GRE6 keepalive is not supported by: 54 | 55 | * [Cisco IOS XE](https://www.cisco.com/c/en/us/td/docs/ios-xml/ios/interface/configuration/xe-16-6/ir-xe-16-6-book/ir-gre-ipv6-tunls-xe.html#GUID-B8369497-671A-4B51-A749-A81971011A29) 56 | * [Juniper Junos OS](https://www.juniper.net/documentation/en_US/junos/topics/concept/gre-keepalive-time-overview.html) 57 | 58 | MikroTik RouterOS implements their own GRE IPv6 keepalive with inner GRE header's proto field set to `0x86dd`. This have been implemented by us. 59 | 60 | ## Building 61 | 62 | Assume we are on a Debian 10. 63 | 64 | ```shell 65 | sudo apt install build-essential clang llvm libelf-dev gcc-multilib linux-headers-$(dpkg --print-architecture) 66 | make all 67 | ``` 68 | 69 | ### Debugging 70 | 71 | View compiled bytecode: 72 | 73 | ```shell 74 | llvm-objdump -S build/keepalive_gre.o 75 | ``` 76 | 77 | Enabling debugging output: 78 | 79 | ```c 80 | #define DEBUG 81 | #define DEBUG_PRINT_HEADER_SIZE 32 82 | ``` 83 | 84 | Then view debug output after enabling it by: 85 | 86 | ```shell 87 | cat /sys/kernel/debug/tracing/trace_pipe 88 | ``` 89 | 90 | ## References 91 | 92 | Here's a list of awesome articles and projects I found useful: 93 | 94 | * [BPF and XDP Reference Guide](https://docs.cilium.io/en/latest/bpf/) 95 | * [xdp-project/xdp-tutorial](https://github.com/xdp-project/xdp-tutorial) 96 | * [dpino/xdp_ipv6_filter](https://github.com/dpino/xdp_ipv6_filter) 97 | * [How GRE Keepalives Work](https://www.cisco.com/c/en/us/support/docs/ip/generic-routing-encapsulation-gre/63760-gre-keepalives-63760.html) 98 | * [OISF/suricata](https://github.com/OISF/suricata) 99 | * [iovisor/bpf-docs](https://github.com/iovisor/bpf-docs) 100 | * [PaulTimmins/linux-gre-keepalive](https://github.com/PaulTimmins/linux-gre-keepalive) 101 | * [An introduction to Linux virtual interfaces: Tunnels](https://developers.redhat.com/blog/2019/05/17/an-introduction-to-linux-virtual-interfaces-tunnels/) 102 | -------------------------------------------------------------------------------- /azure-pipelines.yaml: -------------------------------------------------------------------------------- 1 | name: $(Date:yyyyMMdd).$(Rev:r) 2 | 3 | trigger: 4 | batch: true 5 | branches: 6 | include: [ "*" ] 7 | paths: 8 | exclude: [ "README.md" ] 9 | 10 | jobs: 11 | - job: build 12 | displayName: "Build" 13 | pool: 14 | vmImage: "ubuntu-latest" 15 | workspace: 16 | clean: all 17 | timeoutInMinutes: 10 18 | 19 | steps: 20 | - checkout: 'self' 21 | clean: true 22 | submodules: 'recursive' 23 | 24 | - bash: | 25 | sudo apt update 26 | sudo apt install build-essential clang llvm libelf-dev gcc-multilib linux-headers-$(uname -r) 27 | displayName: 'Install dependencies' 28 | 29 | - bash: | 30 | make DEBUG=1 all 31 | displayName: 'Build (debug)' 32 | 33 | - bash: | 34 | rm -r ${BUILD_ARTIFACTSTAGINGDIRECTORY}/* 35 | cp build/* ${BUILD_ARTIFACTSTAGINGDIRECTORY} 36 | displayName: 'Copy artifacts (debug)' 37 | 38 | - task: PublishBuildArtifacts@1 39 | displayName: 'Publish Artifacts (debug)' 40 | inputs: 41 | artifactName: 'debug' 42 | 43 | - bash: | 44 | sudo -E scripts/ci_test.sh 45 | displayName: 'Test (debug)' 46 | 47 | - bash: | 48 | rm -r build/* 49 | make all 50 | displayName: 'Build (production)' 51 | 52 | - bash: | 53 | rm -r ${BUILD_ARTIFACTSTAGINGDIRECTORY}/* 54 | cp build/* ${BUILD_ARTIFACTSTAGINGDIRECTORY} 55 | displayName: 'Copy artifacts (production)' 56 | 57 | - task: PublishBuildArtifacts@1 58 | displayName: 'Publish Artifacts (production)' 59 | inputs: 60 | artifactName: 'production' 61 | 62 | - bash: | 63 | sudo -E scripts/ci_test.sh 64 | displayName: 'Test (production)' -------------------------------------------------------------------------------- /headers/bpf_endian.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: GPL-2.0 */ 2 | /* Copied from $(LINUX)/tools/testing/selftests/bpf/bpf_endian.h */ 3 | #ifndef __BPF_ENDIAN__ 4 | #define __BPF_ENDIAN__ 5 | 6 | #include 7 | 8 | /* LLVM's BPF target selects the endianness of the CPU 9 | * it compiles on, or the user specifies (bpfel/bpfeb), 10 | * respectively. The used __BYTE_ORDER__ is defined by 11 | * the compiler, we cannot rely on __BYTE_ORDER from 12 | * libc headers, since it doesn't reflect the actual 13 | * requested byte order. 14 | * 15 | * Note, LLVM's BPF target has different __builtin_bswapX() 16 | * semantics. It does map to BPF_ALU | BPF_END | BPF_TO_BE 17 | * in bpfel and bpfeb case, which means below, that we map 18 | * to cpu_to_be16(). We could use it unconditionally in BPF 19 | * case, but better not rely on it, so that this header here 20 | * can be used from application and BPF program side, which 21 | * use different targets. 22 | */ 23 | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 24 | # define __bpf_ntohs(x)__builtin_bswap16(x) 25 | # define __bpf_htons(x)__builtin_bswap16(x) 26 | # define __bpf_constant_ntohs(x)___constant_swab16(x) 27 | # define __bpf_constant_htons(x)___constant_swab16(x) 28 | # define __bpf_ntohl(x)__builtin_bswap32(x) 29 | # define __bpf_htonl(x)__builtin_bswap32(x) 30 | # define __bpf_constant_ntohl(x)___constant_swab32(x) 31 | # define __bpf_constant_htonl(x)___constant_swab32(x) 32 | #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 33 | # define __bpf_ntohs(x)(x) 34 | # define __bpf_htons(x)(x) 35 | # define __bpf_constant_ntohs(x)(x) 36 | # define __bpf_constant_htons(x)(x) 37 | # define __bpf_ntohl(x)(x) 38 | # define __bpf_htonl(x)(x) 39 | # define __bpf_constant_ntohl(x)(x) 40 | # define __bpf_constant_htonl(x)(x) 41 | #else 42 | # error "Fix your compiler's __BYTE_ORDER__?!" 43 | #endif 44 | 45 | #define bpf_htons(x)\ 46 | (__builtin_constant_p(x) ?\ 47 | __bpf_constant_htons(x) : __bpf_htons(x)) 48 | #define bpf_ntohs(x)\ 49 | (__builtin_constant_p(x) ?\ 50 | __bpf_constant_ntohs(x) : __bpf_ntohs(x)) 51 | #define bpf_htonl(x)\ 52 | (__builtin_constant_p(x) ?\ 53 | __bpf_constant_htonl(x) : __bpf_htonl(x)) 54 | #define bpf_ntohl(x)\ 55 | (__builtin_constant_p(x) ?\ 56 | __bpf_constant_ntohl(x) : __bpf_ntohl(x)) 57 | 58 | #endif /* __BPF_ENDIAN__ */ 59 | -------------------------------------------------------------------------------- /headers/bpf_legacy.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ 2 | #ifndef __BPF_LEGACY__ 3 | #define __BPF_LEGACY__ 4 | 5 | /* 6 | * legacy bpf_map_def with extra fields supported only by bpf_load(), do not 7 | * use outside of samples/bpf 8 | */ 9 | struct bpf_map_def_legacy { 10 | unsigned int type; 11 | unsigned int key_size; 12 | unsigned int value_size; 13 | unsigned int max_entries; 14 | unsigned int map_flags; 15 | unsigned int inner_map_idx; 16 | unsigned int numa_node; 17 | }; 18 | 19 | #define BPF_ANNOTATE_KV_PAIR(name, type_key, type_val) \ 20 | struct ____btf_map_##name { \ 21 | type_key key; \ 22 | type_val value; \ 23 | }; \ 24 | struct ____btf_map_##name \ 25 | __attribute__ ((section(".maps." #name), used)) \ 26 | ____btf_map_##name = { } 27 | 28 | /* llvm builtin functions that eBPF C program may use to 29 | * emit BPF_LD_ABS and BPF_LD_IND instructions 30 | */ 31 | unsigned long long load_byte(void *skb, 32 | unsigned long long off) asm("llvm.bpf.load.byte"); 33 | unsigned long long load_half(void *skb, 34 | unsigned long long off) asm("llvm.bpf.load.half"); 35 | unsigned long long load_word(void *skb, 36 | unsigned long long off) asm("llvm.bpf.load.word"); 37 | 38 | #endif 39 | 40 | -------------------------------------------------------------------------------- /headers/bpf_util.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: GPL-2.0 */ 2 | /* Copied from $(LINUX)/tools/testing/selftests/bpf/bpf_util.h */ 3 | #ifndef __BPF_UTIL__ 4 | #define __BPF_UTIL__ 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | static inline unsigned int bpf_num_possible_cpus(void) 12 | { 13 | static const char *fcpu = "/sys/devices/system/cpu/possible"; 14 | unsigned int start, end, possible_cpus = 0; 15 | char buff[128]; 16 | FILE *fp; 17 | int n; 18 | 19 | fp = fopen(fcpu, "r"); 20 | if (!fp) { 21 | printf("Failed to open %s: '%s'!\n", fcpu, strerror(errno)); 22 | exit(1); 23 | } 24 | 25 | while (fgets(buff, sizeof(buff), fp)) { 26 | n = sscanf(buff, "%u-%u", &start, &end); 27 | if (n == 0) { 28 | printf("Failed to retrieve # possible CPUs!\n"); 29 | exit(1); 30 | } else if (n == 1) { 31 | end = start; 32 | } 33 | possible_cpus = start == 0 ? end + 1 : 0; 34 | break; 35 | } 36 | fclose(fp); 37 | 38 | return possible_cpus; 39 | } 40 | 41 | #define __bpf_percpu_val_align __attribute__((__aligned__(8))) 42 | 43 | #define BPF_DECLARE_PERCPU(type, name) \ 44 | struct { type v; /* padding */ } __bpf_percpu_val_align \ 45 | name[bpf_num_possible_cpus()] 46 | #define bpf_percpu(name, cpu) name[(cpu)].v 47 | 48 | #ifndef ARRAY_SIZE 49 | # define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 50 | #endif 51 | 52 | #ifndef sizeof_field 53 | #define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER)) 54 | #endif 55 | 56 | #ifndef offsetofend 57 | #define offsetofend(TYPE, MEMBER) \ 58 | (offsetof(TYPE, MEMBER) + sizeof_field(TYPE, MEMBER)) 59 | #endif 60 | 61 | #endif /* __BPF_UTIL__ */ 62 | -------------------------------------------------------------------------------- /headers/jhash.h: -------------------------------------------------------------------------------- 1 | #ifndef _LINUX_JHASH_H 2 | #define _LINUX_JHASH_H 3 | 4 | /* Copied from $(LINUX)/include/linux/jhash.h (kernel 4.18) */ 5 | 6 | /* jhash.h: Jenkins hash support. 7 | * 8 | * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net) 9 | * 10 | * http://burtleburtle.net/bob/hash/ 11 | * 12 | * These are the credits from Bob's sources: 13 | * 14 | * lookup3.c, by Bob Jenkins, May 2006, Public Domain. 15 | * 16 | * These are functions for producing 32-bit hashes for hash table lookup. 17 | * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() 18 | * are externally useful functions. Routines to test the hash are included 19 | * if SELF_TEST is defined. You can use this free for any purpose. It's in 20 | * the public domain. It has no warranty. 21 | * 22 | * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@blackhole.kfki.hu) 23 | */ 24 | 25 | static inline __u32 rol32(__u32 word, unsigned int shift) 26 | { 27 | return (word << shift) | (word >> ((-shift) & 31)); 28 | } 29 | 30 | /* copy paste of jhash from kernel sources (include/linux/jhash.h) to make sure 31 | * LLVM can compile it into valid sequence of BPF instructions 32 | */ 33 | #define __jhash_mix(a, b, c) \ 34 | { \ 35 | a -= c; a ^= rol32(c, 4); c += b; \ 36 | b -= a; b ^= rol32(a, 6); a += c; \ 37 | c -= b; c ^= rol32(b, 8); b += a; \ 38 | a -= c; a ^= rol32(c, 16); c += b; \ 39 | b -= a; b ^= rol32(a, 19); a += c; \ 40 | c -= b; c ^= rol32(b, 4); b += a; \ 41 | } 42 | 43 | #define __jhash_final(a, b, c) \ 44 | { \ 45 | c ^= b; c -= rol32(b, 14); \ 46 | a ^= c; a -= rol32(c, 11); \ 47 | b ^= a; b -= rol32(a, 25); \ 48 | c ^= b; c -= rol32(b, 16); \ 49 | a ^= c; a -= rol32(c, 4); \ 50 | b ^= a; b -= rol32(a, 14); \ 51 | c ^= b; c -= rol32(b, 24); \ 52 | } 53 | 54 | #define JHASH_INITVAL 0xdeadbeef 55 | 56 | typedef unsigned int u32; 57 | 58 | /* jhash - hash an arbitrary key 59 | * @k: sequence of bytes as key 60 | * @length: the length of the key 61 | * @initval: the previous hash, or an arbitray value 62 | * 63 | * The generic version, hashes an arbitrary sequence of bytes. 64 | * No alignment or length assumptions are made about the input key. 65 | * 66 | * Returns the hash value of the key. The result depends on endianness. 67 | */ 68 | static inline u32 jhash(const void *key, u32 length, u32 initval) 69 | { 70 | u32 a, b, c; 71 | const unsigned char *k = key; 72 | 73 | /* Set up the internal state */ 74 | a = b = c = JHASH_INITVAL + length + initval; 75 | 76 | /* All but the last block: affect some 32 bits of (a,b,c) */ 77 | while (length > 12) { 78 | a += *(u32 *)(k); 79 | b += *(u32 *)(k + 4); 80 | c += *(u32 *)(k + 8); 81 | __jhash_mix(a, b, c); 82 | length -= 12; 83 | k += 12; 84 | } 85 | /* Last block: affect all 32 bits of (c) */ 86 | switch (length) { 87 | case 12: c += (u32)k[11]<<24; /* fall through */ 88 | case 11: c += (u32)k[10]<<16; /* fall through */ 89 | case 10: c += (u32)k[9]<<8; /* fall through */ 90 | case 9: c += k[8]; /* fall through */ 91 | case 8: b += (u32)k[7]<<24; /* fall through */ 92 | case 7: b += (u32)k[6]<<16; /* fall through */ 93 | case 6: b += (u32)k[5]<<8; /* fall through */ 94 | case 5: b += k[4]; /* fall through */ 95 | case 4: a += (u32)k[3]<<24; /* fall through */ 96 | case 3: a += (u32)k[2]<<16; /* fall through */ 97 | case 2: a += (u32)k[1]<<8; /* fall through */ 98 | case 1: a += k[0]; 99 | __jhash_final(a, b, c); 100 | case 0: /* Nothing left to add */ 101 | break; 102 | } 103 | 104 | return c; 105 | } 106 | 107 | /* jhash2 - hash an array of u32's 108 | * @k: the key which must be an array of u32's 109 | * @length: the number of u32's in the key 110 | * @initval: the previous hash, or an arbitray value 111 | * 112 | * Returns the hash value of the key. 113 | */ 114 | static inline u32 jhash2(const u32 *k, u32 length, u32 initval) 115 | { 116 | u32 a, b, c; 117 | 118 | /* Set up the internal state */ 119 | a = b = c = JHASH_INITVAL + (length<<2) + initval; 120 | 121 | /* Handle most of the key */ 122 | while (length > 3) { 123 | a += k[0]; 124 | b += k[1]; 125 | c += k[2]; 126 | __jhash_mix(a, b, c); 127 | length -= 3; 128 | k += 3; 129 | } 130 | 131 | /* Handle the last 3 u32's */ 132 | switch (length) { 133 | case 3: c += k[2]; /* fall through */ 134 | case 2: b += k[1]; /* fall through */ 135 | case 1: a += k[0]; 136 | __jhash_final(a, b, c); 137 | case 0: /* Nothing left to add */ 138 | break; 139 | } 140 | 141 | return c; 142 | } 143 | 144 | 145 | /* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */ 146 | static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval) 147 | { 148 | a += initval; 149 | b += initval; 150 | c += initval; 151 | 152 | __jhash_final(a, b, c); 153 | 154 | return c; 155 | } 156 | 157 | static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval) 158 | { 159 | return __jhash_nwords(a, b, c, initval + JHASH_INITVAL + (3 << 2)); 160 | } 161 | 162 | static inline u32 jhash_2words(u32 a, u32 b, u32 initval) 163 | { 164 | return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2)); 165 | } 166 | 167 | static inline u32 jhash_1word(u32 a, u32 initval) 168 | { 169 | return __jhash_nwords(a, 0, 0, initval + JHASH_INITVAL + (1 << 2)); 170 | } 171 | 172 | #endif /* _LINUX_JHASH_H */ 173 | -------------------------------------------------------------------------------- /headers/linux/bpf.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ 2 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of version 2 of the GNU General Public 6 | * License as published by the Free Software Foundation. 7 | */ 8 | #ifndef _UAPI__LINUX_BPF_H__ 9 | #define _UAPI__LINUX_BPF_H__ 10 | 11 | #include 12 | #include 13 | 14 | /* Extended instruction set based on top of classic BPF */ 15 | 16 | /* instruction classes */ 17 | #define BPF_JMP32 0x06 /* jmp mode in word width */ 18 | #define BPF_ALU64 0x07 /* alu mode in double word width */ 19 | 20 | /* ld/ldx fields */ 21 | #define BPF_DW 0x18 /* double word (64-bit) */ 22 | #define BPF_XADD 0xc0 /* exclusive add */ 23 | 24 | /* alu/jmp fields */ 25 | #define BPF_MOV 0xb0 /* mov reg to reg */ 26 | #define BPF_ARSH 0xc0 /* sign extending arithmetic shift right */ 27 | 28 | /* change endianness of a register */ 29 | #define BPF_END 0xd0 /* flags for endianness conversion: */ 30 | #define BPF_TO_LE 0x00 /* convert to little-endian */ 31 | #define BPF_TO_BE 0x08 /* convert to big-endian */ 32 | #define BPF_FROM_LE BPF_TO_LE 33 | #define BPF_FROM_BE BPF_TO_BE 34 | 35 | /* jmp encodings */ 36 | #define BPF_JNE 0x50 /* jump != */ 37 | #define BPF_JLT 0xa0 /* LT is unsigned, '<' */ 38 | #define BPF_JLE 0xb0 /* LE is unsigned, '<=' */ 39 | #define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */ 40 | #define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */ 41 | #define BPF_JSLT 0xc0 /* SLT is signed, '<' */ 42 | #define BPF_JSLE 0xd0 /* SLE is signed, '<=' */ 43 | #define BPF_CALL 0x80 /* function call */ 44 | #define BPF_EXIT 0x90 /* function return */ 45 | 46 | /* Register numbers */ 47 | enum { 48 | BPF_REG_0 = 0, 49 | BPF_REG_1, 50 | BPF_REG_2, 51 | BPF_REG_3, 52 | BPF_REG_4, 53 | BPF_REG_5, 54 | BPF_REG_6, 55 | BPF_REG_7, 56 | BPF_REG_8, 57 | BPF_REG_9, 58 | BPF_REG_10, 59 | __MAX_BPF_REG, 60 | }; 61 | 62 | /* BPF has 10 general purpose 64-bit registers and stack frame. */ 63 | #define MAX_BPF_REG __MAX_BPF_REG 64 | 65 | struct bpf_insn { 66 | __u8 code; /* opcode */ 67 | __u8 dst_reg:4; /* dest register */ 68 | __u8 src_reg:4; /* source register */ 69 | __s16 off; /* signed offset */ 70 | __s32 imm; /* signed immediate constant */ 71 | }; 72 | 73 | /* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ 74 | struct bpf_lpm_trie_key { 75 | __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ 76 | __u8 data[0]; /* Arbitrary size */ 77 | }; 78 | 79 | struct bpf_cgroup_storage_key { 80 | __u64 cgroup_inode_id; /* cgroup inode id */ 81 | __u32 attach_type; /* program attach type */ 82 | }; 83 | 84 | /* BPF syscall commands, see bpf(2) man-page for details. */ 85 | enum bpf_cmd { 86 | BPF_MAP_CREATE, 87 | BPF_MAP_LOOKUP_ELEM, 88 | BPF_MAP_UPDATE_ELEM, 89 | BPF_MAP_DELETE_ELEM, 90 | BPF_MAP_GET_NEXT_KEY, 91 | BPF_PROG_LOAD, 92 | BPF_OBJ_PIN, 93 | BPF_OBJ_GET, 94 | BPF_PROG_ATTACH, 95 | BPF_PROG_DETACH, 96 | BPF_PROG_TEST_RUN, 97 | BPF_PROG_GET_NEXT_ID, 98 | BPF_MAP_GET_NEXT_ID, 99 | BPF_PROG_GET_FD_BY_ID, 100 | BPF_MAP_GET_FD_BY_ID, 101 | BPF_OBJ_GET_INFO_BY_FD, 102 | BPF_PROG_QUERY, 103 | BPF_RAW_TRACEPOINT_OPEN, 104 | BPF_BTF_LOAD, 105 | BPF_BTF_GET_FD_BY_ID, 106 | BPF_TASK_FD_QUERY, 107 | BPF_MAP_LOOKUP_AND_DELETE_ELEM, 108 | }; 109 | 110 | enum bpf_map_type { 111 | BPF_MAP_TYPE_UNSPEC, 112 | BPF_MAP_TYPE_HASH, 113 | BPF_MAP_TYPE_ARRAY, 114 | BPF_MAP_TYPE_PROG_ARRAY, 115 | BPF_MAP_TYPE_PERF_EVENT_ARRAY, 116 | BPF_MAP_TYPE_PERCPU_HASH, 117 | BPF_MAP_TYPE_PERCPU_ARRAY, 118 | BPF_MAP_TYPE_STACK_TRACE, 119 | BPF_MAP_TYPE_CGROUP_ARRAY, 120 | BPF_MAP_TYPE_LRU_HASH, 121 | BPF_MAP_TYPE_LRU_PERCPU_HASH, 122 | BPF_MAP_TYPE_LPM_TRIE, 123 | BPF_MAP_TYPE_ARRAY_OF_MAPS, 124 | BPF_MAP_TYPE_HASH_OF_MAPS, 125 | BPF_MAP_TYPE_DEVMAP, 126 | BPF_MAP_TYPE_SOCKMAP, 127 | BPF_MAP_TYPE_CPUMAP, 128 | BPF_MAP_TYPE_XSKMAP, 129 | BPF_MAP_TYPE_SOCKHASH, 130 | BPF_MAP_TYPE_CGROUP_STORAGE, 131 | BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, 132 | BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, 133 | BPF_MAP_TYPE_QUEUE, 134 | BPF_MAP_TYPE_STACK, 135 | }; 136 | 137 | /* Note that tracing related programs such as 138 | * BPF_PROG_TYPE_{KPROBE,TRACEPOINT,PERF_EVENT,RAW_TRACEPOINT} 139 | * are not subject to a stable API since kernel internal data 140 | * structures can change from release to release and may 141 | * therefore break existing tracing BPF programs. Tracing BPF 142 | * programs correspond to /a/ specific kernel which is to be 143 | * analyzed, and not /a/ specific kernel /and/ all future ones. 144 | */ 145 | enum bpf_prog_type { 146 | BPF_PROG_TYPE_UNSPEC, 147 | BPF_PROG_TYPE_SOCKET_FILTER, 148 | BPF_PROG_TYPE_KPROBE, 149 | BPF_PROG_TYPE_SCHED_CLS, 150 | BPF_PROG_TYPE_SCHED_ACT, 151 | BPF_PROG_TYPE_TRACEPOINT, 152 | BPF_PROG_TYPE_XDP, 153 | BPF_PROG_TYPE_PERF_EVENT, 154 | BPF_PROG_TYPE_CGROUP_SKB, 155 | BPF_PROG_TYPE_CGROUP_SOCK, 156 | BPF_PROG_TYPE_LWT_IN, 157 | BPF_PROG_TYPE_LWT_OUT, 158 | BPF_PROG_TYPE_LWT_XMIT, 159 | BPF_PROG_TYPE_SOCK_OPS, 160 | BPF_PROG_TYPE_SK_SKB, 161 | BPF_PROG_TYPE_CGROUP_DEVICE, 162 | BPF_PROG_TYPE_SK_MSG, 163 | BPF_PROG_TYPE_RAW_TRACEPOINT, 164 | BPF_PROG_TYPE_CGROUP_SOCK_ADDR, 165 | BPF_PROG_TYPE_LWT_SEG6LOCAL, 166 | BPF_PROG_TYPE_LIRC_MODE2, 167 | BPF_PROG_TYPE_SK_REUSEPORT, 168 | BPF_PROG_TYPE_FLOW_DISSECTOR, 169 | }; 170 | 171 | enum bpf_attach_type { 172 | BPF_CGROUP_INET_INGRESS, 173 | BPF_CGROUP_INET_EGRESS, 174 | BPF_CGROUP_INET_SOCK_CREATE, 175 | BPF_CGROUP_SOCK_OPS, 176 | BPF_SK_SKB_STREAM_PARSER, 177 | BPF_SK_SKB_STREAM_VERDICT, 178 | BPF_CGROUP_DEVICE, 179 | BPF_SK_MSG_VERDICT, 180 | BPF_CGROUP_INET4_BIND, 181 | BPF_CGROUP_INET6_BIND, 182 | BPF_CGROUP_INET4_CONNECT, 183 | BPF_CGROUP_INET6_CONNECT, 184 | BPF_CGROUP_INET4_POST_BIND, 185 | BPF_CGROUP_INET6_POST_BIND, 186 | BPF_CGROUP_UDP4_SENDMSG, 187 | BPF_CGROUP_UDP6_SENDMSG, 188 | BPF_LIRC_MODE2, 189 | BPF_FLOW_DISSECTOR, 190 | __MAX_BPF_ATTACH_TYPE 191 | }; 192 | 193 | #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE 194 | 195 | /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command 196 | * 197 | * NONE(default): No further bpf programs allowed in the subtree. 198 | * 199 | * BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program, 200 | * the program in this cgroup yields to sub-cgroup program. 201 | * 202 | * BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program, 203 | * that cgroup program gets run in addition to the program in this cgroup. 204 | * 205 | * Only one program is allowed to be attached to a cgroup with 206 | * NONE or BPF_F_ALLOW_OVERRIDE flag. 207 | * Attaching another program on top of NONE or BPF_F_ALLOW_OVERRIDE will 208 | * release old program and attach the new one. Attach flags has to match. 209 | * 210 | * Multiple programs are allowed to be attached to a cgroup with 211 | * BPF_F_ALLOW_MULTI flag. They are executed in FIFO order 212 | * (those that were attached first, run first) 213 | * The programs of sub-cgroup are executed first, then programs of 214 | * this cgroup and then programs of parent cgroup. 215 | * When children program makes decision (like picking TCP CA or sock bind) 216 | * parent program has a chance to override it. 217 | * 218 | * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups. 219 | * A cgroup with NONE doesn't allow any programs in sub-cgroups. 220 | * Ex1: 221 | * cgrp1 (MULTI progs A, B) -> 222 | * cgrp2 (OVERRIDE prog C) -> 223 | * cgrp3 (MULTI prog D) -> 224 | * cgrp4 (OVERRIDE prog E) -> 225 | * cgrp5 (NONE prog F) 226 | * the event in cgrp5 triggers execution of F,D,A,B in that order. 227 | * if prog F is detached, the execution is E,D,A,B 228 | * if prog F and D are detached, the execution is E,A,B 229 | * if prog F, E and D are detached, the execution is C,A,B 230 | * 231 | * All eligible programs are executed regardless of return code from 232 | * earlier programs. 233 | */ 234 | #define BPF_F_ALLOW_OVERRIDE (1U << 0) 235 | #define BPF_F_ALLOW_MULTI (1U << 1) 236 | 237 | /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the 238 | * verifier will perform strict alignment checking as if the kernel 239 | * has been built with CONFIG_EFFICIENT_UNALIGNED_ACCESS not set, 240 | * and NET_IP_ALIGN defined to 2. 241 | */ 242 | #define BPF_F_STRICT_ALIGNMENT (1U << 0) 243 | 244 | /* If BPF_F_ANY_ALIGNMENT is used in BPF_PROF_LOAD command, the 245 | * verifier will allow any alignment whatsoever. On platforms 246 | * with strict alignment requirements for loads ands stores (such 247 | * as sparc and mips) the verifier validates that all loads and 248 | * stores provably follow this requirement. This flag turns that 249 | * checking and enforcement off. 250 | * 251 | * It is mostly used for testing when we want to validate the 252 | * context and memory access aspects of the verifier, but because 253 | * of an unaligned access the alignment check would trigger before 254 | * the one we are interested in. 255 | */ 256 | #define BPF_F_ANY_ALIGNMENT (1U << 1) 257 | 258 | /* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */ 259 | #define BPF_PSEUDO_MAP_FD 1 260 | 261 | /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative 262 | * offset to another bpf function 263 | */ 264 | #define BPF_PSEUDO_CALL 1 265 | 266 | /* flags for BPF_MAP_UPDATE_ELEM command */ 267 | #define BPF_ANY 0 /* create new element or update existing */ 268 | #define BPF_NOEXIST 1 /* create new element if it didn't exist */ 269 | #define BPF_EXIST 2 /* update existing element */ 270 | #define BPF_F_LOCK 4 /* spin_lock-ed map_lookup/map_update */ 271 | 272 | /* flags for BPF_MAP_CREATE command */ 273 | #define BPF_F_NO_PREALLOC (1U << 0) 274 | /* Instead of having one common LRU list in the 275 | * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list 276 | * which can scale and perform better. 277 | * Note, the LRU nodes (including free nodes) cannot be moved 278 | * across different LRU lists. 279 | */ 280 | #define BPF_F_NO_COMMON_LRU (1U << 1) 281 | /* Specify numa node during map creation */ 282 | #define BPF_F_NUMA_NODE (1U << 2) 283 | 284 | #define BPF_OBJ_NAME_LEN 16U 285 | 286 | /* Flags for accessing BPF object */ 287 | #define BPF_F_RDONLY (1U << 3) 288 | #define BPF_F_WRONLY (1U << 4) 289 | 290 | /* Flag for stack_map, store build_id+offset instead of pointer */ 291 | #define BPF_F_STACK_BUILD_ID (1U << 5) 292 | 293 | /* Zero-initialize hash function seed. This should only be used for testing. */ 294 | #define BPF_F_ZERO_SEED (1U << 6) 295 | 296 | /* flags for BPF_PROG_QUERY */ 297 | #define BPF_F_QUERY_EFFECTIVE (1U << 0) 298 | 299 | enum bpf_stack_build_id_status { 300 | /* user space need an empty entry to identify end of a trace */ 301 | BPF_STACK_BUILD_ID_EMPTY = 0, 302 | /* with valid build_id and offset */ 303 | BPF_STACK_BUILD_ID_VALID = 1, 304 | /* couldn't get build_id, fallback to ip */ 305 | BPF_STACK_BUILD_ID_IP = 2, 306 | }; 307 | 308 | #define BPF_BUILD_ID_SIZE 20 309 | struct bpf_stack_build_id { 310 | __s32 status; 311 | unsigned char build_id[BPF_BUILD_ID_SIZE]; 312 | union { 313 | __u64 offset; 314 | __u64 ip; 315 | }; 316 | }; 317 | 318 | union bpf_attr { 319 | struct { /* anonymous struct used by BPF_MAP_CREATE command */ 320 | __u32 map_type; /* one of enum bpf_map_type */ 321 | __u32 key_size; /* size of key in bytes */ 322 | __u32 value_size; /* size of value in bytes */ 323 | __u32 max_entries; /* max number of entries in a map */ 324 | __u32 map_flags; /* BPF_MAP_CREATE related 325 | * flags defined above. 326 | */ 327 | __u32 inner_map_fd; /* fd pointing to the inner map */ 328 | __u32 numa_node; /* numa node (effective only if 329 | * BPF_F_NUMA_NODE is set). 330 | */ 331 | char map_name[BPF_OBJ_NAME_LEN]; 332 | __u32 map_ifindex; /* ifindex of netdev to create on */ 333 | __u32 btf_fd; /* fd pointing to a BTF type data */ 334 | __u32 btf_key_type_id; /* BTF type_id of the key */ 335 | __u32 btf_value_type_id; /* BTF type_id of the value */ 336 | }; 337 | 338 | struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ 339 | __u32 map_fd; 340 | __aligned_u64 key; 341 | union { 342 | __aligned_u64 value; 343 | __aligned_u64 next_key; 344 | }; 345 | __u64 flags; 346 | }; 347 | 348 | struct { /* anonymous struct used by BPF_PROG_LOAD command */ 349 | __u32 prog_type; /* one of enum bpf_prog_type */ 350 | __u32 insn_cnt; 351 | __aligned_u64 insns; 352 | __aligned_u64 license; 353 | __u32 log_level; /* verbosity level of verifier */ 354 | __u32 log_size; /* size of user buffer */ 355 | __aligned_u64 log_buf; /* user supplied buffer */ 356 | __u32 kern_version; /* not used */ 357 | __u32 prog_flags; 358 | char prog_name[BPF_OBJ_NAME_LEN]; 359 | __u32 prog_ifindex; /* ifindex of netdev to prep for */ 360 | /* For some prog types expected attach type must be known at 361 | * load time to verify attach type specific parts of prog 362 | * (context accesses, allowed helpers, etc). 363 | */ 364 | __u32 expected_attach_type; 365 | __u32 prog_btf_fd; /* fd pointing to BTF type data */ 366 | __u32 func_info_rec_size; /* userspace bpf_func_info size */ 367 | __aligned_u64 func_info; /* func info */ 368 | __u32 func_info_cnt; /* number of bpf_func_info records */ 369 | __u32 line_info_rec_size; /* userspace bpf_line_info size */ 370 | __aligned_u64 line_info; /* line info */ 371 | __u32 line_info_cnt; /* number of bpf_line_info records */ 372 | }; 373 | 374 | struct { /* anonymous struct used by BPF_OBJ_* commands */ 375 | __aligned_u64 pathname; 376 | __u32 bpf_fd; 377 | __u32 file_flags; 378 | }; 379 | 380 | struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */ 381 | __u32 target_fd; /* container object to attach to */ 382 | __u32 attach_bpf_fd; /* eBPF program to attach */ 383 | __u32 attach_type; 384 | __u32 attach_flags; 385 | }; 386 | 387 | struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ 388 | __u32 prog_fd; 389 | __u32 retval; 390 | __u32 data_size_in; /* input: len of data_in */ 391 | __u32 data_size_out; /* input/output: len of data_out 392 | * returns ENOSPC if data_out 393 | * is too small. 394 | */ 395 | __aligned_u64 data_in; 396 | __aligned_u64 data_out; 397 | __u32 repeat; 398 | __u32 duration; 399 | } test; 400 | 401 | struct { /* anonymous struct used by BPF_*_GET_*_ID */ 402 | union { 403 | __u32 start_id; 404 | __u32 prog_id; 405 | __u32 map_id; 406 | __u32 btf_id; 407 | }; 408 | __u32 next_id; 409 | __u32 open_flags; 410 | }; 411 | 412 | struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */ 413 | __u32 bpf_fd; 414 | __u32 info_len; 415 | __aligned_u64 info; 416 | } info; 417 | 418 | struct { /* anonymous struct used by BPF_PROG_QUERY command */ 419 | __u32 target_fd; /* container object to query */ 420 | __u32 attach_type; 421 | __u32 query_flags; 422 | __u32 attach_flags; 423 | __aligned_u64 prog_ids; 424 | __u32 prog_cnt; 425 | } query; 426 | 427 | struct { 428 | __u64 name; 429 | __u32 prog_fd; 430 | } raw_tracepoint; 431 | 432 | struct { /* anonymous struct for BPF_BTF_LOAD */ 433 | __aligned_u64 btf; 434 | __aligned_u64 btf_log_buf; 435 | __u32 btf_size; 436 | __u32 btf_log_size; 437 | __u32 btf_log_level; 438 | }; 439 | 440 | struct { 441 | __u32 pid; /* input: pid */ 442 | __u32 fd; /* input: fd */ 443 | __u32 flags; /* input: flags */ 444 | __u32 buf_len; /* input/output: buf len */ 445 | __aligned_u64 buf; /* input/output: 446 | * tp_name for tracepoint 447 | * symbol for kprobe 448 | * filename for uprobe 449 | */ 450 | __u32 prog_id; /* output: prod_id */ 451 | __u32 fd_type; /* output: BPF_FD_TYPE_* */ 452 | __u64 probe_offset; /* output: probe_offset */ 453 | __u64 probe_addr; /* output: probe_addr */ 454 | } task_fd_query; 455 | } __attribute__((aligned(8))); 456 | 457 | /* The description below is an attempt at providing documentation to eBPF 458 | * developers about the multiple available eBPF helper functions. It can be 459 | * parsed and used to produce a manual page. The workflow is the following, 460 | * and requires the rst2man utility: 461 | * 462 | * $ ./scripts/bpf_helpers_doc.py \ 463 | * --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst 464 | * $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7 465 | * $ man /tmp/bpf-helpers.7 466 | * 467 | * Note that in order to produce this external documentation, some RST 468 | * formatting is used in the descriptions to get "bold" and "italics" in 469 | * manual pages. Also note that the few trailing white spaces are 470 | * intentional, removing them would break paragraphs for rst2man. 471 | * 472 | * Start of BPF helper function descriptions: 473 | * 474 | * void *bpf_map_lookup_elem(struct bpf_map *map, const void *key) 475 | * Description 476 | * Perform a lookup in *map* for an entry associated to *key*. 477 | * Return 478 | * Map value associated to *key*, or **NULL** if no entry was 479 | * found. 480 | * 481 | * int bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags) 482 | * Description 483 | * Add or update the value of the entry associated to *key* in 484 | * *map* with *value*. *flags* is one of: 485 | * 486 | * **BPF_NOEXIST** 487 | * The entry for *key* must not exist in the map. 488 | * **BPF_EXIST** 489 | * The entry for *key* must already exist in the map. 490 | * **BPF_ANY** 491 | * No condition on the existence of the entry for *key*. 492 | * 493 | * Flag value **BPF_NOEXIST** cannot be used for maps of types 494 | * **BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY** (all 495 | * elements always exist), the helper would return an error. 496 | * Return 497 | * 0 on success, or a negative error in case of failure. 498 | * 499 | * int bpf_map_delete_elem(struct bpf_map *map, const void *key) 500 | * Description 501 | * Delete entry with *key* from *map*. 502 | * Return 503 | * 0 on success, or a negative error in case of failure. 504 | * 505 | * int bpf_probe_read(void *dst, u32 size, const void *src) 506 | * Description 507 | * For tracing programs, safely attempt to read *size* bytes from 508 | * address *src* and store the data in *dst*. 509 | * Return 510 | * 0 on success, or a negative error in case of failure. 511 | * 512 | * u64 bpf_ktime_get_ns(void) 513 | * Description 514 | * Return the time elapsed since system boot, in nanoseconds. 515 | * Return 516 | * Current *ktime*. 517 | * 518 | * int bpf_trace_printk(const char *fmt, u32 fmt_size, ...) 519 | * Description 520 | * This helper is a "printk()-like" facility for debugging. It 521 | * prints a message defined by format *fmt* (of size *fmt_size*) 522 | * to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if 523 | * available. It can take up to three additional **u64** 524 | * arguments (as an eBPF helpers, the total number of arguments is 525 | * limited to five). 526 | * 527 | * Each time the helper is called, it appends a line to the trace. 528 | * The format of the trace is customizable, and the exact output 529 | * one will get depends on the options set in 530 | * *\/sys/kernel/debug/tracing/trace_options* (see also the 531 | * *README* file under the same directory). However, it usually 532 | * defaults to something like: 533 | * 534 | * :: 535 | * 536 | * telnet-470 [001] .N.. 419421.045894: 0x00000001: 537 | * 538 | * In the above: 539 | * 540 | * * ``telnet`` is the name of the current task. 541 | * * ``470`` is the PID of the current task. 542 | * * ``001`` is the CPU number on which the task is 543 | * running. 544 | * * In ``.N..``, each character refers to a set of 545 | * options (whether irqs are enabled, scheduling 546 | * options, whether hard/softirqs are running, level of 547 | * preempt_disabled respectively). **N** means that 548 | * **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED** 549 | * are set. 550 | * * ``419421.045894`` is a timestamp. 551 | * * ``0x00000001`` is a fake value used by BPF for the 552 | * instruction pointer register. 553 | * * ```` is the message formatted with 554 | * *fmt*. 555 | * 556 | * The conversion specifiers supported by *fmt* are similar, but 557 | * more limited than for printk(). They are **%d**, **%i**, 558 | * **%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**, 559 | * **%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size 560 | * of field, padding with zeroes, etc.) is available, and the 561 | * helper will return **-EINVAL** (but print nothing) if it 562 | * encounters an unknown specifier. 563 | * 564 | * Also, note that **bpf_trace_printk**\ () is slow, and should 565 | * only be used for debugging purposes. For this reason, a notice 566 | * bloc (spanning several lines) is printed to kernel logs and 567 | * states that the helper should not be used "for production use" 568 | * the first time this helper is used (or more precisely, when 569 | * **trace_printk**\ () buffers are allocated). For passing values 570 | * to user space, perf events should be preferred. 571 | * Return 572 | * The number of bytes written to the buffer, or a negative error 573 | * in case of failure. 574 | * 575 | * u32 bpf_get_prandom_u32(void) 576 | * Description 577 | * Get a pseudo-random number. 578 | * 579 | * From a security point of view, this helper uses its own 580 | * pseudo-random internal state, and cannot be used to infer the 581 | * seed of other random functions in the kernel. However, it is 582 | * essential to note that the generator used by the helper is not 583 | * cryptographically secure. 584 | * Return 585 | * A random 32-bit unsigned value. 586 | * 587 | * u32 bpf_get_smp_processor_id(void) 588 | * Description 589 | * Get the SMP (symmetric multiprocessing) processor id. Note that 590 | * all programs run with preemption disabled, which means that the 591 | * SMP processor id is stable during all the execution of the 592 | * program. 593 | * Return 594 | * The SMP id of the processor running the program. 595 | * 596 | * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) 597 | * Description 598 | * Store *len* bytes from address *from* into the packet 599 | * associated to *skb*, at *offset*. *flags* are a combination of 600 | * **BPF_F_RECOMPUTE_CSUM** (automatically recompute the 601 | * checksum for the packet after storing the bytes) and 602 | * **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\ 603 | * **->swhash** and *skb*\ **->l4hash** to 0). 604 | * 605 | * A call to this helper is susceptible to change the underlaying 606 | * packet buffer. Therefore, at load time, all checks on pointers 607 | * previously done by the verifier are invalidated and must be 608 | * performed again, if the helper is used in combination with 609 | * direct packet access. 610 | * Return 611 | * 0 on success, or a negative error in case of failure. 612 | * 613 | * int bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size) 614 | * Description 615 | * Recompute the layer 3 (e.g. IP) checksum for the packet 616 | * associated to *skb*. Computation is incremental, so the helper 617 | * must know the former value of the header field that was 618 | * modified (*from*), the new value of this field (*to*), and the 619 | * number of bytes (2 or 4) for this field, stored in *size*. 620 | * Alternatively, it is possible to store the difference between 621 | * the previous and the new values of the header field in *to*, by 622 | * setting *from* and *size* to 0. For both methods, *offset* 623 | * indicates the location of the IP checksum within the packet. 624 | * 625 | * This helper works in combination with **bpf_csum_diff**\ (), 626 | * which does not update the checksum in-place, but offers more 627 | * flexibility and can handle sizes larger than 2 or 4 for the 628 | * checksum to update. 629 | * 630 | * A call to this helper is susceptible to change the underlaying 631 | * packet buffer. Therefore, at load time, all checks on pointers 632 | * previously done by the verifier are invalidated and must be 633 | * performed again, if the helper is used in combination with 634 | * direct packet access. 635 | * Return 636 | * 0 on success, or a negative error in case of failure. 637 | * 638 | * int bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags) 639 | * Description 640 | * Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the 641 | * packet associated to *skb*. Computation is incremental, so the 642 | * helper must know the former value of the header field that was 643 | * modified (*from*), the new value of this field (*to*), and the 644 | * number of bytes (2 or 4) for this field, stored on the lowest 645 | * four bits of *flags*. Alternatively, it is possible to store 646 | * the difference between the previous and the new values of the 647 | * header field in *to*, by setting *from* and the four lowest 648 | * bits of *flags* to 0. For both methods, *offset* indicates the 649 | * location of the IP checksum within the packet. In addition to 650 | * the size of the field, *flags* can be added (bitwise OR) actual 651 | * flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left 652 | * untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and 653 | * for updates resulting in a null checksum the value is set to 654 | * **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates 655 | * the checksum is to be computed against a pseudo-header. 656 | * 657 | * This helper works in combination with **bpf_csum_diff**\ (), 658 | * which does not update the checksum in-place, but offers more 659 | * flexibility and can handle sizes larger than 2 or 4 for the 660 | * checksum to update. 661 | * 662 | * A call to this helper is susceptible to change the underlaying 663 | * packet buffer. Therefore, at load time, all checks on pointers 664 | * previously done by the verifier are invalidated and must be 665 | * performed again, if the helper is used in combination with 666 | * direct packet access. 667 | * Return 668 | * 0 on success, or a negative error in case of failure. 669 | * 670 | * int bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index) 671 | * Description 672 | * This special helper is used to trigger a "tail call", or in 673 | * other words, to jump into another eBPF program. The same stack 674 | * frame is used (but values on stack and in registers for the 675 | * caller are not accessible to the callee). This mechanism allows 676 | * for program chaining, either for raising the maximum number of 677 | * available eBPF instructions, or to execute given programs in 678 | * conditional blocks. For security reasons, there is an upper 679 | * limit to the number of successive tail calls that can be 680 | * performed. 681 | * 682 | * Upon call of this helper, the program attempts to jump into a 683 | * program referenced at index *index* in *prog_array_map*, a 684 | * special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes 685 | * *ctx*, a pointer to the context. 686 | * 687 | * If the call succeeds, the kernel immediately runs the first 688 | * instruction of the new program. This is not a function call, 689 | * and it never returns to the previous program. If the call 690 | * fails, then the helper has no effect, and the caller continues 691 | * to run its subsequent instructions. A call can fail if the 692 | * destination program for the jump does not exist (i.e. *index* 693 | * is superior to the number of entries in *prog_array_map*), or 694 | * if the maximum number of tail calls has been reached for this 695 | * chain of programs. This limit is defined in the kernel by the 696 | * macro **MAX_TAIL_CALL_CNT** (not accessible to user space), 697 | * which is currently set to 32. 698 | * Return 699 | * 0 on success, or a negative error in case of failure. 700 | * 701 | * int bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags) 702 | * Description 703 | * Clone and redirect the packet associated to *skb* to another 704 | * net device of index *ifindex*. Both ingress and egress 705 | * interfaces can be used for redirection. The **BPF_F_INGRESS** 706 | * value in *flags* is used to make the distinction (ingress path 707 | * is selected if the flag is present, egress path otherwise). 708 | * This is the only flag supported for now. 709 | * 710 | * In comparison with **bpf_redirect**\ () helper, 711 | * **bpf_clone_redirect**\ () has the associated cost of 712 | * duplicating the packet buffer, but this can be executed out of 713 | * the eBPF program. Conversely, **bpf_redirect**\ () is more 714 | * efficient, but it is handled through an action code where the 715 | * redirection happens only after the eBPF program has returned. 716 | * 717 | * A call to this helper is susceptible to change the underlaying 718 | * packet buffer. Therefore, at load time, all checks on pointers 719 | * previously done by the verifier are invalidated and must be 720 | * performed again, if the helper is used in combination with 721 | * direct packet access. 722 | * Return 723 | * 0 on success, or a negative error in case of failure. 724 | * 725 | * u64 bpf_get_current_pid_tgid(void) 726 | * Return 727 | * A 64-bit integer containing the current tgid and pid, and 728 | * created as such: 729 | * *current_task*\ **->tgid << 32 \|** 730 | * *current_task*\ **->pid**. 731 | * 732 | * u64 bpf_get_current_uid_gid(void) 733 | * Return 734 | * A 64-bit integer containing the current GID and UID, and 735 | * created as such: *current_gid* **<< 32 \|** *current_uid*. 736 | * 737 | * int bpf_get_current_comm(char *buf, u32 size_of_buf) 738 | * Description 739 | * Copy the **comm** attribute of the current task into *buf* of 740 | * *size_of_buf*. The **comm** attribute contains the name of 741 | * the executable (excluding the path) for the current task. The 742 | * *size_of_buf* must be strictly positive. On success, the 743 | * helper makes sure that the *buf* is NUL-terminated. On failure, 744 | * it is filled with zeroes. 745 | * Return 746 | * 0 on success, or a negative error in case of failure. 747 | * 748 | * u32 bpf_get_cgroup_classid(struct sk_buff *skb) 749 | * Description 750 | * Retrieve the classid for the current task, i.e. for the net_cls 751 | * cgroup to which *skb* belongs. 752 | * 753 | * This helper can be used on TC egress path, but not on ingress. 754 | * 755 | * The net_cls cgroup provides an interface to tag network packets 756 | * based on a user-provided identifier for all traffic coming from 757 | * the tasks belonging to the related cgroup. See also the related 758 | * kernel documentation, available from the Linux sources in file 759 | * *Documentation/cgroup-v1/net_cls.txt*. 760 | * 761 | * The Linux kernel has two versions for cgroups: there are 762 | * cgroups v1 and cgroups v2. Both are available to users, who can 763 | * use a mixture of them, but note that the net_cls cgroup is for 764 | * cgroup v1 only. This makes it incompatible with BPF programs 765 | * run on cgroups, which is a cgroup-v2-only feature (a socket can 766 | * only hold data for one version of cgroups at a time). 767 | * 768 | * This helper is only available is the kernel was compiled with 769 | * the **CONFIG_CGROUP_NET_CLASSID** configuration option set to 770 | * "**y**" or to "**m**". 771 | * Return 772 | * The classid, or 0 for the default unconfigured classid. 773 | * 774 | * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) 775 | * Description 776 | * Push a *vlan_tci* (VLAN tag control information) of protocol 777 | * *vlan_proto* to the packet associated to *skb*, then update 778 | * the checksum. Note that if *vlan_proto* is different from 779 | * **ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to 780 | * be **ETH_P_8021Q**. 781 | * 782 | * A call to this helper is susceptible to change the underlaying 783 | * packet buffer. Therefore, at load time, all checks on pointers 784 | * previously done by the verifier are invalidated and must be 785 | * performed again, if the helper is used in combination with 786 | * direct packet access. 787 | * Return 788 | * 0 on success, or a negative error in case of failure. 789 | * 790 | * int bpf_skb_vlan_pop(struct sk_buff *skb) 791 | * Description 792 | * Pop a VLAN header from the packet associated to *skb*. 793 | * 794 | * A call to this helper is susceptible to change the underlaying 795 | * packet buffer. Therefore, at load time, all checks on pointers 796 | * previously done by the verifier are invalidated and must be 797 | * performed again, if the helper is used in combination with 798 | * direct packet access. 799 | * Return 800 | * 0 on success, or a negative error in case of failure. 801 | * 802 | * int bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) 803 | * Description 804 | * Get tunnel metadata. This helper takes a pointer *key* to an 805 | * empty **struct bpf_tunnel_key** of **size**, that will be 806 | * filled with tunnel metadata for the packet associated to *skb*. 807 | * The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which 808 | * indicates that the tunnel is based on IPv6 protocol instead of 809 | * IPv4. 810 | * 811 | * The **struct bpf_tunnel_key** is an object that generalizes the 812 | * principal parameters used by various tunneling protocols into a 813 | * single struct. This way, it can be used to easily make a 814 | * decision based on the contents of the encapsulation header, 815 | * "summarized" in this struct. In particular, it holds the IP 816 | * address of the remote end (IPv4 or IPv6, depending on the case) 817 | * in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also, 818 | * this struct exposes the *key*\ **->tunnel_id**, which is 819 | * generally mapped to a VNI (Virtual Network Identifier), making 820 | * it programmable together with the **bpf_skb_set_tunnel_key**\ 821 | * () helper. 822 | * 823 | * Let's imagine that the following code is part of a program 824 | * attached to the TC ingress interface, on one end of a GRE 825 | * tunnel, and is supposed to filter out all messages coming from 826 | * remote ends with IPv4 address other than 10.0.0.1: 827 | * 828 | * :: 829 | * 830 | * int ret; 831 | * struct bpf_tunnel_key key = {}; 832 | * 833 | * ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); 834 | * if (ret < 0) 835 | * return TC_ACT_SHOT; // drop packet 836 | * 837 | * if (key.remote_ipv4 != 0x0a000001) 838 | * return TC_ACT_SHOT; // drop packet 839 | * 840 | * return TC_ACT_OK; // accept packet 841 | * 842 | * This interface can also be used with all encapsulation devices 843 | * that can operate in "collect metadata" mode: instead of having 844 | * one network device per specific configuration, the "collect 845 | * metadata" mode only requires a single device where the 846 | * configuration can be extracted from this helper. 847 | * 848 | * This can be used together with various tunnels such as VXLan, 849 | * Geneve, GRE or IP in IP (IPIP). 850 | * Return 851 | * 0 on success, or a negative error in case of failure. 852 | * 853 | * int bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) 854 | * Description 855 | * Populate tunnel metadata for packet associated to *skb.* The 856 | * tunnel metadata is set to the contents of *key*, of *size*. The 857 | * *flags* can be set to a combination of the following values: 858 | * 859 | * **BPF_F_TUNINFO_IPV6** 860 | * Indicate that the tunnel is based on IPv6 protocol 861 | * instead of IPv4. 862 | * **BPF_F_ZERO_CSUM_TX** 863 | * For IPv4 packets, add a flag to tunnel metadata 864 | * indicating that checksum computation should be skipped 865 | * and checksum set to zeroes. 866 | * **BPF_F_DONT_FRAGMENT** 867 | * Add a flag to tunnel metadata indicating that the 868 | * packet should not be fragmented. 869 | * **BPF_F_SEQ_NUMBER** 870 | * Add a flag to tunnel metadata indicating that a 871 | * sequence number should be added to tunnel header before 872 | * sending the packet. This flag was added for GRE 873 | * encapsulation, but might be used with other protocols 874 | * as well in the future. 875 | * 876 | * Here is a typical usage on the transmit path: 877 | * 878 | * :: 879 | * 880 | * struct bpf_tunnel_key key; 881 | * populate key ... 882 | * bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0); 883 | * bpf_clone_redirect(skb, vxlan_dev_ifindex, 0); 884 | * 885 | * See also the description of the **bpf_skb_get_tunnel_key**\ () 886 | * helper for additional information. 887 | * Return 888 | * 0 on success, or a negative error in case of failure. 889 | * 890 | * u64 bpf_perf_event_read(struct bpf_map *map, u64 flags) 891 | * Description 892 | * Read the value of a perf event counter. This helper relies on a 893 | * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of 894 | * the perf event counter is selected when *map* is updated with 895 | * perf event file descriptors. The *map* is an array whose size 896 | * is the number of available CPUs, and each cell contains a value 897 | * relative to one CPU. The value to retrieve is indicated by 898 | * *flags*, that contains the index of the CPU to look up, masked 899 | * with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to 900 | * **BPF_F_CURRENT_CPU** to indicate that the value for the 901 | * current CPU should be retrieved. 902 | * 903 | * Note that before Linux 4.13, only hardware perf event can be 904 | * retrieved. 905 | * 906 | * Also, be aware that the newer helper 907 | * **bpf_perf_event_read_value**\ () is recommended over 908 | * **bpf_perf_event_read**\ () in general. The latter has some ABI 909 | * quirks where error and counter value are used as a return code 910 | * (which is wrong to do since ranges may overlap). This issue is 911 | * fixed with **bpf_perf_event_read_value**\ (), which at the same 912 | * time provides more features over the **bpf_perf_event_read**\ 913 | * () interface. Please refer to the description of 914 | * **bpf_perf_event_read_value**\ () for details. 915 | * Return 916 | * The value of the perf event counter read from the map, or a 917 | * negative error code in case of failure. 918 | * 919 | * int bpf_redirect(u32 ifindex, u64 flags) 920 | * Description 921 | * Redirect the packet to another net device of index *ifindex*. 922 | * This helper is somewhat similar to **bpf_clone_redirect**\ 923 | * (), except that the packet is not cloned, which provides 924 | * increased performance. 925 | * 926 | * Except for XDP, both ingress and egress interfaces can be used 927 | * for redirection. The **BPF_F_INGRESS** value in *flags* is used 928 | * to make the distinction (ingress path is selected if the flag 929 | * is present, egress path otherwise). Currently, XDP only 930 | * supports redirection to the egress interface, and accepts no 931 | * flag at all. 932 | * 933 | * The same effect can be attained with the more generic 934 | * **bpf_redirect_map**\ (), which requires specific maps to be 935 | * used but offers better performance. 936 | * Return 937 | * For XDP, the helper returns **XDP_REDIRECT** on success or 938 | * **XDP_ABORTED** on error. For other program types, the values 939 | * are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on 940 | * error. 941 | * 942 | * u32 bpf_get_route_realm(struct sk_buff *skb) 943 | * Description 944 | * Retrieve the realm or the route, that is to say the 945 | * **tclassid** field of the destination for the *skb*. The 946 | * indentifier retrieved is a user-provided tag, similar to the 947 | * one used with the net_cls cgroup (see description for 948 | * **bpf_get_cgroup_classid**\ () helper), but here this tag is 949 | * held by a route (a destination entry), not by a task. 950 | * 951 | * Retrieving this identifier works with the clsact TC egress hook 952 | * (see also **tc-bpf(8)**), or alternatively on conventional 953 | * classful egress qdiscs, but not on TC ingress path. In case of 954 | * clsact TC egress hook, this has the advantage that, internally, 955 | * the destination entry has not been dropped yet in the transmit 956 | * path. Therefore, the destination entry does not need to be 957 | * artificially held via **netif_keep_dst**\ () for a classful 958 | * qdisc until the *skb* is freed. 959 | * 960 | * This helper is available only if the kernel was compiled with 961 | * **CONFIG_IP_ROUTE_CLASSID** configuration option. 962 | * Return 963 | * The realm of the route for the packet associated to *skb*, or 0 964 | * if none was found. 965 | * 966 | * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) 967 | * Description 968 | * Write raw *data* blob into a special BPF perf event held by 969 | * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf 970 | * event must have the following attributes: **PERF_SAMPLE_RAW** 971 | * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and 972 | * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. 973 | * 974 | * The *flags* are used to indicate the index in *map* for which 975 | * the value must be put, masked with **BPF_F_INDEX_MASK**. 976 | * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** 977 | * to indicate that the index of the current CPU core should be 978 | * used. 979 | * 980 | * The value to write, of *size*, is passed through eBPF stack and 981 | * pointed by *data*. 982 | * 983 | * The context of the program *ctx* needs also be passed to the 984 | * helper. 985 | * 986 | * On user space, a program willing to read the values needs to 987 | * call **perf_event_open**\ () on the perf event (either for 988 | * one or for all CPUs) and to store the file descriptor into the 989 | * *map*. This must be done before the eBPF program can send data 990 | * into it. An example is available in file 991 | * *samples/bpf/trace_output_user.c* in the Linux kernel source 992 | * tree (the eBPF program counterpart is in 993 | * *samples/bpf/trace_output_kern.c*). 994 | * 995 | * **bpf_perf_event_output**\ () achieves better performance 996 | * than **bpf_trace_printk**\ () for sharing data with user 997 | * space, and is much better suitable for streaming data from eBPF 998 | * programs. 999 | * 1000 | * Note that this helper is not restricted to tracing use cases 1001 | * and can be used with programs attached to TC or XDP as well, 1002 | * where it allows for passing data to user space listeners. Data 1003 | * can be: 1004 | * 1005 | * * Only custom structs, 1006 | * * Only the packet payload, or 1007 | * * A combination of both. 1008 | * Return 1009 | * 0 on success, or a negative error in case of failure. 1010 | * 1011 | * int bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) 1012 | * Description 1013 | * This helper was provided as an easy way to load data from a 1014 | * packet. It can be used to load *len* bytes from *offset* from 1015 | * the packet associated to *skb*, into the buffer pointed by 1016 | * *to*. 1017 | * 1018 | * Since Linux 4.7, usage of this helper has mostly been replaced 1019 | * by "direct packet access", enabling packet data to be 1020 | * manipulated with *skb*\ **->data** and *skb*\ **->data_end** 1021 | * pointing respectively to the first byte of packet data and to 1022 | * the byte after the last byte of packet data. However, it 1023 | * remains useful if one wishes to read large quantities of data 1024 | * at once from a packet into the eBPF stack. 1025 | * Return 1026 | * 0 on success, or a negative error in case of failure. 1027 | * 1028 | * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags) 1029 | * Description 1030 | * Walk a user or a kernel stack and return its id. To achieve 1031 | * this, the helper needs *ctx*, which is a pointer to the context 1032 | * on which the tracing program is executed, and a pointer to a 1033 | * *map* of type **BPF_MAP_TYPE_STACK_TRACE**. 1034 | * 1035 | * The last argument, *flags*, holds the number of stack frames to 1036 | * skip (from 0 to 255), masked with 1037 | * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set 1038 | * a combination of the following flags: 1039 | * 1040 | * **BPF_F_USER_STACK** 1041 | * Collect a user space stack instead of a kernel stack. 1042 | * **BPF_F_FAST_STACK_CMP** 1043 | * Compare stacks by hash only. 1044 | * **BPF_F_REUSE_STACKID** 1045 | * If two different stacks hash into the same *stackid*, 1046 | * discard the old one. 1047 | * 1048 | * The stack id retrieved is a 32 bit long integer handle which 1049 | * can be further combined with other data (including other stack 1050 | * ids) and used as a key into maps. This can be useful for 1051 | * generating a variety of graphs (such as flame graphs or off-cpu 1052 | * graphs). 1053 | * 1054 | * For walking a stack, this helper is an improvement over 1055 | * **bpf_probe_read**\ (), which can be used with unrolled loops 1056 | * but is not efficient and consumes a lot of eBPF instructions. 1057 | * Instead, **bpf_get_stackid**\ () can collect up to 1058 | * **PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that 1059 | * this limit can be controlled with the **sysctl** program, and 1060 | * that it should be manually increased in order to profile long 1061 | * user stacks (such as stacks for Java programs). To do so, use: 1062 | * 1063 | * :: 1064 | * 1065 | * # sysctl kernel.perf_event_max_stack= 1066 | * Return 1067 | * The positive or null stack id on success, or a negative error 1068 | * in case of failure. 1069 | * 1070 | * s64 bpf_csum_diff(__be32 *from, u32 from_size, __be32 *to, u32 to_size, __wsum seed) 1071 | * Description 1072 | * Compute a checksum difference, from the raw buffer pointed by 1073 | * *from*, of length *from_size* (that must be a multiple of 4), 1074 | * towards the raw buffer pointed by *to*, of size *to_size* 1075 | * (same remark). An optional *seed* can be added to the value 1076 | * (this can be cascaded, the seed may come from a previous call 1077 | * to the helper). 1078 | * 1079 | * This is flexible enough to be used in several ways: 1080 | * 1081 | * * With *from_size* == 0, *to_size* > 0 and *seed* set to 1082 | * checksum, it can be used when pushing new data. 1083 | * * With *from_size* > 0, *to_size* == 0 and *seed* set to 1084 | * checksum, it can be used when removing data from a packet. 1085 | * * With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it 1086 | * can be used to compute a diff. Note that *from_size* and 1087 | * *to_size* do not need to be equal. 1088 | * 1089 | * This helper can be used in combination with 1090 | * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to 1091 | * which one can feed in the difference computed with 1092 | * **bpf_csum_diff**\ (). 1093 | * Return 1094 | * The checksum result, or a negative error code in case of 1095 | * failure. 1096 | * 1097 | * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size) 1098 | * Description 1099 | * Retrieve tunnel options metadata for the packet associated to 1100 | * *skb*, and store the raw tunnel option data to the buffer *opt* 1101 | * of *size*. 1102 | * 1103 | * This helper can be used with encapsulation devices that can 1104 | * operate in "collect metadata" mode (please refer to the related 1105 | * note in the description of **bpf_skb_get_tunnel_key**\ () for 1106 | * more details). A particular example where this can be used is 1107 | * in combination with the Geneve encapsulation protocol, where it 1108 | * allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper) 1109 | * and retrieving arbitrary TLVs (Type-Length-Value headers) from 1110 | * the eBPF program. This allows for full customization of these 1111 | * headers. 1112 | * Return 1113 | * The size of the option data retrieved. 1114 | * 1115 | * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size) 1116 | * Description 1117 | * Set tunnel options metadata for the packet associated to *skb* 1118 | * to the option data contained in the raw buffer *opt* of *size*. 1119 | * 1120 | * See also the description of the **bpf_skb_get_tunnel_opt**\ () 1121 | * helper for additional information. 1122 | * Return 1123 | * 0 on success, or a negative error in case of failure. 1124 | * 1125 | * int bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags) 1126 | * Description 1127 | * Change the protocol of the *skb* to *proto*. Currently 1128 | * supported are transition from IPv4 to IPv6, and from IPv6 to 1129 | * IPv4. The helper takes care of the groundwork for the 1130 | * transition, including resizing the socket buffer. The eBPF 1131 | * program is expected to fill the new headers, if any, via 1132 | * **skb_store_bytes**\ () and to recompute the checksums with 1133 | * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ 1134 | * (). The main case for this helper is to perform NAT64 1135 | * operations out of an eBPF program. 1136 | * 1137 | * Internally, the GSO type is marked as dodgy so that headers are 1138 | * checked and segments are recalculated by the GSO/GRO engine. 1139 | * The size for GSO target is adapted as well. 1140 | * 1141 | * All values for *flags* are reserved for future usage, and must 1142 | * be left at zero. 1143 | * 1144 | * A call to this helper is susceptible to change the underlaying 1145 | * packet buffer. Therefore, at load time, all checks on pointers 1146 | * previously done by the verifier are invalidated and must be 1147 | * performed again, if the helper is used in combination with 1148 | * direct packet access. 1149 | * Return 1150 | * 0 on success, or a negative error in case of failure. 1151 | * 1152 | * int bpf_skb_change_type(struct sk_buff *skb, u32 type) 1153 | * Description 1154 | * Change the packet type for the packet associated to *skb*. This 1155 | * comes down to setting *skb*\ **->pkt_type** to *type*, except 1156 | * the eBPF program does not have a write access to *skb*\ 1157 | * **->pkt_type** beside this helper. Using a helper here allows 1158 | * for graceful handling of errors. 1159 | * 1160 | * The major use case is to change incoming *skb*s to 1161 | * **PACKET_HOST** in a programmatic way instead of having to 1162 | * recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for 1163 | * example. 1164 | * 1165 | * Note that *type* only allows certain values. At this time, they 1166 | * are: 1167 | * 1168 | * **PACKET_HOST** 1169 | * Packet is for us. 1170 | * **PACKET_BROADCAST** 1171 | * Send packet to all. 1172 | * **PACKET_MULTICAST** 1173 | * Send packet to group. 1174 | * **PACKET_OTHERHOST** 1175 | * Send packet to someone else. 1176 | * Return 1177 | * 0 on success, or a negative error in case of failure. 1178 | * 1179 | * int bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index) 1180 | * Description 1181 | * Check whether *skb* is a descendant of the cgroup2 held by 1182 | * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. 1183 | * Return 1184 | * The return value depends on the result of the test, and can be: 1185 | * 1186 | * * 0, if the *skb* failed the cgroup2 descendant test. 1187 | * * 1, if the *skb* succeeded the cgroup2 descendant test. 1188 | * * A negative error code, if an error occurred. 1189 | * 1190 | * u32 bpf_get_hash_recalc(struct sk_buff *skb) 1191 | * Description 1192 | * Retrieve the hash of the packet, *skb*\ **->hash**. If it is 1193 | * not set, in particular if the hash was cleared due to mangling, 1194 | * recompute this hash. Later accesses to the hash can be done 1195 | * directly with *skb*\ **->hash**. 1196 | * 1197 | * Calling **bpf_set_hash_invalid**\ (), changing a packet 1198 | * prototype with **bpf_skb_change_proto**\ (), or calling 1199 | * **bpf_skb_store_bytes**\ () with the 1200 | * **BPF_F_INVALIDATE_HASH** are actions susceptible to clear 1201 | * the hash and to trigger a new computation for the next call to 1202 | * **bpf_get_hash_recalc**\ (). 1203 | * Return 1204 | * The 32-bit hash. 1205 | * 1206 | * u64 bpf_get_current_task(void) 1207 | * Return 1208 | * A pointer to the current task struct. 1209 | * 1210 | * int bpf_probe_write_user(void *dst, const void *src, u32 len) 1211 | * Description 1212 | * Attempt in a safe way to write *len* bytes from the buffer 1213 | * *src* to *dst* in memory. It only works for threads that are in 1214 | * user context, and *dst* must be a valid user space address. 1215 | * 1216 | * This helper should not be used to implement any kind of 1217 | * security mechanism because of TOC-TOU attacks, but rather to 1218 | * debug, divert, and manipulate execution of semi-cooperative 1219 | * processes. 1220 | * 1221 | * Keep in mind that this feature is meant for experiments, and it 1222 | * has a risk of crashing the system and running programs. 1223 | * Therefore, when an eBPF program using this helper is attached, 1224 | * a warning including PID and process name is printed to kernel 1225 | * logs. 1226 | * Return 1227 | * 0 on success, or a negative error in case of failure. 1228 | * 1229 | * int bpf_current_task_under_cgroup(struct bpf_map *map, u32 index) 1230 | * Description 1231 | * Check whether the probe is being run is the context of a given 1232 | * subset of the cgroup2 hierarchy. The cgroup2 to test is held by 1233 | * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. 1234 | * Return 1235 | * The return value depends on the result of the test, and can be: 1236 | * 1237 | * * 0, if the *skb* task belongs to the cgroup2. 1238 | * * 1, if the *skb* task does not belong to the cgroup2. 1239 | * * A negative error code, if an error occurred. 1240 | * 1241 | * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) 1242 | * Description 1243 | * Resize (trim or grow) the packet associated to *skb* to the 1244 | * new *len*. The *flags* are reserved for future usage, and must 1245 | * be left at zero. 1246 | * 1247 | * The basic idea is that the helper performs the needed work to 1248 | * change the size of the packet, then the eBPF program rewrites 1249 | * the rest via helpers like **bpf_skb_store_bytes**\ (), 1250 | * **bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ () 1251 | * and others. This helper is a slow path utility intended for 1252 | * replies with control messages. And because it is targeted for 1253 | * slow path, the helper itself can afford to be slow: it 1254 | * implicitly linearizes, unclones and drops offloads from the 1255 | * *skb*. 1256 | * 1257 | * A call to this helper is susceptible to change the underlaying 1258 | * packet buffer. Therefore, at load time, all checks on pointers 1259 | * previously done by the verifier are invalidated and must be 1260 | * performed again, if the helper is used in combination with 1261 | * direct packet access. 1262 | * Return 1263 | * 0 on success, or a negative error in case of failure. 1264 | * 1265 | * int bpf_skb_pull_data(struct sk_buff *skb, u32 len) 1266 | * Description 1267 | * Pull in non-linear data in case the *skb* is non-linear and not 1268 | * all of *len* are part of the linear section. Make *len* bytes 1269 | * from *skb* readable and writable. If a zero value is passed for 1270 | * *len*, then the whole length of the *skb* is pulled. 1271 | * 1272 | * This helper is only needed for reading and writing with direct 1273 | * packet access. 1274 | * 1275 | * For direct packet access, testing that offsets to access 1276 | * are within packet boundaries (test on *skb*\ **->data_end**) is 1277 | * susceptible to fail if offsets are invalid, or if the requested 1278 | * data is in non-linear parts of the *skb*. On failure the 1279 | * program can just bail out, or in the case of a non-linear 1280 | * buffer, use a helper to make the data available. The 1281 | * **bpf_skb_load_bytes**\ () helper is a first solution to access 1282 | * the data. Another one consists in using **bpf_skb_pull_data** 1283 | * to pull in once the non-linear parts, then retesting and 1284 | * eventually access the data. 1285 | * 1286 | * At the same time, this also makes sure the *skb* is uncloned, 1287 | * which is a necessary condition for direct write. As this needs 1288 | * to be an invariant for the write part only, the verifier 1289 | * detects writes and adds a prologue that is calling 1290 | * **bpf_skb_pull_data()** to effectively unclone the *skb* from 1291 | * the very beginning in case it is indeed cloned. 1292 | * 1293 | * A call to this helper is susceptible to change the underlaying 1294 | * packet buffer. Therefore, at load time, all checks on pointers 1295 | * previously done by the verifier are invalidated and must be 1296 | * performed again, if the helper is used in combination with 1297 | * direct packet access. 1298 | * Return 1299 | * 0 on success, or a negative error in case of failure. 1300 | * 1301 | * s64 bpf_csum_update(struct sk_buff *skb, __wsum csum) 1302 | * Description 1303 | * Add the checksum *csum* into *skb*\ **->csum** in case the 1304 | * driver has supplied a checksum for the entire packet into that 1305 | * field. Return an error otherwise. This helper is intended to be 1306 | * used in combination with **bpf_csum_diff**\ (), in particular 1307 | * when the checksum needs to be updated after data has been 1308 | * written into the packet through direct packet access. 1309 | * Return 1310 | * The checksum on success, or a negative error code in case of 1311 | * failure. 1312 | * 1313 | * void bpf_set_hash_invalid(struct sk_buff *skb) 1314 | * Description 1315 | * Invalidate the current *skb*\ **->hash**. It can be used after 1316 | * mangling on headers through direct packet access, in order to 1317 | * indicate that the hash is outdated and to trigger a 1318 | * recalculation the next time the kernel tries to access this 1319 | * hash or when the **bpf_get_hash_recalc**\ () helper is called. 1320 | * 1321 | * int bpf_get_numa_node_id(void) 1322 | * Description 1323 | * Return the id of the current NUMA node. The primary use case 1324 | * for this helper is the selection of sockets for the local NUMA 1325 | * node, when the program is attached to sockets using the 1326 | * **SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**), 1327 | * but the helper is also available to other eBPF program types, 1328 | * similarly to **bpf_get_smp_processor_id**\ (). 1329 | * Return 1330 | * The id of current NUMA node. 1331 | * 1332 | * int bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags) 1333 | * Description 1334 | * Grows headroom of packet associated to *skb* and adjusts the 1335 | * offset of the MAC header accordingly, adding *len* bytes of 1336 | * space. It automatically extends and reallocates memory as 1337 | * required. 1338 | * 1339 | * This helper can be used on a layer 3 *skb* to push a MAC header 1340 | * for redirection into a layer 2 device. 1341 | * 1342 | * All values for *flags* are reserved for future usage, and must 1343 | * be left at zero. 1344 | * 1345 | * A call to this helper is susceptible to change the underlaying 1346 | * packet buffer. Therefore, at load time, all checks on pointers 1347 | * previously done by the verifier are invalidated and must be 1348 | * performed again, if the helper is used in combination with 1349 | * direct packet access. 1350 | * Return 1351 | * 0 on success, or a negative error in case of failure. 1352 | * 1353 | * int bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta) 1354 | * Description 1355 | * Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that 1356 | * it is possible to use a negative value for *delta*. This helper 1357 | * can be used to prepare the packet for pushing or popping 1358 | * headers. 1359 | * 1360 | * A call to this helper is susceptible to change the underlaying 1361 | * packet buffer. Therefore, at load time, all checks on pointers 1362 | * previously done by the verifier are invalidated and must be 1363 | * performed again, if the helper is used in combination with 1364 | * direct packet access. 1365 | * Return 1366 | * 0 on success, or a negative error in case of failure. 1367 | * 1368 | * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr) 1369 | * Description 1370 | * Copy a NUL terminated string from an unsafe address 1371 | * *unsafe_ptr* to *dst*. The *size* should include the 1372 | * terminating NUL byte. In case the string length is smaller than 1373 | * *size*, the target is not padded with further NUL bytes. If the 1374 | * string length is larger than *size*, just *size*-1 bytes are 1375 | * copied and the last byte is set to NUL. 1376 | * 1377 | * On success, the length of the copied string is returned. This 1378 | * makes this helper useful in tracing programs for reading 1379 | * strings, and more importantly to get its length at runtime. See 1380 | * the following snippet: 1381 | * 1382 | * :: 1383 | * 1384 | * SEC("kprobe/sys_open") 1385 | * void bpf_sys_open(struct pt_regs *ctx) 1386 | * { 1387 | * char buf[PATHLEN]; // PATHLEN is defined to 256 1388 | * int res = bpf_probe_read_str(buf, sizeof(buf), 1389 | * ctx->di); 1390 | * 1391 | * // Consume buf, for example push it to 1392 | * // userspace via bpf_perf_event_output(); we 1393 | * // can use res (the string length) as event 1394 | * // size, after checking its boundaries. 1395 | * } 1396 | * 1397 | * In comparison, using **bpf_probe_read()** helper here instead 1398 | * to read the string would require to estimate the length at 1399 | * compile time, and would often result in copying more memory 1400 | * than necessary. 1401 | * 1402 | * Another useful use case is when parsing individual process 1403 | * arguments or individual environment variables navigating 1404 | * *current*\ **->mm->arg_start** and *current*\ 1405 | * **->mm->env_start**: using this helper and the return value, 1406 | * one can quickly iterate at the right offset of the memory area. 1407 | * Return 1408 | * On success, the strictly positive length of the string, 1409 | * including the trailing NUL character. On error, a negative 1410 | * value. 1411 | * 1412 | * u64 bpf_get_socket_cookie(struct sk_buff *skb) 1413 | * Description 1414 | * If the **struct sk_buff** pointed by *skb* has a known socket, 1415 | * retrieve the cookie (generated by the kernel) of this socket. 1416 | * If no cookie has been set yet, generate a new cookie. Once 1417 | * generated, the socket cookie remains stable for the life of the 1418 | * socket. This helper can be useful for monitoring per socket 1419 | * networking traffic statistics as it provides a unique socket 1420 | * identifier per namespace. 1421 | * Return 1422 | * A 8-byte long non-decreasing number on success, or 0 if the 1423 | * socket field is missing inside *skb*. 1424 | * 1425 | * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx) 1426 | * Description 1427 | * Equivalent to bpf_get_socket_cookie() helper that accepts 1428 | * *skb*, but gets socket from **struct bpf_sock_addr** context. 1429 | * Return 1430 | * A 8-byte long non-decreasing number. 1431 | * 1432 | * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) 1433 | * Description 1434 | * Equivalent to bpf_get_socket_cookie() helper that accepts 1435 | * *skb*, but gets socket from **struct bpf_sock_ops** context. 1436 | * Return 1437 | * A 8-byte long non-decreasing number. 1438 | * 1439 | * u32 bpf_get_socket_uid(struct sk_buff *skb) 1440 | * Return 1441 | * The owner UID of the socket associated to *skb*. If the socket 1442 | * is **NULL**, or if it is not a full socket (i.e. if it is a 1443 | * time-wait or a request socket instead), **overflowuid** value 1444 | * is returned (note that **overflowuid** might also be the actual 1445 | * UID value for the socket). 1446 | * 1447 | * u32 bpf_set_hash(struct sk_buff *skb, u32 hash) 1448 | * Description 1449 | * Set the full hash for *skb* (set the field *skb*\ **->hash**) 1450 | * to value *hash*. 1451 | * Return 1452 | * 0 1453 | * 1454 | * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen) 1455 | * Description 1456 | * Emulate a call to **setsockopt()** on the socket associated to 1457 | * *bpf_socket*, which must be a full socket. The *level* at 1458 | * which the option resides and the name *optname* of the option 1459 | * must be specified, see **setsockopt(2)** for more information. 1460 | * The option value of length *optlen* is pointed by *optval*. 1461 | * 1462 | * This helper actually implements a subset of **setsockopt()**. 1463 | * It supports the following *level*\ s: 1464 | * 1465 | * * **SOL_SOCKET**, which supports the following *optname*\ s: 1466 | * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**, 1467 | * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**. 1468 | * * **IPPROTO_TCP**, which supports the following *optname*\ s: 1469 | * **TCP_CONGESTION**, **TCP_BPF_IW**, 1470 | * **TCP_BPF_SNDCWND_CLAMP**. 1471 | * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. 1472 | * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. 1473 | * Return 1474 | * 0 on success, or a negative error in case of failure. 1475 | * 1476 | * int bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) 1477 | * Description 1478 | * Grow or shrink the room for data in the packet associated to 1479 | * *skb* by *len_diff*, and according to the selected *mode*. 1480 | * 1481 | * There are two supported modes at this time: 1482 | * 1483 | * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer 1484 | * (room space is added or removed below the layer 2 header). 1485 | * 1486 | * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer 1487 | * (room space is added or removed below the layer 3 header). 1488 | * 1489 | * The following flags are supported at this time: 1490 | * 1491 | * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size. 1492 | * Adjusting mss in this way is not allowed for datagrams. 1493 | * 1494 | * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 **: 1495 | * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 **: 1496 | * Any new space is reserved to hold a tunnel header. 1497 | * Configure skb offsets and other fields accordingly. 1498 | * 1499 | * * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE **: 1500 | * * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **: 1501 | * Use with ENCAP_L3 flags to further specify the tunnel type. 1502 | * 1503 | * A call to this helper is susceptible to change the underlaying 1504 | * packet buffer. Therefore, at load time, all checks on pointers 1505 | * previously done by the verifier are invalidated and must be 1506 | * performed again, if the helper is used in combination with 1507 | * direct packet access. 1508 | * Return 1509 | * 0 on success, or a negative error in case of failure. 1510 | * 1511 | * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags) 1512 | * Description 1513 | * Redirect the packet to the endpoint referenced by *map* at 1514 | * index *key*. Depending on its type, this *map* can contain 1515 | * references to net devices (for forwarding packets through other 1516 | * ports), or to CPUs (for redirecting XDP frames to another CPU; 1517 | * but this is only implemented for native XDP (with driver 1518 | * support) as of this writing). 1519 | * 1520 | * All values for *flags* are reserved for future usage, and must 1521 | * be left at zero. 1522 | * 1523 | * When used to redirect packets to net devices, this helper 1524 | * provides a high performance increase over **bpf_redirect**\ (). 1525 | * This is due to various implementation details of the underlying 1526 | * mechanisms, one of which is the fact that **bpf_redirect_map**\ 1527 | * () tries to send packet as a "bulk" to the device. 1528 | * Return 1529 | * **XDP_REDIRECT** on success, or **XDP_ABORTED** on error. 1530 | * 1531 | * int bpf_sk_redirect_map(struct bpf_map *map, u32 key, u64 flags) 1532 | * Description 1533 | * Redirect the packet to the socket referenced by *map* (of type 1534 | * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and 1535 | * egress interfaces can be used for redirection. The 1536 | * **BPF_F_INGRESS** value in *flags* is used to make the 1537 | * distinction (ingress path is selected if the flag is present, 1538 | * egress path otherwise). This is the only flag supported for now. 1539 | * Return 1540 | * **SK_PASS** on success, or **SK_DROP** on error. 1541 | * 1542 | * int bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) 1543 | * Description 1544 | * Add an entry to, or update a *map* referencing sockets. The 1545 | * *skops* is used as a new value for the entry associated to 1546 | * *key*. *flags* is one of: 1547 | * 1548 | * **BPF_NOEXIST** 1549 | * The entry for *key* must not exist in the map. 1550 | * **BPF_EXIST** 1551 | * The entry for *key* must already exist in the map. 1552 | * **BPF_ANY** 1553 | * No condition on the existence of the entry for *key*. 1554 | * 1555 | * If the *map* has eBPF programs (parser and verdict), those will 1556 | * be inherited by the socket being added. If the socket is 1557 | * already attached to eBPF programs, this results in an error. 1558 | * Return 1559 | * 0 on success, or a negative error in case of failure. 1560 | * 1561 | * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) 1562 | * Description 1563 | * Adjust the address pointed by *xdp_md*\ **->data_meta** by 1564 | * *delta* (which can be positive or negative). Note that this 1565 | * operation modifies the address stored in *xdp_md*\ **->data**, 1566 | * so the latter must be loaded only after the helper has been 1567 | * called. 1568 | * 1569 | * The use of *xdp_md*\ **->data_meta** is optional and programs 1570 | * are not required to use it. The rationale is that when the 1571 | * packet is processed with XDP (e.g. as DoS filter), it is 1572 | * possible to push further meta data along with it before passing 1573 | * to the stack, and to give the guarantee that an ingress eBPF 1574 | * program attached as a TC classifier on the same device can pick 1575 | * this up for further post-processing. Since TC works with socket 1576 | * buffers, it remains possible to set from XDP the **mark** or 1577 | * **priority** pointers, or other pointers for the socket buffer. 1578 | * Having this scratch space generic and programmable allows for 1579 | * more flexibility as the user is free to store whatever meta 1580 | * data they need. 1581 | * 1582 | * A call to this helper is susceptible to change the underlaying 1583 | * packet buffer. Therefore, at load time, all checks on pointers 1584 | * previously done by the verifier are invalidated and must be 1585 | * performed again, if the helper is used in combination with 1586 | * direct packet access. 1587 | * Return 1588 | * 0 on success, or a negative error in case of failure. 1589 | * 1590 | * int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size) 1591 | * Description 1592 | * Read the value of a perf event counter, and store it into *buf* 1593 | * of size *buf_size*. This helper relies on a *map* of type 1594 | * **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event 1595 | * counter is selected when *map* is updated with perf event file 1596 | * descriptors. The *map* is an array whose size is the number of 1597 | * available CPUs, and each cell contains a value relative to one 1598 | * CPU. The value to retrieve is indicated by *flags*, that 1599 | * contains the index of the CPU to look up, masked with 1600 | * **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to 1601 | * **BPF_F_CURRENT_CPU** to indicate that the value for the 1602 | * current CPU should be retrieved. 1603 | * 1604 | * This helper behaves in a way close to 1605 | * **bpf_perf_event_read**\ () helper, save that instead of 1606 | * just returning the value observed, it fills the *buf* 1607 | * structure. This allows for additional data to be retrieved: in 1608 | * particular, the enabled and running times (in *buf*\ 1609 | * **->enabled** and *buf*\ **->running**, respectively) are 1610 | * copied. In general, **bpf_perf_event_read_value**\ () is 1611 | * recommended over **bpf_perf_event_read**\ (), which has some 1612 | * ABI issues and provides fewer functionalities. 1613 | * 1614 | * These values are interesting, because hardware PMU (Performance 1615 | * Monitoring Unit) counters are limited resources. When there are 1616 | * more PMU based perf events opened than available counters, 1617 | * kernel will multiplex these events so each event gets certain 1618 | * percentage (but not all) of the PMU time. In case that 1619 | * multiplexing happens, the number of samples or counter value 1620 | * will not reflect the case compared to when no multiplexing 1621 | * occurs. This makes comparison between different runs difficult. 1622 | * Typically, the counter value should be normalized before 1623 | * comparing to other experiments. The usual normalization is done 1624 | * as follows. 1625 | * 1626 | * :: 1627 | * 1628 | * normalized_counter = counter * t_enabled / t_running 1629 | * 1630 | * Where t_enabled is the time enabled for event and t_running is 1631 | * the time running for event since last normalization. The 1632 | * enabled and running times are accumulated since the perf event 1633 | * open. To achieve scaling factor between two invocations of an 1634 | * eBPF program, users can can use CPU id as the key (which is 1635 | * typical for perf array usage model) to remember the previous 1636 | * value and do the calculation inside the eBPF program. 1637 | * Return 1638 | * 0 on success, or a negative error in case of failure. 1639 | * 1640 | * int bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) 1641 | * Description 1642 | * For en eBPF program attached to a perf event, retrieve the 1643 | * value of the event counter associated to *ctx* and store it in 1644 | * the structure pointed by *buf* and of size *buf_size*. Enabled 1645 | * and running times are also stored in the structure (see 1646 | * description of helper **bpf_perf_event_read_value**\ () for 1647 | * more details). 1648 | * Return 1649 | * 0 on success, or a negative error in case of failure. 1650 | * 1651 | * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen) 1652 | * Description 1653 | * Emulate a call to **getsockopt()** on the socket associated to 1654 | * *bpf_socket*, which must be a full socket. The *level* at 1655 | * which the option resides and the name *optname* of the option 1656 | * must be specified, see **getsockopt(2)** for more information. 1657 | * The retrieved value is stored in the structure pointed by 1658 | * *opval* and of length *optlen*. 1659 | * 1660 | * This helper actually implements a subset of **getsockopt()**. 1661 | * It supports the following *level*\ s: 1662 | * 1663 | * * **IPPROTO_TCP**, which supports *optname* 1664 | * **TCP_CONGESTION**. 1665 | * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. 1666 | * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. 1667 | * Return 1668 | * 0 on success, or a negative error in case of failure. 1669 | * 1670 | * int bpf_override_return(struct pt_reg *regs, u64 rc) 1671 | * Description 1672 | * Used for error injection, this helper uses kprobes to override 1673 | * the return value of the probed function, and to set it to *rc*. 1674 | * The first argument is the context *regs* on which the kprobe 1675 | * works. 1676 | * 1677 | * This helper works by setting setting the PC (program counter) 1678 | * to an override function which is run in place of the original 1679 | * probed function. This means the probed function is not run at 1680 | * all. The replacement function just returns with the required 1681 | * value. 1682 | * 1683 | * This helper has security implications, and thus is subject to 1684 | * restrictions. It is only available if the kernel was compiled 1685 | * with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration 1686 | * option, and in this case it only works on functions tagged with 1687 | * **ALLOW_ERROR_INJECTION** in the kernel code. 1688 | * 1689 | * Also, the helper is only available for the architectures having 1690 | * the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing, 1691 | * x86 architecture is the only one to support this feature. 1692 | * Return 1693 | * 0 1694 | * 1695 | * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) 1696 | * Description 1697 | * Attempt to set the value of the **bpf_sock_ops_cb_flags** field 1698 | * for the full TCP socket associated to *bpf_sock_ops* to 1699 | * *argval*. 1700 | * 1701 | * The primary use of this field is to determine if there should 1702 | * be calls to eBPF programs of type 1703 | * **BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP 1704 | * code. A program of the same type can change its value, per 1705 | * connection and as necessary, when the connection is 1706 | * established. This field is directly accessible for reading, but 1707 | * this helper must be used for updates in order to return an 1708 | * error if an eBPF program tries to set a callback that is not 1709 | * supported in the current kernel. 1710 | * 1711 | * The supported callback values that *argval* can combine are: 1712 | * 1713 | * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) 1714 | * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) 1715 | * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) 1716 | * 1717 | * Here are some examples of where one could call such eBPF 1718 | * program: 1719 | * 1720 | * * When RTO fires. 1721 | * * When a packet is retransmitted. 1722 | * * When the connection terminates. 1723 | * * When a packet is sent. 1724 | * * When a packet is received. 1725 | * Return 1726 | * Code **-EINVAL** if the socket is not a full TCP socket; 1727 | * otherwise, a positive number containing the bits that could not 1728 | * be set is returned (which comes down to 0 if all bits were set 1729 | * as required). 1730 | * 1731 | * int bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags) 1732 | * Description 1733 | * This helper is used in programs implementing policies at the 1734 | * socket level. If the message *msg* is allowed to pass (i.e. if 1735 | * the verdict eBPF program returns **SK_PASS**), redirect it to 1736 | * the socket referenced by *map* (of type 1737 | * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and 1738 | * egress interfaces can be used for redirection. The 1739 | * **BPF_F_INGRESS** value in *flags* is used to make the 1740 | * distinction (ingress path is selected if the flag is present, 1741 | * egress path otherwise). This is the only flag supported for now. 1742 | * Return 1743 | * **SK_PASS** on success, or **SK_DROP** on error. 1744 | * 1745 | * int bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes) 1746 | * Description 1747 | * For socket policies, apply the verdict of the eBPF program to 1748 | * the next *bytes* (number of bytes) of message *msg*. 1749 | * 1750 | * For example, this helper can be used in the following cases: 1751 | * 1752 | * * A single **sendmsg**\ () or **sendfile**\ () system call 1753 | * contains multiple logical messages that the eBPF program is 1754 | * supposed to read and for which it should apply a verdict. 1755 | * * An eBPF program only cares to read the first *bytes* of a 1756 | * *msg*. If the message has a large payload, then setting up 1757 | * and calling the eBPF program repeatedly for all bytes, even 1758 | * though the verdict is already known, would create unnecessary 1759 | * overhead. 1760 | * 1761 | * When called from within an eBPF program, the helper sets a 1762 | * counter internal to the BPF infrastructure, that is used to 1763 | * apply the last verdict to the next *bytes*. If *bytes* is 1764 | * smaller than the current data being processed from a 1765 | * **sendmsg**\ () or **sendfile**\ () system call, the first 1766 | * *bytes* will be sent and the eBPF program will be re-run with 1767 | * the pointer for start of data pointing to byte number *bytes* 1768 | * **+ 1**. If *bytes* is larger than the current data being 1769 | * processed, then the eBPF verdict will be applied to multiple 1770 | * **sendmsg**\ () or **sendfile**\ () calls until *bytes* are 1771 | * consumed. 1772 | * 1773 | * Note that if a socket closes with the internal counter holding 1774 | * a non-zero value, this is not a problem because data is not 1775 | * being buffered for *bytes* and is sent as it is received. 1776 | * Return 1777 | * 0 1778 | * 1779 | * int bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes) 1780 | * Description 1781 | * For socket policies, prevent the execution of the verdict eBPF 1782 | * program for message *msg* until *bytes* (byte number) have been 1783 | * accumulated. 1784 | * 1785 | * This can be used when one needs a specific number of bytes 1786 | * before a verdict can be assigned, even if the data spans 1787 | * multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme 1788 | * case would be a user calling **sendmsg**\ () repeatedly with 1789 | * 1-byte long message segments. Obviously, this is bad for 1790 | * performance, but it is still valid. If the eBPF program needs 1791 | * *bytes* bytes to validate a header, this helper can be used to 1792 | * prevent the eBPF program to be called again until *bytes* have 1793 | * been accumulated. 1794 | * Return 1795 | * 0 1796 | * 1797 | * int bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags) 1798 | * Description 1799 | * For socket policies, pull in non-linear data from user space 1800 | * for *msg* and set pointers *msg*\ **->data** and *msg*\ 1801 | * **->data_end** to *start* and *end* bytes offsets into *msg*, 1802 | * respectively. 1803 | * 1804 | * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a 1805 | * *msg* it can only parse data that the (**data**, **data_end**) 1806 | * pointers have already consumed. For **sendmsg**\ () hooks this 1807 | * is likely the first scatterlist element. But for calls relying 1808 | * on the **sendpage** handler (e.g. **sendfile**\ ()) this will 1809 | * be the range (**0**, **0**) because the data is shared with 1810 | * user space and by default the objective is to avoid allowing 1811 | * user space to modify data while (or after) eBPF verdict is 1812 | * being decided. This helper can be used to pull in data and to 1813 | * set the start and end pointer to given values. Data will be 1814 | * copied if necessary (i.e. if data was not linear and if start 1815 | * and end pointers do not point to the same chunk). 1816 | * 1817 | * A call to this helper is susceptible to change the underlaying 1818 | * packet buffer. Therefore, at load time, all checks on pointers 1819 | * previously done by the verifier are invalidated and must be 1820 | * performed again, if the helper is used in combination with 1821 | * direct packet access. 1822 | * 1823 | * All values for *flags* are reserved for future usage, and must 1824 | * be left at zero. 1825 | * Return 1826 | * 0 on success, or a negative error in case of failure. 1827 | * 1828 | * int bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) 1829 | * Description 1830 | * Bind the socket associated to *ctx* to the address pointed by 1831 | * *addr*, of length *addr_len*. This allows for making outgoing 1832 | * connection from the desired IP address, which can be useful for 1833 | * example when all processes inside a cgroup should use one 1834 | * single IP address on a host that has multiple IP configured. 1835 | * 1836 | * This helper works for IPv4 and IPv6, TCP and UDP sockets. The 1837 | * domain (*addr*\ **->sa_family**) must be **AF_INET** (or 1838 | * **AF_INET6**). Looking for a free port to bind to can be 1839 | * expensive, therefore binding to port is not permitted by the 1840 | * helper: *addr*\ **->sin_port** (or **sin6_port**, respectively) 1841 | * must be set to zero. 1842 | * Return 1843 | * 0 on success, or a negative error in case of failure. 1844 | * 1845 | * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) 1846 | * Description 1847 | * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is 1848 | * only possible to shrink the packet as of this writing, 1849 | * therefore *delta* must be a negative integer. 1850 | * 1851 | * A call to this helper is susceptible to change the underlaying 1852 | * packet buffer. Therefore, at load time, all checks on pointers 1853 | * previously done by the verifier are invalidated and must be 1854 | * performed again, if the helper is used in combination with 1855 | * direct packet access. 1856 | * Return 1857 | * 0 on success, or a negative error in case of failure. 1858 | * 1859 | * int bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags) 1860 | * Description 1861 | * Retrieve the XFRM state (IP transform framework, see also 1862 | * **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*. 1863 | * 1864 | * The retrieved value is stored in the **struct bpf_xfrm_state** 1865 | * pointed by *xfrm_state* and of length *size*. 1866 | * 1867 | * All values for *flags* are reserved for future usage, and must 1868 | * be left at zero. 1869 | * 1870 | * This helper is available only if the kernel was compiled with 1871 | * **CONFIG_XFRM** configuration option. 1872 | * Return 1873 | * 0 on success, or a negative error in case of failure. 1874 | * 1875 | * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags) 1876 | * Description 1877 | * Return a user or a kernel stack in bpf program provided buffer. 1878 | * To achieve this, the helper needs *ctx*, which is a pointer 1879 | * to the context on which the tracing program is executed. 1880 | * To store the stacktrace, the bpf program provides *buf* with 1881 | * a nonnegative *size*. 1882 | * 1883 | * The last argument, *flags*, holds the number of stack frames to 1884 | * skip (from 0 to 255), masked with 1885 | * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set 1886 | * the following flags: 1887 | * 1888 | * **BPF_F_USER_STACK** 1889 | * Collect a user space stack instead of a kernel stack. 1890 | * **BPF_F_USER_BUILD_ID** 1891 | * Collect buildid+offset instead of ips for user stack, 1892 | * only valid if **BPF_F_USER_STACK** is also specified. 1893 | * 1894 | * **bpf_get_stack**\ () can collect up to 1895 | * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject 1896 | * to sufficient large buffer size. Note that 1897 | * this limit can be controlled with the **sysctl** program, and 1898 | * that it should be manually increased in order to profile long 1899 | * user stacks (such as stacks for Java programs). To do so, use: 1900 | * 1901 | * :: 1902 | * 1903 | * # sysctl kernel.perf_event_max_stack= 1904 | * Return 1905 | * A non-negative value equal to or less than *size* on success, 1906 | * or a negative error in case of failure. 1907 | * 1908 | * int bpf_skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) 1909 | * Description 1910 | * This helper is similar to **bpf_skb_load_bytes**\ () in that 1911 | * it provides an easy way to load *len* bytes from *offset* 1912 | * from the packet associated to *skb*, into the buffer pointed 1913 | * by *to*. The difference to **bpf_skb_load_bytes**\ () is that 1914 | * a fifth argument *start_header* exists in order to select a 1915 | * base offset to start from. *start_header* can be one of: 1916 | * 1917 | * **BPF_HDR_START_MAC** 1918 | * Base offset to load data from is *skb*'s mac header. 1919 | * **BPF_HDR_START_NET** 1920 | * Base offset to load data from is *skb*'s network header. 1921 | * 1922 | * In general, "direct packet access" is the preferred method to 1923 | * access packet data, however, this helper is in particular useful 1924 | * in socket filters where *skb*\ **->data** does not always point 1925 | * to the start of the mac header and where "direct packet access" 1926 | * is not available. 1927 | * Return 1928 | * 0 on success, or a negative error in case of failure. 1929 | * 1930 | * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) 1931 | * Description 1932 | * Do FIB lookup in kernel tables using parameters in *params*. 1933 | * If lookup is successful and result shows packet is to be 1934 | * forwarded, the neighbor tables are searched for the nexthop. 1935 | * If successful (ie., FIB lookup shows forwarding and nexthop 1936 | * is resolved), the nexthop address is returned in ipv4_dst 1937 | * or ipv6_dst based on family, smac is set to mac address of 1938 | * egress device, dmac is set to nexthop mac address, rt_metric 1939 | * is set to metric from route (IPv4/IPv6 only), and ifindex 1940 | * is set to the device index of the nexthop from the FIB lookup. 1941 | * 1942 | * *plen* argument is the size of the passed in struct. 1943 | * *flags* argument can be a combination of one or more of the 1944 | * following values: 1945 | * 1946 | * **BPF_FIB_LOOKUP_DIRECT** 1947 | * Do a direct table lookup vs full lookup using FIB 1948 | * rules. 1949 | * **BPF_FIB_LOOKUP_OUTPUT** 1950 | * Perform lookup from an egress perspective (default is 1951 | * ingress). 1952 | * 1953 | * *ctx* is either **struct xdp_md** for XDP programs or 1954 | * **struct sk_buff** tc cls_act programs. 1955 | * Return 1956 | * * < 0 if any input argument is invalid 1957 | * * 0 on success (packet is forwarded, nexthop neighbor exists) 1958 | * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the 1959 | * packet is not forwarded or needs assist from full stack 1960 | * 1961 | * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) 1962 | * Description 1963 | * Add an entry to, or update a sockhash *map* referencing sockets. 1964 | * The *skops* is used as a new value for the entry associated to 1965 | * *key*. *flags* is one of: 1966 | * 1967 | * **BPF_NOEXIST** 1968 | * The entry for *key* must not exist in the map. 1969 | * **BPF_EXIST** 1970 | * The entry for *key* must already exist in the map. 1971 | * **BPF_ANY** 1972 | * No condition on the existence of the entry for *key*. 1973 | * 1974 | * If the *map* has eBPF programs (parser and verdict), those will 1975 | * be inherited by the socket being added. If the socket is 1976 | * already attached to eBPF programs, this results in an error. 1977 | * Return 1978 | * 0 on success, or a negative error in case of failure. 1979 | * 1980 | * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) 1981 | * Description 1982 | * This helper is used in programs implementing policies at the 1983 | * socket level. If the message *msg* is allowed to pass (i.e. if 1984 | * the verdict eBPF program returns **SK_PASS**), redirect it to 1985 | * the socket referenced by *map* (of type 1986 | * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and 1987 | * egress interfaces can be used for redirection. The 1988 | * **BPF_F_INGRESS** value in *flags* is used to make the 1989 | * distinction (ingress path is selected if the flag is present, 1990 | * egress path otherwise). This is the only flag supported for now. 1991 | * Return 1992 | * **SK_PASS** on success, or **SK_DROP** on error. 1993 | * 1994 | * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) 1995 | * Description 1996 | * This helper is used in programs implementing policies at the 1997 | * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. 1998 | * if the verdeict eBPF program returns **SK_PASS**), redirect it 1999 | * to the socket referenced by *map* (of type 2000 | * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and 2001 | * egress interfaces can be used for redirection. The 2002 | * **BPF_F_INGRESS** value in *flags* is used to make the 2003 | * distinction (ingress path is selected if the flag is present, 2004 | * egress otherwise). This is the only flag supported for now. 2005 | * Return 2006 | * **SK_PASS** on success, or **SK_DROP** on error. 2007 | * 2008 | * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) 2009 | * Description 2010 | * Encapsulate the packet associated to *skb* within a Layer 3 2011 | * protocol header. This header is provided in the buffer at 2012 | * address *hdr*, with *len* its size in bytes. *type* indicates 2013 | * the protocol of the header and can be one of: 2014 | * 2015 | * **BPF_LWT_ENCAP_SEG6** 2016 | * IPv6 encapsulation with Segment Routing Header 2017 | * (**struct ipv6_sr_hdr**). *hdr* only contains the SRH, 2018 | * the IPv6 header is computed by the kernel. 2019 | * **BPF_LWT_ENCAP_SEG6_INLINE** 2020 | * Only works if *skb* contains an IPv6 packet. Insert a 2021 | * Segment Routing Header (**struct ipv6_sr_hdr**) inside 2022 | * the IPv6 header. 2023 | * **BPF_LWT_ENCAP_IP** 2024 | * IP encapsulation (GRE/GUE/IPIP/etc). The outer header 2025 | * must be IPv4 or IPv6, followed by zero or more 2026 | * additional headers, up to LWT_BPF_MAX_HEADROOM total 2027 | * bytes in all prepended headers. Please note that 2028 | * if skb_is_gso(skb) is true, no more than two headers 2029 | * can be prepended, and the inner header, if present, 2030 | * should be either GRE or UDP/GUE. 2031 | * 2032 | * BPF_LWT_ENCAP_SEG6*** types can be called by bpf programs of 2033 | * type BPF_PROG_TYPE_LWT_IN; BPF_LWT_ENCAP_IP type can be called 2034 | * by bpf programs of types BPF_PROG_TYPE_LWT_IN and 2035 | * BPF_PROG_TYPE_LWT_XMIT. 2036 | * 2037 | * A call to this helper is susceptible to change the underlaying 2038 | * packet buffer. Therefore, at load time, all checks on pointers 2039 | * previously done by the verifier are invalidated and must be 2040 | * performed again, if the helper is used in combination with 2041 | * direct packet access. 2042 | * Return 2043 | * 0 on success, or a negative error in case of failure. 2044 | * 2045 | * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) 2046 | * Description 2047 | * Store *len* bytes from address *from* into the packet 2048 | * associated to *skb*, at *offset*. Only the flags, tag and TLVs 2049 | * inside the outermost IPv6 Segment Routing Header can be 2050 | * modified through this helper. 2051 | * 2052 | * A call to this helper is susceptible to change the underlaying 2053 | * packet buffer. Therefore, at load time, all checks on pointers 2054 | * previously done by the verifier are invalidated and must be 2055 | * performed again, if the helper is used in combination with 2056 | * direct packet access. 2057 | * Return 2058 | * 0 on success, or a negative error in case of failure. 2059 | * 2060 | * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) 2061 | * Description 2062 | * Adjust the size allocated to TLVs in the outermost IPv6 2063 | * Segment Routing Header contained in the packet associated to 2064 | * *skb*, at position *offset* by *delta* bytes. Only offsets 2065 | * after the segments are accepted. *delta* can be as well 2066 | * positive (growing) as negative (shrinking). 2067 | * 2068 | * A call to this helper is susceptible to change the underlaying 2069 | * packet buffer. Therefore, at load time, all checks on pointers 2070 | * previously done by the verifier are invalidated and must be 2071 | * performed again, if the helper is used in combination with 2072 | * direct packet access. 2073 | * Return 2074 | * 0 on success, or a negative error in case of failure. 2075 | * 2076 | * int bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) 2077 | * Description 2078 | * Apply an IPv6 Segment Routing action of type *action* to the 2079 | * packet associated to *skb*. Each action takes a parameter 2080 | * contained at address *param*, and of length *param_len* bytes. 2081 | * *action* can be one of: 2082 | * 2083 | * **SEG6_LOCAL_ACTION_END_X** 2084 | * End.X action: Endpoint with Layer-3 cross-connect. 2085 | * Type of *param*: **struct in6_addr**. 2086 | * **SEG6_LOCAL_ACTION_END_T** 2087 | * End.T action: Endpoint with specific IPv6 table lookup. 2088 | * Type of *param*: **int**. 2089 | * **SEG6_LOCAL_ACTION_END_B6** 2090 | * End.B6 action: Endpoint bound to an SRv6 policy. 2091 | * Type of param: **struct ipv6_sr_hdr**. 2092 | * **SEG6_LOCAL_ACTION_END_B6_ENCAP** 2093 | * End.B6.Encap action: Endpoint bound to an SRv6 2094 | * encapsulation policy. 2095 | * Type of param: **struct ipv6_sr_hdr**. 2096 | * 2097 | * A call to this helper is susceptible to change the underlaying 2098 | * packet buffer. Therefore, at load time, all checks on pointers 2099 | * previously done by the verifier are invalidated and must be 2100 | * performed again, if the helper is used in combination with 2101 | * direct packet access. 2102 | * Return 2103 | * 0 on success, or a negative error in case of failure. 2104 | * 2105 | * int bpf_rc_repeat(void *ctx) 2106 | * Description 2107 | * This helper is used in programs implementing IR decoding, to 2108 | * report a successfully decoded repeat key message. This delays 2109 | * the generation of a key up event for previously generated 2110 | * key down event. 2111 | * 2112 | * Some IR protocols like NEC have a special IR message for 2113 | * repeating last button, for when a button is held down. 2114 | * 2115 | * The *ctx* should point to the lirc sample as passed into 2116 | * the program. 2117 | * 2118 | * This helper is only available is the kernel was compiled with 2119 | * the **CONFIG_BPF_LIRC_MODE2** configuration option set to 2120 | * "**y**". 2121 | * Return 2122 | * 0 2123 | * 2124 | * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) 2125 | * Description 2126 | * This helper is used in programs implementing IR decoding, to 2127 | * report a successfully decoded key press with *scancode*, 2128 | * *toggle* value in the given *protocol*. The scancode will be 2129 | * translated to a keycode using the rc keymap, and reported as 2130 | * an input key down event. After a period a key up event is 2131 | * generated. This period can be extended by calling either 2132 | * **bpf_rc_keydown**\ () again with the same values, or calling 2133 | * **bpf_rc_repeat**\ (). 2134 | * 2135 | * Some protocols include a toggle bit, in case the button was 2136 | * released and pressed again between consecutive scancodes. 2137 | * 2138 | * The *ctx* should point to the lirc sample as passed into 2139 | * the program. 2140 | * 2141 | * The *protocol* is the decoded protocol number (see 2142 | * **enum rc_proto** for some predefined values). 2143 | * 2144 | * This helper is only available is the kernel was compiled with 2145 | * the **CONFIG_BPF_LIRC_MODE2** configuration option set to 2146 | * "**y**". 2147 | * Return 2148 | * 0 2149 | * 2150 | * u64 bpf_skb_cgroup_id(struct sk_buff *skb) 2151 | * Description 2152 | * Return the cgroup v2 id of the socket associated with the *skb*. 2153 | * This is roughly similar to the **bpf_get_cgroup_classid**\ () 2154 | * helper for cgroup v1 by providing a tag resp. identifier that 2155 | * can be matched on or used for map lookups e.g. to implement 2156 | * policy. The cgroup v2 id of a given path in the hierarchy is 2157 | * exposed in user space through the f_handle API in order to get 2158 | * to the same 64-bit id. 2159 | * 2160 | * This helper can be used on TC egress path, but not on ingress, 2161 | * and is available only if the kernel was compiled with the 2162 | * **CONFIG_SOCK_CGROUP_DATA** configuration option. 2163 | * Return 2164 | * The id is returned or 0 in case the id could not be retrieved. 2165 | * 2166 | * u64 bpf_get_current_cgroup_id(void) 2167 | * Return 2168 | * A 64-bit integer containing the current cgroup id based 2169 | * on the cgroup within which the current task is running. 2170 | * 2171 | * void *bpf_get_local_storage(void *map, u64 flags) 2172 | * Description 2173 | * Get the pointer to the local storage area. 2174 | * The type and the size of the local storage is defined 2175 | * by the *map* argument. 2176 | * The *flags* meaning is specific for each map type, 2177 | * and has to be 0 for cgroup local storage. 2178 | * 2179 | * Depending on the BPF program type, a local storage area 2180 | * can be shared between multiple instances of the BPF program, 2181 | * running simultaneously. 2182 | * 2183 | * A user should care about the synchronization by himself. 2184 | * For example, by using the **BPF_STX_XADD** instruction to alter 2185 | * the shared data. 2186 | * Return 2187 | * A pointer to the local storage area. 2188 | * 2189 | * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) 2190 | * Description 2191 | * Select a **SO_REUSEPORT** socket from a 2192 | * **BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*. 2193 | * It checks the selected socket is matching the incoming 2194 | * request in the socket buffer. 2195 | * Return 2196 | * 0 on success, or a negative error in case of failure. 2197 | * 2198 | * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) 2199 | * Description 2200 | * Return id of cgroup v2 that is ancestor of cgroup associated 2201 | * with the *skb* at the *ancestor_level*. The root cgroup is at 2202 | * *ancestor_level* zero and each step down the hierarchy 2203 | * increments the level. If *ancestor_level* == level of cgroup 2204 | * associated with *skb*, then return value will be same as that 2205 | * of **bpf_skb_cgroup_id**\ (). 2206 | * 2207 | * The helper is useful to implement policies based on cgroups 2208 | * that are upper in hierarchy than immediate cgroup associated 2209 | * with *skb*. 2210 | * 2211 | * The format of returned id and helper limitations are same as in 2212 | * **bpf_skb_cgroup_id**\ (). 2213 | * Return 2214 | * The id is returned or 0 in case the id could not be retrieved. 2215 | * 2216 | * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) 2217 | * Description 2218 | * Look for TCP socket matching *tuple*, optionally in a child 2219 | * network namespace *netns*. The return value must be checked, 2220 | * and if non-**NULL**, released via **bpf_sk_release**\ (). 2221 | * 2222 | * The *ctx* should point to the context of the program, such as 2223 | * the skb or socket (depending on the hook in use). This is used 2224 | * to determine the base network namespace for the lookup. 2225 | * 2226 | * *tuple_size* must be one of: 2227 | * 2228 | * **sizeof**\ (*tuple*\ **->ipv4**) 2229 | * Look for an IPv4 socket. 2230 | * **sizeof**\ (*tuple*\ **->ipv6**) 2231 | * Look for an IPv6 socket. 2232 | * 2233 | * If the *netns* is a negative signed 32-bit integer, then the 2234 | * socket lookup table in the netns associated with the *ctx* will 2235 | * will be used. For the TC hooks, this is the netns of the device 2236 | * in the skb. For socket hooks, this is the netns of the socket. 2237 | * If *netns* is any other signed 32-bit value greater than or 2238 | * equal to zero then it specifies the ID of the netns relative to 2239 | * the netns associated with the *ctx*. *netns* values beyond the 2240 | * range of 32-bit integers are reserved for future use. 2241 | * 2242 | * All values for *flags* are reserved for future usage, and must 2243 | * be left at zero. 2244 | * 2245 | * This helper is available only if the kernel was compiled with 2246 | * **CONFIG_NET** configuration option. 2247 | * Return 2248 | * Pointer to **struct bpf_sock**, or **NULL** in case of failure. 2249 | * For sockets with reuseport option, the **struct bpf_sock** 2250 | * result is from **reuse->socks**\ [] using the hash of the tuple. 2251 | * 2252 | * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) 2253 | * Description 2254 | * Look for UDP socket matching *tuple*, optionally in a child 2255 | * network namespace *netns*. The return value must be checked, 2256 | * and if non-**NULL**, released via **bpf_sk_release**\ (). 2257 | * 2258 | * The *ctx* should point to the context of the program, such as 2259 | * the skb or socket (depending on the hook in use). This is used 2260 | * to determine the base network namespace for the lookup. 2261 | * 2262 | * *tuple_size* must be one of: 2263 | * 2264 | * **sizeof**\ (*tuple*\ **->ipv4**) 2265 | * Look for an IPv4 socket. 2266 | * **sizeof**\ (*tuple*\ **->ipv6**) 2267 | * Look for an IPv6 socket. 2268 | * 2269 | * If the *netns* is a negative signed 32-bit integer, then the 2270 | * socket lookup table in the netns associated with the *ctx* will 2271 | * will be used. For the TC hooks, this is the netns of the device 2272 | * in the skb. For socket hooks, this is the netns of the socket. 2273 | * If *netns* is any other signed 32-bit value greater than or 2274 | * equal to zero then it specifies the ID of the netns relative to 2275 | * the netns associated with the *ctx*. *netns* values beyond the 2276 | * range of 32-bit integers are reserved for future use. 2277 | * 2278 | * All values for *flags* are reserved for future usage, and must 2279 | * be left at zero. 2280 | * 2281 | * This helper is available only if the kernel was compiled with 2282 | * **CONFIG_NET** configuration option. 2283 | * Return 2284 | * Pointer to **struct bpf_sock**, or **NULL** in case of failure. 2285 | * For sockets with reuseport option, the **struct bpf_sock** 2286 | * result is from **reuse->socks**\ [] using the hash of the tuple. 2287 | * 2288 | * int bpf_sk_release(struct bpf_sock *sock) 2289 | * Description 2290 | * Release the reference held by *sock*. *sock* must be a 2291 | * non-**NULL** pointer that was returned from 2292 | * **bpf_sk_lookup_xxx**\ (). 2293 | * Return 2294 | * 0 on success, or a negative error in case of failure. 2295 | * 2296 | * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) 2297 | * Description 2298 | * Push an element *value* in *map*. *flags* is one of: 2299 | * 2300 | * **BPF_EXIST** 2301 | * If the queue/stack is full, the oldest element is 2302 | * removed to make room for this. 2303 | * Return 2304 | * 0 on success, or a negative error in case of failure. 2305 | * 2306 | * int bpf_map_pop_elem(struct bpf_map *map, void *value) 2307 | * Description 2308 | * Pop an element from *map*. 2309 | * Return 2310 | * 0 on success, or a negative error in case of failure. 2311 | * 2312 | * int bpf_map_peek_elem(struct bpf_map *map, void *value) 2313 | * Description 2314 | * Get an element from *map* without removing it. 2315 | * Return 2316 | * 0 on success, or a negative error in case of failure. 2317 | * 2318 | * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags) 2319 | * Description 2320 | * For socket policies, insert *len* bytes into *msg* at offset 2321 | * *start*. 2322 | * 2323 | * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a 2324 | * *msg* it may want to insert metadata or options into the *msg*. 2325 | * This can later be read and used by any of the lower layer BPF 2326 | * hooks. 2327 | * 2328 | * This helper may fail if under memory pressure (a malloc 2329 | * fails) in these cases BPF programs will get an appropriate 2330 | * error and BPF programs will need to handle them. 2331 | * Return 2332 | * 0 on success, or a negative error in case of failure. 2333 | * 2334 | * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags) 2335 | * Description 2336 | * Will remove *pop* bytes from a *msg* starting at byte *start*. 2337 | * This may result in **ENOMEM** errors under certain situations if 2338 | * an allocation and copy are required due to a full ring buffer. 2339 | * However, the helper will try to avoid doing the allocation 2340 | * if possible. Other errors can occur if input parameters are 2341 | * invalid either due to *start* byte not being valid part of *msg* 2342 | * payload and/or *pop* value being to large. 2343 | * Return 2344 | * 0 on success, or a negative error in case of failure. 2345 | * 2346 | * int bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y) 2347 | * Description 2348 | * This helper is used in programs implementing IR decoding, to 2349 | * report a successfully decoded pointer movement. 2350 | * 2351 | * The *ctx* should point to the lirc sample as passed into 2352 | * the program. 2353 | * 2354 | * This helper is only available is the kernel was compiled with 2355 | * the **CONFIG_BPF_LIRC_MODE2** configuration option set to 2356 | * "**y**". 2357 | * Return 2358 | * 0 2359 | * 2360 | * int bpf_spin_lock(struct bpf_spin_lock *lock) 2361 | * Description 2362 | * Acquire a spinlock represented by the pointer *lock*, which is 2363 | * stored as part of a value of a map. Taking the lock allows to 2364 | * safely update the rest of the fields in that value. The 2365 | * spinlock can (and must) later be released with a call to 2366 | * **bpf_spin_unlock**\ (\ *lock*\ ). 2367 | * 2368 | * Spinlocks in BPF programs come with a number of restrictions 2369 | * and constraints: 2370 | * 2371 | * * **bpf_spin_lock** objects are only allowed inside maps of 2372 | * types **BPF_MAP_TYPE_HASH** and **BPF_MAP_TYPE_ARRAY** (this 2373 | * list could be extended in the future). 2374 | * * BTF description of the map is mandatory. 2375 | * * The BPF program can take ONE lock at a time, since taking two 2376 | * or more could cause dead locks. 2377 | * * Only one **struct bpf_spin_lock** is allowed per map element. 2378 | * * When the lock is taken, calls (either BPF to BPF or helpers) 2379 | * are not allowed. 2380 | * * The **BPF_LD_ABS** and **BPF_LD_IND** instructions are not 2381 | * allowed inside a spinlock-ed region. 2382 | * * The BPF program MUST call **bpf_spin_unlock**\ () to release 2383 | * the lock, on all execution paths, before it returns. 2384 | * * The BPF program can access **struct bpf_spin_lock** only via 2385 | * the **bpf_spin_lock**\ () and **bpf_spin_unlock**\ () 2386 | * helpers. Loading or storing data into the **struct 2387 | * bpf_spin_lock** *lock*\ **;** field of a map is not allowed. 2388 | * * To use the **bpf_spin_lock**\ () helper, the BTF description 2389 | * of the map value must be a struct and have **struct 2390 | * bpf_spin_lock** *anyname*\ **;** field at the top level. 2391 | * Nested lock inside another struct is not allowed. 2392 | * * The **struct bpf_spin_lock** *lock* field in a map value must 2393 | * be aligned on a multiple of 4 bytes in that value. 2394 | * * Syscall with command **BPF_MAP_LOOKUP_ELEM** does not copy 2395 | * the **bpf_spin_lock** field to user space. 2396 | * * Syscall with command **BPF_MAP_UPDATE_ELEM**, or update from 2397 | * a BPF program, do not update the **bpf_spin_lock** field. 2398 | * * **bpf_spin_lock** cannot be on the stack or inside a 2399 | * networking packet (it can only be inside of a map values). 2400 | * * **bpf_spin_lock** is available to root only. 2401 | * * Tracing programs and socket filter programs cannot use 2402 | * **bpf_spin_lock**\ () due to insufficient preemption checks 2403 | * (but this may change in the future). 2404 | * * **bpf_spin_lock** is not allowed in inner maps of map-in-map. 2405 | * Return 2406 | * 0 2407 | * 2408 | * int bpf_spin_unlock(struct bpf_spin_lock *lock) 2409 | * Description 2410 | * Release the *lock* previously locked by a call to 2411 | * **bpf_spin_lock**\ (\ *lock*\ ). 2412 | * Return 2413 | * 0 2414 | * 2415 | * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk) 2416 | * Description 2417 | * This helper gets a **struct bpf_sock** pointer such 2418 | * that all the fields in this **bpf_sock** can be accessed. 2419 | * Return 2420 | * A **struct bpf_sock** pointer on success, or **NULL** in 2421 | * case of failure. 2422 | * 2423 | * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk) 2424 | * Description 2425 | * This helper gets a **struct bpf_tcp_sock** pointer from a 2426 | * **struct bpf_sock** pointer. 2427 | * Return 2428 | * A **struct bpf_tcp_sock** pointer on success, or **NULL** in 2429 | * case of failure. 2430 | * 2431 | * int bpf_skb_ecn_set_ce(struct sk_buf *skb) 2432 | * Description 2433 | * Set ECN (Explicit Congestion Notification) field of IP header 2434 | * to **CE** (Congestion Encountered) if current value is **ECT** 2435 | * (ECN Capable Transport). Otherwise, do nothing. Works with IPv6 2436 | * and IPv4. 2437 | * Return 2438 | * 1 if the **CE** flag is set (either by the current helper call 2439 | * or because it was already present), 0 if it is not set. 2440 | * 2441 | * struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk) 2442 | * Description 2443 | * Return a **struct bpf_sock** pointer in **TCP_LISTEN** state. 2444 | * **bpf_sk_release**\ () is unnecessary and not allowed. 2445 | * Return 2446 | * A **struct bpf_sock** pointer on success, or **NULL** in 2447 | * case of failure. 2448 | * 2449 | * struct bpf_sock *bpf_skc_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) 2450 | * Description 2451 | * Look for TCP socket matching *tuple*, optionally in a child 2452 | * network namespace *netns*. The return value must be checked, 2453 | * and if non-**NULL**, released via **bpf_sk_release**\ (). 2454 | * 2455 | * This function is identical to bpf_sk_lookup_tcp, except that it 2456 | * also returns timewait or request sockets. Use bpf_sk_fullsock 2457 | * or bpf_tcp_socket to access the full structure. 2458 | * 2459 | * This helper is available only if the kernel was compiled with 2460 | * **CONFIG_NET** configuration option. 2461 | * Return 2462 | * Pointer to **struct bpf_sock**, or **NULL** in case of failure. 2463 | * For sockets with reuseport option, the **struct bpf_sock** 2464 | * result is from **reuse->socks**\ [] using the hash of the tuple. 2465 | * 2466 | * int bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) 2467 | * Description 2468 | * Check whether iph and th contain a valid SYN cookie ACK for 2469 | * the listening socket in sk. 2470 | * 2471 | * iph points to the start of the IPv4 or IPv6 header, while 2472 | * iph_len contains sizeof(struct iphdr) or sizeof(struct ip6hdr). 2473 | * 2474 | * th points to the start of the TCP header, while th_len contains 2475 | * sizeof(struct tcphdr). 2476 | * 2477 | * Return 2478 | * 0 if iph and th are a valid SYN cookie ACK, or a negative error 2479 | * otherwise. 2480 | */ 2481 | #define __BPF_FUNC_MAPPER(FN) \ 2482 | FN(unspec), \ 2483 | FN(map_lookup_elem), \ 2484 | FN(map_update_elem), \ 2485 | FN(map_delete_elem), \ 2486 | FN(probe_read), \ 2487 | FN(ktime_get_ns), \ 2488 | FN(trace_printk), \ 2489 | FN(get_prandom_u32), \ 2490 | FN(get_smp_processor_id), \ 2491 | FN(skb_store_bytes), \ 2492 | FN(l3_csum_replace), \ 2493 | FN(l4_csum_replace), \ 2494 | FN(tail_call), \ 2495 | FN(clone_redirect), \ 2496 | FN(get_current_pid_tgid), \ 2497 | FN(get_current_uid_gid), \ 2498 | FN(get_current_comm), \ 2499 | FN(get_cgroup_classid), \ 2500 | FN(skb_vlan_push), \ 2501 | FN(skb_vlan_pop), \ 2502 | FN(skb_get_tunnel_key), \ 2503 | FN(skb_set_tunnel_key), \ 2504 | FN(perf_event_read), \ 2505 | FN(redirect), \ 2506 | FN(get_route_realm), \ 2507 | FN(perf_event_output), \ 2508 | FN(skb_load_bytes), \ 2509 | FN(get_stackid), \ 2510 | FN(csum_diff), \ 2511 | FN(skb_get_tunnel_opt), \ 2512 | FN(skb_set_tunnel_opt), \ 2513 | FN(skb_change_proto), \ 2514 | FN(skb_change_type), \ 2515 | FN(skb_under_cgroup), \ 2516 | FN(get_hash_recalc), \ 2517 | FN(get_current_task), \ 2518 | FN(probe_write_user), \ 2519 | FN(current_task_under_cgroup), \ 2520 | FN(skb_change_tail), \ 2521 | FN(skb_pull_data), \ 2522 | FN(csum_update), \ 2523 | FN(set_hash_invalid), \ 2524 | FN(get_numa_node_id), \ 2525 | FN(skb_change_head), \ 2526 | FN(xdp_adjust_head), \ 2527 | FN(probe_read_str), \ 2528 | FN(get_socket_cookie), \ 2529 | FN(get_socket_uid), \ 2530 | FN(set_hash), \ 2531 | FN(setsockopt), \ 2532 | FN(skb_adjust_room), \ 2533 | FN(redirect_map), \ 2534 | FN(sk_redirect_map), \ 2535 | FN(sock_map_update), \ 2536 | FN(xdp_adjust_meta), \ 2537 | FN(perf_event_read_value), \ 2538 | FN(perf_prog_read_value), \ 2539 | FN(getsockopt), \ 2540 | FN(override_return), \ 2541 | FN(sock_ops_cb_flags_set), \ 2542 | FN(msg_redirect_map), \ 2543 | FN(msg_apply_bytes), \ 2544 | FN(msg_cork_bytes), \ 2545 | FN(msg_pull_data), \ 2546 | FN(bind), \ 2547 | FN(xdp_adjust_tail), \ 2548 | FN(skb_get_xfrm_state), \ 2549 | FN(get_stack), \ 2550 | FN(skb_load_bytes_relative), \ 2551 | FN(fib_lookup), \ 2552 | FN(sock_hash_update), \ 2553 | FN(msg_redirect_hash), \ 2554 | FN(sk_redirect_hash), \ 2555 | FN(lwt_push_encap), \ 2556 | FN(lwt_seg6_store_bytes), \ 2557 | FN(lwt_seg6_adjust_srh), \ 2558 | FN(lwt_seg6_action), \ 2559 | FN(rc_repeat), \ 2560 | FN(rc_keydown), \ 2561 | FN(skb_cgroup_id), \ 2562 | FN(get_current_cgroup_id), \ 2563 | FN(get_local_storage), \ 2564 | FN(sk_select_reuseport), \ 2565 | FN(skb_ancestor_cgroup_id), \ 2566 | FN(sk_lookup_tcp), \ 2567 | FN(sk_lookup_udp), \ 2568 | FN(sk_release), \ 2569 | FN(map_push_elem), \ 2570 | FN(map_pop_elem), \ 2571 | FN(map_peek_elem), \ 2572 | FN(msg_push_data), \ 2573 | FN(msg_pop_data), \ 2574 | FN(rc_pointer_rel), \ 2575 | FN(spin_lock), \ 2576 | FN(spin_unlock), \ 2577 | FN(sk_fullsock), \ 2578 | FN(tcp_sock), \ 2579 | FN(skb_ecn_set_ce), \ 2580 | FN(get_listener_sock), \ 2581 | FN(skc_lookup_tcp), \ 2582 | FN(tcp_check_syncookie), 2583 | 2584 | /* integer value in 'imm' field of BPF_CALL instruction selects which helper 2585 | * function eBPF program intends to call 2586 | */ 2587 | #define __BPF_ENUM_FN(x) BPF_FUNC_ ## x 2588 | enum bpf_func_id { 2589 | __BPF_FUNC_MAPPER(__BPF_ENUM_FN) 2590 | __BPF_FUNC_MAX_ID, 2591 | }; 2592 | #undef __BPF_ENUM_FN 2593 | 2594 | /* All flags used by eBPF helper functions, placed here. */ 2595 | 2596 | /* BPF_FUNC_skb_store_bytes flags. */ 2597 | #define BPF_F_RECOMPUTE_CSUM (1ULL << 0) 2598 | #define BPF_F_INVALIDATE_HASH (1ULL << 1) 2599 | 2600 | /* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags. 2601 | * First 4 bits are for passing the header field size. 2602 | */ 2603 | #define BPF_F_HDR_FIELD_MASK 0xfULL 2604 | 2605 | /* BPF_FUNC_l4_csum_replace flags. */ 2606 | #define BPF_F_PSEUDO_HDR (1ULL << 4) 2607 | #define BPF_F_MARK_MANGLED_0 (1ULL << 5) 2608 | #define BPF_F_MARK_ENFORCE (1ULL << 6) 2609 | 2610 | /* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ 2611 | #define BPF_F_INGRESS (1ULL << 0) 2612 | 2613 | /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ 2614 | #define BPF_F_TUNINFO_IPV6 (1ULL << 0) 2615 | 2616 | /* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */ 2617 | #define BPF_F_SKIP_FIELD_MASK 0xffULL 2618 | #define BPF_F_USER_STACK (1ULL << 8) 2619 | /* flags used by BPF_FUNC_get_stackid only. */ 2620 | #define BPF_F_FAST_STACK_CMP (1ULL << 9) 2621 | #define BPF_F_REUSE_STACKID (1ULL << 10) 2622 | /* flags used by BPF_FUNC_get_stack only. */ 2623 | #define BPF_F_USER_BUILD_ID (1ULL << 11) 2624 | 2625 | /* BPF_FUNC_skb_set_tunnel_key flags. */ 2626 | #define BPF_F_ZERO_CSUM_TX (1ULL << 1) 2627 | #define BPF_F_DONT_FRAGMENT (1ULL << 2) 2628 | #define BPF_F_SEQ_NUMBER (1ULL << 3) 2629 | 2630 | /* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and 2631 | * BPF_FUNC_perf_event_read_value flags. 2632 | */ 2633 | #define BPF_F_INDEX_MASK 0xffffffffULL 2634 | #define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK 2635 | /* BPF_FUNC_perf_event_output for sk_buff input context. */ 2636 | #define BPF_F_CTXLEN_MASK (0xfffffULL << 32) 2637 | 2638 | /* Current network namespace */ 2639 | #define BPF_F_CURRENT_NETNS (-1L) 2640 | 2641 | /* BPF_FUNC_skb_adjust_room flags. */ 2642 | #define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) 2643 | 2644 | #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) 2645 | #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) 2646 | #define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) 2647 | #define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) 2648 | 2649 | /* Mode for BPF_FUNC_skb_adjust_room helper. */ 2650 | enum bpf_adj_room_mode { 2651 | BPF_ADJ_ROOM_NET, 2652 | BPF_ADJ_ROOM_MAC, 2653 | }; 2654 | 2655 | /* Mode for BPF_FUNC_skb_load_bytes_relative helper. */ 2656 | enum bpf_hdr_start_off { 2657 | BPF_HDR_START_MAC, 2658 | BPF_HDR_START_NET, 2659 | }; 2660 | 2661 | /* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */ 2662 | enum bpf_lwt_encap_mode { 2663 | BPF_LWT_ENCAP_SEG6, 2664 | BPF_LWT_ENCAP_SEG6_INLINE, 2665 | BPF_LWT_ENCAP_IP, 2666 | }; 2667 | 2668 | #define __bpf_md_ptr(type, name) \ 2669 | union { \ 2670 | type name; \ 2671 | __u64 :64; \ 2672 | } __attribute__((aligned(8))) 2673 | 2674 | /* user accessible mirror of in-kernel sk_buff. 2675 | * new fields can only be added to the end of this structure 2676 | */ 2677 | struct __sk_buff { 2678 | __u32 len; 2679 | __u32 pkt_type; 2680 | __u32 mark; 2681 | __u32 queue_mapping; 2682 | __u32 protocol; 2683 | __u32 vlan_present; 2684 | __u32 vlan_tci; 2685 | __u32 vlan_proto; 2686 | __u32 priority; 2687 | __u32 ingress_ifindex; 2688 | __u32 ifindex; 2689 | __u32 tc_index; 2690 | __u32 cb[5]; 2691 | __u32 hash; 2692 | __u32 tc_classid; 2693 | __u32 data; 2694 | __u32 data_end; 2695 | __u32 napi_id; 2696 | 2697 | /* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */ 2698 | __u32 family; 2699 | __u32 remote_ip4; /* Stored in network byte order */ 2700 | __u32 local_ip4; /* Stored in network byte order */ 2701 | __u32 remote_ip6[4]; /* Stored in network byte order */ 2702 | __u32 local_ip6[4]; /* Stored in network byte order */ 2703 | __u32 remote_port; /* Stored in network byte order */ 2704 | __u32 local_port; /* stored in host byte order */ 2705 | /* ... here. */ 2706 | 2707 | __u32 data_meta; 2708 | __bpf_md_ptr(struct bpf_flow_keys *, flow_keys); 2709 | __u64 tstamp; 2710 | __u32 wire_len; 2711 | __u32 gso_segs; 2712 | __bpf_md_ptr(struct bpf_sock *, sk); 2713 | }; 2714 | 2715 | struct bpf_tunnel_key { 2716 | __u32 tunnel_id; 2717 | union { 2718 | __u32 remote_ipv4; 2719 | __u32 remote_ipv6[4]; 2720 | }; 2721 | __u8 tunnel_tos; 2722 | __u8 tunnel_ttl; 2723 | __u16 tunnel_ext; /* Padding, future use. */ 2724 | __u32 tunnel_label; 2725 | }; 2726 | 2727 | /* user accessible mirror of in-kernel xfrm_state. 2728 | * new fields can only be added to the end of this structure 2729 | */ 2730 | struct bpf_xfrm_state { 2731 | __u32 reqid; 2732 | __u32 spi; /* Stored in network byte order */ 2733 | __u16 family; 2734 | __u16 ext; /* Padding, future use. */ 2735 | union { 2736 | __u32 remote_ipv4; /* Stored in network byte order */ 2737 | __u32 remote_ipv6[4]; /* Stored in network byte order */ 2738 | }; 2739 | }; 2740 | 2741 | /* Generic BPF return codes which all BPF program types may support. 2742 | * The values are binary compatible with their TC_ACT_* counter-part to 2743 | * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT 2744 | * programs. 2745 | * 2746 | * XDP is handled seprately, see XDP_*. 2747 | */ 2748 | enum bpf_ret_code { 2749 | BPF_OK = 0, 2750 | /* 1 reserved */ 2751 | BPF_DROP = 2, 2752 | /* 3-6 reserved */ 2753 | BPF_REDIRECT = 7, 2754 | /* >127 are reserved for prog type specific return codes. 2755 | * 2756 | * BPF_LWT_REROUTE: used by BPF_PROG_TYPE_LWT_IN and 2757 | * BPF_PROG_TYPE_LWT_XMIT to indicate that skb had been 2758 | * changed and should be routed based on its new L3 header. 2759 | * (This is an L3 redirect, as opposed to L2 redirect 2760 | * represented by BPF_REDIRECT above). 2761 | */ 2762 | BPF_LWT_REROUTE = 128, 2763 | }; 2764 | 2765 | struct bpf_sock { 2766 | __u32 bound_dev_if; 2767 | __u32 family; 2768 | __u32 type; 2769 | __u32 protocol; 2770 | __u32 mark; 2771 | __u32 priority; 2772 | /* IP address also allows 1 and 2 bytes access */ 2773 | __u32 src_ip4; 2774 | __u32 src_ip6[4]; 2775 | __u32 src_port; /* host byte order */ 2776 | __u32 dst_port; /* network byte order */ 2777 | __u32 dst_ip4; 2778 | __u32 dst_ip6[4]; 2779 | __u32 state; 2780 | }; 2781 | 2782 | struct bpf_tcp_sock { 2783 | __u32 snd_cwnd; /* Sending congestion window */ 2784 | __u32 srtt_us; /* smoothed round trip time << 3 in usecs */ 2785 | __u32 rtt_min; 2786 | __u32 snd_ssthresh; /* Slow start size threshold */ 2787 | __u32 rcv_nxt; /* What we want to receive next */ 2788 | __u32 snd_nxt; /* Next sequence we send */ 2789 | __u32 snd_una; /* First byte we want an ack for */ 2790 | __u32 mss_cache; /* Cached effective mss, not including SACKS */ 2791 | __u32 ecn_flags; /* ECN status bits. */ 2792 | __u32 rate_delivered; /* saved rate sample: packets delivered */ 2793 | __u32 rate_interval_us; /* saved rate sample: time elapsed */ 2794 | __u32 packets_out; /* Packets which are "in flight" */ 2795 | __u32 retrans_out; /* Retransmitted packets out */ 2796 | __u32 total_retrans; /* Total retransmits for entire connection */ 2797 | __u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn 2798 | * total number of segments in. 2799 | */ 2800 | __u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn 2801 | * total number of data segments in. 2802 | */ 2803 | __u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut 2804 | * The total number of segments sent. 2805 | */ 2806 | __u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut 2807 | * total number of data segments sent. 2808 | */ 2809 | __u32 lost_out; /* Lost packets */ 2810 | __u32 sacked_out; /* SACK'd packets */ 2811 | __u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived 2812 | * sum(delta(rcv_nxt)), or how many bytes 2813 | * were acked. 2814 | */ 2815 | __u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked 2816 | * sum(delta(snd_una)), or how many bytes 2817 | * were acked. 2818 | */ 2819 | }; 2820 | 2821 | struct bpf_sock_tuple { 2822 | union { 2823 | struct { 2824 | __be32 saddr; 2825 | __be32 daddr; 2826 | __be16 sport; 2827 | __be16 dport; 2828 | } ipv4; 2829 | struct { 2830 | __be32 saddr[4]; 2831 | __be32 daddr[4]; 2832 | __be16 sport; 2833 | __be16 dport; 2834 | } ipv6; 2835 | }; 2836 | }; 2837 | 2838 | #define XDP_PACKET_HEADROOM 256 2839 | 2840 | /* User return codes for XDP prog type. 2841 | * A valid XDP program must return one of these defined values. All other 2842 | * return codes are reserved for future use. Unknown return codes will 2843 | * result in packet drops and a warning via bpf_warn_invalid_xdp_action(). 2844 | */ 2845 | enum xdp_action { 2846 | XDP_ABORTED = 0, 2847 | XDP_DROP, 2848 | XDP_PASS, 2849 | XDP_TX, 2850 | XDP_REDIRECT, 2851 | }; 2852 | 2853 | /* user accessible metadata for XDP packet hook 2854 | * new fields must be added to the end of this structure 2855 | */ 2856 | struct xdp_md { 2857 | __u32 data; 2858 | __u32 data_end; 2859 | __u32 data_meta; 2860 | /* Below access go through struct xdp_rxq_info */ 2861 | __u32 ingress_ifindex; /* rxq->dev->ifindex */ 2862 | __u32 rx_queue_index; /* rxq->queue_index */ 2863 | }; 2864 | 2865 | enum sk_action { 2866 | SK_DROP = 0, 2867 | SK_PASS, 2868 | }; 2869 | 2870 | /* user accessible metadata for SK_MSG packet hook, new fields must 2871 | * be added to the end of this structure 2872 | */ 2873 | struct sk_msg_md { 2874 | __bpf_md_ptr(void *, data); 2875 | __bpf_md_ptr(void *, data_end); 2876 | 2877 | __u32 family; 2878 | __u32 remote_ip4; /* Stored in network byte order */ 2879 | __u32 local_ip4; /* Stored in network byte order */ 2880 | __u32 remote_ip6[4]; /* Stored in network byte order */ 2881 | __u32 local_ip6[4]; /* Stored in network byte order */ 2882 | __u32 remote_port; /* Stored in network byte order */ 2883 | __u32 local_port; /* stored in host byte order */ 2884 | __u32 size; /* Total size of sk_msg */ 2885 | }; 2886 | 2887 | struct sk_reuseport_md { 2888 | /* 2889 | * Start of directly accessible data. It begins from 2890 | * the tcp/udp header. 2891 | */ 2892 | __bpf_md_ptr(void *, data); 2893 | /* End of directly accessible data */ 2894 | __bpf_md_ptr(void *, data_end); 2895 | /* 2896 | * Total length of packet (starting from the tcp/udp header). 2897 | * Note that the directly accessible bytes (data_end - data) 2898 | * could be less than this "len". Those bytes could be 2899 | * indirectly read by a helper "bpf_skb_load_bytes()". 2900 | */ 2901 | __u32 len; 2902 | /* 2903 | * Eth protocol in the mac header (network byte order). e.g. 2904 | * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD) 2905 | */ 2906 | __u32 eth_protocol; 2907 | __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ 2908 | __u32 bind_inany; /* Is sock bound to an INANY address? */ 2909 | __u32 hash; /* A hash of the packet 4 tuples */ 2910 | }; 2911 | 2912 | #define BPF_TAG_SIZE 8 2913 | 2914 | struct bpf_prog_info { 2915 | __u32 type; 2916 | __u32 id; 2917 | __u8 tag[BPF_TAG_SIZE]; 2918 | __u32 jited_prog_len; 2919 | __u32 xlated_prog_len; 2920 | __aligned_u64 jited_prog_insns; 2921 | __aligned_u64 xlated_prog_insns; 2922 | __u64 load_time; /* ns since boottime */ 2923 | __u32 created_by_uid; 2924 | __u32 nr_map_ids; 2925 | __aligned_u64 map_ids; 2926 | char name[BPF_OBJ_NAME_LEN]; 2927 | __u32 ifindex; 2928 | __u32 gpl_compatible:1; 2929 | __u64 netns_dev; 2930 | __u64 netns_ino; 2931 | __u32 nr_jited_ksyms; 2932 | __u32 nr_jited_func_lens; 2933 | __aligned_u64 jited_ksyms; 2934 | __aligned_u64 jited_func_lens; 2935 | __u32 btf_id; 2936 | __u32 func_info_rec_size; 2937 | __aligned_u64 func_info; 2938 | __u32 nr_func_info; 2939 | __u32 nr_line_info; 2940 | __aligned_u64 line_info; 2941 | __aligned_u64 jited_line_info; 2942 | __u32 nr_jited_line_info; 2943 | __u32 line_info_rec_size; 2944 | __u32 jited_line_info_rec_size; 2945 | __u32 nr_prog_tags; 2946 | __aligned_u64 prog_tags; 2947 | __u64 run_time_ns; 2948 | __u64 run_cnt; 2949 | } __attribute__((aligned(8))); 2950 | 2951 | struct bpf_map_info { 2952 | __u32 type; 2953 | __u32 id; 2954 | __u32 key_size; 2955 | __u32 value_size; 2956 | __u32 max_entries; 2957 | __u32 map_flags; 2958 | char name[BPF_OBJ_NAME_LEN]; 2959 | __u32 ifindex; 2960 | __u32 :32; 2961 | __u64 netns_dev; 2962 | __u64 netns_ino; 2963 | __u32 btf_id; 2964 | __u32 btf_key_type_id; 2965 | __u32 btf_value_type_id; 2966 | } __attribute__((aligned(8))); 2967 | 2968 | struct bpf_btf_info { 2969 | __aligned_u64 btf; 2970 | __u32 btf_size; 2971 | __u32 id; 2972 | } __attribute__((aligned(8))); 2973 | 2974 | /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed 2975 | * by user and intended to be used by socket (e.g. to bind to, depends on 2976 | * attach attach type). 2977 | */ 2978 | struct bpf_sock_addr { 2979 | __u32 user_family; /* Allows 4-byte read, but no write. */ 2980 | __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write. 2981 | * Stored in network byte order. 2982 | */ 2983 | __u32 user_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write. 2984 | * Stored in network byte order. 2985 | */ 2986 | __u32 user_port; /* Allows 4-byte read and write. 2987 | * Stored in network byte order 2988 | */ 2989 | __u32 family; /* Allows 4-byte read, but no write */ 2990 | __u32 type; /* Allows 4-byte read, but no write */ 2991 | __u32 protocol; /* Allows 4-byte read, but no write */ 2992 | __u32 msg_src_ip4; /* Allows 1,2,4-byte read an 4-byte write. 2993 | * Stored in network byte order. 2994 | */ 2995 | __u32 msg_src_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write. 2996 | * Stored in network byte order. 2997 | */ 2998 | }; 2999 | 3000 | /* User bpf_sock_ops struct to access socket values and specify request ops 3001 | * and their replies. 3002 | * Some of this fields are in network (bigendian) byte order and may need 3003 | * to be converted before use (bpf_ntohl() defined in samples/bpf/bpf_endian.h). 3004 | * New fields can only be added at the end of this structure 3005 | */ 3006 | struct bpf_sock_ops { 3007 | __u32 op; 3008 | union { 3009 | __u32 args[4]; /* Optionally passed to bpf program */ 3010 | __u32 reply; /* Returned by bpf program */ 3011 | __u32 replylong[4]; /* Optionally returned by bpf prog */ 3012 | }; 3013 | __u32 family; 3014 | __u32 remote_ip4; /* Stored in network byte order */ 3015 | __u32 local_ip4; /* Stored in network byte order */ 3016 | __u32 remote_ip6[4]; /* Stored in network byte order */ 3017 | __u32 local_ip6[4]; /* Stored in network byte order */ 3018 | __u32 remote_port; /* Stored in network byte order */ 3019 | __u32 local_port; /* stored in host byte order */ 3020 | __u32 is_fullsock; /* Some TCP fields are only valid if 3021 | * there is a full socket. If not, the 3022 | * fields read as zero. 3023 | */ 3024 | __u32 snd_cwnd; 3025 | __u32 srtt_us; /* Averaged RTT << 3 in usecs */ 3026 | __u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */ 3027 | __u32 state; 3028 | __u32 rtt_min; 3029 | __u32 snd_ssthresh; 3030 | __u32 rcv_nxt; 3031 | __u32 snd_nxt; 3032 | __u32 snd_una; 3033 | __u32 mss_cache; 3034 | __u32 ecn_flags; 3035 | __u32 rate_delivered; 3036 | __u32 rate_interval_us; 3037 | __u32 packets_out; 3038 | __u32 retrans_out; 3039 | __u32 total_retrans; 3040 | __u32 segs_in; 3041 | __u32 data_segs_in; 3042 | __u32 segs_out; 3043 | __u32 data_segs_out; 3044 | __u32 lost_out; 3045 | __u32 sacked_out; 3046 | __u32 sk_txhash; 3047 | __u64 bytes_received; 3048 | __u64 bytes_acked; 3049 | }; 3050 | 3051 | /* Definitions for bpf_sock_ops_cb_flags */ 3052 | #define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0) 3053 | #define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1) 3054 | #define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2) 3055 | #define BPF_SOCK_OPS_ALL_CB_FLAGS 0x7 /* Mask of all currently 3056 | * supported cb flags 3057 | */ 3058 | 3059 | /* List of known BPF sock_ops operators. 3060 | * New entries can only be added at the end 3061 | */ 3062 | enum { 3063 | BPF_SOCK_OPS_VOID, 3064 | BPF_SOCK_OPS_TIMEOUT_INIT, /* Should return SYN-RTO value to use or 3065 | * -1 if default value should be used 3066 | */ 3067 | BPF_SOCK_OPS_RWND_INIT, /* Should return initial advertized 3068 | * window (in packets) or -1 if default 3069 | * value should be used 3070 | */ 3071 | BPF_SOCK_OPS_TCP_CONNECT_CB, /* Calls BPF program right before an 3072 | * active connection is initialized 3073 | */ 3074 | BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, /* Calls BPF program when an 3075 | * active connection is 3076 | * established 3077 | */ 3078 | BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, /* Calls BPF program when a 3079 | * passive connection is 3080 | * established 3081 | */ 3082 | BPF_SOCK_OPS_NEEDS_ECN, /* If connection's congestion control 3083 | * needs ECN 3084 | */ 3085 | BPF_SOCK_OPS_BASE_RTT, /* Get base RTT. The correct value is 3086 | * based on the path and may be 3087 | * dependent on the congestion control 3088 | * algorithm. In general it indicates 3089 | * a congestion threshold. RTTs above 3090 | * this indicate congestion 3091 | */ 3092 | BPF_SOCK_OPS_RTO_CB, /* Called when an RTO has triggered. 3093 | * Arg1: value of icsk_retransmits 3094 | * Arg2: value of icsk_rto 3095 | * Arg3: whether RTO has expired 3096 | */ 3097 | BPF_SOCK_OPS_RETRANS_CB, /* Called when skb is retransmitted. 3098 | * Arg1: sequence number of 1st byte 3099 | * Arg2: # segments 3100 | * Arg3: return value of 3101 | * tcp_transmit_skb (0 => success) 3102 | */ 3103 | BPF_SOCK_OPS_STATE_CB, /* Called when TCP changes state. 3104 | * Arg1: old_state 3105 | * Arg2: new_state 3106 | */ 3107 | BPF_SOCK_OPS_TCP_LISTEN_CB, /* Called on listen(2), right after 3108 | * socket transition to LISTEN state. 3109 | */ 3110 | }; 3111 | 3112 | /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect 3113 | * changes between the TCP and BPF versions. Ideally this should never happen. 3114 | * If it does, we need to add code to convert them before calling 3115 | * the BPF sock_ops function. 3116 | */ 3117 | enum { 3118 | BPF_TCP_ESTABLISHED = 1, 3119 | BPF_TCP_SYN_SENT, 3120 | BPF_TCP_SYN_RECV, 3121 | BPF_TCP_FIN_WAIT1, 3122 | BPF_TCP_FIN_WAIT2, 3123 | BPF_TCP_TIME_WAIT, 3124 | BPF_TCP_CLOSE, 3125 | BPF_TCP_CLOSE_WAIT, 3126 | BPF_TCP_LAST_ACK, 3127 | BPF_TCP_LISTEN, 3128 | BPF_TCP_CLOSING, /* Now a valid state */ 3129 | BPF_TCP_NEW_SYN_RECV, 3130 | 3131 | BPF_TCP_MAX_STATES /* Leave at the end! */ 3132 | }; 3133 | 3134 | #define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ 3135 | #define TCP_BPF_SNDCWND_CLAMP 1002 /* Set sndcwnd_clamp */ 3136 | 3137 | struct bpf_perf_event_value { 3138 | __u64 counter; 3139 | __u64 enabled; 3140 | __u64 running; 3141 | }; 3142 | 3143 | #define BPF_DEVCG_ACC_MKNOD (1ULL << 0) 3144 | #define BPF_DEVCG_ACC_READ (1ULL << 1) 3145 | #define BPF_DEVCG_ACC_WRITE (1ULL << 2) 3146 | 3147 | #define BPF_DEVCG_DEV_BLOCK (1ULL << 0) 3148 | #define BPF_DEVCG_DEV_CHAR (1ULL << 1) 3149 | 3150 | struct bpf_cgroup_dev_ctx { 3151 | /* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */ 3152 | __u32 access_type; 3153 | __u32 major; 3154 | __u32 minor; 3155 | }; 3156 | 3157 | struct bpf_raw_tracepoint_args { 3158 | __u64 args[0]; 3159 | }; 3160 | 3161 | /* DIRECT: Skip the FIB rules and go to FIB table associated with device 3162 | * OUTPUT: Do lookup from egress perspective; default is ingress 3163 | */ 3164 | #define BPF_FIB_LOOKUP_DIRECT BIT(0) 3165 | #define BPF_FIB_LOOKUP_OUTPUT BIT(1) 3166 | 3167 | enum { 3168 | BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */ 3169 | BPF_FIB_LKUP_RET_BLACKHOLE, /* dest is blackholed; can be dropped */ 3170 | BPF_FIB_LKUP_RET_UNREACHABLE, /* dest is unreachable; can be dropped */ 3171 | BPF_FIB_LKUP_RET_PROHIBIT, /* dest not allowed; can be dropped */ 3172 | BPF_FIB_LKUP_RET_NOT_FWDED, /* packet is not forwarded */ 3173 | BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */ 3174 | BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires encapsulation */ 3175 | BPF_FIB_LKUP_RET_NO_NEIGH, /* no neighbor entry for nh */ 3176 | BPF_FIB_LKUP_RET_FRAG_NEEDED, /* fragmentation required to fwd */ 3177 | }; 3178 | 3179 | struct bpf_fib_lookup { 3180 | /* input: network family for lookup (AF_INET, AF_INET6) 3181 | * output: network family of egress nexthop 3182 | */ 3183 | __u8 family; 3184 | 3185 | /* set if lookup is to consider L4 data - e.g., FIB rules */ 3186 | __u8 l4_protocol; 3187 | __be16 sport; 3188 | __be16 dport; 3189 | 3190 | /* total length of packet from network header - used for MTU check */ 3191 | __u16 tot_len; 3192 | 3193 | /* input: L3 device index for lookup 3194 | * output: device index from FIB lookup 3195 | */ 3196 | __u32 ifindex; 3197 | 3198 | union { 3199 | /* inputs to lookup */ 3200 | __u8 tos; /* AF_INET */ 3201 | __be32 flowinfo; /* AF_INET6, flow_label + priority */ 3202 | 3203 | /* output: metric of fib result (IPv4/IPv6 only) */ 3204 | __u32 rt_metric; 3205 | }; 3206 | 3207 | union { 3208 | __be32 ipv4_src; 3209 | __u32 ipv6_src[4]; /* in6_addr; network order */ 3210 | }; 3211 | 3212 | /* input to bpf_fib_lookup, ipv{4,6}_dst is destination address in 3213 | * network header. output: bpf_fib_lookup sets to gateway address 3214 | * if FIB lookup returns gateway route 3215 | */ 3216 | union { 3217 | __be32 ipv4_dst; 3218 | __u32 ipv6_dst[4]; /* in6_addr; network order */ 3219 | }; 3220 | 3221 | /* output */ 3222 | __be16 h_vlan_proto; 3223 | __be16 h_vlan_TCI; 3224 | __u8 smac[6]; /* ETH_ALEN */ 3225 | __u8 dmac[6]; /* ETH_ALEN */ 3226 | }; 3227 | 3228 | enum bpf_task_fd_type { 3229 | BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ 3230 | BPF_FD_TYPE_TRACEPOINT, /* tp name */ 3231 | BPF_FD_TYPE_KPROBE, /* (symbol + offset) or addr */ 3232 | BPF_FD_TYPE_KRETPROBE, /* (symbol + offset) or addr */ 3233 | BPF_FD_TYPE_UPROBE, /* filename + offset */ 3234 | BPF_FD_TYPE_URETPROBE, /* filename + offset */ 3235 | }; 3236 | 3237 | struct bpf_flow_keys { 3238 | __u16 nhoff; 3239 | __u16 thoff; 3240 | __u16 addr_proto; /* ETH_P_* of valid addrs */ 3241 | __u8 is_frag; 3242 | __u8 is_first_frag; 3243 | __u8 is_encap; 3244 | __u8 ip_proto; 3245 | __be16 n_proto; 3246 | __be16 sport; 3247 | __be16 dport; 3248 | union { 3249 | struct { 3250 | __be32 ipv4_src; 3251 | __be32 ipv4_dst; 3252 | }; 3253 | struct { 3254 | __u32 ipv6_src[4]; /* in6_addr; network order */ 3255 | __u32 ipv6_dst[4]; /* in6_addr; network order */ 3256 | }; 3257 | }; 3258 | }; 3259 | 3260 | struct bpf_func_info { 3261 | __u32 insn_off; 3262 | __u32 type_id; 3263 | }; 3264 | 3265 | #define BPF_LINE_INFO_LINE_NUM(line_col) ((line_col) >> 10) 3266 | #define BPF_LINE_INFO_LINE_COL(line_col) ((line_col) & 0x3ff) 3267 | 3268 | struct bpf_line_info { 3269 | __u32 insn_off; 3270 | __u32 file_name_off; 3271 | __u32 line_off; 3272 | __u32 line_col; 3273 | }; 3274 | 3275 | struct bpf_spin_lock { 3276 | __u32 val; 3277 | }; 3278 | #endif /* _UAPI__LINUX_BPF_H__ */ 3279 | -------------------------------------------------------------------------------- /headers/linux/err.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ 2 | 3 | #ifndef __LINUX_ERR_H 4 | #define __LINUX_ERR_H 5 | 6 | #include 7 | #include 8 | 9 | #define MAX_ERRNO 4095 10 | 11 | #define IS_ERR_VALUE(x) ((x) >= (unsigned long)-MAX_ERRNO) 12 | 13 | static inline void * ERR_PTR(long error_) 14 | { 15 | return (void *) error_; 16 | } 17 | 18 | static inline long PTR_ERR(const void *ptr) 19 | { 20 | return (long) ptr; 21 | } 22 | 23 | static inline bool IS_ERR(const void *ptr) 24 | { 25 | return IS_ERR_VALUE((unsigned long)ptr); 26 | } 27 | 28 | static inline bool IS_ERR_OR_NULL(const void *ptr) 29 | { 30 | return (!ptr) || IS_ERR_VALUE((unsigned long)ptr); 31 | } 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /headers/linux/if_link.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ 2 | #ifndef _UAPI_LINUX_IF_LINK_H 3 | #define _UAPI_LINUX_IF_LINK_H 4 | 5 | #include 6 | #include 7 | 8 | /* This struct should be in sync with struct rtnl_link_stats64 */ 9 | struct rtnl_link_stats { 10 | __u32 rx_packets; /* total packets received */ 11 | __u32 tx_packets; /* total packets transmitted */ 12 | __u32 rx_bytes; /* total bytes received */ 13 | __u32 tx_bytes; /* total bytes transmitted */ 14 | __u32 rx_errors; /* bad packets received */ 15 | __u32 tx_errors; /* packet transmit problems */ 16 | __u32 rx_dropped; /* no space in linux buffers */ 17 | __u32 tx_dropped; /* no space available in linux */ 18 | __u32 multicast; /* multicast packets received */ 19 | __u32 collisions; 20 | 21 | /* detailed rx_errors: */ 22 | __u32 rx_length_errors; 23 | __u32 rx_over_errors; /* receiver ring buff overflow */ 24 | __u32 rx_crc_errors; /* recved pkt with crc error */ 25 | __u32 rx_frame_errors; /* recv'd frame alignment error */ 26 | __u32 rx_fifo_errors; /* recv'r fifo overrun */ 27 | __u32 rx_missed_errors; /* receiver missed packet */ 28 | 29 | /* detailed tx_errors */ 30 | __u32 tx_aborted_errors; 31 | __u32 tx_carrier_errors; 32 | __u32 tx_fifo_errors; 33 | __u32 tx_heartbeat_errors; 34 | __u32 tx_window_errors; 35 | 36 | /* for cslip etc */ 37 | __u32 rx_compressed; 38 | __u32 tx_compressed; 39 | 40 | __u32 rx_nohandler; /* dropped, no handler found */ 41 | }; 42 | 43 | /* The main device statistics structure */ 44 | struct rtnl_link_stats64 { 45 | __u64 rx_packets; /* total packets received */ 46 | __u64 tx_packets; /* total packets transmitted */ 47 | __u64 rx_bytes; /* total bytes received */ 48 | __u64 tx_bytes; /* total bytes transmitted */ 49 | __u64 rx_errors; /* bad packets received */ 50 | __u64 tx_errors; /* packet transmit problems */ 51 | __u64 rx_dropped; /* no space in linux buffers */ 52 | __u64 tx_dropped; /* no space available in linux */ 53 | __u64 multicast; /* multicast packets received */ 54 | __u64 collisions; 55 | 56 | /* detailed rx_errors: */ 57 | __u64 rx_length_errors; 58 | __u64 rx_over_errors; /* receiver ring buff overflow */ 59 | __u64 rx_crc_errors; /* recved pkt with crc error */ 60 | __u64 rx_frame_errors; /* recv'd frame alignment error */ 61 | __u64 rx_fifo_errors; /* recv'r fifo overrun */ 62 | __u64 rx_missed_errors; /* receiver missed packet */ 63 | 64 | /* detailed tx_errors */ 65 | __u64 tx_aborted_errors; 66 | __u64 tx_carrier_errors; 67 | __u64 tx_fifo_errors; 68 | __u64 tx_heartbeat_errors; 69 | __u64 tx_window_errors; 70 | 71 | /* for cslip etc */ 72 | __u64 rx_compressed; 73 | __u64 tx_compressed; 74 | 75 | __u64 rx_nohandler; /* dropped, no handler found */ 76 | }; 77 | 78 | /* The struct should be in sync with struct ifmap */ 79 | struct rtnl_link_ifmap { 80 | __u64 mem_start; 81 | __u64 mem_end; 82 | __u64 base_addr; 83 | __u16 irq; 84 | __u8 dma; 85 | __u8 port; 86 | }; 87 | 88 | /* 89 | * IFLA_AF_SPEC 90 | * Contains nested attributes for address family specific attributes. 91 | * Each address family may create a attribute with the address family 92 | * number as type and create its own attribute structure in it. 93 | * 94 | * Example: 95 | * [IFLA_AF_SPEC] = { 96 | * [AF_INET] = { 97 | * [IFLA_INET_CONF] = ..., 98 | * }, 99 | * [AF_INET6] = { 100 | * [IFLA_INET6_FLAGS] = ..., 101 | * [IFLA_INET6_CONF] = ..., 102 | * } 103 | * } 104 | */ 105 | 106 | enum { 107 | IFLA_UNSPEC, 108 | IFLA_ADDRESS, 109 | IFLA_BROADCAST, 110 | IFLA_IFNAME, 111 | IFLA_MTU, 112 | IFLA_LINK, 113 | IFLA_QDISC, 114 | IFLA_STATS, 115 | IFLA_COST, 116 | #define IFLA_COST IFLA_COST 117 | IFLA_PRIORITY, 118 | #define IFLA_PRIORITY IFLA_PRIORITY 119 | IFLA_MASTER, 120 | #define IFLA_MASTER IFLA_MASTER 121 | IFLA_WIRELESS, /* Wireless Extension event - see wireless.h */ 122 | #define IFLA_WIRELESS IFLA_WIRELESS 123 | IFLA_PROTINFO, /* Protocol specific information for a link */ 124 | #define IFLA_PROTINFO IFLA_PROTINFO 125 | IFLA_TXQLEN, 126 | #define IFLA_TXQLEN IFLA_TXQLEN 127 | IFLA_MAP, 128 | #define IFLA_MAP IFLA_MAP 129 | IFLA_WEIGHT, 130 | #define IFLA_WEIGHT IFLA_WEIGHT 131 | IFLA_OPERSTATE, 132 | IFLA_LINKMODE, 133 | IFLA_LINKINFO, 134 | #define IFLA_LINKINFO IFLA_LINKINFO 135 | IFLA_NET_NS_PID, 136 | IFLA_IFALIAS, 137 | IFLA_NUM_VF, /* Number of VFs if device is SR-IOV PF */ 138 | IFLA_VFINFO_LIST, 139 | IFLA_STATS64, 140 | IFLA_VF_PORTS, 141 | IFLA_PORT_SELF, 142 | IFLA_AF_SPEC, 143 | IFLA_GROUP, /* Group the device belongs to */ 144 | IFLA_NET_NS_FD, 145 | IFLA_EXT_MASK, /* Extended info mask, VFs, etc */ 146 | IFLA_PROMISCUITY, /* Promiscuity count: > 0 means acts PROMISC */ 147 | #define IFLA_PROMISCUITY IFLA_PROMISCUITY 148 | IFLA_NUM_TX_QUEUES, 149 | IFLA_NUM_RX_QUEUES, 150 | IFLA_CARRIER, 151 | IFLA_PHYS_PORT_ID, 152 | IFLA_CARRIER_CHANGES, 153 | IFLA_PHYS_SWITCH_ID, 154 | IFLA_LINK_NETNSID, 155 | IFLA_PHYS_PORT_NAME, 156 | IFLA_PROTO_DOWN, 157 | IFLA_GSO_MAX_SEGS, 158 | IFLA_GSO_MAX_SIZE, 159 | IFLA_PAD, 160 | IFLA_XDP, 161 | IFLA_EVENT, 162 | IFLA_NEW_NETNSID, 163 | IFLA_IF_NETNSID, 164 | IFLA_TARGET_NETNSID = IFLA_IF_NETNSID, /* new alias */ 165 | IFLA_CARRIER_UP_COUNT, 166 | IFLA_CARRIER_DOWN_COUNT, 167 | IFLA_NEW_IFINDEX, 168 | IFLA_MIN_MTU, 169 | IFLA_MAX_MTU, 170 | __IFLA_MAX 171 | }; 172 | 173 | 174 | #define IFLA_MAX (__IFLA_MAX - 1) 175 | 176 | /* backwards compatibility for userspace */ 177 | #ifndef __KERNEL__ 178 | #define IFLA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct ifinfomsg)))) 179 | #define IFLA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct ifinfomsg)) 180 | #endif 181 | 182 | enum { 183 | IFLA_INET_UNSPEC, 184 | IFLA_INET_CONF, 185 | __IFLA_INET_MAX, 186 | }; 187 | 188 | #define IFLA_INET_MAX (__IFLA_INET_MAX - 1) 189 | 190 | /* ifi_flags. 191 | 192 | IFF_* flags. 193 | 194 | The only change is: 195 | IFF_LOOPBACK, IFF_BROADCAST and IFF_POINTOPOINT are 196 | more not changeable by user. They describe link media 197 | characteristics and set by device driver. 198 | 199 | Comments: 200 | - Combination IFF_BROADCAST|IFF_POINTOPOINT is invalid 201 | - If neither of these three flags are set; 202 | the interface is NBMA. 203 | 204 | - IFF_MULTICAST does not mean anything special: 205 | multicasts can be used on all not-NBMA links. 206 | IFF_MULTICAST means that this media uses special encapsulation 207 | for multicast frames. Apparently, all IFF_POINTOPOINT and 208 | IFF_BROADCAST devices are able to use multicasts too. 209 | */ 210 | 211 | /* IFLA_LINK. 212 | For usual devices it is equal ifi_index. 213 | If it is a "virtual interface" (f.e. tunnel), ifi_link 214 | can point to real physical interface (f.e. for bandwidth calculations), 215 | or maybe 0, what means, that real media is unknown (usual 216 | for IPIP tunnels, when route to endpoint is allowed to change) 217 | */ 218 | 219 | /* Subtype attributes for IFLA_PROTINFO */ 220 | enum { 221 | IFLA_INET6_UNSPEC, 222 | IFLA_INET6_FLAGS, /* link flags */ 223 | IFLA_INET6_CONF, /* sysctl parameters */ 224 | IFLA_INET6_STATS, /* statistics */ 225 | IFLA_INET6_MCAST, /* MC things. What of them? */ 226 | IFLA_INET6_CACHEINFO, /* time values and max reasm size */ 227 | IFLA_INET6_ICMP6STATS, /* statistics (icmpv6) */ 228 | IFLA_INET6_TOKEN, /* device token */ 229 | IFLA_INET6_ADDR_GEN_MODE, /* implicit address generator mode */ 230 | __IFLA_INET6_MAX 231 | }; 232 | 233 | #define IFLA_INET6_MAX (__IFLA_INET6_MAX - 1) 234 | 235 | enum in6_addr_gen_mode { 236 | IN6_ADDR_GEN_MODE_EUI64, 237 | IN6_ADDR_GEN_MODE_NONE, 238 | IN6_ADDR_GEN_MODE_STABLE_PRIVACY, 239 | IN6_ADDR_GEN_MODE_RANDOM, 240 | }; 241 | 242 | /* Bridge section */ 243 | 244 | enum { 245 | IFLA_BR_UNSPEC, 246 | IFLA_BR_FORWARD_DELAY, 247 | IFLA_BR_HELLO_TIME, 248 | IFLA_BR_MAX_AGE, 249 | IFLA_BR_AGEING_TIME, 250 | IFLA_BR_STP_STATE, 251 | IFLA_BR_PRIORITY, 252 | IFLA_BR_VLAN_FILTERING, 253 | IFLA_BR_VLAN_PROTOCOL, 254 | IFLA_BR_GROUP_FWD_MASK, 255 | IFLA_BR_ROOT_ID, 256 | IFLA_BR_BRIDGE_ID, 257 | IFLA_BR_ROOT_PORT, 258 | IFLA_BR_ROOT_PATH_COST, 259 | IFLA_BR_TOPOLOGY_CHANGE, 260 | IFLA_BR_TOPOLOGY_CHANGE_DETECTED, 261 | IFLA_BR_HELLO_TIMER, 262 | IFLA_BR_TCN_TIMER, 263 | IFLA_BR_TOPOLOGY_CHANGE_TIMER, 264 | IFLA_BR_GC_TIMER, 265 | IFLA_BR_GROUP_ADDR, 266 | IFLA_BR_FDB_FLUSH, 267 | IFLA_BR_MCAST_ROUTER, 268 | IFLA_BR_MCAST_SNOOPING, 269 | IFLA_BR_MCAST_QUERY_USE_IFADDR, 270 | IFLA_BR_MCAST_QUERIER, 271 | IFLA_BR_MCAST_HASH_ELASTICITY, 272 | IFLA_BR_MCAST_HASH_MAX, 273 | IFLA_BR_MCAST_LAST_MEMBER_CNT, 274 | IFLA_BR_MCAST_STARTUP_QUERY_CNT, 275 | IFLA_BR_MCAST_LAST_MEMBER_INTVL, 276 | IFLA_BR_MCAST_MEMBERSHIP_INTVL, 277 | IFLA_BR_MCAST_QUERIER_INTVL, 278 | IFLA_BR_MCAST_QUERY_INTVL, 279 | IFLA_BR_MCAST_QUERY_RESPONSE_INTVL, 280 | IFLA_BR_MCAST_STARTUP_QUERY_INTVL, 281 | IFLA_BR_NF_CALL_IPTABLES, 282 | IFLA_BR_NF_CALL_IP6TABLES, 283 | IFLA_BR_NF_CALL_ARPTABLES, 284 | IFLA_BR_VLAN_DEFAULT_PVID, 285 | IFLA_BR_PAD, 286 | IFLA_BR_VLAN_STATS_ENABLED, 287 | IFLA_BR_MCAST_STATS_ENABLED, 288 | IFLA_BR_MCAST_IGMP_VERSION, 289 | IFLA_BR_MCAST_MLD_VERSION, 290 | IFLA_BR_VLAN_STATS_PER_PORT, 291 | IFLA_BR_MULTI_BOOLOPT, 292 | __IFLA_BR_MAX, 293 | }; 294 | 295 | #define IFLA_BR_MAX (__IFLA_BR_MAX - 1) 296 | 297 | struct ifla_bridge_id { 298 | __u8 prio[2]; 299 | __u8 addr[6]; /* ETH_ALEN */ 300 | }; 301 | 302 | enum { 303 | BRIDGE_MODE_UNSPEC, 304 | BRIDGE_MODE_HAIRPIN, 305 | }; 306 | 307 | enum { 308 | IFLA_BRPORT_UNSPEC, 309 | IFLA_BRPORT_STATE, /* Spanning tree state */ 310 | IFLA_BRPORT_PRIORITY, /* " priority */ 311 | IFLA_BRPORT_COST, /* " cost */ 312 | IFLA_BRPORT_MODE, /* mode (hairpin) */ 313 | IFLA_BRPORT_GUARD, /* bpdu guard */ 314 | IFLA_BRPORT_PROTECT, /* root port protection */ 315 | IFLA_BRPORT_FAST_LEAVE, /* multicast fast leave */ 316 | IFLA_BRPORT_LEARNING, /* mac learning */ 317 | IFLA_BRPORT_UNICAST_FLOOD, /* flood unicast traffic */ 318 | IFLA_BRPORT_PROXYARP, /* proxy ARP */ 319 | IFLA_BRPORT_LEARNING_SYNC, /* mac learning sync from device */ 320 | IFLA_BRPORT_PROXYARP_WIFI, /* proxy ARP for Wi-Fi */ 321 | IFLA_BRPORT_ROOT_ID, /* designated root */ 322 | IFLA_BRPORT_BRIDGE_ID, /* designated bridge */ 323 | IFLA_BRPORT_DESIGNATED_PORT, 324 | IFLA_BRPORT_DESIGNATED_COST, 325 | IFLA_BRPORT_ID, 326 | IFLA_BRPORT_NO, 327 | IFLA_BRPORT_TOPOLOGY_CHANGE_ACK, 328 | IFLA_BRPORT_CONFIG_PENDING, 329 | IFLA_BRPORT_MESSAGE_AGE_TIMER, 330 | IFLA_BRPORT_FORWARD_DELAY_TIMER, 331 | IFLA_BRPORT_HOLD_TIMER, 332 | IFLA_BRPORT_FLUSH, 333 | IFLA_BRPORT_MULTICAST_ROUTER, 334 | IFLA_BRPORT_PAD, 335 | IFLA_BRPORT_MCAST_FLOOD, 336 | IFLA_BRPORT_MCAST_TO_UCAST, 337 | IFLA_BRPORT_VLAN_TUNNEL, 338 | IFLA_BRPORT_BCAST_FLOOD, 339 | IFLA_BRPORT_GROUP_FWD_MASK, 340 | IFLA_BRPORT_NEIGH_SUPPRESS, 341 | IFLA_BRPORT_ISOLATED, 342 | IFLA_BRPORT_BACKUP_PORT, 343 | __IFLA_BRPORT_MAX 344 | }; 345 | #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) 346 | 347 | struct ifla_cacheinfo { 348 | __u32 max_reasm_len; 349 | __u32 tstamp; /* ipv6InterfaceTable updated timestamp */ 350 | __u32 reachable_time; 351 | __u32 retrans_time; 352 | }; 353 | 354 | enum { 355 | IFLA_INFO_UNSPEC, 356 | IFLA_INFO_KIND, 357 | IFLA_INFO_DATA, 358 | IFLA_INFO_XSTATS, 359 | IFLA_INFO_SLAVE_KIND, 360 | IFLA_INFO_SLAVE_DATA, 361 | __IFLA_INFO_MAX, 362 | }; 363 | 364 | #define IFLA_INFO_MAX (__IFLA_INFO_MAX - 1) 365 | 366 | /* VLAN section */ 367 | 368 | enum { 369 | IFLA_VLAN_UNSPEC, 370 | IFLA_VLAN_ID, 371 | IFLA_VLAN_FLAGS, 372 | IFLA_VLAN_EGRESS_QOS, 373 | IFLA_VLAN_INGRESS_QOS, 374 | IFLA_VLAN_PROTOCOL, 375 | __IFLA_VLAN_MAX, 376 | }; 377 | 378 | #define IFLA_VLAN_MAX (__IFLA_VLAN_MAX - 1) 379 | 380 | struct ifla_vlan_flags { 381 | __u32 flags; 382 | __u32 mask; 383 | }; 384 | 385 | enum { 386 | IFLA_VLAN_QOS_UNSPEC, 387 | IFLA_VLAN_QOS_MAPPING, 388 | __IFLA_VLAN_QOS_MAX 389 | }; 390 | 391 | #define IFLA_VLAN_QOS_MAX (__IFLA_VLAN_QOS_MAX - 1) 392 | 393 | struct ifla_vlan_qos_mapping { 394 | __u32 from; 395 | __u32 to; 396 | }; 397 | 398 | /* MACVLAN section */ 399 | enum { 400 | IFLA_MACVLAN_UNSPEC, 401 | IFLA_MACVLAN_MODE, 402 | IFLA_MACVLAN_FLAGS, 403 | IFLA_MACVLAN_MACADDR_MODE, 404 | IFLA_MACVLAN_MACADDR, 405 | IFLA_MACVLAN_MACADDR_DATA, 406 | IFLA_MACVLAN_MACADDR_COUNT, 407 | __IFLA_MACVLAN_MAX, 408 | }; 409 | 410 | #define IFLA_MACVLAN_MAX (__IFLA_MACVLAN_MAX - 1) 411 | 412 | enum macvlan_mode { 413 | MACVLAN_MODE_PRIVATE = 1, /* don't talk to other macvlans */ 414 | MACVLAN_MODE_VEPA = 2, /* talk to other ports through ext bridge */ 415 | MACVLAN_MODE_BRIDGE = 4, /* talk to bridge ports directly */ 416 | MACVLAN_MODE_PASSTHRU = 8,/* take over the underlying device */ 417 | MACVLAN_MODE_SOURCE = 16,/* use source MAC address list to assign */ 418 | }; 419 | 420 | enum macvlan_macaddr_mode { 421 | MACVLAN_MACADDR_ADD, 422 | MACVLAN_MACADDR_DEL, 423 | MACVLAN_MACADDR_FLUSH, 424 | MACVLAN_MACADDR_SET, 425 | }; 426 | 427 | #define MACVLAN_FLAG_NOPROMISC 1 428 | 429 | /* VRF section */ 430 | enum { 431 | IFLA_VRF_UNSPEC, 432 | IFLA_VRF_TABLE, 433 | __IFLA_VRF_MAX 434 | }; 435 | 436 | #define IFLA_VRF_MAX (__IFLA_VRF_MAX - 1) 437 | 438 | enum { 439 | IFLA_VRF_PORT_UNSPEC, 440 | IFLA_VRF_PORT_TABLE, 441 | __IFLA_VRF_PORT_MAX 442 | }; 443 | 444 | #define IFLA_VRF_PORT_MAX (__IFLA_VRF_PORT_MAX - 1) 445 | 446 | /* MACSEC section */ 447 | enum { 448 | IFLA_MACSEC_UNSPEC, 449 | IFLA_MACSEC_SCI, 450 | IFLA_MACSEC_PORT, 451 | IFLA_MACSEC_ICV_LEN, 452 | IFLA_MACSEC_CIPHER_SUITE, 453 | IFLA_MACSEC_WINDOW, 454 | IFLA_MACSEC_ENCODING_SA, 455 | IFLA_MACSEC_ENCRYPT, 456 | IFLA_MACSEC_PROTECT, 457 | IFLA_MACSEC_INC_SCI, 458 | IFLA_MACSEC_ES, 459 | IFLA_MACSEC_SCB, 460 | IFLA_MACSEC_REPLAY_PROTECT, 461 | IFLA_MACSEC_VALIDATION, 462 | IFLA_MACSEC_PAD, 463 | __IFLA_MACSEC_MAX, 464 | }; 465 | 466 | #define IFLA_MACSEC_MAX (__IFLA_MACSEC_MAX - 1) 467 | 468 | /* XFRM section */ 469 | enum { 470 | IFLA_XFRM_UNSPEC, 471 | IFLA_XFRM_LINK, 472 | IFLA_XFRM_IF_ID, 473 | __IFLA_XFRM_MAX 474 | }; 475 | 476 | #define IFLA_XFRM_MAX (__IFLA_XFRM_MAX - 1) 477 | 478 | enum macsec_validation_type { 479 | MACSEC_VALIDATE_DISABLED = 0, 480 | MACSEC_VALIDATE_CHECK = 1, 481 | MACSEC_VALIDATE_STRICT = 2, 482 | __MACSEC_VALIDATE_END, 483 | MACSEC_VALIDATE_MAX = __MACSEC_VALIDATE_END - 1, 484 | }; 485 | 486 | /* IPVLAN section */ 487 | enum { 488 | IFLA_IPVLAN_UNSPEC, 489 | IFLA_IPVLAN_MODE, 490 | IFLA_IPVLAN_FLAGS, 491 | __IFLA_IPVLAN_MAX 492 | }; 493 | 494 | #define IFLA_IPVLAN_MAX (__IFLA_IPVLAN_MAX - 1) 495 | 496 | enum ipvlan_mode { 497 | IPVLAN_MODE_L2 = 0, 498 | IPVLAN_MODE_L3, 499 | IPVLAN_MODE_L3S, 500 | IPVLAN_MODE_MAX 501 | }; 502 | 503 | #define IPVLAN_F_PRIVATE 0x01 504 | #define IPVLAN_F_VEPA 0x02 505 | 506 | /* VXLAN section */ 507 | enum { 508 | IFLA_VXLAN_UNSPEC, 509 | IFLA_VXLAN_ID, 510 | IFLA_VXLAN_GROUP, /* group or remote address */ 511 | IFLA_VXLAN_LINK, 512 | IFLA_VXLAN_LOCAL, 513 | IFLA_VXLAN_TTL, 514 | IFLA_VXLAN_TOS, 515 | IFLA_VXLAN_LEARNING, 516 | IFLA_VXLAN_AGEING, 517 | IFLA_VXLAN_LIMIT, 518 | IFLA_VXLAN_PORT_RANGE, /* source port */ 519 | IFLA_VXLAN_PROXY, 520 | IFLA_VXLAN_RSC, 521 | IFLA_VXLAN_L2MISS, 522 | IFLA_VXLAN_L3MISS, 523 | IFLA_VXLAN_PORT, /* destination port */ 524 | IFLA_VXLAN_GROUP6, 525 | IFLA_VXLAN_LOCAL6, 526 | IFLA_VXLAN_UDP_CSUM, 527 | IFLA_VXLAN_UDP_ZERO_CSUM6_TX, 528 | IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 529 | IFLA_VXLAN_REMCSUM_TX, 530 | IFLA_VXLAN_REMCSUM_RX, 531 | IFLA_VXLAN_GBP, 532 | IFLA_VXLAN_REMCSUM_NOPARTIAL, 533 | IFLA_VXLAN_COLLECT_METADATA, 534 | IFLA_VXLAN_LABEL, 535 | IFLA_VXLAN_GPE, 536 | IFLA_VXLAN_TTL_INHERIT, 537 | IFLA_VXLAN_DF, 538 | __IFLA_VXLAN_MAX 539 | }; 540 | #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) 541 | 542 | struct ifla_vxlan_port_range { 543 | __be16 low; 544 | __be16 high; 545 | }; 546 | 547 | enum ifla_vxlan_df { 548 | VXLAN_DF_UNSET = 0, 549 | VXLAN_DF_SET, 550 | VXLAN_DF_INHERIT, 551 | __VXLAN_DF_END, 552 | VXLAN_DF_MAX = __VXLAN_DF_END - 1, 553 | }; 554 | 555 | /* GENEVE section */ 556 | enum { 557 | IFLA_GENEVE_UNSPEC, 558 | IFLA_GENEVE_ID, 559 | IFLA_GENEVE_REMOTE, 560 | IFLA_GENEVE_TTL, 561 | IFLA_GENEVE_TOS, 562 | IFLA_GENEVE_PORT, /* destination port */ 563 | IFLA_GENEVE_COLLECT_METADATA, 564 | IFLA_GENEVE_REMOTE6, 565 | IFLA_GENEVE_UDP_CSUM, 566 | IFLA_GENEVE_UDP_ZERO_CSUM6_TX, 567 | IFLA_GENEVE_UDP_ZERO_CSUM6_RX, 568 | IFLA_GENEVE_LABEL, 569 | IFLA_GENEVE_TTL_INHERIT, 570 | IFLA_GENEVE_DF, 571 | __IFLA_GENEVE_MAX 572 | }; 573 | #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) 574 | 575 | enum ifla_geneve_df { 576 | GENEVE_DF_UNSET = 0, 577 | GENEVE_DF_SET, 578 | GENEVE_DF_INHERIT, 579 | __GENEVE_DF_END, 580 | GENEVE_DF_MAX = __GENEVE_DF_END - 1, 581 | }; 582 | 583 | /* PPP section */ 584 | enum { 585 | IFLA_PPP_UNSPEC, 586 | IFLA_PPP_DEV_FD, 587 | __IFLA_PPP_MAX 588 | }; 589 | #define IFLA_PPP_MAX (__IFLA_PPP_MAX - 1) 590 | 591 | /* GTP section */ 592 | 593 | enum ifla_gtp_role { 594 | GTP_ROLE_GGSN = 0, 595 | GTP_ROLE_SGSN, 596 | }; 597 | 598 | enum { 599 | IFLA_GTP_UNSPEC, 600 | IFLA_GTP_FD0, 601 | IFLA_GTP_FD1, 602 | IFLA_GTP_PDP_HASHSIZE, 603 | IFLA_GTP_ROLE, 604 | __IFLA_GTP_MAX, 605 | }; 606 | #define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1) 607 | 608 | /* Bonding section */ 609 | 610 | enum { 611 | IFLA_BOND_UNSPEC, 612 | IFLA_BOND_MODE, 613 | IFLA_BOND_ACTIVE_SLAVE, 614 | IFLA_BOND_MIIMON, 615 | IFLA_BOND_UPDELAY, 616 | IFLA_BOND_DOWNDELAY, 617 | IFLA_BOND_USE_CARRIER, 618 | IFLA_BOND_ARP_INTERVAL, 619 | IFLA_BOND_ARP_IP_TARGET, 620 | IFLA_BOND_ARP_VALIDATE, 621 | IFLA_BOND_ARP_ALL_TARGETS, 622 | IFLA_BOND_PRIMARY, 623 | IFLA_BOND_PRIMARY_RESELECT, 624 | IFLA_BOND_FAIL_OVER_MAC, 625 | IFLA_BOND_XMIT_HASH_POLICY, 626 | IFLA_BOND_RESEND_IGMP, 627 | IFLA_BOND_NUM_PEER_NOTIF, 628 | IFLA_BOND_ALL_SLAVES_ACTIVE, 629 | IFLA_BOND_MIN_LINKS, 630 | IFLA_BOND_LP_INTERVAL, 631 | IFLA_BOND_PACKETS_PER_SLAVE, 632 | IFLA_BOND_AD_LACP_RATE, 633 | IFLA_BOND_AD_SELECT, 634 | IFLA_BOND_AD_INFO, 635 | IFLA_BOND_AD_ACTOR_SYS_PRIO, 636 | IFLA_BOND_AD_USER_PORT_KEY, 637 | IFLA_BOND_AD_ACTOR_SYSTEM, 638 | IFLA_BOND_TLB_DYNAMIC_LB, 639 | __IFLA_BOND_MAX, 640 | }; 641 | 642 | #define IFLA_BOND_MAX (__IFLA_BOND_MAX - 1) 643 | 644 | enum { 645 | IFLA_BOND_AD_INFO_UNSPEC, 646 | IFLA_BOND_AD_INFO_AGGREGATOR, 647 | IFLA_BOND_AD_INFO_NUM_PORTS, 648 | IFLA_BOND_AD_INFO_ACTOR_KEY, 649 | IFLA_BOND_AD_INFO_PARTNER_KEY, 650 | IFLA_BOND_AD_INFO_PARTNER_MAC, 651 | __IFLA_BOND_AD_INFO_MAX, 652 | }; 653 | 654 | #define IFLA_BOND_AD_INFO_MAX (__IFLA_BOND_AD_INFO_MAX - 1) 655 | 656 | enum { 657 | IFLA_BOND_SLAVE_UNSPEC, 658 | IFLA_BOND_SLAVE_STATE, 659 | IFLA_BOND_SLAVE_MII_STATUS, 660 | IFLA_BOND_SLAVE_LINK_FAILURE_COUNT, 661 | IFLA_BOND_SLAVE_PERM_HWADDR, 662 | IFLA_BOND_SLAVE_QUEUE_ID, 663 | IFLA_BOND_SLAVE_AD_AGGREGATOR_ID, 664 | IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE, 665 | IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE, 666 | __IFLA_BOND_SLAVE_MAX, 667 | }; 668 | 669 | #define IFLA_BOND_SLAVE_MAX (__IFLA_BOND_SLAVE_MAX - 1) 670 | 671 | /* SR-IOV virtual function management section */ 672 | 673 | enum { 674 | IFLA_VF_INFO_UNSPEC, 675 | IFLA_VF_INFO, 676 | __IFLA_VF_INFO_MAX, 677 | }; 678 | 679 | #define IFLA_VF_INFO_MAX (__IFLA_VF_INFO_MAX - 1) 680 | 681 | enum { 682 | IFLA_VF_UNSPEC, 683 | IFLA_VF_MAC, /* Hardware queue specific attributes */ 684 | IFLA_VF_VLAN, /* VLAN ID and QoS */ 685 | IFLA_VF_TX_RATE, /* Max TX Bandwidth Allocation */ 686 | IFLA_VF_SPOOFCHK, /* Spoof Checking on/off switch */ 687 | IFLA_VF_LINK_STATE, /* link state enable/disable/auto switch */ 688 | IFLA_VF_RATE, /* Min and Max TX Bandwidth Allocation */ 689 | IFLA_VF_RSS_QUERY_EN, /* RSS Redirection Table and Hash Key query 690 | * on/off switch 691 | */ 692 | IFLA_VF_STATS, /* network device statistics */ 693 | IFLA_VF_TRUST, /* Trust VF */ 694 | IFLA_VF_IB_NODE_GUID, /* VF Infiniband node GUID */ 695 | IFLA_VF_IB_PORT_GUID, /* VF Infiniband port GUID */ 696 | IFLA_VF_VLAN_LIST, /* nested list of vlans, option for QinQ */ 697 | __IFLA_VF_MAX, 698 | }; 699 | 700 | #define IFLA_VF_MAX (__IFLA_VF_MAX - 1) 701 | 702 | struct ifla_vf_mac { 703 | __u32 vf; 704 | __u8 mac[32]; /* MAX_ADDR_LEN */ 705 | }; 706 | 707 | struct ifla_vf_vlan { 708 | __u32 vf; 709 | __u32 vlan; /* 0 - 4095, 0 disables VLAN filter */ 710 | __u32 qos; 711 | }; 712 | 713 | enum { 714 | IFLA_VF_VLAN_INFO_UNSPEC, 715 | IFLA_VF_VLAN_INFO, /* VLAN ID, QoS and VLAN protocol */ 716 | __IFLA_VF_VLAN_INFO_MAX, 717 | }; 718 | 719 | #define IFLA_VF_VLAN_INFO_MAX (__IFLA_VF_VLAN_INFO_MAX - 1) 720 | #define MAX_VLAN_LIST_LEN 1 721 | 722 | struct ifla_vf_vlan_info { 723 | __u32 vf; 724 | __u32 vlan; /* 0 - 4095, 0 disables VLAN filter */ 725 | __u32 qos; 726 | __be16 vlan_proto; /* VLAN protocol either 802.1Q or 802.1ad */ 727 | }; 728 | 729 | struct ifla_vf_tx_rate { 730 | __u32 vf; 731 | __u32 rate; /* Max TX bandwidth in Mbps, 0 disables throttling */ 732 | }; 733 | 734 | struct ifla_vf_rate { 735 | __u32 vf; 736 | __u32 min_tx_rate; /* Min Bandwidth in Mbps */ 737 | __u32 max_tx_rate; /* Max Bandwidth in Mbps */ 738 | }; 739 | 740 | struct ifla_vf_spoofchk { 741 | __u32 vf; 742 | __u32 setting; 743 | }; 744 | 745 | struct ifla_vf_guid { 746 | __u32 vf; 747 | __u64 guid; 748 | }; 749 | 750 | enum { 751 | IFLA_VF_LINK_STATE_AUTO, /* link state of the uplink */ 752 | IFLA_VF_LINK_STATE_ENABLE, /* link always up */ 753 | IFLA_VF_LINK_STATE_DISABLE, /* link always down */ 754 | __IFLA_VF_LINK_STATE_MAX, 755 | }; 756 | 757 | struct ifla_vf_link_state { 758 | __u32 vf; 759 | __u32 link_state; 760 | }; 761 | 762 | struct ifla_vf_rss_query_en { 763 | __u32 vf; 764 | __u32 setting; 765 | }; 766 | 767 | enum { 768 | IFLA_VF_STATS_RX_PACKETS, 769 | IFLA_VF_STATS_TX_PACKETS, 770 | IFLA_VF_STATS_RX_BYTES, 771 | IFLA_VF_STATS_TX_BYTES, 772 | IFLA_VF_STATS_BROADCAST, 773 | IFLA_VF_STATS_MULTICAST, 774 | IFLA_VF_STATS_PAD, 775 | IFLA_VF_STATS_RX_DROPPED, 776 | IFLA_VF_STATS_TX_DROPPED, 777 | __IFLA_VF_STATS_MAX, 778 | }; 779 | 780 | #define IFLA_VF_STATS_MAX (__IFLA_VF_STATS_MAX - 1) 781 | 782 | struct ifla_vf_trust { 783 | __u32 vf; 784 | __u32 setting; 785 | }; 786 | 787 | /* VF ports management section 788 | * 789 | * Nested layout of set/get msg is: 790 | * 791 | * [IFLA_NUM_VF] 792 | * [IFLA_VF_PORTS] 793 | * [IFLA_VF_PORT] 794 | * [IFLA_PORT_*], ... 795 | * [IFLA_VF_PORT] 796 | * [IFLA_PORT_*], ... 797 | * ... 798 | * [IFLA_PORT_SELF] 799 | * [IFLA_PORT_*], ... 800 | */ 801 | 802 | enum { 803 | IFLA_VF_PORT_UNSPEC, 804 | IFLA_VF_PORT, /* nest */ 805 | __IFLA_VF_PORT_MAX, 806 | }; 807 | 808 | #define IFLA_VF_PORT_MAX (__IFLA_VF_PORT_MAX - 1) 809 | 810 | enum { 811 | IFLA_PORT_UNSPEC, 812 | IFLA_PORT_VF, /* __u32 */ 813 | IFLA_PORT_PROFILE, /* string */ 814 | IFLA_PORT_VSI_TYPE, /* 802.1Qbg (pre-)standard VDP */ 815 | IFLA_PORT_INSTANCE_UUID, /* binary UUID */ 816 | IFLA_PORT_HOST_UUID, /* binary UUID */ 817 | IFLA_PORT_REQUEST, /* __u8 */ 818 | IFLA_PORT_RESPONSE, /* __u16, output only */ 819 | __IFLA_PORT_MAX, 820 | }; 821 | 822 | #define IFLA_PORT_MAX (__IFLA_PORT_MAX - 1) 823 | 824 | #define PORT_PROFILE_MAX 40 825 | #define PORT_UUID_MAX 16 826 | #define PORT_SELF_VF -1 827 | 828 | enum { 829 | PORT_REQUEST_PREASSOCIATE = 0, 830 | PORT_REQUEST_PREASSOCIATE_RR, 831 | PORT_REQUEST_ASSOCIATE, 832 | PORT_REQUEST_DISASSOCIATE, 833 | }; 834 | 835 | enum { 836 | PORT_VDP_RESPONSE_SUCCESS = 0, 837 | PORT_VDP_RESPONSE_INVALID_FORMAT, 838 | PORT_VDP_RESPONSE_INSUFFICIENT_RESOURCES, 839 | PORT_VDP_RESPONSE_UNUSED_VTID, 840 | PORT_VDP_RESPONSE_VTID_VIOLATION, 841 | PORT_VDP_RESPONSE_VTID_VERSION_VIOALTION, 842 | PORT_VDP_RESPONSE_OUT_OF_SYNC, 843 | /* 0x08-0xFF reserved for future VDP use */ 844 | PORT_PROFILE_RESPONSE_SUCCESS = 0x100, 845 | PORT_PROFILE_RESPONSE_INPROGRESS, 846 | PORT_PROFILE_RESPONSE_INVALID, 847 | PORT_PROFILE_RESPONSE_BADSTATE, 848 | PORT_PROFILE_RESPONSE_INSUFFICIENT_RESOURCES, 849 | PORT_PROFILE_RESPONSE_ERROR, 850 | }; 851 | 852 | struct ifla_port_vsi { 853 | __u8 vsi_mgr_id; 854 | __u8 vsi_type_id[3]; 855 | __u8 vsi_type_version; 856 | __u8 pad[3]; 857 | }; 858 | 859 | 860 | /* IPoIB section */ 861 | 862 | enum { 863 | IFLA_IPOIB_UNSPEC, 864 | IFLA_IPOIB_PKEY, 865 | IFLA_IPOIB_MODE, 866 | IFLA_IPOIB_UMCAST, 867 | __IFLA_IPOIB_MAX 868 | }; 869 | 870 | enum { 871 | IPOIB_MODE_DATAGRAM = 0, /* using unreliable datagram QPs */ 872 | IPOIB_MODE_CONNECTED = 1, /* using connected QPs */ 873 | }; 874 | 875 | #define IFLA_IPOIB_MAX (__IFLA_IPOIB_MAX - 1) 876 | 877 | 878 | /* HSR section */ 879 | 880 | enum { 881 | IFLA_HSR_UNSPEC, 882 | IFLA_HSR_SLAVE1, 883 | IFLA_HSR_SLAVE2, 884 | IFLA_HSR_MULTICAST_SPEC, /* Last byte of supervision addr */ 885 | IFLA_HSR_SUPERVISION_ADDR, /* Supervision frame multicast addr */ 886 | IFLA_HSR_SEQ_NR, 887 | IFLA_HSR_VERSION, /* HSR version */ 888 | __IFLA_HSR_MAX, 889 | }; 890 | 891 | #define IFLA_HSR_MAX (__IFLA_HSR_MAX - 1) 892 | 893 | /* STATS section */ 894 | 895 | struct if_stats_msg { 896 | __u8 family; 897 | __u8 pad1; 898 | __u16 pad2; 899 | __u32 ifindex; 900 | __u32 filter_mask; 901 | }; 902 | 903 | /* A stats attribute can be netdev specific or a global stat. 904 | * For netdev stats, lets use the prefix IFLA_STATS_LINK_* 905 | */ 906 | enum { 907 | IFLA_STATS_UNSPEC, /* also used as 64bit pad attribute */ 908 | IFLA_STATS_LINK_64, 909 | IFLA_STATS_LINK_XSTATS, 910 | IFLA_STATS_LINK_XSTATS_SLAVE, 911 | IFLA_STATS_LINK_OFFLOAD_XSTATS, 912 | IFLA_STATS_AF_SPEC, 913 | __IFLA_STATS_MAX, 914 | }; 915 | 916 | #define IFLA_STATS_MAX (__IFLA_STATS_MAX - 1) 917 | 918 | #define IFLA_STATS_FILTER_BIT(ATTR) (1 << (ATTR - 1)) 919 | 920 | /* These are embedded into IFLA_STATS_LINK_XSTATS: 921 | * [IFLA_STATS_LINK_XSTATS] 922 | * -> [LINK_XSTATS_TYPE_xxx] 923 | * -> [rtnl link type specific attributes] 924 | */ 925 | enum { 926 | LINK_XSTATS_TYPE_UNSPEC, 927 | LINK_XSTATS_TYPE_BRIDGE, 928 | LINK_XSTATS_TYPE_BOND, 929 | __LINK_XSTATS_TYPE_MAX 930 | }; 931 | #define LINK_XSTATS_TYPE_MAX (__LINK_XSTATS_TYPE_MAX - 1) 932 | 933 | /* These are stats embedded into IFLA_STATS_LINK_OFFLOAD_XSTATS */ 934 | enum { 935 | IFLA_OFFLOAD_XSTATS_UNSPEC, 936 | IFLA_OFFLOAD_XSTATS_CPU_HIT, /* struct rtnl_link_stats64 */ 937 | __IFLA_OFFLOAD_XSTATS_MAX 938 | }; 939 | #define IFLA_OFFLOAD_XSTATS_MAX (__IFLA_OFFLOAD_XSTATS_MAX - 1) 940 | 941 | /* XDP section */ 942 | 943 | #define XDP_FLAGS_UPDATE_IF_NOEXIST (1U << 0) 944 | #define XDP_FLAGS_SKB_MODE (1U << 1) 945 | #define XDP_FLAGS_DRV_MODE (1U << 2) 946 | #define XDP_FLAGS_HW_MODE (1U << 3) 947 | #define XDP_FLAGS_MODES (XDP_FLAGS_SKB_MODE | \ 948 | XDP_FLAGS_DRV_MODE | \ 949 | XDP_FLAGS_HW_MODE) 950 | #define XDP_FLAGS_MASK (XDP_FLAGS_UPDATE_IF_NOEXIST | \ 951 | XDP_FLAGS_MODES) 952 | 953 | /* These are stored into IFLA_XDP_ATTACHED on dump. */ 954 | enum { 955 | XDP_ATTACHED_NONE = 0, 956 | XDP_ATTACHED_DRV, 957 | XDP_ATTACHED_SKB, 958 | XDP_ATTACHED_HW, 959 | XDP_ATTACHED_MULTI, 960 | }; 961 | 962 | enum { 963 | IFLA_XDP_UNSPEC, 964 | IFLA_XDP_FD, 965 | IFLA_XDP_ATTACHED, 966 | IFLA_XDP_FLAGS, 967 | IFLA_XDP_PROG_ID, 968 | IFLA_XDP_DRV_PROG_ID, 969 | IFLA_XDP_SKB_PROG_ID, 970 | IFLA_XDP_HW_PROG_ID, 971 | __IFLA_XDP_MAX, 972 | }; 973 | 974 | #define IFLA_XDP_MAX (__IFLA_XDP_MAX - 1) 975 | 976 | enum { 977 | IFLA_EVENT_NONE, 978 | IFLA_EVENT_REBOOT, /* internal reset / reboot */ 979 | IFLA_EVENT_FEATURES, /* change in offload features */ 980 | IFLA_EVENT_BONDING_FAILOVER, /* change in active slave */ 981 | IFLA_EVENT_NOTIFY_PEERS, /* re-sent grat. arp/ndisc */ 982 | IFLA_EVENT_IGMP_RESEND, /* re-sent IGMP JOIN */ 983 | IFLA_EVENT_BONDING_OPTIONS, /* change in bonding options */ 984 | }; 985 | 986 | /* tun section */ 987 | 988 | enum { 989 | IFLA_TUN_UNSPEC, 990 | IFLA_TUN_OWNER, 991 | IFLA_TUN_GROUP, 992 | IFLA_TUN_TYPE, 993 | IFLA_TUN_PI, 994 | IFLA_TUN_VNET_HDR, 995 | IFLA_TUN_PERSIST, 996 | IFLA_TUN_MULTI_QUEUE, 997 | IFLA_TUN_NUM_QUEUES, 998 | IFLA_TUN_NUM_DISABLED_QUEUES, 999 | __IFLA_TUN_MAX, 1000 | }; 1001 | 1002 | #define IFLA_TUN_MAX (__IFLA_TUN_MAX - 1) 1003 | 1004 | /* rmnet section */ 1005 | 1006 | #define RMNET_FLAGS_INGRESS_DEAGGREGATION (1U << 0) 1007 | #define RMNET_FLAGS_INGRESS_MAP_COMMANDS (1U << 1) 1008 | #define RMNET_FLAGS_INGRESS_MAP_CKSUMV4 (1U << 2) 1009 | #define RMNET_FLAGS_EGRESS_MAP_CKSUMV4 (1U << 3) 1010 | 1011 | enum { 1012 | IFLA_RMNET_UNSPEC, 1013 | IFLA_RMNET_MUX_ID, 1014 | IFLA_RMNET_FLAGS, 1015 | __IFLA_RMNET_MAX, 1016 | }; 1017 | 1018 | #define IFLA_RMNET_MAX (__IFLA_RMNET_MAX - 1) 1019 | 1020 | struct ifla_rmnet_flags { 1021 | __u32 flags; 1022 | __u32 mask; 1023 | }; 1024 | 1025 | #endif /* _UAPI_LINUX_IF_LINK_H */ 1026 | -------------------------------------------------------------------------------- /headers/linux/if_xdp.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ 2 | /* 3 | * if_xdp: XDP socket user-space interface 4 | * Copyright(c) 2018 Intel Corporation. 5 | * 6 | * Author(s): Björn Töpel 7 | * Magnus Karlsson 8 | */ 9 | 10 | #ifndef _LINUX_IF_XDP_H 11 | #define _LINUX_IF_XDP_H 12 | 13 | #include 14 | 15 | /* Options for the sxdp_flags field */ 16 | #define XDP_SHARED_UMEM (1 << 0) 17 | #define XDP_COPY (1 << 1) /* Force copy-mode */ 18 | #define XDP_ZEROCOPY (1 << 2) /* Force zero-copy mode */ 19 | /* If this option is set, the driver might go sleep and in that case 20 | * the XDP_RING_NEED_WAKEUP flag in the fill and/or Tx rings will be 21 | * set. If it is set, the application need to explicitly wake up the 22 | * driver with a poll() (Rx and Tx) or sendto() (Tx only). If you are 23 | * running the driver and the application on the same core, you should 24 | * use this option so that the kernel will yield to the user space 25 | * application. 26 | */ 27 | #define XDP_USE_NEED_WAKEUP (1 << 3) 28 | 29 | /* Flags for xsk_umem_config flags */ 30 | #define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) 31 | 32 | struct sockaddr_xdp { 33 | __u16 sxdp_family; 34 | __u16 sxdp_flags; 35 | __u32 sxdp_ifindex; 36 | __u32 sxdp_queue_id; 37 | __u32 sxdp_shared_umem_fd; 38 | }; 39 | 40 | /* XDP_RING flags */ 41 | #define XDP_RING_NEED_WAKEUP (1 << 0) 42 | 43 | struct xdp_ring_offset { 44 | __u64 producer; 45 | __u64 consumer; 46 | __u64 desc; 47 | __u64 flags; 48 | }; 49 | 50 | struct xdp_mmap_offsets { 51 | struct xdp_ring_offset rx; 52 | struct xdp_ring_offset tx; 53 | struct xdp_ring_offset fr; /* Fill */ 54 | struct xdp_ring_offset cr; /* Completion */ 55 | }; 56 | 57 | /* XDP socket options */ 58 | #define XDP_MMAP_OFFSETS 1 59 | #define XDP_RX_RING 2 60 | #define XDP_TX_RING 3 61 | #define XDP_UMEM_REG 4 62 | #define XDP_UMEM_FILL_RING 5 63 | #define XDP_UMEM_COMPLETION_RING 6 64 | #define XDP_STATISTICS 7 65 | #define XDP_OPTIONS 8 66 | 67 | struct xdp_umem_reg { 68 | __u64 addr; /* Start of packet data area */ 69 | __u64 len; /* Length of packet data area */ 70 | __u32 chunk_size; 71 | __u32 headroom; 72 | __u32 flags; 73 | }; 74 | 75 | struct xdp_statistics { 76 | __u64 rx_dropped; /* Dropped for reasons other than invalid desc */ 77 | __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */ 78 | __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */ 79 | }; 80 | 81 | struct xdp_options { 82 | __u32 flags; 83 | }; 84 | 85 | /* Flags for the flags field of struct xdp_options */ 86 | #define XDP_OPTIONS_ZEROCOPY (1 << 0) 87 | 88 | /* Pgoff for mmaping the rings */ 89 | #define XDP_PGOFF_RX_RING 0 90 | #define XDP_PGOFF_TX_RING 0x80000000 91 | #define XDP_UMEM_PGOFF_FILL_RING 0x100000000ULL 92 | #define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000ULL 93 | 94 | /* Masks for unaligned chunks mode */ 95 | #define XSK_UNALIGNED_BUF_OFFSET_SHIFT 48 96 | #define XSK_UNALIGNED_BUF_ADDR_MASK \ 97 | ((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1) 98 | 99 | /* Rx/Tx descriptor */ 100 | struct xdp_desc { 101 | __u64 addr; 102 | __u32 len; 103 | __u32 options; 104 | }; 105 | 106 | /* UMEM descriptor is __u64 */ 107 | 108 | #endif /* _LINUX_IF_XDP_H */ 109 | -------------------------------------------------------------------------------- /headers/perf-sys.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: GPL-2.0 */ 2 | /* Copied from $(LINUX)/tools/perf/perf-sys.h (kernel 4.18) */ 3 | #ifndef _PERF_SYS_H 4 | #define _PERF_SYS_H 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | /* 12 | * remove the following headers to allow for userspace program compilation 13 | * #include 14 | * #include 15 | */ 16 | #ifdef __powerpc__ 17 | #define CPUINFO_PROC {"cpu"} 18 | #endif 19 | 20 | #ifdef __s390__ 21 | #define CPUINFO_PROC {"vendor_id"} 22 | #endif 23 | 24 | #ifdef __sh__ 25 | #define CPUINFO_PROC {"cpu type"} 26 | #endif 27 | 28 | #ifdef __hppa__ 29 | #define CPUINFO_PROC {"cpu"} 30 | #endif 31 | 32 | #ifdef __sparc__ 33 | #define CPUINFO_PROC {"cpu"} 34 | #endif 35 | 36 | #ifdef __alpha__ 37 | #define CPUINFO_PROC {"cpu model"} 38 | #endif 39 | 40 | #ifdef __arm__ 41 | #define CPUINFO_PROC {"model name", "Processor"} 42 | #endif 43 | 44 | #ifdef __mips__ 45 | #define CPUINFO_PROC {"cpu model"} 46 | #endif 47 | 48 | #ifdef __arc__ 49 | #define CPUINFO_PROC {"Processor"} 50 | #endif 51 | 52 | #ifdef __xtensa__ 53 | #define CPUINFO_PROC {"core ID"} 54 | #endif 55 | 56 | #ifndef CPUINFO_PROC 57 | #define CPUINFO_PROC { "model name", } 58 | #endif 59 | 60 | static inline int 61 | sys_perf_event_open(struct perf_event_attr *attr, 62 | pid_t pid, int cpu, int group_fd, 63 | unsigned long flags) 64 | { 65 | int fd; 66 | 67 | fd = syscall(__NR_perf_event_open, attr, pid, cpu, 68 | group_fd, flags); 69 | 70 | #ifdef HAVE_ATTR_TEST 71 | if (unlikely(test_attr__enabled)) 72 | test_attr__open(attr, pid, cpu, fd, group_fd, flags); 73 | #endif 74 | return fd; 75 | } 76 | 77 | #endif /* _PERF_SYS_H */ 78 | -------------------------------------------------------------------------------- /scripts/ci_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -Eeuo pipefail 3 | 4 | # Usage: 5 | # try_load tunnel_type xdp_executable tunnel_config 6 | try_load() { 7 | TUNNEL_TYPE=$1 8 | XDP_EXECUTABLE=$2 9 | TUNNEL_CONFIG=${@:3} 10 | TUNNEL_INTERFACE_NAME=test1 11 | 12 | echo "Testing ${XDP_EXECUTABLE} on ${TUNNEL_TYPE}..." 13 | 14 | ip link del ${TUNNEL_INTERFACE_NAME} || true 15 | ip link add ${TUNNEL_INTERFACE_NAME} type ${TUNNEL_TYPE} ${TUNNEL_CONFIG} 16 | ip link set ${TUNNEL_INTERFACE_NAME} up 17 | ip link set dev ${TUNNEL_INTERFACE_NAME} xdp object "${XDP_EXECUTABLE}" 18 | ip link del ${TUNNEL_INTERFACE_NAME} 19 | } 20 | 21 | if [ $EUID -ne 0 ]; then 22 | echo "This script must be run as root" 23 | exit 1 24 | fi 25 | 26 | cd "$( dirname "${BASH_SOURCE[0]}" )"/.. 27 | 28 | modprobe ip_gre 29 | 30 | try_load gre build/keepalive_gre.o local 169.254.1.1 remote 169.254.1.2 ttl 255 31 | try_load ip6gre build/keepalive_gre6.o local fd00::1 remote fd00::2 ttl 255 32 | -------------------------------------------------------------------------------- /src/common.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #ifndef __COMMON_H__ 3 | #define __COMMON_H__ 4 | 5 | struct gre_hdr { 6 | __be16 flags; 7 | __be16 proto; 8 | }; 9 | 10 | // have to be static and __always_inline, otherwise you will have `Error fetching program/map!` 11 | static __always_inline bool compare_ipv6_address(struct in6_addr *a, struct in6_addr *b) { 12 | #pragma unroll 13 | for (int i = 0; i < 4; ++i) { 14 | if (a->in6_u.u6_addr32[i] != b->in6_u.u6_addr32[i]) return false; 15 | } 16 | return true; 17 | } 18 | 19 | #endif -------------------------------------------------------------------------------- /src/keepalive_gre.c: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: GPL-2.0 */ 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "common.h" 17 | 18 | // enable debug print 19 | // #define DEBUG 20 | // enable packet header dump 21 | // #define DEBUG_PRINT_HEADER_SIZE 32 22 | 23 | char _license[4] SEC("license") = "GPL"; 24 | 25 | SEC("prog") 26 | int xdp_gre_keepalive_func(struct xdp_md *ctx) 27 | { 28 | // for border checking 29 | void *data_start = (void *)(long)ctx->data; 30 | void *data_end = (void *)(long)ctx->data_end; 31 | 32 | // result 33 | __u32 action = XDP_PASS; 34 | 35 | // current parsed header position pointer 36 | void *dataptr = data_start; 37 | 38 | #ifdef DEBUG 39 | bpf_printk("New packet\n"); 40 | #endif 41 | 42 | // debug print packet header 43 | #if (defined DEBUG_PRINT_HEADER_SIZE) && (DEBUG_PRINT_HEADER_SIZE > 0) 44 | // check for out of boarder access is necessary, kernel will run static analysis on our program 45 | if ((dataptr + DEBUG_PRINT_HEADER_SIZE) > data_end) { 46 | bpf_printk("Packet size too small, dump failed\n"); 47 | goto out; 48 | } 49 | __u8 *data_raw = (__u8 *)dataptr; 50 | bpf_printk("Packet header dump:\n"); 51 | #pragma unroll 52 | for (int i = 0; i < DEBUG_PRINT_HEADER_SIZE; ++i) { 53 | bpf_printk("#%d: %x\n", i, data_raw[i]); 54 | } 55 | #endif 56 | 57 | struct iphdr *outer_iphdr; 58 | 59 | // GRE packet directly starts with an IPv4 header 60 | if ((dataptr + 1) > data_end) goto out; 61 | if ((((__u8 *)dataptr)[0] & 0xF0) != 0x40) { 62 | goto out; 63 | } 64 | 65 | if (dataptr + sizeof(struct iphdr) > data_end) return -1; 66 | outer_iphdr = (struct iphdr *)dataptr; 67 | dataptr += sizeof(struct iphdr); 68 | 69 | // now we are at the outer GRE header 70 | if (dataptr + sizeof(struct gre_hdr) > data_end) return -1; 71 | struct gre_hdr *outer_grehdr = (struct gre_hdr *)(dataptr); 72 | dataptr += sizeof(struct gre_hdr); 73 | #ifdef DEBUG 74 | bpf_printk("Outer GRE flags=0x%x proto=%x\n", outer_grehdr->flags, outer_grehdr->proto); 75 | #endif 76 | 77 | // here is all the headers we need to chop off before sending the packet back 78 | void *cutoff_pos = dataptr; 79 | 80 | // parse inner IP header 81 | if (outer_grehdr -> proto == bpf_htons(ETH_P_IP)) { 82 | if (dataptr + 1 > data_end) return -1; 83 | struct iphdr *inner_iphdr = dataptr; 84 | int ip_header_size = (inner_iphdr -> ihl) * 4; 85 | if (dataptr + 20 > data_end) return -1; // workaround kernel static check 86 | if (dataptr + ip_header_size > data_end) return -1; 87 | dataptr += ip_header_size; 88 | __u8 inner_ip_proto = inner_iphdr -> protocol; 89 | #ifdef DEBUG 90 | bpf_printk("IPv4 packet_size=0x%x, proto=0x%x\n", ip_header_size, inner_ip_proto); 91 | #endif 92 | 93 | // check if it is a GRE encapsulated in an IPv4 packet 94 | if (inner_ip_proto != IPPROTO_GRE) goto out; 95 | 96 | // get the inner GRE header 97 | if (dataptr + sizeof(struct gre_hdr) > data_end) return -1; 98 | struct gre_hdr *inner_grehdr = (struct gre_hdr *)(dataptr); 99 | dataptr += sizeof(struct gre_hdr); 100 | #ifdef DEBUG 101 | bpf_printk("Inner is GRE4, proto=%x\n", inner_grehdr -> proto); 102 | #endif 103 | 104 | // check if the GRE header is keepalive 105 | // we need: 106 | // * proto == 0 107 | // * ip address match 108 | // 109 | if ( 110 | inner_grehdr -> proto != 0 111 | || inner_iphdr -> saddr != outer_iphdr -> daddr 112 | || inner_iphdr -> daddr != outer_iphdr -> saddr 113 | ) goto out; 114 | #ifdef DEBUG 115 | bpf_printk("GRE4 keepalive received!\n"); 116 | #endif 117 | 118 | } else { 119 | // unknown protocol 120 | #ifdef DEBUG 121 | bpf_printk("Unknown proto %x inside GRE", outer_grehdr->proto); 122 | #endif 123 | goto out; 124 | } 125 | 126 | // remove the header and send the packet back 127 | if (bpf_xdp_adjust_head(ctx, (int)(cutoff_pos - data_start))) return -1; 128 | action = XDP_TX; 129 | 130 | out: 131 | return action; 132 | } 133 | -------------------------------------------------------------------------------- /src/keepalive_gre6.c: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: GPL-2.0 */ 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "common.h" 17 | 18 | // enable debug print 19 | // #define DEBUG 20 | // enable packet header dump 21 | // #define DEBUG_PRINT_HEADER_SIZE 32 22 | 23 | char _license[4] SEC("license") = "GPL"; 24 | 25 | SEC("prog") 26 | int xdp_keepalive_gre6(struct xdp_md *ctx) 27 | { 28 | // for border checking 29 | void *data_start = (void *)(long)ctx->data; 30 | void *data_end = (void *)(long)ctx->data_end; 31 | 32 | // result 33 | __u32 action = XDP_PASS; 34 | 35 | // current parsed header position pointer 36 | void *dataptr = data_start; 37 | 38 | #ifdef DEBUG 39 | bpf_printk("New packet\n"); 40 | #endif 41 | 42 | // debug print packet header 43 | #if (defined DEBUG_PRINT_HEADER_SIZE) && (DEBUG_PRINT_HEADER_SIZE > 0) 44 | // check for out of boarder access is necessary, kernel will run static analysis on our program 45 | if ((dataptr + DEBUG_PRINT_HEADER_SIZE) > data_end) { 46 | bpf_printk("Packet size too small, dump failed\n"); 47 | goto out; 48 | } 49 | __u8 *data_raw = (__u8 *)dataptr; 50 | bpf_printk("Packet header dump:\n"); 51 | #pragma unroll 52 | for (int i = 0; i < DEBUG_PRINT_HEADER_SIZE; ++i) { 53 | bpf_printk("#%d: %x\n", i, data_raw[i]); 54 | } 55 | #endif 56 | 57 | struct ipv6hdr *outer_ipv6hdr; 58 | 59 | // if the packet is from GREv6 (tunnel mode ip6gre), then it starts with an ethernet header: 60 | // * dst MAC address (6 bytes) 61 | // * src MAC address (6 bytes) 62 | // * ethernet proto (0x86dd, 2 bytes) 63 | // Then comes IPv6 header. 64 | // So we skip the first 12 bytes and verify ethernet proto field and IPv6 header version field 65 | if ((dataptr + 15) > data_end) goto out; 66 | if (!( 67 | ((__u16 *)dataptr)[6] == 0xdd86 68 | && (((__u8 *)dataptr)[14] & 0xF0) == 0x60 69 | )) { 70 | // cannot verify packet header 71 | goto out; 72 | } 73 | 74 | dataptr += 14; // skip to the IPv6 header 75 | 76 | if (dataptr + sizeof(struct ipv6hdr) > data_end) return -1; 77 | outer_ipv6hdr = (struct ipv6hdr *)dataptr; 78 | dataptr += sizeof(struct ipv6hdr); 79 | 80 | // now we are at the outer GRE header 81 | if (dataptr + sizeof(struct gre_hdr) > data_end) return -1; 82 | struct gre_hdr *outer_grehdr = (struct gre_hdr *)(dataptr); 83 | dataptr += sizeof(struct gre_hdr); 84 | #ifdef DEBUG 85 | bpf_printk("Outer GRE flags=0x%x proto=%x\n", outer_grehdr->flags, outer_grehdr->proto); 86 | #endif 87 | 88 | // here is all the headers we need to chop off before sending the packet back 89 | void *cutoff_pos = dataptr; 90 | 91 | // parse inner IP header (must be an IPv6 header too) 92 | if (outer_grehdr->proto == bpf_htons(ETH_P_IPV6)) { 93 | if (dataptr + sizeof(struct ipv6hdr) + 1 > data_end) return -1; 94 | struct ipv6hdr *inner_ipv6hdr = (struct ipv6hdr *)(dataptr); 95 | dataptr += sizeof(struct ipv6hdr); 96 | __u8 inner_ip_proto = inner_ipv6hdr -> nexthdr; 97 | #ifdef DEBUG 98 | bpf_printk("IPv6 proto=0x%x\n", inner_ip_proto); 99 | #endif 100 | 101 | // check if it is a GRE encapsulated in an IPv6 packet 102 | if (inner_ip_proto != IPPROTO_GRE) goto out; 103 | 104 | // get the inner GRE header 105 | if (dataptr + sizeof(struct gre_hdr) > data_end) return -1; 106 | struct gre_hdr *inner_grehdr = (struct gre_hdr *)(dataptr); 107 | dataptr += sizeof(struct gre_hdr); 108 | #ifdef DEBUG 109 | bpf_printk("Inner is GRE6, proto %x\n", inner_grehdr -> proto); 110 | #endif 111 | 112 | // check if the GRE packet is a keepalive packet 113 | if ( 114 | inner_grehdr -> proto != 0xdd86 // seems to be the case for MikroTik RouterOS, TODO: verify compatibility with other vendors 115 | || !compare_ipv6_address(&(outer_ipv6hdr -> saddr), &(inner_ipv6hdr -> daddr)) 116 | || !compare_ipv6_address(&(outer_ipv6hdr -> daddr), &(inner_ipv6hdr -> saddr)) 117 | ) goto out; 118 | #ifdef DEBUG 119 | bpf_printk("GRE6 keepalive received!\n"); 120 | #endif 121 | 122 | } else { 123 | // unknown protocol 124 | #ifdef DEBUG 125 | bpf_printk("Unknown proto %x inside GRE", outer_grehdr->proto); 126 | #endif 127 | goto out; 128 | } 129 | 130 | // remove the header and send the packet back 131 | if (bpf_xdp_adjust_head(ctx, (int)(cutoff_pos - data_start))) return -1; 132 | action = XDP_TX; 133 | 134 | out: 135 | return action; 136 | } 137 | --------------------------------------------------------------------------------