├── .clang-format ├── .github └── workflows │ ├── semgrep.yml │ └── test.yml ├── .gitignore ├── LICENSE ├── README.md ├── cgo.go ├── cgo_test.go ├── fixed_point.go ├── fixed_point_test.go ├── generalisation_test.go ├── go.mod ├── go.sum ├── include ├── Makefile ├── bpf │ ├── bpf_endian.h │ ├── bpf_helper_defs.h │ └── bpf_helpers.h ├── fasthash.h ├── in.h ├── ip.h ├── linux │ ├── bpf.h │ ├── bpf_common.h │ └── types.h ├── lookup3.h ├── mindef.h └── stdbool.h ├── rake_bpfeb.go ├── rake_bpfel.go ├── rakelimit.go ├── rakelimit_test.go └── src ├── common.h ├── countmin.h ├── ewma.h ├── fixed-point.h └── rakelimit.c /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | BasedOnStyle: LLVM 4 | AlignAfterOpenBracket: DontAlign 5 | AlignConsecutiveAssignments: true 6 | AlignEscapedNewlines: DontAlign 7 | AlwaysBreakBeforeMultilineStrings: true 8 | AlwaysBreakTemplateDeclarations: false 9 | AllowAllParametersOfDeclarationOnNextLine: false 10 | AllowShortFunctionsOnASingleLine: false 11 | BreakBeforeBraces: Linux 12 | IndentWidth: 4 13 | KeepEmptyLinesAtTheStartOfBlocks: false 14 | TabWidth: 4 15 | UseTab: ForContinuationAndIndentation 16 | ColumnLimit: 1000 17 | ... 18 | -------------------------------------------------------------------------------- /.github/workflows/semgrep.yml: -------------------------------------------------------------------------------- 1 | 2 | on: 3 | pull_request: {} 4 | workflow_dispatch: {} 5 | push: 6 | branches: 7 | - main 8 | - master 9 | schedule: 10 | - cron: '0 0 * * *' 11 | name: Semgrep config 12 | jobs: 13 | semgrep: 14 | name: semgrep/ci 15 | runs-on: ubuntu-20.04 16 | env: 17 | SEMGREP_APP_TOKEN: ${{ secrets.SEMGREP_APP_TOKEN }} 18 | SEMGREP_URL: https://cloudflare.semgrep.dev 19 | SEMGREP_APP_URL: https://cloudflare.semgrep.dev 20 | SEMGREP_VERSION_CHECK_URL: https://cloudflare.semgrep.dev/api/check-version 21 | container: 22 | image: returntocorp/semgrep 23 | steps: 24 | - uses: actions/checkout@v3 25 | - run: semgrep ci 26 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | # Anytime we push to any branch 2 | on: push 3 | 4 | jobs: 5 | test: 6 | name: Test 7 | runs-on: ubuntu-20.04 8 | 9 | steps: 10 | - name: Set up Go 1.x 11 | uses: actions/setup-go@v2 12 | with: 13 | go-version: ^1.15 14 | 15 | - name: Check out code into the Go module directory 16 | uses: actions/checkout@v2 17 | 18 | - name: Install clang 19 | run: | 20 | wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key 2>/dev/null | sudo apt-key add - 21 | echo 'deb http://apt.llvm.org/focal/ llvm-toolchain-focal-12 main' | sudo tee /etc/apt/sources.list.d/clang.list 22 | # Only update the llvm repo, this is a lot faster. 23 | sudo apt-get update -o Dir::Etc::sourcelist="sources.list.d/clang.list" -o Dir::Etc::sourceparts="-" -o APT::Get::List-Cleanup="0" 24 | sudo apt-get install -y --no-install-recommends clang-12 25 | 26 | - name: Check lint 27 | # gofmt doesn't report any changes 28 | run: test -z $(gofmt -l ./ | tee /dev/stderr) 29 | 30 | - name: Run tests 31 | run: | 32 | sudo sysctl -w net.core.optmem_max=22528 33 | go test -tags cgotest -exec sudo ./... 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | *.o 8 | bin/ 9 | deb/ 10 | 11 | # Test binary, built with `go test -c` 12 | *.test 13 | *.json 14 | 15 | # Output of the go coverage tool, specifically when used with LiteIDE 16 | *.out 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2020, Cloudflare. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 4 | 5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 6 | 7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 8 | 9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 10 | 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Rakelimit 2 | 3 | A multi-dimensional fair-share rate limiter in BPF, designed for UDP. 4 | The algorithm is based on Hierarchical Heavy Hitters, and ensures that no party can exceed 5 | a certain rate of packets. For more information please take a look at our [blog post](https://blog.cloudflare.com/building-rakelimit/). 6 | 7 | ## Usage 8 | 9 | To activate rakelimit create a new instance and provide a file descriptor and a rate limit that you think the 10 | service in question won't be able to handle anymore: 11 | 12 | ```go 13 | 14 | conn, err := net.ListenPacket("udp4", "127.0.0.1:0") 15 | if err != nil { 16 | tb.Fatal("Can't listen:", err) 17 | } 18 | udpConn := conn.(*net.UDPConn) 19 | 20 | // We don't want to allow anyone to use more than 128 packets per second 21 | ppsPerSecond := 128 22 | rake, err := New(udpConn, ppsPerSecond) 23 | defer rake.Close() 24 | // rate limiter stays active even after closing 25 | ``` 26 | 27 | That's all! The library now enforces rate limits on incoming packets, and it happens within the kernel. 28 | 29 | ## Requirements 30 | 31 | The library should be go-gettable, and has been tested on Linux 5.11. 32 | 33 | You may have to increase optmem_max depending on your distribution: 34 | 35 | ``` 36 | sudo sysctl -w net.core.optmem_max=22528 37 | ``` 38 | 39 | You will need a `clang-12` binary if you want to recompile the filter. Simply run `go generate` in the root of the project. 40 | 41 | ## Limitations 42 | - IPv6 doesn't support options 43 | - requires tweaking of optmem 44 | - not tested in production 45 | 46 | ## Testing 47 | 48 | ``` 49 | go test . 50 | ``` 51 | -------------------------------------------------------------------------------- /cgo.go: -------------------------------------------------------------------------------- 1 | // +build cgo,cgotest 2 | 3 | package rakelimit 4 | 5 | // #cgo CFLAGS: -Iinclude 6 | // #include "stdlib.h" 7 | // #include "fasthash.h" 8 | import "C" 9 | 10 | func fasthash64(buf []byte) uint64 { 11 | ptr := C.CBytes(buf) 12 | defer C.free(ptr) 13 | 14 | return uint64(C.fasthash64(ptr, C.__u64(len(buf)), 0)) 15 | } 16 | -------------------------------------------------------------------------------- /cgo_test.go: -------------------------------------------------------------------------------- 1 | // +build cgo,cgotest 2 | 3 | package rakelimit 4 | 5 | import ( 6 | "encoding/hex" 7 | "testing" 8 | ) 9 | 10 | func TestFasthash64(t *testing.T) { 11 | golden := []struct { 12 | input []byte 13 | hash uint64 14 | }{ 15 | {[]byte("asdefg"), 0x07ffd15db88b150b}, 16 | {[]byte("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."), 0xbb1655682c0ac75d}, 17 | } 18 | 19 | for _, gold := range golden { 20 | have := fasthash64(gold.input) 21 | if have != gold.hash { 22 | t.Logf("\n%s", hex.Dump(gold.input)) 23 | t.Errorf("Expected hash %016x, got %016x", gold.hash, have) 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /fixed_point.go: -------------------------------------------------------------------------------- 1 | package rakelimit 2 | 3 | import ( 4 | "math" 5 | ) 6 | 7 | const fractionBits = 32 8 | 9 | func floatToFixed(f float64) uint64 { 10 | ret := uint64(0) 11 | for i := 64 - fractionBits; i >= -fractionBits; i-- { 12 | ret = ret << 1 13 | if f >= math.Pow(2, float64(i)) { 14 | ret |= 1 15 | f -= math.Pow(2, float64(i)) 16 | } 17 | } 18 | return ret 19 | } 20 | 21 | func fixedToFloat(f uint64) float64 { 22 | ret := float64(0) 23 | for i := 64 - fractionBits - 1; i >= -fractionBits; i-- { 24 | if f&(1<<(i+fractionBits)) != 0 { 25 | ret += math.Pow(2, float64(i)) 26 | } 27 | } 28 | return ret 29 | } 30 | -------------------------------------------------------------------------------- /fixed_point_test.go: -------------------------------------------------------------------------------- 1 | package rakelimit 2 | 3 | import ( 4 | "math" 5 | "testing" 6 | ) 7 | 8 | func TestFloatToFixedPoint(t *testing.T) { 9 | x := float64(1.0 / 7.0) 10 | y := fixedToFloat(floatToFixed(x)) 11 | if math.Abs(y-x) > 0.000000001 { 12 | t.Fatal("Difference too large", x, y) 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /generalisation_test.go: -------------------------------------------------------------------------------- 1 | package rakelimit 2 | 3 | import ( 4 | "bytes" 5 | "flag" 6 | "fmt" 7 | "net" 8 | "os" 9 | "testing" 10 | 11 | "math" 12 | "math/rand" 13 | "sort" 14 | "time" 15 | 16 | "github.com/google/gopacket" 17 | "github.com/google/gopacket/layers" 18 | ) 19 | 20 | var seed int64 21 | 22 | func TestMain(m *testing.M) { 23 | flag.Int64Var(&seed, "seed", 0, "seed for the random number generator") 24 | flag.Parse() 25 | 26 | if seed == 0 { 27 | seed = time.Now().UnixNano() 28 | } 29 | 30 | fmt.Println("Seed is", seed) 31 | os.Exit(m.Run()) 32 | } 33 | 34 | type element struct { 35 | SourceAddress net.IP 36 | SourcePort int 37 | DestinationAddress net.IP 38 | DestinationPort int 39 | } 40 | 41 | func (el *element) Clone() *element { 42 | newEl := element{ 43 | SourcePort: el.SourcePort, 44 | DestinationPort: el.DestinationPort, 45 | SourceAddress: make([]byte, len(el.SourceAddress)), 46 | DestinationAddress: make([]byte, len(el.DestinationAddress)), 47 | } 48 | 49 | copy(newEl.SourceAddress, el.SourceAddress) 50 | copy(newEl.DestinationAddress, el.DestinationAddress) 51 | 52 | return &newEl 53 | } 54 | 55 | func (el *element) String() string { 56 | return fmt.Sprintf("%s:%d --> %s:%d", el.SourceAddress, el.SourcePort, el.DestinationAddress, el.DestinationPort) 57 | } 58 | 59 | func (el *element) marshal() []byte { 60 | var packet []gopacket.SerializableLayer 61 | if len(el.SourceAddress) == net.IPv4len { 62 | packet = []gopacket.SerializableLayer{ 63 | &layers.Ethernet{ 64 | SrcMAC: []byte{1, 2, 3, 4, 5, 6}, 65 | DstMAC: []byte{6, 5, 4, 3, 2, 1}, 66 | EthernetType: layers.EthernetTypeIPv4, 67 | }, 68 | &layers.IPv4{ 69 | Version: 4, 70 | SrcIP: el.SourceAddress, 71 | DstIP: el.DestinationAddress, 72 | Protocol: layers.IPProtocolUDP, 73 | }, 74 | &layers.UDP{ 75 | SrcPort: layers.UDPPort(el.SourcePort), 76 | DstPort: layers.UDPPort(el.DestinationPort), 77 | }, 78 | gopacket.Payload([]byte{1, 2, 3, 4}), 79 | } 80 | } else { 81 | packet = []gopacket.SerializableLayer{ 82 | &layers.Ethernet{ 83 | SrcMAC: []byte{1, 2, 3, 4, 5, 6}, 84 | DstMAC: []byte{6, 5, 4, 3, 2, 1}, 85 | EthernetType: layers.EthernetTypeIPv6, 86 | }, 87 | &layers.IPv6{ 88 | Version: 6, 89 | SrcIP: el.SourceAddress, 90 | DstIP: el.DestinationAddress, 91 | NextHeader: layers.IPProtocolUDP, 92 | }, 93 | &layers.UDP{ 94 | SrcPort: layers.UDPPort(el.SourcePort), 95 | DstPort: layers.UDPPort(el.DestinationPort), 96 | }, 97 | gopacket.Payload([]byte{1, 2, 3, 4}), 98 | } 99 | } 100 | 101 | buf := gopacket.NewSerializeBuffer() 102 | opts := gopacket.SerializeOptions{ 103 | FixLengths: true, 104 | } 105 | gopacket.SerializeLayers(buf, opts, packet...) 106 | return buf.Bytes() 107 | } 108 | 109 | type packet struct { 110 | received uint64 111 | key string 112 | element 113 | } 114 | 115 | type packetSpec struct { 116 | key string 117 | rate int 118 | element 119 | } 120 | 121 | func generatePackets(duration time.Duration, specs ...packetSpec) []packet { 122 | // specs describe individual streams of packets that "arrive" concurrently. 123 | // We need to emit packets from the specs in the correct order, determined 124 | // by their rate. 125 | type step struct { 126 | now uint64 127 | packetSpec 128 | } 129 | 130 | var steps []step 131 | for _, spec := range specs { 132 | interval := time.Second / time.Duration(spec.rate) / time.Nanosecond 133 | for i := 0; i < int(duration/interval); i++ { 134 | steps = append(steps, step{ 135 | uint64(i) * uint64(interval), 136 | spec, 137 | }) 138 | } 139 | 140 | } 141 | 142 | sort.Slice(steps, func(i, j int) bool { 143 | return steps[i].now < steps[j].now 144 | }) 145 | 146 | rng := rand.New(rand.NewSource(seed)) 147 | incompleteIP := func(ip net.IP) bool { 148 | return len(ip) != net.IPv4len && len(ip) != net.IPv6len 149 | } 150 | 151 | var packets []packet 152 | var prev element 153 | for _, step := range steps { 154 | source := step.SourceAddress 155 | if incompleteIP(source) { 156 | source = randomIP(rng, prev.SourceAddress, source) 157 | } 158 | 159 | sourcePort := step.SourcePort 160 | if sourcePort == -1 { 161 | sourcePort = randomPort(rng, prev.SourcePort) 162 | } 163 | 164 | dest := step.DestinationAddress 165 | if incompleteIP(dest) { 166 | dest = randomIP(rng, prev.DestinationAddress, dest) 167 | } 168 | 169 | destPort := step.DestinationPort 170 | if destPort == -1 { 171 | destPort = randomPort(rng, prev.DestinationPort) 172 | } 173 | 174 | next := element{ 175 | source, sourcePort, 176 | dest, destPort, 177 | } 178 | 179 | packets = append(packets, packet{ 180 | received: step.now, 181 | key: step.key, 182 | element: next, 183 | }) 184 | 185 | prev = next 186 | } 187 | 188 | return packets 189 | } 190 | 191 | func randomPort(rng *rand.Rand, prevPort int) int { 192 | port := int(rng.Intn(math.MaxUint16)) 193 | for port == prevPort { 194 | port = int(rng.Intn(math.MaxUint16)) 195 | } 196 | return port 197 | } 198 | 199 | func randomIP(rng *rand.Rand, prevIP net.IP, template net.IP) net.IP { 200 | if len(template) == cap(template) { 201 | panic(fmt.Sprint("invalid template:", template)) 202 | } 203 | 204 | ip := make(net.IP, cap(template)) 205 | copy(ip, template) 206 | 207 | rand.Read(ip[len(template):]) 208 | for bytes.Equal([]byte(prevIP), []byte(ip)) { 209 | rand.Read(ip[len(template):]) 210 | } 211 | 212 | return ip 213 | } 214 | 215 | func ipTemplate(ip net.IP, ipLen int) net.IP { 216 | template := make(net.IP, len(ip), ipLen) 217 | copy(template, ip) 218 | return template 219 | } 220 | 221 | func TestRate(t *testing.T) { 222 | const ( 223 | duration = 10 * time.Second 224 | limit = 100 225 | ) 226 | 227 | rake := mustNew(t, "127.0.0.1:0", limit) 228 | 229 | packets := generatePackets(duration, packetSpec{ 230 | rate: 2 * limit, 231 | element: element{ 232 | SourceAddress: []byte{7, 6, 5, 4}, 233 | DestinationAddress: []byte{1, 2, 3, 4}, 234 | SourcePort: 53, 235 | DestinationPort: 443, 236 | }, 237 | }) 238 | 239 | var accepted int 240 | for i, packet := range packets { 241 | rake.updateTime(t, packet.received) 242 | 243 | verdict, _, err := rake.testProgram.Test(packet.marshal()) 244 | if err != nil { 245 | t.Fatal(err) 246 | } 247 | 248 | if level := rake.rateExceededOnLevel(t); i > 0 && level != 0 { 249 | t.Fatalf("Packet is matched on level %d instead of 0", level) 250 | } 251 | 252 | if verdict > 0 { 253 | accepted++ 254 | } 255 | } 256 | 257 | acceptedRate := float64(accepted) / duration.Seconds() 258 | if acceptedRate < limit*0.95 || acceptedRate > limit*1.05 { 259 | t.Errorf("Didn't match desired rate of %d: %.2f pps accepted", limit, acceptedRate) 260 | } 261 | } 262 | 263 | func TestGeneralisations(t *testing.T) { 264 | const ( 265 | limit = 100 266 | ) 267 | 268 | ipv6Src := net.ParseIP("1122:3344:5566:7788::aabb") 269 | ipv6Dst := net.ParseIP("8877:6655:4433:2211::ffee") 270 | srcPort := 53 271 | dstPort := 443 272 | 273 | type testcase struct { 274 | level uint32 275 | listen string 276 | element 277 | } 278 | 279 | var generalisations []testcase 280 | for _, proto := range []struct { 281 | listen string 282 | src, srcNet net.IP 283 | dst net.IP 284 | wildcard net.IP 285 | }{ 286 | { 287 | "127.0.0.1:0", 288 | net.IP{7, 6, 5, 4}, ipTemplate(net.IP{7, 6, 5}, net.IPv4len), 289 | net.IP{1, 2, 3, 4}, 290 | ipTemplate(nil, net.IPv4len), 291 | }, 292 | { 293 | "[::1]:0", 294 | ipv6Src, ipv6Src[: 64/8 : net.IPv6len], 295 | ipv6Dst, 296 | ipTemplate(nil, net.IPv6len), 297 | }, 298 | } { 299 | generalisations = append(generalisations, 300 | // level 0 301 | testcase{0, proto.listen, element{proto.src, srcPort, proto.dst, dstPort}}, 302 | 303 | // level 1 304 | testcase{1, proto.listen, element{proto.srcNet, srcPort, proto.dst, dstPort}}, 305 | testcase{1, proto.listen, element{proto.src, -1, proto.dst, dstPort}}, 306 | testcase{1, proto.listen, element{proto.src, srcPort, proto.dst, -1}}, 307 | 308 | // level 2 309 | testcase{2, proto.listen, element{proto.wildcard, srcPort, proto.dst, dstPort}}, 310 | testcase{2, proto.listen, element{proto.srcNet, -1, proto.dst, dstPort}}, 311 | testcase{2, proto.listen, element{proto.srcNet, srcPort, proto.dst, -1}}, 312 | testcase{2, proto.listen, element{proto.src, -1, proto.dst, -1}}, 313 | 314 | // level 3 315 | testcase{3, proto.listen, element{proto.wildcard, -1, proto.dst, dstPort}}, 316 | testcase{3, proto.listen, element{proto.wildcard, srcPort, proto.dst, -1}}, 317 | testcase{3, proto.listen, element{proto.srcNet, -1, proto.dst, -1}}, 318 | 319 | // level 4 320 | testcase{4, proto.listen, element{proto.wildcard, -1, proto.dst, -1}}, 321 | ) 322 | } 323 | 324 | for _, gen := range generalisations { 325 | t.Run(gen.String(), func(t *testing.T) { 326 | rake := mustNew(t, gen.listen, limit) 327 | 328 | // Drop all packets once rate exceeds limit 329 | rake.updateRand(t, math.MaxUint32) 330 | 331 | packets := generatePackets(time.Second, packetSpec{ 332 | rate: limit + 1, 333 | element: gen.element, 334 | }) 335 | 336 | for i, packet := range packets { 337 | rake.updateTime(t, packet.received) 338 | 339 | t.Logf("%d: %s", i, &packet.element) 340 | verdict, _, err := rake.testProgram.Test(packet.marshal()) 341 | if err != nil { 342 | t.Fatal(err) 343 | } 344 | 345 | if i == 0 { 346 | if verdict == 0 { 347 | t.Fatal("First packet shouldn't be dropped") 348 | } 349 | 350 | continue 351 | } 352 | 353 | if verdict > 0 { 354 | t.Fatalf("Accepted packet #%d", i) 355 | } 356 | 357 | level := rake.rateExceededOnLevel(t) 358 | if level != gen.level { 359 | t.Fatalf("Packet #%d was dropped on level %d instead of %d", i, level, gen.level) 360 | } 361 | } 362 | }) 363 | } 364 | } 365 | 366 | func TestAttackPropagation(t *testing.T) { 367 | const limit = 2645 368 | 369 | packets := generatePackets(10*time.Second, 370 | packetSpec{ 371 | key: "attack", 372 | rate: 3 * limit, 373 | element: element{ 374 | SourceAddress: []byte{7, 6, 5, 4}, 375 | DestinationAddress: []byte{1, 2, 3, 4}, 376 | SourcePort: 53, 377 | DestinationPort: 443, 378 | }, 379 | }, 380 | packetSpec{ 381 | key: "legit", 382 | rate: limit / 2, 383 | element: element{ 384 | SourceAddress: []byte{7, 6, 5, 4}, 385 | DestinationAddress: []byte{1, 2, 3, 4}, 386 | SourcePort: -1, 387 | DestinationPort: 443, 388 | }, 389 | }, 390 | ) 391 | 392 | rake := mustNew(t, "127.0.0.1:0", limit) 393 | rake.updateRand(t, math.MaxUint32) 394 | for i, packet := range packets { 395 | rake.updateTime(t, packet.received) 396 | 397 | verdict, _, err := rake.testProgram.Test(packet.marshal()) 398 | if err != nil { 399 | t.Fatal(err) 400 | } 401 | 402 | if packet.key == "legit" && verdict == 0 { 403 | t.Fatalf("Dropped legitimate packet #%d: %v", i, rake.rateExceededOnLevel(t)) 404 | } 405 | } 406 | } 407 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/cloudflare/rakelimit 2 | 3 | go 1.14 4 | 5 | require ( 6 | github.com/cilium/ebpf v0.5.1-0.20210527163130-29f67e0a7450 7 | github.com/google/gopacket v1.1.18 8 | golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c 9 | ) 10 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/cilium/ebpf v0.4.0 h1:QlHdikaxALkqWasW8hAC1mfR0jdmvbfaBdBPFmRSglA= 2 | github.com/cilium/ebpf v0.4.0/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs= 3 | github.com/cilium/ebpf v0.5.0 h1:E1KshmrMEtkMP2UjlWzfmUV1owWY+BnbL5FxxuatnrU= 4 | github.com/cilium/ebpf v0.5.0/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs= 5 | github.com/cilium/ebpf v0.5.1-0.20210526091824-eeeaaac5dac5 h1:XEgY0nNTP+a4vm1gYceREVgr4nTyLfBHBEk4x4TiI/Q= 6 | github.com/cilium/ebpf v0.5.1-0.20210526091824-eeeaaac5dac5/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs= 7 | github.com/cilium/ebpf v0.5.1-0.20210527163130-29f67e0a7450 h1:5xuyArKXqJdmfbPcfheMHyAswscRSBB2uJG5O0aRETA= 8 | github.com/cilium/ebpf v0.5.1-0.20210527163130-29f67e0a7450/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs= 9 | github.com/frankban/quicktest v1.11.3 h1:8sXhOn0uLys67V8EsXLc6eszDs8VXWxL3iRvebPhedY= 10 | github.com/frankban/quicktest v1.11.3/go.mod h1:wRf/ReqHper53s+kmmSZizM8NamnL3IM0I9ntUbOk+k= 11 | github.com/google/go-cmp v0.5.4 h1:L8R9j+yAqZuZjsqh/z+F1NCffTKKLShY6zXTItVIZ8M= 12 | github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= 13 | github.com/google/gopacket v1.1.18 h1:lum7VRA9kdlvBi7/v2p7/zcbkduHaCH/SVVyurs7OpY= 14 | github.com/google/gopacket v1.1.18/go.mod h1:UdDNZ1OO62aGYVnPhxT1U6aI7ukYtA/kB8vaU0diBUM= 15 | github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI= 16 | github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= 17 | github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= 18 | github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= 19 | github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= 20 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 21 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= 22 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 23 | golang.org/x/sys v0.0.0-20190405154228-4b34438f7a67/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 24 | golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c h1:VwygUrnw9jn88c4u8GD3rZQbqrP/tgas88tPUbBxQrk= 25 | golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 26 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 27 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= 28 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 29 | -------------------------------------------------------------------------------- /include/Makefile: -------------------------------------------------------------------------------- 1 | LINUX_INC ?= /usr/include 2 | LIBBPF_INC ?= /usr/local/include 3 | 4 | .PHONY: update 5 | update: 6 | rsync --existing --exclude "types.h" -av "$(LINUX_INC)/linux/" "$(CURDIR)/linux" 7 | rsync --existing -av "$(LIBBPF_INC)/bpf/" "$(CURDIR)/bpf" 8 | -------------------------------------------------------------------------------- /include/bpf/bpf_endian.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ 2 | #ifndef __BPF_ENDIAN__ 3 | #define __BPF_ENDIAN__ 4 | 5 | /* 6 | * Isolate byte #n and put it into byte #m, for __u##b type. 7 | * E.g., moving byte #6 (nnnnnnnn) into byte #1 (mmmmmmmm) for __u64: 8 | * 1) xxxxxxxx nnnnnnnn xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx mmmmmmmm xxxxxxxx 9 | * 2) nnnnnnnn xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx mmmmmmmm xxxxxxxx 00000000 10 | * 3) 00000000 00000000 00000000 00000000 00000000 00000000 00000000 nnnnnnnn 11 | * 4) 00000000 00000000 00000000 00000000 00000000 00000000 nnnnnnnn 00000000 12 | */ 13 | #define ___bpf_mvb(x, b, n, m) ((__u##b)(x) << (b-(n+1)*8) >> (b-8) << (m*8)) 14 | 15 | #define ___bpf_swab16(x) ((__u16)( \ 16 | ___bpf_mvb(x, 16, 0, 1) | \ 17 | ___bpf_mvb(x, 16, 1, 0))) 18 | 19 | #define ___bpf_swab32(x) ((__u32)( \ 20 | ___bpf_mvb(x, 32, 0, 3) | \ 21 | ___bpf_mvb(x, 32, 1, 2) | \ 22 | ___bpf_mvb(x, 32, 2, 1) | \ 23 | ___bpf_mvb(x, 32, 3, 0))) 24 | 25 | #define ___bpf_swab64(x) ((__u64)( \ 26 | ___bpf_mvb(x, 64, 0, 7) | \ 27 | ___bpf_mvb(x, 64, 1, 6) | \ 28 | ___bpf_mvb(x, 64, 2, 5) | \ 29 | ___bpf_mvb(x, 64, 3, 4) | \ 30 | ___bpf_mvb(x, 64, 4, 3) | \ 31 | ___bpf_mvb(x, 64, 5, 2) | \ 32 | ___bpf_mvb(x, 64, 6, 1) | \ 33 | ___bpf_mvb(x, 64, 7, 0))) 34 | 35 | /* LLVM's BPF target selects the endianness of the CPU 36 | * it compiles on, or the user specifies (bpfel/bpfeb), 37 | * respectively. The used __BYTE_ORDER__ is defined by 38 | * the compiler, we cannot rely on __BYTE_ORDER from 39 | * libc headers, since it doesn't reflect the actual 40 | * requested byte order. 41 | * 42 | * Note, LLVM's BPF target has different __builtin_bswapX() 43 | * semantics. It does map to BPF_ALU | BPF_END | BPF_TO_BE 44 | * in bpfel and bpfeb case, which means below, that we map 45 | * to cpu_to_be16(). We could use it unconditionally in BPF 46 | * case, but better not rely on it, so that this header here 47 | * can be used from application and BPF program side, which 48 | * use different targets. 49 | */ 50 | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 51 | # define __bpf_ntohs(x) __builtin_bswap16(x) 52 | # define __bpf_htons(x) __builtin_bswap16(x) 53 | # define __bpf_constant_ntohs(x) ___bpf_swab16(x) 54 | # define __bpf_constant_htons(x) ___bpf_swab16(x) 55 | # define __bpf_ntohl(x) __builtin_bswap32(x) 56 | # define __bpf_htonl(x) __builtin_bswap32(x) 57 | # define __bpf_constant_ntohl(x) ___bpf_swab32(x) 58 | # define __bpf_constant_htonl(x) ___bpf_swab32(x) 59 | # define __bpf_be64_to_cpu(x) __builtin_bswap64(x) 60 | # define __bpf_cpu_to_be64(x) __builtin_bswap64(x) 61 | # define __bpf_constant_be64_to_cpu(x) ___bpf_swab64(x) 62 | # define __bpf_constant_cpu_to_be64(x) ___bpf_swab64(x) 63 | #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 64 | # define __bpf_ntohs(x) (x) 65 | # define __bpf_htons(x) (x) 66 | # define __bpf_constant_ntohs(x) (x) 67 | # define __bpf_constant_htons(x) (x) 68 | # define __bpf_ntohl(x) (x) 69 | # define __bpf_htonl(x) (x) 70 | # define __bpf_constant_ntohl(x) (x) 71 | # define __bpf_constant_htonl(x) (x) 72 | # define __bpf_be64_to_cpu(x) (x) 73 | # define __bpf_cpu_to_be64(x) (x) 74 | # define __bpf_constant_be64_to_cpu(x) (x) 75 | # define __bpf_constant_cpu_to_be64(x) (x) 76 | #else 77 | # error "Fix your compiler's __BYTE_ORDER__?!" 78 | #endif 79 | 80 | #define bpf_htons(x) \ 81 | (__builtin_constant_p(x) ? \ 82 | __bpf_constant_htons(x) : __bpf_htons(x)) 83 | #define bpf_ntohs(x) \ 84 | (__builtin_constant_p(x) ? \ 85 | __bpf_constant_ntohs(x) : __bpf_ntohs(x)) 86 | #define bpf_htonl(x) \ 87 | (__builtin_constant_p(x) ? \ 88 | __bpf_constant_htonl(x) : __bpf_htonl(x)) 89 | #define bpf_ntohl(x) \ 90 | (__builtin_constant_p(x) ? \ 91 | __bpf_constant_ntohl(x) : __bpf_ntohl(x)) 92 | #define bpf_cpu_to_be64(x) \ 93 | (__builtin_constant_p(x) ? \ 94 | __bpf_constant_cpu_to_be64(x) : __bpf_cpu_to_be64(x)) 95 | #define bpf_be64_to_cpu(x) \ 96 | (__builtin_constant_p(x) ? \ 97 | __bpf_constant_be64_to_cpu(x) : __bpf_be64_to_cpu(x)) 98 | 99 | #endif /* __BPF_ENDIAN__ */ 100 | -------------------------------------------------------------------------------- /include/bpf/bpf_helper_defs.h: -------------------------------------------------------------------------------- 1 | /* This is auto-generated file. See bpf_helpers_doc.py for details. */ 2 | 3 | /* Forward declarations of BPF structs */ 4 | struct bpf_fib_lookup; 5 | struct bpf_sk_lookup; 6 | struct bpf_perf_event_data; 7 | struct bpf_perf_event_value; 8 | struct bpf_pidns_info; 9 | struct bpf_sock; 10 | struct bpf_sock_addr; 11 | struct bpf_sock_ops; 12 | struct bpf_sock_tuple; 13 | struct bpf_spin_lock; 14 | struct bpf_sysctl; 15 | struct bpf_tcp_sock; 16 | struct bpf_tunnel_key; 17 | struct bpf_xfrm_state; 18 | struct pt_regs; 19 | struct sk_reuseport_md; 20 | struct sockaddr; 21 | struct tcphdr; 22 | struct seq_file; 23 | struct tcp6_sock; 24 | struct tcp_sock; 25 | struct tcp_timewait_sock; 26 | struct tcp_request_sock; 27 | struct udp6_sock; 28 | struct task_struct; 29 | struct __sk_buff; 30 | struct sk_msg_md; 31 | struct xdp_md; 32 | 33 | /* 34 | * bpf_map_lookup_elem 35 | * 36 | * Perform a lookup in *map* for an entry associated to *key*. 37 | * 38 | * Returns 39 | * Map value associated to *key*, or **NULL** if no entry was 40 | * found. 41 | */ 42 | static void *(*bpf_map_lookup_elem)(void *map, const void *key) = (void *) 1; 43 | 44 | /* 45 | * bpf_map_update_elem 46 | * 47 | * Add or update the value of the entry associated to *key* in 48 | * *map* with *value*. *flags* is one of: 49 | * 50 | * **BPF_NOEXIST** 51 | * The entry for *key* must not exist in the map. 52 | * **BPF_EXIST** 53 | * The entry for *key* must already exist in the map. 54 | * **BPF_ANY** 55 | * No condition on the existence of the entry for *key*. 56 | * 57 | * Flag value **BPF_NOEXIST** cannot be used for maps of types 58 | * **BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY** (all 59 | * elements always exist), the helper would return an error. 60 | * 61 | * Returns 62 | * 0 on success, or a negative error in case of failure. 63 | */ 64 | static long (*bpf_map_update_elem)(void *map, const void *key, const void *value, __u64 flags) = (void *) 2; 65 | 66 | /* 67 | * bpf_map_delete_elem 68 | * 69 | * Delete entry with *key* from *map*. 70 | * 71 | * Returns 72 | * 0 on success, or a negative error in case of failure. 73 | */ 74 | static long (*bpf_map_delete_elem)(void *map, const void *key) = (void *) 3; 75 | 76 | /* 77 | * bpf_probe_read 78 | * 79 | * For tracing programs, safely attempt to read *size* bytes from 80 | * kernel space address *unsafe_ptr* and store the data in *dst*. 81 | * 82 | * Generally, use **bpf_probe_read_user**\ () or 83 | * **bpf_probe_read_kernel**\ () instead. 84 | * 85 | * Returns 86 | * 0 on success, or a negative error in case of failure. 87 | */ 88 | static long (*bpf_probe_read)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 4; 89 | 90 | /* 91 | * bpf_ktime_get_ns 92 | * 93 | * Return the time elapsed since system boot, in nanoseconds. 94 | * Does not include time the system was suspended. 95 | * See: **clock_gettime**\ (**CLOCK_MONOTONIC**) 96 | * 97 | * Returns 98 | * Current *ktime*. 99 | */ 100 | static __u64 (*bpf_ktime_get_ns)(void) = (void *) 5; 101 | 102 | /* 103 | * bpf_trace_printk 104 | * 105 | * This helper is a "printk()-like" facility for debugging. It 106 | * prints a message defined by format *fmt* (of size *fmt_size*) 107 | * to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if 108 | * available. It can take up to three additional **u64** 109 | * arguments (as an eBPF helpers, the total number of arguments is 110 | * limited to five). 111 | * 112 | * Each time the helper is called, it appends a line to the trace. 113 | * Lines are discarded while *\/sys/kernel/debug/tracing/trace* is 114 | * open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this. 115 | * The format of the trace is customizable, and the exact output 116 | * one will get depends on the options set in 117 | * *\/sys/kernel/debug/tracing/trace_options* (see also the 118 | * *README* file under the same directory). However, it usually 119 | * defaults to something like: 120 | * 121 | * :: 122 | * 123 | * telnet-470 [001] .N.. 419421.045894: 0x00000001: 124 | * 125 | * In the above: 126 | * 127 | * * ``telnet`` is the name of the current task. 128 | * * ``470`` is the PID of the current task. 129 | * * ``001`` is the CPU number on which the task is 130 | * running. 131 | * * In ``.N..``, each character refers to a set of 132 | * options (whether irqs are enabled, scheduling 133 | * options, whether hard/softirqs are running, level of 134 | * preempt_disabled respectively). **N** means that 135 | * **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED** 136 | * are set. 137 | * * ``419421.045894`` is a timestamp. 138 | * * ``0x00000001`` is a fake value used by BPF for the 139 | * instruction pointer register. 140 | * * ```` is the message formatted with 141 | * *fmt*. 142 | * 143 | * The conversion specifiers supported by *fmt* are similar, but 144 | * more limited than for printk(). They are **%d**, **%i**, 145 | * **%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**, 146 | * **%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size 147 | * of field, padding with zeroes, etc.) is available, and the 148 | * helper will return **-EINVAL** (but print nothing) if it 149 | * encounters an unknown specifier. 150 | * 151 | * Also, note that **bpf_trace_printk**\ () is slow, and should 152 | * only be used for debugging purposes. For this reason, a notice 153 | * bloc (spanning several lines) is printed to kernel logs and 154 | * states that the helper should not be used "for production use" 155 | * the first time this helper is used (or more precisely, when 156 | * **trace_printk**\ () buffers are allocated). For passing values 157 | * to user space, perf events should be preferred. 158 | * 159 | * Returns 160 | * The number of bytes written to the buffer, or a negative error 161 | * in case of failure. 162 | */ 163 | static long (*bpf_trace_printk)(const char *fmt, __u32 fmt_size, ...) = (void *) 6; 164 | 165 | /* 166 | * bpf_get_prandom_u32 167 | * 168 | * Get a pseudo-random number. 169 | * 170 | * From a security point of view, this helper uses its own 171 | * pseudo-random internal state, and cannot be used to infer the 172 | * seed of other random functions in the kernel. However, it is 173 | * essential to note that the generator used by the helper is not 174 | * cryptographically secure. 175 | * 176 | * Returns 177 | * A random 32-bit unsigned value. 178 | */ 179 | static __u32 (*bpf_get_prandom_u32)(void) = (void *) 7; 180 | 181 | /* 182 | * bpf_get_smp_processor_id 183 | * 184 | * Get the SMP (symmetric multiprocessing) processor id. Note that 185 | * all programs run with preemption disabled, which means that the 186 | * SMP processor id is stable during all the execution of the 187 | * program. 188 | * 189 | * Returns 190 | * The SMP id of the processor running the program. 191 | */ 192 | static __u32 (*bpf_get_smp_processor_id)(void) = (void *) 8; 193 | 194 | /* 195 | * bpf_skb_store_bytes 196 | * 197 | * Store *len* bytes from address *from* into the packet 198 | * associated to *skb*, at *offset*. *flags* are a combination of 199 | * **BPF_F_RECOMPUTE_CSUM** (automatically recompute the 200 | * checksum for the packet after storing the bytes) and 201 | * **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\ 202 | * **->swhash** and *skb*\ **->l4hash** to 0). 203 | * 204 | * A call to this helper is susceptible to change the underlying 205 | * packet buffer. Therefore, at load time, all checks on pointers 206 | * previously done by the verifier are invalidated and must be 207 | * performed again, if the helper is used in combination with 208 | * direct packet access. 209 | * 210 | * Returns 211 | * 0 on success, or a negative error in case of failure. 212 | */ 213 | static long (*bpf_skb_store_bytes)(struct __sk_buff *skb, __u32 offset, const void *from, __u32 len, __u64 flags) = (void *) 9; 214 | 215 | /* 216 | * bpf_l3_csum_replace 217 | * 218 | * Recompute the layer 3 (e.g. IP) checksum for the packet 219 | * associated to *skb*. Computation is incremental, so the helper 220 | * must know the former value of the header field that was 221 | * modified (*from*), the new value of this field (*to*), and the 222 | * number of bytes (2 or 4) for this field, stored in *size*. 223 | * Alternatively, it is possible to store the difference between 224 | * the previous and the new values of the header field in *to*, by 225 | * setting *from* and *size* to 0. For both methods, *offset* 226 | * indicates the location of the IP checksum within the packet. 227 | * 228 | * This helper works in combination with **bpf_csum_diff**\ (), 229 | * which does not update the checksum in-place, but offers more 230 | * flexibility and can handle sizes larger than 2 or 4 for the 231 | * checksum to update. 232 | * 233 | * A call to this helper is susceptible to change the underlying 234 | * packet buffer. Therefore, at load time, all checks on pointers 235 | * previously done by the verifier are invalidated and must be 236 | * performed again, if the helper is used in combination with 237 | * direct packet access. 238 | * 239 | * Returns 240 | * 0 on success, or a negative error in case of failure. 241 | */ 242 | static long (*bpf_l3_csum_replace)(struct __sk_buff *skb, __u32 offset, __u64 from, __u64 to, __u64 size) = (void *) 10; 243 | 244 | /* 245 | * bpf_l4_csum_replace 246 | * 247 | * Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the 248 | * packet associated to *skb*. Computation is incremental, so the 249 | * helper must know the former value of the header field that was 250 | * modified (*from*), the new value of this field (*to*), and the 251 | * number of bytes (2 or 4) for this field, stored on the lowest 252 | * four bits of *flags*. Alternatively, it is possible to store 253 | * the difference between the previous and the new values of the 254 | * header field in *to*, by setting *from* and the four lowest 255 | * bits of *flags* to 0. For both methods, *offset* indicates the 256 | * location of the IP checksum within the packet. In addition to 257 | * the size of the field, *flags* can be added (bitwise OR) actual 258 | * flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left 259 | * untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and 260 | * for updates resulting in a null checksum the value is set to 261 | * **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates 262 | * the checksum is to be computed against a pseudo-header. 263 | * 264 | * This helper works in combination with **bpf_csum_diff**\ (), 265 | * which does not update the checksum in-place, but offers more 266 | * flexibility and can handle sizes larger than 2 or 4 for the 267 | * checksum to update. 268 | * 269 | * A call to this helper is susceptible to change the underlying 270 | * packet buffer. Therefore, at load time, all checks on pointers 271 | * previously done by the verifier are invalidated and must be 272 | * performed again, if the helper is used in combination with 273 | * direct packet access. 274 | * 275 | * Returns 276 | * 0 on success, or a negative error in case of failure. 277 | */ 278 | static long (*bpf_l4_csum_replace)(struct __sk_buff *skb, __u32 offset, __u64 from, __u64 to, __u64 flags) = (void *) 11; 279 | 280 | /* 281 | * bpf_tail_call 282 | * 283 | * This special helper is used to trigger a "tail call", or in 284 | * other words, to jump into another eBPF program. The same stack 285 | * frame is used (but values on stack and in registers for the 286 | * caller are not accessible to the callee). This mechanism allows 287 | * for program chaining, either for raising the maximum number of 288 | * available eBPF instructions, or to execute given programs in 289 | * conditional blocks. For security reasons, there is an upper 290 | * limit to the number of successive tail calls that can be 291 | * performed. 292 | * 293 | * Upon call of this helper, the program attempts to jump into a 294 | * program referenced at index *index* in *prog_array_map*, a 295 | * special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes 296 | * *ctx*, a pointer to the context. 297 | * 298 | * If the call succeeds, the kernel immediately runs the first 299 | * instruction of the new program. This is not a function call, 300 | * and it never returns to the previous program. If the call 301 | * fails, then the helper has no effect, and the caller continues 302 | * to run its subsequent instructions. A call can fail if the 303 | * destination program for the jump does not exist (i.e. *index* 304 | * is superior to the number of entries in *prog_array_map*), or 305 | * if the maximum number of tail calls has been reached for this 306 | * chain of programs. This limit is defined in the kernel by the 307 | * macro **MAX_TAIL_CALL_CNT** (not accessible to user space), 308 | * which is currently set to 32. 309 | * 310 | * Returns 311 | * 0 on success, or a negative error in case of failure. 312 | */ 313 | static long (*bpf_tail_call)(void *ctx, void *prog_array_map, __u32 index) = (void *) 12; 314 | 315 | /* 316 | * bpf_clone_redirect 317 | * 318 | * Clone and redirect the packet associated to *skb* to another 319 | * net device of index *ifindex*. Both ingress and egress 320 | * interfaces can be used for redirection. The **BPF_F_INGRESS** 321 | * value in *flags* is used to make the distinction (ingress path 322 | * is selected if the flag is present, egress path otherwise). 323 | * This is the only flag supported for now. 324 | * 325 | * In comparison with **bpf_redirect**\ () helper, 326 | * **bpf_clone_redirect**\ () has the associated cost of 327 | * duplicating the packet buffer, but this can be executed out of 328 | * the eBPF program. Conversely, **bpf_redirect**\ () is more 329 | * efficient, but it is handled through an action code where the 330 | * redirection happens only after the eBPF program has returned. 331 | * 332 | * A call to this helper is susceptible to change the underlying 333 | * packet buffer. Therefore, at load time, all checks on pointers 334 | * previously done by the verifier are invalidated and must be 335 | * performed again, if the helper is used in combination with 336 | * direct packet access. 337 | * 338 | * Returns 339 | * 0 on success, or a negative error in case of failure. 340 | */ 341 | static long (*bpf_clone_redirect)(struct __sk_buff *skb, __u32 ifindex, __u64 flags) = (void *) 13; 342 | 343 | /* 344 | * bpf_get_current_pid_tgid 345 | * 346 | * 347 | * Returns 348 | * A 64-bit integer containing the current tgid and pid, and 349 | * created as such: 350 | * *current_task*\ **->tgid << 32 \|** 351 | * *current_task*\ **->pid**. 352 | */ 353 | static __u64 (*bpf_get_current_pid_tgid)(void) = (void *) 14; 354 | 355 | /* 356 | * bpf_get_current_uid_gid 357 | * 358 | * 359 | * Returns 360 | * A 64-bit integer containing the current GID and UID, and 361 | * created as such: *current_gid* **<< 32 \|** *current_uid*. 362 | */ 363 | static __u64 (*bpf_get_current_uid_gid)(void) = (void *) 15; 364 | 365 | /* 366 | * bpf_get_current_comm 367 | * 368 | * Copy the **comm** attribute of the current task into *buf* of 369 | * *size_of_buf*. The **comm** attribute contains the name of 370 | * the executable (excluding the path) for the current task. The 371 | * *size_of_buf* must be strictly positive. On success, the 372 | * helper makes sure that the *buf* is NUL-terminated. On failure, 373 | * it is filled with zeroes. 374 | * 375 | * Returns 376 | * 0 on success, or a negative error in case of failure. 377 | */ 378 | static long (*bpf_get_current_comm)(void *buf, __u32 size_of_buf) = (void *) 16; 379 | 380 | /* 381 | * bpf_get_cgroup_classid 382 | * 383 | * Retrieve the classid for the current task, i.e. for the net_cls 384 | * cgroup to which *skb* belongs. 385 | * 386 | * This helper can be used on TC egress path, but not on ingress. 387 | * 388 | * The net_cls cgroup provides an interface to tag network packets 389 | * based on a user-provided identifier for all traffic coming from 390 | * the tasks belonging to the related cgroup. See also the related 391 | * kernel documentation, available from the Linux sources in file 392 | * *Documentation/admin-guide/cgroup-v1/net_cls.rst*. 393 | * 394 | * The Linux kernel has two versions for cgroups: there are 395 | * cgroups v1 and cgroups v2. Both are available to users, who can 396 | * use a mixture of them, but note that the net_cls cgroup is for 397 | * cgroup v1 only. This makes it incompatible with BPF programs 398 | * run on cgroups, which is a cgroup-v2-only feature (a socket can 399 | * only hold data for one version of cgroups at a time). 400 | * 401 | * This helper is only available is the kernel was compiled with 402 | * the **CONFIG_CGROUP_NET_CLASSID** configuration option set to 403 | * "**y**" or to "**m**". 404 | * 405 | * Returns 406 | * The classid, or 0 for the default unconfigured classid. 407 | */ 408 | static __u32 (*bpf_get_cgroup_classid)(struct __sk_buff *skb) = (void *) 17; 409 | 410 | /* 411 | * bpf_skb_vlan_push 412 | * 413 | * Push a *vlan_tci* (VLAN tag control information) of protocol 414 | * *vlan_proto* to the packet associated to *skb*, then update 415 | * the checksum. Note that if *vlan_proto* is different from 416 | * **ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to 417 | * be **ETH_P_8021Q**. 418 | * 419 | * A call to this helper is susceptible to change the underlying 420 | * packet buffer. Therefore, at load time, all checks on pointers 421 | * previously done by the verifier are invalidated and must be 422 | * performed again, if the helper is used in combination with 423 | * direct packet access. 424 | * 425 | * Returns 426 | * 0 on success, or a negative error in case of failure. 427 | */ 428 | static long (*bpf_skb_vlan_push)(struct __sk_buff *skb, __be16 vlan_proto, __u16 vlan_tci) = (void *) 18; 429 | 430 | /* 431 | * bpf_skb_vlan_pop 432 | * 433 | * Pop a VLAN header from the packet associated to *skb*. 434 | * 435 | * A call to this helper is susceptible to change the underlying 436 | * packet buffer. Therefore, at load time, all checks on pointers 437 | * previously done by the verifier are invalidated and must be 438 | * performed again, if the helper is used in combination with 439 | * direct packet access. 440 | * 441 | * Returns 442 | * 0 on success, or a negative error in case of failure. 443 | */ 444 | static long (*bpf_skb_vlan_pop)(struct __sk_buff *skb) = (void *) 19; 445 | 446 | /* 447 | * bpf_skb_get_tunnel_key 448 | * 449 | * Get tunnel metadata. This helper takes a pointer *key* to an 450 | * empty **struct bpf_tunnel_key** of **size**, that will be 451 | * filled with tunnel metadata for the packet associated to *skb*. 452 | * The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which 453 | * indicates that the tunnel is based on IPv6 protocol instead of 454 | * IPv4. 455 | * 456 | * The **struct bpf_tunnel_key** is an object that generalizes the 457 | * principal parameters used by various tunneling protocols into a 458 | * single struct. This way, it can be used to easily make a 459 | * decision based on the contents of the encapsulation header, 460 | * "summarized" in this struct. In particular, it holds the IP 461 | * address of the remote end (IPv4 or IPv6, depending on the case) 462 | * in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also, 463 | * this struct exposes the *key*\ **->tunnel_id**, which is 464 | * generally mapped to a VNI (Virtual Network Identifier), making 465 | * it programmable together with the **bpf_skb_set_tunnel_key**\ 466 | * () helper. 467 | * 468 | * Let's imagine that the following code is part of a program 469 | * attached to the TC ingress interface, on one end of a GRE 470 | * tunnel, and is supposed to filter out all messages coming from 471 | * remote ends with IPv4 address other than 10.0.0.1: 472 | * 473 | * :: 474 | * 475 | * int ret; 476 | * struct bpf_tunnel_key key = {}; 477 | * 478 | * ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); 479 | * if (ret < 0) 480 | * return TC_ACT_SHOT; // drop packet 481 | * 482 | * if (key.remote_ipv4 != 0x0a000001) 483 | * return TC_ACT_SHOT; // drop packet 484 | * 485 | * return TC_ACT_OK; // accept packet 486 | * 487 | * This interface can also be used with all encapsulation devices 488 | * that can operate in "collect metadata" mode: instead of having 489 | * one network device per specific configuration, the "collect 490 | * metadata" mode only requires a single device where the 491 | * configuration can be extracted from this helper. 492 | * 493 | * This can be used together with various tunnels such as VXLan, 494 | * Geneve, GRE or IP in IP (IPIP). 495 | * 496 | * Returns 497 | * 0 on success, or a negative error in case of failure. 498 | */ 499 | static long (*bpf_skb_get_tunnel_key)(struct __sk_buff *skb, struct bpf_tunnel_key *key, __u32 size, __u64 flags) = (void *) 20; 500 | 501 | /* 502 | * bpf_skb_set_tunnel_key 503 | * 504 | * Populate tunnel metadata for packet associated to *skb.* The 505 | * tunnel metadata is set to the contents of *key*, of *size*. The 506 | * *flags* can be set to a combination of the following values: 507 | * 508 | * **BPF_F_TUNINFO_IPV6** 509 | * Indicate that the tunnel is based on IPv6 protocol 510 | * instead of IPv4. 511 | * **BPF_F_ZERO_CSUM_TX** 512 | * For IPv4 packets, add a flag to tunnel metadata 513 | * indicating that checksum computation should be skipped 514 | * and checksum set to zeroes. 515 | * **BPF_F_DONT_FRAGMENT** 516 | * Add a flag to tunnel metadata indicating that the 517 | * packet should not be fragmented. 518 | * **BPF_F_SEQ_NUMBER** 519 | * Add a flag to tunnel metadata indicating that a 520 | * sequence number should be added to tunnel header before 521 | * sending the packet. This flag was added for GRE 522 | * encapsulation, but might be used with other protocols 523 | * as well in the future. 524 | * 525 | * Here is a typical usage on the transmit path: 526 | * 527 | * :: 528 | * 529 | * struct bpf_tunnel_key key; 530 | * populate key ... 531 | * bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0); 532 | * bpf_clone_redirect(skb, vxlan_dev_ifindex, 0); 533 | * 534 | * See also the description of the **bpf_skb_get_tunnel_key**\ () 535 | * helper for additional information. 536 | * 537 | * Returns 538 | * 0 on success, or a negative error in case of failure. 539 | */ 540 | static long (*bpf_skb_set_tunnel_key)(struct __sk_buff *skb, struct bpf_tunnel_key *key, __u32 size, __u64 flags) = (void *) 21; 541 | 542 | /* 543 | * bpf_perf_event_read 544 | * 545 | * Read the value of a perf event counter. This helper relies on a 546 | * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of 547 | * the perf event counter is selected when *map* is updated with 548 | * perf event file descriptors. The *map* is an array whose size 549 | * is the number of available CPUs, and each cell contains a value 550 | * relative to one CPU. The value to retrieve is indicated by 551 | * *flags*, that contains the index of the CPU to look up, masked 552 | * with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to 553 | * **BPF_F_CURRENT_CPU** to indicate that the value for the 554 | * current CPU should be retrieved. 555 | * 556 | * Note that before Linux 4.13, only hardware perf event can be 557 | * retrieved. 558 | * 559 | * Also, be aware that the newer helper 560 | * **bpf_perf_event_read_value**\ () is recommended over 561 | * **bpf_perf_event_read**\ () in general. The latter has some ABI 562 | * quirks where error and counter value are used as a return code 563 | * (which is wrong to do since ranges may overlap). This issue is 564 | * fixed with **bpf_perf_event_read_value**\ (), which at the same 565 | * time provides more features over the **bpf_perf_event_read**\ 566 | * () interface. Please refer to the description of 567 | * **bpf_perf_event_read_value**\ () for details. 568 | * 569 | * Returns 570 | * The value of the perf event counter read from the map, or a 571 | * negative error code in case of failure. 572 | */ 573 | static __u64 (*bpf_perf_event_read)(void *map, __u64 flags) = (void *) 22; 574 | 575 | /* 576 | * bpf_redirect 577 | * 578 | * Redirect the packet to another net device of index *ifindex*. 579 | * This helper is somewhat similar to **bpf_clone_redirect**\ 580 | * (), except that the packet is not cloned, which provides 581 | * increased performance. 582 | * 583 | * Except for XDP, both ingress and egress interfaces can be used 584 | * for redirection. The **BPF_F_INGRESS** value in *flags* is used 585 | * to make the distinction (ingress path is selected if the flag 586 | * is present, egress path otherwise). Currently, XDP only 587 | * supports redirection to the egress interface, and accepts no 588 | * flag at all. 589 | * 590 | * The same effect can also be attained with the more generic 591 | * **bpf_redirect_map**\ (), which uses a BPF map to store the 592 | * redirect target instead of providing it directly to the helper. 593 | * 594 | * Returns 595 | * For XDP, the helper returns **XDP_REDIRECT** on success or 596 | * **XDP_ABORTED** on error. For other program types, the values 597 | * are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on 598 | * error. 599 | */ 600 | static long (*bpf_redirect)(__u32 ifindex, __u64 flags) = (void *) 23; 601 | 602 | /* 603 | * bpf_get_route_realm 604 | * 605 | * Retrieve the realm or the route, that is to say the 606 | * **tclassid** field of the destination for the *skb*. The 607 | * indentifier retrieved is a user-provided tag, similar to the 608 | * one used with the net_cls cgroup (see description for 609 | * **bpf_get_cgroup_classid**\ () helper), but here this tag is 610 | * held by a route (a destination entry), not by a task. 611 | * 612 | * Retrieving this identifier works with the clsact TC egress hook 613 | * (see also **tc-bpf(8)**), or alternatively on conventional 614 | * classful egress qdiscs, but not on TC ingress path. In case of 615 | * clsact TC egress hook, this has the advantage that, internally, 616 | * the destination entry has not been dropped yet in the transmit 617 | * path. Therefore, the destination entry does not need to be 618 | * artificially held via **netif_keep_dst**\ () for a classful 619 | * qdisc until the *skb* is freed. 620 | * 621 | * This helper is available only if the kernel was compiled with 622 | * **CONFIG_IP_ROUTE_CLASSID** configuration option. 623 | * 624 | * Returns 625 | * The realm of the route for the packet associated to *skb*, or 0 626 | * if none was found. 627 | */ 628 | static __u32 (*bpf_get_route_realm)(struct __sk_buff *skb) = (void *) 24; 629 | 630 | /* 631 | * bpf_perf_event_output 632 | * 633 | * Write raw *data* blob into a special BPF perf event held by 634 | * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf 635 | * event must have the following attributes: **PERF_SAMPLE_RAW** 636 | * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and 637 | * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. 638 | * 639 | * The *flags* are used to indicate the index in *map* for which 640 | * the value must be put, masked with **BPF_F_INDEX_MASK**. 641 | * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** 642 | * to indicate that the index of the current CPU core should be 643 | * used. 644 | * 645 | * The value to write, of *size*, is passed through eBPF stack and 646 | * pointed by *data*. 647 | * 648 | * The context of the program *ctx* needs also be passed to the 649 | * helper. 650 | * 651 | * On user space, a program willing to read the values needs to 652 | * call **perf_event_open**\ () on the perf event (either for 653 | * one or for all CPUs) and to store the file descriptor into the 654 | * *map*. This must be done before the eBPF program can send data 655 | * into it. An example is available in file 656 | * *samples/bpf/trace_output_user.c* in the Linux kernel source 657 | * tree (the eBPF program counterpart is in 658 | * *samples/bpf/trace_output_kern.c*). 659 | * 660 | * **bpf_perf_event_output**\ () achieves better performance 661 | * than **bpf_trace_printk**\ () for sharing data with user 662 | * space, and is much better suitable for streaming data from eBPF 663 | * programs. 664 | * 665 | * Note that this helper is not restricted to tracing use cases 666 | * and can be used with programs attached to TC or XDP as well, 667 | * where it allows for passing data to user space listeners. Data 668 | * can be: 669 | * 670 | * * Only custom structs, 671 | * * Only the packet payload, or 672 | * * A combination of both. 673 | * 674 | * Returns 675 | * 0 on success, or a negative error in case of failure. 676 | */ 677 | static long (*bpf_perf_event_output)(void *ctx, void *map, __u64 flags, void *data, __u64 size) = (void *) 25; 678 | 679 | /* 680 | * bpf_skb_load_bytes 681 | * 682 | * This helper was provided as an easy way to load data from a 683 | * packet. It can be used to load *len* bytes from *offset* from 684 | * the packet associated to *skb*, into the buffer pointed by 685 | * *to*. 686 | * 687 | * Since Linux 4.7, usage of this helper has mostly been replaced 688 | * by "direct packet access", enabling packet data to be 689 | * manipulated with *skb*\ **->data** and *skb*\ **->data_end** 690 | * pointing respectively to the first byte of packet data and to 691 | * the byte after the last byte of packet data. However, it 692 | * remains useful if one wishes to read large quantities of data 693 | * at once from a packet into the eBPF stack. 694 | * 695 | * Returns 696 | * 0 on success, or a negative error in case of failure. 697 | */ 698 | static long (*bpf_skb_load_bytes)(const void *skb, __u32 offset, void *to, __u32 len) = (void *) 26; 699 | 700 | /* 701 | * bpf_get_stackid 702 | * 703 | * Walk a user or a kernel stack and return its id. To achieve 704 | * this, the helper needs *ctx*, which is a pointer to the context 705 | * on which the tracing program is executed, and a pointer to a 706 | * *map* of type **BPF_MAP_TYPE_STACK_TRACE**. 707 | * 708 | * The last argument, *flags*, holds the number of stack frames to 709 | * skip (from 0 to 255), masked with 710 | * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set 711 | * a combination of the following flags: 712 | * 713 | * **BPF_F_USER_STACK** 714 | * Collect a user space stack instead of a kernel stack. 715 | * **BPF_F_FAST_STACK_CMP** 716 | * Compare stacks by hash only. 717 | * **BPF_F_REUSE_STACKID** 718 | * If two different stacks hash into the same *stackid*, 719 | * discard the old one. 720 | * 721 | * The stack id retrieved is a 32 bit long integer handle which 722 | * can be further combined with other data (including other stack 723 | * ids) and used as a key into maps. This can be useful for 724 | * generating a variety of graphs (such as flame graphs or off-cpu 725 | * graphs). 726 | * 727 | * For walking a stack, this helper is an improvement over 728 | * **bpf_probe_read**\ (), which can be used with unrolled loops 729 | * but is not efficient and consumes a lot of eBPF instructions. 730 | * Instead, **bpf_get_stackid**\ () can collect up to 731 | * **PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that 732 | * this limit can be controlled with the **sysctl** program, and 733 | * that it should be manually increased in order to profile long 734 | * user stacks (such as stacks for Java programs). To do so, use: 735 | * 736 | * :: 737 | * 738 | * # sysctl kernel.perf_event_max_stack= 739 | * 740 | * Returns 741 | * The positive or null stack id on success, or a negative error 742 | * in case of failure. 743 | */ 744 | static long (*bpf_get_stackid)(void *ctx, void *map, __u64 flags) = (void *) 27; 745 | 746 | /* 747 | * bpf_csum_diff 748 | * 749 | * Compute a checksum difference, from the raw buffer pointed by 750 | * *from*, of length *from_size* (that must be a multiple of 4), 751 | * towards the raw buffer pointed by *to*, of size *to_size* 752 | * (same remark). An optional *seed* can be added to the value 753 | * (this can be cascaded, the seed may come from a previous call 754 | * to the helper). 755 | * 756 | * This is flexible enough to be used in several ways: 757 | * 758 | * * With *from_size* == 0, *to_size* > 0 and *seed* set to 759 | * checksum, it can be used when pushing new data. 760 | * * With *from_size* > 0, *to_size* == 0 and *seed* set to 761 | * checksum, it can be used when removing data from a packet. 762 | * * With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it 763 | * can be used to compute a diff. Note that *from_size* and 764 | * *to_size* do not need to be equal. 765 | * 766 | * This helper can be used in combination with 767 | * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to 768 | * which one can feed in the difference computed with 769 | * **bpf_csum_diff**\ (). 770 | * 771 | * Returns 772 | * The checksum result, or a negative error code in case of 773 | * failure. 774 | */ 775 | static __s64 (*bpf_csum_diff)(__be32 *from, __u32 from_size, __be32 *to, __u32 to_size, __wsum seed) = (void *) 28; 776 | 777 | /* 778 | * bpf_skb_get_tunnel_opt 779 | * 780 | * Retrieve tunnel options metadata for the packet associated to 781 | * *skb*, and store the raw tunnel option data to the buffer *opt* 782 | * of *size*. 783 | * 784 | * This helper can be used with encapsulation devices that can 785 | * operate in "collect metadata" mode (please refer to the related 786 | * note in the description of **bpf_skb_get_tunnel_key**\ () for 787 | * more details). A particular example where this can be used is 788 | * in combination with the Geneve encapsulation protocol, where it 789 | * allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper) 790 | * and retrieving arbitrary TLVs (Type-Length-Value headers) from 791 | * the eBPF program. This allows for full customization of these 792 | * headers. 793 | * 794 | * Returns 795 | * The size of the option data retrieved. 796 | */ 797 | static long (*bpf_skb_get_tunnel_opt)(struct __sk_buff *skb, void *opt, __u32 size) = (void *) 29; 798 | 799 | /* 800 | * bpf_skb_set_tunnel_opt 801 | * 802 | * Set tunnel options metadata for the packet associated to *skb* 803 | * to the option data contained in the raw buffer *opt* of *size*. 804 | * 805 | * See also the description of the **bpf_skb_get_tunnel_opt**\ () 806 | * helper for additional information. 807 | * 808 | * Returns 809 | * 0 on success, or a negative error in case of failure. 810 | */ 811 | static long (*bpf_skb_set_tunnel_opt)(struct __sk_buff *skb, void *opt, __u32 size) = (void *) 30; 812 | 813 | /* 814 | * bpf_skb_change_proto 815 | * 816 | * Change the protocol of the *skb* to *proto*. Currently 817 | * supported are transition from IPv4 to IPv6, and from IPv6 to 818 | * IPv4. The helper takes care of the groundwork for the 819 | * transition, including resizing the socket buffer. The eBPF 820 | * program is expected to fill the new headers, if any, via 821 | * **skb_store_bytes**\ () and to recompute the checksums with 822 | * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ 823 | * (). The main case for this helper is to perform NAT64 824 | * operations out of an eBPF program. 825 | * 826 | * Internally, the GSO type is marked as dodgy so that headers are 827 | * checked and segments are recalculated by the GSO/GRO engine. 828 | * The size for GSO target is adapted as well. 829 | * 830 | * All values for *flags* are reserved for future usage, and must 831 | * be left at zero. 832 | * 833 | * A call to this helper is susceptible to change the underlying 834 | * packet buffer. Therefore, at load time, all checks on pointers 835 | * previously done by the verifier are invalidated and must be 836 | * performed again, if the helper is used in combination with 837 | * direct packet access. 838 | * 839 | * Returns 840 | * 0 on success, or a negative error in case of failure. 841 | */ 842 | static long (*bpf_skb_change_proto)(struct __sk_buff *skb, __be16 proto, __u64 flags) = (void *) 31; 843 | 844 | /* 845 | * bpf_skb_change_type 846 | * 847 | * Change the packet type for the packet associated to *skb*. This 848 | * comes down to setting *skb*\ **->pkt_type** to *type*, except 849 | * the eBPF program does not have a write access to *skb*\ 850 | * **->pkt_type** beside this helper. Using a helper here allows 851 | * for graceful handling of errors. 852 | * 853 | * The major use case is to change incoming *skb*s to 854 | * **PACKET_HOST** in a programmatic way instead of having to 855 | * recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for 856 | * example. 857 | * 858 | * Note that *type* only allows certain values. At this time, they 859 | * are: 860 | * 861 | * **PACKET_HOST** 862 | * Packet is for us. 863 | * **PACKET_BROADCAST** 864 | * Send packet to all. 865 | * **PACKET_MULTICAST** 866 | * Send packet to group. 867 | * **PACKET_OTHERHOST** 868 | * Send packet to someone else. 869 | * 870 | * Returns 871 | * 0 on success, or a negative error in case of failure. 872 | */ 873 | static long (*bpf_skb_change_type)(struct __sk_buff *skb, __u32 type) = (void *) 32; 874 | 875 | /* 876 | * bpf_skb_under_cgroup 877 | * 878 | * Check whether *skb* is a descendant of the cgroup2 held by 879 | * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. 880 | * 881 | * Returns 882 | * The return value depends on the result of the test, and can be: 883 | * 884 | * * 0, if the *skb* failed the cgroup2 descendant test. 885 | * * 1, if the *skb* succeeded the cgroup2 descendant test. 886 | * * A negative error code, if an error occurred. 887 | */ 888 | static long (*bpf_skb_under_cgroup)(struct __sk_buff *skb, void *map, __u32 index) = (void *) 33; 889 | 890 | /* 891 | * bpf_get_hash_recalc 892 | * 893 | * Retrieve the hash of the packet, *skb*\ **->hash**. If it is 894 | * not set, in particular if the hash was cleared due to mangling, 895 | * recompute this hash. Later accesses to the hash can be done 896 | * directly with *skb*\ **->hash**. 897 | * 898 | * Calling **bpf_set_hash_invalid**\ (), changing a packet 899 | * prototype with **bpf_skb_change_proto**\ (), or calling 900 | * **bpf_skb_store_bytes**\ () with the 901 | * **BPF_F_INVALIDATE_HASH** are actions susceptible to clear 902 | * the hash and to trigger a new computation for the next call to 903 | * **bpf_get_hash_recalc**\ (). 904 | * 905 | * Returns 906 | * The 32-bit hash. 907 | */ 908 | static __u32 (*bpf_get_hash_recalc)(struct __sk_buff *skb) = (void *) 34; 909 | 910 | /* 911 | * bpf_get_current_task 912 | * 913 | * 914 | * Returns 915 | * A pointer to the current task struct. 916 | */ 917 | static __u64 (*bpf_get_current_task)(void) = (void *) 35; 918 | 919 | /* 920 | * bpf_probe_write_user 921 | * 922 | * Attempt in a safe way to write *len* bytes from the buffer 923 | * *src* to *dst* in memory. It only works for threads that are in 924 | * user context, and *dst* must be a valid user space address. 925 | * 926 | * This helper should not be used to implement any kind of 927 | * security mechanism because of TOC-TOU attacks, but rather to 928 | * debug, divert, and manipulate execution of semi-cooperative 929 | * processes. 930 | * 931 | * Keep in mind that this feature is meant for experiments, and it 932 | * has a risk of crashing the system and running programs. 933 | * Therefore, when an eBPF program using this helper is attached, 934 | * a warning including PID and process name is printed to kernel 935 | * logs. 936 | * 937 | * Returns 938 | * 0 on success, or a negative error in case of failure. 939 | */ 940 | static long (*bpf_probe_write_user)(void *dst, const void *src, __u32 len) = (void *) 36; 941 | 942 | /* 943 | * bpf_current_task_under_cgroup 944 | * 945 | * Check whether the probe is being run is the context of a given 946 | * subset of the cgroup2 hierarchy. The cgroup2 to test is held by 947 | * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. 948 | * 949 | * Returns 950 | * The return value depends on the result of the test, and can be: 951 | * 952 | * * 0, if the *skb* task belongs to the cgroup2. 953 | * * 1, if the *skb* task does not belong to the cgroup2. 954 | * * A negative error code, if an error occurred. 955 | */ 956 | static long (*bpf_current_task_under_cgroup)(void *map, __u32 index) = (void *) 37; 957 | 958 | /* 959 | * bpf_skb_change_tail 960 | * 961 | * Resize (trim or grow) the packet associated to *skb* to the 962 | * new *len*. The *flags* are reserved for future usage, and must 963 | * be left at zero. 964 | * 965 | * The basic idea is that the helper performs the needed work to 966 | * change the size of the packet, then the eBPF program rewrites 967 | * the rest via helpers like **bpf_skb_store_bytes**\ (), 968 | * **bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ () 969 | * and others. This helper is a slow path utility intended for 970 | * replies with control messages. And because it is targeted for 971 | * slow path, the helper itself can afford to be slow: it 972 | * implicitly linearizes, unclones and drops offloads from the 973 | * *skb*. 974 | * 975 | * A call to this helper is susceptible to change the underlying 976 | * packet buffer. Therefore, at load time, all checks on pointers 977 | * previously done by the verifier are invalidated and must be 978 | * performed again, if the helper is used in combination with 979 | * direct packet access. 980 | * 981 | * Returns 982 | * 0 on success, or a negative error in case of failure. 983 | */ 984 | static long (*bpf_skb_change_tail)(struct __sk_buff *skb, __u32 len, __u64 flags) = (void *) 38; 985 | 986 | /* 987 | * bpf_skb_pull_data 988 | * 989 | * Pull in non-linear data in case the *skb* is non-linear and not 990 | * all of *len* are part of the linear section. Make *len* bytes 991 | * from *skb* readable and writable. If a zero value is passed for 992 | * *len*, then the whole length of the *skb* is pulled. 993 | * 994 | * This helper is only needed for reading and writing with direct 995 | * packet access. 996 | * 997 | * For direct packet access, testing that offsets to access 998 | * are within packet boundaries (test on *skb*\ **->data_end**) is 999 | * susceptible to fail if offsets are invalid, or if the requested 1000 | * data is in non-linear parts of the *skb*. On failure the 1001 | * program can just bail out, or in the case of a non-linear 1002 | * buffer, use a helper to make the data available. The 1003 | * **bpf_skb_load_bytes**\ () helper is a first solution to access 1004 | * the data. Another one consists in using **bpf_skb_pull_data** 1005 | * to pull in once the non-linear parts, then retesting and 1006 | * eventually access the data. 1007 | * 1008 | * At the same time, this also makes sure the *skb* is uncloned, 1009 | * which is a necessary condition for direct write. As this needs 1010 | * to be an invariant for the write part only, the verifier 1011 | * detects writes and adds a prologue that is calling 1012 | * **bpf_skb_pull_data()** to effectively unclone the *skb* from 1013 | * the very beginning in case it is indeed cloned. 1014 | * 1015 | * A call to this helper is susceptible to change the underlying 1016 | * packet buffer. Therefore, at load time, all checks on pointers 1017 | * previously done by the verifier are invalidated and must be 1018 | * performed again, if the helper is used in combination with 1019 | * direct packet access. 1020 | * 1021 | * Returns 1022 | * 0 on success, or a negative error in case of failure. 1023 | */ 1024 | static long (*bpf_skb_pull_data)(struct __sk_buff *skb, __u32 len) = (void *) 39; 1025 | 1026 | /* 1027 | * bpf_csum_update 1028 | * 1029 | * Add the checksum *csum* into *skb*\ **->csum** in case the 1030 | * driver has supplied a checksum for the entire packet into that 1031 | * field. Return an error otherwise. This helper is intended to be 1032 | * used in combination with **bpf_csum_diff**\ (), in particular 1033 | * when the checksum needs to be updated after data has been 1034 | * written into the packet through direct packet access. 1035 | * 1036 | * Returns 1037 | * The checksum on success, or a negative error code in case of 1038 | * failure. 1039 | */ 1040 | static __s64 (*bpf_csum_update)(struct __sk_buff *skb, __wsum csum) = (void *) 40; 1041 | 1042 | /* 1043 | * bpf_set_hash_invalid 1044 | * 1045 | * Invalidate the current *skb*\ **->hash**. It can be used after 1046 | * mangling on headers through direct packet access, in order to 1047 | * indicate that the hash is outdated and to trigger a 1048 | * recalculation the next time the kernel tries to access this 1049 | * hash or when the **bpf_get_hash_recalc**\ () helper is called. 1050 | * 1051 | */ 1052 | static void (*bpf_set_hash_invalid)(struct __sk_buff *skb) = (void *) 41; 1053 | 1054 | /* 1055 | * bpf_get_numa_node_id 1056 | * 1057 | * Return the id of the current NUMA node. The primary use case 1058 | * for this helper is the selection of sockets for the local NUMA 1059 | * node, when the program is attached to sockets using the 1060 | * **SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**), 1061 | * but the helper is also available to other eBPF program types, 1062 | * similarly to **bpf_get_smp_processor_id**\ (). 1063 | * 1064 | * Returns 1065 | * The id of current NUMA node. 1066 | */ 1067 | static long (*bpf_get_numa_node_id)(void) = (void *) 42; 1068 | 1069 | /* 1070 | * bpf_skb_change_head 1071 | * 1072 | * Grows headroom of packet associated to *skb* and adjusts the 1073 | * offset of the MAC header accordingly, adding *len* bytes of 1074 | * space. It automatically extends and reallocates memory as 1075 | * required. 1076 | * 1077 | * This helper can be used on a layer 3 *skb* to push a MAC header 1078 | * for redirection into a layer 2 device. 1079 | * 1080 | * All values for *flags* are reserved for future usage, and must 1081 | * be left at zero. 1082 | * 1083 | * A call to this helper is susceptible to change the underlying 1084 | * packet buffer. Therefore, at load time, all checks on pointers 1085 | * previously done by the verifier are invalidated and must be 1086 | * performed again, if the helper is used in combination with 1087 | * direct packet access. 1088 | * 1089 | * Returns 1090 | * 0 on success, or a negative error in case of failure. 1091 | */ 1092 | static long (*bpf_skb_change_head)(struct __sk_buff *skb, __u32 len, __u64 flags) = (void *) 43; 1093 | 1094 | /* 1095 | * bpf_xdp_adjust_head 1096 | * 1097 | * Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that 1098 | * it is possible to use a negative value for *delta*. This helper 1099 | * can be used to prepare the packet for pushing or popping 1100 | * headers. 1101 | * 1102 | * A call to this helper is susceptible to change the underlying 1103 | * packet buffer. Therefore, at load time, all checks on pointers 1104 | * previously done by the verifier are invalidated and must be 1105 | * performed again, if the helper is used in combination with 1106 | * direct packet access. 1107 | * 1108 | * Returns 1109 | * 0 on success, or a negative error in case of failure. 1110 | */ 1111 | static long (*bpf_xdp_adjust_head)(struct xdp_md *xdp_md, int delta) = (void *) 44; 1112 | 1113 | /* 1114 | * bpf_probe_read_str 1115 | * 1116 | * Copy a NUL terminated string from an unsafe kernel address 1117 | * *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for 1118 | * more details. 1119 | * 1120 | * Generally, use **bpf_probe_read_user_str**\ () or 1121 | * **bpf_probe_read_kernel_str**\ () instead. 1122 | * 1123 | * Returns 1124 | * On success, the strictly positive length of the string, 1125 | * including the trailing NUL character. On error, a negative 1126 | * value. 1127 | */ 1128 | static long (*bpf_probe_read_str)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 45; 1129 | 1130 | /* 1131 | * bpf_get_socket_cookie 1132 | * 1133 | * If the **struct sk_buff** pointed by *skb* has a known socket, 1134 | * retrieve the cookie (generated by the kernel) of this socket. 1135 | * If no cookie has been set yet, generate a new cookie. Once 1136 | * generated, the socket cookie remains stable for the life of the 1137 | * socket. This helper can be useful for monitoring per socket 1138 | * networking traffic statistics as it provides a global socket 1139 | * identifier that can be assumed unique. 1140 | * 1141 | * Returns 1142 | * A 8-byte long non-decreasing number on success, or 0 if the 1143 | * socket field is missing inside *skb*. 1144 | */ 1145 | static __u64 (*bpf_get_socket_cookie)(void *ctx) = (void *) 46; 1146 | 1147 | /* 1148 | * bpf_get_socket_uid 1149 | * 1150 | * 1151 | * Returns 1152 | * The owner UID of the socket associated to *skb*. If the socket 1153 | * is **NULL**, or if it is not a full socket (i.e. if it is a 1154 | * time-wait or a request socket instead), **overflowuid** value 1155 | * is returned (note that **overflowuid** might also be the actual 1156 | * UID value for the socket). 1157 | */ 1158 | static __u32 (*bpf_get_socket_uid)(struct __sk_buff *skb) = (void *) 47; 1159 | 1160 | /* 1161 | * bpf_set_hash 1162 | * 1163 | * Set the full hash for *skb* (set the field *skb*\ **->hash**) 1164 | * to value *hash*. 1165 | * 1166 | * Returns 1167 | * 0 1168 | */ 1169 | static long (*bpf_set_hash)(struct __sk_buff *skb, __u32 hash) = (void *) 48; 1170 | 1171 | /* 1172 | * bpf_setsockopt 1173 | * 1174 | * Emulate a call to **setsockopt()** on the socket associated to 1175 | * *bpf_socket*, which must be a full socket. The *level* at 1176 | * which the option resides and the name *optname* of the option 1177 | * must be specified, see **setsockopt(2)** for more information. 1178 | * The option value of length *optlen* is pointed by *optval*. 1179 | * 1180 | * *bpf_socket* should be one of the following: 1181 | * 1182 | * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. 1183 | * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** 1184 | * and **BPF_CGROUP_INET6_CONNECT**. 1185 | * 1186 | * This helper actually implements a subset of **setsockopt()**. 1187 | * It supports the following *level*\ s: 1188 | * 1189 | * * **SOL_SOCKET**, which supports the following *optname*\ s: 1190 | * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**, 1191 | * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**, 1192 | * **SO_BINDTODEVICE**, **SO_KEEPALIVE**. 1193 | * * **IPPROTO_TCP**, which supports the following *optname*\ s: 1194 | * **TCP_CONGESTION**, **TCP_BPF_IW**, 1195 | * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, 1196 | * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, 1197 | * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**. 1198 | * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. 1199 | * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. 1200 | * 1201 | * Returns 1202 | * 0 on success, or a negative error in case of failure. 1203 | */ 1204 | static long (*bpf_setsockopt)(void *bpf_socket, int level, int optname, void *optval, int optlen) = (void *) 49; 1205 | 1206 | /* 1207 | * bpf_skb_adjust_room 1208 | * 1209 | * Grow or shrink the room for data in the packet associated to 1210 | * *skb* by *len_diff*, and according to the selected *mode*. 1211 | * 1212 | * By default, the helper will reset any offloaded checksum 1213 | * indicator of the skb to CHECKSUM_NONE. This can be avoided 1214 | * by the following flag: 1215 | * 1216 | * * **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded 1217 | * checksum data of the skb to CHECKSUM_NONE. 1218 | * 1219 | * There are two supported modes at this time: 1220 | * 1221 | * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer 1222 | * (room space is added or removed below the layer 2 header). 1223 | * 1224 | * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer 1225 | * (room space is added or removed below the layer 3 header). 1226 | * 1227 | * The following flags are supported at this time: 1228 | * 1229 | * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size. 1230 | * Adjusting mss in this way is not allowed for datagrams. 1231 | * 1232 | * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4**, 1233 | * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6**: 1234 | * Any new space is reserved to hold a tunnel header. 1235 | * Configure skb offsets and other fields accordingly. 1236 | * 1237 | * * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE**, 1238 | * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP**: 1239 | * Use with ENCAP_L3 flags to further specify the tunnel type. 1240 | * 1241 | * * **BPF_F_ADJ_ROOM_ENCAP_L2**\ (*len*): 1242 | * Use with ENCAP_L3/L4 flags to further specify the tunnel 1243 | * type; *len* is the length of the inner MAC header. 1244 | * 1245 | * A call to this helper is susceptible to change the underlying 1246 | * packet buffer. Therefore, at load time, all checks on pointers 1247 | * previously done by the verifier are invalidated and must be 1248 | * performed again, if the helper is used in combination with 1249 | * direct packet access. 1250 | * 1251 | * Returns 1252 | * 0 on success, or a negative error in case of failure. 1253 | */ 1254 | static long (*bpf_skb_adjust_room)(struct __sk_buff *skb, __s32 len_diff, __u32 mode, __u64 flags) = (void *) 50; 1255 | 1256 | /* 1257 | * bpf_redirect_map 1258 | * 1259 | * Redirect the packet to the endpoint referenced by *map* at 1260 | * index *key*. Depending on its type, this *map* can contain 1261 | * references to net devices (for forwarding packets through other 1262 | * ports), or to CPUs (for redirecting XDP frames to another CPU; 1263 | * but this is only implemented for native XDP (with driver 1264 | * support) as of this writing). 1265 | * 1266 | * The lower two bits of *flags* are used as the return code if 1267 | * the map lookup fails. This is so that the return value can be 1268 | * one of the XDP program return codes up to **XDP_TX**, as chosen 1269 | * by the caller. Any higher bits in the *flags* argument must be 1270 | * unset. 1271 | * 1272 | * See also **bpf_redirect**\ (), which only supports redirecting 1273 | * to an ifindex, but doesn't require a map to do so. 1274 | * 1275 | * Returns 1276 | * **XDP_REDIRECT** on success, or the value of the two lower bits 1277 | * of the *flags* argument on error. 1278 | */ 1279 | static long (*bpf_redirect_map)(void *map, __u32 key, __u64 flags) = (void *) 51; 1280 | 1281 | /* 1282 | * bpf_sk_redirect_map 1283 | * 1284 | * Redirect the packet to the socket referenced by *map* (of type 1285 | * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and 1286 | * egress interfaces can be used for redirection. The 1287 | * **BPF_F_INGRESS** value in *flags* is used to make the 1288 | * distinction (ingress path is selected if the flag is present, 1289 | * egress path otherwise). This is the only flag supported for now. 1290 | * 1291 | * Returns 1292 | * **SK_PASS** on success, or **SK_DROP** on error. 1293 | */ 1294 | static long (*bpf_sk_redirect_map)(struct __sk_buff *skb, void *map, __u32 key, __u64 flags) = (void *) 52; 1295 | 1296 | /* 1297 | * bpf_sock_map_update 1298 | * 1299 | * Add an entry to, or update a *map* referencing sockets. The 1300 | * *skops* is used as a new value for the entry associated to 1301 | * *key*. *flags* is one of: 1302 | * 1303 | * **BPF_NOEXIST** 1304 | * The entry for *key* must not exist in the map. 1305 | * **BPF_EXIST** 1306 | * The entry for *key* must already exist in the map. 1307 | * **BPF_ANY** 1308 | * No condition on the existence of the entry for *key*. 1309 | * 1310 | * If the *map* has eBPF programs (parser and verdict), those will 1311 | * be inherited by the socket being added. If the socket is 1312 | * already attached to eBPF programs, this results in an error. 1313 | * 1314 | * Returns 1315 | * 0 on success, or a negative error in case of failure. 1316 | */ 1317 | static long (*bpf_sock_map_update)(struct bpf_sock_ops *skops, void *map, void *key, __u64 flags) = (void *) 53; 1318 | 1319 | /* 1320 | * bpf_xdp_adjust_meta 1321 | * 1322 | * Adjust the address pointed by *xdp_md*\ **->data_meta** by 1323 | * *delta* (which can be positive or negative). Note that this 1324 | * operation modifies the address stored in *xdp_md*\ **->data**, 1325 | * so the latter must be loaded only after the helper has been 1326 | * called. 1327 | * 1328 | * The use of *xdp_md*\ **->data_meta** is optional and programs 1329 | * are not required to use it. The rationale is that when the 1330 | * packet is processed with XDP (e.g. as DoS filter), it is 1331 | * possible to push further meta data along with it before passing 1332 | * to the stack, and to give the guarantee that an ingress eBPF 1333 | * program attached as a TC classifier on the same device can pick 1334 | * this up for further post-processing. Since TC works with socket 1335 | * buffers, it remains possible to set from XDP the **mark** or 1336 | * **priority** pointers, or other pointers for the socket buffer. 1337 | * Having this scratch space generic and programmable allows for 1338 | * more flexibility as the user is free to store whatever meta 1339 | * data they need. 1340 | * 1341 | * A call to this helper is susceptible to change the underlying 1342 | * packet buffer. Therefore, at load time, all checks on pointers 1343 | * previously done by the verifier are invalidated and must be 1344 | * performed again, if the helper is used in combination with 1345 | * direct packet access. 1346 | * 1347 | * Returns 1348 | * 0 on success, or a negative error in case of failure. 1349 | */ 1350 | static long (*bpf_xdp_adjust_meta)(struct xdp_md *xdp_md, int delta) = (void *) 54; 1351 | 1352 | /* 1353 | * bpf_perf_event_read_value 1354 | * 1355 | * Read the value of a perf event counter, and store it into *buf* 1356 | * of size *buf_size*. This helper relies on a *map* of type 1357 | * **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event 1358 | * counter is selected when *map* is updated with perf event file 1359 | * descriptors. The *map* is an array whose size is the number of 1360 | * available CPUs, and each cell contains a value relative to one 1361 | * CPU. The value to retrieve is indicated by *flags*, that 1362 | * contains the index of the CPU to look up, masked with 1363 | * **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to 1364 | * **BPF_F_CURRENT_CPU** to indicate that the value for the 1365 | * current CPU should be retrieved. 1366 | * 1367 | * This helper behaves in a way close to 1368 | * **bpf_perf_event_read**\ () helper, save that instead of 1369 | * just returning the value observed, it fills the *buf* 1370 | * structure. This allows for additional data to be retrieved: in 1371 | * particular, the enabled and running times (in *buf*\ 1372 | * **->enabled** and *buf*\ **->running**, respectively) are 1373 | * copied. In general, **bpf_perf_event_read_value**\ () is 1374 | * recommended over **bpf_perf_event_read**\ (), which has some 1375 | * ABI issues and provides fewer functionalities. 1376 | * 1377 | * These values are interesting, because hardware PMU (Performance 1378 | * Monitoring Unit) counters are limited resources. When there are 1379 | * more PMU based perf events opened than available counters, 1380 | * kernel will multiplex these events so each event gets certain 1381 | * percentage (but not all) of the PMU time. In case that 1382 | * multiplexing happens, the number of samples or counter value 1383 | * will not reflect the case compared to when no multiplexing 1384 | * occurs. This makes comparison between different runs difficult. 1385 | * Typically, the counter value should be normalized before 1386 | * comparing to other experiments. The usual normalization is done 1387 | * as follows. 1388 | * 1389 | * :: 1390 | * 1391 | * normalized_counter = counter * t_enabled / t_running 1392 | * 1393 | * Where t_enabled is the time enabled for event and t_running is 1394 | * the time running for event since last normalization. The 1395 | * enabled and running times are accumulated since the perf event 1396 | * open. To achieve scaling factor between two invocations of an 1397 | * eBPF program, users can use CPU id as the key (which is 1398 | * typical for perf array usage model) to remember the previous 1399 | * value and do the calculation inside the eBPF program. 1400 | * 1401 | * Returns 1402 | * 0 on success, or a negative error in case of failure. 1403 | */ 1404 | static long (*bpf_perf_event_read_value)(void *map, __u64 flags, struct bpf_perf_event_value *buf, __u32 buf_size) = (void *) 55; 1405 | 1406 | /* 1407 | * bpf_perf_prog_read_value 1408 | * 1409 | * For en eBPF program attached to a perf event, retrieve the 1410 | * value of the event counter associated to *ctx* and store it in 1411 | * the structure pointed by *buf* and of size *buf_size*. Enabled 1412 | * and running times are also stored in the structure (see 1413 | * description of helper **bpf_perf_event_read_value**\ () for 1414 | * more details). 1415 | * 1416 | * Returns 1417 | * 0 on success, or a negative error in case of failure. 1418 | */ 1419 | static long (*bpf_perf_prog_read_value)(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, __u32 buf_size) = (void *) 56; 1420 | 1421 | /* 1422 | * bpf_getsockopt 1423 | * 1424 | * Emulate a call to **getsockopt()** on the socket associated to 1425 | * *bpf_socket*, which must be a full socket. The *level* at 1426 | * which the option resides and the name *optname* of the option 1427 | * must be specified, see **getsockopt(2)** for more information. 1428 | * The retrieved value is stored in the structure pointed by 1429 | * *opval* and of length *optlen*. 1430 | * 1431 | * *bpf_socket* should be one of the following: 1432 | * 1433 | * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. 1434 | * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** 1435 | * and **BPF_CGROUP_INET6_CONNECT**. 1436 | * 1437 | * This helper actually implements a subset of **getsockopt()**. 1438 | * It supports the following *level*\ s: 1439 | * 1440 | * * **IPPROTO_TCP**, which supports *optname* 1441 | * **TCP_CONGESTION**. 1442 | * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. 1443 | * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. 1444 | * 1445 | * Returns 1446 | * 0 on success, or a negative error in case of failure. 1447 | */ 1448 | static long (*bpf_getsockopt)(void *bpf_socket, int level, int optname, void *optval, int optlen) = (void *) 57; 1449 | 1450 | /* 1451 | * bpf_override_return 1452 | * 1453 | * Used for error injection, this helper uses kprobes to override 1454 | * the return value of the probed function, and to set it to *rc*. 1455 | * The first argument is the context *regs* on which the kprobe 1456 | * works. 1457 | * 1458 | * This helper works by setting the PC (program counter) 1459 | * to an override function which is run in place of the original 1460 | * probed function. This means the probed function is not run at 1461 | * all. The replacement function just returns with the required 1462 | * value. 1463 | * 1464 | * This helper has security implications, and thus is subject to 1465 | * restrictions. It is only available if the kernel was compiled 1466 | * with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration 1467 | * option, and in this case it only works on functions tagged with 1468 | * **ALLOW_ERROR_INJECTION** in the kernel code. 1469 | * 1470 | * Also, the helper is only available for the architectures having 1471 | * the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing, 1472 | * x86 architecture is the only one to support this feature. 1473 | * 1474 | * Returns 1475 | * 0 1476 | */ 1477 | static long (*bpf_override_return)(struct pt_regs *regs, __u64 rc) = (void *) 58; 1478 | 1479 | /* 1480 | * bpf_sock_ops_cb_flags_set 1481 | * 1482 | * Attempt to set the value of the **bpf_sock_ops_cb_flags** field 1483 | * for the full TCP socket associated to *bpf_sock_ops* to 1484 | * *argval*. 1485 | * 1486 | * The primary use of this field is to determine if there should 1487 | * be calls to eBPF programs of type 1488 | * **BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP 1489 | * code. A program of the same type can change its value, per 1490 | * connection and as necessary, when the connection is 1491 | * established. This field is directly accessible for reading, but 1492 | * this helper must be used for updates in order to return an 1493 | * error if an eBPF program tries to set a callback that is not 1494 | * supported in the current kernel. 1495 | * 1496 | * *argval* is a flag array which can combine these flags: 1497 | * 1498 | * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) 1499 | * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) 1500 | * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) 1501 | * * **BPF_SOCK_OPS_RTT_CB_FLAG** (every RTT) 1502 | * 1503 | * Therefore, this function can be used to clear a callback flag by 1504 | * setting the appropriate bit to zero. e.g. to disable the RTO 1505 | * callback: 1506 | * 1507 | * **bpf_sock_ops_cb_flags_set(bpf_sock,** 1508 | * **bpf_sock->bpf_sock_ops_cb_flags & ~BPF_SOCK_OPS_RTO_CB_FLAG)** 1509 | * 1510 | * Here are some examples of where one could call such eBPF 1511 | * program: 1512 | * 1513 | * * When RTO fires. 1514 | * * When a packet is retransmitted. 1515 | * * When the connection terminates. 1516 | * * When a packet is sent. 1517 | * * When a packet is received. 1518 | * 1519 | * Returns 1520 | * Code **-EINVAL** if the socket is not a full TCP socket; 1521 | * otherwise, a positive number containing the bits that could not 1522 | * be set is returned (which comes down to 0 if all bits were set 1523 | * as required). 1524 | */ 1525 | static long (*bpf_sock_ops_cb_flags_set)(struct bpf_sock_ops *bpf_sock, int argval) = (void *) 59; 1526 | 1527 | /* 1528 | * bpf_msg_redirect_map 1529 | * 1530 | * This helper is used in programs implementing policies at the 1531 | * socket level. If the message *msg* is allowed to pass (i.e. if 1532 | * the verdict eBPF program returns **SK_PASS**), redirect it to 1533 | * the socket referenced by *map* (of type 1534 | * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and 1535 | * egress interfaces can be used for redirection. The 1536 | * **BPF_F_INGRESS** value in *flags* is used to make the 1537 | * distinction (ingress path is selected if the flag is present, 1538 | * egress path otherwise). This is the only flag supported for now. 1539 | * 1540 | * Returns 1541 | * **SK_PASS** on success, or **SK_DROP** on error. 1542 | */ 1543 | static long (*bpf_msg_redirect_map)(struct sk_msg_md *msg, void *map, __u32 key, __u64 flags) = (void *) 60; 1544 | 1545 | /* 1546 | * bpf_msg_apply_bytes 1547 | * 1548 | * For socket policies, apply the verdict of the eBPF program to 1549 | * the next *bytes* (number of bytes) of message *msg*. 1550 | * 1551 | * For example, this helper can be used in the following cases: 1552 | * 1553 | * * A single **sendmsg**\ () or **sendfile**\ () system call 1554 | * contains multiple logical messages that the eBPF program is 1555 | * supposed to read and for which it should apply a verdict. 1556 | * * An eBPF program only cares to read the first *bytes* of a 1557 | * *msg*. If the message has a large payload, then setting up 1558 | * and calling the eBPF program repeatedly for all bytes, even 1559 | * though the verdict is already known, would create unnecessary 1560 | * overhead. 1561 | * 1562 | * When called from within an eBPF program, the helper sets a 1563 | * counter internal to the BPF infrastructure, that is used to 1564 | * apply the last verdict to the next *bytes*. If *bytes* is 1565 | * smaller than the current data being processed from a 1566 | * **sendmsg**\ () or **sendfile**\ () system call, the first 1567 | * *bytes* will be sent and the eBPF program will be re-run with 1568 | * the pointer for start of data pointing to byte number *bytes* 1569 | * **+ 1**. If *bytes* is larger than the current data being 1570 | * processed, then the eBPF verdict will be applied to multiple 1571 | * **sendmsg**\ () or **sendfile**\ () calls until *bytes* are 1572 | * consumed. 1573 | * 1574 | * Note that if a socket closes with the internal counter holding 1575 | * a non-zero value, this is not a problem because data is not 1576 | * being buffered for *bytes* and is sent as it is received. 1577 | * 1578 | * Returns 1579 | * 0 1580 | */ 1581 | static long (*bpf_msg_apply_bytes)(struct sk_msg_md *msg, __u32 bytes) = (void *) 61; 1582 | 1583 | /* 1584 | * bpf_msg_cork_bytes 1585 | * 1586 | * For socket policies, prevent the execution of the verdict eBPF 1587 | * program for message *msg* until *bytes* (byte number) have been 1588 | * accumulated. 1589 | * 1590 | * This can be used when one needs a specific number of bytes 1591 | * before a verdict can be assigned, even if the data spans 1592 | * multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme 1593 | * case would be a user calling **sendmsg**\ () repeatedly with 1594 | * 1-byte long message segments. Obviously, this is bad for 1595 | * performance, but it is still valid. If the eBPF program needs 1596 | * *bytes* bytes to validate a header, this helper can be used to 1597 | * prevent the eBPF program to be called again until *bytes* have 1598 | * been accumulated. 1599 | * 1600 | * Returns 1601 | * 0 1602 | */ 1603 | static long (*bpf_msg_cork_bytes)(struct sk_msg_md *msg, __u32 bytes) = (void *) 62; 1604 | 1605 | /* 1606 | * bpf_msg_pull_data 1607 | * 1608 | * For socket policies, pull in non-linear data from user space 1609 | * for *msg* and set pointers *msg*\ **->data** and *msg*\ 1610 | * **->data_end** to *start* and *end* bytes offsets into *msg*, 1611 | * respectively. 1612 | * 1613 | * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a 1614 | * *msg* it can only parse data that the (**data**, **data_end**) 1615 | * pointers have already consumed. For **sendmsg**\ () hooks this 1616 | * is likely the first scatterlist element. But for calls relying 1617 | * on the **sendpage** handler (e.g. **sendfile**\ ()) this will 1618 | * be the range (**0**, **0**) because the data is shared with 1619 | * user space and by default the objective is to avoid allowing 1620 | * user space to modify data while (or after) eBPF verdict is 1621 | * being decided. This helper can be used to pull in data and to 1622 | * set the start and end pointer to given values. Data will be 1623 | * copied if necessary (i.e. if data was not linear and if start 1624 | * and end pointers do not point to the same chunk). 1625 | * 1626 | * A call to this helper is susceptible to change the underlying 1627 | * packet buffer. Therefore, at load time, all checks on pointers 1628 | * previously done by the verifier are invalidated and must be 1629 | * performed again, if the helper is used in combination with 1630 | * direct packet access. 1631 | * 1632 | * All values for *flags* are reserved for future usage, and must 1633 | * be left at zero. 1634 | * 1635 | * Returns 1636 | * 0 on success, or a negative error in case of failure. 1637 | */ 1638 | static long (*bpf_msg_pull_data)(struct sk_msg_md *msg, __u32 start, __u32 end, __u64 flags) = (void *) 63; 1639 | 1640 | /* 1641 | * bpf_bind 1642 | * 1643 | * Bind the socket associated to *ctx* to the address pointed by 1644 | * *addr*, of length *addr_len*. This allows for making outgoing 1645 | * connection from the desired IP address, which can be useful for 1646 | * example when all processes inside a cgroup should use one 1647 | * single IP address on a host that has multiple IP configured. 1648 | * 1649 | * This helper works for IPv4 and IPv6, TCP and UDP sockets. The 1650 | * domain (*addr*\ **->sa_family**) must be **AF_INET** (or 1651 | * **AF_INET6**). It's advised to pass zero port (**sin_port** 1652 | * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like 1653 | * behavior and lets the kernel efficiently pick up an unused 1654 | * port as long as 4-tuple is unique. Passing non-zero port might 1655 | * lead to degraded performance. 1656 | * 1657 | * Returns 1658 | * 0 on success, or a negative error in case of failure. 1659 | */ 1660 | static long (*bpf_bind)(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) = (void *) 64; 1661 | 1662 | /* 1663 | * bpf_xdp_adjust_tail 1664 | * 1665 | * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is 1666 | * possible to both shrink and grow the packet tail. 1667 | * Shrink done via *delta* being a negative integer. 1668 | * 1669 | * A call to this helper is susceptible to change the underlying 1670 | * packet buffer. Therefore, at load time, all checks on pointers 1671 | * previously done by the verifier are invalidated and must be 1672 | * performed again, if the helper is used in combination with 1673 | * direct packet access. 1674 | * 1675 | * Returns 1676 | * 0 on success, or a negative error in case of failure. 1677 | */ 1678 | static long (*bpf_xdp_adjust_tail)(struct xdp_md *xdp_md, int delta) = (void *) 65; 1679 | 1680 | /* 1681 | * bpf_skb_get_xfrm_state 1682 | * 1683 | * Retrieve the XFRM state (IP transform framework, see also 1684 | * **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*. 1685 | * 1686 | * The retrieved value is stored in the **struct bpf_xfrm_state** 1687 | * pointed by *xfrm_state* and of length *size*. 1688 | * 1689 | * All values for *flags* are reserved for future usage, and must 1690 | * be left at zero. 1691 | * 1692 | * This helper is available only if the kernel was compiled with 1693 | * **CONFIG_XFRM** configuration option. 1694 | * 1695 | * Returns 1696 | * 0 on success, or a negative error in case of failure. 1697 | */ 1698 | static long (*bpf_skb_get_xfrm_state)(struct __sk_buff *skb, __u32 index, struct bpf_xfrm_state *xfrm_state, __u32 size, __u64 flags) = (void *) 66; 1699 | 1700 | /* 1701 | * bpf_get_stack 1702 | * 1703 | * Return a user or a kernel stack in bpf program provided buffer. 1704 | * To achieve this, the helper needs *ctx*, which is a pointer 1705 | * to the context on which the tracing program is executed. 1706 | * To store the stacktrace, the bpf program provides *buf* with 1707 | * a nonnegative *size*. 1708 | * 1709 | * The last argument, *flags*, holds the number of stack frames to 1710 | * skip (from 0 to 255), masked with 1711 | * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set 1712 | * the following flags: 1713 | * 1714 | * **BPF_F_USER_STACK** 1715 | * Collect a user space stack instead of a kernel stack. 1716 | * **BPF_F_USER_BUILD_ID** 1717 | * Collect buildid+offset instead of ips for user stack, 1718 | * only valid if **BPF_F_USER_STACK** is also specified. 1719 | * 1720 | * **bpf_get_stack**\ () can collect up to 1721 | * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject 1722 | * to sufficient large buffer size. Note that 1723 | * this limit can be controlled with the **sysctl** program, and 1724 | * that it should be manually increased in order to profile long 1725 | * user stacks (such as stacks for Java programs). To do so, use: 1726 | * 1727 | * :: 1728 | * 1729 | * # sysctl kernel.perf_event_max_stack= 1730 | * 1731 | * Returns 1732 | * A non-negative value equal to or less than *size* on success, 1733 | * or a negative error in case of failure. 1734 | */ 1735 | static long (*bpf_get_stack)(void *ctx, void *buf, __u32 size, __u64 flags) = (void *) 67; 1736 | 1737 | /* 1738 | * bpf_skb_load_bytes_relative 1739 | * 1740 | * This helper is similar to **bpf_skb_load_bytes**\ () in that 1741 | * it provides an easy way to load *len* bytes from *offset* 1742 | * from the packet associated to *skb*, into the buffer pointed 1743 | * by *to*. The difference to **bpf_skb_load_bytes**\ () is that 1744 | * a fifth argument *start_header* exists in order to select a 1745 | * base offset to start from. *start_header* can be one of: 1746 | * 1747 | * **BPF_HDR_START_MAC** 1748 | * Base offset to load data from is *skb*'s mac header. 1749 | * **BPF_HDR_START_NET** 1750 | * Base offset to load data from is *skb*'s network header. 1751 | * 1752 | * In general, "direct packet access" is the preferred method to 1753 | * access packet data, however, this helper is in particular useful 1754 | * in socket filters where *skb*\ **->data** does not always point 1755 | * to the start of the mac header and where "direct packet access" 1756 | * is not available. 1757 | * 1758 | * Returns 1759 | * 0 on success, or a negative error in case of failure. 1760 | */ 1761 | static long (*bpf_skb_load_bytes_relative)(const void *skb, __u32 offset, void *to, __u32 len, __u32 start_header) = (void *) 68; 1762 | 1763 | /* 1764 | * bpf_fib_lookup 1765 | * 1766 | * Do FIB lookup in kernel tables using parameters in *params*. 1767 | * If lookup is successful and result shows packet is to be 1768 | * forwarded, the neighbor tables are searched for the nexthop. 1769 | * If successful (ie., FIB lookup shows forwarding and nexthop 1770 | * is resolved), the nexthop address is returned in ipv4_dst 1771 | * or ipv6_dst based on family, smac is set to mac address of 1772 | * egress device, dmac is set to nexthop mac address, rt_metric 1773 | * is set to metric from route (IPv4/IPv6 only), and ifindex 1774 | * is set to the device index of the nexthop from the FIB lookup. 1775 | * 1776 | * *plen* argument is the size of the passed in struct. 1777 | * *flags* argument can be a combination of one or more of the 1778 | * following values: 1779 | * 1780 | * **BPF_FIB_LOOKUP_DIRECT** 1781 | * Do a direct table lookup vs full lookup using FIB 1782 | * rules. 1783 | * **BPF_FIB_LOOKUP_OUTPUT** 1784 | * Perform lookup from an egress perspective (default is 1785 | * ingress). 1786 | * 1787 | * *ctx* is either **struct xdp_md** for XDP programs or 1788 | * **struct sk_buff** tc cls_act programs. 1789 | * 1790 | * Returns 1791 | * * < 0 if any input argument is invalid 1792 | * * 0 on success (packet is forwarded, nexthop neighbor exists) 1793 | * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the 1794 | * packet is not forwarded or needs assist from full stack 1795 | */ 1796 | static long (*bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params, int plen, __u32 flags) = (void *) 69; 1797 | 1798 | /* 1799 | * bpf_sock_hash_update 1800 | * 1801 | * Add an entry to, or update a sockhash *map* referencing sockets. 1802 | * The *skops* is used as a new value for the entry associated to 1803 | * *key*. *flags* is one of: 1804 | * 1805 | * **BPF_NOEXIST** 1806 | * The entry for *key* must not exist in the map. 1807 | * **BPF_EXIST** 1808 | * The entry for *key* must already exist in the map. 1809 | * **BPF_ANY** 1810 | * No condition on the existence of the entry for *key*. 1811 | * 1812 | * If the *map* has eBPF programs (parser and verdict), those will 1813 | * be inherited by the socket being added. If the socket is 1814 | * already attached to eBPF programs, this results in an error. 1815 | * 1816 | * Returns 1817 | * 0 on success, or a negative error in case of failure. 1818 | */ 1819 | static long (*bpf_sock_hash_update)(struct bpf_sock_ops *skops, void *map, void *key, __u64 flags) = (void *) 70; 1820 | 1821 | /* 1822 | * bpf_msg_redirect_hash 1823 | * 1824 | * This helper is used in programs implementing policies at the 1825 | * socket level. If the message *msg* is allowed to pass (i.e. if 1826 | * the verdict eBPF program returns **SK_PASS**), redirect it to 1827 | * the socket referenced by *map* (of type 1828 | * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and 1829 | * egress interfaces can be used for redirection. The 1830 | * **BPF_F_INGRESS** value in *flags* is used to make the 1831 | * distinction (ingress path is selected if the flag is present, 1832 | * egress path otherwise). This is the only flag supported for now. 1833 | * 1834 | * Returns 1835 | * **SK_PASS** on success, or **SK_DROP** on error. 1836 | */ 1837 | static long (*bpf_msg_redirect_hash)(struct sk_msg_md *msg, void *map, void *key, __u64 flags) = (void *) 71; 1838 | 1839 | /* 1840 | * bpf_sk_redirect_hash 1841 | * 1842 | * This helper is used in programs implementing policies at the 1843 | * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. 1844 | * if the verdeict eBPF program returns **SK_PASS**), redirect it 1845 | * to the socket referenced by *map* (of type 1846 | * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and 1847 | * egress interfaces can be used for redirection. The 1848 | * **BPF_F_INGRESS** value in *flags* is used to make the 1849 | * distinction (ingress path is selected if the flag is present, 1850 | * egress otherwise). This is the only flag supported for now. 1851 | * 1852 | * Returns 1853 | * **SK_PASS** on success, or **SK_DROP** on error. 1854 | */ 1855 | static long (*bpf_sk_redirect_hash)(struct __sk_buff *skb, void *map, void *key, __u64 flags) = (void *) 72; 1856 | 1857 | /* 1858 | * bpf_lwt_push_encap 1859 | * 1860 | * Encapsulate the packet associated to *skb* within a Layer 3 1861 | * protocol header. This header is provided in the buffer at 1862 | * address *hdr*, with *len* its size in bytes. *type* indicates 1863 | * the protocol of the header and can be one of: 1864 | * 1865 | * **BPF_LWT_ENCAP_SEG6** 1866 | * IPv6 encapsulation with Segment Routing Header 1867 | * (**struct ipv6_sr_hdr**). *hdr* only contains the SRH, 1868 | * the IPv6 header is computed by the kernel. 1869 | * **BPF_LWT_ENCAP_SEG6_INLINE** 1870 | * Only works if *skb* contains an IPv6 packet. Insert a 1871 | * Segment Routing Header (**struct ipv6_sr_hdr**) inside 1872 | * the IPv6 header. 1873 | * **BPF_LWT_ENCAP_IP** 1874 | * IP encapsulation (GRE/GUE/IPIP/etc). The outer header 1875 | * must be IPv4 or IPv6, followed by zero or more 1876 | * additional headers, up to **LWT_BPF_MAX_HEADROOM** 1877 | * total bytes in all prepended headers. Please note that 1878 | * if **skb_is_gso**\ (*skb*) is true, no more than two 1879 | * headers can be prepended, and the inner header, if 1880 | * present, should be either GRE or UDP/GUE. 1881 | * 1882 | * **BPF_LWT_ENCAP_SEG6**\ \* types can be called by BPF programs 1883 | * of type **BPF_PROG_TYPE_LWT_IN**; **BPF_LWT_ENCAP_IP** type can 1884 | * be called by bpf programs of types **BPF_PROG_TYPE_LWT_IN** and 1885 | * **BPF_PROG_TYPE_LWT_XMIT**. 1886 | * 1887 | * A call to this helper is susceptible to change the underlying 1888 | * packet buffer. Therefore, at load time, all checks on pointers 1889 | * previously done by the verifier are invalidated and must be 1890 | * performed again, if the helper is used in combination with 1891 | * direct packet access. 1892 | * 1893 | * Returns 1894 | * 0 on success, or a negative error in case of failure. 1895 | */ 1896 | static long (*bpf_lwt_push_encap)(struct __sk_buff *skb, __u32 type, void *hdr, __u32 len) = (void *) 73; 1897 | 1898 | /* 1899 | * bpf_lwt_seg6_store_bytes 1900 | * 1901 | * Store *len* bytes from address *from* into the packet 1902 | * associated to *skb*, at *offset*. Only the flags, tag and TLVs 1903 | * inside the outermost IPv6 Segment Routing Header can be 1904 | * modified through this helper. 1905 | * 1906 | * A call to this helper is susceptible to change the underlying 1907 | * packet buffer. Therefore, at load time, all checks on pointers 1908 | * previously done by the verifier are invalidated and must be 1909 | * performed again, if the helper is used in combination with 1910 | * direct packet access. 1911 | * 1912 | * Returns 1913 | * 0 on success, or a negative error in case of failure. 1914 | */ 1915 | static long (*bpf_lwt_seg6_store_bytes)(struct __sk_buff *skb, __u32 offset, const void *from, __u32 len) = (void *) 74; 1916 | 1917 | /* 1918 | * bpf_lwt_seg6_adjust_srh 1919 | * 1920 | * Adjust the size allocated to TLVs in the outermost IPv6 1921 | * Segment Routing Header contained in the packet associated to 1922 | * *skb*, at position *offset* by *delta* bytes. Only offsets 1923 | * after the segments are accepted. *delta* can be as well 1924 | * positive (growing) as negative (shrinking). 1925 | * 1926 | * A call to this helper is susceptible to change the underlying 1927 | * packet buffer. Therefore, at load time, all checks on pointers 1928 | * previously done by the verifier are invalidated and must be 1929 | * performed again, if the helper is used in combination with 1930 | * direct packet access. 1931 | * 1932 | * Returns 1933 | * 0 on success, or a negative error in case of failure. 1934 | */ 1935 | static long (*bpf_lwt_seg6_adjust_srh)(struct __sk_buff *skb, __u32 offset, __s32 delta) = (void *) 75; 1936 | 1937 | /* 1938 | * bpf_lwt_seg6_action 1939 | * 1940 | * Apply an IPv6 Segment Routing action of type *action* to the 1941 | * packet associated to *skb*. Each action takes a parameter 1942 | * contained at address *param*, and of length *param_len* bytes. 1943 | * *action* can be one of: 1944 | * 1945 | * **SEG6_LOCAL_ACTION_END_X** 1946 | * End.X action: Endpoint with Layer-3 cross-connect. 1947 | * Type of *param*: **struct in6_addr**. 1948 | * **SEG6_LOCAL_ACTION_END_T** 1949 | * End.T action: Endpoint with specific IPv6 table lookup. 1950 | * Type of *param*: **int**. 1951 | * **SEG6_LOCAL_ACTION_END_B6** 1952 | * End.B6 action: Endpoint bound to an SRv6 policy. 1953 | * Type of *param*: **struct ipv6_sr_hdr**. 1954 | * **SEG6_LOCAL_ACTION_END_B6_ENCAP** 1955 | * End.B6.Encap action: Endpoint bound to an SRv6 1956 | * encapsulation policy. 1957 | * Type of *param*: **struct ipv6_sr_hdr**. 1958 | * 1959 | * A call to this helper is susceptible to change the underlying 1960 | * packet buffer. Therefore, at load time, all checks on pointers 1961 | * previously done by the verifier are invalidated and must be 1962 | * performed again, if the helper is used in combination with 1963 | * direct packet access. 1964 | * 1965 | * Returns 1966 | * 0 on success, or a negative error in case of failure. 1967 | */ 1968 | static long (*bpf_lwt_seg6_action)(struct __sk_buff *skb, __u32 action, void *param, __u32 param_len) = (void *) 76; 1969 | 1970 | /* 1971 | * bpf_rc_repeat 1972 | * 1973 | * This helper is used in programs implementing IR decoding, to 1974 | * report a successfully decoded repeat key message. This delays 1975 | * the generation of a key up event for previously generated 1976 | * key down event. 1977 | * 1978 | * Some IR protocols like NEC have a special IR message for 1979 | * repeating last button, for when a button is held down. 1980 | * 1981 | * The *ctx* should point to the lirc sample as passed into 1982 | * the program. 1983 | * 1984 | * This helper is only available is the kernel was compiled with 1985 | * the **CONFIG_BPF_LIRC_MODE2** configuration option set to 1986 | * "**y**". 1987 | * 1988 | * Returns 1989 | * 0 1990 | */ 1991 | static long (*bpf_rc_repeat)(void *ctx) = (void *) 77; 1992 | 1993 | /* 1994 | * bpf_rc_keydown 1995 | * 1996 | * This helper is used in programs implementing IR decoding, to 1997 | * report a successfully decoded key press with *scancode*, 1998 | * *toggle* value in the given *protocol*. The scancode will be 1999 | * translated to a keycode using the rc keymap, and reported as 2000 | * an input key down event. After a period a key up event is 2001 | * generated. This period can be extended by calling either 2002 | * **bpf_rc_keydown**\ () again with the same values, or calling 2003 | * **bpf_rc_repeat**\ (). 2004 | * 2005 | * Some protocols include a toggle bit, in case the button was 2006 | * released and pressed again between consecutive scancodes. 2007 | * 2008 | * The *ctx* should point to the lirc sample as passed into 2009 | * the program. 2010 | * 2011 | * The *protocol* is the decoded protocol number (see 2012 | * **enum rc_proto** for some predefined values). 2013 | * 2014 | * This helper is only available is the kernel was compiled with 2015 | * the **CONFIG_BPF_LIRC_MODE2** configuration option set to 2016 | * "**y**". 2017 | * 2018 | * Returns 2019 | * 0 2020 | */ 2021 | static long (*bpf_rc_keydown)(void *ctx, __u32 protocol, __u64 scancode, __u32 toggle) = (void *) 78; 2022 | 2023 | /* 2024 | * bpf_skb_cgroup_id 2025 | * 2026 | * Return the cgroup v2 id of the socket associated with the *skb*. 2027 | * This is roughly similar to the **bpf_get_cgroup_classid**\ () 2028 | * helper for cgroup v1 by providing a tag resp. identifier that 2029 | * can be matched on or used for map lookups e.g. to implement 2030 | * policy. The cgroup v2 id of a given path in the hierarchy is 2031 | * exposed in user space through the f_handle API in order to get 2032 | * to the same 64-bit id. 2033 | * 2034 | * This helper can be used on TC egress path, but not on ingress, 2035 | * and is available only if the kernel was compiled with the 2036 | * **CONFIG_SOCK_CGROUP_DATA** configuration option. 2037 | * 2038 | * Returns 2039 | * The id is returned or 0 in case the id could not be retrieved. 2040 | */ 2041 | static __u64 (*bpf_skb_cgroup_id)(struct __sk_buff *skb) = (void *) 79; 2042 | 2043 | /* 2044 | * bpf_get_current_cgroup_id 2045 | * 2046 | * 2047 | * Returns 2048 | * A 64-bit integer containing the current cgroup id based 2049 | * on the cgroup within which the current task is running. 2050 | */ 2051 | static __u64 (*bpf_get_current_cgroup_id)(void) = (void *) 80; 2052 | 2053 | /* 2054 | * bpf_get_local_storage 2055 | * 2056 | * Get the pointer to the local storage area. 2057 | * The type and the size of the local storage is defined 2058 | * by the *map* argument. 2059 | * The *flags* meaning is specific for each map type, 2060 | * and has to be 0 for cgroup local storage. 2061 | * 2062 | * Depending on the BPF program type, a local storage area 2063 | * can be shared between multiple instances of the BPF program, 2064 | * running simultaneously. 2065 | * 2066 | * A user should care about the synchronization by himself. 2067 | * For example, by using the **BPF_STX_XADD** instruction to alter 2068 | * the shared data. 2069 | * 2070 | * Returns 2071 | * A pointer to the local storage area. 2072 | */ 2073 | static void *(*bpf_get_local_storage)(void *map, __u64 flags) = (void *) 81; 2074 | 2075 | /* 2076 | * bpf_sk_select_reuseport 2077 | * 2078 | * Select a **SO_REUSEPORT** socket from a 2079 | * **BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*. 2080 | * It checks the selected socket is matching the incoming 2081 | * request in the socket buffer. 2082 | * 2083 | * Returns 2084 | * 0 on success, or a negative error in case of failure. 2085 | */ 2086 | static long (*bpf_sk_select_reuseport)(struct sk_reuseport_md *reuse, void *map, void *key, __u64 flags) = (void *) 82; 2087 | 2088 | /* 2089 | * bpf_skb_ancestor_cgroup_id 2090 | * 2091 | * Return id of cgroup v2 that is ancestor of cgroup associated 2092 | * with the *skb* at the *ancestor_level*. The root cgroup is at 2093 | * *ancestor_level* zero and each step down the hierarchy 2094 | * increments the level. If *ancestor_level* == level of cgroup 2095 | * associated with *skb*, then return value will be same as that 2096 | * of **bpf_skb_cgroup_id**\ (). 2097 | * 2098 | * The helper is useful to implement policies based on cgroups 2099 | * that are upper in hierarchy than immediate cgroup associated 2100 | * with *skb*. 2101 | * 2102 | * The format of returned id and helper limitations are same as in 2103 | * **bpf_skb_cgroup_id**\ (). 2104 | * 2105 | * Returns 2106 | * The id is returned or 0 in case the id could not be retrieved. 2107 | */ 2108 | static __u64 (*bpf_skb_ancestor_cgroup_id)(struct __sk_buff *skb, int ancestor_level) = (void *) 83; 2109 | 2110 | /* 2111 | * bpf_sk_lookup_tcp 2112 | * 2113 | * Look for TCP socket matching *tuple*, optionally in a child 2114 | * network namespace *netns*. The return value must be checked, 2115 | * and if non-**NULL**, released via **bpf_sk_release**\ (). 2116 | * 2117 | * The *ctx* should point to the context of the program, such as 2118 | * the skb or socket (depending on the hook in use). This is used 2119 | * to determine the base network namespace for the lookup. 2120 | * 2121 | * *tuple_size* must be one of: 2122 | * 2123 | * **sizeof**\ (*tuple*\ **->ipv4**) 2124 | * Look for an IPv4 socket. 2125 | * **sizeof**\ (*tuple*\ **->ipv6**) 2126 | * Look for an IPv6 socket. 2127 | * 2128 | * If the *netns* is a negative signed 32-bit integer, then the 2129 | * socket lookup table in the netns associated with the *ctx* 2130 | * will be used. For the TC hooks, this is the netns of the device 2131 | * in the skb. For socket hooks, this is the netns of the socket. 2132 | * If *netns* is any other signed 32-bit value greater than or 2133 | * equal to zero then it specifies the ID of the netns relative to 2134 | * the netns associated with the *ctx*. *netns* values beyond the 2135 | * range of 32-bit integers are reserved for future use. 2136 | * 2137 | * All values for *flags* are reserved for future usage, and must 2138 | * be left at zero. 2139 | * 2140 | * This helper is available only if the kernel was compiled with 2141 | * **CONFIG_NET** configuration option. 2142 | * 2143 | * Returns 2144 | * Pointer to **struct bpf_sock**, or **NULL** in case of failure. 2145 | * For sockets with reuseport option, the **struct bpf_sock** 2146 | * result is from *reuse*\ **->socks**\ [] using the hash of the 2147 | * tuple. 2148 | */ 2149 | static struct bpf_sock *(*bpf_sk_lookup_tcp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 84; 2150 | 2151 | /* 2152 | * bpf_sk_lookup_udp 2153 | * 2154 | * Look for UDP socket matching *tuple*, optionally in a child 2155 | * network namespace *netns*. The return value must be checked, 2156 | * and if non-**NULL**, released via **bpf_sk_release**\ (). 2157 | * 2158 | * The *ctx* should point to the context of the program, such as 2159 | * the skb or socket (depending on the hook in use). This is used 2160 | * to determine the base network namespace for the lookup. 2161 | * 2162 | * *tuple_size* must be one of: 2163 | * 2164 | * **sizeof**\ (*tuple*\ **->ipv4**) 2165 | * Look for an IPv4 socket. 2166 | * **sizeof**\ (*tuple*\ **->ipv6**) 2167 | * Look for an IPv6 socket. 2168 | * 2169 | * If the *netns* is a negative signed 32-bit integer, then the 2170 | * socket lookup table in the netns associated with the *ctx* 2171 | * will be used. For the TC hooks, this is the netns of the device 2172 | * in the skb. For socket hooks, this is the netns of the socket. 2173 | * If *netns* is any other signed 32-bit value greater than or 2174 | * equal to zero then it specifies the ID of the netns relative to 2175 | * the netns associated with the *ctx*. *netns* values beyond the 2176 | * range of 32-bit integers are reserved for future use. 2177 | * 2178 | * All values for *flags* are reserved for future usage, and must 2179 | * be left at zero. 2180 | * 2181 | * This helper is available only if the kernel was compiled with 2182 | * **CONFIG_NET** configuration option. 2183 | * 2184 | * Returns 2185 | * Pointer to **struct bpf_sock**, or **NULL** in case of failure. 2186 | * For sockets with reuseport option, the **struct bpf_sock** 2187 | * result is from *reuse*\ **->socks**\ [] using the hash of the 2188 | * tuple. 2189 | */ 2190 | static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 85; 2191 | 2192 | /* 2193 | * bpf_sk_release 2194 | * 2195 | * Release the reference held by *sock*. *sock* must be a 2196 | * non-**NULL** pointer that was returned from 2197 | * **bpf_sk_lookup_xxx**\ (). 2198 | * 2199 | * Returns 2200 | * 0 on success, or a negative error in case of failure. 2201 | */ 2202 | static long (*bpf_sk_release)(struct bpf_sock *sock) = (void *) 86; 2203 | 2204 | /* 2205 | * bpf_map_push_elem 2206 | * 2207 | * Push an element *value* in *map*. *flags* is one of: 2208 | * 2209 | * **BPF_EXIST** 2210 | * If the queue/stack is full, the oldest element is 2211 | * removed to make room for this. 2212 | * 2213 | * Returns 2214 | * 0 on success, or a negative error in case of failure. 2215 | */ 2216 | static long (*bpf_map_push_elem)(void *map, const void *value, __u64 flags) = (void *) 87; 2217 | 2218 | /* 2219 | * bpf_map_pop_elem 2220 | * 2221 | * Pop an element from *map*. 2222 | * 2223 | * Returns 2224 | * 0 on success, or a negative error in case of failure. 2225 | */ 2226 | static long (*bpf_map_pop_elem)(void *map, void *value) = (void *) 88; 2227 | 2228 | /* 2229 | * bpf_map_peek_elem 2230 | * 2231 | * Get an element from *map* without removing it. 2232 | * 2233 | * Returns 2234 | * 0 on success, or a negative error in case of failure. 2235 | */ 2236 | static long (*bpf_map_peek_elem)(void *map, void *value) = (void *) 89; 2237 | 2238 | /* 2239 | * bpf_msg_push_data 2240 | * 2241 | * For socket policies, insert *len* bytes into *msg* at offset 2242 | * *start*. 2243 | * 2244 | * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a 2245 | * *msg* it may want to insert metadata or options into the *msg*. 2246 | * This can later be read and used by any of the lower layer BPF 2247 | * hooks. 2248 | * 2249 | * This helper may fail if under memory pressure (a malloc 2250 | * fails) in these cases BPF programs will get an appropriate 2251 | * error and BPF programs will need to handle them. 2252 | * 2253 | * Returns 2254 | * 0 on success, or a negative error in case of failure. 2255 | */ 2256 | static long (*bpf_msg_push_data)(struct sk_msg_md *msg, __u32 start, __u32 len, __u64 flags) = (void *) 90; 2257 | 2258 | /* 2259 | * bpf_msg_pop_data 2260 | * 2261 | * Will remove *len* bytes from a *msg* starting at byte *start*. 2262 | * This may result in **ENOMEM** errors under certain situations if 2263 | * an allocation and copy are required due to a full ring buffer. 2264 | * However, the helper will try to avoid doing the allocation 2265 | * if possible. Other errors can occur if input parameters are 2266 | * invalid either due to *start* byte not being valid part of *msg* 2267 | * payload and/or *pop* value being to large. 2268 | * 2269 | * Returns 2270 | * 0 on success, or a negative error in case of failure. 2271 | */ 2272 | static long (*bpf_msg_pop_data)(struct sk_msg_md *msg, __u32 start, __u32 len, __u64 flags) = (void *) 91; 2273 | 2274 | /* 2275 | * bpf_rc_pointer_rel 2276 | * 2277 | * This helper is used in programs implementing IR decoding, to 2278 | * report a successfully decoded pointer movement. 2279 | * 2280 | * The *ctx* should point to the lirc sample as passed into 2281 | * the program. 2282 | * 2283 | * This helper is only available is the kernel was compiled with 2284 | * the **CONFIG_BPF_LIRC_MODE2** configuration option set to 2285 | * "**y**". 2286 | * 2287 | * Returns 2288 | * 0 2289 | */ 2290 | static long (*bpf_rc_pointer_rel)(void *ctx, __s32 rel_x, __s32 rel_y) = (void *) 92; 2291 | 2292 | /* 2293 | * bpf_spin_lock 2294 | * 2295 | * Acquire a spinlock represented by the pointer *lock*, which is 2296 | * stored as part of a value of a map. Taking the lock allows to 2297 | * safely update the rest of the fields in that value. The 2298 | * spinlock can (and must) later be released with a call to 2299 | * **bpf_spin_unlock**\ (\ *lock*\ ). 2300 | * 2301 | * Spinlocks in BPF programs come with a number of restrictions 2302 | * and constraints: 2303 | * 2304 | * * **bpf_spin_lock** objects are only allowed inside maps of 2305 | * types **BPF_MAP_TYPE_HASH** and **BPF_MAP_TYPE_ARRAY** (this 2306 | * list could be extended in the future). 2307 | * * BTF description of the map is mandatory. 2308 | * * The BPF program can take ONE lock at a time, since taking two 2309 | * or more could cause dead locks. 2310 | * * Only one **struct bpf_spin_lock** is allowed per map element. 2311 | * * When the lock is taken, calls (either BPF to BPF or helpers) 2312 | * are not allowed. 2313 | * * The **BPF_LD_ABS** and **BPF_LD_IND** instructions are not 2314 | * allowed inside a spinlock-ed region. 2315 | * * The BPF program MUST call **bpf_spin_unlock**\ () to release 2316 | * the lock, on all execution paths, before it returns. 2317 | * * The BPF program can access **struct bpf_spin_lock** only via 2318 | * the **bpf_spin_lock**\ () and **bpf_spin_unlock**\ () 2319 | * helpers. Loading or storing data into the **struct 2320 | * bpf_spin_lock** *lock*\ **;** field of a map is not allowed. 2321 | * * To use the **bpf_spin_lock**\ () helper, the BTF description 2322 | * of the map value must be a struct and have **struct 2323 | * bpf_spin_lock** *anyname*\ **;** field at the top level. 2324 | * Nested lock inside another struct is not allowed. 2325 | * * The **struct bpf_spin_lock** *lock* field in a map value must 2326 | * be aligned on a multiple of 4 bytes in that value. 2327 | * * Syscall with command **BPF_MAP_LOOKUP_ELEM** does not copy 2328 | * the **bpf_spin_lock** field to user space. 2329 | * * Syscall with command **BPF_MAP_UPDATE_ELEM**, or update from 2330 | * a BPF program, do not update the **bpf_spin_lock** field. 2331 | * * **bpf_spin_lock** cannot be on the stack or inside a 2332 | * networking packet (it can only be inside of a map values). 2333 | * * **bpf_spin_lock** is available to root only. 2334 | * * Tracing programs and socket filter programs cannot use 2335 | * **bpf_spin_lock**\ () due to insufficient preemption checks 2336 | * (but this may change in the future). 2337 | * * **bpf_spin_lock** is not allowed in inner maps of map-in-map. 2338 | * 2339 | * Returns 2340 | * 0 2341 | */ 2342 | static long (*bpf_spin_lock)(struct bpf_spin_lock *lock) = (void *) 93; 2343 | 2344 | /* 2345 | * bpf_spin_unlock 2346 | * 2347 | * Release the *lock* previously locked by a call to 2348 | * **bpf_spin_lock**\ (\ *lock*\ ). 2349 | * 2350 | * Returns 2351 | * 0 2352 | */ 2353 | static long (*bpf_spin_unlock)(struct bpf_spin_lock *lock) = (void *) 94; 2354 | 2355 | /* 2356 | * bpf_sk_fullsock 2357 | * 2358 | * This helper gets a **struct bpf_sock** pointer such 2359 | * that all the fields in this **bpf_sock** can be accessed. 2360 | * 2361 | * Returns 2362 | * A **struct bpf_sock** pointer on success, or **NULL** in 2363 | * case of failure. 2364 | */ 2365 | static struct bpf_sock *(*bpf_sk_fullsock)(struct bpf_sock *sk) = (void *) 95; 2366 | 2367 | /* 2368 | * bpf_tcp_sock 2369 | * 2370 | * This helper gets a **struct bpf_tcp_sock** pointer from a 2371 | * **struct bpf_sock** pointer. 2372 | * 2373 | * Returns 2374 | * A **struct bpf_tcp_sock** pointer on success, or **NULL** in 2375 | * case of failure. 2376 | */ 2377 | static struct bpf_tcp_sock *(*bpf_tcp_sock)(struct bpf_sock *sk) = (void *) 96; 2378 | 2379 | /* 2380 | * bpf_skb_ecn_set_ce 2381 | * 2382 | * Set ECN (Explicit Congestion Notification) field of IP header 2383 | * to **CE** (Congestion Encountered) if current value is **ECT** 2384 | * (ECN Capable Transport). Otherwise, do nothing. Works with IPv6 2385 | * and IPv4. 2386 | * 2387 | * Returns 2388 | * 1 if the **CE** flag is set (either by the current helper call 2389 | * or because it was already present), 0 if it is not set. 2390 | */ 2391 | static long (*bpf_skb_ecn_set_ce)(struct __sk_buff *skb) = (void *) 97; 2392 | 2393 | /* 2394 | * bpf_get_listener_sock 2395 | * 2396 | * Return a **struct bpf_sock** pointer in **TCP_LISTEN** state. 2397 | * **bpf_sk_release**\ () is unnecessary and not allowed. 2398 | * 2399 | * Returns 2400 | * A **struct bpf_sock** pointer on success, or **NULL** in 2401 | * case of failure. 2402 | */ 2403 | static struct bpf_sock *(*bpf_get_listener_sock)(struct bpf_sock *sk) = (void *) 98; 2404 | 2405 | /* 2406 | * bpf_skc_lookup_tcp 2407 | * 2408 | * Look for TCP socket matching *tuple*, optionally in a child 2409 | * network namespace *netns*. The return value must be checked, 2410 | * and if non-**NULL**, released via **bpf_sk_release**\ (). 2411 | * 2412 | * This function is identical to **bpf_sk_lookup_tcp**\ (), except 2413 | * that it also returns timewait or request sockets. Use 2414 | * **bpf_sk_fullsock**\ () or **bpf_tcp_sock**\ () to access the 2415 | * full structure. 2416 | * 2417 | * This helper is available only if the kernel was compiled with 2418 | * **CONFIG_NET** configuration option. 2419 | * 2420 | * Returns 2421 | * Pointer to **struct bpf_sock**, or **NULL** in case of failure. 2422 | * For sockets with reuseport option, the **struct bpf_sock** 2423 | * result is from *reuse*\ **->socks**\ [] using the hash of the 2424 | * tuple. 2425 | */ 2426 | static struct bpf_sock *(*bpf_skc_lookup_tcp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 99; 2427 | 2428 | /* 2429 | * bpf_tcp_check_syncookie 2430 | * 2431 | * Check whether *iph* and *th* contain a valid SYN cookie ACK for 2432 | * the listening socket in *sk*. 2433 | * 2434 | * *iph* points to the start of the IPv4 or IPv6 header, while 2435 | * *iph_len* contains **sizeof**\ (**struct iphdr**) or 2436 | * **sizeof**\ (**struct ip6hdr**). 2437 | * 2438 | * *th* points to the start of the TCP header, while *th_len* 2439 | * contains **sizeof**\ (**struct tcphdr**). 2440 | * 2441 | * Returns 2442 | * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative 2443 | * error otherwise. 2444 | */ 2445 | static long (*bpf_tcp_check_syncookie)(struct bpf_sock *sk, void *iph, __u32 iph_len, struct tcphdr *th, __u32 th_len) = (void *) 100; 2446 | 2447 | /* 2448 | * bpf_sysctl_get_name 2449 | * 2450 | * Get name of sysctl in /proc/sys/ and copy it into provided by 2451 | * program buffer *buf* of size *buf_len*. 2452 | * 2453 | * The buffer is always NUL terminated, unless it's zero-sized. 2454 | * 2455 | * If *flags* is zero, full name (e.g. "net/ipv4/tcp_mem") is 2456 | * copied. Use **BPF_F_SYSCTL_BASE_NAME** flag to copy base name 2457 | * only (e.g. "tcp_mem"). 2458 | * 2459 | * Returns 2460 | * Number of character copied (not including the trailing NUL). 2461 | * 2462 | * **-E2BIG** if the buffer wasn't big enough (*buf* will contain 2463 | * truncated name in this case). 2464 | */ 2465 | static long (*bpf_sysctl_get_name)(struct bpf_sysctl *ctx, char *buf, unsigned long buf_len, __u64 flags) = (void *) 101; 2466 | 2467 | /* 2468 | * bpf_sysctl_get_current_value 2469 | * 2470 | * Get current value of sysctl as it is presented in /proc/sys 2471 | * (incl. newline, etc), and copy it as a string into provided 2472 | * by program buffer *buf* of size *buf_len*. 2473 | * 2474 | * The whole value is copied, no matter what file position user 2475 | * space issued e.g. sys_read at. 2476 | * 2477 | * The buffer is always NUL terminated, unless it's zero-sized. 2478 | * 2479 | * Returns 2480 | * Number of character copied (not including the trailing NUL). 2481 | * 2482 | * **-E2BIG** if the buffer wasn't big enough (*buf* will contain 2483 | * truncated name in this case). 2484 | * 2485 | * **-EINVAL** if current value was unavailable, e.g. because 2486 | * sysctl is uninitialized and read returns -EIO for it. 2487 | */ 2488 | static long (*bpf_sysctl_get_current_value)(struct bpf_sysctl *ctx, char *buf, unsigned long buf_len) = (void *) 102; 2489 | 2490 | /* 2491 | * bpf_sysctl_get_new_value 2492 | * 2493 | * Get new value being written by user space to sysctl (before 2494 | * the actual write happens) and copy it as a string into 2495 | * provided by program buffer *buf* of size *buf_len*. 2496 | * 2497 | * User space may write new value at file position > 0. 2498 | * 2499 | * The buffer is always NUL terminated, unless it's zero-sized. 2500 | * 2501 | * Returns 2502 | * Number of character copied (not including the trailing NUL). 2503 | * 2504 | * **-E2BIG** if the buffer wasn't big enough (*buf* will contain 2505 | * truncated name in this case). 2506 | * 2507 | * **-EINVAL** if sysctl is being read. 2508 | */ 2509 | static long (*bpf_sysctl_get_new_value)(struct bpf_sysctl *ctx, char *buf, unsigned long buf_len) = (void *) 103; 2510 | 2511 | /* 2512 | * bpf_sysctl_set_new_value 2513 | * 2514 | * Override new value being written by user space to sysctl with 2515 | * value provided by program in buffer *buf* of size *buf_len*. 2516 | * 2517 | * *buf* should contain a string in same form as provided by user 2518 | * space on sysctl write. 2519 | * 2520 | * User space may write new value at file position > 0. To override 2521 | * the whole sysctl value file position should be set to zero. 2522 | * 2523 | * Returns 2524 | * 0 on success. 2525 | * 2526 | * **-E2BIG** if the *buf_len* is too big. 2527 | * 2528 | * **-EINVAL** if sysctl is being read. 2529 | */ 2530 | static long (*bpf_sysctl_set_new_value)(struct bpf_sysctl *ctx, const char *buf, unsigned long buf_len) = (void *) 104; 2531 | 2532 | /* 2533 | * bpf_strtol 2534 | * 2535 | * Convert the initial part of the string from buffer *buf* of 2536 | * size *buf_len* to a long integer according to the given base 2537 | * and save the result in *res*. 2538 | * 2539 | * The string may begin with an arbitrary amount of white space 2540 | * (as determined by **isspace**\ (3)) followed by a single 2541 | * optional '**-**' sign. 2542 | * 2543 | * Five least significant bits of *flags* encode base, other bits 2544 | * are currently unused. 2545 | * 2546 | * Base must be either 8, 10, 16 or 0 to detect it automatically 2547 | * similar to user space **strtol**\ (3). 2548 | * 2549 | * Returns 2550 | * Number of characters consumed on success. Must be positive but 2551 | * no more than *buf_len*. 2552 | * 2553 | * **-EINVAL** if no valid digits were found or unsupported base 2554 | * was provided. 2555 | * 2556 | * **-ERANGE** if resulting value was out of range. 2557 | */ 2558 | static long (*bpf_strtol)(const char *buf, unsigned long buf_len, __u64 flags, long *res) = (void *) 105; 2559 | 2560 | /* 2561 | * bpf_strtoul 2562 | * 2563 | * Convert the initial part of the string from buffer *buf* of 2564 | * size *buf_len* to an unsigned long integer according to the 2565 | * given base and save the result in *res*. 2566 | * 2567 | * The string may begin with an arbitrary amount of white space 2568 | * (as determined by **isspace**\ (3)). 2569 | * 2570 | * Five least significant bits of *flags* encode base, other bits 2571 | * are currently unused. 2572 | * 2573 | * Base must be either 8, 10, 16 or 0 to detect it automatically 2574 | * similar to user space **strtoul**\ (3). 2575 | * 2576 | * Returns 2577 | * Number of characters consumed on success. Must be positive but 2578 | * no more than *buf_len*. 2579 | * 2580 | * **-EINVAL** if no valid digits were found or unsupported base 2581 | * was provided. 2582 | * 2583 | * **-ERANGE** if resulting value was out of range. 2584 | */ 2585 | static long (*bpf_strtoul)(const char *buf, unsigned long buf_len, __u64 flags, unsigned long *res) = (void *) 106; 2586 | 2587 | /* 2588 | * bpf_sk_storage_get 2589 | * 2590 | * Get a bpf-local-storage from a *sk*. 2591 | * 2592 | * Logically, it could be thought of getting the value from 2593 | * a *map* with *sk* as the **key**. From this 2594 | * perspective, the usage is not much different from 2595 | * **bpf_map_lookup_elem**\ (*map*, **&**\ *sk*) except this 2596 | * helper enforces the key must be a full socket and the map must 2597 | * be a **BPF_MAP_TYPE_SK_STORAGE** also. 2598 | * 2599 | * Underneath, the value is stored locally at *sk* instead of 2600 | * the *map*. The *map* is used as the bpf-local-storage 2601 | * "type". The bpf-local-storage "type" (i.e. the *map*) is 2602 | * searched against all bpf-local-storages residing at *sk*. 2603 | * 2604 | * An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be 2605 | * used such that a new bpf-local-storage will be 2606 | * created if one does not exist. *value* can be used 2607 | * together with **BPF_SK_STORAGE_GET_F_CREATE** to specify 2608 | * the initial value of a bpf-local-storage. If *value* is 2609 | * **NULL**, the new bpf-local-storage will be zero initialized. 2610 | * 2611 | * Returns 2612 | * A bpf-local-storage pointer is returned on success. 2613 | * 2614 | * **NULL** if not found or there was an error in adding 2615 | * a new bpf-local-storage. 2616 | */ 2617 | static void *(*bpf_sk_storage_get)(void *map, struct bpf_sock *sk, void *value, __u64 flags) = (void *) 107; 2618 | 2619 | /* 2620 | * bpf_sk_storage_delete 2621 | * 2622 | * Delete a bpf-local-storage from a *sk*. 2623 | * 2624 | * Returns 2625 | * 0 on success. 2626 | * 2627 | * **-ENOENT** if the bpf-local-storage cannot be found. 2628 | */ 2629 | static long (*bpf_sk_storage_delete)(void *map, struct bpf_sock *sk) = (void *) 108; 2630 | 2631 | /* 2632 | * bpf_send_signal 2633 | * 2634 | * Send signal *sig* to the process of the current task. 2635 | * The signal may be delivered to any of this process's threads. 2636 | * 2637 | * Returns 2638 | * 0 on success or successfully queued. 2639 | * 2640 | * **-EBUSY** if work queue under nmi is full. 2641 | * 2642 | * **-EINVAL** if *sig* is invalid. 2643 | * 2644 | * **-EPERM** if no permission to send the *sig*. 2645 | * 2646 | * **-EAGAIN** if bpf program can try again. 2647 | */ 2648 | static long (*bpf_send_signal)(__u32 sig) = (void *) 109; 2649 | 2650 | /* 2651 | * bpf_tcp_gen_syncookie 2652 | * 2653 | * Try to issue a SYN cookie for the packet with corresponding 2654 | * IP/TCP headers, *iph* and *th*, on the listening socket in *sk*. 2655 | * 2656 | * *iph* points to the start of the IPv4 or IPv6 header, while 2657 | * *iph_len* contains **sizeof**\ (**struct iphdr**) or 2658 | * **sizeof**\ (**struct ip6hdr**). 2659 | * 2660 | * *th* points to the start of the TCP header, while *th_len* 2661 | * contains the length of the TCP header. 2662 | * 2663 | * Returns 2664 | * On success, lower 32 bits hold the generated SYN cookie in 2665 | * followed by 16 bits which hold the MSS value for that cookie, 2666 | * and the top 16 bits are unused. 2667 | * 2668 | * On failure, the returned value is one of the following: 2669 | * 2670 | * **-EINVAL** SYN cookie cannot be issued due to error 2671 | * 2672 | * **-ENOENT** SYN cookie should not be issued (no SYN flood) 2673 | * 2674 | * **-EOPNOTSUPP** kernel configuration does not enable SYN cookies 2675 | * 2676 | * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 2677 | */ 2678 | static __s64 (*bpf_tcp_gen_syncookie)(struct bpf_sock *sk, void *iph, __u32 iph_len, struct tcphdr *th, __u32 th_len) = (void *) 110; 2679 | 2680 | /* 2681 | * bpf_skb_output 2682 | * 2683 | * Write raw *data* blob into a special BPF perf event held by 2684 | * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf 2685 | * event must have the following attributes: **PERF_SAMPLE_RAW** 2686 | * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and 2687 | * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. 2688 | * 2689 | * The *flags* are used to indicate the index in *map* for which 2690 | * the value must be put, masked with **BPF_F_INDEX_MASK**. 2691 | * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** 2692 | * to indicate that the index of the current CPU core should be 2693 | * used. 2694 | * 2695 | * The value to write, of *size*, is passed through eBPF stack and 2696 | * pointed by *data*. 2697 | * 2698 | * *ctx* is a pointer to in-kernel struct sk_buff. 2699 | * 2700 | * This helper is similar to **bpf_perf_event_output**\ () but 2701 | * restricted to raw_tracepoint bpf programs. 2702 | * 2703 | * Returns 2704 | * 0 on success, or a negative error in case of failure. 2705 | */ 2706 | static long (*bpf_skb_output)(void *ctx, void *map, __u64 flags, void *data, __u64 size) = (void *) 111; 2707 | 2708 | /* 2709 | * bpf_probe_read_user 2710 | * 2711 | * Safely attempt to read *size* bytes from user space address 2712 | * *unsafe_ptr* and store the data in *dst*. 2713 | * 2714 | * Returns 2715 | * 0 on success, or a negative error in case of failure. 2716 | */ 2717 | static long (*bpf_probe_read_user)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 112; 2718 | 2719 | /* 2720 | * bpf_probe_read_kernel 2721 | * 2722 | * Safely attempt to read *size* bytes from kernel space address 2723 | * *unsafe_ptr* and store the data in *dst*. 2724 | * 2725 | * Returns 2726 | * 0 on success, or a negative error in case of failure. 2727 | */ 2728 | static long (*bpf_probe_read_kernel)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 113; 2729 | 2730 | /* 2731 | * bpf_probe_read_user_str 2732 | * 2733 | * Copy a NUL terminated string from an unsafe user address 2734 | * *unsafe_ptr* to *dst*. The *size* should include the 2735 | * terminating NUL byte. In case the string length is smaller than 2736 | * *size*, the target is not padded with further NUL bytes. If the 2737 | * string length is larger than *size*, just *size*-1 bytes are 2738 | * copied and the last byte is set to NUL. 2739 | * 2740 | * On success, the length of the copied string is returned. This 2741 | * makes this helper useful in tracing programs for reading 2742 | * strings, and more importantly to get its length at runtime. See 2743 | * the following snippet: 2744 | * 2745 | * :: 2746 | * 2747 | * SEC("kprobe/sys_open") 2748 | * void bpf_sys_open(struct pt_regs *ctx) 2749 | * { 2750 | * char buf[PATHLEN]; // PATHLEN is defined to 256 2751 | * int res = bpf_probe_read_user_str(buf, sizeof(buf), 2752 | * ctx->di); 2753 | * 2754 | * // Consume buf, for example push it to 2755 | * // userspace via bpf_perf_event_output(); we 2756 | * // can use res (the string length) as event 2757 | * // size, after checking its boundaries. 2758 | * } 2759 | * 2760 | * In comparison, using **bpf_probe_read_user**\ () helper here 2761 | * instead to read the string would require to estimate the length 2762 | * at compile time, and would often result in copying more memory 2763 | * than necessary. 2764 | * 2765 | * Another useful use case is when parsing individual process 2766 | * arguments or individual environment variables navigating 2767 | * *current*\ **->mm->arg_start** and *current*\ 2768 | * **->mm->env_start**: using this helper and the return value, 2769 | * one can quickly iterate at the right offset of the memory area. 2770 | * 2771 | * Returns 2772 | * On success, the strictly positive length of the string, 2773 | * including the trailing NUL character. On error, a negative 2774 | * value. 2775 | */ 2776 | static long (*bpf_probe_read_user_str)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 114; 2777 | 2778 | /* 2779 | * bpf_probe_read_kernel_str 2780 | * 2781 | * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* 2782 | * to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply. 2783 | * 2784 | * Returns 2785 | * On success, the strictly positive length of the string, including 2786 | * the trailing NUL character. On error, a negative value. 2787 | */ 2788 | static long (*bpf_probe_read_kernel_str)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 115; 2789 | 2790 | /* 2791 | * bpf_tcp_send_ack 2792 | * 2793 | * Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**. 2794 | * *rcv_nxt* is the ack_seq to be sent out. 2795 | * 2796 | * Returns 2797 | * 0 on success, or a negative error in case of failure. 2798 | */ 2799 | static long (*bpf_tcp_send_ack)(void *tp, __u32 rcv_nxt) = (void *) 116; 2800 | 2801 | /* 2802 | * bpf_send_signal_thread 2803 | * 2804 | * Send signal *sig* to the thread corresponding to the current task. 2805 | * 2806 | * Returns 2807 | * 0 on success or successfully queued. 2808 | * 2809 | * **-EBUSY** if work queue under nmi is full. 2810 | * 2811 | * **-EINVAL** if *sig* is invalid. 2812 | * 2813 | * **-EPERM** if no permission to send the *sig*. 2814 | * 2815 | * **-EAGAIN** if bpf program can try again. 2816 | */ 2817 | static long (*bpf_send_signal_thread)(__u32 sig) = (void *) 117; 2818 | 2819 | /* 2820 | * bpf_jiffies64 2821 | * 2822 | * Obtain the 64bit jiffies 2823 | * 2824 | * Returns 2825 | * The 64 bit jiffies 2826 | */ 2827 | static __u64 (*bpf_jiffies64)(void) = (void *) 118; 2828 | 2829 | /* 2830 | * bpf_read_branch_records 2831 | * 2832 | * For an eBPF program attached to a perf event, retrieve the 2833 | * branch records (**struct perf_branch_entry**) associated to *ctx* 2834 | * and store it in the buffer pointed by *buf* up to size 2835 | * *size* bytes. 2836 | * 2837 | * Returns 2838 | * On success, number of bytes written to *buf*. On error, a 2839 | * negative value. 2840 | * 2841 | * The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to 2842 | * instead return the number of bytes required to store all the 2843 | * branch entries. If this flag is set, *buf* may be NULL. 2844 | * 2845 | * **-EINVAL** if arguments invalid or **size** not a multiple 2846 | * of **sizeof**\ (**struct perf_branch_entry**\ ). 2847 | * 2848 | * **-ENOENT** if architecture does not support branch records. 2849 | */ 2850 | static long (*bpf_read_branch_records)(struct bpf_perf_event_data *ctx, void *buf, __u32 size, __u64 flags) = (void *) 119; 2851 | 2852 | /* 2853 | * bpf_get_ns_current_pid_tgid 2854 | * 2855 | * Returns 0 on success, values for *pid* and *tgid* as seen from the current 2856 | * *namespace* will be returned in *nsdata*. 2857 | * 2858 | * Returns 2859 | * 0 on success, or one of the following in case of failure: 2860 | * 2861 | * **-EINVAL** if dev and inum supplied don't match dev_t and inode number 2862 | * with nsfs of current task, or if dev conversion to dev_t lost high bits. 2863 | * 2864 | * **-ENOENT** if pidns does not exists for the current task. 2865 | */ 2866 | static long (*bpf_get_ns_current_pid_tgid)(__u64 dev, __u64 ino, struct bpf_pidns_info *nsdata, __u32 size) = (void *) 120; 2867 | 2868 | /* 2869 | * bpf_xdp_output 2870 | * 2871 | * Write raw *data* blob into a special BPF perf event held by 2872 | * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf 2873 | * event must have the following attributes: **PERF_SAMPLE_RAW** 2874 | * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and 2875 | * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. 2876 | * 2877 | * The *flags* are used to indicate the index in *map* for which 2878 | * the value must be put, masked with **BPF_F_INDEX_MASK**. 2879 | * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** 2880 | * to indicate that the index of the current CPU core should be 2881 | * used. 2882 | * 2883 | * The value to write, of *size*, is passed through eBPF stack and 2884 | * pointed by *data*. 2885 | * 2886 | * *ctx* is a pointer to in-kernel struct xdp_buff. 2887 | * 2888 | * This helper is similar to **bpf_perf_eventoutput**\ () but 2889 | * restricted to raw_tracepoint bpf programs. 2890 | * 2891 | * Returns 2892 | * 0 on success, or a negative error in case of failure. 2893 | */ 2894 | static long (*bpf_xdp_output)(void *ctx, void *map, __u64 flags, void *data, __u64 size) = (void *) 121; 2895 | 2896 | /* 2897 | * bpf_get_netns_cookie 2898 | * 2899 | * Retrieve the cookie (generated by the kernel) of the network 2900 | * namespace the input *ctx* is associated with. The network 2901 | * namespace cookie remains stable for its lifetime and provides 2902 | * a global identifier that can be assumed unique. If *ctx* is 2903 | * NULL, then the helper returns the cookie for the initial 2904 | * network namespace. The cookie itself is very similar to that 2905 | * of **bpf_get_socket_cookie**\ () helper, but for network 2906 | * namespaces instead of sockets. 2907 | * 2908 | * Returns 2909 | * A 8-byte long opaque number. 2910 | */ 2911 | static __u64 (*bpf_get_netns_cookie)(void *ctx) = (void *) 122; 2912 | 2913 | /* 2914 | * bpf_get_current_ancestor_cgroup_id 2915 | * 2916 | * Return id of cgroup v2 that is ancestor of the cgroup associated 2917 | * with the current task at the *ancestor_level*. The root cgroup 2918 | * is at *ancestor_level* zero and each step down the hierarchy 2919 | * increments the level. If *ancestor_level* == level of cgroup 2920 | * associated with the current task, then return value will be the 2921 | * same as that of **bpf_get_current_cgroup_id**\ (). 2922 | * 2923 | * The helper is useful to implement policies based on cgroups 2924 | * that are upper in hierarchy than immediate cgroup associated 2925 | * with the current task. 2926 | * 2927 | * The format of returned id and helper limitations are same as in 2928 | * **bpf_get_current_cgroup_id**\ (). 2929 | * 2930 | * Returns 2931 | * The id is returned or 0 in case the id could not be retrieved. 2932 | */ 2933 | static __u64 (*bpf_get_current_ancestor_cgroup_id)(int ancestor_level) = (void *) 123; 2934 | 2935 | /* 2936 | * bpf_sk_assign 2937 | * 2938 | * Helper is overloaded depending on BPF program type. This 2939 | * description applies to **BPF_PROG_TYPE_SCHED_CLS** and 2940 | * **BPF_PROG_TYPE_SCHED_ACT** programs. 2941 | * 2942 | * Assign the *sk* to the *skb*. When combined with appropriate 2943 | * routing configuration to receive the packet towards the socket, 2944 | * will cause *skb* to be delivered to the specified socket. 2945 | * Subsequent redirection of *skb* via **bpf_redirect**\ (), 2946 | * **bpf_clone_redirect**\ () or other methods outside of BPF may 2947 | * interfere with successful delivery to the socket. 2948 | * 2949 | * This operation is only valid from TC ingress path. 2950 | * 2951 | * The *flags* argument must be zero. 2952 | * 2953 | * Returns 2954 | * 0 on success, or a negative error in case of failure: 2955 | * 2956 | * **-EINVAL** if specified *flags* are not supported. 2957 | * 2958 | * **-ENOENT** if the socket is unavailable for assignment. 2959 | * 2960 | * **-ENETUNREACH** if the socket is unreachable (wrong netns). 2961 | * 2962 | * **-EOPNOTSUPP** if the operation is not supported, for example 2963 | * a call from outside of TC ingress. 2964 | * 2965 | * **-ESOCKTNOSUPPORT** if the socket type is not supported 2966 | * (reuseport). 2967 | */ 2968 | static long (*bpf_sk_assign)(void *ctx, struct bpf_sock *sk, __u64 flags) = (void *) 124; 2969 | 2970 | /* 2971 | * bpf_ktime_get_boot_ns 2972 | * 2973 | * Return the time elapsed since system boot, in nanoseconds. 2974 | * Does include the time the system was suspended. 2975 | * See: **clock_gettime**\ (**CLOCK_BOOTTIME**) 2976 | * 2977 | * Returns 2978 | * Current *ktime*. 2979 | */ 2980 | static __u64 (*bpf_ktime_get_boot_ns)(void) = (void *) 125; 2981 | 2982 | /* 2983 | * bpf_seq_printf 2984 | * 2985 | * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print 2986 | * out the format string. 2987 | * The *m* represents the seq_file. The *fmt* and *fmt_size* are for 2988 | * the format string itself. The *data* and *data_len* are format string 2989 | * arguments. The *data* are a **u64** array and corresponding format string 2990 | * values are stored in the array. For strings and pointers where pointees 2991 | * are accessed, only the pointer values are stored in the *data* array. 2992 | * The *data_len* is the size of *data* in bytes. 2993 | * 2994 | * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory. 2995 | * Reading kernel memory may fail due to either invalid address or 2996 | * valid address but requiring a major memory fault. If reading kernel memory 2997 | * fails, the string for **%s** will be an empty string, and the ip 2998 | * address for **%p{i,I}{4,6}** will be 0. Not returning error to 2999 | * bpf program is consistent with what **bpf_trace_printk**\ () does for now. 3000 | * 3001 | * Returns 3002 | * 0 on success, or a negative error in case of failure: 3003 | * 3004 | * **-EBUSY** if per-CPU memory copy buffer is busy, can try again 3005 | * by returning 1 from bpf program. 3006 | * 3007 | * **-EINVAL** if arguments are invalid, or if *fmt* is invalid/unsupported. 3008 | * 3009 | * **-E2BIG** if *fmt* contains too many format specifiers. 3010 | * 3011 | * **-EOVERFLOW** if an overflow happened: The same object will be tried again. 3012 | */ 3013 | static long (*bpf_seq_printf)(struct seq_file *m, const char *fmt, __u32 fmt_size, const void *data, __u32 data_len) = (void *) 126; 3014 | 3015 | /* 3016 | * bpf_seq_write 3017 | * 3018 | * **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data. 3019 | * The *m* represents the seq_file. The *data* and *len* represent the 3020 | * data to write in bytes. 3021 | * 3022 | * Returns 3023 | * 0 on success, or a negative error in case of failure: 3024 | * 3025 | * **-EOVERFLOW** if an overflow happened: The same object will be tried again. 3026 | */ 3027 | static long (*bpf_seq_write)(struct seq_file *m, const void *data, __u32 len) = (void *) 127; 3028 | 3029 | /* 3030 | * bpf_sk_cgroup_id 3031 | * 3032 | * Return the cgroup v2 id of the socket *sk*. 3033 | * 3034 | * *sk* must be a non-**NULL** pointer to a full socket, e.g. one 3035 | * returned from **bpf_sk_lookup_xxx**\ (), 3036 | * **bpf_sk_fullsock**\ (), etc. The format of returned id is 3037 | * same as in **bpf_skb_cgroup_id**\ (). 3038 | * 3039 | * This helper is available only if the kernel was compiled with 3040 | * the **CONFIG_SOCK_CGROUP_DATA** configuration option. 3041 | * 3042 | * Returns 3043 | * The id is returned or 0 in case the id could not be retrieved. 3044 | */ 3045 | static __u64 (*bpf_sk_cgroup_id)(struct bpf_sock *sk) = (void *) 128; 3046 | 3047 | /* 3048 | * bpf_sk_ancestor_cgroup_id 3049 | * 3050 | * Return id of cgroup v2 that is ancestor of cgroup associated 3051 | * with the *sk* at the *ancestor_level*. The root cgroup is at 3052 | * *ancestor_level* zero and each step down the hierarchy 3053 | * increments the level. If *ancestor_level* == level of cgroup 3054 | * associated with *sk*, then return value will be same as that 3055 | * of **bpf_sk_cgroup_id**\ (). 3056 | * 3057 | * The helper is useful to implement policies based on cgroups 3058 | * that are upper in hierarchy than immediate cgroup associated 3059 | * with *sk*. 3060 | * 3061 | * The format of returned id and helper limitations are same as in 3062 | * **bpf_sk_cgroup_id**\ (). 3063 | * 3064 | * Returns 3065 | * The id is returned or 0 in case the id could not be retrieved. 3066 | */ 3067 | static __u64 (*bpf_sk_ancestor_cgroup_id)(struct bpf_sock *sk, int ancestor_level) = (void *) 129; 3068 | 3069 | /* 3070 | * bpf_ringbuf_output 3071 | * 3072 | * Copy *size* bytes from *data* into a ring buffer *ringbuf*. 3073 | * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification 3074 | * of new data availability is sent. 3075 | * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification 3076 | * of new data availability is sent unconditionally. 3077 | * 3078 | * Returns 3079 | * 0 on success, or a negative error in case of failure. 3080 | */ 3081 | static long (*bpf_ringbuf_output)(void *ringbuf, void *data, __u64 size, __u64 flags) = (void *) 130; 3082 | 3083 | /* 3084 | * bpf_ringbuf_reserve 3085 | * 3086 | * Reserve *size* bytes of payload in a ring buffer *ringbuf*. 3087 | * 3088 | * Returns 3089 | * Valid pointer with *size* bytes of memory available; NULL, 3090 | * otherwise. 3091 | */ 3092 | static void *(*bpf_ringbuf_reserve)(void *ringbuf, __u64 size, __u64 flags) = (void *) 131; 3093 | 3094 | /* 3095 | * bpf_ringbuf_submit 3096 | * 3097 | * Submit reserved ring buffer sample, pointed to by *data*. 3098 | * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification 3099 | * of new data availability is sent. 3100 | * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification 3101 | * of new data availability is sent unconditionally. 3102 | * 3103 | * Returns 3104 | * Nothing. Always succeeds. 3105 | */ 3106 | static void (*bpf_ringbuf_submit)(void *data, __u64 flags) = (void *) 132; 3107 | 3108 | /* 3109 | * bpf_ringbuf_discard 3110 | * 3111 | * Discard reserved ring buffer sample, pointed to by *data*. 3112 | * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification 3113 | * of new data availability is sent. 3114 | * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification 3115 | * of new data availability is sent unconditionally. 3116 | * 3117 | * Returns 3118 | * Nothing. Always succeeds. 3119 | */ 3120 | static void (*bpf_ringbuf_discard)(void *data, __u64 flags) = (void *) 133; 3121 | 3122 | /* 3123 | * bpf_ringbuf_query 3124 | * 3125 | * Query various characteristics of provided ring buffer. What 3126 | * exactly is queries is determined by *flags*: 3127 | * 3128 | * * **BPF_RB_AVAIL_DATA**: Amount of data not yet consumed. 3129 | * * **BPF_RB_RING_SIZE**: The size of ring buffer. 3130 | * * **BPF_RB_CONS_POS**: Consumer position (can wrap around). 3131 | * * **BPF_RB_PROD_POS**: Producer(s) position (can wrap around). 3132 | * 3133 | * Data returned is just a momentary snapshot of actual values 3134 | * and could be inaccurate, so this facility should be used to 3135 | * power heuristics and for reporting, not to make 100% correct 3136 | * calculation. 3137 | * 3138 | * Returns 3139 | * Requested value, or 0, if *flags* are not recognized. 3140 | */ 3141 | static __u64 (*bpf_ringbuf_query)(void *ringbuf, __u64 flags) = (void *) 134; 3142 | 3143 | /* 3144 | * bpf_csum_level 3145 | * 3146 | * Change the skbs checksum level by one layer up or down, or 3147 | * reset it entirely to none in order to have the stack perform 3148 | * checksum validation. The level is applicable to the following 3149 | * protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of 3150 | * | ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP | 3151 | * through **bpf_skb_adjust_room**\ () helper with passing in 3152 | * **BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one call 3153 | * to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since 3154 | * the UDP header is removed. Similarly, an encap of the latter 3155 | * into the former could be accompanied by a helper call to 3156 | * **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the 3157 | * skb is still intended to be processed in higher layers of the 3158 | * stack instead of just egressing at tc. 3159 | * 3160 | * There are three supported level settings at this time: 3161 | * 3162 | * * **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs 3163 | * with CHECKSUM_UNNECESSARY. 3164 | * * **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs 3165 | * with CHECKSUM_UNNECESSARY. 3166 | * * **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and 3167 | * sets CHECKSUM_NONE to force checksum validation by the stack. 3168 | * * **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current 3169 | * skb->csum_level. 3170 | * 3171 | * Returns 3172 | * 0 on success, or a negative error in case of failure. In the 3173 | * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level 3174 | * is returned or the error code -EACCES in case the skb is not 3175 | * subject to CHECKSUM_UNNECESSARY. 3176 | */ 3177 | static long (*bpf_csum_level)(struct __sk_buff *skb, __u64 level) = (void *) 135; 3178 | 3179 | /* 3180 | * bpf_skc_to_tcp6_sock 3181 | * 3182 | * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. 3183 | * 3184 | * Returns 3185 | * *sk* if casting is valid, or NULL otherwise. 3186 | */ 3187 | static struct tcp6_sock *(*bpf_skc_to_tcp6_sock)(void *sk) = (void *) 136; 3188 | 3189 | /* 3190 | * bpf_skc_to_tcp_sock 3191 | * 3192 | * Dynamically cast a *sk* pointer to a *tcp_sock* pointer. 3193 | * 3194 | * Returns 3195 | * *sk* if casting is valid, or NULL otherwise. 3196 | */ 3197 | static struct tcp_sock *(*bpf_skc_to_tcp_sock)(void *sk) = (void *) 137; 3198 | 3199 | /* 3200 | * bpf_skc_to_tcp_timewait_sock 3201 | * 3202 | * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer. 3203 | * 3204 | * Returns 3205 | * *sk* if casting is valid, or NULL otherwise. 3206 | */ 3207 | static struct tcp_timewait_sock *(*bpf_skc_to_tcp_timewait_sock)(void *sk) = (void *) 138; 3208 | 3209 | /* 3210 | * bpf_skc_to_tcp_request_sock 3211 | * 3212 | * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. 3213 | * 3214 | * Returns 3215 | * *sk* if casting is valid, or NULL otherwise. 3216 | */ 3217 | static struct tcp_request_sock *(*bpf_skc_to_tcp_request_sock)(void *sk) = (void *) 139; 3218 | 3219 | /* 3220 | * bpf_skc_to_udp6_sock 3221 | * 3222 | * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. 3223 | * 3224 | * Returns 3225 | * *sk* if casting is valid, or NULL otherwise. 3226 | */ 3227 | static struct udp6_sock *(*bpf_skc_to_udp6_sock)(void *sk) = (void *) 140; 3228 | 3229 | /* 3230 | * bpf_get_task_stack 3231 | * 3232 | * Return a user or a kernel stack in bpf program provided buffer. 3233 | * To achieve this, the helper needs *task*, which is a valid 3234 | * pointer to struct task_struct. To store the stacktrace, the 3235 | * bpf program provides *buf* with a nonnegative *size*. 3236 | * 3237 | * The last argument, *flags*, holds the number of stack frames to 3238 | * skip (from 0 to 255), masked with 3239 | * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set 3240 | * the following flags: 3241 | * 3242 | * **BPF_F_USER_STACK** 3243 | * Collect a user space stack instead of a kernel stack. 3244 | * **BPF_F_USER_BUILD_ID** 3245 | * Collect buildid+offset instead of ips for user stack, 3246 | * only valid if **BPF_F_USER_STACK** is also specified. 3247 | * 3248 | * **bpf_get_task_stack**\ () can collect up to 3249 | * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject 3250 | * to sufficient large buffer size. Note that 3251 | * this limit can be controlled with the **sysctl** program, and 3252 | * that it should be manually increased in order to profile long 3253 | * user stacks (such as stacks for Java programs). To do so, use: 3254 | * 3255 | * :: 3256 | * 3257 | * # sysctl kernel.perf_event_max_stack= 3258 | * 3259 | * Returns 3260 | * A non-negative value equal to or less than *size* on success, 3261 | * or a negative error in case of failure. 3262 | */ 3263 | static long (*bpf_get_task_stack)(struct task_struct *task, void *buf, __u32 size, __u64 flags) = (void *) 141; 3264 | 3265 | 3266 | -------------------------------------------------------------------------------- /include/bpf/bpf_helpers.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ 2 | #ifndef __BPF_HELPERS__ 3 | #define __BPF_HELPERS__ 4 | 5 | /* 6 | * Note that bpf programs need to include either 7 | * vmlinux.h (auto-generated from BTF) or linux/types.h 8 | * in advance since bpf_helper_defs.h uses such types 9 | * as __u64. 10 | */ 11 | #include "bpf_helper_defs.h" 12 | 13 | #define __uint(name, val) int (*name)[val] 14 | #define __type(name, val) typeof(val) *name 15 | #define __array(name, val) typeof(val) *name[] 16 | 17 | /* Helper macro to print out debug messages */ 18 | #define bpf_printk(fmt, ...) \ 19 | ({ \ 20 | char ____fmt[] = fmt; \ 21 | bpf_trace_printk(____fmt, sizeof(____fmt), \ 22 | ##__VA_ARGS__); \ 23 | }) 24 | 25 | /* 26 | * Helper macro to place programs, maps, license in 27 | * different sections in elf_bpf file. Section names 28 | * are interpreted by elf_bpf loader 29 | */ 30 | #define SEC(NAME) __attribute__((section(NAME), used)) 31 | 32 | #ifndef __always_inline 33 | #define __always_inline __attribute__((always_inline)) 34 | #endif 35 | #ifndef __weak 36 | #define __weak __attribute__((weak)) 37 | #endif 38 | 39 | /* 40 | * Helper macro to manipulate data structures 41 | */ 42 | #ifndef offsetof 43 | #define offsetof(TYPE, MEMBER) __builtin_offsetof(TYPE, MEMBER) 44 | #endif 45 | #ifndef container_of 46 | #define container_of(ptr, type, member) \ 47 | ({ \ 48 | void *__mptr = (void *)(ptr); \ 49 | ((type *)(__mptr - offsetof(type, member))); \ 50 | }) 51 | #endif 52 | 53 | /* 54 | * Helper structure used by eBPF C program 55 | * to describe BPF map attributes to libbpf loader 56 | */ 57 | struct bpf_map_def { 58 | unsigned int type; 59 | unsigned int key_size; 60 | unsigned int value_size; 61 | unsigned int max_entries; 62 | unsigned int map_flags; 63 | }; 64 | 65 | enum libbpf_pin_type { 66 | LIBBPF_PIN_NONE, 67 | /* PIN_BY_NAME: pin maps by name (in /sys/fs/bpf by default) */ 68 | LIBBPF_PIN_BY_NAME, 69 | }; 70 | 71 | enum libbpf_tristate { 72 | TRI_NO = 0, 73 | TRI_YES = 1, 74 | TRI_MODULE = 2, 75 | }; 76 | 77 | #define __kconfig __attribute__((section(".kconfig"))) 78 | #define __ksym __attribute__((section(".ksyms"))) 79 | 80 | #endif 81 | -------------------------------------------------------------------------------- /include/fasthash.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (C) 2012 Zilong Tan (eric.zltan@gmail.com) 4 | 5 | Permission is hereby granted, free of charge, to any person 6 | obtaining a copy of this software and associated documentation 7 | files (the "Software"), to deal in the Software without 8 | restriction, including without limitation the rights to use, copy, 9 | modify, merge, publish, distribute, sublicense, and/or sell copies 10 | of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | #pragma once 27 | 28 | #include 29 | #include 30 | 31 | // clang-format off 32 | 33 | // Compression function for Merkle-Damgard construction. 34 | // This function is generated using the framework provided. 35 | static __attribute__((always_inline)) inline __u64 fasthash_mix(__u64 h) { 36 | h ^= h >> 23; 37 | h *= 0x2127599bf4325c37ULL; 38 | h ^= h >> 47; 39 | return h; 40 | } 41 | 42 | static __attribute__((always_inline)) inline __u64 fasthash64(const void *buf, __u64 len, __u64 seed) 43 | { 44 | const __u64 m = 0x880355f21e6d1965ULL; 45 | const __u64 *pos = (const __u64 *)buf; 46 | const __u64 *end = pos + (len / 8); 47 | __u64 h = seed ^ (len * m); 48 | __u64 v; 49 | 50 | #pragma clang loop unroll(full) 51 | while (pos != end) { 52 | v = *pos++; 53 | h ^= fasthash_mix(v); 54 | h *= m; 55 | } 56 | 57 | if (len & 7) { 58 | v = 0; 59 | __builtin_memcpy(&v, pos, len & 7); 60 | h ^= fasthash_mix(v); 61 | h *= m; 62 | } 63 | 64 | return fasthash_mix(h); 65 | } 66 | 67 | static __attribute__((always_inline)) inline __u32 fasthash32(const void *buf, __u64 len, __u32 seed) 68 | { 69 | // the following trick converts the 64-bit hashcode to Fermat 70 | // residue, which shall retain information from both the higher 71 | // and lower parts of hashcode. 72 | __u64 h = fasthash64(buf, len, seed); 73 | return h - (h >> 32); 74 | } -------------------------------------------------------------------------------- /include/in.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | // musl license: https://git.musl-libc.org/cgit/musl/tree/COPYRIGHT 6 | 7 | // from musl: include/netinet/in.h 8 | struct in_addr { 9 | __u32 s_addr; 10 | }; 11 | 12 | struct in6_addr { 13 | union { 14 | __u8 s6_addr[16]; 15 | __u16 s6_addr16[8]; 16 | __u32 s6_addr32[4]; 17 | }; 18 | }; -------------------------------------------------------------------------------- /include/ip.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | // musl license: https://git.musl-libc.org/cgit/musl/tree/COPYRIGHT 7 | 8 | // from musl: include/netinet/ip.h 9 | struct iphdr { 10 | __u8 version_ihl; 11 | __u8 tos; 12 | __u16 tot_len; 13 | __u16 id; 14 | __u16 frag_off; 15 | __u8 ttl; 16 | __u8 protocol; 17 | __u16 check; 18 | __u32 saddr; 19 | __u32 daddr; 20 | }; 21 | 22 | // from musl: include/netinet/ip6.h 23 | struct ip6_hdr { 24 | union { 25 | struct ip6_hdrctl { 26 | __u32 ip6_un1_flow; 27 | __u16 ip6_un1_plen; 28 | __u8 ip6_un1_nxt; 29 | __u8 ip6_un1_hlim; 30 | } ip6_un1; 31 | __u8 ip6_un2_vfc; 32 | } ip6_ctlun; 33 | struct in6_addr ip6_src; 34 | struct in6_addr ip6_dst; 35 | }; -------------------------------------------------------------------------------- /include/linux/bpf_common.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ 2 | #ifndef __LINUX_BPF_COMMON_H__ 3 | #define __LINUX_BPF_COMMON_H__ 4 | 5 | /* Instruction classes */ 6 | #define BPF_CLASS(code) ((code) & 0x07) 7 | #define BPF_LD 0x00 8 | #define BPF_LDX 0x01 9 | #define BPF_ST 0x02 10 | #define BPF_STX 0x03 11 | #define BPF_ALU 0x04 12 | #define BPF_JMP 0x05 13 | #define BPF_RET 0x06 14 | #define BPF_MISC 0x07 15 | 16 | /* ld/ldx fields */ 17 | #define BPF_SIZE(code) ((code) & 0x18) 18 | #define BPF_W 0x00 /* 32-bit */ 19 | #define BPF_H 0x08 /* 16-bit */ 20 | #define BPF_B 0x10 /* 8-bit */ 21 | /* eBPF BPF_DW 0x18 64-bit */ 22 | #define BPF_MODE(code) ((code) & 0xe0) 23 | #define BPF_IMM 0x00 24 | #define BPF_ABS 0x20 25 | #define BPF_IND 0x40 26 | #define BPF_MEM 0x60 27 | #define BPF_LEN 0x80 28 | #define BPF_MSH 0xa0 29 | 30 | /* alu/jmp fields */ 31 | #define BPF_OP(code) ((code) & 0xf0) 32 | #define BPF_ADD 0x00 33 | #define BPF_SUB 0x10 34 | #define BPF_MUL 0x20 35 | #define BPF_DIV 0x30 36 | #define BPF_OR 0x40 37 | #define BPF_AND 0x50 38 | #define BPF_LSH 0x60 39 | #define BPF_RSH 0x70 40 | #define BPF_NEG 0x80 41 | #define BPF_MOD 0x90 42 | #define BPF_XOR 0xa0 43 | 44 | #define BPF_JA 0x00 45 | #define BPF_JEQ 0x10 46 | #define BPF_JGT 0x20 47 | #define BPF_JGE 0x30 48 | #define BPF_JSET 0x40 49 | #define BPF_SRC(code) ((code) & 0x08) 50 | #define BPF_K 0x00 51 | #define BPF_X 0x08 52 | 53 | #ifndef BPF_MAXINSNS 54 | #define BPF_MAXINSNS 4096 55 | #endif 56 | 57 | #endif /* __LINUX_BPF_COMMON_H__ */ 58 | -------------------------------------------------------------------------------- /include/linux/types.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | typedef char __s8; 4 | typedef short __s16; 5 | typedef int __s32; 6 | typedef long long __s64; 7 | _Static_assert(sizeof(__s32) == 4, "__s32 must be 4 bytes"); 8 | _Static_assert(sizeof(__s64) == 8, "__s64 must be 8 bytes"); 9 | 10 | typedef unsigned char __u8; 11 | typedef unsigned short __u16; 12 | typedef unsigned int __u32; 13 | typedef unsigned long long __u64; 14 | _Static_assert(sizeof(__u32) == 4, "__u32 must be 4 bytes"); 15 | _Static_assert(sizeof(__u64) == 8, "__u64 must be 8 bytes"); 16 | 17 | typedef __u16 __be16; 18 | typedef __u16 __le16; 19 | typedef __u32 __be32; 20 | typedef __u32 __le32; 21 | typedef __u64 __be64; 22 | typedef __u64 __le64; 23 | 24 | typedef __u32 __wsum; 25 | 26 | typedef __u64 __attribute__((aligned(8))) __aligned_u64; 27 | -------------------------------------------------------------------------------- /include/lookup3.h: -------------------------------------------------------------------------------- 1 | /* 2 | ------------------------------------------------------------------------------- 3 | This file is derived from lookup3 by Bob Jenkins. The main change is that 4 | hashlittle assumes an aligned pointer. This is because BPF doesn't allow 5 | inspecting pointer values. 6 | 7 | lookup3.c, by Bob Jenkins, May 2006, Public Domain. 8 | 9 | These are functions for producing 32-bit hashes for hash table lookup. 10 | hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() 11 | are externally useful functions. Routines to test the hash are included 12 | if SELF_TEST is defined. You can use this free for any purpose. It's in 13 | the public domain. It has no warranty. 14 | 15 | You probably want to use hashlittle(). hashlittle() and hashbig() 16 | hash byte arrays. hashlittle() is is faster than hashbig() on 17 | little-endian machines. Intel and AMD are little-endian machines. 18 | On second thought, you probably want hashlittle2(), which is identical to 19 | hashlittle() except it returns two 32-bit hashes for the price of one. 20 | You could implement hashbig2() if you wanted but I haven't bothered here. 21 | 22 | If you want to find a hash of, say, exactly 7 integers, do 23 | a = i1; b = i2; c = i3; 24 | mix(a,b,c); 25 | a += i4; b += i5; c += i6; 26 | mix(a,b,c); 27 | a += i7; 28 | final(a,b,c); 29 | then use c as the hash value. If you have a variable length array of 30 | 4-byte integers to hash, use hashword(). If you have a byte array (like 31 | a character string), use hashlittle(). If you have several byte arrays, or 32 | a mix of things, see the comments above hashlittle(). 33 | 34 | Why is this so big? I read 12 bytes at a time into 3 4-byte integers, 35 | then mix those integers. This is fast (you can do a lot more thorough 36 | mixing with 12*3 instructions on 3 integers than you can with 3 instructions 37 | on 1 byte), but shoehorning those bytes into integers efficiently is messy. 38 | ------------------------------------------------------------------------------- 39 | */ 40 | 41 | #pragma once 42 | 43 | #include 44 | 45 | // clang-format off 46 | 47 | #define hashsize(n) ((__u32)1 << (n)) 48 | #define hashmask(n) (hashsize(n) - 1) 49 | #define rot(x, k) (((x) << (k)) | ((x) >> (32 - (k)))) 50 | 51 | /* 52 | ------------------------------------------------------------------------------- 53 | mix -- mix 3 32-bit values reversibly. 54 | 55 | This is reversible, so any information in (a,b,c) before mix() is 56 | still in (a,b,c) after mix(). 57 | 58 | If four pairs of (a,b,c) inputs are run through mix(), or through 59 | mix() in reverse, there are at least 32 bits of the output that 60 | are sometimes the same for one pair and different for another pair. 61 | This was tested for: 62 | * pairs that differed by one bit, by two bits, in any combination 63 | of top bits of (a,b,c), or in any combination of bottom bits of 64 | (a,b,c). 65 | * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed 66 | the output delta to a Gray code (a^(a>>1)) so a string of 1's (as 67 | is commonly produced by subtraction) look like a single 1-bit 68 | difference. 69 | * the base values were pseudorandom, all zero but one bit set, or 70 | all zero plus a counter that starts at zero. 71 | 72 | Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that 73 | satisfy this are 74 | 4 6 8 16 19 4 75 | 9 15 3 18 27 15 76 | 14 9 3 7 17 3 77 | Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing 78 | for "differ" defined as + with a one-bit base and a two-bit delta. I 79 | used http://burtleburtle.net/bob/hash/avalanche.html to choose 80 | the operations, constants, and arrangements of the variables. 81 | 82 | This does not achieve avalanche. There are input bits of (a,b,c) 83 | that fail to affect some output bits of (a,b,c), especially of a. The 84 | most thoroughly mixed value is c, but it doesn't really even achieve 85 | avalanche in c. 86 | 87 | This allows some parallelism. Read-after-writes are good at doubling 88 | the number of bits affected, so the goal of mixing pulls in the opposite 89 | direction as the goal of parallelism. I did what I could. Rotates 90 | seem to cost as much as shifts on every machine I could lay my hands 91 | on, and rotates are much kinder to the top and bottom bits, so I used 92 | rotates. 93 | ------------------------------------------------------------------------------- 94 | */ 95 | #define mix(a, b, c) \ 96 | { \ 97 | a -= c; \ 98 | a ^= rot(c, 4); \ 99 | c += b; \ 100 | b -= a; \ 101 | b ^= rot(a, 6); \ 102 | a += c; \ 103 | c -= b; \ 104 | c ^= rot(b, 8); \ 105 | b += a; \ 106 | a -= c; \ 107 | a ^= rot(c, 16); \ 108 | c += b; \ 109 | b -= a; \ 110 | b ^= rot(a, 19); \ 111 | a += c; \ 112 | c -= b; \ 113 | c ^= rot(b, 4); \ 114 | b += a; \ 115 | } 116 | 117 | /* 118 | ------------------------------------------------------------------------------- 119 | final -- final mixing of 3 32-bit values (a,b,c) into c 120 | 121 | Pairs of (a,b,c) values differing in only a few bits will usually 122 | produce values of c that look totally different. This was tested for 123 | * pairs that differed by one bit, by two bits, in any combination 124 | of top bits of (a,b,c), or in any combination of bottom bits of 125 | (a,b,c). 126 | * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed 127 | the output delta to a Gray code (a^(a>>1)) so a string of 1's (as 128 | is commonly produced by subtraction) look like a single 1-bit 129 | difference. 130 | * the base values were pseudorandom, all zero but one bit set, or 131 | all zero plus a counter that starts at zero. 132 | 133 | These constants passed: 134 | 14 11 25 16 4 14 24 135 | 12 14 25 16 4 14 24 136 | and these came close: 137 | 4 8 15 26 3 22 24 138 | 10 8 15 26 3 22 24 139 | 11 8 15 26 3 22 24 140 | ------------------------------------------------------------------------------- 141 | */ 142 | #define final(a, b, c) \ 143 | { \ 144 | c ^= b; \ 145 | c -= rot(b, 14); \ 146 | a ^= c; \ 147 | a -= rot(c, 11); \ 148 | b ^= a; \ 149 | b -= rot(a, 25); \ 150 | c ^= b; \ 151 | c -= rot(b, 16); \ 152 | a ^= c; \ 153 | a -= rot(c, 4); \ 154 | b ^= a; \ 155 | b -= rot(a, 14); \ 156 | c ^= b; \ 157 | c -= rot(b, 24); \ 158 | } 159 | 160 | static __attribute__((always_inline)) __u32 hashlittle(const void *key, __u64 length, __u32 initval) 161 | { 162 | __u32 a, b, c; /* internal state */ 163 | const __u32 *k = (const __u32 *)key; /* read 32-bit chunks */ 164 | const __u32 *end = k + (length / 12) * 3; 165 | const __u8 *k8; 166 | 167 | /* Set up the internal state */ 168 | a = b = c = 0xdeadbeef + ((__u32)length) + initval; 169 | 170 | /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */ 171 | #pragma clang loop unroll(full) 172 | while (k != end) { 173 | a += k[0]; 174 | b += k[1]; 175 | c += k[2]; 176 | mix(a, b, c); 177 | k += 3; 178 | } 179 | 180 | /*----------------------------- handle the last (probably partial) block */ 181 | k8 = (const __u8 *)k; 182 | switch (length % 12) { 183 | case 12: 184 | c += k[2]; 185 | b += k[1]; 186 | a += k[0]; 187 | break; 188 | case 11: 189 | c += ((__u32)k8[10]) << 16; /* fall through */ 190 | case 10: 191 | c += ((__u32)k8[9]) << 8; /* fall through */ 192 | case 9: 193 | c += k8[8]; /* fall through */ 194 | case 8: 195 | b += k[1]; 196 | a += k[0]; 197 | break; 198 | case 7: 199 | b += ((__u32)k8[6]) << 16; /* fall through */ 200 | case 6: 201 | b += ((__u32)k8[5]) << 8; /* fall through */ 202 | case 5: 203 | b += k8[4]; /* fall through */ 204 | case 4: 205 | a += k[0]; 206 | break; 207 | case 3: 208 | a += ((__u32)k8[2]) << 16; /* fall through */ 209 | case 2: 210 | a += ((__u32)k8[1]) << 8; /* fall through */ 211 | case 1: 212 | a += k8[0]; 213 | break; 214 | case 0: 215 | return c; 216 | } 217 | 218 | final(a, b, c); 219 | return c; 220 | } 221 | 222 | #undef hashsize 223 | #undef hashmask 224 | #undef rot 225 | #undef mix 226 | #undef final 227 | -------------------------------------------------------------------------------- /include/mindef.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifndef NULL 4 | #define NULL ((void *)0) 5 | #endif 6 | 7 | #ifndef offsetof 8 | #define offsetof(type, member) __builtin_offsetof(type, member) 9 | #endif 10 | 11 | #ifndef offsetofend 12 | #define offsetofend(type, member) (offsetof(type, member) + sizeof((((type *)0)->member))) 13 | #endif 14 | -------------------------------------------------------------------------------- /include/stdbool.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define bool _Bool 4 | 5 | #define true 1 6 | #define false 0 7 | -------------------------------------------------------------------------------- /rakelimit.go: -------------------------------------------------------------------------------- 1 | package rakelimit 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "syscall" 7 | 8 | "github.com/cilium/ebpf" 9 | "github.com/cilium/ebpf/asm" 10 | "golang.org/x/sys/unix" 11 | ) 12 | 13 | //go:generate go run github.com/cilium/ebpf/cmd/bpf2go -cc clang-12 rake ./src/rakelimit.c -- -I./include -nostdinc -Os 14 | 15 | // Rakelimit holds an instance of a ratelimiter that can be applied on a socket 16 | type Rakelimit struct { 17 | domain int 18 | program *ebpf.Program 19 | bpfObjects *rakeObjects 20 | } 21 | 22 | // New creates a new Rakelimit instance based on the specified ppsLimit 23 | func New(conn syscall.Conn, ppsLimit uint32) (*Rakelimit, error) { 24 | // set ratelimit 25 | spec, err := loadRake() 26 | if err != nil { 27 | return nil, fmt.Errorf("get elf spec: %v", err) 28 | } 29 | 30 | if err := rewriteConstant(spec, "LIMIT", uint64(ppsLimit)); err != nil { 31 | return nil, err 32 | } 33 | 34 | var objs rakeObjects 35 | if err := spec.LoadAndAssign(&objs, nil); err != nil { 36 | return nil, fmt.Errorf("load BPF: %v", err) 37 | } 38 | 39 | raw, err := conn.SyscallConn() 40 | if err != nil { 41 | return nil, fmt.Errorf("raw conn: %s", err) 42 | } 43 | 44 | var opErr error 45 | var domain int 46 | var prog *ebpf.Program 47 | if err := raw.Control(func(s uintptr) { 48 | domain, opErr = unix.GetsockoptInt(int(s), unix.SOL_SOCKET, unix.SO_DOMAIN) 49 | if opErr != nil { 50 | opErr = fmt.Errorf("can't retrieve domain: %s", opErr) 51 | return 52 | } 53 | 54 | switch domain { 55 | case unix.AF_INET: 56 | prog = objs.FilterIpv4 57 | case unix.AF_INET6: 58 | prog = objs.FilterIpv6 59 | default: 60 | opErr = fmt.Errorf("unsupported socket domain: %d", domain) 61 | return 62 | } 63 | 64 | opErr = unix.SetsockoptInt(int(s), unix.SOL_SOCKET, unix.SO_ATTACH_BPF, prog.FD()) 65 | if errors.Is(opErr, unix.ENOMEM) { 66 | opErr = fmt.Errorf("attach filter: net.core.optmem_max might be too low: %s", opErr) 67 | return 68 | } 69 | if opErr != nil { 70 | opErr = fmt.Errorf("attach filter: %s", opErr) 71 | } 72 | }); err != nil { 73 | return nil, fmt.Errorf("can't access fd: %s", err) 74 | } 75 | if opErr != nil { 76 | return nil, opErr 77 | } 78 | 79 | return &Rakelimit{domain, prog, &objs}, nil 80 | } 81 | 82 | // Close cleans up resources occupied and should be called when finished using the structure 83 | func (rl *Rakelimit) Close() error { 84 | return rl.bpfObjects.Close() 85 | } 86 | 87 | func rewriteConstant(spec *ebpf.CollectionSpec, symbol string, value uint64) error { 88 | rewritten := false 89 | for name, prog := range spec.Programs { 90 | for i := range prog.Instructions { 91 | ins := &prog.Instructions[i] 92 | if ins.Reference != symbol { 93 | continue 94 | } 95 | 96 | if !ins.IsConstantLoad(asm.DWord) { 97 | return fmt.Errorf("program %s: instruction %d: not a dword-sized constant load: %s", name, i, ins) 98 | } 99 | 100 | ins.Constant = int64(value) 101 | rewritten = true 102 | } 103 | } 104 | 105 | if !rewritten { 106 | return fmt.Errorf("symbol %s is not referenced", symbol) 107 | } 108 | 109 | return nil 110 | } 111 | -------------------------------------------------------------------------------- /rakelimit_test.go: -------------------------------------------------------------------------------- 1 | package rakelimit 2 | 3 | import ( 4 | "math" 5 | "net" 6 | "testing" 7 | "time" 8 | 9 | "github.com/cilium/ebpf" 10 | "github.com/google/gopacket" 11 | "github.com/google/gopacket/layers" 12 | "golang.org/x/sys/unix" 13 | ) 14 | 15 | func TestLoad(t *testing.T) { 16 | spec, err := loadRake() 17 | if err != nil { 18 | t.Fatal(err) 19 | } 20 | 21 | if err := rewriteConstant(spec, "LIMIT", uint64(100)); err != nil { 22 | t.Fatal(err) 23 | } 24 | 25 | t.Run("IPv4", func(t *testing.T) { 26 | var objs struct { 27 | Prog *ebpf.Program `ebpf:"filter_ipv4"` 28 | } 29 | if err := spec.LoadAndAssign(&objs, nil); err != nil { 30 | t.Error(err) 31 | } 32 | }) 33 | t.Run("IPv6", func(t *testing.T) { 34 | var objs struct { 35 | Prog *ebpf.Program `ebpf:"filter_ipv6"` 36 | } 37 | if err := spec.LoadAndAssign(&objs, nil); err != nil { 38 | t.Error(err) 39 | } 40 | }) 41 | } 42 | 43 | const floatBits = 32 44 | 45 | type FixedPointTuple struct { 46 | k, v uint64 47 | } 48 | 49 | /* TestBPFFloatToFixedPoint tests the convesion of integers/floats to fixed-point on the 50 | userspace & the bpf side to ensure both convert it in the same way */ 51 | func TestBPFFloatToFixedPoint(t *testing.T) { 52 | var objs rakeObjects 53 | if err := loadRakeObjects(&objs, nil); err != nil { 54 | t.Fatal("Can't load program", err) 55 | 56 | } 57 | defer objs.Close() 58 | 59 | prog := objs.TestFpCmp 60 | lookupTable := objs.TestSingleResult 61 | payload := make([]byte, 14) 62 | 63 | // check 27 64 | if err := lookupTable.Put(uint32(0), floatToFixed(27.0)); err != nil { 65 | t.Fatal(err) 66 | } 67 | 68 | res, _, err := prog.Test(payload) 69 | if err != nil { 70 | t.Fatal(err) 71 | } 72 | if res != 0 { 73 | t.Fatalf("Error on line %d", res) 74 | } 75 | 76 | var fp uint64 77 | if err := lookupTable.Lookup(uint32(0), &fp); err != nil { 78 | t.Fatal(err) 79 | } 80 | 81 | // check if bpf to go works 82 | fl := fixedToFloat(fp) 83 | if fl != 19 { 84 | t.Fatal("Expected 19, got", fl) 85 | } 86 | } 87 | 88 | func TestBPFFEwma(t *testing.T) { 89 | const ( 90 | rateKey uint32 = iota 91 | oldTSKey 92 | newTSKey 93 | ) 94 | 95 | var objs rakeObjects 96 | if err := loadRakeObjects(&objs, nil); err != nil { 97 | t.Fatal("Can't load program", err) 98 | 99 | } 100 | defer objs.Close() 101 | 102 | prog := objs.TestEwma 103 | sr := objs.TestSingleResult 104 | 105 | sr.Put(rateKey, uint64(50)) 106 | sr.Put(oldTSKey, uint64(346534651)) 107 | sr.Put(newTSKey, uint64(415841581)) 108 | 109 | ret, _, err := prog.Test(make([]byte, 14)) 110 | if err != nil { 111 | t.Fatal(err) 112 | } 113 | if ret == 0 { 114 | t.Fatal("Unexpected return from BPF program") 115 | } 116 | 117 | var result uint64 118 | if err := sr.Lookup(rateKey, &result); err != nil { 119 | t.Fatal(err) 120 | } 121 | 122 | if result != 31 { 123 | t.Error("Expected 31, got", result) 124 | } 125 | } 126 | 127 | func BenchmarkRakelimit(b *testing.B) { 128 | b.Run("IPv4", func(b *testing.B) { 129 | rake := mustNew(b, "127.0.0.1:0", math.MaxUint32) 130 | 131 | packet := mustSerializeLayers(b, 132 | &layers.Ethernet{ 133 | SrcMAC: []byte{1, 2, 3, 4, 5, 6}, 134 | DstMAC: []byte{6, 5, 4, 3, 2, 1}, 135 | EthernetType: layers.EthernetTypeIPv4, 136 | }, 137 | &layers.IPv4{ 138 | Version: 4, 139 | SrcIP: net.IPv4(192, 0, 2, 0), 140 | DstIP: net.IPv4(192, 0, 2, 123), 141 | Protocol: layers.IPProtocolUDP, 142 | }, 143 | &layers.UDP{ 144 | SrcPort: layers.UDPPort(12345), 145 | DstPort: layers.UDPPort(443), 146 | }, 147 | gopacket.Payload([]byte{1, 2, 3, 4}), 148 | ) 149 | b.ResetTimer() 150 | 151 | lastRet, duration, err := rake.program.Benchmark(packet, b.N, b.ResetTimer) 152 | if err != nil { 153 | b.Fatal(err) 154 | } 155 | 156 | if lastRet == 0 { 157 | b.Error("Packet was dropped") 158 | } 159 | 160 | b.ReportMetric(float64(duration/time.Nanosecond), "ns/op") 161 | }) 162 | 163 | b.Run("IPv6", func(b *testing.B) { 164 | rake := mustNew(b, "[::1]:0", math.MaxUint32) 165 | 166 | packet := mustSerializeLayers(b, 167 | &layers.Ethernet{ 168 | SrcMAC: []byte{1, 2, 3, 4, 5, 6}, 169 | DstMAC: []byte{6, 5, 4, 3, 2, 1}, 170 | EthernetType: layers.EthernetTypeIPv6, 171 | }, 172 | &layers.IPv6{ 173 | Version: 6, 174 | SrcIP: net.ParseIP("fd::1"), 175 | DstIP: net.ParseIP("fc::1337"), 176 | NextHeader: layers.IPProtocolUDP, 177 | }, 178 | &layers.UDP{ 179 | SrcPort: layers.UDPPort(12345), 180 | DstPort: layers.UDPPort(443), 181 | }, 182 | gopacket.Payload([]byte{1, 2, 3, 4}), 183 | ) 184 | b.ResetTimer() 185 | 186 | lastRet, duration, err := rake.program.Benchmark(packet, b.N, b.ResetTimer) 187 | if err != nil { 188 | b.Fatal(err) 189 | } 190 | 191 | if lastRet == 0 { 192 | b.Error("Packet was dropped") 193 | } 194 | 195 | b.ReportMetric(float64(duration/time.Nanosecond), "ns/op") 196 | }) 197 | } 198 | 199 | func mustSerializeLayers(tb testing.TB, layers ...gopacket.SerializableLayer) []byte { 200 | tb.Helper() 201 | 202 | buf := gopacket.NewSerializeBuffer() 203 | opts := gopacket.SerializeOptions{ 204 | FixLengths: true, 205 | } 206 | err := gopacket.SerializeLayers(buf, opts, layers...) 207 | if err != nil { 208 | tb.Fatal("Can't serialize layers:", err) 209 | } 210 | 211 | return buf.Bytes() 212 | } 213 | 214 | type testRakelimit struct { 215 | *Rakelimit 216 | testProgram *ebpf.Program 217 | args *ebpf.Map 218 | conn *net.UDPConn 219 | } 220 | 221 | const ( 222 | timeArgKey uint32 = iota 223 | randArgKey 224 | rateExceededOnLevelKey 225 | ) 226 | 227 | func mustNew(tb testing.TB, addr string, limit uint32) *testRakelimit { 228 | tb.Helper() 229 | 230 | conn, err := net.ListenPacket("udp", addr) 231 | if err != nil { 232 | tb.Fatal("Can't listen:", err) 233 | } 234 | tb.Cleanup(func() { conn.Close() }) 235 | 236 | udp := conn.(*net.UDPConn) 237 | rake, err := New(udp, limit) 238 | if err != nil { 239 | tb.Fatal("Can't create limiter:", err) 240 | } 241 | tb.Cleanup(func() { rake.Close() }) 242 | 243 | prog := rake.bpfObjects.TestIpv4 244 | if rake.domain == unix.AF_INET6 { 245 | prog = rake.bpfObjects.TestIpv6 246 | } 247 | 248 | args := rake.bpfObjects.TestSingleResult 249 | if err := args.Put(randArgKey, uint64(math.MaxUint32+1)); err != nil { 250 | tb.Fatal("Can't update rand:", err) 251 | } 252 | 253 | return &testRakelimit{rake, prog, args, udp} 254 | } 255 | 256 | func (trl *testRakelimit) updateTime(tb testing.TB, now uint64) { 257 | tb.Helper() 258 | 259 | if now < math.MaxUint64 { 260 | // Make sure we never use a zero time, since the ewma code 261 | // assumes that zero means uninitialised. 262 | now++ 263 | } 264 | 265 | if err := trl.args.Put(timeArgKey, now); err != nil { 266 | tb.Error("Can't update time:", err) 267 | } 268 | } 269 | 270 | func (trl *testRakelimit) updateRand(tb testing.TB, value uint32) { 271 | tb.Helper() 272 | 273 | if err := trl.args.Put(randArgKey, uint64(value)); err != nil { 274 | tb.Error("Can't update rand:", err) 275 | } 276 | } 277 | 278 | func (trl *testRakelimit) rateExceededOnLevel(tb testing.TB) uint32 { 279 | tb.Helper() 280 | 281 | var level uint64 282 | if err := trl.args.Lookup(rateExceededOnLevelKey, &level); err != nil { 283 | tb.Fatal("Can't lookup drop level:", err) 284 | } 285 | 286 | return uint32(level) 287 | } 288 | -------------------------------------------------------------------------------- /src/common.h: -------------------------------------------------------------------------------- 1 | #define FORCE_INLINE inline __attribute__((__always_inline__)) 2 | 3 | /* from linux/socket.h */ 4 | #define AF_INET 2 /* Internet IP Protocol */ 5 | #define AF_INET6 10 /* IP version 6 */ 6 | /***********************/ 7 | 8 | /* from linux/filter.h */ 9 | #define BPF_NET_OFF (-0x100000) 10 | #define BPF_LL_OFF (-0x200000) 11 | /***********************/ 12 | 13 | /* Accept - allow any number of bytes */ 14 | #define SKB_PASS -1 15 | /* Drop, cut packet to zero bytes */ 16 | #define SKB_REJECT 0 17 | 18 | #define ETH_P_IP 0x0800 19 | #define ETH_P_IPV6 0x86DD 20 | 21 | unsigned long long load_byte(void *skb, unsigned long long off) asm("llvm.bpf.load.byte"); 22 | unsigned long long load_half(void *skb, unsigned long long off) asm("llvm.bpf.load.half"); 23 | unsigned long long load_word(void *skb, unsigned long long off) asm("llvm.bpf.load.word"); 24 | 25 | #define ARRAY_SIZE(a) (sizeof(a)/sizeof(a[0])) 26 | -------------------------------------------------------------------------------- /src/countmin.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | 6 | #include "common.h" 7 | #include "ewma.h" 8 | #include "fasthash.h" 9 | #include "fixed-point.h" 10 | #include "lookup3.h" 11 | 12 | // countmin sketch paper: http://dimacs.rutgers.edu/~graham/pubs/papers/cm-full.pdf 13 | // 14 | // A cm sketch can be thought of as a two dimensional array width d rows and 15 | // w columns. Each row uses a distinct hash function to index into its columns. 16 | // 17 | // The paper shows the following error bounds for the estimation, provided we 18 | // choose d = ceil(ln(1/gamma)) and w = ceil(e/E) (see page 7). 19 | // 20 | // a <= a' 21 | // a' <= E * ||a|| with probability at least (1 - gamma) 22 | // a : the true answer 23 | // a' : the estimate made by the cm sketch 24 | // E : a chosen error bound 25 | // gamma: desired probability of the upper bound 26 | // ||a||: the sum of all previous observations (I think) 27 | // 28 | // We always choose w to be a power of two to be able to cheaply index into the cm 29 | // sketch based on a hash value. For d = 2 and w = 512 we get gamma ~0.14 and E ~0.005. 30 | // 31 | // a <= a' <= ~0.005 * ||a|| (with probability ~0.86) 32 | // 33 | // Using 3 instead of 2 hash functions would increase the probability to 0.96. For 34 | // that we need another function however. 35 | 36 | #define HASHFN_N 2 37 | #define COLUMNS 512 38 | 39 | _Static_assert((COLUMNS & (COLUMNS - 1)) == 0, "COLUMNS must be a power of two"); 40 | 41 | struct cm_value { 42 | __u32 value; 43 | __u64 ts; 44 | }; 45 | 46 | struct cm_hash { 47 | __u32 values[HASHFN_N]; 48 | }; 49 | 50 | struct countmin { 51 | struct cm_value values[HASHFN_N][COLUMNS]; 52 | }; 53 | 54 | // add element and determine count 55 | static __u32 FORCE_INLINE cm_add_and_query(struct countmin *cm, __u64 now, const struct cm_hash *h) 56 | { 57 | __u32 min = -1; 58 | #pragma clang loop unroll(full) 59 | for (int i = 0; i < ARRAY_SIZE(cm->values); i++) { 60 | __u32 target_idx = h->values[i] & (ARRAY_SIZE(cm->values[i]) - 1); 61 | struct cm_value *value = &cm->values[i][target_idx]; 62 | value->value = estimate_rate(value->value, value->ts, now); 63 | value->ts = now; 64 | if (value->value < min) { 65 | min = value->value; 66 | } 67 | } 68 | return min; 69 | } 70 | -------------------------------------------------------------------------------- /src/ewma.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "common.h" 6 | #include "fixed-point.h" 7 | 8 | // estimate_avg_rate takes a previous rate and a duration that elapsed 9 | // since this rate has been determined, and estimates based on these and 10 | // WINDOW the current rate in packets per second. 11 | static __u32 FORCE_INLINE estimate_rate(__u32 old_rate, __u64 old_ts, __u64 now) 12 | { 13 | // The window after which old observations are discarded. 14 | // Chosen to be a power of two so that division can be done 15 | // with a bit shift. 16 | const __u32 WINDOW_NS = 1ull << 27; 17 | const __u32 ONE_SECOND_NS = 1000000000ull; 18 | 19 | if (old_ts >= now) { 20 | // Time went backward or stood still due to clockskew. Return the old value, 21 | // since we can't compute the current rate. 22 | return old_rate; 23 | } 24 | 25 | __s64 elapsed = now - old_ts; 26 | if (old_ts == 0 || elapsed >= WINDOW_NS) { 27 | // Either there is no previous measurement, or it's too old. 28 | // We need another sample to calculate a reliable rate. 29 | return 0; 30 | } 31 | 32 | __u32 rate_current = ONE_SECOND_NS / (__u32)elapsed; 33 | if (old_rate == 0) { 34 | // This is the first time we can calculate a rate, so use that 35 | // to initialize our estimate. 36 | return rate_current; 37 | } 38 | 39 | const fpoint one = to_fixed_point(1, 0); 40 | fpoint a = div_by_int(to_fixed_point(elapsed, 0), WINDOW_NS); 41 | 42 | return to_int(a * rate_current + (one - a) * old_rate); 43 | } -------------------------------------------------------------------------------- /src/fixed-point.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "common.h" 8 | 9 | #define FRACTION_BITS 32 10 | 11 | typedef __u64 fpoint; 12 | 13 | static __u64 FORCE_INLINE to_fixed_point(__u32 integer, __u32 fraction) 14 | { 15 | return (((__u64)integer) << FRACTION_BITS) | (__u64)fraction; 16 | } 17 | 18 | static __u32 FORCE_INLINE to_int(fpoint a) 19 | { 20 | return a >> FRACTION_BITS; 21 | } 22 | 23 | static fpoint FORCE_INLINE div_by_int(fpoint dividend, __u32 divisor) 24 | { 25 | return dividend / divisor; 26 | } 27 | -------------------------------------------------------------------------------- /src/rakelimit.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | #include "common.h" 12 | #include "countmin.h" 13 | #include "fasthash.h" 14 | 15 | #define FH_SEED (0x2d31e867) 16 | #define L3_SEED (0x6ad611c3) 17 | 18 | #define PARAMETER(type, name) \ 19 | ({ \ 20 | type __tmp; \ 21 | _Static_assert(sizeof(__tmp) <= sizeof(__u64), name " exceeds 64 bits"); \ 22 | asm("%0 = " name " ll" : "=r"(__tmp)); \ 23 | __tmp; \ 24 | }) 25 | 26 | enum address_gen { 27 | ADDRESS_IP = 0, // /32 or /128 28 | ADDRESS_NET = 1, // /24 or /48 29 | ADDRESS_WILDCARD = 2, // /0 30 | }; 31 | 32 | enum port_gen { 33 | PORT_SPECIFIED = 0, 34 | PORT_WILDCARD = 1, 35 | }; 36 | 37 | struct gen { 38 | int level; 39 | enum address_gen source; 40 | enum port_gen source_port; 41 | enum address_gen dest; 42 | enum port_gen dest_port; 43 | bool evaluate; 44 | }; 45 | 46 | struct address_hash { 47 | __u64 vals[ADDRESS_WILDCARD]; 48 | }; 49 | 50 | struct hash { 51 | struct address_hash src; 52 | struct address_hash dst; 53 | __u64 src_port; 54 | __u64 dst_port; 55 | }; 56 | 57 | static const struct gen generalisations[] = { 58 | /*level 0*/ 59 | {0, ADDRESS_IP, PORT_SPECIFIED, ADDRESS_IP, PORT_SPECIFIED, true}, 60 | 61 | /* level 1 */ 62 | {1, ADDRESS_NET, PORT_SPECIFIED, ADDRESS_IP, PORT_SPECIFIED, false}, 63 | {1, ADDRESS_IP, PORT_WILDCARD, ADDRESS_IP, PORT_SPECIFIED, false}, 64 | {1, ADDRESS_IP, PORT_SPECIFIED, ADDRESS_IP, PORT_WILDCARD, true}, 65 | 66 | /* level 2 */ 67 | /* *.*.*.*:i --> w.x.y.z:j */ 68 | {2, ADDRESS_WILDCARD, PORT_SPECIFIED, ADDRESS_IP, PORT_SPECIFIED, false}, 69 | /* a.b.c.*:* --> w.x.y.z:j */ 70 | {2, ADDRESS_NET, PORT_WILDCARD, ADDRESS_IP, PORT_SPECIFIED, false}, 71 | /* a.b.c.*:i --> w.x.y.z:* */ 72 | {2, ADDRESS_NET, PORT_SPECIFIED, ADDRESS_IP, PORT_WILDCARD, false}, 73 | /* a.b.c.d:* --> w.x.y.z:* */ 74 | {2, ADDRESS_IP, PORT_WILDCARD, ADDRESS_IP, PORT_WILDCARD, true}, 75 | 76 | /* level 3 */ 77 | /* *.*.*.*:* --> w.x.y.z:j */ 78 | {3, ADDRESS_WILDCARD, PORT_WILDCARD, ADDRESS_IP, PORT_SPECIFIED, false}, 79 | /* *.*.*.*:i --> w.x.y.z:* */ 80 | {3, ADDRESS_WILDCARD, PORT_SPECIFIED, ADDRESS_IP, PORT_WILDCARD, false}, 81 | /* A.B.C.*:* --> w.x.y.z:* */ 82 | {3, ADDRESS_NET, PORT_WILDCARD, ADDRESS_IP, PORT_WILDCARD, true}, 83 | 84 | /* level 4 */ 85 | {4, ADDRESS_WILDCARD, PORT_WILDCARD, ADDRESS_IP, PORT_WILDCARD, true}, 86 | }; 87 | 88 | // collect number of packet drops per level 89 | struct bpf_map_def SEC("maps") stats = { 90 | .type = BPF_MAP_TYPE_ARRAY, 91 | .key_size = sizeof(__u32), 92 | .value_size = sizeof(__u64), 93 | .max_entries = 5, // 5 levels 94 | }; 95 | 96 | struct bpf_map_def SEC("maps") countmin = { 97 | .type = BPF_MAP_TYPE_ARRAY, 98 | .key_size = sizeof(__u32), 99 | .value_size = sizeof(struct countmin), 100 | .max_entries = ARRAY_SIZE(generalisations), 101 | }; 102 | 103 | static FORCE_INLINE void ipv6_hash(const struct in6_addr *ip, struct address_hash *a, struct address_hash *b) 104 | { 105 | a->vals[ADDRESS_IP] = fasthash64(ip, sizeof(*ip), FH_SEED); 106 | b->vals[ADDRESS_IP] = hashlittle(ip, sizeof(*ip), L3_SEED); 107 | a->vals[ADDRESS_NET] = fasthash64(ip, 48 / 8, FH_SEED); 108 | b->vals[ADDRESS_NET] = hashlittle(ip, 48 / 8, L3_SEED); 109 | } 110 | 111 | static FORCE_INLINE void ipv4_hash(struct in_addr ip, struct address_hash *a, struct address_hash *b) 112 | { 113 | a->vals[ADDRESS_IP] = fasthash64(&ip, sizeof(ip), FH_SEED); 114 | b->vals[ADDRESS_IP] = hashlittle(&ip, sizeof(ip), L3_SEED); 115 | ip.s_addr &= 0xffffff00; 116 | a->vals[ADDRESS_NET] = fasthash64(&ip, sizeof(ip), FH_SEED); 117 | b->vals[ADDRESS_NET] = hashlittle(&ip, sizeof(ip), L3_SEED); 118 | } 119 | 120 | static FORCE_INLINE __u64 hash_mix(__u64 a, __u64 b) 121 | { 122 | // Adapted from https://stackoverflow.com/a/27952689. The constant below 123 | // is derived from the golden ratio. 124 | a ^= b + 0x9e3779b97f4a7c15 + (a << 6) + (a >> 2); 125 | return a; 126 | } 127 | 128 | static FORCE_INLINE __u32 gen_hash(const struct gen *gen, const struct hash *ph) 129 | { 130 | __u64 tmp = 0; 131 | 132 | if (gen->source != ADDRESS_WILDCARD) { 133 | tmp = hash_mix(tmp, ph->src.vals[gen->source]); 134 | } 135 | 136 | if (gen->dest != ADDRESS_WILDCARD) { 137 | tmp = hash_mix(tmp, ph->dst.vals[gen->dest]); 138 | } 139 | 140 | if (gen->source_port != PORT_WILDCARD) { 141 | tmp = hash_mix(tmp, ph->src_port); 142 | } 143 | 144 | if (gen->dest_port != PORT_WILDCARD) { 145 | tmp = hash_mix(tmp, ph->dst_port); 146 | } 147 | 148 | // Adapted from fasthash32 149 | return tmp - (tmp >> 32); 150 | } 151 | 152 | static __u32 FORCE_INLINE add_to_node(__u32 node_idx, __u64 ts, const struct cm_hash *h) 153 | { 154 | struct countmin *node = bpf_map_lookup_elem(&countmin, &node_idx); 155 | if (node == NULL) { 156 | return -1; 157 | } 158 | return cm_add_and_query(node, ts, h); 159 | } 160 | 161 | static FORCE_INLINE void log_level_drop(__u32 level) 162 | { 163 | __u64 *count = bpf_map_lookup_elem(&stats, &level); 164 | if (count == NULL) { 165 | return; 166 | } 167 | (*count)++; 168 | } 169 | 170 | static FORCE_INLINE __u64 transport_offset_ipv4(struct __sk_buff *skb) 171 | { 172 | __u8 version_ihl = load_byte(skb, offsetof(struct iphdr, version_ihl)); 173 | return (version_ihl & 0xf) * sizeof(__u32); 174 | } 175 | 176 | static FORCE_INLINE __u64 transport_offset_ipv6(struct __sk_buff *skb) 177 | { 178 | // TODO: Check nexthdr to make sure it's UDP. 179 | return sizeof(struct ip6_hdr); 180 | } 181 | 182 | static FORCE_INLINE int load_ipv6(struct in6_addr *ip, struct __sk_buff *skb, __u64 off) 183 | { 184 | return bpf_skb_load_bytes(skb, off, ip, sizeof(*ip)); 185 | } 186 | 187 | static FORCE_INLINE int drop_or_accept(__u32 level, fpoint limit, __u32 max_rate, __u32 rand) 188 | { 189 | if (div_by_int(to_fixed_point(limit, 0), max_rate) < to_fixed_point(0, rand)) { 190 | log_level_drop(level); 191 | return SKB_REJECT; 192 | } 193 | return SKB_PASS; 194 | } 195 | 196 | static FORCE_INLINE int process_packet(struct __sk_buff *skb, __u16 proto, __u64 ts, __u32 rand, __u64 *rate_exceeded_level) 197 | { 198 | __u32 limit = PARAMETER(__u32, "LIMIT"); 199 | struct hash ph[HASHFN_N]; 200 | struct in6_addr ipv6; 201 | struct in_addr ipv4; 202 | __u32 max_rate = 0; 203 | 204 | if (limit == 0) { 205 | return SKB_PASS; 206 | } 207 | 208 | __u64 troff; 209 | switch (proto) { 210 | case ETH_P_IP: 211 | troff = transport_offset_ipv4(skb); 212 | ipv4.s_addr = load_word(skb, BPF_NET_OFF + offsetof(struct iphdr, saddr)); 213 | ipv4_hash(ipv4, &ph[0].src, &ph[1].src); 214 | ipv4.s_addr = load_word(skb, BPF_NET_OFF + offsetof(struct iphdr, daddr)); 215 | ipv4_hash(ipv4, &ph[0].dst, &ph[1].dst); 216 | break; 217 | 218 | case ETH_P_IPV6: 219 | troff = transport_offset_ipv6(skb); 220 | if (load_ipv6(&ipv6, skb, offsetof(struct ip6_hdr, ip6_src))) { 221 | return SKB_REJECT; 222 | } 223 | ipv6_hash(&ipv6, &ph[0].src, &ph[1].src); 224 | if (load_ipv6(&ipv6, skb, offsetof(struct ip6_hdr, ip6_dst))) { 225 | return SKB_REJECT; 226 | } 227 | ipv6_hash(&ipv6, &ph[0].dst, &ph[1].dst); 228 | break; 229 | 230 | default: 231 | return SKB_REJECT; 232 | } 233 | 234 | __u16 src_port = load_half(skb, troff); 235 | ph[0].src_port = fasthash64(&src_port, sizeof(src_port), FH_SEED); 236 | ph[1].src_port = hashlittle(&src_port, sizeof(src_port), L3_SEED); 237 | __u16 dst_port = load_half(skb, troff + 2); 238 | ph[0].dst_port = fasthash64(&dst_port, sizeof(dst_port), FH_SEED); 239 | ph[1].dst_port = hashlittle(&dst_port, sizeof(dst_port), L3_SEED); 240 | 241 | #pragma clang loop unroll(full) 242 | for (int i = 0; i < ARRAY_SIZE(generalisations); i++) { 243 | const struct gen *gen = &generalisations[i]; 244 | const int level = gen->level; 245 | 246 | // Force clang to inline level on the stack rather than loading it from 247 | // .rodata later on. 248 | asm volatile("" : : "r"(level) : "memory"); 249 | 250 | struct cm_hash h = {{ 251 | gen_hash(gen, &ph[0]), 252 | gen_hash(gen, &ph[1]), 253 | }}; 254 | 255 | __u32 rate = add_to_node(i, ts, &h); 256 | 257 | if (rate > max_rate) { 258 | max_rate = rate; 259 | } 260 | 261 | if (gen->evaluate) { 262 | if (max_rate > limit) { 263 | if (rate_exceeded_level != NULL) { 264 | *rate_exceeded_level = level; 265 | } 266 | return drop_or_accept(level, limit, max_rate, rand); 267 | } 268 | 269 | max_rate = 0; 270 | } 271 | } 272 | 273 | return SKB_PASS; 274 | } 275 | 276 | SEC("socket/ipv4") 277 | int filter_ipv4(struct __sk_buff *skb) 278 | { 279 | return process_packet(skb, ETH_P_IP, bpf_ktime_get_ns(), bpf_get_prandom_u32(), NULL); 280 | } 281 | 282 | SEC("socket/ipv6") 283 | int filter_ipv6(struct __sk_buff *skb) 284 | { 285 | return process_packet(skb, ETH_P_IPV6, bpf_ktime_get_ns(), bpf_get_prandom_u32(), NULL); 286 | } 287 | 288 | // a map used for testing 289 | struct bpf_map_def SEC("maps") test_single_result = { 290 | .type = BPF_MAP_TYPE_ARRAY, 291 | .key_size = sizeof(__u32), 292 | .value_size = sizeof(__u64), 293 | .max_entries = 3, 294 | }; 295 | 296 | static FORCE_INLINE int test_filter(struct __sk_buff *skb, __u16 proto) 297 | { 298 | __u64 *ts, *randp, *rate_exceeded_level; 299 | __u32 rand; 300 | 301 | ts = bpf_map_lookup_elem(&test_single_result, &(__u32){0}); 302 | if (ts == NULL) { 303 | return SKB_PASS; 304 | } 305 | 306 | randp = bpf_map_lookup_elem(&test_single_result, &(__u32){1}); 307 | if (randp == NULL) { 308 | return SKB_PASS; 309 | } 310 | 311 | if (*randp > 0xffffffff) { 312 | rand = bpf_get_prandom_u32(); 313 | } else { 314 | rand = *randp; 315 | } 316 | 317 | rate_exceeded_level = bpf_map_lookup_elem(&test_single_result, &(__u32){2}); 318 | if (rate_exceeded_level == NULL) { 319 | return SKB_PASS; 320 | } 321 | 322 | // Always reset the level to some weird value that isn't zero. 323 | *rate_exceeded_level = -1; 324 | 325 | return process_packet(skb, proto, *ts, rand, rate_exceeded_level); 326 | } 327 | 328 | SEC("socket/test_ipv4") 329 | int test_ipv4(struct __sk_buff *skb) 330 | { 331 | return test_filter(skb, ETH_P_IP); 332 | } 333 | 334 | SEC("socket/test_ipv6") 335 | int test_ipv6(struct __sk_buff *skb) 336 | { 337 | return test_filter(skb, ETH_P_IPV6); 338 | } 339 | 340 | // test_fp_cmp takes the element with the index 0 out of the test_single_result map, and 341 | // compares if it is equal to some randomly chosen integer converted to a fixed-point (27 in this case). 342 | // Then we do the same thing the other way around and put a converted 19 into the map to ensure the userspace 343 | // implementation does its job as well 344 | SEC("socket/test1") 345 | int test_fp_cmp(struct __sk_buff *skb) 346 | { 347 | int i = 0; 348 | __u64 *fp = bpf_map_lookup_elem(&test_single_result, &i); 349 | if (fp == NULL) { 350 | return __LINE__; 351 | } 352 | // first check the value from userside 353 | if (to_fixed_point(27, 0) != *fp) { 354 | return __LINE__; 355 | } 356 | // then replace it 357 | *fp = to_fixed_point(19, 0); 358 | bpf_map_update_elem(&test_single_result, &i, fp, 0); 359 | return 0; 360 | } 361 | 362 | // test_ewma takes a previous rate from index 0 (as a u32) and an old and 363 | // new timestamp from index 1-2 (as u64) and estimates the current rate. 364 | // The result is written to the previous rate. 365 | SEC("socket/test2") 366 | int test_ewma(struct __sk_buff *skb) 367 | { 368 | __u64 *old_rate = bpf_map_lookup_elem(&test_single_result, &(__u32){0}); 369 | if (old_rate == NULL) { 370 | return SKB_REJECT; 371 | } 372 | 373 | __u64 *old_ts = bpf_map_lookup_elem(&test_single_result, &(__u32){1}); 374 | if (old_ts == NULL) { 375 | return SKB_REJECT; 376 | } 377 | 378 | __u64 *now = bpf_map_lookup_elem(&test_single_result, &(__u32){2}); 379 | if (now == NULL) { 380 | return SKB_REJECT; 381 | } 382 | 383 | *old_rate = estimate_rate(*old_rate, *old_ts, *now); 384 | return SKB_PASS; 385 | } 386 | 387 | char __license[] SEC("license") = "Dual BSD/GPL"; 388 | --------------------------------------------------------------------------------